/* * * Copyright (C) 2023, KylinSoft Co., Ltd. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * */ #pragma once #include #include "MixSegment.hpp" //#include "IdfTrie.hpp" #include "idf-trie/idf-trie.h" namespace cppjieba { using namespace limonp; using namespace std; /*utf8*/ class KeywordExtractor { public: KeywordExtractor(const DictTrie* dictTrie, const HMMModel* model, const string& idfPath, const string& dat_cache_path, const string& stopWordPath) : segment_(dictTrie, model, stopWordPath), idf_trie_(idfPath, dat_cache_path){ } ~KeywordExtractor() { } void Extract(const string& sentence, vector& keywords, size_t topN) const { vector topWords; Extract(sentence, topWords, topN); for (size_t i = 0; i < topWords.size(); i++) { keywords.push_back(topWords[i].word); } } void Extract(const string& sentence, vector >& keywords, size_t topN) const { vector topWords; Extract(sentence, topWords, topN); for (size_t i = 0; i < topWords.size(); i++) { keywords.push_back(pair(topWords[i].word, topWords[i].weight)); } } void Extract(const string& sentence, vector& keywords, size_t topN) const { unordered_map wordmap;//插入字符串与Word的map,相同string统计词频叠加权重 PreFilter pre_filter(symbols_, sentence); RuneStrArray::const_iterator null_p; WordRange range(null_p, null_p); bool isNull(false); while (pre_filter.Next(range, isNull)) { if (isNull) { continue; } segment_.CutToStr(sentence, range, wordmap); } keywords.clear(); keywords.reserve(wordmap.size()); for (unordered_map::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { double idf = idf_trie_.Find(itr->first); if (-1 != idf) {//IDF词典查找 itr->second.weight *= idf; } else { itr->second.weight *= idf_trie_.GetIdfAverage(); } itr->second.word = itr->first; keywords.push_back(itr->second); } topN = min(topN, keywords.size()); partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); keywords.resize(topN); } private: static bool Compare(const KeyWord& lhs, const KeyWord& rhs) { return lhs.weight > rhs.weight; } MixSegment segment_; IdfTrie idf_trie_; unordered_set symbols_; }; // class KeywordExtractor inline ostream& operator << (ostream& os, const KeyWord& word) { return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; } } // namespace cppjieba