ukui-search/libchinese-segmentation/cppjieba/KeywordExtractor.hpp

#pragma once

#include <cmath>
#include "MixSegment.hpp"
#include "IdfTrie.hpp"

namespace cppjieba {

using namespace limonp;
using namespace std;

/*utf8*/
class KeywordExtractor {
public:

    KeywordExtractor(const DictTrie* dictTrie,
                     const HMMModel* model,
                     const string& idfPath,
                     const string& dat_cache_path,
                     const string& stopWordPath)
        : segment_(dictTrie, model, stopWordPath),
        idf_trie_(idfPath,dat_cache_path){
    }
    ~KeywordExtractor() {
    }

    void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
        vector<KeyWord> topWords;
        Extract(sentence, topWords, topN);

        for (size_t i = 0; i < topWords.size(); i++) {
            keywords.push_back(topWords[i].word);
        }
    }

    void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
        vector<KeyWord> topWords;
        Extract(sentence, topWords, topN);

        for (size_t i = 0; i < topWords.size(); i++) {
            keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
        }
    }

    void Extract(const string& sentence, vector<KeyWord>& keywords, size_t topN) const {

        unordered_map<string, KeyWord> wordmap;//插入字符串与Word的map，相同string统计词频叠加权重
        PreFilter pre_filter(symbols_, sentence);
        RuneStrArray::const_iterator null_p;
        WordRange range(null_p, null_p);
        bool isNull(false);
        while (pre_filter.Next(range, isNull)) {
            if (isNull) {
                continue;
            }
            segment_.CutToStr(sentence, range,  wordmap);
        }

        keywords.clear();
        keywords.reserve(wordmap.size());

        for (unordered_map<string, KeyWord>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
            double idf = idf_trie_.Find(itr->first);
            if (-1 != idf) {//IDF词典查找
                itr->second.weight *= idf;
            } else {
                itr->second.weight *= idf_trie_.idfAverage_;
            }

            itr->second.word = itr->first;
            keywords.push_back(itr->second);
        }

        topN = min(topN, keywords.size());
        partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
        keywords.resize(topN);
    }
private:

    static bool Compare(const KeyWord& lhs, const KeyWord& rhs) {
        return lhs.weight > rhs.weight;
    }

    MixSegment segment_;
    IdfTrie idf_trie_;


    unordered_set<Rune> symbols_;
}; // class KeywordExtractor

inline ostream& operator << (ostream& os, const KeyWord& word) {
    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
           "}";
}

} // namespace cppjieba
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
+								#pragma once
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
 								#include <cmath>
 								#include "MixSegment.hpp"
-												Optimization of IDF dictionary loading mode； Limit the maximum number of words segmentation; Other optimization;

											
										
										
											2021-06-23 15:50:19 +08:00
+								#include "IdfTrie.hpp"
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
 								namespace cppjieba {
 								using namespace limonp;
 								using namespace std;
 								/*utf8*/
 								class KeywordExtractor {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								public:
 								    KeywordExtractor(const DictTrie* dictTrie,
 								                     const HMMModel* model,
 								                     const string& idfPath,
-												Optimization of IDF dictionary loading mode； Limit the maximum number of words segmentation; Other optimization;

											
										
										
											2021-06-23 15:50:19 +08:00
+								                     const string& dat_cache_path,
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								                     const string& stopWordPath)
-												Optimization of IDF dictionary loading mode； Limit the maximum number of words segmentation; Other optimization;

											
										
										
											2021-06-23 15:50:19 +08:00
+								        : segment_(dictTrie, model, stopWordPath),
 								        idf_trie_(idfPath,dat_cache_path){
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
+								    }
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								    ~KeywordExtractor() {
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
+								    }
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								    void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
-												Optimization of Jieba keyword extraction; Optimize memory and efficiency

											
										
										
											2021-06-07 15:37:06 +08:00
+								        vector<KeyWord> topWords;
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								        Extract(sentence, topWords, topN);
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
 								        for (size_t i = 0; i < topWords.size(); i++) {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								            keywords.push_back(topWords[i].word);
 								        }
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
+								    }
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								    void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
-												Optimization of Jieba keyword extraction; Optimize memory and efficiency

											
										
										
											2021-06-07 15:37:06 +08:00
+								        vector<KeyWord> topWords;
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								        Extract(sentence, topWords, topN);
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
 								        for (size_t i = 0; i < topWords.size(); i++) {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								            keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
 								        }
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
+								    }
-												Optimization of Jieba keyword extraction; Optimize memory and efficiency

											
										
										
											2021-06-07 15:37:06 +08:00
+								    void Extract(const string& sentence, vector<KeyWord>& keywords, size_t topN) const {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
-												Optimization of Jieba keyword extraction; Optimize memory and efficiency

											
										
										
											2021-06-07 15:37:06 +08:00
+								        unordered_map<string, KeyWord> wordmap;//插入字符串与Word的map，相同string统计词频叠加权重
 								        PreFilter pre_filter(symbols_, sentence);
 								        RuneStrArray::const_iterator null_p;
 								        WordRange range(null_p, null_p);
 								        bool isNull(false);
 								        while (pre_filter.Next(range, isNull)) {
 								            if (isNull) {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								                continue;
 								            }
-												Optimization of Jieba keyword extraction; Optimize memory and efficiency

											
										
										
											2021-06-07 15:37:06 +08:00
+								            segment_.CutToStr(sentence, range,  wordmap);
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								        }
 								        keywords.clear();
 								        keywords.reserve(wordmap.size());
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
-												Optimization of Jieba keyword extraction; Optimize memory and efficiency

											
										
										
											2021-06-07 15:37:06 +08:00
+								        for (unordered_map<string, KeyWord>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
-												Optimization of IDF dictionary loading mode； Limit the maximum number of words segmentation; Other optimization;

											
										
										
											2021-06-23 15:50:19 +08:00
+								            double idf = idf_trie_.Find(itr->first);
 								            if (-1 != idf) {//IDF词典查找
 								                itr->second.weight *= idf;
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								            } else {
-												Optimization of IDF dictionary loading mode； Limit the maximum number of words segmentation; Other optimization;

											
										
										
											2021-06-23 15:50:19 +08:00
+								                itr->second.weight *= idf_trie_.idfAverage_;
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								            }
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								            itr->second.word = itr->first;
 								            keywords.push_back(itr->second);
 								        }
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								        topN = min(topN, keywords.size());
 								        partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
 								        keywords.resize(topN);
 								    }
 								private:
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
-												Optimization of Jieba keyword extraction; Optimize memory and efficiency

											
										
										
											2021-06-07 15:37:06 +08:00
+								    static bool Compare(const KeyWord& lhs, const KeyWord& rhs) {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								        return lhs.weight > rhs.weight;
 								    }
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								    MixSegment segment_;
-												Optimization of IDF dictionary loading mode； Limit the maximum number of words segmentation; Other optimization;

											
										
										
											2021-06-23 15:50:19 +08:00
+								    IdfTrie idf_trie_;
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
-												优化多音字字典存储数据结构；部分代码及注释整理；

											
										
										
											2022-03-02 09:27:40 +08:00
-												Optimization of Jieba keyword extraction; Optimize memory and efficiency

											
										
										
											2021-06-07 15:37:06 +08:00
+								    unordered_set<Rune> symbols_;
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
+								}; // class KeywordExtractor
-												Optimization of Jieba keyword extraction; Optimize memory and efficiency

											
										
										
											2021-06-07 15:37:06 +08:00
+								inline ostream& operator << (ostream& os, const KeyWord& word) {
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
+								    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
 								           "}";
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
+								}
 								} // namespace cppjieba