ukui-search/libchinese-segmentation/cppjieba/KeywordExtractor.hpp

/*
 *
 * Copyright (C) 2023, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 */
#pragma once

#include <cmath>
#include "MixSegment.hpp"
//#include "IdfTrie.hpp"
#include "idf-trie/idf-trie.h"

namespace cppjieba {

using namespace limonp;
using namespace std;

/*utf8*/
class KeywordExtractor {
public:

    KeywordExtractor(const DictTrie* dictTrie,
                     const HMMModel* model,
                     const string& idfPath,
                     const string& dat_cache_path,
                     const string& stopWordPath)
        : segment_(dictTrie, model, stopWordPath),
          idf_trie_(idfPath, dat_cache_path){
    }
    ~KeywordExtractor() {
    }

    void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
        vector<KeyWord> topWords;
        Extract(sentence, topWords, topN);

        for (size_t i = 0; i < topWords.size(); i++) {
            keywords.push_back(topWords[i].word);
        }
    }

    void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
        vector<KeyWord> topWords;
        Extract(sentence, topWords, topN);

        for (size_t i = 0; i < topWords.size(); i++) {
            keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
        }
    }

    void Extract(const string& sentence, vector<KeyWord>& keywords, size_t topN) const {

        unordered_map<string, KeyWord> wordmap;//插入字符串与Word的map，相同string统计词频叠加权重
        PreFilter pre_filter(symbols_, sentence);
        RuneStrArray::const_iterator null_p;
        WordRange range(null_p, null_p);
        bool isNull(false);
        while (pre_filter.Next(range, isNull)) {
            if (isNull) {
                continue;
            }
            segment_.CutToStr(sentence, range,  wordmap);
        }

        keywords.clear();
        keywords.reserve(wordmap.size());

        for (unordered_map<string, KeyWord>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
            double idf = idf_trie_.Find(itr->first);
            if (-1 != idf) {//IDF词典查找
                itr->second.weight *= idf;
            } else {
                itr->second.weight *= idf_trie_.GetIdfAverage();
            }

            itr->second.word = itr->first;
            keywords.push_back(itr->second);
        }

        topN = min(topN, keywords.size());
        partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
        keywords.resize(topN);
    }
private:

    static bool Compare(const KeyWord& lhs, const KeyWord& rhs) {
        return lhs.weight > rhs.weight;
    }

    MixSegment segment_;
    IdfTrie idf_trie_;


    unordered_set<Rune> symbols_;
}; // class KeywordExtractor

inline ostream& operator << (ostream& os, const KeyWord& word) {
    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
           "}";
}

} // namespace cppjieba