ukui-search/libchinese-segmentation/cppjieba/DictTrie.hpp

/*
 * The MIT License (MIT)
 *
 * Copyright (C) 2013 Yanyi Wu
 * Copyright (C) 2023, KylinSoft Co., Ltd.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
*/
#pragma once

#include <iostream>
#include <fstream>
#include <map>
#include <string>
#include <cstring>
#include <cstdlib>
#include <stdint.h>
#include <cmath>
#include <limits>
#include "limonp/StringUtil.hpp"
#include "limonp/Logging.hpp"
#include "Unicode.hpp"
#include "DatTrie.hpp"
#include <QDebug>
namespace cppjieba {

using namespace limonp;

const double MAX_DOUBLE = 3.14e+100;
const size_t DICT_COLUMN_NUM = 3;
const char* const UNKNOWN_TAG = "";

class DictTrie {
public:
    enum UserWordWeightOption {
        WordWeightMin,
        WordWeightMedian,
        WordWeightMax,
    }; // enum UserWordWeightOption

    DictTrie(const string& dict_path, const string& user_dict_paths = "", const string & dat_cache_path = "",
             UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
        Init(dict_path, user_dict_paths, dat_cache_path, user_word_weight_opt);
    }

    ~DictTrie() {}

    const DatMemElem* Find(const string & word) const {
        return dat_.Find(word);
    }

    void FindDatDag(RuneStrArray::const_iterator begin,
              RuneStrArray::const_iterator end,
              vector<struct DatDag>&res,
              size_t max_word_len = MAX_WORD_LENGTH) const {
        dat_.Find(begin, end, res, max_word_len);
    }

    void FindWordRange(RuneStrArray::const_iterator begin,
              RuneStrArray::const_iterator end,
              vector<WordRange>& words,
              size_t max_word_len = MAX_WORD_LENGTH) const {
        dat_.Find(begin, end, words, max_word_len);
    }

    bool IsUserDictSingleChineseWord(const Rune& word) const {
        return IsIn(user_dict_single_chinese_word_, word);
    }

    double GetMinWeight() const {
        return dat_.GetMinWeight();
    }

    size_t GetTotalDictSize() const {
        return total_dict_size_;
    }

    void InserUserDictNode(const string& line, bool saveNodeInfo = true) {
        vector<string> buf;
        DatElement node_info;
        Split(line, buf, " ");

        if (buf.size() == 0) {
            return;
        }

        node_info.word = buf[0];
        node_info.weight = user_word_default_weight_;
        node_info.tag = UNKNOWN_TAG;

        if (buf.size() == 2) {
            node_info.tag = buf[1];
        } else if (buf.size() == 3) {
            if (freq_sum_ > 0.0) {
                const int freq = atoi(buf[1].c_str());
                node_info.weight = log(1.0 * freq / freq_sum_);
                node_info.tag = buf[2];
            }
        }

        if (saveNodeInfo) {
            static_node_infos_.push_back(node_info);
        }

        if (Utf8CharNum(node_info.word) == 1) {
            RuneArray word;

            if (DecodeRunesInString(node_info.word, word)) {
                user_dict_single_chinese_word_.insert(word[0]);
            } else {
                XLOG(ERROR) << "Decode " << node_info.word << " failed.";
            }
        }
    }

    void LoadUserDict(const string& filePaths, bool saveNodeInfo = true) {
        vector<string> files = limonp::Split(filePaths, "|;");

        for (size_t i = 0; i < files.size(); i++) {
            ifstream ifs(files[i].c_str());
            XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
            string line;

            for (; getline(ifs, line);) {
                if (line.size() == 0) {
                    continue;
                }

                InserUserDictNode(line, saveNodeInfo);
            }
        }
    }


private:
    void Init(const string& dict_path, const string& user_dict_paths, string dat_cache_path,
              UserWordWeightOption user_word_weight_opt) {
        const auto dict_list = dict_path + "|" + user_dict_paths;
        size_t file_size_sum = 0;
        const string md5 = CalcFileListMD5(dict_list, file_size_sum);
        total_dict_size_ = file_size_sum;

        if (dat_cache_path.empty()) {
            dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
        }
         dat_cache_path += VERSION;
        QString path = QString::fromStdString(dat_cache_path);
        qDebug() << "#########Dict path:" << path;
        if (dat_.InitAttachDat(dat_cache_path, md5)) {
            LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
            return;
        }

        LoadDefaultDict(dict_path);
        freq_sum_ = CalcFreqSum(static_node_infos_);
        CalculateWeight(static_node_infos_, freq_sum_);
        double min_weight = 0;
        SetStaticWordWeights(user_word_weight_opt, min_weight);
        dat_.SetMinWeight(min_weight);

        LoadUserDict(user_dict_paths);
        const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
        assert(build_ret);
        vector<DatElement>().swap(static_node_infos_);
    }

    void LoadDefaultDict(const string& filePath) {
        ifstream ifs(filePath.c_str());
        XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
        string line;
        vector<string> buf;

        for (; getline(ifs, line);) {
            Split(line, buf, " ");
            XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
            DatElement node_info;
            node_info.word = buf[0];
            node_info.weight = atof(buf[1].c_str());
            node_info.tag = buf[2];
            static_node_infos_.push_back(node_info);
        }
    }

    static bool WeightCompare(const DatElement& lhs, const DatElement& rhs) {
        return lhs.weight < rhs.weight;
    }

    void SetStaticWordWeights(UserWordWeightOption option, double & min_weight) {
        XCHECK(!static_node_infos_.empty());
        vector<DatElement> x = static_node_infos_;
        sort(x.begin(), x.end(), WeightCompare);
        if(x.empty()){
            return;
        }
        min_weight = x[0].weight;
        const double max_weight_ = x[x.size() - 1].weight;
        const double median_weight_ = x[x.size() / 2].weight;

        switch (option) {
            case WordWeightMin:
                user_word_default_weight_ = min_weight;
                break;

            case WordWeightMedian:
                user_word_default_weight_ = median_weight_;
                break;

            default:
                user_word_default_weight_ = max_weight_;
                break;
        }
    }

    double CalcFreqSum(const vector<DatElement>& node_infos) const {
        double sum = 0.0;

        for (size_t i = 0; i < node_infos.size(); i++) {
            sum += node_infos[i].weight;
        }

        return sum;
    }

    void CalculateWeight(vector<DatElement>& node_infos, double sum) const {
        for (size_t i = 0; i < node_infos.size(); i++) {
            DatElement& node_info = node_infos[i];
            assert(node_info.weight > 0.0);
            node_info.weight = log(double(node_info.weight) / sum);
        }
    }

private:
    vector<DatElement> static_node_infos_;
    size_t total_dict_size_ = 0;
    DatTrie dat_;

    double freq_sum_;
    double user_word_default_weight_;
    unordered_set<Rune> user_dict_single_chinese_word_;
};
}
同步子项目 2024-01-30 14:42:09 +08:00			`/*`
同步子项目 2024-02-06 09:45:29 +08:00			`* The MIT License (MIT)`
同步子项目 2024-01-30 14:42:09 +08:00			`*`
同步子项目 2024-02-06 09:45:29 +08:00			`* Copyright (C) 2013 Yanyi Wu`
同步子项目 2024-01-30 14:42:09 +08:00			`* Copyright (C) 2023, KylinSoft Co., Ltd.`
			`*`
同步子项目 2024-02-06 09:45:29 +08:00			`* Permission is hereby granted, free of charge, to any person obtaining a`
			`* copy of this software and associated documentation files (the "Software"),`
			`* to deal in the Software without restriction, including without limitation`
			`* the rights to use, copy, modify, merge, publish, distribute, sublicense,`
			`* and/or sell copies of the Software, and to permit persons to whom the`
			`* Software is furnished to do so, subject to the following conditions:`
同步子项目 2024-01-30 14:42:09 +08:00			`*`
同步子项目 2024-02-06 09:45:29 +08:00			`* The above copyright notice and this permission notice shall be`
			`* included in all copies or substantial portions of the Software.`
同步子项目 2024-01-30 14:42:09 +08:00			`*`
同步子项目 2024-02-06 09:45:29 +08:00			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS`
			`* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL`
			`* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR`
			`* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,`
			`* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR`
			`* OTHER DEALINGS IN THE SOFTWARE.`
同步子项目 2024-01-30 14:42:09 +08:00			`*`
同步子项目 2024-02-06 09:45:29 +08:00			`*/`
删除子项目，upstream分支将只作为打包时同步代码的分支 2023-12-28 10:27:40 +08:00			`#pragma once`

			`#include <iostream>`
			`#include <fstream>`
			`#include <map>`
			`#include <string>`
			`#include <cstring>`
			`#include <cstdlib>`
			`#include <stdint.h>`
			`#include <cmath>`
			`#include <limits>`
			`#include "limonp/StringUtil.hpp"`
			`#include "limonp/Logging.hpp"`
			`#include "Unicode.hpp"`
			`#include "DatTrie.hpp"`
			`#include <QDebug>`
			`namespace cppjieba {`

			`using namespace limonp;`

			`const double MAX_DOUBLE = 3.14e+100;`
			`const size_t DICT_COLUMN_NUM = 3;`
			`const char* const UNKNOWN_TAG = "";`

			`class DictTrie {`
			`public:`
			`enum UserWordWeightOption {`
			`WordWeightMin,`
			`WordWeightMedian,`
			`WordWeightMax,`
			`}; // enum UserWordWeightOption`

			`DictTrie(const string& dict_path, const string& user_dict_paths = "", const string & dat_cache_path = "",`
			`UserWordWeightOption user_word_weight_opt = WordWeightMedian) {`
			`Init(dict_path, user_dict_paths, dat_cache_path, user_word_weight_opt);`
			`}`

			`~DictTrie() {}`

			`const DatMemElem* Find(const string & word) const {`
			`return dat_.Find(word);`
			`}`

			`void FindDatDag(RuneStrArray::const_iterator begin,`
			`RuneStrArray::const_iterator end,`
			`vector<struct DatDag>&res,`
			`size_t max_word_len = MAX_WORD_LENGTH) const {`
			`dat_.Find(begin, end, res, max_word_len);`
			`}`

			`void FindWordRange(RuneStrArray::const_iterator begin,`
			`RuneStrArray::const_iterator end,`
			`vector<WordRange>& words,`
			`size_t max_word_len = MAX_WORD_LENGTH) const {`
			`dat_.Find(begin, end, words, max_word_len);`
			`}`

			`bool IsUserDictSingleChineseWord(const Rune& word) const {`
			`return IsIn(user_dict_single_chinese_word_, word);`
			`}`

			`double GetMinWeight() const {`
			`return dat_.GetMinWeight();`
			`}`

			`size_t GetTotalDictSize() const {`
			`return total_dict_size_;`
			`}`

			`void InserUserDictNode(const string& line, bool saveNodeInfo = true) {`
			`vector<string> buf;`
			`DatElement node_info;`
			`Split(line, buf, " ");`

			`if (buf.size() == 0) {`
			`return;`
			`}`

			`node_info.word = buf[0];`
			`node_info.weight = user_word_default_weight_;`
			`node_info.tag = UNKNOWN_TAG;`

			`if (buf.size() == 2) {`
			`node_info.tag = buf[1];`
			`} else if (buf.size() == 3) {`
			`if (freq_sum_ > 0.0) {`
			`const int freq = atoi(buf[1].c_str());`
			`node_info.weight = log(1.0 * freq / freq_sum_);`
			`node_info.tag = buf[2];`
			`}`
			`}`

			`if (saveNodeInfo) {`
			`static_node_infos_.push_back(node_info);`
			`}`

			`if (Utf8CharNum(node_info.word) == 1) {`
			`RuneArray word;`

			`if (DecodeRunesInString(node_info.word, word)) {`
			`user_dict_single_chinese_word_.insert(word[0]);`
			`} else {`
			`XLOG(ERROR) << "Decode " << node_info.word << " failed.";`
			`}`
			`}`
			`}`

			`void LoadUserDict(const string& filePaths, bool saveNodeInfo = true) {`
			`vector<string> files = limonp::Split(filePaths, "\|;");`

			`for (size_t i = 0; i < files.size(); i++) {`
			`ifstream ifs(files[i].c_str());`
			`XCHECK(ifs.is_open()) << "open " << files[i] << " failed";`
			`string line;`

			`for (; getline(ifs, line);) {`
			`if (line.size() == 0) {`
			`continue;`
			`}`

			`InserUserDictNode(line, saveNodeInfo);`
			`}`
			`}`
			`}`


			`private:`
			`void Init(const string& dict_path, const string& user_dict_paths, string dat_cache_path,`
			`UserWordWeightOption user_word_weight_opt) {`
			`const auto dict_list = dict_path + "\|" + user_dict_paths;`
			`size_t file_size_sum = 0;`
			`const string md5 = CalcFileListMD5(dict_list, file_size_sum);`
			`total_dict_size_ = file_size_sum;`

			`if (dat_cache_path.empty()) {`
			`dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下`
			`}`
			`dat_cache_path += VERSION;`
			`QString path = QString::fromStdString(dat_cache_path);`
			`qDebug() << "#########Dict path:" << path;`
			`if (dat_.InitAttachDat(dat_cache_path, md5)) {`
			`LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;`
			`return;`
			`}`

			`LoadDefaultDict(dict_path);`
			`freq_sum_ = CalcFreqSum(static_node_infos_);`
			`CalculateWeight(static_node_infos_, freq_sum_);`
			`double min_weight = 0;`
			`SetStaticWordWeights(user_word_weight_opt, min_weight);`
			`dat_.SetMinWeight(min_weight);`

			`LoadUserDict(user_dict_paths);`
			`const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);`
			`assert(build_ret);`
			`vector<DatElement>().swap(static_node_infos_);`
			`}`

			`void LoadDefaultDict(const string& filePath) {`
			`ifstream ifs(filePath.c_str());`
			`XCHECK(ifs.is_open()) << "open " << filePath << " failed.";`
			`string line;`
			`vector<string> buf;`

			`for (; getline(ifs, line);) {`
			`Split(line, buf, " ");`
			`XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;`
			`DatElement node_info;`
			`node_info.word = buf[0];`
			`node_info.weight = atof(buf[1].c_str());`
			`node_info.tag = buf[2];`
			`static_node_infos_.push_back(node_info);`
			`}`
			`}`

			`static bool WeightCompare(const DatElement& lhs, const DatElement& rhs) {`
			`return lhs.weight < rhs.weight;`
			`}`

			`void SetStaticWordWeights(UserWordWeightOption option, double & min_weight) {`
			`XCHECK(!static_node_infos_.empty());`
			`vector<DatElement> x = static_node_infos_;`
			`sort(x.begin(), x.end(), WeightCompare);`
			`if(x.empty()){`
			`return;`
			`}`
			`min_weight = x[0].weight;`
			`const double max_weight_ = x[x.size() - 1].weight;`
			`const double median_weight_ = x[x.size() / 2].weight;`

			`switch (option) {`
			`case WordWeightMin:`
			`user_word_default_weight_ = min_weight;`
			`break;`

			`case WordWeightMedian:`
			`user_word_default_weight_ = median_weight_;`
			`break;`

			`default:`
			`user_word_default_weight_ = max_weight_;`
			`break;`
			`}`
			`}`

			`double CalcFreqSum(const vector<DatElement>& node_infos) const {`
			`double sum = 0.0;`

			`for (size_t i = 0; i < node_infos.size(); i++) {`
			`sum += node_infos[i].weight;`
			`}`

			`return sum;`
			`}`

			`void CalculateWeight(vector<DatElement>& node_infos, double sum) const {`
			`for (size_t i = 0; i < node_infos.size(); i++) {`
			`DatElement& node_info = node_infos[i];`
			`assert(node_info.weight > 0.0);`
			`node_info.weight = log(double(node_info.weight) / sum);`
			`}`
			`}`

			`private:`
			`vector<DatElement> static_node_infos_;`
			`size_t total_dict_size_ = 0;`
			`DatTrie dat_;`

			`double freq_sum_;`
			`double user_word_default_weight_;`
			`unordered_set<Rune> user_dict_single_chinese_word_;`
			`};`
			`}`