#pragma once #include #include #include #include #include #include #include #include #include #include "limonp/StringUtil.hpp" #include "limonp/Logging.hpp" #include "Unicode.hpp" #include "DatTrie.hpp" #include namespace cppjieba { using namespace limonp; const size_t PINYIN_COLUMN_NUM = 2; class PinYinTrie { public: enum UserWordWeightOption { WordWeightMin, WordWeightMedian, WordWeightMax, }; // enum UserWordWeightOption PinYinTrie(const string& dict_path, const string & dat_cache_path = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) { Init(dict_path, dat_cache_path, user_word_weight_opt); } ~PinYinTrie() {} int getMultiTonResults(string word, QStringList &results) { if (qmap_chinese2pinyin.contains(QString::fromStdString(word))) { for (auto i:qmap_chinese2pinyin[QString::fromStdString(word)]) results.push_back(i); return 0; } return -1; } int getSingleTonResult(string word, QString &result) { const PinYinMemElem * tmp = dat_.PinYinFind(word); if (tmp) { result = QString::fromStdString(tmp->GetTag()); return 0; } return -1; } bool contains(string &word) { if (qmap_chinese2pinyin.contains(QString::fromStdString(word)) or !dat_.PinYinFind(word)) return true; // if (map_chinese2pinyin.contains(word) // or !dat_.PinYinFind(word)) // return true; return false; } bool isMultiTone(const string &word) { if (qmap_chinese2pinyin.contains(QString::fromStdString(word))) return true; // if (map_chinese2pinyin.contains(word)) // return true; return false; } size_t GetTotalDictSize() const { return total_dict_size_; } private: void Init(const string& dict_path, string dat_cache_path, UserWordWeightOption user_word_weight_opt) { size_t file_size_sum = 0; vector node_infos; const string md5 = CalcFileListMD5(dict_path, file_size_sum); total_dict_size_ = file_size_sum; if (dat_cache_path.empty()) { //未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519 dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache"; } QString path = QString::fromStdString(dat_cache_path); qDebug() << "#########PinYin path:" << path << file_size_sum; if (dat_.InitPinYinAttachDat(dat_cache_path, md5)) { //多音字仍需遍历文件信息 LoadDefaultPinYin(node_infos, dict_path, true); return; } LoadDefaultPinYin(node_infos, dict_path, false); double min_weight = 0; dat_.SetMinWeight(min_weight); const auto build_ret = dat_.InitBuildDat(node_infos, dat_cache_path, md5); assert(build_ret); vector().swap(node_infos); } void LoadDefaultPinYin(vector &node_infos, const string& filePath, bool multiFlag) { ifstream ifs(filePath.c_str()); if(not ifs.is_open()){ return ; } XCHECK(ifs.is_open()) << "open " << filePath << " failed."; string line; vector buf; size_t lineno = 0; for (; getline(ifs, line); lineno++) { if (line.empty()) { XLOG(ERROR) << "lineno: " << lineno << " empty. skipped."; continue; } Split(line, buf, " "); if (buf.size() == PINYIN_COLUMN_NUM) { if (multiFlag) {//非多音字 continue; } PinYinElement node_info; node_info.word = buf[1]; node_info.tag = buf[0]; node_infos.push_back(node_info); } else {//多音字 QString content = QString::fromUtf8(line.c_str()); qmap_chinese2pinyin[content.split(" ").last().trimmed()] = content.split(" "); qmap_chinese2pinyin[content.split(" ").last().trimmed()].pop_back(); /* //std map string list list tmpList; for(int i = 0; i < buf.size() - 1; ++i){ tmpList.push_back(buf[i]); } map[buf[buf.size() - 1]] = tmpList; */ } } } private: QMap qmap_chinese2pinyin; //map> map_chinese2pinyin; size_t total_dict_size_ = 0; DatTrie dat_; }; }