ukui-search/libchinese-segmentation/cppjieba/PinYinTrie.hpp

173 lines
5.5 KiB
C++
Raw Normal View History

2024-01-30 14:42:09 +08:00
/*
*
* Copyright (C) 2023, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
*/
#pragma once
#include <iostream>
#include <fstream>
#include <map>
#include <string>
#include <cstring>
#include <cstdlib>
#include <stdint.h>
#include <cmath>
#include <limits>
#include "limonp/StringUtil.hpp"
#include "limonp/Logging.hpp"
#include "Unicode.hpp"
#include "DatTrie.hpp"
#include <QDebug>
namespace cppjieba {
using namespace limonp;
const size_t PINYIN_COLUMN_NUM = 2;
class PinYinTrie {
public:
enum UserWordWeightOption {
WordWeightMin,
WordWeightMedian,
WordWeightMax,
}; // enum UserWordWeightOption
PinYinTrie(const string& dict_path, const string & dat_cache_path = "",
UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
Init(dict_path, dat_cache_path, user_word_weight_opt);
}
~PinYinTrie() {}
int getMultiTonResults(string word, QStringList &results) {
if (qmap_chinese2pinyin.contains(QString::fromStdString(word))) {
for (auto i:qmap_chinese2pinyin[QString::fromStdString(word)])
results.push_back(i);
return 0;
}
return -1;
}
int getSingleTonResult(string word, QString &result) {
const PinYinMemElem * tmp = dat_.PinYinFind(word);
if (tmp) {
result = QString::fromStdString(tmp->GetTag());
return 0;
}
return -1;
}
bool contains(string &word) {
if (qmap_chinese2pinyin.contains(QString::fromStdString(word))
or !dat_.PinYinFind(word))
return true;
// if (map_chinese2pinyin.contains(word)
// or !dat_.PinYinFind(word))
// return true;
return false;
}
bool isMultiTone(const string &word) {
if (qmap_chinese2pinyin.contains(QString::fromStdString(word)))
return true;
// if (map_chinese2pinyin.contains(word))
// return true;
return false;
}
size_t GetTotalDictSize() const {
return total_dict_size_;
}
private:
void Init(const string& dict_path, string dat_cache_path,
UserWordWeightOption user_word_weight_opt) {
size_t file_size_sum = 0;
vector<PinYinElement> node_infos;
const string md5 = CalcFileListMD5(dict_path, file_size_sum);
total_dict_size_ = file_size_sum;
if (dat_cache_path.empty()) {
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache";
}
QString path = QString::fromStdString(dat_cache_path);
qDebug() << "#########PinYin path:" << path << file_size_sum;
if (dat_.InitPinYinAttachDat(dat_cache_path, md5)) {
//多音字仍需遍历文件信息
LoadDefaultPinYin(node_infos, dict_path, true);
return;
}
LoadDefaultPinYin(node_infos, dict_path, false);
double min_weight = 0;
dat_.SetMinWeight(min_weight);
const auto build_ret = dat_.InitBuildDat(node_infos, dat_cache_path, md5);
assert(build_ret);
vector<PinYinElement>().swap(node_infos);
}
void LoadDefaultPinYin(vector<PinYinElement> &node_infos, const string& filePath, bool multiFlag) {
ifstream ifs(filePath.c_str());
if(not ifs.is_open()){
return ;
}
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
string line;
vector<string> buf;
size_t lineno = 0;
for (; getline(ifs, line); lineno++) {
if (line.empty()) {
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
continue;
}
Split(line, buf, " ");
if (buf.size() == PINYIN_COLUMN_NUM) {
if (multiFlag) {//非多音字
continue;
}
PinYinElement node_info;
node_info.word = buf[1];
node_info.tag = buf[0];
node_infos.push_back(node_info);
} else {//多音字
QString content = QString::fromUtf8(line.c_str());
qmap_chinese2pinyin[content.split(" ").last().trimmed()] = content.split(" ");
qmap_chinese2pinyin[content.split(" ").last().trimmed()].pop_back();
/*
//std map string list
list<string> tmpList;
for(int i = 0; i < buf.size() - 1; ++i){
tmpList.push_back(buf[i]);
}
map[buf[buf.size() - 1]] = tmpList;
*/
}
}
}
private:
QMap<QString, QStringList> qmap_chinese2pinyin;
//map<string, list<string>> map_chinese2pinyin;
size_t total_dict_size_ = 0;
DatTrie dat_;
};
}