260 lines
8.2 KiB
C++
260 lines
8.2 KiB
C++
/*
|
|
* The MIT License (MIT)
|
|
*
|
|
* Copyright (C) 2013 Yanyi Wu
|
|
* Copyright (C) 2023, KylinSoft Co., Ltd.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
*/
|
|
#pragma once
|
|
|
|
#include <iostream>
|
|
#include <fstream>
|
|
#include <map>
|
|
#include <string>
|
|
#include <cstring>
|
|
#include <cstdlib>
|
|
#include <stdint.h>
|
|
#include <cmath>
|
|
#include <limits>
|
|
#include "limonp/StringUtil.hpp"
|
|
#include "limonp/Logging.hpp"
|
|
#include "Unicode.hpp"
|
|
#include "DatTrie.hpp"
|
|
#include <QDebug>
|
|
namespace cppjieba {
|
|
|
|
using namespace limonp;
|
|
|
|
const double MAX_DOUBLE = 3.14e+100;
|
|
const size_t DICT_COLUMN_NUM = 3;
|
|
const char* const UNKNOWN_TAG = "";
|
|
|
|
class DictTrie {
|
|
public:
|
|
enum UserWordWeightOption {
|
|
WordWeightMin,
|
|
WordWeightMedian,
|
|
WordWeightMax,
|
|
}; // enum UserWordWeightOption
|
|
|
|
DictTrie(const string& dict_path, const string& user_dict_paths = "", const string & dat_cache_path = "",
|
|
UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
|
Init(dict_path, user_dict_paths, dat_cache_path, user_word_weight_opt);
|
|
}
|
|
|
|
~DictTrie() {}
|
|
|
|
const DatMemElem* Find(const string & word) const {
|
|
return dat_.Find(word);
|
|
}
|
|
|
|
void FindDatDag(RuneStrArray::const_iterator begin,
|
|
RuneStrArray::const_iterator end,
|
|
vector<struct DatDag>&res,
|
|
size_t max_word_len = MAX_WORD_LENGTH) const {
|
|
dat_.Find(begin, end, res, max_word_len);
|
|
}
|
|
|
|
void FindWordRange(RuneStrArray::const_iterator begin,
|
|
RuneStrArray::const_iterator end,
|
|
vector<WordRange>& words,
|
|
size_t max_word_len = MAX_WORD_LENGTH) const {
|
|
dat_.Find(begin, end, words, max_word_len);
|
|
}
|
|
|
|
bool IsUserDictSingleChineseWord(const Rune& word) const {
|
|
return IsIn(user_dict_single_chinese_word_, word);
|
|
}
|
|
|
|
double GetMinWeight() const {
|
|
return dat_.GetMinWeight();
|
|
}
|
|
|
|
size_t GetTotalDictSize() const {
|
|
return total_dict_size_;
|
|
}
|
|
|
|
void InserUserDictNode(const string& line, bool saveNodeInfo = true) {
|
|
vector<string> buf;
|
|
DatElement node_info;
|
|
Split(line, buf, " ");
|
|
|
|
if (buf.size() == 0) {
|
|
return;
|
|
}
|
|
|
|
node_info.word = buf[0];
|
|
node_info.weight = user_word_default_weight_;
|
|
node_info.tag = UNKNOWN_TAG;
|
|
|
|
if (buf.size() == 2) {
|
|
node_info.tag = buf[1];
|
|
} else if (buf.size() == 3) {
|
|
if (freq_sum_ > 0.0) {
|
|
const int freq = atoi(buf[1].c_str());
|
|
node_info.weight = log(1.0 * freq / freq_sum_);
|
|
node_info.tag = buf[2];
|
|
}
|
|
}
|
|
|
|
if (saveNodeInfo) {
|
|
static_node_infos_.push_back(node_info);
|
|
}
|
|
|
|
if (Utf8CharNum(node_info.word) == 1) {
|
|
RuneArray word;
|
|
|
|
if (DecodeRunesInString(node_info.word, word)) {
|
|
user_dict_single_chinese_word_.insert(word[0]);
|
|
} else {
|
|
XLOG(ERROR) << "Decode " << node_info.word << " failed.";
|
|
}
|
|
}
|
|
}
|
|
|
|
void LoadUserDict(const string& filePaths, bool saveNodeInfo = true) {
|
|
vector<string> files = limonp::Split(filePaths, "|;");
|
|
|
|
for (size_t i = 0; i < files.size(); i++) {
|
|
ifstream ifs(files[i].c_str());
|
|
XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
|
|
string line;
|
|
|
|
for (; getline(ifs, line);) {
|
|
if (line.size() == 0) {
|
|
continue;
|
|
}
|
|
|
|
InserUserDictNode(line, saveNodeInfo);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
private:
|
|
void Init(const string& dict_path, const string& user_dict_paths, string dat_cache_path,
|
|
UserWordWeightOption user_word_weight_opt) {
|
|
const auto dict_list = dict_path + "|" + user_dict_paths;
|
|
size_t file_size_sum = 0;
|
|
const string md5 = CalcFileListMD5(dict_list, file_size_sum);
|
|
total_dict_size_ = file_size_sum;
|
|
|
|
if (dat_cache_path.empty()) {
|
|
dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
|
|
}
|
|
dat_cache_path += VERSION;
|
|
QString path = QString::fromStdString(dat_cache_path);
|
|
qDebug() << "#########Dict path:" << path;
|
|
if (dat_.InitAttachDat(dat_cache_path, md5)) {
|
|
LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
|
|
return;
|
|
}
|
|
|
|
LoadDefaultDict(dict_path);
|
|
freq_sum_ = CalcFreqSum(static_node_infos_);
|
|
CalculateWeight(static_node_infos_, freq_sum_);
|
|
double min_weight = 0;
|
|
SetStaticWordWeights(user_word_weight_opt, min_weight);
|
|
dat_.SetMinWeight(min_weight);
|
|
|
|
LoadUserDict(user_dict_paths);
|
|
const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
|
|
assert(build_ret);
|
|
vector<DatElement>().swap(static_node_infos_);
|
|
}
|
|
|
|
void LoadDefaultDict(const string& filePath) {
|
|
ifstream ifs(filePath.c_str());
|
|
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
|
|
string line;
|
|
vector<string> buf;
|
|
|
|
for (; getline(ifs, line);) {
|
|
Split(line, buf, " ");
|
|
XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
|
|
DatElement node_info;
|
|
node_info.word = buf[0];
|
|
node_info.weight = atof(buf[1].c_str());
|
|
node_info.tag = buf[2];
|
|
static_node_infos_.push_back(node_info);
|
|
}
|
|
}
|
|
|
|
static bool WeightCompare(const DatElement& lhs, const DatElement& rhs) {
|
|
return lhs.weight < rhs.weight;
|
|
}
|
|
|
|
void SetStaticWordWeights(UserWordWeightOption option, double & min_weight) {
|
|
XCHECK(!static_node_infos_.empty());
|
|
vector<DatElement> x = static_node_infos_;
|
|
sort(x.begin(), x.end(), WeightCompare);
|
|
if(x.empty()){
|
|
return;
|
|
}
|
|
min_weight = x[0].weight;
|
|
const double max_weight_ = x[x.size() - 1].weight;
|
|
const double median_weight_ = x[x.size() / 2].weight;
|
|
|
|
switch (option) {
|
|
case WordWeightMin:
|
|
user_word_default_weight_ = min_weight;
|
|
break;
|
|
|
|
case WordWeightMedian:
|
|
user_word_default_weight_ = median_weight_;
|
|
break;
|
|
|
|
default:
|
|
user_word_default_weight_ = max_weight_;
|
|
break;
|
|
}
|
|
}
|
|
|
|
double CalcFreqSum(const vector<DatElement>& node_infos) const {
|
|
double sum = 0.0;
|
|
|
|
for (size_t i = 0; i < node_infos.size(); i++) {
|
|
sum += node_infos[i].weight;
|
|
}
|
|
|
|
return sum;
|
|
}
|
|
|
|
void CalculateWeight(vector<DatElement>& node_infos, double sum) const {
|
|
for (size_t i = 0; i < node_infos.size(); i++) {
|
|
DatElement& node_info = node_infos[i];
|
|
assert(node_info.weight > 0.0);
|
|
node_info.weight = log(double(node_info.weight) / sum);
|
|
}
|
|
}
|
|
|
|
private:
|
|
vector<DatElement> static_node_infos_;
|
|
size_t total_dict_size_ = 0;
|
|
DatTrie dat_;
|
|
|
|
double freq_sum_;
|
|
double user_word_default_weight_;
|
|
unordered_set<Rune> user_dict_single_chinese_word_;
|
|
};
|
|
}
|
|
|