优化多音字字典存储数据结构;部分代码及注释整理;
This commit is contained in:
parent
fb7811e417
commit
47af66e682
|
@ -46,16 +46,17 @@ struct SKeyWord {
|
||||||
class CHINESESEGMENTATION_EXPORT ChineseSegmentation {
|
class CHINESESEGMENTATION_EXPORT ChineseSegmentation {
|
||||||
public:
|
public:
|
||||||
static ChineseSegmentation *getInstance();
|
static ChineseSegmentation *getInstance();
|
||||||
~ChineseSegmentation();
|
|
||||||
QVector<SKeyWord> callSegement(std::string s);
|
QVector<SKeyWord> callSegement(std::string s);
|
||||||
//新添加callSegementStd函数,修改返回值为std::vector<cppjieba::KeywordExtractor::Word>并简化内部处理流程--jxx20210517
|
|
||||||
//修改函数入参形式为引用,去掉Qstring与std::string转换代码--jxx20210519
|
|
||||||
std::vector<cppjieba::KeyWord> callSegementStd(const std::string& str);
|
std::vector<cppjieba::KeyWord> callSegementStd(const std::string& str);
|
||||||
|
|
||||||
|
private:
|
||||||
|
explicit ChineseSegmentation();
|
||||||
|
~ChineseSegmentation();
|
||||||
void convert(std::vector<cppjieba::KeyWord>& keywordres, QVector<SKeyWord>& kw);
|
void convert(std::vector<cppjieba::KeyWord>& keywordres, QVector<SKeyWord>& kw);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static QMutex m_mutex;
|
static QMutex m_mutex;
|
||||||
cppjieba::Jieba *m_jieba;
|
cppjieba::Jieba *m_jieba;
|
||||||
explicit ChineseSegmentation();
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -46,6 +46,16 @@ struct IdfElement {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct PinYinElement
|
||||||
|
{
|
||||||
|
string word;
|
||||||
|
string tag;
|
||||||
|
|
||||||
|
bool operator < (const DatElement & b) const {
|
||||||
|
return this->word < b.word;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
|
inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
|
||||||
return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
|
return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
|
||||||
}
|
}
|
||||||
|
@ -64,6 +74,19 @@ struct DatMemElem {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct PinYinMemElem {
|
||||||
|
char tag[6] = {};
|
||||||
|
|
||||||
|
void SetTag(const string & str) {
|
||||||
|
memset(&tag[0], 0, sizeof(tag));
|
||||||
|
strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
string GetTag() const {
|
||||||
|
return &tag[0];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
inline std::ostream & operator << (std::ostream& os, const DatMemElem & elem) {
|
inline std::ostream & operator << (std::ostream& os, const DatMemElem & elem) {
|
||||||
return os << "/tag=" << elem.GetTag() << "/weight=" << elem.weight;
|
return os << "/tag=" << elem.GetTag() << "/weight=" << elem.weight;
|
||||||
}
|
}
|
||||||
|
@ -122,6 +145,17 @@ public:
|
||||||
return idf_elements_ptr_[ find_result.value ];
|
return idf_elements_ptr_[ find_result.value ];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const PinYinMemElem * PinYinFind(const string & key) const {
|
||||||
|
JiebaDAT::result_pair_type find_result;
|
||||||
|
dat_.exactMatchSearch(key.c_str(), find_result);
|
||||||
|
|
||||||
|
if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return &pinyin_elements_ptr_[ find_result.value ];
|
||||||
|
}
|
||||||
|
|
||||||
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||||
vector<struct DatDag>&res, size_t max_word_len) const {
|
vector<struct DatDag>&res, size_t max_word_len) const {
|
||||||
|
|
||||||
|
@ -167,6 +201,7 @@ public:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
void Find_Reverse(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
void Find_Reverse(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||||
vector<struct DatDag>&res, size_t max_word_len) const {
|
vector<struct DatDag>&res, size_t max_word_len) const {
|
||||||
|
|
||||||
|
@ -208,7 +243,8 @@ public:
|
||||||
res[str_size - i - 1].nexts.push_back(pair<size_t, const DatMemElem *>(str_size - 1 - i + char_num, pValue));
|
res[str_size - i - 1].nexts.push_back(pair<size_t, const DatMemElem *>(str_size - 1 - i + char_num, pValue));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}*/
|
||||||
|
|
||||||
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||||
vector<WordRange>& words, size_t max_word_len) const {
|
vector<WordRange>& words, size_t max_word_len) const {
|
||||||
|
|
||||||
|
@ -300,6 +336,11 @@ public:
|
||||||
return InitIdfAttachDat(dat_cache_file, md5);
|
return InitIdfAttachDat(dat_cache_file, md5);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool InitBuildDat(vector<PinYinElement>& elements, const string & dat_cache_file, const string & md5) {
|
||||||
|
BuildDatCache(elements, dat_cache_file, md5);
|
||||||
|
return InitPinYinAttachDat(dat_cache_file, md5);
|
||||||
|
}
|
||||||
|
|
||||||
bool InitAttachDat(const string & dat_cache_file, const string & md5) {
|
bool InitAttachDat(const string & dat_cache_file, const string & md5) {
|
||||||
mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
|
mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
|
||||||
|
|
||||||
|
@ -362,6 +403,37 @@ public:
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool InitPinYinAttachDat(const string & dat_cache_file, const string & md5) {
|
||||||
|
mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
|
||||||
|
|
||||||
|
if (mmap_fd_ < 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
|
||||||
|
assert(seek_off >= 0);
|
||||||
|
mmap_length_ = seek_off;
|
||||||
|
|
||||||
|
mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
|
||||||
|
assert(MAP_FAILED != mmap_addr_);
|
||||||
|
|
||||||
|
assert(mmap_length_ >= sizeof(CacheFileHeader));
|
||||||
|
CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
|
||||||
|
elements_num_ = header.elements_num;
|
||||||
|
min_weight_ = header.min_weight;
|
||||||
|
assert(sizeof(header.md5_hex) == md5.size());
|
||||||
|
|
||||||
|
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(PinYinMemElem) + header.dat_size * dat_.unit_size());
|
||||||
|
pinyin_elements_ptr_ = (const PinYinMemElem *)(mmap_addr_ + sizeof(header));
|
||||||
|
const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(PinYinMemElem) * elements_num_;
|
||||||
|
dat_.set_array(dat_ptr, header.dat_size);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void BuildDatCache(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
|
void BuildDatCache(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
|
||||||
std::sort(elements.begin(), elements.end());
|
std::sort(elements.begin(), elements.end());
|
||||||
|
@ -464,6 +536,56 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void BuildDatCache(vector<PinYinElement>& elements, const string & dat_cache_file, const string & md5) {
|
||||||
|
//std::sort(elements.begin(), elements.end());
|
||||||
|
|
||||||
|
vector<const char*> keys_ptr_vec;
|
||||||
|
vector<int> values_vec;
|
||||||
|
vector<PinYinMemElem> mem_elem_vec;
|
||||||
|
|
||||||
|
keys_ptr_vec.reserve(elements.size());
|
||||||
|
values_vec.reserve(elements.size());
|
||||||
|
mem_elem_vec.reserve(elements.size());
|
||||||
|
|
||||||
|
CacheFileHeader header;
|
||||||
|
header.min_weight = min_weight_;
|
||||||
|
assert(sizeof(header.md5_hex) == md5.size());
|
||||||
|
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
|
||||||
|
|
||||||
|
for (size_t i = 0; i < elements.size(); ++i) {
|
||||||
|
keys_ptr_vec.push_back(elements[i].word.data());
|
||||||
|
values_vec.push_back(i);
|
||||||
|
mem_elem_vec.push_back(PinYinMemElem());
|
||||||
|
auto & mem_elem = mem_elem_vec.back();
|
||||||
|
mem_elem.SetTag(elements[i].tag);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
|
||||||
|
assert(0 == ret);
|
||||||
|
header.elements_num = mem_elem_vec.size();
|
||||||
|
header.dat_size = dat_.size();
|
||||||
|
|
||||||
|
{
|
||||||
|
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
|
||||||
|
::umask(S_IWGRP | S_IWOTH);
|
||||||
|
//const int fd =::mkstemp(&tmp_filepath[0]);
|
||||||
|
const int fd =::mkstemp((char *)tmp_filepath.data());
|
||||||
|
qDebug() << "mkstemp :" << errno << tmp_filepath.data();
|
||||||
|
assert(fd >= 0);
|
||||||
|
::fchmod(fd, 0644);
|
||||||
|
|
||||||
|
auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
|
||||||
|
write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(mem_elem_vec[0]) * mem_elem_vec.size());
|
||||||
|
write_bytes += ::write(fd, dat_.array(), dat_.total_size());
|
||||||
|
|
||||||
|
assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(mem_elem_vec[0]) + dat_.total_size());
|
||||||
|
::close(fd);
|
||||||
|
|
||||||
|
const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
|
||||||
|
assert(0 == rename_ret);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
DatTrie(const DatTrie &);
|
DatTrie(const DatTrie &);
|
||||||
DatTrie &operator=(const DatTrie &);
|
DatTrie &operator=(const DatTrie &);
|
||||||
|
|
||||||
|
@ -471,6 +593,7 @@ private:
|
||||||
JiebaDAT dat_;
|
JiebaDAT dat_;
|
||||||
const DatMemElem * elements_ptr_ = nullptr;
|
const DatMemElem * elements_ptr_ = nullptr;
|
||||||
const double * idf_elements_ptr_ = nullptr;
|
const double * idf_elements_ptr_ = nullptr;
|
||||||
|
const PinYinMemElem * pinyin_elements_ptr_ = nullptr;
|
||||||
size_t elements_num_ = 0;
|
size_t elements_num_ = 0;
|
||||||
double min_weight_ = 0;
|
double min_weight_ = 0;
|
||||||
|
|
||||||
|
|
|
@ -131,6 +131,7 @@ private:
|
||||||
const auto dict_list = dict_path + "|" + user_dict_paths;
|
const auto dict_list = dict_path + "|" + user_dict_paths;
|
||||||
size_t file_size_sum = 0;
|
size_t file_size_sum = 0;
|
||||||
const string md5 = CalcFileListMD5(dict_list, file_size_sum);
|
const string md5 = CalcFileListMD5(dict_list, file_size_sum);
|
||||||
|
total_dict_size_ = file_size_sum;
|
||||||
|
|
||||||
if (dat_cache_path.empty()) {
|
if (dat_cache_path.empty()) {
|
||||||
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
|
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
|
||||||
|
@ -140,7 +141,6 @@ private:
|
||||||
qDebug() << "#########Dict path:" << path;
|
qDebug() << "#########Dict path:" << path;
|
||||||
if (dat_.InitAttachDat(dat_cache_path, md5)) {
|
if (dat_.InitAttachDat(dat_cache_path, md5)) {
|
||||||
LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
|
LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
|
||||||
total_dict_size_ = file_size_sum;
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -154,7 +154,6 @@ private:
|
||||||
LoadUserDict(user_dict_paths);
|
LoadUserDict(user_dict_paths);
|
||||||
const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
|
const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
|
||||||
assert(build_ret);
|
assert(build_ret);
|
||||||
total_dict_size_ = file_size_sum;
|
|
||||||
vector<DatElement>().swap(static_node_infos_);
|
vector<DatElement>().swap(static_node_infos_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -39,21 +39,6 @@ public:
|
||||||
return dat_.Find(word, length, node_pos);
|
return dat_.Find(word, length, node_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Find(RuneStrArray::const_iterator begin,
|
|
||||||
RuneStrArray::const_iterator end,
|
|
||||||
vector<struct DatDag>&res,
|
|
||||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
|
||||||
dat_.Find(begin, end, res, max_word_len);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool IsUserDictSingleChineseWord(const Rune& word) const {
|
|
||||||
return IsIn(user_dict_single_chinese_word_, word);
|
|
||||||
}
|
|
||||||
|
|
||||||
double GetMinWeight() const {
|
|
||||||
return dat_.GetMinWeight();
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t GetTotalDictSize() const {
|
size_t GetTotalDictSize() const {
|
||||||
return total_dict_size_;
|
return total_dict_size_;
|
||||||
}
|
}
|
||||||
|
@ -63,6 +48,7 @@ private:
|
||||||
UserWordWeightOption user_word_weight_opt) {
|
UserWordWeightOption user_word_weight_opt) {
|
||||||
size_t file_size_sum = 0;
|
size_t file_size_sum = 0;
|
||||||
const string md5 = CalcFileListMD5(dict_path, file_size_sum);
|
const string md5 = CalcFileListMD5(dict_path, file_size_sum);
|
||||||
|
total_dict_size_ = file_size_sum;
|
||||||
|
|
||||||
if (dat_cache_path.empty()) {
|
if (dat_cache_path.empty()) {
|
||||||
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
|
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
|
||||||
|
@ -71,7 +57,6 @@ private:
|
||||||
QString path = QString::fromStdString(dat_cache_path);
|
QString path = QString::fromStdString(dat_cache_path);
|
||||||
qDebug() << "#########Idf path:" << path;
|
qDebug() << "#########Idf path:" << path;
|
||||||
if (dat_.InitIdfAttachDat(dat_cache_path, md5)) {
|
if (dat_.InitIdfAttachDat(dat_cache_path, md5)) {
|
||||||
total_dict_size_ = file_size_sum;
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -85,7 +70,6 @@ private:
|
||||||
|
|
||||||
const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
|
const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
|
||||||
assert(build_ret);
|
assert(build_ret);
|
||||||
total_dict_size_ = file_size_sum;
|
|
||||||
vector<IdfElement>().swap(static_node_infos_);
|
vector<IdfElement>().swap(static_node_infos_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -128,7 +112,6 @@ private:
|
||||||
vector<IdfElement> static_node_infos_;
|
vector<IdfElement> static_node_infos_;
|
||||||
size_t total_dict_size_ = 0;
|
size_t total_dict_size_ = 0;
|
||||||
DatTrie dat_;
|
DatTrie dat_;
|
||||||
unordered_set<Rune> user_dict_single_chinese_word_;
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -84,6 +84,7 @@ private:
|
||||||
MixSegment segment_;
|
MixSegment segment_;
|
||||||
IdfTrie idf_trie_;
|
IdfTrie idf_trie_;
|
||||||
|
|
||||||
|
|
||||||
unordered_set<Rune> symbols_;
|
unordered_set<Rune> symbols_;
|
||||||
}; // class KeywordExtractor
|
}; // class KeywordExtractor
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,154 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <map>
|
||||||
|
#include <string>
|
||||||
|
#include <cstring>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <cmath>
|
||||||
|
#include <limits>
|
||||||
|
#include "limonp/StringUtil.hpp"
|
||||||
|
#include "limonp/Logging.hpp"
|
||||||
|
#include "Unicode.hpp"
|
||||||
|
#include "DatTrie.hpp"
|
||||||
|
#include <QDebug>
|
||||||
|
namespace cppjieba {
|
||||||
|
|
||||||
|
using namespace limonp;
|
||||||
|
|
||||||
|
const size_t PINYIN_COLUMN_NUM = 2;
|
||||||
|
|
||||||
|
class PinYinTrie {
|
||||||
|
public:
|
||||||
|
enum UserWordWeightOption {
|
||||||
|
WordWeightMin,
|
||||||
|
WordWeightMedian,
|
||||||
|
WordWeightMax,
|
||||||
|
}; // enum UserWordWeightOption
|
||||||
|
|
||||||
|
PinYinTrie(const string& dict_path, const string & dat_cache_path = "",
|
||||||
|
UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
||||||
|
Init(dict_path, dat_cache_path, user_word_weight_opt);
|
||||||
|
}
|
||||||
|
|
||||||
|
~PinYinTrie() {}
|
||||||
|
|
||||||
|
int getMultiTonResults(string word, QStringList &results) {
|
||||||
|
if (qmap_chinese2pinyin.contains(QString::fromStdString(word))) {
|
||||||
|
for (auto i:qmap_chinese2pinyin[QString::fromStdString(word)])
|
||||||
|
results.push_back(i);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int getSingleTonResult(string word, QString &result) {
|
||||||
|
const PinYinMemElem * tmp = dat_.PinYinFind(word);
|
||||||
|
if (tmp) {
|
||||||
|
result = QString::fromStdString(tmp->GetTag());
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool contains(string &word) {
|
||||||
|
if (qmap_chinese2pinyin.contains(QString::fromStdString(word))
|
||||||
|
or !dat_.PinYinFind(word))
|
||||||
|
return true;
|
||||||
|
// if (map_chinese2pinyin.contains(word)
|
||||||
|
// or !dat_.PinYinFind(word))
|
||||||
|
// return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isMultiTone(string &word) {
|
||||||
|
if (qmap_chinese2pinyin.contains(QString::fromStdString(word)))
|
||||||
|
return true;
|
||||||
|
// if (map_chinese2pinyin.contains(word))
|
||||||
|
// return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t GetTotalDictSize() const {
|
||||||
|
return total_dict_size_;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
void Init(const string& dict_path, string dat_cache_path,
|
||||||
|
UserWordWeightOption user_word_weight_opt) {
|
||||||
|
size_t file_size_sum = 0;
|
||||||
|
vector<PinYinElement> node_infos;
|
||||||
|
const string md5 = CalcFileListMD5(dict_path, file_size_sum);
|
||||||
|
total_dict_size_ = file_size_sum;
|
||||||
|
|
||||||
|
if (dat_cache_path.empty()) {
|
||||||
|
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
|
||||||
|
dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache";
|
||||||
|
}
|
||||||
|
QString path = QString::fromStdString(dat_cache_path);
|
||||||
|
qDebug() << "#########PinYin path:" << path << file_size_sum;
|
||||||
|
if (dat_.InitPinYinAttachDat(dat_cache_path, md5)) {
|
||||||
|
//多音字仍需遍历文件信息
|
||||||
|
LoadDefaultPinYin(node_infos, dict_path, true);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
LoadDefaultPinYin(node_infos, dict_path, false);
|
||||||
|
double min_weight = 0;
|
||||||
|
dat_.SetMinWeight(min_weight);
|
||||||
|
|
||||||
|
const auto build_ret = dat_.InitBuildDat(node_infos, dat_cache_path, md5);
|
||||||
|
assert(build_ret);
|
||||||
|
vector<PinYinElement>().swap(node_infos);
|
||||||
|
}
|
||||||
|
|
||||||
|
void LoadDefaultPinYin(vector<PinYinElement> &node_infos, const string& filePath, bool multiFlag) {
|
||||||
|
ifstream ifs(filePath.c_str());
|
||||||
|
if(not ifs.is_open()){
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
|
||||||
|
string line;
|
||||||
|
vector<string> buf;
|
||||||
|
size_t lineno = 0;
|
||||||
|
|
||||||
|
for (; getline(ifs, line); lineno++) {
|
||||||
|
if (line.empty()) {
|
||||||
|
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Split(line, buf, " ");
|
||||||
|
if (buf.size() == PINYIN_COLUMN_NUM) {
|
||||||
|
if (multiFlag) {//非多音字
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
PinYinElement node_info;
|
||||||
|
node_info.word = buf[1];
|
||||||
|
node_info.tag = buf[0];
|
||||||
|
node_infos.push_back(node_info);
|
||||||
|
} else {//多音字
|
||||||
|
QString content = QString::fromUtf8(line.c_str());
|
||||||
|
qmap_chinese2pinyin[content.split(" ").last().trimmed()] = content.split(" ");
|
||||||
|
qmap_chinese2pinyin[content.split(" ").last().trimmed()].pop_back();
|
||||||
|
/*
|
||||||
|
//std map string list
|
||||||
|
list<string> tmpList;
|
||||||
|
for(int i = 0; i < buf.size() - 1; ++i){
|
||||||
|
tmpList.push_back(buf[i]);
|
||||||
|
}
|
||||||
|
map[buf[buf.size() - 1]] = tmpList;
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
QMap<QString, QStringList> qmap_chinese2pinyin;
|
||||||
|
//map<string, list<string>> map_chinese2pinyin;
|
||||||
|
size_t total_dict_size_ = 0;
|
||||||
|
DatTrie dat_;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ INCLUDEPATH += $$PWD
|
||||||
HEADERS += \
|
HEADERS += \
|
||||||
$$PWD/DictTrie.hpp \
|
$$PWD/DictTrie.hpp \
|
||||||
$$PWD/IdfTrie.hpp \
|
$$PWD/IdfTrie.hpp \
|
||||||
|
$$PWD/PinYinTrie.hpp \
|
||||||
$$PWD/FullSegment.hpp \
|
$$PWD/FullSegment.hpp \
|
||||||
$$PWD/HMMModel.hpp \
|
$$PWD/HMMModel.hpp \
|
||||||
$$PWD/HMMSegment.hpp \
|
$$PWD/HMMSegment.hpp \
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -23,14 +23,17 @@ include(cppjieba/cppjieba.pri)
|
||||||
|
|
||||||
SOURCES += \
|
SOURCES += \
|
||||||
chinese-segmentation.cpp \
|
chinese-segmentation.cpp \
|
||||||
|
pinyinmanager.cpp
|
||||||
|
|
||||||
HEADERS += \
|
HEADERS += \
|
||||||
chinese-segmentation.h \
|
chinese-segmentation.h \
|
||||||
libchinese-segmentation_global.h
|
libchinese-segmentation_global.h \
|
||||||
|
pinyinmanager.h
|
||||||
|
|
||||||
dict_files.path = /usr/share/ukui-search/res/dict/
|
dict_files.path = /usr/share/ukui-search/res/dict/
|
||||||
dict_files.files = $$PWD/dict/*.utf8\
|
dict_files.files = $$PWD/dict/*.utf8\
|
||||||
dict_files.files += $$PWD/dict/pos_dict/*.utf8\
|
dict_files.files += $$PWD/dict/pos_dict/*.utf8\
|
||||||
|
dict_files.files += $$PWD/dict/*.txt\
|
||||||
|
|
||||||
INSTALLS += \
|
INSTALLS += \
|
||||||
dict_files \
|
dict_files \
|
||||||
|
@ -60,5 +63,6 @@ DISTFILES += \
|
||||||
dict/pos_dict/prob_start.utf8 \
|
dict/pos_dict/prob_start.utf8 \
|
||||||
dict/pos_dict/prob_trans.utf8 \
|
dict/pos_dict/prob_trans.utf8 \
|
||||||
dict/stop_words.utf8 \
|
dict/stop_words.utf8 \
|
||||||
dict/user.dict.utf8
|
dict/user.dict.utf8 \
|
||||||
|
dict/pinyinWithoutTone.txt
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,55 @@
|
||||||
|
#include "pinyinmanager.h"
|
||||||
|
#include <mutex>
|
||||||
|
PinYinManager * PinYinManager::g_pinYinManager = nullptr;
|
||||||
|
std::once_flag g_singleFlag;
|
||||||
|
PinYinManager * PinYinManager::getInstance()
|
||||||
|
{
|
||||||
|
call_once(g_singleFlag, []() {
|
||||||
|
g_pinYinManager = new PinYinManager;
|
||||||
|
});
|
||||||
|
return g_pinYinManager;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PinYinManager::contains(string &word)
|
||||||
|
{
|
||||||
|
return m_pinYinTrie->contains(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PinYinManager::isMultiTon(string &word)
|
||||||
|
{
|
||||||
|
return m_pinYinTrie->isMultiTone(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PinYinManager::isMultiTon(string word)
|
||||||
|
{
|
||||||
|
return m_pinYinTrie->isMultiTone(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
int PinYinManager::getResults(string word, QStringList &results)
|
||||||
|
{
|
||||||
|
results.clear();
|
||||||
|
if (-1 != m_pinYinTrie->getMultiTonResults(word, results)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
QString tmp;
|
||||||
|
if (-1 != m_pinYinTrie->getSingleTonResult(word, tmp)) {
|
||||||
|
results.append(tmp);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
PinYinManager::PinYinManager()
|
||||||
|
{
|
||||||
|
const char * const PINYIN_PATH = "/usr/share/ukui-search/res/dict/pinyinWithoutTone.txt";
|
||||||
|
m_pinYinTrie = new cppjieba::PinYinTrie(PINYIN_PATH);
|
||||||
|
}
|
||||||
|
|
||||||
|
PinYinManager::~PinYinManager()
|
||||||
|
{
|
||||||
|
if (m_pinYinTrie){
|
||||||
|
delete m_pinYinTrie;
|
||||||
|
m_pinYinTrie = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
#ifndef PINYINMANAGER_H
|
||||||
|
#define PINYINMANAGER_H
|
||||||
|
|
||||||
|
#include <QtCore/qglobal.h>
|
||||||
|
#include "cppjieba/PinYinTrie.hpp"
|
||||||
|
|
||||||
|
#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
class PINYINMANAGER_EXPORT PinYinManager
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static PinYinManager * getInstance();
|
||||||
|
|
||||||
|
public:
|
||||||
|
bool contains(string &word);
|
||||||
|
bool isMultiTon(string &word);
|
||||||
|
bool isMultiTon(string word);
|
||||||
|
|
||||||
|
int getResults(string word, QStringList &results);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
PinYinManager();
|
||||||
|
~PinYinManager();
|
||||||
|
|
||||||
|
private:
|
||||||
|
static PinYinManager *g_pinYinManager;
|
||||||
|
cppjieba::PinYinTrie *m_pinYinTrie = nullptr;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // PINYINMANAGER_H
|
|
@ -27,6 +27,7 @@
|
||||||
#include <QDBusConnection>
|
#include <QDBusConnection>
|
||||||
#include <QDomDocument>
|
#include <QDomDocument>
|
||||||
#include "gobject-template.h"
|
#include "gobject-template.h"
|
||||||
|
#include "pinyinmanager.h"
|
||||||
|
|
||||||
using namespace UkuiSearch;
|
using namespace UkuiSearch;
|
||||||
size_t FileUtils::_max_index_count = 0;
|
size_t FileUtils::_max_index_count = 0;
|
||||||
|
@ -405,25 +406,25 @@ void stitchMultiToneWordsBFSHeapLess3(const QString &hanzi, QStringList &resultL
|
||||||
|
|
||||||
//BFS+Stack+超过3个多音字只建一个索引,比较折中的方案
|
//BFS+Stack+超过3个多音字只建一个索引,比较折中的方案
|
||||||
void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &resultList) {
|
void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &resultList) {
|
||||||
QString tempHanzi, resultAllPinYin, resultFirst;
|
QString tempHanzi;
|
||||||
QQueue<QString> tempQueue;
|
QQueue<QString> tempQueue;
|
||||||
QQueue<QString> tempQueueFirst;
|
QQueue<QString> tempQueueFirst;
|
||||||
tempHanzi = hanzi;
|
tempHanzi = hanzi;
|
||||||
int tempQueueSize = 0;
|
int tempQueueSize = 0;
|
||||||
int multiToneWordNum = 0;
|
int multiToneWordNum = 0;
|
||||||
|
|
||||||
for (auto i:hanzi) {
|
for (auto i:hanzi) {
|
||||||
if(FileUtils::map_chinese2pinyin.contains(i)) {
|
if (PinYinManager::getInstance()->isMultiTon(QString(i).toStdString()))
|
||||||
if(FileUtils::map_chinese2pinyin[i].size() > 1) {
|
|
||||||
++multiToneWordNum;
|
++multiToneWordNum;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
if(multiToneWordNum > 3) {
|
if(multiToneWordNum > 3) {
|
||||||
QString oneResult, oneResultFirst;
|
QString oneResult, oneResultFirst;
|
||||||
for(auto i : hanzi) {
|
for(auto i : hanzi) {
|
||||||
if(FileUtils::map_chinese2pinyin.contains(i)) {
|
QStringList results;
|
||||||
oneResult += FileUtils::map_chinese2pinyin[i].first();
|
PinYinManager::getInstance()->getResults(QString(i).toStdString(), results);
|
||||||
oneResultFirst += FileUtils::map_chinese2pinyin[i].first().at(0);
|
if(results.size()) {
|
||||||
|
oneResult += results.first();
|
||||||
|
oneResultFirst += results.first().at(0);
|
||||||
} else {
|
} else {
|
||||||
oneResult += i;
|
oneResult += i;
|
||||||
oneResultFirst += i;
|
oneResultFirst += i;
|
||||||
|
@ -434,8 +435,10 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(FileUtils::map_chinese2pinyin.contains(tempHanzi.at(0))) {
|
QStringList results;
|
||||||
for(auto i : FileUtils::map_chinese2pinyin[tempHanzi.at(0)]) {
|
PinYinManager::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results);
|
||||||
|
if(results.size()) {
|
||||||
|
for(auto i : results) {
|
||||||
tempQueue.enqueue(i);
|
tempQueue.enqueue(i);
|
||||||
tempQueueFirst.enqueue(i.at(0));
|
tempQueueFirst.enqueue(i.at(0));
|
||||||
}
|
}
|
||||||
|
@ -445,10 +448,11 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result
|
||||||
}
|
}
|
||||||
tempHanzi = tempHanzi.right(tempHanzi.size() - 1);
|
tempHanzi = tempHanzi.right(tempHanzi.size() - 1);
|
||||||
while(tempHanzi.size() != 0) {
|
while(tempHanzi.size() != 0) {
|
||||||
|
PinYinManager::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results);
|
||||||
tempQueueSize = tempQueue.size();
|
tempQueueSize = tempQueue.size();
|
||||||
if(FileUtils::map_chinese2pinyin.contains(tempHanzi.at(0))) {
|
if(results.size()) {
|
||||||
for(int j = 0; j < tempQueueSize; ++j) {
|
for(int j = 0; j < tempQueueSize; ++j) {
|
||||||
for(auto i : FileUtils::map_chinese2pinyin[tempHanzi.at(0)]) {
|
for(auto i : results) {
|
||||||
tempQueue.enqueue(tempQueue.head() + i);
|
tempQueue.enqueue(tempQueue.head() + i);
|
||||||
tempQueueFirst.enqueue(tempQueueFirst.head() + i.at(0));
|
tempQueueFirst.enqueue(tempQueueFirst.head() + i.at(0));
|
||||||
}
|
}
|
||||||
|
@ -469,22 +473,12 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result
|
||||||
resultList.append(tempQueue.dequeue());
|
resultList.append(tempQueue.dequeue());
|
||||||
resultList.append(tempQueueFirst.dequeue());
|
resultList.append(tempQueueFirst.dequeue());
|
||||||
}
|
}
|
||||||
// delete tempQueue;
|
|
||||||
// delete tempQueueFirst;
|
|
||||||
// tempQueue = nullptr;
|
|
||||||
// tempQueueFirst = nullptr;
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
QStringList FileUtils::findMultiToneWords(const QString &hanzi) {
|
QStringList FileUtils::findMultiToneWords(const QString &hanzi) {
|
||||||
// QStringList* output = new QStringList();
|
|
||||||
QStringList output;
|
QStringList output;
|
||||||
QString tempAllPinYin, tempFirst;
|
|
||||||
QStringList stringList = hanzi.split("");
|
|
||||||
|
|
||||||
// stitchMultiToneWordsDFS(hanzi, tempAllPinYin, tempFirst, output);
|
|
||||||
stitchMultiToneWordsBFSStackLess3(hanzi, output);
|
stitchMultiToneWordsBFSStackLess3(hanzi, output);
|
||||||
// qDebug() << output;
|
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -51,7 +51,6 @@
|
||||||
#include <uchardet/uchardet.h>
|
#include <uchardet/uchardet.h>
|
||||||
//#include <poppler-qt5.h>
|
//#include <poppler-qt5.h>
|
||||||
#include <poppler/qt5/poppler-qt5.h>
|
#include <poppler/qt5/poppler-qt5.h>
|
||||||
#include <common.h>
|
|
||||||
|
|
||||||
#include "libsearch_global.h"
|
#include "libsearch_global.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,7 +1,5 @@
|
||||||
<RCC>
|
<RCC>
|
||||||
<qresource prefix="/">
|
<qresource prefix="/">
|
||||||
<file>index/pinyinWithTone.txt</file>
|
|
||||||
<file>index/pinyinWithoutTone.txt</file>
|
|
||||||
<file>res/icons/desktop.png</file>
|
<file>res/icons/desktop.png</file>
|
||||||
<file>res/icons/close.svg</file>
|
<file>res/icons/close.svg</file>
|
||||||
<file>res/icons/edit-find-symbolic.svg</file>
|
<file>res/icons/edit-find-symbolic.svg</file>
|
||||||
|
|
Loading…
Reference in New Issue