forked from openkylin/ukui-search
优化多音字字典存储数据结构;部分代码及注释整理;
This commit is contained in:
parent
fb7811e417
commit
47af66e682
|
@ -46,16 +46,17 @@ struct SKeyWord {
|
|||
class CHINESESEGMENTATION_EXPORT ChineseSegmentation {
|
||||
public:
|
||||
static ChineseSegmentation *getInstance();
|
||||
~ChineseSegmentation();
|
||||
QVector<SKeyWord> callSegement(std::string s);
|
||||
//新添加callSegementStd函数,修改返回值为std::vector<cppjieba::KeywordExtractor::Word>并简化内部处理流程--jxx20210517
|
||||
//修改函数入参形式为引用,去掉Qstring与std::string转换代码--jxx20210519
|
||||
std::vector<cppjieba::KeyWord> callSegementStd(const std::string& str);
|
||||
|
||||
private:
|
||||
explicit ChineseSegmentation();
|
||||
~ChineseSegmentation();
|
||||
void convert(std::vector<cppjieba::KeyWord>& keywordres, QVector<SKeyWord>& kw);
|
||||
|
||||
private:
|
||||
static QMutex m_mutex;
|
||||
cppjieba::Jieba *m_jieba;
|
||||
explicit ChineseSegmentation();
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -46,6 +46,16 @@ struct IdfElement {
|
|||
}
|
||||
};
|
||||
|
||||
struct PinYinElement
|
||||
{
|
||||
string word;
|
||||
string tag;
|
||||
|
||||
bool operator < (const DatElement & b) const {
|
||||
return this->word < b.word;
|
||||
}
|
||||
};
|
||||
|
||||
inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
|
||||
return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
|
||||
}
|
||||
|
@ -64,6 +74,19 @@ struct DatMemElem {
|
|||
}
|
||||
};
|
||||
|
||||
struct PinYinMemElem {
|
||||
char tag[6] = {};
|
||||
|
||||
void SetTag(const string & str) {
|
||||
memset(&tag[0], 0, sizeof(tag));
|
||||
strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
|
||||
}
|
||||
|
||||
string GetTag() const {
|
||||
return &tag[0];
|
||||
}
|
||||
};
|
||||
|
||||
inline std::ostream & operator << (std::ostream& os, const DatMemElem & elem) {
|
||||
return os << "/tag=" << elem.GetTag() << "/weight=" << elem.weight;
|
||||
}
|
||||
|
@ -122,6 +145,17 @@ public:
|
|||
return idf_elements_ptr_[ find_result.value ];
|
||||
}
|
||||
|
||||
const PinYinMemElem * PinYinFind(const string & key) const {
|
||||
JiebaDAT::result_pair_type find_result;
|
||||
dat_.exactMatchSearch(key.c_str(), find_result);
|
||||
|
||||
if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return &pinyin_elements_ptr_[ find_result.value ];
|
||||
}
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||
vector<struct DatDag>&res, size_t max_word_len) const {
|
||||
|
||||
|
@ -167,6 +201,7 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
void Find_Reverse(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||
vector<struct DatDag>&res, size_t max_word_len) const {
|
||||
|
||||
|
@ -208,7 +243,8 @@ public:
|
|||
res[str_size - i - 1].nexts.push_back(pair<size_t, const DatMemElem *>(str_size - 1 - i + char_num, pValue));
|
||||
}
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||
vector<WordRange>& words, size_t max_word_len) const {
|
||||
|
||||
|
@ -300,6 +336,11 @@ public:
|
|||
return InitIdfAttachDat(dat_cache_file, md5);
|
||||
}
|
||||
|
||||
bool InitBuildDat(vector<PinYinElement>& elements, const string & dat_cache_file, const string & md5) {
|
||||
BuildDatCache(elements, dat_cache_file, md5);
|
||||
return InitPinYinAttachDat(dat_cache_file, md5);
|
||||
}
|
||||
|
||||
bool InitAttachDat(const string & dat_cache_file, const string & md5) {
|
||||
mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
|
||||
|
||||
|
@ -362,6 +403,37 @@ public:
|
|||
return true;
|
||||
}
|
||||
|
||||
bool InitPinYinAttachDat(const string & dat_cache_file, const string & md5) {
|
||||
mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
|
||||
|
||||
if (mmap_fd_ < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
|
||||
assert(seek_off >= 0);
|
||||
mmap_length_ = seek_off;
|
||||
|
||||
mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
|
||||
assert(MAP_FAILED != mmap_addr_);
|
||||
|
||||
assert(mmap_length_ >= sizeof(CacheFileHeader));
|
||||
CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
|
||||
elements_num_ = header.elements_num;
|
||||
min_weight_ = header.min_weight;
|
||||
assert(sizeof(header.md5_hex) == md5.size());
|
||||
|
||||
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(PinYinMemElem) + header.dat_size * dat_.unit_size());
|
||||
pinyin_elements_ptr_ = (const PinYinMemElem *)(mmap_addr_ + sizeof(header));
|
||||
const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(PinYinMemElem) * elements_num_;
|
||||
dat_.set_array(dat_ptr, header.dat_size);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
void BuildDatCache(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
|
||||
std::sort(elements.begin(), elements.end());
|
||||
|
@ -464,13 +536,64 @@ private:
|
|||
}
|
||||
}
|
||||
|
||||
void BuildDatCache(vector<PinYinElement>& elements, const string & dat_cache_file, const string & md5) {
|
||||
//std::sort(elements.begin(), elements.end());
|
||||
|
||||
vector<const char*> keys_ptr_vec;
|
||||
vector<int> values_vec;
|
||||
vector<PinYinMemElem> mem_elem_vec;
|
||||
|
||||
keys_ptr_vec.reserve(elements.size());
|
||||
values_vec.reserve(elements.size());
|
||||
mem_elem_vec.reserve(elements.size());
|
||||
|
||||
CacheFileHeader header;
|
||||
header.min_weight = min_weight_;
|
||||
assert(sizeof(header.md5_hex) == md5.size());
|
||||
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
|
||||
|
||||
for (size_t i = 0; i < elements.size(); ++i) {
|
||||
keys_ptr_vec.push_back(elements[i].word.data());
|
||||
values_vec.push_back(i);
|
||||
mem_elem_vec.push_back(PinYinMemElem());
|
||||
auto & mem_elem = mem_elem_vec.back();
|
||||
mem_elem.SetTag(elements[i].tag);
|
||||
}
|
||||
|
||||
auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
|
||||
assert(0 == ret);
|
||||
header.elements_num = mem_elem_vec.size();
|
||||
header.dat_size = dat_.size();
|
||||
|
||||
{
|
||||
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
|
||||
::umask(S_IWGRP | S_IWOTH);
|
||||
//const int fd =::mkstemp(&tmp_filepath[0]);
|
||||
const int fd =::mkstemp((char *)tmp_filepath.data());
|
||||
qDebug() << "mkstemp :" << errno << tmp_filepath.data();
|
||||
assert(fd >= 0);
|
||||
::fchmod(fd, 0644);
|
||||
|
||||
auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
|
||||
write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(mem_elem_vec[0]) * mem_elem_vec.size());
|
||||
write_bytes += ::write(fd, dat_.array(), dat_.total_size());
|
||||
|
||||
assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(mem_elem_vec[0]) + dat_.total_size());
|
||||
::close(fd);
|
||||
|
||||
const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
|
||||
assert(0 == rename_ret);
|
||||
}
|
||||
}
|
||||
|
||||
DatTrie(const DatTrie &);
|
||||
DatTrie &operator=(const DatTrie &);
|
||||
|
||||
private:
|
||||
JiebaDAT dat_;
|
||||
const DatMemElem * elements_ptr_ = nullptr;
|
||||
const double * idf_elements_ptr_= nullptr;
|
||||
const double * idf_elements_ptr_ = nullptr;
|
||||
const PinYinMemElem * pinyin_elements_ptr_ = nullptr;
|
||||
size_t elements_num_ = 0;
|
||||
double min_weight_ = 0;
|
||||
|
||||
|
|
|
@ -131,6 +131,7 @@ private:
|
|||
const auto dict_list = dict_path + "|" + user_dict_paths;
|
||||
size_t file_size_sum = 0;
|
||||
const string md5 = CalcFileListMD5(dict_list, file_size_sum);
|
||||
total_dict_size_ = file_size_sum;
|
||||
|
||||
if (dat_cache_path.empty()) {
|
||||
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
|
||||
|
@ -140,7 +141,6 @@ private:
|
|||
qDebug() << "#########Dict path:" << path;
|
||||
if (dat_.InitAttachDat(dat_cache_path, md5)) {
|
||||
LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
|
||||
total_dict_size_ = file_size_sum;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -154,7 +154,6 @@ private:
|
|||
LoadUserDict(user_dict_paths);
|
||||
const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
|
||||
assert(build_ret);
|
||||
total_dict_size_ = file_size_sum;
|
||||
vector<DatElement>().swap(static_node_infos_);
|
||||
}
|
||||
|
||||
|
|
|
@ -39,21 +39,6 @@ public:
|
|||
return dat_.Find(word, length, node_pos);
|
||||
}
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<struct DatDag>&res,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
dat_.Find(begin, end, res, max_word_len);
|
||||
}
|
||||
|
||||
bool IsUserDictSingleChineseWord(const Rune& word) const {
|
||||
return IsIn(user_dict_single_chinese_word_, word);
|
||||
}
|
||||
|
||||
double GetMinWeight() const {
|
||||
return dat_.GetMinWeight();
|
||||
}
|
||||
|
||||
size_t GetTotalDictSize() const {
|
||||
return total_dict_size_;
|
||||
}
|
||||
|
@ -63,6 +48,7 @@ private:
|
|||
UserWordWeightOption user_word_weight_opt) {
|
||||
size_t file_size_sum = 0;
|
||||
const string md5 = CalcFileListMD5(dict_path, file_size_sum);
|
||||
total_dict_size_ = file_size_sum;
|
||||
|
||||
if (dat_cache_path.empty()) {
|
||||
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
|
||||
|
@ -71,7 +57,6 @@ private:
|
|||
QString path = QString::fromStdString(dat_cache_path);
|
||||
qDebug() << "#########Idf path:" << path;
|
||||
if (dat_.InitIdfAttachDat(dat_cache_path, md5)) {
|
||||
total_dict_size_ = file_size_sum;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -85,7 +70,6 @@ private:
|
|||
|
||||
const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
|
||||
assert(build_ret);
|
||||
total_dict_size_ = file_size_sum;
|
||||
vector<IdfElement>().swap(static_node_infos_);
|
||||
}
|
||||
|
||||
|
@ -128,7 +112,6 @@ private:
|
|||
vector<IdfElement> static_node_infos_;
|
||||
size_t total_dict_size_ = 0;
|
||||
DatTrie dat_;
|
||||
unordered_set<Rune> user_dict_single_chinese_word_;
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -84,6 +84,7 @@ private:
|
|||
MixSegment segment_;
|
||||
IdfTrie idf_trie_;
|
||||
|
||||
|
||||
unordered_set<Rune> symbols_;
|
||||
}; // class KeywordExtractor
|
||||
|
||||
|
|
|
@ -0,0 +1,154 @@
|
|||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <stdint.h>
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "Unicode.hpp"
|
||||
#include "DatTrie.hpp"
|
||||
#include <QDebug>
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace limonp;
|
||||
|
||||
const size_t PINYIN_COLUMN_NUM = 2;
|
||||
|
||||
class PinYinTrie {
|
||||
public:
|
||||
enum UserWordWeightOption {
|
||||
WordWeightMin,
|
||||
WordWeightMedian,
|
||||
WordWeightMax,
|
||||
}; // enum UserWordWeightOption
|
||||
|
||||
PinYinTrie(const string& dict_path, const string & dat_cache_path = "",
|
||||
UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
|
||||
Init(dict_path, dat_cache_path, user_word_weight_opt);
|
||||
}
|
||||
|
||||
~PinYinTrie() {}
|
||||
|
||||
int getMultiTonResults(string word, QStringList &results) {
|
||||
if (qmap_chinese2pinyin.contains(QString::fromStdString(word))) {
|
||||
for (auto i:qmap_chinese2pinyin[QString::fromStdString(word)])
|
||||
results.push_back(i);
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int getSingleTonResult(string word, QString &result) {
|
||||
const PinYinMemElem * tmp = dat_.PinYinFind(word);
|
||||
if (tmp) {
|
||||
result = QString::fromStdString(tmp->GetTag());
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool contains(string &word) {
|
||||
if (qmap_chinese2pinyin.contains(QString::fromStdString(word))
|
||||
or !dat_.PinYinFind(word))
|
||||
return true;
|
||||
// if (map_chinese2pinyin.contains(word)
|
||||
// or !dat_.PinYinFind(word))
|
||||
// return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool isMultiTone(string &word) {
|
||||
if (qmap_chinese2pinyin.contains(QString::fromStdString(word)))
|
||||
return true;
|
||||
// if (map_chinese2pinyin.contains(word))
|
||||
// return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t GetTotalDictSize() const {
|
||||
return total_dict_size_;
|
||||
}
|
||||
|
||||
private:
|
||||
void Init(const string& dict_path, string dat_cache_path,
|
||||
UserWordWeightOption user_word_weight_opt) {
|
||||
size_t file_size_sum = 0;
|
||||
vector<PinYinElement> node_infos;
|
||||
const string md5 = CalcFileListMD5(dict_path, file_size_sum);
|
||||
total_dict_size_ = file_size_sum;
|
||||
|
||||
if (dat_cache_path.empty()) {
|
||||
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
|
||||
dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache";
|
||||
}
|
||||
QString path = QString::fromStdString(dat_cache_path);
|
||||
qDebug() << "#########PinYin path:" << path << file_size_sum;
|
||||
if (dat_.InitPinYinAttachDat(dat_cache_path, md5)) {
|
||||
//多音字仍需遍历文件信息
|
||||
LoadDefaultPinYin(node_infos, dict_path, true);
|
||||
return;
|
||||
}
|
||||
|
||||
LoadDefaultPinYin(node_infos, dict_path, false);
|
||||
double min_weight = 0;
|
||||
dat_.SetMinWeight(min_weight);
|
||||
|
||||
const auto build_ret = dat_.InitBuildDat(node_infos, dat_cache_path, md5);
|
||||
assert(build_ret);
|
||||
vector<PinYinElement>().swap(node_infos);
|
||||
}
|
||||
|
||||
void LoadDefaultPinYin(vector<PinYinElement> &node_infos, const string& filePath, bool multiFlag) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
if(not ifs.is_open()){
|
||||
return ;
|
||||
}
|
||||
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
|
||||
string line;
|
||||
vector<string> buf;
|
||||
size_t lineno = 0;
|
||||
|
||||
for (; getline(ifs, line); lineno++) {
|
||||
if (line.empty()) {
|
||||
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
|
||||
continue;
|
||||
}
|
||||
Split(line, buf, " ");
|
||||
if (buf.size() == PINYIN_COLUMN_NUM) {
|
||||
if (multiFlag) {//非多音字
|
||||
continue;
|
||||
}
|
||||
PinYinElement node_info;
|
||||
node_info.word = buf[1];
|
||||
node_info.tag = buf[0];
|
||||
node_infos.push_back(node_info);
|
||||
} else {//多音字
|
||||
QString content = QString::fromUtf8(line.c_str());
|
||||
qmap_chinese2pinyin[content.split(" ").last().trimmed()] = content.split(" ");
|
||||
qmap_chinese2pinyin[content.split(" ").last().trimmed()].pop_back();
|
||||
/*
|
||||
//std map string list
|
||||
list<string> tmpList;
|
||||
for(int i = 0; i < buf.size() - 1; ++i){
|
||||
tmpList.push_back(buf[i]);
|
||||
}
|
||||
map[buf[buf.size() - 1]] = tmpList;
|
||||
*/
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
QMap<QString, QStringList> qmap_chinese2pinyin;
|
||||
//map<string, list<string>> map_chinese2pinyin;
|
||||
size_t total_dict_size_ = 0;
|
||||
DatTrie dat_;
|
||||
};
|
||||
}
|
||||
|
|
@ -3,6 +3,7 @@ INCLUDEPATH += $$PWD
|
|||
HEADERS += \
|
||||
$$PWD/DictTrie.hpp \
|
||||
$$PWD/IdfTrie.hpp \
|
||||
$$PWD/PinYinTrie.hpp \
|
||||
$$PWD/FullSegment.hpp \
|
||||
$$PWD/HMMModel.hpp \
|
||||
$$PWD/HMMSegment.hpp \
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -23,14 +23,17 @@ include(cppjieba/cppjieba.pri)
|
|||
|
||||
SOURCES += \
|
||||
chinese-segmentation.cpp \
|
||||
pinyinmanager.cpp
|
||||
|
||||
HEADERS += \
|
||||
chinese-segmentation.h \
|
||||
libchinese-segmentation_global.h
|
||||
libchinese-segmentation_global.h \
|
||||
pinyinmanager.h
|
||||
|
||||
dict_files.path = /usr/share/ukui-search/res/dict/
|
||||
dict_files.files = $$PWD/dict/*.utf8\
|
||||
dict_files.files += $$PWD/dict/pos_dict/*.utf8\
|
||||
dict_files.files += $$PWD/dict/*.txt\
|
||||
|
||||
INSTALLS += \
|
||||
dict_files \
|
||||
|
@ -60,5 +63,6 @@ DISTFILES += \
|
|||
dict/pos_dict/prob_start.utf8 \
|
||||
dict/pos_dict/prob_trans.utf8 \
|
||||
dict/stop_words.utf8 \
|
||||
dict/user.dict.utf8
|
||||
dict/user.dict.utf8 \
|
||||
dict/pinyinWithoutTone.txt
|
||||
|
||||
|
|
|
@ -0,0 +1,55 @@
|
|||
#include "pinyinmanager.h"
|
||||
#include <mutex>
|
||||
PinYinManager * PinYinManager::g_pinYinManager = nullptr;
|
||||
std::once_flag g_singleFlag;
|
||||
PinYinManager * PinYinManager::getInstance()
|
||||
{
|
||||
call_once(g_singleFlag, []() {
|
||||
g_pinYinManager = new PinYinManager;
|
||||
});
|
||||
return g_pinYinManager;
|
||||
}
|
||||
|
||||
bool PinYinManager::contains(string &word)
|
||||
{
|
||||
return m_pinYinTrie->contains(word);
|
||||
}
|
||||
|
||||
bool PinYinManager::isMultiTon(string &word)
|
||||
{
|
||||
return m_pinYinTrie->isMultiTone(word);
|
||||
}
|
||||
|
||||
bool PinYinManager::isMultiTon(string word)
|
||||
{
|
||||
return m_pinYinTrie->isMultiTone(word);
|
||||
}
|
||||
|
||||
int PinYinManager::getResults(string word, QStringList &results)
|
||||
{
|
||||
results.clear();
|
||||
if (-1 != m_pinYinTrie->getMultiTonResults(word, results)) {
|
||||
return 0;
|
||||
}
|
||||
QString tmp;
|
||||
if (-1 != m_pinYinTrie->getSingleTonResult(word, tmp)) {
|
||||
results.append(tmp);
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
PinYinManager::PinYinManager()
|
||||
{
|
||||
const char * const PINYIN_PATH = "/usr/share/ukui-search/res/dict/pinyinWithoutTone.txt";
|
||||
m_pinYinTrie = new cppjieba::PinYinTrie(PINYIN_PATH);
|
||||
}
|
||||
|
||||
PinYinManager::~PinYinManager()
|
||||
{
|
||||
if (m_pinYinTrie){
|
||||
delete m_pinYinTrie;
|
||||
m_pinYinTrie = nullptr;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
#ifndef PINYINMANAGER_H
|
||||
#define PINYINMANAGER_H
|
||||
|
||||
#include <QtCore/qglobal.h>
|
||||
#include "cppjieba/PinYinTrie.hpp"
|
||||
|
||||
#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
|
||||
|
||||
using namespace std;
|
||||
|
||||
class PINYINMANAGER_EXPORT PinYinManager
|
||||
{
|
||||
public:
|
||||
static PinYinManager * getInstance();
|
||||
|
||||
public:
|
||||
bool contains(string &word);
|
||||
bool isMultiTon(string &word);
|
||||
bool isMultiTon(string word);
|
||||
|
||||
int getResults(string word, QStringList &results);
|
||||
|
||||
protected:
|
||||
PinYinManager();
|
||||
~PinYinManager();
|
||||
|
||||
private:
|
||||
static PinYinManager *g_pinYinManager;
|
||||
cppjieba::PinYinTrie *m_pinYinTrie = nullptr;
|
||||
|
||||
};
|
||||
|
||||
#endif // PINYINMANAGER_H
|
|
@ -27,6 +27,7 @@
|
|||
#include <QDBusConnection>
|
||||
#include <QDomDocument>
|
||||
#include "gobject-template.h"
|
||||
#include "pinyinmanager.h"
|
||||
|
||||
using namespace UkuiSearch;
|
||||
size_t FileUtils::_max_index_count = 0;
|
||||
|
@ -405,25 +406,25 @@ void stitchMultiToneWordsBFSHeapLess3(const QString &hanzi, QStringList &resultL
|
|||
|
||||
//BFS+Stack+超过3个多音字只建一个索引,比较折中的方案
|
||||
void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &resultList) {
|
||||
QString tempHanzi, resultAllPinYin, resultFirst;
|
||||
QString tempHanzi;
|
||||
QQueue<QString> tempQueue;
|
||||
QQueue<QString> tempQueueFirst;
|
||||
tempHanzi = hanzi;
|
||||
int tempQueueSize = 0;
|
||||
int multiToneWordNum = 0;
|
||||
for(auto i : hanzi) {
|
||||
if(FileUtils::map_chinese2pinyin.contains(i)) {
|
||||
if(FileUtils::map_chinese2pinyin[i].size() > 1) {
|
||||
++multiToneWordNum;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto i:hanzi) {
|
||||
if (PinYinManager::getInstance()->isMultiTon(QString(i).toStdString()))
|
||||
++multiToneWordNum;
|
||||
}
|
||||
if(multiToneWordNum > 3) {
|
||||
QString oneResult, oneResultFirst;
|
||||
for(auto i : hanzi) {
|
||||
if(FileUtils::map_chinese2pinyin.contains(i)) {
|
||||
oneResult += FileUtils::map_chinese2pinyin[i].first();
|
||||
oneResultFirst += FileUtils::map_chinese2pinyin[i].first().at(0);
|
||||
QStringList results;
|
||||
PinYinManager::getInstance()->getResults(QString(i).toStdString(), results);
|
||||
if(results.size()) {
|
||||
oneResult += results.first();
|
||||
oneResultFirst += results.first().at(0);
|
||||
} else {
|
||||
oneResult += i;
|
||||
oneResultFirst += i;
|
||||
|
@ -434,8 +435,10 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result
|
|||
return;
|
||||
}
|
||||
|
||||
if(FileUtils::map_chinese2pinyin.contains(tempHanzi.at(0))) {
|
||||
for(auto i : FileUtils::map_chinese2pinyin[tempHanzi.at(0)]) {
|
||||
QStringList results;
|
||||
PinYinManager::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results);
|
||||
if(results.size()) {
|
||||
for(auto i : results) {
|
||||
tempQueue.enqueue(i);
|
||||
tempQueueFirst.enqueue(i.at(0));
|
||||
}
|
||||
|
@ -445,10 +448,11 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result
|
|||
}
|
||||
tempHanzi = tempHanzi.right(tempHanzi.size() - 1);
|
||||
while(tempHanzi.size() != 0) {
|
||||
PinYinManager::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results);
|
||||
tempQueueSize = tempQueue.size();
|
||||
if(FileUtils::map_chinese2pinyin.contains(tempHanzi.at(0))) {
|
||||
if(results.size()) {
|
||||
for(int j = 0; j < tempQueueSize; ++j) {
|
||||
for(auto i : FileUtils::map_chinese2pinyin[tempHanzi.at(0)]) {
|
||||
for(auto i : results) {
|
||||
tempQueue.enqueue(tempQueue.head() + i);
|
||||
tempQueueFirst.enqueue(tempQueueFirst.head() + i.at(0));
|
||||
}
|
||||
|
@ -469,22 +473,12 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result
|
|||
resultList.append(tempQueue.dequeue());
|
||||
resultList.append(tempQueueFirst.dequeue());
|
||||
}
|
||||
// delete tempQueue;
|
||||
// delete tempQueueFirst;
|
||||
// tempQueue = nullptr;
|
||||
// tempQueueFirst = nullptr;
|
||||
return;
|
||||
}
|
||||
|
||||
QStringList FileUtils::findMultiToneWords(const QString &hanzi) {
|
||||
// QStringList* output = new QStringList();
|
||||
QStringList output;
|
||||
QString tempAllPinYin, tempFirst;
|
||||
QStringList stringList = hanzi.split("");
|
||||
|
||||
// stitchMultiToneWordsDFS(hanzi, tempAllPinYin, tempFirst, output);
|
||||
stitchMultiToneWordsBFSStackLess3(hanzi, output);
|
||||
// qDebug() << output;
|
||||
return output;
|
||||
}
|
||||
|
||||
|
|
|
@ -51,7 +51,6 @@
|
|||
#include <uchardet/uchardet.h>
|
||||
//#include <poppler-qt5.h>
|
||||
#include <poppler/qt5/poppler-qt5.h>
|
||||
#include <common.h>
|
||||
|
||||
#include "libsearch_global.h"
|
||||
#include "common.h"
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,7 +1,5 @@
|
|||
<RCC>
|
||||
<qresource prefix="/">
|
||||
<file>index/pinyinWithTone.txt</file>
|
||||
<file>index/pinyinWithoutTone.txt</file>
|
||||
<file>res/icons/desktop.png</file>
|
||||
<file>res/icons/close.svg</file>
|
||||
<file>res/icons/edit-find-symbolic.svg</file>
|
||||
|
|
Loading…
Reference in New Issue