优化多音字字典存储数据结构;部分代码及注释整理;

This commit is contained in:
jixiaoxu 2022-03-02 09:27:40 +08:00 committed by iaom
parent fb7811e417
commit 47af66e682
15 changed files with 7343 additions and 48449 deletions

View File

@ -46,16 +46,17 @@ struct SKeyWord {
class CHINESESEGMENTATION_EXPORT ChineseSegmentation {
public:
static ChineseSegmentation *getInstance();
~ChineseSegmentation();
QVector<SKeyWord> callSegement(std::string s);
//新添加callSegementStd函数修改返回值为stdvector<cppjieba::KeywordExtractor::Word>并简化内部处理流程--jxx20210517
//修改函数入参形式为引用去掉Qstring与std::string转换代码--jxx20210519
std::vector<cppjieba::KeyWord> callSegementStd(const std::string& str);
private:
explicit ChineseSegmentation();
~ChineseSegmentation();
void convert(std::vector<cppjieba::KeyWord>& keywordres, QVector<SKeyWord>& kw);
private:
static QMutex m_mutex;
cppjieba::Jieba *m_jieba;
explicit ChineseSegmentation();
};

View File

@ -46,6 +46,16 @@ struct IdfElement {
}
};
struct PinYinElement
{
string word;
string tag;
bool operator < (const DatElement & b) const {
return this->word < b.word;
}
};
inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
}
@ -64,6 +74,19 @@ struct DatMemElem {
}
};
struct PinYinMemElem {
char tag[6] = {};
void SetTag(const string & str) {
memset(&tag[0], 0, sizeof(tag));
strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
}
string GetTag() const {
return &tag[0];
}
};
inline std::ostream & operator << (std::ostream& os, const DatMemElem & elem) {
return os << "/tag=" << elem.GetTag() << "/weight=" << elem.weight;
}
@ -122,6 +145,17 @@ public:
return idf_elements_ptr_[ find_result.value ];
}
const PinYinMemElem * PinYinFind(const string & key) const {
JiebaDAT::result_pair_type find_result;
dat_.exactMatchSearch(key.c_str(), find_result);
if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
return nullptr;
}
return &pinyin_elements_ptr_[ find_result.value ];
}
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
vector<struct DatDag>&res, size_t max_word_len) const {
@ -167,6 +201,7 @@ public:
}
}
/*
void Find_Reverse(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
vector<struct DatDag>&res, size_t max_word_len) const {
@ -208,7 +243,8 @@ public:
res[str_size - i - 1].nexts.push_back(pair<size_t, const DatMemElem *>(str_size - 1 - i + char_num, pValue));
}
}
}
}*/
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
vector<WordRange>& words, size_t max_word_len) const {
@ -300,6 +336,11 @@ public:
return InitIdfAttachDat(dat_cache_file, md5);
}
bool InitBuildDat(vector<PinYinElement>& elements, const string & dat_cache_file, const string & md5) {
BuildDatCache(elements, dat_cache_file, md5);
return InitPinYinAttachDat(dat_cache_file, md5);
}
bool InitAttachDat(const string & dat_cache_file, const string & md5) {
mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
@ -362,6 +403,37 @@ public:
return true;
}
bool InitPinYinAttachDat(const string & dat_cache_file, const string & md5) {
mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
if (mmap_fd_ < 0) {
return false;
}
const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
assert(seek_off >= 0);
mmap_length_ = seek_off;
mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
assert(MAP_FAILED != mmap_addr_);
assert(mmap_length_ >= sizeof(CacheFileHeader));
CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
elements_num_ = header.elements_num;
min_weight_ = header.min_weight;
assert(sizeof(header.md5_hex) == md5.size());
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
return false;
}
assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(PinYinMemElem) + header.dat_size * dat_.unit_size());
pinyin_elements_ptr_ = (const PinYinMemElem *)(mmap_addr_ + sizeof(header));
const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(PinYinMemElem) * elements_num_;
dat_.set_array(dat_ptr, header.dat_size);
return true;
}
private:
void BuildDatCache(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
std::sort(elements.begin(), elements.end());
@ -464,13 +536,64 @@ private:
}
}
void BuildDatCache(vector<PinYinElement>& elements, const string & dat_cache_file, const string & md5) {
//std::sort(elements.begin(), elements.end());
vector<const char*> keys_ptr_vec;
vector<int> values_vec;
vector<PinYinMemElem> mem_elem_vec;
keys_ptr_vec.reserve(elements.size());
values_vec.reserve(elements.size());
mem_elem_vec.reserve(elements.size());
CacheFileHeader header;
header.min_weight = min_weight_;
assert(sizeof(header.md5_hex) == md5.size());
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
for (size_t i = 0; i < elements.size(); ++i) {
keys_ptr_vec.push_back(elements[i].word.data());
values_vec.push_back(i);
mem_elem_vec.push_back(PinYinMemElem());
auto & mem_elem = mem_elem_vec.back();
mem_elem.SetTag(elements[i].tag);
}
auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
assert(0 == ret);
header.elements_num = mem_elem_vec.size();
header.dat_size = dat_.size();
{
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
::umask(S_IWGRP | S_IWOTH);
//const int fd =::mkstemp(&tmp_filepath[0]);
const int fd =::mkstemp((char *)tmp_filepath.data());
qDebug() << "mkstemp :" << errno << tmp_filepath.data();
assert(fd >= 0);
::fchmod(fd, 0644);
auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(mem_elem_vec[0]) * mem_elem_vec.size());
write_bytes += ::write(fd, dat_.array(), dat_.total_size());
assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(mem_elem_vec[0]) + dat_.total_size());
::close(fd);
const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
assert(0 == rename_ret);
}
}
DatTrie(const DatTrie &);
DatTrie &operator=(const DatTrie &);
private:
JiebaDAT dat_;
const DatMemElem * elements_ptr_ = nullptr;
const double * idf_elements_ptr_= nullptr;
const double * idf_elements_ptr_ = nullptr;
const PinYinMemElem * pinyin_elements_ptr_ = nullptr;
size_t elements_num_ = 0;
double min_weight_ = 0;

View File

@ -131,6 +131,7 @@ private:
const auto dict_list = dict_path + "|" + user_dict_paths;
size_t file_size_sum = 0;
const string md5 = CalcFileListMD5(dict_list, file_size_sum);
total_dict_size_ = file_size_sum;
if (dat_cache_path.empty()) {
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
@ -140,7 +141,6 @@ private:
qDebug() << "#########Dict path:" << path;
if (dat_.InitAttachDat(dat_cache_path, md5)) {
LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
total_dict_size_ = file_size_sum;
return;
}
@ -154,7 +154,6 @@ private:
LoadUserDict(user_dict_paths);
const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
assert(build_ret);
total_dict_size_ = file_size_sum;
vector<DatElement>().swap(static_node_infos_);
}

View File

@ -39,21 +39,6 @@ public:
return dat_.Find(word, length, node_pos);
}
void Find(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<struct DatDag>&res,
size_t max_word_len = MAX_WORD_LENGTH) const {
dat_.Find(begin, end, res, max_word_len);
}
bool IsUserDictSingleChineseWord(const Rune& word) const {
return IsIn(user_dict_single_chinese_word_, word);
}
double GetMinWeight() const {
return dat_.GetMinWeight();
}
size_t GetTotalDictSize() const {
return total_dict_size_;
}
@ -63,6 +48,7 @@ private:
UserWordWeightOption user_word_weight_opt) {
size_t file_size_sum = 0;
const string md5 = CalcFileListMD5(dict_path, file_size_sum);
total_dict_size_ = file_size_sum;
if (dat_cache_path.empty()) {
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
@ -71,7 +57,6 @@ private:
QString path = QString::fromStdString(dat_cache_path);
qDebug() << "#########Idf path:" << path;
if (dat_.InitIdfAttachDat(dat_cache_path, md5)) {
total_dict_size_ = file_size_sum;
return;
}
@ -85,7 +70,6 @@ private:
const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
assert(build_ret);
total_dict_size_ = file_size_sum;
vector<IdfElement>().swap(static_node_infos_);
}
@ -128,7 +112,6 @@ private:
vector<IdfElement> static_node_infos_;
size_t total_dict_size_ = 0;
DatTrie dat_;
unordered_set<Rune> user_dict_single_chinese_word_;
};
}

View File

@ -84,6 +84,7 @@ private:
MixSegment segment_;
IdfTrie idf_trie_;
unordered_set<Rune> symbols_;
}; // class KeywordExtractor

View File

@ -0,0 +1,154 @@
#pragma once
#include <iostream>
#include <fstream>
#include <map>
#include <string>
#include <cstring>
#include <cstdlib>
#include <stdint.h>
#include <cmath>
#include <limits>
#include "limonp/StringUtil.hpp"
#include "limonp/Logging.hpp"
#include "Unicode.hpp"
#include "DatTrie.hpp"
#include <QDebug>
namespace cppjieba {
using namespace limonp;
const size_t PINYIN_COLUMN_NUM = 2;
class PinYinTrie {
public:
enum UserWordWeightOption {
WordWeightMin,
WordWeightMedian,
WordWeightMax,
}; // enum UserWordWeightOption
PinYinTrie(const string& dict_path, const string & dat_cache_path = "",
UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
Init(dict_path, dat_cache_path, user_word_weight_opt);
}
~PinYinTrie() {}
int getMultiTonResults(string word, QStringList &results) {
if (qmap_chinese2pinyin.contains(QString::fromStdString(word))) {
for (auto i:qmap_chinese2pinyin[QString::fromStdString(word)])
results.push_back(i);
return 0;
}
return -1;
}
int getSingleTonResult(string word, QString &result) {
const PinYinMemElem * tmp = dat_.PinYinFind(word);
if (tmp) {
result = QString::fromStdString(tmp->GetTag());
return 0;
}
return -1;
}
bool contains(string &word) {
if (qmap_chinese2pinyin.contains(QString::fromStdString(word))
or !dat_.PinYinFind(word))
return true;
// if (map_chinese2pinyin.contains(word)
// or !dat_.PinYinFind(word))
// return true;
return false;
}
bool isMultiTone(string &word) {
if (qmap_chinese2pinyin.contains(QString::fromStdString(word)))
return true;
// if (map_chinese2pinyin.contains(word))
// return true;
return false;
}
size_t GetTotalDictSize() const {
return total_dict_size_;
}
private:
void Init(const string& dict_path, string dat_cache_path,
UserWordWeightOption user_word_weight_opt) {
size_t file_size_sum = 0;
vector<PinYinElement> node_infos;
const string md5 = CalcFileListMD5(dict_path, file_size_sum);
total_dict_size_ = file_size_sum;
if (dat_cache_path.empty()) {
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache";
}
QString path = QString::fromStdString(dat_cache_path);
qDebug() << "#########PinYin path:" << path << file_size_sum;
if (dat_.InitPinYinAttachDat(dat_cache_path, md5)) {
//多音字仍需遍历文件信息
LoadDefaultPinYin(node_infos, dict_path, true);
return;
}
LoadDefaultPinYin(node_infos, dict_path, false);
double min_weight = 0;
dat_.SetMinWeight(min_weight);
const auto build_ret = dat_.InitBuildDat(node_infos, dat_cache_path, md5);
assert(build_ret);
vector<PinYinElement>().swap(node_infos);
}
void LoadDefaultPinYin(vector<PinYinElement> &node_infos, const string& filePath, bool multiFlag) {
ifstream ifs(filePath.c_str());
if(not ifs.is_open()){
return ;
}
XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
string line;
vector<string> buf;
size_t lineno = 0;
for (; getline(ifs, line); lineno++) {
if (line.empty()) {
XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
continue;
}
Split(line, buf, " ");
if (buf.size() == PINYIN_COLUMN_NUM) {
if (multiFlag) {//非多音字
continue;
}
PinYinElement node_info;
node_info.word = buf[1];
node_info.tag = buf[0];
node_infos.push_back(node_info);
} else {//多音字
QString content = QString::fromUtf8(line.c_str());
qmap_chinese2pinyin[content.split(" ").last().trimmed()] = content.split(" ");
qmap_chinese2pinyin[content.split(" ").last().trimmed()].pop_back();
/*
//std map string list
list<string> tmpList;
for(int i = 0; i < buf.size() - 1; ++i){
tmpList.push_back(buf[i]);
}
map[buf[buf.size() - 1]] = tmpList;
*/
}
}
}
private:
QMap<QString, QStringList> qmap_chinese2pinyin;
//map<string, list<string>> map_chinese2pinyin;
size_t total_dict_size_ = 0;
DatTrie dat_;
};
}

View File

@ -3,6 +3,7 @@ INCLUDEPATH += $$PWD
HEADERS += \
$$PWD/DictTrie.hpp \
$$PWD/IdfTrie.hpp \
$$PWD/PinYinTrie.hpp \
$$PWD/FullSegment.hpp \
$$PWD/HMMModel.hpp \
$$PWD/HMMSegment.hpp \

View File

@ -23,14 +23,17 @@ include(cppjieba/cppjieba.pri)
SOURCES += \
chinese-segmentation.cpp \
pinyinmanager.cpp
HEADERS += \
chinese-segmentation.h \
libchinese-segmentation_global.h
libchinese-segmentation_global.h \
pinyinmanager.h
dict_files.path = /usr/share/ukui-search/res/dict/
dict_files.files = $$PWD/dict/*.utf8\
dict_files.files += $$PWD/dict/pos_dict/*.utf8\
dict_files.files += $$PWD/dict/*.txt\
INSTALLS += \
dict_files \
@ -60,5 +63,6 @@ DISTFILES += \
dict/pos_dict/prob_start.utf8 \
dict/pos_dict/prob_trans.utf8 \
dict/stop_words.utf8 \
dict/user.dict.utf8
dict/user.dict.utf8 \
dict/pinyinWithoutTone.txt

View File

@ -0,0 +1,55 @@
#include "pinyinmanager.h"
#include <mutex>
PinYinManager * PinYinManager::g_pinYinManager = nullptr;
std::once_flag g_singleFlag;
PinYinManager * PinYinManager::getInstance()
{
call_once(g_singleFlag, []() {
g_pinYinManager = new PinYinManager;
});
return g_pinYinManager;
}
bool PinYinManager::contains(string &word)
{
return m_pinYinTrie->contains(word);
}
bool PinYinManager::isMultiTon(string &word)
{
return m_pinYinTrie->isMultiTone(word);
}
bool PinYinManager::isMultiTon(string word)
{
return m_pinYinTrie->isMultiTone(word);
}
int PinYinManager::getResults(string word, QStringList &results)
{
results.clear();
if (-1 != m_pinYinTrie->getMultiTonResults(word, results)) {
return 0;
}
QString tmp;
if (-1 != m_pinYinTrie->getSingleTonResult(word, tmp)) {
results.append(tmp);
return 0;
}
return -1;
}
PinYinManager::PinYinManager()
{
const char * const PINYIN_PATH = "/usr/share/ukui-search/res/dict/pinyinWithoutTone.txt";
m_pinYinTrie = new cppjieba::PinYinTrie(PINYIN_PATH);
}
PinYinManager::~PinYinManager()
{
if (m_pinYinTrie){
delete m_pinYinTrie;
m_pinYinTrie = nullptr;
}
}

View File

@ -0,0 +1,33 @@
#ifndef PINYINMANAGER_H
#define PINYINMANAGER_H
#include <QtCore/qglobal.h>
#include "cppjieba/PinYinTrie.hpp"
#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
using namespace std;
class PINYINMANAGER_EXPORT PinYinManager
{
public:
static PinYinManager * getInstance();
public:
bool contains(string &word);
bool isMultiTon(string &word);
bool isMultiTon(string word);
int getResults(string word, QStringList &results);
protected:
PinYinManager();
~PinYinManager();
private:
static PinYinManager *g_pinYinManager;
cppjieba::PinYinTrie *m_pinYinTrie = nullptr;
};
#endif // PINYINMANAGER_H

View File

@ -27,6 +27,7 @@
#include <QDBusConnection>
#include <QDomDocument>
#include "gobject-template.h"
#include "pinyinmanager.h"
using namespace UkuiSearch;
size_t FileUtils::_max_index_count = 0;
@ -405,25 +406,25 @@ void stitchMultiToneWordsBFSHeapLess3(const QString &hanzi, QStringList &resultL
//BFS+Stack+超过3个多音字只建一个索引比较折中的方案
void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &resultList) {
QString tempHanzi, resultAllPinYin, resultFirst;
QString tempHanzi;
QQueue<QString> tempQueue;
QQueue<QString> tempQueueFirst;
tempHanzi = hanzi;
int tempQueueSize = 0;
int multiToneWordNum = 0;
for(auto i : hanzi) {
if(FileUtils::map_chinese2pinyin.contains(i)) {
if(FileUtils::map_chinese2pinyin[i].size() > 1) {
for (auto i:hanzi) {
if (PinYinManager::getInstance()->isMultiTon(QString(i).toStdString()))
++multiToneWordNum;
}
}
}
if(multiToneWordNum > 3) {
QString oneResult, oneResultFirst;
for(auto i : hanzi) {
if(FileUtils::map_chinese2pinyin.contains(i)) {
oneResult += FileUtils::map_chinese2pinyin[i].first();
oneResultFirst += FileUtils::map_chinese2pinyin[i].first().at(0);
QStringList results;
PinYinManager::getInstance()->getResults(QString(i).toStdString(), results);
if(results.size()) {
oneResult += results.first();
oneResultFirst += results.first().at(0);
} else {
oneResult += i;
oneResultFirst += i;
@ -434,8 +435,10 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result
return;
}
if(FileUtils::map_chinese2pinyin.contains(tempHanzi.at(0))) {
for(auto i : FileUtils::map_chinese2pinyin[tempHanzi.at(0)]) {
QStringList results;
PinYinManager::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results);
if(results.size()) {
for(auto i : results) {
tempQueue.enqueue(i);
tempQueueFirst.enqueue(i.at(0));
}
@ -445,10 +448,11 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result
}
tempHanzi = tempHanzi.right(tempHanzi.size() - 1);
while(tempHanzi.size() != 0) {
PinYinManager::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results);
tempQueueSize = tempQueue.size();
if(FileUtils::map_chinese2pinyin.contains(tempHanzi.at(0))) {
if(results.size()) {
for(int j = 0; j < tempQueueSize; ++j) {
for(auto i : FileUtils::map_chinese2pinyin[tempHanzi.at(0)]) {
for(auto i : results) {
tempQueue.enqueue(tempQueue.head() + i);
tempQueueFirst.enqueue(tempQueueFirst.head() + i.at(0));
}
@ -469,22 +473,12 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result
resultList.append(tempQueue.dequeue());
resultList.append(tempQueueFirst.dequeue());
}
// delete tempQueue;
// delete tempQueueFirst;
// tempQueue = nullptr;
// tempQueueFirst = nullptr;
return;
}
QStringList FileUtils::findMultiToneWords(const QString &hanzi) {
// QStringList* output = new QStringList();
QStringList output;
QString tempAllPinYin, tempFirst;
QStringList stringList = hanzi.split("");
// stitchMultiToneWordsDFS(hanzi, tempAllPinYin, tempFirst, output);
stitchMultiToneWordsBFSStackLess3(hanzi, output);
// qDebug() << output;
return output;
}

View File

@ -51,7 +51,6 @@
#include <uchardet/uchardet.h>
//#include <poppler-qt5.h>
#include <poppler/qt5/poppler-qt5.h>
#include <common.h>
#include "libsearch_global.h"
#include "common.h"

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,5 @@
<RCC>
<qresource prefix="/">
<file>index/pinyinWithTone.txt</file>
<file>index/pinyinWithoutTone.txt</file>
<file>res/icons/desktop.png</file>
<file>res/icons/close.svg</file>
<file>res/icons/edit-find-symbolic.svg</file>