From b499293d82971f5158458b668082dc2ba4bffc66 Mon Sep 17 00:00:00 2001 From: iaom Date: Tue, 6 Jun 2023 15:23:27 +0800 Subject: [PATCH] =?UTF-8?q?=E5=90=8C=E6=AD=A5=E5=AD=90=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=E6=94=B9=E5=8A=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- libchinese-segmentation/README.md | 170 ++++++++++++++++++ .../Traditional2Simplified_trie.cpp | 3 +- .../cppjieba/idf-trie/idf-trie.cpp | 3 +- .../cppjieba/segment-trie/segment-trie.cpp | 3 +- .../pinyin4cpp/pinyin4cpp-trie.cpp | 3 +- .../storage-base/storage-base.hpp | 15 ++ 6 files changed, 189 insertions(+), 8 deletions(-) create mode 100644 libchinese-segmentation/README.md diff --git a/libchinese-segmentation/README.md b/libchinese-segmentation/README.md new file mode 100644 index 0000000..aaf1ba5 --- /dev/null +++ b/libchinese-segmentation/README.md @@ -0,0 +1,170 @@ +# chinese-segmentation + +#### 介绍 +libchinese-segmentation工程以单例的形式分别提供了中文分词、汉字转拼音和中文繁体简体转换功能。 + +接口文件分别为: +chinese-segmentation.h +libchinese-segmentation_global.h +common-struct.h + +hanzi-to-pinyin.h +pinyin4cpp-common.h + +Traditional-to-Simplified.h +安装路径:/usr/include/chinese-seg + +#### 使用说明 + +其中中文分词相关功能由chinese-segmentation.h提供接口,主要包括以下功能函数: + +``` + static ChineseSegmentation *getInstance();//全局单例 + /** + * @brief ChineseSegmentation::callSegment + * 调用extractor进行关键词提取,先使用Mix方式初步分词,再使用Idf词典进行关键词提取,只包含两字以上关键词 + * + * @param sentence 要提取关键词的句子 + * @return vector 存放提取后关键词的信息的容器 + */ + vector callSegment(const string &sentence); + vector callSegment(QString &sentence); + + /** + * @brief ChineseSegmentation::callMixSegmentCutStr + * 使用Mix方法进行分词,即先使用最大概率法MP初步分词,再用隐式马尔科夫模型HMM进一步分词,可以准确切出词典已有词和未登录词,结果比较准确 + * + * @param sentence 要分词的句子 + * @return vector 只存放分词后每个词的内容的容器 + */ + vector callMixSegmentCutStr(const string& sentence); + + /** + * @brief ChineseSegmentation::callMixSegmentCutWord + * 和callMixSegmentCutStr功能相同 + * @param sentence 要分词的句子 + * @return vector 存放分词后每个词所有信息的容器 + */ + vector callMixSegmentCutWord(const string& str); + + /** + * @brief ChineseSegmentation::lookUpTagOfWord + * 查询word的词性 + * @param word 要查询词性的词 + * @return string word的词性 + */ + string lookUpTagOfWord(const string& word); + + /** + * @brief ChineseSegmentation::getTagOfWordsInSentence + * 使用Mix分词后获取每个词的词性 + * @param sentence 要分词的句子 + * @return vector> 分词后的每个词的内容(firsr)和其对应的词性(second) + */ + vector> getTagOfWordsInSentence(const string &sentence); + + /** + * @brief ChineseSegmentation::callFullSegment + * 使用Full进行分词,Full会切出字典里所有的词。 + * @param sentence 要分词的句子 + * @return vector 存放分词后每个词所有信息的容器 + */ + vector callFullSegment(const string& sentence); + + /** + * @brief ChineseSegmentation::callQuerySegment + * 使用Query进行分词,即先使用Mix,对于长词再用Full,结果最精确,但词的数量也最大 + * @param sentence 要分词的句子 + * @return vector 存放分词后每个词所有信息的容器 + */ + vector callQuerySegment(const string& sentence); + + /** + * @brief ChineseSegmentation::callHMMSegment + * 使用隐式马尔科夫模型HMM进行分词 + * @param sentence 要分词的句子 + * @return vector 存放分词后每个词所有信息的容器 + */ + vector callHMMSegment(const string& sentence); + + /** + * @brief ChineseSegmentation::callMPSegment + * 使用最大概率法MP进行分词 + * @param sentence 要分词的句子 + * @return vector 存放分词后每个词所有信息的容器 + */ + vector callMPSegment(const string& sentence); + +``` + +汉字转拼音相关功能由hanzi-to-pinyin.h提供接口,主要包括以下功能函数: + +``` + static HanZiToPinYin * getInstance();//全局单例 + + /** + * @brief HanZiToPinYin::isMultiTone 判断是否为多音字/词/句 + * @param word 要判断的字/词/句 + * @return bool 不是返回false + */ + bool isMultiTone(string &word); + bool isMultiTone(string &&word); + bool isMultiTone(const string &word); + bool isMultiTone(const string &&word); + + /** + * @brief HanZiToPinYin::contains 查询某个字/词/句是否有拼音(是否在数据库包含) + * @param word 要查询的字/词/句 + * @return bool 数据库不包含返回false + */ + bool contains(string &word); + + /** + * @brief HanZiToPinYin::getResults 获取某个字/词/句的拼音 + * @param word 要获取拼音的字/词/句 + * @param results word的拼音列表(有可能多音字),每次调用results会被清空 + * @return int 获取到返回0,否则返回-1 + */ + int getResults(string word, QStringList &results); + + /** + * @brief setConfig 设置HanZiToPinYin的各项功能,详见pinyin4cpp-common.h + * @param dataStyle 返回数据风格,默认defult + * @param segType 是否启用分词,默认启用 + * @param polyphoneType 是否启用多音字,默认不启用 + * @param processType 无拼音数据处理模式,默认defult + */ + void setConfig(PinyinDataStyle dataStyle,SegType segType,PolyphoneType polyphoneType,ExDataProcessType processType); + +``` + +中文繁体转简体相关功能由Traditional-to-Simplified.h提供接口,主要包括以下功能函数: + +``` + static Traditional2Simplified * getInstance();//全局单例 + /** + * @brief Traditional2Simplified::isMultiTone 判断是否为繁体字,是则返回true + * @param oneWord 要判断的字 + * @return bool 不是返回false + */ + bool isTraditional(string &oneWord); + + /** + * @brief Traditional2Simplified::getResults 转换某个字/词/句的繁体字 + * @param words 要转换为简体中文的字/词/句 + * @return words 的简体中文结果 + */ + string getResults(string words); + +``` + +除此之外工程中提供了测试程序位于chinese-segmentation/test,运行界面如下: +![输入图片说明](https://foruda.gitee.com/images/1682048388802220746/245a2ec3_8021248.png "image.png") + +#### 参与贡献 + +1. Fork 本仓库 +2. 新建分支 +3. 提交代码 +4. 新建 Pull Request + diff --git a/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.cpp b/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.cpp index 06de94a..a277c18 100644 --- a/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.cpp +++ b/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.cpp @@ -66,8 +66,7 @@ void Traditional2SimplifiedTrie::LoadSourceFile(const string &dat_cache_file, co close(fd); assert((size_t)write_bytes == sizeof(CacheFileHeaderBase) + offset + this->GetDataTrieTotalSize()); - const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str()); - assert(0 == rename_ret); + tryRename(tmp_filepath, dat_cache_file); } string Traditional2SimplifiedTrie::Find(const string &key) diff --git a/libchinese-segmentation/cppjieba/idf-trie/idf-trie.cpp b/libchinese-segmentation/cppjieba/idf-trie/idf-trie.cpp index 17d141e..feac716 100644 --- a/libchinese-segmentation/cppjieba/idf-trie/idf-trie.cpp +++ b/libchinese-segmentation/cppjieba/idf-trie/idf-trie.cpp @@ -78,8 +78,7 @@ void IdfTrie::LoadSourceFile(const string &dat_cache_file, const string &md5) close(fd); assert((size_t)write_bytes == sizeof(IdfCacheFileHeader) + offset + this->GetDataTrieTotalSize()); - const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str()); - assert(0 == rename_ret); + tryRename(tmp_filepath, dat_cache_file); } double IdfTrie::Find(const string &key) const diff --git a/libchinese-segmentation/cppjieba/segment-trie/segment-trie.cpp b/libchinese-segmentation/cppjieba/segment-trie/segment-trie.cpp index 5d9e1ff..894eaa9 100644 --- a/libchinese-segmentation/cppjieba/segment-trie/segment-trie.cpp +++ b/libchinese-segmentation/cppjieba/segment-trie/segment-trie.cpp @@ -63,8 +63,7 @@ void DictTrie::LoadSourceFile(const string &dat_cache_file, const string &md5) close(fd); assert((size_t)write_bytes == sizeof(DictCacheFileHeader) + offset + this->GetDataTrieTotalSize()); - const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str()); - assert(0 == rename_ret); + tryRename(tmp_filepath, dat_cache_file); } const DatMemElem * DictTrie::Find(const string &key) const diff --git a/libchinese-segmentation/pinyin4cpp/pinyin4cpp-trie.cpp b/libchinese-segmentation/pinyin4cpp/pinyin4cpp-trie.cpp index 1746bf8..7a78534 100644 --- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp-trie.cpp +++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp-trie.cpp @@ -75,8 +75,7 @@ void Pinyin4cppTrie::LoadSourceFile(const string &dat_cache_file, const string & close(fd); assert((size_t)write_bytes == sizeof(CacheFileHeaderBase) + offset + this->GetDataTrieTotalSize()); - const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str()); - assert(0 == rename_ret); + tryRename(tmp_filepath, dat_cache_file); } string Pinyin4cppTrie::Find(const string &key) diff --git a/libchinese-segmentation/storage-base/storage-base.hpp b/libchinese-segmentation/storage-base/storage-base.hpp index c85f5d6..764b6d3 100644 --- a/libchinese-segmentation/storage-base/storage-base.hpp +++ b/libchinese-segmentation/storage-base/storage-base.hpp @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "Md5.hpp" #include "StringUtil.hpp" //#define USE_DARTS @@ -63,6 +65,19 @@ inline string CalcFileListMD5(const vector &files_list, int & file_size_ return string(md5.digestChars); } +inline bool isFileExist(const string filePath) { + ifstream infile(filePath); + return infile.good(); +} + +inline void tryRename(string tmpName, string name) { + if (0 != rename(tmpName.c_str(), name.c_str())) { + if (isFileExist(name)) { + remove(tmpName.c_str()); + } + } +} + struct CacheFileHeaderBase { //todo 字节对齐 char md5_hex[32] = {}; uint32_t elements_num = 0;