From 7daa82e66fa488ea3b7f30247757c81310e2fc83 Mon Sep 17 00:00:00 2001 From: baijunjie Date: Tue, 17 May 2022 15:33:19 +0800 Subject: [PATCH] Encapsulate the basic interface of cppjieba and hanzi to pinyin. Use the perfect forword to optimize the 'isMultiTone' interface. --- debian | 2 +- .../chinese-segmentation-private.h | 33 ++++ .../chinese-segmentation.cpp | 154 +++++++++++++----- .../chinese-segmentation.h | 107 +++++++++--- libchinese-segmentation/common-struct.h | 52 ++++++ .../cppjieba/PinYinTrie.hpp | 2 +- libchinese-segmentation/cppjieba/Unicode.hpp | 35 ++-- .../header-files/ChineseSegmentation | 1 + .../header-files/HanZiToPinYin | 1 + .../hanzi-to-pinyin-private.h | 29 ++++ libchinese-segmentation/hanzi-to-pinyin.cpp | 83 ++++++++++ libchinese-segmentation/hanzi-to-pinyin.h | 53 ++++++ .../libchinese-segmentation.pro | 40 +++-- libchinese-segmentation/pinyinmanager.cpp | 55 ------- libchinese-segmentation/pinyinmanager.h | 33 ---- libsearch/file-utils.cpp | 10 +- libsearch/index/construct-document.cpp | 4 +- libsearch/index/index-generator.cpp | 6 +- libsearch/index/search-manager.cpp | 4 +- libsearch/notesearch/note-search-plugin.cpp | 4 +- .../searchtasks/file-content-search-task.cpp | 2 +- 21 files changed, 508 insertions(+), 202 deletions(-) create mode 100644 libchinese-segmentation/chinese-segmentation-private.h create mode 100644 libchinese-segmentation/common-struct.h create mode 100644 libchinese-segmentation/development-files/header-files/ChineseSegmentation create mode 100644 libchinese-segmentation/development-files/header-files/HanZiToPinYin create mode 100644 libchinese-segmentation/hanzi-to-pinyin-private.h create mode 100644 libchinese-segmentation/hanzi-to-pinyin.cpp create mode 100644 libchinese-segmentation/hanzi-to-pinyin.h delete mode 100644 libchinese-segmentation/pinyinmanager.cpp delete mode 100644 libchinese-segmentation/pinyinmanager.h diff --git a/debian b/debian index 965e490..b20439f 160000 --- a/debian +++ b/debian @@ -1 +1 @@ -Subproject commit 965e490d6438d89d477afb1f9a2cdd93f3a5d401 +Subproject commit b20439faa51ad6179ea825befbcf4e99a5ebc002 diff --git a/libchinese-segmentation/chinese-segmentation-private.h b/libchinese-segmentation/chinese-segmentation-private.h new file mode 100644 index 0000000..3214dff --- /dev/null +++ b/libchinese-segmentation/chinese-segmentation-private.h @@ -0,0 +1,33 @@ +#ifndef CHINESESEGMENTATIONPRIVATE_H +#define CHINESESEGMENTATIONPRIVATE_H + +#include "chinese-segmentation.h" +#include "cppjieba/Jieba.hpp" +#include "cppjieba/KeywordExtractor.hpp" + +class ChineseSegmentationPrivate +{ +public: + explicit ChineseSegmentationPrivate(ChineseSegmentation *parent = nullptr); + ~ChineseSegmentationPrivate(); + vector callSegment(const string& sentence); + + vector callMixSegmentCutStr(const string& sentence); + vector callMixSegmentCutWord(const string& sentence); + string lookUpTagOfWord(const string& word); + vector> getTagOfWordsInSentence(const string &sentence); + + vector callFullSegment(const string& sentence); + + vector callQuerySegment(const string& sentence); + + vector callHMMSegment(const string& sentence); + + vector callMPSegment(const string& sentence); + +private: + cppjieba::Jieba *m_jieba; + ChineseSegmentation *q = nullptr; +}; + +#endif // CHINESESEGMENTATIONPRIVATE_H diff --git a/libchinese-segmentation/chinese-segmentation.cpp b/libchinese-segmentation/chinese-segmentation.cpp index 3b6f04c..6a0ba23 100644 --- a/libchinese-segmentation/chinese-segmentation.cpp +++ b/libchinese-segmentation/chinese-segmentation.cpp @@ -19,12 +19,10 @@ * */ #include "chinese-segmentation.h" -#include -#include -static ChineseSegmentation *global_instance_chinese_segmentation = nullptr; -QMutex ChineseSegmentation::m_mutex; +#include "chinese-segmentation-private.h" -ChineseSegmentation::ChineseSegmentation() { +ChineseSegmentationPrivate::ChineseSegmentationPrivate(ChineseSegmentation *parent) : q(parent) +{ const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8"; const char * const HMM_PATH = "/usr/share/ukui-search/res/dict/hmm_model.utf8"; const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8"; @@ -38,53 +36,127 @@ ChineseSegmentation::ChineseSegmentation() { ""); } -ChineseSegmentation::~ChineseSegmentation() { +ChineseSegmentationPrivate::~ChineseSegmentationPrivate() { if(m_jieba) delete m_jieba; m_jieba = nullptr; } -ChineseSegmentation *ChineseSegmentation::getInstance() { - QMutexLocker locker(&m_mutex); - if(!global_instance_chinese_segmentation) { - global_instance_chinese_segmentation = new ChineseSegmentation; - } - return global_instance_chinese_segmentation; -} - -QVector ChineseSegmentation::callSegement(std::string s) { -// std::string s; -// s = str.toStdString(); -// str.squeeze(); - +vector ChineseSegmentationPrivate::callSegment(const string &sentence) { const size_t topk = -1; - std::vector keywordres; - ChineseSegmentation::m_jieba->extractor.Extract(s, keywordres, topk); - std::string().swap(s); - QVector vecNeeds; - convert(keywordres, vecNeeds); + vector keywordres; + ChineseSegmentationPrivate::m_jieba->extractor.Extract(sentence, keywordres, topk); - keywordres.clear(); -// keywordres.shrink_to_fit(); - return vecNeeds; + return keywordres; } -std::vector ChineseSegmentation::callSegementStd(const std::string &str) { - - const size_t topk = -1; - std::vector keywordres; - ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk); - +vector ChineseSegmentationPrivate::callMixSegmentCutStr(const string &sentence) +{ + vector keywordres; + ChineseSegmentationPrivate::m_jieba->Cut(sentence, keywordres); return keywordres; } -void ChineseSegmentation::convert(std::vector &keywordres, QVector &kw) { - for(auto i : keywordres) { - SKeyWord temp; - temp.word = i.word; - temp.offsets = QVector::fromStdVector(i.offsets); - temp.weight = i.weight; - kw.append(temp); - } +vector ChineseSegmentationPrivate::callMixSegmentCutWord(const string &sentence) +{ + vector keywordres; + ChineseSegmentationPrivate::m_jieba->Cut(sentence, keywordres); + return keywordres; +} + +string ChineseSegmentationPrivate::lookUpTagOfWord(const string &word) +{ + return ChineseSegmentationPrivate::m_jieba->LookupTag(word); +} + +vector> ChineseSegmentationPrivate::getTagOfWordsInSentence(const string &sentence) +{ + vector> words; + ChineseSegmentationPrivate::m_jieba->Tag(sentence, words); + return words; +} + +vector ChineseSegmentationPrivate::callFullSegment(const string &sentence) +{ + vector keywordres; + ChineseSegmentationPrivate::m_jieba->CutAll(sentence, keywordres); + return keywordres; +} + +vector ChineseSegmentationPrivate::callQuerySegment(const string &sentence) +{ + vector keywordres; + ChineseSegmentationPrivate::m_jieba->CutForSearch(sentence, keywordres); + return keywordres; +} + +vector ChineseSegmentationPrivate::callHMMSegment(const string &sentence) +{ + vector keywordres; + ChineseSegmentationPrivate::m_jieba->CutHMM(sentence, keywordres); + return keywordres; +} + +vector ChineseSegmentationPrivate::callMPSegment(const string &sentence) +{ + size_t maxWordLen = 512; + vector keywordres; + ChineseSegmentationPrivate::m_jieba->CutSmall(sentence, keywordres, maxWordLen); + return keywordres; +} + +ChineseSegmentation *ChineseSegmentation::getInstance() +{ + static ChineseSegmentation *global_instance_chinese_segmentation = new ChineseSegmentation; + return global_instance_chinese_segmentation; +} + +vector ChineseSegmentation::callSegment(const string &sentence) +{ + return d->callSegment(sentence); +} + +vector ChineseSegmentation::callMixSegmentCutStr(const string &sentence) +{ + return d->callMixSegmentCutStr(sentence); +} + +vector ChineseSegmentation::callMixSegmentCutWord(const string &str) +{ + return d->callMixSegmentCutWord(str); +} + +string ChineseSegmentation::lookUpTagOfWord(const string &word) +{ + return d->lookUpTagOfWord(word); +} + +vector > ChineseSegmentation::getTagOfWordsInSentence(const string &sentence) +{ + return d->getTagOfWordsInSentence(sentence); +} + +vector ChineseSegmentation::callFullSegment(const string &sentence) +{ + return d->callFullSegment(sentence); +} + +vector ChineseSegmentation::callQuerySegment(const string &sentence) +{ + return d->callQuerySegment(sentence); +} + +vector ChineseSegmentation::callHMMSegment(const string &sentence) +{ + return d->callHMMSegment(sentence); +} + +vector ChineseSegmentation::callMPSegment(const string &sentence) +{ + return d->callMPSegment(sentence); +} + +ChineseSegmentation::ChineseSegmentation() : d(new ChineseSegmentationPrivate) +{ } diff --git a/libchinese-segmentation/chinese-segmentation.h b/libchinese-segmentation/chinese-segmentation.h index 63f3305..c8c7fb2 100644 --- a/libchinese-segmentation/chinese-segmentation.h +++ b/libchinese-segmentation/chinese-segmentation.h @@ -22,42 +22,95 @@ #define CHINESESEGMENTATION_H #include "libchinese-segmentation_global.h" -#include "cppjieba/Jieba.hpp" -//#include "Logging.hpp" -//#include "LocalVector.hpp" -//#include "cppjieba/QuerySegment.hpp" -#include "cppjieba/KeywordExtractor.hpp" -#include -#include -#include -#include - -struct SKeyWord { - std::string word; - QVector offsets; - double weight; - ~SKeyWord() { - word = std::move(""); - offsets.clear(); - offsets.shrink_to_fit(); - } -}; +#include "common-struct.h" +class ChineseSegmentationPrivate; class CHINESESEGMENTATION_EXPORT ChineseSegmentation { public: static ChineseSegmentation *getInstance(); - QVector callSegement(std::string s); - std::vector callSegementStd(const std::string& str); + + /** + * @brief ChineseSegmentation::callSegment + * 调用extractor进行关键词提取,先使用Mix方式初步分词,再使用Idf词典进行关键词提取,只包含两字以上关键词 + * + * @param sentence 要提取关键词的句子 + * @return vector 存放提取后关键词的信息的容器 + */ + vector callSegment(const string &sentence); + + /** + * @brief ChineseSegmentation::callMixSegmentCutStr + * 使用Mix方法进行分词,即先使用最大概率法MP初步分词,再用隐式马尔科夫模型HMM进一步分词,可以准确切出词典已有词和未登录词,结果比较准确 + * + * @param sentence 要分词的句子 + * @return vector 只存放分词后每个词的内容的容器 + */ + vector callMixSegmentCutStr(const string& sentence); + + /** + * @brief ChineseSegmentation::callMixSegmentCutWord + * 和callMixSegmentCutStr功能相同 + * @param sentence 要分词的句子 + * @return vector 存放分词后每个词所有信息的容器 + */ + vector callMixSegmentCutWord(const string& str); + + /** + * @brief ChineseSegmentation::lookUpTagOfWord + * 查询word的词性 + * @param word 要查询词性的词 + * @return string word的词性 + */ + string lookUpTagOfWord(const string& word); + + /** + * @brief ChineseSegmentation::getTagOfWordsInSentence + * 使用Mix分词后获取每个词的词性 + * @param sentence 要分词的句子 + * @return vector> 分词后的每个词的内容(firsr)和其对应的词性(second) + */ + vector> getTagOfWordsInSentence(const string &sentence); + + /** + * @brief ChineseSegmentation::callFullSegment + * 使用Full进行分词,Full会切出字典里所有的词。 + * @param sentence 要分词的句子 + * @return vector 存放分词后每个词所有信息的容器 + */ + vector callFullSegment(const string& sentence); + + /** + * @brief ChineseSegmentation::callQuerySegment + * 使用Query进行分词,即先使用Mix,对于长词再用Full,结果最精确,但词的数量也最大 + * @param sentence 要分词的句子 + * @return vector 存放分词后每个词所有信息的容器 + */ + vector callQuerySegment(const string& sentence); + + /** + * @brief ChineseSegmentation::callHMMSegment + * 使用隐式马尔科夫模型HMM进行分词 + * @param sentence 要分词的句子 + * @return vector 存放分词后每个词所有信息的容器 + */ + vector callHMMSegment(const string& sentence); + + /** + * @brief ChineseSegmentation::callMPSegment + * 使用最大概率法MP进行分词 + * @param sentence 要分词的句子 + * @return vector 存放分词后每个词所有信息的容器 + */ + vector callMPSegment(const string& sentence); private: explicit ChineseSegmentation(); - ~ChineseSegmentation(); - void convert(std::vector& keywordres, QVector& kw); + ~ChineseSegmentation() = default; + ChineseSegmentation(const ChineseSegmentation&) = delete; + ChineseSegmentation& operator =(const ChineseSegmentation&) = delete; private: - static QMutex m_mutex; - cppjieba::Jieba *m_jieba; - + ChineseSegmentationPrivate *d = nullptr; }; #endif // CHINESESEGMENTATION_H diff --git a/libchinese-segmentation/common-struct.h b/libchinese-segmentation/common-struct.h new file mode 100644 index 0000000..cf99c5a --- /dev/null +++ b/libchinese-segmentation/common-struct.h @@ -0,0 +1,52 @@ +#ifndef COMMONSTRUCT_H +#define COMMONSTRUCT_H + +#include +#include + +using namespace std; + +/** + * @brief The KeyWord struct + * + * @property word the content of keyword + * @property offsets the Unicode offsets, can be used to check the word pos in a sentence + * @property weight the weight of the keyword + */ + +struct KeyWord { + string word; + vector offsets; + double weight; + ~KeyWord() { + word = std::move(""); + offsets.clear(); + offsets.shrink_to_fit(); + } +}; + +/** + * @brief The Word struct + * + * @property word the content of word + * @property offset the offset of the word(absolute pos, Chinese 3 , English 1), can be used to check the word pos in a sentence + * @property unicode_offset the Unicode offset of the word + * @property unicode_length the Unicode length of the word + */ +struct Word { + string word; + uint32_t offset; + uint32_t unicode_offset; + uint32_t unicode_length; + Word(const string& w, uint32_t o) + : word(w), offset(o) { + } + Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length) + : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) { + } + ~Word() { + word = std::move(""); + } +}; // struct Word + +#endif // COMMONSTRUCT_H diff --git a/libchinese-segmentation/cppjieba/PinYinTrie.hpp b/libchinese-segmentation/cppjieba/PinYinTrie.hpp index 1779ca3..50700bd 100644 --- a/libchinese-segmentation/cppjieba/PinYinTrie.hpp +++ b/libchinese-segmentation/cppjieba/PinYinTrie.hpp @@ -63,7 +63,7 @@ public: return false; } - bool isMultiTone(string &word) { + bool isMultiTone(const string &word) { if (qmap_chinese2pinyin.contains(QString::fromStdString(word))) return true; // if (map_chinese2pinyin.contains(word)) diff --git a/libchinese-segmentation/cppjieba/Unicode.hpp b/libchinese-segmentation/cppjieba/Unicode.hpp index 360b461..01f2e59 100644 --- a/libchinese-segmentation/cppjieba/Unicode.hpp +++ b/libchinese-segmentation/cppjieba/Unicode.hpp @@ -7,6 +7,7 @@ #include #include "limonp/LocalVector.hpp" #include "limonp/StringUtil.hpp" +#include "common-struct.h" namespace cppjieba { @@ -15,24 +16,24 @@ using std::vector; typedef uint32_t Rune; -struct KeyWord { - string word; - vector offsets; - double weight; -}; // struct Word +//struct KeyWord { +// string word; +// vector offsets; +// double weight; +//}; // struct Word -struct Word { - string word; - uint32_t offset; - uint32_t unicode_offset; - uint32_t unicode_length; - Word(const string& w, uint32_t o) - : word(w), offset(o) { - } - Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length) - : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) { - } -}; // struct Word +//struct Word { +// string word; +// uint32_t offset; +// uint32_t unicode_offset; +// uint32_t unicode_length; +// Word(const string& w, uint32_t o) +// : word(w), offset(o) { +// } +// Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length) +// : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) { +// } +//}; // struct Word inline std::ostream& operator << (std::ostream& os, const Word& w) { return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}"; diff --git a/libchinese-segmentation/development-files/header-files/ChineseSegmentation b/libchinese-segmentation/development-files/header-files/ChineseSegmentation new file mode 100644 index 0000000..bea1813 --- /dev/null +++ b/libchinese-segmentation/development-files/header-files/ChineseSegmentation @@ -0,0 +1 @@ +#include "chinese-segmentation.h" diff --git a/libchinese-segmentation/development-files/header-files/HanZiToPinYin b/libchinese-segmentation/development-files/header-files/HanZiToPinYin new file mode 100644 index 0000000..e6421d4 --- /dev/null +++ b/libchinese-segmentation/development-files/header-files/HanZiToPinYin @@ -0,0 +1 @@ +#include "hanzi-to-pinyin.h" diff --git a/libchinese-segmentation/hanzi-to-pinyin-private.h b/libchinese-segmentation/hanzi-to-pinyin-private.h new file mode 100644 index 0000000..fdfa6eb --- /dev/null +++ b/libchinese-segmentation/hanzi-to-pinyin-private.h @@ -0,0 +1,29 @@ +#ifndef HANZITOPINYINPRIVATE_H +#define HANZITOPINYINPRIVATE_H + +#include +#include "cppjieba/PinYinTrie.hpp" +#include "hanzi-to-pinyin.h" + +#define PINYINMANAGER_EXPORT Q_DECL_IMPORT + +using namespace std; + +class PINYINMANAGER_EXPORT HanZiToPinYinPrivate +{ +public: + HanZiToPinYinPrivate(HanZiToPinYin *parent = nullptr); + ~HanZiToPinYinPrivate(); + +public: + template + bool isMultiTone(T &&t) {return m_pinYinTrie->isMultiTone(std::forward(t));} + + bool contains(string &word); + int getResults(string word, QStringList &results); + +private: + cppjieba::PinYinTrie *m_pinYinTrie = nullptr; + HanZiToPinYin *q = nullptr; +}; +#endif // HANZITOPINYINPRIVATE_H diff --git a/libchinese-segmentation/hanzi-to-pinyin.cpp b/libchinese-segmentation/hanzi-to-pinyin.cpp new file mode 100644 index 0000000..c5be586 --- /dev/null +++ b/libchinese-segmentation/hanzi-to-pinyin.cpp @@ -0,0 +1,83 @@ +#include "hanzi-to-pinyin.h" +#include "hanzi-to-pinyin-private.h" +#include + +HanZiToPinYin * HanZiToPinYin::g_pinYinManager = nullptr; +std::once_flag g_singleFlag; + + + +bool HanZiToPinYinPrivate::contains(string &word) +{ + return m_pinYinTrie->contains(word); +} + +int HanZiToPinYinPrivate::getResults(string word, QStringList &results) +{ + results.clear(); + if (-1 != m_pinYinTrie->getMultiTonResults(word, results)) { + return 0; + } + QString tmp; + if (-1 != m_pinYinTrie->getSingleTonResult(word, tmp)) { + results.append(tmp); + return 0; + } + return -1; +} + +HanZiToPinYinPrivate::HanZiToPinYinPrivate(HanZiToPinYin *parent) : q(parent) +{ + const char * const PINYIN_PATH = "/usr/share/ukui-search/res/dict/pinyinWithoutTone.txt"; + m_pinYinTrie = new cppjieba::PinYinTrie(PINYIN_PATH); +} + +HanZiToPinYinPrivate::~HanZiToPinYinPrivate() +{ + if (m_pinYinTrie){ + delete m_pinYinTrie; + m_pinYinTrie = nullptr; + } +} + +HanZiToPinYin * HanZiToPinYin::getInstance() +{ + call_once(g_singleFlag, []() { + g_pinYinManager = new HanZiToPinYin; + }); + return g_pinYinManager; +} + +bool HanZiToPinYin::contains(string &word) +{ + return d->contains(word); +} + +bool HanZiToPinYin::isMultiTone(string &word) +{ + return d->isMultiTone(word); +} + +bool HanZiToPinYin::isMultiTone(string &&word) +{ + return d->isMultiTone(word); +} + +bool HanZiToPinYin::isMultiTone(const string &word) +{ + return d->isMultiTone(word); +} + +bool HanZiToPinYin::isMultiTone(const string &&word) +{ + return d->isMultiTone(word); +} + +int HanZiToPinYin::getResults(string word, QStringList &results) +{ + return d->getResults(word, results); +} + +HanZiToPinYin::HanZiToPinYin() : d(new HanZiToPinYinPrivate) +{ +} diff --git a/libchinese-segmentation/hanzi-to-pinyin.h b/libchinese-segmentation/hanzi-to-pinyin.h new file mode 100644 index 0000000..8432afa --- /dev/null +++ b/libchinese-segmentation/hanzi-to-pinyin.h @@ -0,0 +1,53 @@ +#ifndef HANZITOPINYIN_H +#define HANZITOPINYIN_H + +#include +//#include "cppjieba/PinYinTrie.hpp" +#include +#define PINYINMANAGER_EXPORT Q_DECL_IMPORT + +using namespace std; + +class HanZiToPinYinPrivate; +class PINYINMANAGER_EXPORT HanZiToPinYin +{ +public: + static HanZiToPinYin * getInstance(); + +public: + /** + * @brief HanZiToPinYin::isMultiTone 判断是否为多音字(只支持单字) + * @param word 要判断的字 + * @return bool 不是多音字或不是单字返回false + */ + bool isMultiTone(string &word); + bool isMultiTone(string &&word); + bool isMultiTone(const string &word); + bool isMultiTone(const string &&word); + + /** + * @brief HanZiToPinYin::contains 查询某个字是否有拼音(是否在数据库包含,只支持单字) + * @param word 要查询的字 + * @return bool 数据库不包含或不是单字返回false + */ + bool contains(string &word); + + /** + * @brief HanZiToPinYin::getResults 获取某个字的拼音(只支持单字) + * @param word 要获取拼音的字 + * @param results word的拼音列表(有可能多音字),每次调用results会清空 + * @return int 获取到返回0,否则返回-1 + */ + int getResults(string word, QStringList &results); + +protected: + HanZiToPinYin(); + ~HanZiToPinYin(); + HanZiToPinYin(const HanZiToPinYin&) = delete; + HanZiToPinYin& operator =(const HanZiToPinYin&) = delete; +private: + static HanZiToPinYin *g_pinYinManager; + HanZiToPinYinPrivate *d = nullptr; +}; + +#endif // PINYINMANAGER_H diff --git a/libchinese-segmentation/libchinese-segmentation.pro b/libchinese-segmentation/libchinese-segmentation.pro index 477507f..2356a05 100644 --- a/libchinese-segmentation/libchinese-segmentation.pro +++ b/libchinese-segmentation/libchinese-segmentation.pro @@ -5,13 +5,16 @@ TARGET = chinese-segmentation TEMPLATE = lib DEFINES += LIBCHINESESEGMENTATION_LIBRARY -CONFIG += c++11 +CONFIG += c++11 create_pc create_prl no_install_prl # The following define makes your compiler emit warnings if you use # any Qt feature that has been marked deprecated (the exact warnings # depend on your compiler). Please consult the documentation of the # deprecated API in order to know how to port your code away from it. DEFINES += QT_DEPRECATED_WARNINGS +QMAKE_CXXFLAGS += -Werror=return-type -Werror=return-local-addr +#QMAKE_CXXFLAGS += -Werror=uninitialized +QMAKE_CXXFLAGS += -execution-charset:utf-8 # You can also make your code fail to compile if it uses deprecated APIs. # In order to do so, uncomment the following line. @@ -23,12 +26,15 @@ include(cppjieba/cppjieba.pri) SOURCES += \ chinese-segmentation.cpp \ - pinyinmanager.cpp + hanzi-to-pinyin.cpp HEADERS += \ + chinese-segmentation-private.h \ chinese-segmentation.h \ - libchinese-segmentation_global.h \ - pinyinmanager.h + common-struct.h \ + hanzi-to-pinyin-private.h \ + hanzi-to-pinyin.h \ + libchinese-segmentation_global.h dict_files.path = /usr/share/ukui-search/res/dict/ dict_files.files = $$PWD/dict/*.utf8\ @@ -41,14 +47,24 @@ INSTALLS += \ # Default rules for deployment. unix { target.path = $$[QT_INSTALL_LIBS] -} + QMAKE_PKGCONFIG_NAME = chinese-segmentation + QMAKE_PKGCONFIG_DESCRIPTION = chinese-segmentation Header files + QMAKE_PKGCONFIG_VERSION = $$VERSION + QMAKE_PKGCONFIG_LIBDIR = $$target.path + QMAKE_PKGCONFIG_DESTDIR = pkgconfig + QMAKE_PKGCONFIG_INCDIR = /usr/include/chinese-seg + QMAKE_PKGCONFIG_CFLAGS += -I/usr/include/chinese-seg + !isEmpty(target.path): INSTALLS += target - header.path = /usr/include/chinese-seg/ - header.files += *.h - headercppjieba.path = /usr/include/chinese-seg/cppjieba/ - headercppjieba.files = cppjieba/* - INSTALLS += header headercppjieba + header.path = /usr/include/chinese-seg + header.files += chinese-segmentation.h libchinese-segmentation_global.h common-struct.h hanzi-to-pinyin.h + header.files += development-files/header-files/* +# headercppjieba.path = /usr/include/chinese-seg/cppjieba/ +# headercppjieba.files = cppjieba/* + INSTALLS += header +} + #DISTFILES += \ # jiaba/jieba.pri @@ -64,5 +80,5 @@ DISTFILES += \ dict/pos_dict/prob_trans.utf8 \ dict/stop_words.utf8 \ dict/user.dict.utf8 \ - dict/pinyinWithoutTone.txt - + dict/pinyinWithoutTone.txt \ + development-files/header-files/* \ diff --git a/libchinese-segmentation/pinyinmanager.cpp b/libchinese-segmentation/pinyinmanager.cpp deleted file mode 100644 index 3a65557..0000000 --- a/libchinese-segmentation/pinyinmanager.cpp +++ /dev/null @@ -1,55 +0,0 @@ -#include "pinyinmanager.h" -#include -PinYinManager * PinYinManager::g_pinYinManager = nullptr; -std::once_flag g_singleFlag; -PinYinManager * PinYinManager::getInstance() -{ - call_once(g_singleFlag, []() { - g_pinYinManager = new PinYinManager; - }); - return g_pinYinManager; -} - -bool PinYinManager::contains(string &word) -{ - return m_pinYinTrie->contains(word); -} - -bool PinYinManager::isMultiTon(string &word) -{ - return m_pinYinTrie->isMultiTone(word); -} - -bool PinYinManager::isMultiTon(string word) -{ - return m_pinYinTrie->isMultiTone(word); -} - -int PinYinManager::getResults(string word, QStringList &results) -{ - results.clear(); - if (-1 != m_pinYinTrie->getMultiTonResults(word, results)) { - return 0; - } - QString tmp; - if (-1 != m_pinYinTrie->getSingleTonResult(word, tmp)) { - results.append(tmp); - return 0; - } - return -1; -} - -PinYinManager::PinYinManager() -{ - const char * const PINYIN_PATH = "/usr/share/ukui-search/res/dict/pinyinWithoutTone.txt"; - m_pinYinTrie = new cppjieba::PinYinTrie(PINYIN_PATH); -} - -PinYinManager::~PinYinManager() -{ - if (m_pinYinTrie){ - delete m_pinYinTrie; - m_pinYinTrie = nullptr; - } -} - diff --git a/libchinese-segmentation/pinyinmanager.h b/libchinese-segmentation/pinyinmanager.h deleted file mode 100644 index 89db3ae..0000000 --- a/libchinese-segmentation/pinyinmanager.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef PINYINMANAGER_H -#define PINYINMANAGER_H - -#include -#include "cppjieba/PinYinTrie.hpp" - -#define PINYINMANAGER_EXPORT Q_DECL_IMPORT - -using namespace std; - -class PINYINMANAGER_EXPORT PinYinManager -{ -public: - static PinYinManager * getInstance(); - -public: - bool contains(string &word); - bool isMultiTon(string &word); - bool isMultiTon(string word); - - int getResults(string word, QStringList &results); - -protected: - PinYinManager(); - ~PinYinManager(); - -private: - static PinYinManager *g_pinYinManager; - cppjieba::PinYinTrie *m_pinYinTrie = nullptr; - -}; - -#endif // PINYINMANAGER_H diff --git a/libsearch/file-utils.cpp b/libsearch/file-utils.cpp index ef6640d..5d84e2f 100644 --- a/libsearch/file-utils.cpp +++ b/libsearch/file-utils.cpp @@ -27,7 +27,7 @@ #include #include #include "gobject-template.h" -#include "pinyinmanager.h" +#include "hanzi-to-pinyin.h" using namespace UkuiSearch; size_t FileUtils::maxIndexCount = 0; @@ -413,14 +413,14 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result int multiToneWordNum = 0; for (auto i:hanzi) { - if (PinYinManager::getInstance()->isMultiTon(QString(i).toStdString())) + if (HanZiToPinYin::getInstance()->isMultiTone(QString(i).toStdString())) ++multiToneWordNum; } if(multiToneWordNum > 3) { QString oneResult, oneResultFirst; for(auto i : hanzi) { QStringList results; - PinYinManager::getInstance()->getResults(QString(i).toStdString(), results); + HanZiToPinYin::getInstance()->getResults(QString(i).toStdString(), results); if(results.size()) { oneResult += results.first(); oneResultFirst += results.first().at(0); @@ -435,7 +435,7 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result } QStringList results; - PinYinManager::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results); + HanZiToPinYin::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results); if(results.size()) { for(auto i : results) { tempQueue.enqueue(i); @@ -447,7 +447,7 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result } tempHanzi = tempHanzi.right(tempHanzi.size() - 1); while(tempHanzi.size() != 0) { - PinYinManager::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results); + HanZiToPinYin::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results); tempQueueSize = tempQueue.size(); if(results.size()) { for(int j = 0; j < tempQueueSize; ++j) { diff --git a/libsearch/index/construct-document.cpp b/libsearch/index/construct-document.cpp index fc81e42..f67abe5 100644 --- a/libsearch/index/construct-document.cpp +++ b/libsearch/index/construct-document.cpp @@ -118,7 +118,7 @@ void ConstructDocumentForContent::run() { doc.setData(content); //'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info. content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " "); - std::vector term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString()); + std::vector term = ChineseSegmentation::getInstance()->callSegment(content.left(20480000).toStdString()); for(size_t i = 0; i < term.size(); ++i) { doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast(term.at(i).weight)); } @@ -158,7 +158,7 @@ void ConstructDocumentForOcr::run() doc.setData(content); //'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info. content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " "); - std::vector term = ChineseSegmentation::getInstance()->callSegementStd(content.toStdString()); + std::vector term = ChineseSegmentation::getInstance()->callSegment(content.toStdString()); for(size_t i = 0; i < term.size(); ++i) { doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast(term.at(i).weight)); } diff --git a/libsearch/index/index-generator.cpp b/libsearch/index/index-generator.cpp index 1fb3710..6d98da4 100644 --- a/libsearch/index/index-generator.cpp +++ b/libsearch/index/index-generator.cpp @@ -414,15 +414,15 @@ Document IndexGenerator::GenerateContentDocument(const QString &path) { // 构造文本索引的document QString content; QStringList tmp; - QVector term; - SKeyWord skw; + std::vector term; + KeyWord skw; Document doc; QString uniqueterm; QString upTerm; QString suffix; FileReader::getTextContent(path, content, suffix); - term = ChineseSegmentation::getInstance()->callSegement(content.toStdString()); + term = ChineseSegmentation::getInstance()->callSegment(content.toStdString()); // QStringList term = content.split(""); doc.setData(content); diff --git a/libsearch/index/search-manager.cpp b/libsearch/index/search-manager.cpp index c3385d2..41db777 100644 --- a/libsearch/index/search-manager.cpp +++ b/libsearch/index/search-manager.cpp @@ -272,7 +272,7 @@ int FileContentSearch::keywordSearchContent() { qp.set_default_op(Xapian::Query::OP_AND); qp.set_database(db); - QVector sKeyWord = ChineseSegmentation::getInstance()->callSegement(m_keyword.toStdString()); + std::vector sKeyWord = ChineseSegmentation::getInstance()->callSegment(m_keyword.toStdString()); //Creat a query std::string words; for(int i = 0; i < sKeyWord.size(); i++) { @@ -446,7 +446,7 @@ int OcrSearch::keywordSearchOcr() { Xapian::QueryParser qp; qp.set_default_op(Xapian::Query::OP_AND); qp.set_database(db); - QVector sKeyWord = ChineseSegmentation::getInstance()->callSegement(m_keyword.toStdString()); + std::vector sKeyWord = ChineseSegmentation::getInstance()->callSegment(m_keyword.toStdString()); //Creat a query std::string words; for(int i = 0; i < sKeyWord.size(); i++) { diff --git a/libsearch/notesearch/note-search-plugin.cpp b/libsearch/notesearch/note-search-plugin.cpp index caee694..88445a5 100644 --- a/libsearch/notesearch/note-search-plugin.cpp +++ b/libsearch/notesearch/note-search-plugin.cpp @@ -154,9 +154,9 @@ NoteSearch::NoteSearch(DataQueue *searchResult, c } void NoteSearch::run() { - QVector sKeyWordVec = ChineseSegmentation::getInstance()->callSegement(m_keyword.toStdString()); + std::vector sKeyWordVec = ChineseSegmentation::getInstance()->callSegment(m_keyword.toStdString()); QStringList keywordList; - for (SKeyWord sKeyWord : sKeyWordVec) { + for (KeyWord sKeyWord : sKeyWordVec) { keywordList.append(QString::fromStdString(sKeyWord.word)); } QDBusInterface qi("org.ukui.note", "/org/ukui/note", "org.ukui.note.interface", QDBusConnection::sessionBus()); diff --git a/libsearch/searchinterface/searchtasks/file-content-search-task.cpp b/libsearch/searchinterface/searchtasks/file-content-search-task.cpp index b855d18..221a4da 100644 --- a/libsearch/searchinterface/searchtasks/file-content-search-task.cpp +++ b/libsearch/searchinterface/searchtasks/file-content-search-task.cpp @@ -165,7 +165,7 @@ inline Xapian::Query FileContentSearchWorker::createQuery() std::vector v; for (const auto &keyword : m_searchController->getKeyword()) { - QVector sKeyWord = ChineseSegmentation::getInstance()->callSegement(keyword.toStdString()); + std::vector sKeyWord = ChineseSegmentation::getInstance()->callSegment(keyword.toStdString()); for(const auto & c : sKeyWord) { v.emplace_back(c.word);