Merge branch '0516ukss' into 'ukss-dev'
Encapsulate the basic interface of cppjieba. See merge request kylin-desktop/ukui-search!313
This commit is contained in:
commit
772458b8da
|
@ -0,0 +1,33 @@
|
|||
#ifndef CHINESESEGMENTATIONPRIVATE_H
|
||||
#define CHINESESEGMENTATIONPRIVATE_H
|
||||
|
||||
#include "chinese-segmentation.h"
|
||||
#include "cppjieba/Jieba.hpp"
|
||||
#include "cppjieba/KeywordExtractor.hpp"
|
||||
|
||||
class ChineseSegmentationPrivate
|
||||
{
|
||||
public:
|
||||
explicit ChineseSegmentationPrivate(ChineseSegmentation *parent = nullptr);
|
||||
~ChineseSegmentationPrivate();
|
||||
vector<KeyWord> callSegment(const string& sentence);
|
||||
|
||||
vector<string> callMixSegmentCutStr(const string& sentence);
|
||||
vector<Word> callMixSegmentCutWord(const string& sentence);
|
||||
string lookUpTagOfWord(const string& word);
|
||||
vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
|
||||
|
||||
vector<Word> callFullSegment(const string& sentence);
|
||||
|
||||
vector<Word> callQuerySegment(const string& sentence);
|
||||
|
||||
vector<Word> callHMMSegment(const string& sentence);
|
||||
|
||||
vector<Word> callMPSegment(const string& sentence);
|
||||
|
||||
private:
|
||||
cppjieba::Jieba *m_jieba;
|
||||
ChineseSegmentation *q = nullptr;
|
||||
};
|
||||
|
||||
#endif // CHINESESEGMENTATIONPRIVATE_H
|
|
@ -19,12 +19,10 @@
|
|||
*
|
||||
*/
|
||||
#include "chinese-segmentation.h"
|
||||
#include <QFileInfo>
|
||||
#include <QDebug>
|
||||
static ChineseSegmentation *global_instance_chinese_segmentation = nullptr;
|
||||
QMutex ChineseSegmentation::m_mutex;
|
||||
#include "chinese-segmentation-private.h"
|
||||
|
||||
ChineseSegmentation::ChineseSegmentation() {
|
||||
ChineseSegmentationPrivate::ChineseSegmentationPrivate(ChineseSegmentation *parent) : q(parent)
|
||||
{
|
||||
const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
|
||||
const char * const HMM_PATH = "/usr/share/ukui-search/res/dict/hmm_model.utf8";
|
||||
const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
|
||||
|
@ -38,53 +36,127 @@ ChineseSegmentation::ChineseSegmentation() {
|
|||
"");
|
||||
}
|
||||
|
||||
ChineseSegmentation::~ChineseSegmentation() {
|
||||
ChineseSegmentationPrivate::~ChineseSegmentationPrivate() {
|
||||
if(m_jieba)
|
||||
delete m_jieba;
|
||||
m_jieba = nullptr;
|
||||
}
|
||||
|
||||
ChineseSegmentation *ChineseSegmentation::getInstance() {
|
||||
QMutexLocker locker(&m_mutex);
|
||||
if(!global_instance_chinese_segmentation) {
|
||||
global_instance_chinese_segmentation = new ChineseSegmentation;
|
||||
}
|
||||
return global_instance_chinese_segmentation;
|
||||
}
|
||||
|
||||
QVector<SKeyWord> ChineseSegmentation::callSegement(std::string s) {
|
||||
// std::string s;
|
||||
// s = str.toStdString();
|
||||
// str.squeeze();
|
||||
|
||||
vector<KeyWord> ChineseSegmentationPrivate::callSegment(const string &sentence) {
|
||||
const size_t topk = -1;
|
||||
std::vector<cppjieba::KeyWord> keywordres;
|
||||
ChineseSegmentation::m_jieba->extractor.Extract(s, keywordres, topk);
|
||||
std::string().swap(s);
|
||||
QVector<SKeyWord> vecNeeds;
|
||||
convert(keywordres, vecNeeds);
|
||||
vector<KeyWord> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->extractor.Extract(sentence, keywordres, topk);
|
||||
|
||||
keywordres.clear();
|
||||
// keywordres.shrink_to_fit();
|
||||
return vecNeeds;
|
||||
return keywordres;
|
||||
|
||||
}
|
||||
|
||||
std::vector<cppjieba::KeyWord> ChineseSegmentation::callSegementStd(const std::string &str) {
|
||||
|
||||
const size_t topk = -1;
|
||||
std::vector<cppjieba::KeyWord> keywordres;
|
||||
ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk);
|
||||
|
||||
vector<string> ChineseSegmentationPrivate::callMixSegmentCutStr(const string &sentence)
|
||||
{
|
||||
vector<string> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->Cut(sentence, keywordres);
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
void ChineseSegmentation::convert(std::vector<cppjieba::KeyWord> &keywordres, QVector<SKeyWord> &kw) {
|
||||
for(auto i : keywordres) {
|
||||
SKeyWord temp;
|
||||
temp.word = i.word;
|
||||
temp.offsets = QVector<size_t>::fromStdVector(i.offsets);
|
||||
temp.weight = i.weight;
|
||||
kw.append(temp);
|
||||
}
|
||||
vector<Word> ChineseSegmentationPrivate::callMixSegmentCutWord(const string &sentence)
|
||||
{
|
||||
vector<Word> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->Cut(sentence, keywordres);
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
string ChineseSegmentationPrivate::lookUpTagOfWord(const string &word)
|
||||
{
|
||||
return ChineseSegmentationPrivate::m_jieba->LookupTag(word);
|
||||
}
|
||||
|
||||
vector<pair<string, string>> ChineseSegmentationPrivate::getTagOfWordsInSentence(const string &sentence)
|
||||
{
|
||||
vector<pair<string, string>> words;
|
||||
ChineseSegmentationPrivate::m_jieba->Tag(sentence, words);
|
||||
return words;
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentationPrivate::callFullSegment(const string &sentence)
|
||||
{
|
||||
vector<Word> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->CutAll(sentence, keywordres);
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentationPrivate::callQuerySegment(const string &sentence)
|
||||
{
|
||||
vector<Word> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->CutForSearch(sentence, keywordres);
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentationPrivate::callHMMSegment(const string &sentence)
|
||||
{
|
||||
vector<Word> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->CutHMM(sentence, keywordres);
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentationPrivate::callMPSegment(const string &sentence)
|
||||
{
|
||||
size_t maxWordLen = 512;
|
||||
vector<Word> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->CutSmall(sentence, keywordres, maxWordLen);
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
ChineseSegmentation *ChineseSegmentation::getInstance()
|
||||
{
|
||||
static ChineseSegmentation *global_instance_chinese_segmentation = new ChineseSegmentation;
|
||||
return global_instance_chinese_segmentation;
|
||||
}
|
||||
|
||||
vector<KeyWord> ChineseSegmentation::callSegment(const string &sentence)
|
||||
{
|
||||
return d->callSegment(sentence);
|
||||
}
|
||||
|
||||
vector<string> ChineseSegmentation::callMixSegmentCutStr(const string &sentence)
|
||||
{
|
||||
return d->callMixSegmentCutStr(sentence);
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentation::callMixSegmentCutWord(const string &str)
|
||||
{
|
||||
return d->callMixSegmentCutWord(str);
|
||||
}
|
||||
|
||||
string ChineseSegmentation::lookUpTagOfWord(const string &word)
|
||||
{
|
||||
return d->lookUpTagOfWord(word);
|
||||
}
|
||||
|
||||
vector<pair<string, string> > ChineseSegmentation::getTagOfWordsInSentence(const string &sentence)
|
||||
{
|
||||
return d->getTagOfWordsInSentence(sentence);
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentation::callFullSegment(const string &sentence)
|
||||
{
|
||||
return d->callFullSegment(sentence);
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentation::callQuerySegment(const string &sentence)
|
||||
{
|
||||
return d->callQuerySegment(sentence);
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentation::callHMMSegment(const string &sentence)
|
||||
{
|
||||
return d->callHMMSegment(sentence);
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentation::callMPSegment(const string &sentence)
|
||||
{
|
||||
return d->callMPSegment(sentence);
|
||||
}
|
||||
|
||||
ChineseSegmentation::ChineseSegmentation() : d(new ChineseSegmentationPrivate)
|
||||
{
|
||||
}
|
||||
|
|
|
@ -22,42 +22,95 @@
|
|||
#define CHINESESEGMENTATION_H
|
||||
|
||||
#include "libchinese-segmentation_global.h"
|
||||
#include "cppjieba/Jieba.hpp"
|
||||
//#include "Logging.hpp"
|
||||
//#include "LocalVector.hpp"
|
||||
//#include "cppjieba/QuerySegment.hpp"
|
||||
#include "cppjieba/KeywordExtractor.hpp"
|
||||
#include <QVector>
|
||||
#include <QString>
|
||||
#include <QDebug>
|
||||
#include <QMutex>
|
||||
|
||||
struct SKeyWord {
|
||||
std::string word;
|
||||
QVector<size_t> offsets;
|
||||
double weight;
|
||||
~SKeyWord() {
|
||||
word = std::move("");
|
||||
offsets.clear();
|
||||
offsets.shrink_to_fit();
|
||||
}
|
||||
};
|
||||
#include "common-struct.h"
|
||||
|
||||
class ChineseSegmentationPrivate;
|
||||
class CHINESESEGMENTATION_EXPORT ChineseSegmentation {
|
||||
public:
|
||||
static ChineseSegmentation *getInstance();
|
||||
QVector<SKeyWord> callSegement(std::string s);
|
||||
std::vector<cppjieba::KeyWord> callSegementStd(const std::string& str);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callSegment
|
||||
* 调用extractor进行关键词提取,先使用Mix方式初步分词,再使用Idf词典进行关键词提取,只包含两字以上关键词
|
||||
*
|
||||
* @param sentence 要提取关键词的句子
|
||||
* @return vector<KeyWord> 存放提取后关键词的信息的容器
|
||||
*/
|
||||
vector<KeyWord> callSegment(const string &sentence);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callMixSegmentCutStr
|
||||
* 使用Mix方法进行分词,即先使用最大概率法MP初步分词,再用隐式马尔科夫模型HMM进一步分词,可以准确切出词典已有词和未登录词,结果比较准确
|
||||
*
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<string> 只存放分词后每个词的内容的容器
|
||||
*/
|
||||
vector<string> callMixSegmentCutStr(const string& sentence);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callMixSegmentCutWord
|
||||
* 和callMixSegmentCutStr功能相同
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<Word> 存放分词后每个词所有信息的容器
|
||||
*/
|
||||
vector<Word> callMixSegmentCutWord(const string& str);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::lookUpTagOfWord
|
||||
* 查询word的词性
|
||||
* @param word 要查询词性的词
|
||||
* @return string word的词性
|
||||
*/
|
||||
string lookUpTagOfWord(const string& word);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::getTagOfWordsInSentence
|
||||
* 使用Mix分词后获取每个词的词性
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<pair<string, string>> 分词后的每个词的内容(firsr)和其对应的词性(second)
|
||||
*/
|
||||
vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callFullSegment
|
||||
* 使用Full进行分词,Full会切出字典里所有的词。
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<Word> 存放分词后每个词所有信息的容器
|
||||
*/
|
||||
vector<Word> callFullSegment(const string& sentence);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callQuerySegment
|
||||
* 使用Query进行分词,即先使用Mix,对于长词再用Full,结果最精确,但词的数量也最大
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<Word> 存放分词后每个词所有信息的容器
|
||||
*/
|
||||
vector<Word> callQuerySegment(const string& sentence);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callHMMSegment
|
||||
* 使用隐式马尔科夫模型HMM进行分词
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<Word> 存放分词后每个词所有信息的容器
|
||||
*/
|
||||
vector<Word> callHMMSegment(const string& sentence);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callMPSegment
|
||||
* 使用最大概率法MP进行分词
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<Word> 存放分词后每个词所有信息的容器
|
||||
*/
|
||||
vector<Word> callMPSegment(const string& sentence);
|
||||
|
||||
private:
|
||||
explicit ChineseSegmentation();
|
||||
~ChineseSegmentation();
|
||||
void convert(std::vector<cppjieba::KeyWord>& keywordres, QVector<SKeyWord>& kw);
|
||||
~ChineseSegmentation() = default;
|
||||
ChineseSegmentation(const ChineseSegmentation&) = delete;
|
||||
ChineseSegmentation& operator =(const ChineseSegmentation&) = delete;
|
||||
|
||||
private:
|
||||
static QMutex m_mutex;
|
||||
cppjieba::Jieba *m_jieba;
|
||||
|
||||
ChineseSegmentationPrivate *d = nullptr;
|
||||
};
|
||||
|
||||
#endif // CHINESESEGMENTATION_H
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
#ifndef COMMONSTRUCT_H
|
||||
#define COMMONSTRUCT_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
/**
|
||||
* @brief The KeyWord struct
|
||||
*
|
||||
* @property word the content of keyword
|
||||
* @property offsets the Unicode offsets, can be used to check the word pos in a sentence
|
||||
* @property weight the weight of the keyword
|
||||
*/
|
||||
|
||||
struct KeyWord {
|
||||
string word;
|
||||
vector<size_t> offsets;
|
||||
double weight;
|
||||
~KeyWord() {
|
||||
word = std::move("");
|
||||
offsets.clear();
|
||||
offsets.shrink_to_fit();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The Word struct
|
||||
*
|
||||
* @property word the content of word
|
||||
* @property offset the offset of the word(absolute pos, Chinese 3 , English 1), can be used to check the word pos in a sentence
|
||||
* @property unicode_offset the Unicode offset of the word
|
||||
* @property unicode_length the Unicode length of the word
|
||||
*/
|
||||
struct Word {
|
||||
string word;
|
||||
uint32_t offset;
|
||||
uint32_t unicode_offset;
|
||||
uint32_t unicode_length;
|
||||
Word(const string& w, uint32_t o)
|
||||
: word(w), offset(o) {
|
||||
}
|
||||
Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
|
||||
: word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
||||
}
|
||||
~Word() {
|
||||
word = std::move("");
|
||||
}
|
||||
}; // struct Word
|
||||
|
||||
#endif // COMMONSTRUCT_H
|
|
@ -63,7 +63,7 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
bool isMultiTone(string &word) {
|
||||
bool isMultiTone(const string &word) {
|
||||
if (qmap_chinese2pinyin.contains(QString::fromStdString(word)))
|
||||
return true;
|
||||
// if (map_chinese2pinyin.contains(word))
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include <ostream>
|
||||
#include "limonp/LocalVector.hpp"
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "common-struct.h"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
|
@ -15,24 +16,24 @@ using std::vector;
|
|||
|
||||
typedef uint32_t Rune;
|
||||
|
||||
struct KeyWord {
|
||||
string word;
|
||||
vector<size_t> offsets;
|
||||
double weight;
|
||||
}; // struct Word
|
||||
//struct KeyWord {
|
||||
// string word;
|
||||
// vector<size_t> offsets;
|
||||
// double weight;
|
||||
//}; // struct Word
|
||||
|
||||
struct Word {
|
||||
string word;
|
||||
uint32_t offset;
|
||||
uint32_t unicode_offset;
|
||||
uint32_t unicode_length;
|
||||
Word(const string& w, uint32_t o)
|
||||
: word(w), offset(o) {
|
||||
}
|
||||
Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
|
||||
: word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
||||
}
|
||||
}; // struct Word
|
||||
//struct Word {
|
||||
// string word;
|
||||
// uint32_t offset;
|
||||
// uint32_t unicode_offset;
|
||||
// uint32_t unicode_length;
|
||||
// Word(const string& w, uint32_t o)
|
||||
// : word(w), offset(o) {
|
||||
// }
|
||||
// Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
|
||||
// : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
||||
// }
|
||||
//}; // struct Word
|
||||
|
||||
inline std::ostream& operator << (std::ostream& os, const Word& w) {
|
||||
return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
#include "chinese-segmentation.h"
|
|
@ -0,0 +1 @@
|
|||
#include "hanzi-to-pinyin.h"
|
|
@ -0,0 +1,29 @@
|
|||
#ifndef HANZITOPINYINPRIVATE_H
|
||||
#define HANZITOPINYINPRIVATE_H
|
||||
|
||||
#include <QtCore/qglobal.h>
|
||||
#include "cppjieba/PinYinTrie.hpp"
|
||||
#include "hanzi-to-pinyin.h"
|
||||
|
||||
#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
|
||||
|
||||
using namespace std;
|
||||
|
||||
class PINYINMANAGER_EXPORT HanZiToPinYinPrivate
|
||||
{
|
||||
public:
|
||||
HanZiToPinYinPrivate(HanZiToPinYin *parent = nullptr);
|
||||
~HanZiToPinYinPrivate();
|
||||
|
||||
public:
|
||||
template <typename T>
|
||||
bool isMultiTone(T &&t) {return m_pinYinTrie->isMultiTone(std::forward<T>(t));}
|
||||
|
||||
bool contains(string &word);
|
||||
int getResults(string word, QStringList &results);
|
||||
|
||||
private:
|
||||
cppjieba::PinYinTrie *m_pinYinTrie = nullptr;
|
||||
HanZiToPinYin *q = nullptr;
|
||||
};
|
||||
#endif // HANZITOPINYINPRIVATE_H
|
|
@ -0,0 +1,83 @@
|
|||
#include "hanzi-to-pinyin.h"
|
||||
#include "hanzi-to-pinyin-private.h"
|
||||
#include <mutex>
|
||||
|
||||
HanZiToPinYin * HanZiToPinYin::g_pinYinManager = nullptr;
|
||||
std::once_flag g_singleFlag;
|
||||
|
||||
|
||||
|
||||
bool HanZiToPinYinPrivate::contains(string &word)
|
||||
{
|
||||
return m_pinYinTrie->contains(word);
|
||||
}
|
||||
|
||||
int HanZiToPinYinPrivate::getResults(string word, QStringList &results)
|
||||
{
|
||||
results.clear();
|
||||
if (-1 != m_pinYinTrie->getMultiTonResults(word, results)) {
|
||||
return 0;
|
||||
}
|
||||
QString tmp;
|
||||
if (-1 != m_pinYinTrie->getSingleTonResult(word, tmp)) {
|
||||
results.append(tmp);
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
HanZiToPinYinPrivate::HanZiToPinYinPrivate(HanZiToPinYin *parent) : q(parent)
|
||||
{
|
||||
const char * const PINYIN_PATH = "/usr/share/ukui-search/res/dict/pinyinWithoutTone.txt";
|
||||
m_pinYinTrie = new cppjieba::PinYinTrie(PINYIN_PATH);
|
||||
}
|
||||
|
||||
HanZiToPinYinPrivate::~HanZiToPinYinPrivate()
|
||||
{
|
||||
if (m_pinYinTrie){
|
||||
delete m_pinYinTrie;
|
||||
m_pinYinTrie = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
HanZiToPinYin * HanZiToPinYin::getInstance()
|
||||
{
|
||||
call_once(g_singleFlag, []() {
|
||||
g_pinYinManager = new HanZiToPinYin;
|
||||
});
|
||||
return g_pinYinManager;
|
||||
}
|
||||
|
||||
bool HanZiToPinYin::contains(string &word)
|
||||
{
|
||||
return d->contains(word);
|
||||
}
|
||||
|
||||
bool HanZiToPinYin::isMultiTone(string &word)
|
||||
{
|
||||
return d->isMultiTone(word);
|
||||
}
|
||||
|
||||
bool HanZiToPinYin::isMultiTone(string &&word)
|
||||
{
|
||||
return d->isMultiTone(word);
|
||||
}
|
||||
|
||||
bool HanZiToPinYin::isMultiTone(const string &word)
|
||||
{
|
||||
return d->isMultiTone(word);
|
||||
}
|
||||
|
||||
bool HanZiToPinYin::isMultiTone(const string &&word)
|
||||
{
|
||||
return d->isMultiTone(word);
|
||||
}
|
||||
|
||||
int HanZiToPinYin::getResults(string word, QStringList &results)
|
||||
{
|
||||
return d->getResults(word, results);
|
||||
}
|
||||
|
||||
HanZiToPinYin::HanZiToPinYin() : d(new HanZiToPinYinPrivate)
|
||||
{
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
#ifndef HANZITOPINYIN_H
|
||||
#define HANZITOPINYIN_H
|
||||
|
||||
#include <QtCore/qglobal.h>
|
||||
//#include "cppjieba/PinYinTrie.hpp"
|
||||
#include <QStringList>
|
||||
#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
|
||||
|
||||
using namespace std;
|
||||
|
||||
class HanZiToPinYinPrivate;
|
||||
class PINYINMANAGER_EXPORT HanZiToPinYin
|
||||
{
|
||||
public:
|
||||
static HanZiToPinYin * getInstance();
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief HanZiToPinYin::isMultiTone 判断是否为多音字(只支持单字)
|
||||
* @param word 要判断的字
|
||||
* @return bool 不是多音字或不是单字返回false
|
||||
*/
|
||||
bool isMultiTone(string &word);
|
||||
bool isMultiTone(string &&word);
|
||||
bool isMultiTone(const string &word);
|
||||
bool isMultiTone(const string &&word);
|
||||
|
||||
/**
|
||||
* @brief HanZiToPinYin::contains 查询某个字是否有拼音(是否在数据库包含,只支持单字)
|
||||
* @param word 要查询的字
|
||||
* @return bool 数据库不包含或不是单字返回false
|
||||
*/
|
||||
bool contains(string &word);
|
||||
|
||||
/**
|
||||
* @brief HanZiToPinYin::getResults 获取某个字的拼音(只支持单字)
|
||||
* @param word 要获取拼音的字
|
||||
* @param results word的拼音列表(有可能多音字),每次调用results会清空
|
||||
* @return int 获取到返回0,否则返回-1
|
||||
*/
|
||||
int getResults(string word, QStringList &results);
|
||||
|
||||
protected:
|
||||
HanZiToPinYin();
|
||||
~HanZiToPinYin();
|
||||
HanZiToPinYin(const HanZiToPinYin&) = delete;
|
||||
HanZiToPinYin& operator =(const HanZiToPinYin&) = delete;
|
||||
private:
|
||||
static HanZiToPinYin *g_pinYinManager;
|
||||
HanZiToPinYinPrivate *d = nullptr;
|
||||
};
|
||||
|
||||
#endif // PINYINMANAGER_H
|
|
@ -5,13 +5,16 @@ TARGET = chinese-segmentation
|
|||
TEMPLATE = lib
|
||||
DEFINES += LIBCHINESESEGMENTATION_LIBRARY
|
||||
|
||||
CONFIG += c++11
|
||||
CONFIG += c++11 create_pc create_prl no_install_prl
|
||||
|
||||
# The following define makes your compiler emit warnings if you use
|
||||
# any Qt feature that has been marked deprecated (the exact warnings
|
||||
# depend on your compiler). Please consult the documentation of the
|
||||
# deprecated API in order to know how to port your code away from it.
|
||||
DEFINES += QT_DEPRECATED_WARNINGS
|
||||
QMAKE_CXXFLAGS += -Werror=return-type -Werror=return-local-addr
|
||||
#QMAKE_CXXFLAGS += -Werror=uninitialized
|
||||
QMAKE_CXXFLAGS += -execution-charset:utf-8
|
||||
|
||||
# You can also make your code fail to compile if it uses deprecated APIs.
|
||||
# In order to do so, uncomment the following line.
|
||||
|
@ -23,12 +26,15 @@ include(cppjieba/cppjieba.pri)
|
|||
|
||||
SOURCES += \
|
||||
chinese-segmentation.cpp \
|
||||
pinyinmanager.cpp
|
||||
hanzi-to-pinyin.cpp
|
||||
|
||||
HEADERS += \
|
||||
chinese-segmentation-private.h \
|
||||
chinese-segmentation.h \
|
||||
libchinese-segmentation_global.h \
|
||||
pinyinmanager.h
|
||||
common-struct.h \
|
||||
hanzi-to-pinyin-private.h \
|
||||
hanzi-to-pinyin.h \
|
||||
libchinese-segmentation_global.h
|
||||
|
||||
dict_files.path = /usr/share/ukui-search/res/dict/
|
||||
dict_files.files = $$PWD/dict/*.utf8\
|
||||
|
@ -41,14 +47,24 @@ INSTALLS += \
|
|||
# Default rules for deployment.
|
||||
unix {
|
||||
target.path = $$[QT_INSTALL_LIBS]
|
||||
}
|
||||
QMAKE_PKGCONFIG_NAME = chinese-segmentation
|
||||
QMAKE_PKGCONFIG_DESCRIPTION = chinese-segmentation Header files
|
||||
QMAKE_PKGCONFIG_VERSION = $$VERSION
|
||||
QMAKE_PKGCONFIG_LIBDIR = $$target.path
|
||||
QMAKE_PKGCONFIG_DESTDIR = pkgconfig
|
||||
QMAKE_PKGCONFIG_INCDIR = /usr/include/chinese-seg
|
||||
QMAKE_PKGCONFIG_CFLAGS += -I/usr/include/chinese-seg
|
||||
|
||||
!isEmpty(target.path): INSTALLS += target
|
||||
|
||||
header.path = /usr/include/chinese-seg/
|
||||
header.files += *.h
|
||||
headercppjieba.path = /usr/include/chinese-seg/cppjieba/
|
||||
headercppjieba.files = cppjieba/*
|
||||
INSTALLS += header headercppjieba
|
||||
header.path = /usr/include/chinese-seg
|
||||
header.files += chinese-segmentation.h libchinese-segmentation_global.h common-struct.h hanzi-to-pinyin.h
|
||||
header.files += development-files/header-files/*
|
||||
# headercppjieba.path = /usr/include/chinese-seg/cppjieba/
|
||||
# headercppjieba.files = cppjieba/*
|
||||
INSTALLS += header
|
||||
}
|
||||
|
||||
|
||||
#DISTFILES += \
|
||||
# jiaba/jieba.pri
|
||||
|
@ -64,5 +80,5 @@ DISTFILES += \
|
|||
dict/pos_dict/prob_trans.utf8 \
|
||||
dict/stop_words.utf8 \
|
||||
dict/user.dict.utf8 \
|
||||
dict/pinyinWithoutTone.txt
|
||||
|
||||
dict/pinyinWithoutTone.txt \
|
||||
development-files/header-files/* \
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
#include "pinyinmanager.h"
|
||||
#include <mutex>
|
||||
PinYinManager * PinYinManager::g_pinYinManager = nullptr;
|
||||
std::once_flag g_singleFlag;
|
||||
PinYinManager * PinYinManager::getInstance()
|
||||
{
|
||||
call_once(g_singleFlag, []() {
|
||||
g_pinYinManager = new PinYinManager;
|
||||
});
|
||||
return g_pinYinManager;
|
||||
}
|
||||
|
||||
bool PinYinManager::contains(string &word)
|
||||
{
|
||||
return m_pinYinTrie->contains(word);
|
||||
}
|
||||
|
||||
bool PinYinManager::isMultiTon(string &word)
|
||||
{
|
||||
return m_pinYinTrie->isMultiTone(word);
|
||||
}
|
||||
|
||||
bool PinYinManager::isMultiTon(string word)
|
||||
{
|
||||
return m_pinYinTrie->isMultiTone(word);
|
||||
}
|
||||
|
||||
int PinYinManager::getResults(string word, QStringList &results)
|
||||
{
|
||||
results.clear();
|
||||
if (-1 != m_pinYinTrie->getMultiTonResults(word, results)) {
|
||||
return 0;
|
||||
}
|
||||
QString tmp;
|
||||
if (-1 != m_pinYinTrie->getSingleTonResult(word, tmp)) {
|
||||
results.append(tmp);
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
PinYinManager::PinYinManager()
|
||||
{
|
||||
const char * const PINYIN_PATH = "/usr/share/ukui-search/res/dict/pinyinWithoutTone.txt";
|
||||
m_pinYinTrie = new cppjieba::PinYinTrie(PINYIN_PATH);
|
||||
}
|
||||
|
||||
PinYinManager::~PinYinManager()
|
||||
{
|
||||
if (m_pinYinTrie){
|
||||
delete m_pinYinTrie;
|
||||
m_pinYinTrie = nullptr;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,33 +0,0 @@
|
|||
#ifndef PINYINMANAGER_H
|
||||
#define PINYINMANAGER_H
|
||||
|
||||
#include <QtCore/qglobal.h>
|
||||
#include "cppjieba/PinYinTrie.hpp"
|
||||
|
||||
#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
|
||||
|
||||
using namespace std;
|
||||
|
||||
class PINYINMANAGER_EXPORT PinYinManager
|
||||
{
|
||||
public:
|
||||
static PinYinManager * getInstance();
|
||||
|
||||
public:
|
||||
bool contains(string &word);
|
||||
bool isMultiTon(string &word);
|
||||
bool isMultiTon(string word);
|
||||
|
||||
int getResults(string word, QStringList &results);
|
||||
|
||||
protected:
|
||||
PinYinManager();
|
||||
~PinYinManager();
|
||||
|
||||
private:
|
||||
static PinYinManager *g_pinYinManager;
|
||||
cppjieba::PinYinTrie *m_pinYinTrie = nullptr;
|
||||
|
||||
};
|
||||
|
||||
#endif // PINYINMANAGER_H
|
|
@ -27,7 +27,7 @@
|
|||
#include <QDBusConnection>
|
||||
#include <QDomDocument>
|
||||
#include "gobject-template.h"
|
||||
#include "pinyinmanager.h"
|
||||
#include "hanzi-to-pinyin.h"
|
||||
|
||||
using namespace UkuiSearch;
|
||||
size_t FileUtils::maxIndexCount = 0;
|
||||
|
@ -413,14 +413,14 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result
|
|||
int multiToneWordNum = 0;
|
||||
|
||||
for (auto i:hanzi) {
|
||||
if (PinYinManager::getInstance()->isMultiTon(QString(i).toStdString()))
|
||||
if (HanZiToPinYin::getInstance()->isMultiTone(QString(i).toStdString()))
|
||||
++multiToneWordNum;
|
||||
}
|
||||
if(multiToneWordNum > 3) {
|
||||
QString oneResult, oneResultFirst;
|
||||
for(auto i : hanzi) {
|
||||
QStringList results;
|
||||
PinYinManager::getInstance()->getResults(QString(i).toStdString(), results);
|
||||
HanZiToPinYin::getInstance()->getResults(QString(i).toStdString(), results);
|
||||
if(results.size()) {
|
||||
oneResult += results.first();
|
||||
oneResultFirst += results.first().at(0);
|
||||
|
@ -435,7 +435,7 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result
|
|||
}
|
||||
|
||||
QStringList results;
|
||||
PinYinManager::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results);
|
||||
HanZiToPinYin::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results);
|
||||
if(results.size()) {
|
||||
for(auto i : results) {
|
||||
tempQueue.enqueue(i);
|
||||
|
@ -447,7 +447,7 @@ void stitchMultiToneWordsBFSStackLess3(const QString &hanzi, QStringList &result
|
|||
}
|
||||
tempHanzi = tempHanzi.right(tempHanzi.size() - 1);
|
||||
while(tempHanzi.size() != 0) {
|
||||
PinYinManager::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results);
|
||||
HanZiToPinYin::getInstance()->getResults(QString(tempHanzi.at(0)).toStdString(), results);
|
||||
tempQueueSize = tempQueue.size();
|
||||
if(results.size()) {
|
||||
for(int j = 0; j < tempQueueSize; ++j) {
|
||||
|
|
|
@ -118,7 +118,7 @@ void ConstructDocumentForContent::run() {
|
|||
doc.setData(content);
|
||||
//'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
|
||||
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
|
||||
std::vector<cppjieba::KeyWord> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
|
||||
std::vector<KeyWord> term = ChineseSegmentation::getInstance()->callSegment(content.left(20480000).toStdString());
|
||||
for(size_t i = 0; i < term.size(); ++i) {
|
||||
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
|
||||
}
|
||||
|
@ -158,7 +158,7 @@ void ConstructDocumentForOcr::run()
|
|||
doc.setData(content);
|
||||
//'\xEF\xBC\x8C' is "," "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
|
||||
content = content.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
|
||||
std::vector<cppjieba::KeyWord> term = ChineseSegmentation::getInstance()->callSegementStd(content.toStdString());
|
||||
std::vector<KeyWord> term = ChineseSegmentation::getInstance()->callSegment(content.toStdString());
|
||||
for(size_t i = 0; i < term.size(); ++i) {
|
||||
doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
|
||||
}
|
||||
|
|
|
@ -414,15 +414,15 @@ Document IndexGenerator::GenerateContentDocument(const QString &path) {
|
|||
// 构造文本索引的document
|
||||
QString content;
|
||||
QStringList tmp;
|
||||
QVector<SKeyWord> term;
|
||||
SKeyWord skw;
|
||||
std::vector<KeyWord> term;
|
||||
KeyWord skw;
|
||||
Document doc;
|
||||
QString uniqueterm;
|
||||
QString upTerm;
|
||||
QString suffix;
|
||||
FileReader::getTextContent(path, content, suffix);
|
||||
|
||||
term = ChineseSegmentation::getInstance()->callSegement(content.toStdString());
|
||||
term = ChineseSegmentation::getInstance()->callSegment(content.toStdString());
|
||||
// QStringList term = content.split("");
|
||||
|
||||
doc.setData(content);
|
||||
|
|
|
@ -272,7 +272,7 @@ int FileContentSearch::keywordSearchContent() {
|
|||
qp.set_default_op(Xapian::Query::OP_AND);
|
||||
qp.set_database(db);
|
||||
|
||||
QVector<SKeyWord> sKeyWord = ChineseSegmentation::getInstance()->callSegement(m_keyword.toStdString());
|
||||
std::vector<KeyWord> sKeyWord = ChineseSegmentation::getInstance()->callSegment(m_keyword.toStdString());
|
||||
//Creat a query
|
||||
std::string words;
|
||||
for(int i = 0; i < sKeyWord.size(); i++) {
|
||||
|
@ -446,7 +446,7 @@ int OcrSearch::keywordSearchOcr() {
|
|||
Xapian::QueryParser qp;
|
||||
qp.set_default_op(Xapian::Query::OP_AND);
|
||||
qp.set_database(db);
|
||||
QVector<SKeyWord> sKeyWord = ChineseSegmentation::getInstance()->callSegement(m_keyword.toStdString());
|
||||
std::vector<KeyWord> sKeyWord = ChineseSegmentation::getInstance()->callSegment(m_keyword.toStdString());
|
||||
//Creat a query
|
||||
std::string words;
|
||||
for(int i = 0; i < sKeyWord.size(); i++) {
|
||||
|
|
|
@ -154,9 +154,9 @@ NoteSearch::NoteSearch(DataQueue<SearchPluginIface::ResultInfo> *searchResult, c
|
|||
}
|
||||
|
||||
void NoteSearch::run() {
|
||||
QVector<SKeyWord> sKeyWordVec = ChineseSegmentation::getInstance()->callSegement(m_keyword.toStdString());
|
||||
std::vector<KeyWord> sKeyWordVec = ChineseSegmentation::getInstance()->callSegment(m_keyword.toStdString());
|
||||
QStringList keywordList;
|
||||
for (SKeyWord sKeyWord : sKeyWordVec) {
|
||||
for (KeyWord sKeyWord : sKeyWordVec) {
|
||||
keywordList.append(QString::fromStdString(sKeyWord.word));
|
||||
}
|
||||
QDBusInterface qi("org.ukui.note", "/org/ukui/note", "org.ukui.note.interface", QDBusConnection::sessionBus());
|
||||
|
|
|
@ -165,7 +165,7 @@ inline Xapian::Query FileContentSearchWorker::createQuery()
|
|||
std::vector<Xapian::Query> v;
|
||||
|
||||
for (const auto &keyword : m_searchController->getKeyword()) {
|
||||
QVector<SKeyWord> sKeyWord = ChineseSegmentation::getInstance()->callSegement(keyword.toStdString());
|
||||
std::vector<KeyWord> sKeyWord = ChineseSegmentation::getInstance()->callSegment(keyword.toStdString());
|
||||
|
||||
for(const auto & c : sKeyWord) {
|
||||
v.emplace_back(c.word);
|
||||
|
|
Loading…
Reference in New Issue