2020-12-30 14:42:04 +08:00
|
|
|
#include "chinese-segmentation.h"
|
2020-12-31 21:14:13 +08:00
|
|
|
#include <QFileInfo>
|
2021-01-19 10:44:28 +08:00
|
|
|
#include <QDebug>
|
|
|
|
static ChineseSegmentation *global_instance_chinese_segmentation = nullptr;
|
|
|
|
QMutex ChineseSegmentation::m_mutex;
|
2020-12-30 14:42:04 +08:00
|
|
|
|
|
|
|
ChineseSegmentation::ChineseSegmentation()
|
2020-12-31 21:14:13 +08:00
|
|
|
{
|
|
|
|
const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
|
|
|
|
const char * const HMM_PATH = "/usr/share/ukui-search/res/dict/hmm_model.utf8";
|
|
|
|
const char * const USER_DICT_PATH ="/usr/share/ukui-search/res/dict/user.dict.utf8";
|
|
|
|
const char * const IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
|
|
|
|
const char * const STOP_WORD_PATH = "/usr/share/ukui-search/res/dict/stop_words.utf8";
|
|
|
|
|
2021-01-04 14:35:04 +08:00
|
|
|
m_jieba = new cppjieba::Jieba(DICT_PATH,
|
2021-01-19 10:44:28 +08:00
|
|
|
HMM_PATH,
|
|
|
|
USER_DICT_PATH,
|
|
|
|
IDF_PATH,
|
2021-01-04 14:35:04 +08:00
|
|
|
STOP_WORD_PATH);
|
|
|
|
}
|
2020-12-31 21:14:13 +08:00
|
|
|
|
2021-01-04 14:35:04 +08:00
|
|
|
ChineseSegmentation::~ChineseSegmentation()
|
|
|
|
{
|
|
|
|
if(m_jieba)
|
|
|
|
delete m_jieba;
|
2021-01-19 10:44:28 +08:00
|
|
|
m_jieba = nullptr;
|
2021-01-04 14:35:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
ChineseSegmentation *ChineseSegmentation::getInstance()
|
|
|
|
{
|
2021-01-21 13:50:21 +08:00
|
|
|
QMutexLocker locker(&m_mutex);
|
2021-01-19 10:44:28 +08:00
|
|
|
if (!global_instance_chinese_segmentation) {
|
|
|
|
global_instance_chinese_segmentation = new ChineseSegmentation;
|
2021-01-04 14:35:04 +08:00
|
|
|
}
|
2021-01-19 10:44:28 +08:00
|
|
|
return global_instance_chinese_segmentation;
|
2021-01-04 14:35:04 +08:00
|
|
|
}
|
|
|
|
|
2021-01-19 10:44:28 +08:00
|
|
|
QVector<SKeyWord> ChineseSegmentation::callSegement(QString& str)
|
2021-01-04 14:35:04 +08:00
|
|
|
{
|
2020-12-31 21:14:13 +08:00
|
|
|
std::string s;
|
2021-01-19 10:44:28 +08:00
|
|
|
s=str.toStdString();
|
2020-12-31 21:14:13 +08:00
|
|
|
|
|
|
|
const size_t topk = -1;
|
|
|
|
std::vector<cppjieba::KeywordExtractor::Word> keywordres;
|
2021-01-04 14:35:04 +08:00
|
|
|
ChineseSegmentation::m_jieba->extractor.Extract(s, keywordres, topk);
|
2020-12-31 21:14:13 +08:00
|
|
|
QVector<SKeyWord> vecNeeds;
|
|
|
|
convert(keywordres, vecNeeds);
|
|
|
|
|
2021-01-20 17:14:14 +08:00
|
|
|
keywordres.clear();
|
|
|
|
keywordres.shrink_to_fit();
|
|
|
|
|
|
|
|
|
2020-12-31 21:14:13 +08:00
|
|
|
return vecNeeds;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
void ChineseSegmentation::convert(std::vector<cppjieba::KeywordExtractor::Word> &keywordres, QVector<SKeyWord> &kw)
|
|
|
|
{
|
|
|
|
for (auto i : keywordres){
|
|
|
|
SKeyWord temp;
|
|
|
|
temp.word = i.word;
|
|
|
|
temp.offsets = QVector<size_t>::fromStdVector(i.offsets);
|
|
|
|
temp.weight = i.weight;
|
|
|
|
kw.append(temp);
|
|
|
|
}
|
|
|
|
}
|