2021-01-29 11:43:07 +08:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
*
|
|
|
|
* Authors: zhangzihao <zhangzihao@kylinos.cn>
|
|
|
|
* Modified by: zhangpengfei <zhangpengfei@kylinos.cn>
|
|
|
|
*
|
|
|
|
*/
|
2020-12-30 14:42:04 +08:00
|
|
|
#include "chinese-segmentation.h"
|
2020-12-31 21:14:13 +08:00
|
|
|
#include <QFileInfo>
|
2021-01-19 10:44:28 +08:00
|
|
|
#include <QDebug>
|
|
|
|
static ChineseSegmentation *global_instance_chinese_segmentation = nullptr;
|
|
|
|
QMutex ChineseSegmentation::m_mutex;
|
2020-12-30 14:42:04 +08:00
|
|
|
|
2021-04-26 15:06:47 +08:00
|
|
|
ChineseSegmentation::ChineseSegmentation() {
|
2020-12-31 21:14:13 +08:00
|
|
|
const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
|
|
|
|
const char * const HMM_PATH = "/usr/share/ukui-search/res/dict/hmm_model.utf8";
|
2021-04-26 15:06:47 +08:00
|
|
|
const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
|
2020-12-31 21:14:13 +08:00
|
|
|
const char * const IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
|
|
|
|
const char * const STOP_WORD_PATH = "/usr/share/ukui-search/res/dict/stop_words.utf8";
|
2021-01-04 14:35:04 +08:00
|
|
|
m_jieba = new cppjieba::Jieba(DICT_PATH,
|
2021-01-19 10:44:28 +08:00
|
|
|
HMM_PATH,
|
|
|
|
USER_DICT_PATH,
|
|
|
|
IDF_PATH,
|
2021-05-22 09:18:35 +08:00
|
|
|
STOP_WORD_PATH,
|
|
|
|
"");
|
2021-01-04 14:35:04 +08:00
|
|
|
}
|
2020-12-31 21:14:13 +08:00
|
|
|
|
2021-04-26 15:06:47 +08:00
|
|
|
ChineseSegmentation::~ChineseSegmentation() {
|
2021-01-04 14:35:04 +08:00
|
|
|
if(m_jieba)
|
|
|
|
delete m_jieba;
|
2021-01-19 10:44:28 +08:00
|
|
|
m_jieba = nullptr;
|
2021-01-04 14:35:04 +08:00
|
|
|
}
|
|
|
|
|
2021-04-26 15:06:47 +08:00
|
|
|
ChineseSegmentation *ChineseSegmentation::getInstance() {
|
2021-01-21 13:50:21 +08:00
|
|
|
QMutexLocker locker(&m_mutex);
|
2021-04-26 15:06:47 +08:00
|
|
|
if(!global_instance_chinese_segmentation) {
|
2021-01-19 10:44:28 +08:00
|
|
|
global_instance_chinese_segmentation = new ChineseSegmentation;
|
2021-01-04 14:35:04 +08:00
|
|
|
}
|
2021-01-19 10:44:28 +08:00
|
|
|
return global_instance_chinese_segmentation;
|
2021-01-04 14:35:04 +08:00
|
|
|
}
|
|
|
|
|
2021-05-17 14:47:39 +08:00
|
|
|
QVector<SKeyWord> ChineseSegmentation::callSegement(std::string s) {
|
|
|
|
// std::string s;
|
|
|
|
// s = str.toStdString();
|
|
|
|
// str.squeeze();
|
2020-12-31 21:14:13 +08:00
|
|
|
|
|
|
|
const size_t topk = -1;
|
2021-06-07 15:37:06 +08:00
|
|
|
std::vector<cppjieba::KeyWord> keywordres;
|
2021-01-04 14:35:04 +08:00
|
|
|
ChineseSegmentation::m_jieba->extractor.Extract(s, keywordres, topk);
|
2021-03-16 17:21:10 +08:00
|
|
|
std::string().swap(s);
|
2020-12-31 21:14:13 +08:00
|
|
|
QVector<SKeyWord> vecNeeds;
|
|
|
|
convert(keywordres, vecNeeds);
|
|
|
|
|
2021-01-20 17:14:14 +08:00
|
|
|
keywordres.clear();
|
2021-05-12 16:27:24 +08:00
|
|
|
// keywordres.shrink_to_fit();
|
2020-12-31 21:14:13 +08:00
|
|
|
return vecNeeds;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2021-06-07 15:37:06 +08:00
|
|
|
std::vector<cppjieba::KeyWord> ChineseSegmentation::callSegementStd(const std::string &str) {
|
2021-05-22 09:18:35 +08:00
|
|
|
|
|
|
|
const size_t topk = -1;
|
2021-06-07 15:37:06 +08:00
|
|
|
std::vector<cppjieba::KeyWord> keywordres;
|
2021-05-22 09:18:35 +08:00
|
|
|
ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk);
|
|
|
|
|
|
|
|
return keywordres;
|
|
|
|
}
|
|
|
|
|
2021-06-07 15:37:06 +08:00
|
|
|
void ChineseSegmentation::convert(std::vector<cppjieba::KeyWord> &keywordres, QVector<SKeyWord> &kw) {
|
2021-04-26 15:06:47 +08:00
|
|
|
for(auto i : keywordres) {
|
2020-12-31 21:14:13 +08:00
|
|
|
SKeyWord temp;
|
|
|
|
temp.word = i.word;
|
|
|
|
temp.offsets = QVector<size_t>::fromStdVector(i.offsets);
|
|
|
|
temp.weight = i.weight;
|
|
|
|
kw.append(temp);
|
|
|
|
}
|
|
|
|
}
|