ukui-search/libchinese-segmentation/chinese-segmentation.h

119 lines
4.4 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (C) 2020, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: zhangzihao <zhangzihao@kylinos.cn>
* Modified by: zhangpengfei <zhangpengfei@kylinos.cn>
*
*/
#ifndef CHINESESEGMENTATION_H
#define CHINESESEGMENTATION_H
#include <QString>
#include "libchinese-segmentation_global.h"
#include "common-struct.h"
class ChineseSegmentationPrivate;
class CHINESESEGMENTATION_EXPORT ChineseSegmentation {
public:
static ChineseSegmentation *getInstance();
/**
* @brief ChineseSegmentation::callSegment
* 调用extractor进行关键词提取先使用Mix方式初步分词再使用Idf词典进行关键词提取只包含两字以上关键词
*
* @param sentence 要提取关键词的句子
* @return vector<KeyWord> 存放提取后关键词的信息的容器
*/
vector<KeyWord> callSegment(const string &sentence);
vector<KeyWord> callSegment(QString &sentence);
/**
* @brief ChineseSegmentation::callMixSegmentCutStr
* 使用Mix方法进行分词即先使用最大概率法MP初步分词再用隐式马尔科夫模型HMM进一步分词可以准确切出词典已有词和未登录词结果比较准确
*
* @param sentence 要分词的句子
* @return vector<string> 只存放分词后每个词的内容的容器
*/
vector<string> callMixSegmentCutStr(const string& sentence);
/**
* @brief ChineseSegmentation::callMixSegmentCutWord
* 和callMixSegmentCutStr功能相同
* @param sentence 要分词的句子
* @return vector<Word> 存放分词后每个词所有信息的容器
*/
vector<Word> callMixSegmentCutWord(const string& str);
/**
* @brief ChineseSegmentation::lookUpTagOfWord
* 查询word的词性
* @param word 要查询词性的词
* @return string word的词性
*/
string lookUpTagOfWord(const string& word);
/**
* @brief ChineseSegmentation::getTagOfWordsInSentence
* 使用Mix分词后获取每个词的词性
* @param sentence 要分词的句子
* @return vector<pair<string, string>> 分词后的每个词的内容(firsr)和其对应的词性(second)
*/
vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
/**
* @brief ChineseSegmentation::callFullSegment
* 使用Full进行分词Full会切出字典里所有的词。
* @param sentence 要分词的句子
* @return vector<Word> 存放分词后每个词所有信息的容器
*/
vector<Word> callFullSegment(const string& sentence);
/**
* @brief ChineseSegmentation::callQuerySegment
* 使用Query进行分词即先使用Mix对于长词再用Full结果最精确但词的数量也最大
* @param sentence 要分词的句子
* @return vector<Word> 存放分词后每个词所有信息的容器
*/
vector<Word> callQuerySegment(const string& sentence);
/**
* @brief ChineseSegmentation::callHMMSegment
* 使用隐式马尔科夫模型HMM进行分词
* @param sentence 要分词的句子
* @return vector<Word> 存放分词后每个词所有信息的容器
*/
vector<Word> callHMMSegment(const string& sentence);
/**
* @brief ChineseSegmentation::callMPSegment
* 使用最大概率法MP进行分词
* @param sentence 要分词的句子
* @return vector<Word> 存放分词后每个词所有信息的容器
*/
vector<Word> callMPSegment(const string& sentence);
private:
explicit ChineseSegmentation();
~ChineseSegmentation() = default;
ChineseSegmentation(const ChineseSegmentation&) = delete;
ChineseSegmentation& operator =(const ChineseSegmentation&) = delete;
private:
ChineseSegmentationPrivate *d = nullptr;
};
#endif // CHINESESEGMENTATION_H