361 lines
14 KiB
C++
361 lines
14 KiB
C++
/*
|
||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||
*
|
||
* This program is free software: you can redistribute it and/or modify
|
||
* it under the terms of the GNU General Public License as published by
|
||
* the Free Software Foundation, either version 3 of the License, or
|
||
* (at your option) any later version.
|
||
*
|
||
* This program is distributed in the hope that it will be useful,
|
||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
* GNU General Public License for more details.
|
||
*
|
||
* You should have received a copy of the GNU General Public License
|
||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
*
|
||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||
*
|
||
*/
|
||
|
||
#include <mutex>
|
||
#include <cctype>
|
||
#include "hanzi-to-pinyin.h"
|
||
#include "hanzi-to-pinyin-private.h"
|
||
#include "chinese-segmentation.h"
|
||
#include "cppjieba/Unicode.hpp"
|
||
|
||
HanZiToPinYin * HanZiToPinYin::g_pinYinManager = nullptr;
|
||
std::once_flag g_singleFlag;
|
||
|
||
bool HanZiToPinYinPrivate::contains(string &word)
|
||
{
|
||
return m_pinYinTrie.Contains(word);
|
||
}
|
||
|
||
int HanZiToPinYinPrivate::getResults(string &word, QStringList &results)
|
||
{
|
||
results.clear();
|
||
|
||
string directResult = m_pinYinTrie.Find(word);
|
||
|
||
if (directResult == string()) {
|
||
if (m_segType == SegType::NoSegmentation) {//无分词、无结果直接返回-1
|
||
return -1;
|
||
} else {//无结果、启用分词
|
||
vector<string> segResults = ChineseSegmentation::getInstance()->callMixSegmentCutStr(word);
|
||
string data;
|
||
for (string &info : segResults) {
|
||
if (info == string()) {
|
||
continue;
|
||
}
|
||
data = m_pinYinTrie.Find(info);
|
||
if (data == string()) {//分词后无结果
|
||
if (cppjieba::IsSingleWord(info)) {//单个字符
|
||
if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回
|
||
results.append(QString().fromStdString(info));
|
||
} else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略
|
||
continue;
|
||
}
|
||
} else {//多个字符
|
||
string oneWord;
|
||
cppjieba::RuneStrArray runeArray;
|
||
cppjieba::DecodeRunesInString(info, runeArray);
|
||
for (auto i = runeArray.begin(); i != runeArray.end(); ++i) {
|
||
oneWord = cppjieba::GetStringFromRunes(info, i, i);
|
||
data = m_pinYinTrie.Find(oneWord);
|
||
if (data == string()) {//单字无结果则按设置返回
|
||
if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回
|
||
results.append(QString().fromStdString(oneWord));
|
||
} else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略
|
||
continue;
|
||
}
|
||
}
|
||
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
|
||
results.append(QString().fromStdString(data));
|
||
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
|
||
if (limonp::IsInStr(data, ',')) {
|
||
results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0))));
|
||
} else {
|
||
results.append(QString().fromStdString(data));
|
||
}
|
||
}
|
||
}
|
||
}
|
||
} else {//分词后有结果
|
||
if (cppjieba::IsSingleWord(info)) {//单个字符
|
||
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
|
||
results.append(QString().fromStdString(data));
|
||
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
|
||
if (limonp::IsInStr(data, ',')) {
|
||
results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0))));
|
||
} else {
|
||
results.append(QString().fromStdString(data));
|
||
}
|
||
}
|
||
} else {//多个字符
|
||
vector<string> dataVec = limonp::Split(data, "/");
|
||
if (dataVec.size() == 1) {//无多音词
|
||
vector<string> dataVec = limonp::Split(data, ",");
|
||
for (auto &oneResult : dataVec) {
|
||
results.append(QString().fromStdString(oneResult));
|
||
}
|
||
} else {
|
||
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
|
||
int wordSize = limonp::Split(dataVec[0], ",").size();
|
||
for (int i = 0; i < wordSize; ++i) {
|
||
QStringList oneResult;
|
||
for (size_t j = 0; j < dataVec.size(); ++j) {
|
||
oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i]));
|
||
}
|
||
results.append(oneResult.join('/'));
|
||
}
|
||
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
|
||
vector<string> tmp = limonp::Split(dataVec[0], ",");
|
||
for (auto &oneResult : tmp) {
|
||
results.append(QString().fromStdString(oneResult));
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
} else {//可以直接查到结果
|
||
if (cppjieba::IsSingleWord(word)) {//单个字符
|
||
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
|
||
results.append(QString().fromStdString(directResult));
|
||
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
|
||
if (limonp::IsInStr(directResult, ',')) {
|
||
results.append(QString().fromStdString(directResult.substr(0, directResult.find_first_of(",", 0))));
|
||
} else {
|
||
results.append(QString().fromStdString(directResult));
|
||
}
|
||
}
|
||
} else {//多个字符
|
||
vector<string> dataVec = limonp::Split(directResult, "/");
|
||
if (dataVec.size() == 1) {//无多音词
|
||
vector<string> dataVec = limonp::Split(directResult, ",");
|
||
for (auto &oneResult : dataVec) {
|
||
results.append(QString().fromStdString(oneResult));
|
||
}
|
||
} else {
|
||
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
|
||
int wordSize = limonp::Split(dataVec[0], ",").size();
|
||
for (int i = 0; i < wordSize; ++i) {
|
||
QStringList oneResult;
|
||
for (size_t j = 0; j < dataVec.size(); ++j) {
|
||
oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i]));
|
||
}
|
||
results.append(oneResult.join('/'));
|
||
}
|
||
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
|
||
vector<string> tmp = limonp::Split(dataVec[0], ",");
|
||
for (auto &oneResult : tmp) {
|
||
results.append(QString().fromStdString(oneResult));
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
convertDataStyle(results);
|
||
return 0;//todo
|
||
}
|
||
|
||
void HanZiToPinYinPrivate::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType)
|
||
{
|
||
m_pinyinDataStyle = dataStyle;
|
||
m_segType = segType;
|
||
m_polyphoneType = polyphoneType;
|
||
m_exDataProcessType = processType;
|
||
}
|
||
|
||
void HanZiToPinYinPrivate::convertDataStyle(QStringList &results)
|
||
{
|
||
QString value;
|
||
if (m_pinyinDataStyle == PinyinDataStyle::Default) {
|
||
for (QString &info : results) {
|
||
if(info == ",") {
|
||
continue;
|
||
}
|
||
//if info's length was been changed, there's someting wrong while traverse the chars of info
|
||
for (const QChar &c : info) {
|
||
if (!isalpha(c.toLatin1())) {
|
||
value = PhoneticSymbol.value(c);
|
||
if (!value.isEmpty()) {
|
||
info.replace(c, value.at(0));
|
||
}
|
||
}
|
||
}
|
||
|
||
QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序)
|
||
QStringList tmpValue;
|
||
for (auto &str : tmpList) {
|
||
if (!tmpValue.contains(str)) {
|
||
tmpValue.push_back(str);
|
||
}
|
||
}
|
||
info = tmpValue.join(",");
|
||
}
|
||
} else if (m_pinyinDataStyle == PinyinDataStyle::Tone) {
|
||
//无需处理
|
||
} else if (m_pinyinDataStyle == PinyinDataStyle::Tone2) {
|
||
for (QString &info : results) {
|
||
for (int i = 0; i < info.size();) {
|
||
auto c = info.at(i);
|
||
if (!isalpha(c.toLatin1())) {
|
||
value = PhoneticSymbol.value(c);
|
||
if (!value.isEmpty()) {
|
||
info.replace(c, PhoneticSymbol.value(c));
|
||
i += PhoneticSymbol.value(c).size();
|
||
continue;
|
||
}
|
||
}
|
||
i++;
|
||
}
|
||
}
|
||
} else if (m_pinyinDataStyle == PinyinDataStyle::Tone3) {
|
||
for (QString &info : results) {
|
||
if(info == "/") {
|
||
continue;
|
||
}
|
||
bool isPolyphoneWords(false);
|
||
if (info.contains("/")) {
|
||
isPolyphoneWords = true;
|
||
info.replace("/", ",");
|
||
}
|
||
|
||
for (int i = 0; i < info.size();) {
|
||
auto c = info.at(i);
|
||
if (!isalpha(c.toLatin1())) {
|
||
value = PhoneticSymbol.value(c);
|
||
if (!value.isEmpty()) {
|
||
info.replace(i, 1, value.at(0));
|
||
//多音词模式
|
||
if (info.contains(",")) {
|
||
int pos = info.indexOf(',', i);
|
||
if (isPolyphoneWords) {
|
||
info.replace(",", "/");
|
||
}
|
||
//最后一个读音时
|
||
if (pos == -1) {
|
||
info.append(value.at(1));
|
||
break;
|
||
}
|
||
info.insert(pos, value.at(1));
|
||
i = pos + 1; //insert导致','的位置加一,将i行进到','的位置
|
||
i++;
|
||
continue;
|
||
} else {
|
||
info.append(value.at(1));
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
i++;
|
||
}
|
||
|
||
}
|
||
} else if (m_pinyinDataStyle == PinyinDataStyle::FirstLetter) {
|
||
for (QString &info : results) {
|
||
if(info == "," or info == "/") {
|
||
continue;
|
||
}
|
||
|
||
bool isPolyphoneWords(false);
|
||
if (info.contains("/")) {
|
||
isPolyphoneWords = true;
|
||
info.replace("/", ",");
|
||
}
|
||
|
||
for (int i = 0; i < info.size();i++) {
|
||
auto c = info.at(i);
|
||
if (!isalpha(c.toLatin1())) {
|
||
value = PhoneticSymbol.value(c);
|
||
if (!value.isEmpty()) {
|
||
info.replace(c, value.at(0));
|
||
}
|
||
}
|
||
}
|
||
|
||
QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序)
|
||
QStringList tmpValue;
|
||
for (auto &str : tmpList) {
|
||
if (!tmpValue.contains(str)) {
|
||
tmpValue.push_back(str.at(0));
|
||
}
|
||
}
|
||
if (isPolyphoneWords) {
|
||
info = tmpValue.join("/");
|
||
} else {
|
||
info = tmpValue.join(",");
|
||
}
|
||
}
|
||
} else if (m_pinyinDataStyle == PinyinDataStyle::English) {
|
||
//暂不支持
|
||
}
|
||
}
|
||
|
||
HanZiToPinYinPrivate::HanZiToPinYinPrivate(HanZiToPinYin *parent) : q(parent)
|
||
{
|
||
//const char * const SINGLE_WORD_PINYIN_PATH = "/usr/share/ukui-search/res/dict/singleWordPinyin.txt";
|
||
//const char * const WORDS_PINYIN_PATH = "/usr/share/ukui-search/res/dict/wordsPinyin.txt";
|
||
//m_pinYinTrie = new Pinyin4cppDictTrie(SINGLE_WORD_PINYIN_PATH, WORDS_PINYIN_PATH);
|
||
//m_pinYinTrie = new Pinyin4cppTrie;
|
||
}
|
||
|
||
HanZiToPinYinPrivate::~HanZiToPinYinPrivate()
|
||
{
|
||
// if (m_pinYinTrie){
|
||
// delete m_pinYinTrie;
|
||
// m_pinYinTrie = nullptr;
|
||
// }
|
||
}
|
||
|
||
HanZiToPinYin * HanZiToPinYin::getInstance()
|
||
{
|
||
call_once(g_singleFlag, []() {
|
||
g_pinYinManager = new HanZiToPinYin;
|
||
});
|
||
return g_pinYinManager;
|
||
}
|
||
|
||
bool HanZiToPinYin::contains(string &word)
|
||
{
|
||
return d->contains(word);
|
||
}
|
||
|
||
bool HanZiToPinYin::isMultiTone(string &word)
|
||
{
|
||
return d->isMultiTone(word);
|
||
}
|
||
|
||
bool HanZiToPinYin::isMultiTone(string &&word)
|
||
{
|
||
return d->isMultiTone(word);
|
||
}
|
||
|
||
bool HanZiToPinYin::isMultiTone(const string &word)
|
||
{
|
||
return d->isMultiTone(word);
|
||
}
|
||
|
||
bool HanZiToPinYin::isMultiTone(const string &&word)
|
||
{
|
||
return d->isMultiTone(word);
|
||
}
|
||
|
||
int HanZiToPinYin::getResults(string word, QStringList &results)
|
||
{
|
||
return d->getResults(word, results);
|
||
}
|
||
|
||
void HanZiToPinYin::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType)
|
||
{
|
||
d->setConfig(dataStyle, segType, polyphoneType, processType);
|
||
}
|
||
|
||
HanZiToPinYin::HanZiToPinYin() : d(new HanZiToPinYinPrivate)
|
||
{
|
||
}
|