ukui-search/libchinese-segmentation/hanzi-to-pinyin.cpp

361 lines
14 KiB
C++
Raw Permalink Normal View History

/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#include <mutex>
#include <cctype>
#include "hanzi-to-pinyin.h"
#include "hanzi-to-pinyin-private.h"
#include "chinese-segmentation.h"
#include "cppjieba/Unicode.hpp"
HanZiToPinYin * HanZiToPinYin::g_pinYinManager = nullptr;
std::once_flag g_singleFlag;
bool HanZiToPinYinPrivate::contains(string &word)
{
return m_pinYinTrie.Contains(word);
}
int HanZiToPinYinPrivate::getResults(string &word, QStringList &results)
{
results.clear();
string directResult = m_pinYinTrie.Find(word);
if (directResult == string()) {
if (m_segType == SegType::NoSegmentation) {//无分词、无结果直接返回-1
return -1;
} else {//无结果、启用分词
vector<string> segResults = ChineseSegmentation::getInstance()->callMixSegmentCutStr(word);
string data;
for (string &info : segResults) {
if (info == string()) {
continue;
}
data = m_pinYinTrie.Find(info);
if (data == string()) {//分词后无结果
if (cppjieba::IsSingleWord(info)) {//单个字符
if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回
results.append(QString().fromStdString(info));
} else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略
continue;
}
} else {//多个字符
string oneWord;
cppjieba::RuneStrArray runeArray;
cppjieba::DecodeRunesInString(info, runeArray);
for (auto i = runeArray.begin(); i != runeArray.end(); ++i) {
oneWord = cppjieba::GetStringFromRunes(info, i, i);
data = m_pinYinTrie.Find(oneWord);
if (data == string()) {//单字无结果则按设置返回
if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回
results.append(QString().fromStdString(oneWord));
} else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略
continue;
}
}
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
results.append(QString().fromStdString(data));
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
if (limonp::IsInStr(data, ',')) {
results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0))));
} else {
results.append(QString().fromStdString(data));
}
}
}
}
} else {//分词后有结果
if (cppjieba::IsSingleWord(info)) {//单个字符
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
results.append(QString().fromStdString(data));
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
if (limonp::IsInStr(data, ',')) {
results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0))));
} else {
results.append(QString().fromStdString(data));
}
}
} else {//多个字符
vector<string> dataVec = limonp::Split(data, "/");
if (dataVec.size() == 1) {//无多音词
vector<string> dataVec = limonp::Split(data, ",");
for (auto &oneResult : dataVec) {
results.append(QString().fromStdString(oneResult));
}
} else {
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
int wordSize = limonp::Split(dataVec[0], ",").size();
for (int i = 0; i < wordSize; ++i) {
QStringList oneResult;
for (size_t j = 0; j < dataVec.size(); ++j) {
oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i]));
}
results.append(oneResult.join('/'));
}
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
vector<string> tmp = limonp::Split(dataVec[0], ",");
for (auto &oneResult : tmp) {
results.append(QString().fromStdString(oneResult));
}
}
}
}
}
}
}
} else {//可以直接查到结果
if (cppjieba::IsSingleWord(word)) {//单个字符
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
results.append(QString().fromStdString(directResult));
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
if (limonp::IsInStr(directResult, ',')) {
results.append(QString().fromStdString(directResult.substr(0, directResult.find_first_of(",", 0))));
} else {
results.append(QString().fromStdString(directResult));
}
}
} else {//多个字符
vector<string> dataVec = limonp::Split(directResult, "/");
if (dataVec.size() == 1) {//无多音词
vector<string> dataVec = limonp::Split(directResult, ",");
for (auto &oneResult : dataVec) {
results.append(QString().fromStdString(oneResult));
}
} else {
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
int wordSize = limonp::Split(dataVec[0], ",").size();
for (int i = 0; i < wordSize; ++i) {
QStringList oneResult;
for (size_t j = 0; j < dataVec.size(); ++j) {
oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i]));
}
results.append(oneResult.join('/'));
}
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
vector<string> tmp = limonp::Split(dataVec[0], ",");
for (auto &oneResult : tmp) {
results.append(QString().fromStdString(oneResult));
}
}
}
}
}
convertDataStyle(results);
return 0;//todo
}
void HanZiToPinYinPrivate::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType)
{
m_pinyinDataStyle = dataStyle;
m_segType = segType;
m_polyphoneType = polyphoneType;
m_exDataProcessType = processType;
}
void HanZiToPinYinPrivate::convertDataStyle(QStringList &results)
{
QString value;
if (m_pinyinDataStyle == PinyinDataStyle::Default) {
for (QString &info : results) {
if(info == ",") {
continue;
}
//if info's length was been changed, there's someting wrong while traverse the chars of info
for (const QChar &c : info) {
if (!isalpha(c.toLatin1())) {
value = PhoneticSymbol.value(c);
if (!value.isEmpty()) {
info.replace(c, value.at(0));
}
}
}
QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序)
QStringList tmpValue;
for (auto &str : tmpList) {
if (!tmpValue.contains(str)) {
tmpValue.push_back(str);
}
}
info = tmpValue.join(",");
}
} else if (m_pinyinDataStyle == PinyinDataStyle::Tone) {
//无需处理
} else if (m_pinyinDataStyle == PinyinDataStyle::Tone2) {
for (QString &info : results) {
for (int i = 0; i < info.size();) {
auto c = info.at(i);
if (!isalpha(c.toLatin1())) {
value = PhoneticSymbol.value(c);
if (!value.isEmpty()) {
info.replace(c, PhoneticSymbol.value(c));
i += PhoneticSymbol.value(c).size();
continue;
}
}
i++;
}
}
} else if (m_pinyinDataStyle == PinyinDataStyle::Tone3) {
for (QString &info : results) {
if(info == "/") {
continue;
}
bool isPolyphoneWords(false);
if (info.contains("/")) {
isPolyphoneWords = true;
info.replace("/", ",");
}
for (int i = 0; i < info.size();) {
auto c = info.at(i);
if (!isalpha(c.toLatin1())) {
value = PhoneticSymbol.value(c);
if (!value.isEmpty()) {
info.replace(i, 1, value.at(0));
//多音词模式
if (info.contains(",")) {
int pos = info.indexOf(',', i);
if (isPolyphoneWords) {
info.replace(",", "/");
}
//最后一个读音时
if (pos == -1) {
info.append(value.at(1));
break;
}
info.insert(pos, value.at(1));
i = pos + 1; //insert导致','的位置加一将i行进到','的位置
i++;
continue;
} else {
info.append(value.at(1));
break;
}
}
}
i++;
}
}
} else if (m_pinyinDataStyle == PinyinDataStyle::FirstLetter) {
for (QString &info : results) {
if(info == "," or info == "/") {
continue;
}
bool isPolyphoneWords(false);
if (info.contains("/")) {
isPolyphoneWords = true;
info.replace("/", ",");
}
for (int i = 0; i < info.size();i++) {
auto c = info.at(i);
if (!isalpha(c.toLatin1())) {
value = PhoneticSymbol.value(c);
if (!value.isEmpty()) {
info.replace(c, value.at(0));
}
}
}
QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序)
QStringList tmpValue;
for (auto &str : tmpList) {
if (!tmpValue.contains(str)) {
tmpValue.push_back(str.at(0));
}
}
if (isPolyphoneWords) {
info = tmpValue.join("/");
} else {
info = tmpValue.join(",");
}
}
} else if (m_pinyinDataStyle == PinyinDataStyle::English) {
//暂不支持
}
}
HanZiToPinYinPrivate::HanZiToPinYinPrivate(HanZiToPinYin *parent) : q(parent)
{
//const char * const SINGLE_WORD_PINYIN_PATH = "/usr/share/ukui-search/res/dict/singleWordPinyin.txt";
//const char * const WORDS_PINYIN_PATH = "/usr/share/ukui-search/res/dict/wordsPinyin.txt";
//m_pinYinTrie = new Pinyin4cppDictTrie(SINGLE_WORD_PINYIN_PATH, WORDS_PINYIN_PATH);
//m_pinYinTrie = new Pinyin4cppTrie;
}
HanZiToPinYinPrivate::~HanZiToPinYinPrivate()
{
// if (m_pinYinTrie){
// delete m_pinYinTrie;
// m_pinYinTrie = nullptr;
// }
}
HanZiToPinYin * HanZiToPinYin::getInstance()
{
call_once(g_singleFlag, []() {
g_pinYinManager = new HanZiToPinYin;
});
return g_pinYinManager;
}
bool HanZiToPinYin::contains(string &word)
{
return d->contains(word);
}
bool HanZiToPinYin::isMultiTone(string &word)
{
return d->isMultiTone(word);
}
bool HanZiToPinYin::isMultiTone(string &&word)
{
return d->isMultiTone(word);
}
bool HanZiToPinYin::isMultiTone(const string &word)
{
return d->isMultiTone(word);
}
bool HanZiToPinYin::isMultiTone(const string &&word)
{
return d->isMultiTone(word);
}
int HanZiToPinYin::getResults(string word, QStringList &results)
{
return d->getResults(word, results);
}
void HanZiToPinYin::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType)
{
d->setConfig(dataStyle, segType, polyphoneType, processType);
}
HanZiToPinYin::HanZiToPinYin() : d(new HanZiToPinYinPrivate)
{
}