/* * Copyright (C) 2022, KylinSoft Co., Ltd. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * * Authors: jixiaoxu * */ #include #include #include "hanzi-to-pinyin.h" #include "hanzi-to-pinyin-private.h" #include "chinese-segmentation.h" #include "cppjieba/Unicode.hpp" HanZiToPinYin * HanZiToPinYin::g_pinYinManager = nullptr; std::once_flag g_singleFlag; bool HanZiToPinYinPrivate::contains(string &word) { return m_pinYinTrie.Contains(word); } int HanZiToPinYinPrivate::getResults(string &word, QStringList &results) { results.clear(); string directResult = m_pinYinTrie.Find(word); if (directResult == string()) { if (m_segType == SegType::NoSegmentation) {//无分词、无结果直接返回-1 return -1; } else {//无结果、启用分词 vector segResults = ChineseSegmentation::getInstance()->callMixSegmentCutStr(word); string data; for (string &info : segResults) { if (info == string()) { continue; } data = m_pinYinTrie.Find(info); if (data == string()) {//分词后无结果 if (cppjieba::IsSingleWord(info)) {//单个字符 if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回 results.append(QString().fromStdString(info)); } else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略 continue; } } else {//多个字符 string oneWord; cppjieba::RuneStrArray runeArray; cppjieba::DecodeRunesInString(info, runeArray); for (auto i = runeArray.begin(); i != runeArray.end(); ++i) { oneWord = cppjieba::GetStringFromRunes(info, i, i); data = m_pinYinTrie.Find(oneWord); if (data == string()) {//单字无结果则按设置返回 if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回 results.append(QString().fromStdString(oneWord)); } else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略 continue; } } if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字 results.append(QString().fromStdString(data)); } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字 if (limonp::IsInStr(data, ',')) { results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0)))); } else { results.append(QString().fromStdString(data)); } } } } } else {//分词后有结果 if (cppjieba::IsSingleWord(info)) {//单个字符 if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字 results.append(QString().fromStdString(data)); } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字 if (limonp::IsInStr(data, ',')) { results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0)))); } else { results.append(QString().fromStdString(data)); } } } else {//多个字符 vector dataVec = limonp::Split(data, "/"); if (dataVec.size() == 1) {//无多音词 vector dataVec = limonp::Split(data, ","); for (auto &oneResult : dataVec) { results.append(QString().fromStdString(oneResult)); } } else { if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字 int wordSize = limonp::Split(dataVec[0], ",").size(); for (int i = 0; i < wordSize; ++i) { QStringList oneResult; for (size_t j = 0; j < dataVec.size(); ++j) { oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i])); } results.append(oneResult.join('/')); } } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字 vector tmp = limonp::Split(dataVec[0], ","); for (auto &oneResult : tmp) { results.append(QString().fromStdString(oneResult)); } } } } } } } } else {//可以直接查到结果 if (cppjieba::IsSingleWord(word)) {//单个字符 if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字 results.append(QString().fromStdString(directResult)); } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字 if (limonp::IsInStr(directResult, ',')) { results.append(QString().fromStdString(directResult.substr(0, directResult.find_first_of(",", 0)))); } else { results.append(QString().fromStdString(directResult)); } } } else {//多个字符 vector dataVec = limonp::Split(directResult, "/"); if (dataVec.size() == 1) {//无多音词 vector dataVec = limonp::Split(directResult, ","); for (auto &oneResult : dataVec) { results.append(QString().fromStdString(oneResult)); } } else { if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字 int wordSize = limonp::Split(dataVec[0], ",").size(); for (int i = 0; i < wordSize; ++i) { QStringList oneResult; for (size_t j = 0; j < dataVec.size(); ++j) { oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i])); } results.append(oneResult.join('/')); } } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字 vector tmp = limonp::Split(dataVec[0], ","); for (auto &oneResult : tmp) { results.append(QString().fromStdString(oneResult)); } } } } } convertDataStyle(results); return 0;//todo } void HanZiToPinYinPrivate::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType) { m_pinyinDataStyle = dataStyle; m_segType = segType; m_polyphoneType = polyphoneType; m_exDataProcessType = processType; } void HanZiToPinYinPrivate::convertDataStyle(QStringList &results) { QString value; if (m_pinyinDataStyle == PinyinDataStyle::Default) { for (QString &info : results) { if(info == ",") { continue; } //if info's length was been changed, there's someting wrong while traverse the chars of info for (const QChar &c : info) { if (!isalpha(c.toLatin1())) { value = PhoneticSymbol.value(c); if (!value.isEmpty()) { info.replace(c, value.at(0)); } } } QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序) QStringList tmpValue; for (auto &str : tmpList) { if (!tmpValue.contains(str)) { tmpValue.push_back(str); } } info = tmpValue.join(","); } } else if (m_pinyinDataStyle == PinyinDataStyle::Tone) { //无需处理 } else if (m_pinyinDataStyle == PinyinDataStyle::Tone2) { for (QString &info : results) { for (int i = 0; i < info.size();) { auto c = info.at(i); if (!isalpha(c.toLatin1())) { value = PhoneticSymbol.value(c); if (!value.isEmpty()) { info.replace(c, PhoneticSymbol.value(c)); i += PhoneticSymbol.value(c).size(); continue; } } i++; } } } else if (m_pinyinDataStyle == PinyinDataStyle::Tone3) { for (QString &info : results) { if(info == "/") { continue; } bool isPolyphoneWords(false); if (info.contains("/")) { isPolyphoneWords = true; info.replace("/", ","); } for (int i = 0; i < info.size();) { auto c = info.at(i); if (!isalpha(c.toLatin1())) { value = PhoneticSymbol.value(c); if (!value.isEmpty()) { info.replace(i, 1, value.at(0)); //多音词模式 if (info.contains(",")) { int pos = info.indexOf(',', i); if (isPolyphoneWords) { info.replace(",", "/"); } //最后一个读音时 if (pos == -1) { info.append(value.at(1)); break; } info.insert(pos, value.at(1)); i = pos + 1; //insert导致','的位置加一,将i行进到','的位置 i++; continue; } else { info.append(value.at(1)); break; } } } i++; } } } else if (m_pinyinDataStyle == PinyinDataStyle::FirstLetter) { for (QString &info : results) { if(info == "," or info == "/") { continue; } bool isPolyphoneWords(false); if (info.contains("/")) { isPolyphoneWords = true; info.replace("/", ","); } for (int i = 0; i < info.size();i++) { auto c = info.at(i); if (!isalpha(c.toLatin1())) { value = PhoneticSymbol.value(c); if (!value.isEmpty()) { info.replace(c, value.at(0)); } } } QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序) QStringList tmpValue; for (auto &str : tmpList) { if (!tmpValue.contains(str)) { tmpValue.push_back(str.at(0)); } } if (isPolyphoneWords) { info = tmpValue.join("/"); } else { info = tmpValue.join(","); } } } else if (m_pinyinDataStyle == PinyinDataStyle::English) { //暂不支持 } } HanZiToPinYinPrivate::HanZiToPinYinPrivate(HanZiToPinYin *parent) : q(parent) { //const char * const SINGLE_WORD_PINYIN_PATH = "/usr/share/ukui-search/res/dict/singleWordPinyin.txt"; //const char * const WORDS_PINYIN_PATH = "/usr/share/ukui-search/res/dict/wordsPinyin.txt"; //m_pinYinTrie = new Pinyin4cppDictTrie(SINGLE_WORD_PINYIN_PATH, WORDS_PINYIN_PATH); //m_pinYinTrie = new Pinyin4cppTrie; } HanZiToPinYinPrivate::~HanZiToPinYinPrivate() { // if (m_pinYinTrie){ // delete m_pinYinTrie; // m_pinYinTrie = nullptr; // } } HanZiToPinYin * HanZiToPinYin::getInstance() { call_once(g_singleFlag, []() { g_pinYinManager = new HanZiToPinYin; }); return g_pinYinManager; } bool HanZiToPinYin::contains(string &word) { return d->contains(word); } bool HanZiToPinYin::isMultiTone(string &word) { return d->isMultiTone(word); } bool HanZiToPinYin::isMultiTone(string &&word) { return d->isMultiTone(word); } bool HanZiToPinYin::isMultiTone(const string &word) { return d->isMultiTone(word); } bool HanZiToPinYin::isMultiTone(const string &&word) { return d->isMultiTone(word); } int HanZiToPinYin::getResults(string word, QStringList &results) { return d->getResults(word, results); } void HanZiToPinYin::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType) { d->setConfig(dataStyle, segType, polyphoneType, processType); } HanZiToPinYin::HanZiToPinYin() : d(new HanZiToPinYinPrivate) { }