同步分词库改动

This commit is contained in:
iaom 2023-03-24 11:33:22 +08:00
parent 684305e554
commit 73cccc4083
13 changed files with 3442 additions and 76 deletions

View File

@ -0,0 +1,10 @@
INCLUDEPATH += $$PWD
HEADERS += \
$$PWD/Traditional2Simplified_trie.h
SOURCES += \
$$PWD/Traditional2Simplified_trie.cpp
DISTFILES += \
Traditional-Chinese-Simplified-conversion/dict/TraditionalChineseSimplifiedDict.txt

View File

@ -0,0 +1,99 @@
/*
* Copyright (C) 2023, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#include "Traditional2Simplified_trie.h"
Traditional2SimplifiedTrie::Traditional2SimplifiedTrie(string dat_cache_path)
: StorageBase<char, false, CacheFileHeaderBase>(vector<string>{TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH}, dat_cache_path)
{
this->Init();
}
Traditional2SimplifiedTrie::Traditional2SimplifiedTrie(const vector<string> file_paths, string dat_cache_path)
: StorageBase<char, false, CacheFileHeaderBase>(file_paths, dat_cache_path)
{
this->Init();
}
bool Traditional2SimplifiedTrie::IsTraditional(const string &word) {
string result = this->Find(word);
if (!result.empty())
return true;
return false;
}
void Traditional2SimplifiedTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
{
CacheFileHeaderBase header;
assert(sizeof(header.md5_hex) == md5.size());
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
umask(S_IWGRP | S_IWOTH);
const int fd =mkstemp((char *)tmp_filepath.data());
assert(fd >= 0);
fchmod(fd, 0644);
write_bytes = write(fd, (const char *)&header, sizeof(CacheFileHeaderBase));
this->LoadDict(fd, write_bytes, offset, elements_num);
write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
lseek(fd, sizeof(header.md5_hex), SEEK_SET);
write(fd, &elements_num, sizeof(int));
write(fd, &offset, sizeof(int));
data_trie_size = this->GetDataTrieSize();
write(fd, &data_trie_size, sizeof(int));
close(fd);
assert((size_t)write_bytes == sizeof(CacheFileHeaderBase) + offset + this->GetDataTrieTotalSize());
const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str());
assert(0 == rename_ret);
}
string Traditional2SimplifiedTrie::Find(const string &key)
{
int result = this->ExactMatchSearch(key.c_str(), key.size());
if (result < 0)
return string();
return string(&this->GetElementPtr()[result]);
}
void Traditional2SimplifiedTrie::LoadDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
{
ifstream ifs(TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH);
string line;
vector<string> buf;
for (; getline(ifs, line);) {
if (limonp::StartsWith(line, "#") or line.empty()) {
continue;
}
limonp::Split(line, buf, ":");
if (buf.size() != 2)
continue;
this->Update(buf[0].c_str(), buf[0].size(), offset);
offset += (buf[1].size() + 1);
elements_num++;
write_bytes += write(fd, buf[1].c_str(), buf[1].size() + 1);
}
}

View File

@ -0,0 +1,40 @@
/*
* Copyright (C) 2023, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef Traditional2SimplifiedTrie_H
#define Traditional2SimplifiedTrie_H
#include "storage-base.hpp"
const char * const TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH = "/usr/share/ukui-search/res/dict/TraditionalChineseSimplifiedDict.txt";
class Traditional2SimplifiedTrie : public StorageBase<char, false, CacheFileHeaderBase>
{
public:
Traditional2SimplifiedTrie(string dat_cache_path = "");
Traditional2SimplifiedTrie(const vector<string> file_paths, string dat_cache_path = "");
void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
string Find(const string &key);
bool IsTraditional(const string &word);
private:
void LoadDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
};
#endif // Traditional2SimplifiedTrie_H

View File

@ -0,0 +1,47 @@
/*
* Copyright (C) 2023, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef Traditional2SimplifiedPRIVATE_H
#define Traditional2SimplifiedPRIVATE_H
#include <QtCore/qglobal.h>
#include <QHash>
#include "Traditional-to-Simplified.h"
#include "Traditional2Simplified_trie.h"
using namespace std;
class TRADITIONAL_CHINESE_SIMPLIFIED_EXPORT Traditional2SimplifiedPrivate
{
public:
Traditional2SimplifiedPrivate(Traditional2Simplified *parent = nullptr);
~Traditional2SimplifiedPrivate();
public:
bool isTraditional(string &word) {return m_Traditional2SimplifiedTrie.IsTraditional(word);}
string getResults(string words);
private:
Traditional2Simplified *q = nullptr;
Traditional2SimplifiedTrie m_Traditional2SimplifiedTrie;
};
#endif // Traditional2SimplifiedPRIVATE_H

View File

@ -0,0 +1,86 @@
/*
* Copyright (C) 2023, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#include <mutex>
#include <cctype>
#include "Traditional-to-Simplified.h"
#include "Traditional-to-Simplified-private.h"
#include "cppjieba/Unicode.hpp"
Traditional2Simplified * Traditional2Simplified::g_Traditional2SimplifiedManager = nullptr;
std::once_flag g_Traditional2SimplifiedSingleFlag;
string Traditional2SimplifiedPrivate::getResults(string words)
{
string results;
if (words.empty()) {
return words;
} else if (cppjieba::IsSingleWord(words)) {//单个字符
results = m_Traditional2SimplifiedTrie.Find(words);
if (results.empty()) {
results = words;//原数据返回
}
} else {//多个字符
string oneWord;
string data;
cppjieba::RuneStrArray runeArray;
cppjieba::DecodeRunesInString(words, runeArray);
for (auto i = runeArray.begin(); i != runeArray.end(); ++i) {
oneWord = cppjieba::GetStringFromRunes(words, i, i);
data = m_Traditional2SimplifiedTrie.Find(oneWord);
if (data.empty()) {//单字无结果
results.append(oneWord);
} else {
results.append(data);
}
}
}
return results;
}
Traditional2SimplifiedPrivate::Traditional2SimplifiedPrivate(Traditional2Simplified *parent) : q(parent)
{
}
Traditional2SimplifiedPrivate::~Traditional2SimplifiedPrivate()
{
}
Traditional2Simplified * Traditional2Simplified::getInstance()
{
call_once(g_Traditional2SimplifiedSingleFlag, []() {
g_Traditional2SimplifiedManager = new Traditional2Simplified;
});
return g_Traditional2SimplifiedManager;
}
bool Traditional2Simplified::isTraditional(string &oneWord)
{
return d->isTraditional(oneWord);
}
string Traditional2Simplified::getResults(string words)
{
return d->getResults(words);
}
Traditional2Simplified::Traditional2Simplified() : d(new Traditional2SimplifiedPrivate)
{
}

View File

@ -0,0 +1,61 @@
/*
* Copyright (C) 2023, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef Traditional2Simplified_H
#define Traditional2Simplified_H
#include <QtCore/qglobal.h>
#include <string>
#define TRADITIONAL_CHINESE_SIMPLIFIED_EXPORT Q_DECL_IMPORT
using namespace std;
class Traditional2SimplifiedPrivate;
class TRADITIONAL_CHINESE_SIMPLIFIED_EXPORT Traditional2Simplified
{
public:
static Traditional2Simplified * getInstance();
public:
/**
* @brief Traditional2Simplified::isMultiTone true
* @param oneWord
* @return bool false
*/
bool isTraditional(string &oneWord);
/**
* @brief Traditional2Simplified::getResults //
* @param words //
* @return words
*/
string getResults(string words);
protected:
Traditional2Simplified();
~Traditional2Simplified();
Traditional2Simplified(const Traditional2Simplified&) = delete;
Traditional2Simplified& operator =(const Traditional2Simplified&) = delete;
private:
static Traditional2Simplified *g_Traditional2SimplifiedManager;
Traditional2SimplifiedPrivate *d = nullptr;
};
#endif // PINYINMANAGER_H

View File

@ -11,6 +11,7 @@ public:
explicit ChineseSegmentationPrivate(ChineseSegmentation *parent = nullptr);
~ChineseSegmentationPrivate();
vector<KeyWord> callSegment(const string& sentence);
vector<KeyWord> callSegment(QString& sentence);
vector<string> callMixSegmentCutStr(const string& sentence);
vector<Word> callMixSegmentCutWord(const string& sentence);

View File

@ -51,6 +51,17 @@ vector<KeyWord> ChineseSegmentationPrivate::callSegment(const string &sentence)
}
vector<KeyWord> ChineseSegmentationPrivate::callSegment(QString &sentence) {
//'\xEF\xBC\x8C' is "" "\xE3\x80\x82" is "。" use three " " to replace ,to ensure the offset info.
sentence = sentence.replace("\t", " ").replace("\xEF\xBC\x8C", " ").replace("\xE3\x80\x82", " ");
const size_t topk = -1;
vector<KeyWord> keywordres;
ChineseSegmentationPrivate::m_jieba->extractor.Extract(sentence.left(20480000).toStdString(), keywordres, topk);
return keywordres;
}
vector<string> ChineseSegmentationPrivate::callMixSegmentCutStr(const string &sentence)
{
vector<string> keywordres;
@ -117,6 +128,11 @@ vector<KeyWord> ChineseSegmentation::callSegment(const string &sentence)
return d->callSegment(sentence);
}
vector<KeyWord> ChineseSegmentation::callSegment(QString &sentence)
{
return d->callSegment(sentence);
}
vector<string> ChineseSegmentation::callMixSegmentCutStr(const string &sentence)
{
return d->callMixSegmentCutStr(sentence);

View File

@ -21,6 +21,7 @@
#ifndef CHINESESEGMENTATION_H
#define CHINESESEGMENTATION_H
#include <QString>
#include "libchinese-segmentation_global.h"
#include "common-struct.h"
@ -37,6 +38,7 @@ public:
* @return vector<KeyWord>
*/
vector<KeyWord> callSegment(const string &sentence);
vector<KeyWord> callSegment(QString &sentence);
/**
* @brief ChineseSegmentation::callMixSegmentCutStr

View File

@ -23,13 +23,15 @@ QMAKE_CXXFLAGS += -execution-charset:utf-8
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
include(cppjieba/cppjieba.pri)
include(pinyin4cpp/pinyin4cpp.pri)
include(Traditional-Chinese-Simplified-conversion/Traditional2Simplified.pri)
include(storage-base/storage-base-cedar.pri)
#LIBS += -L/usr/local/lib/libjemalloc -ljemalloc
SOURCES += \
chinese-segmentation.cpp \
hanzi-to-pinyin.cpp
hanzi-to-pinyin.cpp \
Traditional-to-Simplified.cpp
HEADERS += \
chinese-segmentation-private.h \
@ -37,6 +39,8 @@ HEADERS += \
common-struct.h \
hanzi-to-pinyin-private.h \
hanzi-to-pinyin.h \
Traditional-to-Simplified-private.h \
Traditional-to-Simplified.h \
pinyin4cpp-common.h \
libchinese-segmentation_global.h
@ -44,7 +48,8 @@ dict_files.path = /usr/share/ukui-search/res/dict/
dict_files.files = $$PWD/dict/*.utf8\
dict_files.files += $$PWD/dict/pos_dict/*.utf8\
dict_files.files += $$PWD/dict/*.txt\
dict_files.files += $$PWD/pinyin4cpp/dict/*.txt
dict_files.files += $$PWD/pinyin4cpp/dict/*.txt\
dict_files.files += $$PWD/Traditional-Chinese-Simplified-conversion/dict/*.txt
INSTALLS += \
dict_files \
@ -63,7 +68,7 @@ unix {
!isEmpty(target.path): INSTALLS += target
header.path = /usr/include/chinese-seg
header.files += chinese-segmentation.h libchinese-segmentation_global.h common-struct.h hanzi-to-pinyin.h pinyin4cpp-common.h
header.files += chinese-segmentation.h libchinese-segmentation_global.h common-struct.h hanzi-to-pinyin.h pinyin4cpp-common.h Traditional-to-Simplified.h
header.files += development-files/header-files/*
# headercppjieba.path = /usr/include/chinese-seg/cppjieba/
# headercppjieba.files = cppjieba/*

View File

@ -2,6 +2,7 @@
#include "ui_mainwindow.h"
#include <HanZiToPinYin>
#include <ChineseSegmentation>
#include <Traditional-to-Simplified.h>
#include <QMenu>
#include <QDebug>
#include <QStringList>
@ -79,7 +80,7 @@ void MainWindow::initconnections()
ui->lineEdit_4->setText(list.join(" "));
qDebug() << "result:" << list.join(" ");
vector<KeyWord> result = ChineseSegmentation::getInstance()->callSegment(ui->lineEdit_2->text().toStdString());
vector<KeyWord> result = ChineseSegmentation::getInstance()->callSegment(text.toStdString());
list.clear();
for (auto &info:result) {
@ -87,6 +88,9 @@ void MainWindow::initconnections()
}
ui->lineEdit_6->setText(list.join("/"));
string simplified = Traditional2Simplified::getInstance()->getResults(text.toStdString());
ui->lineEdit_7->setText(QString().fromStdString(simplified));
});
}

View File

@ -27,61 +27,6 @@
<string>点击开始</string>
</property>
</widget>
<widget class="QLineEdit" name="lineEdit">
<property name="geometry">
<rect>
<x>40</x>
<y>20</y>
<width>91</width>
<height>31</height>
</rect>
</property>
<property name="text">
<string>输入文字:</string>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
<widget class="QLineEdit" name="lineEdit_2">
<property name="geometry">
<rect>
<x>40</x>
<y>70</y>
<width>711</width>
<height>41</height>
</rect>
</property>
</widget>
<widget class="QLineEdit" name="lineEdit_3">
<property name="geometry">
<rect>
<x>40</x>
<y>310</y>
<width>121</width>
<height>31</height>
</rect>
</property>
<property name="text">
<string>拼音转换结果:</string>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
<widget class="QLineEdit" name="lineEdit_4">
<property name="geometry">
<rect>
<x>40</x>
<y>360</y>
<width>711</width>
<height>41</height>
</rect>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
<widget class="QCheckBox" name="checkSegBox">
<property name="geometry">
<rect>
@ -140,28 +85,77 @@
<string>无拼音数据原数据返回</string>
</property>
</widget>
<widget class="QLineEdit" name="lineEdit_5">
<widget class="QWidget" name="">
<property name="geometry">
<rect>
<x>40</x>
<y>160</y>
<width>113</width>
<height>31</height>
</rect>
</property>
<property name="text">
<string>分词结果:</string>
</property>
</widget>
<widget class="QLineEdit" name="lineEdit_6">
<property name="geometry">
<rect>
<x>40</x>
<y>220</y>
<y>20</y>
<width>711</width>
<height>41</height>
<height>391</height>
</rect>
</property>
<layout class="QVBoxLayout" name="verticalLayout">
<item>
<widget class="QLineEdit" name="lineEdit">
<property name="text">
<string>输入文字:</string>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
</item>
<item>
<widget class="QLineEdit" name="lineEdit_2"/>
</item>
<item>
<widget class="QLineEdit" name="lineEdit_5">
<property name="text">
<string>分词结果:</string>
</property>
</widget>
</item>
<item>
<widget class="QLineEdit" name="lineEdit_6"/>
</item>
<item>
<widget class="QLineEdit" name="lineEdit_3">
<property name="text">
<string>拼音转换结果:</string>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
</item>
<item>
<widget class="QLineEdit" name="lineEdit_4">
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
</item>
<item>
<widget class="QLineEdit" name="lineEdit_8">
<property name="text">
<string>繁简转换结果:</string>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
</item>
<item>
<widget class="QLineEdit" name="lineEdit_7">
<property name="text">
<string/>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
</item>
</layout>
</widget>
</widget>
<widget class="QMenuBar" name="menubar">
@ -170,7 +164,7 @@
<x>0</x>
<y>0</y>
<width>800</width>
<height>28</height>
<height>29</height>
</rect>
</property>
</widget>