同步分词子项目

This commit is contained in:
iaom 2023-08-21 11:28:02 +08:00
parent 098f9563b3
commit 17d1c8cac6
12 changed files with 228 additions and 17 deletions

View File

@ -0,0 +1,170 @@
cmake_minimum_required(VERSION 3.14)
project(chinese-segmentation LANGUAGES CXX)
set(VERSION_MAJOR 1)
set(VERSION_MINOR 1)
set(VERSION_MICRO 0)
set(CHINESE_SEGMENTATION_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_MICRO})
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_INCLUDE_CURRENT_DIR ON)
find_package(QT NAMES Qt6 Qt5 COMPONENTS Core REQUIRED)
find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core REQUIRED)
set(HEADERS
chinese-segmentation.h
common-struct.h
hanzi-to-pinyin.h
Traditional-to-Simplified.h
pinyin4cpp-common.h
libchinese-segmentation_global.h)
set(CHINESE_SEGMENTATION_SRC
Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.cpp
Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.h
Traditional-to-Simplified.cpp
Traditional-to-Simplified-private.h
chinese-segmentation.cpp
chinese-segmentation-private.h
cppjieba/DatTrie.hpp
cppjieba/DictTrie.hpp
cppjieba/FullSegment.hpp
cppjieba/HMMModel.hpp
cppjieba/HMMSegment.hpp
cppjieba/IdfTrie.hpp
cppjieba/Jieba.hpp
cppjieba/KeywordExtractor.hpp
cppjieba/MPSegment.hpp
cppjieba/MixSegment.hpp
cppjieba/PinYinTrie.hpp
cppjieba/PosTagger.hpp
cppjieba/PreFilter.hpp
cppjieba/QuerySegment.hpp
cppjieba/SegmentBase.hpp
cppjieba/SegmentTagged.hpp
cppjieba/TextRankExtractor.hpp
cppjieba/Unicode.hpp
cppjieba/idf-trie/idf-trie.cpp cppjieba/idf-trie/idf-trie.h
cppjieba/limonp/ArgvContext.hpp
cppjieba/limonp/BlockingQueue.hpp
cppjieba/limonp/BoundedBlockingQueue.hpp
cppjieba/limonp/BoundedQueue.hpp
cppjieba/limonp/Closure.hpp
cppjieba/limonp/Colors.hpp
cppjieba/limonp/Condition.hpp
cppjieba/limonp/Config.hpp
cppjieba/limonp/FileLock.hpp
cppjieba/limonp/ForcePublic.hpp
cppjieba/limonp/LocalVector.hpp
cppjieba/limonp/Logging.hpp
cppjieba/limonp/Md5.hpp
cppjieba/limonp/MutexLock.hpp
cppjieba/limonp/NonCopyable.hpp
cppjieba/limonp/StdExtension.hpp
cppjieba/limonp/StringUtil.hpp
cppjieba/limonp/Thread.hpp
cppjieba/limonp/ThreadPool.hpp
cppjieba/segment-trie/segment-trie.cpp
cppjieba/segment-trie/segment-trie.h
hanzi-to-pinyin.cpp
hanzi-to-pinyin-private.h
pinyin4cpp/pinyin4cpp-trie.cpp
pinyin4cpp/pinyin4cpp-trie.h
pinyin4cpp/pinyin4cpp_dataTrie.cpp
pinyin4cpp/pinyin4cpp_dataTrie.h
pinyin4cpp/pinyin4cpp_dictTrie.cpp
pinyin4cpp/pinyin4cpp_dictTrie.h
storage-base/cedar/cedar.h
storage-base/cedar/cedarpp.h
storage-base/darts-clone/darts.h
storage-base/storage-base.cpp
storage-base/storage-base.h
storage-base/storage-base.hpp)
add_library(chinese-segmentation SHARED
${CHINESE_SEGMENTATION_SRC}
${HEADERS}
)
include_directories(chinese-segmentation
storage-base/cedar
storage-base
cppjieba
cppjieba/limonp
pinyin4cpp
Traditional-Chinese-Simplified-conversion
)
target_link_libraries(chinese-segmentation PUBLIC
Qt${QT_VERSION_MAJOR}::Core
)
include(CMakePackageConfigHelpers)
set(CMAKE_CONFIG_INSTALL_DIR "/usr/share/cmake/chinese-segmentation")
set(HEADERS_INSTALL_DIR "/usr/include/chinese-segmentation")
set(PC_INSTALL_DIR "/usr/lib/${CMAKE_LIBRARY_ARCHITECTURE}/pkgconfig")
set(DICT_INSTALL_PATH "/usr/share/chinese-segmentation/res/dict")
add_compile_definitions(
VERSION="${CHINESE_SEGMENTATION_VERSION}"
DICT_INSTALL_PATH="${DICT_INSTALL_PATH}"
)
target_include_directories(chinese-segmentation PUBLIC $<INSTALL_INTERFACE:${HEADERS_INSTALL_DIR}>)
configure_package_config_file(
"${CMAKE_CURRENT_SOURCE_DIR}/chinese-segmentation-config.cmake.in"
"${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config.cmake"
INSTALL_DESTINATION ${CMAKE_CONFIG_INSTALL_DIR})
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config-version.cmake
VERSION ${CHINESE_SEGMENTATION_VERSION}
COMPATIBILITY SameMajorVersion
)
configure_package_config_file(
"${CMAKE_CURRENT_SOURCE_DIR}/chinese-segmentation.pc.in"
"${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation.pc"
INSTALL_DESTINATION ${PC_INSTALL_DIR})
set_target_properties(chinese-segmentation PROPERTIES
VERSION ${CHINESE_SEGMENTATION_VERSION}
SOVERSION ${VERSION_MAJOR}
OUTPUT_NAME chinese-segmentation
)
install(TARGETS chinese-segmentation
EXPORT chinese-segmentation
PUBLIC_HEADER DESTINATION ${HEADERS_INSTALL_DIR}
LIBRARY DESTINATION /usr/lib/${CMAKE_LIBRARY_ARCHITECTURE}
)
install(EXPORT chinese-segmentation
FILE chinese-segmentation-targets.cmake
DESTINATION ${CMAKE_CONFIG_INSTALL_DIR})
install(FILES ${HEADERS} DESTINATION ${HEADERS_INSTALL_DIR})
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation.pc DESTINATION ${PC_INSTALL_DIR})
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config-version.cmake
DESTINATION ${CMAKE_CONFIG_INSTALL_DIR})
set(DICT_FILES
dict/hmm_model.utf8
dict/idf.utf8
dict/jieba.dict.utf8
dict/stop_words.utf8
dict/user.dict.utf8
dict/pinyinWithoutTone.txt
dict/pos_dict/char_state_tab.utf8
dict/pos_dict/prob_emit.utf8
dict/pos_dict/prob_start.utf8
dict/pos_dict/prob_trans.utf8
pinyin4cpp/dict/singleWordPinyin.txt
pinyin4cpp/dict/wordsPinyin.txt
Traditional-Chinese-Simplified-conversion/dict/TraditionalChineseSimplifiedDict.txt
)
install(FILES ${DICT_FILES} DESTINATION ${DICT_INSTALL_PATH})
if (BUILD_TEST)
add_subdirectory(test)
endif ()

View File

@ -22,7 +22,7 @@
#include "storage-base.hpp"
const char * const TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH = "/usr/share/ukui-search/res/dict/TraditionalChineseSimplifiedDict.txt";
const char * const TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH = DICT_INSTALL_PATH"/TraditionalChineseSimplifiedDict.txt";
class Traditional2SimplifiedTrie : public StorageBase<char, false, CacheFileHeaderBase>
{

View File

@ -0,0 +1,9 @@
@PACKAGE_INIT@
include(CMakeFindDependencyMacro)
find_dependency(Qt@QT_VERSION_MAJOR@Core "@REQUIRED_QT_VERSION@")
if(TARGET Qt6::Core)
find_dependency(Qt6Core5Compat @REQUIRED_QT_VERSION@)
endif()
include("${CMAKE_CURRENT_LIST_DIR}/chinese-segmentation-targets.cmake")

View File

@ -24,10 +24,10 @@
ChineseSegmentationPrivate::ChineseSegmentationPrivate(ChineseSegmentation *parent) : q(parent)
{
//const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
const char * const HMM_PATH = "/usr/share/ukui-search/res/dict/hmm_model.utf8";
const char * const HMM_PATH = DICT_INSTALL_PATH"/hmm_model.utf8";
//const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
//const char * const IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
const char * const STOP_WORD_PATH = "/usr/share/ukui-search/res/dict/stop_words.utf8";
const char * const STOP_WORD_PATH = DICT_INSTALL_PATH"/stop_words.utf8";
m_jieba = new cppjieba::Jieba(DICT_PATH,
HMM_PATH,
USER_DICT_PATH,

View File

@ -0,0 +1,11 @@
prefix=/usr
exec_prefix=${prefix}
libdir=${prefix}/lib/@CMAKE_LIBRARY_ARCHITECTURE@
includedir=${prefix}/include/chinese-segmentation
Name: chinese-segmentation
Description: Chinese-segmentation header files
URL: https://www.ukui.org/
Version: @VERSION@
Cflags: -I${includedir}
Libs: -L${libdir} -lchinese-segmentation

View File

@ -22,7 +22,7 @@
#include "storage-base.hpp"
const char * const IDF_DICT_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
const char * const IDF_DICT_PATH = DICT_INSTALL_PATH"/idf.utf8";
struct IdfCacheFileHeader : CacheFileHeaderBase
{

View File

@ -25,8 +25,8 @@
using namespace cppjieba;
const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
const char * const DICT_PATH = DICT_INSTALL_PATH"/jieba.dict.utf8";
const char * const USER_DICT_PATH = DICT_INSTALL_PATH"/user.dict.utf8";
struct DictCacheFileHeader : CacheFileHeaderBase
{

View File

@ -43,8 +43,10 @@ HEADERS += \
Traditional-to-Simplified.h \
pinyin4cpp-common.h \
libchinese-segmentation_global.h
DICT_INSTALL_PATH = /usr/share/chinese-segmentation/res/dict
DEFINES += DICT_INSTALL_PATH='\\"$${DICT_INSTALL_PATH}\\"'
dict_files.path = /usr/share/ukui-search/res/dict/
dict_files.path = DICT_INSTALL_PATH
dict_files.files = $$PWD/dict/*.utf8\
dict_files.files += $$PWD/dict/pos_dict/*.utf8\
dict_files.files += $$PWD/dict/*.txt\
@ -62,12 +64,12 @@ unix {
QMAKE_PKGCONFIG_VERSION = $$VERSION
QMAKE_PKGCONFIG_LIBDIR = $$target.path
QMAKE_PKGCONFIG_DESTDIR = pkgconfig
QMAKE_PKGCONFIG_INCDIR = /usr/include/chinese-seg
QMAKE_PKGCONFIG_CFLAGS += -I/usr/include/chinese-seg
QMAKE_PKGCONFIG_INCDIR = /usr/include/chinese-segmentation
QMAKE_PKGCONFIG_CFLAGS += -I/usr/include/chinese-segmentation
!isEmpty(target.path): INSTALLS += target
header.path = /usr/include/chinese-seg
header.path = /usr/include/chinese-segmentation
header.files += chinese-segmentation.h libchinese-segmentation_global.h common-struct.h hanzi-to-pinyin.h pinyin4cpp-common.h Traditional-to-Simplified.h
header.files += development-files/header-files/*
# headercppjieba.path = /usr/include/chinese-seg/cppjieba/

View File

@ -22,8 +22,8 @@
#include "storage-base.hpp"
const char * const SINGLE_WORD_PINYIN_PATH = "/usr/share/ukui-search/res/dict/singleWordPinyin.txt";
const char * const WORDS_PINYIN_PATH = "/usr/share/ukui-search/res/dict/wordsPinyin.txt";
const char * const SINGLE_WORD_PINYIN_PATH = DICT_INSTALL_PATH"/singleWordPinyin.txt";
const char * const WORDS_PINYIN_PATH = DICT_INSTALL_PATH"/wordsPinyin.txt";
class Pinyin4cppTrie : public StorageBase<char, false, CacheFileHeaderBase>
{

View File

@ -54,9 +54,9 @@ void StorageBase<ordered, cache_file_header>::Init()
template<const bool ordered, typename cache_file_header>
string StorageBase<ordered, cache_file_header>::Find(const string &key)
{
int result = m_double_array_data_trie->exactMatchSearch<int>(key.c_str(), key.size());
int result = m_double_array_data_trie->template exactMatchSearch<int>(key.c_str(), key.size());
if (result < 0)
return string();
return {};
return string(&m_elements_ptr[result]);
}

View File

@ -0,0 +1,19 @@
set(CMAKE_AUTOUIC ON)
set(CMAKE_AUTOMOC ON)
set(CMAKE_AUTORCC ON)
find_package(QT NAMES Qt6 Qt5 COMPONENTS Core Gui Widgets REQUIRED)
find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core Gui Widgets REQUIRED)
add_executable(test
main.cpp
mainwindow.cpp
mainwindow.h
mainwindow.ui
)
target_include_directories( test PRIVATE
../)
target_link_libraries(test PRIVATE
Qt${QT_VERSION_MAJOR}::Core
Qt${QT_VERSION_MAJOR}::Gui
Qt${QT_VERSION_MAJOR}::Widgets
chinese-segmentation
)

View File

@ -1,8 +1,8 @@
#include "mainwindow.h"
#include "ui_mainwindow.h"
#include <HanZiToPinYin>
#include <ChineseSegmentation>
#include <Traditional-to-Simplified.h>
#include "hanzi-to-pinyin.h"
#include "chinese-segmentation.h"
#include "Traditional-to-Simplified.h"
#include <QMenu>
#include <QDebug>
#include <QStringList>