同步分词子项目
This commit is contained in:
parent
098f9563b3
commit
17d1c8cac6
|
@ -0,0 +1,170 @@
|
|||
cmake_minimum_required(VERSION 3.14)
|
||||
project(chinese-segmentation LANGUAGES CXX)
|
||||
|
||||
set(VERSION_MAJOR 1)
|
||||
set(VERSION_MINOR 1)
|
||||
set(VERSION_MICRO 0)
|
||||
set(CHINESE_SEGMENTATION_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_MICRO})
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_INCLUDE_CURRENT_DIR ON)
|
||||
|
||||
find_package(QT NAMES Qt6 Qt5 COMPONENTS Core REQUIRED)
|
||||
find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core REQUIRED)
|
||||
|
||||
set(HEADERS
|
||||
chinese-segmentation.h
|
||||
common-struct.h
|
||||
hanzi-to-pinyin.h
|
||||
Traditional-to-Simplified.h
|
||||
pinyin4cpp-common.h
|
||||
libchinese-segmentation_global.h)
|
||||
|
||||
set(CHINESE_SEGMENTATION_SRC
|
||||
Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.cpp
|
||||
Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.h
|
||||
Traditional-to-Simplified.cpp
|
||||
Traditional-to-Simplified-private.h
|
||||
chinese-segmentation.cpp
|
||||
chinese-segmentation-private.h
|
||||
cppjieba/DatTrie.hpp
|
||||
cppjieba/DictTrie.hpp
|
||||
cppjieba/FullSegment.hpp
|
||||
cppjieba/HMMModel.hpp
|
||||
cppjieba/HMMSegment.hpp
|
||||
cppjieba/IdfTrie.hpp
|
||||
cppjieba/Jieba.hpp
|
||||
cppjieba/KeywordExtractor.hpp
|
||||
cppjieba/MPSegment.hpp
|
||||
cppjieba/MixSegment.hpp
|
||||
cppjieba/PinYinTrie.hpp
|
||||
cppjieba/PosTagger.hpp
|
||||
cppjieba/PreFilter.hpp
|
||||
cppjieba/QuerySegment.hpp
|
||||
cppjieba/SegmentBase.hpp
|
||||
cppjieba/SegmentTagged.hpp
|
||||
cppjieba/TextRankExtractor.hpp
|
||||
cppjieba/Unicode.hpp
|
||||
cppjieba/idf-trie/idf-trie.cpp cppjieba/idf-trie/idf-trie.h
|
||||
cppjieba/limonp/ArgvContext.hpp
|
||||
cppjieba/limonp/BlockingQueue.hpp
|
||||
cppjieba/limonp/BoundedBlockingQueue.hpp
|
||||
cppjieba/limonp/BoundedQueue.hpp
|
||||
cppjieba/limonp/Closure.hpp
|
||||
cppjieba/limonp/Colors.hpp
|
||||
cppjieba/limonp/Condition.hpp
|
||||
cppjieba/limonp/Config.hpp
|
||||
cppjieba/limonp/FileLock.hpp
|
||||
cppjieba/limonp/ForcePublic.hpp
|
||||
cppjieba/limonp/LocalVector.hpp
|
||||
cppjieba/limonp/Logging.hpp
|
||||
cppjieba/limonp/Md5.hpp
|
||||
cppjieba/limonp/MutexLock.hpp
|
||||
cppjieba/limonp/NonCopyable.hpp
|
||||
cppjieba/limonp/StdExtension.hpp
|
||||
cppjieba/limonp/StringUtil.hpp
|
||||
cppjieba/limonp/Thread.hpp
|
||||
cppjieba/limonp/ThreadPool.hpp
|
||||
cppjieba/segment-trie/segment-trie.cpp
|
||||
cppjieba/segment-trie/segment-trie.h
|
||||
hanzi-to-pinyin.cpp
|
||||
hanzi-to-pinyin-private.h
|
||||
pinyin4cpp/pinyin4cpp-trie.cpp
|
||||
pinyin4cpp/pinyin4cpp-trie.h
|
||||
pinyin4cpp/pinyin4cpp_dataTrie.cpp
|
||||
pinyin4cpp/pinyin4cpp_dataTrie.h
|
||||
pinyin4cpp/pinyin4cpp_dictTrie.cpp
|
||||
pinyin4cpp/pinyin4cpp_dictTrie.h
|
||||
storage-base/cedar/cedar.h
|
||||
storage-base/cedar/cedarpp.h
|
||||
storage-base/darts-clone/darts.h
|
||||
storage-base/storage-base.cpp
|
||||
storage-base/storage-base.h
|
||||
storage-base/storage-base.hpp)
|
||||
|
||||
add_library(chinese-segmentation SHARED
|
||||
${CHINESE_SEGMENTATION_SRC}
|
||||
${HEADERS}
|
||||
)
|
||||
|
||||
include_directories(chinese-segmentation
|
||||
storage-base/cedar
|
||||
storage-base
|
||||
cppjieba
|
||||
cppjieba/limonp
|
||||
pinyin4cpp
|
||||
Traditional-Chinese-Simplified-conversion
|
||||
)
|
||||
|
||||
target_link_libraries(chinese-segmentation PUBLIC
|
||||
Qt${QT_VERSION_MAJOR}::Core
|
||||
)
|
||||
|
||||
include(CMakePackageConfigHelpers)
|
||||
set(CMAKE_CONFIG_INSTALL_DIR "/usr/share/cmake/chinese-segmentation")
|
||||
set(HEADERS_INSTALL_DIR "/usr/include/chinese-segmentation")
|
||||
set(PC_INSTALL_DIR "/usr/lib/${CMAKE_LIBRARY_ARCHITECTURE}/pkgconfig")
|
||||
set(DICT_INSTALL_PATH "/usr/share/chinese-segmentation/res/dict")
|
||||
add_compile_definitions(
|
||||
VERSION="${CHINESE_SEGMENTATION_VERSION}"
|
||||
DICT_INSTALL_PATH="${DICT_INSTALL_PATH}"
|
||||
)
|
||||
target_include_directories(chinese-segmentation PUBLIC $<INSTALL_INTERFACE:${HEADERS_INSTALL_DIR}>)
|
||||
|
||||
configure_package_config_file(
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/chinese-segmentation-config.cmake.in"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config.cmake"
|
||||
INSTALL_DESTINATION ${CMAKE_CONFIG_INSTALL_DIR})
|
||||
|
||||
write_basic_package_version_file(
|
||||
${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config-version.cmake
|
||||
VERSION ${CHINESE_SEGMENTATION_VERSION}
|
||||
COMPATIBILITY SameMajorVersion
|
||||
)
|
||||
|
||||
configure_package_config_file(
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/chinese-segmentation.pc.in"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation.pc"
|
||||
INSTALL_DESTINATION ${PC_INSTALL_DIR})
|
||||
|
||||
set_target_properties(chinese-segmentation PROPERTIES
|
||||
VERSION ${CHINESE_SEGMENTATION_VERSION}
|
||||
SOVERSION ${VERSION_MAJOR}
|
||||
OUTPUT_NAME chinese-segmentation
|
||||
)
|
||||
install(TARGETS chinese-segmentation
|
||||
EXPORT chinese-segmentation
|
||||
PUBLIC_HEADER DESTINATION ${HEADERS_INSTALL_DIR}
|
||||
LIBRARY DESTINATION /usr/lib/${CMAKE_LIBRARY_ARCHITECTURE}
|
||||
)
|
||||
install(EXPORT chinese-segmentation
|
||||
FILE chinese-segmentation-targets.cmake
|
||||
DESTINATION ${CMAKE_CONFIG_INSTALL_DIR})
|
||||
install(FILES ${HEADERS} DESTINATION ${HEADERS_INSTALL_DIR})
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation.pc DESTINATION ${PC_INSTALL_DIR})
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config.cmake
|
||||
${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config-version.cmake
|
||||
DESTINATION ${CMAKE_CONFIG_INSTALL_DIR})
|
||||
set(DICT_FILES
|
||||
dict/hmm_model.utf8
|
||||
dict/idf.utf8
|
||||
dict/jieba.dict.utf8
|
||||
dict/stop_words.utf8
|
||||
dict/user.dict.utf8
|
||||
dict/pinyinWithoutTone.txt
|
||||
dict/pos_dict/char_state_tab.utf8
|
||||
dict/pos_dict/prob_emit.utf8
|
||||
dict/pos_dict/prob_start.utf8
|
||||
dict/pos_dict/prob_trans.utf8
|
||||
pinyin4cpp/dict/singleWordPinyin.txt
|
||||
pinyin4cpp/dict/wordsPinyin.txt
|
||||
Traditional-Chinese-Simplified-conversion/dict/TraditionalChineseSimplifiedDict.txt
|
||||
)
|
||||
install(FILES ${DICT_FILES} DESTINATION ${DICT_INSTALL_PATH})
|
||||
|
||||
if (BUILD_TEST)
|
||||
add_subdirectory(test)
|
||||
endif ()
|
||||
|
||||
|
||||
|
|
@ -22,7 +22,7 @@
|
|||
|
||||
#include "storage-base.hpp"
|
||||
|
||||
const char * const TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH = "/usr/share/ukui-search/res/dict/TraditionalChineseSimplifiedDict.txt";
|
||||
const char * const TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH = DICT_INSTALL_PATH"/TraditionalChineseSimplifiedDict.txt";
|
||||
|
||||
class Traditional2SimplifiedTrie : public StorageBase<char, false, CacheFileHeaderBase>
|
||||
{
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
@PACKAGE_INIT@
|
||||
|
||||
include(CMakeFindDependencyMacro)
|
||||
find_dependency(Qt@QT_VERSION_MAJOR@Core "@REQUIRED_QT_VERSION@")
|
||||
if(TARGET Qt6::Core)
|
||||
find_dependency(Qt6Core5Compat @REQUIRED_QT_VERSION@)
|
||||
endif()
|
||||
|
||||
include("${CMAKE_CURRENT_LIST_DIR}/chinese-segmentation-targets.cmake")
|
|
@ -24,10 +24,10 @@
|
|||
ChineseSegmentationPrivate::ChineseSegmentationPrivate(ChineseSegmentation *parent) : q(parent)
|
||||
{
|
||||
//const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
|
||||
const char * const HMM_PATH = "/usr/share/ukui-search/res/dict/hmm_model.utf8";
|
||||
const char * const HMM_PATH = DICT_INSTALL_PATH"/hmm_model.utf8";
|
||||
//const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
|
||||
//const char * const IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
|
||||
const char * const STOP_WORD_PATH = "/usr/share/ukui-search/res/dict/stop_words.utf8";
|
||||
const char * const STOP_WORD_PATH = DICT_INSTALL_PATH"/stop_words.utf8";
|
||||
m_jieba = new cppjieba::Jieba(DICT_PATH,
|
||||
HMM_PATH,
|
||||
USER_DICT_PATH,
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
prefix=/usr
|
||||
exec_prefix=${prefix}
|
||||
libdir=${prefix}/lib/@CMAKE_LIBRARY_ARCHITECTURE@
|
||||
includedir=${prefix}/include/chinese-segmentation
|
||||
|
||||
Name: chinese-segmentation
|
||||
Description: Chinese-segmentation header files
|
||||
URL: https://www.ukui.org/
|
||||
Version: @VERSION@
|
||||
Cflags: -I${includedir}
|
||||
Libs: -L${libdir} -lchinese-segmentation
|
|
@ -22,7 +22,7 @@
|
|||
|
||||
#include "storage-base.hpp"
|
||||
|
||||
const char * const IDF_DICT_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
|
||||
const char * const IDF_DICT_PATH = DICT_INSTALL_PATH"/idf.utf8";
|
||||
|
||||
struct IdfCacheFileHeader : CacheFileHeaderBase
|
||||
{
|
||||
|
|
|
@ -25,8 +25,8 @@
|
|||
|
||||
using namespace cppjieba;
|
||||
|
||||
const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
|
||||
const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
|
||||
const char * const DICT_PATH = DICT_INSTALL_PATH"/jieba.dict.utf8";
|
||||
const char * const USER_DICT_PATH = DICT_INSTALL_PATH"/user.dict.utf8";
|
||||
|
||||
struct DictCacheFileHeader : CacheFileHeaderBase
|
||||
{
|
||||
|
|
|
@ -43,8 +43,10 @@ HEADERS += \
|
|||
Traditional-to-Simplified.h \
|
||||
pinyin4cpp-common.h \
|
||||
libchinese-segmentation_global.h
|
||||
DICT_INSTALL_PATH = /usr/share/chinese-segmentation/res/dict
|
||||
DEFINES += DICT_INSTALL_PATH='\\"$${DICT_INSTALL_PATH}\\"'
|
||||
|
||||
dict_files.path = /usr/share/ukui-search/res/dict/
|
||||
dict_files.path = DICT_INSTALL_PATH
|
||||
dict_files.files = $$PWD/dict/*.utf8\
|
||||
dict_files.files += $$PWD/dict/pos_dict/*.utf8\
|
||||
dict_files.files += $$PWD/dict/*.txt\
|
||||
|
@ -62,12 +64,12 @@ unix {
|
|||
QMAKE_PKGCONFIG_VERSION = $$VERSION
|
||||
QMAKE_PKGCONFIG_LIBDIR = $$target.path
|
||||
QMAKE_PKGCONFIG_DESTDIR = pkgconfig
|
||||
QMAKE_PKGCONFIG_INCDIR = /usr/include/chinese-seg
|
||||
QMAKE_PKGCONFIG_CFLAGS += -I/usr/include/chinese-seg
|
||||
QMAKE_PKGCONFIG_INCDIR = /usr/include/chinese-segmentation
|
||||
QMAKE_PKGCONFIG_CFLAGS += -I/usr/include/chinese-segmentation
|
||||
|
||||
!isEmpty(target.path): INSTALLS += target
|
||||
|
||||
header.path = /usr/include/chinese-seg
|
||||
header.path = /usr/include/chinese-segmentation
|
||||
header.files += chinese-segmentation.h libchinese-segmentation_global.h common-struct.h hanzi-to-pinyin.h pinyin4cpp-common.h Traditional-to-Simplified.h
|
||||
header.files += development-files/header-files/*
|
||||
# headercppjieba.path = /usr/include/chinese-seg/cppjieba/
|
||||
|
|
|
@ -22,8 +22,8 @@
|
|||
|
||||
#include "storage-base.hpp"
|
||||
|
||||
const char * const SINGLE_WORD_PINYIN_PATH = "/usr/share/ukui-search/res/dict/singleWordPinyin.txt";
|
||||
const char * const WORDS_PINYIN_PATH = "/usr/share/ukui-search/res/dict/wordsPinyin.txt";
|
||||
const char * const SINGLE_WORD_PINYIN_PATH = DICT_INSTALL_PATH"/singleWordPinyin.txt";
|
||||
const char * const WORDS_PINYIN_PATH = DICT_INSTALL_PATH"/wordsPinyin.txt";
|
||||
|
||||
class Pinyin4cppTrie : public StorageBase<char, false, CacheFileHeaderBase>
|
||||
{
|
||||
|
|
|
@ -54,9 +54,9 @@ void StorageBase<ordered, cache_file_header>::Init()
|
|||
template<const bool ordered, typename cache_file_header>
|
||||
string StorageBase<ordered, cache_file_header>::Find(const string &key)
|
||||
{
|
||||
int result = m_double_array_data_trie->exactMatchSearch<int>(key.c_str(), key.size());
|
||||
int result = m_double_array_data_trie->template exactMatchSearch<int>(key.c_str(), key.size());
|
||||
if (result < 0)
|
||||
return string();
|
||||
return {};
|
||||
return string(&m_elements_ptr[result]);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
set(CMAKE_AUTOUIC ON)
|
||||
set(CMAKE_AUTOMOC ON)
|
||||
set(CMAKE_AUTORCC ON)
|
||||
find_package(QT NAMES Qt6 Qt5 COMPONENTS Core Gui Widgets REQUIRED)
|
||||
find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core Gui Widgets REQUIRED)
|
||||
add_executable(test
|
||||
main.cpp
|
||||
mainwindow.cpp
|
||||
mainwindow.h
|
||||
mainwindow.ui
|
||||
)
|
||||
target_include_directories( test PRIVATE
|
||||
../)
|
||||
target_link_libraries(test PRIVATE
|
||||
Qt${QT_VERSION_MAJOR}::Core
|
||||
Qt${QT_VERSION_MAJOR}::Gui
|
||||
Qt${QT_VERSION_MAJOR}::Widgets
|
||||
chinese-segmentation
|
||||
)
|
|
@ -1,8 +1,8 @@
|
|||
#include "mainwindow.h"
|
||||
#include "ui_mainwindow.h"
|
||||
#include <HanZiToPinYin>
|
||||
#include <ChineseSegmentation>
|
||||
#include <Traditional-to-Simplified.h>
|
||||
#include "hanzi-to-pinyin.h"
|
||||
#include "chinese-segmentation.h"
|
||||
#include "Traditional-to-Simplified.h"
|
||||
#include <QMenu>
|
||||
#include <QDebug>
|
||||
#include <QStringList>
|
||||
|
|
Loading…
Reference in New Issue