删除子项目，upstream分支将只作为打包时同步代码的分支

2023-12-28 10:27:40 +08:00 · 2023-12-28 10:27:40 +08:00 · 62b82585a9
parent 8f71e3e7c6
commit 62b82585a9
102 changed files with 1247385 additions and 4 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +0,0 @@
-[submodule "libchinese-segmentation"]
-	path = libchinese-segmentation
-	url = https://gitee.com/openkylin/chinese-segmentation.git
--- a/1
+++ b/1
@ -1 +0,0 @@
-Subproject commit f7aa56a30705c2635b0d4237efb635e8fee5022a
--- a/libchinese-segmentation/CMakeLists.txt
+++ b/libchinese-segmentation/CMakeLists.txt
@ -0,0 +1,169 @@
+cmake_minimum_required(VERSION 3.14)
+project(chinese-segmentation LANGUAGES CXX)
+
+set(VERSION_MAJOR 1)
+set(VERSION_MINOR 1)
+set(VERSION_MICRO 0)
+set(CHINESE_SEGMENTATION_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_MICRO})
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
+find_package(QT NAMES Qt6 Qt5 COMPONENTS Core REQUIRED)
+find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core REQUIRED)
+
+set(HEADERS
+        chinese-segmentation.h
+        common-struct.h
+        hanzi-to-pinyin.h
+        Traditional-to-Simplified.h
+        pinyin4cpp-common.h
+        libchinese-segmentation_global.h)
+
+set(CHINESE_SEGMENTATION_SRC
+        Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.cpp
+        Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.h
+        Traditional-to-Simplified.cpp
+        Traditional-to-Simplified-private.h
+        chinese-segmentation.cpp
+        chinese-segmentation-private.h
+        cppjieba/DatTrie.hpp
+        cppjieba/DictTrie.hpp
+        cppjieba/FullSegment.hpp
+        cppjieba/HMMModel.hpp
+        cppjieba/HMMSegment.hpp
+        cppjieba/IdfTrie.hpp
+        cppjieba/Jieba.hpp
+        cppjieba/KeywordExtractor.hpp
+        cppjieba/MPSegment.hpp
+        cppjieba/MixSegment.hpp
+        cppjieba/PinYinTrie.hpp
+        cppjieba/PosTagger.hpp
+        cppjieba/PreFilter.hpp
+        cppjieba/QuerySegment.hpp
+        cppjieba/SegmentBase.hpp
+        cppjieba/SegmentTagged.hpp
+        cppjieba/TextRankExtractor.hpp
+        cppjieba/Unicode.hpp
+        cppjieba/idf-trie/idf-trie.cpp cppjieba/idf-trie/idf-trie.h
+        cppjieba/limonp/ArgvContext.hpp
+        cppjieba/limonp/BlockingQueue.hpp
+        cppjieba/limonp/BoundedBlockingQueue.hpp
+        cppjieba/limonp/BoundedQueue.hpp
+        cppjieba/limonp/Closure.hpp
+        cppjieba/limonp/Colors.hpp
+        cppjieba/limonp/Condition.hpp
+        cppjieba/limonp/Config.hpp
+        cppjieba/limonp/FileLock.hpp
+        cppjieba/limonp/ForcePublic.hpp
+        cppjieba/limonp/LocalVector.hpp
+        cppjieba/limonp/Logging.hpp
+        cppjieba/limonp/Md5.hpp
+        cppjieba/limonp/MutexLock.hpp
+        cppjieba/limonp/NonCopyable.hpp
+        cppjieba/limonp/StdExtension.hpp
+        cppjieba/limonp/StringUtil.hpp
+        cppjieba/limonp/Thread.hpp
+        cppjieba/limonp/ThreadPool.hpp
+        cppjieba/segment-trie/segment-trie.cpp
+        cppjieba/segment-trie/segment-trie.h
+        hanzi-to-pinyin.cpp
+        hanzi-to-pinyin-private.h
+        pinyin4cpp/pinyin4cpp-trie.cpp
+        pinyin4cpp/pinyin4cpp-trie.h
+        pinyin4cpp/pinyin4cpp_dataTrie.cpp
+        pinyin4cpp/pinyin4cpp_dataTrie.h
+        pinyin4cpp/pinyin4cpp_dictTrie.cpp
+        pinyin4cpp/pinyin4cpp_dictTrie.h
+        storage-base/cedar/cedar.h
+        storage-base/cedar/cedarpp.h
+        storage-base/darts-clone/darts.h
+        storage-base/storage-base.cpp
+        storage-base/storage-base.h
+        storage-base/storage-base.hpp)
+
+add_library(chinese-segmentation SHARED
+        ${CHINESE_SEGMENTATION_SRC}
+        ${HEADERS}
+        )
+
+include_directories(chinese-segmentation
+        storage-base/cedar
+        storage-base
+        cppjieba
+        cppjieba/limonp
+        pinyin4cpp
+        Traditional-Chinese-Simplified-conversion
+        )
+
+target_link_libraries(chinese-segmentation PUBLIC
+    Qt${QT_VERSION_MAJOR}::Core
+)
+
+include(CMakePackageConfigHelpers)
+set(CMAKE_CONFIG_INSTALL_DIR "/usr/share/cmake/chinese-segmentation")
+set(HEADERS_INSTALL_DIR "/usr/include/chinese-segmentation")
+set(PC_INSTALL_DIR "/usr/lib/${CMAKE_LIBRARY_ARCHITECTURE}/pkgconfig")
+set(DICT_INSTALL_PATH "/usr/share/chinese-segmentation/res/dict")
+add_compile_definitions(
+        VERSION="${CHINESE_SEGMENTATION_VERSION}"
+        DICT_INSTALL_PATH="${DICT_INSTALL_PATH}"
+)
+target_include_directories(chinese-segmentation PUBLIC  $<INSTALL_INTERFACE:${HEADERS_INSTALL_DIR}>)
+
+configure_package_config_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/chinese-segmentation-config.cmake.in"
+        "${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config.cmake"
+        INSTALL_DESTINATION  ${CMAKE_CONFIG_INSTALL_DIR})
+
+write_basic_package_version_file(
+        ${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config-version.cmake
+        VERSION ${CHINESE_SEGMENTATION_VERSION}
+        COMPATIBILITY SameMajorVersion
+)
+
+configure_package_config_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/chinese-segmentation.pc.in"
+        "${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation.pc"
+        INSTALL_DESTINATION  ${PC_INSTALL_DIR})
+
+set_target_properties(chinese-segmentation PROPERTIES
+        VERSION ${CHINESE_SEGMENTATION_VERSION}
+        SOVERSION ${VERSION_MAJOR}
+        OUTPUT_NAME chinese-segmentation
+        )
+install(TARGETS chinese-segmentation
+        EXPORT chinese-segmentation
+        PUBLIC_HEADER DESTINATION ${HEADERS_INSTALL_DIR}
+        LIBRARY DESTINATION /usr/lib/${CMAKE_LIBRARY_ARCHITECTURE}
+        )
+install(EXPORT chinese-segmentation
+        FILE chinese-segmentation-targets.cmake
+        DESTINATION ${CMAKE_CONFIG_INSTALL_DIR})
+install(FILES ${HEADERS} DESTINATION ${HEADERS_INSTALL_DIR})
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation.pc DESTINATION ${PC_INSTALL_DIR})
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config.cmake
+        ${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config-version.cmake
+        DESTINATION ${CMAKE_CONFIG_INSTALL_DIR})
+set(DICT_FILES
+        dict/hmm_model.utf8
+        dict/idf.utf8
+        dict/jieba.dict.utf8
+        dict/stop_words.utf8
+        dict/user.dict.utf8
+        dict/pinyinWithoutTone.txt
+        dict/pos_dict/char_state_tab.utf8
+        dict/pos_dict/prob_emit.utf8
+        dict/pos_dict/prob_start.utf8
+        dict/pos_dict/prob_trans.utf8
+        pinyin4cpp/dict/singleWordPinyin.txt
+        pinyin4cpp/dict/wordsPinyin.txt
+        Traditional-Chinese-Simplified-conversion/dict/TraditionalChineseSimplifiedDict.txt
+        )
+install(FILES ${DICT_FILES} DESTINATION ${DICT_INSTALL_PATH})
+
+if (BUILD_TEST)
+    add_subdirectory(test)
+endif ()
+
+
+
--- a/libchinese-segmentation/LICENSE
+++ b/libchinese-segmentation/LICENSE
@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
--- a/libchinese-segmentation/README.md
+++ b/libchinese-segmentation/README.md
@ -0,0 +1,170 @@
+# chinese-segmentation
+
+#### 介绍
+libchinese-segmentation工程以单例的形式分别提供了中文分词、汉字转拼音和中文繁体简体转换功能。
+
+接口文件分别为:
+chinese-segmentation.h
+libchinese-segmentation_global.h
+common-struct.h
+
+hanzi-to-pinyin.h
+pinyin4cpp-common.h
+
+Traditional-to-Simplified.h
+安装路径:/usr/include/chinese-seg
+
+#### 使用说明
+
+其中中文分词相关功能由chinese-segmentation.h提供接口，主要包括以下功能函数：
+
+```
+   static ChineseSegmentation *getInstance();//全局单例
+     /**
+     * @brief ChineseSegmentation::callSegment
+     * 调用extractor进行关键词提取，先使用Mix方式初步分词，再使用Idf词典进行关键词提取，只包含两字以上关键词
+     *
+     * @param sentence 要提取关键词的句子
+     * @return vector<KeyWord> 存放提取后关键词的信息的容器
+     */
+    vector<KeyWord> callSegment(const string &sentence);
+    vector<KeyWord> callSegment(QString &sentence);
+
+    /**
+     * @brief ChineseSegmentation::callMixSegmentCutStr
+     * 使用Mix方法进行分词，即先使用最大概率法MP初步分词，再用隐式马尔科夫模型HMM进一步分词，可以准确切出词典已有词和未登录词，结果比较准确
+     *
+     * @param sentence 要分词的句子
+     * @return vector<string> 只存放分词后每个词的内容的容器
+     */
+    vector<string> callMixSegmentCutStr(const string& sentence);
+
+    /**
+     * @brief ChineseSegmentation::callMixSegmentCutWord
+     * 和callMixSegmentCutStr功能相同
+     * @param sentence 要分词的句子
+     * @return vector<Word> 存放分词后每个词所有信息的容器
+     */
+    vector<Word> callMixSegmentCutWord(const string& str);
+
+    /**
+     * @brief ChineseSegmentation::lookUpTagOfWord
+     * 查询word的词性
+     * @param word 要查询词性的词
+     * @return string word的词性
+     */
+    string lookUpTagOfWord(const string& word);
+
+    /**
+     * @brief ChineseSegmentation::getTagOfWordsInSentence
+     * 使用Mix分词后获取每个词的词性
+     * @param sentence 要分词的句子
+     * @return vector<pair<string, string>> 分词后的每个词的内容(firsr)和其对应的词性(second)
+     */
+    vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
+
+    /**
+     * @brief ChineseSegmentation::callFullSegment
+     * 使用Full进行分词，Full会切出字典里所有的词。
+     * @param sentence 要分词的句子
+     * @return vector<Word> 存放分词后每个词所有信息的容器
+     */
+    vector<Word> callFullSegment(const string& sentence);
+
+    /**
+     * @brief ChineseSegmentation::callQuerySegment
+     * 使用Query进行分词，即先使用Mix，对于长词再用Full，结果最精确，但词的数量也最大
+     * @param sentence 要分词的句子
+     * @return vector<Word> 存放分词后每个词所有信息的容器
+     */
+    vector<Word> callQuerySegment(const string& sentence);
+
+    /**
+     * @brief ChineseSegmentation::callHMMSegment
+     * 使用隐式马尔科夫模型HMM进行分词
+     * @param sentence 要分词的句子
+     * @return vector<Word> 存放分词后每个词所有信息的容器
+     */
+    vector<Word> callHMMSegment(const string& sentence);
+
+    /**
+     * @brief ChineseSegmentation::callMPSegment
+     * 使用最大概率法MP进行分词
+     * @param sentence 要分词的句子
+     * @return vector<Word> 存放分词后每个词所有信息的容器
+     */
+    vector<Word> callMPSegment(const string& sentence);
+
+```
+
+汉字转拼音相关功能由hanzi-to-pinyin.h提供接口，主要包括以下功能函数：
+
+```
+    static HanZiToPinYin * getInstance();//全局单例
+
+    /**
+     * @brief HanZiToPinYin::isMultiTone 判断是否为多音字/词/句
+     * @param word 要判断的字/词/句
+     * @return bool 不是返回false
+     */
+    bool isMultiTone(string &word);
+    bool isMultiTone(string &&word);
+    bool isMultiTone(const string &word);
+    bool isMultiTone(const string &&word);
+
+    /**
+     * @brief HanZiToPinYin::contains 查询某个字/词/句是否有拼音（是否在数据库包含）
+     * @param word 要查询的字/词/句
+     * @return bool 数据库不包含返回false
+     */
+    bool contains(string &word);
+
+    /**
+     * @brief HanZiToPinYin::getResults 获取某个字/词/句的拼音
+     * @param word 要获取拼音的字/词/句
+     * @param results word的拼音列表（有可能多音字），每次调用results会被清空
+     * @return int 获取到返回0，否则返回-1
+     */
+    int getResults(string word, QStringList &results);
+
+    /**
+     * @brief setConfig 设置HanZiToPinYin的各项功能，详见pinyin4cpp-common.h
+     * @param dataStyle 返回数据风格，默认defult
+     * @param segType 是否启用分词，默认启用
+     * @param polyphoneType 是否启用多音字，默认不启用
+     * @param processType 无拼音数据处理模式，默认defult
+     */
+    void setConfig(PinyinDataStyle dataStyle,SegType segType,PolyphoneType polyphoneType,ExDataProcessType processType);
+
+```
+
+中文繁体转简体相关功能由Traditional-to-Simplified.h提供接口，主要包括以下功能函数：
+
+```
+    static Traditional2Simplified * getInstance();//全局单例
+    /**
+     * @brief Traditional2Simplified::isMultiTone 判断是否为繁体字，是则返回true
+     * @param oneWord 要判断的字
+     * @return bool 不是返回false
+     */
+    bool isTraditional(string &oneWord);
+
+    /**
+     * @brief Traditional2Simplified::getResults 转换某个字/词/句的繁体字
+     * @param words 要转换为简体中文的字/词/句
+     * @return words 的简体中文结果
+     */
+    string getResults(string words);
+
+```
+
+除此之外工程中提供了测试程序位于chinese-segmentation/test，运行界面如下：
+![输入图片说明](https://foruda.gitee.com/images/1682048388802220746/245a2ec3_8021248.png "image.png")
+
+#### 参与贡献
+
+1.  Fork 本仓库
+2.  新建分支
+3.  提交代码
+4.  新建 Pull Request
+
--- a/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified.pri
+++ b/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified.pri
@ -0,0 +1,10 @@
+INCLUDEPATH += $$PWD
+
+HEADERS += \
+    $$PWD/Traditional2Simplified_trie.h
+
+SOURCES += \
+    $$PWD/Traditional2Simplified_trie.cpp
+
+DISTFILES += \
+    Traditional-Chinese-Simplified-conversion/dict/TraditionalChineseSimplifiedDict.txt 
--- a/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.cpp
+++ b/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.cpp
@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2023, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#include "Traditional2Simplified_trie.h"
+
+Traditional2SimplifiedTrie::Traditional2SimplifiedTrie(string dat_cache_path)
+    : StorageBase<char, false, CacheFileHeaderBase>(vector<string>{TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH}, dat_cache_path)
+{
+    this->Init();
+}
+
+Traditional2SimplifiedTrie::Traditional2SimplifiedTrie(const vector<string> file_paths, string dat_cache_path)
+    : StorageBase<char, false, CacheFileHeaderBase>(file_paths, dat_cache_path)
+{
+    this->Init();
+}
+
+bool Traditional2SimplifiedTrie::IsTraditional(const string &word) {
+    string result = this->Find(word);
+    if (!result.empty())
+        return true;
+    return false;
+}
+
+void Traditional2SimplifiedTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
+{
+    CacheFileHeaderBase header;
+    assert(sizeof(header.md5_hex) == md5.size());
+    memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
+
+    int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
+    string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
+    umask(S_IWGRP | S_IWOTH);
+    const int fd =mkstemp((char *)tmp_filepath.data());
+    assert(fd >= 0);
+    fchmod(fd, 0644);
+
+    write_bytes = write(fd, (const char *)&header, sizeof(CacheFileHeaderBase));
+
+    this->LoadDict(fd, write_bytes, offset, elements_num);
+
+    write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
+
+    lseek(fd, sizeof(header.md5_hex), SEEK_SET);
+    write(fd, &elements_num, sizeof(int));
+    write(fd, &offset, sizeof(int));
+    data_trie_size = this->GetDataTrieSize();
+    write(fd, &data_trie_size, sizeof(int));
+
+    close(fd);
+    assert((size_t)write_bytes == sizeof(CacheFileHeaderBase) + offset + this->GetDataTrieTotalSize());
+
+    tryRename(tmp_filepath, dat_cache_file);
+}
+
+string Traditional2SimplifiedTrie::Find(const string &key)
+{
+    int result = this->ExactMatchSearch(key.c_str(), key.size());
+    if (result < 0)
+        return string();
+    return string(&this->GetElementPtr()[result]);
+}
+
+void Traditional2SimplifiedTrie::LoadDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
+{
+    ifstream ifs(TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH);
+    string line;
+    vector<string> buf;
+
+    for (; getline(ifs, line);) {
+        if (limonp::StartsWith(line, "#") or line.empty()) {
+            continue;
+        }
+        limonp::Split(line, buf, ":");
+        if (buf.size() != 2)
+            continue;
+        this->Update(buf[0].c_str(), buf[0].size(), offset);
+        offset += (buf[1].size() + 1);
+        elements_num++;
+        write_bytes += write(fd, buf[1].c_str(), buf[1].size() + 1);
+    }
+}
--- a/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.h
+++ b/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.h
@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2023, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#ifndef Traditional2SimplifiedTrie_H
+#define Traditional2SimplifiedTrie_H
+
+#include "storage-base.hpp"
+
+const char * const  TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH = DICT_INSTALL_PATH"/TraditionalChineseSimplifiedDict.txt";
+
+class Traditional2SimplifiedTrie : public StorageBase<char, false, CacheFileHeaderBase>
+{
+public:
+    Traditional2SimplifiedTrie(string dat_cache_path = "");
+    Traditional2SimplifiedTrie(const vector<string> file_paths, string dat_cache_path = "");
+    void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
+    string Find(const string &key);
+    bool IsTraditional(const string &word);
+
+private:
+    void LoadDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
+};
+
+#endif // Traditional2SimplifiedTrie_H
--- a/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/dict/TraditionalChineseSimplifiedDict.txt
+++ b/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/dict/TraditionalChineseSimplifiedDict.txt
--- a/libchinese-segmentation/Traditional-to-Simplified-private.h
+++ b/libchinese-segmentation/Traditional-to-Simplified-private.h
@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2023, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+
+#ifndef Traditional2SimplifiedPRIVATE_H
+#define Traditional2SimplifiedPRIVATE_H
+
+#include <QtCore/qglobal.h>
+#include <QHash>
+#include "Traditional-to-Simplified.h"
+#include "Traditional2Simplified_trie.h"
+
+using namespace std;
+
+class TRADITIONAL_CHINESE_SIMPLIFIED_EXPORT Traditional2SimplifiedPrivate
+{
+public:
+    Traditional2SimplifiedPrivate(Traditional2Simplified *parent = nullptr);
+    ~Traditional2SimplifiedPrivate();
+
+public:
+    bool isTraditional(string &word) {return m_Traditional2SimplifiedTrie.IsTraditional(word);}
+
+    string getResults(string words);
+
+private:
+
+    Traditional2Simplified *q = nullptr;
+    Traditional2SimplifiedTrie m_Traditional2SimplifiedTrie;
+};
+#endif // Traditional2SimplifiedPRIVATE_H
--- a/libchinese-segmentation/Traditional-to-Simplified.cpp
+++ b/libchinese-segmentation/Traditional-to-Simplified.cpp
@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2023, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+
+#include <mutex>
+#include <cctype>
+#include "Traditional-to-Simplified.h"
+#include "Traditional-to-Simplified-private.h"
+#include "cppjieba/Unicode.hpp"
+
+Traditional2Simplified * Traditional2Simplified::g_Traditional2SimplifiedManager = nullptr;
+std::once_flag g_Traditional2SimplifiedSingleFlag;
+
+string Traditional2SimplifiedPrivate::getResults(string words)
+{
+    string results;
+    if (words.empty()) {
+        return words;
+    } else if (cppjieba::IsSingleWord(words)) {//单个字符
+        results = m_Traditional2SimplifiedTrie.Find(words);
+        if (results.empty()) {
+            results = words;//原数据返回
+        }
+    } else {//多个字符
+        string oneWord;
+        string data;
+        cppjieba::RuneStrArray runeArray;
+        cppjieba::DecodeRunesInString(words, runeArray);
+        for (auto i = runeArray.begin(); i != runeArray.end(); ++i) {
+            oneWord = cppjieba::GetStringFromRunes(words, i, i);
+            data = m_Traditional2SimplifiedTrie.Find(oneWord);
+            if (data.empty()) {//单字无结果
+                results.append(oneWord);
+            } else {
+                results.append(data);
+            }
+        }
+    }
+    return results;
+}
+
+Traditional2SimplifiedPrivate::Traditional2SimplifiedPrivate(Traditional2Simplified *parent) : q(parent)
+{
+}
+
+Traditional2SimplifiedPrivate::~Traditional2SimplifiedPrivate()
+{
+}
+
+Traditional2Simplified * Traditional2Simplified::getInstance()
+{
+    call_once(g_Traditional2SimplifiedSingleFlag, []() {
+        g_Traditional2SimplifiedManager = new Traditional2Simplified;
+    });
+    return g_Traditional2SimplifiedManager;
+}
+
+bool Traditional2Simplified::isTraditional(string &oneWord)
+{
+    return d->isTraditional(oneWord);
+}
+
+string Traditional2Simplified::getResults(string words)
+{
+    return d->getResults(words);
+}
+
+Traditional2Simplified::Traditional2Simplified() : d(new Traditional2SimplifiedPrivate)
+{
+}
--- a/libchinese-segmentation/Traditional-to-Simplified.h
+++ b/libchinese-segmentation/Traditional-to-Simplified.h
@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2023, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+
+#ifndef Traditional2Simplified_H
+#define Traditional2Simplified_H
+
+#include <QtCore/qglobal.h>
+#include <string>
+#define TRADITIONAL_CHINESE_SIMPLIFIED_EXPORT Q_DECL_IMPORT
+
+using namespace std;
+
+class Traditional2SimplifiedPrivate;
+class TRADITIONAL_CHINESE_SIMPLIFIED_EXPORT Traditional2Simplified
+{
+public:
+    static Traditional2Simplified * getInstance();
+
+public:
+    /**
+     * @brief Traditional2Simplified::isMultiTone 判断是否为繁体字，是则返回true
+     * @param oneWord 要判断的字
+     * @return bool 不是返回false
+     */
+    bool isTraditional(string &oneWord);
+
+    /**
+     * @brief Traditional2Simplified::getResults 转换某个字/词/句的繁体字
+     * @param words 要转换为简体中文的字/词/句
+     * @return words 的简体中文结果
+     */
+    string getResults(string words);
+
+protected:
+    Traditional2Simplified();
+    ~Traditional2Simplified();
+    Traditional2Simplified(const Traditional2Simplified&) = delete;
+    Traditional2Simplified& operator =(const Traditional2Simplified&) = delete;
+private:
+    static Traditional2Simplified *g_Traditional2SimplifiedManager;
+    Traditional2SimplifiedPrivate *d = nullptr;
+};
+
+#endif // PINYINMANAGER_H
--- a/libchinese-segmentation/chinese-segmentation-config.cmake.in
+++ b/libchinese-segmentation/chinese-segmentation-config.cmake.in
@ -0,0 +1,9 @@
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+find_dependency(Qt@QT_VERSION_MAJOR@Core "@REQUIRED_QT_VERSION@")
+if(TARGET Qt6::Core)
+    find_dependency(Qt6Core5Compat @REQUIRED_QT_VERSION@)
+endif()
+
+include("${CMAKE_CURRENT_LIST_DIR}/chinese-segmentation-targets.cmake")
--- a/libchinese-segmentation/chinese-segmentation-private.h
+++ b/libchinese-segmentation/chinese-segmentation-private.h
@ -0,0 +1,34 @@
+#ifndef CHINESESEGMENTATIONPRIVATE_H
+#define CHINESESEGMENTATIONPRIVATE_H
+
+#include "chinese-segmentation.h"
+#include "cppjieba/Jieba.hpp"
+#include "cppjieba/KeywordExtractor.hpp"
+
+class ChineseSegmentationPrivate
+{
+public:
+    explicit ChineseSegmentationPrivate(ChineseSegmentation *parent = nullptr);
+    ~ChineseSegmentationPrivate();
+    vector<KeyWord> callSegment(const string& sentence);
+    vector<KeyWord> callSegment(QString& sentence);
+
+    vector<string> callMixSegmentCutStr(const string& sentence);
+    vector<Word> callMixSegmentCutWord(const string& sentence);
+    string lookUpTagOfWord(const string& word);
+    vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
+
+    vector<Word> callFullSegment(const string& sentence);
+
+    vector<Word> callQuerySegment(const string& sentence);
+
+    vector<Word> callHMMSegment(const string& sentence);
+
+    vector<Word> callMPSegment(const string& sentence);
+
+private:
+    cppjieba::Jieba *m_jieba;
+    ChineseSegmentation *q = nullptr;
+};
+
+#endif // CHINESESEGMENTATIONPRIVATE_H
--- a/libchinese-segmentation/chinese-segmentation.cpp
+++ b/libchinese-segmentation/chinese-segmentation.cpp
@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2020, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: zhangzihao <zhangzihao@kylinos.cn>
+ * Modified by: zhangpengfei <zhangpengfei@kylinos.cn>
+ *
+ */
+#include "chinese-segmentation.h"
+#include "chinese-segmentation-private.h"
+
+ChineseSegmentationPrivate::ChineseSegmentationPrivate(ChineseSegmentation *parent) : q(parent)
+{
+    //const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
+    const char * const  HMM_PATH = DICT_INSTALL_PATH"/hmm_model.utf8";
+    //const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
+    //const char * const  IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
+    const char * const  STOP_WORD_PATH = DICT_INSTALL_PATH"/stop_words.utf8";
+    m_jieba = new cppjieba::Jieba(DICT_PATH,
+                                  HMM_PATH,
+                                  USER_DICT_PATH,
+                                  IDF_DICT_PATH,
+                                  STOP_WORD_PATH,
+                                  "");
+}
+
+ChineseSegmentationPrivate::~ChineseSegmentationPrivate() {
+    if(m_jieba)
+        delete m_jieba;
+    m_jieba = nullptr;
+}
+
+vector<KeyWord> ChineseSegmentationPrivate::callSegment(const string &sentence) {
+    const size_t topk = -1;
+    vector<KeyWord> keywordres;
+    ChineseSegmentationPrivate::m_jieba->extractor.Extract(sentence, keywordres, topk);
+
+    return keywordres;
+
+}
+
+vector<KeyWord> ChineseSegmentationPrivate::callSegment(QString &sentence) {
+    //'\xEF\xBC\x8C' is "，" "\xE3\x80\x82" is "。"  use three " " to replace ,to ensure the offset info.
+    sentence = sentence.replace("\t", " ").replace("\xEF\xBC\x8C", "   ").replace("\xE3\x80\x82", "   ");
+    const size_t topk = -1;
+    vector<KeyWord> keywordres;
+    ChineseSegmentationPrivate::m_jieba->extractor.Extract(sentence.left(20480000).toStdString(), keywordres, topk);
+
+    return keywordres;
+
+}
+
+vector<string> ChineseSegmentationPrivate::callMixSegmentCutStr(const string &sentence)
+{
+    vector<string> keywordres;
+    ChineseSegmentationPrivate::m_jieba->Cut(sentence, keywordres);
+    return keywordres;
+}
+
+vector<Word> ChineseSegmentationPrivate::callMixSegmentCutWord(const string &sentence)
+{
+    vector<Word> keywordres;
+    ChineseSegmentationPrivate::m_jieba->Cut(sentence, keywordres);
+    return keywordres;
+}
+
+string ChineseSegmentationPrivate::lookUpTagOfWord(const string &word)
+{
+    return ChineseSegmentationPrivate::m_jieba->LookupTag(word);
+}
+
+vector<pair<string, string>> ChineseSegmentationPrivate::getTagOfWordsInSentence(const string &sentence)
+{
+     vector<pair<string, string>> words;
+     ChineseSegmentationPrivate::m_jieba->Tag(sentence, words);
+     return words;
+}
+
+vector<Word> ChineseSegmentationPrivate::callFullSegment(const string &sentence)
+{
+    vector<Word> keywordres;
+    ChineseSegmentationPrivate::m_jieba->CutAll(sentence, keywordres);
+    return keywordres;
+}
+
+vector<Word> ChineseSegmentationPrivate::callQuerySegment(const string &sentence)
+{
+    vector<Word> keywordres;
+    ChineseSegmentationPrivate::m_jieba->CutForSearch(sentence, keywordres);
+    return keywordres;
+}
+
+vector<Word> ChineseSegmentationPrivate::callHMMSegment(const string &sentence)
+{
+    vector<Word> keywordres;
+    ChineseSegmentationPrivate::m_jieba->CutHMM(sentence, keywordres);
+    return keywordres;
+}
+
+vector<Word> ChineseSegmentationPrivate::callMPSegment(const string &sentence)
+{
+    size_t maxWordLen = 512;
+    vector<Word> keywordres;
+    ChineseSegmentationPrivate::m_jieba->CutSmall(sentence, keywordres, maxWordLen);
+    return keywordres;
+}
+
+ChineseSegmentation *ChineseSegmentation::getInstance()
+{
+    static ChineseSegmentation *global_instance_chinese_segmentation = new ChineseSegmentation;
+    return global_instance_chinese_segmentation;
+}
+
+vector<KeyWord> ChineseSegmentation::callSegment(const string &sentence)
+{
+    return d->callSegment(sentence);
+}
+
+vector<KeyWord> ChineseSegmentation::callSegment(QString &sentence)
+{
+    return d->callSegment(sentence);
+}
+
+vector<string> ChineseSegmentation::callMixSegmentCutStr(const string &sentence)
+{
+    return d->callMixSegmentCutStr(sentence);
+}
+
+vector<Word> ChineseSegmentation::callMixSegmentCutWord(const string &str)
+{
+    return d->callMixSegmentCutWord(str);
+}
+
+string ChineseSegmentation::lookUpTagOfWord(const string &word)
+{
+    return d->lookUpTagOfWord(word);
+}
+
+vector<pair<string, string> > ChineseSegmentation::getTagOfWordsInSentence(const string &sentence)
+{
+    return d->getTagOfWordsInSentence(sentence);
+}
+
+vector<Word> ChineseSegmentation::callFullSegment(const string &sentence)
+{
+    return d->callFullSegment(sentence);
+}
+
+vector<Word> ChineseSegmentation::callQuerySegment(const string &sentence)
+{
+    return d->callQuerySegment(sentence);
+}
+
+vector<Word> ChineseSegmentation::callHMMSegment(const string &sentence)
+{
+    return d->callHMMSegment(sentence);
+}
+
+vector<Word> ChineseSegmentation::callMPSegment(const string &sentence)
+{
+    return d->callMPSegment(sentence);
+}
+
+ChineseSegmentation::ChineseSegmentation() : d(new ChineseSegmentationPrivate)
+{
+}
--- a/libchinese-segmentation/chinese-segmentation.h
+++ b/libchinese-segmentation/chinese-segmentation.h
@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2020, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: zhangzihao <zhangzihao@kylinos.cn>
+ * Modified by: zhangpengfei <zhangpengfei@kylinos.cn>
+ *
+ */
+#ifndef CHINESESEGMENTATION_H
+#define CHINESESEGMENTATION_H
+
+#include <QString>
+#include "libchinese-segmentation_global.h"
+#include "common-struct.h"
+
+class ChineseSegmentationPrivate;
+class CHINESESEGMENTATION_EXPORT ChineseSegmentation {
+public:
+    static ChineseSegmentation *getInstance();
+
+    /**
+     * @brief ChineseSegmentation::callSegment
+     * 调用extractor进行关键词提取，先使用Mix方式初步分词，再使用Idf词典进行关键词提取，只包含两字以上关键词
+     *
+     * @param sentence 要提取关键词的句子
+     * @return vector<KeyWord> 存放提取后关键词的信息的容器
+     */
+    vector<KeyWord> callSegment(const string &sentence);
+    vector<KeyWord> callSegment(QString &sentence);
+
+    /**
+     * @brief ChineseSegmentation::callMixSegmentCutStr
+     * 使用Mix方法进行分词，即先使用最大概率法MP初步分词，再用隐式马尔科夫模型HMM进一步分词，可以准确切出词典已有词和未登录词，结果比较准确
+     *
+     * @param sentence 要分词的句子
+     * @return vector<string> 只存放分词后每个词的内容的容器
+     */
+    vector<string> callMixSegmentCutStr(const string& sentence);
+
+    /**
+     * @brief ChineseSegmentation::callMixSegmentCutWord
+     * 和callMixSegmentCutStr功能相同
+     * @param sentence 要分词的句子
+     * @return vector<Word> 存放分词后每个词所有信息的容器
+     */
+    vector<Word> callMixSegmentCutWord(const string& str);
+
+    /**
+     * @brief ChineseSegmentation::lookUpTagOfWord
+     * 查询word的词性
+     * @param word 要查询词性的词
+     * @return string word的词性
+     */
+    string lookUpTagOfWord(const string& word);
+
+    /**
+     * @brief ChineseSegmentation::getTagOfWordsInSentence
+     * 使用Mix分词后获取每个词的词性
+     * @param sentence 要分词的句子
+     * @return vector<pair<string, string>> 分词后的每个词的内容(firsr)和其对应的词性(second)
+     */
+    vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
+
+    /**
+     * @brief ChineseSegmentation::callFullSegment
+     * 使用Full进行分词，Full会切出字典里所有的词。
+     * @param sentence 要分词的句子
+     * @return vector<Word> 存放分词后每个词所有信息的容器
+     */
+    vector<Word> callFullSegment(const string& sentence);
+
+    /**
+     * @brief ChineseSegmentation::callQuerySegment
+     * 使用Query进行分词，即先使用Mix，对于长词再用Full，结果最精确，但词的数量也最大
+     * @param sentence 要分词的句子
+     * @return vector<Word> 存放分词后每个词所有信息的容器
+     */
+    vector<Word> callQuerySegment(const string& sentence);
+
+    /**
+     * @brief ChineseSegmentation::callHMMSegment
+     * 使用隐式马尔科夫模型HMM进行分词
+     * @param sentence 要分词的句子
+     * @return vector<Word> 存放分词后每个词所有信息的容器
+     */
+    vector<Word> callHMMSegment(const string& sentence);
+
+    /**
+     * @brief ChineseSegmentation::callMPSegment
+     * 使用最大概率法MP进行分词
+     * @param sentence 要分词的句子
+     * @return vector<Word> 存放分词后每个词所有信息的容器
+     */
+    vector<Word> callMPSegment(const string& sentence);
+
+private:
+    explicit ChineseSegmentation();
+    ~ChineseSegmentation() = default;
+    ChineseSegmentation(const ChineseSegmentation&) = delete;
+    ChineseSegmentation& operator =(const ChineseSegmentation&) = delete;
+
+private:
+    ChineseSegmentationPrivate *d = nullptr;
+};
+
+#endif // CHINESESEGMENTATION_H
--- a/libchinese-segmentation/chinese-segmentation.pc.in
+++ b/libchinese-segmentation/chinese-segmentation.pc.in
@ -0,0 +1,11 @@
+prefix=/usr
+exec_prefix=${prefix}
+libdir=${prefix}/lib/@CMAKE_LIBRARY_ARCHITECTURE@
+includedir=${prefix}/include/chinese-segmentation
+
+Name: chinese-segmentation
+Description: Chinese-segmentation header files
+URL: https://www.ukui.org/
+Version: @VERSION@
+Cflags: -I${includedir}
+Libs: -L${libdir} -lchinese-segmentation
--- a/libchinese-segmentation/common-struct.h
+++ b/libchinese-segmentation/common-struct.h
@ -0,0 +1,52 @@
+#ifndef COMMONSTRUCT_H
+#define COMMONSTRUCT_H
+
+#include <string>
+#include <vector>
+
+using namespace std;
+
+/**
+ * @brief The KeyWord struct
+ *
+ * @property word the content of keyword
+ * @property offsets the Unicode offsets, can be used to check the word pos in a sentence
+ * @property weight the weight of the keyword
+ */
+
+struct KeyWord {
+    string word;
+    vector<size_t> offsets;
+    double weight;
+    ~KeyWord() {
+        word = std::move("");
+        offsets.clear();
+        offsets.shrink_to_fit();
+    }
+};
+
+/**
+ * @brief The Word struct
+ *
+ * @property word the content of word
+ * @property offset the offset of the word(absolute pos, Chinese 3 , English 1)， can be used to check the word pos in a sentence
+ * @property unicode_offset the Unicode offset of the word
+ * @property unicode_length the Unicode length of the word
+ */
+struct Word {
+    string word;
+    uint32_t offset;
+    uint32_t unicode_offset;
+    uint32_t unicode_length;
+    Word(const string& w, uint32_t o)
+        : word(w), offset(o) {
+    }
+    Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
+        : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
+    }
+    ~Word() {
+        word = std::move("");
+    }
+}; // struct Word
+
+#endif // COMMONSTRUCT_H
--- a/libchinese-segmentation/cppjieba/DatTrie.hpp
+++ b/libchinese-segmentation/cppjieba/DatTrie.hpp
@ -0,0 +1,641 @@
+#pragma once
+
+#include <stdint.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <QDebug>
+
+#include <algorithm>
+#include <utility>
+
+#include "limonp/Md5.hpp"
+#include "Unicode.hpp"
+//#define USE_DARTS_CLONE
+#ifdef USE_DARTS_CLONE
+#include "../storage-base/darts-clone/darts.h"
+#else
+#include "../storage-base/cedar/cedar.h"
+#endif
+
+namespace cppjieba {
+
+using std::pair;
+
+struct DatElement {
+    string word;
+    string tag;
+    double weight = 0;
+
+    bool operator < (const DatElement & b) const {
+        if (word == b.word) {
+            return this->weight > b.weight;
+        }
+
+        return this->word < b.word;
+    }
+};
+
+struct IdfElement {
+    string word;
+    double idf = 0;
+
+    bool operator < (const IdfElement & b) const {
+        if (word == b.word) {
+            return this->idf > b.idf;
+        }
+
+        return this->word < b.word;
+    }
+};
+
+struct PinYinElement
+{
+    string word;
+    string tag;
+
+    bool operator < (const DatElement & b) const {
+        return this->word < b.word;
+    }
+};
+
+inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
+    return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
+}
+
+struct PinYinMemElem {
+    char tag[6] = {};
+
+    void SetTag(const string & str) {
+        memset(&tag[0], 0, sizeof(tag));
+        strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
+    }
+
+    string GetTag() const {
+        return &tag[0];
+    }
+};
+
+inline std::ostream & operator << (std::ostream& os, const DatMemElem & elem) {
+    return os << "/tag=" << elem.GetTag() << "/weight=" << elem.weight;
+}
+#ifdef USE_DARTS_CLONE
+typedef Darts::DoubleArray JiebaDAT;
+#else
+typedef cedar::da<int, -1, -2, false> JiebaDAT;
+#endif
+
+
+struct CacheFileHeader {
+    char md5_hex[32] = {};
+    double min_weight = 0;
+    uint32_t elements_num = 0;
+    uint32_t dat_size = 0;
+};
+
+static_assert(sizeof(DatMemElem) == 16, "DatMemElem length invalid");
+static_assert((sizeof(CacheFileHeader) % sizeof(DatMemElem)) == 0, "DatMemElem CacheFileHeader length equal");
+
+
+class DatTrie {
+public:
+    DatTrie() {}
+    ~DatTrie() {
+        ::munmap(mmap_addr_, mmap_length_);
+        mmap_addr_ = nullptr;
+        mmap_length_ = 0;
+
+        ::close(mmap_fd_);
+        mmap_fd_ = -1;
+    }
+
+    const DatMemElem * Find(const string & key) const {
+#ifdef USE_DARTS_CLONE
+        JiebaDAT::result_pair_type find_result;
+        dat_.exactMatchSearch(key.c_str(), find_result);
+
+        if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
+            return nullptr;
+        }
+
+        return &elements_ptr_[ find_result.value ];
+#else
+        int result = dat_.exactMatchSearch<int>(key.c_str());
+        if (result < 0)
+            return nullptr;
+        return &elements_ptr_[result];
+#endif
+    }
+
+    const double Find(const string & key, std::size_t length, std::size_t node_pos) const {
+#ifdef USE_DARTS_CLONE
+        JiebaDAT::result_pair_type find_result;
+        dat_.exactMatchSearch(key.c_str(), find_result, length, node_pos);
+
+        if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
+            return -1;
+        }
+
+        return idf_elements_ptr_[ find_result.value ];
+#else
+        int result = dat_.exactMatchSearch<int>(key.c_str(), length, node_pos);
+        if (result < 0)
+            return -1;
+        return idf_elements_ptr_[result];
+#endif
+    }
+
+    const PinYinMemElem * PinYinFind(const string & key) const {
+#ifdef USE_DARTS_CLONE
+        JiebaDAT::result_pair_type find_result;
+        dat_.exactMatchSearch(key.c_str(), find_result);
+
+        if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
+            return nullptr;
+        }
+
+        return &pinyin_elements_ptr_[ find_result.value ];
+#else
+        int result = dat_.exactMatchSearch<int>(key.c_str());
+        if (result < 0)
+            return nullptr;
+        return &pinyin_elements_ptr_[result];
+#endif
+    }
+
+    void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
+              vector<struct DatDag>&res, size_t max_word_len) const {
+
+        res.clear();
+        res.resize(end - begin);
+
+        string text_str;
+        EncodeRunesToString(begin, end, text_str);
+
+        static const size_t max_num = 128;
+        JiebaDAT::result_pair_type result_pairs[max_num] = {};
+
+        for (size_t i = 0, begin_pos = 0; i < size_t(end - begin); i++) {
+
+            std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
+
+            res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + 1, nullptr));
+
+            for (std::size_t idx = 0; idx < num_results; ++idx) {
+                auto & match = result_pairs[idx];
+
+                if ((match.value < 0) || ((size_t)match.value >= elements_num_)) {
+                    continue;
+                }
+
+                auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
+
+                if (char_num > max_word_len) {
+                    continue;
+                }
+
+                auto pValue = &elements_ptr_[match.value];
+
+                if (1 == char_num) {
+                    res[i].nexts[0].second = pValue;
+                    continue;
+                }
+
+                res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + char_num, pValue));
+            }
+
+            begin_pos += limonp::UnicodeToUtf8Bytes((begin + i)->rune);
+        }
+    }
+
+    /*
+    void Find_Reverse(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
+              vector<struct DatDag>&res, size_t max_word_len) const {
+
+        res.clear();
+        res.resize(end - begin);
+
+        string text_str;
+        EncodeRunesToString(begin, end, text_str);
+
+        static const size_t max_num = 128;
+        JiebaDAT::result_pair_type result_pairs[max_num] = {};
+
+        size_t str_size = end - begin;
+        for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
+
+            begin_pos -= (end - i - 1)->len;
+            std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
+            res[str_size - i - 1].nexts.push_back(pair<size_t, const DatMemElem *>(str_size - i, nullptr));
+
+            for (std::size_t idx = 0; idx < num_results; ++idx) {
+                auto & match = result_pairs[idx];
+                if ((match.value < 0) || ((size_t)match.value >= elements_num_)) {
+                    continue;
+                }
+
+                auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
+
+                if (char_num > max_word_len) {
+                    continue;
+                }
+
+                auto pValue = &elements_ptr_[match.value];
+
+                if (1 == char_num) {
+                    res[str_size - i - 1].nexts[0].second = pValue;
+                    continue;
+                }
+
+                res[str_size - i - 1].nexts.push_back(pair<size_t, const DatMemElem *>(str_size - 1 - i + char_num, pValue));
+            }
+        }
+    }*/
+
+    void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
+              vector<WordRange>& words, size_t max_word_len) const {
+
+        string text_str;
+        EncodeRunesToString(begin, end, text_str);
+
+        static const size_t max_num = 128;
+        JiebaDAT::result_pair_type result_pairs[max_num] = {};//存放字典查询结果
+        size_t str_size = end - begin;
+        double max_weight[str_size];//存放逆向路径最大weight
+        for (size_t i = 0; i<str_size; i++) {
+            max_weight[i] = -3.14e+100;
+        }
+        int max_next[str_size];//存放动态规划后的分词结果
+        //memset(max_next,-1,str_size);
+
+        double val(0);
+        for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
+            size_t nextPos = str_size - i;//逆向计算
+            begin_pos -= (end - i - 1)->len;
+
+            std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
+            if (0 == num_results) {//字典不存在则单独分词
+                val = min_weight_;
+
+                if (nextPos  < str_size) {
+                    val += max_weight[nextPos];
+                }
+                if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
+                    max_weight[nextPos - 1] = val;
+                    max_next[nextPos - 1] = nextPos;
+                }
+            } else {//字典存在则根据查询结果数量计算最大概率路径
+                for (std::size_t idx = 0; idx < num_results; ++idx) {
+                    auto & match = result_pairs[idx];
+                    if ((match.value < 0) || ((size_t)match.value >= elements_num_)) {
+                        continue;
+                    }
+                    auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
+                    if (char_num > max_word_len) {
+                        continue;
+                    }
+                    auto pValue = &elements_ptr_[match.value];
+
+                    val = pValue->weight;
+                    if (1 == char_num) {
+                        if (nextPos  < str_size) {
+                            val += max_weight[nextPos];
+                        }
+                        if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
+                            max_weight[nextPos - 1] = val;
+                            max_next[nextPos - 1] = nextPos;
+                        }
+                    } else {
+                        if (nextPos - 1 + char_num  < str_size) {
+                            val += max_weight[nextPos - 1 + char_num];
+                        }
+                        if ((nextPos - 1 + char_num <= str_size) && (val > max_weight[nextPos - 1])) {
+                            max_weight[nextPos - 1] = val;
+                            max_next[nextPos - 1] = nextPos - 1 + char_num;
+                        }
+                    }
+                }
+            }
+        }
+        for (size_t i = 0; i < str_size;) {//统计动态规划结果
+            assert(max_next[i] > i);
+            assert(max_next[i] <= str_size);
+            WordRange wr(begin + i, begin + max_next[i] - 1);
+            words.push_back(wr);
+            i = max_next[i];
+        }
+    }
+    double GetMinWeight() const {
+        return min_weight_;
+    }
+
+    void SetMinWeight(double d) {
+        min_weight_ = d ;
+    }
+
+    bool InitBuildDat(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
+        BuildDatCache(elements, dat_cache_file, md5);
+        return InitAttachDat(dat_cache_file, md5);
+    }
+
+    bool InitBuildDat(vector<IdfElement>& elements, const string & dat_cache_file, const string & md5) {
+        BuildDatCache(elements, dat_cache_file, md5);
+        return InitIdfAttachDat(dat_cache_file, md5);
+    }
+
+    bool InitBuildDat(vector<PinYinElement>& elements, const string & dat_cache_file, const string & md5) {
+        BuildDatCache(elements, dat_cache_file, md5);
+        return InitPinYinAttachDat(dat_cache_file, md5);
+    }
+
+    bool InitAttachDat(const string & dat_cache_file, const string & md5) {
+        mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
+
+        if (mmap_fd_ < 0) {
+            return false;
+        }
+
+        const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
+        assert(seek_off >= 0);
+        mmap_length_ = seek_off;
+
+        mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
+        assert(MAP_FAILED != mmap_addr_);
+
+        assert(mmap_length_ >= sizeof(CacheFileHeader));
+        CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
+        elements_num_ = header.elements_num;
+        min_weight_ = header.min_weight;
+        assert(sizeof(header.md5_hex) == md5.size());
+
+        if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
+            return false;
+        }
+
+        assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(DatMemElem)  + header.dat_size * dat_.unit_size());
+        elements_ptr_ = (const DatMemElem *)(mmap_addr_ + sizeof(header));
+        char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(DatMemElem) * elements_num_;
+        dat_.set_array(dat_ptr, header.dat_size);
+        return true;
+    }
+
+    bool InitIdfAttachDat(const string & dat_cache_file, const string & md5) {
+        mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
+
+        if (mmap_fd_ < 0) {
+            return false;
+        }
+
+        const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
+        assert(seek_off >= 0);
+        mmap_length_ = seek_off;
+
+        mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
+        assert(MAP_FAILED != mmap_addr_);
+
+        assert(mmap_length_ >= sizeof(CacheFileHeader));
+        CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
+        elements_num_ = header.elements_num;
+        min_weight_ = header.min_weight;
+        assert(sizeof(header.md5_hex) == md5.size());
+
+        if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
+            return false;
+        }
+
+        assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(double)  + header.dat_size * dat_.unit_size());
+        idf_elements_ptr_ = (const double *)(mmap_addr_ + sizeof(header));
+        char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(double) * elements_num_;
+        dat_.set_array(dat_ptr, header.dat_size);
+        return true;
+    }
+
+    bool InitPinYinAttachDat(const string & dat_cache_file, const string & md5) {
+        mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
+
+        if (mmap_fd_ < 0) {
+            return false;
+        }
+
+        const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
+        assert(seek_off >= 0);
+        mmap_length_ = seek_off;
+
+        mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
+        assert(MAP_FAILED != mmap_addr_);
+
+        assert(mmap_length_ >= sizeof(CacheFileHeader));
+        CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
+        elements_num_ = header.elements_num;
+        min_weight_ = header.min_weight;
+        assert(sizeof(header.md5_hex) == md5.size());
+
+        if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
+            return false;
+        }
+
+        assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(PinYinMemElem)  + header.dat_size * dat_.unit_size());
+        pinyin_elements_ptr_ = (const PinYinMemElem *)(mmap_addr_ + sizeof(header));
+        char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(PinYinMemElem) * elements_num_;
+        dat_.set_array(dat_ptr, header.dat_size);
+        return true;
+    }
+
+private:
+    void BuildDatCache(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
+        std::sort(elements.begin(), elements.end());
+
+        vector<const char*> keys_ptr_vec;
+        vector<int> values_vec;
+        vector<DatMemElem> mem_elem_vec;
+
+        keys_ptr_vec.reserve(elements.size());
+        values_vec.reserve(elements.size());
+        mem_elem_vec.reserve(elements.size());
+
+        CacheFileHeader header;
+        header.min_weight = min_weight_;
+        assert(sizeof(header.md5_hex) == md5.size());
+        memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
+
+        for (size_t i = 0; i < elements.size(); ++i) {
+            keys_ptr_vec.push_back(elements[i].word.data());
+            values_vec.push_back(i);
+            mem_elem_vec.push_back(DatMemElem());
+            auto & mem_elem = mem_elem_vec.back();
+            mem_elem.weight = elements[i].weight;
+            mem_elem.SetTag(elements[i].tag);
+        }
+
+        auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
+        assert(0 == ret);
+        header.elements_num = mem_elem_vec.size();
+        header.dat_size = dat_.size();
+
+        {
+            string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
+            ::umask(S_IWGRP | S_IWOTH);
+            //const int fd =::mkstemp(&tmp_filepath[0]);
+            const int fd =::mkstemp((char *)tmp_filepath.data());
+            qDebug() << "mkstemp :" << errno << tmp_filepath.data();
+            assert(fd >= 0);
+            ::fchmod(fd, 0644);
+
+            auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
+            write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(mem_elem_vec[0]) * mem_elem_vec.size());
+            write_bytes += ::write(fd, dat_.array(), dat_.total_size());
+
+            assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(mem_elem_vec[0]) + dat_.total_size());
+            ::close(fd);
+
+            const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
+            assert(0 == rename_ret);
+        }
+    }
+
+    void BuildDatCache(vector<IdfElement>& elements, const string & dat_cache_file, const string & md5) {
+        std::sort(elements.begin(), elements.end());
+
+        vector<const char*> keys_ptr_vec;
+        vector<int> values_vec;
+        vector<double> mem_elem_vec;
+
+        keys_ptr_vec.reserve(elements.size());
+        values_vec.reserve(elements.size());
+        mem_elem_vec.reserve(elements.size());
+
+        CacheFileHeader header;
+        header.min_weight = min_weight_;
+        assert(sizeof(header.md5_hex) == md5.size());
+        memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
+
+        for (size_t i = 0; i < elements.size(); ++i) {
+            keys_ptr_vec.push_back(elements[i].word.data());
+            values_vec.push_back(i);
+            mem_elem_vec.push_back(elements[i].idf);
+        }
+
+        auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
+        assert(0 == ret);
+        header.elements_num = mem_elem_vec.size();
+        header.dat_size = dat_.size();
+
+        {
+            string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
+            ::umask(S_IWGRP | S_IWOTH);
+            //const int fd =::mkstemp(&tmp_filepath[0]);
+            const int fd =::mkstemp((char *)tmp_filepath.data());
+            qDebug() << "mkstemp error:" << errno << tmp_filepath.data();
+            assert(fd >= 0);
+            ::fchmod(fd, 0644);
+
+            auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
+            write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(double) * mem_elem_vec.size());
+            write_bytes += ::write(fd, dat_.array(), dat_.total_size());
+
+            assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(double) + dat_.total_size());
+            ::close(fd);
+
+            const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
+            assert(0 == rename_ret);
+        }
+    }
+
+    void BuildDatCache(vector<PinYinElement>& elements, const string & dat_cache_file, const string & md5) {
+        //std::sort(elements.begin(), elements.end());
+
+        vector<const char*> keys_ptr_vec;
+        vector<int> values_vec;
+        vector<PinYinMemElem> mem_elem_vec;
+
+        keys_ptr_vec.reserve(elements.size());
+        values_vec.reserve(elements.size());
+        mem_elem_vec.reserve(elements.size());
+
+        CacheFileHeader header;
+        header.min_weight = min_weight_;
+        assert(sizeof(header.md5_hex) == md5.size());
+        memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
+
+        for (size_t i = 0; i < elements.size(); ++i) {
+            keys_ptr_vec.push_back(elements[i].word.data());
+            values_vec.push_back(i);
+            mem_elem_vec.push_back(PinYinMemElem());
+            auto & mem_elem = mem_elem_vec.back();
+            mem_elem.SetTag(elements[i].tag);
+        }
+
+        auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
+        assert(0 == ret);
+        header.elements_num = mem_elem_vec.size();
+        header.dat_size = dat_.size();
+
+        {
+            string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
+            ::umask(S_IWGRP | S_IWOTH);
+            //const int fd =::mkstemp(&tmp_filepath[0]);
+            const int fd =::mkstemp((char *)tmp_filepath.data());
+            qDebug() << "mkstemp :" << errno << tmp_filepath.data();
+            assert(fd >= 0);
+            ::fchmod(fd, 0644);
+
+            auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
+            write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(mem_elem_vec[0]) * mem_elem_vec.size());
+            write_bytes += ::write(fd, dat_.array(), dat_.total_size());
+
+            assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(mem_elem_vec[0]) + dat_.total_size());
+            ::close(fd);
+
+            const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
+            assert(0 == rename_ret);
+        }
+    }
+
+    DatTrie(const DatTrie &);
+    DatTrie &operator=(const DatTrie &);
+
+private:
+    JiebaDAT dat_;
+    const DatMemElem * elements_ptr_ = nullptr;
+    const double * idf_elements_ptr_ = nullptr;
+    const PinYinMemElem * pinyin_elements_ptr_ = nullptr;
+    size_t elements_num_ = 0;
+    double min_weight_ = 0;
+
+    int mmap_fd_ = -1;
+    size_t mmap_length_ = 0;
+    char * mmap_addr_ = nullptr;
+};
+
+
+inline string CalcFileListMD5(const string & files_list, size_t & file_size_sum) {
+    limonp::MD5 md5;
+
+    const auto files = limonp::Split(files_list, "|;");
+    file_size_sum = 0;
+
+    for (auto const & local_path : files) {
+        const int fd = ::open(local_path.c_str(), O_RDONLY);
+        if( fd < 0){
+            continue;
+        }
+        auto const len = ::lseek(fd, 0, SEEK_END);
+        if (len > 0) {
+            void * addr = ::mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+            assert(MAP_FAILED != addr);
+
+            md5.Update((unsigned char *) addr, len);
+            file_size_sum += len;
+
+            ::munmap(addr, len);
+        }
+        ::close(fd);
+    }
+
+    md5.Final();
+    return string(md5.digestChars);
+}
+
+}
--- a/libchinese-segmentation/cppjieba/DictTrie.hpp
+++ b/libchinese-segmentation/cppjieba/DictTrie.hpp
@ -0,0 +1,234 @@
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <cstring>
+#include <cstdlib>
+#include <stdint.h>
+#include <cmath>
+#include <limits>
+#include "limonp/StringUtil.hpp"
+#include "limonp/Logging.hpp"
+#include "Unicode.hpp"
+#include "DatTrie.hpp"
+#include <QDebug>
+namespace cppjieba {
+
+using namespace limonp;
+
+const double MAX_DOUBLE = 3.14e+100;
+const size_t DICT_COLUMN_NUM = 3;
+const char* const UNKNOWN_TAG = "";
+
+class DictTrie {
+public:
+    enum UserWordWeightOption {
+        WordWeightMin,
+        WordWeightMedian,
+        WordWeightMax,
+    }; // enum UserWordWeightOption
+
+    DictTrie(const string& dict_path, const string& user_dict_paths = "", const string & dat_cache_path = "",
+             UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
+        Init(dict_path, user_dict_paths, dat_cache_path, user_word_weight_opt);
+    }
+
+    ~DictTrie() {}
+
+    const DatMemElem* Find(const string & word) const {
+        return dat_.Find(word);
+    }
+
+    void FindDatDag(RuneStrArray::const_iterator begin,
+              RuneStrArray::const_iterator end,
+              vector<struct DatDag>&res,
+              size_t max_word_len = MAX_WORD_LENGTH) const {
+        dat_.Find(begin, end, res, max_word_len);
+    }
+
+    void FindWordRange(RuneStrArray::const_iterator begin,
+              RuneStrArray::const_iterator end,
+              vector<WordRange>& words,
+              size_t max_word_len = MAX_WORD_LENGTH) const {
+        dat_.Find(begin, end, words, max_word_len);
+    }
+
+    bool IsUserDictSingleChineseWord(const Rune& word) const {
+        return IsIn(user_dict_single_chinese_word_, word);
+    }
+
+    double GetMinWeight() const {
+        return dat_.GetMinWeight();
+    }
+
+    size_t GetTotalDictSize() const {
+        return total_dict_size_;
+    }
+
+    void InserUserDictNode(const string& line, bool saveNodeInfo = true) {
+        vector<string> buf;
+        DatElement node_info;
+        Split(line, buf, " ");
+
+        if (buf.size() == 0) {
+            return;
+        }
+
+        node_info.word = buf[0];
+        node_info.weight = user_word_default_weight_;
+        node_info.tag = UNKNOWN_TAG;
+
+        if (buf.size() == 2) {
+            node_info.tag = buf[1];
+        } else if (buf.size() == 3) {
+            if (freq_sum_ > 0.0) {
+                const int freq = atoi(buf[1].c_str());
+                node_info.weight = log(1.0 * freq / freq_sum_);
+                node_info.tag = buf[2];
+            }
+        }
+
+        if (saveNodeInfo) {
+            static_node_infos_.push_back(node_info);
+        }
+
+        if (Utf8CharNum(node_info.word) == 1) {
+            RuneArray word;
+
+            if (DecodeRunesInString(node_info.word, word)) {
+                user_dict_single_chinese_word_.insert(word[0]);
+            } else {
+                XLOG(ERROR) << "Decode " << node_info.word << " failed.";
+            }
+        }
+    }
+
+    void LoadUserDict(const string& filePaths, bool saveNodeInfo = true) {
+        vector<string> files = limonp::Split(filePaths, "|;");
+
+        for (size_t i = 0; i < files.size(); i++) {
+            ifstream ifs(files[i].c_str());
+            XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
+            string line;
+
+            for (; getline(ifs, line);) {
+                if (line.size() == 0) {
+                    continue;
+                }
+
+                InserUserDictNode(line, saveNodeInfo);
+            }
+        }
+    }
+
+
+private:
+    void Init(const string& dict_path, const string& user_dict_paths, string dat_cache_path,
+              UserWordWeightOption user_word_weight_opt) {
+        const auto dict_list = dict_path + "|" + user_dict_paths;
+        size_t file_size_sum = 0;
+        const string md5 = CalcFileListMD5(dict_list, file_size_sum);
+        total_dict_size_ = file_size_sum;
+
+        if (dat_cache_path.empty()) {
+            dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
+        }
+         dat_cache_path += VERSION;
+        QString path = QString::fromStdString(dat_cache_path);
+        qDebug() << "#########Dict path:" << path;
+        if (dat_.InitAttachDat(dat_cache_path, md5)) {
+            LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
+            return;
+        }
+
+        LoadDefaultDict(dict_path);
+        freq_sum_ = CalcFreqSum(static_node_infos_);
+        CalculateWeight(static_node_infos_, freq_sum_);
+        double min_weight = 0;
+        SetStaticWordWeights(user_word_weight_opt, min_weight);
+        dat_.SetMinWeight(min_weight);
+
+        LoadUserDict(user_dict_paths);
+        const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
+        assert(build_ret);
+        vector<DatElement>().swap(static_node_infos_);
+    }
+
+    void LoadDefaultDict(const string& filePath) {
+        ifstream ifs(filePath.c_str());
+        XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
+        string line;
+        vector<string> buf;
+
+        for (; getline(ifs, line);) {
+            Split(line, buf, " ");
+            XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
+            DatElement node_info;
+            node_info.word = buf[0];
+            node_info.weight = atof(buf[1].c_str());
+            node_info.tag = buf[2];
+            static_node_infos_.push_back(node_info);
+        }
+    }
+
+    static bool WeightCompare(const DatElement& lhs, const DatElement& rhs) {
+        return lhs.weight < rhs.weight;
+    }
+
+    void SetStaticWordWeights(UserWordWeightOption option, double & min_weight) {
+        XCHECK(!static_node_infos_.empty());
+        vector<DatElement> x = static_node_infos_;
+        sort(x.begin(), x.end(), WeightCompare);
+        if(x.empty()){
+            return;
+        }
+        min_weight = x[0].weight;
+        const double max_weight_ = x[x.size() - 1].weight;
+        const double median_weight_ = x[x.size() / 2].weight;
+
+        switch (option) {
+            case WordWeightMin:
+                user_word_default_weight_ = min_weight;
+                break;
+
+            case WordWeightMedian:
+                user_word_default_weight_ = median_weight_;
+                break;
+
+            default:
+                user_word_default_weight_ = max_weight_;
+                break;
+        }
+    }
+
+    double CalcFreqSum(const vector<DatElement>& node_infos) const {
+        double sum = 0.0;
+
+        for (size_t i = 0; i < node_infos.size(); i++) {
+            sum += node_infos[i].weight;
+        }
+
+        return sum;
+    }
+
+    void CalculateWeight(vector<DatElement>& node_infos, double sum) const {
+        for (size_t i = 0; i < node_infos.size(); i++) {
+            DatElement& node_info = node_infos[i];
+            assert(node_info.weight > 0.0);
+            node_info.weight = log(double(node_info.weight) / sum);
+        }
+    }
+
+private:
+    vector<DatElement> static_node_infos_;
+    size_t total_dict_size_ = 0;
+    DatTrie dat_;
+
+    double freq_sum_;
+    double user_word_default_weight_;
+    unordered_set<Rune> user_dict_single_chinese_word_;
+};
+}
+
--- a/libchinese-segmentation/cppjieba/FullSegment.hpp
+++ b/libchinese-segmentation/cppjieba/FullSegment.hpp
@ -0,0 +1,67 @@
+#pragma once
+
+#include <algorithm>
+#include <set>
+#include <cassert>
+#include "limonp/Logging.hpp"
+#include "segment-trie/segment-trie.h"
+//#include "DictTrie.hpp"
+#include "SegmentBase.hpp"
+#include "Unicode.hpp"
+
+namespace cppjieba {
+class FullSegment: public SegmentBase {
+public:
+    FullSegment(const DictTrie* dictTrie)
+        : dictTrie_(dictTrie) {
+        assert(dictTrie_);
+    }
+    ~FullSegment() { }
+
+    virtual void Cut(RuneStrArray::const_iterator begin,
+                     RuneStrArray::const_iterator end,
+                     vector<WordRange>& res, bool, size_t) const override {
+        assert(dictTrie_);
+        vector<struct DatDag> dags;
+        dictTrie_->FindDatDag(begin, end, dags);
+        size_t max_word_end_pos = 0;
+
+        for (size_t i = 0; i < dags.size(); i++) {
+            for (const auto & kv : dags[i].nexts) {
+                const size_t nextoffset = kv.first - 1;
+                assert(nextoffset < dags.size());
+                const auto wordLen = nextoffset - i + 1;
+                const bool is_not_covered_single_word = ((dags[i].nexts.size() == 1) && (max_word_end_pos <= i));
+                const bool is_oov = (nullptr == kv.second); //Out-of-Vocabulary
+
+                if ((is_not_covered_single_word) || ((not is_oov) && (wordLen >= 2))) {
+                    WordRange wr(begin + i, begin + nextoffset);
+                    res.push_back(wr);
+                }
+
+                max_word_end_pos = max(max_word_end_pos, nextoffset + 1);
+            }
+        }
+    }
+
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
+                     size_t) const override {
+        std::ignore = s;
+        std::ignore = begin;
+        std::ignore = end;
+        std::ignore = res;
+        std::ignore = hmm;
+    }
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
+                     size_t) const override {
+        std::ignore = s;
+        std::ignore = begin;
+        std::ignore = end;
+        std::ignore = res;
+        std::ignore = hmm;
+    }
+private:
+    const DictTrie* dictTrie_;
+};
+}
+
--- a/libchinese-segmentation/cppjieba/HMMModel.hpp
+++ b/libchinese-segmentation/cppjieba/HMMModel.hpp
@ -0,0 +1,158 @@
+#pragma once
+
+#include "limonp/StringUtil.hpp"
+//#define USE_CEDAR_SEGMENT //使用cedar初步测试性能损失3%-5%左右，内存占用降低近1M
+#ifdef USE_CEDAR_SEGMENT
+#include "cedar/cedar.h"
+#endif
+namespace cppjieba {
+
+using namespace limonp;
+#ifdef USE_CEDAR_SEGMENT
+typedef cedar::da<float, -1, -2, false> EmitProbMap;
+#else
+typedef unordered_map<Rune, double> EmitProbMap;
+#endif
+struct HMMModel {
+    /*
+     * STATUS:
+     * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
+     * */
+    enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
+
+    HMMModel(const string& modelPath) {
+        memset(startProb, 0, sizeof(startProb));
+        memset(transProb, 0, sizeof(transProb));
+        statMap[0] = 'B';
+        statMap[1] = 'E';
+        statMap[2] = 'M';
+        statMap[3] = 'S';
+        emitProbVec.push_back(&emitProbB);
+        emitProbVec.push_back(&emitProbE);
+        emitProbVec.push_back(&emitProbM);
+        emitProbVec.push_back(&emitProbS);
+        LoadModel(modelPath);
+    }
+    ~HMMModel() {
+    }
+    void LoadModel(const string& filePath) {
+        ifstream ifile(filePath.c_str());
+        XCHECK(ifile.is_open()) << "open " << filePath << " failed";
+        string line;
+        vector<string> tmp;
+        vector<string> tmp2;
+        //Load startProb
+        XCHECK(GetLine(ifile, line));
+        Split(line, tmp, " ");
+        XCHECK(tmp.size() == STATUS_SUM);
+
+        for (size_t j = 0; j < tmp.size(); j++) {
+            startProb[j] = atof(tmp[j].c_str());
+        }
+
+        //Load transProb
+        for (size_t i = 0; i < STATUS_SUM; i++) {
+            XCHECK(GetLine(ifile, line));
+            Split(line, tmp, " ");
+            XCHECK(tmp.size() == STATUS_SUM);
+
+            for (size_t j = 0; j < tmp.size(); j++) {
+                transProb[i][j] = atof(tmp[j].c_str());
+            }
+        }
+
+        //Load emitProbB
+        XCHECK(GetLine(ifile, line));
+        XCHECK(LoadEmitProb(line, emitProbB));
+
+        //Load emitProbE
+        XCHECK(GetLine(ifile, line));
+        XCHECK(LoadEmitProb(line, emitProbE));
+
+        //Load emitProbM
+        XCHECK(GetLine(ifile, line));
+        XCHECK(LoadEmitProb(line, emitProbM));
+
+        //Load emitProbS
+        XCHECK(GetLine(ifile, line));
+        XCHECK(LoadEmitProb(line, emitProbS));
+    }
+    double GetEmitProb(const EmitProbMap* ptMp, Rune key,
+                       double defVal)const {
+#ifdef USE_CEDAR_SEGMENT
+        char str_key[8];
+        snprintf(str_key, sizeof(str_key), "%d", key);
+        float result = ptMp->exactMatchSearch<float>(str_key);
+        return result < 0 ? defVal : result;
+#else
+        EmitProbMap::const_iterator cit = ptMp->find(key);
+
+        if (cit == ptMp->end()) {
+            return defVal;
+        }
+
+        return cit->second;
+#endif
+    }
+    bool GetLine(ifstream& ifile, string& line) {
+        while (getline(ifile, line)) {
+            Trim(line);
+
+            if (line.empty()) {
+                continue;
+            }
+
+            if (StartsWith(line, "#")) {
+                continue;
+            }
+
+            return true;
+        }
+
+        return false;
+    }
+    bool LoadEmitProb(const string& line, EmitProbMap& mp) {
+        if (line.empty()) {
+            return false;
+        }
+
+        vector<string> tmp, tmp2;
+        RuneArray unicode;
+        Split(line, tmp, ",");
+
+        for (size_t i = 0; i < tmp.size(); i++) {
+            Split(tmp[i], tmp2, ":");
+
+            if (2 != tmp2.size()) {
+                XLOG(ERROR) << "emitProb illegal.";
+                return false;
+            }
+
+            if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
+                XLOG(ERROR) << "TransCode failed.";
+                return false;
+            }
+#ifdef USE_CEDAR_SEGMENT
+            char str_key[8];
+            snprintf(str_key, sizeof(str_key), "%d", unicode[0]);
+            mp.update(str_key, std::strlen(str_key), atof(tmp2[1].c_str()));
+#else
+            mp[unicode[0]] = atof(tmp2[1].c_str());
+#endif
+        }
+
+        return true;
+    }
+
+    char statMap[STATUS_SUM];
+    double startProb[STATUS_SUM];
+    double transProb[STATUS_SUM][STATUS_SUM];
+    EmitProbMap emitProbB;
+    EmitProbMap emitProbE;
+    EmitProbMap emitProbM;
+    EmitProbMap emitProbS;
+    vector<EmitProbMap* > emitProbVec;
+}; // struct HMMModel
+
+} // namespace cppjieba
+
--- a/libchinese-segmentation/cppjieba/HMMSegment.hpp
+++ b/libchinese-segmentation/cppjieba/HMMSegment.hpp
@ -0,0 +1,206 @@
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <memory.h>
+#include <cassert>
+#include "HMMModel.hpp"
+#include "SegmentBase.hpp"
+
+namespace cppjieba {
+
+const double MIN_DOUBLE = -3.14e+100;
+
+class HMMSegment: public SegmentBase {
+public:
+    HMMSegment(const HMMModel* model)
+        : model_(model) {
+    }
+    ~HMMSegment() { }
+
+    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool,
+                     size_t) const override {
+        RuneStrArray::const_iterator left = begin;
+        RuneStrArray::const_iterator right = begin;
+
+        while (right != end) {
+            if (right->rune < 0x80) { //asc码
+                if (left != right) {
+                    InternalCut(left, right, res);
+                }
+
+                left = right;
+
+                do {
+                    right = SequentialLetterRule(left, end);//非英文字符则返回left，否则返回left后非英文字母的位置
+
+                    if (right != left) {
+                        break;
+                    }
+
+                    right = NumbersRule(left, end);//非数字则返回left，否则返回left后非数字的位置
+
+                    if (right != left) {
+                        break;
+                    }
+
+                    right ++;
+                } while (false);
+
+                WordRange wr(left, right - 1);
+                res.push_back(wr);
+                left = right;
+            } else {
+                right++;
+            }
+        }
+
+        if (left != right) {
+            InternalCut(left, right, res);
+        }
+    }
+
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
+                     size_t) const override {
+        std::ignore = s;
+        std::ignore = begin;
+        std::ignore = end;
+        std::ignore = res;
+        std::ignore = hmm;
+    }
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
+                     size_t) const override {
+        std::ignore = s;
+        std::ignore = begin;
+        std::ignore = end;
+        std::ignore = res;
+        std::ignore = hmm;
+    }
+private:
+    // sequential letters rule
+    RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin,
+                                                      RuneStrArray::const_iterator end) const {
+        Rune x = begin->rune;
+
+        if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
+            begin ++;
+        } else {
+            return begin;
+        }
+
+        while (begin != end) {
+            x = begin->rune;
+
+            if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
+                begin ++;
+            } else {
+                break;
+            }
+        }
+
+        return begin;
+    }
+    //
+    RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
+        Rune x = begin->rune;
+
+        if ('0' <= x && x <= '9') {
+            begin ++;
+        } else {
+            return begin;
+        }
+
+        while (begin != end) {
+            x = begin->rune;
+
+            if (('0' <= x && x <= '9') || x == '.') {
+                begin++;
+            } else {
+                break;
+            }
+        }
+
+        return begin;
+    }
+    void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
+        vector<size_t> status;
+        Viterbi(begin, end, status);
+
+        RuneStrArray::const_iterator left = begin;
+        RuneStrArray::const_iterator right;
+
+        for (size_t i = 0; i < status.size(); i++) {
+            if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
+                right = begin + i + 1;
+                WordRange wr(left, right - 1);
+                res.push_back(wr);
+                left = right;
+            }
+        }
+    }
+
+    void Viterbi(RuneStrArray::const_iterator begin,
+                 RuneStrArray::const_iterator end,
+                 vector<size_t>& status) const {
+        size_t Y = HMMModel::STATUS_SUM;
+        size_t X = end - begin;
+
+        size_t XYSize = X * Y;
+        size_t now, old, stat;
+        double tmp, endE, endS;
+
+        //vector<int> path(XYSize);
+        //vector<double> weight(XYSize);
+        int path[XYSize];
+        double weight[XYSize];
+
+        //start
+        for (size_t y = 0; y < Y; y++) {
+            weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
+            path[0 + y * X] = -1;
+        }
+
+        double emitProb;
+
+        for (size_t x = 1; x < X; x++) {
+            for (size_t y = 0; y < Y; y++) {
+                now = x + y * X;
+                weight[now] = MIN_DOUBLE;
+                path[now] = HMMModel::E; // warning
+                emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin + x)->rune, MIN_DOUBLE);
+
+                for (size_t preY = 0; preY < Y; preY++) {
+                    old = x - 1 + preY * X;
+                    tmp = weight[old] + model_->transProb[preY][y] + emitProb;
+
+                    if (tmp > weight[now]) {
+                        weight[now] = tmp;
+                        path[now] = preY;
+                    }
+                }
+            }
+        }
+
+        endE = weight[X - 1 + HMMModel::E * X];
+        endS = weight[X - 1 + HMMModel::S * X];
+        stat = 0;
+
+        if (endE >= endS) {
+            stat = HMMModel::E;
+        } else {
+            stat = HMMModel::S;
+        }
+
+        status.resize(X);
+
+        for (int x = X - 1 ; x >= 0; x--) {
+            status[x] = stat;
+            stat = path[x + stat * X];
+        }
+    }
+
+    const HMMModel* model_;
+}; // class HMMSegment
+
+} // namespace cppjieba
+
--- a/libchinese-segmentation/cppjieba/IdfTrie.hpp
+++ b/libchinese-segmentation/cppjieba/IdfTrie.hpp
@ -0,0 +1,117 @@
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <cstring>
+#include <cstdlib>
+#include <stdint.h>
+#include <cmath>
+#include <limits>
+#include "limonp/StringUtil.hpp"
+#include "limonp/Logging.hpp"
+#include "Unicode.hpp"
+#include "DatTrie.hpp"
+#include <QDebug>
+namespace cppjieba {
+
+using namespace limonp;
+
+const size_t IDF_COLUMN_NUM = 2;
+
+class IdfTrie {
+public:
+    enum UserWordWeightOption {
+        WordWeightMin,
+        WordWeightMedian,
+        WordWeightMax,
+    }; // enum UserWordWeightOption
+
+    IdfTrie(const string& dict_path, const string & dat_cache_path = "",
+             UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
+        Init(dict_path, dat_cache_path, user_word_weight_opt);
+    }
+
+    ~IdfTrie() {}
+
+    double Find(const string & word, std::size_t length = 0, std::size_t node_pos = 0) const {
+        return dat_.Find(word, length, node_pos);
+    }
+
+    size_t GetTotalDictSize() const {
+        return total_dict_size_;
+    }
+
+private:
+    void Init(const string& dict_path, string dat_cache_path,
+              UserWordWeightOption user_word_weight_opt) {
+        size_t file_size_sum = 0;
+        const string md5 = CalcFileListMD5(dict_path, file_size_sum);
+        total_dict_size_ = file_size_sum;
+
+        if (dat_cache_path.empty()) {
+            dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
+        }
+         dat_cache_path += VERSION;
+        QString path = QString::fromStdString(dat_cache_path);
+        qDebug() << "#########Idf path:" << path;
+        if (dat_.InitIdfAttachDat(dat_cache_path, md5)) {
+            return;
+        }
+
+        LoadDefaultIdf(dict_path);
+        double idf_sum_ = CalcIdfSum(static_node_infos_);
+        assert(static_node_infos_.size());
+        idfAverage_ = idf_sum_ / static_node_infos_.size();
+        assert(idfAverage_ > 0.0);
+        double min_weight = 0;
+        dat_.SetMinWeight(min_weight);
+
+        const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
+        assert(build_ret);
+        vector<IdfElement>().swap(static_node_infos_);
+    }
+
+    void LoadDefaultIdf(const string& filePath) {
+        ifstream ifs(filePath.c_str());
+        if(not ifs.is_open()){
+            return ;
+        }
+        XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
+        string line;
+        vector<string> buf;
+        size_t lineno = 0;
+
+        for (; getline(ifs, line); lineno++) {
+            if (line.empty()) {
+                XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
+                continue;
+            }
+            Split(line, buf, " ");
+            XCHECK(buf.size() == IDF_COLUMN_NUM) << "split result illegal, line:" << line;
+            IdfElement node_info;
+            node_info.word = buf[0];
+            node_info.idf = atof(buf[1].c_str());
+            static_node_infos_.push_back(node_info);
+        }
+    }
+
+    double CalcIdfSum(const vector<IdfElement>& node_infos) const {
+        double sum = 0.0;
+
+        for (size_t i = 0; i < node_infos.size(); i++) {
+            sum += node_infos[i].idf;
+        }
+
+        return sum;
+    }
+public:
+    double idfAverage_;
+private:
+    vector<IdfElement> static_node_infos_;
+    size_t total_dict_size_ = 0;
+    DatTrie dat_;
+};
+}
+
--- a/libchinese-segmentation/cppjieba/Jieba.hpp
+++ b/libchinese-segmentation/cppjieba/Jieba.hpp
@ -0,0 +1,99 @@
+#pragma once
+
+#include <memory>
+#include "QuerySegment.hpp"
+#include "KeywordExtractor.hpp"
+#include "segment-trie/segment-trie.h"
+
+namespace cppjieba {
+
+class Jieba {
+public:
+    Jieba(const string& dict_path,
+          const string& model_path,
+          const string& user_dict_path,
+          const string& idfPath = "",
+          const string& stopWordPath = "",
+          const string& dat_cache_path = "")
+        : dict_trie_(dict_path, user_dict_path, dat_cache_path),
+          model_(model_path),
+          mp_seg_(&dict_trie_),
+          hmm_seg_(&model_),
+          mix_seg_(&dict_trie_, &model_, stopWordPath),
+          full_seg_(&dict_trie_),
+          query_seg_(&dict_trie_, &model_, stopWordPath),
+          extractor(&dict_trie_, &model_, idfPath, dat_cache_path, stopWordPath){ }
+    ~Jieba() { }
+
+    void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
+        mix_seg_.CutToStr(sentence, words, hmm);
+    }
+    void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
+        mix_seg_.CutToWord(sentence, words, hmm);
+    }
+    void CutAll(const string& sentence, vector<string>& words) const {
+        full_seg_.CutToStr(sentence, words);
+    }
+    void CutAll(const string& sentence, vector<Word>& words) const {
+        full_seg_.CutToWord(sentence, words);
+    }
+    void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
+        query_seg_.CutToStr(sentence, words, hmm);
+    }
+    void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
+        query_seg_.CutToWord(sentence, words, hmm);
+    }
+    void CutHMM(const string& sentence, vector<string>& words) const {
+        hmm_seg_.CutToStr(sentence, words);
+    }
+    void CutHMM(const string& sentence, vector<Word>& words) const {
+        hmm_seg_.CutToWord(sentence, words);
+    }
+    void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
+        mp_seg_.CutToStr(sentence, words, false, max_word_len);
+    }
+    void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
+        mp_seg_.CutToWord(sentence, words, false, max_word_len);
+    }
+
+    void Tag(const string& sentence, vector<pair<string, string> >& words) const {
+        mix_seg_.Tag(sentence, words);
+    }
+    string LookupTag(const string &str) const {
+        return mix_seg_.LookupTag(str);
+    }
+
+    void ResetSeparators(const string& s) {
+        //TODO
+        mp_seg_.ResetSeparators(s);
+        hmm_seg_.ResetSeparators(s);
+        mix_seg_.ResetSeparators(s);
+        full_seg_.ResetSeparators(s);
+        query_seg_.ResetSeparators(s);
+    }
+
+    const DictTrie* GetDictTrie() const {
+        return &dict_trie_;
+    }
+
+    const HMMModel* GetHMMModel() const {
+        return &model_;
+    }
+
+private:
+    DictTrie dict_trie_;
+    HMMModel model_;
+
+    // They share the same dict trie and model
+    MPSegment mp_seg_;
+    HMMSegment hmm_seg_;
+    MixSegment mix_seg_;
+    FullSegment full_seg_;
+    QuerySegment query_seg_;
+
+public:
+    KeywordExtractor extractor;
+}; // class Jieba
+
+} // namespace cppjieba
+
--- a/libchinese-segmentation/cppjieba/KeywordExtractor.hpp
+++ b/libchinese-segmentation/cppjieba/KeywordExtractor.hpp
@ -0,0 +1,100 @@
+#pragma once
+
+#include <cmath>
+#include "MixSegment.hpp"
+//#include "IdfTrie.hpp"
+#include "idf-trie/idf-trie.h"
+
+namespace cppjieba {
+
+using namespace limonp;
+using namespace std;
+
+/*utf8*/
+class KeywordExtractor {
+public:
+
+    KeywordExtractor(const DictTrie* dictTrie,
+                     const HMMModel* model,
+                     const string& idfPath,
+                     const string& dat_cache_path,
+                     const string& stopWordPath)
+        : segment_(dictTrie, model, stopWordPath),
+          idf_trie_(idfPath, dat_cache_path){
+    }
+    ~KeywordExtractor() {
+    }
+
+    void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
+        vector<KeyWord> topWords;
+        Extract(sentence, topWords, topN);
+
+        for (size_t i = 0; i < topWords.size(); i++) {
+            keywords.push_back(topWords[i].word);
+        }
+    }
+
+    void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
+        vector<KeyWord> topWords;
+        Extract(sentence, topWords, topN);
+
+        for (size_t i = 0; i < topWords.size(); i++) {
+            keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
+        }
+    }
+
+    void Extract(const string& sentence, vector<KeyWord>& keywords, size_t topN) const {
+
+        unordered_map<string, KeyWord> wordmap;//插入字符串与Word的map，相同string统计词频叠加权重
+        PreFilter pre_filter(symbols_, sentence);
+        RuneStrArray::const_iterator null_p;
+        WordRange range(null_p, null_p);
+        bool isNull(false);
+        while (pre_filter.Next(range, isNull)) {
+            if (isNull) {
+                continue;
+            }
+            segment_.CutToStr(sentence, range,  wordmap);
+        }
+
+        keywords.clear();
+        keywords.reserve(wordmap.size());
+
+        for (unordered_map<string, KeyWord>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
+            double idf = idf_trie_.Find(itr->first);
+            if (-1 != idf) {//IDF词典查找
+                itr->second.weight *= idf;
+            } else {
+                itr->second.weight *= idf_trie_.GetIdfAverage();
+            }
+
+            itr->second.word = itr->first;
+            keywords.push_back(itr->second);
+        }
+
+        topN = min(topN, keywords.size());
+        partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
+        keywords.resize(topN);
+    }
+private:
+
+    static bool Compare(const KeyWord& lhs, const KeyWord& rhs) {
+        return lhs.weight > rhs.weight;
+    }
+
+    MixSegment segment_;
+    IdfTrie idf_trie_;
+
+
+    unordered_set<Rune> symbols_;
+}; // class KeywordExtractor
+
+inline ostream& operator << (ostream& os, const KeyWord& word) {
+    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
+           "}";
+}
+
+} // namespace cppjieba
+
+
+
--- a/libchinese-segmentation/cppjieba/MPSegment.hpp
+++ b/libchinese-segmentation/cppjieba/MPSegment.hpp
@ -0,0 +1,133 @@
+#pragma once
+
+#include <algorithm>
+#include <set>
+#include <cassert>
+#include "limonp/Logging.hpp"
+#include "segment-trie/segment-trie.h"
+//#include "DictTrie.hpp"
+#include "SegmentTagged.hpp"
+#include "PosTagger.hpp"
+
+namespace cppjieba {
+
+class MPSegment: public SegmentTagged {
+public:
+    MPSegment(const DictTrie* dictTrie)
+        : dictTrie_(dictTrie) {
+        assert(dictTrie_);
+    }
+    ~MPSegment() { }
+
+    virtual void Cut(RuneStrArray::const_iterator begin,
+                     RuneStrArray::const_iterator end,
+                     vector<WordRange>& words,
+                     bool, size_t max_word_len) const override {
+        dictTrie_->FindWordRange(begin, end, words, max_word_len);
+    }
+
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
+                     size_t) const override {
+        std::ignore = s;
+        std::ignore = begin;
+        std::ignore = end;
+        std::ignore = res;
+        std::ignore = hmm;
+    }
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
+                     size_t) const override {
+        std::ignore = s;
+        std::ignore = begin;
+        std::ignore = end;
+        std::ignore = res;
+        std::ignore = hmm;
+    }
+    const DictTrie* GetDictTrie() const override {
+        return dictTrie_;
+    }
+
+    bool Tag(const string& src, vector<pair<string, string> >& res) const override {
+        return tagger_.Tag(src, res, *this);
+    }
+
+    bool IsUserDictSingleChineseWord(const Rune& value) const {
+        return dictTrie_->IsUserDictSingleChineseWord(value);
+    }
+private:
+/*
+    void CalcDP(vector<DatDag>& dags) const {
+        double val(0);
+        for (auto rit = dags.rbegin(); rit != dags.rend(); rit++) {
+            rit->max_next = -1;
+            rit->max_weight = MIN_DOUBLE;
+
+            for (const auto & it : rit->nexts) {
+                const auto nextPos = it.first;
+                val = dictTrie_->GetMinWeight();
+
+                if (nullptr != it.second) {
+                    val = it.second->weight;
+                }
+
+                if (nextPos  < dags.size()) {
+                    val += dags[nextPos].max_weight;
+                }
+
+                if ((nextPos <= dags.size()) && (val > rit->max_weight)) {
+                    rit->max_weight = val;
+                    rit->max_next = nextPos;
+                }
+            }
+        }
+    }
+*/
+/*  倒叙方式重写CalcDP函数，初步测试未发现问题*/
+/*
+    void CalcDP(vector<DatDag>& dags) const {
+        double val(0);
+        size_t size = dags.size();
+
+        for (size_t i = 0; i < size; i++) {
+            dags[size - 1 - i].max_next = -1;
+            dags[size - 1 - i].max_weight = MIN_DOUBLE;
+
+            for (const auto & it : dags[size - 1 - i].nexts) {
+                const auto nextPos = it.first;
+                if (nullptr != it.second) {
+                    val = it.second->weight;
+                }
+
+                if (nextPos  < dags.size()) {
+                    val += dags[nextPos].max_weight;
+                }
+
+                if ((nextPos <= dags.size()) && (val > dags[size - 1 - i].max_weight)) {
+                    dags[size - 1 - i].max_weight = val;
+                    dags[size - 1 - i].max_next = nextPos;
+                }
+            }
+        }
+    }
+
+    void CutByDag(RuneStrArray::const_iterator begin,
+                  RuneStrArray::const_iterator,
+                  const vector<DatDag>& dags,
+                  vector<WordRange>& words) const {
+
+        for (size_t i = 0; i < dags.size();) {
+            const auto next = dags[i].max_next;
+            assert(next > i);
+            assert(next <= dags.size());
+            WordRange wr(begin + i, begin + next - 1);
+            words.push_back(wr);
+            i = next;
+        }
+    }
+*///相关功能已集成到Find函数中
+    const DictTrie* dictTrie_;
+    PosTagger tagger_;
+
+}; // class MPSegment
+
+} // namespace cppjieba
+
--- a/libchinese-segmentation/cppjieba/MixSegment.hpp
+++ b/libchinese-segmentation/cppjieba/MixSegment.hpp
@ -0,0 +1,276 @@
+#pragma once
+
+#include <cassert>
+#include "MPSegment.hpp"
+#include "HMMSegment.hpp"
+#include "limonp/StringUtil.hpp"
+#include "PosTagger.hpp"
+#define STOP_WORDS_USE_CEDAR_SEGMENT //使用cedar初步测试性能提升3%-5%左右，内存占用降低近不明显
+#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
+#include "cedar/cedar.h"
+#endif
+
+namespace cppjieba {
+class MixSegment: public SegmentTagged {
+public:
+    MixSegment(const DictTrie* dictTrie,
+               const HMMModel* model,
+               const string& stopWordPath)
+        : mpSeg_(dictTrie), hmmSeg_(model) {
+        LoadStopWordDict(stopWordPath);
+    }
+    ~MixSegment() {}
+
+    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
+                     size_t) const override {
+        if (!hmm) {
+            mpSeg_.CutRuneArray(begin, end, res);
+            return;
+        }
+
+        vector<WordRange> words;
+        assert(end >= begin);
+        words.reserve(end - begin);
+        mpSeg_.CutRuneArray(begin, end, words);
+
+        vector<WordRange> hmmRes;
+        hmmRes.reserve(end - begin);
+
+        for (size_t i = 0; i < words.size(); i++) {
+            //if mp Get a word, it's ok, put it into result
+            if (words[i].left != words[i].right || (words[i].left == words[i].right &&
+                                                    mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
+                res.push_back(words[i]);
+                continue;
+            }
+
+            // if mp Get a single one and it is not in userdict, collect it in sequence
+            size_t j = i;
+
+            while (j < words.size() && words[j].left == words[j].right &&
+                   !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
+                j++;
+            }
+
+            // Cut the sequence with hmm
+            assert(j - 1 >= i);
+            // TODO
+            hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
+
+            //put hmm result to result
+            for (size_t k = 0; k < hmmRes.size(); k++) {
+                res.push_back(hmmRes[k]);
+            }
+
+            //clear tmp vars
+            hmmRes.clear();
+
+            //let i jump over this piece
+            i = j - 1;
+        }
+    }
+
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
+                     size_t) const override {
+        //目前hmm默认开启，后期如有需要关闭再修改--jxx20210519
+//        if (!hmm) {
+//            mpSeg_.CutRuneArray(begin, end, res);
+//            return;
+//        }
+        std::ignore = hmm;
+        vector<WordRange> words;
+        assert(end >= begin);
+        words.reserve(end - begin);
+        mpSeg_.CutRuneArray(begin, end, words);
+
+        vector<WordRange> hmmRes;
+        hmmRes.reserve(end - begin);
+
+        for (size_t i = 0; i < words.size(); i++) {
+            //if mp Get a word, it's ok, put it into result
+            if (words[i].left != words[i].right) {
+                res.push_back(GetStringFromRunes(s, words[i].left, words[i].right));
+                continue;
+            }
+            if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
+                    || i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
+                res.push_back(GetStringFromRunes(s, words[i].left, words[i].right));
+                continue;
+            }
+
+            // if mp Get a single one and it is not in userdict, collect it in sequence
+            size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里（i字符不是最后一个字符），直接判定j字符
+
+            while (j < (words.size() - 1) && words[j].left == words[j].right &&
+                   !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
+                j++;
+            }
+
+            // Cut the sequence with hmm
+            assert(j - 1 >= i);
+            // TODO
+            hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
+
+            //put hmm result to result
+            for (size_t k = 0; k < hmmRes.size(); k++) {
+                res.push_back(GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right));
+            }
+
+            //clear tmp vars
+            hmmRes.clear();
+
+            //let i jump over this piece
+            i = j - 1;
+        }
+    }
+
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
+                     size_t) const override {
+        std::ignore = hmm;
+        vector<WordRange> words;
+        vector<WordRange> hmmRes;
+        assert(end >= begin);
+        if (3 == begin->len or 4 == begin->len) {
+            words.reserve(end - begin);
+            mpSeg_.CutRuneArray(begin, end, words);
+            hmmRes.reserve(words.size());
+        } else {
+            hmmRes.reserve(end - begin);
+        }
+
+        if (words.size() != 0) {//存在中文分词结果
+            for (size_t i = 0; i < words.size(); i++) {
+
+                string str = GetStringFromRunes(s, words[i].left, words[i].right);
+
+                if (words[i].left != words[i].right) {
+#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
+                    if (0 < stopWords_.exactMatchSearch<int>(str.c_str(), str.size())) {
+                        continue;
+                    }
+#else
+                    if (stopWords_.find(str) != stopWords_.end()) {
+                        continue;
+                    }
+#endif
+                    res[str].offsets.push_back(words[i].left->offset);
+                    res[str].weight += 1.0;
+                    continue;
+                }
+
+                if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
+                        || i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
+#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
+                    if (0 < stopWords_.exactMatchSearch<int>(str.c_str(), str.size())) {
+                        continue;
+                    }
+#else
+                    if (stopWords_.find(str) != stopWords_.end()) {
+                        continue;
+                    }
+#endif
+                    res[str].offsets.push_back(words[i].left->offset);
+                    res[str].weight += 1.0;
+                    continue;
+                }
+                // if mp Get a single one and it is not in userdict, collect it in sequence
+                size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里（i字符不是最后一个字符），直接判定j字符
+                bool isLastWordsSingle(false);
+                while (j <= (words.size() - 1)
+                       && words[j].left == words[j].right
+                       && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
+                    if (j == (words.size() - 1)) {//最后一个分词结果是单字
+                        isLastWordsSingle = true;
+                        break;
+                    }
+                    j++;
+                }
+
+                // Cut the sequence with hmm
+                assert(j - 1 >= i);
+                // TODO
+                if (isLastWordsSingle) {
+                    hmmSeg_.CutRuneArray(words[i].left, words[j].left + 1, hmmRes);
+                } else {
+                    hmmSeg_.CutRuneArray(words[i].left, words[j].left, hmmRes);
+                }
+
+                //put hmm result to result
+                for (size_t k = 0; k < hmmRes.size(); k++) {
+                    string hmmStr = GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right);
+#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
+                    if (0 < stopWords_.exactMatchSearch<int>(hmmStr.c_str(), hmmStr.size())) {
+                        continue;
+                    }
+#else
+                    if (/*IsSingleWord(hmmStr) || */stopWords_.find(hmmStr) != stopWords_.end()) {
+                        continue;
+                    }
+#endif
+
+                    res[hmmStr].offsets.push_back(hmmRes[k].left->offset);
+                    res[hmmStr].weight += 1.0;
+                }
+
+                //clear tmp vars
+                hmmRes.clear();
+
+                //let i jump over this piece
+                if (isLastWordsSingle) {
+                    break;
+                }
+                i = j - 1;
+            }
+        } else {//不存在中文分词结果
+            for (size_t i = 0; i < (size_t)(end - begin); i++) {
+                string str = s.substr((begin+i)->offset, (begin+i)->len);
+                res[str].offsets.push_back((begin+i)->offset);
+                res[str].weight += 1.0;
+            }
+        }
+    }
+
+    const DictTrie* GetDictTrie() const override {
+        return mpSeg_.GetDictTrie();
+    }
+
+    bool Tag(const string& src, vector<pair<string, string> >& res) const override {
+        return tagger_.Tag(src, res, *this);
+    }
+
+    string LookupTag(const string &str) const {
+        return tagger_.LookupTag(str, *this);
+    }
+
+    void LoadStopWordDict(const string& filePath) {
+        ifstream ifs(filePath.c_str());
+        if(not ifs.is_open()){
+            return ;
+        }
+        XCHECK(ifs.is_open()) << "open " << filePath << " failed";
+        string line ;
+
+        while (getline(ifs, line)) {
+#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
+            stopWords_.update(line.c_str(), line.size(), 1);
+#else
+            stopWords_.insert(line);
+#endif
+        }
+
+        assert(stopWords_.size());
+    }
+private:
+#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
+    cedar::da<int, -1, -2, false> stopWords_;
+#else
+    unordered_set<string> stopWords_;
+#endif
+    MPSegment mpSeg_;
+    HMMSegment hmmSeg_;
+    PosTagger tagger_;
+
+}; // class MixSegment
+
+} // namespace cppjieba
+
--- a/libchinese-segmentation/cppjieba/PinYinTrie.hpp
+++ b/libchinese-segmentation/cppjieba/PinYinTrie.hpp
@ -0,0 +1,154 @@
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <cstring>
+#include <cstdlib>
+#include <stdint.h>
+#include <cmath>
+#include <limits>
+#include "limonp/StringUtil.hpp"
+#include "limonp/Logging.hpp"
+#include "Unicode.hpp"
+#include "DatTrie.hpp"
+#include <QDebug>
+namespace cppjieba {
+
+using namespace limonp;
+
+const size_t PINYIN_COLUMN_NUM = 2;
+
+class PinYinTrie {
+public:
+    enum UserWordWeightOption {
+        WordWeightMin,
+        WordWeightMedian,
+        WordWeightMax,
+    }; // enum UserWordWeightOption
+
+    PinYinTrie(const string& dict_path, const string & dat_cache_path = "",
+             UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
+        Init(dict_path, dat_cache_path, user_word_weight_opt);
+    }
+
+    ~PinYinTrie() {}
+
+    int getMultiTonResults(string word, QStringList &results) {
+        if (qmap_chinese2pinyin.contains(QString::fromStdString(word))) {
+            for (auto i:qmap_chinese2pinyin[QString::fromStdString(word)])
+                results.push_back(i);
+            return 0;
+        }
+        return -1;
+    }
+
+    int getSingleTonResult(string word, QString &result) {
+        const PinYinMemElem * tmp = dat_.PinYinFind(word);
+        if (tmp) {
+            result = QString::fromStdString(tmp->GetTag());
+            return 0;
+        }
+        return -1;
+    }
+
+    bool contains(string &word) {
+        if (qmap_chinese2pinyin.contains(QString::fromStdString(word))
+                or !dat_.PinYinFind(word))
+            return true;
+//        if (map_chinese2pinyin.contains(word)
+//                or !dat_.PinYinFind(word))
+//            return true;
+        return false;
+    }
+
+    bool isMultiTone(const string &word) {
+        if (qmap_chinese2pinyin.contains(QString::fromStdString(word)))
+            return true;
+//        if (map_chinese2pinyin.contains(word))
+//            return true;
+        return false;
+    }
+
+    size_t GetTotalDictSize() const {
+        return total_dict_size_;
+    }
+
+private:
+    void Init(const string& dict_path, string dat_cache_path,
+              UserWordWeightOption user_word_weight_opt) {
+        size_t file_size_sum = 0;
+        vector<PinYinElement> node_infos;
+        const string md5 = CalcFileListMD5(dict_path, file_size_sum);
+        total_dict_size_ = file_size_sum;
+
+        if (dat_cache_path.empty()) {
+            //未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
+            dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) +  ".dat_cache";
+        }
+        QString path = QString::fromStdString(dat_cache_path);
+        qDebug() << "#########PinYin path:" << path << file_size_sum;
+        if (dat_.InitPinYinAttachDat(dat_cache_path, md5)) {
+            //多音字仍需遍历文件信息
+            LoadDefaultPinYin(node_infos, dict_path, true);
+            return;
+        }
+
+        LoadDefaultPinYin(node_infos, dict_path, false);
+        double min_weight = 0;
+        dat_.SetMinWeight(min_weight);
+
+        const auto build_ret = dat_.InitBuildDat(node_infos, dat_cache_path, md5);
+        assert(build_ret);
+        vector<PinYinElement>().swap(node_infos);
+    }
+
+    void LoadDefaultPinYin(vector<PinYinElement> &node_infos, const string& filePath, bool multiFlag) {
+        ifstream ifs(filePath.c_str());
+        if(not ifs.is_open()){
+            return ;
+        }
+        XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
+        string line;
+        vector<string> buf;
+        size_t lineno = 0;
+
+        for (; getline(ifs, line); lineno++) {
+            if (line.empty()) {
+                XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
+                continue;
+            }
+            Split(line, buf, " ");
+            if (buf.size() == PINYIN_COLUMN_NUM) {
+                if (multiFlag) {//非多音字
+                    continue;
+                }
+                PinYinElement node_info;
+                node_info.word = buf[1];
+                node_info.tag = buf[0];
+                node_infos.push_back(node_info);
+            } else {//多音字
+                QString content = QString::fromUtf8(line.c_str());
+                qmap_chinese2pinyin[content.split(" ").last().trimmed()] = content.split(" ");
+                qmap_chinese2pinyin[content.split(" ").last().trimmed()].pop_back();
+                /*
+                 //std map string list
+                 list<string> tmpList;
+                 for(int i = 0; i < buf.size() - 1; ++i){
+                    tmpList.push_back(buf[i]);
+                 }
+                 map[buf[buf.size() - 1]] = tmpList;
+                */
+            }
+        }
+    }
+
+private:
+    QMap<QString, QStringList> qmap_chinese2pinyin;
+    //map<string, list<string>> map_chinese2pinyin;
+    size_t total_dict_size_ = 0;
+    DatTrie dat_;
+};
+}
+
--- a/libchinese-segmentation/cppjieba/PosTagger.hpp
+++ b/libchinese-segmentation/cppjieba/PosTagger.hpp
@ -0,0 +1,84 @@
+#pragma once
+
+#include "limonp/StringUtil.hpp"
+#include "segment-trie/segment-trie.h"
+//#include "DictTrie.hpp"
+//#include "SegmentTagged.hpp"
+
+namespace cppjieba {
+using namespace limonp;
+
+static const char* const POS_M = "m";
+static const char* const POS_ENG = "eng";
+static const char* const POS_X = "x";
+
+class PosTagger {
+public:
+    PosTagger() {
+    }
+    ~PosTagger() {
+    }
+
+    bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
+        vector<string> CutRes;
+        segment.CutToStr(src, CutRes);
+
+        for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
+            res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
+        }
+
+        return !res.empty();
+    }
+
+    string LookupTag(const string &str, const SegmentTagged& segment) const {
+        const DictTrie * dict = segment.GetDictTrie();
+        assert(dict != nullptr);
+        const auto tmp = dict->Find(str);
+
+        if (tmp == nullptr || tmp->GetTag().empty()) {
+            RuneStrArray runes;
+
+            if (!DecodeRunesInString(str, runes)) {
+                XLOG(ERROR) << "Decode failed.";
+                return POS_X;
+            }
+
+            return SpecialRule(runes);
+        } else {
+            return tmp->GetTag();
+        }
+    }
+
+private:
+    const char* SpecialRule(const RuneStrArray& unicode) const {
+        size_t m = 0;
+        size_t eng = 0;
+
+        for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
+            if (unicode[i].rune < 0x80) {
+                eng ++;
+
+                if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
+                    m++;
+                }
+            }
+        }
+
+        // ascii char is not found
+        if (eng == 0) {
+            return POS_X;
+        }
+
+        // all the ascii is number char
+        if (m == eng) {
+            return POS_M;
+        }
+
+        // the ascii chars contain english letter
+        return POS_ENG;
+    }
+
+}; // class PosTagger
+
+} // namespace cppjieba
+
--- a/libchinese-segmentation/cppjieba/PreFilter.hpp
+++ b/libchinese-segmentation/cppjieba/PreFilter.hpp
@ -0,0 +1,127 @@
+#pragma once
+
+#include "limonp/Logging.hpp"
+#include <unordered_set>
+#include "Unicode.hpp"
+
+namespace cppjieba {
+
+class PreFilter {
+public:
+    PreFilter(const std::unordered_set<Rune>& symbols,
+              const string& sentence)
+        : symbols_(symbols) {
+        if (!DecodeRunesInString(sentence, sentence_)) {
+            XLOG(ERROR) << "decode failed. "<<sentence;
+        }
+
+        cursor_ = sentence_.begin();
+    }
+    ~PreFilter() {
+    }
+    bool HasNext() const {
+        return cursor_ != sentence_.end();
+    }
+    bool Next(WordRange& wordRange) {
+
+        if (cursor_ == sentence_.end()) {
+            return false;
+        }
+
+        wordRange.left = cursor_;
+
+        while (cursor_->rune == 0x20 && cursor_ != sentence_.end()) {
+            cursor_++;
+        }
+
+        if (cursor_ == sentence_.end()) {
+            wordRange.right = cursor_;
+            return true;
+        }
+
+        while (++cursor_ != sentence_.end()) {
+            if (cursor_->rune == 0x20) {
+                wordRange.right = cursor_;
+                return true;
+            }
+        }
+
+        wordRange.right = sentence_.end();
+        return true;
+    }
+
+    bool Next(WordRange& wordRange, bool& isNull) {
+        isNull = false;
+        if (cursor_ == sentence_.end()) {
+            return false;
+        }
+
+        wordRange.left = cursor_;
+        if (cursor_->rune == 0x20) {
+            while (cursor_ != sentence_.end()) {
+                if (cursor_->rune != 0x20) {
+                    if (wordRange.left == cursor_) {
+                        cursor_ ++;
+                    }
+                    wordRange.right = cursor_;
+                    isNull = true;
+                    return true;
+                }
+                cursor_ ++;
+            }
+            return false;
+        }
+
+        int max_num = 0;
+        uint32_t utf8_num = cursor_->len;
+
+        while (cursor_ != sentence_.end()) {
+            if (cursor_->rune == 0x20) {
+                if (wordRange.left == cursor_) {
+                    cursor_ ++;
+                }
+
+                wordRange.right = cursor_;
+                return true;
+            }
+
+            cursor_ ++;
+            max_num++;
+            if (max_num >= 1024 or cursor_->len != utf8_num) { //todo 防止一次性传入过多字节，暂定限制为1024个字
+                wordRange.right = cursor_;
+                return true;
+            }
+        }
+
+        wordRange.right = sentence_.end();
+        return true;
+    }
+
+    WordRange Next() {
+        WordRange range(cursor_, cursor_);
+
+        while (cursor_ != sentence_.end()) {
+            //if (IsIn(symbols_, cursor_->rune)) {
+            if (cursor_->rune == 0x20) {
+                if (range.left == cursor_) {
+                    cursor_ ++;
+                }
+
+                range.right = cursor_;
+                return range;
+            }
+
+            cursor_ ++;
+        }
+
+        range.right = sentence_.end();
+        return range;
+    }
+private:
+    RuneStrArray::const_iterator cursor_;
+    RuneStrArray sentence_;
+    const std::unordered_set<Rune>& symbols_;
+}; // class PreFilter
+
+} // namespace cppjieba
+
--- a/libchinese-segmentation/cppjieba/QuerySegment.hpp
+++ b/libchinese-segmentation/cppjieba/QuerySegment.hpp
@ -0,0 +1,89 @@
+#pragma once
+
+#include <algorithm>
+#include <set>
+#include <cassert>
+#include "limonp/Logging.hpp"
+#include "SegmentBase.hpp"
+#include "FullSegment.hpp"
+#include "MixSegment.hpp"
+#include "Unicode.hpp"
+
+namespace cppjieba {
+class QuerySegment: public SegmentBase {
+public:
+    QuerySegment(const DictTrie* dictTrie,
+                 const HMMModel* model,
+                 const string& stopWordPath)
+        : mixSeg_(dictTrie, model, stopWordPath), trie_(dictTrie) {
+    }
+    ~QuerySegment() {
+    }
+
+    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
+                     size_t) const override {
+        //use mix Cut first
+        vector<WordRange> mixRes;
+        mixSeg_.CutRuneArray(begin, end, mixRes, hmm);
+
+        vector<WordRange> fullRes;
+
+        for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
+            if (mixResItr->Length() > 2) {
+                for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
+                    string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 2);
+
+                    if (trie_->Find(text) != nullptr) {
+                        WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
+                        res.push_back(wr);
+                    }
+                }
+            }
+
+            if (mixResItr->Length() > 3) {
+                for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
+                    string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 3);
+
+                    if (trie_->Find(text) != nullptr) {
+                        WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
+                        res.push_back(wr);
+                    }
+                }
+            }
+
+            res.push_back(*mixResItr);
+        }
+    }
+
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
+                     size_t) const override {
+        std::ignore = s;
+        std::ignore = begin;
+        std::ignore = end;
+        std::ignore = res;
+        std::ignore = hmm;
+    }
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
+                     size_t) const override {
+        std::ignore = s;
+        std::ignore = begin;
+        std::ignore = end;
+        std::ignore = res;
+        std::ignore = hmm;
+    }
+private:
+    bool IsAllAscii(const RuneArray& s) const {
+        for (size_t i = 0; i < s.size(); i++) {
+            if (s[i] >= 0x80) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+    MixSegment mixSeg_;
+    const DictTrie* trie_;
+}; // QuerySegment
+
+} // namespace cppjieba
+
--- a/libchinese-segmentation/cppjieba/SegmentBase.hpp
+++ b/libchinese-segmentation/cppjieba/SegmentBase.hpp
@ -0,0 +1,94 @@
+#pragma once
+
+#include "limonp/Logging.hpp"
+#include "PreFilter.hpp"
+#include <cassert>
+
+
+namespace cppjieba {
+
+const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
+
+using namespace limonp;
+
+class SegmentBase {
+public:
+    SegmentBase() {
+        XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
+    }
+    virtual ~SegmentBase() { }
+
+    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
+                     size_t max_word_len) const = 0;
+    //添加基于sentence的cut方法，减少中间变量的存储与格式转换--jxx20210517
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
+                     size_t max_word_len) const = 0;
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
+                     size_t max_word_len) const = 0;
+    //重写CutToStr函数，简化获取vector<string>& words的流程，降低内存占用--jxx20210517
+    void CutToStr(const string& sentence, vector<string>& words, bool hmm = true,
+                  size_t max_word_len = MAX_WORD_LENGTH) const {
+        PreFilter pre_filter(symbols_, sentence);
+        words.clear();
+        words.reserve(sentence.size() / 2);//todo 参考源码，参数待定
+        RuneStrArray::const_iterator null_p;
+        WordRange range(null_p, null_p);
+        while (pre_filter.Next(range)) {
+            CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len);
+        }
+    }
+    void CutToStr(const string& sentence, WordRange range, vector<string>& words, bool hmm = true,
+                  size_t max_word_len = MAX_WORD_LENGTH) const {
+        CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len);
+    }
+    void CutToStr(const string& sentence, WordRange range, unordered_map<string, KeyWord>& words, bool hmm = true,
+                  size_t max_word_len = MAX_WORD_LENGTH) const {
+        CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len);
+    }
+    void CutToWord(const string& sentence, vector<Word>& words, bool hmm = true,
+                   size_t max_word_len = MAX_WORD_LENGTH) const {
+        PreFilter pre_filter(symbols_, sentence);
+        vector<WordRange> wrs;
+        wrs.reserve(sentence.size() / 2);
+
+        while (pre_filter.HasNext()) {
+            auto range = pre_filter.Next();
+            Cut(range.left, range.right, wrs, hmm, max_word_len);
+        }
+
+        words.clear();
+        words.reserve(wrs.size());
+        GetWordsFromWordRanges(sentence, wrs, words);
+        wrs.clear();
+        vector<WordRange>().swap(wrs);
+    }
+
+    void CutRuneArray(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res,
+                      bool hmm = true, size_t max_word_len = MAX_WORD_LENGTH) const {
+        Cut(begin, end, res, hmm, max_word_len);
+    }
+
+    bool ResetSeparators(const string& s) {
+        symbols_.clear();
+        RuneStrArray runes;
+
+        if (!DecodeRunesInString(s, runes)) {
+            XLOG(ERROR) << "decode " << s << " failed";
+            return false;
+        }
+
+        for (size_t i = 0; i < runes.size(); i++) {
+            if (!symbols_.insert(runes[i].rune).second) {
+                XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
+                return false;
+            }
+        }
+
+        return true;
+    }
+protected:
+    unordered_set<Rune> symbols_;
+}; // class SegmentBase
+
+} // cppjieba
+
--- a/libchinese-segmentation/cppjieba/SegmentTagged.hpp
+++ b/libchinese-segmentation/cppjieba/SegmentTagged.hpp
@ -0,0 +1,21 @@
+#pragma once
+
+#include "SegmentBase.hpp"
+
+namespace cppjieba {
+
+class SegmentTagged : public SegmentBase {
+public:
+    SegmentTagged() {
+    }
+    virtual ~SegmentTagged() {
+    }
+
+    virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
+
+    virtual const DictTrie* GetDictTrie() const = 0;
+
+}; // class SegmentTagged
+
+} // cppjieba
+
--- a/libchinese-segmentation/cppjieba/TextRankExtractor.hpp
+++ b/libchinese-segmentation/cppjieba/TextRankExtractor.hpp
@ -0,0 +1,205 @@
+
+#include <cmath>
+#include "Jieba.hpp"
+
+namespace cppjieba {
+using namespace limonp;
+using namespace std;
+
+class TextRankExtractor {
+public:
+    typedef struct _Word {
+        string word;
+        vector<size_t> offsets;
+        double weight;
+    }    Word; // struct Word
+private:
+    typedef std::map<string, Word> WordMap;
+
+    class WordGraph {
+    private:
+        typedef double Score;
+        typedef string Node;
+        typedef std::set<Node> NodeSet;
+
+        typedef std::map<Node, double> Edges;
+        typedef std::map<Node, Edges> Graph;
+        //typedef std::unordered_map<Node,double> Edges;
+        //typedef std::unordered_map<Node,Edges> Graph;
+
+        double d;
+        Graph graph;
+        NodeSet nodeSet;
+    public:
+        WordGraph(): d(0.85) {};
+        WordGraph(double in_d): d(in_d) {};
+
+        void addEdge(Node start, Node end, double weight) {
+            Edges temp;
+            Edges::iterator gotEdges;
+            nodeSet.insert(start);
+            nodeSet.insert(end);
+            graph[start][end] += weight;
+            graph[end][start] += weight;
+        }
+
+        void rank(WordMap &ws, size_t rankTime = 10) {
+            WordMap outSum;
+            Score wsdef, min_rank, max_rank;
+
+            if (graph.size() == 0) {
+                return;
+            }
+
+            wsdef = 1.0 / graph.size();
+
+            for (Graph::iterator edges = graph.begin(); edges != graph.end(); ++edges) {
+                // edges->first start节点；edge->first end节点；edge->second 权重
+                ws[edges->first].word = edges->first;
+                ws[edges->first].weight = wsdef;
+                outSum[edges->first].weight = 0;
+
+                for (Edges::iterator edge = edges->second.begin(); edge != edges->second.end(); ++edge) {
+                    outSum[edges->first].weight += edge->second;
+                }
+            }
+
+            //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
+            for (size_t i = 0; i < rankTime; i++) {
+                for (NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++) {
+                    double s = 0;
+
+                    for (Edges::iterator edge = graph[*node].begin(); edge != graph[*node].end(); edge++)
+                        // edge->first end节点；edge->second 权重
+                    {
+                        s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
+                    }
+
+                    ws[*node].weight = (1 - d) + d * s;
+                }
+            }
+
+            min_rank = max_rank = ws.begin()->second.weight;
+
+            for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
+                if (i->second.weight < min_rank) {
+                    min_rank = i->second.weight;
+                }
+
+                if (i->second.weight > max_rank) {
+                    max_rank = i->second.weight;
+                }
+            }
+
+            for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
+                ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
+            }
+        }
+    };
+
+public:
+    TextRankExtractor(const DictTrie* dictTrie,
+                      const HMMModel* model,
+                      const string& stopWordPath)
+        : segment_(dictTrie, model) {
+        LoadStopWordDict(stopWordPath);
+    }
+    TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
+        LoadStopWordDict(stopWordPath);
+    }
+    ~TextRankExtractor() {
+    }
+
+    void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
+        vector<Word> topWords;
+        Extract(sentence, topWords, topN);
+
+        for (size_t i = 0; i < topWords.size(); i++) {
+            keywords.push_back(topWords[i].word);
+        }
+    }
+
+    void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
+        vector<Word> topWords;
+        Extract(sentence, topWords, topN);
+
+        for (size_t i = 0; i < topWords.size(); i++) {
+            keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
+        }
+    }
+
+    void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span = 5, size_t rankTime = 10) const {
+        vector<string> words;
+        segment_.CutToStr(sentence, words);
+
+        TextRankExtractor::WordGraph graph;
+        WordMap wordmap;
+        size_t offset = 0;
+
+        for (size_t i = 0; i < words.size(); i++) {
+            size_t t = offset;
+            offset += words[i].size();
+
+            if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
+                continue;
+            }
+
+            for (size_t j = i + 1, skip = 0; j < i + span + skip && j < words.size(); j++) {
+                if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
+                    skip++;
+                    continue;
+                }
+
+                graph.addEdge(words[i], words[j], 1);
+            }
+
+            wordmap[words[i]].offsets.push_back(t);
+        }
+
+        if (offset != sentence.size()) {
+            XLOG(ERROR) << "words illegal";
+            return;
+        }
+
+        graph.rank(wordmap, rankTime);
+
+        keywords.clear();
+        keywords.reserve(wordmap.size());
+
+        for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
+            keywords.push_back(itr->second);
+        }
+
+        topN = min(topN, keywords.size());
+        partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
+        keywords.resize(topN);
+    }
+private:
+    void LoadStopWordDict(const string& filePath) {
+        ifstream ifs(filePath.c_str());
+        XCHECK(ifs.is_open()) << "open " << filePath << " failed";
+        string line ;
+
+        while (getline(ifs, line)) {
+            stopWords_.insert(line);
+        }
+
+        assert(stopWords_.size());
+    }
+
+    static bool Compare(const Word &x, const Word &y) {
+        return x.weight > y.weight;
+    }
+
+    MixSegment segment_;
+    unordered_set<string> stopWords_;
+}; // class TextRankExtractor
+
+inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
+    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
+           "}";
+}
+} // namespace cppjieba
+
+
+
--- a/libchinese-segmentation/cppjieba/Unicode.hpp
+++ b/libchinese-segmentation/cppjieba/Unicode.hpp
@ -0,0 +1,264 @@
+#pragma once
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+#include <ostream>
+#include "limonp/LocalVector.hpp"
+#include "limonp/StringUtil.hpp"
+#include "common-struct.h"
+
+namespace cppjieba {
+
+using std::string;
+using std::vector;
+
+typedef uint32_t Rune;
+
+inline std::ostream& operator << (std::ostream& os, const Word& w) {
+    return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
+}
+
+struct DatMemElem {
+    double weight = 0.0;
+    char tag[8] = {};
+
+    void SetTag(const string & str) {
+        memset(&tag[0], 0, sizeof(tag));
+        strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
+    }
+
+    string GetTag() const {
+        return &tag[0];
+    }
+};
+
+struct DatDag {
+    limonp::LocalVector<pair<size_t, const DatMemElem *> > nexts;
+    //double max_weight;
+    //size_t max_next;
+};
+
+struct RuneInfo {
+    Rune rune;
+    uint32_t offset;
+    uint32_t len;
+    uint32_t unicode_offset = 0;
+    uint32_t unicode_length = 0;
+    RuneInfo(): rune(0), offset(0), len(0) {
+    }
+    RuneInfo(Rune r, uint32_t o, uint32_t l)
+        : rune(r), offset(o), len(l) {
+    }
+    RuneInfo(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
+        : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
+    }
+}; // struct RuneInfo
+
+inline std::ostream& operator << (std::ostream& os, const RuneInfo& r) {
+    return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
+}
+
+typedef limonp::LocalVector<Rune> RuneArray;
+typedef limonp::LocalVector<struct RuneInfo> RuneStrArray;
+
+// [left, right]
+struct WordRange {
+    RuneStrArray::const_iterator left;
+    RuneStrArray::const_iterator right;
+    WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
+        : left(l), right(r) {
+    }
+    size_t Length() const {
+        return right - left;
+    }
+
+    bool IsAllAscii() const {
+        for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
+            if (iter->rune >= 0x80) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+}; // struct WordRange
+
+
+inline bool DecodeRunesInString(const string& s, RuneArray& arr) {
+    arr.clear();
+    return limonp::Utf8ToUnicode32(s, arr);
+}
+
+inline RuneArray DecodeRunesInString(const string& s) {
+    RuneArray result;
+    DecodeRunesInString(s, result);
+    return result;
+}
+
+inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
+
+    uint32_t tmp;
+    uint32_t offset = 0;
+    runes.clear();
+    uint32_t len(0);
+    for (size_t i = 0; i < s.size();) {
+      if (!(s.data()[i] & 0x80)) { // 0xxxxxxx
+        // 7bit, total 7bit
+        tmp = (uint8_t)(s.data()[i]) & 0x7f;
+        i++;
+        len = 1;
+      } else if ((uint8_t)s.data()[i] <= 0xdf && i + 1 < s.size()) { // 110xxxxxx
+        // 5bit, total 5bit
+        tmp = (uint8_t)(s.data()[i]) & 0x1f;
+
+        // 6bit, total 11bit
+        tmp <<= 6;
+        tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
+        i += 2;
+        len = 2;
+      } else if((uint8_t)s.data()[i] <= 0xef && i + 2 < s.size()) { // 1110xxxxxx
+        // 4bit, total 4bit
+        tmp = (uint8_t)(s.data()[i]) & 0x0f;
+
+        // 6bit, total 10bit
+        tmp <<= 6;
+        tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
+
+        // 6bit, total 16bit
+        tmp <<= 6;
+        tmp |= (uint8_t)(s.data()[i+2]) & 0x3f;
+
+        i += 3;
+        len = 3;
+      } else if((uint8_t)s.data()[i] <= 0xf7 && i + 3 < s.size()) { // 11110xxxx
+        // 3bit, total 3bit
+        tmp = (uint8_t)(s.data()[i]) & 0x07;
+
+        // 6bit, total 9bit
+        tmp <<= 6;
+        tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
+
+        // 6bit, total 15bit
+        tmp <<= 6;
+        tmp |= (uint8_t)(s.data()[i+2]) & 0x3f;
+
+        // 6bit, total 21bit
+        tmp <<= 6;
+        tmp |= (uint8_t)(s.data()[i+3]) & 0x3f;
+
+        i += 4;
+        len = 4;
+      } else {
+        return false;
+      }
+      RuneInfo x(tmp, offset, len, i, 1);
+      runes.push_back(x);
+      offset += len;
+    }
+    return true;
+}
+
+class RunePtrWrapper {
+public:
+    const RuneInfo * m_ptr = nullptr;
+
+public:
+    explicit RunePtrWrapper(const RuneInfo * p) : m_ptr(p) {}
+
+    uint32_t operator *() {
+        return m_ptr->rune;
+    }
+
+    RunePtrWrapper operator ++(int) {
+        m_ptr ++;
+        return RunePtrWrapper(m_ptr);
+    }
+
+    bool operator !=(const RunePtrWrapper & b) const {
+        return this->m_ptr != b.m_ptr;
+    }
+};
+
+inline string EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) {
+    string str;
+    RunePtrWrapper it_begin(begin), it_end(end);
+    limonp::Unicode32ToUtf8(it_begin, it_end, str);
+    return str;
+}
+
+inline void EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, string& str) {
+    RunePtrWrapper it_begin(begin), it_end(end);
+    limonp::Unicode32ToUtf8(it_begin, it_end, str);
+    return;
+}
+
+class Unicode32Counter {
+public :
+    size_t length = 0;
+    void clear() {
+        length = 0;
+    }
+    void push_back(uint32_t) {
+        ++length;
+    }
+};
+
+inline size_t Utf8CharNum(const char * str, size_t length) {
+    Unicode32Counter c;
+
+    if (limonp::Utf8ToUnicode32(str, length, c)) {
+        return c.length;
+    }
+
+    return 0;
+}
+
+inline size_t Utf8CharNum(const string & str) {
+    return Utf8CharNum(str.data(), str.size());
+}
+
+inline bool IsSingleWord(const string& str) {
+    return Utf8CharNum(str) == 1;
+}
+
+
+// [left, right]
+inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
+    assert(right->offset >= left->offset);
+    uint32_t len = right->offset - left->offset + right->len;
+    uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
+    return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
+}
+
+inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
+    assert(right->offset >= left->offset);
+    //uint32_t len = right->offset - left->offset + right->len;
+    return s.substr(left->offset, right->offset - left->offset + right->len);
+}
+
+inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
+    for (size_t i = 0; i < wrs.size(); i++) {
+        words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
+    }
+}
+
+inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<string>& words) {
+    for (size_t i = 0; i < wrs.size(); i++) {
+        words.push_back(GetStringFromRunes(s, wrs[i].left, wrs[i].right));
+    }
+}
+
+inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
+    strs.resize(words.size());
+
+    for (size_t i = 0; i < words.size(); ++i) {
+        strs[i] = words[i].word;
+    }
+}
+
+const size_t MAX_WORD_LENGTH = 512;
+
+} // namespace cppjieba
+
--- a/libchinese-segmentation/cppjieba/cppjieba.pri
+++ b/libchinese-segmentation/cppjieba/cppjieba.pri
@ -0,0 +1,43 @@
+INCLUDEPATH += $$PWD
+
+HEADERS += \
+    $$PWD/DictTrie.hpp \
+    $$PWD/IdfTrie.hpp \
+    $$PWD/PinYinTrie.hpp \
+    $$PWD/FullSegment.hpp \
+    $$PWD/HMMModel.hpp \
+    $$PWD/HMMSegment.hpp \
+    $$PWD/Jieba.hpp \
+    $$PWD/KeywordExtractor.hpp \
+    $$PWD/MPSegment.hpp \
+    $$PWD/MixSegment.hpp \
+    $$PWD/PosTagger.hpp \
+    $$PWD/PreFilter.hpp \
+    $$PWD/QuerySegment.hpp \
+    $$PWD/SegmentBase.hpp \
+    $$PWD/SegmentTagged.hpp \
+    $$PWD/TextRankExtractor.hpp \
+#    $$PWD/Trie.hpp \
+    $$PWD/Unicode.hpp \
+    $$PWD/DatTrie.hpp \
+    $$PWD/idf-trie/idf-trie.h \
+    $$PWD/segment-trie/segment-trie.h
+
+DISTFILES += \
+    dict/README.md \
+    dict/hmm_model.utf8 \
+    dict/idf.utf8 \
+    dict/jieba.dict.utf8 \
+    dict/pos_dict/char_state_tab.utf8 \
+    dict/pos_dict/prob_emit.utf8 \
+    dict/pos_dict/prob_start.utf8 \
+    dict/pos_dict/prob_trans.utf8 \
+    dict/stop_words.utf8 \
+    dict/user.dict.utf8
+    #dict/pinyinWithoutTone.txt \
+
+include(limonp/limonp.pri)
+
+SOURCES += \
+    $$PWD/idf-trie/idf-trie.cpp \
+    $$PWD/segment-trie/segment-trie.cpp
--- a/libchinese-segmentation/cppjieba/idf-trie/idf-trie.cpp
+++ b/libchinese-segmentation/cppjieba/idf-trie/idf-trie.cpp
@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#include "idf-trie.h"
+
+IdfTrie::IdfTrie(const vector<string> file_paths, string dat_cache_path)
+    : StorageBase<double, false, IdfCacheFileHeader>(file_paths, dat_cache_path)
+{
+    this->Init();
+}
+
+IdfTrie::IdfTrie(string file_path, string dat_cache_path)
+: StorageBase<double, false, IdfCacheFileHeader>(vector<string>{file_path}, dat_cache_path)
+{
+    this->Init();
+}
+
+void IdfTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
+{
+    IdfCacheFileHeader header;
+    assert(sizeof(header.md5_hex) == md5.size());
+    memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
+
+    int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
+    double idf_sum(0), idf_average(0), tmp(0);
+    string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
+    umask(S_IWGRP | S_IWOTH);
+    const int fd =mkstemp((char *)tmp_filepath.data());
+    assert(fd >= 0);
+    fchmod(fd, 0644);
+
+    write_bytes = write(fd, (const char *)&header, sizeof(IdfCacheFileHeader));
+
+    ifstream ifs(IDF_DICT_PATH);
+    string line;
+    vector<string> buf;
+
+    for (; getline(ifs, line);) {
+        if (limonp::StartsWith(line, "#") or line.empty()) {
+            continue;
+        }
+        limonp::Split(line, buf, " ");
+        if (buf.size() != 2)
+            continue;
+        this->Update(buf[0].c_str(), buf[0].size(), elements_num);
+        offset += sizeof(double);
+        elements_num++;
+        tmp = atof(buf[1].c_str());
+        write_bytes += write(fd, &tmp, sizeof(double));
+        idf_sum += tmp;
+    }
+    idf_average = idf_sum / elements_num;
+    write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
+
+    lseek(fd, sizeof(header.md5_hex), SEEK_SET);
+    write(fd, &elements_num, sizeof(int));
+    write(fd, &offset, sizeof(int));
+    data_trie_size = this->GetDataTrieSize();
+    write(fd, &data_trie_size, sizeof(int));
+    write(fd, &idf_average, sizeof(double));
+
+    close(fd);
+    assert((size_t)write_bytes == sizeof(IdfCacheFileHeader) + offset + this->GetDataTrieTotalSize());
+
+    tryRename(tmp_filepath, dat_cache_file);
+}
+
+double IdfTrie::Find(const string &key) const
+{
+    int result = this->ExactMatchSearch(key.c_str(), key.size());
+    if (result < 0)
+        return -1;
+    return this->GetElementPtr()[result];
+}
+
+double IdfTrie::GetIdfAverage() const
+{
+    return this->GetCacheFileHeaderPtr()->idf_average;
+}
+
--- a/libchinese-segmentation/cppjieba/idf-trie/idf-trie.h
+++ b/libchinese-segmentation/cppjieba/idf-trie/idf-trie.h
@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#ifndef IdfTrie_H
+#define IdfTrie_H
+
+#include "storage-base.hpp"
+
+const char * const  IDF_DICT_PATH = DICT_INSTALL_PATH"/idf.utf8";
+
+struct IdfCacheFileHeader : CacheFileHeaderBase
+{
+    double idf_average = 0;
+};
+
+class IdfTrie : public StorageBase<double, false, IdfCacheFileHeader>
+{
+public:
+    IdfTrie(const vector<string> file_paths, string dat_cache_path);
+    IdfTrie(string file_path, string dat_cache_path);
+    void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
+    double Find(const string &key) const;
+    double GetIdfAverage() const;
+
+private:
+
+};
+
+#endif // IdfTrie_H
--- a/libchinese-segmentation/cppjieba/limonp/ArgvContext.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/ArgvContext.hpp
@ -0,0 +1,70 @@
+/************************************
+ * file enc : ascii
+ * author   : wuyanyi09@gmail.com
+ ************************************/
+
+#ifndef LIMONP_ARGV_FUNCTS_H
+#define LIMONP_ARGV_FUNCTS_H
+
+#include <set>
+#include <sstream>
+#include "StringUtil.hpp"
+
+namespace limonp {
+
+using namespace std;
+
+class ArgvContext {
+ public :
+  ArgvContext(int argc, const char* const * argv) {
+    for(int i = 0; i < argc; i++) {
+      if(StartsWith(argv[i], "-")) {
+        if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) {
+          mpss_[argv[i]] = argv[i+1];
+          i++;
+        } else {
+          sset_.insert(argv[i]);
+        }
+      } else {
+        args_.push_back(argv[i]);
+      }
+    }
+  }
+  ~ArgvContext() {
+  }
+
+  friend ostream& operator << (ostream& os, const ArgvContext& args);
+  string operator [](size_t i) const {
+    if(i < args_.size()) {
+      return args_[i];
+    }
+    return "";
+  }
+  string operator [](const string& key) const {
+    map<string, string>::const_iterator it = mpss_.find(key);
+    if(it != mpss_.end()) {
+      return it->second;
+    }
+    return "";
+  }
+
+  bool HasKey(const string& key) const {
+    if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) {
+      return true;
+    }
+    return false;
+  }
+
+ private:
+  vector<string> args_;
+  map<string, string> mpss_;
+  set<string> sset_;
+}; // class ArgvContext
+
+inline ostream& operator << (ostream& os, const ArgvContext& args) {
+  return os<<args.args_<<args.mpss_<<args.sset_;
+}
+
+} // namespace limonp
+
+#endif
--- a/libchinese-segmentation/cppjieba/limonp/BlockingQueue.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/BlockingQueue.hpp
@ -0,0 +1,49 @@
+#ifndef LIMONP_BLOCKINGQUEUE_HPP
+#define LIMONP_BLOCKINGQUEUE_HPP
+
+#include <queue>
+#include "Condition.hpp"
+
+namespace limonp {
+template<class T>
+class BlockingQueue: NonCopyable {
+ public:
+  BlockingQueue()
+    : mutex_(), notEmpty_(mutex_), queue_() {
+  }
+
+  void Push(const T& x) {
+    MutexLockGuard lock(mutex_);
+    queue_.push(x);
+    notEmpty_.Notify(); // Wait morphing saves us
+  }
+
+  T Pop() {
+    MutexLockGuard lock(mutex_);
+    // always use a while-loop, due to spurious wakeup
+    while (queue_.empty()) {
+      notEmpty_.Wait();
+    }
+    assert(!queue_.empty());
+    T front(queue_.front());
+    queue_.pop();
+    return front;
+  }
+
+  size_t Size() const {
+    MutexLockGuard lock(mutex_);
+    return queue_.size();
+  }
+  bool Empty() const {
+    return Size() == 0;
+  }
+
+ private:
+  mutable MutexLock mutex_;
+  Condition         notEmpty_;
+  std::queue<T>     queue_;
+}; // class BlockingQueue
+
+} // namespace limonp
+
+#endif // LIMONP_BLOCKINGQUEUE_HPP
--- a/libchinese-segmentation/cppjieba/limonp/BoundedBlockingQueue.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/BoundedBlockingQueue.hpp
@ -0,0 +1,67 @@
+#ifndef LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
+#define LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
+
+#include "BoundedQueue.hpp"
+
+namespace limonp {
+
+template<typename T>
+class BoundedBlockingQueue : NonCopyable {
+ public:
+  explicit BoundedBlockingQueue(size_t maxSize)
+    : mutex_(),
+      notEmpty_(mutex_),
+      notFull_(mutex_),
+      queue_(maxSize) {
+  }
+
+  void Push(const T& x) {
+    MutexLockGuard lock(mutex_);
+    while (queue_.Full()) {
+      notFull_.Wait();
+    }
+    assert(!queue_.Full());
+    queue_.Push(x);
+    notEmpty_.Notify();
+  }
+
+  T Pop() {
+    MutexLockGuard lock(mutex_);
+    while (queue_.Empty()) {
+      notEmpty_.Wait();
+    }
+    assert(!queue_.Empty());
+    T res = queue_.Pop();
+    notFull_.Notify();
+    return res;
+  }
+
+  bool Empty() const {
+    MutexLockGuard lock(mutex_);
+    return queue_.Empty();
+  }
+
+  bool Full() const {
+    MutexLockGuard lock(mutex_);
+    return queue_.Full();
+  }
+
+  size_t size() const {
+    MutexLockGuard lock(mutex_);
+    return queue_.size();
+  }
+
+  size_t capacity() const {
+    return queue_.capacity();
+  }
+
+ private:
+  mutable MutexLock          mutex_;
+  Condition                  notEmpty_;
+  Condition                  notFull_;
+  BoundedQueue<T>  queue_;
+}; // class BoundedBlockingQueue
+
+} // namespace limonp
+
+#endif // LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
--- a/libchinese-segmentation/cppjieba/limonp/BoundedQueue.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/BoundedQueue.hpp
@ -0,0 +1,65 @@
+#ifndef LIMONP_BOUNDED_QUEUE_HPP
+#define LIMONP_BOUNDED_QUEUE_HPP
+
+#include <vector>
+#include <fstream>
+#include <cassert>
+
+namespace limonp {
+using namespace std;
+template<class T>
+class BoundedQueue {
+ public:
+  explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) {
+    head_ = 0;
+    tail_ = 0;
+    size_ = 0;
+    assert(capacity_);
+  }
+  ~BoundedQueue() {
+  }
+
+  void Clear() {
+    head_ = 0;
+    tail_ = 0;
+    size_ = 0;
+  }
+  bool Empty() const {
+    return !size_;
+  }
+  bool Full() const {
+    return capacity_ == size_;
+  }
+  size_t Size() const {
+    return size_;
+  }
+  size_t Capacity() const {
+    return capacity_;
+  }
+
+  void Push(const T& t) {
+    assert(!Full());
+    circular_buffer_[tail_] = t;
+    tail_ = (tail_ + 1) % capacity_;
+    size_ ++;
+  }
+
+  T Pop() {
+    assert(!Empty());
+    size_t oldPos = head_;
+    head_ = (head_ + 1) % capacity_;
+    size_ --;
+    return circular_buffer_[oldPos];
+  }
+
+ private:
+  size_t head_;
+  size_t tail_;
+  size_t size_;
+  const size_t capacity_;
+  vector<T> circular_buffer_;
+
+}; // class BoundedQueue
+} // namespace limonp
+
+#endif
--- a/libchinese-segmentation/cppjieba/limonp/Closure.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Closure.hpp
@ -0,0 +1,206 @@
+#ifndef LIMONP_CLOSURE_HPP
+#define LIMONP_CLOSURE_HPP
+
+namespace limonp {
+
+class ClosureInterface {
+ public:
+  virtual ~ClosureInterface() {
+  }
+  virtual void Run() = 0;
+};
+
+template <class Funct>
+class Closure0: public ClosureInterface {
+ public:
+  Closure0(Funct fun) {
+    fun_ = fun;
+  }
+  virtual ~Closure0() {
+  }
+  virtual void Run() {
+    (*fun_)();
+  }
+ private:
+  Funct fun_;
+}; 
+
+template <class Funct, class Arg1>
+class Closure1: public ClosureInterface {
+ public:
+  Closure1(Funct fun, Arg1 arg1) {
+    fun_ = fun;
+    arg1_ = arg1;
+  }
+  virtual ~Closure1() {
+  }
+  virtual void Run() {
+    (*fun_)(arg1_);
+  }
+ private:
+  Funct fun_;
+  Arg1 arg1_;
+}; 
+
+template <class Funct, class Arg1, class Arg2>
+class Closure2: public ClosureInterface {
+ public:
+  Closure2(Funct fun, Arg1 arg1, Arg2 arg2) {
+    fun_ = fun;
+    arg1_ = arg1;
+    arg2_ = arg2;
+  }
+  virtual ~Closure2() {
+  }
+  virtual void Run() {
+    (*fun_)(arg1_, arg2_);
+  }
+ private:
+  Funct fun_;
+  Arg1 arg1_;
+  Arg2 arg2_;
+}; 
+
+template <class Funct, class Arg1, class Arg2, class Arg3>
+class Closure3: public ClosureInterface {
+ public:
+  Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+    fun_ = fun;
+    arg1_ = arg1;
+    arg2_ = arg2;
+    arg3_ = arg3;
+  }
+  virtual ~Closure3() {
+  }
+  virtual void Run() {
+    (*fun_)(arg1_, arg2_, arg3_);
+  }
+ private:
+  Funct fun_;
+  Arg1 arg1_;
+  Arg2 arg2_;
+  Arg3 arg3_;
+}; 
+
+template <class Obj, class Funct> 
+class ObjClosure0: public ClosureInterface {
+ public:
+  ObjClosure0(Obj* p, Funct fun) {
+   p_ = p;
+   fun_ = fun;
+  }
+  virtual ~ObjClosure0() {
+  }
+  virtual void Run() {
+    (p_->*fun_)();
+  }
+ private:
+  Obj* p_;
+  Funct fun_;
+}; 
+
+template <class Obj, class Funct, class Arg1> 
+class ObjClosure1: public ClosureInterface {
+ public:
+  ObjClosure1(Obj* p, Funct fun, Arg1 arg1) {
+   p_ = p;
+   fun_ = fun;
+   arg1_ = arg1;
+  }
+  virtual ~ObjClosure1() {
+  }
+  virtual void Run() {
+    (p_->*fun_)(arg1_);
+  }
+ private:
+  Obj* p_;
+  Funct fun_;
+  Arg1 arg1_;
+}; 
+
+template <class Obj, class Funct, class Arg1, class Arg2> 
+class ObjClosure2: public ClosureInterface {
+ public:
+  ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) {
+   p_ = p;
+   fun_ = fun;
+   arg1_ = arg1;
+   arg2_ = arg2;
+  }
+  virtual ~ObjClosure2() {
+  }
+  virtual void Run() {
+    (p_->*fun_)(arg1_, arg2_);
+  }
+ private:
+  Obj* p_;
+  Funct fun_;
+  Arg1 arg1_;
+  Arg2 arg2_;
+}; 
+template <class Obj, class Funct, class Arg1, class Arg2, class Arg3> 
+class ObjClosure3: public ClosureInterface {
+ public:
+  ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+   p_ = p;
+   fun_ = fun;
+   arg1_ = arg1;
+   arg2_ = arg2;
+   arg3_ = arg3;
+  }
+  virtual ~ObjClosure3() {
+  }
+  virtual void Run() {
+    (p_->*fun_)(arg1_, arg2_, arg3_);
+  }
+ private:
+  Obj* p_;
+  Funct fun_;
+  Arg1 arg1_;
+  Arg2 arg2_;
+  Arg3 arg3_;
+}; 
+
+template<class R>
+ClosureInterface* NewClosure(R (*fun)()) {
+  return new Closure0<R (*)()>(fun);
+}
+
+template<class R, class Arg1>
+ClosureInterface* NewClosure(R (*fun)(Arg1), Arg1 arg1) {
+  return new Closure1<R (*)(Arg1), Arg1>(fun, arg1);
+}
+
+template<class R, class Arg1, class Arg2>
+ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
+  return new Closure2<R (*)(Arg1, Arg2), Arg1, Arg2>(fun, arg1, arg2);
+}
+
+template<class R, class Arg1, class Arg2, class Arg3>
+ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+  return new Closure3<R (*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(fun, arg1, arg2, arg3);
+}
+
+template<class R, class Obj>
+ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)()) {
+  return new ObjClosure0<Obj, R (Obj::* )()>(obj, fun);
+}
+
+template<class R, class Obj, class Arg1>
+ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1), Arg1 arg1) {
+  return new ObjClosure1<Obj, R (Obj::* )(Arg1), Arg1>(obj, fun, arg1);
+}
+
+template<class R, class Obj, class Arg1, class Arg2>
+ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
+  return new ObjClosure2<Obj, R (Obj::*)(Arg1, Arg2), Arg1, Arg2>(obj, fun, arg1, arg2);
+}
+
+template<class R, class Obj, class Arg1, class Arg2, class Arg3>
+ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+  return new ObjClosure3<Obj, R (Obj::*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(obj, fun, arg1, arg2, arg3);
+}
+
+} // namespace limonp
+
+#endif // LIMONP_CLOSURE_HPP
--- a/libchinese-segmentation/cppjieba/limonp/Colors.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Colors.hpp
@ -0,0 +1,31 @@
+#ifndef LIMONP_COLOR_PRINT_HPP
+#define LIMONP_COLOR_PRINT_HPP
+
+#include <string>
+#include <stdarg.h>
+
+namespace limonp {
+
+using std::string;
+
+enum Color {
+  BLACK = 30,
+  RED,
+  GREEN,
+  YELLOW,
+  BLUE,
+  PURPLE
+}; // enum Color
+
+static void ColorPrintln(enum Color color, const char * fmt, ...) {
+  va_list ap;
+  printf("\033[0;%dm", color);
+  va_start(ap, fmt);
+  vprintf(fmt, ap);
+  va_end(ap);
+  printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly
+}
+
+} // namespace limonp
+
+#endif // LIMONP_COLOR_PRINT_HPP
--- a/libchinese-segmentation/cppjieba/limonp/Condition.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Condition.hpp
@ -0,0 +1,38 @@
+#ifndef LIMONP_CONDITION_HPP
+#define LIMONP_CONDITION_HPP
+
+#include "MutexLock.hpp"
+
+namespace limonp {
+
+class Condition : NonCopyable {
+ public:
+  explicit Condition(MutexLock& mutex)
+    : mutex_(mutex) {
+    XCHECK(!pthread_cond_init(&pcond_, NULL));
+  }
+
+  ~Condition() {
+    XCHECK(!pthread_cond_destroy(&pcond_));
+  }
+
+  void Wait() {
+    XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex()));
+  }
+
+  void Notify() {
+    XCHECK(!pthread_cond_signal(&pcond_));
+  }
+
+  void NotifyAll() {
+    XCHECK(!pthread_cond_broadcast(&pcond_));
+  }
+
+ private:
+  MutexLock& mutex_;
+  pthread_cond_t pcond_;
+}; // class Condition
+
+} // namespace limonp
+
+#endif // LIMONP_CONDITION_HPP
--- a/libchinese-segmentation/cppjieba/limonp/Config.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Config.hpp
@ -0,0 +1,103 @@
+/************************************
+ * file enc : utf8
+ * author   : wuyanyi09@gmail.com
+ ************************************/
+#ifndef LIMONP_CONFIG_H
+#define LIMONP_CONFIG_H
+
+#include <map>
+#include <fstream>
+#include <iostream>
+#include <assert.h>
+#include "StringUtil.hpp"
+
+namespace limonp {
+
+using namespace std;
+
+class Config {
+ public:
+  explicit Config(const string& filePath) {
+    LoadFile(filePath);
+  }
+
+  operator bool () {
+    return !map_.empty();
+  }
+
+  string Get(const string& key, const string& defaultvalue) const {
+    map<string, string>::const_iterator it = map_.find(key);
+    if(map_.end() != it) {
+      return it->second;
+    }
+    return defaultvalue;
+  }
+  int Get(const string& key, int defaultvalue) const {
+    string str = Get(key, "");
+    if("" == str) {
+      return defaultvalue;
+    }
+    return atoi(str.c_str());
+  }
+  const char* operator [] (const char* key) const {
+    if(NULL == key) {
+      return NULL;
+    }
+    map<string, string>::const_iterator it = map_.find(key);
+    if(map_.end() != it) {
+      return it->second.c_str();
+    }
+    return NULL;
+  }
+
+  string GetConfigInfo() const {
+    string res;
+    res << *this;
+    return res;
+  }
+
+ private:
+  void LoadFile(const string& filePath) {
+    ifstream ifs(filePath.c_str());
+    assert(ifs);
+    string line;
+    vector<string> vecBuf;
+    size_t lineno = 0;
+    while(getline(ifs, line)) {
+      lineno ++;
+      Trim(line);
+      if(line.empty() || StartsWith(line, "#")) {
+        continue;
+      }
+      vecBuf.clear();
+      Split(line, vecBuf, "=");
+      if(2 != vecBuf.size()) {
+        fprintf(stderr, "line[%s] illegal.\n", line.c_str());
+        assert(false);
+        continue;
+      }
+      string& key = vecBuf[0];
+      string& value = vecBuf[1];
+      Trim(key);
+      Trim(value);
+      if(!map_.insert(make_pair(key, value)).second) {
+        fprintf(stderr, "key[%s] already exits.\n", key.c_str());
+        assert(false);
+        continue;
+      }
+    }
+    ifs.close();
+  }
+
+  friend ostream& operator << (ostream& os, const Config& config);
+
+  map<string, string> map_;
+}; // class Config
+
+inline ostream& operator << (ostream& os, const Config& config) {
+  return os << config.map_;
+}
+
+} // namespace limonp
+
+#endif // LIMONP_CONFIG_H
--- a/libchinese-segmentation/cppjieba/limonp/FileLock.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/FileLock.hpp
@ -0,0 +1,74 @@
+#ifndef LIMONP_FILELOCK_HPP
+#define LIMONP_FILELOCK_HPP
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string>
+#include <string.h>
+#include <assert.h>
+
+namespace limonp {
+
+using std::string;
+
+class FileLock {
+ public:
+  FileLock() : fd_(-1), ok_(true) {
+  }
+  ~FileLock() {
+    if(fd_ > 0) {
+      Close();
+    }
+  }
+  void Open(const string& fname) {
+    assert(fd_ == -1);
+    fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
+    if(fd_ < 0) {
+      ok_ = false;
+      err_ = strerror(errno);
+    }
+  }
+  void Close() {
+    ::close(fd_);
+  }
+  void Lock() {
+    if(LockOrUnlock(fd_, true) < 0) {
+      ok_ = false;
+      err_ = strerror(errno);
+    }
+  }
+  void UnLock() {
+    if(LockOrUnlock(fd_, false) < 0) {
+      ok_ = false;
+      err_ = strerror(errno);
+    }
+  }
+  bool Ok() const {
+    return ok_;
+  }
+  string Error() const {
+    return err_;
+  }
+ private:
+  static int LockOrUnlock(int fd, bool lock) {
+    errno = 0;
+    struct flock f;
+    memset(&f, 0, sizeof(f));
+    f.l_type = (lock ? F_WRLCK : F_UNLCK);
+    f.l_whence = SEEK_SET;
+    f.l_start = 0;
+    f.l_len = 0;        // Lock/unlock entire file
+    return fcntl(fd, F_SETLK, &f);
+  }
+
+  int fd_;
+  bool ok_;
+  string err_;
+}; // class FileLock
+
+}// namespace limonp
+
+#endif // LIMONP_FILELOCK_HPP
--- a/libchinese-segmentation/cppjieba/limonp/ForcePublic.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/ForcePublic.hpp
@ -0,0 +1,7 @@
+#ifndef LIMONP_FORCE_PUBLIC_H
+#define LIMONP_FORCE_PUBLIC_H
+
+#define private public
+#define protected public
+
+#endif // LIMONP_FORCE_PUBLIC_H
--- a/libchinese-segmentation/cppjieba/limonp/LocalVector.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/LocalVector.hpp
@ -0,0 +1,142 @@
+#ifndef LIMONP_LOCAL_VECTOR_HPP
+#define LIMONP_LOCAL_VECTOR_HPP
+
+#include <iostream>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+namespace limonp {
+using namespace std;
+/*
+ * LocalVector<T> : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector<T> may be dangerous..
+ * LocalVector<T> is simple and not well-tested.
+ */
+const size_t LOCAL_VECTOR_BUFFER_SIZE = 16;
+template <class T>
+class LocalVector {
+ public:
+  typedef const T* const_iterator ;
+  typedef T value_type;
+  typedef size_t size_type;
+ private:
+  T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
+  T * ptr_;
+  size_t size_;
+  size_t capacity_;
+ public:
+  LocalVector() {
+    init_();
+  };
+  LocalVector(const LocalVector<T>& vec) {
+    init_();
+    *this = vec;
+  }
+  LocalVector(const_iterator  begin, const_iterator end) { // TODO: make it faster
+    init_();
+    while(begin != end) {
+      push_back(*begin++);
+    }
+  }
+  LocalVector(size_t size, const T& t) { // TODO: make it faster
+    init_();
+    while(size--) {
+      push_back(t);
+    }
+  }
+  ~LocalVector() {
+    if(ptr_ != buffer_) {
+      free(ptr_);
+    }
+  };
+ public:
+  LocalVector<T>& operator = (const LocalVector<T>& vec) {
+      if(this == &vec){
+          return *this;
+      }
+    clear();
+    size_ = vec.size();
+    capacity_ = vec.capacity();
+    if(vec.buffer_ == vec.ptr_) {
+      memcpy(buffer_, vec.buffer_, sizeof(T) * size_);
+      ptr_ = buffer_;
+    } else {
+      ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
+      assert(ptr_);
+      memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T));
+    }
+    return *this;
+  }
+ private:
+  void init_() {
+    ptr_ = buffer_;
+    size_ = 0;
+    capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
+  }
+ public:
+  T& operator [] (size_t i) {
+    return ptr_[i];
+  }
+  const T& operator [] (size_t i) const {
+    return ptr_[i];
+  }
+  void push_back(const T& t) {
+    if(size_ == capacity_) {
+      assert(capacity_);
+      reserve(capacity_ * 2);
+    }
+    ptr_[size_ ++ ] = t;
+  }
+  void reserve(size_t size) {
+    if(size <= capacity_) {
+      return;
+    }
+    T * next =  (T*)malloc(sizeof(T) * size);
+    assert(next);
+    T * old = ptr_;
+    ptr_ = next;
+    memcpy(ptr_, old, sizeof(T) * capacity_);
+    capacity_ = size;
+    if(old != buffer_) {
+      free(old);
+    }
+  }
+  bool empty() const {
+    return 0 == size();
+  }
+  size_t size() const {
+    return size_;
+  }
+  size_t capacity() const {
+    return capacity_;
+  }
+  const_iterator begin() const {
+    return ptr_;
+  }
+  const_iterator end() const {
+    return ptr_ + size_;
+  }
+  void clear() {
+    if(ptr_ != buffer_) {
+      free(ptr_);
+    }
+    init_();
+  }
+};
+
+template <class T>
+ostream & operator << (ostream& os, const LocalVector<T>& vec) {
+  if(vec.empty()) {
+    return os << "[]";
+  }
+  os<<"[\""<<vec[0];
+  for(size_t i = 1; i < vec.size(); i++) {
+    os<<"\", \""<<vec[i];
+  }
+  os<<"\"]";
+  return os;
+}
+
+}
+
+#endif
--- a/libchinese-segmentation/cppjieba/limonp/Logging.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Logging.hpp
@ -0,0 +1,77 @@
+#ifndef LIMONP_LOGGING_HPP
+#define LIMONP_LOGGING_HPP
+
+#include <sstream>
+#include <iostream>
+#include <cassert>
+#include <cstdlib>
+#include <ctime>
+
+#ifdef XLOG
+#error "XLOG has been defined already"
+#endif // XLOG
+#ifdef XCHECK
+#error "XCHECK has been defined already"
+#endif // XCHECK
+
+#define XLOG(level) limonp::Logger(limonp::LL_##level, __FILE__, __LINE__).Stream()
+#define XCHECK(exp) if(!(exp)) XLOG(FATAL) << "exp: ["#exp << "] false. "
+
+namespace limonp {
+
+enum {
+  LL_DEBUG = 0,
+  LL_INFO = 1,
+  LL_WARNING = 2,
+  LL_ERROR = 3,
+  LL_FATAL = 4,
+}; // enum
+
+static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"};
+
+class Logger {
+ public:
+  Logger(size_t level, const char* filename, int lineno)
+   : level_(level) {
+#ifdef LOGGING_LEVEL
+     if (level_ < LOGGING_LEVEL) {
+       return;
+     }
+#endif
+    assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY));
+    char buf[32];
+    time_t now;
+    time(&now);
+    struct tm result;
+    localtime_r(&now, &result);
+    strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &result);
+    stream_ << buf
+      << " " << filename
+      << ":" << lineno
+      << " " << LOG_LEVEL_ARRAY[level_]
+      << " ";
+  }
+  ~Logger() {
+#ifdef LOGGING_LEVEL
+     if (level_ < LOGGING_LEVEL) {
+       return;
+     }
+#endif
+    std::cerr << stream_.str() << std::endl;
+    if (level_ == LL_FATAL) {
+      abort();
+    }
+  }
+
+  std::ostream& Stream() {
+    return stream_;
+  }
+
+ private:
+  std::ostringstream stream_;
+  size_t level_;
+}; // class Logger
+
+} // namespace limonp
+
+#endif // LIMONP_LOGGING_HPP
--- a/libchinese-segmentation/cppjieba/limonp/Md5.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Md5.hpp
@ -0,0 +1,415 @@
+/****************************************************************************
+**Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991
+**              2020, KylinSoft Co., Ltd.
+**All rights reserved.
+**
+**License to copy and use this software is granted provided that it
+**is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+**Algorithm" in all material mentioning or referencing this software
+**or this function.
+**
+**License is also granted to make and use derivative works provided
+**that such works are identified as "derived from the RSA Data
+**Security, Inc. MD5 Message-Digest Algorithm" in all material
+**mentioning or referencing the derived work.
+**
+**RSA Data Security, Inc. makes no representations concerning either
+**the merchantability of this software or the suitability of this
+**software for any particular purpose. It is provided "as is"
+**without express or implied warranty of any kind.
+**
+**These notices must be retained in any copies of any part of this
+**documentation and/or software.
+**
+**
+**
+**The original md5 implementation avoids external libraries.
+**This version has dependency on stdio.h for file input and
+**string.h for memcpy.
+**
+****************************************************************************/
+
+#ifndef __MD5_H__
+#define __MD5_H__
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+
+namespace limonp {
+
+//#pragma region MD5 defines
+// Constants for MD5Transform routine.
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+
+
+// F, G, H and I are basic MD5 functions.
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+// ROTATE_LEFT rotates x left n bits.
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+// Rotation is separate from addition to prevent recomputation.
+#define FF(a, b, c, d, x, s, ac) { \
+  (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \
+  (a) = ROTATE_LEFT ((a), (s)); \
+  (a) += (b); \
+  }
+#define GG(a, b, c, d, x, s, ac) { \
+  (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \
+  (a) = ROTATE_LEFT ((a), (s)); \
+  (a) += (b); \
+  }
+#define HH(a, b, c, d, x, s, ac) { \
+  (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \
+  (a) = ROTATE_LEFT ((a), (s)); \
+  (a) += (b); \
+  }
+#define II(a, b, c, d, x, s, ac) { \
+  (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \
+  (a) = ROTATE_LEFT ((a), (s)); \
+  (a) += (b); \
+  }
+//#pragma endregion
+
+
+typedef unsigned char BYTE ;
+
+// POINTER defines a generic pointer type
+typedef unsigned char *POINTER;
+
+// UINT2 defines a two byte word
+typedef unsigned short int UINT2;
+
+// UINT4 defines a four byte word
+typedef unsigned int UINT4;
+
+static unsigned char PADDING[64] = {
+    0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+// convenient object that wraps
+// the C-functions for use in C++ only
+class MD5 {
+private:
+    struct __context_t {
+        UINT4 state[4];                                   /* state (ABCD) */
+        UINT4 count[2];        /* number of bits, modulo 2^64 (lsb first) */
+        unsigned char buffer[64];                         /* input buffer */
+    } context ;
+
+    //#pragma region static helper functions
+    // The core of the MD5 algorithm is here.
+    // MD5 basic transformation. Transforms state based on block.
+    static void MD5Transform(UINT4 state[4], unsigned char block[64]) {
+        UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+        Decode(x, block, 64);
+
+        /* Round 1 */
+        FF(a, b, c, d, x[ 0], S11, 0xd76aa478);  /* 1 */
+        FF(d, a, b, c, x[ 1], S12, 0xe8c7b756);  /* 2 */
+        FF(c, d, a, b, x[ 2], S13, 0x242070db);  /* 3 */
+        FF(b, c, d, a, x[ 3], S14, 0xc1bdceee);  /* 4 */
+        FF(a, b, c, d, x[ 4], S11, 0xf57c0faf);  /* 5 */
+        FF(d, a, b, c, x[ 5], S12, 0x4787c62a);  /* 6 */
+        FF(c, d, a, b, x[ 6], S13, 0xa8304613);  /* 7 */
+        FF(b, c, d, a, x[ 7], S14, 0xfd469501);  /* 8 */
+        FF(a, b, c, d, x[ 8], S11, 0x698098d8);  /* 9 */
+        FF(d, a, b, c, x[ 9], S12, 0x8b44f7af);  /* 10 */
+        FF(c, d, a, b, x[10], S13, 0xffff5bb1);  /* 11 */
+        FF(b, c, d, a, x[11], S14, 0x895cd7be);  /* 12 */
+        FF(a, b, c, d, x[12], S11, 0x6b901122);  /* 13 */
+        FF(d, a, b, c, x[13], S12, 0xfd987193);  /* 14 */
+        FF(c, d, a, b, x[14], S13, 0xa679438e);  /* 15 */
+        FF(b, c, d, a, x[15], S14, 0x49b40821);  /* 16 */
+
+        /* Round 2 */
+        GG(a, b, c, d, x[ 1], S21, 0xf61e2562);  /* 17 */
+        GG(d, a, b, c, x[ 6], S22, 0xc040b340);  /* 18 */
+        GG(c, d, a, b, x[11], S23, 0x265e5a51);  /* 19 */
+        GG(b, c, d, a, x[ 0], S24, 0xe9b6c7aa);  /* 20 */
+        GG(a, b, c, d, x[ 5], S21, 0xd62f105d);  /* 21 */
+        GG(d, a, b, c, x[10], S22,  0x2441453);  /* 22 */
+        GG(c, d, a, b, x[15], S23, 0xd8a1e681);  /* 23 */
+        GG(b, c, d, a, x[ 4], S24, 0xe7d3fbc8);  /* 24 */
+        GG(a, b, c, d, x[ 9], S21, 0x21e1cde6);  /* 25 */
+        GG(d, a, b, c, x[14], S22, 0xc33707d6);  /* 26 */
+        GG(c, d, a, b, x[ 3], S23, 0xf4d50d87);  /* 27 */
+        GG(b, c, d, a, x[ 8], S24, 0x455a14ed);  /* 28 */
+        GG(a, b, c, d, x[13], S21, 0xa9e3e905);  /* 29 */
+        GG(d, a, b, c, x[ 2], S22, 0xfcefa3f8);  /* 30 */
+        GG(c, d, a, b, x[ 7], S23, 0x676f02d9);  /* 31 */
+        GG(b, c, d, a, x[12], S24, 0x8d2a4c8a);  /* 32 */
+
+        /* Round 3 */
+        HH(a, b, c, d, x[ 5], S31, 0xfffa3942);  /* 33 */
+        HH(d, a, b, c, x[ 8], S32, 0x8771f681);  /* 34 */
+        HH(c, d, a, b, x[11], S33, 0x6d9d6122);  /* 35 */
+        HH(b, c, d, a, x[14], S34, 0xfde5380c);  /* 36 */
+        HH(a, b, c, d, x[ 1], S31, 0xa4beea44);  /* 37 */
+        HH(d, a, b, c, x[ 4], S32, 0x4bdecfa9);  /* 38 */
+        HH(c, d, a, b, x[ 7], S33, 0xf6bb4b60);  /* 39 */
+        HH(b, c, d, a, x[10], S34, 0xbebfbc70);  /* 40 */
+        HH(a, b, c, d, x[13], S31, 0x289b7ec6);  /* 41 */
+        HH(d, a, b, c, x[ 0], S32, 0xeaa127fa);  /* 42 */
+        HH(c, d, a, b, x[ 3], S33, 0xd4ef3085);  /* 43 */
+        HH(b, c, d, a, x[ 6], S34,  0x4881d05);  /* 44 */
+        HH(a, b, c, d, x[ 9], S31, 0xd9d4d039);  /* 45 */
+        HH(d, a, b, c, x[12], S32, 0xe6db99e5);  /* 46 */
+        HH(c, d, a, b, x[15], S33, 0x1fa27cf8);  /* 47 */
+        HH(b, c, d, a, x[ 2], S34, 0xc4ac5665);  /* 48 */
+
+        /* Round 4 */
+        II(a, b, c, d, x[ 0], S41, 0xf4292244);  /* 49 */
+        II(d, a, b, c, x[ 7], S42, 0x432aff97);  /* 50 */
+        II(c, d, a, b, x[14], S43, 0xab9423a7);  /* 51 */
+        II(b, c, d, a, x[ 5], S44, 0xfc93a039);  /* 52 */
+        II(a, b, c, d, x[12], S41, 0x655b59c3);  /* 53 */
+        II(d, a, b, c, x[ 3], S42, 0x8f0ccc92);  /* 54 */
+        II(c, d, a, b, x[10], S43, 0xffeff47d);  /* 55 */
+        II(b, c, d, a, x[ 1], S44, 0x85845dd1);  /* 56 */
+        II(a, b, c, d, x[ 8], S41, 0x6fa87e4f);  /* 57 */
+        II(d, a, b, c, x[15], S42, 0xfe2ce6e0);  /* 58 */
+        II(c, d, a, b, x[ 6], S43, 0xa3014314);  /* 59 */
+        II(b, c, d, a, x[13], S44, 0x4e0811a1);  /* 60 */
+        II(a, b, c, d, x[ 4], S41, 0xf7537e82);  /* 61 */
+        II(d, a, b, c, x[11], S42, 0xbd3af235);  /* 62 */
+        II(c, d, a, b, x[ 2], S43, 0x2ad7d2bb);  /* 63 */
+        II(b, c, d, a, x[ 9], S44, 0xeb86d391);  /* 64 */
+
+        state[0] += a;
+        state[1] += b;
+        state[2] += c;
+        state[3] += d;
+
+        // Zeroize sensitive information.
+        memset((POINTER)x, 0, sizeof(x));
+    }
+
+    // Encodes input (UINT4) into output (unsigned char). Assumes len is
+    // a multiple of 4.
+    static void Encode(unsigned char *output, UINT4 *input, unsigned int len) {
+        unsigned int i, j;
+
+        for(i = 0, j = 0; j < len; i++, j += 4) {
+            output[j] = (unsigned char)(input[i] & 0xff);
+            output[j + 1] = (unsigned char)((input[i] >> 8) & 0xff);
+            output[j + 2] = (unsigned char)((input[i] >> 16) & 0xff);
+            output[j + 3] = (unsigned char)((input[i] >> 24) & 0xff);
+        }
+    }
+
+    // Decodes input (unsigned char) into output (UINT4). Assumes len is
+    // a multiple of 4.
+    static void Decode(UINT4 *output, unsigned char *input, unsigned int len) {
+        unsigned int i, j;
+
+        for(i = 0, j = 0; j < len; i++, j += 4)
+            output[i] = ((UINT4)input[j]) | (((UINT4)input[j + 1]) << 8) |
+                        (((UINT4)input[j + 2]) << 16) | (((UINT4)input[j + 3]) << 24);
+    }
+    //#pragma endregion
+
+
+public:
+    // MAIN FUNCTIONS
+    MD5() {
+        Init() ;
+    }
+
+    // MD5 initialization. Begins an MD5 operation, writing a new context.
+    void Init() {
+        context.count[0] = context.count[1] = 0;
+
+        // Load magic initialization constants.
+        context.state[0] = 0x67452301;
+        context.state[1] = 0xefcdab89;
+        context.state[2] = 0x98badcfe;
+        context.state[3] = 0x10325476;
+    }
+
+    // MD5 block update operation. Continues an MD5 message-digest
+    // operation, processing another message block, and updating the
+    // context.
+    void Update(
+        unsigned char *input,   // input block
+        unsigned int inputLen) {  // length of input block
+        unsigned int i, index, partLen;
+
+        // Compute number of bytes mod 64
+        index = (unsigned int)((context.count[0] >> 3) & 0x3F);
+
+        // Update number of bits
+        if((context.count[0] += ((UINT4)inputLen << 3))
+                < ((UINT4)inputLen << 3))
+            context.count[1]++;
+        context.count[1] += ((UINT4)inputLen >> 29);
+
+        partLen = 64 - index;
+
+        // Transform as many times as possible.
+        if(inputLen >= partLen) {
+            memcpy((POINTER)&context.buffer[index], (POINTER)input, partLen);
+            MD5Transform(context.state, context.buffer);
+
+            for(i = partLen; i + 63 < inputLen; i += 64)
+                MD5Transform(context.state, &input[i]);
+
+            index = 0;
+        } else
+            i = 0;
+
+        /* Buffer remaining input */
+        memcpy((POINTER)&context.buffer[index], (POINTER)&input[i], inputLen - i);
+    }
+
+    // MD5 finalization. Ends an MD5 message-digest operation, writing the
+    // the message digest and zeroizing the context.
+    // Writes to digestRaw
+    void Final() {
+        unsigned char bits[8];
+        unsigned int index, padLen;
+
+        // Save number of bits
+        Encode(bits, context.count, 8);
+
+        // Pad out to 56 mod 64.
+        index = (unsigned int)((context.count[0] >> 3) & 0x3f);
+        padLen = (index < 56) ? (56 - index) : (120 - index);
+        Update(PADDING, padLen);
+
+        // Append length (before padding)
+        Update(bits, 8);
+
+        // Store state in digest
+        Encode(digestRaw, context.state, 16);
+
+        // Zeroize sensitive information.
+        memset((POINTER)&context, 0, sizeof(context));
+
+        writeToString() ;
+    }
+
+    /// Buffer must be 32+1 (nul) = 33 chars long at least
+    void writeToString() {
+        int pos ;
+
+        for(pos = 0 ; pos < 16 ; pos++)
+            sprintf(digestChars + (pos * 2), "%02x", digestRaw[pos]) ;
+    }
+
+
+public:
+    // an MD5 digest is a 16-byte number (32 hex digits)
+    BYTE digestRaw[ 16 ] ;
+
+    // This version of the digest is actually
+    // a "printf'd" version of the digest.
+    char digestChars[ 33 ] ;
+
+    /// Load a file from disk and digest it
+    // Digests a file and returns the result.
+    const char* digestFile(const char *filename) {
+        if(NULL == filename || strcmp(filename, "") == 0)
+            return NULL;
+
+        Init() ;
+
+        FILE *file;
+
+        unsigned char buffer[1024] ;
+
+        if((file = fopen(filename, "rb")) == NULL) {
+            return NULL;
+        }
+        int len;
+        while((len = fread(buffer, 1, 1024, file)))
+            Update(buffer, len) ;
+        Final();
+
+        fclose(file);
+
+        return digestChars ;
+    }
+
+    /// Digests a byte-array already in memory
+    const char* digestMemory(BYTE *memchunk, int len) {
+        if(NULL == memchunk)
+            return NULL;
+
+        Init() ;
+        Update(memchunk, len) ;
+        Final() ;
+
+        return digestChars ;
+    }
+
+    // Digests a string and prints the result.
+    const char* digestString(const char *string) {
+        if(string == NULL)
+            return NULL;
+
+        Init() ;
+        Update((unsigned char*)string, strlen(string)) ;
+        Final() ;
+
+        return digestChars ;
+    }
+};
+
+inline bool md5String(const char* str, std::string& res) {
+    if(NULL == str) {
+        res = "";
+        return false;
+    }
+
+    MD5 md5;
+    const char *pRes = md5.digestString(str);
+    if(NULL == pRes) {
+        res = "";
+        return false;
+    }
+
+    res = pRes;
+    return true;
+}
+
+inline bool md5File(const char* filepath, std::string& res) {
+    if(NULL == filepath || strcmp(filepath, "") == 0) {
+        res = "";
+        return false;
+    }
+
+    MD5 md5;
+    const char *pRes = md5.digestFile(filepath);
+
+    if(NULL == pRes) {
+        res = "";
+        return false;
+    }
+
+    res = pRes;
+    return true;
+}
+}
+#endif
--- a/libchinese-segmentation/cppjieba/limonp/MutexLock.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/MutexLock.hpp
@ -0,0 +1,51 @@
+#ifndef LIMONP_MUTEX_LOCK_HPP
+#define LIMONP_MUTEX_LOCK_HPP
+
+#include <pthread.h>
+#include "NonCopyable.hpp"
+#include "Logging.hpp"
+
+namespace limonp {
+
+class MutexLock: NonCopyable {
+ public:
+  MutexLock() {
+    XCHECK(!pthread_mutex_init(&mutex_, NULL));
+  }
+  ~MutexLock() {
+    XCHECK(!pthread_mutex_destroy(&mutex_));
+  }
+  pthread_mutex_t* GetPthreadMutex() {
+    return &mutex_;
+  }
+
+ private:
+  void Lock() {
+    XCHECK(!pthread_mutex_lock(&mutex_));
+  }
+  void Unlock() {
+    XCHECK(!pthread_mutex_unlock(&mutex_));
+  }
+  friend class MutexLockGuard;
+
+  pthread_mutex_t mutex_;
+}; // class MutexLock
+
+class MutexLockGuard: NonCopyable {
+ public:
+  explicit MutexLockGuard(MutexLock & mutex)
+    : mutex_(mutex) {
+    mutex_.Lock();
+  }
+  ~MutexLockGuard() {
+    mutex_.Unlock();
+  }
+ private:
+  MutexLock & mutex_;
+}; // class MutexLockGuard
+
+#define MutexLockGuard(x) XCHECK(false);
+
+} // namespace limonp
+
+#endif // LIMONP_MUTEX_LOCK_HPP
--- a/libchinese-segmentation/cppjieba/limonp/NonCopyable.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/NonCopyable.hpp
@ -0,0 +1,21 @@
+/************************************
+ ************************************/
+#ifndef LIMONP_NONCOPYABLE_H
+#define LIMONP_NONCOPYABLE_H
+
+namespace limonp {
+
+class NonCopyable {
+ protected:
+  NonCopyable() {
+  }
+  ~NonCopyable() {
+  }
+ private:
+  NonCopyable(const NonCopyable& );
+  const NonCopyable& operator=(const NonCopyable& );
+}; // class NonCopyable
+
+} // namespace limonp
+
+#endif // LIMONP_NONCOPYABLE_H
--- a/libchinese-segmentation/cppjieba/limonp/StdExtension.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/StdExtension.hpp
@ -0,0 +1,157 @@
+#ifndef LIMONP_STD_EXTEMSION_HPP
+#define LIMONP_STD_EXTEMSION_HPP
+
+#include <map>
+
+#ifdef __APPLE__
+#include <unordered_map>
+#include <unordered_set>
+#elif(__cplusplus >= 201103L)
+#include <unordered_map>
+#include <unordered_set>
+#elif defined _MSC_VER
+#include <unordered_map>
+#include <unordered_set>
+#else
+#include <tr1/unordered_map>
+#include <tr1/unordered_set>
+namespace std {
+using std::tr1::unordered_map;
+using std::tr1::unordered_set;
+}
+
+#endif
+
+#include <set>
+#include <string>
+#include <vector>
+#include <deque>
+#include <fstream>
+#include <sstream>
+
+namespace std {
+
+template<typename T>
+ostream& operator << (ostream& os, const vector<T>& v) {
+  if(v.empty()) {
+    return os << "[]";
+  }
+  os<<"["<<v[0];
+  for(size_t i = 1; i < v.size(); i++) {
+    os<<", "<<v[i];
+  }
+  os<<"]";
+  return os;
+}
+
+template<>
+inline ostream& operator << (ostream& os, const vector<string>& v) {
+  if(v.empty()) {
+    return os << "[]";
+  }
+  os<<"[\""<<v[0];
+  for(size_t i = 1; i < v.size(); i++) {
+    os<<"\", \""<<v[i];
+  }
+  os<<"\"]";
+  return os;
+}
+
+template<typename T>
+ostream& operator << (ostream& os, const deque<T>& dq) {
+  if(dq.empty()) {
+    return os << "[]";
+  }
+  os<<"[\""<<dq[0];
+  for(size_t i = 1; i < dq.size(); i++) {
+    os<<"\", \""<<dq[i];
+  }
+  os<<"\"]";
+  return os;
+}
+
+
+template<class T1, class T2>
+ostream& operator << (ostream& os, const pair<T1, T2>& pr) {
+  os << pr.first << ":" << pr.second ;
+  return os;
+}
+
+
+template<class T>
+string& operator << (string& str, const T& obj) {
+  stringstream ss;
+  ss << obj; // call ostream& operator << (ostream& os,
+  return str = ss.str();
+}
+
+template<class T1, class T2>
+ostream& operator << (ostream& os, const map<T1, T2>& mp) {
+  if(mp.empty()) {
+    os<<"{}";
+    return os;
+  }
+  os<<'{';
+  typename map<T1, T2>::const_iterator it = mp.begin();
+  os<<*it;
+  it++;
+  while(it != mp.end()) {
+    os<<", "<<*it;
+    it++;
+  }
+  os<<'}';
+  return os;
+}
+template<class T1, class T2>
+ostream& operator << (ostream& os, const std::unordered_map<T1, T2>& mp) {
+  if(mp.empty()) {
+    return os << "{}";
+  }
+  os<<'{';
+  typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
+  os<<*it;
+  it++;
+  while(it != mp.end()) {
+    os<<", "<<*it++;
+  }
+  return os<<'}';
+}
+
+template<class T>
+ostream& operator << (ostream& os, const set<T>& st) {
+  if(st.empty()) {
+    os << "{}";
+    return os;
+  }
+  os<<'{';
+  typename set<T>::const_iterator it = st.begin();
+  os<<*it;
+  it++;
+  while(it != st.end()) {
+    os<<", "<<*it;
+    it++;
+  }
+  os<<'}';
+  return os;
+}
+
+template<class KeyType, class ContainType>
+bool IsIn(const ContainType& contain, const KeyType& key) {
+  return contain.end() != contain.find(key);
+}
+
+template<class T>
+basic_string<T> & operator << (basic_string<T> & s, ifstream & ifs) {
+  return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
+}
+
+template<class T>
+ofstream & operator << (ofstream & ofs, const basic_string<T>& s) {
+  ostreambuf_iterator<T> itr (ofs);
+  copy(s.begin(), s.end(), itr);
+  return ofs;
+}
+
+} // namespace std
+
+#endif
--- a/libchinese-segmentation/cppjieba/limonp/StringUtil.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/StringUtil.hpp
@ -0,0 +1,382 @@
+/************************************
+ * file enc : ascii
+ * author   : wuyanyi09@gmail.com
+ ************************************/
+#ifndef LIMONP_STR_FUNCTS_H
+#define LIMONP_STR_FUNCTS_H
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <memory.h>
+#include <sys/types.h>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <cctype>
+#include <map>
+#include <functional>
+#include <locale>
+#include <sstream>
+#include <iterator>
+#include <algorithm>
+#include "StdExtension.hpp"
+
+namespace limonp {
+using namespace std;
+inline string StringFormat(const char* fmt, ...) {
+  int size = 256;
+  std::string str;
+  va_list ap;
+  while (1) {
+    str.resize(size);
+    va_start(ap, fmt);
+    int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
+    va_end(ap);
+    if (n > -1 && n < size) {
+      str.resize(n);
+      return str;
+    }
+    if (n > -1)
+      size = n + 1;
+    else
+      size *= 2;
+  }
+  return str;
+}
+
+template<class T>
+void Join(T begin, T end, string& res, const string& connector) {
+  if(begin == end) {
+    return;
+  }
+  stringstream ss;
+  ss<<*begin;
+  begin++;
+  while(begin != end) {
+    ss << connector << *begin;
+    begin ++;
+  }
+  res = ss.str();
+}
+
+template<class T>
+string Join(T begin, T end, const string& connector) {
+  string res;
+  Join(begin ,end, res, connector);
+  return res;
+}
+
+inline string& Upper(string& str) {
+  transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
+  return str;
+}
+
+inline string& Lower(string& str) {
+  transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
+  return str;
+}
+
+inline bool IsSpace(unsigned c) {
+  // when passing large int as the argument of isspace, it core dump, so here need a type cast.
+  return c > 0xff ? false : std::isspace(c & 0xff);
+}
+
+inline std::string& LTrim(std::string &s) {
+  s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))));
+  return s;
+}
+
+inline std::string& RTrim(std::string &s) {
+  s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))).base(), s.end());
+  return s;
+}
+
+inline std::string& Trim(std::string &s) {
+  return LTrim(RTrim(s));
+}
+
+inline std::string& LTrim(std::string & s, char x) {
+  s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to<char>(), x))));
+  return s;
+}
+
+inline std::string& RTrim(std::string & s, char x) {
+  s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to<char>(), x))).base(), s.end());
+  return s;
+}
+
+inline std::string& Trim(std::string &s, char x) {
+  return LTrim(RTrim(s, x), x);
+}
+
+inline void Split(const string& src, vector<string>& res, const string& pattern, size_t maxsplit = string::npos) {
+  res.clear();
+  size_t Start = 0;
+  size_t end = 0;
+  string sub;
+  while(Start < src.size()) {
+    end = src.find_first_of(pattern, Start);
+    if(string::npos == end || res.size() >= maxsplit) {
+      sub = src.substr(Start);
+      res.push_back(sub);
+      return;
+    }
+    sub = src.substr(Start, end - Start);
+    res.push_back(sub);
+    Start = end + 1;
+  }
+  return;
+}
+
+inline vector<string> Split(const string& src, const string& pattern, size_t maxsplit = string::npos) {
+  vector<string> res;
+  Split(src, res, pattern, maxsplit);
+  return res;
+}
+
+inline bool StartsWith(const string& str, const string& prefix) {
+  if(prefix.length() > str.length()) {
+    return false;
+  }
+  return 0 == str.compare(0, prefix.length(), prefix);
+}
+
+inline bool EndsWith(const string& str, const string& suffix) {
+  if(suffix.length() > str.length()) {
+    return false;
+  }
+  return 0 == str.compare(str.length() -  suffix.length(), suffix.length(), suffix);
+}
+
+inline bool IsInStr(const string& str, char ch) {
+  return str.find(ch) != string::npos;
+}
+
+inline uint16_t TwocharToUint16(char high, char low) {
+  return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
+}
+
+template <class Uint16Container>
+bool Utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec) {
+  if(!str) {
+    return false;
+  }
+  char ch1, ch2;
+  uint16_t tmp;
+  vec.clear();
+  for(size_t i = 0; i < len;) {
+    if(!(str[i] & 0x80)) { // 0xxxxxxx
+      vec.push_back(str[i]);
+      i++;
+    } else if ((uint8_t)str[i] <= 0xdf && i + 1 < len) { // 110xxxxxx
+      ch1 = (str[i] >> 2) & 0x07;
+      ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
+      tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
+      vec.push_back(tmp);
+      i += 2;
+    } else if((uint8_t)str[i] <= 0xef && i + 2 < len) {
+      ch1 = ((uint8_t)str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
+      ch2 = (((uint8_t)str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
+      tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
+      vec.push_back(tmp);
+      i += 3;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <class Uint16Container>
+bool Utf8ToUnicode(const string& str, Uint16Container& vec) {
+  return Utf8ToUnicode(str.c_str(), str.size(), vec);
+}
+
+template <class Uint32Container>
+bool Utf8ToUnicode32(const char * str, size_t size, Uint32Container& vec) {
+  uint32_t tmp;
+  vec.clear();
+  for(size_t i = 0; i < size;) {
+    if(!(str[i] & 0x80)) { // 0xxxxxxx
+      // 7bit, total 7bit
+      tmp = (uint8_t)(str[i]) & 0x7f;
+      i++;
+    } else if ((uint8_t)str[i] <= 0xdf && i + 1 < size) { // 110xxxxxx
+      // 5bit, total 5bit
+      tmp = (uint8_t)(str[i]) & 0x1f;
+
+      // 6bit, total 11bit
+      tmp <<= 6;
+      tmp |= (uint8_t)(str[i+1]) & 0x3f;
+      i += 2;
+    } else if((uint8_t)str[i] <= 0xef && i + 2 < size) { // 1110xxxxxx
+      // 4bit, total 4bit
+      tmp = (uint8_t)(str[i]) & 0x0f;
+
+      // 6bit, total 10bit
+      tmp <<= 6;
+      tmp |= (uint8_t)(str[i+1]) & 0x3f;
+
+      // 6bit, total 16bit
+      tmp <<= 6;
+      tmp |= (uint8_t)(str[i+2]) & 0x3f;
+
+      i += 3;
+    } else if((uint8_t)str[i] <= 0xf7 && i + 3 < size) { // 11110xxxx
+      // 3bit, total 3bit
+      tmp = (uint8_t)(str[i]) & 0x07;
+
+      // 6bit, total 9bit
+      tmp <<= 6;
+      tmp |= (uint8_t)(str[i+1]) & 0x3f;
+
+      // 6bit, total 15bit
+      tmp <<= 6;
+      tmp |= (uint8_t)(str[i+2]) & 0x3f;
+
+      // 6bit, total 21bit
+      tmp <<= 6;
+      tmp |= (uint8_t)(str[i+3]) & 0x3f;
+
+      i += 4;
+    } else {
+      return false;
+    }
+    vec.push_back(tmp);
+  }
+  return true;
+}
+
+template <class Uint32Container>
+bool Utf8ToUnicode32(const string& str, Uint32Container& vec) {
+    return Utf8ToUnicode32(str.data(), str.size(), vec);
+}
+
+inline int UnicodeToUtf8Bytes(uint32_t ui){
+    if(ui <= 0x7f) {
+        return 1;
+    } else if(ui <= 0x7ff) {
+        return 2;
+    } else if(ui <= 0xffff) {
+        return 3;
+    } else {
+        return 4;
+    }
+}
+
+template <class Uint32ContainerConIter>
+void Unicode32ToUtf8(Uint32ContainerConIter begin, Uint32ContainerConIter end, string& res) {
+  res.clear();
+  uint32_t ui;
+  while(begin != end) {
+    ui = *begin;
+    if(ui <= 0x7f) {
+      res += char(ui);
+    } else if(ui <= 0x7ff) {
+      res += char(((ui >> 6) & 0x1f) | 0xc0);
+      res += char((ui & 0x3f) | 0x80);
+    } else if(ui <= 0xffff) {
+      res += char(((ui >> 12) & 0x0f) | 0xe0);
+      res += char(((ui >> 6) & 0x3f) | 0x80);
+      res += char((ui & 0x3f) | 0x80);
+    } else {
+      res += char(((ui >> 18) & 0x03) | 0xf0);
+      res += char(((ui >> 12) & 0x3f) | 0x80);
+      res += char(((ui >> 6) & 0x3f) | 0x80);
+      res += char((ui & 0x3f) | 0x80);
+    }
+    begin ++;
+  }
+}
+
+template <class Uint16ContainerConIter>
+void UnicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
+  res.clear();
+  uint16_t ui;
+  while(begin != end) {
+    ui = *begin;
+    if(ui <= 0x7f) {
+      res += char(ui);
+    } else if(ui <= 0x7ff) {
+      res += char(((ui>>6) & 0x1f) | 0xc0);
+      res += char((ui & 0x3f) | 0x80);
+    } else {
+      res += char(((ui >> 12) & 0x0f )| 0xe0);
+      res += char(((ui>>6) & 0x3f )| 0x80 );
+      res += char((ui & 0x3f) | 0x80);
+    }
+    begin ++;
+  }
+}
+
+
+template <class Uint16Container>
+bool GBKTrans(const char* const str, size_t len, Uint16Container& vec) {
+  vec.clear();
+  if(!str) {
+    return true;
+  }
+  size_t i = 0;
+  while(i < len) {
+    if(0 == (str[i] & 0x80)) {
+      vec.push_back(uint16_t(str[i]));
+      i++;
+    } else {
+      if(i + 1 < len) { //&& (str[i+1] & 0x80))
+        uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
+        vec.push_back(tmp);
+        i += 2;
+      } else {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+template <class Uint16Container>
+bool GBKTrans(const string& str, Uint16Container& vec) {
+  return GBKTrans(str.c_str(), str.size(), vec);
+}
+
+template <class Uint16ContainerConIter>
+void GBKTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
+  res.clear();
+  //pair<char, char> pa;
+  char first, second;
+  while(begin != end) {
+    //pa = uint16ToChar2(*begin);
+    first = ((*begin)>>8) & 0x00ff;
+    second = (*begin) & 0x00ff;
+    if(first & 0x80) {
+      res += first;
+      res += second;
+    } else {
+      res += second;
+    }
+    begin++;
+  }
+}
+
+/*
+ * format example: "%Y-%m-%d %H:%M:%S"
+ */
+// inline void GetTime(const string& format, string&  timeStr) {
+//   time_t timeNow;
+//   time(&timeNow);
+//   timeStr.resize(64);
+//   size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow));
+//   timeStr.resize(len);
+// }
+
+inline string PathJoin(const string& path1, const string& path2) {
+  if(EndsWith(path1, "/")) {
+    return path1 + path2;
+  }
+  return path1 + "/" + path2;
+}
+
+}
+#endif
--- a/libchinese-segmentation/cppjieba/limonp/Thread.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Thread.hpp
@ -0,0 +1,44 @@
+#ifndef LIMONP_THREAD_HPP
+#define LIMONP_THREAD_HPP
+
+#include "Logging.hpp"
+#include "NonCopyable.hpp"
+
+namespace limonp {
+
+class IThread: NonCopyable {
+ public:
+  IThread(): isStarted(false), isJoined(false) {
+  }
+  virtual ~IThread() {
+    if(isStarted && !isJoined) {
+      XCHECK(!pthread_detach(thread_));
+    }
+  };
+
+  virtual void Run() = 0;
+  void Start() {
+    XCHECK(!isStarted);
+    XCHECK(!pthread_create(&thread_, NULL, Worker, this));
+    isStarted = true;
+  }
+  void Join() {
+    XCHECK(!isJoined);
+    XCHECK(!pthread_join(thread_, NULL));
+    isJoined = true;
+  }
+ private:
+  static void * Worker(void * data) {
+    IThread * ptr = (IThread* ) data;
+    ptr->Run();
+    return NULL;
+  }
+
+  pthread_t thread_;
+  bool isStarted;
+  bool isJoined;
+}; // class IThread
+
+} // namespace limonp
+
+#endif // LIMONP_THREAD_HPP
--- a/libchinese-segmentation/cppjieba/limonp/ThreadPool.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/ThreadPool.hpp
@ -0,0 +1,86 @@
+#ifndef LIMONP_THREAD_POOL_HPP
+#define LIMONP_THREAD_POOL_HPP
+
+#include "Thread.hpp"
+#include "BlockingQueue.hpp"
+#include "BoundedBlockingQueue.hpp"
+#include "Closure.hpp"
+
+namespace limonp {
+
+using namespace std;
+
+//class ThreadPool;
+class ThreadPool: NonCopyable {
+ public:
+  class Worker: public IThread {
+   public:
+    Worker(ThreadPool* pool): ptThreadPool_(pool) {
+      assert(ptThreadPool_);
+    }
+    virtual ~Worker() {
+    }
+
+    virtual void Run() {
+      while (true) {
+        ClosureInterface* closure = ptThreadPool_->queue_.Pop();
+        if (closure == NULL) {
+          break;
+        }
+        try {
+          closure->Run();
+        } catch(std::exception& e) {
+          XLOG(ERROR) << e.what();
+        } catch(...) {
+          XLOG(ERROR) << " unknown exception.";
+        }
+        delete closure;
+      }
+    }
+   private:
+    ThreadPool * ptThreadPool_;
+  }; // class Worker
+
+  ThreadPool(size_t thread_num)
+    : threads_(thread_num), 
+      queue_(thread_num) {
+    assert(thread_num);
+    for(size_t i = 0; i < threads_.size(); i ++) {
+      threads_[i] = new Worker(this);
+    }
+  }
+  ~ThreadPool() {
+    Stop();
+  }
+
+  void Start() {
+    for(size_t i = 0; i < threads_.size(); i++) {
+      threads_[i]->Start();
+    }
+  }
+  void Stop() {
+    for(size_t i = 0; i < threads_.size(); i ++) {
+      queue_.Push(NULL);
+    }
+    for(size_t i = 0; i < threads_.size(); i ++) {
+      threads_[i]->Join();
+      delete threads_[i];
+    }
+    threads_.clear();
+  }
+
+  void Add(ClosureInterface* task) {
+    assert(task);
+    queue_.Push(task);
+  }
+
+ private:
+  friend class Worker;
+
+  vector<IThread*> threads_;
+  BoundedBlockingQueue<ClosureInterface*> queue_;
+}; // class ThreadPool
+
+} // namespace limonp
+
+#endif // LIMONP_THREAD_POOL_HPP
--- a/libchinese-segmentation/cppjieba/limonp/limonp.pri
+++ b/libchinese-segmentation/cppjieba/limonp/limonp.pri
@ -0,0 +1,22 @@
+INCLUDEPATH += $$PWD
+
+HEADERS += \
+    $$PWD/ArgvContext.hpp \
+    $$PWD/BlockingQueue.hpp \
+    $$PWD/BoundedBlockingQueue.hpp \
+    $$PWD/BoundedQueue.hpp \
+    $$PWD/Closure.hpp \
+    $$PWD/Colors.hpp \
+    $$PWD/Condition.hpp \
+    $$PWD/Config.hpp \
+    $$PWD/FileLock.hpp \
+    $$PWD/ForcePublic.hpp \
+    $$PWD/LocalVector.hpp \
+    $$PWD/Logging.hpp \
+    $$PWD/Md5.hpp \
+    $$PWD/MutexLock.hpp \
+    $$PWD/NonCopyable.hpp \
+    $$PWD/StdExtension.hpp \
+    $$PWD/StringUtil.hpp \
+    $$PWD/Thread.hpp \
+    $$PWD/ThreadPool.hpp
--- a/libchinese-segmentation/cppjieba/segment-trie/segment-trie.cpp
+++ b/libchinese-segmentation/cppjieba/segment-trie/segment-trie.cpp
@ -0,0 +1,275 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#include <cmath>
+#include "segment-trie.h"
+
+DictTrie::DictTrie(const vector<string> file_paths, string dat_cache_path)
+    : StorageBase<DatMemElem, false, DictCacheFileHeader>(file_paths, dat_cache_path)
+{
+    this->Init();
+}
+
+DictTrie::DictTrie(const string &dict_path, const string &user_dict_paths, const string &dat_cache_path)
+    : StorageBase<DatMemElem, false, DictCacheFileHeader>(vector<string>{dict_path, user_dict_paths}, dat_cache_path)
+{
+    this->Init();
+}
+
+void DictTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
+{
+    DictCacheFileHeader header;
+    assert(sizeof(header.md5_hex) == md5.size());
+    memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
+
+    int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
+    string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
+    umask(S_IWGRP | S_IWOTH);
+    const int fd =mkstemp((char *)tmp_filepath.data());
+    assert(fd >= 0);
+    fchmod(fd, 0644);
+
+    write_bytes = write(fd, (const char *)&header, sizeof(DictCacheFileHeader));
+
+    this->PreLoad();
+    this->LoadDefaultDict(fd, write_bytes, offset, elements_num);
+    this->LoadUserDict(fd, write_bytes, offset, elements_num);
+
+    write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
+
+    lseek(fd, sizeof(header.md5_hex), SEEK_SET);
+    write(fd, &elements_num, sizeof(int));
+    write(fd, &offset, sizeof(int));
+    data_trie_size = this->GetDataTrieSize();
+    write(fd, &data_trie_size, sizeof(int));
+    write(fd, &m_min_weight, sizeof(double));
+
+    close(fd);
+    assert((size_t)write_bytes == sizeof(DictCacheFileHeader) + offset + this->GetDataTrieTotalSize());
+
+    tryRename(tmp_filepath, dat_cache_file);
+}
+
+const DatMemElem * DictTrie::Find(const string &key) const
+{
+    int result = this->ExactMatchSearch(key.c_str(), key.size());
+    if (result < 0)
+        return nullptr;
+    return &this->GetElementPtr()[result];
+}
+
+
+
+void DictTrie::FindDatDag(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<DatDag> &res, size_t max_word_len) const {
+
+    res.clear();
+    res.resize(end - begin);
+
+    string text_str;
+    EncodeRunesToString(begin, end, text_str);
+
+    static const size_t max_num = 128;
+    result_pair_type result_pairs[max_num] = {};
+
+    for (size_t i = 0, begin_pos = 0; i < size_t(end - begin); i++) {
+
+        std::size_t num_results = this->CommonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
+
+        res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + 1, nullptr));
+
+        for (std::size_t idx = 0; idx < num_results; ++idx) {
+            auto & match = result_pairs[idx];
+
+            if ((match.value < 0) || ((size_t)match.value >= this->GetCacheFileHeaderPtr()->elements_size)) {
+                continue;
+            }
+
+            auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
+
+            if (char_num > max_word_len) {
+                continue;
+            }
+
+            const DatMemElem * pValue = &this->GetElementPtr()[match.value];
+
+            if (1 == char_num) {
+                res[i].nexts[0].second = pValue;
+                continue;
+            }
+
+            res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + char_num, pValue));
+        }
+
+        begin_pos += limonp::UnicodeToUtf8Bytes((begin + i)->rune);
+    }
+}
+
+void DictTrie::FindWordRange(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange> &words, size_t max_word_len) const {
+
+    string text_str;
+    EncodeRunesToString(begin, end, text_str);
+
+    static const size_t max_num = 128;
+    result_pair_type result_pairs[max_num] = {};//存放字典查询结果
+    size_t str_size = end - begin;
+    double max_weight[str_size];//存放逆向路径最大weight
+    for (size_t i = 0; i<str_size; i++) {
+        max_weight[i] = -3.14e+100;
+    }
+    size_t max_next[str_size];//存放动态规划后的分词结果
+    //memset(max_next,-1,str_size*sizeof(size_t));
+
+    double val(0);
+    for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
+        size_t nextPos = str_size - i;//逆向计算
+        begin_pos -= (end - i - 1)->len;
+
+        std::size_t num_results = this->CommonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
+        if (0 == num_results) {//字典不存在则单独分词
+            val = GetMinWeight();
+            if (nextPos  < str_size) {
+                val += max_weight[nextPos];
+            }
+            if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
+                max_weight[nextPos - 1] = val;
+                max_next[nextPos - 1] = nextPos;
+            }
+        } else {//字典存在则根据查询结果数量计算最大概率路径
+            for (std::size_t idx = 0; idx < num_results; ++idx) {
+                auto & match = result_pairs[idx];
+                if ((match.value < 0) || ((uint32_t)match.value >= this->GetCacheFileHeaderPtr()->elements_size)) {
+                    continue;
+                }
+                auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
+                if (char_num > max_word_len) {
+                    continue;
+                }
+                auto * pValue = &this->GetElementPtr()[match.value];
+
+                val = pValue->weight;
+                if (1 == char_num) {
+                    if (nextPos  < str_size) {
+                        val += max_weight[nextPos];
+                    }
+                    if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
+                        max_weight[nextPos - 1] = val;
+                        max_next[nextPos - 1] = nextPos;
+                    }
+                } else {
+                    if (nextPos - 1 + char_num  < str_size) {
+                        val += max_weight[nextPos - 1 + char_num];
+                    }
+                    if ((nextPos - 1 + char_num <= str_size) && (val > max_weight[nextPos - 1])) {
+                        max_weight[nextPos - 1] = val;
+                        max_next[nextPos - 1] = nextPos - 1 + char_num;
+                    }
+                }
+            }
+        }
+    }
+    for (size_t i = 0; i < str_size;) {//统计动态规划结果
+        assert(max_next[i] > i);
+        assert(max_next[i] <= str_size);
+        WordRange wr(begin + i, begin + max_next[i] - 1);
+        words.push_back(wr);
+        i = max_next[i];
+    }
+}
+
+bool DictTrie::IsUserDictSingleChineseWord(const Rune &word) const {
+    return IsIn(m_user_dict_single_chinese_word, word);
+}
+
+void DictTrie::PreLoad()
+{
+    ifstream ifs(DICT_PATH);
+    string line;
+    vector<string> buf;
+
+    for (; getline(ifs, line);) {
+        if (limonp::StartsWith(line, "#") or line.empty()) {
+            continue;
+        }
+        limonp::Split(line, buf, " ");
+        if (buf.size() != 3)
+            continue;
+        m_freq_sum += atof(buf[1].c_str());
+    }
+}
+
+void DictTrie::LoadDefaultDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
+{
+    ifstream ifs(DICT_PATH);
+    string line;
+    vector<string> buf;
+
+    for (; getline(ifs, line);) {
+        if (limonp::StartsWith(line, "#") or line.empty()) {
+            continue;
+        }
+        limonp::Split(line, buf, " ");
+        if (buf.size() != 3)
+            continue;
+        DatMemElem node_info;
+        node_info.weight = log(atof(buf[1].c_str()) / m_freq_sum);
+        node_info.SetTag(buf[2]);
+        this->Update(buf[0].c_str(), buf[0].size(), elements_num);
+        offset += (sizeof(DatMemElem));
+        elements_num++;
+        if (m_min_weight > node_info.weight) {
+            m_min_weight = node_info.weight;
+        }
+        write_bytes += write(fd, &node_info, sizeof(DatMemElem));
+    }
+}
+
+void DictTrie::LoadUserDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
+{
+    ifstream ifs(USER_DICT_PATH);
+    string line;
+    vector<string> buf;
+    for (; getline(ifs, line);) {
+        if (limonp::StartsWith(line, "#") or line.empty()) {
+            continue;
+        }
+        limonp::Split(line, buf, " ");
+        if (buf.size() != 3)
+            continue;
+        DatMemElem node_info;
+        assert(m_freq_sum > 0.0);
+        const int freq = atoi(buf[1].c_str());
+        node_info.weight = log(1.0 * freq / m_freq_sum);
+        node_info.SetTag(buf[2]);
+        this->Update(buf[0].c_str(), buf[0].size(), elements_num);
+        offset += (sizeof(DatMemElem));
+        elements_num++;
+        write_bytes += write(fd, &node_info, sizeof(DatMemElem));
+        if (Utf8CharNum(buf[0]) == 1) {
+            RuneArray word;
+            if (DecodeRunesInString(buf[0], word)) {
+                m_user_dict_single_chinese_word.insert(word[0]);
+            }
+        }
+    }
+}
+
+inline double DictTrie::GetMinWeight() const
+{
+    return this->GetCacheFileHeaderPtr()->min_weight;
+}
--- a/libchinese-segmentation/cppjieba/segment-trie/segment-trie.h
+++ b/libchinese-segmentation/cppjieba/segment-trie/segment-trie.h
@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#ifndef SegmentTrie_H
+#define SegmentTrie_H
+
+#include "storage-base.hpp"
+#include "cppjieba/Unicode.hpp"
+
+using namespace cppjieba;
+
+const char * const DICT_PATH = DICT_INSTALL_PATH"/jieba.dict.utf8";
+const char * const USER_DICT_PATH = DICT_INSTALL_PATH"/user.dict.utf8";
+
+struct DictCacheFileHeader : CacheFileHeaderBase
+{
+    double min_weight = 0;
+};
+
+class DictTrie : public StorageBase<DatMemElem, false, DictCacheFileHeader>
+{
+public:
+    DictTrie(const vector<string> file_paths, string dat_cache_path = "");
+    DictTrie(const string& dict_path, const string& user_dict_paths = "", const string & dat_cache_path = "");
+    void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
+
+    const DatMemElem *Find(const string &key) const;
+    void FindDatDag(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
+              vector<struct DatDag>&res, size_t max_word_len = MAX_WORD_LENGTH) const;
+    void FindWordRange(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
+              vector<WordRange>& words, size_t max_word_len = MAX_WORD_LENGTH) const;
+    bool IsUserDictSingleChineseWord(const Rune& word) const;
+
+private:
+    DictTrie();
+    void PreLoad();
+    void LoadDefaultDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
+    void LoadUserDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
+    double GetMinWeight() const;
+
+    double m_freq_sum = 0.0;
+    double m_min_weight = 3.14e+100;
+    unordered_set<Rune> m_user_dict_single_chinese_word;
+};
+
+#endif // SegmentTrie_H
--- a/libchinese-segmentation/development-files/header-files/ChineseSegmentation
+++ b/libchinese-segmentation/development-files/header-files/ChineseSegmentation
@ -0,0 +1 @@
+#include "chinese-segmentation.h"
--- a/libchinese-segmentation/development-files/header-files/HanZiToPinYin
+++ b/libchinese-segmentation/development-files/header-files/HanZiToPinYin
@ -0,0 +1 @@
+#include "hanzi-to-pinyin.h"
--- a/libchinese-segmentation/dict/README.md
+++ b/libchinese-segmentation/dict/README.md
@ -0,0 +1,31 @@
+# CppJieba字典
+
+文件后缀名代表的是词典的编码方式。
+比如filename.utf8 是 utf8编码，filename.gbk 是 gbk编码方式。
+
+
+## 分词
+
+### jieba.dict.utf8/gbk
+
+作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
+
+### hmm_model.utf8/gbk
+
+作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
+
+__对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
+
+
+## 关键词抽取
+
+### idf.utf8
+
+IDF(Inverse Document Frequency)
+在KeywordExtractor中，使用的是经典的TF-IDF算法，所以需要这么一个词典提供IDF信息。
+
+### stop_words.utf8
+
+停用词词典
+
+
--- a/libchinese-segmentation/dict/hmm_model.utf8
+++ b/libchinese-segmentation/dict/hmm_model.utf8
--- a/libchinese-segmentation/dict/idf.utf8
+++ b/libchinese-segmentation/dict/idf.utf8
--- a/libchinese-segmentation/dict/jieba.dict.utf8
+++ b/libchinese-segmentation/dict/jieba.dict.utf8
--- a/libchinese-segmentation/dict/pinyinWithoutTone.txt
+++ b/libchinese-segmentation/dict/pinyinWithoutTone.txt
--- a/libchinese-segmentation/dict/pos_dict/char_state_tab.utf8
+++ b/libchinese-segmentation/dict/pos_dict/char_state_tab.utf8
--- a/libchinese-segmentation/dict/pos_dict/prob_emit.utf8
+++ b/libchinese-segmentation/dict/pos_dict/prob_emit.utf8
--- a/libchinese-segmentation/dict/pos_dict/prob_start.utf8
+++ b/libchinese-segmentation/dict/pos_dict/prob_start.utf8
@ -0,0 +1,259 @@
+#初始状态的概率
+#格式
+#状态:概率
+B,a:-4.7623052146
+B,ad:-6.68006603678
+B,ag:-3.14e+100
+B,an:-8.69708322302
+B,b:-5.01837436211
+B,bg:-3.14e+100
+B,c:-3.42388018495
+B,d:-3.97504752976
+B,df:-8.88897423083
+B,dg:-3.14e+100
+B,e:-8.56355183039
+B,en:-3.14e+100
+B,f:-5.49163041848
+B,g:-3.14e+100
+B,h:-13.53336513
+B,i:-6.11578472756
+B,in:-3.14e+100
+B,j:-5.05761912847
+B,jn:-3.14e+100
+B,k:-3.14e+100
+B,l:-4.90588358466
+B,ln:-3.14e+100
+B,m:-3.6524299819
+B,mg:-3.14e+100
+B,mq:-6.7869530014
+B,n:-1.69662577975
+B,ng:-3.14e+100
+B,nr:-2.23104959138
+B,nrfg:-5.87372217541
+B,nrt:-4.98564273352
+B,ns:-2.8228438315
+B,nt:-4.84609166818
+B,nz:-3.94698846058
+B,o:-8.43349870215
+B,p:-4.20098413209
+B,q:-6.99812385896
+B,qe:-3.14e+100
+B,qg:-3.14e+100
+B,r:-3.40981877908
+B,rg:-3.14e+100
+B,rr:-12.4347528413
+B,rz:-7.94611647157
+B,s:-5.52267359084
+B,t:-3.36474790945
+B,tg:-3.14e+100
+B,u:-9.1639172775
+B,ud:-3.14e+100
+B,ug:-3.14e+100
+B,uj:-3.14e+100
+B,ul:-3.14e+100
+B,uv:-3.14e+100
+B,uz:-3.14e+100
+B,v:-2.67405848743
+B,vd:-9.04472876024
+B,vg:-3.14e+100
+B,vi:-12.4347528413
+B,vn:-4.33156108902
+B,vq:-12.1470707689
+B,w:-3.14e+100
+B,x:-3.14e+100
+B,y:-9.84448567586
+B,yg:-3.14e+100
+B,z:-7.04568111149
+B,zg:-3.14e+100
+E,a:-3.14e+100
+E,ad:-3.14e+100
+E,ag:-3.14e+100
+E,an:-3.14e+100
+E,b:-3.14e+100
+E,bg:-3.14e+100
+E,c:-3.14e+100
+E,d:-3.14e+100
+E,df:-3.14e+100
+E,dg:-3.14e+100
+E,e:-3.14e+100
+E,en:-3.14e+100
+E,f:-3.14e+100
+E,g:-3.14e+100
+E,h:-3.14e+100
+E,i:-3.14e+100
+E,in:-3.14e+100
+E,j:-3.14e+100
+E,jn:-3.14e+100
+E,k:-3.14e+100
+E,l:-3.14e+100
+E,ln:-3.14e+100
+E,m:-3.14e+100
+E,mg:-3.14e+100
+E,mq:-3.14e+100
+E,n:-3.14e+100
+E,ng:-3.14e+100
+E,nr:-3.14e+100
+E,nrfg:-3.14e+100
+E,nrt:-3.14e+100
+E,ns:-3.14e+100
+E,nt:-3.14e+100
+E,nz:-3.14e+100
+E,o:-3.14e+100
+E,p:-3.14e+100
+E,q:-3.14e+100
+E,qe:-3.14e+100
+E,qg:-3.14e+100
+E,r:-3.14e+100
+E,rg:-3.14e+100
+E,rr:-3.14e+100
+E,rz:-3.14e+100
+E,s:-3.14e+100
+E,t:-3.14e+100
+E,tg:-3.14e+100
+E,u:-3.14e+100
+E,ud:-3.14e+100
+E,ug:-3.14e+100
+E,uj:-3.14e+100
+E,ul:-3.14e+100
+E,uv:-3.14e+100
+E,uz:-3.14e+100
+E,v:-3.14e+100
+E,vd:-3.14e+100
+E,vg:-3.14e+100
+E,vi:-3.14e+100
+E,vn:-3.14e+100
+E,vq:-3.14e+100
+E,w:-3.14e+100
+E,x:-3.14e+100
+E,y:-3.14e+100
+E,yg:-3.14e+100
+E,z:-3.14e+100
+E,zg:-3.14e+100
+M,a:-3.14e+100
+M,ad:-3.14e+100
+M,ag:-3.14e+100
+M,an:-3.14e+100
+M,b:-3.14e+100
+M,bg:-3.14e+100
+M,c:-3.14e+100
+M,d:-3.14e+100
+M,df:-3.14e+100
+M,dg:-3.14e+100
+M,e:-3.14e+100
+M,en:-3.14e+100
+M,f:-3.14e+100
+M,g:-3.14e+100
+M,h:-3.14e+100
+M,i:-3.14e+100
+M,in:-3.14e+100
+M,j:-3.14e+100
+M,jn:-3.14e+100
+M,k:-3.14e+100
+M,l:-3.14e+100
+M,ln:-3.14e+100
+M,m:-3.14e+100
+M,mg:-3.14e+100
+M,mq:-3.14e+100
+M,n:-3.14e+100
+M,ng:-3.14e+100
+M,nr:-3.14e+100
+M,nrfg:-3.14e+100
+M,nrt:-3.14e+100
+M,ns:-3.14e+100
+M,nt:-3.14e+100
+M,nz:-3.14e+100
+M,o:-3.14e+100
+M,p:-3.14e+100
+M,q:-3.14e+100
+M,qe:-3.14e+100
+M,qg:-3.14e+100
+M,r:-3.14e+100
+M,rg:-3.14e+100
+M,rr:-3.14e+100
+M,rz:-3.14e+100
+M,s:-3.14e+100
+M,t:-3.14e+100
+M,tg:-3.14e+100
+M,u:-3.14e+100
+M,ud:-3.14e+100
+M,ug:-3.14e+100
+M,uj:-3.14e+100
+M,ul:-3.14e+100
+M,uv:-3.14e+100
+M,uz:-3.14e+100
+M,v:-3.14e+100
+M,vd:-3.14e+100
+M,vg:-3.14e+100
+M,vi:-3.14e+100
+M,vn:-3.14e+100
+M,vq:-3.14e+100
+M,w:-3.14e+100
+M,x:-3.14e+100
+M,y:-3.14e+100
+M,yg:-3.14e+100
+M,z:-3.14e+100
+M,zg:-3.14e+100
+S,a:-3.90253968313
+S,ad:-11.0484584802
+S,ag:-6.95411391796
+S,an:-12.8402179494
+S,b:-6.47288876397
+S,bg:-3.14e+100
+S,c:-4.78696679586
+S,d:-3.90391976418
+S,df:-3.14e+100
+S,dg:-8.9483976513
+S,e:-5.94251300628
+S,en:-3.14e+100
+S,f:-5.19482024998
+S,g:-6.50782681533
+S,h:-8.65056320738
+S,i:-3.14e+100
+S,in:-3.14e+100
+S,j:-4.91199211964
+S,jn:-3.14e+100
+S,k:-6.94032059583
+S,l:-3.14e+100
+S,ln:-3.14e+100
+S,m:-3.26920065212
+S,mg:-10.8253149289
+S,mq:-3.14e+100
+S,n:-3.85514838976
+S,ng:-4.9134348611
+S,nr:-4.48366310396
+S,nrfg:-3.14e+100
+S,nrt:-3.14e+100
+S,ns:-3.14e+100
+S,nt:-12.1470707689
+S,nz:-3.14e+100
+S,o:-8.46446092775
+S,p:-2.98684018136
+S,q:-4.88865861826
+S,qe:-3.14e+100
+S,qg:-3.14e+100
+S,r:-2.76353367841
+S,rg:-10.2752685919
+S,rr:-3.14e+100
+S,rz:-3.14e+100
+S,s:-3.14e+100
+S,t:-3.14e+100
+S,tg:-6.27284253188
+S,u:-6.94032059583
+S,ud:-7.72823016105
+S,ug:-7.53940370266
+S,uj:-6.85251045118
+S,ul:-8.41537131755
+S,uv:-8.15808672229
+S,uz:-9.29925862537
+S,v:-3.05329230341
+S,vd:-3.14e+100
+S,vg:-5.94301818437
+S,vi:-3.14e+100
+S,vn:-11.4539235883
+S,vq:-3.14e+100
+S,w:-3.14e+100
+S,x:-8.42741965607
+S,y:-6.19707946995
+S,yg:-13.53336513
+S,z:-3.14e+100
+S,zg:-3.14e+100
--- a/libchinese-segmentation/dict/pos_dict/prob_trans.utf8
+++ b/libchinese-segmentation/dict/pos_dict/prob_trans.utf8
--- a/libchinese-segmentation/dict/stop_words.utf8
+++ b/libchinese-segmentation/dict/stop_words.utf8
--- a/libchinese-segmentation/dict/user.dict.utf8
+++ b/libchinese-segmentation/dict/user.dict.utf8
@ -0,0 +1,4 @@
+云计算
+韩玉鉴赏
+蓝翔 nz
+区块链 10 nz
--- a/libchinese-segmentation/hanzi-to-pinyin-private.h
+++ b/libchinese-segmentation/hanzi-to-pinyin-private.h
@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+
+#ifndef HANZITOPINYINPRIVATE_H
+#define HANZITOPINYINPRIVATE_H
+
+#include <QtCore/qglobal.h>
+#include <QHash>
+#include "pinyin4cpp_dictTrie.h"
+#include "hanzi-to-pinyin.h"
+#include "pinyin4cpp-trie.h"
+
+using namespace std;
+
+static const QHash<QString, QString> PhoneticSymbol = {
+    {"ā", "a1"}, {"á", "a2"}, {"ǎ", "a3"}, {"à", "a4"},
+    {"ē", "e1"}, {"é", "e2"}, {"ě", "e3"}, {"è", "e4"},
+    {"ō", "o1"}, {"ó", "o2"}, {"ǒ", "o3"}, {"ò", "o4"},
+    {"ī", "i1"}, {"í", "i2"}, {"ǐ", "i3"}, {"ì", "i4"},
+    {"ū", "u1"}, {"ú", "u2"}, {"ǔ", "u3"}, {"ù", "u4"},
+    // üe
+    {"ü", "v"},
+    {"ǖ", "v1"}, {"ǘ", "v2"}, {"ǚ", "v3"}, {"ǜ", "v4"},
+    {"ń", "n2"}, {"ň", "n3"}, {"ǹ", "n4"},
+    {"m̄", "m1"}, {"ḿ", "m2"}, {"m̀", "m4"},
+    {"ê̄", "ê1"}, {"ế", "ê2"}, {"ê̌", "ê3"}, {"ề", "ê4"}
+};
+
+#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
+
+class PINYINMANAGER_EXPORT HanZiToPinYinPrivate
+{
+public:
+    HanZiToPinYinPrivate(HanZiToPinYin *parent = nullptr);
+    ~HanZiToPinYinPrivate();
+
+public:
+    template <typename T>
+    bool isMultiTone(T &&t) {return m_pinYinTrie.IsMultiTone(std::forward<T>(t));}
+
+    bool contains(string &word);
+    int getResults(string &word, QStringList &results);
+    void setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType);
+
+private:
+    void convertDataStyle(QStringList &results);
+
+    HanZiToPinYin *q = nullptr;
+    //Pinyin4cppDictTrie *m_pinYinTrie = nullptr;
+    Pinyin4cppTrie m_pinYinTrie;
+
+    SegType m_segType = SegType::Segmentation;
+    PolyphoneType m_polyphoneType = PolyphoneType::Disable;
+    PinyinDataStyle m_pinyinDataStyle = PinyinDataStyle::Default;
+    ExDataProcessType m_exDataProcessType = ExDataProcessType::Default;
+};
+#endif // HANZITOPINYINPRIVATE_H
--- a/libchinese-segmentation/hanzi-to-pinyin.cpp
+++ b/libchinese-segmentation/hanzi-to-pinyin.cpp
@ -0,0 +1,360 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+
+#include <mutex>
+#include <cctype>
+#include "hanzi-to-pinyin.h"
+#include "hanzi-to-pinyin-private.h"
+#include "chinese-segmentation.h"
+#include "cppjieba/Unicode.hpp"
+
+HanZiToPinYin * HanZiToPinYin::g_pinYinManager = nullptr;
+std::once_flag g_singleFlag;
+
+bool HanZiToPinYinPrivate::contains(string &word)
+{
+    return m_pinYinTrie.Contains(word);
+}
+
+int HanZiToPinYinPrivate::getResults(string &word, QStringList &results)
+{
+    results.clear();
+
+    string directResult = m_pinYinTrie.Find(word);
+
+    if (directResult == string()) {
+        if (m_segType == SegType::NoSegmentation) {//无分词、无结果直接返回-1
+            return -1;
+        } else {//无结果、启用分词
+            vector<string> segResults = ChineseSegmentation::getInstance()->callMixSegmentCutStr(word);
+            string data;
+            for (string &info : segResults) {
+                if (info == string()) {
+                    continue;
+                }
+                data = m_pinYinTrie.Find(info);
+                if (data == string()) {//分词后无结果
+                    if (cppjieba::IsSingleWord(info)) {//单个字符
+                        if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回
+                            results.append(QString().fromStdString(info));
+                        } else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略
+                            continue;
+                        }
+                    } else {//多个字符
+                        string oneWord;
+                        cppjieba::RuneStrArray runeArray;
+                        cppjieba::DecodeRunesInString(info, runeArray);
+                        for (auto i = runeArray.begin(); i != runeArray.end(); ++i) {
+                            oneWord = cppjieba::GetStringFromRunes(info, i, i);
+                            data = m_pinYinTrie.Find(oneWord);
+                            if (data == string()) {//单字无结果则按设置返回
+                                if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回
+                                    results.append(QString().fromStdString(oneWord));
+                                } else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略
+                                    continue;
+                                }
+                            }
+                            if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
+                                results.append(QString().fromStdString(data));
+                            } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
+                                if (limonp::IsInStr(data, ',')) {
+                                    results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0))));
+                                } else {
+                                    results.append(QString().fromStdString(data));
+                                }
+                            }
+                        }
+                    }
+                } else {//分词后有结果
+                    if (cppjieba::IsSingleWord(info)) {//单个字符
+                        if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
+                            results.append(QString().fromStdString(data));
+                        } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
+                            if (limonp::IsInStr(data, ',')) {
+                                results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0))));
+                            } else {
+                                results.append(QString().fromStdString(data));
+                            }
+                        }
+                    } else {//多个字符
+                        vector<string> dataVec = limonp::Split(data, "/");
+                        if (dataVec.size() == 1) {//无多音词
+                            vector<string> dataVec = limonp::Split(data, ",");
+                            for (auto &oneResult : dataVec) {
+                                results.append(QString().fromStdString(oneResult));
+                            }
+                        } else {
+                            if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
+                                int wordSize = limonp::Split(dataVec[0], ",").size();
+                                for (int i = 0; i < wordSize; ++i) {
+                                    QStringList oneResult;
+                                    for (size_t j = 0; j < dataVec.size(); ++j) {
+                                        oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i]));
+                                    }
+                                    results.append(oneResult.join('/'));
+                                }
+                            } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
+                                vector<string> tmp = limonp::Split(dataVec[0], ",");
+                                for (auto &oneResult : tmp) {
+                                    results.append(QString().fromStdString(oneResult));
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {//可以直接查到结果
+        if (cppjieba::IsSingleWord(word)) {//单个字符
+            if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
+                results.append(QString().fromStdString(directResult));
+            } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
+                if (limonp::IsInStr(directResult, ',')) {
+                    results.append(QString().fromStdString(directResult.substr(0, directResult.find_first_of(",", 0))));
+                } else {
+                    results.append(QString().fromStdString(directResult));
+                }
+            }
+        } else {//多个字符
+            vector<string> dataVec = limonp::Split(directResult, "/");
+            if (dataVec.size() == 1) {//无多音词
+                vector<string> dataVec = limonp::Split(directResult, ",");
+                for (auto &oneResult : dataVec) {
+                    results.append(QString().fromStdString(oneResult));
+                }
+            } else {
+                if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
+                    int wordSize = limonp::Split(dataVec[0], ",").size();
+                    for (int i = 0; i < wordSize; ++i) {
+                        QStringList oneResult;
+                        for (size_t j = 0; j < dataVec.size(); ++j) {
+                            oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i]));
+                        }
+                        results.append(oneResult.join('/'));
+                    }
+                } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
+                    vector<string> tmp = limonp::Split(dataVec[0], ",");
+                    for (auto &oneResult : tmp) {
+                        results.append(QString().fromStdString(oneResult));
+                    }
+                }
+            }
+        }
+    }
+    convertDataStyle(results);
+    return 0;//todo
+}
+
+void HanZiToPinYinPrivate::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType)
+{
+    m_pinyinDataStyle = dataStyle;
+    m_segType = segType;
+    m_polyphoneType = polyphoneType;
+    m_exDataProcessType = processType;
+}
+
+void HanZiToPinYinPrivate::convertDataStyle(QStringList &results)
+{
+    QString value;
+    if (m_pinyinDataStyle == PinyinDataStyle::Default) {
+        for (QString &info : results) {
+            if(info == ",") {
+                continue;
+            }
+            //if info's length was been changed, there's someting wrong while traverse the chars of info
+            for (const QChar &c : info) {
+                if (!isalpha(c.toLatin1())) {
+                    value = PhoneticSymbol.value(c);
+                    if (!value.isEmpty()) {
+                        info.replace(c, value.at(0));
+                    }
+                }
+            }
+
+            QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序)
+            QStringList tmpValue;
+            for (auto &str : tmpList) {
+                if (!tmpValue.contains(str)) {
+                    tmpValue.push_back(str);
+                }
+            }
+            info = tmpValue.join(",");
+        }
+    } else if (m_pinyinDataStyle == PinyinDataStyle::Tone) {
+        //无需处理
+    } else if (m_pinyinDataStyle == PinyinDataStyle::Tone2) {
+        for (QString &info : results) {
+            for (int i = 0; i < info.size();) {
+                auto c = info.at(i);
+                if (!isalpha(c.toLatin1())) {
+                    value = PhoneticSymbol.value(c);
+                    if (!value.isEmpty()) {
+                        info.replace(c, PhoneticSymbol.value(c));
+                        i += PhoneticSymbol.value(c).size();
+                        continue;
+                    }
+                }
+                i++;
+            }
+        }
+    } else if (m_pinyinDataStyle == PinyinDataStyle::Tone3) {
+        for (QString &info : results) {
+            if(info == "/") {
+                continue;
+            }
+            bool isPolyphoneWords(false);
+            if (info.contains("/")) {
+                isPolyphoneWords = true;
+                info.replace("/", ",");
+            }
+
+            for (int i = 0; i < info.size();) {
+                auto c = info.at(i);
+                if (!isalpha(c.toLatin1())) {
+                    value = PhoneticSymbol.value(c);
+                    if (!value.isEmpty()) {
+                        info.replace(i, 1, value.at(0));
+                        //多音词模式
+                        if (info.contains(",")) {
+                            int pos = info.indexOf(',', i);
+                            if (isPolyphoneWords) {
+                                info.replace(",", "/");
+                            }
+                            //最后一个读音时
+                            if (pos == -1) {
+                                info.append(value.at(1));
+                                break;
+                            }
+                            info.insert(pos, value.at(1));
+                            i = pos + 1;    //insert导致','的位置加一，将i行进到','的位置
+                            i++;
+                            continue;
+                        } else {
+                            info.append(value.at(1));
+                            break;
+                        }
+                    }
+                }
+                i++;
+            }
+
+        }
+    } else if (m_pinyinDataStyle == PinyinDataStyle::FirstLetter) {
+        for (QString &info : results) {
+            if(info == "," or info == "/") {
+                continue;
+            }
+
+            bool isPolyphoneWords(false);
+            if (info.contains("/")) {
+                isPolyphoneWords = true;
+                info.replace("/", ",");
+            }
+
+            for (int i = 0; i < info.size();i++) {
+                auto c = info.at(i);
+                if (!isalpha(c.toLatin1())) {
+                    value = PhoneticSymbol.value(c);
+                    if (!value.isEmpty()) {
+                        info.replace(c, value.at(0));
+                    }
+                }
+            }
+
+            QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序)
+            QStringList tmpValue;
+            for (auto &str : tmpList) {
+                if (!tmpValue.contains(str)) {
+                    tmpValue.push_back(str.at(0));
+                }
+            }
+            if (isPolyphoneWords) {
+                info = tmpValue.join("/");
+            } else {
+                info = tmpValue.join(",");
+            }
+        }
+    } else if (m_pinyinDataStyle == PinyinDataStyle::English) {
+        //暂不支持
+    }
+}
+
+HanZiToPinYinPrivate::HanZiToPinYinPrivate(HanZiToPinYin *parent) : q(parent)
+{
+    //const char * const  SINGLE_WORD_PINYIN_PATH = "/usr/share/ukui-search/res/dict/singleWordPinyin.txt";
+    //const char * const  WORDS_PINYIN_PATH = "/usr/share/ukui-search/res/dict/wordsPinyin.txt";
+    //m_pinYinTrie = new Pinyin4cppDictTrie(SINGLE_WORD_PINYIN_PATH, WORDS_PINYIN_PATH);
+    //m_pinYinTrie = new Pinyin4cppTrie;
+}
+
+HanZiToPinYinPrivate::~HanZiToPinYinPrivate()
+{
+//    if (m_pinYinTrie){
+//        delete m_pinYinTrie;
+//        m_pinYinTrie = nullptr;
+//    }
+}
+
+HanZiToPinYin * HanZiToPinYin::getInstance()
+{
+    call_once(g_singleFlag, []() {
+        g_pinYinManager = new HanZiToPinYin;
+    });
+    return g_pinYinManager;
+}
+
+bool HanZiToPinYin::contains(string &word)
+{
+    return d->contains(word);
+}
+
+bool HanZiToPinYin::isMultiTone(string &word)
+{
+    return d->isMultiTone(word);
+}
+
+bool HanZiToPinYin::isMultiTone(string &&word)
+{
+    return d->isMultiTone(word);
+}
+
+bool HanZiToPinYin::isMultiTone(const string &word)
+{
+    return d->isMultiTone(word);
+}
+
+bool HanZiToPinYin::isMultiTone(const string &&word)
+{
+    return d->isMultiTone(word);
+}
+
+int HanZiToPinYin::getResults(string word, QStringList &results)
+{
+    return d->getResults(word, results);
+}
+
+void HanZiToPinYin::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType)
+{
+    d->setConfig(dataStyle, segType, polyphoneType, processType);
+}
+
+HanZiToPinYin::HanZiToPinYin() : d(new HanZiToPinYinPrivate)
+{
+}
--- a/libchinese-segmentation/hanzi-to-pinyin.h
+++ b/libchinese-segmentation/hanzi-to-pinyin.h
@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+
+#ifndef HANZITOPINYIN_H
+#define HANZITOPINYIN_H
+
+#include <QtCore/qglobal.h>
+#include <QStringList>
+#include "pinyin4cpp-common.h"
+#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
+
+using namespace std;
+
+class HanZiToPinYinPrivate;
+class PINYINMANAGER_EXPORT HanZiToPinYin
+{
+public:
+    static HanZiToPinYin * getInstance();
+
+public:
+    /**
+     * @brief HanZiToPinYin::isMultiTone 判断是否为多音字/词/句
+     * @param word 要判断的字/词/句
+     * @return bool 不是返回false
+     */
+    bool isMultiTone(string &word);
+    bool isMultiTone(string &&word);
+    bool isMultiTone(const string &word);
+    bool isMultiTone(const string &&word);
+
+    /**
+     * @brief HanZiToPinYin::contains 查询某个字/词/句是否有拼音（是否在数据库包含）
+     * @param word 要查询的字/词/句
+     * @return bool 数据库不包含返回false
+     */
+    bool contains(string &word);
+
+    /**
+     * @brief HanZiToPinYin::getResults 获取某个字/词/句的拼音
+     * @param word 要获取拼音的字/词/句
+     * @param results word的拼音列表（有可能多音字），每次调用results会被清空
+     * @return int 获取到返回0，否则返回-1
+     */
+    int getResults(string word, QStringList &results);
+
+    /**
+     * @brief setConfig 设置HanZiToPinYin的各项功能，详见pinyin4cpp-common.h
+     * @param dataStyle 返回数据风格，默认defult
+     * @param segType 是否启用分词，默认启用
+     * @param polyphoneType 是否启用多音字，默认不启用
+     * @param processType 无拼音数据处理模式，默认defult
+     */
+    void setConfig(PinyinDataStyle dataStyle,SegType segType,PolyphoneType polyphoneType,ExDataProcessType processType);
+
+protected:
+    HanZiToPinYin();
+    ~HanZiToPinYin();
+    HanZiToPinYin(const HanZiToPinYin&) = delete;
+    HanZiToPinYin& operator =(const HanZiToPinYin&) = delete;
+private:
+    static HanZiToPinYin *g_pinYinManager;
+    HanZiToPinYinPrivate *d = nullptr;
+};
+
+#endif // PINYINMANAGER_H
--- a/libchinese-segmentation/libchinese-segmentation.pro
+++ b/libchinese-segmentation/libchinese-segmentation.pro
@ -0,0 +1,86 @@
+QT -= gui
+
+VERSION = 1.1.0
+TARGET =  chinese-segmentation
+TEMPLATE = lib
+DEFINES += LIBCHINESESEGMENTATION_LIBRARY
+DEFINES += VERSION='\\"$${VERSION}\\"'
+
+CONFIG += c++11 create_pc create_prl no_install_prl
+
+# The following define makes your compiler emit warnings if you use
+# any Qt feature that has been marked deprecated (the exact warnings
+# depend on your compiler). Please consult the documentation of the
+# deprecated API in order to know how to port your code away from it.
+DEFINES += QT_DEPRECATED_WARNINGS
+QMAKE_CXXFLAGS += -Werror=return-type -Werror=return-local-addr
+#QMAKE_CXXFLAGS += -Werror=uninitialized
+QMAKE_CXXFLAGS += -execution-charset:utf-8
+
+# You can also make your code fail to compile if it uses deprecated APIs.
+# In order to do so, uncomment the following line.
+# You can also select to disable deprecated APIs only up to a certain version of Qt.
+#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
+include(cppjieba/cppjieba.pri)
+include(pinyin4cpp/pinyin4cpp.pri)
+include(Traditional-Chinese-Simplified-conversion/Traditional2Simplified.pri)
+include(storage-base/storage-base-cedar.pri)
+
+#LIBS += -L/usr/local/lib/libjemalloc -ljemalloc
+
+SOURCES += \
+    chinese-segmentation.cpp \
+    hanzi-to-pinyin.cpp \
+    Traditional-to-Simplified.cpp
+
+HEADERS += \
+    chinese-segmentation-private.h \
+    chinese-segmentation.h \
+    common-struct.h \
+    hanzi-to-pinyin-private.h \
+    hanzi-to-pinyin.h \
+    Traditional-to-Simplified-private.h \
+    Traditional-to-Simplified.h \
+    pinyin4cpp-common.h \
+    libchinese-segmentation_global.h
+DICT_INSTALL_PATH = /usr/share/chinese-segmentation/res/dict
+DEFINES += DICT_INSTALL_PATH='\\"$${DICT_INSTALL_PATH}\\"'
+
+dict_files.path = DICT_INSTALL_PATH
+dict_files.files = $$PWD/dict/*.utf8\
+dict_files.files += $$PWD/dict/pos_dict/*.utf8\
+dict_files.files += $$PWD/dict/*.txt\
+dict_files.files += $$PWD/pinyin4cpp/dict/*.txt\
+dict_files.files += $$PWD/Traditional-Chinese-Simplified-conversion/dict/*.txt
+
+INSTALLS += \
+    dict_files \
+
+# Default rules for deployment.
+unix {
+    target.path = $$[QT_INSTALL_LIBS]
+    QMAKE_PKGCONFIG_NAME = chinese-segmentation
+    QMAKE_PKGCONFIG_DESCRIPTION = chinese-segmentation Header files
+    QMAKE_PKGCONFIG_VERSION = $$VERSION
+    QMAKE_PKGCONFIG_LIBDIR = $$target.path
+    QMAKE_PKGCONFIG_DESTDIR = pkgconfig
+    QMAKE_PKGCONFIG_INCDIR = /usr/include/chinese-segmentation
+    QMAKE_PKGCONFIG_CFLAGS += -I/usr/include/chinese-segmentation
+
+!isEmpty(target.path): INSTALLS += target
+
+    header.path = /usr/include/chinese-segmentation
+    header.files += chinese-segmentation.h libchinese-segmentation_global.h common-struct.h hanzi-to-pinyin.h pinyin4cpp-common.h Traditional-to-Simplified.h
+    header.files += development-files/header-files/*
+#    headercppjieba.path = /usr/include/chinese-seg/cppjieba/
+#    headercppjieba.files = cppjieba/*
+    INSTALLS += header
+}
+
+
+#DISTFILES += \
+#    jiaba/jieba.pri
+
+DISTFILES += \
+    development-files/header-files/* \
+    pinyin4cpp/pinyin4cpp.pri
--- a/libchinese-segmentation/libchinese-segmentation_global.h
+++ b/libchinese-segmentation/libchinese-segmentation_global.h
@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2020, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: zhangzihao <zhangzihao@kylinos.cn>
+ * Modified by: zhangpengfei <zhangpengfei@kylinos.cn>
+ *
+ */
+#ifndef CHINESESEGMENTATION_GLOBAL_H
+#define CHINESESEGMENTATION_GLOBAL_H
+
+#include <QtCore/qglobal.h>
+
+#if defined(CHINESESEGMENTATION_LIBRARY)
+#  define CHINESESEGMENTATION_EXPORT Q_DECL_EXPORT
+#else
+#  define CHINESESEGMENTATION_EXPORT Q_DECL_IMPORT
+#endif
+
+#endif // CHINESESEGMENTATION_GLOBAL_H
--- a/libchinese-segmentation/pinyin4cpp-common.h
+++ b/libchinese-segmentation/pinyin4cpp-common.h
@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+
+#ifndef PINYIN4CPP_COMMON_H
+#define PINYIN4CPP_COMMON_H
+
+/**
+ * @brief The PinyinDataStyle enum
+ * Default 默认模式，“中心” return “zhong xin”
+ * Tone 带读音模式 #“中心” return “zhōng xīn”
+ * Tone2 带读音模式2 #“中心” return “zho1ng xi1n”
+ * Tone3 带读音模式3 #“中心” return “zhong1 xin1”
+ * FirstLetter 首字母模式 #“中心” return “z x”
+ * English 英文翻译模式(暂不支持) #“中心” return “center,heart,core”
+ */
+enum class PinyinDataStyle {
+    Default       = 1u << 0,
+    Tone          = 1u << 1,
+    Tone2         = 1u << 2,
+    Tone3         = 1u << 3,
+    FirstLetter   = 1u << 4,
+    English       = 1u << 5
+};
+
+/**
+ * @brief The SegType enum
+ * Segmentation 默认带分词 #“银河麒麟”->“银河”“麒麟”
+ * NoSegmentation 无分词模式 #“银河麒麟”
+ */
+enum class SegType {
+    Segmentation    = 1u << 0,
+    NoSegmentation  = 1u << 1
+};
+
+/**
+ * @brief The PolyphoneType enum
+ * Disable 默认不启用多音字，“奇安信”return “qi an xin”多音字按照常用读音返回
+ * Enable 启用多音字 “奇安信” return“qi,ji an xin”
+ * 注意：多音词返回格式为 “朝阳” return "zhao/chao yang/yang"
+ */
+enum class PolyphoneType {
+    Disable       = 1u << 0,
+    Enable        = 1u << 1
+};
+
+/**
+ * @brief The ExDataProcessType enum
+ * Default 默认无拼音数据直接返回，“123木头人” return "123 mu tou ren"（分词模式）
+ * Delete  删除多余数据，#“123木头人” return "mu tou ren"（分词模式）
+ */
+enum class ExDataProcessType {
+    Default       = 1u << 0,
+    Delete        = 1u << 1
+};
+
+#endif //PINYIN4CPP_COMMON_H
--- a/libchinese-segmentation/pinyin4cpp/dict/singleWordPinyin.txt
+++ b/libchinese-segmentation/pinyin4cpp/dict/singleWordPinyin.txt
--- a/libchinese-segmentation/pinyin4cpp/dict/wordsPinyin.txt
+++ b/libchinese-segmentation/pinyin4cpp/dict/wordsPinyin.txt
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp-trie.cpp
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp-trie.cpp
@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#include "pinyin4cpp-trie.h"
+
+Pinyin4cppTrie::Pinyin4cppTrie(string dat_cache_path)
+    : StorageBase<char, false, CacheFileHeaderBase>(vector<string>{SINGLE_WORD_PINYIN_PATH, WORDS_PINYIN_PATH}, dat_cache_path)
+{
+    this->Init();
+}
+
+Pinyin4cppTrie::Pinyin4cppTrie(const vector<string> file_paths, string dat_cache_path)
+    : StorageBase<char, false, CacheFileHeaderBase>(file_paths, dat_cache_path)
+{
+    this->Init();
+}
+
+
+
+bool Pinyin4cppTrie::Contains(string &word) {
+    if (this->Find(word) != string())
+        return true;
+    return false;
+}
+
+bool Pinyin4cppTrie::IsMultiTone(const string &word) {
+    string result = this->Find(word);
+    if (result.find(",") == result.npos)
+        return true;
+    return false;
+}
+
+void Pinyin4cppTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
+{
+    CacheFileHeaderBase header;
+    assert(sizeof(header.md5_hex) == md5.size());
+    memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
+
+    int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
+    string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
+    umask(S_IWGRP | S_IWOTH);
+    const int fd =mkstemp((char *)tmp_filepath.data());
+    assert(fd >= 0);
+    fchmod(fd, 0644);
+
+    write_bytes = write(fd, (const char *)&header, sizeof(CacheFileHeaderBase));
+
+    this->LoadSingleWordDict(fd, write_bytes, offset, elements_num);
+    this->LoadWordsDict(fd, write_bytes, offset, elements_num);
+
+    write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
+
+    lseek(fd, sizeof(header.md5_hex), SEEK_SET);
+    write(fd, &elements_num, sizeof(int));
+    write(fd, &offset, sizeof(int));
+    data_trie_size = this->GetDataTrieSize();
+    write(fd, &data_trie_size, sizeof(int));
+
+    close(fd);
+    assert((size_t)write_bytes == sizeof(CacheFileHeaderBase) + offset + this->GetDataTrieTotalSize());
+
+    tryRename(tmp_filepath, dat_cache_file);
+}
+
+string Pinyin4cppTrie::Find(const string &key)
+{
+    int result = this->ExactMatchSearch(key.c_str(), key.size());
+    if (result < 0)
+        return string();
+    return string(&this->GetElementPtr()[result]);
+}
+
+void Pinyin4cppTrie::LoadSingleWordDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
+{
+    ifstream ifs(SINGLE_WORD_PINYIN_PATH);
+    string line;
+    vector<string> buf;
+
+    for (; getline(ifs, line);) {
+        if (limonp::StartsWith(line, "#") or line.empty()) {
+            continue;
+        }
+        limonp::Split(line, buf, ":");
+        if (buf.size() != 3)
+            continue;
+        this->Update(buf[2].c_str(), buf[2].size(), offset);
+        offset += (buf[1].size() + 1);
+        elements_num++;
+        write_bytes += write(fd, buf[1].c_str(), buf[1].size() + 1);
+    }
+}
+
+void Pinyin4cppTrie::LoadWordsDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
+{
+    ifstream ifs(WORDS_PINYIN_PATH);
+    string line;
+    vector<string> buf;
+    for (; getline(ifs, line);) {
+        if (limonp::StartsWith(line, "#") or line.empty()) {
+            continue;
+        }
+        limonp::Split(line, buf, ":");
+        if (buf.size() != 2)
+            continue;
+        this->Update(buf[0].c_str(), buf[0].size(), offset);
+        offset += (buf[1].size() + 1);
+        elements_num++;
+        write_bytes += write(fd, buf[1].c_str(), buf[1].size() + 1);
+    }
+}
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp-trie.h
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp-trie.h
@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#ifndef PINYIN4CPPTRIE_H
+#define PINYIN4CPPTRIE_H
+
+#include "storage-base.hpp"
+
+const char * const  SINGLE_WORD_PINYIN_PATH = DICT_INSTALL_PATH"/singleWordPinyin.txt";
+const char * const  WORDS_PINYIN_PATH = DICT_INSTALL_PATH"/wordsPinyin.txt";
+
+class Pinyin4cppTrie : public StorageBase<char, false, CacheFileHeaderBase>
+{
+public:
+    Pinyin4cppTrie(string dat_cache_path = "");
+    Pinyin4cppTrie(const vector<string> file_paths, string dat_cache_path = "");
+    void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
+    string Find(const string &key);
+    bool Contains(string &word);
+    bool IsMultiTone(const string &word);
+
+private:
+    void LoadSingleWordDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
+    void LoadWordsDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
+};
+
+#endif // PINYIN4CPPTRIE_H
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp.pri
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp.pri
@ -0,0 +1,15 @@
+INCLUDEPATH += $$PWD
+
+HEADERS += \
+    $$PWD/pinyin4cpp-trie.h \
+    $$PWD/pinyin4cpp_dataTrie.h \
+    $$PWD/pinyin4cpp_dictTrie.h
+
+SOURCES += \
+    $$PWD/pinyin4cpp-trie.cpp \
+    $$PWD/pinyin4cpp_dataTrie.cpp \
+    $$PWD/pinyin4cpp_dictTrie.cpp
+
+DISTFILES += \
+    pinyin4cpp/dict/wordsPinyin.txt \
+    pinyin4cpp/dict/singleWordPinyin.txt
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dataTrie.cpp
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dataTrie.cpp
@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+
+#include "pinyin4cpp_dataTrie.h"
+
+Pinyin4cppDataTrie::Pinyin4cppDataTrie()
+{
+
+}
+
+Pinyin4cppDataTrie::~Pinyin4cppDataTrie()
+{
+    munmap(m_mmapAddr, m_mmapLength);
+    m_mmapAddr = nullptr;
+    close(m_mmapFd);
+    m_mmapFd = -1;
+}
+
+string Pinyin4cppDataTrie::Find(const string &key) const {
+//    darts-clone的接口方法
+    Darts::DoubleArray::result_pair_type find_result;
+    m_DoubleArrayDataTrie.exactMatchSearch(key.c_str(), find_result);
+    if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= m_elementsSize)) {//todo
+        return string();
+    }
+    return string(&m_elementsPtr[find_result.value]);
+
+//  cedarpp的接口方法
+//    int result = m_DoubleArrayDataTrie.exactMatchSearch<int>(key.c_str(), key.size());
+//    if (result < 0)
+//        return string();
+//    return string(&m_elementsPtr[result]);
+
+}
+
+bool Pinyin4cppDataTrie::InitBuildDat(map<string, string> &elements, const string &dat_cache_file, const string &md5) {
+    BuildDatCache(elements, dat_cache_file, md5);
+    return InitAttachDat(dat_cache_file, md5);
+}
+
+bool Pinyin4cppDataTrie::InitAttachDat(const string &dat_cache_file, const string &md5) {
+    m_mmapFd = open(dat_cache_file.c_str(), O_RDONLY);
+
+    if (m_mmapFd < 0) {
+        return false;
+    }
+
+    const auto seek_off = lseek(m_mmapFd, 0, SEEK_END);
+    assert(seek_off >= 0);
+
+    m_mmapLength = static_cast<size_t>(seek_off);
+    m_mmapAddr = reinterpret_cast<char *>(mmap(NULL, m_mmapLength, PROT_READ, MAP_SHARED, m_mmapFd, 0));
+    assert(MAP_FAILED != m_mmapAddr);
+    assert(m_mmapLength >= sizeof(CacheFileHeader));
+
+    CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(m_mmapAddr);
+    m_elementsNum = header.elements_num;
+    m_elementsSize = header.elements_size;
+    assert(sizeof(header.md5_hex) == md5.size());
+
+    if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
+        return false;
+    }
+
+    assert(m_mmapLength == sizeof(CacheFileHeader) + header.elements_size  + header.dat_size * m_DoubleArrayDataTrie.unit_size());
+
+    m_elementsPtr = (const char *)(m_mmapAddr + sizeof(CacheFileHeader));
+    const char * dat_ptr = m_mmapAddr + sizeof(CacheFileHeader) + header.elements_size;
+    m_DoubleArrayDataTrie.set_array((char *)dat_ptr, header.dat_size);
+    return true;
+}
+
+void Pinyin4cppDataTrie::BuildDatCache(map<string, string> &elements, const string &dat_cache_file, const string &md5) {
+    vector<const char*> keys_ptr_vec;
+    vector<int> values_vec;
+    vector<string> mem_elem_vec;
+
+    keys_ptr_vec.reserve(elements.size());
+    values_vec.reserve(elements.size());
+    mem_elem_vec.reserve(elements.size());
+
+    CacheFileHeader header;
+    assert(sizeof(header.md5_hex) == md5.size());
+    memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
+
+    int offset(0);
+    for (auto &info:elements) {
+        keys_ptr_vec.push_back(info.first.c_str());
+        values_vec.push_back(offset);
+        offset += (info.second.size() + 1);//+1指字符串后加\0
+        assert(info.second.size() > 0);
+        mem_elem_vec.push_back(info.second);
+    }
+
+    auto const ret = m_DoubleArrayDataTrie.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
+    assert(0 == ret);
+    header.elements_num = mem_elem_vec.size();
+    header.elements_size = offset;
+    header.dat_size = m_DoubleArrayDataTrie.size();
+
+    string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
+    umask(S_IWGRP | S_IWOTH);
+    const int fd =mkstemp((char *)tmp_filepath.data());
+    assert(fd >= 0);
+    fchmod(fd, 0644);
+
+    auto write_bytes = write(fd, (const char *)&header, sizeof(header));
+    for (size_t i = 0; i < elements.size(); ++i) {
+        write_bytes += write(fd, mem_elem_vec[i].c_str(), mem_elem_vec[i].size() + 1);
+    }
+    write_bytes += write(fd, m_DoubleArrayDataTrie.array(), m_DoubleArrayDataTrie.total_size());
+
+    assert((size_t)write_bytes == sizeof(header) + offset + m_DoubleArrayDataTrie.total_size());
+    close(fd);
+
+    const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str());
+    assert(0 == rename_ret);
+}
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dataTrie.h
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dataTrie.h
@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#ifndef PINYIN4cpp_DATATRIE_H
+#define PINYIN4cpp_DATATRIE_H
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <QDebug>
+#include "Md5.hpp"
+#include "LocalVector.hpp"
+#include "StringUtil.hpp"
+//#define USE_REDUCED_TRIE
+#include "../storage-base/cedar/cedar.h"
+#include "../storage-base/darts-clone/darts.h"
+
+using namespace std;
+using std::pair;
+
+struct CacheFileHeader { //todo 字节对齐
+    char md5_hex[32] = {};
+    uint32_t elements_num = 0;
+    uint32_t elements_size = 0;
+    uint32_t dat_size = 0;
+};
+
+class Pinyin4cppDataTrie {
+public:
+    Pinyin4cppDataTrie();
+    ~Pinyin4cppDataTrie();
+
+    string Find(const string & key) const;
+
+    bool InitBuildDat(map<string, string>& elements, const string & dat_cache_file, const string & md5);
+
+    bool InitAttachDat(const string & dat_cache_file, const string & md5);
+
+private:
+    void BuildDatCache(map<string, string>& elements, const string & dat_cache_file, const string & md5);
+
+    Pinyin4cppDataTrie(const Pinyin4cppDataTrie &);
+    Pinyin4cppDataTrie &operator=(const Pinyin4cppDataTrie &);
+
+private:
+    Darts::DoubleArray m_DoubleArrayDataTrie;
+    //cedar::da<int, -1, -2, true> m_DoubleArrayDataTrie;
+    const char * m_elementsPtr = nullptr;
+    size_t m_elementsNum = 0;
+    size_t m_elementsSize = 0;
+    size_t m_mmapLength = 0;
+
+    int    m_mmapFd = -1;
+    char * m_mmapAddr = nullptr;
+};
+
+#endif //PINYIN4cpp_DATATRIE_H
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dictTrie.cpp
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dictTrie.cpp
@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+
+#include "pinyin4cpp_dictTrie.h"
+#include "malloc.h"
+
+Pinyin4cppDictTrie::Pinyin4cppDictTrie(const string &single_word_dict_path, const string &words_dict_paths, const string &dat_cache_path) {
+    Init(single_word_dict_path, words_dict_paths, dat_cache_path);
+}
+
+string Pinyin4cppDictTrie::Find(const string &word) const {
+    return m_DataTrie.Find(word);
+}
+
+bool Pinyin4cppDictTrie::Contains(string &word) {
+    if (m_DataTrie.Find(word) != string())
+        return true;
+    return false;
+}
+
+bool Pinyin4cppDictTrie::IsMultiTone(const string &word) {
+    string result = m_DataTrie.Find(word);
+    if (result.find(",") == result.npos)
+        return true;
+    return false;
+}
+
+size_t Pinyin4cppDictTrie::GetTotalDictSize() const {
+    return m_TotalDictSize_;
+}
+
+void Pinyin4cppDictTrie::Init(const string &single_word_dict_path, const string &words_dict_paths, string dat_cache_path) {
+    const auto dict_list = single_word_dict_path + "|" + words_dict_paths;
+    size_t file_size_sum = 0;
+    const string md5 = CalcFileListMD5(dict_list, file_size_sum);
+    m_TotalDictSize_ = file_size_sum;
+
+    if (dat_cache_path.empty()) {
+        dat_cache_path = "/tmp/" + md5 + ".dat_cache";//未指定词库数据文件存储位置的默认存储在tmp目录下
+    }
+    qDebug() << "#####Pinyin Dict path:" << dat_cache_path.c_str();
+    if (m_DataTrie.InitAttachDat(dat_cache_path, md5)) {
+        return;
+    }
+
+    LoadSingleWordDict(single_word_dict_path);
+    LoadWordsDict(words_dict_paths);
+    bool build_ret = m_DataTrie.InitBuildDat(m_StaticNodeInfos, dat_cache_path, md5);
+    assert(build_ret);
+    m_StaticNodeInfos.clear();
+    malloc_trim(0);
+}
+
+void Pinyin4cppDictTrie::LoadSingleWordDict(const string &filePath) {
+    ifstream ifs(filePath.c_str());
+    string line;
+    vector<string> buf;
+
+    for (; getline(ifs, line);) {
+        if (limonp::StartsWith(line, "#")) {
+            continue;
+        }
+        limonp::Split(line, buf, ":");
+        assert(buf.size() == SINGLE_WORD_DICT_COLUMN_NUM);
+        if (m_StaticNodeInfos.find(buf[2]) != m_StaticNodeInfos.end()) {
+            vector<string> tmp;
+            bool isfind(false);
+            limonp::Split(m_StaticNodeInfos[buf[2]], tmp, ",");
+            for (auto &onePinyin:tmp) {
+                if (onePinyin == buf[1]) {
+                    isfind = true;
+                    break;
+                }
+            }
+            if (!isfind) {
+                m_StaticNodeInfos[buf[2]] += ("," + buf[2]);
+            }
+        } else {
+            m_StaticNodeInfos[buf[2]] = buf[1];
+        }
+    }
+}
+
+void Pinyin4cppDictTrie::LoadWordsDict(const string &filePath) {
+    ifstream ifs(filePath.c_str());
+    string line;
+    vector<string> buf;
+    for (; getline(ifs, line);) {
+        if (limonp::StartsWith(line, "#")) {
+            continue;
+        }
+        limonp::Split(line, buf, ":");
+        assert(buf.size() == WORDS_DICT_COLUMN_NUM);
+        if (m_StaticNodeInfos.find(buf[0]) != m_StaticNodeInfos.end()) {
+            vector<string> tmp;
+            bool isfind(false);
+            limonp::Split(m_StaticNodeInfos[buf[0]], tmp, "/");
+            for (auto &onePinyin:tmp) {
+                if (onePinyin == buf[1]) {
+                    isfind = true;
+                    break;
+                }
+            }
+            if (!isfind) {
+                m_StaticNodeInfos[buf[0]] += ("/" + buf[1]);
+            }
+        } else {
+            m_StaticNodeInfos[buf[0]] = buf[1];
+        }
+    }
+}
+
+string CalcFileListMD5(const string &files_list, size_t &file_size_sum) {
+    limonp::MD5 md5;
+
+    const auto files = limonp::Split(files_list, "|;");
+    file_size_sum = 0;
+
+    for (auto const & local_path : files) {
+        const int fd = open(local_path.c_str(), O_RDONLY);
+        if (fd < 0){
+            continue;
+        }
+        auto const len = lseek(fd, 0, SEEK_END);
+        if (len > 0) {
+            void * addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+            assert(MAP_FAILED != addr);
+
+            md5.Update((unsigned char *) addr, len);
+            file_size_sum += len;
+
+            munmap(addr, len);
+        }
+        close(fd);
+    }
+
+    md5.Final();
+    return string(md5.digestChars);
+}
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dictTrie.h
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dictTrie.h
@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#ifndef PINYIN4cpp_DICTTRIE_H
+#define PINYIN4cpp_DICTTRIE_H
+
+#include "pinyin4cpp_dataTrie.h"
+using namespace std;
+
+const size_t SINGLE_WORD_DICT_COLUMN_NUM = 3;
+const size_t WORDS_DICT_COLUMN_NUM = 2;
+
+class Pinyin4cppDictTrie {
+public:
+    Pinyin4cppDictTrie(const string& single_word_dict_path, const string& words_dict_paths, const string & dat_cache_path = "");
+
+    ~Pinyin4cppDictTrie() {}
+
+    string Find(const string &word) const;
+
+    bool Contains(string &word);
+    bool IsMultiTone(const string &word);
+
+    size_t GetTotalDictSize() const;
+
+private:
+    void Init(const string& single_word_dict_path, const string& words_dict_paths, string dat_cache_path);
+
+    void LoadSingleWordDict(const string& filePath);
+
+    void LoadWordsDict(const string& filePath);
+
+private:
+    map<string, string> m_StaticNodeInfos;
+
+    size_t m_TotalDictSize_ = 0;
+    Pinyin4cppDataTrie m_DataTrie;
+
+};
+
+inline string CalcFileListMD5(const string & files_list, size_t & file_size_sum);
+
+#endif //PINYIN4cpp_DICTTRIE_H
--- a/libchinese-segmentation/storage-base/cedar/cedar.h
+++ b/libchinese-segmentation/storage-base/cedar/cedar.h
@ -0,0 +1,682 @@
+// cedar -- C++ implementation of Efficiently-updatable Double ARray trie
+//  $Id: cedar.h 1938 2022-03-17 16:22:30Z ynaga $
+// Copyright (c) 2009-2015 Naoki Yoshinaga <ynaga@tkl.iis.u-tokyo.ac.jp>
+#ifndef CEDAR_H
+#define CEDAR_H
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define STATIC_ASSERT(e, msg) typedef char msg[(e) ? 1 : -1]
+
+namespace cedar {
+  // typedefs
+  typedef unsigned char  uchar;
+  template <typename T> struct NaN { enum { N1 = -1, N2 = -2 }; };
+  template <> struct NaN <float> { enum { N1 = 0x7f800001, N2 = 0x7f800002 }; };
+  static const int MAX_ALLOC_SIZE = 1 << 16; // must be divisible by 256
+  // dynamic double array
+  template <typename value_type,
+            const int     NO_VALUE  = NaN <value_type>::N1,
+            const int     NO_PATH   = NaN <value_type>::N2,
+            const bool    ORDERED   = true,
+            const int     MAX_TRIAL = 1,
+            const size_t  NUM_TRACKING_NODES = 0>
+  class da {
+  public:
+    enum error_code { CEDAR_NO_VALUE = NO_VALUE, CEDAR_NO_PATH = NO_PATH, CEDAR_VALUE_LIMIT = 2147483647 };
+    typedef value_type result_type;
+    struct result_pair_type {
+      value_type  value;
+      size_t      length;  // prefix length
+    };
+    struct result_triple_type { // for predict ()
+      value_type  value;
+      size_t      length;  // suffix length
+      size_t      id;      // node id of value
+    };
+    struct node {
+      union { int base_; value_type value; }; // negative means prev empty index
+      int  check;                             // negative means next empty index
+      node (const int base__ = 0, const int check_ = 0)
+        : base_ (base__), check (check_) {}
+#ifdef USE_REDUCED_TRIE
+      int base () const { return - (base_ + 1); } // ~ in two's complement system
+#else
+      int base () const { return base_; }
+#endif
+    };
+    struct ninfo {  // x1.5 update speed; +.25 % memory (8n -> 10n)
+      uchar  sibling;   // right sibling (= 0 if not exist)
+      uchar  child;     // first child
+      ninfo () : sibling (0), child (0) {}
+    };
+    struct block { // a block w/ 256 elements
+      int   prev;   // prev block; 3 bytes
+      int   next;   // next block; 3 bytes
+      short num;    // # empty elements; 0 - 256
+      short reject; // minimum # branching failed to locate; soft limit
+      int   trial;  // # trial
+      int   ehead;  // first empty item
+      block () : prev (0), next (0), num (256), reject (257), trial (0), ehead (0) {}
+    };
+    da () : tracking_node (), _array (0), _ninfo (0), _block (0), _bheadF (0), _bheadC (0), _bheadO (0), _capacity (0), _size (0), _no_delete (false), _reject () {
+      STATIC_ASSERT(sizeof (value_type) <= sizeof (int),
+                    value_type_is_not_supported___maintain_a_value_array_by_yourself_and_store_its_index
+                    );
+      _initialize ();
+    }
+    ~da () { clear (false); }
+    size_t capacity   () const { return static_cast <size_t> (_capacity); }
+    size_t size       () const { return static_cast <size_t> (_size); }
+    size_t total_size () const { return sizeof (node) * _size; }
+    size_t unit_size  () const { return sizeof (node); }
+    size_t nonzero_size () const {
+      size_t i = 0;
+      for (int to = 0; to < _size; ++to)
+        if (_array[to].check >= 0) ++i;
+      return i;
+    }
+    size_t num_keys () const {
+      size_t i = 0;
+      for (int to = 0; to < _size; ++to)
+#ifdef USE_REDUCED_TRIE
+        if (_array[to].check >= 0 && _array[to].value >= 0) ++i;
+#else
+        if (_array[to].check >= 0 && _array[_array[to].check].base () == to) ++i;
+#endif
+      return i;
+    }
+    // interfance
+    template <typename T>
+    T exactMatchSearch (const char* key) const
+    { return exactMatchSearch <T> (key, std::strlen (key)); }
+    template <typename T>
+    T exactMatchSearch (const char* key, size_t len, size_t from = 0) const {
+      union { int i; value_type x; } b;
+      size_t pos = 0;
+      b.i = _find (key, from, pos, len);
+      if (b.i == CEDAR_NO_PATH) b.i = CEDAR_NO_VALUE;
+      T result;
+      _set_result (&result, b.x, len, from);
+      return result;
+    }
+    template <typename T>
+    size_t commonPrefixSearch (const char* key, T* result, size_t result_len) const
+    { return commonPrefixSearch (key, result, result_len, std::strlen (key)); }
+    template <typename T>
+    size_t commonPrefixSearch (const char* key, T* result, size_t result_len, size_t len, size_t from = 0) const {
+      size_t num = 0;
+      for (size_t pos = 0; pos < len; ) {
+        union { int i; value_type x; } b;
+        b.i = _find (key, from, pos, pos + 1);
+        if (b.i == CEDAR_NO_VALUE) continue;
+        if (b.i == CEDAR_NO_PATH)  return num;
+        if (num < result_len) _set_result (&result[num], b.x, pos, from);
+        ++num;
+      }
+      return num;
+    }
+    // predict key from double array
+    template <typename T>
+    size_t commonPrefixPredict (const char* key, T* result, size_t result_len)
+    { return commonPrefixPredict (key, result, result_len, std::strlen (key)); }
+    template <typename T>
+    size_t commonPrefixPredict (const char* key, T* result, size_t result_len, size_t len, size_t from = 0) {
+      size_t num (0), pos (0), p (0);
+      if (_find (key, from, pos, len) == CEDAR_NO_PATH) return 0;
+      union { int i; value_type x; } b;
+      size_t root = from;
+      for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p, root)) {
+        if (num < result_len) _set_result (&result[num], b.x, p, from);
+        ++num;
+      }
+      return num;
+    }
+    void suffix (char* key, size_t len, size_t to) const {
+      key[len] = '\0';
+      while (len--) {
+        const int from = _array[to].check;
+        key[len]
+          = static_cast <char> (_array[from].base () ^ static_cast <int> (to));
+        to = static_cast <size_t> (from);
+      }
+    }
+    value_type traverse (const char* key, size_t& from, size_t& pos) const
+    { return traverse (key, from, pos, std::strlen (key)); }
+    value_type traverse (const char* key, size_t& from, size_t& pos, size_t len) const {
+      union { int i; value_type x; } b;
+      b.i = _find (key, from, pos, len);
+      return b.x;
+    }
+    struct empty_callback { void operator () (const int, const int) {} }; // dummy empty function
+    value_type& update (const char* key)
+    { return update (key, std::strlen (key)); }
+    value_type& update (const char* key, size_t len, value_type val = value_type (0))
+    { size_t from (0), pos (0); return update (key, from, pos, len, val); }
+    value_type& update (const char* key, size_t& from, size_t& pos, size_t len, value_type val = value_type (0))
+    { empty_callback cf; return update (key, from, pos, len, val, cf); }
+    template <typename T>
+    value_type& update (const char* key, size_t& from, size_t& pos, size_t len, value_type val, T& cf) {
+      if (! len && ! from)
+        _err (__FILE__, __LINE__, "failed to insert zero-length key\n");
+#ifndef USE_FAST_LOAD
+      if (! _ninfo || ! _block) restore ();
+#endif
+      for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
+           pos < len; ++pos) {
+#ifdef USE_REDUCED_TRIE
+        const value_type val_ = _array[from].value;
+        if (val_ >= 0 && val_ != CEDAR_VALUE_LIMIT) // always new; correct this!
+          { const int to = _follow (from, 0, cf); _array[to].value = val_; }
+#endif
+        from = static_cast <size_t> (_follow (from, key_[pos], cf));
+      }
+#ifdef USE_REDUCED_TRIE
+      const int to = _array[from].value >= 0 ? static_cast <int> (from) : _follow (from, 0, cf);
+      if (_array[to].value == CEDAR_VALUE_LIMIT) _array[to].value = 0;
+#else
+      const int to = _follow (from, 0, cf);
+#endif
+      return _array[to].value += val;
+    }
+    // easy-going erase () without compression
+    int erase (const char* key) { return erase (key, std::strlen (key)); }
+    int erase (const char* key, size_t len, size_t from = 0) {
+      size_t pos = 0;
+      const int i = _find (key, from, pos, len);
+      if (i == CEDAR_NO_PATH || i == CEDAR_NO_VALUE) return -1;
+      erase (from);
+      return 0;
+    }
+    void erase (size_t from) {
+      // _test ();
+#ifdef USE_REDUCED_TRIE
+      int e = _array[from].value >= 0 ? static_cast <int> (from) : _array[from].base () ^ 0;
+      from = static_cast <size_t> (_array[e].check);
+#else
+      int e = _array[from].base () ^ 0;
+#endif
+      bool flag = false; // have sibling
+      do {
+        const node& n = _array[from];
+        flag = _ninfo[n.base () ^ _ninfo[from].child].sibling;
+        if (flag) _pop_sibling (from, n.base (), static_cast <uchar> (n.base () ^ e));
+        _push_enode (e);
+         e = static_cast <int> (from);
+        from = static_cast <size_t> (_array[from].check);
+      } while (! flag);
+    }
+    int build (size_t num, const char** key, const size_t* len = 0, const value_type* val = 0) {
+      for (size_t i = 0; i < num; ++i)
+        update (key[i], len ? len[i] : std::strlen (key[i]), val ? val[i] : value_type (i));
+      return 0;
+    }
+    template <typename T>
+    void dump (T* result, const size_t result_len) {
+      union { int i; value_type x; } b;
+      size_t num (0), from (0), p (0);
+      for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p))
+        if (num < result_len)
+          _set_result (&result[num++], b.x, p, from);
+        else
+          _err (__FILE__, __LINE__, "dump() needs array of length = num_keys()\n");
+    }
+    int save (const char* fn, const char* mode = "wb") const {
+      // _test ();
+      FILE* fp = std::fopen (fn, mode);
+      if (! fp) return -1;
+      std::fwrite (_array, sizeof (node), static_cast <size_t> (_size), fp);
+      std::fclose (fp);
+#ifdef USE_FAST_LOAD
+      const char* const info
+        = std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
+      fp = std::fopen (info, mode);
+      delete [] info; // resolve memory leak
+      if (! fp) return -1;
+      std::fwrite (&_bheadF, sizeof (int), 1, fp);
+      std::fwrite (&_bheadC, sizeof (int), 1, fp);
+      std::fwrite (&_bheadO, sizeof (int), 1, fp);
+      std::fwrite (_ninfo, sizeof (ninfo), static_cast <size_t> (_size), fp);
+      std::fwrite (_block, sizeof (block), static_cast <size_t> (_size >> 8), fp);
+      std::fclose (fp);
+#endif
+      return 0;
+    }
+    int open (const char* fn, const char* mode = "rb",
+              const size_t offset = 0, size_t size_ = 0) {
+      FILE* fp = std::fopen (fn, mode);
+      if (! fp) return -1;
+      // get size
+      if (! size_) {
+        if (std::fseek (fp, 0, SEEK_END) != 0) return -1;
+        size_ = static_cast <size_t> (std::ftell (fp));
+        if (std::fseek (fp, 0, SEEK_SET) != 0) return -1;
+      }
+      if (size_ <= offset) return -1;
+      // set array
+      clear (false);
+      size_ = (size_ - offset) / sizeof (node);
+      if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
+      _array = static_cast <node*>  (std::malloc (sizeof (node)  * size_));
+#ifdef USE_FAST_LOAD
+      _ninfo = static_cast <ninfo*> (std::malloc (sizeof (ninfo) * size_));
+      _block = static_cast <block*> (std::malloc (sizeof (block) * size_));
+      if (! _array || ! _ninfo || ! _block)
+#else
+        if (! _array)
+#endif
+          _err (__FILE__, __LINE__, "memory allocation failed\n");
+      if (size_ != std::fread (_array, sizeof (node), size_, fp)) return -1;
+      std::fclose (fp);
+      _size = static_cast <int> (size_);
+#ifdef USE_FAST_LOAD
+      const char* const info
+        = std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
+      fp = std::fopen (info, mode);
+      delete [] info; // resolve memory leak
+      if (! fp) return -1;
+      std::fread (&_bheadF, sizeof (int), 1, fp);
+      std::fread (&_bheadC, sizeof (int), 1, fp);
+      std::fread (&_bheadO, sizeof (int), 1, fp);
+      if (size_ != std::fread (_ninfo, sizeof (ninfo), size_, fp) ||
+          size_ != std::fread (_block, sizeof (block), size_ >> 8, fp) << 8)
+        return -1;
+      std::fclose (fp);
+      _capacity = _size;
+#endif
+      return 0;
+    }
+#ifndef USE_FAST_LOAD
+    void restore () { // restore information to update
+      if (! _block) _restore_block ();
+      if (! _ninfo) _restore_ninfo ();
+      _capacity = _size;
+    }
+#endif
+    void set_array (void* p, size_t size_ = 0) { // ad-hoc
+      clear (false);
+      _array = static_cast <node*> (p);
+      _size  = static_cast <int> (size_);
+      _no_delete = true;
+    }
+    const void* array () const { return _array; }
+    void clear (const bool reuse = true) {
+      if (_array && ! _no_delete) std::free (_array);
+      if (_ninfo) std::free (_ninfo);
+      if (_block) std::free (_block);
+      _array = 0; _ninfo = 0; _block = 0;
+      _bheadF = _bheadC = _bheadO = _capacity = _size = 0; // *
+      if (reuse) _initialize ();
+      _no_delete = false;
+    }
+    // return the first child for a tree rooted by a given node
+    int begin (size_t& from, size_t& len) {
+#ifndef USE_FAST_LOAD
+      if (! _ninfo) _restore_ninfo ();
+#endif
+      int   base = _array[from].base ();
+      uchar c    = _ninfo[from].child;
+      if (! from && ! (c = _ninfo[base ^ c].sibling)) // bug fix
+        return CEDAR_NO_PATH; // no entry
+      for (; c; ++len) {
+        from = static_cast <size_t> (_array[from].base ()) ^ c;
+        c    = _ninfo[from].child;
+      }
+#ifdef USE_REDUCED_TRIE
+      if (_array[from].value >= 0) return _array[from].value;
+#endif
+      return _array[_array[from].base () ^ c].base_;
+    }
+    // return the next child if any
+    int next (size_t& from, size_t& len, const size_t root = 0) {
+      uchar c = 0;
+#ifdef USE_REDUCED_TRIE
+      if (_array[from].value < 0)
+#endif
+        c = _ninfo[_array[from].base () ^ 0].sibling;
+      for (; ! c && from != root; --len) {
+        c = _ninfo[from].sibling;
+        from = static_cast <size_t> (_array[from].check);
+      }
+      return c ?
+        begin (from = static_cast <size_t> (_array[from].base ()) ^ c, ++len) :
+        CEDAR_NO_PATH;
+    }
+    // test the validity of double array for debug
+    void test (const size_t from = 0) const {
+      const int base = _array[from].base ();
+      uchar c = _ninfo[from].child;
+      do {
+        if (from) assert (_array[base ^ c].check == static_cast <int> (from));
+        if (c  && _array[base ^ c].value < 0) // correct this
+          test (static_cast <size_t> (base ^ c));
+      } while ((c = _ninfo[base ^ c].sibling));
+    }
+    size_t tracking_node[NUM_TRACKING_NODES + 1];
+  private:
+    // currently disabled; implement these if you need
+    da (const da&);
+    da& operator= (const da&);
+    node*   _array;
+    ninfo*  _ninfo;
+    block*  _block;
+    int     _bheadF;  // first block of Full;   0
+    int     _bheadC;  // first block of Closed; 0 if no Closed
+    int     _bheadO;  // first block of Open;   0 if no Open
+    int     _capacity;
+    int     _size;
+    int     _no_delete;
+    short   _reject[257];
+    //
+    static void _err (const char* fn, const int ln, const char* msg)
+    { std::fprintf (stderr, "cedar: %s [%d]: %s", fn, ln, msg); std::exit (1); }
+    template <typename T>
+    static void _realloc_array (T*& p, const int size_n, const int size_p = 0) {
+      void* tmp = std::realloc (p, sizeof (T) * static_cast <size_t> (size_n));
+      if (! tmp)
+        std::free (p), _err (__FILE__, __LINE__, "memory reallocation failed\n");
+      p = static_cast <T*> (tmp);
+      static const T T0 = T ();
+      for (T* q (p + size_p), * const r (p + size_n); q != r; ++q) *q = T0;
+    }
+    void _initialize () { // initilize the first special block
+      _realloc_array (_array, 256, 256);
+      _realloc_array (_ninfo, 256);
+      _realloc_array (_block, 1);
+#ifdef USE_REDUCED_TRIE
+      _array[0] = node (-1, -1);
+#else
+      _array[0] = node (0, -1);
+#endif
+      for (int i = 1; i < 256; ++i)
+        _array[i] = node (i == 1 ? -255 : - (i - 1), i == 255 ? -1 : - (i + 1));
+      _block[0].ehead = 1; // bug fix for erase
+      _capacity = _size = 256;
+      for (size_t i = 0 ; i <= NUM_TRACKING_NODES; ++i) tracking_node[i] = 0;
+      for (short  i = 0; i <= 256; ++i) _reject[i] = i + 1;
+    }
+    // follow/create edge
+    template <typename T>
+    int _follow (size_t& from, const uchar& label, T& cf) {
+      int to = 0;
+      const int base = _array[from].base ();
+      if (base < 0 || _array[to = base ^ label].check < 0) {
+        to = _pop_enode (base, label, static_cast <int> (from));
+        _push_sibling (from, to ^ label, label, base >= 0);
+      } else if (_array[to].check != static_cast <int> (from))
+        to = _resolve (from, base, label, cf);
+      return to;
+    }
+    // find key from double array
+    int _find (const char* key, size_t& from, size_t& pos, const size_t len) const {
+      for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
+           pos < len; ) { // follow link
+#ifdef USE_REDUCED_TRIE
+        if (_array[from].value >= 0) return CEDAR_NO_PATH;
+#endif
+        size_t to = static_cast <size_t> (_array[from].base ()); to ^= key_[pos];
+        if (_array[to].check != static_cast <int> (from)) return CEDAR_NO_PATH;
+        ++pos;
+        from = to;
+      }
+#ifdef USE_REDUCED_TRIE
+      if (_array[from].value >= 0) // get value from leaf; only allow integer key
+        return _array[from].value;
+#endif
+      const node n = _array[_array[from].base () ^ 0];
+      if (n.check != static_cast <int> (from)) return CEDAR_NO_VALUE;
+      return n.base_;
+    }
+#ifndef USE_FAST_LOAD
+    void _restore_ninfo () {
+      _realloc_array (_ninfo, _size);
+      for (int to = 0; to < _size; ++to) {
+        const int from = _array[to].check;
+        if (from < 0) continue; // skip empty node
+        const int base = _array[from].base ();
+        if (const uchar label = static_cast <uchar> (base ^ to)) // skip leaf
+          _push_sibling (static_cast <size_t> (from), base, label,
+                         ! from || _ninfo[from].child || _array[base ^ 0].check == from);
+      }
+    }
+    void _restore_block () {
+      _realloc_array (_block, _size >> 8);
+      _bheadF = _bheadC = _bheadO = 0;
+      for (int bi (0), e (0); e < _size; ++bi) { // register blocks to full
+        block& b = _block[bi];
+        b.num = 0;
+        for (; e < (bi << 8) + 256; ++e)
+          if (_array[e].check < 0 && ++b.num == 1) b.ehead = e;
+        int& head_out = b.num == 1 ? _bheadC : (b.num == 0 ? _bheadF : _bheadO);
+        _push_block (bi, head_out, ! head_out && b.num);
+      }
+    }
+#endif
+    void _set_result (result_type* x, value_type r, size_t = 0, size_t = 0) const
+    { *x = r; }
+    void _set_result (result_pair_type* x, value_type r, size_t l, size_t = 0) const
+    { x->value = r; x->length = l; }
+    void _set_result (result_triple_type* x, value_type r, size_t l, size_t from) const
+    { x->value = r; x->length = l; x->id = from; }
+    void _pop_block (const int bi, int& head_in, const bool last) {
+      if (last) { // last one poped; Closed or Open
+        head_in = 0;
+      } else {
+        const block& b = _block[bi];
+        _block[b.prev].next = b.next;
+        _block[b.next].prev = b.prev;
+        if (bi == head_in) head_in = b.next;
+      }
+    }
+    void _push_block (const int bi, int& head_out, const bool empty) {
+      block& b = _block[bi];
+      if (empty) { // the destination is empty
+        head_out = b.prev = b.next = bi;
+      } else { // use most recently pushed
+        int& tail_out = _block[head_out].prev;
+        b.prev = tail_out;
+        b.next = head_out;
+        head_out = tail_out = _block[tail_out].next = bi;
+      }
+    }
+    int _add_block () {
+      if (_size == _capacity) { // allocate memory if needed
+#ifdef USE_EXACT_FIT
+        _capacity += _size >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : _size;
+#else
+        _capacity += _capacity;
+#endif
+        _realloc_array (_array, _capacity, _capacity);
+        _realloc_array (_ninfo, _capacity, _size);
+        _realloc_array (_block, _capacity >> 8, _size >> 8);
+      }
+      _block[_size >> 8].ehead = _size;
+      _array[_size] = node (- (_size + 255),  - (_size + 1));
+      for (int i = _size + 1; i < _size + 255; ++i)
+        _array[i] = node (-(i - 1), -(i + 1));
+      _array[_size + 255] = node (- (_size + 254),  -_size);
+      _push_block (_size >> 8, _bheadO, ! _bheadO); // append to block Open
+      _size += 256;
+      return (_size >> 8) - 1;
+    }
+    // transfer block from one start w/ head_in to one start w/ head_out
+    void _transfer_block (const int bi, int& head_in, int& head_out) {
+      _pop_block  (bi, head_in, bi == _block[bi].next);
+      _push_block (bi, head_out, ! head_out && _block[bi].num);
+    }
+    // pop empty node from block; never transfer the special block (bi = 0)
+    int _pop_enode (const int base, const uchar label, const int from) {
+      const int e  = base < 0 ? _find_place () : base ^ label;
+      const int bi = e >> 8;
+      node&  n = _array[e];
+      block& b = _block[bi];
+      if (--b.num == 0) {
+        if (bi) _transfer_block (bi, _bheadC, _bheadF); // Closed to Full
+      } else { // release empty node from empty ring
+        _array[-n.base_].check = n.check;
+        _array[-n.check].base_ = n.base_;
+        if (e == b.ehead) b.ehead = -n.check; // set ehead
+        if (bi && b.num == 1 && b.trial != MAX_TRIAL) // Open to Closed
+          _transfer_block (bi, _bheadO, _bheadC);
+      }
+      // initialize the released node
+#ifdef USE_REDUCED_TRIE
+      n.value = CEDAR_VALUE_LIMIT; n.check = from;
+      if (base < 0) _array[from].base_ = - (e ^ label) - 1;
+#else
+      if (label) n.base_ = -1; else n.value = value_type (0); n.check = from;
+      if (base < 0) _array[from].base_ = e ^ label;
+#endif
+      return e;
+    }
+    // push empty node into empty ring
+    void _push_enode (const int e) {
+      const int bi = e >> 8;
+      block& b = _block[bi];
+      if (++b.num == 1) { // Full to Closed
+        b.ehead = e;
+        _array[e] = node (-e, -e);
+        if (bi) _transfer_block (bi, _bheadF, _bheadC); // Full to Closed
+      } else {
+        const int prev = b.ehead;
+        const int next = -_array[prev].check;
+        _array[e] = node (-prev, -next);
+        _array[prev].check = _array[next].base_ = -e;
+        if (b.num == 2 || b.trial == MAX_TRIAL) // Closed to Open
+          if (bi) _transfer_block (bi, _bheadC, _bheadO);
+        b.trial = 0;
+      }
+      if (b.reject < _reject[b.num]) b.reject = _reject[b.num];
+      _ninfo[e] = ninfo (); // reset ninfo; no child, no sibling
+    }
+    // push label to from's child
+    void _push_sibling (const size_t from, const int base, const uchar label, const bool flag = true) {
+      uchar* c = &_ninfo[from].child;
+      if (flag && (ORDERED ? label > *c : ! *c))
+        do c = &_ninfo[base ^ *c].sibling; while (ORDERED && *c && *c < label);
+      _ninfo[base ^ label].sibling = *c, *c = label;
+    }
+    // pop label from from's child
+    void _pop_sibling (const size_t from, const int base, const uchar label) {
+      uchar* c = &_ninfo[from].child;
+      while (*c != label) c = &_ninfo[base ^ *c].sibling;
+      *c = _ninfo[base ^ label].sibling;
+    }
+    // check whether to replace branching w/ the newly added node
+    bool _consult (const int base_n, const int base_p, uchar c_n, uchar c_p) const {
+      do if (! (c_p = _ninfo[base_p ^ c_p].sibling)) return false;
+      while ((c_n = _ninfo[base_n ^ c_n].sibling));
+      return true;
+    }
+    // enumerate (equal to or more than one) child nodes
+    uchar* _set_child (uchar* p, const int base, uchar c, const int label = -1) {
+      --p;
+      if (! c)  { *++p = c; c = _ninfo[base ^ c].sibling; } // 0: terminal
+      if (ORDERED)
+        while (c && c < label) { *++p = c; c = _ninfo[base ^ c].sibling; }
+      if (label != -1) *++p = static_cast <uchar> (label);
+      while (c) { *++p = c; c = _ninfo[base ^ c].sibling; }
+      return p;
+    }
+    // explore new block to settle down
+    int _find_place () {
+      if (_bheadC) return _block[_bheadC].ehead;
+      if (_bheadO) return _block[_bheadO].ehead;
+      return _add_block () << 8;
+    }
+    int _find_place (const uchar* const first, const uchar* const last) {
+      if (int bi = _bheadO) {
+        const int   bz = _block[_bheadO].prev;
+        const short nc = static_cast <short> (last - first + 1);
+        while (1) { // set candidate block
+          block& b = _block[bi];
+          if (b.num >= nc && nc < b.reject) // explore configuration
+            for (int e = b.ehead;;) {
+              const int base = e ^ *first;
+              for (const uchar* p = first; _array[base ^ *++p].check < 0; )
+                if (p == last) return b.ehead = e; // no conflict
+              if ((e = -_array[e].check) == b.ehead) break;
+            }
+          b.reject = nc;
+          if (b.reject < _reject[b.num]) _reject[b.num] = b.reject;
+          const int bi_ = b.next;
+          if (++b.trial == MAX_TRIAL) _transfer_block (bi, _bheadO, _bheadC);
+          if (bi == bz) break;
+          bi = bi_;
+        };
+      }
+      return _add_block () << 8;
+    }
+    // resolve conflict on base_n ^ label_n = base_p ^ label_p
+    template <typename T>
+    int _resolve (size_t& from_n, const int base_n, const uchar label_n, T& cf) {
+      // examine siblings of conflicted nodes
+      const int to_pn  = base_n ^ label_n;
+      const int from_p = _array[to_pn].check;
+      const int base_p = _array[from_p].base ();
+      const bool flag // whether to replace siblings of newly added
+        = _consult (base_n, base_p, _ninfo[from_n].child, _ninfo[from_p].child);
+      uchar child[256];
+      uchar* const first = &child[0];
+      uchar* const last  =
+        flag ? _set_child (first, base_n, _ninfo[from_n].child, label_n)
+        : _set_child (first, base_p, _ninfo[from_p].child);
+      const int base =
+        (first == last ? _find_place () : _find_place (first, last)) ^ *first;
+      // replace & modify empty list
+      const int from  = flag ? static_cast <int> (from_n) : from_p;
+      const int base_ = flag ? base_n : base_p;
+      if (flag && *first == label_n) _ninfo[from].child = label_n; // new child
+#ifdef USE_REDUCED_TRIE
+      _array[from].base_ = -base - 1; // new base
+#else
+      _array[from].base_ = base; // new base
+#endif
+      for (const uchar* p = first; p <= last; ++p) { // to_ => to
+        const int to  = _pop_enode (base, *p, from);
+        const int to_ = base_ ^ *p;
+        _ninfo[to].sibling = (p == last ? 0 : *(p + 1));
+        if (flag && to_ == to_pn) continue; // skip newcomer (no child)
+        cf (to_, to); // user-defined callback function to handle moved nodes
+        node& n  = _array[to];
+        node& n_ = _array[to_];
+#ifdef USE_REDUCED_TRIE
+        if ((n.base_ = n_.base_) < 0 && *p) // copy base; bug fix
+#else
+        if ((n.base_ = n_.base_) > 0 && *p) // copy base; bug fix
+#endif
+          {
+            uchar c = _ninfo[to].child = _ninfo[to_].child;
+            do _array[n.base () ^ c].check = to; // adjust grand son's check
+            while ((c = _ninfo[n.base () ^ c].sibling));
+          }
+        if (! flag && to_ == static_cast <int> (from_n)) // parent node moved
+          from_n = static_cast <size_t> (to); // bug fix
+        if (! flag && to_ == to_pn) { // the address is immediately used
+          _push_sibling (from_n, to_pn ^ label_n, label_n);
+          _ninfo[to_].child = 0; // remember to reset child
+#ifdef USE_REDUCED_TRIE
+          n_.value = CEDAR_VALUE_LIMIT;
+#else
+          if (label_n) n_.base_ = -1; else n_.value = value_type (0);
+#endif
+          n_.check = static_cast <int> (from_n);
+        } else
+          _push_enode (to_);
+        if (NUM_TRACKING_NODES) // keep the traversed node updated
+          for (size_t j = 0; tracking_node[j] != 0; ++j)
+            if (tracking_node[j] == static_cast <size_t> (to_))
+              { tracking_node[j] = static_cast <size_t> (to); break; }
+      }
+      return flag ? base ^ label_n : to_pn;
+    }
+  };
+}
+#endif
--- a/libchinese-segmentation/storage-base/cedar/cedarpp.h
+++ b/libchinese-segmentation/storage-base/cedar/cedarpp.h
@ -0,0 +1,834 @@
+// cedar -- C++ implementation of Efficiently-updatable Double ARray trie
+//  $Id: cedarpp.h 1916 2017-07-12 07:30:56Z ynaga $
+// Copyright (c) 2009-2015 Naoki Yoshinaga <ynaga@tkl.iis.u-tokyo.ac.jp>
+#ifndef CEDAR_H
+#define CEDAR_H
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <climits>
+#include <cassert>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define STATIC_ASSERT(e, msg) typedef char msg[(e) ? 1 : -1]
+
+namespace cedar {
+  // typedefs
+#if LONG_BIT == 64
+  typedef unsigned long       npos_t; // possibly compatible with size_t
+#else
+  typedef unsigned long long  npos_t;
+#endif
+  typedef unsigned char       uchar;
+  static const npos_t TAIL_OFFSET_MASK = static_cast <npos_t> (0xffffffff);
+  static const npos_t NODE_INDEX_MASK  = static_cast <npos_t> (0xffffffff) << 32;
+  template <typename T> struct NaN { enum { N1 = -1, N2 = -2 }; };
+  template <> struct NaN <float> { enum { N1 = 0x7f800001, N2 = 0x7f800002 }; };
+  static const int MAX_ALLOC_SIZE = 1 << 16; // must be divisible by 256
+  // dynamic double array
+  template <typename value_type,
+            const int     NO_VALUE  = NaN <value_type>::N1,
+            const int     NO_PATH   = NaN <value_type>::N2,
+            const bool    ORDERED   = true,
+            const int     MAX_TRIAL = 1,
+            const size_t  NUM_TRACKING_NODES = 0>
+  class da {
+  public:
+    enum error_code { CEDAR_NO_VALUE = NO_VALUE, CEDAR_NO_PATH = NO_PATH };
+    typedef value_type result_type;
+    struct result_pair_type {
+      value_type  value;
+      size_t      length;  // prefix length
+    };
+    struct result_triple_type { // for predict ()
+      value_type  value;
+      size_t      length;  // suffix length
+      npos_t      id;      // node id of value
+    };
+    struct node {
+      union { int base; value_type value; }; // negative means prev empty index
+      int  check;                            // negative means next empty index
+      node (const int base_ = 0, const int check_ = 0)
+        : base (base_), check (check_) {}
+    };
+    struct ninfo {  // x1.5 update speed; +.25 % memory (8n -> 10n)
+      uchar  sibling;   // right sibling (= 0 if not exist)
+      uchar  child;     // first child
+      ninfo () : sibling (0), child (0) {}
+    };
+    struct block { // a block w/ 256 elements
+      int   prev;   // prev block; 3 bytes
+      int   next;   // next block; 3 bytes
+      short num;    // # empty elements; 0 - 256
+      short reject; // minimum # branching failed to locate; soft limit
+      int   trial;  // # trial
+      int   ehead;  // first empty item
+      block () : prev (0), next (0), num (256), reject (257), trial (0), ehead (0) {}
+    };
+    da () : tracking_node (), _array (0), _tail (0), _tail0 (0), _ninfo (0), _block (0), _bheadF (0), _bheadC (0), _bheadO (0), _capacity (0), _size (0), _quota (0), _quota0 (0), _no_delete (false), _reject () {
+#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
+      STATIC_ASSERT(sizeof (value_type) <= sizeof (int),
+                    value_type_is_not_supported___maintain_a_value_array_by_yourself_and_store_its_index_to_trie
+                    );
+#pragma GCC diagnostic warning "-Wunused-local-typedefs"
+      _initialize ();
+    }
+    ~da () { clear (false); }
+    size_t capacity   () const { return static_cast <size_t> (_capacity); }
+    size_t size       () const { return static_cast <size_t> (_size); }
+    size_t length     () const { return static_cast <size_t> (*_length); }
+    size_t total_size () const { return sizeof (node) * _size; }
+    size_t unit_size  () const { return sizeof (node); }
+    size_t nonzero_size () const {
+      size_t i = 0;
+      for (int to = 0; to < _size; ++to)
+        if (_array[to].check >= 0) ++i;
+      return i;
+    }
+    size_t nonzero_length () const {
+      size_t i (0), j (0);
+      for (int to = 0; to < _size; ++to) {
+        const node& n = _array[to];
+        if (n.check >= 0 && _array[n.check].base != to && n.base < 0)
+          { ++j; for (const char* p = &_tail[-n.base]; *p; ++p) ++i; }
+      }
+      return i + j * (1 + sizeof (value_type));
+    }
+    size_t num_keys () const {
+      size_t i = 0;
+      for (int to = 0; to < _size; ++to) {
+        const node& n = _array[to];
+        if (n.check >= 0 && (_array[n.check].base == to || n.base < 0)) ++i;
+      }
+      return i;
+    }
+    // interfance
+    template <typename T>
+    T exactMatchSearch (const char* key) const
+    { return exactMatchSearch <T> (key, std::strlen (key)); }
+    template <typename T>
+    T exactMatchSearch (const char* key, size_t len, npos_t from = 0) const {
+      union { int i; value_type x; } b;
+      size_t pos = 0;
+      b.i = _find (key, from, pos, len);
+      if (b.i == CEDAR_NO_PATH) b.i = CEDAR_NO_VALUE;
+      T result;
+      _set_result (&result, b.x, len, from);
+      return result;
+    }
+    template <typename T>
+    size_t commonPrefixSearch (const char* key, T* result, size_t result_len) const
+    { return commonPrefixSearch (key, result, result_len, std::strlen (key)); }
+    template <typename T>
+    size_t commonPrefixSearch (const char* key, T* result, size_t result_len, size_t len, npos_t from = 0) const {
+      size_t num = 0;
+      for (size_t pos = 0; pos < len; ) {
+        union { int i; value_type x; } b;
+        b.i = _find (key, from, pos, pos + 1);
+        if (b.i == CEDAR_NO_VALUE) continue;
+        if (b.i == CEDAR_NO_PATH)  return num;
+        if (num < result_len) _set_result (&result[num], b.x, pos, from);
+        ++num;
+      }
+      return num;
+    }
+    // predict key from double array
+    template <typename T>
+    size_t commonPrefixPredict (const char* key, T* result, size_t result_len)
+    { return commonPrefixPredict (key, result, result_len, std::strlen (key)); }
+    template <typename T>
+    size_t commonPrefixPredict (const char* key, T* result, size_t result_len, size_t len, npos_t from = 0) {
+      size_t num (0), pos (0), p (0);
+      if (_find (key, from, pos, len) == CEDAR_NO_PATH) return 0;
+      union { int i; value_type x; } b;
+      const npos_t root = from;
+      for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p, root)) {
+        if (num < result_len)
+          _set_result (&result[num], b.x, p, from);
+        ++num;
+      }
+      return num;
+    }
+    void suffix (char* key, size_t len, npos_t to) const {
+      key[len] = '\0';
+      if (const int offset = static_cast <int> (to >> 32)) {
+        to &= TAIL_OFFSET_MASK;
+        size_t len_tail = std::strlen (&_tail[-_array[to].base]);
+        if (len > len_tail) len -= len_tail; else len_tail = len, len = 0;
+        std::memcpy (&key[len], &_tail[static_cast <size_t> (offset) - len_tail], len_tail);
+      }
+      while (len--) {
+        const int from = _array[to].check;
+        key[len] = static_cast <char> (_array[from].base ^ static_cast <int> (to));
+        to = static_cast <npos_t> (from);
+      }
+    }
+    value_type traverse (const char* key, npos_t& from, size_t& pos) const
+    { return traverse (key, from, pos, std::strlen (key)); }
+    value_type traverse (const char* key, npos_t& from, size_t& pos, size_t len) const {
+      union { int i; value_type x; } b;
+      b.i = _find (key, from, pos, len);
+      return b.x;
+    }
+    struct empty_callback { void operator () (const int, const int) {} }; // dummy empty function
+    value_type& update (const char* key)
+    { return update (key, std::strlen (key)); }
+    value_type& update (const char* key, size_t len, value_type val = value_type (0))
+    { npos_t from (0); size_t pos (0); return update (key, from, pos, len, val); }
+    value_type& update (const char* key, npos_t& from, size_t& pos, size_t len, value_type val = value_type (0))
+    { empty_callback cf; return update (key, from, pos, len, val, cf); }
+    template <typename T>
+    value_type& update (const char* key, npos_t& from, size_t& pos, size_t len, value_type val, T& cf) {
+      if (! len && ! from)
+        _err (__FILE__, __LINE__, "failed to insert zero-length key\n");
+#ifndef USE_FAST_LOAD
+      if (! _ninfo || ! _block) restore ();
+#endif
+      npos_t offset = from >> 32;
+      if (! offset) { // node on trie
+        for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
+             _array[from].base >= 0; ++pos) {
+          if (pos == len) // could be reduced
+            { const int to = _follow (from, 0, cf); return _array[to].value += val; }
+          from = static_cast <size_t> (_follow (from, key_[pos], cf));
+        }
+        offset = static_cast <npos_t> (-_array[from].base);
+      }
+      if (offset >= sizeof (int)) { // go to _tail
+        const size_t pos_orig = pos;
+        char* const tail = &_tail[offset] - pos;
+        while (pos < len && key[pos] == tail[pos]) ++pos;
+        //
+        if (pos == len && tail[pos] == '\0') { // found exact key
+          if (const npos_t moved = pos - pos_orig) { // search end on tail
+            from &= TAIL_OFFSET_MASK;
+            from |= (offset + moved) << 32;
+          }
+          return *reinterpret_cast <value_type*> (&tail[len + 1]) += val;
+        }
+        // otherwise, insert the common prefix in tail if any
+        if (from >> 32) {
+          from &= TAIL_OFFSET_MASK; // reset to update tail offset
+          for (npos_t offset_ = static_cast <npos_t> (-_array[from].base);
+               offset_ < offset; ) {
+            from = static_cast <size_t>
+                   (_follow (from, static_cast <uchar> (_tail[offset_]), cf));
+            ++offset_;
+            // this shows intricacy in debugging updatable double array trie
+            if (NUM_TRACKING_NODES) // keep the traversed node (on tail) updated
+              for (size_t j = 0; tracking_node[j] != 0; ++j)
+                if (tracking_node[j] >> 32 == offset_)
+                  tracking_node[j] = static_cast <npos_t> (from);
+          }
+        }
+        for (size_t pos_ = pos_orig; pos_ < pos; ++pos_)
+          from = static_cast <size_t>
+                 (_follow (from, static_cast <uchar> (key[pos_]), cf));
+        npos_t moved = pos - pos_orig;
+        if (tail[pos]) { // remember to move offset to existing tail
+          const int to_ = _follow (from, static_cast <uchar> (tail[pos]), cf);
+          _array[to_].base = - static_cast <int> (offset + ++moved);
+          moved -= 1 + sizeof (value_type); // keep record
+        }
+        moved += offset;
+        for (npos_t i = offset; i <= moved; i += 1 + sizeof (value_type)) {
+          if (_quota0 == ++*_length0) {
+#ifdef USE_EXACT_FIT
+            _quota0 += *_length0 >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : *_length0;
+#else
+            _quota0 += _quota0;
+#endif
+            _realloc_array (_tail0, _quota0, *_length0);
+          }
+          _tail0[*_length0] = static_cast <int> (i);
+        }
+        if (pos == len || tail[pos] == '\0') {
+          const int to = _follow (from, 0, cf); // could be reduced
+          if (pos == len) return _array[to].value += val; // set value on trie
+          _array[to].value += *reinterpret_cast <value_type*> (&tail[pos + 1]);
+        }
+        from = static_cast <size_t> (_follow (from, static_cast <uchar> (key[pos]), cf));
+        ++pos;
+      }
+      const int needed = static_cast <int> (len - pos + 1 + sizeof (value_type));
+      if (pos == len && *_length0) { // reuse
+        const int offset0 = _tail0[*_length0];
+        _tail[offset0] = '\0';
+        _array[from].base = -offset0;
+        --*_length0;
+        return *reinterpret_cast <value_type*> (&_tail[offset0 + 1]) = val;
+      }
+      if (_quota < *_length + needed) {
+#ifdef USE_EXACT_FIT
+        _quota += needed > *_length || needed > MAX_ALLOC_SIZE ? needed :
+                  (*_length >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : *_length);
+#else
+        _quota += _quota >= needed ? _quota : needed;
+#endif
+        _realloc_array (_tail, _quota, *_length);
+      }
+      _array[from].base = -*_length;
+      const size_t pos_orig = pos;
+      char* const tail = &_tail[*_length] - pos;
+      if (pos < len) {
+        do tail[pos] = key[pos]; while (++pos < len);
+        from |= (static_cast <npos_t> (*_length) + (len - pos_orig)) << 32;
+      }
+      *_length += needed;
+      return *reinterpret_cast <value_type*> (&tail[len + 1]) += val;
+    }
+    // easy-going erase () without compression
+    int erase (const char* key) { return erase (key, std::strlen (key)); }
+    int erase (const char* key, size_t len, npos_t from = 0) {
+      size_t pos = 0;
+      const int i = _find (key, from, pos, len);
+      if (i == CEDAR_NO_PATH || i == CEDAR_NO_VALUE) return -1;
+      if (from >> 32) from &= TAIL_OFFSET_MASK; // leave tail as is
+      bool flag = _array[from].base < 0; // have sibling
+      int e = flag ? static_cast <int> (from) : _array[from].base ^ 0;
+      from  = _array[e].check;
+      do {
+        const node& n = _array[from];
+        flag = _ninfo[n.base ^ _ninfo[from].child].sibling;
+        if (flag) _pop_sibling (from, n.base, static_cast <uchar> (n.base ^ e));
+        _push_enode (e);
+        e = static_cast <int> (from);
+        from = static_cast <size_t> (_array[from].check);
+      } while (! flag);
+      return 0;
+    }
+    int build (size_t num, const char** key, const size_t* len = 0, const value_type* val = 0) {
+      for (size_t i = 0; i < num; ++i)
+        update (key[i], len ? len[i] : std::strlen (key[i]), val ? val[i] : value_type (i));
+      return 0;
+    }
+    template <typename T>
+    void dump (T* result, const size_t result_len) {
+      union { int i; value_type x; } b;
+      size_t num (0), p (0);
+      npos_t from = 0;
+      for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p))
+        if (num < result_len)
+          _set_result (&result[num++], b.x, p, from);
+        else
+          _err (__FILE__, __LINE__, "dump() needs array of length = num_keys()\n");
+    }
+    void shrink_tail () {
+      union { char* tail; int* length; } t;
+      const size_t length_
+        = static_cast <size_t> (*_length)
+        - static_cast <size_t> (*_length0) * (1 + sizeof (value_type));
+      t.tail = static_cast <char*> (std::malloc (length_));
+      if (! t.tail) _err (__FILE__, __LINE__, "memory allocation failed\n");
+      *t.length = static_cast <int> (sizeof (int));
+      for (int to = 0; to < _size; ++to) {
+        node& n = _array[to];
+        if (n.check >= 0 && _array[n.check].base != to && n.base < 0) {
+          char* const tail (&t.tail[*t.length]), * const tail_ (&_tail[-n.base]);
+          n.base = - *t.length;
+          int i = 0; do tail[i] = tail_[i]; while (tail[i++]);
+          *reinterpret_cast <value_type*> (&tail[i])
+            = *reinterpret_cast <const value_type*> (&tail_[i]);
+          *t.length += i + static_cast <int> (sizeof (value_type));
+        }
+      }
+      std::free (_tail);
+      _tail = t.tail;
+      _realloc_array (_tail,  *_length,  *_length);
+      _quota  = *_length;
+      _realloc_array (_tail0, 1);
+      _quota0 = 1;
+    }
+    int save (const char* fn, const char* mode, const bool shrink) {
+      if (shrink) shrink_tail ();
+      return save (fn, mode);
+    }
+    int save (const char* fn, const char* mode = "wb") const {
+      // _test ();
+      FILE* fp = std::fopen (fn, mode);
+      if (! fp) return -1;
+      std::fwrite (_tail,  sizeof (char), static_cast <size_t> (*_length), fp);
+      std::fwrite (_array, sizeof (node), static_cast <size_t> (_size), fp);
+      std::fclose (fp);
+#ifdef USE_FAST_LOAD
+      const char* const info
+        = std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
+      fp = std::fopen (info, mode);
+      delete [] info; // resolve memory leak
+      if (! fp) return -1;
+      std::fwrite (&_bheadF, sizeof (int), 1, fp);
+      std::fwrite (&_bheadC, sizeof (int), 1, fp);
+      std::fwrite (&_bheadO, sizeof (int), 1, fp);
+      std::fwrite (_ninfo, sizeof (ninfo), static_cast <size_t> (_size), fp);
+      std::fwrite (_block, sizeof (block), static_cast <size_t> (_size >> 8), fp);
+      std::fclose (fp);
+#endif
+      return 0;
+    }
+    int open (const char* fn, const char* mode = "rb",
+              const size_t offset = 0, size_t size_ = 0) {
+      FILE* fp = std::fopen (fn, mode);
+      if (! fp) return -1;
+      // get size
+      if (! size_) {
+        if (std::fseek (fp, 0, SEEK_END) != 0) return -1;
+        size_ = static_cast <size_t> (std::ftell (fp));
+        if (std::fseek (fp, 0, SEEK_SET) != 0) return -1;
+      }
+      if (size_ <= offset) return -1;
+      if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
+      int len = 0;
+      if (std::fread (&len, sizeof (int), 1, fp) != 1) return -1;
+      const size_t length_ = static_cast <size_t> (len);
+      if (size_ <= offset + length_) return -1;
+      // set array
+      clear (false);
+      size_ = (size_ - offset - length_) / sizeof (node);
+      _array = static_cast <node*>  (std::malloc (sizeof (node)  * size_));
+      _tail  = static_cast <char*>  (std::malloc (length_));
+      _tail0 = static_cast <int*>   (std::malloc (sizeof (int)));
+#ifdef USE_FAST_LOAD
+      _ninfo = static_cast <ninfo*> (std::malloc (sizeof (ninfo) * size_));
+      _block = static_cast <block*> (std::malloc (sizeof (block) * size_));
+      if (! _array || ! _tail || ! _tail0 || ! _ninfo || ! _block)
+#else
+      if (! _array || ! _tail || ! _tail0)
+#endif
+        _err (__FILE__, __LINE__, "memory allocation failed\n");
+      if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
+      if (length_ != std::fread (_tail,  sizeof (char), length_, fp) ||
+          size_   != std::fread (_array, sizeof (node), size_,   fp))
+        return -1;
+      std::fclose (fp);
+      _size = static_cast <int> (size_);
+      *_length0 = 0;
+#ifdef USE_FAST_LOAD
+      const char* const info
+        = std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
+      fp = std::fopen (info, mode);
+      delete [] info; // resolve memory leak
+      if (! fp) return -1;
+      std::fread (&_bheadF, sizeof (int), 1, fp);
+      std::fread (&_bheadC, sizeof (int), 1, fp);
+      std::fread (&_bheadO, sizeof (int), 1, fp);
+      if (size_      != std::fread (_ninfo, sizeof (ninfo), size_, fp) ||
+          size_ >> 8 != std::fread (_block, sizeof (block), size_ >> 8, fp))
+        return -1;
+      std::fclose (fp);
+      _capacity = _size;
+      _quota  = *_length;
+      _quota0 = 1;
+#endif
+      return 0;
+    }
+#ifndef USE_FAST_LOAD
+    void restore () { // restore information to update
+      if (! _block) _restore_block ();
+      if (! _ninfo) _restore_ninfo ();
+      _capacity = _size;
+      _quota  = *_length;
+      _quota0 = 1;
+    }
+#endif
+    void set_array (void* p, size_t size_ = 0) { // ad-hoc
+      clear (false);
+      if (size_)
+        size_ = size_ * unit_size () - static_cast <size_t> (*static_cast <int*> (p));
+      _tail  = static_cast <char*> (p);
+      _array = reinterpret_cast <node*> (_tail + *_length);
+      _size  = static_cast <int> (size_ / unit_size () + (size_ % unit_size () ? 1 : 0));
+      _no_delete = true;
+    }
+    const void* array () const { return _array; }
+    void clear (const bool reuse = true) {
+      if (_no_delete) _array = 0, _tail = 0;
+      if (_array) std::free (_array);
+      if (_tail)  std::free (_tail);
+      if (_tail0) std::free (_tail0);
+      if (_ninfo) std::free (_ninfo);
+      if (_block) std::free (_block);
+      _array = 0; _tail = 0; _tail0 = 0; _ninfo = 0; _block = 0;
+      _bheadF = _bheadC = _bheadO = _capacity = _size = _quota = _quota0 = 0;
+      if (reuse) _initialize ();
+      _no_delete = false;
+    }
+    // return the first child for a tree rooted by a given node
+    int begin (npos_t& from, size_t& len) {
+#ifndef USE_FAST_LOAD
+      if (! _ninfo) _restore_ninfo ();
+#endif
+      int base = from >> 32 ? - static_cast <int> (from >> 32) : _array[from].base;
+      if (base >= 0) { // on trie
+        uchar c = _ninfo[from].child;
+        if (! from && ! (c = _ninfo[base ^ c].sibling)) // bug fix
+          return CEDAR_NO_PATH; // no entry
+        for (; c && base >= 0; ++len) {
+          from = static_cast <size_t> (base) ^ c;
+          base = _array[from].base;
+          c    = _ninfo[from].child;
+        }
+        if (base >= 0) return _array[base ^ c].base;
+      }
+      const size_t len_ = std::strlen (&_tail[-base]);
+      from &= TAIL_OFFSET_MASK;
+      from |= static_cast <npos_t> (static_cast <size_t> (-base) + len_) << 32;
+      len += len_;
+      return *reinterpret_cast <int*> (&_tail[-base] + len_ + 1);
+    }
+    // return the next child if any
+    int next (npos_t& from, size_t& len, const npos_t root = 0) {
+      uchar c = 0;
+      if (const int offset = static_cast <int> (from >> 32)) { // on tail
+        if (root >> 32) return CEDAR_NO_PATH;
+        from &= TAIL_OFFSET_MASK;
+        len -= static_cast <size_t> (offset - (-_array[from].base));
+      } else
+        c    = _ninfo[_array[from].base ^ 0].sibling;
+      for (; ! c && from != root; --len) {
+        c    = _ninfo[from].sibling;
+        from = static_cast <size_t> (_array[from].check);
+      }
+      if (! c) return CEDAR_NO_PATH;
+      return begin (from = static_cast <size_t> (_array[from].base) ^ c, ++len);
+    }
+    npos_t tracking_node[NUM_TRACKING_NODES + 1];
+  private:
+    // currently disabled; implement these if you need
+    da (const da&);
+    da& operator= (const da&);
+    node*   _array;
+    union { char* _tail;  int* _length;  };
+    union { int*  _tail0; int* _length0; };
+    ninfo*  _ninfo;
+    block*  _block;
+    int     _bheadF;  // first block of Full;   0
+    int     _bheadC;  // first block of Closed; 0 if no Closed
+    int     _bheadO;  // first block of Open;   0 if no Open
+    int     _capacity;
+    int     _size;
+    int     _quota;
+    int     _quota0;
+    int     _no_delete;
+    short   _reject[257];
+    //
+    static void _err (const char* fn, const int ln, const char* msg)
+    { std::fprintf (stderr, "cedar: %s [%d]: %s", fn, ln, msg); std::exit (1); }
+    template <typename T>
+    static void _realloc_array (T*& p, const int size_n, const int size_p = 0) {
+      void* tmp = std::realloc (p, sizeof (T) * static_cast <size_t> (size_n));
+      if (! tmp)
+        std::free (p), _err (__FILE__, __LINE__, "memory reallocation failed\n");
+      p = static_cast <T*> (tmp);
+      static const T T0 = T ();
+      for (T* q (p + size_p), * const r (p + size_n); q != r; ++q) *q = T0;
+    }
+    void _initialize () { // initilize the first special block
+      _realloc_array (_array, 256, 256);
+      _realloc_array (_tail,  sizeof (int));
+      _realloc_array (_tail0, 1);
+      _realloc_array (_ninfo, 256);
+      _realloc_array (_block, 1);
+      _array[0] = node (0, -1);
+      for (int i = 1; i < 256; ++i)
+        _array[i] = node (i == 1 ? -255 : - (i - 1), i == 255 ? -1 : - (i + 1));
+      _capacity = _size = 256;
+      _block[0].ehead = 1; // bug fix for erase
+      _quota  = *_length  = static_cast <int> (sizeof (int));
+      _quota0 = 1;
+      for (size_t i = 0 ; i <= NUM_TRACKING_NODES; ++i) tracking_node[i] = 0;
+      for (short  i = 0; i <= 256; ++i) _reject[i] = i + 1;
+    }
+    // follow/create edge
+    template <typename T>
+    int _follow (npos_t& from, const uchar& label, T& cf) {
+      int to = 0;
+      const int base = _array[from].base;
+      if (base < 0 || _array[to = base ^ label].check < 0) {
+        to = _pop_enode (base, label, static_cast <int> (from));
+        _push_sibling (from, to ^ label, label, base >= 0);
+      } else if (_array[to].check != static_cast <int> (from))
+        to = _resolve (from, base, label, cf);
+      return to;
+    }
+    // find key from double array
+    int _find (const char* key, npos_t& from, size_t& pos, const size_t len) const {
+      npos_t offset = from >> 32;
+      if (! offset) { // node on trie
+        for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
+             _array[from].base >= 0; ) {
+          if (pos == len) {
+            const node& n = _array[_array[from].base ^ 0];
+            if (n.check != static_cast <int> (from)) return CEDAR_NO_VALUE;
+            return n.base;
+          }
+          size_t to = static_cast <size_t> (_array[from].base); to ^= key_[pos];
+          if (_array[to].check != static_cast <int> (from)) return CEDAR_NO_PATH;
+          ++pos;
+          from = to;
+        }
+        offset = static_cast <npos_t> (-_array[from].base);
+      }
+      // switch to _tail to match suffix
+      const size_t pos_orig = pos; // start position in reading _tail
+      const char* const tail = &_tail[offset] - pos;
+      if (pos < len) {
+        do if (key[pos] != tail[pos]) break; while (++pos < len);
+        if (const npos_t moved = pos - pos_orig) {
+          from &= TAIL_OFFSET_MASK;
+          from |= (offset + moved) << 32;
+        }
+        if (pos < len) return CEDAR_NO_PATH; // input > tail, input != tail
+      }
+      if (tail[pos]) return CEDAR_NO_VALUE;  // input < tail
+      return *reinterpret_cast <const int*> (&tail[len + 1]);
+    }
+#ifndef USE_FAST_LOAD
+    void _restore_ninfo () {
+      _realloc_array (_ninfo, _size);
+      for (int to = 0; to < _size; ++to) {
+        const int from = _array[to].check;
+        if (from < 0) continue; // skip empty node
+        const int base = _array[from].base;
+        if (const uchar label = static_cast <uchar> (base ^ to)) // skip leaf
+          _push_sibling (static_cast <size_t> (from), base, label,
+                         ! from || _ninfo[from].child || _array[base ^ 0].check == from);
+      }
+    }
+    void _restore_block () {
+      _realloc_array (_block, _size >> 8);
+      _bheadF = _bheadC = _bheadO = 0;
+      for (int bi (0), e (0); e < _size; ++bi) { // register blocks to full
+        block& b = _block[bi];
+        b.num = 0;
+        for (; e < (bi << 8) + 256; ++e)
+          if (_array[e].check < 0 && ++b.num == 1) b.ehead = e;
+        int& head_out = b.num == 1 ? _bheadC : (b.num == 0 ? _bheadF : _bheadO);
+        _push_block (bi, head_out, ! head_out && b.num);
+      }
+    }
+#endif
+    void _set_result (result_type* x, value_type r, size_t = 0, npos_t = 0) const
+    { *x = r; }
+    void _set_result (result_pair_type* x, value_type r, size_t l, npos_t = 0) const
+    { x->value = r; x->length = l; }
+    void _set_result (result_triple_type* x, value_type r, size_t l, npos_t from) const
+    { x->value = r; x->length = l; x->id = from; }
+    void _pop_block (const int bi, int& head_in, const bool last) {
+      if (last) { // last one poped; Closed or Open
+        head_in = 0;
+      } else {
+        const block& b = _block[bi];
+        _block[b.prev].next = b.next;
+        _block[b.next].prev = b.prev;
+        if (bi == head_in) head_in = b.next;
+      }
+    }
+    void _push_block (const int bi, int& head_out, const bool empty) {
+      block& b = _block[bi];
+      if (empty) { // the destination is empty
+        head_out = b.prev = b.next = bi;
+      } else { // use most recently pushed
+        int& tail_out = _block[head_out].prev;
+        b.prev = tail_out;
+        b.next = head_out;
+        head_out = tail_out = _block[tail_out].next = bi;
+      }
+    }
+    int _add_block () {
+      if (_size == _capacity) { // allocate memory if needed
+#ifdef USE_EXACT_FIT
+        _capacity += _size >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : _size;
+#else
+        _capacity += _capacity;
+#endif
+        _realloc_array (_array, _capacity, _capacity);
+        _realloc_array (_ninfo, _capacity, _size);
+        _realloc_array (_block, _capacity >> 8, _size >> 8);
+      }
+      _block[_size >> 8].ehead = _size;
+      _array[_size] = node (- (_size + 255),  - (_size + 1));
+      for (int i = _size + 1; i < _size + 255; ++i)
+        _array[i] = node (-(i - 1), -(i + 1));
+      _array[_size + 255] = node (- (_size + 254),  -_size);
+      _push_block (_size >> 8, _bheadO, ! _bheadO); // append to block Open
+      _size += 256;
+      return (_size >> 8) - 1;
+    }
+    // transfer block from one start w/ head_in to one start w/ head_out
+    void _transfer_block (const int bi, int& head_in, int& head_out) {
+      _pop_block  (bi, head_in, bi == _block[bi].next);
+      _push_block (bi, head_out, ! head_out && _block[bi].num);
+    }
+    // pop empty node from block; never transfer the special block (bi = 0)
+    int _pop_enode (const int base, const uchar label, const int from) {
+      const int e  = base < 0 ? _find_place () : base ^ label;
+      const int bi = e >> 8;
+      node&  n = _array[e];
+      block& b = _block[bi];
+      if (--b.num == 0) {
+        if (bi) _transfer_block (bi, _bheadC, _bheadF); // Closed to Full
+      } else { // release empty node from empty ring
+        _array[-n.base].check = n.check;
+        _array[-n.check].base = n.base;
+        if (e == b.ehead) b.ehead = -n.check; // set ehead
+        if (bi && b.num == 1 && b.trial != MAX_TRIAL) // Open to Closed
+          _transfer_block (bi, _bheadO, _bheadC);
+      }
+      // initialize the released node
+      if (label) n.base = -1; else n.value = value_type (0);
+      n.check = from;
+      if (base < 0) _array[from].base = e ^ label;
+      return e;
+    }
+    // push empty node into empty ring
+    void _push_enode (const int e) {
+      const int bi = e >> 8;
+      block& b = _block[bi];
+      if (++b.num == 1) { // Full to Closed
+        b.ehead = e;
+        _array[e] = node (-e, -e);
+        if (bi) _transfer_block (bi, _bheadF, _bheadC); // Full to Closed
+      } else {
+        const int prev = b.ehead;
+        const int next = -_array[prev].check;
+        _array[e] = node (-prev, -next);
+        _array[prev].check = _array[next].base = -e;
+        if (b.num == 2 || b.trial == MAX_TRIAL) { // Closed to Open
+          if (bi) _transfer_block (bi, _bheadC, _bheadO);
+        }
+        b.trial = 0;
+      }
+      if (b.reject < _reject[b.num]) b.reject = _reject[b.num];
+      _ninfo[e] = ninfo (); // reset ninfo; no child, no sibling
+    }
+    // push label to from's child
+    void _push_sibling (const npos_t from, const int base, const uchar label, const bool flag = true) {
+      uchar* c = &_ninfo[from].child;
+      if (flag && (ORDERED ? label > *c : ! *c))
+        do c = &_ninfo[base ^ *c].sibling; while (ORDERED && *c && *c < label);
+      _ninfo[base ^ label].sibling = *c, *c = label;
+    }
+    // pop label from from's child
+    void _pop_sibling (const npos_t from, const int base, const uchar label) {
+      uchar* c = &_ninfo[from].child;
+      while (*c != label) c = &_ninfo[base ^ *c].sibling;
+      *c = _ninfo[base ^ label].sibling;
+    }
+    // check whether to replace branching w/ the newly added node
+    bool _consult (const int base_n, const int base_p, uchar c_n, uchar c_p) const {
+      do if (! (c_p = _ninfo[base_p ^ c_p].sibling)) return false;
+      while ((c_n = _ninfo[base_n ^ c_n].sibling));
+      return true;
+    }
+    // enumerate (equal to or more than one) child nodes
+    uchar* _set_child (uchar* p, const int base, uchar c, const int label = -1) {
+      --p;
+      if (! c)  { *++p = c; c = _ninfo[base ^ c].sibling; } // 0: terminal
+      if (ORDERED)
+        while (c && c < label) { *++p = c; c = _ninfo[base ^ c].sibling; }
+      if (label != -1) *++p = static_cast <uchar> (label);
+      while (c) { *++p = c; c = _ninfo[base ^ c].sibling; }
+      return p;
+    }
+    // explore new block to settle down
+    int _find_place () {
+      if (_bheadC) return _block[_bheadC].ehead;
+      if (_bheadO) return _block[_bheadO].ehead;
+      return _add_block () << 8;
+    }
+    int _find_place (const uchar* const first, const uchar* const last) {
+      if (int bi = _bheadO) {
+        const int   bz = _block[_bheadO].prev;
+        const short nc = static_cast <short> (last - first + 1);
+        while (1) { // set candidate block
+          block& b = _block[bi];
+          if (b.num >= nc && nc < b.reject) // explore configuration
+            for (int e = b.ehead;;) {
+              const int base = e ^ *first;
+              for (const uchar* p = first; _array[base ^ *++p].check < 0; )
+                if (p == last) return b.ehead = e; // no conflict
+              if ((e = -_array[e].check) == b.ehead) break;
+            }
+          b.reject = nc;
+          if (b.reject < _reject[b.num]) _reject[b.num] = b.reject;
+          const int bi_ = b.next;
+          if (++b.trial == MAX_TRIAL) _transfer_block (bi, _bheadO, _bheadC);
+          if (bi == bz) break;
+          bi = bi_;
+        }
+      }
+      return _add_block () << 8;
+    }
+    // resolve conflict on base_n ^ label_n = base_p ^ label_p
+    template <typename T>
+    int _resolve (npos_t& from_n, const int base_n, const uchar label_n, T& cf) {
+      // examine siblings of conflicted nodes
+      const int to_pn  = base_n ^ label_n;
+      const int from_p = _array[to_pn].check;
+      const int base_p = _array[from_p].base;
+      const bool flag // whether to replace siblings of newly added
+        = _consult (base_n, base_p, _ninfo[from_n].child, _ninfo[from_p].child);
+      uchar child[256];
+      uchar* const first = &child[0];
+      uchar* const last  =
+        flag ? _set_child (first, base_n, _ninfo[from_n].child, label_n)
+        : _set_child (first, base_p, _ninfo[from_p].child);
+      const int base =
+        (first == last ? _find_place () : _find_place (first, last)) ^ *first;
+      // replace & modify empty list
+      const int from  = flag ? static_cast <int> (from_n) : from_p;
+      const int base_ = flag ? base_n : base_p;
+      if (flag && *first == label_n) _ninfo[from].child = label_n; // new child
+      _array[from].base = base; // new base
+      for (const uchar* p = first; p <= last; ++p) { // to_ => to
+        const int to  = _pop_enode (base, *p, from);
+        const int to_ = base_ ^ *p;
+        _ninfo[to].sibling = (p == last ? 0 : *(p + 1));
+        if (flag && to_ == to_pn) continue; // skip newcomer (no child)
+        cf (to_, to);
+        node& n  = _array[to];
+        node& n_ = _array[to_];
+        if ((n.base = n_.base) > 0 && *p) { // copy base; bug fix
+          uchar c = _ninfo[to].child = _ninfo[to_].child;
+          do _array[n.base ^ c].check = to; // adjust grand son's check
+          while ((c = _ninfo[n.base ^ c].sibling));
+        }
+        if (! flag && to_ == static_cast <int> (from_n)) // parent node moved
+          from_n = static_cast <size_t> (to); // bug fix
+        if (! flag && to_ == to_pn) { // the address is immediately used
+          _push_sibling (from_n, to_pn ^ label_n, label_n);
+          _ninfo[to_].child = 0; // remember to reset child
+          if (label_n) n_.base = -1; else n_.value = value_type (0);
+          n_.check = static_cast <int> (from_n);
+        } else
+          _push_enode (to_);
+        if (NUM_TRACKING_NODES) // keep the traversed node updated
+          for (size_t j = 0; tracking_node[j] != 0; ++j) {
+            if (static_cast <int> (tracking_node[j] & TAIL_OFFSET_MASK) == to_) {
+              tracking_node[j] &= NODE_INDEX_MASK;
+              tracking_node[j] |= static_cast <npos_t> (to);
+            }
+          }
+      }
+      return flag ? base ^ label_n : to_pn;
+    }
+    // test the validity of double array for debug
+    void _test (const npos_t from = 0) const {
+      const int base = _array[from].base;
+      if (base < 0) { // validate tail offset
+        assert (*_length >= static_cast <int> (-base + 1 + sizeof (value_type)));
+        return;
+      }
+      uchar c = _ninfo[from].child;
+      do {
+        if (from) assert (_array[base ^ c].check == static_cast <int> (from));
+        if (c) _test (static_cast <npos_t> (base ^ c));
+      } while ((c = _ninfo[base ^ c].sibling));
+    }
+  };
+}
+#endif
--- a/libchinese-segmentation/storage-base/darts-clone/darts.h
+++ b/libchinese-segmentation/storage-base/darts-clone/darts.h
--- a/libchinese-segmentation/storage-base/storage-base-cedar.pri
+++ b/libchinese-segmentation/storage-base/storage-base-cedar.pri
@ -0,0 +1,12 @@
+INCLUDEPATH += $$PWD
+
+HEADERS += \
+    $$PWD/darts-clone/darts.h \
+    $$PWD/cedar/cedarpp.h \
+    $$PWD/cedar/cedar.h \
+    $$PWD/storage-base.h \
+    $$PWD/storage-base.hpp
+
+SOURCES += \
+    $$PWD/storage-base.cpp
+
--- a/libchinese-segmentation/storage-base/storage-base.cpp
+++ b/libchinese-segmentation/storage-base/storage-base.cpp
@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#ifndef STORAGEBASE_CPP
+#define STORAGEBASE_CPP
+
+#include "storage-base.h"
+
+template<const bool ordered, typename cache_file_header>
+StorageBase<ordered, cache_file_header>::StorageBase(const vector<string> file_paths, string dat_cache_path)
+    :m_file_paths(file_paths), m_dat_cache_path(dat_cache_path), m_double_array_data_trie(new cedar::da<int, -1, -2, ordered>)
+{
+    static_assert(std::is_base_of<CacheFileHeaderBase, header_type>::value, "CacheFileHeader class not derived from CacheFileHeaderBase!");
+}
+
+template<const bool ordered, typename cache_file_header>
+void StorageBase<ordered, cache_file_header>::Init()
+{
+    int file_size_sum = 0;
+    const string md5 = CalcFileListMD5(m_file_paths, file_size_sum);
+    m_total_dict_size = file_size_sum;
+
+    if (m_dat_cache_path.empty()) {
+        m_dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
+    }
+     m_dat_cache_path += VERSION;
+    if (InitAttachDat(m_dat_cache_path, md5)) {
+        return;
+    }
+
+    LoadSourceFile(m_dat_cache_path, md5);//构建DATrie，写入dat文件
+
+    bool build_ret = InitAttachDat(m_dat_cache_path, md5);
+
+    assert(build_ret);
+}
+
+template<const bool ordered, typename cache_file_header>
+string StorageBase<ordered, cache_file_header>::Find(const string &key)
+{
+    int result = m_double_array_data_trie->template exactMatchSearch<int>(key.c_str(), key.size());
+    if (result < 0)
+        return {};
+    return string(&m_elements_ptr[result]);
+}
+
+template<const bool ordered, typename cache_file_header>
+bool StorageBase<ordered, cache_file_header>::Contains(string &word)
+{
+    if (this->Find(word) != string())
+        return true;
+    return false;
+}
+
+template<const bool ordered, typename cache_file_header>
+bool StorageBase<ordered, cache_file_header>::IsMultiTone(const string &word)
+{
+    string result = this->Find(word);
+    if (result.find(",") == result.npos)
+        return true;
+    return false;
+}
+
+template<const bool ordered, typename cache_file_header>
+int StorageBase<ordered, cache_file_header>::GetTotalDictSize() const
+{
+    return m_total_dict_size;
+}
+
+template<const bool ordered, typename cache_file_header>
+StorageBase<ordered, cache_file_header>::~StorageBase()
+{
+    munmap(m_mmap_addr, m_mmap_length);
+    m_mmap_addr = nullptr;
+    close(m_mmap_fd);
+    m_mmap_fd = -1;
+
+    if (m_double_array_data_trie)
+        delete m_double_array_data_trie;
+    m_double_array_data_trie = nullptr;
+}
+
+template<const bool ordered, typename cache_file_header>
+cedar::da<int, -1, -2, ordered> *StorageBase<ordered, cache_file_header>::GetDoubleArrayDataTrie()
+{
+    return m_double_array_data_trie;
+}
+
+template<const bool ordered, typename cache_file_header>
+const void *StorageBase<ordered, cache_file_header>::GetDataTrieArray()
+{
+    return m_double_array_data_trie->array();
+}
+
+template<const bool ordered, typename cache_file_header>
+int StorageBase<ordered, cache_file_header>::GetDataTrieSize()
+{
+    return m_double_array_data_trie->size();
+}
+
+template<const bool ordered, typename cache_file_header>
+int StorageBase<ordered, cache_file_header>::GetDataTrieTotalSize()
+{
+    return m_double_array_data_trie->total_size();
+}
+
+template<const bool ordered, typename cache_file_header>
+cache_file_header *StorageBase<ordered, cache_file_header>::GetCacheFileHeaderPtr()
+{
+    return reinterpret_cast<header_type*>(m_mmap_addr);
+}
+
+
+template<const bool ordered, typename cache_file_header>
+bool StorageBase<ordered, cache_file_header>::InitAttachDat(const string &dat_cache_file, const string &md5)
+{
+    m_mmap_fd = open(dat_cache_file.c_str(), O_RDONLY);
+
+    if (m_mmap_fd < 0) {
+        return false;
+    }
+
+    const auto seek_off = lseek(m_mmap_fd, 0, SEEK_END);
+    if (seek_off < 0){
+        close(m_mmap_fd);
+        m_mmap_fd = -1;
+        return false;
+    };
+
+    m_mmap_length = seek_off;
+    m_mmap_addr = reinterpret_cast<char *>(mmap(NULL, m_mmap_length, PROT_READ, MAP_SHARED, m_mmap_fd, 0));
+    if (m_mmap_addr == MAP_FAILED) {
+        close(m_mmap_fd);
+        m_mmap_fd = -1;
+        return false;
+    }
+    if (m_mmap_length < sizeof(header_type)) {
+        munmap(m_mmap_addr, m_mmap_length);
+        m_mmap_addr = nullptr;
+        close(m_mmap_fd);
+        m_mmap_fd = -1;
+        return false;
+    }
+    header_type & header = *reinterpret_cast<header_type*>(m_mmap_addr);
+
+    if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())
+            or m_mmap_length != sizeof(header_type) + header.elements_size  + header.dat_size * m_double_array_data_trie->unit_size()) {
+        munmap(m_mmap_addr, m_mmap_length);
+        m_mmap_addr = nullptr;
+        close(m_mmap_fd);
+        m_mmap_fd = -1;
+        return false;
+    }
+
+    m_elements_ptr = (const char *)(m_mmap_addr + sizeof(header_type));
+    const char * dat_ptr = m_mmap_addr + sizeof(header_type) + header.elements_size;
+    this->m_double_array_data_trie->set_array((char *)dat_ptr, header.dat_size);
+    return true;
+}
+
+string CalcFileListMD5(const vector<string> &files_list, int &file_size_sum) {
+    limonp::MD5 md5;
+    file_size_sum = 0;
+
+    for (auto const & local_path : files_list) {
+        const int fd = open(local_path.c_str(), O_RDONLY);
+        if (fd < 0){
+            continue;
+        }
+        auto const len = lseek(fd, 0, SEEK_END);
+        if (len > 0) {
+            void * addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+            assert(MAP_FAILED != addr);
+
+            md5.Update((unsigned char *) addr, len);
+            file_size_sum += len;
+
+            munmap(addr, len);
+        }
+        close(fd);
+    }
+
+    md5.Final();
+    return string(md5.digestChars);
+}
+#endif
--- a/libchinese-segmentation/storage-base/storage-base.h
+++ b/libchinese-segmentation/storage-base/storage-base.h
@ -0,0 +1,93 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#ifndef STORAGEBASE_H
+#define STORAGEBASE_H
+#include <string>
+#include <vector>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include "Md5.hpp"
+#include "StringUtil.hpp"
+#include "cedar.h"
+using namespace std;
+
+struct CacheFileHeaderBase { //todo 字节对齐
+    char     md5_hex[32] = {};
+    uint32_t elements_num = 0;
+    uint32_t elements_size = 0;
+    uint32_t dat_size = 0;
+};
+
+template<const bool ordered = false, typename cache_file_header = CacheFileHeaderBase>
+class StorageBase
+{
+public:
+    typedef cache_file_header header_type;
+
+    StorageBase(const vector<string> file_paths, string dat_cache_path = "");
+
+    virtual void Init();
+
+    virtual string Find(const string &key);
+
+    virtual bool Contains(string &word);
+
+    virtual bool IsMultiTone(const string &word);
+
+    virtual int GetTotalDictSize() const;
+
+    virtual void LoadSourceFile(const string &dat_cache_file, const string &md5) = 0;
+
+    virtual ~StorageBase();
+
+    cedar::da<int, -1, -2, ordered> * GetDoubleArrayDataTrie();
+    const void * GetDataTrieArray();
+    int GetDataTrieSize();
+    int GetDataTrieTotalSize();
+
+    cache_file_header * GetCacheFileHeaderPtr();
+
+private:
+    StorageBase();
+    StorageBase(const StorageBase&);
+    StorageBase& operator = (const StorageBase&);
+
+    bool InitAttachDat(const string &dat_cache_file, const string &md5);
+
+    vector<string> m_file_paths;
+    string m_dat_cache_path;
+
+    cedar::da<int, -1, -2, ordered> * m_double_array_data_trie = nullptr;
+
+    const char * m_elements_ptr = nullptr;
+
+    int    m_mmap_fd = -1;
+    int    m_mmap_length = 0;
+    char * m_mmap_addr = nullptr;
+
+    int    m_total_dict_size = 0;
+
+};
+
+inline string CalcFileListMD5(const vector<string> &files_list, int & file_size_sum);
+#include "storage-base.cpp"
+#endif // STORAGEBASE_H
--- a/libchinese-segmentation/storage-base/storage-base.hpp
+++ b/libchinese-segmentation/storage-base/storage-base.hpp
@ -0,0 +1,247 @@
+/*
+ * Copyright (C) 2022, KylinSoft Co., Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ *
+ * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
+ *
+ */
+#ifndef STORAGEBASE_H
+#define STORAGEBASE_H
+#include <string>
+#include <vector>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <iostream>
+#include "Md5.hpp"
+#include "StringUtil.hpp"
+//#define USE_DARTS
+#ifdef USE_DARTS
+#include "../storage-base/darts-clone/darts.h"
+#include <cassert>
+#else
+#include "../storage-base/cedar/cedar.h"
+#endif
+using namespace std;
+
+inline string CalcFileListMD5(const vector<string> &files_list, int & file_size_sum)
+{
+    limonp::MD5 md5;
+    file_size_sum = 0;
+
+    for (auto const & local_path : files_list) {
+        const int fd = open(local_path.c_str(), O_RDONLY);
+        if (fd < 0){
+            continue;
+        }
+        auto const len = lseek(fd, 0, SEEK_END);
+        if (len > 0) {
+            void * addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+            assert(MAP_FAILED != addr);
+
+            md5.Update((unsigned char *) addr, len);
+            file_size_sum += len;
+
+            munmap(addr, len);
+        }
+        close(fd);
+    }
+
+    md5.Final();
+    return string(md5.digestChars);
+}
+
+inline bool isFileExist(const string filePath) {
+    ifstream infile(filePath);
+    return infile.good();
+}
+
+inline void tryRename(string tmpName, string name) {
+    if (0 != rename(tmpName.c_str(), name.c_str())) {
+        if (isFileExist(name)) {
+            remove(tmpName.c_str());
+        }
+    }
+}
+
+struct CacheFileHeaderBase { //todo 字节对齐
+    char     md5_hex[32] = {};
+    uint32_t elements_num = 0;
+    uint32_t elements_size = 0;
+    uint32_t dat_size = 0;
+};
+
+template<typename element_ptr_type, const bool ordered = false, typename cache_file_header = CacheFileHeaderBase>
+class StorageBase
+{
+public:
+    typedef cache_file_header header_type;
+#ifdef USE_DARTS
+    typedef typename Darts::DoubleArray::result_pair_type result_pair_type;
+    StorageBase(const vector<string> file_paths, string dat_cache_path = "")
+        :m_file_paths(file_paths), m_dat_cache_path(dat_cache_path), m_double_array_data_trie(new Darts::DoubleArray)
+    {
+        static_assert(std::is_base_of<CacheFileHeaderBase, header_type>::value, "CacheFileHeader class not derived from CacheFileHeaderBase!");
+    }
+#else
+    typedef typename cedar::da<int, -1, -2, ordered>::result_pair_type result_pair_type;
+    StorageBase(const vector<string> file_paths, string dat_cache_path = "")
+        :m_file_paths(file_paths), m_dat_cache_path(dat_cache_path)/*, m_double_array_data_trie(new cedar::da<int, -1, -2, ordered>)*/
+    {
+        static_assert(std::is_base_of<CacheFileHeaderBase, header_type>::value, "CacheFileHeader class not derived from CacheFileHeaderBase!");
+    }
+#endif
+    virtual void Init()
+    {
+        int file_size_sum = 0;
+        const string md5 = CalcFileListMD5(m_file_paths, file_size_sum);
+        m_total_dict_size = file_size_sum;
+
+        if (m_dat_cache_path.empty()) {
+            m_dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
+        }
+         m_dat_cache_path += VERSION;
+        if (InitAttachDat(m_dat_cache_path, md5)) {
+            return;
+        }
+
+        LoadSourceFile(m_dat_cache_path, md5);//构建DATrie，写入dat文件
+
+        bool build_ret = InitAttachDat(m_dat_cache_path, md5);
+
+        assert(build_ret);
+    }
+
+    virtual void LoadSourceFile(const string &dat_cache_file, const string &md5) = 0;
+
+    virtual ~StorageBase()
+    {
+        munmap(m_mmap_addr, m_mmap_length);
+        m_mmap_addr = nullptr;
+        close(m_mmap_fd);
+        m_mmap_fd = -1;
+    }
+#ifndef USE_DARTS
+    inline int Update(const char* key, size_t len, int val)
+    {
+        return m_double_array_data_trie.update(key, len, val);
+    }
+#endif
+    inline size_t CommonPrefixSearch(const char* key, result_pair_type* result, size_t result_len) const
+    {
+        return m_double_array_data_trie.commonPrefixSearch(key, result, result_len);
+    }
+
+    inline int ExactMatchSearch(const char* key, size_t len) const
+    {
+        return m_double_array_data_trie.template exactMatchSearch<int>(key, len);
+    }
+
+    inline const void * GetDataTrieArray()
+    {
+        return m_double_array_data_trie.array();
+    }
+
+    inline int GetDataTrieSize()
+    {
+        return m_double_array_data_trie.size();
+    }
+
+    inline int GetDataTrieTotalSize()
+    {
+        return m_double_array_data_trie.total_size();
+    }
+
+    inline cache_file_header * GetCacheFileHeaderPtr() const
+    {
+        return reinterpret_cast<header_type*>(m_mmap_addr);
+    }
+
+    inline const element_ptr_type * GetElementPtr() const
+    {
+        return m_elements_ptr;
+    }
+
+private:
+    StorageBase();
+    StorageBase(const StorageBase&);
+    StorageBase& operator = (const StorageBase&);
+
+    bool InitAttachDat(const string &dat_cache_file, const string &md5)
+    {
+        m_mmap_fd = open(dat_cache_file.c_str(), O_RDONLY);
+
+        if (m_mmap_fd < 0) {
+            return false;
+        }
+
+        const auto seek_off = lseek(m_mmap_fd, 0, SEEK_END);
+        if (seek_off < 0){
+            close(m_mmap_fd);
+            m_mmap_fd = -1;
+            return false;
+        };
+
+        m_mmap_length = seek_off;
+        m_mmap_addr = reinterpret_cast<char *>(mmap(NULL, m_mmap_length, PROT_READ, MAP_SHARED, m_mmap_fd, 0));
+        if (m_mmap_addr == MAP_FAILED) {
+            close(m_mmap_fd);
+            m_mmap_fd = -1;
+            return false;
+        }
+        if (m_mmap_length < sizeof(header_type)) {
+            munmap(m_mmap_addr, m_mmap_length);
+            m_mmap_addr = nullptr;
+            close(m_mmap_fd);
+            m_mmap_fd = -1;
+            return false;
+        }
+        header_type & header = *reinterpret_cast<header_type*>(m_mmap_addr);
+
+        if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())
+                or m_mmap_length != sizeof(header_type) + header.elements_size  + header.dat_size * m_double_array_data_trie.unit_size()) {
+            munmap(m_mmap_addr, m_mmap_length);
+            m_mmap_addr = nullptr;
+            close(m_mmap_fd);
+            m_mmap_fd = -1;
+            return false;
+        }
+
+        m_elements_ptr = (const element_ptr_type *)(m_mmap_addr + sizeof(header_type));
+        const char * dat_ptr = m_mmap_addr + sizeof(header_type) + header.elements_size;
+        this->m_double_array_data_trie.set_array((char *)dat_ptr, header.dat_size);
+        return true;
+    }
+
+    vector<string> m_file_paths;
+    string m_dat_cache_path;
+
+#ifdef USE_DARTS
+    Darts::DoubleArray m_double_array_data_trie;
+#else
+    cedar::da<int, -1, -2, ordered> m_double_array_data_trie;
+#endif
+
+    const element_ptr_type * m_elements_ptr = nullptr;
+    int    m_mmap_fd = -1;
+    size_t    m_mmap_length = 0;
+    char * m_mmap_addr = nullptr;
+    int    m_total_dict_size = 0;
+
+};
+
+#endif // STORAGEBASE_H
--- a/libchinese-segmentation/test/CMakeLists.txt
+++ b/libchinese-segmentation/test/CMakeLists.txt
@ -0,0 +1,19 @@
+set(CMAKE_AUTOUIC ON)
+set(CMAKE_AUTOMOC ON)
+set(CMAKE_AUTORCC ON)
+find_package(QT NAMES Qt6 Qt5 COMPONENTS Core Gui Widgets REQUIRED)
+find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core Gui Widgets REQUIRED)
+add_executable(test
+        main.cpp
+        mainwindow.cpp
+        mainwindow.h
+        mainwindow.ui
+        )
+target_include_directories( test PRIVATE
+        ../)
+target_link_libraries(test PRIVATE
+        Qt${QT_VERSION_MAJOR}::Core
+        Qt${QT_VERSION_MAJOR}::Gui
+        Qt${QT_VERSION_MAJOR}::Widgets
+        chinese-segmentation
+        )
--- a/libchinese-segmentation/test/main.cpp
+++ b/libchinese-segmentation/test/main.cpp
@ -0,0 +1,11 @@
+#include "mainwindow.h"
+
+#include <QApplication>
+
+int main(int argc, char *argv[])
+{
+    QApplication a(argc, argv);
+    MainWindow w;
+    w.show();
+    return a.exec();
+}
--- a/libchinese-segmentation/test/mainwindow.cpp
+++ b/libchinese-segmentation/test/mainwindow.cpp
@ -0,0 +1,96 @@
+#include "mainwindow.h"
+#include "ui_mainwindow.h"
+#include "hanzi-to-pinyin.h"
+#include "chinese-segmentation.h"
+#include "Traditional-to-Simplified.h"
+#include <QMenu>
+#include <QDebug>
+#include <QStringList>
+
+MainWindow::MainWindow(QWidget *parent)
+    : QMainWindow(parent)
+    , ui(new Ui::MainWindow)
+{
+    ui->setupUi(this);
+    QMenu * menu = new QMenu(this);
+    menu->addAction("Default");
+    menu->addAction("Tone");
+    menu->addAction("Tone2");
+    menu->addAction("Tone3");
+    menu->addAction("FirstLetter");
+    ui->toolButton->setMenu(menu);
+    initconnections();
+    ui->lineEdit_2->setFocus();
+}
+
+MainWindow::~MainWindow()
+{
+    delete ui;
+}
+
+void MainWindow::initconnections()
+{
+    connect(ui->toolButton->menu(), &QMenu::triggered, [&](QAction *action){
+        qDebug() << "tool button:" << action->text();
+        m_action = action->text();
+        ui->toolButton->setText(action->text());
+    });
+    connect(ui->pushButton, &QPushButton::pressed, [&]() {
+        PinyinDataStyle dataStyle;
+        SegType segType;
+        PolyphoneType polyType;
+        ExDataProcessType exType;
+
+        if (m_action == "Default") {
+            dataStyle = PinyinDataStyle::Default;
+        } else if (m_action == "Tone") {
+            dataStyle = PinyinDataStyle::Tone;
+        } else if (m_action == "Tone2") {
+            dataStyle = PinyinDataStyle::Tone2;
+        } else if (m_action == "Tone3") {
+            dataStyle = PinyinDataStyle::Tone3;
+        } else if (m_action == "FirstLetter") {
+            dataStyle = PinyinDataStyle::FirstLetter;
+        }
+
+        if(!ui->checkSegBox->isChecked())
+            segType = SegType::Segmentation;
+        else
+            segType = SegType::NoSegmentation;
+
+        if(ui->checkPolyBox_2->isChecked())
+            polyType = PolyphoneType::Enable;
+        else
+            polyType = PolyphoneType::Disable;
+
+        if (ui->checkExBox_3->isChecked())
+            exType = ExDataProcessType::Default;
+        else
+            exType = ExDataProcessType::Delete;
+
+        HanZiToPinYin::getInstance()->setConfig(dataStyle, segType, polyType, exType);
+
+        ui->lineEdit_4->clear();
+        QString text = ui->lineEdit_2->text();
+        qDebug() << "input:" << text;
+
+        QStringList list;
+        HanZiToPinYin::getInstance()->getResults(text.toStdString(), list);
+
+        ui->lineEdit_4->setText(list.join(" "));
+        qDebug() << "result:" << list.join(" ");
+
+        vector<KeyWord> result = ChineseSegmentation::getInstance()->callSegment(text.toStdString());
+
+        list.clear();
+        for (auto &info:result) {
+            list.append(QString().fromStdString(info.word));
+        }
+        ui->lineEdit_6->setText(list.join("/"));
+
+        string simplified = Traditional2Simplified::getInstance()->getResults(text.toStdString());
+
+        ui->lineEdit_7->setText(QString().fromStdString(simplified));
+    });
+}
+
--- a/libchinese-segmentation/test/mainwindow.h
+++ b/libchinese-segmentation/test/mainwindow.h
@ -0,0 +1,23 @@
+#ifndef MAINWINDOW_H
+#define MAINWINDOW_H
+
+#include <QtWidgets>
+
+QT_BEGIN_NAMESPACE
+namespace Ui { class MainWindow; }
+QT_END_NAMESPACE
+
+class MainWindow : public QMainWindow
+{
+    Q_OBJECT
+
+public:
+    MainWindow(QWidget *parent = nullptr);
+    ~MainWindow();
+
+private:
+    void initconnections();
+    Ui::MainWindow *ui;
+    QString m_action;
+};
+#endif // MAINWINDOW_H
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`Subproject commit f7aa56a30705c2635b0d4237efb635e8fee5022a`