删除子项目，upstream分支将只作为打包时同步代码的分支

2023-12-28 10:27:40 +08:00 · 2023-12-28 10:27:40 +08:00 · 62b82585a9
parent 8f71e3e7c6
commit 62b82585a9
102 changed files with 1247385 additions and 4 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +0,0 @@
 [submodule "libchinese-segmentation"]
 	path = libchinese-segmentation
 	url = https://gitee.com/openkylin/chinese-segmentation.git
--- a/1
+++ b/1
@ -1 +0,0 @@
 Subproject commit f7aa56a30705c2635b0d4237efb635e8fee5022a
--- a/libchinese-segmentation/CMakeLists.txt
+++ b/libchinese-segmentation/CMakeLists.txt
@ -0,0 +1,169 @@
 cmake_minimum_required(VERSION 3.14)
 project(chinese-segmentation LANGUAGES CXX)
 set(VERSION_MAJOR 1)
 set(VERSION_MINOR 1)
 set(VERSION_MICRO 0)
 set(CHINESE_SEGMENTATION_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_MICRO})
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
 find_package(QT NAMES Qt6 Qt5 COMPONENTS Core REQUIRED)
 find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core REQUIRED)
 set(HEADERS
        chinese-segmentation.h
        common-struct.h
        hanzi-to-pinyin.h
        Traditional-to-Simplified.h
        pinyin4cpp-common.h
        libchinese-segmentation_global.h)
 set(CHINESE_SEGMENTATION_SRC
        Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.cpp
        Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.h
        Traditional-to-Simplified.cpp
        Traditional-to-Simplified-private.h
        chinese-segmentation.cpp
        chinese-segmentation-private.h
        cppjieba/DatTrie.hpp
        cppjieba/DictTrie.hpp
        cppjieba/FullSegment.hpp
        cppjieba/HMMModel.hpp
        cppjieba/HMMSegment.hpp
        cppjieba/IdfTrie.hpp
        cppjieba/Jieba.hpp
        cppjieba/KeywordExtractor.hpp
        cppjieba/MPSegment.hpp
        cppjieba/MixSegment.hpp
        cppjieba/PinYinTrie.hpp
        cppjieba/PosTagger.hpp
        cppjieba/PreFilter.hpp
        cppjieba/QuerySegment.hpp
        cppjieba/SegmentBase.hpp
        cppjieba/SegmentTagged.hpp
        cppjieba/TextRankExtractor.hpp
        cppjieba/Unicode.hpp
        cppjieba/idf-trie/idf-trie.cpp cppjieba/idf-trie/idf-trie.h
        cppjieba/limonp/ArgvContext.hpp
        cppjieba/limonp/BlockingQueue.hpp
        cppjieba/limonp/BoundedBlockingQueue.hpp
        cppjieba/limonp/BoundedQueue.hpp
        cppjieba/limonp/Closure.hpp
        cppjieba/limonp/Colors.hpp
        cppjieba/limonp/Condition.hpp
        cppjieba/limonp/Config.hpp
        cppjieba/limonp/FileLock.hpp
        cppjieba/limonp/ForcePublic.hpp
        cppjieba/limonp/LocalVector.hpp
        cppjieba/limonp/Logging.hpp
        cppjieba/limonp/Md5.hpp
        cppjieba/limonp/MutexLock.hpp
        cppjieba/limonp/NonCopyable.hpp
        cppjieba/limonp/StdExtension.hpp
        cppjieba/limonp/StringUtil.hpp
        cppjieba/limonp/Thread.hpp
        cppjieba/limonp/ThreadPool.hpp
        cppjieba/segment-trie/segment-trie.cpp
        cppjieba/segment-trie/segment-trie.h
        hanzi-to-pinyin.cpp
        hanzi-to-pinyin-private.h
        pinyin4cpp/pinyin4cpp-trie.cpp
        pinyin4cpp/pinyin4cpp-trie.h
        pinyin4cpp/pinyin4cpp_dataTrie.cpp
        pinyin4cpp/pinyin4cpp_dataTrie.h
        pinyin4cpp/pinyin4cpp_dictTrie.cpp
        pinyin4cpp/pinyin4cpp_dictTrie.h
        storage-base/cedar/cedar.h
        storage-base/cedar/cedarpp.h
        storage-base/darts-clone/darts.h
        storage-base/storage-base.cpp
        storage-base/storage-base.h
        storage-base/storage-base.hpp)
 add_library(chinese-segmentation SHARED
        ${CHINESE_SEGMENTATION_SRC}
        ${HEADERS}
        )
 include_directories(chinese-segmentation
        storage-base/cedar
        storage-base
        cppjieba
        cppjieba/limonp
        pinyin4cpp
        Traditional-Chinese-Simplified-conversion
        )
 target_link_libraries(chinese-segmentation PUBLIC
    Qt${QT_VERSION_MAJOR}::Core
 )
 include(CMakePackageConfigHelpers)
 set(CMAKE_CONFIG_INSTALL_DIR "/usr/share/cmake/chinese-segmentation")
 set(HEADERS_INSTALL_DIR "/usr/include/chinese-segmentation")
 set(PC_INSTALL_DIR "/usr/lib/${CMAKE_LIBRARY_ARCHITECTURE}/pkgconfig")
 set(DICT_INSTALL_PATH "/usr/share/chinese-segmentation/res/dict")
 add_compile_definitions(
        VERSION="${CHINESE_SEGMENTATION_VERSION}"
        DICT_INSTALL_PATH="${DICT_INSTALL_PATH}"
 )
 target_include_directories(chinese-segmentation PUBLIC  $<INSTALL_INTERFACE:${HEADERS_INSTALL_DIR}>)
 configure_package_config_file(
        "${CMAKE_CURRENT_SOURCE_DIR}/chinese-segmentation-config.cmake.in"
        "${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config.cmake"
        INSTALL_DESTINATION  ${CMAKE_CONFIG_INSTALL_DIR})
 write_basic_package_version_file(
        ${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config-version.cmake
        VERSION ${CHINESE_SEGMENTATION_VERSION}
        COMPATIBILITY SameMajorVersion
 )
 configure_package_config_file(
        "${CMAKE_CURRENT_SOURCE_DIR}/chinese-segmentation.pc.in"
        "${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation.pc"
        INSTALL_DESTINATION  ${PC_INSTALL_DIR})
 set_target_properties(chinese-segmentation PROPERTIES
        VERSION ${CHINESE_SEGMENTATION_VERSION}
        SOVERSION ${VERSION_MAJOR}
        OUTPUT_NAME chinese-segmentation
        )
 install(TARGETS chinese-segmentation
        EXPORT chinese-segmentation
        PUBLIC_HEADER DESTINATION ${HEADERS_INSTALL_DIR}
        LIBRARY DESTINATION /usr/lib/${CMAKE_LIBRARY_ARCHITECTURE}
        )
 install(EXPORT chinese-segmentation
        FILE chinese-segmentation-targets.cmake
        DESTINATION ${CMAKE_CONFIG_INSTALL_DIR})
 install(FILES ${HEADERS} DESTINATION ${HEADERS_INSTALL_DIR})
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation.pc DESTINATION ${PC_INSTALL_DIR})
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/chinese-segmentation-config-version.cmake
        DESTINATION ${CMAKE_CONFIG_INSTALL_DIR})
 set(DICT_FILES
        dict/hmm_model.utf8
        dict/idf.utf8
        dict/jieba.dict.utf8
        dict/stop_words.utf8
        dict/user.dict.utf8
        dict/pinyinWithoutTone.txt
        dict/pos_dict/char_state_tab.utf8
        dict/pos_dict/prob_emit.utf8
        dict/pos_dict/prob_start.utf8
        dict/pos_dict/prob_trans.utf8
        pinyin4cpp/dict/singleWordPinyin.txt
        pinyin4cpp/dict/wordsPinyin.txt
        Traditional-Chinese-Simplified-conversion/dict/TraditionalChineseSimplifiedDict.txt
        )
 install(FILES ${DICT_FILES} DESTINATION ${DICT_INSTALL_PATH})
 if (BUILD_TEST)
    add_subdirectory(test)
 endif ()
--- a/libchinese-segmentation/LICENSE
+++ b/libchinese-segmentation/LICENSE
@ -0,0 +1,674 @@
                    GNU GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007
 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
                            Preamble
  The GNU General Public License is a free, copyleft license for
 software and other kinds of works.
  The licenses for most software and other practical works are designed
 to take away your freedom to share and change the works.  By contrast,
 the GNU General Public License is intended to guarantee your freedom to
 share and change all versions of a program--to make sure it remains free
 software for all its users.  We, the Free Software Foundation, use the
 GNU General Public License for most of our software; it applies also to
 any other work released this way by its authors.  You can apply it to
 your programs, too.
  When we speak of free software, we are referring to freedom, not
 price.  Our General Public Licenses are designed to make sure that you
 have the freedom to distribute copies of free software (and charge for
 them if you wish), that you receive source code or can get it if you
 want it, that you can change the software or use pieces of it in new
 free programs, and that you know you can do these things.
  To protect your rights, we need to prevent others from denying you
 these rights or asking you to surrender the rights.  Therefore, you have
 certain responsibilities if you distribute copies of the software, or if
 you modify it: responsibilities to respect the freedom of others.
  For example, if you distribute copies of such a program, whether
 gratis or for a fee, you must pass on to the recipients the same
 freedoms that you received.  You must make sure that they, too, receive
 or can get the source code.  And you must show them these terms so they
 know their rights.
  Developers that use the GNU GPL protect your rights with two steps:
 (1) assert copyright on the software, and (2) offer you this License
 giving you legal permission to copy, distribute and/or modify it.
  For the developers' and authors' protection, the GPL clearly explains
 that there is no warranty for this free software.  For both users' and
 authors' sake, the GPL requires that modified versions be marked as
 changed, so that their problems will not be attributed erroneously to
 authors of previous versions.
  Some devices are designed to deny users access to install or run
 modified versions of the software inside them, although the manufacturer
 can do so.  This is fundamentally incompatible with the aim of
 protecting users' freedom to change the software.  The systematic
 pattern of such abuse occurs in the area of products for individuals to
 use, which is precisely where it is most unacceptable.  Therefore, we
 have designed this version of the GPL to prohibit the practice for those
 products.  If such problems arise substantially in other domains, we
 stand ready to extend this provision to those domains in future versions
 of the GPL, as needed to protect the freedom of users.
  Finally, every program is threatened constantly by software patents.
 States should not allow patents to restrict development and use of
 software on general-purpose computers, but in those that do, we wish to
 avoid the special danger that patents applied to a free program could
 make it effectively proprietary.  To prevent this, the GPL assures that
 patents cannot be used to render the program non-free.
  The precise terms and conditions for copying, distribution and
 modification follow.
                       TERMS AND CONDITIONS
  0. Definitions.
  "This License" refers to version 3 of the GNU General Public License.
  "Copyright" also means copyright-like laws that apply to other kinds of
 works, such as semiconductor masks.
  "The Program" refers to any copyrightable work licensed under this
 License.  Each licensee is addressed as "you".  "Licensees" and
 "recipients" may be individuals or organizations.
  To "modify" a work means to copy from or adapt all or part of the work
 in a fashion requiring copyright permission, other than the making of an
 exact copy.  The resulting work is called a "modified version" of the
 earlier work or a work "based on" the earlier work.
  A "covered work" means either the unmodified Program or a work based
 on the Program.
  To "propagate" a work means to do anything with it that, without
 permission, would make you directly or secondarily liable for
 infringement under applicable copyright law, except executing it on a
 computer or modifying a private copy.  Propagation includes copying,
 distribution (with or without modification), making available to the
 public, and in some countries other activities as well.
  To "convey" a work means any kind of propagation that enables other
 parties to make or receive copies.  Mere interaction with a user through
 a computer network, with no transfer of a copy, is not conveying.
  An interactive user interface displays "Appropriate Legal Notices"
 to the extent that it includes a convenient and prominently visible
 feature that (1) displays an appropriate copyright notice, and (2)
 tells the user that there is no warranty for the work (except to the
 extent that warranties are provided), that licensees may convey the
 work under this License, and how to view a copy of this License.  If
 the interface presents a list of user commands or options, such as a
 menu, a prominent item in the list meets this criterion.
  1. Source Code.
  The "source code" for a work means the preferred form of the work
 for making modifications to it.  "Object code" means any non-source
 form of a work.
  A "Standard Interface" means an interface that either is an official
 standard defined by a recognized standards body, or, in the case of
 interfaces specified for a particular programming language, one that
 is widely used among developers working in that language.
  The "System Libraries" of an executable work include anything, other
 than the work as a whole, that (a) is included in the normal form of
 packaging a Major Component, but which is not part of that Major
 Component, and (b) serves only to enable use of the work with that
 Major Component, or to implement a Standard Interface for which an
 implementation is available to the public in source code form.  A
 "Major Component", in this context, means a major essential component
 (kernel, window system, and so on) of the specific operating system
 (if any) on which the executable work runs, or a compiler used to
 produce the work, or an object code interpreter used to run it.
  The "Corresponding Source" for a work in object code form means all
 the source code needed to generate, install, and (for an executable
 work) run the object code and to modify the work, including scripts to
 control those activities.  However, it does not include the work's
 System Libraries, or general-purpose tools or generally available free
 programs which are used unmodified in performing those activities but
 which are not part of the work.  For example, Corresponding Source
 includes interface definition files associated with source files for
 the work, and the source code for shared libraries and dynamically
 linked subprograms that the work is specifically designed to require,
 such as by intimate data communication or control flow between those
 subprograms and other parts of the work.
  The Corresponding Source need not include anything that users
 can regenerate automatically from other parts of the Corresponding
 Source.
  The Corresponding Source for a work in source code form is that
 same work.
  2. Basic Permissions.
  All rights granted under this License are granted for the term of
 copyright on the Program, and are irrevocable provided the stated
 conditions are met.  This License explicitly affirms your unlimited
 permission to run the unmodified Program.  The output from running a
 covered work is covered by this License only if the output, given its
 content, constitutes a covered work.  This License acknowledges your
 rights of fair use or other equivalent, as provided by copyright law.
  You may make, run and propagate covered works that you do not
 convey, without conditions so long as your license otherwise remains
 in force.  You may convey covered works to others for the sole purpose
 of having them make modifications exclusively for you, or provide you
 with facilities for running those works, provided that you comply with
 the terms of this License in conveying all material for which you do
 not control copyright.  Those thus making or running the covered works
 for you must do so exclusively on your behalf, under your direction
 and control, on terms that prohibit them from making any copies of
 your copyrighted material outside their relationship with you.
  Conveying under any other circumstances is permitted solely under
 the conditions stated below.  Sublicensing is not allowed; section 10
 makes it unnecessary.
  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
  No covered work shall be deemed part of an effective technological
 measure under any applicable law fulfilling obligations under article
 11 of the WIPO copyright treaty adopted on 20 December 1996, or
 similar laws prohibiting or restricting circumvention of such
 measures.
  When you convey a covered work, you waive any legal power to forbid
 circumvention of technological measures to the extent such circumvention
 is effected by exercising rights under this License with respect to
 the covered work, and you disclaim any intention to limit operation or
 modification of the work as a means of enforcing, against the work's
 users, your or third parties' legal rights to forbid circumvention of
 technological measures.
  4. Conveying Verbatim Copies.
  You may convey verbatim copies of the Program's source code as you
 receive it, in any medium, provided that you conspicuously and
 appropriately publish on each copy an appropriate copyright notice;
 keep intact all notices stating that this License and any
 non-permissive terms added in accord with section 7 apply to the code;
 keep intact all notices of the absence of any warranty; and give all
 recipients a copy of this License along with the Program.
  You may charge any price or no price for each copy that you convey,
 and you may offer support or warranty protection for a fee.
  5. Conveying Modified Source Versions.
  You may convey a work based on the Program, or the modifications to
 produce it from the Program, in the form of source code under the
 terms of section 4, provided that you also meet all of these conditions:
    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.
    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".
    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.
    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.
  A compilation of a covered work with other separate and independent
 works, which are not by their nature extensions of the covered work,
 and which are not combined with it such as to form a larger program,
 in or on a volume of a storage or distribution medium, is called an
 "aggregate" if the compilation and its resulting copyright are not
 used to limit the access or legal rights of the compilation's users
 beyond what the individual works permit.  Inclusion of a covered work
 in an aggregate does not cause this License to apply to the other
 parts of the aggregate.
  6. Conveying Non-Source Forms.
  You may convey a covered work in object code form under the terms
 of sections 4 and 5, provided that you also convey the
 machine-readable Corresponding Source under the terms of this License,
 in one of these ways:
    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.
    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.
    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.
    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.
    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.
  A separable portion of the object code, whose source code is excluded
 from the Corresponding Source as a System Library, need not be
 included in conveying the object code work.
  A "User Product" is either (1) a "consumer product", which means any
 tangible personal property which is normally used for personal, family,
 or household purposes, or (2) anything designed or sold for incorporation
 into a dwelling.  In determining whether a product is a consumer product,
 doubtful cases shall be resolved in favor of coverage.  For a particular
 product received by a particular user, "normally used" refers to a
 typical or common use of that class of product, regardless of the status
 of the particular user or of the way in which the particular user
 actually uses, or expects or is expected to use, the product.  A product
 is a consumer product regardless of whether the product has substantial
 commercial, industrial or non-consumer uses, unless such uses represent
 the only significant mode of use of the product.
  "Installation Information" for a User Product means any methods,
 procedures, authorization keys, or other information required to install
 and execute modified versions of a covered work in that User Product from
 a modified version of its Corresponding Source.  The information must
 suffice to ensure that the continued functioning of the modified object
 code is in no case prevented or interfered with solely because
 modification has been made.
  If you convey an object code work under this section in, or with, or
 specifically for use in, a User Product, and the conveying occurs as
 part of a transaction in which the right of possession and use of the
 User Product is transferred to the recipient in perpetuity or for a
 fixed term (regardless of how the transaction is characterized), the
 Corresponding Source conveyed under this section must be accompanied
 by the Installation Information.  But this requirement does not apply
 if neither you nor any third party retains the ability to install
 modified object code on the User Product (for example, the work has
 been installed in ROM).
  The requirement to provide Installation Information does not include a
 requirement to continue to provide support service, warranty, or updates
 for a work that has been modified or installed by the recipient, or for
 the User Product in which it has been modified or installed.  Access to a
 network may be denied when the modification itself materially and
 adversely affects the operation of the network or violates the rules and
 protocols for communication across the network.
  Corresponding Source conveyed, and Installation Information provided,
 in accord with this section must be in a format that is publicly
 documented (and with an implementation available to the public in
 source code form), and must require no special password or key for
 unpacking, reading or copying.
  7. Additional Terms.
  "Additional permissions" are terms that supplement the terms of this
 License by making exceptions from one or more of its conditions.
 Additional permissions that are applicable to the entire Program shall
 be treated as though they were included in this License, to the extent
 that they are valid under applicable law.  If additional permissions
 apply only to part of the Program, that part may be used separately
 under those permissions, but the entire Program remains governed by
 this License without regard to the additional permissions.
  When you convey a copy of a covered work, you may at your option
 remove any additional permissions from that copy, or from any part of
 it.  (Additional permissions may be written to require their own
 removal in certain cases when you modify the work.)  You may place
 additional permissions on material, added by you to a covered work,
 for which you have or can give appropriate copyright permission.
  Notwithstanding any other provision of this License, for material you
 add to a covered work, you may (if authorized by the copyright holders of
 that material) supplement the terms of this License with terms:
    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or
    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or
    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or
    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or
    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or
    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.
  All other non-permissive additional terms are considered "further
 restrictions" within the meaning of section 10.  If the Program as you
 received it, or any part of it, contains a notice stating that it is
 governed by this License along with a term that is a further
 restriction, you may remove that term.  If a license document contains
 a further restriction but permits relicensing or conveying under this
 License, you may add to a covered work material governed by the terms
 of that license document, provided that the further restriction does
 not survive such relicensing or conveying.
  If you add terms to a covered work in accord with this section, you
 must place, in the relevant source files, a statement of the
 additional terms that apply to those files, or a notice indicating
 where to find the applicable terms.
  Additional terms, permissive or non-permissive, may be stated in the
 form of a separately written license, or stated as exceptions;
 the above requirements apply either way.
  8. Termination.
  You may not propagate or modify a covered work except as expressly
 provided under this License.  Any attempt otherwise to propagate or
 modify it is void, and will automatically terminate your rights under
 this License (including any patent licenses granted under the third
 paragraph of section 11).
  However, if you cease all violation of this License, then your
 license from a particular copyright holder is reinstated (a)
 provisionally, unless and until the copyright holder explicitly and
 finally terminates your license, and (b) permanently, if the copyright
 holder fails to notify you of the violation by some reasonable means
 prior to 60 days after the cessation.
  Moreover, your license from a particular copyright holder is
 reinstated permanently if the copyright holder notifies you of the
 violation by some reasonable means, this is the first time you have
 received notice of violation of this License (for any work) from that
 copyright holder, and you cure the violation prior to 30 days after
 your receipt of the notice.
  Termination of your rights under this section does not terminate the
 licenses of parties who have received copies or rights from you under
 this License.  If your rights have been terminated and not permanently
 reinstated, you do not qualify to receive new licenses for the same
 material under section 10.
  9. Acceptance Not Required for Having Copies.
  You are not required to accept this License in order to receive or
 run a copy of the Program.  Ancillary propagation of a covered work
 occurring solely as a consequence of using peer-to-peer transmission
 to receive a copy likewise does not require acceptance.  However,
 nothing other than this License grants you permission to propagate or
 modify any covered work.  These actions infringe copyright if you do
 not accept this License.  Therefore, by modifying or propagating a
 covered work, you indicate your acceptance of this License to do so.
  10. Automatic Licensing of Downstream Recipients.
  Each time you convey a covered work, the recipient automatically
 receives a license from the original licensors, to run, modify and
 propagate that work, subject to this License.  You are not responsible
 for enforcing compliance by third parties with this License.
  An "entity transaction" is a transaction transferring control of an
 organization, or substantially all assets of one, or subdividing an
 organization, or merging organizations.  If propagation of a covered
 work results from an entity transaction, each party to that
 transaction who receives a copy of the work also receives whatever
 licenses to the work the party's predecessor in interest had or could
 give under the previous paragraph, plus a right to possession of the
 Corresponding Source of the work from the predecessor in interest, if
 the predecessor has it or can get it with reasonable efforts.
  You may not impose any further restrictions on the exercise of the
 rights granted or affirmed under this License.  For example, you may
 not impose a license fee, royalty, or other charge for exercise of
 rights granted under this License, and you may not initiate litigation
 (including a cross-claim or counterclaim in a lawsuit) alleging that
 any patent claim is infringed by making, using, selling, offering for
 sale, or importing the Program or any portion of it.
  11. Patents.
  A "contributor" is a copyright holder who authorizes use under this
 License of the Program or a work on which the Program is based.  The
 work thus licensed is called the contributor's "contributor version".
  A contributor's "essential patent claims" are all patent claims
 owned or controlled by the contributor, whether already acquired or
 hereafter acquired, that would be infringed by some manner, permitted
 by this License, of making, using, or selling its contributor version,
 but do not include claims that would be infringed only as a
 consequence of further modification of the contributor version.  For
 purposes of this definition, "control" includes the right to grant
 patent sublicenses in a manner consistent with the requirements of
 this License.
  Each contributor grants you a non-exclusive, worldwide, royalty-free
 patent license under the contributor's essential patent claims, to
 make, use, sell, offer for sale, import and otherwise run, modify and
 propagate the contents of its contributor version.
  In the following three paragraphs, a "patent license" is any express
 agreement or commitment, however denominated, not to enforce a patent
 (such as an express permission to practice a patent or covenant not to
 sue for patent infringement).  To "grant" such a patent license to a
 party means to make such an agreement or commitment not to enforce a
 patent against the party.
  If you convey a covered work, knowingly relying on a patent license,
 and the Corresponding Source of the work is not available for anyone
 to copy, free of charge and under the terms of this License, through a
 publicly available network server or other readily accessible means,
 then you must either (1) cause the Corresponding Source to be so
 available, or (2) arrange to deprive yourself of the benefit of the
 patent license for this particular work, or (3) arrange, in a manner
 consistent with the requirements of this License, to extend the patent
 license to downstream recipients.  "Knowingly relying" means you have
 actual knowledge that, but for the patent license, your conveying the
 covered work in a country, or your recipient's use of the covered work
 in a country, would infringe one or more identifiable patents in that
 country that you have reason to believe are valid.
  If, pursuant to or in connection with a single transaction or
 arrangement, you convey, or propagate by procuring conveyance of, a
 covered work, and grant a patent license to some of the parties
 receiving the covered work authorizing them to use, propagate, modify
 or convey a specific copy of the covered work, then the patent license
 you grant is automatically extended to all recipients of the covered
 work and works based on it.
  A patent license is "discriminatory" if it does not include within
 the scope of its coverage, prohibits the exercise of, or is
 conditioned on the non-exercise of one or more of the rights that are
 specifically granted under this License.  You may not convey a covered
 work if you are a party to an arrangement with a third party that is
 in the business of distributing software, under which you make payment
 to the third party based on the extent of your activity of conveying
 the work, and under which the third party grants, to any of the
 parties who would receive the covered work from you, a discriminatory
 patent license (a) in connection with copies of the covered work
 conveyed by you (or copies made from those copies), or (b) primarily
 for and in connection with specific products or compilations that
 contain the covered work, unless you entered into that arrangement,
 or that patent license was granted, prior to 28 March 2007.
  Nothing in this License shall be construed as excluding or limiting
 any implied license or other defenses to infringement that may
 otherwise be available to you under applicable patent law.
  12. No Surrender of Others' Freedom.
  If conditions are imposed on you (whether by court order, agreement or
 otherwise) that contradict the conditions of this License, they do not
 excuse you from the conditions of this License.  If you cannot convey a
 covered work so as to satisfy simultaneously your obligations under this
 License and any other pertinent obligations, then as a consequence you may
 not convey it at all.  For example, if you agree to terms that obligate you
 to collect a royalty for further conveying from those to whom you convey
 the Program, the only way you could satisfy both those terms and this
 License would be to refrain entirely from conveying the Program.
  13. Use with the GNU Affero General Public License.
  Notwithstanding any other provision of this License, you have
 permission to link or combine any covered work with a work licensed
 under version 3 of the GNU Affero General Public License into a single
 combined work, and to convey the resulting work.  The terms of this
 License will continue to apply to the part which is the covered work,
 but the special requirements of the GNU Affero General Public License,
 section 13, concerning interaction through a network will apply to the
 combination as such.
  14. Revised Versions of this License.
  The Free Software Foundation may publish revised and/or new versions of
 the GNU General Public License from time to time.  Such new versions will
 be similar in spirit to the present version, but may differ in detail to
 address new problems or concerns.
  Each version is given a distinguishing version number.  If the
 Program specifies that a certain numbered version of the GNU General
 Public License "or any later version" applies to it, you have the
 option of following the terms and conditions either of that numbered
 version or of any later version published by the Free Software
 Foundation.  If the Program does not specify a version number of the
 GNU General Public License, you may choose any version ever published
 by the Free Software Foundation.
  If the Program specifies that a proxy can decide which future
 versions of the GNU General Public License can be used, that proxy's
 public statement of acceptance of a version permanently authorizes you
 to choose that version for the Program.
  Later license versions may give you additional or different
 permissions.  However, no additional obligations are imposed on any
 author or copyright holder as a result of your choosing to follow a
 later version.
  15. Disclaimer of Warranty.
  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
 APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
 HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
 OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
 THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
 IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
 ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
  16. Limitation of Liability.
  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
 WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
 THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
 GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
 USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
 DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
 PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
 EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
 SUCH DAMAGES.
  17. Interpretation of Sections 15 and 16.
  If the disclaimer of warranty and limitation of liability provided
 above cannot be given local legal effect according to their terms,
 reviewing courts shall apply local law that most closely approximates
 an absolute waiver of all civil liability in connection with the
 Program, unless a warranty or assumption of liability accompanies a
 copy of the Program in return for a fee.
                     END OF TERMS AND CONDITIONS
            How to Apply These Terms to Your New Programs
  If you develop a new program, and you want it to be of the greatest
 possible use to the public, the best way to achieve this is to make it
 free software which everyone can redistribute and change under these terms.
  To do so, attach the following notices to the program.  It is safest
 to attach them to the start of each source file to most effectively
 state the exclusion of warranty; and each file should have at least
 the "copyright" line and a pointer to where the full notice is found.
    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 Also add information on how to contact you by electronic and paper mail.
  If the program does terminal interaction, make it output a short
 notice like this when it starts in an interactive mode:
    <program>  Copyright (C) <year>  <name of author>
    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.
 The hypothetical commands `show w' and `show c' should show the appropriate
 parts of the General Public License.  Of course, your program's commands
 might be different; for a GUI interface, you would use an "about box".
  You should also get your employer (if you work as a programmer) or school,
 if any, to sign a "copyright disclaimer" for the program, if necessary.
 For more information on this, and how to apply and follow the GNU GPL, see
 <http://www.gnu.org/licenses/>.
  The GNU General Public License does not permit incorporating your program
 into proprietary programs.  If your program is a subroutine library, you
 may consider it more useful to permit linking proprietary applications with
 the library.  If this is what you want to do, use the GNU Lesser General
 Public License instead of this License.  But first, please read
 <http://www.gnu.org/philosophy/why-not-lgpl.html>.
--- a/libchinese-segmentation/README.md
+++ b/libchinese-segmentation/README.md
@ -0,0 +1,170 @@
 # chinese-segmentation
 #### 介绍
 libchinese-segmentation工程以单例的形式分别提供了中文分词、汉字转拼音和中文繁体简体转换功能。
 接口文件分别为:
 chinese-segmentation.h
 libchinese-segmentation_global.h
 common-struct.h
 hanzi-to-pinyin.h
 pinyin4cpp-common.h
 Traditional-to-Simplified.h
 安装路径:/usr/include/chinese-seg
 #### 使用说明
 其中中文分词相关功能由chinese-segmentation.h提供接口，主要包括以下功能函数：
 ```
   static ChineseSegmentation *getInstance();//全局单例
     /**
     * @brief ChineseSegmentation::callSegment
     * 调用extractor进行关键词提取，先使用Mix方式初步分词，再使用Idf词典进行关键词提取，只包含两字以上关键词
     *
     * @param sentence 要提取关键词的句子
     * @return vector<KeyWord> 存放提取后关键词的信息的容器
     */
    vector<KeyWord> callSegment(const string &sentence);
    vector<KeyWord> callSegment(QString &sentence);
    /**
     * @brief ChineseSegmentation::callMixSegmentCutStr
     * 使用Mix方法进行分词，即先使用最大概率法MP初步分词，再用隐式马尔科夫模型HMM进一步分词，可以准确切出词典已有词和未登录词，结果比较准确
     *
     * @param sentence 要分词的句子
     * @return vector<string> 只存放分词后每个词的内容的容器
     */
    vector<string> callMixSegmentCutStr(const string& sentence);
    /**
     * @brief ChineseSegmentation::callMixSegmentCutWord
     * 和callMixSegmentCutStr功能相同
     * @param sentence 要分词的句子
     * @return vector<Word> 存放分词后每个词所有信息的容器
     */
    vector<Word> callMixSegmentCutWord(const string& str);
    /**
     * @brief ChineseSegmentation::lookUpTagOfWord
     * 查询word的词性
     * @param word 要查询词性的词
     * @return string word的词性
     */
    string lookUpTagOfWord(const string& word);
    /**
     * @brief ChineseSegmentation::getTagOfWordsInSentence
     * 使用Mix分词后获取每个词的词性
     * @param sentence 要分词的句子
     * @return vector<pair<string, string>> 分词后的每个词的内容(firsr)和其对应的词性(second)
     */
    vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
    /**
     * @brief ChineseSegmentation::callFullSegment
     * 使用Full进行分词，Full会切出字典里所有的词。
     * @param sentence 要分词的句子
     * @return vector<Word> 存放分词后每个词所有信息的容器
     */
    vector<Word> callFullSegment(const string& sentence);
    /**
     * @brief ChineseSegmentation::callQuerySegment
     * 使用Query进行分词，即先使用Mix，对于长词再用Full，结果最精确，但词的数量也最大
     * @param sentence 要分词的句子
     * @return vector<Word> 存放分词后每个词所有信息的容器
     */
    vector<Word> callQuerySegment(const string& sentence);
    /**
     * @brief ChineseSegmentation::callHMMSegment
     * 使用隐式马尔科夫模型HMM进行分词
     * @param sentence 要分词的句子
     * @return vector<Word> 存放分词后每个词所有信息的容器
     */
    vector<Word> callHMMSegment(const string& sentence);
    /**
     * @brief ChineseSegmentation::callMPSegment
     * 使用最大概率法MP进行分词
     * @param sentence 要分词的句子
     * @return vector<Word> 存放分词后每个词所有信息的容器
     */
    vector<Word> callMPSegment(const string& sentence);
 ```
 汉字转拼音相关功能由hanzi-to-pinyin.h提供接口，主要包括以下功能函数：
 ```
    static HanZiToPinYin * getInstance();//全局单例
    /**
     * @brief HanZiToPinYin::isMultiTone 判断是否为多音字/词/句
     * @param word 要判断的字/词/句
     * @return bool 不是返回false
     */
    bool isMultiTone(string &word);
    bool isMultiTone(string &&word);
    bool isMultiTone(const string &word);
    bool isMultiTone(const string &&word);
    /**
     * @brief HanZiToPinYin::contains 查询某个字/词/句是否有拼音（是否在数据库包含）
     * @param word 要查询的字/词/句
     * @return bool 数据库不包含返回false
     */
    bool contains(string &word);
    /**
     * @brief HanZiToPinYin::getResults 获取某个字/词/句的拼音
     * @param word 要获取拼音的字/词/句
     * @param results word的拼音列表（有可能多音字），每次调用results会被清空
     * @return int 获取到返回0，否则返回-1
     */
    int getResults(string word, QStringList &results);
    /**
     * @brief setConfig 设置HanZiToPinYin的各项功能，详见pinyin4cpp-common.h
     * @param dataStyle 返回数据风格，默认defult
     * @param segType 是否启用分词，默认启用
     * @param polyphoneType 是否启用多音字，默认不启用
     * @param processType 无拼音数据处理模式，默认defult
     */
    void setConfig(PinyinDataStyle dataStyle,SegType segType,PolyphoneType polyphoneType,ExDataProcessType processType);
 ```
 中文繁体转简体相关功能由Traditional-to-Simplified.h提供接口，主要包括以下功能函数：
 ```
    static Traditional2Simplified * getInstance();//全局单例
    /**
     * @brief Traditional2Simplified::isMultiTone 判断是否为繁体字，是则返回true
     * @param oneWord 要判断的字
     * @return bool 不是返回false
     */
    bool isTraditional(string &oneWord);
    /**
     * @brief Traditional2Simplified::getResults 转换某个字/词/句的繁体字
     * @param words 要转换为简体中文的字/词/句
     * @return words 的简体中文结果
     */
    string getResults(string words);
 ```
 除此之外工程中提供了测试程序位于chinese-segmentation/test，运行界面如下：
 ![输入图片说明](https://foruda.gitee.com/images/1682048388802220746/245a2ec3_8021248.png "image.png")
 #### 参与贡献
 1.  Fork 本仓库
 2.  新建分支
 3.  提交代码
 4.  新建 Pull Request
--- a/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified.pri
+++ b/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified.pri
@ -0,0 +1,10 @@
 INCLUDEPATH += $$PWD
 HEADERS += \
    $$PWD/Traditional2Simplified_trie.h
 SOURCES += \
    $$PWD/Traditional2Simplified_trie.cpp
 DISTFILES += \
    Traditional-Chinese-Simplified-conversion/dict/TraditionalChineseSimplifiedDict.txt 
--- a/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.cpp
+++ b/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.cpp
@ -0,0 +1,98 @@
 /*
 * Copyright (C) 2023, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #include "Traditional2Simplified_trie.h"
 Traditional2SimplifiedTrie::Traditional2SimplifiedTrie(string dat_cache_path)
    : StorageBase<char, false, CacheFileHeaderBase>(vector<string>{TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH}, dat_cache_path)
 {
    this->Init();
 }
 Traditional2SimplifiedTrie::Traditional2SimplifiedTrie(const vector<string> file_paths, string dat_cache_path)
    : StorageBase<char, false, CacheFileHeaderBase>(file_paths, dat_cache_path)
 {
    this->Init();
 }
 bool Traditional2SimplifiedTrie::IsTraditional(const string &word) {
    string result = this->Find(word);
    if (!result.empty())
        return true;
    return false;
 }
 void Traditional2SimplifiedTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
 {
    CacheFileHeaderBase header;
    assert(sizeof(header.md5_hex) == md5.size());
    memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
    int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
    string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
    umask(S_IWGRP | S_IWOTH);
    const int fd =mkstemp((char *)tmp_filepath.data());
    assert(fd >= 0);
    fchmod(fd, 0644);
    write_bytes = write(fd, (const char *)&header, sizeof(CacheFileHeaderBase));
    this->LoadDict(fd, write_bytes, offset, elements_num);
    write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
    lseek(fd, sizeof(header.md5_hex), SEEK_SET);
    write(fd, &elements_num, sizeof(int));
    write(fd, &offset, sizeof(int));
    data_trie_size = this->GetDataTrieSize();
    write(fd, &data_trie_size, sizeof(int));
    close(fd);
    assert((size_t)write_bytes == sizeof(CacheFileHeaderBase) + offset + this->GetDataTrieTotalSize());
    tryRename(tmp_filepath, dat_cache_file);
 }
 string Traditional2SimplifiedTrie::Find(const string &key)
 {
    int result = this->ExactMatchSearch(key.c_str(), key.size());
    if (result < 0)
        return string();
    return string(&this->GetElementPtr()[result]);
 }
 void Traditional2SimplifiedTrie::LoadDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
 {
    ifstream ifs(TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH);
    string line;
    vector<string> buf;
    for (; getline(ifs, line);) {
        if (limonp::StartsWith(line, "#") or line.empty()) {
            continue;
        }
        limonp::Split(line, buf, ":");
        if (buf.size() != 2)
            continue;
        this->Update(buf[0].c_str(), buf[0].size(), offset);
        offset += (buf[1].size() + 1);
        elements_num++;
        write_bytes += write(fd, buf[1].c_str(), buf[1].size() + 1);
    }
 }
--- a/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.h
+++ b/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/Traditional2Simplified_trie.h
@ -0,0 +1,40 @@
 /*
 * Copyright (C) 2023, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef Traditional2SimplifiedTrie_H
 #define Traditional2SimplifiedTrie_H
 #include "storage-base.hpp"
 const char * const  TRADITIONAL_CHINESE_SIMPLIFIED_DICT_PATH = DICT_INSTALL_PATH"/TraditionalChineseSimplifiedDict.txt";
 class Traditional2SimplifiedTrie : public StorageBase<char, false, CacheFileHeaderBase>
 {
 public:
    Traditional2SimplifiedTrie(string dat_cache_path = "");
    Traditional2SimplifiedTrie(const vector<string> file_paths, string dat_cache_path = "");
    void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
    string Find(const string &key);
    bool IsTraditional(const string &word);
 private:
    void LoadDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
 };
 #endif // Traditional2SimplifiedTrie_H
--- a/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/dict/TraditionalChineseSimplifiedDict.txt
+++ b/libchinese-segmentation/Traditional-Chinese-Simplified-conversion/dict/TraditionalChineseSimplifiedDict.txt
--- a/libchinese-segmentation/Traditional-to-Simplified-private.h
+++ b/libchinese-segmentation/Traditional-to-Simplified-private.h
@ -0,0 +1,47 @@
 /*
 * Copyright (C) 2023, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef Traditional2SimplifiedPRIVATE_H
 #define Traditional2SimplifiedPRIVATE_H
 #include <QtCore/qglobal.h>
 #include <QHash>
 #include "Traditional-to-Simplified.h"
 #include "Traditional2Simplified_trie.h"
 using namespace std;
 class TRADITIONAL_CHINESE_SIMPLIFIED_EXPORT Traditional2SimplifiedPrivate
 {
 public:
    Traditional2SimplifiedPrivate(Traditional2Simplified *parent = nullptr);
    ~Traditional2SimplifiedPrivate();
 public:
    bool isTraditional(string &word) {return m_Traditional2SimplifiedTrie.IsTraditional(word);}
    string getResults(string words);
 private:
    Traditional2Simplified *q = nullptr;
    Traditional2SimplifiedTrie m_Traditional2SimplifiedTrie;
 };
 #endif // Traditional2SimplifiedPRIVATE_H
--- a/libchinese-segmentation/Traditional-to-Simplified.cpp
+++ b/libchinese-segmentation/Traditional-to-Simplified.cpp
@ -0,0 +1,86 @@
 /*
 * Copyright (C) 2023, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #include <mutex>
 #include <cctype>
 #include "Traditional-to-Simplified.h"
 #include "Traditional-to-Simplified-private.h"
 #include "cppjieba/Unicode.hpp"
 Traditional2Simplified * Traditional2Simplified::g_Traditional2SimplifiedManager = nullptr;
 std::once_flag g_Traditional2SimplifiedSingleFlag;
 string Traditional2SimplifiedPrivate::getResults(string words)
 {
    string results;
    if (words.empty()) {
        return words;
    } else if (cppjieba::IsSingleWord(words)) {//单个字符
        results = m_Traditional2SimplifiedTrie.Find(words);
        if (results.empty()) {
            results = words;//原数据返回
        }
    } else {//多个字符
        string oneWord;
        string data;
        cppjieba::RuneStrArray runeArray;
        cppjieba::DecodeRunesInString(words, runeArray);
        for (auto i = runeArray.begin(); i != runeArray.end(); ++i) {
            oneWord = cppjieba::GetStringFromRunes(words, i, i);
            data = m_Traditional2SimplifiedTrie.Find(oneWord);
            if (data.empty()) {//单字无结果
                results.append(oneWord);
            } else {
                results.append(data);
            }
        }
    }
    return results;
 }
 Traditional2SimplifiedPrivate::Traditional2SimplifiedPrivate(Traditional2Simplified *parent) : q(parent)
 {
 }
 Traditional2SimplifiedPrivate::~Traditional2SimplifiedPrivate()
 {
 }
 Traditional2Simplified * Traditional2Simplified::getInstance()
 {
    call_once(g_Traditional2SimplifiedSingleFlag, []() {
        g_Traditional2SimplifiedManager = new Traditional2Simplified;
    });
    return g_Traditional2SimplifiedManager;
 }
 bool Traditional2Simplified::isTraditional(string &oneWord)
 {
    return d->isTraditional(oneWord);
 }
 string Traditional2Simplified::getResults(string words)
 {
    return d->getResults(words);
 }
 Traditional2Simplified::Traditional2Simplified() : d(new Traditional2SimplifiedPrivate)
 {
 }
--- a/libchinese-segmentation/Traditional-to-Simplified.h
+++ b/libchinese-segmentation/Traditional-to-Simplified.h
@ -0,0 +1,61 @@
 /*
 * Copyright (C) 2023, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef Traditional2Simplified_H
 #define Traditional2Simplified_H
 #include <QtCore/qglobal.h>
 #include <string>
 #define TRADITIONAL_CHINESE_SIMPLIFIED_EXPORT Q_DECL_IMPORT
 using namespace std;
 class Traditional2SimplifiedPrivate;
 class TRADITIONAL_CHINESE_SIMPLIFIED_EXPORT Traditional2Simplified
 {
 public:
    static Traditional2Simplified * getInstance();
 public:
    /**
     * @brief Traditional2Simplified::isMultiTone 判断是否为繁体字，是则返回true
     * @param oneWord 要判断的字
     * @return bool 不是返回false
     */
    bool isTraditional(string &oneWord);
    /**
     * @brief Traditional2Simplified::getResults 转换某个字/词/句的繁体字
     * @param words 要转换为简体中文的字/词/句
     * @return words 的简体中文结果
     */
    string getResults(string words);
 protected:
    Traditional2Simplified();
    ~Traditional2Simplified();
    Traditional2Simplified(const Traditional2Simplified&) = delete;
    Traditional2Simplified& operator =(const Traditional2Simplified&) = delete;
 private:
    static Traditional2Simplified *g_Traditional2SimplifiedManager;
    Traditional2SimplifiedPrivate *d = nullptr;
 };
 #endif // PINYINMANAGER_H
--- a/libchinese-segmentation/chinese-segmentation-config.cmake.in
+++ b/libchinese-segmentation/chinese-segmentation-config.cmake.in
@ -0,0 +1,9 @@
@PACKAGE_INIT@
 include(CMakeFindDependencyMacro)
 find_dependency(Qt@QT_VERSION_MAJOR@Core "@REQUIRED_QT_VERSION@")
 if(TARGET Qt6::Core)
    find_dependency(Qt6Core5Compat @REQUIRED_QT_VERSION@)
 endif()
 include("${CMAKE_CURRENT_LIST_DIR}/chinese-segmentation-targets.cmake")
--- a/libchinese-segmentation/chinese-segmentation-private.h
+++ b/libchinese-segmentation/chinese-segmentation-private.h
@ -0,0 +1,34 @@
 #ifndef CHINESESEGMENTATIONPRIVATE_H
 #define CHINESESEGMENTATIONPRIVATE_H
 #include "chinese-segmentation.h"
 #include "cppjieba/Jieba.hpp"
 #include "cppjieba/KeywordExtractor.hpp"
 class ChineseSegmentationPrivate
 {
 public:
    explicit ChineseSegmentationPrivate(ChineseSegmentation *parent = nullptr);
    ~ChineseSegmentationPrivate();
    vector<KeyWord> callSegment(const string& sentence);
    vector<KeyWord> callSegment(QString& sentence);
    vector<string> callMixSegmentCutStr(const string& sentence);
    vector<Word> callMixSegmentCutWord(const string& sentence);
    string lookUpTagOfWord(const string& word);
    vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
    vector<Word> callFullSegment(const string& sentence);
    vector<Word> callQuerySegment(const string& sentence);
    vector<Word> callHMMSegment(const string& sentence);
    vector<Word> callMPSegment(const string& sentence);
 private:
    cppjieba::Jieba *m_jieba;
    ChineseSegmentation *q = nullptr;
 };
 #endif // CHINESESEGMENTATIONPRIVATE_H
--- a/libchinese-segmentation/chinese-segmentation.cpp
+++ b/libchinese-segmentation/chinese-segmentation.cpp
@ -0,0 +1,178 @@
 /*
 * Copyright (C) 2020, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: zhangzihao <zhangzihao@kylinos.cn>
 * Modified by: zhangpengfei <zhangpengfei@kylinos.cn>
 *
 */
 #include "chinese-segmentation.h"
 #include "chinese-segmentation-private.h"
 ChineseSegmentationPrivate::ChineseSegmentationPrivate(ChineseSegmentation *parent) : q(parent)
 {
    //const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
    const char * const  HMM_PATH = DICT_INSTALL_PATH"/hmm_model.utf8";
    //const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
    //const char * const  IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
    const char * const  STOP_WORD_PATH = DICT_INSTALL_PATH"/stop_words.utf8";
    m_jieba = new cppjieba::Jieba(DICT_PATH,
                                  HMM_PATH,
                                  USER_DICT_PATH,
                                  IDF_DICT_PATH,
                                  STOP_WORD_PATH,
                                  "");
 }
 ChineseSegmentationPrivate::~ChineseSegmentationPrivate() {
    if(m_jieba)
        delete m_jieba;
    m_jieba = nullptr;
 }
 vector<KeyWord> ChineseSegmentationPrivate::callSegment(const string &sentence) {
    const size_t topk = -1;
    vector<KeyWord> keywordres;
    ChineseSegmentationPrivate::m_jieba->extractor.Extract(sentence, keywordres, topk);
    return keywordres;
 }
 vector<KeyWord> ChineseSegmentationPrivate::callSegment(QString &sentence) {
    //'\xEF\xBC\x8C' is "，" "\xE3\x80\x82" is "。"  use three " " to replace ,to ensure the offset info.
    sentence = sentence.replace("\t", " ").replace("\xEF\xBC\x8C", "   ").replace("\xE3\x80\x82", "   ");
    const size_t topk = -1;
    vector<KeyWord> keywordres;
    ChineseSegmentationPrivate::m_jieba->extractor.Extract(sentence.left(20480000).toStdString(), keywordres, topk);
    return keywordres;
 }
 vector<string> ChineseSegmentationPrivate::callMixSegmentCutStr(const string &sentence)
 {
    vector<string> keywordres;
    ChineseSegmentationPrivate::m_jieba->Cut(sentence, keywordres);
    return keywordres;
 }
 vector<Word> ChineseSegmentationPrivate::callMixSegmentCutWord(const string &sentence)
 {
    vector<Word> keywordres;
    ChineseSegmentationPrivate::m_jieba->Cut(sentence, keywordres);
    return keywordres;
 }
 string ChineseSegmentationPrivate::lookUpTagOfWord(const string &word)
 {
    return ChineseSegmentationPrivate::m_jieba->LookupTag(word);
 }
 vector<pair<string, string>> ChineseSegmentationPrivate::getTagOfWordsInSentence(const string &sentence)
 {
     vector<pair<string, string>> words;
     ChineseSegmentationPrivate::m_jieba->Tag(sentence, words);
     return words;
 }
 vector<Word> ChineseSegmentationPrivate::callFullSegment(const string &sentence)
 {
    vector<Word> keywordres;
    ChineseSegmentationPrivate::m_jieba->CutAll(sentence, keywordres);
    return keywordres;
 }
 vector<Word> ChineseSegmentationPrivate::callQuerySegment(const string &sentence)
 {
    vector<Word> keywordres;
    ChineseSegmentationPrivate::m_jieba->CutForSearch(sentence, keywordres);
    return keywordres;
 }
 vector<Word> ChineseSegmentationPrivate::callHMMSegment(const string &sentence)
 {
    vector<Word> keywordres;
    ChineseSegmentationPrivate::m_jieba->CutHMM(sentence, keywordres);
    return keywordres;
 }
 vector<Word> ChineseSegmentationPrivate::callMPSegment(const string &sentence)
 {
    size_t maxWordLen = 512;
    vector<Word> keywordres;
    ChineseSegmentationPrivate::m_jieba->CutSmall(sentence, keywordres, maxWordLen);
    return keywordres;
 }
 ChineseSegmentation *ChineseSegmentation::getInstance()
 {
    static ChineseSegmentation *global_instance_chinese_segmentation = new ChineseSegmentation;
    return global_instance_chinese_segmentation;
 }
 vector<KeyWord> ChineseSegmentation::callSegment(const string &sentence)
 {
    return d->callSegment(sentence);
 }
 vector<KeyWord> ChineseSegmentation::callSegment(QString &sentence)
 {
    return d->callSegment(sentence);
 }
 vector<string> ChineseSegmentation::callMixSegmentCutStr(const string &sentence)
 {
    return d->callMixSegmentCutStr(sentence);
 }
 vector<Word> ChineseSegmentation::callMixSegmentCutWord(const string &str)
 {
    return d->callMixSegmentCutWord(str);
 }
 string ChineseSegmentation::lookUpTagOfWord(const string &word)
 {
    return d->lookUpTagOfWord(word);
 }
 vector<pair<string, string> > ChineseSegmentation::getTagOfWordsInSentence(const string &sentence)
 {
    return d->getTagOfWordsInSentence(sentence);
 }
 vector<Word> ChineseSegmentation::callFullSegment(const string &sentence)
 {
    return d->callFullSegment(sentence);
 }
 vector<Word> ChineseSegmentation::callQuerySegment(const string &sentence)
 {
    return d->callQuerySegment(sentence);
 }
 vector<Word> ChineseSegmentation::callHMMSegment(const string &sentence)
 {
    return d->callHMMSegment(sentence);
 }
 vector<Word> ChineseSegmentation::callMPSegment(const string &sentence)
 {
    return d->callMPSegment(sentence);
 }
 ChineseSegmentation::ChineseSegmentation() : d(new ChineseSegmentationPrivate)
 {
 }
--- a/libchinese-segmentation/chinese-segmentation.h
+++ b/libchinese-segmentation/chinese-segmentation.h
@ -0,0 +1,118 @@
 /*
 * Copyright (C) 2020, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: zhangzihao <zhangzihao@kylinos.cn>
 * Modified by: zhangpengfei <zhangpengfei@kylinos.cn>
 *
 */
 #ifndef CHINESESEGMENTATION_H
 #define CHINESESEGMENTATION_H
 #include <QString>
 #include "libchinese-segmentation_global.h"
 #include "common-struct.h"
 class ChineseSegmentationPrivate;
 class CHINESESEGMENTATION_EXPORT ChineseSegmentation {
 public:
    static ChineseSegmentation *getInstance();
    /**
     * @brief ChineseSegmentation::callSegment
     * 调用extractor进行关键词提取，先使用Mix方式初步分词，再使用Idf词典进行关键词提取，只包含两字以上关键词
     *
     * @param sentence 要提取关键词的句子
     * @return vector<KeyWord> 存放提取后关键词的信息的容器
     */
    vector<KeyWord> callSegment(const string &sentence);
    vector<KeyWord> callSegment(QString &sentence);
    /**
     * @brief ChineseSegmentation::callMixSegmentCutStr
     * 使用Mix方法进行分词，即先使用最大概率法MP初步分词，再用隐式马尔科夫模型HMM进一步分词，可以准确切出词典已有词和未登录词，结果比较准确
     *
     * @param sentence 要分词的句子
     * @return vector<string> 只存放分词后每个词的内容的容器
     */
    vector<string> callMixSegmentCutStr(const string& sentence);
    /**
     * @brief ChineseSegmentation::callMixSegmentCutWord
     * 和callMixSegmentCutStr功能相同
     * @param sentence 要分词的句子
     * @return vector<Word> 存放分词后每个词所有信息的容器
     */
    vector<Word> callMixSegmentCutWord(const string& str);
    /**
     * @brief ChineseSegmentation::lookUpTagOfWord
     * 查询word的词性
     * @param word 要查询词性的词
     * @return string word的词性
     */
    string lookUpTagOfWord(const string& word);
    /**
     * @brief ChineseSegmentation::getTagOfWordsInSentence
     * 使用Mix分词后获取每个词的词性
     * @param sentence 要分词的句子
     * @return vector<pair<string, string>> 分词后的每个词的内容(firsr)和其对应的词性(second)
     */
    vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
    /**
     * @brief ChineseSegmentation::callFullSegment
     * 使用Full进行分词，Full会切出字典里所有的词。
     * @param sentence 要分词的句子
     * @return vector<Word> 存放分词后每个词所有信息的容器
     */
    vector<Word> callFullSegment(const string& sentence);
    /**
     * @brief ChineseSegmentation::callQuerySegment
     * 使用Query进行分词，即先使用Mix，对于长词再用Full，结果最精确，但词的数量也最大
     * @param sentence 要分词的句子
     * @return vector<Word> 存放分词后每个词所有信息的容器
     */
    vector<Word> callQuerySegment(const string& sentence);
    /**
     * @brief ChineseSegmentation::callHMMSegment
     * 使用隐式马尔科夫模型HMM进行分词
     * @param sentence 要分词的句子
     * @return vector<Word> 存放分词后每个词所有信息的容器
     */
    vector<Word> callHMMSegment(const string& sentence);
    /**
     * @brief ChineseSegmentation::callMPSegment
     * 使用最大概率法MP进行分词
     * @param sentence 要分词的句子
     * @return vector<Word> 存放分词后每个词所有信息的容器
     */
    vector<Word> callMPSegment(const string& sentence);
 private:
    explicit ChineseSegmentation();
    ~ChineseSegmentation() = default;
    ChineseSegmentation(const ChineseSegmentation&) = delete;
    ChineseSegmentation& operator =(const ChineseSegmentation&) = delete;
 private:
    ChineseSegmentationPrivate *d = nullptr;
 };
 #endif // CHINESESEGMENTATION_H
--- a/libchinese-segmentation/chinese-segmentation.pc.in
+++ b/libchinese-segmentation/chinese-segmentation.pc.in
@ -0,0 +1,11 @@
 prefix=/usr
 exec_prefix=${prefix}
 libdir=${prefix}/lib/@CMAKE_LIBRARY_ARCHITECTURE@
 includedir=${prefix}/include/chinese-segmentation
 Name: chinese-segmentation
 Description: Chinese-segmentation header files
 URL: https://www.ukui.org/
 Version: @VERSION@
 Cflags: -I${includedir}
 Libs: -L${libdir} -lchinese-segmentation
--- a/libchinese-segmentation/common-struct.h
+++ b/libchinese-segmentation/common-struct.h
@ -0,0 +1,52 @@
 #ifndef COMMONSTRUCT_H
 #define COMMONSTRUCT_H
 #include <string>
 #include <vector>
 using namespace std;
 /**
 * @brief The KeyWord struct
 *
 * @property word the content of keyword
 * @property offsets the Unicode offsets, can be used to check the word pos in a sentence
 * @property weight the weight of the keyword
 */
 struct KeyWord {
    string word;
    vector<size_t> offsets;
    double weight;
    ~KeyWord() {
        word = std::move("");
        offsets.clear();
        offsets.shrink_to_fit();
    }
 };
 /**
 * @brief The Word struct
 *
 * @property word the content of word
 * @property offset the offset of the word(absolute pos, Chinese 3 , English 1)， can be used to check the word pos in a sentence
 * @property unicode_offset the Unicode offset of the word
 * @property unicode_length the Unicode length of the word
 */
 struct Word {
    string word;
    uint32_t offset;
    uint32_t unicode_offset;
    uint32_t unicode_length;
    Word(const string& w, uint32_t o)
        : word(w), offset(o) {
    }
    Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
        : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
    }
    ~Word() {
        word = std::move("");
    }
 }; // struct Word
 #endif // COMMONSTRUCT_H
--- a/libchinese-segmentation/cppjieba/DatTrie.hpp
+++ b/libchinese-segmentation/cppjieba/DatTrie.hpp
@ -0,0 +1,641 @@
 #pragma once
 #include <stdint.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <QDebug>
 #include <algorithm>
 #include <utility>
 #include "limonp/Md5.hpp"
 #include "Unicode.hpp"
 //#define USE_DARTS_CLONE
 #ifdef USE_DARTS_CLONE
 #include "../storage-base/darts-clone/darts.h"
 #else
 #include "../storage-base/cedar/cedar.h"
 #endif
 namespace cppjieba {
 using std::pair;
 struct DatElement {
    string word;
    string tag;
    double weight = 0;
    bool operator < (const DatElement & b) const {
        if (word == b.word) {
            return this->weight > b.weight;
        }
        return this->word < b.word;
    }
 };
 struct IdfElement {
    string word;
    double idf = 0;
    bool operator < (const IdfElement & b) const {
        if (word == b.word) {
            return this->idf > b.idf;
        }
        return this->word < b.word;
    }
 };
 struct PinYinElement
 {
    string word;
    string tag;
    bool operator < (const DatElement & b) const {
        return this->word < b.word;
    }
 };
 inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
    return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
 }
 struct PinYinMemElem {
    char tag[6] = {};
    void SetTag(const string & str) {
        memset(&tag[0], 0, sizeof(tag));
        strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
    }
    string GetTag() const {
        return &tag[0];
    }
 };
 inline std::ostream & operator << (std::ostream& os, const DatMemElem & elem) {
    return os << "/tag=" << elem.GetTag() << "/weight=" << elem.weight;
 }
 #ifdef USE_DARTS_CLONE
 typedef Darts::DoubleArray JiebaDAT;
 #else
 typedef cedar::da<int, -1, -2, false> JiebaDAT;
 #endif
 struct CacheFileHeader {
    char md5_hex[32] = {};
    double min_weight = 0;
    uint32_t elements_num = 0;
    uint32_t dat_size = 0;
 };
 static_assert(sizeof(DatMemElem) == 16, "DatMemElem length invalid");
 static_assert((sizeof(CacheFileHeader) % sizeof(DatMemElem)) == 0, "DatMemElem CacheFileHeader length equal");
 class DatTrie {
 public:
    DatTrie() {}
    ~DatTrie() {
        ::munmap(mmap_addr_, mmap_length_);
        mmap_addr_ = nullptr;
        mmap_length_ = 0;
        ::close(mmap_fd_);
        mmap_fd_ = -1;
    }
    const DatMemElem * Find(const string & key) const {
 #ifdef USE_DARTS_CLONE
        JiebaDAT::result_pair_type find_result;
        dat_.exactMatchSearch(key.c_str(), find_result);
        if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
            return nullptr;
        }
        return &elements_ptr_[ find_result.value ];
 #else
        int result = dat_.exactMatchSearch<int>(key.c_str());
        if (result < 0)
            return nullptr;
        return &elements_ptr_[result];
 #endif
    }
    const double Find(const string & key, std::size_t length, std::size_t node_pos) const {
 #ifdef USE_DARTS_CLONE
        JiebaDAT::result_pair_type find_result;
        dat_.exactMatchSearch(key.c_str(), find_result, length, node_pos);
        if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
            return -1;
        }
        return idf_elements_ptr_[ find_result.value ];
 #else
        int result = dat_.exactMatchSearch<int>(key.c_str(), length, node_pos);
        if (result < 0)
            return -1;
        return idf_elements_ptr_[result];
 #endif
    }
    const PinYinMemElem * PinYinFind(const string & key) const {
 #ifdef USE_DARTS_CLONE
        JiebaDAT::result_pair_type find_result;
        dat_.exactMatchSearch(key.c_str(), find_result);
        if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
            return nullptr;
        }
        return &pinyin_elements_ptr_[ find_result.value ];
 #else
        int result = dat_.exactMatchSearch<int>(key.c_str());
        if (result < 0)
            return nullptr;
        return &pinyin_elements_ptr_[result];
 #endif
    }
    void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
              vector<struct DatDag>&res, size_t max_word_len) const {
        res.clear();
        res.resize(end - begin);
        string text_str;
        EncodeRunesToString(begin, end, text_str);
        static const size_t max_num = 128;
        JiebaDAT::result_pair_type result_pairs[max_num] = {};
        for (size_t i = 0, begin_pos = 0; i < size_t(end - begin); i++) {
            std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
            res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + 1, nullptr));
            for (std::size_t idx = 0; idx < num_results; ++idx) {
                auto & match = result_pairs[idx];
                if ((match.value < 0) || ((size_t)match.value >= elements_num_)) {
                    continue;
                }
                auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
                if (char_num > max_word_len) {
                    continue;
                }
                auto pValue = &elements_ptr_[match.value];
                if (1 == char_num) {
                    res[i].nexts[0].second = pValue;
                    continue;
                }
                res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + char_num, pValue));
            }
            begin_pos += limonp::UnicodeToUtf8Bytes((begin + i)->rune);
        }
    }
    /*
    void Find_Reverse(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
              vector<struct DatDag>&res, size_t max_word_len) const {
        res.clear();
        res.resize(end - begin);
        string text_str;
        EncodeRunesToString(begin, end, text_str);
        static const size_t max_num = 128;
        JiebaDAT::result_pair_type result_pairs[max_num] = {};
        size_t str_size = end - begin;
        for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
            begin_pos -= (end - i - 1)->len;
            std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
            res[str_size - i - 1].nexts.push_back(pair<size_t, const DatMemElem *>(str_size - i, nullptr));
            for (std::size_t idx = 0; idx < num_results; ++idx) {
                auto & match = result_pairs[idx];
                if ((match.value < 0) || ((size_t)match.value >= elements_num_)) {
                    continue;
                }
                auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
                if (char_num > max_word_len) {
                    continue;
                }
                auto pValue = &elements_ptr_[match.value];
                if (1 == char_num) {
                    res[str_size - i - 1].nexts[0].second = pValue;
                    continue;
                }
                res[str_size - i - 1].nexts.push_back(pair<size_t, const DatMemElem *>(str_size - 1 - i + char_num, pValue));
            }
        }
    }*/
    void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
              vector<WordRange>& words, size_t max_word_len) const {
        string text_str;
        EncodeRunesToString(begin, end, text_str);
        static const size_t max_num = 128;
        JiebaDAT::result_pair_type result_pairs[max_num] = {};//存放字典查询结果
        size_t str_size = end - begin;
        double max_weight[str_size];//存放逆向路径最大weight
        for (size_t i = 0; i<str_size; i++) {
            max_weight[i] = -3.14e+100;
        }
        int max_next[str_size];//存放动态规划后的分词结果
        //memset(max_next,-1,str_size);
        double val(0);
        for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
            size_t nextPos = str_size - i;//逆向计算
            begin_pos -= (end - i - 1)->len;
            std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
            if (0 == num_results) {//字典不存在则单独分词
                val = min_weight_;
                if (nextPos  < str_size) {
                    val += max_weight[nextPos];
                }
                if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
                    max_weight[nextPos - 1] = val;
                    max_next[nextPos - 1] = nextPos;
                }
            } else {//字典存在则根据查询结果数量计算最大概率路径
                for (std::size_t idx = 0; idx < num_results; ++idx) {
                    auto & match = result_pairs[idx];
                    if ((match.value < 0) || ((size_t)match.value >= elements_num_)) {
                        continue;
                    }
                    auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
                    if (char_num > max_word_len) {
                        continue;
                    }
                    auto pValue = &elements_ptr_[match.value];
                    val = pValue->weight;
                    if (1 == char_num) {
                        if (nextPos  < str_size) {
                            val += max_weight[nextPos];
                        }
                        if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
                            max_weight[nextPos - 1] = val;
                            max_next[nextPos - 1] = nextPos;
                        }
                    } else {
                        if (nextPos - 1 + char_num  < str_size) {
                            val += max_weight[nextPos - 1 + char_num];
                        }
                        if ((nextPos - 1 + char_num <= str_size) && (val > max_weight[nextPos - 1])) {
                            max_weight[nextPos - 1] = val;
                            max_next[nextPos - 1] = nextPos - 1 + char_num;
                        }
                    }
                }
            }
        }
        for (size_t i = 0; i < str_size;) {//统计动态规划结果
            assert(max_next[i] > i);
            assert(max_next[i] <= str_size);
            WordRange wr(begin + i, begin + max_next[i] - 1);
            words.push_back(wr);
            i = max_next[i];
        }
    }
    double GetMinWeight() const {
        return min_weight_;
    }
    void SetMinWeight(double d) {
        min_weight_ = d ;
    }
    bool InitBuildDat(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
        BuildDatCache(elements, dat_cache_file, md5);
        return InitAttachDat(dat_cache_file, md5);
    }
    bool InitBuildDat(vector<IdfElement>& elements, const string & dat_cache_file, const string & md5) {
        BuildDatCache(elements, dat_cache_file, md5);
        return InitIdfAttachDat(dat_cache_file, md5);
    }
    bool InitBuildDat(vector<PinYinElement>& elements, const string & dat_cache_file, const string & md5) {
        BuildDatCache(elements, dat_cache_file, md5);
        return InitPinYinAttachDat(dat_cache_file, md5);
    }
    bool InitAttachDat(const string & dat_cache_file, const string & md5) {
        mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
        if (mmap_fd_ < 0) {
            return false;
        }
        const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
        assert(seek_off >= 0);
        mmap_length_ = seek_off;
        mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
        assert(MAP_FAILED != mmap_addr_);
        assert(mmap_length_ >= sizeof(CacheFileHeader));
        CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
        elements_num_ = header.elements_num;
        min_weight_ = header.min_weight;
        assert(sizeof(header.md5_hex) == md5.size());
        if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
            return false;
        }
        assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(DatMemElem)  + header.dat_size * dat_.unit_size());
        elements_ptr_ = (const DatMemElem *)(mmap_addr_ + sizeof(header));
        char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(DatMemElem) * elements_num_;
        dat_.set_array(dat_ptr, header.dat_size);
        return true;
    }
    bool InitIdfAttachDat(const string & dat_cache_file, const string & md5) {
        mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
        if (mmap_fd_ < 0) {
            return false;
        }
        const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
        assert(seek_off >= 0);
        mmap_length_ = seek_off;
        mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
        assert(MAP_FAILED != mmap_addr_);
        assert(mmap_length_ >= sizeof(CacheFileHeader));
        CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
        elements_num_ = header.elements_num;
        min_weight_ = header.min_weight;
        assert(sizeof(header.md5_hex) == md5.size());
        if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
            return false;
        }
        assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(double)  + header.dat_size * dat_.unit_size());
        idf_elements_ptr_ = (const double *)(mmap_addr_ + sizeof(header));
        char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(double) * elements_num_;
        dat_.set_array(dat_ptr, header.dat_size);
        return true;
    }
    bool InitPinYinAttachDat(const string & dat_cache_file, const string & md5) {
        mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
        if (mmap_fd_ < 0) {
            return false;
        }
        const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
        assert(seek_off >= 0);
        mmap_length_ = seek_off;
        mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
        assert(MAP_FAILED != mmap_addr_);
        assert(mmap_length_ >= sizeof(CacheFileHeader));
        CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
        elements_num_ = header.elements_num;
        min_weight_ = header.min_weight;
        assert(sizeof(header.md5_hex) == md5.size());
        if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
            return false;
        }
        assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(PinYinMemElem)  + header.dat_size * dat_.unit_size());
        pinyin_elements_ptr_ = (const PinYinMemElem *)(mmap_addr_ + sizeof(header));
        char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(PinYinMemElem) * elements_num_;
        dat_.set_array(dat_ptr, header.dat_size);
        return true;
    }
 private:
    void BuildDatCache(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
        std::sort(elements.begin(), elements.end());
        vector<const char*> keys_ptr_vec;
        vector<int> values_vec;
        vector<DatMemElem> mem_elem_vec;
        keys_ptr_vec.reserve(elements.size());
        values_vec.reserve(elements.size());
        mem_elem_vec.reserve(elements.size());
        CacheFileHeader header;
        header.min_weight = min_weight_;
        assert(sizeof(header.md5_hex) == md5.size());
        memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
        for (size_t i = 0; i < elements.size(); ++i) {
            keys_ptr_vec.push_back(elements[i].word.data());
            values_vec.push_back(i);
            mem_elem_vec.push_back(DatMemElem());
            auto & mem_elem = mem_elem_vec.back();
            mem_elem.weight = elements[i].weight;
            mem_elem.SetTag(elements[i].tag);
        }
        auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
        assert(0 == ret);
        header.elements_num = mem_elem_vec.size();
        header.dat_size = dat_.size();
        {
            string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
            ::umask(S_IWGRP | S_IWOTH);
            //const int fd =::mkstemp(&tmp_filepath[0]);
            const int fd =::mkstemp((char *)tmp_filepath.data());
            qDebug() << "mkstemp :" << errno << tmp_filepath.data();
            assert(fd >= 0);
            ::fchmod(fd, 0644);
            auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
            write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(mem_elem_vec[0]) * mem_elem_vec.size());
            write_bytes += ::write(fd, dat_.array(), dat_.total_size());
            assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(mem_elem_vec[0]) + dat_.total_size());
            ::close(fd);
            const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
            assert(0 == rename_ret);
        }
    }
    void BuildDatCache(vector<IdfElement>& elements, const string & dat_cache_file, const string & md5) {
        std::sort(elements.begin(), elements.end());
        vector<const char*> keys_ptr_vec;
        vector<int> values_vec;
        vector<double> mem_elem_vec;
        keys_ptr_vec.reserve(elements.size());
        values_vec.reserve(elements.size());
        mem_elem_vec.reserve(elements.size());
        CacheFileHeader header;
        header.min_weight = min_weight_;
        assert(sizeof(header.md5_hex) == md5.size());
        memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
        for (size_t i = 0; i < elements.size(); ++i) {
            keys_ptr_vec.push_back(elements[i].word.data());
            values_vec.push_back(i);
            mem_elem_vec.push_back(elements[i].idf);
        }
        auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
        assert(0 == ret);
        header.elements_num = mem_elem_vec.size();
        header.dat_size = dat_.size();
        {
            string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
            ::umask(S_IWGRP | S_IWOTH);
            //const int fd =::mkstemp(&tmp_filepath[0]);
            const int fd =::mkstemp((char *)tmp_filepath.data());
            qDebug() << "mkstemp error:" << errno << tmp_filepath.data();
            assert(fd >= 0);
            ::fchmod(fd, 0644);
            auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
            write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(double) * mem_elem_vec.size());
            write_bytes += ::write(fd, dat_.array(), dat_.total_size());
            assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(double) + dat_.total_size());
            ::close(fd);
            const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
            assert(0 == rename_ret);
        }
    }
    void BuildDatCache(vector<PinYinElement>& elements, const string & dat_cache_file, const string & md5) {
        //std::sort(elements.begin(), elements.end());
        vector<const char*> keys_ptr_vec;
        vector<int> values_vec;
        vector<PinYinMemElem> mem_elem_vec;
        keys_ptr_vec.reserve(elements.size());
        values_vec.reserve(elements.size());
        mem_elem_vec.reserve(elements.size());
        CacheFileHeader header;
        header.min_weight = min_weight_;
        assert(sizeof(header.md5_hex) == md5.size());
        memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
        for (size_t i = 0; i < elements.size(); ++i) {
            keys_ptr_vec.push_back(elements[i].word.data());
            values_vec.push_back(i);
            mem_elem_vec.push_back(PinYinMemElem());
            auto & mem_elem = mem_elem_vec.back();
            mem_elem.SetTag(elements[i].tag);
        }
        auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
        assert(0 == ret);
        header.elements_num = mem_elem_vec.size();
        header.dat_size = dat_.size();
        {
            string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
            ::umask(S_IWGRP | S_IWOTH);
            //const int fd =::mkstemp(&tmp_filepath[0]);
            const int fd =::mkstemp((char *)tmp_filepath.data());
            qDebug() << "mkstemp :" << errno << tmp_filepath.data();
            assert(fd >= 0);
            ::fchmod(fd, 0644);
            auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
            write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(mem_elem_vec[0]) * mem_elem_vec.size());
            write_bytes += ::write(fd, dat_.array(), dat_.total_size());
            assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(mem_elem_vec[0]) + dat_.total_size());
            ::close(fd);
            const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
            assert(0 == rename_ret);
        }
    }
    DatTrie(const DatTrie &);
    DatTrie &operator=(const DatTrie &);
 private:
    JiebaDAT dat_;
    const DatMemElem * elements_ptr_ = nullptr;
    const double * idf_elements_ptr_ = nullptr;
    const PinYinMemElem * pinyin_elements_ptr_ = nullptr;
    size_t elements_num_ = 0;
    double min_weight_ = 0;
    int mmap_fd_ = -1;
    size_t mmap_length_ = 0;
    char * mmap_addr_ = nullptr;
 };
 inline string CalcFileListMD5(const string & files_list, size_t & file_size_sum) {
    limonp::MD5 md5;
    const auto files = limonp::Split(files_list, "|;");
    file_size_sum = 0;
    for (auto const & local_path : files) {
        const int fd = ::open(local_path.c_str(), O_RDONLY);
        if( fd < 0){
            continue;
        }
        auto const len = ::lseek(fd, 0, SEEK_END);
        if (len > 0) {
            void * addr = ::mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
            assert(MAP_FAILED != addr);
            md5.Update((unsigned char *) addr, len);
            file_size_sum += len;
            ::munmap(addr, len);
        }
        ::close(fd);
    }
    md5.Final();
    return string(md5.digestChars);
 }
 }
--- a/libchinese-segmentation/cppjieba/DictTrie.hpp
+++ b/libchinese-segmentation/cppjieba/DictTrie.hpp
@ -0,0 +1,234 @@
 #pragma once
 #include <iostream>
 #include <fstream>
 #include <map>
 #include <string>
 #include <cstring>
 #include <cstdlib>
 #include <stdint.h>
 #include <cmath>
 #include <limits>
 #include "limonp/StringUtil.hpp"
 #include "limonp/Logging.hpp"
 #include "Unicode.hpp"
 #include "DatTrie.hpp"
 #include <QDebug>
 namespace cppjieba {
 using namespace limonp;
 const double MAX_DOUBLE = 3.14e+100;
 const size_t DICT_COLUMN_NUM = 3;
 const char* const UNKNOWN_TAG = "";
 class DictTrie {
 public:
    enum UserWordWeightOption {
        WordWeightMin,
        WordWeightMedian,
        WordWeightMax,
    }; // enum UserWordWeightOption
    DictTrie(const string& dict_path, const string& user_dict_paths = "", const string & dat_cache_path = "",
             UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
        Init(dict_path, user_dict_paths, dat_cache_path, user_word_weight_opt);
    }
    ~DictTrie() {}
    const DatMemElem* Find(const string & word) const {
        return dat_.Find(word);
    }
    void FindDatDag(RuneStrArray::const_iterator begin,
              RuneStrArray::const_iterator end,
              vector<struct DatDag>&res,
              size_t max_word_len = MAX_WORD_LENGTH) const {
        dat_.Find(begin, end, res, max_word_len);
    }
    void FindWordRange(RuneStrArray::const_iterator begin,
              RuneStrArray::const_iterator end,
              vector<WordRange>& words,
              size_t max_word_len = MAX_WORD_LENGTH) const {
        dat_.Find(begin, end, words, max_word_len);
    }
    bool IsUserDictSingleChineseWord(const Rune& word) const {
        return IsIn(user_dict_single_chinese_word_, word);
    }
    double GetMinWeight() const {
        return dat_.GetMinWeight();
    }
    size_t GetTotalDictSize() const {
        return total_dict_size_;
    }
    void InserUserDictNode(const string& line, bool saveNodeInfo = true) {
        vector<string> buf;
        DatElement node_info;
        Split(line, buf, " ");
        if (buf.size() == 0) {
            return;
        }
        node_info.word = buf[0];
        node_info.weight = user_word_default_weight_;
        node_info.tag = UNKNOWN_TAG;
        if (buf.size() == 2) {
            node_info.tag = buf[1];
        } else if (buf.size() == 3) {
            if (freq_sum_ > 0.0) {
                const int freq = atoi(buf[1].c_str());
                node_info.weight = log(1.0 * freq / freq_sum_);
                node_info.tag = buf[2];
            }
        }
        if (saveNodeInfo) {
            static_node_infos_.push_back(node_info);
        }
        if (Utf8CharNum(node_info.word) == 1) {
            RuneArray word;
            if (DecodeRunesInString(node_info.word, word)) {
                user_dict_single_chinese_word_.insert(word[0]);
            } else {
                XLOG(ERROR) << "Decode " << node_info.word << " failed.";
            }
        }
    }
    void LoadUserDict(const string& filePaths, bool saveNodeInfo = true) {
        vector<string> files = limonp::Split(filePaths, "|;");
        for (size_t i = 0; i < files.size(); i++) {
            ifstream ifs(files[i].c_str());
            XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
            string line;
            for (; getline(ifs, line);) {
                if (line.size() == 0) {
                    continue;
                }
                InserUserDictNode(line, saveNodeInfo);
            }
        }
    }
 private:
    void Init(const string& dict_path, const string& user_dict_paths, string dat_cache_path,
              UserWordWeightOption user_word_weight_opt) {
        const auto dict_list = dict_path + "|" + user_dict_paths;
        size_t file_size_sum = 0;
        const string md5 = CalcFileListMD5(dict_list, file_size_sum);
        total_dict_size_ = file_size_sum;
        if (dat_cache_path.empty()) {
            dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
        }
         dat_cache_path += VERSION;
        QString path = QString::fromStdString(dat_cache_path);
        qDebug() << "#########Dict path:" << path;
        if (dat_.InitAttachDat(dat_cache_path, md5)) {
            LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
            return;
        }
        LoadDefaultDict(dict_path);
        freq_sum_ = CalcFreqSum(static_node_infos_);
        CalculateWeight(static_node_infos_, freq_sum_);
        double min_weight = 0;
        SetStaticWordWeights(user_word_weight_opt, min_weight);
        dat_.SetMinWeight(min_weight);
        LoadUserDict(user_dict_paths);
        const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
        assert(build_ret);
        vector<DatElement>().swap(static_node_infos_);
    }
    void LoadDefaultDict(const string& filePath) {
        ifstream ifs(filePath.c_str());
        XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
        string line;
        vector<string> buf;
        for (; getline(ifs, line);) {
            Split(line, buf, " ");
            XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
            DatElement node_info;
            node_info.word = buf[0];
            node_info.weight = atof(buf[1].c_str());
            node_info.tag = buf[2];
            static_node_infos_.push_back(node_info);
        }
    }
    static bool WeightCompare(const DatElement& lhs, const DatElement& rhs) {
        return lhs.weight < rhs.weight;
    }
    void SetStaticWordWeights(UserWordWeightOption option, double & min_weight) {
        XCHECK(!static_node_infos_.empty());
        vector<DatElement> x = static_node_infos_;
        sort(x.begin(), x.end(), WeightCompare);
        if(x.empty()){
            return;
        }
        min_weight = x[0].weight;
        const double max_weight_ = x[x.size() - 1].weight;
        const double median_weight_ = x[x.size() / 2].weight;
        switch (option) {
            case WordWeightMin:
                user_word_default_weight_ = min_weight;
                break;
            case WordWeightMedian:
                user_word_default_weight_ = median_weight_;
                break;
            default:
                user_word_default_weight_ = max_weight_;
                break;
        }
    }
    double CalcFreqSum(const vector<DatElement>& node_infos) const {
        double sum = 0.0;
        for (size_t i = 0; i < node_infos.size(); i++) {
            sum += node_infos[i].weight;
        }
        return sum;
    }
    void CalculateWeight(vector<DatElement>& node_infos, double sum) const {
        for (size_t i = 0; i < node_infos.size(); i++) {
            DatElement& node_info = node_infos[i];
            assert(node_info.weight > 0.0);
            node_info.weight = log(double(node_info.weight) / sum);
        }
    }
 private:
    vector<DatElement> static_node_infos_;
    size_t total_dict_size_ = 0;
    DatTrie dat_;
    double freq_sum_;
    double user_word_default_weight_;
    unordered_set<Rune> user_dict_single_chinese_word_;
 };
 }
--- a/libchinese-segmentation/cppjieba/FullSegment.hpp
+++ b/libchinese-segmentation/cppjieba/FullSegment.hpp
@ -0,0 +1,67 @@
 #pragma once
 #include <algorithm>
 #include <set>
 #include <cassert>
 #include "limonp/Logging.hpp"
 #include "segment-trie/segment-trie.h"
 //#include "DictTrie.hpp"
 #include "SegmentBase.hpp"
 #include "Unicode.hpp"
 namespace cppjieba {
 class FullSegment: public SegmentBase {
 public:
    FullSegment(const DictTrie* dictTrie)
        : dictTrie_(dictTrie) {
        assert(dictTrie_);
    }
    ~FullSegment() { }
    virtual void Cut(RuneStrArray::const_iterator begin,
                     RuneStrArray::const_iterator end,
                     vector<WordRange>& res, bool, size_t) const override {
        assert(dictTrie_);
        vector<struct DatDag> dags;
        dictTrie_->FindDatDag(begin, end, dags);
        size_t max_word_end_pos = 0;
        for (size_t i = 0; i < dags.size(); i++) {
            for (const auto & kv : dags[i].nexts) {
                const size_t nextoffset = kv.first - 1;
                assert(nextoffset < dags.size());
                const auto wordLen = nextoffset - i + 1;
                const bool is_not_covered_single_word = ((dags[i].nexts.size() == 1) && (max_word_end_pos <= i));
                const bool is_oov = (nullptr == kv.second); //Out-of-Vocabulary
                if ((is_not_covered_single_word) || ((not is_oov) && (wordLen >= 2))) {
                    WordRange wr(begin + i, begin + nextoffset);
                    res.push_back(wr);
                }
                max_word_end_pos = max(max_word_end_pos, nextoffset + 1);
            }
        }
    }
    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
                     size_t) const override {
        std::ignore = s;
        std::ignore = begin;
        std::ignore = end;
        std::ignore = res;
        std::ignore = hmm;
    }
    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
                     size_t) const override {
        std::ignore = s;
        std::ignore = begin;
        std::ignore = end;
        std::ignore = res;
        std::ignore = hmm;
    }
 private:
    const DictTrie* dictTrie_;
 };
 }
--- a/libchinese-segmentation/cppjieba/HMMModel.hpp
+++ b/libchinese-segmentation/cppjieba/HMMModel.hpp
@ -0,0 +1,158 @@
 #pragma once
 #include "limonp/StringUtil.hpp"
 //#define USE_CEDAR_SEGMENT //使用cedar初步测试性能损失3%-5%左右，内存占用降低近1M
 #ifdef USE_CEDAR_SEGMENT
 #include "cedar/cedar.h"
 #endif
 namespace cppjieba {
 using namespace limonp;
 #ifdef USE_CEDAR_SEGMENT
 typedef cedar::da<float, -1, -2, false> EmitProbMap;
 #else
 typedef unordered_map<Rune, double> EmitProbMap;
 #endif
 struct HMMModel {
    /*
     * STATUS:
     * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
     * */
    enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
    HMMModel(const string& modelPath) {
        memset(startProb, 0, sizeof(startProb));
        memset(transProb, 0, sizeof(transProb));
        statMap[0] = 'B';
        statMap[1] = 'E';
        statMap[2] = 'M';
        statMap[3] = 'S';
        emitProbVec.push_back(&emitProbB);
        emitProbVec.push_back(&emitProbE);
        emitProbVec.push_back(&emitProbM);
        emitProbVec.push_back(&emitProbS);
        LoadModel(modelPath);
    }
    ~HMMModel() {
    }
    void LoadModel(const string& filePath) {
        ifstream ifile(filePath.c_str());
        XCHECK(ifile.is_open()) << "open " << filePath << " failed";
        string line;
        vector<string> tmp;
        vector<string> tmp2;
        //Load startProb
        XCHECK(GetLine(ifile, line));
        Split(line, tmp, " ");
        XCHECK(tmp.size() == STATUS_SUM);
        for (size_t j = 0; j < tmp.size(); j++) {
            startProb[j] = atof(tmp[j].c_str());
        }
        //Load transProb
        for (size_t i = 0; i < STATUS_SUM; i++) {
            XCHECK(GetLine(ifile, line));
            Split(line, tmp, " ");
            XCHECK(tmp.size() == STATUS_SUM);
            for (size_t j = 0; j < tmp.size(); j++) {
                transProb[i][j] = atof(tmp[j].c_str());
            }
        }
        //Load emitProbB
        XCHECK(GetLine(ifile, line));
        XCHECK(LoadEmitProb(line, emitProbB));
        //Load emitProbE
        XCHECK(GetLine(ifile, line));
        XCHECK(LoadEmitProb(line, emitProbE));
        //Load emitProbM
        XCHECK(GetLine(ifile, line));
        XCHECK(LoadEmitProb(line, emitProbM));
        //Load emitProbS
        XCHECK(GetLine(ifile, line));
        XCHECK(LoadEmitProb(line, emitProbS));
    }
    double GetEmitProb(const EmitProbMap* ptMp, Rune key,
                       double defVal)const {
 #ifdef USE_CEDAR_SEGMENT
        char str_key[8];
        snprintf(str_key, sizeof(str_key), "%d", key);
        float result = ptMp->exactMatchSearch<float>(str_key);
        return result < 0 ? defVal : result;
 #else
        EmitProbMap::const_iterator cit = ptMp->find(key);
        if (cit == ptMp->end()) {
            return defVal;
        }
        return cit->second;
 #endif
    }
    bool GetLine(ifstream& ifile, string& line) {
        while (getline(ifile, line)) {
            Trim(line);
            if (line.empty()) {
                continue;
            }
            if (StartsWith(line, "#")) {
                continue;
            }
            return true;
        }
        return false;
    }
    bool LoadEmitProb(const string& line, EmitProbMap& mp) {
        if (line.empty()) {
            return false;
        }
        vector<string> tmp, tmp2;
        RuneArray unicode;
        Split(line, tmp, ",");
        for (size_t i = 0; i < tmp.size(); i++) {
            Split(tmp[i], tmp2, ":");
            if (2 != tmp2.size()) {
                XLOG(ERROR) << "emitProb illegal.";
                return false;
            }
            if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
                XLOG(ERROR) << "TransCode failed.";
                return false;
            }
 #ifdef USE_CEDAR_SEGMENT
            char str_key[8];
            snprintf(str_key, sizeof(str_key), "%d", unicode[0]);
            mp.update(str_key, std::strlen(str_key), atof(tmp2[1].c_str()));
 #else
            mp[unicode[0]] = atof(tmp2[1].c_str());
 #endif
        }
        return true;
    }
    char statMap[STATUS_SUM];
    double startProb[STATUS_SUM];
    double transProb[STATUS_SUM][STATUS_SUM];
    EmitProbMap emitProbB;
    EmitProbMap emitProbE;
    EmitProbMap emitProbM;
    EmitProbMap emitProbS;
    vector<EmitProbMap* > emitProbVec;
 }; // struct HMMModel
 } // namespace cppjieba
--- a/libchinese-segmentation/cppjieba/HMMSegment.hpp
+++ b/libchinese-segmentation/cppjieba/HMMSegment.hpp
@ -0,0 +1,206 @@
 #pragma once
 #include <iostream>
 #include <fstream>
 #include <memory.h>
 #include <cassert>
 #include "HMMModel.hpp"
 #include "SegmentBase.hpp"
 namespace cppjieba {
 const double MIN_DOUBLE = -3.14e+100;
 class HMMSegment: public SegmentBase {
 public:
    HMMSegment(const HMMModel* model)
        : model_(model) {
    }
    ~HMMSegment() { }
    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool,
                     size_t) const override {
        RuneStrArray::const_iterator left = begin;
        RuneStrArray::const_iterator right = begin;
        while (right != end) {
            if (right->rune < 0x80) { //asc码
                if (left != right) {
                    InternalCut(left, right, res);
                }
                left = right;
                do {
                    right = SequentialLetterRule(left, end);//非英文字符则返回left，否则返回left后非英文字母的位置
                    if (right != left) {
                        break;
                    }
                    right = NumbersRule(left, end);//非数字则返回left，否则返回left后非数字的位置
                    if (right != left) {
                        break;
                    }
                    right ++;
                } while (false);
                WordRange wr(left, right - 1);
                res.push_back(wr);
                left = right;
            } else {
                right++;
            }
        }
        if (left != right) {
            InternalCut(left, right, res);
        }
    }
    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
                     size_t) const override {
        std::ignore = s;
        std::ignore = begin;
        std::ignore = end;
        std::ignore = res;
        std::ignore = hmm;
    }
    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
                     size_t) const override {
        std::ignore = s;
        std::ignore = begin;
        std::ignore = end;
        std::ignore = res;
        std::ignore = hmm;
    }
 private:
    // sequential letters rule
    RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin,
                                                      RuneStrArray::const_iterator end) const {
        Rune x = begin->rune;
        if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
            begin ++;
        } else {
            return begin;
        }
        while (begin != end) {
            x = begin->rune;
            if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
                begin ++;
            } else {
                break;
            }
        }
        return begin;
    }
    //
    RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
        Rune x = begin->rune;
        if ('0' <= x && x <= '9') {
            begin ++;
        } else {
            return begin;
        }
        while (begin != end) {
            x = begin->rune;
            if (('0' <= x && x <= '9') || x == '.') {
                begin++;
            } else {
                break;
            }
        }
        return begin;
    }
    void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
        vector<size_t> status;
        Viterbi(begin, end, status);
        RuneStrArray::const_iterator left = begin;
        RuneStrArray::const_iterator right;
        for (size_t i = 0; i < status.size(); i++) {
            if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
                right = begin + i + 1;
                WordRange wr(left, right - 1);
                res.push_back(wr);
                left = right;
            }
        }
    }
    void Viterbi(RuneStrArray::const_iterator begin,
                 RuneStrArray::const_iterator end,
                 vector<size_t>& status) const {
        size_t Y = HMMModel::STATUS_SUM;
        size_t X = end - begin;
        size_t XYSize = X * Y;
        size_t now, old, stat;
        double tmp, endE, endS;
        //vector<int> path(XYSize);
        //vector<double> weight(XYSize);
        int path[XYSize];
        double weight[XYSize];
        //start
        for (size_t y = 0; y < Y; y++) {
            weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
            path[0 + y * X] = -1;
        }
        double emitProb;
        for (size_t x = 1; x < X; x++) {
            for (size_t y = 0; y < Y; y++) {
                now = x + y * X;
                weight[now] = MIN_DOUBLE;
                path[now] = HMMModel::E; // warning
                emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin + x)->rune, MIN_DOUBLE);
                for (size_t preY = 0; preY < Y; preY++) {
                    old = x - 1 + preY * X;
                    tmp = weight[old] + model_->transProb[preY][y] + emitProb;
                    if (tmp > weight[now]) {
                        weight[now] = tmp;
                        path[now] = preY;
                    }
                }
            }
        }
        endE = weight[X - 1 + HMMModel::E * X];
        endS = weight[X - 1 + HMMModel::S * X];
        stat = 0;
        if (endE >= endS) {
            stat = HMMModel::E;
        } else {
            stat = HMMModel::S;
        }
        status.resize(X);
        for (int x = X - 1 ; x >= 0; x--) {
            status[x] = stat;
            stat = path[x + stat * X];
        }
    }
    const HMMModel* model_;
 }; // class HMMSegment
 } // namespace cppjieba
--- a/libchinese-segmentation/cppjieba/IdfTrie.hpp
+++ b/libchinese-segmentation/cppjieba/IdfTrie.hpp
@ -0,0 +1,117 @@
 #pragma once
 #include <iostream>
 #include <fstream>
 #include <map>
 #include <string>
 #include <cstring>
 #include <cstdlib>
 #include <stdint.h>
 #include <cmath>
 #include <limits>
 #include "limonp/StringUtil.hpp"
 #include "limonp/Logging.hpp"
 #include "Unicode.hpp"
 #include "DatTrie.hpp"
 #include <QDebug>
 namespace cppjieba {
 using namespace limonp;
 const size_t IDF_COLUMN_NUM = 2;
 class IdfTrie {
 public:
    enum UserWordWeightOption {
        WordWeightMin,
        WordWeightMedian,
        WordWeightMax,
    }; // enum UserWordWeightOption
    IdfTrie(const string& dict_path, const string & dat_cache_path = "",
             UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
        Init(dict_path, dat_cache_path, user_word_weight_opt);
    }
    ~IdfTrie() {}
    double Find(const string & word, std::size_t length = 0, std::size_t node_pos = 0) const {
        return dat_.Find(word, length, node_pos);
    }
    size_t GetTotalDictSize() const {
        return total_dict_size_;
    }
 private:
    void Init(const string& dict_path, string dat_cache_path,
              UserWordWeightOption user_word_weight_opt) {
        size_t file_size_sum = 0;
        const string md5 = CalcFileListMD5(dict_path, file_size_sum);
        total_dict_size_ = file_size_sum;
        if (dat_cache_path.empty()) {
            dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
        }
         dat_cache_path += VERSION;
        QString path = QString::fromStdString(dat_cache_path);
        qDebug() << "#########Idf path:" << path;
        if (dat_.InitIdfAttachDat(dat_cache_path, md5)) {
            return;
        }
        LoadDefaultIdf(dict_path);
        double idf_sum_ = CalcIdfSum(static_node_infos_);
        assert(static_node_infos_.size());
        idfAverage_ = idf_sum_ / static_node_infos_.size();
        assert(idfAverage_ > 0.0);
        double min_weight = 0;
        dat_.SetMinWeight(min_weight);
        const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
        assert(build_ret);
        vector<IdfElement>().swap(static_node_infos_);
    }
    void LoadDefaultIdf(const string& filePath) {
        ifstream ifs(filePath.c_str());
        if(not ifs.is_open()){
            return ;
        }
        XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
        string line;
        vector<string> buf;
        size_t lineno = 0;
        for (; getline(ifs, line); lineno++) {
            if (line.empty()) {
                XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
                continue;
            }
            Split(line, buf, " ");
            XCHECK(buf.size() == IDF_COLUMN_NUM) << "split result illegal, line:" << line;
            IdfElement node_info;
            node_info.word = buf[0];
            node_info.idf = atof(buf[1].c_str());
            static_node_infos_.push_back(node_info);
        }
    }
    double CalcIdfSum(const vector<IdfElement>& node_infos) const {
        double sum = 0.0;
        for (size_t i = 0; i < node_infos.size(); i++) {
            sum += node_infos[i].idf;
        }
        return sum;
    }
 public:
    double idfAverage_;
 private:
    vector<IdfElement> static_node_infos_;
    size_t total_dict_size_ = 0;
    DatTrie dat_;
 };
 }
--- a/libchinese-segmentation/cppjieba/Jieba.hpp
+++ b/libchinese-segmentation/cppjieba/Jieba.hpp
@ -0,0 +1,99 @@
 #pragma once
 #include <memory>
 #include "QuerySegment.hpp"
 #include "KeywordExtractor.hpp"
 #include "segment-trie/segment-trie.h"
 namespace cppjieba {
 class Jieba {
 public:
    Jieba(const string& dict_path,
          const string& model_path,
          const string& user_dict_path,
          const string& idfPath = "",
          const string& stopWordPath = "",
          const string& dat_cache_path = "")
        : dict_trie_(dict_path, user_dict_path, dat_cache_path),
          model_(model_path),
          mp_seg_(&dict_trie_),
          hmm_seg_(&model_),
          mix_seg_(&dict_trie_, &model_, stopWordPath),
          full_seg_(&dict_trie_),
          query_seg_(&dict_trie_, &model_, stopWordPath),
          extractor(&dict_trie_, &model_, idfPath, dat_cache_path, stopWordPath){ }
    ~Jieba() { }
    void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
        mix_seg_.CutToStr(sentence, words, hmm);
    }
    void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
        mix_seg_.CutToWord(sentence, words, hmm);
    }
    void CutAll(const string& sentence, vector<string>& words) const {
        full_seg_.CutToStr(sentence, words);
    }
    void CutAll(const string& sentence, vector<Word>& words) const {
        full_seg_.CutToWord(sentence, words);
    }
    void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
        query_seg_.CutToStr(sentence, words, hmm);
    }
    void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
        query_seg_.CutToWord(sentence, words, hmm);
    }
    void CutHMM(const string& sentence, vector<string>& words) const {
        hmm_seg_.CutToStr(sentence, words);
    }
    void CutHMM(const string& sentence, vector<Word>& words) const {
        hmm_seg_.CutToWord(sentence, words);
    }
    void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
        mp_seg_.CutToStr(sentence, words, false, max_word_len);
    }
    void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
        mp_seg_.CutToWord(sentence, words, false, max_word_len);
    }
    void Tag(const string& sentence, vector<pair<string, string> >& words) const {
        mix_seg_.Tag(sentence, words);
    }
    string LookupTag(const string &str) const {
        return mix_seg_.LookupTag(str);
    }
    void ResetSeparators(const string& s) {
        //TODO
        mp_seg_.ResetSeparators(s);
        hmm_seg_.ResetSeparators(s);
        mix_seg_.ResetSeparators(s);
        full_seg_.ResetSeparators(s);
        query_seg_.ResetSeparators(s);
    }
    const DictTrie* GetDictTrie() const {
        return &dict_trie_;
    }
    const HMMModel* GetHMMModel() const {
        return &model_;
    }
 private:
    DictTrie dict_trie_;
    HMMModel model_;
    // They share the same dict trie and model
    MPSegment mp_seg_;
    HMMSegment hmm_seg_;
    MixSegment mix_seg_;
    FullSegment full_seg_;
    QuerySegment query_seg_;
 public:
    KeywordExtractor extractor;
 }; // class Jieba
 } // namespace cppjieba
--- a/libchinese-segmentation/cppjieba/KeywordExtractor.hpp
+++ b/libchinese-segmentation/cppjieba/KeywordExtractor.hpp
@ -0,0 +1,100 @@
 #pragma once
 #include <cmath>
 #include "MixSegment.hpp"
 //#include "IdfTrie.hpp"
 #include "idf-trie/idf-trie.h"
 namespace cppjieba {
 using namespace limonp;
 using namespace std;
 /*utf8*/
 class KeywordExtractor {
 public:
    KeywordExtractor(const DictTrie* dictTrie,
                     const HMMModel* model,
                     const string& idfPath,
                     const string& dat_cache_path,
                     const string& stopWordPath)
        : segment_(dictTrie, model, stopWordPath),
          idf_trie_(idfPath, dat_cache_path){
    }
    ~KeywordExtractor() {
    }
    void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
        vector<KeyWord> topWords;
        Extract(sentence, topWords, topN);
        for (size_t i = 0; i < topWords.size(); i++) {
            keywords.push_back(topWords[i].word);
        }
    }
    void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
        vector<KeyWord> topWords;
        Extract(sentence, topWords, topN);
        for (size_t i = 0; i < topWords.size(); i++) {
            keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
        }
    }
    void Extract(const string& sentence, vector<KeyWord>& keywords, size_t topN) const {
        unordered_map<string, KeyWord> wordmap;//插入字符串与Word的map，相同string统计词频叠加权重
        PreFilter pre_filter(symbols_, sentence);
        RuneStrArray::const_iterator null_p;
        WordRange range(null_p, null_p);
        bool isNull(false);
        while (pre_filter.Next(range, isNull)) {
            if (isNull) {
                continue;
            }
            segment_.CutToStr(sentence, range,  wordmap);
        }
        keywords.clear();
        keywords.reserve(wordmap.size());
        for (unordered_map<string, KeyWord>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
            double idf = idf_trie_.Find(itr->first);
            if (-1 != idf) {//IDF词典查找
                itr->second.weight *= idf;
            } else {
                itr->second.weight *= idf_trie_.GetIdfAverage();
            }
            itr->second.word = itr->first;
            keywords.push_back(itr->second);
        }
        topN = min(topN, keywords.size());
        partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
        keywords.resize(topN);
    }
 private:
    static bool Compare(const KeyWord& lhs, const KeyWord& rhs) {
        return lhs.weight > rhs.weight;
    }
    MixSegment segment_;
    IdfTrie idf_trie_;
    unordered_set<Rune> symbols_;
 }; // class KeywordExtractor
 inline ostream& operator << (ostream& os, const KeyWord& word) {
    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
           "}";
 }
 } // namespace cppjieba
--- a/libchinese-segmentation/cppjieba/MPSegment.hpp
+++ b/libchinese-segmentation/cppjieba/MPSegment.hpp
@ -0,0 +1,133 @@
 #pragma once
 #include <algorithm>
 #include <set>
 #include <cassert>
 #include "limonp/Logging.hpp"
 #include "segment-trie/segment-trie.h"
 //#include "DictTrie.hpp"
 #include "SegmentTagged.hpp"
 #include "PosTagger.hpp"
 namespace cppjieba {
 class MPSegment: public SegmentTagged {
 public:
    MPSegment(const DictTrie* dictTrie)
        : dictTrie_(dictTrie) {
        assert(dictTrie_);
    }
    ~MPSegment() { }
    virtual void Cut(RuneStrArray::const_iterator begin,
                     RuneStrArray::const_iterator end,
                     vector<WordRange>& words,
                     bool, size_t max_word_len) const override {
        dictTrie_->FindWordRange(begin, end, words, max_word_len);
    }
    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
                     size_t) const override {
        std::ignore = s;
        std::ignore = begin;
        std::ignore = end;
        std::ignore = res;
        std::ignore = hmm;
    }
    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
                     size_t) const override {
        std::ignore = s;
        std::ignore = begin;
        std::ignore = end;
        std::ignore = res;
        std::ignore = hmm;
    }
    const DictTrie* GetDictTrie() const override {
        return dictTrie_;
    }
    bool Tag(const string& src, vector<pair<string, string> >& res) const override {
        return tagger_.Tag(src, res, *this);
    }
    bool IsUserDictSingleChineseWord(const Rune& value) const {
        return dictTrie_->IsUserDictSingleChineseWord(value);
    }
 private:
 /*
    void CalcDP(vector<DatDag>& dags) const {
        double val(0);
        for (auto rit = dags.rbegin(); rit != dags.rend(); rit++) {
            rit->max_next = -1;
            rit->max_weight = MIN_DOUBLE;
            for (const auto & it : rit->nexts) {
                const auto nextPos = it.first;
                val = dictTrie_->GetMinWeight();
                if (nullptr != it.second) {
                    val = it.second->weight;
                }
                if (nextPos  < dags.size()) {
                    val += dags[nextPos].max_weight;
                }
                if ((nextPos <= dags.size()) && (val > rit->max_weight)) {
                    rit->max_weight = val;
                    rit->max_next = nextPos;
                }
            }
        }
    }
 */
 /*  倒叙方式重写CalcDP函数，初步测试未发现问题*/
 /*
    void CalcDP(vector<DatDag>& dags) const {
        double val(0);
        size_t size = dags.size();
        for (size_t i = 0; i < size; i++) {
            dags[size - 1 - i].max_next = -1;
            dags[size - 1 - i].max_weight = MIN_DOUBLE;
            for (const auto & it : dags[size - 1 - i].nexts) {
                const auto nextPos = it.first;
                if (nullptr != it.second) {
                    val = it.second->weight;
                }
                if (nextPos  < dags.size()) {
                    val += dags[nextPos].max_weight;
                }
                if ((nextPos <= dags.size()) && (val > dags[size - 1 - i].max_weight)) {
                    dags[size - 1 - i].max_weight = val;
                    dags[size - 1 - i].max_next = nextPos;
                }
            }
        }
    }
    void CutByDag(RuneStrArray::const_iterator begin,
                  RuneStrArray::const_iterator,
                  const vector<DatDag>& dags,
                  vector<WordRange>& words) const {
        for (size_t i = 0; i < dags.size();) {
            const auto next = dags[i].max_next;
            assert(next > i);
            assert(next <= dags.size());
            WordRange wr(begin + i, begin + next - 1);
            words.push_back(wr);
            i = next;
        }
    }
 *///相关功能已集成到Find函数中
    const DictTrie* dictTrie_;
    PosTagger tagger_;
 }; // class MPSegment
 } // namespace cppjieba
--- a/libchinese-segmentation/cppjieba/MixSegment.hpp
+++ b/libchinese-segmentation/cppjieba/MixSegment.hpp
@ -0,0 +1,276 @@
 #pragma once
 #include <cassert>
 #include "MPSegment.hpp"
 #include "HMMSegment.hpp"
 #include "limonp/StringUtil.hpp"
 #include "PosTagger.hpp"
 #define STOP_WORDS_USE_CEDAR_SEGMENT //使用cedar初步测试性能提升3%-5%左右，内存占用降低近不明显
 #ifdef STOP_WORDS_USE_CEDAR_SEGMENT
 #include "cedar/cedar.h"
 #endif
 namespace cppjieba {
 class MixSegment: public SegmentTagged {
 public:
    MixSegment(const DictTrie* dictTrie,
               const HMMModel* model,
               const string& stopWordPath)
        : mpSeg_(dictTrie), hmmSeg_(model) {
        LoadStopWordDict(stopWordPath);
    }
    ~MixSegment() {}
    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
                     size_t) const override {
        if (!hmm) {
            mpSeg_.CutRuneArray(begin, end, res);
            return;
        }
        vector<WordRange> words;
        assert(end >= begin);
        words.reserve(end - begin);
        mpSeg_.CutRuneArray(begin, end, words);
        vector<WordRange> hmmRes;
        hmmRes.reserve(end - begin);
        for (size_t i = 0; i < words.size(); i++) {
            //if mp Get a word, it's ok, put it into result
            if (words[i].left != words[i].right || (words[i].left == words[i].right &&
                                                    mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
                res.push_back(words[i]);
                continue;
            }
            // if mp Get a single one and it is not in userdict, collect it in sequence
            size_t j = i;
            while (j < words.size() && words[j].left == words[j].right &&
                   !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
                j++;
            }
            // Cut the sequence with hmm
            assert(j - 1 >= i);
            // TODO
            hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
            //put hmm result to result
            for (size_t k = 0; k < hmmRes.size(); k++) {
                res.push_back(hmmRes[k]);
            }
            //clear tmp vars
            hmmRes.clear();
            //let i jump over this piece
            i = j - 1;
        }
    }
    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
                     size_t) const override {
        //目前hmm默认开启，后期如有需要关闭再修改--jxx20210519
 //        if (!hmm) {
 //            mpSeg_.CutRuneArray(begin, end, res);
 //            return;
 //        }
        std::ignore = hmm;
        vector<WordRange> words;
        assert(end >= begin);
        words.reserve(end - begin);
        mpSeg_.CutRuneArray(begin, end, words);
        vector<WordRange> hmmRes;
        hmmRes.reserve(end - begin);
        for (size_t i = 0; i < words.size(); i++) {
            //if mp Get a word, it's ok, put it into result
            if (words[i].left != words[i].right) {
                res.push_back(GetStringFromRunes(s, words[i].left, words[i].right));
                continue;
            }
            if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
                    || i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
                res.push_back(GetStringFromRunes(s, words[i].left, words[i].right));
                continue;
            }
            // if mp Get a single one and it is not in userdict, collect it in sequence
            size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里（i字符不是最后一个字符），直接判定j字符
            while (j < (words.size() - 1) && words[j].left == words[j].right &&
                   !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
                j++;
            }
            // Cut the sequence with hmm
            assert(j - 1 >= i);
            // TODO
            hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
            //put hmm result to result
            for (size_t k = 0; k < hmmRes.size(); k++) {
                res.push_back(GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right));
            }
            //clear tmp vars
            hmmRes.clear();
            //let i jump over this piece
            i = j - 1;
        }
    }
    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
                     size_t) const override {
        std::ignore = hmm;
        vector<WordRange> words;
        vector<WordRange> hmmRes;
        assert(end >= begin);
        if (3 == begin->len or 4 == begin->len) {
            words.reserve(end - begin);
            mpSeg_.CutRuneArray(begin, end, words);
            hmmRes.reserve(words.size());
        } else {
            hmmRes.reserve(end - begin);
        }
        if (words.size() != 0) {//存在中文分词结果
            for (size_t i = 0; i < words.size(); i++) {
                string str = GetStringFromRunes(s, words[i].left, words[i].right);
                if (words[i].left != words[i].right) {
 #ifdef STOP_WORDS_USE_CEDAR_SEGMENT
                    if (0 < stopWords_.exactMatchSearch<int>(str.c_str(), str.size())) {
                        continue;
                    }
 #else
                    if (stopWords_.find(str) != stopWords_.end()) {
                        continue;
                    }
 #endif
                    res[str].offsets.push_back(words[i].left->offset);
                    res[str].weight += 1.0;
                    continue;
                }
                if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
                        || i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
 #ifdef STOP_WORDS_USE_CEDAR_SEGMENT
                    if (0 < stopWords_.exactMatchSearch<int>(str.c_str(), str.size())) {
                        continue;
                    }
 #else
                    if (stopWords_.find(str) != stopWords_.end()) {
                        continue;
                    }
 #endif
                    res[str].offsets.push_back(words[i].left->offset);
                    res[str].weight += 1.0;
                    continue;
                }
                // if mp Get a single one and it is not in userdict, collect it in sequence
                size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里（i字符不是最后一个字符），直接判定j字符
                bool isLastWordsSingle(false);
                while (j <= (words.size() - 1)
                       && words[j].left == words[j].right
                       && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
                    if (j == (words.size() - 1)) {//最后一个分词结果是单字
                        isLastWordsSingle = true;
                        break;
                    }
                    j++;
                }
                // Cut the sequence with hmm
                assert(j - 1 >= i);
                // TODO
                if (isLastWordsSingle) {
                    hmmSeg_.CutRuneArray(words[i].left, words[j].left + 1, hmmRes);
                } else {
                    hmmSeg_.CutRuneArray(words[i].left, words[j].left, hmmRes);
                }
                //put hmm result to result
                for (size_t k = 0; k < hmmRes.size(); k++) {
                    string hmmStr = GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right);
 #ifdef STOP_WORDS_USE_CEDAR_SEGMENT
                    if (0 < stopWords_.exactMatchSearch<int>(hmmStr.c_str(), hmmStr.size())) {
                        continue;
                    }
 #else
                    if (/*IsSingleWord(hmmStr) || */stopWords_.find(hmmStr) != stopWords_.end()) {
                        continue;
                    }
 #endif
                    res[hmmStr].offsets.push_back(hmmRes[k].left->offset);
                    res[hmmStr].weight += 1.0;
                }
                //clear tmp vars
                hmmRes.clear();
                //let i jump over this piece
                if (isLastWordsSingle) {
                    break;
                }
                i = j - 1;
            }
        } else {//不存在中文分词结果
            for (size_t i = 0; i < (size_t)(end - begin); i++) {
                string str = s.substr((begin+i)->offset, (begin+i)->len);
                res[str].offsets.push_back((begin+i)->offset);
                res[str].weight += 1.0;
            }
        }
    }
    const DictTrie* GetDictTrie() const override {
        return mpSeg_.GetDictTrie();
    }
    bool Tag(const string& src, vector<pair<string, string> >& res) const override {
        return tagger_.Tag(src, res, *this);
    }
    string LookupTag(const string &str) const {
        return tagger_.LookupTag(str, *this);
    }
    void LoadStopWordDict(const string& filePath) {
        ifstream ifs(filePath.c_str());
        if(not ifs.is_open()){
            return ;
        }
        XCHECK(ifs.is_open()) << "open " << filePath << " failed";
        string line ;
        while (getline(ifs, line)) {
 #ifdef STOP_WORDS_USE_CEDAR_SEGMENT
            stopWords_.update(line.c_str(), line.size(), 1);
 #else
            stopWords_.insert(line);
 #endif
        }
        assert(stopWords_.size());
    }
 private:
 #ifdef STOP_WORDS_USE_CEDAR_SEGMENT
    cedar::da<int, -1, -2, false> stopWords_;
 #else
    unordered_set<string> stopWords_;
 #endif
    MPSegment mpSeg_;
    HMMSegment hmmSeg_;
    PosTagger tagger_;
 }; // class MixSegment
 } // namespace cppjieba
--- a/libchinese-segmentation/cppjieba/PinYinTrie.hpp
+++ b/libchinese-segmentation/cppjieba/PinYinTrie.hpp
@ -0,0 +1,154 @@
 #pragma once
 #include <iostream>
 #include <fstream>
 #include <map>
 #include <string>
 #include <cstring>
 #include <cstdlib>
 #include <stdint.h>
 #include <cmath>
 #include <limits>
 #include "limonp/StringUtil.hpp"
 #include "limonp/Logging.hpp"
 #include "Unicode.hpp"
 #include "DatTrie.hpp"
 #include <QDebug>
 namespace cppjieba {
 using namespace limonp;
 const size_t PINYIN_COLUMN_NUM = 2;
 class PinYinTrie {
 public:
    enum UserWordWeightOption {
        WordWeightMin,
        WordWeightMedian,
        WordWeightMax,
    }; // enum UserWordWeightOption
    PinYinTrie(const string& dict_path, const string & dat_cache_path = "",
             UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
        Init(dict_path, dat_cache_path, user_word_weight_opt);
    }
    ~PinYinTrie() {}
    int getMultiTonResults(string word, QStringList &results) {
        if (qmap_chinese2pinyin.contains(QString::fromStdString(word))) {
            for (auto i:qmap_chinese2pinyin[QString::fromStdString(word)])
                results.push_back(i);
            return 0;
        }
        return -1;
    }
    int getSingleTonResult(string word, QString &result) {
        const PinYinMemElem * tmp = dat_.PinYinFind(word);
        if (tmp) {
            result = QString::fromStdString(tmp->GetTag());
            return 0;
        }
        return -1;
    }
    bool contains(string &word) {
        if (qmap_chinese2pinyin.contains(QString::fromStdString(word))
                or !dat_.PinYinFind(word))
            return true;
 //        if (map_chinese2pinyin.contains(word)
 //                or !dat_.PinYinFind(word))
 //            return true;
        return false;
    }
    bool isMultiTone(const string &word) {
        if (qmap_chinese2pinyin.contains(QString::fromStdString(word)))
            return true;
 //        if (map_chinese2pinyin.contains(word))
 //            return true;
        return false;
    }
    size_t GetTotalDictSize() const {
        return total_dict_size_;
    }
 private:
    void Init(const string& dict_path, string dat_cache_path,
              UserWordWeightOption user_word_weight_opt) {
        size_t file_size_sum = 0;
        vector<PinYinElement> node_infos;
        const string md5 = CalcFileListMD5(dict_path, file_size_sum);
        total_dict_size_ = file_size_sum;
        if (dat_cache_path.empty()) {
            //未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
            dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) +  ".dat_cache";
        }
        QString path = QString::fromStdString(dat_cache_path);
        qDebug() << "#########PinYin path:" << path << file_size_sum;
        if (dat_.InitPinYinAttachDat(dat_cache_path, md5)) {
            //多音字仍需遍历文件信息
            LoadDefaultPinYin(node_infos, dict_path, true);
            return;
        }
        LoadDefaultPinYin(node_infos, dict_path, false);
        double min_weight = 0;
        dat_.SetMinWeight(min_weight);
        const auto build_ret = dat_.InitBuildDat(node_infos, dat_cache_path, md5);
        assert(build_ret);
        vector<PinYinElement>().swap(node_infos);
    }
    void LoadDefaultPinYin(vector<PinYinElement> &node_infos, const string& filePath, bool multiFlag) {
        ifstream ifs(filePath.c_str());
        if(not ifs.is_open()){
            return ;
        }
        XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
        string line;
        vector<string> buf;
        size_t lineno = 0;
        for (; getline(ifs, line); lineno++) {
            if (line.empty()) {
                XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
                continue;
            }
            Split(line, buf, " ");
            if (buf.size() == PINYIN_COLUMN_NUM) {
                if (multiFlag) {//非多音字
                    continue;
                }
                PinYinElement node_info;
                node_info.word = buf[1];
                node_info.tag = buf[0];
                node_infos.push_back(node_info);
            } else {//多音字
                QString content = QString::fromUtf8(line.c_str());
                qmap_chinese2pinyin[content.split(" ").last().trimmed()] = content.split(" ");
                qmap_chinese2pinyin[content.split(" ").last().trimmed()].pop_back();
                /*
                 //std map string list
                 list<string> tmpList;
                 for(int i = 0; i < buf.size() - 1; ++i){
                    tmpList.push_back(buf[i]);
                 }
                 map[buf[buf.size() - 1]] = tmpList;
                */
            }
        }
    }
 private:
    QMap<QString, QStringList> qmap_chinese2pinyin;
    //map<string, list<string>> map_chinese2pinyin;
    size_t total_dict_size_ = 0;
    DatTrie dat_;
 };
 }
--- a/libchinese-segmentation/cppjieba/PosTagger.hpp
+++ b/libchinese-segmentation/cppjieba/PosTagger.hpp
@ -0,0 +1,84 @@
 #pragma once
 #include "limonp/StringUtil.hpp"
 #include "segment-trie/segment-trie.h"
 //#include "DictTrie.hpp"
 //#include "SegmentTagged.hpp"
 namespace cppjieba {
 using namespace limonp;
 static const char* const POS_M = "m";
 static const char* const POS_ENG = "eng";
 static const char* const POS_X = "x";
 class PosTagger {
 public:
    PosTagger() {
    }
    ~PosTagger() {
    }
    bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
        vector<string> CutRes;
        segment.CutToStr(src, CutRes);
        for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
            res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
        }
        return !res.empty();
    }
    string LookupTag(const string &str, const SegmentTagged& segment) const {
        const DictTrie * dict = segment.GetDictTrie();
        assert(dict != nullptr);
        const auto tmp = dict->Find(str);
        if (tmp == nullptr || tmp->GetTag().empty()) {
            RuneStrArray runes;
            if (!DecodeRunesInString(str, runes)) {
                XLOG(ERROR) << "Decode failed.";
                return POS_X;
            }
            return SpecialRule(runes);
        } else {
            return tmp->GetTag();
        }
    }
 private:
    const char* SpecialRule(const RuneStrArray& unicode) const {
        size_t m = 0;
        size_t eng = 0;
        for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
            if (unicode[i].rune < 0x80) {
                eng ++;
                if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
                    m++;
                }
            }
        }
        // ascii char is not found
        if (eng == 0) {
            return POS_X;
        }
        // all the ascii is number char
        if (m == eng) {
            return POS_M;
        }
        // the ascii chars contain english letter
        return POS_ENG;
    }
 }; // class PosTagger
 } // namespace cppjieba
--- a/libchinese-segmentation/cppjieba/PreFilter.hpp
+++ b/libchinese-segmentation/cppjieba/PreFilter.hpp
@ -0,0 +1,127 @@
 #pragma once
 #include "limonp/Logging.hpp"
 #include <unordered_set>
 #include "Unicode.hpp"
 namespace cppjieba {
 class PreFilter {
 public:
    PreFilter(const std::unordered_set<Rune>& symbols,
              const string& sentence)
        : symbols_(symbols) {
        if (!DecodeRunesInString(sentence, sentence_)) {
            XLOG(ERROR) << "decode failed. "<<sentence;
        }
        cursor_ = sentence_.begin();
    }
    ~PreFilter() {
    }
    bool HasNext() const {
        return cursor_ != sentence_.end();
    }
    bool Next(WordRange& wordRange) {
        if (cursor_ == sentence_.end()) {
            return false;
        }
        wordRange.left = cursor_;
        while (cursor_->rune == 0x20 && cursor_ != sentence_.end()) {
            cursor_++;
        }
        if (cursor_ == sentence_.end()) {
            wordRange.right = cursor_;
            return true;
        }
        while (++cursor_ != sentence_.end()) {
            if (cursor_->rune == 0x20) {
                wordRange.right = cursor_;
                return true;
            }
        }
        wordRange.right = sentence_.end();
        return true;
    }
    bool Next(WordRange& wordRange, bool& isNull) {
        isNull = false;
        if (cursor_ == sentence_.end()) {
            return false;
        }
        wordRange.left = cursor_;
        if (cursor_->rune == 0x20) {
            while (cursor_ != sentence_.end()) {
                if (cursor_->rune != 0x20) {
                    if (wordRange.left == cursor_) {
                        cursor_ ++;
                    }
                    wordRange.right = cursor_;
                    isNull = true;
                    return true;
                }
                cursor_ ++;
            }
            return false;
        }
        int max_num = 0;
        uint32_t utf8_num = cursor_->len;
        while (cursor_ != sentence_.end()) {
            if (cursor_->rune == 0x20) {
                if (wordRange.left == cursor_) {
                    cursor_ ++;
                }
                wordRange.right = cursor_;
                return true;
            }
            cursor_ ++;
            max_num++;
            if (max_num >= 1024 or cursor_->len != utf8_num) { //todo 防止一次性传入过多字节，暂定限制为1024个字
                wordRange.right = cursor_;
                return true;
            }
        }
        wordRange.right = sentence_.end();
        return true;
    }
    WordRange Next() {
        WordRange range(cursor_, cursor_);
        while (cursor_ != sentence_.end()) {
            //if (IsIn(symbols_, cursor_->rune)) {
            if (cursor_->rune == 0x20) {
                if (range.left == cursor_) {
                    cursor_ ++;
                }
                range.right = cursor_;
                return range;
            }
            cursor_ ++;
        }
        range.right = sentence_.end();
        return range;
    }
 private:
    RuneStrArray::const_iterator cursor_;
    RuneStrArray sentence_;
    const std::unordered_set<Rune>& symbols_;
 }; // class PreFilter
 } // namespace cppjieba
--- a/libchinese-segmentation/cppjieba/QuerySegment.hpp
+++ b/libchinese-segmentation/cppjieba/QuerySegment.hpp
@ -0,0 +1,89 @@
 #pragma once
 #include <algorithm>
 #include <set>
 #include <cassert>
 #include "limonp/Logging.hpp"
 #include "SegmentBase.hpp"
 #include "FullSegment.hpp"
 #include "MixSegment.hpp"
 #include "Unicode.hpp"
 namespace cppjieba {
 class QuerySegment: public SegmentBase {
 public:
    QuerySegment(const DictTrie* dictTrie,
                 const HMMModel* model,
                 const string& stopWordPath)
        : mixSeg_(dictTrie, model, stopWordPath), trie_(dictTrie) {
    }
    ~QuerySegment() {
    }
    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
                     size_t) const override {
        //use mix Cut first
        vector<WordRange> mixRes;
        mixSeg_.CutRuneArray(begin, end, mixRes, hmm);
        vector<WordRange> fullRes;
        for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
            if (mixResItr->Length() > 2) {
                for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
                    string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 2);
                    if (trie_->Find(text) != nullptr) {
                        WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
                        res.push_back(wr);
                    }
                }
            }
            if (mixResItr->Length() > 3) {
                for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
                    string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 3);
                    if (trie_->Find(text) != nullptr) {
                        WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
                        res.push_back(wr);
                    }
                }
            }
            res.push_back(*mixResItr);
        }
    }
    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
                     size_t) const override {
        std::ignore = s;
        std::ignore = begin;
        std::ignore = end;
        std::ignore = res;
        std::ignore = hmm;
    }
    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
                     size_t) const override {
        std::ignore = s;
        std::ignore = begin;
        std::ignore = end;
        std::ignore = res;
        std::ignore = hmm;
    }
 private:
    bool IsAllAscii(const RuneArray& s) const {
        for (size_t i = 0; i < s.size(); i++) {
            if (s[i] >= 0x80) {
                return false;
            }
        }
        return true;
    }
    MixSegment mixSeg_;
    const DictTrie* trie_;
 }; // QuerySegment
 } // namespace cppjieba
--- a/libchinese-segmentation/cppjieba/SegmentBase.hpp
+++ b/libchinese-segmentation/cppjieba/SegmentBase.hpp
@ -0,0 +1,94 @@
 #pragma once
 #include "limonp/Logging.hpp"
 #include "PreFilter.hpp"
 #include <cassert>
 namespace cppjieba {
 const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
 using namespace limonp;
 class SegmentBase {
 public:
    SegmentBase() {
        XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
    }
    virtual ~SegmentBase() { }
    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
                     size_t max_word_len) const = 0;
    //添加基于sentence的cut方法，减少中间变量的存储与格式转换--jxx20210517
    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
                     size_t max_word_len) const = 0;
    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
                     size_t max_word_len) const = 0;
    //重写CutToStr函数，简化获取vector<string>& words的流程，降低内存占用--jxx20210517
    void CutToStr(const string& sentence, vector<string>& words, bool hmm = true,
                  size_t max_word_len = MAX_WORD_LENGTH) const {
        PreFilter pre_filter(symbols_, sentence);
        words.clear();
        words.reserve(sentence.size() / 2);//todo 参考源码，参数待定
        RuneStrArray::const_iterator null_p;
        WordRange range(null_p, null_p);
        while (pre_filter.Next(range)) {
            CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len);
        }
    }
    void CutToStr(const string& sentence, WordRange range, vector<string>& words, bool hmm = true,
                  size_t max_word_len = MAX_WORD_LENGTH) const {
        CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len);
    }
    void CutToStr(const string& sentence, WordRange range, unordered_map<string, KeyWord>& words, bool hmm = true,
                  size_t max_word_len = MAX_WORD_LENGTH) const {
        CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len);
    }
    void CutToWord(const string& sentence, vector<Word>& words, bool hmm = true,
                   size_t max_word_len = MAX_WORD_LENGTH) const {
        PreFilter pre_filter(symbols_, sentence);
        vector<WordRange> wrs;
        wrs.reserve(sentence.size() / 2);
        while (pre_filter.HasNext()) {
            auto range = pre_filter.Next();
            Cut(range.left, range.right, wrs, hmm, max_word_len);
        }
        words.clear();
        words.reserve(wrs.size());
        GetWordsFromWordRanges(sentence, wrs, words);
        wrs.clear();
        vector<WordRange>().swap(wrs);
    }
    void CutRuneArray(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res,
                      bool hmm = true, size_t max_word_len = MAX_WORD_LENGTH) const {
        Cut(begin, end, res, hmm, max_word_len);
    }
    bool ResetSeparators(const string& s) {
        symbols_.clear();
        RuneStrArray runes;
        if (!DecodeRunesInString(s, runes)) {
            XLOG(ERROR) << "decode " << s << " failed";
            return false;
        }
        for (size_t i = 0; i < runes.size(); i++) {
            if (!symbols_.insert(runes[i].rune).second) {
                XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
                return false;
            }
        }
        return true;
    }
 protected:
    unordered_set<Rune> symbols_;
 }; // class SegmentBase
 } // cppjieba
--- a/libchinese-segmentation/cppjieba/SegmentTagged.hpp
+++ b/libchinese-segmentation/cppjieba/SegmentTagged.hpp
@ -0,0 +1,21 @@
 #pragma once
 #include "SegmentBase.hpp"
 namespace cppjieba {
 class SegmentTagged : public SegmentBase {
 public:
    SegmentTagged() {
    }
    virtual ~SegmentTagged() {
    }
    virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
    virtual const DictTrie* GetDictTrie() const = 0;
 }; // class SegmentTagged
 } // cppjieba
--- a/libchinese-segmentation/cppjieba/TextRankExtractor.hpp
+++ b/libchinese-segmentation/cppjieba/TextRankExtractor.hpp
@ -0,0 +1,205 @@
 #include <cmath>
 #include "Jieba.hpp"
 namespace cppjieba {
 using namespace limonp;
 using namespace std;
 class TextRankExtractor {
 public:
    typedef struct _Word {
        string word;
        vector<size_t> offsets;
        double weight;
    }    Word; // struct Word
 private:
    typedef std::map<string, Word> WordMap;
    class WordGraph {
    private:
        typedef double Score;
        typedef string Node;
        typedef std::set<Node> NodeSet;
        typedef std::map<Node, double> Edges;
        typedef std::map<Node, Edges> Graph;
        //typedef std::unordered_map<Node,double> Edges;
        //typedef std::unordered_map<Node,Edges> Graph;
        double d;
        Graph graph;
        NodeSet nodeSet;
    public:
        WordGraph(): d(0.85) {};
        WordGraph(double in_d): d(in_d) {};
        void addEdge(Node start, Node end, double weight) {
            Edges temp;
            Edges::iterator gotEdges;
            nodeSet.insert(start);
            nodeSet.insert(end);
            graph[start][end] += weight;
            graph[end][start] += weight;
        }
        void rank(WordMap &ws, size_t rankTime = 10) {
            WordMap outSum;
            Score wsdef, min_rank, max_rank;
            if (graph.size() == 0) {
                return;
            }
            wsdef = 1.0 / graph.size();
            for (Graph::iterator edges = graph.begin(); edges != graph.end(); ++edges) {
                // edges->first start节点；edge->first end节点；edge->second 权重
                ws[edges->first].word = edges->first;
                ws[edges->first].weight = wsdef;
                outSum[edges->first].weight = 0;
                for (Edges::iterator edge = edges->second.begin(); edge != edges->second.end(); ++edge) {
                    outSum[edges->first].weight += edge->second;
                }
            }
            //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
            for (size_t i = 0; i < rankTime; i++) {
                for (NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++) {
                    double s = 0;
                    for (Edges::iterator edge = graph[*node].begin(); edge != graph[*node].end(); edge++)
                        // edge->first end节点；edge->second 权重
                    {
                        s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
                    }
                    ws[*node].weight = (1 - d) + d * s;
                }
            }
            min_rank = max_rank = ws.begin()->second.weight;
            for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
                if (i->second.weight < min_rank) {
                    min_rank = i->second.weight;
                }
                if (i->second.weight > max_rank) {
                    max_rank = i->second.weight;
                }
            }
            for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
                ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
            }
        }
    };
 public:
    TextRankExtractor(const DictTrie* dictTrie,
                      const HMMModel* model,
                      const string& stopWordPath)
        : segment_(dictTrie, model) {
        LoadStopWordDict(stopWordPath);
    }
    TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
        LoadStopWordDict(stopWordPath);
    }
    ~TextRankExtractor() {
    }
    void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
        vector<Word> topWords;
        Extract(sentence, topWords, topN);
        for (size_t i = 0; i < topWords.size(); i++) {
            keywords.push_back(topWords[i].word);
        }
    }
    void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
        vector<Word> topWords;
        Extract(sentence, topWords, topN);
        for (size_t i = 0; i < topWords.size(); i++) {
            keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
        }
    }
    void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span = 5, size_t rankTime = 10) const {
        vector<string> words;
        segment_.CutToStr(sentence, words);
        TextRankExtractor::WordGraph graph;
        WordMap wordmap;
        size_t offset = 0;
        for (size_t i = 0; i < words.size(); i++) {
            size_t t = offset;
            offset += words[i].size();
            if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
                continue;
            }
            for (size_t j = i + 1, skip = 0; j < i + span + skip && j < words.size(); j++) {
                if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
                    skip++;
                    continue;
                }
                graph.addEdge(words[i], words[j], 1);
            }
            wordmap[words[i]].offsets.push_back(t);
        }
        if (offset != sentence.size()) {
            XLOG(ERROR) << "words illegal";
            return;
        }
        graph.rank(wordmap, rankTime);
        keywords.clear();
        keywords.reserve(wordmap.size());
        for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
            keywords.push_back(itr->second);
        }
        topN = min(topN, keywords.size());
        partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
        keywords.resize(topN);
    }
 private:
    void LoadStopWordDict(const string& filePath) {
        ifstream ifs(filePath.c_str());
        XCHECK(ifs.is_open()) << "open " << filePath << " failed";
        string line ;
        while (getline(ifs, line)) {
            stopWords_.insert(line);
        }
        assert(stopWords_.size());
    }
    static bool Compare(const Word &x, const Word &y) {
        return x.weight > y.weight;
    }
    MixSegment segment_;
    unordered_set<string> stopWords_;
 }; // class TextRankExtractor
 inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
           "}";
 }
 } // namespace cppjieba
--- a/libchinese-segmentation/cppjieba/Unicode.hpp
+++ b/libchinese-segmentation/cppjieba/Unicode.hpp
@ -0,0 +1,264 @@
 #pragma once
 #include <stdint.h>
 #include <stdlib.h>
 #include <string>
 #include <vector>
 #include <ostream>
 #include "limonp/LocalVector.hpp"
 #include "limonp/StringUtil.hpp"
 #include "common-struct.h"
 namespace cppjieba {
 using std::string;
 using std::vector;
 typedef uint32_t Rune;
 inline std::ostream& operator << (std::ostream& os, const Word& w) {
    return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
 }
 struct DatMemElem {
    double weight = 0.0;
    char tag[8] = {};
    void SetTag(const string & str) {
        memset(&tag[0], 0, sizeof(tag));
        strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
    }
    string GetTag() const {
        return &tag[0];
    }
 };
 struct DatDag {
    limonp::LocalVector<pair<size_t, const DatMemElem *> > nexts;
    //double max_weight;
    //size_t max_next;
 };
 struct RuneInfo {
    Rune rune;
    uint32_t offset;
    uint32_t len;
    uint32_t unicode_offset = 0;
    uint32_t unicode_length = 0;
    RuneInfo(): rune(0), offset(0), len(0) {
    }
    RuneInfo(Rune r, uint32_t o, uint32_t l)
        : rune(r), offset(o), len(l) {
    }
    RuneInfo(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
        : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
    }
 }; // struct RuneInfo
 inline std::ostream& operator << (std::ostream& os, const RuneInfo& r) {
    return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
 }
 typedef limonp::LocalVector<Rune> RuneArray;
 typedef limonp::LocalVector<struct RuneInfo> RuneStrArray;
 // [left, right]
 struct WordRange {
    RuneStrArray::const_iterator left;
    RuneStrArray::const_iterator right;
    WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
        : left(l), right(r) {
    }
    size_t Length() const {
        return right - left;
    }
    bool IsAllAscii() const {
        for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
            if (iter->rune >= 0x80) {
                return false;
            }
        }
        return true;
    }
 }; // struct WordRange
 inline bool DecodeRunesInString(const string& s, RuneArray& arr) {
    arr.clear();
    return limonp::Utf8ToUnicode32(s, arr);
 }
 inline RuneArray DecodeRunesInString(const string& s) {
    RuneArray result;
    DecodeRunesInString(s, result);
    return result;
 }
 inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
    uint32_t tmp;
    uint32_t offset = 0;
    runes.clear();
    uint32_t len(0);
    for (size_t i = 0; i < s.size();) {
      if (!(s.data()[i] & 0x80)) { // 0xxxxxxx
        // 7bit, total 7bit
        tmp = (uint8_t)(s.data()[i]) & 0x7f;
        i++;
        len = 1;
      } else if ((uint8_t)s.data()[i] <= 0xdf && i + 1 < s.size()) { // 110xxxxxx
        // 5bit, total 5bit
        tmp = (uint8_t)(s.data()[i]) & 0x1f;
        // 6bit, total 11bit
        tmp <<= 6;
        tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
        i += 2;
        len = 2;
      } else if((uint8_t)s.data()[i] <= 0xef && i + 2 < s.size()) { // 1110xxxxxx
        // 4bit, total 4bit
        tmp = (uint8_t)(s.data()[i]) & 0x0f;
        // 6bit, total 10bit
        tmp <<= 6;
        tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
        // 6bit, total 16bit
        tmp <<= 6;
        tmp |= (uint8_t)(s.data()[i+2]) & 0x3f;
        i += 3;
        len = 3;
      } else if((uint8_t)s.data()[i] <= 0xf7 && i + 3 < s.size()) { // 11110xxxx
        // 3bit, total 3bit
        tmp = (uint8_t)(s.data()[i]) & 0x07;
        // 6bit, total 9bit
        tmp <<= 6;
        tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
        // 6bit, total 15bit
        tmp <<= 6;
        tmp |= (uint8_t)(s.data()[i+2]) & 0x3f;
        // 6bit, total 21bit
        tmp <<= 6;
        tmp |= (uint8_t)(s.data()[i+3]) & 0x3f;
        i += 4;
        len = 4;
      } else {
        return false;
      }
      RuneInfo x(tmp, offset, len, i, 1);
      runes.push_back(x);
      offset += len;
    }
    return true;
 }
 class RunePtrWrapper {
 public:
    const RuneInfo * m_ptr = nullptr;
 public:
    explicit RunePtrWrapper(const RuneInfo * p) : m_ptr(p) {}
    uint32_t operator *() {
        return m_ptr->rune;
    }
    RunePtrWrapper operator ++(int) {
        m_ptr ++;
        return RunePtrWrapper(m_ptr);
    }
    bool operator !=(const RunePtrWrapper & b) const {
        return this->m_ptr != b.m_ptr;
    }
 };
 inline string EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) {
    string str;
    RunePtrWrapper it_begin(begin), it_end(end);
    limonp::Unicode32ToUtf8(it_begin, it_end, str);
    return str;
 }
 inline void EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, string& str) {
    RunePtrWrapper it_begin(begin), it_end(end);
    limonp::Unicode32ToUtf8(it_begin, it_end, str);
    return;
 }
 class Unicode32Counter {
 public :
    size_t length = 0;
    void clear() {
        length = 0;
    }
    void push_back(uint32_t) {
        ++length;
    }
 };
 inline size_t Utf8CharNum(const char * str, size_t length) {
    Unicode32Counter c;
    if (limonp::Utf8ToUnicode32(str, length, c)) {
        return c.length;
    }
    return 0;
 }
 inline size_t Utf8CharNum(const string & str) {
    return Utf8CharNum(str.data(), str.size());
 }
 inline bool IsSingleWord(const string& str) {
    return Utf8CharNum(str) == 1;
 }
 // [left, right]
 inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
    assert(right->offset >= left->offset);
    uint32_t len = right->offset - left->offset + right->len;
    uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
    return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
 }
 inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
    assert(right->offset >= left->offset);
    //uint32_t len = right->offset - left->offset + right->len;
    return s.substr(left->offset, right->offset - left->offset + right->len);
 }
 inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
    for (size_t i = 0; i < wrs.size(); i++) {
        words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
    }
 }
 inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<string>& words) {
    for (size_t i = 0; i < wrs.size(); i++) {
        words.push_back(GetStringFromRunes(s, wrs[i].left, wrs[i].right));
    }
 }
 inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
    strs.resize(words.size());
    for (size_t i = 0; i < words.size(); ++i) {
        strs[i] = words[i].word;
    }
 }
 const size_t MAX_WORD_LENGTH = 512;
 } // namespace cppjieba
--- a/libchinese-segmentation/cppjieba/cppjieba.pri
+++ b/libchinese-segmentation/cppjieba/cppjieba.pri
@ -0,0 +1,43 @@
 INCLUDEPATH += $$PWD
 HEADERS += \
    $$PWD/DictTrie.hpp \
    $$PWD/IdfTrie.hpp \
    $$PWD/PinYinTrie.hpp \
    $$PWD/FullSegment.hpp \
    $$PWD/HMMModel.hpp \
    $$PWD/HMMSegment.hpp \
    $$PWD/Jieba.hpp \
    $$PWD/KeywordExtractor.hpp \
    $$PWD/MPSegment.hpp \
    $$PWD/MixSegment.hpp \
    $$PWD/PosTagger.hpp \
    $$PWD/PreFilter.hpp \
    $$PWD/QuerySegment.hpp \
    $$PWD/SegmentBase.hpp \
    $$PWD/SegmentTagged.hpp \
    $$PWD/TextRankExtractor.hpp \
 #    $$PWD/Trie.hpp \
    $$PWD/Unicode.hpp \
    $$PWD/DatTrie.hpp \
    $$PWD/idf-trie/idf-trie.h \
    $$PWD/segment-trie/segment-trie.h
 DISTFILES += \
    dict/README.md \
    dict/hmm_model.utf8 \
    dict/idf.utf8 \
    dict/jieba.dict.utf8 \
    dict/pos_dict/char_state_tab.utf8 \
    dict/pos_dict/prob_emit.utf8 \
    dict/pos_dict/prob_start.utf8 \
    dict/pos_dict/prob_trans.utf8 \
    dict/stop_words.utf8 \
    dict/user.dict.utf8
    #dict/pinyinWithoutTone.txt \
 include(limonp/limonp.pri)
 SOURCES += \
    $$PWD/idf-trie/idf-trie.cpp \
    $$PWD/segment-trie/segment-trie.cpp
--- a/libchinese-segmentation/cppjieba/idf-trie/idf-trie.cpp
+++ b/libchinese-segmentation/cppjieba/idf-trie/idf-trie.cpp
@ -0,0 +1,96 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #include "idf-trie.h"
 IdfTrie::IdfTrie(const vector<string> file_paths, string dat_cache_path)
    : StorageBase<double, false, IdfCacheFileHeader>(file_paths, dat_cache_path)
 {
    this->Init();
 }
 IdfTrie::IdfTrie(string file_path, string dat_cache_path)
 : StorageBase<double, false, IdfCacheFileHeader>(vector<string>{file_path}, dat_cache_path)
 {
    this->Init();
 }
 void IdfTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
 {
    IdfCacheFileHeader header;
    assert(sizeof(header.md5_hex) == md5.size());
    memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
    int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
    double idf_sum(0), idf_average(0), tmp(0);
    string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
    umask(S_IWGRP | S_IWOTH);
    const int fd =mkstemp((char *)tmp_filepath.data());
    assert(fd >= 0);
    fchmod(fd, 0644);
    write_bytes = write(fd, (const char *)&header, sizeof(IdfCacheFileHeader));
    ifstream ifs(IDF_DICT_PATH);
    string line;
    vector<string> buf;
    for (; getline(ifs, line);) {
        if (limonp::StartsWith(line, "#") or line.empty()) {
            continue;
        }
        limonp::Split(line, buf, " ");
        if (buf.size() != 2)
            continue;
        this->Update(buf[0].c_str(), buf[0].size(), elements_num);
        offset += sizeof(double);
        elements_num++;
        tmp = atof(buf[1].c_str());
        write_bytes += write(fd, &tmp, sizeof(double));
        idf_sum += tmp;
    }
    idf_average = idf_sum / elements_num;
    write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
    lseek(fd, sizeof(header.md5_hex), SEEK_SET);
    write(fd, &elements_num, sizeof(int));
    write(fd, &offset, sizeof(int));
    data_trie_size = this->GetDataTrieSize();
    write(fd, &data_trie_size, sizeof(int));
    write(fd, &idf_average, sizeof(double));
    close(fd);
    assert((size_t)write_bytes == sizeof(IdfCacheFileHeader) + offset + this->GetDataTrieTotalSize());
    tryRename(tmp_filepath, dat_cache_file);
 }
 double IdfTrie::Find(const string &key) const
 {
    int result = this->ExactMatchSearch(key.c_str(), key.size());
    if (result < 0)
        return -1;
    return this->GetElementPtr()[result];
 }
 double IdfTrie::GetIdfAverage() const
 {
    return this->GetCacheFileHeaderPtr()->idf_average;
 }
--- a/libchinese-segmentation/cppjieba/idf-trie/idf-trie.h
+++ b/libchinese-segmentation/cppjieba/idf-trie/idf-trie.h
@ -0,0 +1,45 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef IdfTrie_H
 #define IdfTrie_H
 #include "storage-base.hpp"
 const char * const  IDF_DICT_PATH = DICT_INSTALL_PATH"/idf.utf8";
 struct IdfCacheFileHeader : CacheFileHeaderBase
 {
    double idf_average = 0;
 };
 class IdfTrie : public StorageBase<double, false, IdfCacheFileHeader>
 {
 public:
    IdfTrie(const vector<string> file_paths, string dat_cache_path);
    IdfTrie(string file_path, string dat_cache_path);
    void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
    double Find(const string &key) const;
    double GetIdfAverage() const;
 private:
 };
 #endif // IdfTrie_H
--- a/libchinese-segmentation/cppjieba/limonp/ArgvContext.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/ArgvContext.hpp
@ -0,0 +1,70 @@
 /************************************
 * file enc : ascii
 * author   : wuyanyi09@gmail.com
 ************************************/
 #ifndef LIMONP_ARGV_FUNCTS_H
 #define LIMONP_ARGV_FUNCTS_H
 #include <set>
 #include <sstream>
 #include "StringUtil.hpp"
 namespace limonp {
 using namespace std;
 class ArgvContext {
 public :
  ArgvContext(int argc, const char* const * argv) {
    for(int i = 0; i < argc; i++) {
      if(StartsWith(argv[i], "-")) {
        if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) {
          mpss_[argv[i]] = argv[i+1];
          i++;
        } else {
          sset_.insert(argv[i]);
        }
      } else {
        args_.push_back(argv[i]);
      }
    }
  }
  ~ArgvContext() {
  }
  friend ostream& operator << (ostream& os, const ArgvContext& args);
  string operator [](size_t i) const {
    if(i < args_.size()) {
      return args_[i];
    }
    return "";
  }
  string operator [](const string& key) const {
    map<string, string>::const_iterator it = mpss_.find(key);
    if(it != mpss_.end()) {
      return it->second;
    }
    return "";
  }
  bool HasKey(const string& key) const {
    if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) {
      return true;
    }
    return false;
  }
 private:
  vector<string> args_;
  map<string, string> mpss_;
  set<string> sset_;
 }; // class ArgvContext
 inline ostream& operator << (ostream& os, const ArgvContext& args) {
  return os<<args.args_<<args.mpss_<<args.sset_;
 }
 } // namespace limonp
 #endif
--- a/libchinese-segmentation/cppjieba/limonp/BlockingQueue.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/BlockingQueue.hpp
@ -0,0 +1,49 @@
 #ifndef LIMONP_BLOCKINGQUEUE_HPP
 #define LIMONP_BLOCKINGQUEUE_HPP
 #include <queue>
 #include "Condition.hpp"
 namespace limonp {
 template<class T>
 class BlockingQueue: NonCopyable {
 public:
  BlockingQueue()
    : mutex_(), notEmpty_(mutex_), queue_() {
  }
  void Push(const T& x) {
    MutexLockGuard lock(mutex_);
    queue_.push(x);
    notEmpty_.Notify(); // Wait morphing saves us
  }
  T Pop() {
    MutexLockGuard lock(mutex_);
    // always use a while-loop, due to spurious wakeup
    while (queue_.empty()) {
      notEmpty_.Wait();
    }
    assert(!queue_.empty());
    T front(queue_.front());
    queue_.pop();
    return front;
  }
  size_t Size() const {
    MutexLockGuard lock(mutex_);
    return queue_.size();
  }
  bool Empty() const {
    return Size() == 0;
  }
 private:
  mutable MutexLock mutex_;
  Condition         notEmpty_;
  std::queue<T>     queue_;
 }; // class BlockingQueue
 } // namespace limonp
 #endif // LIMONP_BLOCKINGQUEUE_HPP
--- a/libchinese-segmentation/cppjieba/limonp/BoundedBlockingQueue.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/BoundedBlockingQueue.hpp
@ -0,0 +1,67 @@
 #ifndef LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
 #define LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
 #include "BoundedQueue.hpp"
 namespace limonp {
 template<typename T>
 class BoundedBlockingQueue : NonCopyable {
 public:
  explicit BoundedBlockingQueue(size_t maxSize)
    : mutex_(),
      notEmpty_(mutex_),
      notFull_(mutex_),
      queue_(maxSize) {
  }
  void Push(const T& x) {
    MutexLockGuard lock(mutex_);
    while (queue_.Full()) {
      notFull_.Wait();
    }
    assert(!queue_.Full());
    queue_.Push(x);
    notEmpty_.Notify();
  }
  T Pop() {
    MutexLockGuard lock(mutex_);
    while (queue_.Empty()) {
      notEmpty_.Wait();
    }
    assert(!queue_.Empty());
    T res = queue_.Pop();
    notFull_.Notify();
    return res;
  }
  bool Empty() const {
    MutexLockGuard lock(mutex_);
    return queue_.Empty();
  }
  bool Full() const {
    MutexLockGuard lock(mutex_);
    return queue_.Full();
  }
  size_t size() const {
    MutexLockGuard lock(mutex_);
    return queue_.size();
  }
  size_t capacity() const {
    return queue_.capacity();
  }
 private:
  mutable MutexLock          mutex_;
  Condition                  notEmpty_;
  Condition                  notFull_;
  BoundedQueue<T>  queue_;
 }; // class BoundedBlockingQueue
 } // namespace limonp
 #endif // LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
--- a/libchinese-segmentation/cppjieba/limonp/BoundedQueue.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/BoundedQueue.hpp
@ -0,0 +1,65 @@
 #ifndef LIMONP_BOUNDED_QUEUE_HPP
 #define LIMONP_BOUNDED_QUEUE_HPP
 #include <vector>
 #include <fstream>
 #include <cassert>
 namespace limonp {
 using namespace std;
 template<class T>
 class BoundedQueue {
 public:
  explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) {
    head_ = 0;
    tail_ = 0;
    size_ = 0;
    assert(capacity_);
  }
  ~BoundedQueue() {
  }
  void Clear() {
    head_ = 0;
    tail_ = 0;
    size_ = 0;
  }
  bool Empty() const {
    return !size_;
  }
  bool Full() const {
    return capacity_ == size_;
  }
  size_t Size() const {
    return size_;
  }
  size_t Capacity() const {
    return capacity_;
  }
  void Push(const T& t) {
    assert(!Full());
    circular_buffer_[tail_] = t;
    tail_ = (tail_ + 1) % capacity_;
    size_ ++;
  }
  T Pop() {
    assert(!Empty());
    size_t oldPos = head_;
    head_ = (head_ + 1) % capacity_;
    size_ --;
    return circular_buffer_[oldPos];
  }
 private:
  size_t head_;
  size_t tail_;
  size_t size_;
  const size_t capacity_;
  vector<T> circular_buffer_;
 }; // class BoundedQueue
 } // namespace limonp
 #endif
--- a/libchinese-segmentation/cppjieba/limonp/Closure.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Closure.hpp
@ -0,0 +1,206 @@
 #ifndef LIMONP_CLOSURE_HPP
 #define LIMONP_CLOSURE_HPP
 namespace limonp {
 class ClosureInterface {
 public:
  virtual ~ClosureInterface() {
  }
  virtual void Run() = 0;
 };
 template <class Funct>
 class Closure0: public ClosureInterface {
 public:
  Closure0(Funct fun) {
    fun_ = fun;
  }
  virtual ~Closure0() {
  }
  virtual void Run() {
    (*fun_)();
  }
 private:
  Funct fun_;
 }; 
 template <class Funct, class Arg1>
 class Closure1: public ClosureInterface {
 public:
  Closure1(Funct fun, Arg1 arg1) {
    fun_ = fun;
    arg1_ = arg1;
  }
  virtual ~Closure1() {
  }
  virtual void Run() {
    (*fun_)(arg1_);
  }
 private:
  Funct fun_;
  Arg1 arg1_;
 }; 
 template <class Funct, class Arg1, class Arg2>
 class Closure2: public ClosureInterface {
 public:
  Closure2(Funct fun, Arg1 arg1, Arg2 arg2) {
    fun_ = fun;
    arg1_ = arg1;
    arg2_ = arg2;
  }
  virtual ~Closure2() {
  }
  virtual void Run() {
    (*fun_)(arg1_, arg2_);
  }
 private:
  Funct fun_;
  Arg1 arg1_;
  Arg2 arg2_;
 }; 
 template <class Funct, class Arg1, class Arg2, class Arg3>
 class Closure3: public ClosureInterface {
 public:
  Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
    fun_ = fun;
    arg1_ = arg1;
    arg2_ = arg2;
    arg3_ = arg3;
  }
  virtual ~Closure3() {
  }
  virtual void Run() {
    (*fun_)(arg1_, arg2_, arg3_);
  }
 private:
  Funct fun_;
  Arg1 arg1_;
  Arg2 arg2_;
  Arg3 arg3_;
 }; 
 template <class Obj, class Funct> 
 class ObjClosure0: public ClosureInterface {
 public:
  ObjClosure0(Obj* p, Funct fun) {
   p_ = p;
   fun_ = fun;
  }
  virtual ~ObjClosure0() {
  }
  virtual void Run() {
    (p_->*fun_)();
  }
 private:
  Obj* p_;
  Funct fun_;
 }; 
 template <class Obj, class Funct, class Arg1> 
 class ObjClosure1: public ClosureInterface {
 public:
  ObjClosure1(Obj* p, Funct fun, Arg1 arg1) {
   p_ = p;
   fun_ = fun;
   arg1_ = arg1;
  }
  virtual ~ObjClosure1() {
  }
  virtual void Run() {
    (p_->*fun_)(arg1_);
  }
 private:
  Obj* p_;
  Funct fun_;
  Arg1 arg1_;
 }; 
 template <class Obj, class Funct, class Arg1, class Arg2> 
 class ObjClosure2: public ClosureInterface {
 public:
  ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) {
   p_ = p;
   fun_ = fun;
   arg1_ = arg1;
   arg2_ = arg2;
  }
  virtual ~ObjClosure2() {
  }
  virtual void Run() {
    (p_->*fun_)(arg1_, arg2_);
  }
 private:
  Obj* p_;
  Funct fun_;
  Arg1 arg1_;
  Arg2 arg2_;
 }; 
 template <class Obj, class Funct, class Arg1, class Arg2, class Arg3> 
 class ObjClosure3: public ClosureInterface {
 public:
  ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
   p_ = p;
   fun_ = fun;
   arg1_ = arg1;
   arg2_ = arg2;
   arg3_ = arg3;
  }
  virtual ~ObjClosure3() {
  }
  virtual void Run() {
    (p_->*fun_)(arg1_, arg2_, arg3_);
  }
 private:
  Obj* p_;
  Funct fun_;
  Arg1 arg1_;
  Arg2 arg2_;
  Arg3 arg3_;
 }; 
 template<class R>
 ClosureInterface* NewClosure(R (*fun)()) {
  return new Closure0<R (*)()>(fun);
 }
 template<class R, class Arg1>
 ClosureInterface* NewClosure(R (*fun)(Arg1), Arg1 arg1) {
  return new Closure1<R (*)(Arg1), Arg1>(fun, arg1);
 }
 template<class R, class Arg1, class Arg2>
 ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
  return new Closure2<R (*)(Arg1, Arg2), Arg1, Arg2>(fun, arg1, arg2);
 }
 template<class R, class Arg1, class Arg2, class Arg3>
 ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
  return new Closure3<R (*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(fun, arg1, arg2, arg3);
 }
 template<class R, class Obj>
 ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)()) {
  return new ObjClosure0<Obj, R (Obj::* )()>(obj, fun);
 }
 template<class R, class Obj, class Arg1>
 ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1), Arg1 arg1) {
  return new ObjClosure1<Obj, R (Obj::* )(Arg1), Arg1>(obj, fun, arg1);
 }
 template<class R, class Obj, class Arg1, class Arg2>
 ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
  return new ObjClosure2<Obj, R (Obj::*)(Arg1, Arg2), Arg1, Arg2>(obj, fun, arg1, arg2);
 }
 template<class R, class Obj, class Arg1, class Arg2, class Arg3>
 ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
  return new ObjClosure3<Obj, R (Obj::*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(obj, fun, arg1, arg2, arg3);
 }
 } // namespace limonp
 #endif // LIMONP_CLOSURE_HPP
--- a/libchinese-segmentation/cppjieba/limonp/Colors.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Colors.hpp
@ -0,0 +1,31 @@
 #ifndef LIMONP_COLOR_PRINT_HPP
 #define LIMONP_COLOR_PRINT_HPP
 #include <string>
 #include <stdarg.h>
 namespace limonp {
 using std::string;
 enum Color {
  BLACK = 30,
  RED,
  GREEN,
  YELLOW,
  BLUE,
  PURPLE
 }; // enum Color
 static void ColorPrintln(enum Color color, const char * fmt, ...) {
  va_list ap;
  printf("\033[0;%dm", color);
  va_start(ap, fmt);
  vprintf(fmt, ap);
  va_end(ap);
  printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly
 }
 } // namespace limonp
 #endif // LIMONP_COLOR_PRINT_HPP
--- a/libchinese-segmentation/cppjieba/limonp/Condition.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Condition.hpp
@ -0,0 +1,38 @@
 #ifndef LIMONP_CONDITION_HPP
 #define LIMONP_CONDITION_HPP
 #include "MutexLock.hpp"
 namespace limonp {
 class Condition : NonCopyable {
 public:
  explicit Condition(MutexLock& mutex)
    : mutex_(mutex) {
    XCHECK(!pthread_cond_init(&pcond_, NULL));
  }
  ~Condition() {
    XCHECK(!pthread_cond_destroy(&pcond_));
  }
  void Wait() {
    XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex()));
  }
  void Notify() {
    XCHECK(!pthread_cond_signal(&pcond_));
  }
  void NotifyAll() {
    XCHECK(!pthread_cond_broadcast(&pcond_));
  }
 private:
  MutexLock& mutex_;
  pthread_cond_t pcond_;
 }; // class Condition
 } // namespace limonp
 #endif // LIMONP_CONDITION_HPP
--- a/libchinese-segmentation/cppjieba/limonp/Config.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Config.hpp
@ -0,0 +1,103 @@
 /************************************
 * file enc : utf8
 * author   : wuyanyi09@gmail.com
 ************************************/
 #ifndef LIMONP_CONFIG_H
 #define LIMONP_CONFIG_H
 #include <map>
 #include <fstream>
 #include <iostream>
 #include <assert.h>
 #include "StringUtil.hpp"
 namespace limonp {
 using namespace std;
 class Config {
 public:
  explicit Config(const string& filePath) {
    LoadFile(filePath);
  }
  operator bool () {
    return !map_.empty();
  }
  string Get(const string& key, const string& defaultvalue) const {
    map<string, string>::const_iterator it = map_.find(key);
    if(map_.end() != it) {
      return it->second;
    }
    return defaultvalue;
  }
  int Get(const string& key, int defaultvalue) const {
    string str = Get(key, "");
    if("" == str) {
      return defaultvalue;
    }
    return atoi(str.c_str());
  }
  const char* operator [] (const char* key) const {
    if(NULL == key) {
      return NULL;
    }
    map<string, string>::const_iterator it = map_.find(key);
    if(map_.end() != it) {
      return it->second.c_str();
    }
    return NULL;
  }
  string GetConfigInfo() const {
    string res;
    res << *this;
    return res;
  }
 private:
  void LoadFile(const string& filePath) {
    ifstream ifs(filePath.c_str());
    assert(ifs);
    string line;
    vector<string> vecBuf;
    size_t lineno = 0;
    while(getline(ifs, line)) {
      lineno ++;
      Trim(line);
      if(line.empty() || StartsWith(line, "#")) {
        continue;
      }
      vecBuf.clear();
      Split(line, vecBuf, "=");
      if(2 != vecBuf.size()) {
        fprintf(stderr, "line[%s] illegal.\n", line.c_str());
        assert(false);
        continue;
      }
      string& key = vecBuf[0];
      string& value = vecBuf[1];
      Trim(key);
      Trim(value);
      if(!map_.insert(make_pair(key, value)).second) {
        fprintf(stderr, "key[%s] already exits.\n", key.c_str());
        assert(false);
        continue;
      }
    }
    ifs.close();
  }
  friend ostream& operator << (ostream& os, const Config& config);
  map<string, string> map_;
 }; // class Config
 inline ostream& operator << (ostream& os, const Config& config) {
  return os << config.map_;
 }
 } // namespace limonp
 #endif // LIMONP_CONFIG_H
--- a/libchinese-segmentation/cppjieba/limonp/FileLock.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/FileLock.hpp
@ -0,0 +1,74 @@
 #ifndef LIMONP_FILELOCK_HPP
 #define LIMONP_FILELOCK_HPP
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <fcntl.h>
 #include <errno.h>
 #include <string>
 #include <string.h>
 #include <assert.h>
 namespace limonp {
 using std::string;
 class FileLock {
 public:
  FileLock() : fd_(-1), ok_(true) {
  }
  ~FileLock() {
    if(fd_ > 0) {
      Close();
    }
  }
  void Open(const string& fname) {
    assert(fd_ == -1);
    fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
    if(fd_ < 0) {
      ok_ = false;
      err_ = strerror(errno);
    }
  }
  void Close() {
    ::close(fd_);
  }
  void Lock() {
    if(LockOrUnlock(fd_, true) < 0) {
      ok_ = false;
      err_ = strerror(errno);
    }
  }
  void UnLock() {
    if(LockOrUnlock(fd_, false) < 0) {
      ok_ = false;
      err_ = strerror(errno);
    }
  }
  bool Ok() const {
    return ok_;
  }
  string Error() const {
    return err_;
  }
 private:
  static int LockOrUnlock(int fd, bool lock) {
    errno = 0;
    struct flock f;
    memset(&f, 0, sizeof(f));
    f.l_type = (lock ? F_WRLCK : F_UNLCK);
    f.l_whence = SEEK_SET;
    f.l_start = 0;
    f.l_len = 0;        // Lock/unlock entire file
    return fcntl(fd, F_SETLK, &f);
  }
  int fd_;
  bool ok_;
  string err_;
 }; // class FileLock
 }// namespace limonp
 #endif // LIMONP_FILELOCK_HPP
--- a/libchinese-segmentation/cppjieba/limonp/ForcePublic.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/ForcePublic.hpp
@ -0,0 +1,7 @@
 #ifndef LIMONP_FORCE_PUBLIC_H
 #define LIMONP_FORCE_PUBLIC_H
 #define private public
 #define protected public
 #endif // LIMONP_FORCE_PUBLIC_H
--- a/libchinese-segmentation/cppjieba/limonp/LocalVector.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/LocalVector.hpp
@ -0,0 +1,142 @@
 #ifndef LIMONP_LOCAL_VECTOR_HPP
 #define LIMONP_LOCAL_VECTOR_HPP
 #include <iostream>
 #include <stdlib.h>
 #include <assert.h>
 #include <string.h>
 namespace limonp {
 using namespace std;
 /*
 * LocalVector<T> : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector<T> may be dangerous..
 * LocalVector<T> is simple and not well-tested.
 */
 const size_t LOCAL_VECTOR_BUFFER_SIZE = 16;
 template <class T>
 class LocalVector {
 public:
  typedef const T* const_iterator ;
  typedef T value_type;
  typedef size_t size_type;
 private:
  T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
  T * ptr_;
  size_t size_;
  size_t capacity_;
 public:
  LocalVector() {
    init_();
  };
  LocalVector(const LocalVector<T>& vec) {
    init_();
    *this = vec;
  }
  LocalVector(const_iterator  begin, const_iterator end) { // TODO: make it faster
    init_();
    while(begin != end) {
      push_back(*begin++);
    }
  }
  LocalVector(size_t size, const T& t) { // TODO: make it faster
    init_();
    while(size--) {
      push_back(t);
    }
  }
  ~LocalVector() {
    if(ptr_ != buffer_) {
      free(ptr_);
    }
  };
 public:
  LocalVector<T>& operator = (const LocalVector<T>& vec) {
      if(this == &vec){
          return *this;
      }
    clear();
    size_ = vec.size();
    capacity_ = vec.capacity();
    if(vec.buffer_ == vec.ptr_) {
      memcpy(buffer_, vec.buffer_, sizeof(T) * size_);
      ptr_ = buffer_;
    } else {
      ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
      assert(ptr_);
      memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T));
    }
    return *this;
  }
 private:
  void init_() {
    ptr_ = buffer_;
    size_ = 0;
    capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
  }
 public:
  T& operator [] (size_t i) {
    return ptr_[i];
  }
  const T& operator [] (size_t i) const {
    return ptr_[i];
  }
  void push_back(const T& t) {
    if(size_ == capacity_) {
      assert(capacity_);
      reserve(capacity_ * 2);
    }
    ptr_[size_ ++ ] = t;
  }
  void reserve(size_t size) {
    if(size <= capacity_) {
      return;
    }
    T * next =  (T*)malloc(sizeof(T) * size);
    assert(next);
    T * old = ptr_;
    ptr_ = next;
    memcpy(ptr_, old, sizeof(T) * capacity_);
    capacity_ = size;
    if(old != buffer_) {
      free(old);
    }
  }
  bool empty() const {
    return 0 == size();
  }
  size_t size() const {
    return size_;
  }
  size_t capacity() const {
    return capacity_;
  }
  const_iterator begin() const {
    return ptr_;
  }
  const_iterator end() const {
    return ptr_ + size_;
  }
  void clear() {
    if(ptr_ != buffer_) {
      free(ptr_);
    }
    init_();
  }
 };
 template <class T>
 ostream & operator << (ostream& os, const LocalVector<T>& vec) {
  if(vec.empty()) {
    return os << "[]";
  }
  os<<"[\""<<vec[0];
  for(size_t i = 1; i < vec.size(); i++) {
    os<<"\", \""<<vec[i];
  }
  os<<"\"]";
  return os;
 }
 }
 #endif
--- a/libchinese-segmentation/cppjieba/limonp/Logging.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Logging.hpp
@ -0,0 +1,77 @@
 #ifndef LIMONP_LOGGING_HPP
 #define LIMONP_LOGGING_HPP
 #include <sstream>
 #include <iostream>
 #include <cassert>
 #include <cstdlib>
 #include <ctime>
 #ifdef XLOG
 #error "XLOG has been defined already"
 #endif // XLOG
 #ifdef XCHECK
 #error "XCHECK has been defined already"
 #endif // XCHECK
 #define XLOG(level) limonp::Logger(limonp::LL_##level, __FILE__, __LINE__).Stream()
 #define XCHECK(exp) if(!(exp)) XLOG(FATAL) << "exp: ["#exp << "] false. "
 namespace limonp {
 enum {
  LL_DEBUG = 0,
  LL_INFO = 1,
  LL_WARNING = 2,
  LL_ERROR = 3,
  LL_FATAL = 4,
 }; // enum
 static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"};
 class Logger {
 public:
  Logger(size_t level, const char* filename, int lineno)
   : level_(level) {
 #ifdef LOGGING_LEVEL
     if (level_ < LOGGING_LEVEL) {
       return;
     }
 #endif
    assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY));
    char buf[32];
    time_t now;
    time(&now);
    struct tm result;
    localtime_r(&now, &result);
    strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &result);
    stream_ << buf
      << " " << filename
      << ":" << lineno
      << " " << LOG_LEVEL_ARRAY[level_]
      << " ";
  }
  ~Logger() {
 #ifdef LOGGING_LEVEL
     if (level_ < LOGGING_LEVEL) {
       return;
     }
 #endif
    std::cerr << stream_.str() << std::endl;
    if (level_ == LL_FATAL) {
      abort();
    }
  }
  std::ostream& Stream() {
    return stream_;
  }
 private:
  std::ostringstream stream_;
  size_t level_;
 }; // class Logger
 } // namespace limonp
 #endif // LIMONP_LOGGING_HPP
--- a/libchinese-segmentation/cppjieba/limonp/Md5.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Md5.hpp
@ -0,0 +1,415 @@
 /****************************************************************************
 **Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991
 **              2020, KylinSoft Co., Ltd.
 **All rights reserved.
 **
 **License to copy and use this software is granted provided that it
 **is identified as the "RSA Data Security, Inc. MD5 Message-Digest
 **Algorithm" in all material mentioning or referencing this software
 **or this function.
 **
 **License is also granted to make and use derivative works provided
 **that such works are identified as "derived from the RSA Data
 **Security, Inc. MD5 Message-Digest Algorithm" in all material
 **mentioning or referencing the derived work.
 **
 **RSA Data Security, Inc. makes no representations concerning either
 **the merchantability of this software or the suitability of this
 **software for any particular purpose. It is provided "as is"
 **without express or implied warranty of any kind.
 **
 **These notices must be retained in any copies of any part of this
 **documentation and/or software.
 **
 **
 **
 **The original md5 implementation avoids external libraries.
 **This version has dependency on stdio.h for file input and
 **string.h for memcpy.
 **
 ****************************************************************************/
 #ifndef __MD5_H__
 #define __MD5_H__
 #include <cstdio>
 #include <cstring>
 #include <iostream>
 namespace limonp {
 //#pragma region MD5 defines
 // Constants for MD5Transform routine.
 #define S11 7
 #define S12 12
 #define S13 17
 #define S14 22
 #define S21 5
 #define S22 9
 #define S23 14
 #define S24 20
 #define S31 4
 #define S32 11
 #define S33 16
 #define S34 23
 #define S41 6
 #define S42 10
 #define S43 15
 #define S44 21
 // F, G, H and I are basic MD5 functions.
 #define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
 #define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
 #define H(x, y, z) ((x) ^ (y) ^ (z))
 #define I(x, y, z) ((y) ^ ((x) | (~z)))
 // ROTATE_LEFT rotates x left n bits.
 #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
 // FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
 // Rotation is separate from addition to prevent recomputation.
 #define FF(a, b, c, d, x, s, ac) { \
  (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \
  (a) = ROTATE_LEFT ((a), (s)); \
  (a) += (b); \
  }
 #define GG(a, b, c, d, x, s, ac) { \
  (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \
  (a) = ROTATE_LEFT ((a), (s)); \
  (a) += (b); \
  }
 #define HH(a, b, c, d, x, s, ac) { \
  (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \
  (a) = ROTATE_LEFT ((a), (s)); \
  (a) += (b); \
  }
 #define II(a, b, c, d, x, s, ac) { \
  (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \
  (a) = ROTATE_LEFT ((a), (s)); \
  (a) += (b); \
  }
 //#pragma endregion
 typedef unsigned char BYTE ;
 // POINTER defines a generic pointer type
 typedef unsigned char *POINTER;
 // UINT2 defines a two byte word
 typedef unsigned short int UINT2;
 // UINT4 defines a four byte word
 typedef unsigned int UINT4;
 static unsigned char PADDING[64] = {
    0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 // convenient object that wraps
 // the C-functions for use in C++ only
 class MD5 {
 private:
    struct __context_t {
        UINT4 state[4];                                   /* state (ABCD) */
        UINT4 count[2];        /* number of bits, modulo 2^64 (lsb first) */
        unsigned char buffer[64];                         /* input buffer */
    } context ;
    //#pragma region static helper functions
    // The core of the MD5 algorithm is here.
    // MD5 basic transformation. Transforms state based on block.
    static void MD5Transform(UINT4 state[4], unsigned char block[64]) {
        UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
        Decode(x, block, 64);
        /* Round 1 */
        FF(a, b, c, d, x[ 0], S11, 0xd76aa478);  /* 1 */
        FF(d, a, b, c, x[ 1], S12, 0xe8c7b756);  /* 2 */
        FF(c, d, a, b, x[ 2], S13, 0x242070db);  /* 3 */
        FF(b, c, d, a, x[ 3], S14, 0xc1bdceee);  /* 4 */
        FF(a, b, c, d, x[ 4], S11, 0xf57c0faf);  /* 5 */
        FF(d, a, b, c, x[ 5], S12, 0x4787c62a);  /* 6 */
        FF(c, d, a, b, x[ 6], S13, 0xa8304613);  /* 7 */
        FF(b, c, d, a, x[ 7], S14, 0xfd469501);  /* 8 */
        FF(a, b, c, d, x[ 8], S11, 0x698098d8);  /* 9 */
        FF(d, a, b, c, x[ 9], S12, 0x8b44f7af);  /* 10 */
        FF(c, d, a, b, x[10], S13, 0xffff5bb1);  /* 11 */
        FF(b, c, d, a, x[11], S14, 0x895cd7be);  /* 12 */
        FF(a, b, c, d, x[12], S11, 0x6b901122);  /* 13 */
        FF(d, a, b, c, x[13], S12, 0xfd987193);  /* 14 */
        FF(c, d, a, b, x[14], S13, 0xa679438e);  /* 15 */
        FF(b, c, d, a, x[15], S14, 0x49b40821);  /* 16 */
        /* Round 2 */
        GG(a, b, c, d, x[ 1], S21, 0xf61e2562);  /* 17 */
        GG(d, a, b, c, x[ 6], S22, 0xc040b340);  /* 18 */
        GG(c, d, a, b, x[11], S23, 0x265e5a51);  /* 19 */
        GG(b, c, d, a, x[ 0], S24, 0xe9b6c7aa);  /* 20 */
        GG(a, b, c, d, x[ 5], S21, 0xd62f105d);  /* 21 */
        GG(d, a, b, c, x[10], S22,  0x2441453);  /* 22 */
        GG(c, d, a, b, x[15], S23, 0xd8a1e681);  /* 23 */
        GG(b, c, d, a, x[ 4], S24, 0xe7d3fbc8);  /* 24 */
        GG(a, b, c, d, x[ 9], S21, 0x21e1cde6);  /* 25 */
        GG(d, a, b, c, x[14], S22, 0xc33707d6);  /* 26 */
        GG(c, d, a, b, x[ 3], S23, 0xf4d50d87);  /* 27 */
        GG(b, c, d, a, x[ 8], S24, 0x455a14ed);  /* 28 */
        GG(a, b, c, d, x[13], S21, 0xa9e3e905);  /* 29 */
        GG(d, a, b, c, x[ 2], S22, 0xfcefa3f8);  /* 30 */
        GG(c, d, a, b, x[ 7], S23, 0x676f02d9);  /* 31 */
        GG(b, c, d, a, x[12], S24, 0x8d2a4c8a);  /* 32 */
        /* Round 3 */
        HH(a, b, c, d, x[ 5], S31, 0xfffa3942);  /* 33 */
        HH(d, a, b, c, x[ 8], S32, 0x8771f681);  /* 34 */
        HH(c, d, a, b, x[11], S33, 0x6d9d6122);  /* 35 */
        HH(b, c, d, a, x[14], S34, 0xfde5380c);  /* 36 */
        HH(a, b, c, d, x[ 1], S31, 0xa4beea44);  /* 37 */
        HH(d, a, b, c, x[ 4], S32, 0x4bdecfa9);  /* 38 */
        HH(c, d, a, b, x[ 7], S33, 0xf6bb4b60);  /* 39 */
        HH(b, c, d, a, x[10], S34, 0xbebfbc70);  /* 40 */
        HH(a, b, c, d, x[13], S31, 0x289b7ec6);  /* 41 */
        HH(d, a, b, c, x[ 0], S32, 0xeaa127fa);  /* 42 */
        HH(c, d, a, b, x[ 3], S33, 0xd4ef3085);  /* 43 */
        HH(b, c, d, a, x[ 6], S34,  0x4881d05);  /* 44 */
        HH(a, b, c, d, x[ 9], S31, 0xd9d4d039);  /* 45 */
        HH(d, a, b, c, x[12], S32, 0xe6db99e5);  /* 46 */
        HH(c, d, a, b, x[15], S33, 0x1fa27cf8);  /* 47 */
        HH(b, c, d, a, x[ 2], S34, 0xc4ac5665);  /* 48 */
        /* Round 4 */
        II(a, b, c, d, x[ 0], S41, 0xf4292244);  /* 49 */
        II(d, a, b, c, x[ 7], S42, 0x432aff97);  /* 50 */
        II(c, d, a, b, x[14], S43, 0xab9423a7);  /* 51 */
        II(b, c, d, a, x[ 5], S44, 0xfc93a039);  /* 52 */
        II(a, b, c, d, x[12], S41, 0x655b59c3);  /* 53 */
        II(d, a, b, c, x[ 3], S42, 0x8f0ccc92);  /* 54 */
        II(c, d, a, b, x[10], S43, 0xffeff47d);  /* 55 */
        II(b, c, d, a, x[ 1], S44, 0x85845dd1);  /* 56 */
        II(a, b, c, d, x[ 8], S41, 0x6fa87e4f);  /* 57 */
        II(d, a, b, c, x[15], S42, 0xfe2ce6e0);  /* 58 */
        II(c, d, a, b, x[ 6], S43, 0xa3014314);  /* 59 */
        II(b, c, d, a, x[13], S44, 0x4e0811a1);  /* 60 */
        II(a, b, c, d, x[ 4], S41, 0xf7537e82);  /* 61 */
        II(d, a, b, c, x[11], S42, 0xbd3af235);  /* 62 */
        II(c, d, a, b, x[ 2], S43, 0x2ad7d2bb);  /* 63 */
        II(b, c, d, a, x[ 9], S44, 0xeb86d391);  /* 64 */
        state[0] += a;
        state[1] += b;
        state[2] += c;
        state[3] += d;
        // Zeroize sensitive information.
        memset((POINTER)x, 0, sizeof(x));
    }
    // Encodes input (UINT4) into output (unsigned char). Assumes len is
    // a multiple of 4.
    static void Encode(unsigned char *output, UINT4 *input, unsigned int len) {
        unsigned int i, j;
        for(i = 0, j = 0; j < len; i++, j += 4) {
            output[j] = (unsigned char)(input[i] & 0xff);
            output[j + 1] = (unsigned char)((input[i] >> 8) & 0xff);
            output[j + 2] = (unsigned char)((input[i] >> 16) & 0xff);
            output[j + 3] = (unsigned char)((input[i] >> 24) & 0xff);
        }
    }
    // Decodes input (unsigned char) into output (UINT4). Assumes len is
    // a multiple of 4.
    static void Decode(UINT4 *output, unsigned char *input, unsigned int len) {
        unsigned int i, j;
        for(i = 0, j = 0; j < len; i++, j += 4)
            output[i] = ((UINT4)input[j]) | (((UINT4)input[j + 1]) << 8) |
                        (((UINT4)input[j + 2]) << 16) | (((UINT4)input[j + 3]) << 24);
    }
    //#pragma endregion
 public:
    // MAIN FUNCTIONS
    MD5() {
        Init() ;
    }
    // MD5 initialization. Begins an MD5 operation, writing a new context.
    void Init() {
        context.count[0] = context.count[1] = 0;
        // Load magic initialization constants.
        context.state[0] = 0x67452301;
        context.state[1] = 0xefcdab89;
        context.state[2] = 0x98badcfe;
        context.state[3] = 0x10325476;
    }
    // MD5 block update operation. Continues an MD5 message-digest
    // operation, processing another message block, and updating the
    // context.
    void Update(
        unsigned char *input,   // input block
        unsigned int inputLen) {  // length of input block
        unsigned int i, index, partLen;
        // Compute number of bytes mod 64
        index = (unsigned int)((context.count[0] >> 3) & 0x3F);
        // Update number of bits
        if((context.count[0] += ((UINT4)inputLen << 3))
                < ((UINT4)inputLen << 3))
            context.count[1]++;
        context.count[1] += ((UINT4)inputLen >> 29);
        partLen = 64 - index;
        // Transform as many times as possible.
        if(inputLen >= partLen) {
            memcpy((POINTER)&context.buffer[index], (POINTER)input, partLen);
            MD5Transform(context.state, context.buffer);
            for(i = partLen; i + 63 < inputLen; i += 64)
                MD5Transform(context.state, &input[i]);
            index = 0;
        } else
            i = 0;
        /* Buffer remaining input */
        memcpy((POINTER)&context.buffer[index], (POINTER)&input[i], inputLen - i);
    }
    // MD5 finalization. Ends an MD5 message-digest operation, writing the
    // the message digest and zeroizing the context.
    // Writes to digestRaw
    void Final() {
        unsigned char bits[8];
        unsigned int index, padLen;
        // Save number of bits
        Encode(bits, context.count, 8);
        // Pad out to 56 mod 64.
        index = (unsigned int)((context.count[0] >> 3) & 0x3f);
        padLen = (index < 56) ? (56 - index) : (120 - index);
        Update(PADDING, padLen);
        // Append length (before padding)
        Update(bits, 8);
        // Store state in digest
        Encode(digestRaw, context.state, 16);
        // Zeroize sensitive information.
        memset((POINTER)&context, 0, sizeof(context));
        writeToString() ;
    }
    /// Buffer must be 32+1 (nul) = 33 chars long at least
    void writeToString() {
        int pos ;
        for(pos = 0 ; pos < 16 ; pos++)
            sprintf(digestChars + (pos * 2), "%02x", digestRaw[pos]) ;
    }
 public:
    // an MD5 digest is a 16-byte number (32 hex digits)
    BYTE digestRaw[ 16 ] ;
    // This version of the digest is actually
    // a "printf'd" version of the digest.
    char digestChars[ 33 ] ;
    /// Load a file from disk and digest it
    // Digests a file and returns the result.
    const char* digestFile(const char *filename) {
        if(NULL == filename || strcmp(filename, "") == 0)
            return NULL;
        Init() ;
        FILE *file;
        unsigned char buffer[1024] ;
        if((file = fopen(filename, "rb")) == NULL) {
            return NULL;
        }
        int len;
        while((len = fread(buffer, 1, 1024, file)))
            Update(buffer, len) ;
        Final();
        fclose(file);
        return digestChars ;
    }
    /// Digests a byte-array already in memory
    const char* digestMemory(BYTE *memchunk, int len) {
        if(NULL == memchunk)
            return NULL;
        Init() ;
        Update(memchunk, len) ;
        Final() ;
        return digestChars ;
    }
    // Digests a string and prints the result.
    const char* digestString(const char *string) {
        if(string == NULL)
            return NULL;
        Init() ;
        Update((unsigned char*)string, strlen(string)) ;
        Final() ;
        return digestChars ;
    }
 };
 inline bool md5String(const char* str, std::string& res) {
    if(NULL == str) {
        res = "";
        return false;
    }
    MD5 md5;
    const char *pRes = md5.digestString(str);
    if(NULL == pRes) {
        res = "";
        return false;
    }
    res = pRes;
    return true;
 }
 inline bool md5File(const char* filepath, std::string& res) {
    if(NULL == filepath || strcmp(filepath, "") == 0) {
        res = "";
        return false;
    }
    MD5 md5;
    const char *pRes = md5.digestFile(filepath);
    if(NULL == pRes) {
        res = "";
        return false;
    }
    res = pRes;
    return true;
 }
 }
 #endif
--- a/libchinese-segmentation/cppjieba/limonp/MutexLock.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/MutexLock.hpp
@ -0,0 +1,51 @@
 #ifndef LIMONP_MUTEX_LOCK_HPP
 #define LIMONP_MUTEX_LOCK_HPP
 #include <pthread.h>
 #include "NonCopyable.hpp"
 #include "Logging.hpp"
 namespace limonp {
 class MutexLock: NonCopyable {
 public:
  MutexLock() {
    XCHECK(!pthread_mutex_init(&mutex_, NULL));
  }
  ~MutexLock() {
    XCHECK(!pthread_mutex_destroy(&mutex_));
  }
  pthread_mutex_t* GetPthreadMutex() {
    return &mutex_;
  }
 private:
  void Lock() {
    XCHECK(!pthread_mutex_lock(&mutex_));
  }
  void Unlock() {
    XCHECK(!pthread_mutex_unlock(&mutex_));
  }
  friend class MutexLockGuard;
  pthread_mutex_t mutex_;
 }; // class MutexLock
 class MutexLockGuard: NonCopyable {
 public:
  explicit MutexLockGuard(MutexLock & mutex)
    : mutex_(mutex) {
    mutex_.Lock();
  }
  ~MutexLockGuard() {
    mutex_.Unlock();
  }
 private:
  MutexLock & mutex_;
 }; // class MutexLockGuard
 #define MutexLockGuard(x) XCHECK(false);
 } // namespace limonp
 #endif // LIMONP_MUTEX_LOCK_HPP
--- a/libchinese-segmentation/cppjieba/limonp/NonCopyable.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/NonCopyable.hpp
@ -0,0 +1,21 @@
 /************************************
 ************************************/
 #ifndef LIMONP_NONCOPYABLE_H
 #define LIMONP_NONCOPYABLE_H
 namespace limonp {
 class NonCopyable {
 protected:
  NonCopyable() {
  }
  ~NonCopyable() {
  }
 private:
  NonCopyable(const NonCopyable& );
  const NonCopyable& operator=(const NonCopyable& );
 }; // class NonCopyable
 } // namespace limonp
 #endif // LIMONP_NONCOPYABLE_H
--- a/libchinese-segmentation/cppjieba/limonp/StdExtension.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/StdExtension.hpp
@ -0,0 +1,157 @@
 #ifndef LIMONP_STD_EXTEMSION_HPP
 #define LIMONP_STD_EXTEMSION_HPP
 #include <map>
 #ifdef __APPLE__
 #include <unordered_map>
 #include <unordered_set>
 #elif(__cplusplus >= 201103L)
 #include <unordered_map>
 #include <unordered_set>
 #elif defined _MSC_VER
 #include <unordered_map>
 #include <unordered_set>
 #else
 #include <tr1/unordered_map>
 #include <tr1/unordered_set>
 namespace std {
 using std::tr1::unordered_map;
 using std::tr1::unordered_set;
 }
 #endif
 #include <set>
 #include <string>
 #include <vector>
 #include <deque>
 #include <fstream>
 #include <sstream>
 namespace std {
 template<typename T>
 ostream& operator << (ostream& os, const vector<T>& v) {
  if(v.empty()) {
    return os << "[]";
  }
  os<<"["<<v[0];
  for(size_t i = 1; i < v.size(); i++) {
    os<<", "<<v[i];
  }
  os<<"]";
  return os;
 }
 template<>
 inline ostream& operator << (ostream& os, const vector<string>& v) {
  if(v.empty()) {
    return os << "[]";
  }
  os<<"[\""<<v[0];
  for(size_t i = 1; i < v.size(); i++) {
    os<<"\", \""<<v[i];
  }
  os<<"\"]";
  return os;
 }
 template<typename T>
 ostream& operator << (ostream& os, const deque<T>& dq) {
  if(dq.empty()) {
    return os << "[]";
  }
  os<<"[\""<<dq[0];
  for(size_t i = 1; i < dq.size(); i++) {
    os<<"\", \""<<dq[i];
  }
  os<<"\"]";
  return os;
 }
 template<class T1, class T2>
 ostream& operator << (ostream& os, const pair<T1, T2>& pr) {
  os << pr.first << ":" << pr.second ;
  return os;
 }
 template<class T>
 string& operator << (string& str, const T& obj) {
  stringstream ss;
  ss << obj; // call ostream& operator << (ostream& os,
  return str = ss.str();
 }
 template<class T1, class T2>
 ostream& operator << (ostream& os, const map<T1, T2>& mp) {
  if(mp.empty()) {
    os<<"{}";
    return os;
  }
  os<<'{';
  typename map<T1, T2>::const_iterator it = mp.begin();
  os<<*it;
  it++;
  while(it != mp.end()) {
    os<<", "<<*it;
    it++;
  }
  os<<'}';
  return os;
 }
 template<class T1, class T2>
 ostream& operator << (ostream& os, const std::unordered_map<T1, T2>& mp) {
  if(mp.empty()) {
    return os << "{}";
  }
  os<<'{';
  typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
  os<<*it;
  it++;
  while(it != mp.end()) {
    os<<", "<<*it++;
  }
  return os<<'}';
 }
 template<class T>
 ostream& operator << (ostream& os, const set<T>& st) {
  if(st.empty()) {
    os << "{}";
    return os;
  }
  os<<'{';
  typename set<T>::const_iterator it = st.begin();
  os<<*it;
  it++;
  while(it != st.end()) {
    os<<", "<<*it;
    it++;
  }
  os<<'}';
  return os;
 }
 template<class KeyType, class ContainType>
 bool IsIn(const ContainType& contain, const KeyType& key) {
  return contain.end() != contain.find(key);
 }
 template<class T>
 basic_string<T> & operator << (basic_string<T> & s, ifstream & ifs) {
  return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
 }
 template<class T>
 ofstream & operator << (ofstream & ofs, const basic_string<T>& s) {
  ostreambuf_iterator<T> itr (ofs);
  copy(s.begin(), s.end(), itr);
  return ofs;
 }
 } // namespace std
 #endif
--- a/libchinese-segmentation/cppjieba/limonp/StringUtil.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/StringUtil.hpp
@ -0,0 +1,382 @@
 /************************************
 * file enc : ascii
 * author   : wuyanyi09@gmail.com
 ************************************/
 #ifndef LIMONP_STR_FUNCTS_H
 #define LIMONP_STR_FUNCTS_H
 #include <stdint.h>
 #include <stdio.h>
 #include <stdarg.h>
 #include <memory.h>
 #include <sys/types.h>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
 #include <algorithm>
 #include <cctype>
 #include <map>
 #include <functional>
 #include <locale>
 #include <sstream>
 #include <iterator>
 #include <algorithm>
 #include "StdExtension.hpp"
 namespace limonp {
 using namespace std;
 inline string StringFormat(const char* fmt, ...) {
  int size = 256;
  std::string str;
  va_list ap;
  while (1) {
    str.resize(size);
    va_start(ap, fmt);
    int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
    va_end(ap);
    if (n > -1 && n < size) {
      str.resize(n);
      return str;
    }
    if (n > -1)
      size = n + 1;
    else
      size *= 2;
  }
  return str;
 }
 template<class T>
 void Join(T begin, T end, string& res, const string& connector) {
  if(begin == end) {
    return;
  }
  stringstream ss;
  ss<<*begin;
  begin++;
  while(begin != end) {
    ss << connector << *begin;
    begin ++;
  }
  res = ss.str();
 }
 template<class T>
 string Join(T begin, T end, const string& connector) {
  string res;
  Join(begin ,end, res, connector);
  return res;
 }
 inline string& Upper(string& str) {
  transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
  return str;
 }
 inline string& Lower(string& str) {
  transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
  return str;
 }
 inline bool IsSpace(unsigned c) {
  // when passing large int as the argument of isspace, it core dump, so here need a type cast.
  return c > 0xff ? false : std::isspace(c & 0xff);
 }
 inline std::string& LTrim(std::string &s) {
  s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))));
  return s;
 }
 inline std::string& RTrim(std::string &s) {
  s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))).base(), s.end());
  return s;
 }
 inline std::string& Trim(std::string &s) {
  return LTrim(RTrim(s));
 }
 inline std::string& LTrim(std::string & s, char x) {
  s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to<char>(), x))));
  return s;
 }
 inline std::string& RTrim(std::string & s, char x) {
  s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to<char>(), x))).base(), s.end());
  return s;
 }
 inline std::string& Trim(std::string &s, char x) {
  return LTrim(RTrim(s, x), x);
 }
 inline void Split(const string& src, vector<string>& res, const string& pattern, size_t maxsplit = string::npos) {
  res.clear();
  size_t Start = 0;
  size_t end = 0;
  string sub;
  while(Start < src.size()) {
    end = src.find_first_of(pattern, Start);
    if(string::npos == end || res.size() >= maxsplit) {
      sub = src.substr(Start);
      res.push_back(sub);
      return;
    }
    sub = src.substr(Start, end - Start);
    res.push_back(sub);
    Start = end + 1;
  }
  return;
 }
 inline vector<string> Split(const string& src, const string& pattern, size_t maxsplit = string::npos) {
  vector<string> res;
  Split(src, res, pattern, maxsplit);
  return res;
 }
 inline bool StartsWith(const string& str, const string& prefix) {
  if(prefix.length() > str.length()) {
    return false;
  }
  return 0 == str.compare(0, prefix.length(), prefix);
 }
 inline bool EndsWith(const string& str, const string& suffix) {
  if(suffix.length() > str.length()) {
    return false;
  }
  return 0 == str.compare(str.length() -  suffix.length(), suffix.length(), suffix);
 }
 inline bool IsInStr(const string& str, char ch) {
  return str.find(ch) != string::npos;
 }
 inline uint16_t TwocharToUint16(char high, char low) {
  return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
 }
 template <class Uint16Container>
 bool Utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec) {
  if(!str) {
    return false;
  }
  char ch1, ch2;
  uint16_t tmp;
  vec.clear();
  for(size_t i = 0; i < len;) {
    if(!(str[i] & 0x80)) { // 0xxxxxxx
      vec.push_back(str[i]);
      i++;
    } else if ((uint8_t)str[i] <= 0xdf && i + 1 < len) { // 110xxxxxx
      ch1 = (str[i] >> 2) & 0x07;
      ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
      tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
      vec.push_back(tmp);
      i += 2;
    } else if((uint8_t)str[i] <= 0xef && i + 2 < len) {
      ch1 = ((uint8_t)str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
      ch2 = (((uint8_t)str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
      tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
      vec.push_back(tmp);
      i += 3;
    } else {
      return false;
    }
  }
  return true;
 }
 template <class Uint16Container>
 bool Utf8ToUnicode(const string& str, Uint16Container& vec) {
  return Utf8ToUnicode(str.c_str(), str.size(), vec);
 }
 template <class Uint32Container>
 bool Utf8ToUnicode32(const char * str, size_t size, Uint32Container& vec) {
  uint32_t tmp;
  vec.clear();
  for(size_t i = 0; i < size;) {
    if(!(str[i] & 0x80)) { // 0xxxxxxx
      // 7bit, total 7bit
      tmp = (uint8_t)(str[i]) & 0x7f;
      i++;
    } else if ((uint8_t)str[i] <= 0xdf && i + 1 < size) { // 110xxxxxx
      // 5bit, total 5bit
      tmp = (uint8_t)(str[i]) & 0x1f;
      // 6bit, total 11bit
      tmp <<= 6;
      tmp |= (uint8_t)(str[i+1]) & 0x3f;
      i += 2;
    } else if((uint8_t)str[i] <= 0xef && i + 2 < size) { // 1110xxxxxx
      // 4bit, total 4bit
      tmp = (uint8_t)(str[i]) & 0x0f;
      // 6bit, total 10bit
      tmp <<= 6;
      tmp |= (uint8_t)(str[i+1]) & 0x3f;
      // 6bit, total 16bit
      tmp <<= 6;
      tmp |= (uint8_t)(str[i+2]) & 0x3f;
      i += 3;
    } else if((uint8_t)str[i] <= 0xf7 && i + 3 < size) { // 11110xxxx
      // 3bit, total 3bit
      tmp = (uint8_t)(str[i]) & 0x07;
      // 6bit, total 9bit
      tmp <<= 6;
      tmp |= (uint8_t)(str[i+1]) & 0x3f;
      // 6bit, total 15bit
      tmp <<= 6;
      tmp |= (uint8_t)(str[i+2]) & 0x3f;
      // 6bit, total 21bit
      tmp <<= 6;
      tmp |= (uint8_t)(str[i+3]) & 0x3f;
      i += 4;
    } else {
      return false;
    }
    vec.push_back(tmp);
  }
  return true;
 }
 template <class Uint32Container>
 bool Utf8ToUnicode32(const string& str, Uint32Container& vec) {
    return Utf8ToUnicode32(str.data(), str.size(), vec);
 }
 inline int UnicodeToUtf8Bytes(uint32_t ui){
    if(ui <= 0x7f) {
        return 1;
    } else if(ui <= 0x7ff) {
        return 2;
    } else if(ui <= 0xffff) {
        return 3;
    } else {
        return 4;
    }
 }
 template <class Uint32ContainerConIter>
 void Unicode32ToUtf8(Uint32ContainerConIter begin, Uint32ContainerConIter end, string& res) {
  res.clear();
  uint32_t ui;
  while(begin != end) {
    ui = *begin;
    if(ui <= 0x7f) {
      res += char(ui);
    } else if(ui <= 0x7ff) {
      res += char(((ui >> 6) & 0x1f) | 0xc0);
      res += char((ui & 0x3f) | 0x80);
    } else if(ui <= 0xffff) {
      res += char(((ui >> 12) & 0x0f) | 0xe0);
      res += char(((ui >> 6) & 0x3f) | 0x80);
      res += char((ui & 0x3f) | 0x80);
    } else {
      res += char(((ui >> 18) & 0x03) | 0xf0);
      res += char(((ui >> 12) & 0x3f) | 0x80);
      res += char(((ui >> 6) & 0x3f) | 0x80);
      res += char((ui & 0x3f) | 0x80);
    }
    begin ++;
  }
 }
 template <class Uint16ContainerConIter>
 void UnicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
  res.clear();
  uint16_t ui;
  while(begin != end) {
    ui = *begin;
    if(ui <= 0x7f) {
      res += char(ui);
    } else if(ui <= 0x7ff) {
      res += char(((ui>>6) & 0x1f) | 0xc0);
      res += char((ui & 0x3f) | 0x80);
    } else {
      res += char(((ui >> 12) & 0x0f )| 0xe0);
      res += char(((ui>>6) & 0x3f )| 0x80 );
      res += char((ui & 0x3f) | 0x80);
    }
    begin ++;
  }
 }
 template <class Uint16Container>
 bool GBKTrans(const char* const str, size_t len, Uint16Container& vec) {
  vec.clear();
  if(!str) {
    return true;
  }
  size_t i = 0;
  while(i < len) {
    if(0 == (str[i] & 0x80)) {
      vec.push_back(uint16_t(str[i]));
      i++;
    } else {
      if(i + 1 < len) { //&& (str[i+1] & 0x80))
        uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
        vec.push_back(tmp);
        i += 2;
      } else {
        return false;
      }
    }
  }
  return true;
 }
 template <class Uint16Container>
 bool GBKTrans(const string& str, Uint16Container& vec) {
  return GBKTrans(str.c_str(), str.size(), vec);
 }
 template <class Uint16ContainerConIter>
 void GBKTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
  res.clear();
  //pair<char, char> pa;
  char first, second;
  while(begin != end) {
    //pa = uint16ToChar2(*begin);
    first = ((*begin)>>8) & 0x00ff;
    second = (*begin) & 0x00ff;
    if(first & 0x80) {
      res += first;
      res += second;
    } else {
      res += second;
    }
    begin++;
  }
 }
 /*
 * format example: "%Y-%m-%d %H:%M:%S"
 */
 // inline void GetTime(const string& format, string&  timeStr) {
 //   time_t timeNow;
 //   time(&timeNow);
 //   timeStr.resize(64);
 //   size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow));
 //   timeStr.resize(len);
 // }
 inline string PathJoin(const string& path1, const string& path2) {
  if(EndsWith(path1, "/")) {
    return path1 + path2;
  }
  return path1 + "/" + path2;
 }
 }
 #endif
--- a/libchinese-segmentation/cppjieba/limonp/Thread.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Thread.hpp
@ -0,0 +1,44 @@
 #ifndef LIMONP_THREAD_HPP
 #define LIMONP_THREAD_HPP
 #include "Logging.hpp"
 #include "NonCopyable.hpp"
 namespace limonp {
 class IThread: NonCopyable {
 public:
  IThread(): isStarted(false), isJoined(false) {
  }
  virtual ~IThread() {
    if(isStarted && !isJoined) {
      XCHECK(!pthread_detach(thread_));
    }
  };
  virtual void Run() = 0;
  void Start() {
    XCHECK(!isStarted);
    XCHECK(!pthread_create(&thread_, NULL, Worker, this));
    isStarted = true;
  }
  void Join() {
    XCHECK(!isJoined);
    XCHECK(!pthread_join(thread_, NULL));
    isJoined = true;
  }
 private:
  static void * Worker(void * data) {
    IThread * ptr = (IThread* ) data;
    ptr->Run();
    return NULL;
  }
  pthread_t thread_;
  bool isStarted;
  bool isJoined;
 }; // class IThread
 } // namespace limonp
 #endif // LIMONP_THREAD_HPP
--- a/libchinese-segmentation/cppjieba/limonp/ThreadPool.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/ThreadPool.hpp
@ -0,0 +1,86 @@
 #ifndef LIMONP_THREAD_POOL_HPP
 #define LIMONP_THREAD_POOL_HPP
 #include "Thread.hpp"
 #include "BlockingQueue.hpp"
 #include "BoundedBlockingQueue.hpp"
 #include "Closure.hpp"
 namespace limonp {
 using namespace std;
 //class ThreadPool;
 class ThreadPool: NonCopyable {
 public:
  class Worker: public IThread {
   public:
    Worker(ThreadPool* pool): ptThreadPool_(pool) {
      assert(ptThreadPool_);
    }
    virtual ~Worker() {
    }
    virtual void Run() {
      while (true) {
        ClosureInterface* closure = ptThreadPool_->queue_.Pop();
        if (closure == NULL) {
          break;
        }
        try {
          closure->Run();
        } catch(std::exception& e) {
          XLOG(ERROR) << e.what();
        } catch(...) {
          XLOG(ERROR) << " unknown exception.";
        }
        delete closure;
      }
    }
   private:
    ThreadPool * ptThreadPool_;
  }; // class Worker
  ThreadPool(size_t thread_num)
    : threads_(thread_num), 
      queue_(thread_num) {
    assert(thread_num);
    for(size_t i = 0; i < threads_.size(); i ++) {
      threads_[i] = new Worker(this);
    }
  }
  ~ThreadPool() {
    Stop();
  }
  void Start() {
    for(size_t i = 0; i < threads_.size(); i++) {
      threads_[i]->Start();
    }
  }
  void Stop() {
    for(size_t i = 0; i < threads_.size(); i ++) {
      queue_.Push(NULL);
    }
    for(size_t i = 0; i < threads_.size(); i ++) {
      threads_[i]->Join();
      delete threads_[i];
    }
    threads_.clear();
  }
  void Add(ClosureInterface* task) {
    assert(task);
    queue_.Push(task);
  }
 private:
  friend class Worker;
  vector<IThread*> threads_;
  BoundedBlockingQueue<ClosureInterface*> queue_;
 }; // class ThreadPool
 } // namespace limonp
 #endif // LIMONP_THREAD_POOL_HPP
--- a/libchinese-segmentation/cppjieba/limonp/limonp.pri
+++ b/libchinese-segmentation/cppjieba/limonp/limonp.pri
@ -0,0 +1,22 @@
 INCLUDEPATH += $$PWD
 HEADERS += \
    $$PWD/ArgvContext.hpp \
    $$PWD/BlockingQueue.hpp \
    $$PWD/BoundedBlockingQueue.hpp \
    $$PWD/BoundedQueue.hpp \
    $$PWD/Closure.hpp \
    $$PWD/Colors.hpp \
    $$PWD/Condition.hpp \
    $$PWD/Config.hpp \
    $$PWD/FileLock.hpp \
    $$PWD/ForcePublic.hpp \
    $$PWD/LocalVector.hpp \
    $$PWD/Logging.hpp \
    $$PWD/Md5.hpp \
    $$PWD/MutexLock.hpp \
    $$PWD/NonCopyable.hpp \
    $$PWD/StdExtension.hpp \
    $$PWD/StringUtil.hpp \
    $$PWD/Thread.hpp \
    $$PWD/ThreadPool.hpp
--- a/libchinese-segmentation/cppjieba/segment-trie/segment-trie.cpp
+++ b/libchinese-segmentation/cppjieba/segment-trie/segment-trie.cpp
@ -0,0 +1,275 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #include <cmath>
 #include "segment-trie.h"
 DictTrie::DictTrie(const vector<string> file_paths, string dat_cache_path)
    : StorageBase<DatMemElem, false, DictCacheFileHeader>(file_paths, dat_cache_path)
 {
    this->Init();
 }
 DictTrie::DictTrie(const string &dict_path, const string &user_dict_paths, const string &dat_cache_path)
    : StorageBase<DatMemElem, false, DictCacheFileHeader>(vector<string>{dict_path, user_dict_paths}, dat_cache_path)
 {
    this->Init();
 }
 void DictTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
 {
    DictCacheFileHeader header;
    assert(sizeof(header.md5_hex) == md5.size());
    memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
    int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
    string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
    umask(S_IWGRP | S_IWOTH);
    const int fd =mkstemp((char *)tmp_filepath.data());
    assert(fd >= 0);
    fchmod(fd, 0644);
    write_bytes = write(fd, (const char *)&header, sizeof(DictCacheFileHeader));
    this->PreLoad();
    this->LoadDefaultDict(fd, write_bytes, offset, elements_num);
    this->LoadUserDict(fd, write_bytes, offset, elements_num);
    write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
    lseek(fd, sizeof(header.md5_hex), SEEK_SET);
    write(fd, &elements_num, sizeof(int));
    write(fd, &offset, sizeof(int));
    data_trie_size = this->GetDataTrieSize();
    write(fd, &data_trie_size, sizeof(int));
    write(fd, &m_min_weight, sizeof(double));
    close(fd);
    assert((size_t)write_bytes == sizeof(DictCacheFileHeader) + offset + this->GetDataTrieTotalSize());
    tryRename(tmp_filepath, dat_cache_file);
 }
 const DatMemElem * DictTrie::Find(const string &key) const
 {
    int result = this->ExactMatchSearch(key.c_str(), key.size());
    if (result < 0)
        return nullptr;
    return &this->GetElementPtr()[result];
 }
 void DictTrie::FindDatDag(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<DatDag> &res, size_t max_word_len) const {
    res.clear();
    res.resize(end - begin);
    string text_str;
    EncodeRunesToString(begin, end, text_str);
    static const size_t max_num = 128;
    result_pair_type result_pairs[max_num] = {};
    for (size_t i = 0, begin_pos = 0; i < size_t(end - begin); i++) {
        std::size_t num_results = this->CommonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
        res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + 1, nullptr));
        for (std::size_t idx = 0; idx < num_results; ++idx) {
            auto & match = result_pairs[idx];
            if ((match.value < 0) || ((size_t)match.value >= this->GetCacheFileHeaderPtr()->elements_size)) {
                continue;
            }
            auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
            if (char_num > max_word_len) {
                continue;
            }
            const DatMemElem * pValue = &this->GetElementPtr()[match.value];
            if (1 == char_num) {
                res[i].nexts[0].second = pValue;
                continue;
            }
            res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + char_num, pValue));
        }
        begin_pos += limonp::UnicodeToUtf8Bytes((begin + i)->rune);
    }
 }
 void DictTrie::FindWordRange(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange> &words, size_t max_word_len) const {
    string text_str;
    EncodeRunesToString(begin, end, text_str);
    static const size_t max_num = 128;
    result_pair_type result_pairs[max_num] = {};//存放字典查询结果
    size_t str_size = end - begin;
    double max_weight[str_size];//存放逆向路径最大weight
    for (size_t i = 0; i<str_size; i++) {
        max_weight[i] = -3.14e+100;
    }
    size_t max_next[str_size];//存放动态规划后的分词结果
    //memset(max_next,-1,str_size*sizeof(size_t));
    double val(0);
    for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
        size_t nextPos = str_size - i;//逆向计算
        begin_pos -= (end - i - 1)->len;
        std::size_t num_results = this->CommonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
        if (0 == num_results) {//字典不存在则单独分词
            val = GetMinWeight();
            if (nextPos  < str_size) {
                val += max_weight[nextPos];
            }
            if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
                max_weight[nextPos - 1] = val;
                max_next[nextPos - 1] = nextPos;
            }
        } else {//字典存在则根据查询结果数量计算最大概率路径
            for (std::size_t idx = 0; idx < num_results; ++idx) {
                auto & match = result_pairs[idx];
                if ((match.value < 0) || ((uint32_t)match.value >= this->GetCacheFileHeaderPtr()->elements_size)) {
                    continue;
                }
                auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
                if (char_num > max_word_len) {
                    continue;
                }
                auto * pValue = &this->GetElementPtr()[match.value];
                val = pValue->weight;
                if (1 == char_num) {
                    if (nextPos  < str_size) {
                        val += max_weight[nextPos];
                    }
                    if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
                        max_weight[nextPos - 1] = val;
                        max_next[nextPos - 1] = nextPos;
                    }
                } else {
                    if (nextPos - 1 + char_num  < str_size) {
                        val += max_weight[nextPos - 1 + char_num];
                    }
                    if ((nextPos - 1 + char_num <= str_size) && (val > max_weight[nextPos - 1])) {
                        max_weight[nextPos - 1] = val;
                        max_next[nextPos - 1] = nextPos - 1 + char_num;
                    }
                }
            }
        }
    }
    for (size_t i = 0; i < str_size;) {//统计动态规划结果
        assert(max_next[i] > i);
        assert(max_next[i] <= str_size);
        WordRange wr(begin + i, begin + max_next[i] - 1);
        words.push_back(wr);
        i = max_next[i];
    }
 }
 bool DictTrie::IsUserDictSingleChineseWord(const Rune &word) const {
    return IsIn(m_user_dict_single_chinese_word, word);
 }
 void DictTrie::PreLoad()
 {
    ifstream ifs(DICT_PATH);
    string line;
    vector<string> buf;
    for (; getline(ifs, line);) {
        if (limonp::StartsWith(line, "#") or line.empty()) {
            continue;
        }
        limonp::Split(line, buf, " ");
        if (buf.size() != 3)
            continue;
        m_freq_sum += atof(buf[1].c_str());
    }
 }
 void DictTrie::LoadDefaultDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
 {
    ifstream ifs(DICT_PATH);
    string line;
    vector<string> buf;
    for (; getline(ifs, line);) {
        if (limonp::StartsWith(line, "#") or line.empty()) {
            continue;
        }
        limonp::Split(line, buf, " ");
        if (buf.size() != 3)
            continue;
        DatMemElem node_info;
        node_info.weight = log(atof(buf[1].c_str()) / m_freq_sum);
        node_info.SetTag(buf[2]);
        this->Update(buf[0].c_str(), buf[0].size(), elements_num);
        offset += (sizeof(DatMemElem));
        elements_num++;
        if (m_min_weight > node_info.weight) {
            m_min_weight = node_info.weight;
        }
        write_bytes += write(fd, &node_info, sizeof(DatMemElem));
    }
 }
 void DictTrie::LoadUserDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
 {
    ifstream ifs(USER_DICT_PATH);
    string line;
    vector<string> buf;
    for (; getline(ifs, line);) {
        if (limonp::StartsWith(line, "#") or line.empty()) {
            continue;
        }
        limonp::Split(line, buf, " ");
        if (buf.size() != 3)
            continue;
        DatMemElem node_info;
        assert(m_freq_sum > 0.0);
        const int freq = atoi(buf[1].c_str());
        node_info.weight = log(1.0 * freq / m_freq_sum);
        node_info.SetTag(buf[2]);
        this->Update(buf[0].c_str(), buf[0].size(), elements_num);
        offset += (sizeof(DatMemElem));
        elements_num++;
        write_bytes += write(fd, &node_info, sizeof(DatMemElem));
        if (Utf8CharNum(buf[0]) == 1) {
            RuneArray word;
            if (DecodeRunesInString(buf[0], word)) {
                m_user_dict_single_chinese_word.insert(word[0]);
            }
        }
    }
 }
 inline double DictTrie::GetMinWeight() const
 {
    return this->GetCacheFileHeaderPtr()->min_weight;
 }
--- a/libchinese-segmentation/cppjieba/segment-trie/segment-trie.h
+++ b/libchinese-segmentation/cppjieba/segment-trie/segment-trie.h
@ -0,0 +1,62 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef SegmentTrie_H
 #define SegmentTrie_H
 #include "storage-base.hpp"
 #include "cppjieba/Unicode.hpp"
 using namespace cppjieba;
 const char * const DICT_PATH = DICT_INSTALL_PATH"/jieba.dict.utf8";
 const char * const USER_DICT_PATH = DICT_INSTALL_PATH"/user.dict.utf8";
 struct DictCacheFileHeader : CacheFileHeaderBase
 {
    double min_weight = 0;
 };
 class DictTrie : public StorageBase<DatMemElem, false, DictCacheFileHeader>
 {
 public:
    DictTrie(const vector<string> file_paths, string dat_cache_path = "");
    DictTrie(const string& dict_path, const string& user_dict_paths = "", const string & dat_cache_path = "");
    void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
    const DatMemElem *Find(const string &key) const;
    void FindDatDag(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
              vector<struct DatDag>&res, size_t max_word_len = MAX_WORD_LENGTH) const;
    void FindWordRange(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
              vector<WordRange>& words, size_t max_word_len = MAX_WORD_LENGTH) const;
    bool IsUserDictSingleChineseWord(const Rune& word) const;
 private:
    DictTrie();
    void PreLoad();
    void LoadDefaultDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
    void LoadUserDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
    double GetMinWeight() const;
    double m_freq_sum = 0.0;
    double m_min_weight = 3.14e+100;
    unordered_set<Rune> m_user_dict_single_chinese_word;
 };
 #endif // SegmentTrie_H
--- a/libchinese-segmentation/development-files/header-files/ChineseSegmentation
+++ b/libchinese-segmentation/development-files/header-files/ChineseSegmentation
@ -0,0 +1 @@
 #include "chinese-segmentation.h"
--- a/libchinese-segmentation/development-files/header-files/HanZiToPinYin
+++ b/libchinese-segmentation/development-files/header-files/HanZiToPinYin
@ -0,0 +1 @@
 #include "hanzi-to-pinyin.h"
--- a/libchinese-segmentation/dict/README.md
+++ b/libchinese-segmentation/dict/README.md
@ -0,0 +1,31 @@
 # CppJieba字典
 文件后缀名代表的是词典的编码方式。
 比如filename.utf8 是 utf8编码，filename.gbk 是 gbk编码方式。
 ## 分词
 ### jieba.dict.utf8/gbk
 作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
 ### hmm_model.utf8/gbk
 作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
 __对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
 ## 关键词抽取
 ### idf.utf8
 IDF(Inverse Document Frequency)
 在KeywordExtractor中，使用的是经典的TF-IDF算法，所以需要这么一个词典提供IDF信息。
 ### stop_words.utf8
 停用词词典
--- a/libchinese-segmentation/dict/hmm_model.utf8
+++ b/libchinese-segmentation/dict/hmm_model.utf8
--- a/libchinese-segmentation/dict/idf.utf8
+++ b/libchinese-segmentation/dict/idf.utf8
--- a/libchinese-segmentation/dict/jieba.dict.utf8
+++ b/libchinese-segmentation/dict/jieba.dict.utf8
--- a/libchinese-segmentation/dict/pinyinWithoutTone.txt
+++ b/libchinese-segmentation/dict/pinyinWithoutTone.txt
--- a/libchinese-segmentation/dict/pos_dict/char_state_tab.utf8
+++ b/libchinese-segmentation/dict/pos_dict/char_state_tab.utf8
--- a/libchinese-segmentation/dict/pos_dict/prob_emit.utf8
+++ b/libchinese-segmentation/dict/pos_dict/prob_emit.utf8
--- a/libchinese-segmentation/dict/pos_dict/prob_start.utf8
+++ b/libchinese-segmentation/dict/pos_dict/prob_start.utf8
@ -0,0 +1,259 @@
 #初始状态的概率
 #格式
 #状态:概率
 B,a:-4.7623052146
 B,ad:-6.68006603678
 B,ag:-3.14e+100
 B,an:-8.69708322302
 B,b:-5.01837436211
 B,bg:-3.14e+100
 B,c:-3.42388018495
 B,d:-3.97504752976
 B,df:-8.88897423083
 B,dg:-3.14e+100
 B,e:-8.56355183039
 B,en:-3.14e+100
 B,f:-5.49163041848
 B,g:-3.14e+100
 B,h:-13.53336513
 B,i:-6.11578472756
 B,in:-3.14e+100
 B,j:-5.05761912847
 B,jn:-3.14e+100
 B,k:-3.14e+100
 B,l:-4.90588358466
 B,ln:-3.14e+100
 B,m:-3.6524299819
 B,mg:-3.14e+100
 B,mq:-6.7869530014
 B,n:-1.69662577975
 B,ng:-3.14e+100
 B,nr:-2.23104959138
 B,nrfg:-5.87372217541
 B,nrt:-4.98564273352
 B,ns:-2.8228438315
 B,nt:-4.84609166818
 B,nz:-3.94698846058
 B,o:-8.43349870215
 B,p:-4.20098413209
 B,q:-6.99812385896
 B,qe:-3.14e+100
 B,qg:-3.14e+100
 B,r:-3.40981877908
 B,rg:-3.14e+100
 B,rr:-12.4347528413
 B,rz:-7.94611647157
 B,s:-5.52267359084
 B,t:-3.36474790945
 B,tg:-3.14e+100
 B,u:-9.1639172775
 B,ud:-3.14e+100
 B,ug:-3.14e+100
 B,uj:-3.14e+100
 B,ul:-3.14e+100
 B,uv:-3.14e+100
 B,uz:-3.14e+100
 B,v:-2.67405848743
 B,vd:-9.04472876024
 B,vg:-3.14e+100
 B,vi:-12.4347528413
 B,vn:-4.33156108902
 B,vq:-12.1470707689
 B,w:-3.14e+100
 B,x:-3.14e+100
 B,y:-9.84448567586
 B,yg:-3.14e+100
 B,z:-7.04568111149
 B,zg:-3.14e+100
 E,a:-3.14e+100
 E,ad:-3.14e+100
 E,ag:-3.14e+100
 E,an:-3.14e+100
 E,b:-3.14e+100
 E,bg:-3.14e+100
 E,c:-3.14e+100
 E,d:-3.14e+100
 E,df:-3.14e+100
 E,dg:-3.14e+100
 E,e:-3.14e+100
 E,en:-3.14e+100
 E,f:-3.14e+100
 E,g:-3.14e+100
 E,h:-3.14e+100
 E,i:-3.14e+100
 E,in:-3.14e+100
 E,j:-3.14e+100
 E,jn:-3.14e+100
 E,k:-3.14e+100
 E,l:-3.14e+100
 E,ln:-3.14e+100
 E,m:-3.14e+100
 E,mg:-3.14e+100
 E,mq:-3.14e+100
 E,n:-3.14e+100
 E,ng:-3.14e+100
 E,nr:-3.14e+100
 E,nrfg:-3.14e+100
 E,nrt:-3.14e+100
 E,ns:-3.14e+100
 E,nt:-3.14e+100
 E,nz:-3.14e+100
 E,o:-3.14e+100
 E,p:-3.14e+100
 E,q:-3.14e+100
 E,qe:-3.14e+100
 E,qg:-3.14e+100
 E,r:-3.14e+100
 E,rg:-3.14e+100
 E,rr:-3.14e+100
 E,rz:-3.14e+100
 E,s:-3.14e+100
 E,t:-3.14e+100
 E,tg:-3.14e+100
 E,u:-3.14e+100
 E,ud:-3.14e+100
 E,ug:-3.14e+100
 E,uj:-3.14e+100
 E,ul:-3.14e+100
 E,uv:-3.14e+100
 E,uz:-3.14e+100
 E,v:-3.14e+100
 E,vd:-3.14e+100
 E,vg:-3.14e+100
 E,vi:-3.14e+100
 E,vn:-3.14e+100
 E,vq:-3.14e+100
 E,w:-3.14e+100
 E,x:-3.14e+100
 E,y:-3.14e+100
 E,yg:-3.14e+100
 E,z:-3.14e+100
 E,zg:-3.14e+100
 M,a:-3.14e+100
 M,ad:-3.14e+100
 M,ag:-3.14e+100
 M,an:-3.14e+100
 M,b:-3.14e+100
 M,bg:-3.14e+100
 M,c:-3.14e+100
 M,d:-3.14e+100
 M,df:-3.14e+100
 M,dg:-3.14e+100
 M,e:-3.14e+100
 M,en:-3.14e+100
 M,f:-3.14e+100
 M,g:-3.14e+100
 M,h:-3.14e+100
 M,i:-3.14e+100
 M,in:-3.14e+100
 M,j:-3.14e+100
 M,jn:-3.14e+100
 M,k:-3.14e+100
 M,l:-3.14e+100
 M,ln:-3.14e+100
 M,m:-3.14e+100
 M,mg:-3.14e+100
 M,mq:-3.14e+100
 M,n:-3.14e+100
 M,ng:-3.14e+100
 M,nr:-3.14e+100
 M,nrfg:-3.14e+100
 M,nrt:-3.14e+100
 M,ns:-3.14e+100
 M,nt:-3.14e+100
 M,nz:-3.14e+100
 M,o:-3.14e+100
 M,p:-3.14e+100
 M,q:-3.14e+100
 M,qe:-3.14e+100
 M,qg:-3.14e+100
 M,r:-3.14e+100
 M,rg:-3.14e+100
 M,rr:-3.14e+100
 M,rz:-3.14e+100
 M,s:-3.14e+100
 M,t:-3.14e+100
 M,tg:-3.14e+100
 M,u:-3.14e+100
 M,ud:-3.14e+100
 M,ug:-3.14e+100
 M,uj:-3.14e+100
 M,ul:-3.14e+100
 M,uv:-3.14e+100
 M,uz:-3.14e+100
 M,v:-3.14e+100
 M,vd:-3.14e+100
 M,vg:-3.14e+100
 M,vi:-3.14e+100
 M,vn:-3.14e+100
 M,vq:-3.14e+100
 M,w:-3.14e+100
 M,x:-3.14e+100
 M,y:-3.14e+100
 M,yg:-3.14e+100
 M,z:-3.14e+100
 M,zg:-3.14e+100
 S,a:-3.90253968313
 S,ad:-11.0484584802
 S,ag:-6.95411391796
 S,an:-12.8402179494
 S,b:-6.47288876397
 S,bg:-3.14e+100
 S,c:-4.78696679586
 S,d:-3.90391976418
 S,df:-3.14e+100
 S,dg:-8.9483976513
 S,e:-5.94251300628
 S,en:-3.14e+100
 S,f:-5.19482024998
 S,g:-6.50782681533
 S,h:-8.65056320738
 S,i:-3.14e+100
 S,in:-3.14e+100
 S,j:-4.91199211964
 S,jn:-3.14e+100
 S,k:-6.94032059583
 S,l:-3.14e+100
 S,ln:-3.14e+100
 S,m:-3.26920065212
 S,mg:-10.8253149289
 S,mq:-3.14e+100
 S,n:-3.85514838976
 S,ng:-4.9134348611
 S,nr:-4.48366310396
 S,nrfg:-3.14e+100
 S,nrt:-3.14e+100
 S,ns:-3.14e+100
 S,nt:-12.1470707689
 S,nz:-3.14e+100
 S,o:-8.46446092775
 S,p:-2.98684018136
 S,q:-4.88865861826
 S,qe:-3.14e+100
 S,qg:-3.14e+100
 S,r:-2.76353367841
 S,rg:-10.2752685919
 S,rr:-3.14e+100
 S,rz:-3.14e+100
 S,s:-3.14e+100
 S,t:-3.14e+100
 S,tg:-6.27284253188
 S,u:-6.94032059583
 S,ud:-7.72823016105
 S,ug:-7.53940370266
 S,uj:-6.85251045118
 S,ul:-8.41537131755
 S,uv:-8.15808672229
 S,uz:-9.29925862537
 S,v:-3.05329230341
 S,vd:-3.14e+100
 S,vg:-5.94301818437
 S,vi:-3.14e+100
 S,vn:-11.4539235883
 S,vq:-3.14e+100
 S,w:-3.14e+100
 S,x:-8.42741965607
 S,y:-6.19707946995
 S,yg:-13.53336513
 S,z:-3.14e+100
 S,zg:-3.14e+100
--- a/libchinese-segmentation/dict/pos_dict/prob_trans.utf8
+++ b/libchinese-segmentation/dict/pos_dict/prob_trans.utf8
--- a/libchinese-segmentation/dict/stop_words.utf8
+++ b/libchinese-segmentation/dict/stop_words.utf8
--- a/libchinese-segmentation/dict/user.dict.utf8
+++ b/libchinese-segmentation/dict/user.dict.utf8
@ -0,0 +1,4 @@
 云计算
 韩玉鉴赏
 蓝翔 nz
 区块链 10 nz
--- a/libchinese-segmentation/hanzi-to-pinyin-private.h
+++ b/libchinese-segmentation/hanzi-to-pinyin-private.h
@ -0,0 +1,74 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef HANZITOPINYINPRIVATE_H
 #define HANZITOPINYINPRIVATE_H
 #include <QtCore/qglobal.h>
 #include <QHash>
 #include "pinyin4cpp_dictTrie.h"
 #include "hanzi-to-pinyin.h"
 #include "pinyin4cpp-trie.h"
 using namespace std;
 static const QHash<QString, QString> PhoneticSymbol = {
    {"ā", "a1"}, {"á", "a2"}, {"ǎ", "a3"}, {"à", "a4"},
    {"ē", "e1"}, {"é", "e2"}, {"ě", "e3"}, {"è", "e4"},
    {"ō", "o1"}, {"ó", "o2"}, {"ǒ", "o3"}, {"ò", "o4"},
    {"ī", "i1"}, {"í", "i2"}, {"ǐ", "i3"}, {"ì", "i4"},
    {"ū", "u1"}, {"ú", "u2"}, {"ǔ", "u3"}, {"ù", "u4"},
    // üe
    {"ü", "v"},
    {"ǖ", "v1"}, {"ǘ", "v2"}, {"ǚ", "v3"}, {"ǜ", "v4"},
    {"ń", "n2"}, {"ň", "n3"}, {"ǹ", "n4"},
    {"m̄", "m1"}, {"ḿ", "m2"}, {"m̀", "m4"},
    {"ê̄", "ê1"}, {"ế", "ê2"}, {"ê̌", "ê3"}, {"ề", "ê4"}
 };
 #define PINYINMANAGER_EXPORT Q_DECL_IMPORT
 class PINYINMANAGER_EXPORT HanZiToPinYinPrivate
 {
 public:
    HanZiToPinYinPrivate(HanZiToPinYin *parent = nullptr);
    ~HanZiToPinYinPrivate();
 public:
    template <typename T>
    bool isMultiTone(T &&t) {return m_pinYinTrie.IsMultiTone(std::forward<T>(t));}
    bool contains(string &word);
    int getResults(string &word, QStringList &results);
    void setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType);
 private:
    void convertDataStyle(QStringList &results);
    HanZiToPinYin *q = nullptr;
    //Pinyin4cppDictTrie *m_pinYinTrie = nullptr;
    Pinyin4cppTrie m_pinYinTrie;
    SegType m_segType = SegType::Segmentation;
    PolyphoneType m_polyphoneType = PolyphoneType::Disable;
    PinyinDataStyle m_pinyinDataStyle = PinyinDataStyle::Default;
    ExDataProcessType m_exDataProcessType = ExDataProcessType::Default;
 };
 #endif // HANZITOPINYINPRIVATE_H
--- a/libchinese-segmentation/hanzi-to-pinyin.cpp
+++ b/libchinese-segmentation/hanzi-to-pinyin.cpp
@ -0,0 +1,360 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #include <mutex>
 #include <cctype>
 #include "hanzi-to-pinyin.h"
 #include "hanzi-to-pinyin-private.h"
 #include "chinese-segmentation.h"
 #include "cppjieba/Unicode.hpp"
 HanZiToPinYin * HanZiToPinYin::g_pinYinManager = nullptr;
 std::once_flag g_singleFlag;
 bool HanZiToPinYinPrivate::contains(string &word)
 {
    return m_pinYinTrie.Contains(word);
 }
 int HanZiToPinYinPrivate::getResults(string &word, QStringList &results)
 {
    results.clear();
    string directResult = m_pinYinTrie.Find(word);
    if (directResult == string()) {
        if (m_segType == SegType::NoSegmentation) {//无分词、无结果直接返回-1
            return -1;
        } else {//无结果、启用分词
            vector<string> segResults = ChineseSegmentation::getInstance()->callMixSegmentCutStr(word);
            string data;
            for (string &info : segResults) {
                if (info == string()) {
                    continue;
                }
                data = m_pinYinTrie.Find(info);
                if (data == string()) {//分词后无结果
                    if (cppjieba::IsSingleWord(info)) {//单个字符
                        if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回
                            results.append(QString().fromStdString(info));
                        } else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略
                            continue;
                        }
                    } else {//多个字符
                        string oneWord;
                        cppjieba::RuneStrArray runeArray;
                        cppjieba::DecodeRunesInString(info, runeArray);
                        for (auto i = runeArray.begin(); i != runeArray.end(); ++i) {
                            oneWord = cppjieba::GetStringFromRunes(info, i, i);
                            data = m_pinYinTrie.Find(oneWord);
                            if (data == string()) {//单字无结果则按设置返回
                                if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回
                                    results.append(QString().fromStdString(oneWord));
                                } else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略
                                    continue;
                                }
                            }
                            if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
                                results.append(QString().fromStdString(data));
                            } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
                                if (limonp::IsInStr(data, ',')) {
                                    results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0))));
                                } else {
                                    results.append(QString().fromStdString(data));
                                }
                            }
                        }
                    }
                } else {//分词后有结果
                    if (cppjieba::IsSingleWord(info)) {//单个字符
                        if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
                            results.append(QString().fromStdString(data));
                        } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
                            if (limonp::IsInStr(data, ',')) {
                                results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0))));
                            } else {
                                results.append(QString().fromStdString(data));
                            }
                        }
                    } else {//多个字符
                        vector<string> dataVec = limonp::Split(data, "/");
                        if (dataVec.size() == 1) {//无多音词
                            vector<string> dataVec = limonp::Split(data, ",");
                            for (auto &oneResult : dataVec) {
                                results.append(QString().fromStdString(oneResult));
                            }
                        } else {
                            if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
                                int wordSize = limonp::Split(dataVec[0], ",").size();
                                for (int i = 0; i < wordSize; ++i) {
                                    QStringList oneResult;
                                    for (size_t j = 0; j < dataVec.size(); ++j) {
                                        oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i]));
                                    }
                                    results.append(oneResult.join('/'));
                                }
                            } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
                                vector<string> tmp = limonp::Split(dataVec[0], ",");
                                for (auto &oneResult : tmp) {
                                    results.append(QString().fromStdString(oneResult));
                                }
                            }
                        }
                    }
                }
            }
        }
    } else {//可以直接查到结果
        if (cppjieba::IsSingleWord(word)) {//单个字符
            if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
                results.append(QString().fromStdString(directResult));
            } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
                if (limonp::IsInStr(directResult, ',')) {
                    results.append(QString().fromStdString(directResult.substr(0, directResult.find_first_of(",", 0))));
                } else {
                    results.append(QString().fromStdString(directResult));
                }
            }
        } else {//多个字符
            vector<string> dataVec = limonp::Split(directResult, "/");
            if (dataVec.size() == 1) {//无多音词
                vector<string> dataVec = limonp::Split(directResult, ",");
                for (auto &oneResult : dataVec) {
                    results.append(QString().fromStdString(oneResult));
                }
            } else {
                if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
                    int wordSize = limonp::Split(dataVec[0], ",").size();
                    for (int i = 0; i < wordSize; ++i) {
                        QStringList oneResult;
                        for (size_t j = 0; j < dataVec.size(); ++j) {
                            oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i]));
                        }
                        results.append(oneResult.join('/'));
                    }
                } else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
                    vector<string> tmp = limonp::Split(dataVec[0], ",");
                    for (auto &oneResult : tmp) {
                        results.append(QString().fromStdString(oneResult));
                    }
                }
            }
        }
    }
    convertDataStyle(results);
    return 0;//todo
 }
 void HanZiToPinYinPrivate::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType)
 {
    m_pinyinDataStyle = dataStyle;
    m_segType = segType;
    m_polyphoneType = polyphoneType;
    m_exDataProcessType = processType;
 }
 void HanZiToPinYinPrivate::convertDataStyle(QStringList &results)
 {
    QString value;
    if (m_pinyinDataStyle == PinyinDataStyle::Default) {
        for (QString &info : results) {
            if(info == ",") {
                continue;
            }
            //if info's length was been changed, there's someting wrong while traverse the chars of info
            for (const QChar &c : info) {
                if (!isalpha(c.toLatin1())) {
                    value = PhoneticSymbol.value(c);
                    if (!value.isEmpty()) {
                        info.replace(c, value.at(0));
                    }
                }
            }
            QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序)
            QStringList tmpValue;
            for (auto &str : tmpList) {
                if (!tmpValue.contains(str)) {
                    tmpValue.push_back(str);
                }
            }
            info = tmpValue.join(",");
        }
    } else if (m_pinyinDataStyle == PinyinDataStyle::Tone) {
        //无需处理
    } else if (m_pinyinDataStyle == PinyinDataStyle::Tone2) {
        for (QString &info : results) {
            for (int i = 0; i < info.size();) {
                auto c = info.at(i);
                if (!isalpha(c.toLatin1())) {
                    value = PhoneticSymbol.value(c);
                    if (!value.isEmpty()) {
                        info.replace(c, PhoneticSymbol.value(c));
                        i += PhoneticSymbol.value(c).size();
                        continue;
                    }
                }
                i++;
            }
        }
    } else if (m_pinyinDataStyle == PinyinDataStyle::Tone3) {
        for (QString &info : results) {
            if(info == "/") {
                continue;
            }
            bool isPolyphoneWords(false);
            if (info.contains("/")) {
                isPolyphoneWords = true;
                info.replace("/", ",");
            }
            for (int i = 0; i < info.size();) {
                auto c = info.at(i);
                if (!isalpha(c.toLatin1())) {
                    value = PhoneticSymbol.value(c);
                    if (!value.isEmpty()) {
                        info.replace(i, 1, value.at(0));
                        //多音词模式
                        if (info.contains(",")) {
                            int pos = info.indexOf(',', i);
                            if (isPolyphoneWords) {
                                info.replace(",", "/");
                            }
                            //最后一个读音时
                            if (pos == -1) {
                                info.append(value.at(1));
                                break;
                            }
                            info.insert(pos, value.at(1));
                            i = pos + 1;    //insert导致','的位置加一，将i行进到','的位置
                            i++;
                            continue;
                        } else {
                            info.append(value.at(1));
                            break;
                        }
                    }
                }
                i++;
            }
        }
    } else if (m_pinyinDataStyle == PinyinDataStyle::FirstLetter) {
        for (QString &info : results) {
            if(info == "," or info == "/") {
                continue;
            }
            bool isPolyphoneWords(false);
            if (info.contains("/")) {
                isPolyphoneWords = true;
                info.replace("/", ",");
            }
            for (int i = 0; i < info.size();i++) {
                auto c = info.at(i);
                if (!isalpha(c.toLatin1())) {
                    value = PhoneticSymbol.value(c);
                    if (!value.isEmpty()) {
                        info.replace(c, value.at(0));
                    }
                }
            }
            QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序)
            QStringList tmpValue;
            for (auto &str : tmpList) {
                if (!tmpValue.contains(str)) {
                    tmpValue.push_back(str.at(0));
                }
            }
            if (isPolyphoneWords) {
                info = tmpValue.join("/");
            } else {
                info = tmpValue.join(",");
            }
        }
    } else if (m_pinyinDataStyle == PinyinDataStyle::English) {
        //暂不支持
    }
 }
 HanZiToPinYinPrivate::HanZiToPinYinPrivate(HanZiToPinYin *parent) : q(parent)
 {
    //const char * const  SINGLE_WORD_PINYIN_PATH = "/usr/share/ukui-search/res/dict/singleWordPinyin.txt";
    //const char * const  WORDS_PINYIN_PATH = "/usr/share/ukui-search/res/dict/wordsPinyin.txt";
    //m_pinYinTrie = new Pinyin4cppDictTrie(SINGLE_WORD_PINYIN_PATH, WORDS_PINYIN_PATH);
    //m_pinYinTrie = new Pinyin4cppTrie;
 }
 HanZiToPinYinPrivate::~HanZiToPinYinPrivate()
 {
 //    if (m_pinYinTrie){
 //        delete m_pinYinTrie;
 //        m_pinYinTrie = nullptr;
 //    }
 }
 HanZiToPinYin * HanZiToPinYin::getInstance()
 {
    call_once(g_singleFlag, []() {
        g_pinYinManager = new HanZiToPinYin;
    });
    return g_pinYinManager;
 }
 bool HanZiToPinYin::contains(string &word)
 {
    return d->contains(word);
 }
 bool HanZiToPinYin::isMultiTone(string &word)
 {
    return d->isMultiTone(word);
 }
 bool HanZiToPinYin::isMultiTone(string &&word)
 {
    return d->isMultiTone(word);
 }
 bool HanZiToPinYin::isMultiTone(const string &word)
 {
    return d->isMultiTone(word);
 }
 bool HanZiToPinYin::isMultiTone(const string &&word)
 {
    return d->isMultiTone(word);
 }
 int HanZiToPinYin::getResults(string word, QStringList &results)
 {
    return d->getResults(word, results);
 }
 void HanZiToPinYin::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType)
 {
    d->setConfig(dataStyle, segType, polyphoneType, processType);
 }
 HanZiToPinYin::HanZiToPinYin() : d(new HanZiToPinYinPrivate)
 {
 }
--- a/libchinese-segmentation/hanzi-to-pinyin.h
+++ b/libchinese-segmentation/hanzi-to-pinyin.h
@ -0,0 +1,82 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef HANZITOPINYIN_H
 #define HANZITOPINYIN_H
 #include <QtCore/qglobal.h>
 #include <QStringList>
 #include "pinyin4cpp-common.h"
 #define PINYINMANAGER_EXPORT Q_DECL_IMPORT
 using namespace std;
 class HanZiToPinYinPrivate;
 class PINYINMANAGER_EXPORT HanZiToPinYin
 {
 public:
    static HanZiToPinYin * getInstance();
 public:
    /**
     * @brief HanZiToPinYin::isMultiTone 判断是否为多音字/词/句
     * @param word 要判断的字/词/句
     * @return bool 不是返回false
     */
    bool isMultiTone(string &word);
    bool isMultiTone(string &&word);
    bool isMultiTone(const string &word);
    bool isMultiTone(const string &&word);
    /**
     * @brief HanZiToPinYin::contains 查询某个字/词/句是否有拼音（是否在数据库包含）
     * @param word 要查询的字/词/句
     * @return bool 数据库不包含返回false
     */
    bool contains(string &word);
    /**
     * @brief HanZiToPinYin::getResults 获取某个字/词/句的拼音
     * @param word 要获取拼音的字/词/句
     * @param results word的拼音列表（有可能多音字），每次调用results会被清空
     * @return int 获取到返回0，否则返回-1
     */
    int getResults(string word, QStringList &results);
    /**
     * @brief setConfig 设置HanZiToPinYin的各项功能，详见pinyin4cpp-common.h
     * @param dataStyle 返回数据风格，默认defult
     * @param segType 是否启用分词，默认启用
     * @param polyphoneType 是否启用多音字，默认不启用
     * @param processType 无拼音数据处理模式，默认defult
     */
    void setConfig(PinyinDataStyle dataStyle,SegType segType,PolyphoneType polyphoneType,ExDataProcessType processType);
 protected:
    HanZiToPinYin();
    ~HanZiToPinYin();
    HanZiToPinYin(const HanZiToPinYin&) = delete;
    HanZiToPinYin& operator =(const HanZiToPinYin&) = delete;
 private:
    static HanZiToPinYin *g_pinYinManager;
    HanZiToPinYinPrivate *d = nullptr;
 };
 #endif // PINYINMANAGER_H
--- a/libchinese-segmentation/libchinese-segmentation.pro
+++ b/libchinese-segmentation/libchinese-segmentation.pro
@ -0,0 +1,86 @@
 QT -= gui
 VERSION = 1.1.0
 TARGET =  chinese-segmentation
 TEMPLATE = lib
 DEFINES += LIBCHINESESEGMENTATION_LIBRARY
 DEFINES += VERSION='\\"$${VERSION}\\"'
 CONFIG += c++11 create_pc create_prl no_install_prl
 # The following define makes your compiler emit warnings if you use
 # any Qt feature that has been marked deprecated (the exact warnings
 # depend on your compiler). Please consult the documentation of the
 # deprecated API in order to know how to port your code away from it.
 DEFINES += QT_DEPRECATED_WARNINGS
 QMAKE_CXXFLAGS += -Werror=return-type -Werror=return-local-addr
 #QMAKE_CXXFLAGS += -Werror=uninitialized
 QMAKE_CXXFLAGS += -execution-charset:utf-8
 # You can also make your code fail to compile if it uses deprecated APIs.
 # In order to do so, uncomment the following line.
 # You can also select to disable deprecated APIs only up to a certain version of Qt.
 #DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
 include(cppjieba/cppjieba.pri)
 include(pinyin4cpp/pinyin4cpp.pri)
 include(Traditional-Chinese-Simplified-conversion/Traditional2Simplified.pri)
 include(storage-base/storage-base-cedar.pri)
 #LIBS += -L/usr/local/lib/libjemalloc -ljemalloc
 SOURCES += \
    chinese-segmentation.cpp \
    hanzi-to-pinyin.cpp \
    Traditional-to-Simplified.cpp
 HEADERS += \
    chinese-segmentation-private.h \
    chinese-segmentation.h \
    common-struct.h \
    hanzi-to-pinyin-private.h \
    hanzi-to-pinyin.h \
    Traditional-to-Simplified-private.h \
    Traditional-to-Simplified.h \
    pinyin4cpp-common.h \
    libchinese-segmentation_global.h
 DICT_INSTALL_PATH = /usr/share/chinese-segmentation/res/dict
 DEFINES += DICT_INSTALL_PATH='\\"$${DICT_INSTALL_PATH}\\"'
 dict_files.path = DICT_INSTALL_PATH
 dict_files.files = $$PWD/dict/*.utf8\
 dict_files.files += $$PWD/dict/pos_dict/*.utf8\
 dict_files.files += $$PWD/dict/*.txt\
 dict_files.files += $$PWD/pinyin4cpp/dict/*.txt\
 dict_files.files += $$PWD/Traditional-Chinese-Simplified-conversion/dict/*.txt
 INSTALLS += \
    dict_files \
 # Default rules for deployment.
 unix {
    target.path = $$[QT_INSTALL_LIBS]
    QMAKE_PKGCONFIG_NAME = chinese-segmentation
    QMAKE_PKGCONFIG_DESCRIPTION = chinese-segmentation Header files
    QMAKE_PKGCONFIG_VERSION = $$VERSION
    QMAKE_PKGCONFIG_LIBDIR = $$target.path
    QMAKE_PKGCONFIG_DESTDIR = pkgconfig
    QMAKE_PKGCONFIG_INCDIR = /usr/include/chinese-segmentation
    QMAKE_PKGCONFIG_CFLAGS += -I/usr/include/chinese-segmentation
 !isEmpty(target.path): INSTALLS += target
    header.path = /usr/include/chinese-segmentation
    header.files += chinese-segmentation.h libchinese-segmentation_global.h common-struct.h hanzi-to-pinyin.h pinyin4cpp-common.h Traditional-to-Simplified.h
    header.files += development-files/header-files/*
 #    headercppjieba.path = /usr/include/chinese-seg/cppjieba/
 #    headercppjieba.files = cppjieba/*
    INSTALLS += header
 }
 #DISTFILES += \
 #    jiaba/jieba.pri
 DISTFILES += \
    development-files/header-files/* \
    pinyin4cpp/pinyin4cpp.pri
--- a/libchinese-segmentation/libchinese-segmentation_global.h
+++ b/libchinese-segmentation/libchinese-segmentation_global.h
@ -0,0 +1,32 @@
 /*
 * Copyright (C) 2020, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: zhangzihao <zhangzihao@kylinos.cn>
 * Modified by: zhangpengfei <zhangpengfei@kylinos.cn>
 *
 */
 #ifndef CHINESESEGMENTATION_GLOBAL_H
 #define CHINESESEGMENTATION_GLOBAL_H
 #include <QtCore/qglobal.h>
 #if defined(CHINESESEGMENTATION_LIBRARY)
 #  define CHINESESEGMENTATION_EXPORT Q_DECL_EXPORT
 #else
 #  define CHINESESEGMENTATION_EXPORT Q_DECL_IMPORT
 #endif
 #endif // CHINESESEGMENTATION_GLOBAL_H
--- a/libchinese-segmentation/pinyin4cpp-common.h
+++ b/libchinese-segmentation/pinyin4cpp-common.h
@ -0,0 +1,73 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef PINYIN4CPP_COMMON_H
 #define PINYIN4CPP_COMMON_H
 /**
 * @brief The PinyinDataStyle enum
 * Default 默认模式，“中心” return “zhong xin”
 * Tone 带读音模式 #“中心” return “zhōng xīn”
 * Tone2 带读音模式2 #“中心” return “zho1ng xi1n”
 * Tone3 带读音模式3 #“中心” return “zhong1 xin1”
 * FirstLetter 首字母模式 #“中心” return “z x”
 * English 英文翻译模式(暂不支持) #“中心” return “center,heart,core”
 */
 enum class PinyinDataStyle {
    Default       = 1u << 0,
    Tone          = 1u << 1,
    Tone2         = 1u << 2,
    Tone3         = 1u << 3,
    FirstLetter   = 1u << 4,
    English       = 1u << 5
 };
 /**
 * @brief The SegType enum
 * Segmentation 默认带分词 #“银河麒麟”->“银河”“麒麟”
 * NoSegmentation 无分词模式 #“银河麒麟”
 */
 enum class SegType {
    Segmentation    = 1u << 0,
    NoSegmentation  = 1u << 1
 };
 /**
 * @brief The PolyphoneType enum
 * Disable 默认不启用多音字，“奇安信”return “qi an xin”多音字按照常用读音返回
 * Enable 启用多音字 “奇安信” return“qi,ji an xin”
 * 注意：多音词返回格式为 “朝阳” return "zhao/chao yang/yang"
 */
 enum class PolyphoneType {
    Disable       = 1u << 0,
    Enable        = 1u << 1
 };
 /**
 * @brief The ExDataProcessType enum
 * Default 默认无拼音数据直接返回，“123木头人” return "123 mu tou ren"（分词模式）
 * Delete  删除多余数据，#“123木头人” return "mu tou ren"（分词模式）
 */
 enum class ExDataProcessType {
    Default       = 1u << 0,
    Delete        = 1u << 1
 };
 #endif //PINYIN4CPP_COMMON_H
--- a/libchinese-segmentation/pinyin4cpp/dict/singleWordPinyin.txt
+++ b/libchinese-segmentation/pinyin4cpp/dict/singleWordPinyin.txt
--- a/libchinese-segmentation/pinyin4cpp/dict/wordsPinyin.txt
+++ b/libchinese-segmentation/pinyin4cpp/dict/wordsPinyin.txt
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp-trie.cpp
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp-trie.cpp
@ -0,0 +1,126 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #include "pinyin4cpp-trie.h"
 Pinyin4cppTrie::Pinyin4cppTrie(string dat_cache_path)
    : StorageBase<char, false, CacheFileHeaderBase>(vector<string>{SINGLE_WORD_PINYIN_PATH, WORDS_PINYIN_PATH}, dat_cache_path)
 {
    this->Init();
 }
 Pinyin4cppTrie::Pinyin4cppTrie(const vector<string> file_paths, string dat_cache_path)
    : StorageBase<char, false, CacheFileHeaderBase>(file_paths, dat_cache_path)
 {
    this->Init();
 }
 bool Pinyin4cppTrie::Contains(string &word) {
    if (this->Find(word) != string())
        return true;
    return false;
 }
 bool Pinyin4cppTrie::IsMultiTone(const string &word) {
    string result = this->Find(word);
    if (result.find(",") == result.npos)
        return true;
    return false;
 }
 void Pinyin4cppTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
 {
    CacheFileHeaderBase header;
    assert(sizeof(header.md5_hex) == md5.size());
    memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
    int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
    string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
    umask(S_IWGRP | S_IWOTH);
    const int fd =mkstemp((char *)tmp_filepath.data());
    assert(fd >= 0);
    fchmod(fd, 0644);
    write_bytes = write(fd, (const char *)&header, sizeof(CacheFileHeaderBase));
    this->LoadSingleWordDict(fd, write_bytes, offset, elements_num);
    this->LoadWordsDict(fd, write_bytes, offset, elements_num);
    write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
    lseek(fd, sizeof(header.md5_hex), SEEK_SET);
    write(fd, &elements_num, sizeof(int));
    write(fd, &offset, sizeof(int));
    data_trie_size = this->GetDataTrieSize();
    write(fd, &data_trie_size, sizeof(int));
    close(fd);
    assert((size_t)write_bytes == sizeof(CacheFileHeaderBase) + offset + this->GetDataTrieTotalSize());
    tryRename(tmp_filepath, dat_cache_file);
 }
 string Pinyin4cppTrie::Find(const string &key)
 {
    int result = this->ExactMatchSearch(key.c_str(), key.size());
    if (result < 0)
        return string();
    return string(&this->GetElementPtr()[result]);
 }
 void Pinyin4cppTrie::LoadSingleWordDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
 {
    ifstream ifs(SINGLE_WORD_PINYIN_PATH);
    string line;
    vector<string> buf;
    for (; getline(ifs, line);) {
        if (limonp::StartsWith(line, "#") or line.empty()) {
            continue;
        }
        limonp::Split(line, buf, ":");
        if (buf.size() != 3)
            continue;
        this->Update(buf[2].c_str(), buf[2].size(), offset);
        offset += (buf[1].size() + 1);
        elements_num++;
        write_bytes += write(fd, buf[1].c_str(), buf[1].size() + 1);
    }
 }
 void Pinyin4cppTrie::LoadWordsDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
 {
    ifstream ifs(WORDS_PINYIN_PATH);
    string line;
    vector<string> buf;
    for (; getline(ifs, line);) {
        if (limonp::StartsWith(line, "#") or line.empty()) {
            continue;
        }
        limonp::Split(line, buf, ":");
        if (buf.size() != 2)
            continue;
        this->Update(buf[0].c_str(), buf[0].size(), offset);
        offset += (buf[1].size() + 1);
        elements_num++;
        write_bytes += write(fd, buf[1].c_str(), buf[1].size() + 1);
    }
 }
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp-trie.h
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp-trie.h
@ -0,0 +1,43 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef PINYIN4CPPTRIE_H
 #define PINYIN4CPPTRIE_H
 #include "storage-base.hpp"
 const char * const  SINGLE_WORD_PINYIN_PATH = DICT_INSTALL_PATH"/singleWordPinyin.txt";
 const char * const  WORDS_PINYIN_PATH = DICT_INSTALL_PATH"/wordsPinyin.txt";
 class Pinyin4cppTrie : public StorageBase<char, false, CacheFileHeaderBase>
 {
 public:
    Pinyin4cppTrie(string dat_cache_path = "");
    Pinyin4cppTrie(const vector<string> file_paths, string dat_cache_path = "");
    void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
    string Find(const string &key);
    bool Contains(string &word);
    bool IsMultiTone(const string &word);
 private:
    void LoadSingleWordDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
    void LoadWordsDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
 };
 #endif // PINYIN4CPPTRIE_H
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp.pri
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp.pri
@ -0,0 +1,15 @@
 INCLUDEPATH += $$PWD
 HEADERS += \
    $$PWD/pinyin4cpp-trie.h \
    $$PWD/pinyin4cpp_dataTrie.h \
    $$PWD/pinyin4cpp_dictTrie.h
 SOURCES += \
    $$PWD/pinyin4cpp-trie.cpp \
    $$PWD/pinyin4cpp_dataTrie.cpp \
    $$PWD/pinyin4cpp_dictTrie.cpp
 DISTFILES += \
    pinyin4cpp/dict/wordsPinyin.txt \
    pinyin4cpp/dict/singleWordPinyin.txt
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dataTrie.cpp
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dataTrie.cpp
@ -0,0 +1,135 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #include "pinyin4cpp_dataTrie.h"
 Pinyin4cppDataTrie::Pinyin4cppDataTrie()
 {
 }
 Pinyin4cppDataTrie::~Pinyin4cppDataTrie()
 {
    munmap(m_mmapAddr, m_mmapLength);
    m_mmapAddr = nullptr;
    close(m_mmapFd);
    m_mmapFd = -1;
 }
 string Pinyin4cppDataTrie::Find(const string &key) const {
 //    darts-clone的接口方法
    Darts::DoubleArray::result_pair_type find_result;
    m_DoubleArrayDataTrie.exactMatchSearch(key.c_str(), find_result);
    if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= m_elementsSize)) {//todo
        return string();
    }
    return string(&m_elementsPtr[find_result.value]);
 //  cedarpp的接口方法
 //    int result = m_DoubleArrayDataTrie.exactMatchSearch<int>(key.c_str(), key.size());
 //    if (result < 0)
 //        return string();
 //    return string(&m_elementsPtr[result]);
 }
 bool Pinyin4cppDataTrie::InitBuildDat(map<string, string> &elements, const string &dat_cache_file, const string &md5) {
    BuildDatCache(elements, dat_cache_file, md5);
    return InitAttachDat(dat_cache_file, md5);
 }
 bool Pinyin4cppDataTrie::InitAttachDat(const string &dat_cache_file, const string &md5) {
    m_mmapFd = open(dat_cache_file.c_str(), O_RDONLY);
    if (m_mmapFd < 0) {
        return false;
    }
    const auto seek_off = lseek(m_mmapFd, 0, SEEK_END);
    assert(seek_off >= 0);
    m_mmapLength = static_cast<size_t>(seek_off);
    m_mmapAddr = reinterpret_cast<char *>(mmap(NULL, m_mmapLength, PROT_READ, MAP_SHARED, m_mmapFd, 0));
    assert(MAP_FAILED != m_mmapAddr);
    assert(m_mmapLength >= sizeof(CacheFileHeader));
    CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(m_mmapAddr);
    m_elementsNum = header.elements_num;
    m_elementsSize = header.elements_size;
    assert(sizeof(header.md5_hex) == md5.size());
    if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
        return false;
    }
    assert(m_mmapLength == sizeof(CacheFileHeader) + header.elements_size  + header.dat_size * m_DoubleArrayDataTrie.unit_size());
    m_elementsPtr = (const char *)(m_mmapAddr + sizeof(CacheFileHeader));
    const char * dat_ptr = m_mmapAddr + sizeof(CacheFileHeader) + header.elements_size;
    m_DoubleArrayDataTrie.set_array((char *)dat_ptr, header.dat_size);
    return true;
 }
 void Pinyin4cppDataTrie::BuildDatCache(map<string, string> &elements, const string &dat_cache_file, const string &md5) {
    vector<const char*> keys_ptr_vec;
    vector<int> values_vec;
    vector<string> mem_elem_vec;
    keys_ptr_vec.reserve(elements.size());
    values_vec.reserve(elements.size());
    mem_elem_vec.reserve(elements.size());
    CacheFileHeader header;
    assert(sizeof(header.md5_hex) == md5.size());
    memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
    int offset(0);
    for (auto &info:elements) {
        keys_ptr_vec.push_back(info.first.c_str());
        values_vec.push_back(offset);
        offset += (info.second.size() + 1);//+1指字符串后加\0
        assert(info.second.size() > 0);
        mem_elem_vec.push_back(info.second);
    }
    auto const ret = m_DoubleArrayDataTrie.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
    assert(0 == ret);
    header.elements_num = mem_elem_vec.size();
    header.elements_size = offset;
    header.dat_size = m_DoubleArrayDataTrie.size();
    string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
    umask(S_IWGRP | S_IWOTH);
    const int fd =mkstemp((char *)tmp_filepath.data());
    assert(fd >= 0);
    fchmod(fd, 0644);
    auto write_bytes = write(fd, (const char *)&header, sizeof(header));
    for (size_t i = 0; i < elements.size(); ++i) {
        write_bytes += write(fd, mem_elem_vec[i].c_str(), mem_elem_vec[i].size() + 1);
    }
    write_bytes += write(fd, m_DoubleArrayDataTrie.array(), m_DoubleArrayDataTrie.total_size());
    assert((size_t)write_bytes == sizeof(header) + offset + m_DoubleArrayDataTrie.total_size());
    close(fd);
    const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str());
    assert(0 == rename_ret);
 }
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dataTrie.h
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dataTrie.h
@ -0,0 +1,74 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef PINYIN4cpp_DATATRIE_H
 #define PINYIN4cpp_DATATRIE_H
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <QDebug>
 #include "Md5.hpp"
 #include "LocalVector.hpp"
 #include "StringUtil.hpp"
 //#define USE_REDUCED_TRIE
 #include "../storage-base/cedar/cedar.h"
 #include "../storage-base/darts-clone/darts.h"
 using namespace std;
 using std::pair;
 struct CacheFileHeader { //todo 字节对齐
    char md5_hex[32] = {};
    uint32_t elements_num = 0;
    uint32_t elements_size = 0;
    uint32_t dat_size = 0;
 };
 class Pinyin4cppDataTrie {
 public:
    Pinyin4cppDataTrie();
    ~Pinyin4cppDataTrie();
    string Find(const string & key) const;
    bool InitBuildDat(map<string, string>& elements, const string & dat_cache_file, const string & md5);
    bool InitAttachDat(const string & dat_cache_file, const string & md5);
 private:
    void BuildDatCache(map<string, string>& elements, const string & dat_cache_file, const string & md5);
    Pinyin4cppDataTrie(const Pinyin4cppDataTrie &);
    Pinyin4cppDataTrie &operator=(const Pinyin4cppDataTrie &);
 private:
    Darts::DoubleArray m_DoubleArrayDataTrie;
    //cedar::da<int, -1, -2, true> m_DoubleArrayDataTrie;
    const char * m_elementsPtr = nullptr;
    size_t m_elementsNum = 0;
    size_t m_elementsSize = 0;
    size_t m_mmapLength = 0;
    int    m_mmapFd = -1;
    char * m_mmapAddr = nullptr;
 };
 #endif //PINYIN4cpp_DATATRIE_H
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dictTrie.cpp
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dictTrie.cpp
@ -0,0 +1,156 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #include "pinyin4cpp_dictTrie.h"
 #include "malloc.h"
 Pinyin4cppDictTrie::Pinyin4cppDictTrie(const string &single_word_dict_path, const string &words_dict_paths, const string &dat_cache_path) {
    Init(single_word_dict_path, words_dict_paths, dat_cache_path);
 }
 string Pinyin4cppDictTrie::Find(const string &word) const {
    return m_DataTrie.Find(word);
 }
 bool Pinyin4cppDictTrie::Contains(string &word) {
    if (m_DataTrie.Find(word) != string())
        return true;
    return false;
 }
 bool Pinyin4cppDictTrie::IsMultiTone(const string &word) {
    string result = m_DataTrie.Find(word);
    if (result.find(",") == result.npos)
        return true;
    return false;
 }
 size_t Pinyin4cppDictTrie::GetTotalDictSize() const {
    return m_TotalDictSize_;
 }
 void Pinyin4cppDictTrie::Init(const string &single_word_dict_path, const string &words_dict_paths, string dat_cache_path) {
    const auto dict_list = single_word_dict_path + "|" + words_dict_paths;
    size_t file_size_sum = 0;
    const string md5 = CalcFileListMD5(dict_list, file_size_sum);
    m_TotalDictSize_ = file_size_sum;
    if (dat_cache_path.empty()) {
        dat_cache_path = "/tmp/" + md5 + ".dat_cache";//未指定词库数据文件存储位置的默认存储在tmp目录下
    }
    qDebug() << "#####Pinyin Dict path:" << dat_cache_path.c_str();
    if (m_DataTrie.InitAttachDat(dat_cache_path, md5)) {
        return;
    }
    LoadSingleWordDict(single_word_dict_path);
    LoadWordsDict(words_dict_paths);
    bool build_ret = m_DataTrie.InitBuildDat(m_StaticNodeInfos, dat_cache_path, md5);
    assert(build_ret);
    m_StaticNodeInfos.clear();
    malloc_trim(0);
 }
 void Pinyin4cppDictTrie::LoadSingleWordDict(const string &filePath) {
    ifstream ifs(filePath.c_str());
    string line;
    vector<string> buf;
    for (; getline(ifs, line);) {
        if (limonp::StartsWith(line, "#")) {
            continue;
        }
        limonp::Split(line, buf, ":");
        assert(buf.size() == SINGLE_WORD_DICT_COLUMN_NUM);
        if (m_StaticNodeInfos.find(buf[2]) != m_StaticNodeInfos.end()) {
            vector<string> tmp;
            bool isfind(false);
            limonp::Split(m_StaticNodeInfos[buf[2]], tmp, ",");
            for (auto &onePinyin:tmp) {
                if (onePinyin == buf[1]) {
                    isfind = true;
                    break;
                }
            }
            if (!isfind) {
                m_StaticNodeInfos[buf[2]] += ("," + buf[2]);
            }
        } else {
            m_StaticNodeInfos[buf[2]] = buf[1];
        }
    }
 }
 void Pinyin4cppDictTrie::LoadWordsDict(const string &filePath) {
    ifstream ifs(filePath.c_str());
    string line;
    vector<string> buf;
    for (; getline(ifs, line);) {
        if (limonp::StartsWith(line, "#")) {
            continue;
        }
        limonp::Split(line, buf, ":");
        assert(buf.size() == WORDS_DICT_COLUMN_NUM);
        if (m_StaticNodeInfos.find(buf[0]) != m_StaticNodeInfos.end()) {
            vector<string> tmp;
            bool isfind(false);
            limonp::Split(m_StaticNodeInfos[buf[0]], tmp, "/");
            for (auto &onePinyin:tmp) {
                if (onePinyin == buf[1]) {
                    isfind = true;
                    break;
                }
            }
            if (!isfind) {
                m_StaticNodeInfos[buf[0]] += ("/" + buf[1]);
            }
        } else {
            m_StaticNodeInfos[buf[0]] = buf[1];
        }
    }
 }
 string CalcFileListMD5(const string &files_list, size_t &file_size_sum) {
    limonp::MD5 md5;
    const auto files = limonp::Split(files_list, "|;");
    file_size_sum = 0;
    for (auto const & local_path : files) {
        const int fd = open(local_path.c_str(), O_RDONLY);
        if (fd < 0){
            continue;
        }
        auto const len = lseek(fd, 0, SEEK_END);
        if (len > 0) {
            void * addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
            assert(MAP_FAILED != addr);
            md5.Update((unsigned char *) addr, len);
            file_size_sum += len;
            munmap(addr, len);
        }
        close(fd);
    }
    md5.Final();
    return string(md5.digestChars);
 }
--- a/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dictTrie.h
+++ b/libchinese-segmentation/pinyin4cpp/pinyin4cpp_dictTrie.h
@ -0,0 +1,59 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef PINYIN4cpp_DICTTRIE_H
 #define PINYIN4cpp_DICTTRIE_H
 #include "pinyin4cpp_dataTrie.h"
 using namespace std;
 const size_t SINGLE_WORD_DICT_COLUMN_NUM = 3;
 const size_t WORDS_DICT_COLUMN_NUM = 2;
 class Pinyin4cppDictTrie {
 public:
    Pinyin4cppDictTrie(const string& single_word_dict_path, const string& words_dict_paths, const string & dat_cache_path = "");
    ~Pinyin4cppDictTrie() {}
    string Find(const string &word) const;
    bool Contains(string &word);
    bool IsMultiTone(const string &word);
    size_t GetTotalDictSize() const;
 private:
    void Init(const string& single_word_dict_path, const string& words_dict_paths, string dat_cache_path);
    void LoadSingleWordDict(const string& filePath);
    void LoadWordsDict(const string& filePath);
 private:
    map<string, string> m_StaticNodeInfos;
    size_t m_TotalDictSize_ = 0;
    Pinyin4cppDataTrie m_DataTrie;
 };
 inline string CalcFileListMD5(const string & files_list, size_t & file_size_sum);
 #endif //PINYIN4cpp_DICTTRIE_H
--- a/libchinese-segmentation/storage-base/cedar/cedar.h
+++ b/libchinese-segmentation/storage-base/cedar/cedar.h
@ -0,0 +1,682 @@
 // cedar -- C++ implementation of Efficiently-updatable Double ARray trie
 //  $Id: cedar.h 1938 2022-03-17 16:22:30Z ynaga $
 // Copyright (c) 2009-2015 Naoki Yoshinaga <ynaga@tkl.iis.u-tokyo.ac.jp>
 #ifndef CEDAR_H
 #define CEDAR_H
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <cassert>
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #define STATIC_ASSERT(e, msg) typedef char msg[(e) ? 1 : -1]
 namespace cedar {
  // typedefs
  typedef unsigned char  uchar;
  template <typename T> struct NaN { enum { N1 = -1, N2 = -2 }; };
  template <> struct NaN <float> { enum { N1 = 0x7f800001, N2 = 0x7f800002 }; };
  static const int MAX_ALLOC_SIZE = 1 << 16; // must be divisible by 256
  // dynamic double array
  template <typename value_type,
            const int     NO_VALUE  = NaN <value_type>::N1,
            const int     NO_PATH   = NaN <value_type>::N2,
            const bool    ORDERED   = true,
            const int     MAX_TRIAL = 1,
            const size_t  NUM_TRACKING_NODES = 0>
  class da {
  public:
    enum error_code { CEDAR_NO_VALUE = NO_VALUE, CEDAR_NO_PATH = NO_PATH, CEDAR_VALUE_LIMIT = 2147483647 };
    typedef value_type result_type;
    struct result_pair_type {
      value_type  value;
      size_t      length;  // prefix length
    };
    struct result_triple_type { // for predict ()
      value_type  value;
      size_t      length;  // suffix length
      size_t      id;      // node id of value
    };
    struct node {
      union { int base_; value_type value; }; // negative means prev empty index
      int  check;                             // negative means next empty index
      node (const int base__ = 0, const int check_ = 0)
        : base_ (base__), check (check_) {}
 #ifdef USE_REDUCED_TRIE
      int base () const { return - (base_ + 1); } // ~ in two's complement system
 #else
      int base () const { return base_; }
 #endif
    };
    struct ninfo {  // x1.5 update speed; +.25 % memory (8n -> 10n)
      uchar  sibling;   // right sibling (= 0 if not exist)
      uchar  child;     // first child
      ninfo () : sibling (0), child (0) {}
    };
    struct block { // a block w/ 256 elements
      int   prev;   // prev block; 3 bytes
      int   next;   // next block; 3 bytes
      short num;    // # empty elements; 0 - 256
      short reject; // minimum # branching failed to locate; soft limit
      int   trial;  // # trial
      int   ehead;  // first empty item
      block () : prev (0), next (0), num (256), reject (257), trial (0), ehead (0) {}
    };
    da () : tracking_node (), _array (0), _ninfo (0), _block (0), _bheadF (0), _bheadC (0), _bheadO (0), _capacity (0), _size (0), _no_delete (false), _reject () {
      STATIC_ASSERT(sizeof (value_type) <= sizeof (int),
                    value_type_is_not_supported___maintain_a_value_array_by_yourself_and_store_its_index
                    );
      _initialize ();
    }
    ~da () { clear (false); }
    size_t capacity   () const { return static_cast <size_t> (_capacity); }
    size_t size       () const { return static_cast <size_t> (_size); }
    size_t total_size () const { return sizeof (node) * _size; }
    size_t unit_size  () const { return sizeof (node); }
    size_t nonzero_size () const {
      size_t i = 0;
      for (int to = 0; to < _size; ++to)
        if (_array[to].check >= 0) ++i;
      return i;
    }
    size_t num_keys () const {
      size_t i = 0;
      for (int to = 0; to < _size; ++to)
 #ifdef USE_REDUCED_TRIE
        if (_array[to].check >= 0 && _array[to].value >= 0) ++i;
 #else
        if (_array[to].check >= 0 && _array[_array[to].check].base () == to) ++i;
 #endif
      return i;
    }
    // interfance
    template <typename T>
    T exactMatchSearch (const char* key) const
    { return exactMatchSearch <T> (key, std::strlen (key)); }
    template <typename T>
    T exactMatchSearch (const char* key, size_t len, size_t from = 0) const {
      union { int i; value_type x; } b;
      size_t pos = 0;
      b.i = _find (key, from, pos, len);
      if (b.i == CEDAR_NO_PATH) b.i = CEDAR_NO_VALUE;
      T result;
      _set_result (&result, b.x, len, from);
      return result;
    }
    template <typename T>
    size_t commonPrefixSearch (const char* key, T* result, size_t result_len) const
    { return commonPrefixSearch (key, result, result_len, std::strlen (key)); }
    template <typename T>
    size_t commonPrefixSearch (const char* key, T* result, size_t result_len, size_t len, size_t from = 0) const {
      size_t num = 0;
      for (size_t pos = 0; pos < len; ) {
        union { int i; value_type x; } b;
        b.i = _find (key, from, pos, pos + 1);
        if (b.i == CEDAR_NO_VALUE) continue;
        if (b.i == CEDAR_NO_PATH)  return num;
        if (num < result_len) _set_result (&result[num], b.x, pos, from);
        ++num;
      }
      return num;
    }
    // predict key from double array
    template <typename T>
    size_t commonPrefixPredict (const char* key, T* result, size_t result_len)
    { return commonPrefixPredict (key, result, result_len, std::strlen (key)); }
    template <typename T>
    size_t commonPrefixPredict (const char* key, T* result, size_t result_len, size_t len, size_t from = 0) {
      size_t num (0), pos (0), p (0);
      if (_find (key, from, pos, len) == CEDAR_NO_PATH) return 0;
      union { int i; value_type x; } b;
      size_t root = from;
      for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p, root)) {
        if (num < result_len) _set_result (&result[num], b.x, p, from);
        ++num;
      }
      return num;
    }
    void suffix (char* key, size_t len, size_t to) const {
      key[len] = '\0';
      while (len--) {
        const int from = _array[to].check;
        key[len]
          = static_cast <char> (_array[from].base () ^ static_cast <int> (to));
        to = static_cast <size_t> (from);
      }
    }
    value_type traverse (const char* key, size_t& from, size_t& pos) const
    { return traverse (key, from, pos, std::strlen (key)); }
    value_type traverse (const char* key, size_t& from, size_t& pos, size_t len) const {
      union { int i; value_type x; } b;
      b.i = _find (key, from, pos, len);
      return b.x;
    }
    struct empty_callback { void operator () (const int, const int) {} }; // dummy empty function
    value_type& update (const char* key)
    { return update (key, std::strlen (key)); }
    value_type& update (const char* key, size_t len, value_type val = value_type (0))
    { size_t from (0), pos (0); return update (key, from, pos, len, val); }
    value_type& update (const char* key, size_t& from, size_t& pos, size_t len, value_type val = value_type (0))
    { empty_callback cf; return update (key, from, pos, len, val, cf); }
    template <typename T>
    value_type& update (const char* key, size_t& from, size_t& pos, size_t len, value_type val, T& cf) {
      if (! len && ! from)
        _err (__FILE__, __LINE__, "failed to insert zero-length key\n");
 #ifndef USE_FAST_LOAD
      if (! _ninfo || ! _block) restore ();
 #endif
      for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
           pos < len; ++pos) {
 #ifdef USE_REDUCED_TRIE
        const value_type val_ = _array[from].value;
        if (val_ >= 0 && val_ != CEDAR_VALUE_LIMIT) // always new; correct this!
          { const int to = _follow (from, 0, cf); _array[to].value = val_; }
 #endif
        from = static_cast <size_t> (_follow (from, key_[pos], cf));
      }
 #ifdef USE_REDUCED_TRIE
      const int to = _array[from].value >= 0 ? static_cast <int> (from) : _follow (from, 0, cf);
      if (_array[to].value == CEDAR_VALUE_LIMIT) _array[to].value = 0;
 #else
      const int to = _follow (from, 0, cf);
 #endif
      return _array[to].value += val;
    }
    // easy-going erase () without compression
    int erase (const char* key) { return erase (key, std::strlen (key)); }
    int erase (const char* key, size_t len, size_t from = 0) {
      size_t pos = 0;
      const int i = _find (key, from, pos, len);
      if (i == CEDAR_NO_PATH || i == CEDAR_NO_VALUE) return -1;
      erase (from);
      return 0;
    }
    void erase (size_t from) {
      // _test ();
 #ifdef USE_REDUCED_TRIE
      int e = _array[from].value >= 0 ? static_cast <int> (from) : _array[from].base () ^ 0;
      from = static_cast <size_t> (_array[e].check);
 #else
      int e = _array[from].base () ^ 0;
 #endif
      bool flag = false; // have sibling
      do {
        const node& n = _array[from];
        flag = _ninfo[n.base () ^ _ninfo[from].child].sibling;
        if (flag) _pop_sibling (from, n.base (), static_cast <uchar> (n.base () ^ e));
        _push_enode (e);
         e = static_cast <int> (from);
        from = static_cast <size_t> (_array[from].check);
      } while (! flag);
    }
    int build (size_t num, const char** key, const size_t* len = 0, const value_type* val = 0) {
      for (size_t i = 0; i < num; ++i)
        update (key[i], len ? len[i] : std::strlen (key[i]), val ? val[i] : value_type (i));
      return 0;
    }
    template <typename T>
    void dump (T* result, const size_t result_len) {
      union { int i; value_type x; } b;
      size_t num (0), from (0), p (0);
      for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p))
        if (num < result_len)
          _set_result (&result[num++], b.x, p, from);
        else
          _err (__FILE__, __LINE__, "dump() needs array of length = num_keys()\n");
    }
    int save (const char* fn, const char* mode = "wb") const {
      // _test ();
      FILE* fp = std::fopen (fn, mode);
      if (! fp) return -1;
      std::fwrite (_array, sizeof (node), static_cast <size_t> (_size), fp);
      std::fclose (fp);
 #ifdef USE_FAST_LOAD
      const char* const info
        = std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
      fp = std::fopen (info, mode);
      delete [] info; // resolve memory leak
      if (! fp) return -1;
      std::fwrite (&_bheadF, sizeof (int), 1, fp);
      std::fwrite (&_bheadC, sizeof (int), 1, fp);
      std::fwrite (&_bheadO, sizeof (int), 1, fp);
      std::fwrite (_ninfo, sizeof (ninfo), static_cast <size_t> (_size), fp);
      std::fwrite (_block, sizeof (block), static_cast <size_t> (_size >> 8), fp);
      std::fclose (fp);
 #endif
      return 0;
    }
    int open (const char* fn, const char* mode = "rb",
              const size_t offset = 0, size_t size_ = 0) {
      FILE* fp = std::fopen (fn, mode);
      if (! fp) return -1;
      // get size
      if (! size_) {
        if (std::fseek (fp, 0, SEEK_END) != 0) return -1;
        size_ = static_cast <size_t> (std::ftell (fp));
        if (std::fseek (fp, 0, SEEK_SET) != 0) return -1;
      }
      if (size_ <= offset) return -1;
      // set array
      clear (false);
      size_ = (size_ - offset) / sizeof (node);
      if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
      _array = static_cast <node*>  (std::malloc (sizeof (node)  * size_));
 #ifdef USE_FAST_LOAD
      _ninfo = static_cast <ninfo*> (std::malloc (sizeof (ninfo) * size_));
      _block = static_cast <block*> (std::malloc (sizeof (block) * size_));
      if (! _array || ! _ninfo || ! _block)
 #else
        if (! _array)
 #endif
          _err (__FILE__, __LINE__, "memory allocation failed\n");
      if (size_ != std::fread (_array, sizeof (node), size_, fp)) return -1;
      std::fclose (fp);
      _size = static_cast <int> (size_);
 #ifdef USE_FAST_LOAD
      const char* const info
        = std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
      fp = std::fopen (info, mode);
      delete [] info; // resolve memory leak
      if (! fp) return -1;
      std::fread (&_bheadF, sizeof (int), 1, fp);
      std::fread (&_bheadC, sizeof (int), 1, fp);
      std::fread (&_bheadO, sizeof (int), 1, fp);
      if (size_ != std::fread (_ninfo, sizeof (ninfo), size_, fp) ||
          size_ != std::fread (_block, sizeof (block), size_ >> 8, fp) << 8)
        return -1;
      std::fclose (fp);
      _capacity = _size;
 #endif
      return 0;
    }
 #ifndef USE_FAST_LOAD
    void restore () { // restore information to update
      if (! _block) _restore_block ();
      if (! _ninfo) _restore_ninfo ();
      _capacity = _size;
    }
 #endif
    void set_array (void* p, size_t size_ = 0) { // ad-hoc
      clear (false);
      _array = static_cast <node*> (p);
      _size  = static_cast <int> (size_);
      _no_delete = true;
    }
    const void* array () const { return _array; }
    void clear (const bool reuse = true) {
      if (_array && ! _no_delete) std::free (_array);
      if (_ninfo) std::free (_ninfo);
      if (_block) std::free (_block);
      _array = 0; _ninfo = 0; _block = 0;
      _bheadF = _bheadC = _bheadO = _capacity = _size = 0; // *
      if (reuse) _initialize ();
      _no_delete = false;
    }
    // return the first child for a tree rooted by a given node
    int begin (size_t& from, size_t& len) {
 #ifndef USE_FAST_LOAD
      if (! _ninfo) _restore_ninfo ();
 #endif
      int   base = _array[from].base ();
      uchar c    = _ninfo[from].child;
      if (! from && ! (c = _ninfo[base ^ c].sibling)) // bug fix
        return CEDAR_NO_PATH; // no entry
      for (; c; ++len) {
        from = static_cast <size_t> (_array[from].base ()) ^ c;
        c    = _ninfo[from].child;
      }
 #ifdef USE_REDUCED_TRIE
      if (_array[from].value >= 0) return _array[from].value;
 #endif
      return _array[_array[from].base () ^ c].base_;
    }
    // return the next child if any
    int next (size_t& from, size_t& len, const size_t root = 0) {
      uchar c = 0;
 #ifdef USE_REDUCED_TRIE
      if (_array[from].value < 0)
 #endif
        c = _ninfo[_array[from].base () ^ 0].sibling;
      for (; ! c && from != root; --len) {
        c = _ninfo[from].sibling;
        from = static_cast <size_t> (_array[from].check);
      }
      return c ?
        begin (from = static_cast <size_t> (_array[from].base ()) ^ c, ++len) :
        CEDAR_NO_PATH;
    }
    // test the validity of double array for debug
    void test (const size_t from = 0) const {
      const int base = _array[from].base ();
      uchar c = _ninfo[from].child;
      do {
        if (from) assert (_array[base ^ c].check == static_cast <int> (from));
        if (c  && _array[base ^ c].value < 0) // correct this
          test (static_cast <size_t> (base ^ c));
      } while ((c = _ninfo[base ^ c].sibling));
    }
    size_t tracking_node[NUM_TRACKING_NODES + 1];
  private:
    // currently disabled; implement these if you need
    da (const da&);
    da& operator= (const da&);
    node*   _array;
    ninfo*  _ninfo;
    block*  _block;
    int     _bheadF;  // first block of Full;   0
    int     _bheadC;  // first block of Closed; 0 if no Closed
    int     _bheadO;  // first block of Open;   0 if no Open
    int     _capacity;
    int     _size;
    int     _no_delete;
    short   _reject[257];
    //
    static void _err (const char* fn, const int ln, const char* msg)
    { std::fprintf (stderr, "cedar: %s [%d]: %s", fn, ln, msg); std::exit (1); }
    template <typename T>
    static void _realloc_array (T*& p, const int size_n, const int size_p = 0) {
      void* tmp = std::realloc (p, sizeof (T) * static_cast <size_t> (size_n));
      if (! tmp)
        std::free (p), _err (__FILE__, __LINE__, "memory reallocation failed\n");
      p = static_cast <T*> (tmp);
      static const T T0 = T ();
      for (T* q (p + size_p), * const r (p + size_n); q != r; ++q) *q = T0;
    }
    void _initialize () { // initilize the first special block
      _realloc_array (_array, 256, 256);
      _realloc_array (_ninfo, 256);
      _realloc_array (_block, 1);
 #ifdef USE_REDUCED_TRIE
      _array[0] = node (-1, -1);
 #else
      _array[0] = node (0, -1);
 #endif
      for (int i = 1; i < 256; ++i)
        _array[i] = node (i == 1 ? -255 : - (i - 1), i == 255 ? -1 : - (i + 1));
      _block[0].ehead = 1; // bug fix for erase
      _capacity = _size = 256;
      for (size_t i = 0 ; i <= NUM_TRACKING_NODES; ++i) tracking_node[i] = 0;
      for (short  i = 0; i <= 256; ++i) _reject[i] = i + 1;
    }
    // follow/create edge
    template <typename T>
    int _follow (size_t& from, const uchar& label, T& cf) {
      int to = 0;
      const int base = _array[from].base ();
      if (base < 0 || _array[to = base ^ label].check < 0) {
        to = _pop_enode (base, label, static_cast <int> (from));
        _push_sibling (from, to ^ label, label, base >= 0);
      } else if (_array[to].check != static_cast <int> (from))
        to = _resolve (from, base, label, cf);
      return to;
    }
    // find key from double array
    int _find (const char* key, size_t& from, size_t& pos, const size_t len) const {
      for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
           pos < len; ) { // follow link
 #ifdef USE_REDUCED_TRIE
        if (_array[from].value >= 0) return CEDAR_NO_PATH;
 #endif
        size_t to = static_cast <size_t> (_array[from].base ()); to ^= key_[pos];
        if (_array[to].check != static_cast <int> (from)) return CEDAR_NO_PATH;
        ++pos;
        from = to;
      }
 #ifdef USE_REDUCED_TRIE
      if (_array[from].value >= 0) // get value from leaf; only allow integer key
        return _array[from].value;
 #endif
      const node n = _array[_array[from].base () ^ 0];
      if (n.check != static_cast <int> (from)) return CEDAR_NO_VALUE;
      return n.base_;
    }
 #ifndef USE_FAST_LOAD
    void _restore_ninfo () {
      _realloc_array (_ninfo, _size);
      for (int to = 0; to < _size; ++to) {
        const int from = _array[to].check;
        if (from < 0) continue; // skip empty node
        const int base = _array[from].base ();
        if (const uchar label = static_cast <uchar> (base ^ to)) // skip leaf
          _push_sibling (static_cast <size_t> (from), base, label,
                         ! from || _ninfo[from].child || _array[base ^ 0].check == from);
      }
    }
    void _restore_block () {
      _realloc_array (_block, _size >> 8);
      _bheadF = _bheadC = _bheadO = 0;
      for (int bi (0), e (0); e < _size; ++bi) { // register blocks to full
        block& b = _block[bi];
        b.num = 0;
        for (; e < (bi << 8) + 256; ++e)
          if (_array[e].check < 0 && ++b.num == 1) b.ehead = e;
        int& head_out = b.num == 1 ? _bheadC : (b.num == 0 ? _bheadF : _bheadO);
        _push_block (bi, head_out, ! head_out && b.num);
      }
    }
 #endif
    void _set_result (result_type* x, value_type r, size_t = 0, size_t = 0) const
    { *x = r; }
    void _set_result (result_pair_type* x, value_type r, size_t l, size_t = 0) const
    { x->value = r; x->length = l; }
    void _set_result (result_triple_type* x, value_type r, size_t l, size_t from) const
    { x->value = r; x->length = l; x->id = from; }
    void _pop_block (const int bi, int& head_in, const bool last) {
      if (last) { // last one poped; Closed or Open
        head_in = 0;
      } else {
        const block& b = _block[bi];
        _block[b.prev].next = b.next;
        _block[b.next].prev = b.prev;
        if (bi == head_in) head_in = b.next;
      }
    }
    void _push_block (const int bi, int& head_out, const bool empty) {
      block& b = _block[bi];
      if (empty) { // the destination is empty
        head_out = b.prev = b.next = bi;
      } else { // use most recently pushed
        int& tail_out = _block[head_out].prev;
        b.prev = tail_out;
        b.next = head_out;
        head_out = tail_out = _block[tail_out].next = bi;
      }
    }
    int _add_block () {
      if (_size == _capacity) { // allocate memory if needed
 #ifdef USE_EXACT_FIT
        _capacity += _size >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : _size;
 #else
        _capacity += _capacity;
 #endif
        _realloc_array (_array, _capacity, _capacity);
        _realloc_array (_ninfo, _capacity, _size);
        _realloc_array (_block, _capacity >> 8, _size >> 8);
      }
      _block[_size >> 8].ehead = _size;
      _array[_size] = node (- (_size + 255),  - (_size + 1));
      for (int i = _size + 1; i < _size + 255; ++i)
        _array[i] = node (-(i - 1), -(i + 1));
      _array[_size + 255] = node (- (_size + 254),  -_size);
      _push_block (_size >> 8, _bheadO, ! _bheadO); // append to block Open
      _size += 256;
      return (_size >> 8) - 1;
    }
    // transfer block from one start w/ head_in to one start w/ head_out
    void _transfer_block (const int bi, int& head_in, int& head_out) {
      _pop_block  (bi, head_in, bi == _block[bi].next);
      _push_block (bi, head_out, ! head_out && _block[bi].num);
    }
    // pop empty node from block; never transfer the special block (bi = 0)
    int _pop_enode (const int base, const uchar label, const int from) {
      const int e  = base < 0 ? _find_place () : base ^ label;
      const int bi = e >> 8;
      node&  n = _array[e];
      block& b = _block[bi];
      if (--b.num == 0) {
        if (bi) _transfer_block (bi, _bheadC, _bheadF); // Closed to Full
      } else { // release empty node from empty ring
        _array[-n.base_].check = n.check;
        _array[-n.check].base_ = n.base_;
        if (e == b.ehead) b.ehead = -n.check; // set ehead
        if (bi && b.num == 1 && b.trial != MAX_TRIAL) // Open to Closed
          _transfer_block (bi, _bheadO, _bheadC);
      }
      // initialize the released node
 #ifdef USE_REDUCED_TRIE
      n.value = CEDAR_VALUE_LIMIT; n.check = from;
      if (base < 0) _array[from].base_ = - (e ^ label) - 1;
 #else
      if (label) n.base_ = -1; else n.value = value_type (0); n.check = from;
      if (base < 0) _array[from].base_ = e ^ label;
 #endif
      return e;
    }
    // push empty node into empty ring
    void _push_enode (const int e) {
      const int bi = e >> 8;
      block& b = _block[bi];
      if (++b.num == 1) { // Full to Closed
        b.ehead = e;
        _array[e] = node (-e, -e);
        if (bi) _transfer_block (bi, _bheadF, _bheadC); // Full to Closed
      } else {
        const int prev = b.ehead;
        const int next = -_array[prev].check;
        _array[e] = node (-prev, -next);
        _array[prev].check = _array[next].base_ = -e;
        if (b.num == 2 || b.trial == MAX_TRIAL) // Closed to Open
          if (bi) _transfer_block (bi, _bheadC, _bheadO);
        b.trial = 0;
      }
      if (b.reject < _reject[b.num]) b.reject = _reject[b.num];
      _ninfo[e] = ninfo (); // reset ninfo; no child, no sibling
    }
    // push label to from's child
    void _push_sibling (const size_t from, const int base, const uchar label, const bool flag = true) {
      uchar* c = &_ninfo[from].child;
      if (flag && (ORDERED ? label > *c : ! *c))
        do c = &_ninfo[base ^ *c].sibling; while (ORDERED && *c && *c < label);
      _ninfo[base ^ label].sibling = *c, *c = label;
    }
    // pop label from from's child
    void _pop_sibling (const size_t from, const int base, const uchar label) {
      uchar* c = &_ninfo[from].child;
      while (*c != label) c = &_ninfo[base ^ *c].sibling;
      *c = _ninfo[base ^ label].sibling;
    }
    // check whether to replace branching w/ the newly added node
    bool _consult (const int base_n, const int base_p, uchar c_n, uchar c_p) const {
      do if (! (c_p = _ninfo[base_p ^ c_p].sibling)) return false;
      while ((c_n = _ninfo[base_n ^ c_n].sibling));
      return true;
    }
    // enumerate (equal to or more than one) child nodes
    uchar* _set_child (uchar* p, const int base, uchar c, const int label = -1) {
      --p;
      if (! c)  { *++p = c; c = _ninfo[base ^ c].sibling; } // 0: terminal
      if (ORDERED)
        while (c && c < label) { *++p = c; c = _ninfo[base ^ c].sibling; }
      if (label != -1) *++p = static_cast <uchar> (label);
      while (c) { *++p = c; c = _ninfo[base ^ c].sibling; }
      return p;
    }
    // explore new block to settle down
    int _find_place () {
      if (_bheadC) return _block[_bheadC].ehead;
      if (_bheadO) return _block[_bheadO].ehead;
      return _add_block () << 8;
    }
    int _find_place (const uchar* const first, const uchar* const last) {
      if (int bi = _bheadO) {
        const int   bz = _block[_bheadO].prev;
        const short nc = static_cast <short> (last - first + 1);
        while (1) { // set candidate block
          block& b = _block[bi];
          if (b.num >= nc && nc < b.reject) // explore configuration
            for (int e = b.ehead;;) {
              const int base = e ^ *first;
              for (const uchar* p = first; _array[base ^ *++p].check < 0; )
                if (p == last) return b.ehead = e; // no conflict
              if ((e = -_array[e].check) == b.ehead) break;
            }
          b.reject = nc;
          if (b.reject < _reject[b.num]) _reject[b.num] = b.reject;
          const int bi_ = b.next;
          if (++b.trial == MAX_TRIAL) _transfer_block (bi, _bheadO, _bheadC);
          if (bi == bz) break;
          bi = bi_;
        };
      }
      return _add_block () << 8;
    }
    // resolve conflict on base_n ^ label_n = base_p ^ label_p
    template <typename T>
    int _resolve (size_t& from_n, const int base_n, const uchar label_n, T& cf) {
      // examine siblings of conflicted nodes
      const int to_pn  = base_n ^ label_n;
      const int from_p = _array[to_pn].check;
      const int base_p = _array[from_p].base ();
      const bool flag // whether to replace siblings of newly added
        = _consult (base_n, base_p, _ninfo[from_n].child, _ninfo[from_p].child);
      uchar child[256];
      uchar* const first = &child[0];
      uchar* const last  =
        flag ? _set_child (first, base_n, _ninfo[from_n].child, label_n)
        : _set_child (first, base_p, _ninfo[from_p].child);
      const int base =
        (first == last ? _find_place () : _find_place (first, last)) ^ *first;
      // replace & modify empty list
      const int from  = flag ? static_cast <int> (from_n) : from_p;
      const int base_ = flag ? base_n : base_p;
      if (flag && *first == label_n) _ninfo[from].child = label_n; // new child
 #ifdef USE_REDUCED_TRIE
      _array[from].base_ = -base - 1; // new base
 #else
      _array[from].base_ = base; // new base
 #endif
      for (const uchar* p = first; p <= last; ++p) { // to_ => to
        const int to  = _pop_enode (base, *p, from);
        const int to_ = base_ ^ *p;
        _ninfo[to].sibling = (p == last ? 0 : *(p + 1));
        if (flag && to_ == to_pn) continue; // skip newcomer (no child)
        cf (to_, to); // user-defined callback function to handle moved nodes
        node& n  = _array[to];
        node& n_ = _array[to_];
 #ifdef USE_REDUCED_TRIE
        if ((n.base_ = n_.base_) < 0 && *p) // copy base; bug fix
 #else
        if ((n.base_ = n_.base_) > 0 && *p) // copy base; bug fix
 #endif
          {
            uchar c = _ninfo[to].child = _ninfo[to_].child;
            do _array[n.base () ^ c].check = to; // adjust grand son's check
            while ((c = _ninfo[n.base () ^ c].sibling));
          }
        if (! flag && to_ == static_cast <int> (from_n)) // parent node moved
          from_n = static_cast <size_t> (to); // bug fix
        if (! flag && to_ == to_pn) { // the address is immediately used
          _push_sibling (from_n, to_pn ^ label_n, label_n);
          _ninfo[to_].child = 0; // remember to reset child
 #ifdef USE_REDUCED_TRIE
          n_.value = CEDAR_VALUE_LIMIT;
 #else
          if (label_n) n_.base_ = -1; else n_.value = value_type (0);
 #endif
          n_.check = static_cast <int> (from_n);
        } else
          _push_enode (to_);
        if (NUM_TRACKING_NODES) // keep the traversed node updated
          for (size_t j = 0; tracking_node[j] != 0; ++j)
            if (tracking_node[j] == static_cast <size_t> (to_))
              { tracking_node[j] = static_cast <size_t> (to); break; }
      }
      return flag ? base ^ label_n : to_pn;
    }
  };
 }
 #endif
--- a/libchinese-segmentation/storage-base/cedar/cedarpp.h
+++ b/libchinese-segmentation/storage-base/cedar/cedarpp.h
@ -0,0 +1,834 @@
 // cedar -- C++ implementation of Efficiently-updatable Double ARray trie
 //  $Id: cedarpp.h 1916 2017-07-12 07:30:56Z ynaga $
 // Copyright (c) 2009-2015 Naoki Yoshinaga <ynaga@tkl.iis.u-tokyo.ac.jp>
 #ifndef CEDAR_H
 #define CEDAR_H
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <climits>
 #include <cassert>
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #define STATIC_ASSERT(e, msg) typedef char msg[(e) ? 1 : -1]
 namespace cedar {
  // typedefs
 #if LONG_BIT == 64
  typedef unsigned long       npos_t; // possibly compatible with size_t
 #else
  typedef unsigned long long  npos_t;
 #endif
  typedef unsigned char       uchar;
  static const npos_t TAIL_OFFSET_MASK = static_cast <npos_t> (0xffffffff);
  static const npos_t NODE_INDEX_MASK  = static_cast <npos_t> (0xffffffff) << 32;
  template <typename T> struct NaN { enum { N1 = -1, N2 = -2 }; };
  template <> struct NaN <float> { enum { N1 = 0x7f800001, N2 = 0x7f800002 }; };
  static const int MAX_ALLOC_SIZE = 1 << 16; // must be divisible by 256
  // dynamic double array
  template <typename value_type,
            const int     NO_VALUE  = NaN <value_type>::N1,
            const int     NO_PATH   = NaN <value_type>::N2,
            const bool    ORDERED   = true,
            const int     MAX_TRIAL = 1,
            const size_t  NUM_TRACKING_NODES = 0>
  class da {
  public:
    enum error_code { CEDAR_NO_VALUE = NO_VALUE, CEDAR_NO_PATH = NO_PATH };
    typedef value_type result_type;
    struct result_pair_type {
      value_type  value;
      size_t      length;  // prefix length
    };
    struct result_triple_type { // for predict ()
      value_type  value;
      size_t      length;  // suffix length
      npos_t      id;      // node id of value
    };
    struct node {
      union { int base; value_type value; }; // negative means prev empty index
      int  check;                            // negative means next empty index
      node (const int base_ = 0, const int check_ = 0)
        : base (base_), check (check_) {}
    };
    struct ninfo {  // x1.5 update speed; +.25 % memory (8n -> 10n)
      uchar  sibling;   // right sibling (= 0 if not exist)
      uchar  child;     // first child
      ninfo () : sibling (0), child (0) {}
    };
    struct block { // a block w/ 256 elements
      int   prev;   // prev block; 3 bytes
      int   next;   // next block; 3 bytes
      short num;    // # empty elements; 0 - 256
      short reject; // minimum # branching failed to locate; soft limit
      int   trial;  // # trial
      int   ehead;  // first empty item
      block () : prev (0), next (0), num (256), reject (257), trial (0), ehead (0) {}
    };
    da () : tracking_node (), _array (0), _tail (0), _tail0 (0), _ninfo (0), _block (0), _bheadF (0), _bheadC (0), _bheadO (0), _capacity (0), _size (0), _quota (0), _quota0 (0), _no_delete (false), _reject () {
 #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
      STATIC_ASSERT(sizeof (value_type) <= sizeof (int),
                    value_type_is_not_supported___maintain_a_value_array_by_yourself_and_store_its_index_to_trie
                    );
 #pragma GCC diagnostic warning "-Wunused-local-typedefs"
      _initialize ();
    }
    ~da () { clear (false); }
    size_t capacity   () const { return static_cast <size_t> (_capacity); }
    size_t size       () const { return static_cast <size_t> (_size); }
    size_t length     () const { return static_cast <size_t> (*_length); }
    size_t total_size () const { return sizeof (node) * _size; }
    size_t unit_size  () const { return sizeof (node); }
    size_t nonzero_size () const {
      size_t i = 0;
      for (int to = 0; to < _size; ++to)
        if (_array[to].check >= 0) ++i;
      return i;
    }
    size_t nonzero_length () const {
      size_t i (0), j (0);
      for (int to = 0; to < _size; ++to) {
        const node& n = _array[to];
        if (n.check >= 0 && _array[n.check].base != to && n.base < 0)
          { ++j; for (const char* p = &_tail[-n.base]; *p; ++p) ++i; }
      }
      return i + j * (1 + sizeof (value_type));
    }
    size_t num_keys () const {
      size_t i = 0;
      for (int to = 0; to < _size; ++to) {
        const node& n = _array[to];
        if (n.check >= 0 && (_array[n.check].base == to || n.base < 0)) ++i;
      }
      return i;
    }
    // interfance
    template <typename T>
    T exactMatchSearch (const char* key) const
    { return exactMatchSearch <T> (key, std::strlen (key)); }
    template <typename T>
    T exactMatchSearch (const char* key, size_t len, npos_t from = 0) const {
      union { int i; value_type x; } b;
      size_t pos = 0;
      b.i = _find (key, from, pos, len);
      if (b.i == CEDAR_NO_PATH) b.i = CEDAR_NO_VALUE;
      T result;
      _set_result (&result, b.x, len, from);
      return result;
    }
    template <typename T>
    size_t commonPrefixSearch (const char* key, T* result, size_t result_len) const
    { return commonPrefixSearch (key, result, result_len, std::strlen (key)); }
    template <typename T>
    size_t commonPrefixSearch (const char* key, T* result, size_t result_len, size_t len, npos_t from = 0) const {
      size_t num = 0;
      for (size_t pos = 0; pos < len; ) {
        union { int i; value_type x; } b;
        b.i = _find (key, from, pos, pos + 1);
        if (b.i == CEDAR_NO_VALUE) continue;
        if (b.i == CEDAR_NO_PATH)  return num;
        if (num < result_len) _set_result (&result[num], b.x, pos, from);
        ++num;
      }
      return num;
    }
    // predict key from double array
    template <typename T>
    size_t commonPrefixPredict (const char* key, T* result, size_t result_len)
    { return commonPrefixPredict (key, result, result_len, std::strlen (key)); }
    template <typename T>
    size_t commonPrefixPredict (const char* key, T* result, size_t result_len, size_t len, npos_t from = 0) {
      size_t num (0), pos (0), p (0);
      if (_find (key, from, pos, len) == CEDAR_NO_PATH) return 0;
      union { int i; value_type x; } b;
      const npos_t root = from;
      for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p, root)) {
        if (num < result_len)
          _set_result (&result[num], b.x, p, from);
        ++num;
      }
      return num;
    }
    void suffix (char* key, size_t len, npos_t to) const {
      key[len] = '\0';
      if (const int offset = static_cast <int> (to >> 32)) {
        to &= TAIL_OFFSET_MASK;
        size_t len_tail = std::strlen (&_tail[-_array[to].base]);
        if (len > len_tail) len -= len_tail; else len_tail = len, len = 0;
        std::memcpy (&key[len], &_tail[static_cast <size_t> (offset) - len_tail], len_tail);
      }
      while (len--) {
        const int from = _array[to].check;
        key[len] = static_cast <char> (_array[from].base ^ static_cast <int> (to));
        to = static_cast <npos_t> (from);
      }
    }
    value_type traverse (const char* key, npos_t& from, size_t& pos) const
    { return traverse (key, from, pos, std::strlen (key)); }
    value_type traverse (const char* key, npos_t& from, size_t& pos, size_t len) const {
      union { int i; value_type x; } b;
      b.i = _find (key, from, pos, len);
      return b.x;
    }
    struct empty_callback { void operator () (const int, const int) {} }; // dummy empty function
    value_type& update (const char* key)
    { return update (key, std::strlen (key)); }
    value_type& update (const char* key, size_t len, value_type val = value_type (0))
    { npos_t from (0); size_t pos (0); return update (key, from, pos, len, val); }
    value_type& update (const char* key, npos_t& from, size_t& pos, size_t len, value_type val = value_type (0))
    { empty_callback cf; return update (key, from, pos, len, val, cf); }
    template <typename T>
    value_type& update (const char* key, npos_t& from, size_t& pos, size_t len, value_type val, T& cf) {
      if (! len && ! from)
        _err (__FILE__, __LINE__, "failed to insert zero-length key\n");
 #ifndef USE_FAST_LOAD
      if (! _ninfo || ! _block) restore ();
 #endif
      npos_t offset = from >> 32;
      if (! offset) { // node on trie
        for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
             _array[from].base >= 0; ++pos) {
          if (pos == len) // could be reduced
            { const int to = _follow (from, 0, cf); return _array[to].value += val; }
          from = static_cast <size_t> (_follow (from, key_[pos], cf));
        }
        offset = static_cast <npos_t> (-_array[from].base);
      }
      if (offset >= sizeof (int)) { // go to _tail
        const size_t pos_orig = pos;
        char* const tail = &_tail[offset] - pos;
        while (pos < len && key[pos] == tail[pos]) ++pos;
        //
        if (pos == len && tail[pos] == '\0') { // found exact key
          if (const npos_t moved = pos - pos_orig) { // search end on tail
            from &= TAIL_OFFSET_MASK;
            from |= (offset + moved) << 32;
          }
          return *reinterpret_cast <value_type*> (&tail[len + 1]) += val;
        }
        // otherwise, insert the common prefix in tail if any
        if (from >> 32) {
          from &= TAIL_OFFSET_MASK; // reset to update tail offset
          for (npos_t offset_ = static_cast <npos_t> (-_array[from].base);
               offset_ < offset; ) {
            from = static_cast <size_t>
                   (_follow (from, static_cast <uchar> (_tail[offset_]), cf));
            ++offset_;
            // this shows intricacy in debugging updatable double array trie
            if (NUM_TRACKING_NODES) // keep the traversed node (on tail) updated
              for (size_t j = 0; tracking_node[j] != 0; ++j)
                if (tracking_node[j] >> 32 == offset_)
                  tracking_node[j] = static_cast <npos_t> (from);
          }
        }
        for (size_t pos_ = pos_orig; pos_ < pos; ++pos_)
          from = static_cast <size_t>
                 (_follow (from, static_cast <uchar> (key[pos_]), cf));
        npos_t moved = pos - pos_orig;
        if (tail[pos]) { // remember to move offset to existing tail
          const int to_ = _follow (from, static_cast <uchar> (tail[pos]), cf);
          _array[to_].base = - static_cast <int> (offset + ++moved);
          moved -= 1 + sizeof (value_type); // keep record
        }
        moved += offset;
        for (npos_t i = offset; i <= moved; i += 1 + sizeof (value_type)) {
          if (_quota0 == ++*_length0) {
 #ifdef USE_EXACT_FIT
            _quota0 += *_length0 >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : *_length0;
 #else
            _quota0 += _quota0;
 #endif
            _realloc_array (_tail0, _quota0, *_length0);
          }
          _tail0[*_length0] = static_cast <int> (i);
        }
        if (pos == len || tail[pos] == '\0') {
          const int to = _follow (from, 0, cf); // could be reduced
          if (pos == len) return _array[to].value += val; // set value on trie
          _array[to].value += *reinterpret_cast <value_type*> (&tail[pos + 1]);
        }
        from = static_cast <size_t> (_follow (from, static_cast <uchar> (key[pos]), cf));
        ++pos;
      }
      const int needed = static_cast <int> (len - pos + 1 + sizeof (value_type));
      if (pos == len && *_length0) { // reuse
        const int offset0 = _tail0[*_length0];
        _tail[offset0] = '\0';
        _array[from].base = -offset0;
        --*_length0;
        return *reinterpret_cast <value_type*> (&_tail[offset0 + 1]) = val;
      }
      if (_quota < *_length + needed) {
 #ifdef USE_EXACT_FIT
        _quota += needed > *_length || needed > MAX_ALLOC_SIZE ? needed :
                  (*_length >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : *_length);
 #else
        _quota += _quota >= needed ? _quota : needed;
 #endif
        _realloc_array (_tail, _quota, *_length);
      }
      _array[from].base = -*_length;
      const size_t pos_orig = pos;
      char* const tail = &_tail[*_length] - pos;
      if (pos < len) {
        do tail[pos] = key[pos]; while (++pos < len);
        from |= (static_cast <npos_t> (*_length) + (len - pos_orig)) << 32;
      }
      *_length += needed;
      return *reinterpret_cast <value_type*> (&tail[len + 1]) += val;
    }
    // easy-going erase () without compression
    int erase (const char* key) { return erase (key, std::strlen (key)); }
    int erase (const char* key, size_t len, npos_t from = 0) {
      size_t pos = 0;
      const int i = _find (key, from, pos, len);
      if (i == CEDAR_NO_PATH || i == CEDAR_NO_VALUE) return -1;
      if (from >> 32) from &= TAIL_OFFSET_MASK; // leave tail as is
      bool flag = _array[from].base < 0; // have sibling
      int e = flag ? static_cast <int> (from) : _array[from].base ^ 0;
      from  = _array[e].check;
      do {
        const node& n = _array[from];
        flag = _ninfo[n.base ^ _ninfo[from].child].sibling;
        if (flag) _pop_sibling (from, n.base, static_cast <uchar> (n.base ^ e));
        _push_enode (e);
        e = static_cast <int> (from);
        from = static_cast <size_t> (_array[from].check);
      } while (! flag);
      return 0;
    }
    int build (size_t num, const char** key, const size_t* len = 0, const value_type* val = 0) {
      for (size_t i = 0; i < num; ++i)
        update (key[i], len ? len[i] : std::strlen (key[i]), val ? val[i] : value_type (i));
      return 0;
    }
    template <typename T>
    void dump (T* result, const size_t result_len) {
      union { int i; value_type x; } b;
      size_t num (0), p (0);
      npos_t from = 0;
      for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p))
        if (num < result_len)
          _set_result (&result[num++], b.x, p, from);
        else
          _err (__FILE__, __LINE__, "dump() needs array of length = num_keys()\n");
    }
    void shrink_tail () {
      union { char* tail; int* length; } t;
      const size_t length_
        = static_cast <size_t> (*_length)
        - static_cast <size_t> (*_length0) * (1 + sizeof (value_type));
      t.tail = static_cast <char*> (std::malloc (length_));
      if (! t.tail) _err (__FILE__, __LINE__, "memory allocation failed\n");
      *t.length = static_cast <int> (sizeof (int));
      for (int to = 0; to < _size; ++to) {
        node& n = _array[to];
        if (n.check >= 0 && _array[n.check].base != to && n.base < 0) {
          char* const tail (&t.tail[*t.length]), * const tail_ (&_tail[-n.base]);
          n.base = - *t.length;
          int i = 0; do tail[i] = tail_[i]; while (tail[i++]);
          *reinterpret_cast <value_type*> (&tail[i])
            = *reinterpret_cast <const value_type*> (&tail_[i]);
          *t.length += i + static_cast <int> (sizeof (value_type));
        }
      }
      std::free (_tail);
      _tail = t.tail;
      _realloc_array (_tail,  *_length,  *_length);
      _quota  = *_length;
      _realloc_array (_tail0, 1);
      _quota0 = 1;
    }
    int save (const char* fn, const char* mode, const bool shrink) {
      if (shrink) shrink_tail ();
      return save (fn, mode);
    }
    int save (const char* fn, const char* mode = "wb") const {
      // _test ();
      FILE* fp = std::fopen (fn, mode);
      if (! fp) return -1;
      std::fwrite (_tail,  sizeof (char), static_cast <size_t> (*_length), fp);
      std::fwrite (_array, sizeof (node), static_cast <size_t> (_size), fp);
      std::fclose (fp);
 #ifdef USE_FAST_LOAD
      const char* const info
        = std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
      fp = std::fopen (info, mode);
      delete [] info; // resolve memory leak
      if (! fp) return -1;
      std::fwrite (&_bheadF, sizeof (int), 1, fp);
      std::fwrite (&_bheadC, sizeof (int), 1, fp);
      std::fwrite (&_bheadO, sizeof (int), 1, fp);
      std::fwrite (_ninfo, sizeof (ninfo), static_cast <size_t> (_size), fp);
      std::fwrite (_block, sizeof (block), static_cast <size_t> (_size >> 8), fp);
      std::fclose (fp);
 #endif
      return 0;
    }
    int open (const char* fn, const char* mode = "rb",
              const size_t offset = 0, size_t size_ = 0) {
      FILE* fp = std::fopen (fn, mode);
      if (! fp) return -1;
      // get size
      if (! size_) {
        if (std::fseek (fp, 0, SEEK_END) != 0) return -1;
        size_ = static_cast <size_t> (std::ftell (fp));
        if (std::fseek (fp, 0, SEEK_SET) != 0) return -1;
      }
      if (size_ <= offset) return -1;
      if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
      int len = 0;
      if (std::fread (&len, sizeof (int), 1, fp) != 1) return -1;
      const size_t length_ = static_cast <size_t> (len);
      if (size_ <= offset + length_) return -1;
      // set array
      clear (false);
      size_ = (size_ - offset - length_) / sizeof (node);
      _array = static_cast <node*>  (std::malloc (sizeof (node)  * size_));
      _tail  = static_cast <char*>  (std::malloc (length_));
      _tail0 = static_cast <int*>   (std::malloc (sizeof (int)));
 #ifdef USE_FAST_LOAD
      _ninfo = static_cast <ninfo*> (std::malloc (sizeof (ninfo) * size_));
      _block = static_cast <block*> (std::malloc (sizeof (block) * size_));
      if (! _array || ! _tail || ! _tail0 || ! _ninfo || ! _block)
 #else
      if (! _array || ! _tail || ! _tail0)
 #endif
        _err (__FILE__, __LINE__, "memory allocation failed\n");
      if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
      if (length_ != std::fread (_tail,  sizeof (char), length_, fp) ||
          size_   != std::fread (_array, sizeof (node), size_,   fp))
        return -1;
      std::fclose (fp);
      _size = static_cast <int> (size_);
      *_length0 = 0;
 #ifdef USE_FAST_LOAD
      const char* const info
        = std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
      fp = std::fopen (info, mode);
      delete [] info; // resolve memory leak
      if (! fp) return -1;
      std::fread (&_bheadF, sizeof (int), 1, fp);
      std::fread (&_bheadC, sizeof (int), 1, fp);
      std::fread (&_bheadO, sizeof (int), 1, fp);
      if (size_      != std::fread (_ninfo, sizeof (ninfo), size_, fp) ||
          size_ >> 8 != std::fread (_block, sizeof (block), size_ >> 8, fp))
        return -1;
      std::fclose (fp);
      _capacity = _size;
      _quota  = *_length;
      _quota0 = 1;
 #endif
      return 0;
    }
 #ifndef USE_FAST_LOAD
    void restore () { // restore information to update
      if (! _block) _restore_block ();
      if (! _ninfo) _restore_ninfo ();
      _capacity = _size;
      _quota  = *_length;
      _quota0 = 1;
    }
 #endif
    void set_array (void* p, size_t size_ = 0) { // ad-hoc
      clear (false);
      if (size_)
        size_ = size_ * unit_size () - static_cast <size_t> (*static_cast <int*> (p));
      _tail  = static_cast <char*> (p);
      _array = reinterpret_cast <node*> (_tail + *_length);
      _size  = static_cast <int> (size_ / unit_size () + (size_ % unit_size () ? 1 : 0));
      _no_delete = true;
    }
    const void* array () const { return _array; }
    void clear (const bool reuse = true) {
      if (_no_delete) _array = 0, _tail = 0;
      if (_array) std::free (_array);
      if (_tail)  std::free (_tail);
      if (_tail0) std::free (_tail0);
      if (_ninfo) std::free (_ninfo);
      if (_block) std::free (_block);
      _array = 0; _tail = 0; _tail0 = 0; _ninfo = 0; _block = 0;
      _bheadF = _bheadC = _bheadO = _capacity = _size = _quota = _quota0 = 0;
      if (reuse) _initialize ();
      _no_delete = false;
    }
    // return the first child for a tree rooted by a given node
    int begin (npos_t& from, size_t& len) {
 #ifndef USE_FAST_LOAD
      if (! _ninfo) _restore_ninfo ();
 #endif
      int base = from >> 32 ? - static_cast <int> (from >> 32) : _array[from].base;
      if (base >= 0) { // on trie
        uchar c = _ninfo[from].child;
        if (! from && ! (c = _ninfo[base ^ c].sibling)) // bug fix
          return CEDAR_NO_PATH; // no entry
        for (; c && base >= 0; ++len) {
          from = static_cast <size_t> (base) ^ c;
          base = _array[from].base;
          c    = _ninfo[from].child;
        }
        if (base >= 0) return _array[base ^ c].base;
      }
      const size_t len_ = std::strlen (&_tail[-base]);
      from &= TAIL_OFFSET_MASK;
      from |= static_cast <npos_t> (static_cast <size_t> (-base) + len_) << 32;
      len += len_;
      return *reinterpret_cast <int*> (&_tail[-base] + len_ + 1);
    }
    // return the next child if any
    int next (npos_t& from, size_t& len, const npos_t root = 0) {
      uchar c = 0;
      if (const int offset = static_cast <int> (from >> 32)) { // on tail
        if (root >> 32) return CEDAR_NO_PATH;
        from &= TAIL_OFFSET_MASK;
        len -= static_cast <size_t> (offset - (-_array[from].base));
      } else
        c    = _ninfo[_array[from].base ^ 0].sibling;
      for (; ! c && from != root; --len) {
        c    = _ninfo[from].sibling;
        from = static_cast <size_t> (_array[from].check);
      }
      if (! c) return CEDAR_NO_PATH;
      return begin (from = static_cast <size_t> (_array[from].base) ^ c, ++len);
    }
    npos_t tracking_node[NUM_TRACKING_NODES + 1];
  private:
    // currently disabled; implement these if you need
    da (const da&);
    da& operator= (const da&);
    node*   _array;
    union { char* _tail;  int* _length;  };
    union { int*  _tail0; int* _length0; };
    ninfo*  _ninfo;
    block*  _block;
    int     _bheadF;  // first block of Full;   0
    int     _bheadC;  // first block of Closed; 0 if no Closed
    int     _bheadO;  // first block of Open;   0 if no Open
    int     _capacity;
    int     _size;
    int     _quota;
    int     _quota0;
    int     _no_delete;
    short   _reject[257];
    //
    static void _err (const char* fn, const int ln, const char* msg)
    { std::fprintf (stderr, "cedar: %s [%d]: %s", fn, ln, msg); std::exit (1); }
    template <typename T>
    static void _realloc_array (T*& p, const int size_n, const int size_p = 0) {
      void* tmp = std::realloc (p, sizeof (T) * static_cast <size_t> (size_n));
      if (! tmp)
        std::free (p), _err (__FILE__, __LINE__, "memory reallocation failed\n");
      p = static_cast <T*> (tmp);
      static const T T0 = T ();
      for (T* q (p + size_p), * const r (p + size_n); q != r; ++q) *q = T0;
    }
    void _initialize () { // initilize the first special block
      _realloc_array (_array, 256, 256);
      _realloc_array (_tail,  sizeof (int));
      _realloc_array (_tail0, 1);
      _realloc_array (_ninfo, 256);
      _realloc_array (_block, 1);
      _array[0] = node (0, -1);
      for (int i = 1; i < 256; ++i)
        _array[i] = node (i == 1 ? -255 : - (i - 1), i == 255 ? -1 : - (i + 1));
      _capacity = _size = 256;
      _block[0].ehead = 1; // bug fix for erase
      _quota  = *_length  = static_cast <int> (sizeof (int));
      _quota0 = 1;
      for (size_t i = 0 ; i <= NUM_TRACKING_NODES; ++i) tracking_node[i] = 0;
      for (short  i = 0; i <= 256; ++i) _reject[i] = i + 1;
    }
    // follow/create edge
    template <typename T>
    int _follow (npos_t& from, const uchar& label, T& cf) {
      int to = 0;
      const int base = _array[from].base;
      if (base < 0 || _array[to = base ^ label].check < 0) {
        to = _pop_enode (base, label, static_cast <int> (from));
        _push_sibling (from, to ^ label, label, base >= 0);
      } else if (_array[to].check != static_cast <int> (from))
        to = _resolve (from, base, label, cf);
      return to;
    }
    // find key from double array
    int _find (const char* key, npos_t& from, size_t& pos, const size_t len) const {
      npos_t offset = from >> 32;
      if (! offset) { // node on trie
        for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
             _array[from].base >= 0; ) {
          if (pos == len) {
            const node& n = _array[_array[from].base ^ 0];
            if (n.check != static_cast <int> (from)) return CEDAR_NO_VALUE;
            return n.base;
          }
          size_t to = static_cast <size_t> (_array[from].base); to ^= key_[pos];
          if (_array[to].check != static_cast <int> (from)) return CEDAR_NO_PATH;
          ++pos;
          from = to;
        }
        offset = static_cast <npos_t> (-_array[from].base);
      }
      // switch to _tail to match suffix
      const size_t pos_orig = pos; // start position in reading _tail
      const char* const tail = &_tail[offset] - pos;
      if (pos < len) {
        do if (key[pos] != tail[pos]) break; while (++pos < len);
        if (const npos_t moved = pos - pos_orig) {
          from &= TAIL_OFFSET_MASK;
          from |= (offset + moved) << 32;
        }
        if (pos < len) return CEDAR_NO_PATH; // input > tail, input != tail
      }
      if (tail[pos]) return CEDAR_NO_VALUE;  // input < tail
      return *reinterpret_cast <const int*> (&tail[len + 1]);
    }
 #ifndef USE_FAST_LOAD
    void _restore_ninfo () {
      _realloc_array (_ninfo, _size);
      for (int to = 0; to < _size; ++to) {
        const int from = _array[to].check;
        if (from < 0) continue; // skip empty node
        const int base = _array[from].base;
        if (const uchar label = static_cast <uchar> (base ^ to)) // skip leaf
          _push_sibling (static_cast <size_t> (from), base, label,
                         ! from || _ninfo[from].child || _array[base ^ 0].check == from);
      }
    }
    void _restore_block () {
      _realloc_array (_block, _size >> 8);
      _bheadF = _bheadC = _bheadO = 0;
      for (int bi (0), e (0); e < _size; ++bi) { // register blocks to full
        block& b = _block[bi];
        b.num = 0;
        for (; e < (bi << 8) + 256; ++e)
          if (_array[e].check < 0 && ++b.num == 1) b.ehead = e;
        int& head_out = b.num == 1 ? _bheadC : (b.num == 0 ? _bheadF : _bheadO);
        _push_block (bi, head_out, ! head_out && b.num);
      }
    }
 #endif
    void _set_result (result_type* x, value_type r, size_t = 0, npos_t = 0) const
    { *x = r; }
    void _set_result (result_pair_type* x, value_type r, size_t l, npos_t = 0) const
    { x->value = r; x->length = l; }
    void _set_result (result_triple_type* x, value_type r, size_t l, npos_t from) const
    { x->value = r; x->length = l; x->id = from; }
    void _pop_block (const int bi, int& head_in, const bool last) {
      if (last) { // last one poped; Closed or Open
        head_in = 0;
      } else {
        const block& b = _block[bi];
        _block[b.prev].next = b.next;
        _block[b.next].prev = b.prev;
        if (bi == head_in) head_in = b.next;
      }
    }
    void _push_block (const int bi, int& head_out, const bool empty) {
      block& b = _block[bi];
      if (empty) { // the destination is empty
        head_out = b.prev = b.next = bi;
      } else { // use most recently pushed
        int& tail_out = _block[head_out].prev;
        b.prev = tail_out;
        b.next = head_out;
        head_out = tail_out = _block[tail_out].next = bi;
      }
    }
    int _add_block () {
      if (_size == _capacity) { // allocate memory if needed
 #ifdef USE_EXACT_FIT
        _capacity += _size >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : _size;
 #else
        _capacity += _capacity;
 #endif
        _realloc_array (_array, _capacity, _capacity);
        _realloc_array (_ninfo, _capacity, _size);
        _realloc_array (_block, _capacity >> 8, _size >> 8);
      }
      _block[_size >> 8].ehead = _size;
      _array[_size] = node (- (_size + 255),  - (_size + 1));
      for (int i = _size + 1; i < _size + 255; ++i)
        _array[i] = node (-(i - 1), -(i + 1));
      _array[_size + 255] = node (- (_size + 254),  -_size);
      _push_block (_size >> 8, _bheadO, ! _bheadO); // append to block Open
      _size += 256;
      return (_size >> 8) - 1;
    }
    // transfer block from one start w/ head_in to one start w/ head_out
    void _transfer_block (const int bi, int& head_in, int& head_out) {
      _pop_block  (bi, head_in, bi == _block[bi].next);
      _push_block (bi, head_out, ! head_out && _block[bi].num);
    }
    // pop empty node from block; never transfer the special block (bi = 0)
    int _pop_enode (const int base, const uchar label, const int from) {
      const int e  = base < 0 ? _find_place () : base ^ label;
      const int bi = e >> 8;
      node&  n = _array[e];
      block& b = _block[bi];
      if (--b.num == 0) {
        if (bi) _transfer_block (bi, _bheadC, _bheadF); // Closed to Full
      } else { // release empty node from empty ring
        _array[-n.base].check = n.check;
        _array[-n.check].base = n.base;
        if (e == b.ehead) b.ehead = -n.check; // set ehead
        if (bi && b.num == 1 && b.trial != MAX_TRIAL) // Open to Closed
          _transfer_block (bi, _bheadO, _bheadC);
      }
      // initialize the released node
      if (label) n.base = -1; else n.value = value_type (0);
      n.check = from;
      if (base < 0) _array[from].base = e ^ label;
      return e;
    }
    // push empty node into empty ring
    void _push_enode (const int e) {
      const int bi = e >> 8;
      block& b = _block[bi];
      if (++b.num == 1) { // Full to Closed
        b.ehead = e;
        _array[e] = node (-e, -e);
        if (bi) _transfer_block (bi, _bheadF, _bheadC); // Full to Closed
      } else {
        const int prev = b.ehead;
        const int next = -_array[prev].check;
        _array[e] = node (-prev, -next);
        _array[prev].check = _array[next].base = -e;
        if (b.num == 2 || b.trial == MAX_TRIAL) { // Closed to Open
          if (bi) _transfer_block (bi, _bheadC, _bheadO);
        }
        b.trial = 0;
      }
      if (b.reject < _reject[b.num]) b.reject = _reject[b.num];
      _ninfo[e] = ninfo (); // reset ninfo; no child, no sibling
    }
    // push label to from's child
    void _push_sibling (const npos_t from, const int base, const uchar label, const bool flag = true) {
      uchar* c = &_ninfo[from].child;
      if (flag && (ORDERED ? label > *c : ! *c))
        do c = &_ninfo[base ^ *c].sibling; while (ORDERED && *c && *c < label);
      _ninfo[base ^ label].sibling = *c, *c = label;
    }
    // pop label from from's child
    void _pop_sibling (const npos_t from, const int base, const uchar label) {
      uchar* c = &_ninfo[from].child;
      while (*c != label) c = &_ninfo[base ^ *c].sibling;
      *c = _ninfo[base ^ label].sibling;
    }
    // check whether to replace branching w/ the newly added node
    bool _consult (const int base_n, const int base_p, uchar c_n, uchar c_p) const {
      do if (! (c_p = _ninfo[base_p ^ c_p].sibling)) return false;
      while ((c_n = _ninfo[base_n ^ c_n].sibling));
      return true;
    }
    // enumerate (equal to or more than one) child nodes
    uchar* _set_child (uchar* p, const int base, uchar c, const int label = -1) {
      --p;
      if (! c)  { *++p = c; c = _ninfo[base ^ c].sibling; } // 0: terminal
      if (ORDERED)
        while (c && c < label) { *++p = c; c = _ninfo[base ^ c].sibling; }
      if (label != -1) *++p = static_cast <uchar> (label);
      while (c) { *++p = c; c = _ninfo[base ^ c].sibling; }
      return p;
    }
    // explore new block to settle down
    int _find_place () {
      if (_bheadC) return _block[_bheadC].ehead;
      if (_bheadO) return _block[_bheadO].ehead;
      return _add_block () << 8;
    }
    int _find_place (const uchar* const first, const uchar* const last) {
      if (int bi = _bheadO) {
        const int   bz = _block[_bheadO].prev;
        const short nc = static_cast <short> (last - first + 1);
        while (1) { // set candidate block
          block& b = _block[bi];
          if (b.num >= nc && nc < b.reject) // explore configuration
            for (int e = b.ehead;;) {
              const int base = e ^ *first;
              for (const uchar* p = first; _array[base ^ *++p].check < 0; )
                if (p == last) return b.ehead = e; // no conflict
              if ((e = -_array[e].check) == b.ehead) break;
            }
          b.reject = nc;
          if (b.reject < _reject[b.num]) _reject[b.num] = b.reject;
          const int bi_ = b.next;
          if (++b.trial == MAX_TRIAL) _transfer_block (bi, _bheadO, _bheadC);
          if (bi == bz) break;
          bi = bi_;
        }
      }
      return _add_block () << 8;
    }
    // resolve conflict on base_n ^ label_n = base_p ^ label_p
    template <typename T>
    int _resolve (npos_t& from_n, const int base_n, const uchar label_n, T& cf) {
      // examine siblings of conflicted nodes
      const int to_pn  = base_n ^ label_n;
      const int from_p = _array[to_pn].check;
      const int base_p = _array[from_p].base;
      const bool flag // whether to replace siblings of newly added
        = _consult (base_n, base_p, _ninfo[from_n].child, _ninfo[from_p].child);
      uchar child[256];
      uchar* const first = &child[0];
      uchar* const last  =
        flag ? _set_child (first, base_n, _ninfo[from_n].child, label_n)
        : _set_child (first, base_p, _ninfo[from_p].child);
      const int base =
        (first == last ? _find_place () : _find_place (first, last)) ^ *first;
      // replace & modify empty list
      const int from  = flag ? static_cast <int> (from_n) : from_p;
      const int base_ = flag ? base_n : base_p;
      if (flag && *first == label_n) _ninfo[from].child = label_n; // new child
      _array[from].base = base; // new base
      for (const uchar* p = first; p <= last; ++p) { // to_ => to
        const int to  = _pop_enode (base, *p, from);
        const int to_ = base_ ^ *p;
        _ninfo[to].sibling = (p == last ? 0 : *(p + 1));
        if (flag && to_ == to_pn) continue; // skip newcomer (no child)
        cf (to_, to);
        node& n  = _array[to];
        node& n_ = _array[to_];
        if ((n.base = n_.base) > 0 && *p) { // copy base; bug fix
          uchar c = _ninfo[to].child = _ninfo[to_].child;
          do _array[n.base ^ c].check = to; // adjust grand son's check
          while ((c = _ninfo[n.base ^ c].sibling));
        }
        if (! flag && to_ == static_cast <int> (from_n)) // parent node moved
          from_n = static_cast <size_t> (to); // bug fix
        if (! flag && to_ == to_pn) { // the address is immediately used
          _push_sibling (from_n, to_pn ^ label_n, label_n);
          _ninfo[to_].child = 0; // remember to reset child
          if (label_n) n_.base = -1; else n_.value = value_type (0);
          n_.check = static_cast <int> (from_n);
        } else
          _push_enode (to_);
        if (NUM_TRACKING_NODES) // keep the traversed node updated
          for (size_t j = 0; tracking_node[j] != 0; ++j) {
            if (static_cast <int> (tracking_node[j] & TAIL_OFFSET_MASK) == to_) {
              tracking_node[j] &= NODE_INDEX_MASK;
              tracking_node[j] |= static_cast <npos_t> (to);
            }
          }
      }
      return flag ? base ^ label_n : to_pn;
    }
    // test the validity of double array for debug
    void _test (const npos_t from = 0) const {
      const int base = _array[from].base;
      if (base < 0) { // validate tail offset
        assert (*_length >= static_cast <int> (-base + 1 + sizeof (value_type)));
        return;
      }
      uchar c = _ninfo[from].child;
      do {
        if (from) assert (_array[base ^ c].check == static_cast <int> (from));
        if (c) _test (static_cast <npos_t> (base ^ c));
      } while ((c = _ninfo[base ^ c].sibling));
    }
  };
 }
 #endif
--- a/libchinese-segmentation/storage-base/darts-clone/darts.h
+++ b/libchinese-segmentation/storage-base/darts-clone/darts.h
--- a/libchinese-segmentation/storage-base/storage-base-cedar.pri
+++ b/libchinese-segmentation/storage-base/storage-base-cedar.pri
@ -0,0 +1,12 @@
 INCLUDEPATH += $$PWD
 HEADERS += \
    $$PWD/darts-clone/darts.h \
    $$PWD/cedar/cedarpp.h \
    $$PWD/cedar/cedar.h \
    $$PWD/storage-base.h \
    $$PWD/storage-base.hpp
 SOURCES += \
    $$PWD/storage-base.cpp
--- a/libchinese-segmentation/storage-base/storage-base.cpp
+++ b/libchinese-segmentation/storage-base/storage-base.cpp
@ -0,0 +1,202 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef STORAGEBASE_CPP
 #define STORAGEBASE_CPP
 #include "storage-base.h"
 template<const bool ordered, typename cache_file_header>
 StorageBase<ordered, cache_file_header>::StorageBase(const vector<string> file_paths, string dat_cache_path)
    :m_file_paths(file_paths), m_dat_cache_path(dat_cache_path), m_double_array_data_trie(new cedar::da<int, -1, -2, ordered>)
 {
    static_assert(std::is_base_of<CacheFileHeaderBase, header_type>::value, "CacheFileHeader class not derived from CacheFileHeaderBase!");
 }
 template<const bool ordered, typename cache_file_header>
 void StorageBase<ordered, cache_file_header>::Init()
 {
    int file_size_sum = 0;
    const string md5 = CalcFileListMD5(m_file_paths, file_size_sum);
    m_total_dict_size = file_size_sum;
    if (m_dat_cache_path.empty()) {
        m_dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
    }
     m_dat_cache_path += VERSION;
    if (InitAttachDat(m_dat_cache_path, md5)) {
        return;
    }
    LoadSourceFile(m_dat_cache_path, md5);//构建DATrie，写入dat文件
    bool build_ret = InitAttachDat(m_dat_cache_path, md5);
    assert(build_ret);
 }
 template<const bool ordered, typename cache_file_header>
 string StorageBase<ordered, cache_file_header>::Find(const string &key)
 {
    int result = m_double_array_data_trie->template exactMatchSearch<int>(key.c_str(), key.size());
    if (result < 0)
        return {};
    return string(&m_elements_ptr[result]);
 }
 template<const bool ordered, typename cache_file_header>
 bool StorageBase<ordered, cache_file_header>::Contains(string &word)
 {
    if (this->Find(word) != string())
        return true;
    return false;
 }
 template<const bool ordered, typename cache_file_header>
 bool StorageBase<ordered, cache_file_header>::IsMultiTone(const string &word)
 {
    string result = this->Find(word);
    if (result.find(",") == result.npos)
        return true;
    return false;
 }
 template<const bool ordered, typename cache_file_header>
 int StorageBase<ordered, cache_file_header>::GetTotalDictSize() const
 {
    return m_total_dict_size;
 }
 template<const bool ordered, typename cache_file_header>
 StorageBase<ordered, cache_file_header>::~StorageBase()
 {
    munmap(m_mmap_addr, m_mmap_length);
    m_mmap_addr = nullptr;
    close(m_mmap_fd);
    m_mmap_fd = -1;
    if (m_double_array_data_trie)
        delete m_double_array_data_trie;
    m_double_array_data_trie = nullptr;
 }
 template<const bool ordered, typename cache_file_header>
 cedar::da<int, -1, -2, ordered> *StorageBase<ordered, cache_file_header>::GetDoubleArrayDataTrie()
 {
    return m_double_array_data_trie;
 }
 template<const bool ordered, typename cache_file_header>
 const void *StorageBase<ordered, cache_file_header>::GetDataTrieArray()
 {
    return m_double_array_data_trie->array();
 }
 template<const bool ordered, typename cache_file_header>
 int StorageBase<ordered, cache_file_header>::GetDataTrieSize()
 {
    return m_double_array_data_trie->size();
 }
 template<const bool ordered, typename cache_file_header>
 int StorageBase<ordered, cache_file_header>::GetDataTrieTotalSize()
 {
    return m_double_array_data_trie->total_size();
 }
 template<const bool ordered, typename cache_file_header>
 cache_file_header *StorageBase<ordered, cache_file_header>::GetCacheFileHeaderPtr()
 {
    return reinterpret_cast<header_type*>(m_mmap_addr);
 }
 template<const bool ordered, typename cache_file_header>
 bool StorageBase<ordered, cache_file_header>::InitAttachDat(const string &dat_cache_file, const string &md5)
 {
    m_mmap_fd = open(dat_cache_file.c_str(), O_RDONLY);
    if (m_mmap_fd < 0) {
        return false;
    }
    const auto seek_off = lseek(m_mmap_fd, 0, SEEK_END);
    if (seek_off < 0){
        close(m_mmap_fd);
        m_mmap_fd = -1;
        return false;
    };
    m_mmap_length = seek_off;
    m_mmap_addr = reinterpret_cast<char *>(mmap(NULL, m_mmap_length, PROT_READ, MAP_SHARED, m_mmap_fd, 0));
    if (m_mmap_addr == MAP_FAILED) {
        close(m_mmap_fd);
        m_mmap_fd = -1;
        return false;
    }
    if (m_mmap_length < sizeof(header_type)) {
        munmap(m_mmap_addr, m_mmap_length);
        m_mmap_addr = nullptr;
        close(m_mmap_fd);
        m_mmap_fd = -1;
        return false;
    }
    header_type & header = *reinterpret_cast<header_type*>(m_mmap_addr);
    if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())
            or m_mmap_length != sizeof(header_type) + header.elements_size  + header.dat_size * m_double_array_data_trie->unit_size()) {
        munmap(m_mmap_addr, m_mmap_length);
        m_mmap_addr = nullptr;
        close(m_mmap_fd);
        m_mmap_fd = -1;
        return false;
    }
    m_elements_ptr = (const char *)(m_mmap_addr + sizeof(header_type));
    const char * dat_ptr = m_mmap_addr + sizeof(header_type) + header.elements_size;
    this->m_double_array_data_trie->set_array((char *)dat_ptr, header.dat_size);
    return true;
 }
 string CalcFileListMD5(const vector<string> &files_list, int &file_size_sum) {
    limonp::MD5 md5;
    file_size_sum = 0;
    for (auto const & local_path : files_list) {
        const int fd = open(local_path.c_str(), O_RDONLY);
        if (fd < 0){
            continue;
        }
        auto const len = lseek(fd, 0, SEEK_END);
        if (len > 0) {
            void * addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
            assert(MAP_FAILED != addr);
            md5.Update((unsigned char *) addr, len);
            file_size_sum += len;
            munmap(addr, len);
        }
        close(fd);
    }
    md5.Final();
    return string(md5.digestChars);
 }
 #endif
--- a/libchinese-segmentation/storage-base/storage-base.h
+++ b/libchinese-segmentation/storage-base/storage-base.h
@ -0,0 +1,93 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef STORAGEBASE_H
 #define STORAGEBASE_H
 #include <string>
 #include <vector>
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include "Md5.hpp"
 #include "StringUtil.hpp"
 #include "cedar.h"
 using namespace std;
 struct CacheFileHeaderBase { //todo 字节对齐
    char     md5_hex[32] = {};
    uint32_t elements_num = 0;
    uint32_t elements_size = 0;
    uint32_t dat_size = 0;
 };
 template<const bool ordered = false, typename cache_file_header = CacheFileHeaderBase>
 class StorageBase
 {
 public:
    typedef cache_file_header header_type;
    StorageBase(const vector<string> file_paths, string dat_cache_path = "");
    virtual void Init();
    virtual string Find(const string &key);
    virtual bool Contains(string &word);
    virtual bool IsMultiTone(const string &word);
    virtual int GetTotalDictSize() const;
    virtual void LoadSourceFile(const string &dat_cache_file, const string &md5) = 0;
    virtual ~StorageBase();
    cedar::da<int, -1, -2, ordered> * GetDoubleArrayDataTrie();
    const void * GetDataTrieArray();
    int GetDataTrieSize();
    int GetDataTrieTotalSize();
    cache_file_header * GetCacheFileHeaderPtr();
 private:
    StorageBase();
    StorageBase(const StorageBase&);
    StorageBase& operator = (const StorageBase&);
    bool InitAttachDat(const string &dat_cache_file, const string &md5);
    vector<string> m_file_paths;
    string m_dat_cache_path;
    cedar::da<int, -1, -2, ordered> * m_double_array_data_trie = nullptr;
    const char * m_elements_ptr = nullptr;
    int    m_mmap_fd = -1;
    int    m_mmap_length = 0;
    char * m_mmap_addr = nullptr;
    int    m_total_dict_size = 0;
 };
 inline string CalcFileListMD5(const vector<string> &files_list, int & file_size_sum);
 #include "storage-base.cpp"
 #endif // STORAGEBASE_H
--- a/libchinese-segmentation/storage-base/storage-base.hpp
+++ b/libchinese-segmentation/storage-base/storage-base.hpp
@ -0,0 +1,247 @@
 /*
 * Copyright (C) 2022, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 * Authors: jixiaoxu <jixiaoxu@kylinos.cn>
 *
 */
 #ifndef STORAGEBASE_H
 #define STORAGEBASE_H
 #include <string>
 #include <vector>
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <fstream>
 #include <iostream>
 #include "Md5.hpp"
 #include "StringUtil.hpp"
 //#define USE_DARTS
 #ifdef USE_DARTS
 #include "../storage-base/darts-clone/darts.h"
 #include <cassert>
 #else
 #include "../storage-base/cedar/cedar.h"
 #endif
 using namespace std;
 inline string CalcFileListMD5(const vector<string> &files_list, int & file_size_sum)
 {
    limonp::MD5 md5;
    file_size_sum = 0;
    for (auto const & local_path : files_list) {
        const int fd = open(local_path.c_str(), O_RDONLY);
        if (fd < 0){
            continue;
        }
        auto const len = lseek(fd, 0, SEEK_END);
        if (len > 0) {
            void * addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
            assert(MAP_FAILED != addr);
            md5.Update((unsigned char *) addr, len);
            file_size_sum += len;
            munmap(addr, len);
        }
        close(fd);
    }
    md5.Final();
    return string(md5.digestChars);
 }
 inline bool isFileExist(const string filePath) {
    ifstream infile(filePath);
    return infile.good();
 }
 inline void tryRename(string tmpName, string name) {
    if (0 != rename(tmpName.c_str(), name.c_str())) {
        if (isFileExist(name)) {
            remove(tmpName.c_str());
        }
    }
 }
 struct CacheFileHeaderBase { //todo 字节对齐
    char     md5_hex[32] = {};
    uint32_t elements_num = 0;
    uint32_t elements_size = 0;
    uint32_t dat_size = 0;
 };
 template<typename element_ptr_type, const bool ordered = false, typename cache_file_header = CacheFileHeaderBase>
 class StorageBase
 {
 public:
    typedef cache_file_header header_type;
 #ifdef USE_DARTS
    typedef typename Darts::DoubleArray::result_pair_type result_pair_type;
    StorageBase(const vector<string> file_paths, string dat_cache_path = "")
        :m_file_paths(file_paths), m_dat_cache_path(dat_cache_path), m_double_array_data_trie(new Darts::DoubleArray)
    {
        static_assert(std::is_base_of<CacheFileHeaderBase, header_type>::value, "CacheFileHeader class not derived from CacheFileHeaderBase!");
    }
 #else
    typedef typename cedar::da<int, -1, -2, ordered>::result_pair_type result_pair_type;
    StorageBase(const vector<string> file_paths, string dat_cache_path = "")
        :m_file_paths(file_paths), m_dat_cache_path(dat_cache_path)/*, m_double_array_data_trie(new cedar::da<int, -1, -2, ordered>)*/
    {
        static_assert(std::is_base_of<CacheFileHeaderBase, header_type>::value, "CacheFileHeader class not derived from CacheFileHeaderBase!");
    }
 #endif
    virtual void Init()
    {
        int file_size_sum = 0;
        const string md5 = CalcFileListMD5(m_file_paths, file_size_sum);
        m_total_dict_size = file_size_sum;
        if (m_dat_cache_path.empty()) {
            m_dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
        }
         m_dat_cache_path += VERSION;
        if (InitAttachDat(m_dat_cache_path, md5)) {
            return;
        }
        LoadSourceFile(m_dat_cache_path, md5);//构建DATrie，写入dat文件
        bool build_ret = InitAttachDat(m_dat_cache_path, md5);
        assert(build_ret);
    }
    virtual void LoadSourceFile(const string &dat_cache_file, const string &md5) = 0;
    virtual ~StorageBase()
    {
        munmap(m_mmap_addr, m_mmap_length);
        m_mmap_addr = nullptr;
        close(m_mmap_fd);
        m_mmap_fd = -1;
    }
 #ifndef USE_DARTS
    inline int Update(const char* key, size_t len, int val)
    {
        return m_double_array_data_trie.update(key, len, val);
    }
 #endif
    inline size_t CommonPrefixSearch(const char* key, result_pair_type* result, size_t result_len) const
    {
        return m_double_array_data_trie.commonPrefixSearch(key, result, result_len);
    }
    inline int ExactMatchSearch(const char* key, size_t len) const
    {
        return m_double_array_data_trie.template exactMatchSearch<int>(key, len);
    }
    inline const void * GetDataTrieArray()
    {
        return m_double_array_data_trie.array();
    }
    inline int GetDataTrieSize()
    {
        return m_double_array_data_trie.size();
    }
    inline int GetDataTrieTotalSize()
    {
        return m_double_array_data_trie.total_size();
    }
    inline cache_file_header * GetCacheFileHeaderPtr() const
    {
        return reinterpret_cast<header_type*>(m_mmap_addr);
    }
    inline const element_ptr_type * GetElementPtr() const
    {
        return m_elements_ptr;
    }
 private:
    StorageBase();
    StorageBase(const StorageBase&);
    StorageBase& operator = (const StorageBase&);
    bool InitAttachDat(const string &dat_cache_file, const string &md5)
    {
        m_mmap_fd = open(dat_cache_file.c_str(), O_RDONLY);
        if (m_mmap_fd < 0) {
            return false;
        }
        const auto seek_off = lseek(m_mmap_fd, 0, SEEK_END);
        if (seek_off < 0){
            close(m_mmap_fd);
            m_mmap_fd = -1;
            return false;
        };
        m_mmap_length = seek_off;
        m_mmap_addr = reinterpret_cast<char *>(mmap(NULL, m_mmap_length, PROT_READ, MAP_SHARED, m_mmap_fd, 0));
        if (m_mmap_addr == MAP_FAILED) {
            close(m_mmap_fd);
            m_mmap_fd = -1;
            return false;
        }
        if (m_mmap_length < sizeof(header_type)) {
            munmap(m_mmap_addr, m_mmap_length);
            m_mmap_addr = nullptr;
            close(m_mmap_fd);
            m_mmap_fd = -1;
            return false;
        }
        header_type & header = *reinterpret_cast<header_type*>(m_mmap_addr);
        if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())
                or m_mmap_length != sizeof(header_type) + header.elements_size  + header.dat_size * m_double_array_data_trie.unit_size()) {
            munmap(m_mmap_addr, m_mmap_length);
            m_mmap_addr = nullptr;
            close(m_mmap_fd);
            m_mmap_fd = -1;
            return false;
        }
        m_elements_ptr = (const element_ptr_type *)(m_mmap_addr + sizeof(header_type));
        const char * dat_ptr = m_mmap_addr + sizeof(header_type) + header.elements_size;
        this->m_double_array_data_trie.set_array((char *)dat_ptr, header.dat_size);
        return true;
    }
    vector<string> m_file_paths;
    string m_dat_cache_path;
 #ifdef USE_DARTS
    Darts::DoubleArray m_double_array_data_trie;
 #else
    cedar::da<int, -1, -2, ordered> m_double_array_data_trie;
 #endif
    const element_ptr_type * m_elements_ptr = nullptr;
    int    m_mmap_fd = -1;
    size_t    m_mmap_length = 0;
    char * m_mmap_addr = nullptr;
    int    m_total_dict_size = 0;
 };
 #endif // STORAGEBASE_H
--- a/libchinese-segmentation/test/CMakeLists.txt
+++ b/libchinese-segmentation/test/CMakeLists.txt
@ -0,0 +1,19 @@
 set(CMAKE_AUTOUIC ON)
 set(CMAKE_AUTOMOC ON)
 set(CMAKE_AUTORCC ON)
 find_package(QT NAMES Qt6 Qt5 COMPONENTS Core Gui Widgets REQUIRED)
 find_package(Qt${QT_VERSION_MAJOR} COMPONENTS Core Gui Widgets REQUIRED)
 add_executable(test
        main.cpp
        mainwindow.cpp
        mainwindow.h
        mainwindow.ui
        )
 target_include_directories( test PRIVATE
        ../)
 target_link_libraries(test PRIVATE
        Qt${QT_VERSION_MAJOR}::Core
        Qt${QT_VERSION_MAJOR}::Gui
        Qt${QT_VERSION_MAJOR}::Widgets
        chinese-segmentation
        )
--- a/libchinese-segmentation/test/main.cpp
+++ b/libchinese-segmentation/test/main.cpp
@ -0,0 +1,11 @@
 #include "mainwindow.h"
 #include <QApplication>
 int main(int argc, char *argv[])
 {
    QApplication a(argc, argv);
    MainWindow w;
    w.show();
    return a.exec();
 }
--- a/libchinese-segmentation/test/mainwindow.cpp
+++ b/libchinese-segmentation/test/mainwindow.cpp
@ -0,0 +1,96 @@
 #include "mainwindow.h"
 #include "ui_mainwindow.h"
 #include "hanzi-to-pinyin.h"
 #include "chinese-segmentation.h"
 #include "Traditional-to-Simplified.h"
 #include <QMenu>
 #include <QDebug>
 #include <QStringList>
 MainWindow::MainWindow(QWidget *parent)
    : QMainWindow(parent)
    , ui(new Ui::MainWindow)
 {
    ui->setupUi(this);
    QMenu * menu = new QMenu(this);
    menu->addAction("Default");
    menu->addAction("Tone");
    menu->addAction("Tone2");
    menu->addAction("Tone3");
    menu->addAction("FirstLetter");
    ui->toolButton->setMenu(menu);
    initconnections();
    ui->lineEdit_2->setFocus();
 }
 MainWindow::~MainWindow()
 {
    delete ui;
 }
 void MainWindow::initconnections()
 {
    connect(ui->toolButton->menu(), &QMenu::triggered, [&](QAction *action){
        qDebug() << "tool button:" << action->text();
        m_action = action->text();
        ui->toolButton->setText(action->text());
    });
    connect(ui->pushButton, &QPushButton::pressed, [&]() {
        PinyinDataStyle dataStyle;
        SegType segType;
        PolyphoneType polyType;
        ExDataProcessType exType;
        if (m_action == "Default") {
            dataStyle = PinyinDataStyle::Default;
        } else if (m_action == "Tone") {
            dataStyle = PinyinDataStyle::Tone;
        } else if (m_action == "Tone2") {
            dataStyle = PinyinDataStyle::Tone2;
        } else if (m_action == "Tone3") {
            dataStyle = PinyinDataStyle::Tone3;
        } else if (m_action == "FirstLetter") {
            dataStyle = PinyinDataStyle::FirstLetter;
        }
        if(!ui->checkSegBox->isChecked())
            segType = SegType::Segmentation;
        else
            segType = SegType::NoSegmentation;
        if(ui->checkPolyBox_2->isChecked())
            polyType = PolyphoneType::Enable;
        else
            polyType = PolyphoneType::Disable;
        if (ui->checkExBox_3->isChecked())
            exType = ExDataProcessType::Default;
        else
            exType = ExDataProcessType::Delete;
        HanZiToPinYin::getInstance()->setConfig(dataStyle, segType, polyType, exType);
        ui->lineEdit_4->clear();
        QString text = ui->lineEdit_2->text();
        qDebug() << "input:" << text;
        QStringList list;
        HanZiToPinYin::getInstance()->getResults(text.toStdString(), list);
        ui->lineEdit_4->setText(list.join(" "));
        qDebug() << "result:" << list.join(" ");
        vector<KeyWord> result = ChineseSegmentation::getInstance()->callSegment(text.toStdString());
        list.clear();
        for (auto &info:result) {
            list.append(QString().fromStdString(info.word));
        }
        ui->lineEdit_6->setText(list.join("/"));
        string simplified = Traditional2Simplified::getInstance()->getResults(text.toStdString());
        ui->lineEdit_7->setText(QString().fromStdString(simplified));
    });
 }
--- a/libchinese-segmentation/test/mainwindow.h
+++ b/libchinese-segmentation/test/mainwindow.h
@ -0,0 +1,23 @@
 #ifndef MAINWINDOW_H
 #define MAINWINDOW_H
 #include <QtWidgets>
 QT_BEGIN_NAMESPACE
 namespace Ui { class MainWindow; }
 QT_END_NAMESPACE
 class MainWindow : public QMainWindow
 {
    Q_OBJECT
 public:
    MainWindow(QWidget *parent = nullptr);
    ~MainWindow();
 private:
    void initconnections();
    Ui::MainWindow *ui;
    QString m_action;
 };
 #endif // MAINWINDOW_H
--- a/Show More
+++ b/Show More
		`@ -1 +0,0 @@`
			`Subproject commit f7aa56a30705c2635b0d4237efb635e8fee5022a`