Merge pull request #250 from ukui/main

merge from main
2021-05-31 18:47:43 +08:00 · 2021-05-31 18:47:43 +08:00 · 1117d75025
parent 691981bcc0 83ff5325f6
commit 1117d75025
49 changed files with 4347 additions and 2773 deletions
--- a/libchinese-segmentation/chinese-segmentation.cpp
+++ b/libchinese-segmentation/chinese-segmentation.cpp
@ -30,12 +30,12 @@ ChineseSegmentation::ChineseSegmentation() {
    const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
    const char * const  IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
    const char * const  STOP_WORD_PATH = "/usr/share/ukui-search/res/dict/stop_words.utf8";
-
    m_jieba = new cppjieba::Jieba(DICT_PATH,
                                  HMM_PATH,
                                  USER_DICT_PATH,
                                  IDF_PATH,
-                                  STOP_WORD_PATH);
+                                  STOP_WORD_PATH,
+                                  "");
 }

 ChineseSegmentation::~ChineseSegmentation() {
@ -72,6 +72,15 @@ QVector<SKeyWord> ChineseSegmentation::callSegement(std::string s) {

 }

+std::vector<cppjieba::KeywordExtractor::Word> ChineseSegmentation::callSegementStd(const std::string &str) {
+
+    const size_t topk = -1;
+    std::vector<cppjieba::KeywordExtractor::Word> keywordres;
+    ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk);
+
+    return keywordres;
+}
+
 void ChineseSegmentation::convert(std::vector<cppjieba::KeywordExtractor::Word> &keywordres, QVector<SKeyWord> &kw) {
    for(auto i : keywordres) {
        SKeyWord temp;
--- a/libchinese-segmentation/chinese-segmentation.h
+++ b/libchinese-segmentation/chinese-segmentation.h
@ -48,6 +48,9 @@ public:
    static ChineseSegmentation *getInstance();
    ~ChineseSegmentation();
    QVector<SKeyWord> callSegement(std::string s);
+    //新添加callSegementStd函数，修改返回值为std：：vector<cppjieba::KeywordExtractor::Word>并简化内部处理流程--jxx20210517
+    //修改函数入参形式为引用，去掉Qstring与std::string转换代码--jxx20210519
+    std::vector<cppjieba::KeywordExtractor::Word> callSegementStd(const std::string& str);
    void convert(std::vector<cppjieba::KeywordExtractor::Word>& keywordres, QVector<SKeyWord>& kw);
 private:
    static QMutex m_mutex;
--- a/libchinese-segmentation/cppjieba/DatTrie.hpp
+++ b/libchinese-segmentation/cppjieba/DatTrie.hpp
@ -0,0 +1,286 @@
+#pragma once
+
+#include <stdint.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <QDebug>
+
+#include <algorithm>
+#include <utility>
+
+#include "limonp/Md5.hpp"
+#include "Unicode.hpp"
+#include "darts.h"
+
+namespace cppjieba {
+
+using std::pair;
+
+struct DatElement {
+    string word;
+    string tag;
+    double weight = 0;
+
+    bool operator < (const DatElement & b) const {
+        if (word == b.word) {
+            return this->weight > b.weight;
+        }
+
+        return this->word < b.word;
+    }
+};
+
+inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
+    return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
+}
+
+struct DatMemElem {
+    double weight = 0.0;
+    char tag[8] = {};
+
+    void SetTag(const string & str) {
+        memset(&tag[0], 0, sizeof(tag));
+        strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
+    }
+
+    string GetTag() const {
+        return &tag[0];
+    }
+};
+
+inline std::ostream & operator << (std::ostream& os, const DatMemElem & elem) {
+    return os << "/tag=" << elem.GetTag() << "/weight=" << elem.weight;
+}
+
+struct DatDag {
+    limonp::LocalVector<pair<size_t, const DatMemElem *> > nexts;
+    double max_weight;
+    int max_next;
+};
+
+typedef Darts::DoubleArray JiebaDAT;
+
+
+struct CacheFileHeader {
+    char md5_hex[32] = {};
+    double min_weight = 0;
+    uint32_t elements_num = 0;
+    uint32_t dat_size = 0;
+};
+
+static_assert(sizeof(DatMemElem) == 16, "DatMemElem length invalid");
+static_assert((sizeof(CacheFileHeader) % sizeof(DatMemElem)) == 0, "DatMemElem CacheFileHeader length equal");
+
+
+class DatTrie {
+public:
+    DatTrie() {}
+    ~DatTrie() {
+        ::munmap(mmap_addr_, mmap_length_);
+        mmap_addr_ = nullptr;
+        mmap_length_ = 0;
+
+        ::close(mmap_fd_);
+        mmap_fd_ = -1;
+    }
+
+    const DatMemElem * Find(const string & key) const {
+        JiebaDAT::result_pair_type find_result;
+        dat_.exactMatchSearch(key.c_str(), find_result);
+
+        if ((0 == find_result.length) || (find_result.value < 0) || (find_result.value >= elements_num_)) {
+            return nullptr;
+        }
+
+        return &elements_ptr_[ find_result.value ];
+    }
+
+    void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
+              vector<struct DatDag>&res, size_t max_word_len) const {
+
+        res.clear();
+        res.resize(end - begin);
+
+        string text_str;
+        EncodeRunesToString(begin, end, text_str);
+
+        static const size_t max_num = 128;
+        JiebaDAT::result_pair_type result_pairs[max_num] = {};
+
+        for (size_t i = 0, begin_pos = 0; i < size_t(end - begin); i++) {
+
+            std::size_t num_results = dat_.commonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
+
+            res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + 1, nullptr));
+
+            for (std::size_t idx = 0; idx < num_results; ++idx) {
+                auto & match = result_pairs[idx];
+
+                if ((match.value < 0) || (match.value >= elements_num_)) {
+                    continue;
+                }
+
+                auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
+
+                if (char_num > max_word_len) {
+                    continue;
+                }
+
+                auto pValue = &elements_ptr_[match.value];
+
+                if (1 == char_num) {
+                    res[i].nexts[0].second = pValue;
+                    continue;
+                }
+
+                res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + char_num, pValue));
+            }
+
+            begin_pos += limonp::UnicodeToUtf8Bytes((begin + i)->rune);
+        }
+    }
+
+    double GetMinWeight() const {
+        return min_weight_;
+    }
+
+    void SetMinWeight(double d) {
+        min_weight_ = d ;
+    }
+
+    bool InitBuildDat(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
+        BuildDatCache(elements, dat_cache_file, md5);
+        return InitAttachDat(dat_cache_file, md5);
+    }
+
+    bool InitAttachDat(const string & dat_cache_file, const string & md5) {
+        mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
+
+        if (mmap_fd_ < 0) {
+            return false;
+        }
+
+        const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
+        assert(seek_off >= 0);
+        mmap_length_ = seek_off;
+
+        mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
+        assert(MAP_FAILED != mmap_addr_);
+
+        assert(mmap_length_ >= sizeof(CacheFileHeader));
+        CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
+        elements_num_ = header.elements_num;
+        min_weight_ = header.min_weight;
+        assert(sizeof(header.md5_hex) == md5.size());
+
+        if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
+            return false;
+        }
+
+        assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(DatMemElem)  + header.dat_size * dat_.unit_size());
+        elements_ptr_ = (const DatMemElem *)(mmap_addr_ + sizeof(header));
+        const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(DatMemElem) * elements_num_;
+        dat_.set_array(dat_ptr, header.dat_size);
+        return true;
+    }
+
+private:
+    void BuildDatCache(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
+        std::sort(elements.begin(), elements.end());
+
+        vector<const char*> keys_ptr_vec;
+        vector<int> values_vec;
+        vector<DatMemElem> mem_elem_vec;
+
+        keys_ptr_vec.reserve(elements.size());
+        values_vec.reserve(elements.size());
+        mem_elem_vec.reserve(elements.size());
+
+        CacheFileHeader header;
+        header.min_weight = min_weight_;
+        assert(sizeof(header.md5_hex) == md5.size());
+        memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
+
+        for (size_t i = 0; i < elements.size(); ++i) {
+            keys_ptr_vec.push_back(elements[i].word.data());
+            values_vec.push_back(i);
+            mem_elem_vec.push_back(DatMemElem());
+            auto & mem_elem = mem_elem_vec.back();
+            mem_elem.weight = elements[i].weight;
+            mem_elem.SetTag(elements[i].tag);
+        }
+
+        auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
+        assert(0 == ret);
+        header.elements_num = mem_elem_vec.size();
+        header.dat_size = dat_.size();
+
+        {
+            string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
+            ::umask(S_IWGRP | S_IWOTH);
+            //const int fd =::mkstemp(&tmp_filepath[0]);
+            //原mkstemp用法有误，已修复--jxx20210519
+            const int fd =::mkstemp((char *)tmp_filepath.data());
+            qDebug() << "mkstemp error:" << errno << tmp_filepath.data();
+            assert(fd >= 0);
+            ::fchmod(fd, 0644);
+
+            auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
+            write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(mem_elem_vec[0]) * mem_elem_vec.size());
+            write_bytes += ::write(fd, dat_.array(), dat_.total_size());
+
+            assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(mem_elem_vec[0]) + dat_.total_size());
+            ::close(fd);
+
+            const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
+            assert(0 == rename_ret);
+        }
+    }
+
+    DatTrie(const DatTrie &);
+    DatTrie &operator=(const DatTrie &);
+
+private:
+    JiebaDAT dat_;
+    const DatMemElem * elements_ptr_ = nullptr;
+    size_t elements_num_ = 0;
+    double min_weight_ = 0;
+
+    int mmap_fd_ = -1;
+    size_t mmap_length_ = 0;
+    char * mmap_addr_ = nullptr;
+};
+
+
+inline string CalcFileListMD5(const string & files_list, size_t & file_size_sum) {
+    limonp::MD5 md5;
+
+    const auto files = limonp::Split(files_list, "|;");
+    file_size_sum = 0;
+
+    for (auto const & local_path : files) {
+        const int fd = ::open(local_path.c_str(), O_RDONLY);
+        if( fd < 0){
+            continue;
+        }
+        auto const len = ::lseek(fd, 0, SEEK_END);
+        if (len > 0) {
+            void * addr = ::mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+            assert(MAP_FAILED != addr);
+
+            md5.Update((unsigned char *) addr, len);
+            file_size_sum += len;
+
+            ::munmap(addr, len);
+        }
+        ::close(fd);
+    }
+
+    md5.Final();
+    return string(md5.digestChars);
+}
+
+}
--- a/libchinese-segmentation/cppjieba/DictTrie.hpp
+++ b/libchinese-segmentation/cppjieba/DictTrie.hpp
@ -1,23 +1,4 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_DICT_TRIE_HPP
-#define CPPJIEBA_DICT_TRIE_HPP
+#pragma once

 #include <iostream>
 #include <fstream>
@ -31,8 +12,8 @@
 #include "limonp/StringUtil.hpp"
 #include "limonp/Logging.hpp"
 #include "Unicode.hpp"
-#include "Trie.hpp"
-
+#include "DatTrie.hpp"
+#include <QDebug>
 namespace cppjieba {

 using namespace limonp;
@ -50,58 +31,22 @@ public:
        WordWeightMax,
    }; // enum UserWordWeightOption

-    DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
-        Init(dict_path, user_dict_paths, user_word_weight_opt);
+    DictTrie(const string& dict_path, const string& user_dict_paths = "", const string & dat_cache_path = "",
+             UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
+        Init(dict_path, user_dict_paths, dat_cache_path, user_word_weight_opt);
    }

-    ~DictTrie() {
-        delete trie_;
-    }
+    ~DictTrie() {}

-    bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
-        DictUnit node_info;
-        if(!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
-            return false;
-        }
-        active_node_infos_.push_back(node_info);
-        trie_->InsertNode(node_info.word, &active_node_infos_.back());
-        return true;
-    }
-
-    bool InsertUserWord(const string& word, int freq, const string& tag = UNKNOWN_TAG) {
-        DictUnit node_info;
-        double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
-        if(!MakeNodeInfo(node_info, word, weight, tag)) {
-            return false;
-        }
-        active_node_infos_.push_back(node_info);
-        trie_->InsertNode(node_info.word, &active_node_infos_.back());
-        return true;
-    }
-
-    const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
-        return trie_->Find(begin, end);
+    const DatMemElem* Find(const string & word) const {
+        return dat_.Find(word);
    }

    void Find(RuneStrArray::const_iterator begin,
              RuneStrArray::const_iterator end,
-              vector<struct Dag>&res,
+              vector<struct DatDag>&res,
              size_t max_word_len = MAX_WORD_LENGTH) const {
-        trie_->Find(begin, end, res, max_word_len);
-    }
-
-    bool Find(const string& word) {
-        const DictUnit *tmp = NULL;
-        RuneStrArray runes;
-        if(!DecodeRunesInString(word, runes)) {
-            XLOG(ERROR) << "Decode failed.";
-        }
-        tmp = Find(runes.begin(), runes.end());
-        if(tmp == NULL) {
-            return false;
-        } else {
-            return true;
-        }
+        dat_.Find(begin, end, res, max_word_len);
    }

    bool IsUserDictSingleChineseWord(const Rune& word) const {
@ -109,182 +54,176 @@ public:
    }

    double GetMinWeight() const {
-        return min_weight_;
+        return dat_.GetMinWeight();
    }

-    void InserUserDictNode(const string& line) {
+    size_t GetTotalDictSize() const {
+        return total_dict_size_;
+    }
+
+    void InserUserDictNode(const string& line, bool saveNodeInfo = true) {
        vector<string> buf;
-        DictUnit node_info;
+        DatElement node_info;
        Split(line, buf, " ");
-        if(buf.size() == 1) {
-            MakeNodeInfo(node_info,
-                         buf[0],
-                         user_word_default_weight_,
-                         UNKNOWN_TAG);
-        } else if(buf.size() == 2) {
-            MakeNodeInfo(node_info,
-                         buf[0],
-                         user_word_default_weight_,
-                         buf[1]);
-        } else if(buf.size() == 3) {
-            int freq = atoi(buf[1].c_str());
-            assert(freq_sum_ > 0.0);
-            double weight = log(1.0 * freq / freq_sum_);
-            MakeNodeInfo(node_info, buf[0], weight, buf[2]);
+
+        if (buf.size() == 0) {
+            return;
        }
-        static_node_infos_.push_back(node_info);
-        if(node_info.word.size() == 1) {
-            user_dict_single_chinese_word_.insert(node_info.word[0]);
+
+        node_info.word = buf[0];
+        node_info.weight = user_word_default_weight_;
+        node_info.tag = UNKNOWN_TAG;
+
+        if (buf.size() == 2) {
+            node_info.tag = buf[1];
+        } else if (buf.size() == 3) {
+            if (freq_sum_ > 0.0) {
+                const int freq = atoi(buf[1].c_str());
+                node_info.weight = log(1.0 * freq / freq_sum_);
+                node_info.tag = buf[2];
+            }
+        }
+
+        if (saveNodeInfo) {
+            static_node_infos_.push_back(node_info);
+        }
+
+        if (Utf8CharNum(node_info.word) == 1) {
+            RuneArray word;
+
+            if (DecodeRunesInString(node_info.word, word)) {
+                user_dict_single_chinese_word_.insert(word[0]);
+            } else {
+                XLOG(ERROR) << "Decode " << node_info.word << " failed.";
+            }
        }
    }

-    void LoadUserDict(const vector<string>& buf) {
-        for(size_t i = 0; i < buf.size(); i++) {
-            InserUserDictNode(buf[i]);
-        }
-    }
-
-    void LoadUserDict(const set<string>& buf) {
-        std::set<string>::const_iterator iter;
-        for(iter = buf.begin(); iter != buf.end(); iter++) {
-            InserUserDictNode(*iter);
-        }
-    }
-
-    void LoadUserDict(const string& filePaths) {
+    void LoadUserDict(const string& filePaths, bool saveNodeInfo = true) {
        vector<string> files = limonp::Split(filePaths, "|;");
-        size_t lineno = 0;
-        for(size_t i = 0; i < files.size(); i++) {
+
+        for (size_t i = 0; i < files.size(); i++) {
            ifstream ifs(files[i].c_str());
            XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
            string line;

-            for(; getline(ifs, line); lineno++) {
-                if(line.size() == 0) {
+            for (; getline(ifs, line);) {
+                if (line.size() == 0) {
                    continue;
                }
-                InserUserDictNode(line);
+
+                InserUserDictNode(line, saveNodeInfo);
            }
        }
    }


 private:
-    void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
-        LoadDict(dict_path);
+    void Init(const string& dict_path, const string& user_dict_paths, string dat_cache_path,
+              UserWordWeightOption user_word_weight_opt) {
+        const auto dict_list = dict_path + "|" + user_dict_paths;
+        size_t file_size_sum = 0;
+        const string md5 = CalcFileListMD5(dict_list, file_size_sum);
+
+        if (dat_cache_path.empty()) {
+            //未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
+            dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) +  ".dat_cache";
+        }
+        QString path = QString::fromStdString(dat_cache_path);
+        qDebug() << "#########path:" << path;
+        if (dat_.InitAttachDat(dat_cache_path, md5)) {
+            LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
+            total_dict_size_ = file_size_sum;
+            return;
+        }
+
+        LoadDefaultDict(dict_path);
        freq_sum_ = CalcFreqSum(static_node_infos_);
        CalculateWeight(static_node_infos_, freq_sum_);
-        SetStaticWordWeights(user_word_weight_opt);
+        double min_weight = 0;
+        SetStaticWordWeights(user_word_weight_opt, min_weight);
+        dat_.SetMinWeight(min_weight);

-        if(user_dict_paths.size()) {
-            LoadUserDict(user_dict_paths);
-        }
-        Shrink(static_node_infos_);
-        CreateTrie(static_node_infos_);
+        LoadUserDict(user_dict_paths);
+        const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
+        assert(build_ret);
+        total_dict_size_ = file_size_sum;
+        vector<DatElement>().swap(static_node_infos_);
    }

-    void CreateTrie(const vector<DictUnit>& dictUnits) {
-        assert(dictUnits.size());
-        vector<Unicode> words;
-        vector<const DictUnit*> valuePointers;
-        for(size_t i = 0 ; i < dictUnits.size(); i ++) {
-            words.push_back(dictUnits[i].word);
-            valuePointers.push_back(&dictUnits[i]);
-        }
-
-        trie_ = new Trie(words, valuePointers);
-    }
-
-
-
-
-    bool MakeNodeInfo(DictUnit& node_info,
-                      const string& word,
-                      double weight,
-                      const string& tag) {
-        if(!DecodeRunesInString(word, node_info.word)) {
-            XLOG(ERROR) << "Decode " << word << " failed.";
-            return false;
-        }
-        node_info.weight = weight;
-        node_info.tag = tag;
-        return true;
-    }
-
-    void LoadDict(const string& filePath) {
+    void LoadDefaultDict(const string& filePath) {
        ifstream ifs(filePath.c_str());
        XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
        string line;
        vector<string> buf;

-        DictUnit node_info;
-        for(size_t lineno = 0; getline(ifs, line); lineno++) {
+        for (; getline(ifs, line);) {
            Split(line, buf, " ");
            XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
-            MakeNodeInfo(node_info,
-                         buf[0],
-                         atof(buf[1].c_str()),
-                         buf[2]);
+            DatElement node_info;
+            node_info.word = buf[0];
+            node_info.weight = atof(buf[1].c_str());
+            node_info.tag = buf[2];
            static_node_infos_.push_back(node_info);
        }
    }

-    static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
+    static bool WeightCompare(const DatElement& lhs, const DatElement& rhs) {
        return lhs.weight < rhs.weight;
    }

-    void SetStaticWordWeights(UserWordWeightOption option) {
+    void SetStaticWordWeights(UserWordWeightOption option, double & min_weight) {
        XCHECK(!static_node_infos_.empty());
-        vector<DictUnit> x = static_node_infos_;
+        vector<DatElement> x = static_node_infos_;
        sort(x.begin(), x.end(), WeightCompare);
-        min_weight_ = x[0].weight;
-        max_weight_ = x[x.size() - 1].weight;
-        median_weight_ = x[x.size() / 2].weight;
-        switch(option) {
-        case WordWeightMin:
-            user_word_default_weight_ = min_weight_;
-            break;
-        case WordWeightMedian:
-            user_word_default_weight_ = median_weight_;
-            break;
-        default:
-            user_word_default_weight_ = max_weight_;
-            break;
+        if(x.empty()){
+            return;
+        }
+        min_weight = x[0].weight;
+        const double max_weight_ = x[x.size() - 1].weight;
+        const double median_weight_ = x[x.size() / 2].weight;
+
+        switch (option) {
+            case WordWeightMin:
+                user_word_default_weight_ = min_weight;
+                break;
+
+            case WordWeightMedian:
+                user_word_default_weight_ = median_weight_;
+                break;
+
+            default:
+                user_word_default_weight_ = max_weight_;
+                break;
        }
    }

-    double CalcFreqSum(const vector<DictUnit>& node_infos) const {
+    double CalcFreqSum(const vector<DatElement>& node_infos) const {
        double sum = 0.0;
-        for(size_t i = 0; i < node_infos.size(); i++) {
+
+        for (size_t i = 0; i < node_infos.size(); i++) {
            sum += node_infos[i].weight;
        }
+
        return sum;
    }

-    void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
-        assert(sum > 0.0);
-        for(size_t i = 0; i < node_infos.size(); i++) {
-            DictUnit& node_info = node_infos[i];
+    void CalculateWeight(vector<DatElement>& node_infos, double sum) const {
+        for (size_t i = 0; i < node_infos.size(); i++) {
+            DatElement& node_info = node_infos[i];
            assert(node_info.weight > 0.0);
            node_info.weight = log(double(node_info.weight) / sum);
        }
    }

-    void Shrink(vector<DictUnit>& units) const {
-        vector<DictUnit>(units.begin(), units.end()).swap(units);
-    }
-
-    vector<DictUnit> static_node_infos_;
-    deque<DictUnit> active_node_infos_; // must not be vector
-    Trie * trie_;
+private:
+    vector<DatElement> static_node_infos_;
+    size_t total_dict_size_ = 0;
+    DatTrie dat_;

    double freq_sum_;
-    double min_weight_;
-    double max_weight_;
-    double median_weight_;
    double user_word_default_weight_;
    unordered_set<Rune> user_dict_single_chinese_word_;
 };
 }

-#endif
--- a/libchinese-segmentation/cppjieba/FullSegment.hpp
+++ b/libchinese-segmentation/cppjieba/FullSegment.hpp
@ -1,23 +1,4 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_FULLSEGMENT_H
-#define CPPJIEBA_FULLSEGMENT_H
+#pragma once

 #include <algorithm>
 #include <set>
@ -30,82 +11,45 @@
 namespace cppjieba {
 class FullSegment: public SegmentBase {
 public:
-    FullSegment(const string& dictPath) {
-        dictTrie_ = new DictTrie(dictPath);
-        isNeedDestroy_ = true;
-    }
    FullSegment(const DictTrie* dictTrie)
-        : dictTrie_(dictTrie), isNeedDestroy_(false) {
+        : dictTrie_(dictTrie) {
        assert(dictTrie_);
    }
-    ~FullSegment() {
-        if(isNeedDestroy_) {
-            delete dictTrie_;
-        }
-    }
-    void Cut(const string& sentence,
-             vector<string>& words) const {
-        vector<Word> tmp;
-        Cut(sentence, tmp);
-        GetStringsFromWords(tmp, words);
-    }
-    void Cut(const string& sentence,
-             vector<Word>& words) const {
-        PreFilter pre_filter(symbols_, sentence);
-        PreFilter::Range range;
-        vector<WordRange> wrs;
-        wrs.reserve(sentence.size() / 2);
-        while(pre_filter.HasNext()) {
-            range = pre_filter.Next();
-            Cut(range.begin, range.end, wrs);
-        }
-        words.clear();
-        words.reserve(wrs.size());
-        GetWordsFromWordRanges(sentence, wrs, words);
-    }
-    void Cut(RuneStrArray::const_iterator begin,
-             RuneStrArray::const_iterator end,
-             vector<WordRange>& res) const {
-        // result of searching in trie tree
-        LocalVector<pair<size_t, const DictUnit*> > tRes;
+    ~FullSegment() { }

-        // max index of res's words
-        size_t maxIdx = 0;
-
-        // always equals to (uItr - begin)
-        size_t uIdx = 0;
-
-        // tmp variables
-        size_t wordLen = 0;
+    virtual void Cut(RuneStrArray::const_iterator begin,
+                     RuneStrArray::const_iterator end,
+                     vector<WordRange>& res, bool, size_t) const override {
        assert(dictTrie_);
-        vector<struct Dag> dags;
+        vector<struct DatDag> dags;
        dictTrie_->Find(begin, end, dags);
-        for(size_t i = 0; i < dags.size(); i++) {
-            for(size_t j = 0; j < dags[i].nexts.size(); j++) {
-                size_t nextoffset = dags[i].nexts[j].first;
+        size_t max_word_end_pos = 0;
+
+        for (size_t i = 0; i < dags.size(); i++) {
+            for (const auto & kv : dags[i].nexts) {
+                const size_t nextoffset = kv.first - 1;
                assert(nextoffset < dags.size());
-                const DictUnit* du = dags[i].nexts[j].second;
-                if(du == NULL) {
-                    if(dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
-                        WordRange wr(begin + i, begin + nextoffset);
-                        res.push_back(wr);
-                    }
-                } else {
-                    wordLen = du->word.size();
-                    if(wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
-                        WordRange wr(begin + i, begin + nextoffset);
-                        res.push_back(wr);
-                    }
+                const auto wordLen = nextoffset - i + 1;
+                const bool is_not_covered_single_word = ((dags[i].nexts.size() == 1) && (max_word_end_pos <= i));
+                const bool is_oov = (nullptr == kv.second); //Out-of-Vocabulary
+
+                if ((is_not_covered_single_word) || ((not is_oov) && (wordLen >= 2))) {
+                    WordRange wr(begin + i, begin + nextoffset);
+                    res.push_back(wr);
                }
-                maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
+
+                max_word_end_pos = max(max_word_end_pos, nextoffset + 1);
            }
-            uIdx++;
        }
    }
+
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
+                     size_t) const override {
+
+    }
+
 private:
    const DictTrie* dictTrie_;
-    bool isNeedDestroy_;
 };
 }

-#endif
--- a/libchinese-segmentation/cppjieba/HMMModel.hpp
+++ b/libchinese-segmentation/cppjieba/HMMModel.hpp
@ -1,26 +1,6 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_HMMMODEL_H
-#define CPPJIEBA_HMMMODEL_H
+#pragma once

 #include "limonp/StringUtil.hpp"
-#include "Trie.hpp"

 namespace cppjieba {

@ -59,16 +39,18 @@ struct HMMModel {
        XCHECK(GetLine(ifile, line));
        Split(line, tmp, " ");
        XCHECK(tmp.size() == STATUS_SUM);
-        for(size_t j = 0; j < tmp.size(); j++) {
+
+        for (size_t j = 0; j < tmp.size(); j++) {
            startProb[j] = atof(tmp[j].c_str());
        }

        //Load transProb
-        for(size_t i = 0; i < STATUS_SUM; i++) {
+        for (size_t i = 0; i < STATUS_SUM; i++) {
            XCHECK(GetLine(ifile, line));
            Split(line, tmp, " ");
            XCHECK(tmp.size() == STATUS_SUM);
-            for(size_t j = 0; j < STATUS_SUM; j++) {
+
+            for (size_t j = 0; j < tmp.size(); j++) {
                transProb[i][j] = atof(tmp[j].c_str());
            }
        }
@ -92,43 +74,55 @@ struct HMMModel {
    double GetEmitProb(const EmitProbMap* ptMp, Rune key,
                       double defVal)const {
        EmitProbMap::const_iterator cit = ptMp->find(key);
-        if(cit == ptMp->end()) {
+
+        if (cit == ptMp->end()) {
            return defVal;
        }
+
        return cit->second;
    }
    bool GetLine(ifstream& ifile, string& line) {
-        while(getline(ifile, line)) {
+        while (getline(ifile, line)) {
            Trim(line);
-            if(line.empty()) {
+
+            if (line.empty()) {
                continue;
            }
-            if(StartsWith(line, "#")) {
+
+            if (StartsWith(line, "#")) {
                continue;
            }
+
            return true;
        }
+
        return false;
    }
    bool LoadEmitProb(const string& line, EmitProbMap& mp) {
-        if(line.empty()) {
+        if (line.empty()) {
            return false;
        }
+
        vector<string> tmp, tmp2;
-        Unicode unicode;
+        RuneArray unicode;
        Split(line, tmp, ",");
-        for(size_t i = 0; i < tmp.size(); i++) {
+
+        for (size_t i = 0; i < tmp.size(); i++) {
            Split(tmp[i], tmp2, ":");
-            if(2 != tmp2.size()) {
+
+            if (2 != tmp2.size()) {
                XLOG(ERROR) << "emitProb illegal.";
                return false;
            }
-            if(!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
+
+            if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
                XLOG(ERROR) << "TransCode failed.";
                return false;
            }
+
            mp[unicode[0]] = atof(tmp2[1].c_str());
        }
+
        return true;
    }

@ -144,4 +138,3 @@ struct HMMModel {

 } // namespace cppjieba

-#endif
--- a/libchinese-segmentation/cppjieba/HMMSegment.hpp
+++ b/libchinese-segmentation/cppjieba/HMMSegment.hpp
@ -1,23 +1,4 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIBEA_HMMSEGMENT_H
-#define CPPJIBEA_HMMSEGMENT_H
+#pragma once

 #include <iostream>
 #include <fstream>
@ -29,58 +10,40 @@
 namespace cppjieba {
 class HMMSegment: public SegmentBase {
 public:
-    HMMSegment(const string& filePath)
-        : model_(new HMMModel(filePath)), isNeedDestroy_(true) {
-    }
    HMMSegment(const HMMModel* model)
-        : model_(model), isNeedDestroy_(false) {
-    }
-    ~HMMSegment() {
-        if(isNeedDestroy_) {
-            delete model_;
-        }
+        : model_(model) {
    }
+    ~HMMSegment() { }

-    void Cut(const string& sentence,
-             vector<string>& words) const {
-        vector<Word> tmp;
-        Cut(sentence, tmp);
-        GetStringsFromWords(tmp, words);
-    }
-    void Cut(const string& sentence,
-             vector<Word>& words) const {
-        PreFilter pre_filter(symbols_, sentence);
-        PreFilter::Range range;
-        vector<WordRange> wrs;
-        wrs.reserve(sentence.size() / 2);
-        while(pre_filter.HasNext()) {
-            range = pre_filter.Next();
-            Cut(range.begin, range.end, wrs);
-        }
-        words.clear();
-        words.reserve(wrs.size());
-        GetWordsFromWordRanges(sentence, wrs, words);
-    }
-    void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
+    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool,
+                     size_t) const override {
        RuneStrArray::const_iterator left = begin;
        RuneStrArray::const_iterator right = begin;
-        while(right != end) {
-            if(right->rune < 0x80) {
-                if(left != right) {
+
+        while (right != end) {
+            if (right->rune < 0x80) {
+                if (left != right) {
                    InternalCut(left, right, res);
                }
+
                left = right;
+
                do {
                    right = SequentialLetterRule(left, end);
-                    if(right != left) {
+
+                    if (right != left) {
                        break;
                    }
+
                    right = NumbersRule(left, end);
-                    if(right != left) {
+
+                    if (right != left) {
                        break;
                    }
+
                    right ++;
-                } while(false);
+                } while (false);
+
                WordRange wr(left, right - 1);
                res.push_back(wr);
                left = right;
@ -88,45 +51,61 @@ public:
                right++;
            }
        }
-        if(left != right) {
+
+        if (left != right) {
            InternalCut(left, right, res);
        }
    }
+
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
+                     size_t) const override {
+
+    }
+
 private:
    // sequential letters rule
-    RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
+    RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin,
+                                                      RuneStrArray::const_iterator end) const {
        Rune x = begin->rune;
-        if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
+
+        if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
            begin ++;
        } else {
            return begin;
        }
-        while(begin != end) {
+
+        while (begin != end) {
            x = begin->rune;
-            if(('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
+
+            if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
                begin ++;
            } else {
                break;
            }
        }
+
        return begin;
    }
    //
    RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
        Rune x = begin->rune;
-        if('0' <= x && x <= '9') {
+
+        if ('0' <= x && x <= '9') {
            begin ++;
        } else {
            return begin;
        }
-        while(begin != end) {
+
+        while (begin != end) {
            x = begin->rune;
-            if(('0' <= x && x <= '9') || x == '.') {
+
+            if (('0' <= x && x <= '9') || x == '.') {
                begin++;
            } else {
                break;
            }
        }
+
        return begin;
    }
    void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
@ -135,8 +114,9 @@ private:

        RuneStrArray::const_iterator left = begin;
        RuneStrArray::const_iterator right;
-        for(size_t i = 0; i < status.size(); i++) {
-            if(status[i] % 2) {  //if (HMMModel::E == status[i] || HMMModel::S == status[i])
+
+        for (size_t i = 0; i < status.size(); i++) {
+            if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
                right = begin + i + 1;
                WordRange wr(left, right - 1);
                res.push_back(wr);
@ -159,23 +139,25 @@ private:
        vector<double> weight(XYSize);

        //start
-        for(size_t y = 0; y < Y; y++) {
+        for (size_t y = 0; y < Y; y++) {
            weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
            path[0 + y * X] = -1;
        }

        double emitProb;

-        for(size_t x = 1; x < X; x++) {
-            for(size_t y = 0; y < Y; y++) {
+        for (size_t x = 1; x < X; x++) {
+            for (size_t y = 0; y < Y; y++) {
                now = x + y * X;
                weight[now] = MIN_DOUBLE;
                path[now] = HMMModel::E; // warning
                emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin + x)->rune, MIN_DOUBLE);
-                for(size_t preY = 0; preY < Y; preY++) {
+
+                for (size_t preY = 0; preY < Y; preY++) {
                    old = x - 1 + preY * X;
                    tmp = weight[old] + model_->transProb[preY][y] + emitProb;
-                    if(tmp > weight[now]) {
+
+                    if (tmp > weight[now]) {
                        weight[now] = tmp;
                        path[now] = preY;
                    }
@ -186,23 +168,23 @@ private:
        endE = weight[X - 1 + HMMModel::E * X];
        endS = weight[X - 1 + HMMModel::S * X];
        stat = 0;
-        if(endE >= endS) {
+
+        if (endE >= endS) {
            stat = HMMModel::E;
        } else {
            stat = HMMModel::S;
        }

        status.resize(X);
-        for(int x = X - 1 ; x >= 0; x--) {
+
+        for (int x = X - 1 ; x >= 0; x--) {
            status[x] = stat;
            stat = path[x + stat * X];
        }
    }

    const HMMModel* model_;
-    bool isNeedDestroy_;
 }; // class HMMSegment

 } // namespace cppjieba

-#endif
--- a/libchinese-segmentation/cppjieba/Jieba.hpp
+++ b/libchinese-segmentation/cppjieba/Jieba.hpp
@ -1,24 +1,6 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEAB_JIEBA_H
-#define CPPJIEAB_JIEBA_H
+#pragma once

+#include <memory>
 #include "QuerySegment.hpp"
 #include "KeywordExtractor.hpp"

@ -29,56 +11,48 @@ public:
    Jieba(const string& dict_path,
          const string& model_path,
          const string& user_dict_path,
-          const string& idfPath,
-          const string& stopWordPath)
-        : dict_trie_(dict_path, user_dict_path),
+          const string& idfPath = "",
+          const string& stopWordPath = "",
+          const string& dat_cache_path = "")
+        : dict_trie_(dict_path, user_dict_path, dat_cache_path),
          model_(model_path),
          mp_seg_(&dict_trie_),
          hmm_seg_(&model_),
          mix_seg_(&dict_trie_, &model_),
          full_seg_(&dict_trie_),
          query_seg_(&dict_trie_, &model_),
-          extractor(&dict_trie_, &model_, idfPath, stopWordPath) {
-
-    }
-    ~Jieba() {
-    }
-
-    struct LocWord {
-        string word;
-        size_t begin;
-        size_t end;
-    }; // struct LocWord
+          extractor(&dict_trie_, &model_, idfPath, stopWordPath){ }
+    ~Jieba() { }

    void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
-        mix_seg_.Cut(sentence, words, hmm);
+        mix_seg_.CutToStr(sentence, words, hmm);
    }
    void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
-        mix_seg_.Cut(sentence, words, hmm);
+        mix_seg_.CutToWord(sentence, words, hmm);
    }
    void CutAll(const string& sentence, vector<string>& words) const {
-        full_seg_.Cut(sentence, words);
+        full_seg_.CutToStr(sentence, words);
    }
    void CutAll(const string& sentence, vector<Word>& words) const {
-        full_seg_.Cut(sentence, words);
+        full_seg_.CutToWord(sentence, words);
    }
    void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
-        query_seg_.Cut(sentence, words, hmm);
+        query_seg_.CutToStr(sentence, words, hmm);
    }
    void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
-        query_seg_.Cut(sentence, words, hmm);
+        query_seg_.CutToWord(sentence, words, hmm);
    }
    void CutHMM(const string& sentence, vector<string>& words) const {
-        hmm_seg_.Cut(sentence, words);
+        hmm_seg_.CutToStr(sentence, words);
    }
    void CutHMM(const string& sentence, vector<Word>& words) const {
-        hmm_seg_.Cut(sentence, words);
+        hmm_seg_.CutToWord(sentence, words);
    }
    void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
-        mp_seg_.Cut(sentence, words, max_word_len);
+        mp_seg_.CutToStr(sentence, words, false, max_word_len);
    }
    void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
-        mp_seg_.Cut(sentence, words, max_word_len);
+        mp_seg_.CutToWord(sentence, words, false, max_word_len);
    }

    void Tag(const string& sentence, vector<pair<string, string> >& words) const {
@ -87,16 +61,8 @@ public:
    string LookupTag(const string &str) const {
        return mix_seg_.LookupTag(str);
    }
-    bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
-        return dict_trie_.InsertUserWord(word, tag);
-    }
-
-    bool InsertUserWord(const string& word, int freq, const string& tag = UNKNOWN_TAG) {
-        return dict_trie_.InsertUserWord(word, freq, tag);
-    }
-
    bool Find(const string& word) {
-        return dict_trie_.Find(word);
+        return nullptr != dict_trie_.Find(word);
    }

    void ResetSeparators(const string& s) {
@ -116,18 +82,6 @@ public:
        return &model_;
    }

-    void LoadUserDict(const vector<string>& buf)  {
-        dict_trie_.LoadUserDict(buf);
-    }
-
-    void LoadUserDict(const set<string>& buf)  {
-        dict_trie_.LoadUserDict(buf);
-    }
-
-    void LoadUserDict(const string& path)  {
-        dict_trie_.LoadUserDict(path);
-    }
-
 private:
    DictTrie dict_trie_;
    HMMModel model_;
@ -145,4 +99,3 @@ public:

 } // namespace cppjieba

-#endif // CPPJIEAB_JIEBA_H
--- a/libchinese-segmentation/cppjieba/KeywordExtractor.hpp
+++ b/libchinese-segmentation/cppjieba/KeywordExtractor.hpp
@ -1,23 +1,4 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
-#define CPPJIEBA_KEYWORD_EXTRACTOR_H
+#pragma once

 #include <cmath>
 #include <set>
@ -37,15 +18,6 @@ public:
        double weight;
    }; // struct Word

-    KeywordExtractor(const string& dictPath,
-                     const string& hmmFilePath,
-                     const string& idfPath,
-                     const string& stopWordPath,
-                     const string& userDict = "")
-        : segment_(dictPath, hmmFilePath, userDict) {
-        LoadIdfDict(idfPath);
-        LoadStopWordDict(stopWordPath);
-    }
    KeywordExtractor(const DictTrie* dictTrie,
                     const HMMModel* model,
                     const string& idfPath,
@ -60,7 +32,8 @@ public:
    void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
        vector<Word> topWords;
        Extract(sentence, topWords, topN);
-        for(size_t i = 0; i < topWords.size(); i++) {
+
+        for (size_t i = 0; i < topWords.size(); i++) {
            keywords.push_back(topWords[i].word);
        }
    }
@ -68,43 +41,52 @@ public:
    void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
        vector<Word> topWords;
        Extract(sentence, topWords, topN);
-        for(size_t i = 0; i < topWords.size(); i++) {
+
+        for (size_t i = 0; i < topWords.size(); i++) {
            keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
        }
    }

    void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
        vector<string> words;
-        segment_.Cut(sentence, words);
+        segment_.CutToStr(sentence, words);//将字符串string分解为words放入vector

-        map<string, Word> wordmap;
+        map<string, Word> wordmap;//插入字符串与Word的map，相同string统计词频叠加权重
        size_t offset = 0;
-        for(size_t i = 0; i < words.size(); ++i) {
+
+        for (size_t i = 0; i < words.size(); ++i) {
            size_t t = offset;
            offset += words[i].size();
-            if(IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
+
+            if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
                continue;
            }
+
            wordmap[words[i]].offsets.push_back(t);
            wordmap[words[i]].weight += 1.0;
        }
-        if(offset != sentence.size()) {
+
+        if (offset != sentence.size()) {
            XLOG(ERROR) << "words illegal";
            return;
        }

        keywords.clear();
        keywords.reserve(wordmap.size());
-        for(map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
-            unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
-            if(cit != idfMap_.end()) {
+
+        for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
+            unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);//IDF词典查找
+
+            if (cit != idfMap_.end()) {
                itr->second.weight *= cit->second;
            } else {
                itr->second.weight *= idfAverage_;
            }
+
            itr->second.word = itr->first;
            keywords.push_back(itr->second);
        }
+
        topN = min(topN, keywords.size());
        partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
        keywords.resize(topN);
@ -112,23 +94,31 @@ public:
 private:
    void LoadIdfDict(const string& idfPath) {
        ifstream ifs(idfPath.c_str());
+        if(not ifs.is_open()){
+            return ;
+        }
        XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
        string line ;
        vector<string> buf;
        double idf = 0.0;
        double idfSum = 0.0;
        size_t lineno = 0;
-        for(; getline(ifs, line); lineno++) {
+
+        for (; getline(ifs, line); lineno++) {
            buf.clear();
-            if(line.empty()) {
+
+            if (line.empty()) {
                XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
                continue;
            }
+
            Split(line, buf, " ");
-            if(buf.size() != 2) {
+
+            if (buf.size() != 2) {
                XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
                continue;
            }
+
            idf = atof(buf[1].c_str());
            idfMap_[buf[0]] = idf;
            idfSum += idf;
@ -141,11 +131,16 @@ private:
    }
    void LoadStopWordDict(const string& filePath) {
        ifstream ifs(filePath.c_str());
+        if(not ifs.is_open()){
+            return ;
+        }
        XCHECK(ifs.is_open()) << "open " << filePath << " failed";
        string line ;
-        while(getline(ifs, line)) {
+
+        while (getline(ifs, line)) {
            stopWords_.insert(line);
        }
+
        assert(stopWords_.size());
    }

@ -161,11 +156,11 @@ private:
 }; // class KeywordExtractor

 inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
-    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
+    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
+           "}";
 }

 } // namespace cppjieba

-#endif


--- a/libchinese-segmentation/cppjieba/MPSegment.hpp
+++ b/libchinese-segmentation/cppjieba/MPSegment.hpp
@ -1,23 +1,4 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_MPSEGMENT_H
-#define CPPJIEBA_MPSEGMENT_H
+#pragma once

 #include <algorithm>
 #include <set>
@ -31,63 +12,32 @@ namespace cppjieba {

 class MPSegment: public SegmentTagged {
 public:
-    MPSegment(const string& dictPath, const string& userDictPath = "")
-        : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
-    }
    MPSegment(const DictTrie* dictTrie)
-        : dictTrie_(dictTrie), isNeedDestroy_(false) {
+        : dictTrie_(dictTrie) {
        assert(dictTrie_);
    }
-    ~MPSegment() {
-        if(isNeedDestroy_) {
-            delete dictTrie_;
-        }
+    ~MPSegment() { }
+
+    virtual void Cut(RuneStrArray::const_iterator begin,
+                     RuneStrArray::const_iterator end,
+                     vector<WordRange>& words,
+                     bool, size_t max_word_len) const override {
+        vector<DatDag> dags;
+        dictTrie_->Find(begin, end, dags, max_word_len);//依据DAG词典生成DAG--jxx
+        CalcDP(dags);//动态规划（Dynamic Programming，DP），根据DAG计算最优动态规划路径--jxx
+        CutByDag(begin, end, dags, words);//依据DAG最优路径分词--jxx
    }

-    void Cut(const string& sentence, vector<string>& words) const {
-        Cut(sentence, words, MAX_WORD_LENGTH);
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
+                     size_t) const override {
+
    }

-    void Cut(const string& sentence,
-             vector<string>& words,
-             size_t max_word_len) const {
-        vector<Word> tmp;
-        Cut(sentence, tmp, max_word_len);
-        GetStringsFromWords(tmp, words);
-    }
-    void Cut(const string& sentence,
-             vector<Word>& words,
-             size_t max_word_len = MAX_WORD_LENGTH) const {
-        PreFilter pre_filter(symbols_, sentence);
-        PreFilter::Range range;
-        vector<WordRange> wrs;
-        wrs.reserve(sentence.size() / 2);
-        while(pre_filter.HasNext()) {
-            range = pre_filter.Next();
-            Cut(range.begin, range.end, wrs, max_word_len);
-        }
-        words.clear();
-        words.reserve(wrs.size());
-        GetWordsFromWordRanges(sentence, wrs, words);
-    }
-    void Cut(RuneStrArray::const_iterator begin,
-             RuneStrArray::const_iterator end,
-             vector<WordRange>& words,
-             size_t max_word_len = MAX_WORD_LENGTH) const {
-        vector<Dag> dags;
-        dictTrie_->Find(begin,
-                        end,
-                        dags,
-                        max_word_len);
-        CalcDP(dags);
-        CutByDag(begin, end, dags, words);
-    }
-
-    const DictTrie* GetDictTrie() const {
+    const DictTrie* GetDictTrie() const override {
        return dictTrie_;
    }

-    bool Tag(const string& src, vector<pair<string, string> >& res) const {
+    bool Tag(const string& src, vector<pair<string, string> >& res) const override {
        return tagger_.Tag(src, res, *this);
    }

@ -95,61 +45,50 @@ public:
        return dictTrie_->IsUserDictSingleChineseWord(value);
    }
 private:
-    void CalcDP(vector<Dag>& dags) const {
-        size_t nextPos;
-        const DictUnit* p;
-        double val;
+    void CalcDP(vector<DatDag>& dags) const {
+        for (auto rit = dags.rbegin(); rit != dags.rend(); rit++) {
+            rit->max_next = -1;
+            rit->max_weight = MIN_DOUBLE;

-        for(vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
-            rit->pInfo = NULL;
-            rit->weight = MIN_DOUBLE;
-            assert(!rit->nexts.empty());
-            for(LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
-                nextPos = it->first;
-                p = it->second;
-                val = 0.0;
-                if(nextPos + 1 < dags.size()) {
-                    val += dags[nextPos + 1].weight;
+            for (const auto & it : rit->nexts) {
+                const auto nextPos = it.first;
+                double val = dictTrie_->GetMinWeight();
+
+                if (nullptr != it.second) {
+                    val = it.second->weight;
                }

-                if(p) {
-                    val += p->weight;
-                } else {
-                    val += dictTrie_->GetMinWeight();
+                if (nextPos  < dags.size()) {
+                    val += dags[nextPos].max_weight;
                }
-                if(val > rit->weight) {
-                    rit->pInfo = p;
-                    rit->weight = val;
+
+                if ((nextPos <= dags.size()) && (val > rit->max_weight)) {
+                    rit->max_weight = val;
+                    rit->max_next = nextPos;
                }
            }
        }
    }
+
    void CutByDag(RuneStrArray::const_iterator begin,
-                  RuneStrArray::const_iterator end,
-                  const vector<Dag>& dags,
+                  RuneStrArray::const_iterator,
+                  const vector<DatDag>& dags,
                  vector<WordRange>& words) const {
-        size_t i = 0;
-        while(i < dags.size()) {
-            const DictUnit* p = dags[i].pInfo;
-            if(p) {
-                assert(p->word.size() >= 1);
-                WordRange wr(begin + i, begin + i + p->word.size() - 1);
-                words.push_back(wr);
-                i += p->word.size();
-            } else { //single chinese word
-                WordRange wr(begin + i, begin + i);
-                words.push_back(wr);
-                i++;
-            }
+
+        for (size_t i = 0; i < dags.size();) {
+            const auto next = dags[i].max_next;
+            assert(next > i);
+            assert(next <= dags.size());
+            WordRange wr(begin + i, begin + next - 1);
+            words.push_back(wr);
+            i = next;
        }
    }

    const DictTrie* dictTrie_;
-    bool isNeedDestroy_;
    PosTagger tagger_;

 }; // class MPSegment

 } // namespace cppjieba

-#endif
--- a/libchinese-segmentation/cppjieba/MixSegment.hpp
+++ b/libchinese-segmentation/cppjieba/MixSegment.hpp
@ -1,23 +1,4 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_MIXSEGMENT_H
-#define CPPJIEBA_MIXSEGMENT_H
+#pragma once

 #include <cassert>
 #include "MPSegment.hpp"
@ -28,70 +9,49 @@
 namespace cppjieba {
 class MixSegment: public SegmentTagged {
 public:
-    MixSegment(const string& mpSegDict, const string& hmmSegDict,
-               const string& userDict = "")
-        : mpSeg_(mpSegDict, userDict),
-          hmmSeg_(hmmSegDict) {
-    }
    MixSegment(const DictTrie* dictTrie, const HMMModel* model)
        : mpSeg_(dictTrie), hmmSeg_(model) {
    }
-    ~MixSegment() {
-    }
+    ~MixSegment() {}

-    void Cut(const string& sentence, vector<string>& words) const {
-        Cut(sentence, words, true);
-    }
-    void Cut(const string& sentence, vector<string>& words, bool hmm) const {
-        vector<Word> tmp;
-        Cut(sentence, tmp, hmm);
-        GetStringsFromWords(tmp, words);
-    }
-    void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
-        PreFilter pre_filter(symbols_, sentence);
-        PreFilter::Range range;
-        vector<WordRange> wrs;
-        wrs.reserve(sentence.size() / 2);
-        while(pre_filter.HasNext()) {
-            range = pre_filter.Next();
-            Cut(range.begin, range.end, wrs, hmm);
-        }
-        words.clear();
-        words.reserve(wrs.size());
-        GetWordsFromWordRanges(sentence, wrs, words);
-    }
-
-    void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
-        if(!hmm) {
-            mpSeg_.Cut(begin, end, res);
+    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
+                     size_t) const override {
+        if (!hmm) {
+            mpSeg_.CutRuneArray(begin, end, res);
            return;
        }
+
        vector<WordRange> words;
        assert(end >= begin);
        words.reserve(end - begin);
-        mpSeg_.Cut(begin, end, words);
+        mpSeg_.CutRuneArray(begin, end, words);

        vector<WordRange> hmmRes;
        hmmRes.reserve(end - begin);
-        for(size_t i = 0; i < words.size(); i++) {
+
+        for (size_t i = 0; i < words.size(); i++) {
            //if mp Get a word, it's ok, put it into result
-            if(words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
+            if (words[i].left != words[i].right || (words[i].left == words[i].right &&
+                                                    mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
                res.push_back(words[i]);
                continue;
            }

            // if mp Get a single one and it is not in userdict, collect it in sequence
            size_t j = i;
-            while(j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
+
+            while (j < words.size() && words[j].left == words[j].right &&
+                   !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
                j++;
            }

            // Cut the sequence with hmm
            assert(j - 1 >= i);
            // TODO
-            hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
+            hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
+
            //put hmm result to result
-            for(size_t k = 0; k < hmmRes.size(); k++) {
+            for (size_t k = 0; k < hmmRes.size(); k++) {
                res.push_back(hmmRes[k]);
            }

@ -103,11 +63,61 @@ public:
        }
    }

-    const DictTrie* GetDictTrie() const {
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
+                     size_t) const override {
+        //目前hmm默认开启，后期如有需要关闭再修改--jxx20210519
+//        if (!hmm) {
+//            mpSeg_.CutRuneArray(begin, end, res);
+//            return;
+//        }
+
+        vector<WordRange> words;
+        assert(end >= begin);
+        words.reserve(end - begin);
+        mpSeg_.CutRuneArray(begin, end, words);
+
+        vector<WordRange> hmmRes;
+        hmmRes.reserve(end - begin);
+
+        for (size_t i = 0; i < words.size(); i++) {
+            //if mp Get a word, it's ok, put it into result
+            if (words[i].left != words[i].right || (words[i].left == words[i].right &&
+                                                    mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
+                res.push_back(GetStringFromRunes(s, words[i].left, words[i].right));
+                continue;
+            }
+
+            // if mp Get a single one and it is not in userdict, collect it in sequence
+            size_t j = i;
+
+            while (j < words.size() && words[j].left == words[j].right &&
+                   !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
+                j++;
+            }
+
+            // Cut the sequence with hmm
+            assert(j - 1 >= i);
+            // TODO
+            hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
+
+            //put hmm result to result
+            for (size_t k = 0; k < hmmRes.size(); k++) {
+                res.push_back(GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right));
+            }
+
+            //clear tmp vars
+            hmmRes.clear();
+
+            //let i jump over this piece
+            i = j - 1;
+        }
+    }
+
+    const DictTrie* GetDictTrie() const override {
        return mpSeg_.GetDictTrie();
    }

-    bool Tag(const string& src, vector<pair<string, string> >& res) const {
+    bool Tag(const string& src, vector<pair<string, string> >& res) const override {
        return tagger_.Tag(src, res, *this);
    }

@ -124,4 +134,3 @@ private:

 } // namespace cppjieba

-#endif
--- a/libchinese-segmentation/cppjieba/PosTagger.hpp
+++ b/libchinese-segmentation/cppjieba/PosTagger.hpp
@ -1,27 +1,8 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_POS_TAGGING_H
-#define CPPJIEBA_POS_TAGGING_H
+#pragma once

 #include "limonp/StringUtil.hpp"
-#include "SegmentTagged.hpp"
 #include "DictTrie.hpp"
+#include "SegmentTagged.hpp"

 namespace cppjieba {
 using namespace limonp;
@ -39,28 +20,31 @@ public:

    bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
        vector<string> CutRes;
-        segment.Cut(src, CutRes);
+        segment.CutToStr(src, CutRes);

-        for(vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
+        for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
            res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
        }
+
        return !res.empty();
    }

    string LookupTag(const string &str, const SegmentTagged& segment) const {
-        const DictUnit *tmp = NULL;
-        RuneStrArray runes;
        const DictTrie * dict = segment.GetDictTrie();
        assert(dict != NULL);
-        if(!DecodeRunesInString(str, runes)) {
-            XLOG(ERROR) << "Decode failed.";
-            return POS_X;
-        }
-        tmp = dict->Find(runes.begin(), runes.end());
-        if(tmp == NULL || tmp->tag.empty()) {
+        const auto tmp = dict->Find(str);
+
+        if (tmp == NULL || tmp->GetTag().empty()) {
+            RuneStrArray runes;
+
+            if (!DecodeRunesInString(str, runes)) {
+                XLOG(ERROR) << "Decode failed.";
+                return POS_X;
+            }
+
            return SpecialRule(runes);
        } else {
-            return tmp->tag;
+            return tmp->GetTag();
        }
    }

@ -68,22 +52,27 @@ private:
    const char* SpecialRule(const RuneStrArray& unicode) const {
        size_t m = 0;
        size_t eng = 0;
-        for(size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
-            if(unicode[i].rune < 0x80) {
+
+        for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
+            if (unicode[i].rune < 0x80) {
                eng ++;
-                if('0' <= unicode[i].rune && unicode[i].rune <= '9') {
+
+                if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
                    m++;
                }
            }
        }
+
        // ascii char is not found
-        if(eng == 0) {
+        if (eng == 0) {
            return POS_X;
        }
+
        // all the ascii is number char
-        if(m == eng) {
+        if (m == eng) {
            return POS_M;
        }
+
        // the ascii chars contain english letter
        return POS_ENG;
    }
@ -92,4 +81,3 @@ private:

 } // namespace cppjieba

-#endif
--- a/libchinese-segmentation/cppjieba/PreFilter.hpp
+++ b/libchinese-segmentation/cppjieba/PreFilter.hpp
@ -1,43 +1,20 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_PRE_FILTER_H
-#define CPPJIEBA_PRE_FILTER_H
+#pragma once

-#include "Trie.hpp"
 #include "limonp/Logging.hpp"
+#include <unordered_set>
+#include "Unicode.hpp"

 namespace cppjieba {

 class PreFilter {
 public:
-    //TODO use WordRange instead of Range
-    struct Range {
-        RuneStrArray::const_iterator begin;
-        RuneStrArray::const_iterator end;
-    }; // struct Range
-
-    PreFilter(const unordered_set<Rune>& symbols,
+    PreFilter(const std::unordered_set<Rune>& symbols,
              const string& sentence)
        : symbols_(symbols) {
-        if(!DecodeRunesInString(sentence, sentence_)) {
-            XLOG(ERROR) << "decode failed. ";
+        if (!DecodeRunesInString(sentence, sentence_)) {
+            XLOG(ERROR) << "decode failed. "<<sentence;
        }
+
        cursor_ = sentence_.begin();
    }
    ~PreFilter() {
@ -45,28 +22,31 @@ public:
    bool HasNext() const {
        return cursor_ != sentence_.end();
    }
-    Range Next() {
-        Range range;
-        range.begin = cursor_;
-        while(cursor_ != sentence_.end()) {
-            if(IsIn(symbols_, cursor_->rune)) {
-                if(range.begin == cursor_) {
+    WordRange Next() {
+        WordRange range(cursor_, cursor_);
+
+        while (cursor_ != sentence_.end()) {
+            //if (IsIn(symbols_, cursor_->rune)) {
+            if (cursor_->rune == 0x20) {
+                if (range.left == cursor_) {
                    cursor_ ++;
                }
-                range.end = cursor_;
+
+                range.right = cursor_;
                return range;
            }
+
            cursor_ ++;
        }
-        range.end = sentence_.end();
+
+        range.right = sentence_.end();
        return range;
    }
 private:
    RuneStrArray::const_iterator cursor_;
    RuneStrArray sentence_;
-    const unordered_set<Rune>& symbols_;
+    const std::unordered_set<Rune>& symbols_;
 }; // class PreFilter

 } // namespace cppjieba

-#endif // CPPJIEBA_PRE_FILTER_H
--- a/libchinese-segmentation/cppjieba/QuerySegment.hpp
+++ b/libchinese-segmentation/cppjieba/QuerySegment.hpp
@ -1,23 +1,4 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_QUERYSEGMENT_H
-#define CPPJIEBA_QUERYSEGMENT_H
+#pragma once

 #include <algorithm>
 #include <set>
@ -28,74 +9,65 @@
 #include "FullSegment.hpp"
 #include "MixSegment.hpp"
 #include "Unicode.hpp"
+#include "DictTrie.hpp"

 namespace cppjieba {
 class QuerySegment: public SegmentBase {
 public:
-    QuerySegment(const string& dict, const string& model, const string& userDict = "")
-        : mixSeg_(dict, model, userDict),
-          trie_(mixSeg_.GetDictTrie()) {
-    }
    QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
        : mixSeg_(dictTrie, model), trie_(dictTrie) {
    }
    ~QuerySegment() {
    }

-    void Cut(const string& sentence, vector<string>& words) const {
-        Cut(sentence, words, true);
-    }
-    void Cut(const string& sentence, vector<string>& words, bool hmm) const {
-        vector<Word> tmp;
-        Cut(sentence, tmp, hmm);
-        GetStringsFromWords(tmp, words);
-    }
-    void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
-        PreFilter pre_filter(symbols_, sentence);
-        PreFilter::Range range;
-        vector<WordRange> wrs;
-        wrs.reserve(sentence.size() / 2);
-        while(pre_filter.HasNext()) {
-            range = pre_filter.Next();
-            Cut(range.begin, range.end, wrs, hmm);
-        }
-        words.clear();
-        words.reserve(wrs.size());
-        GetWordsFromWordRanges(sentence, wrs, words);
-    }
-    void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
+    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
+                     size_t) const override {
        //use mix Cut first
        vector<WordRange> mixRes;
-        mixSeg_.Cut(begin, end, mixRes, hmm);
+        mixSeg_.CutRuneArray(begin, end, mixRes, hmm);

        vector<WordRange> fullRes;
-        for(vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
-            if(mixResItr->Length() > 2) {
-                for(size_t i = 0; i + 1 < mixResItr->Length(); i++) {
-                    WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
-                    if(trie_->Find(wr.left, wr.right + 1) != NULL) {
+
+        for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
+            if (mixResItr->Length() > 2) {
+                for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
+                    string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 2);
+
+                    if (trie_->Find(text) != NULL) {
+                        WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
                        res.push_back(wr);
                    }
                }
            }
-            if(mixResItr->Length() > 3) {
-                for(size_t i = 0; i + 2 < mixResItr->Length(); i++) {
-                    WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
-                    if(trie_->Find(wr.left, wr.right + 1) != NULL) {
+
+            if (mixResItr->Length() > 3) {
+                for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
+                    string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 3);
+
+                    if (trie_->Find(text) != NULL) {
+                        WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
                        res.push_back(wr);
                    }
                }
            }
+
            res.push_back(*mixResItr);
        }
    }
+
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
+                     size_t) const override {
+
+    }
+
 private:
-    bool IsAllAscii(const Unicode& s) const {
-        for(size_t i = 0; i < s.size(); i++) {
-            if(s[i] >= 0x80) {
+    bool IsAllAscii(const RuneArray& s) const {
+        for (size_t i = 0; i < s.size(); i++) {
+            if (s[i] >= 0x80) {
                return false;
            }
        }
+
        return true;
    }
    MixSegment mixSeg_;
@ -104,4 +76,3 @@ private:

 } // namespace cppjieba

-#endif
--- a/libchinese-segmentation/cppjieba/SegmentBase.hpp
+++ b/libchinese-segmentation/cppjieba/SegmentBase.hpp
@ -1,23 +1,4 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_SEGMENTBASE_H
-#define CPPJIEBA_SEGMENTBASE_H
+#pragma once

 #include "limonp/Logging.hpp"
 #include "PreFilter.hpp"
@ -35,24 +16,69 @@ public:
    SegmentBase() {
        XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
    }
-    virtual ~SegmentBase() {
+    virtual ~SegmentBase() { }
+
+    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
+                     size_t max_word_len) const = 0;
+    //添加基于sentence的cut方法，减少中间变量的存储与格式转换--jxx20210517
+    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
+                     size_t max_word_len) const = 0;
+    //重写CutToStr函数，简化获取vector<string>& words的流程，降低内存占用--jxx20210517
+    void CutToStr(const string& sentence, vector<string>& words, bool hmm = true,
+                  size_t max_word_len = MAX_WORD_LENGTH) const {
+/*
+        vector<Word> tmp;
+        CutToWord(sentence, tmp, hmm, max_word_len);
+        GetStringsFromWords(tmp, words);
+*/
+        PreFilter pre_filter(symbols_, sentence);
+        words.clear();
+        words.reserve(sentence.size() / 2);//todo 参考源码，参数待定
+        while (pre_filter.HasNext()) {
+            auto range = pre_filter.Next();
+            CutWithSentence(sentence, range.left, range.right, words, hmm, max_word_len);
+        }
    }

-    virtual void Cut(const string& sentence, vector<string>& words) const = 0;
+    void CutToWord(const string& sentence, vector<Word>& words, bool hmm = true,
+                   size_t max_word_len = MAX_WORD_LENGTH) const {
+        PreFilter pre_filter(symbols_, sentence);
+        vector<WordRange> wrs;
+        wrs.reserve(sentence.size() / 2);
+
+        while (pre_filter.HasNext()) {
+            auto range = pre_filter.Next();
+            Cut(range.left, range.right, wrs, hmm, max_word_len);
+        }
+
+        words.clear();
+        words.reserve(wrs.size());
+        GetWordsFromWordRanges(sentence, wrs, words);
+        wrs.clear();
+        vector<WordRange>().swap(wrs);
+    }
+
+    void CutRuneArray(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res,
+                      bool hmm = true, size_t max_word_len = MAX_WORD_LENGTH) const {
+        Cut(begin, end, res, hmm, max_word_len);
+    }

    bool ResetSeparators(const string& s) {
        symbols_.clear();
        RuneStrArray runes;
-        if(!DecodeRunesInString(s, runes)) {
+
+        if (!DecodeRunesInString(s, runes)) {
            XLOG(ERROR) << "decode " << s << " failed";
            return false;
        }
-        for(size_t i = 0; i < runes.size(); i++) {
-            if(!symbols_.insert(runes[i].rune).second) {
+
+        for (size_t i = 0; i < runes.size(); i++) {
+            if (!symbols_.insert(runes[i].rune).second) {
                XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
                return false;
            }
        }
+
        return true;
    }
 protected:
@ -61,4 +87,3 @@ protected:

 } // cppjieba

-#endif
--- a/libchinese-segmentation/cppjieba/SegmentTagged.hpp
+++ b/libchinese-segmentation/cppjieba/SegmentTagged.hpp
@ -1,23 +1,4 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_SEGMENTTAGGED_H
-#define CPPJIEBA_SEGMENTTAGGED_H
+#pragma once

 #include "SegmentBase.hpp"

@ -38,4 +19,3 @@ public:

 } // cppjieba

-#endif
--- a/libchinese-segmentation/cppjieba/TextRankExtractor.hpp
+++ b/libchinese-segmentation/cppjieba/TextRankExtractor.hpp
@ -1,212 +1,205 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
-#define CPPJIEBA_TEXTRANK_EXTRACTOR_H
-
-#include <cmath>
-#include "Jieba.hpp"
-
-namespace cppjieba {
-using namespace limonp;
-using namespace std;
-
-class TextRankExtractor {
-public:
-    typedef struct _Word {
-        string word;
-        vector<size_t> offsets;
-        double weight;
-    }    Word; // struct Word
-private:
-    typedef std::map<string, Word> WordMap;
-
-    class WordGraph {
-    private:
-        typedef double Score;
-        typedef string Node;
-        typedef std::set<Node> NodeSet;
-
-        typedef std::map<Node, double> Edges;
-        typedef std::map<Node, Edges> Graph;
-        //typedef std::unordered_map<Node,double> Edges;
-        //typedef std::unordered_map<Node,Edges> Graph;
-
-        double d;
-        Graph graph;
-        NodeSet nodeSet;
-    public:
-        WordGraph(): d(0.85) {};
-        WordGraph(double in_d): d(in_d) {};
-
-        void addEdge(Node start, Node end, double weight) {
-            Edges temp;
-            Edges::iterator gotEdges;
-            nodeSet.insert(start);
-            nodeSet.insert(end);
-            graph[start][end] += weight;
-            graph[end][start] += weight;
-        }
-
-        void rank(WordMap &ws, size_t rankTime = 10) {
-            WordMap outSum;
-            Score wsdef, min_rank, max_rank;
-
-            if(graph.size() == 0)
-                return;
-
-            wsdef = 1.0 / graph.size();
-
-            for(Graph::iterator edges = graph.begin(); edges != graph.end(); ++edges) {
-                // edges->first start节点；edge->first end节点；edge->second 权重
-                ws[edges->first].word = edges->first;
-                ws[edges->first].weight = wsdef;
-                outSum[edges->first].weight = 0;
-                for(Edges::iterator edge = edges->second.begin(); edge != edges->second.end(); ++edge) {
-                    outSum[edges->first].weight += edge->second;
-                }
-            }
-            //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
-            for(size_t i = 0; i < rankTime; i++) {
-                for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++) {
-                    double s = 0;
-                    for(Edges::iterator edge = graph[*node].begin(); edge != graph[*node].end(); edge++)
-                        // edge->first end节点；edge->second 权重
-                        s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
-                    ws[*node].weight = (1 - d) + d * s;
-                }
-            }
-
-            min_rank = max_rank = ws.begin()->second.weight;
-            for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
-                if(i->second.weight < min_rank) {
-                    min_rank = i->second.weight;
-                }
-                if(i->second.weight > max_rank) {
-                    max_rank = i->second.weight;
-                }
-            }
-            for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
-                ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
-            }
-        }
-    };
-
-public:
-    TextRankExtractor(const string& dictPath,
-                      const string& hmmFilePath,
-                      const string& stopWordPath,
-                      const string& userDict = "")
-        : segment_(dictPath, hmmFilePath, userDict) {
-        LoadStopWordDict(stopWordPath);
-    }
-    TextRankExtractor(const DictTrie* dictTrie,
-                      const HMMModel* model,
-                      const string& stopWordPath)
-        : segment_(dictTrie, model) {
-        LoadStopWordDict(stopWordPath);
-    }
-    TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
-        LoadStopWordDict(stopWordPath);
-    }
-    ~TextRankExtractor() {
-    }
-
-    void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
-        vector<Word> topWords;
-        Extract(sentence, topWords, topN);
-        for(size_t i = 0; i < topWords.size(); i++) {
-            keywords.push_back(topWords[i].word);
-        }
-    }
-
-    void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
-        vector<Word> topWords;
-        Extract(sentence, topWords, topN);
-        for(size_t i = 0; i < topWords.size(); i++) {
-            keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
-        }
-    }
-
-    void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span = 5, size_t rankTime = 10) const {
-        vector<string> words;
-        segment_.Cut(sentence, words);
-
-        TextRankExtractor::WordGraph graph;
-        WordMap wordmap;
-        size_t offset = 0;
-
-        for(size_t i = 0; i < words.size(); i++) {
-            size_t t = offset;
-            offset += words[i].size();
-            if(IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
-                continue;
-            }
-            for(size_t j = i + 1, skip = 0; j < i + span + skip && j < words.size(); j++) {
-                if(IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
-                    skip++;
-                    continue;
-                }
-                graph.addEdge(words[i], words[j], 1);
-            }
-            wordmap[words[i]].offsets.push_back(t);
-        }
-        if(offset != sentence.size()) {
-            XLOG(ERROR) << "words illegal";
-            return;
-        }
-
-        graph.rank(wordmap, rankTime);
-
-        keywords.clear();
-        keywords.reserve(wordmap.size());
-        for(WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
-            keywords.push_back(itr->second);
-        }
-
-        topN = min(topN, keywords.size());
-        partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
-        keywords.resize(topN);
-    }
-private:
-    void LoadStopWordDict(const string& filePath) {
-        ifstream ifs(filePath.c_str());
-        XCHECK(ifs.is_open()) << "open " << filePath << " failed";
-        string line ;
-        while(getline(ifs, line)) {
-            stopWords_.insert(line);
-        }
-        assert(stopWords_.size());
-    }
-
-    static bool Compare(const Word &x, const Word &y) {
-        return x.weight > y.weight;
-    }
-
-    MixSegment segment_;
-    unordered_set<string> stopWords_;
-}; // class TextRankExtractor
-
-inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
-    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
-}
-} // namespace cppjieba
-
-#endif
-
-
+
+#include <cmath>
+#include "Jieba.hpp"
+
+namespace cppjieba {
+using namespace limonp;
+using namespace std;
+
+class TextRankExtractor {
+public:
+    typedef struct _Word {
+        string word;
+        vector<size_t> offsets;
+        double weight;
+    }    Word; // struct Word
+private:
+    typedef std::map<string, Word> WordMap;
+
+    class WordGraph {
+    private:
+        typedef double Score;
+        typedef string Node;
+        typedef std::set<Node> NodeSet;
+
+        typedef std::map<Node, double> Edges;
+        typedef std::map<Node, Edges> Graph;
+        //typedef std::unordered_map<Node,double> Edges;
+        //typedef std::unordered_map<Node,Edges> Graph;
+
+        double d;
+        Graph graph;
+        NodeSet nodeSet;
+    public:
+        WordGraph(): d(0.85) {};
+        WordGraph(double in_d): d(in_d) {};
+
+        void addEdge(Node start, Node end, double weight) {
+            Edges temp;
+            Edges::iterator gotEdges;
+            nodeSet.insert(start);
+            nodeSet.insert(end);
+            graph[start][end] += weight;
+            graph[end][start] += weight;
+        }
+
+        void rank(WordMap &ws, size_t rankTime = 10) {
+            WordMap outSum;
+            Score wsdef, min_rank, max_rank;
+
+            if (graph.size() == 0) {
+                return;
+            }
+
+            wsdef = 1.0 / graph.size();
+
+            for (Graph::iterator edges = graph.begin(); edges != graph.end(); ++edges) {
+                // edges->first start节点；edge->first end节点；edge->second 权重
+                ws[edges->first].word = edges->first;
+                ws[edges->first].weight = wsdef;
+                outSum[edges->first].weight = 0;
+
+                for (Edges::iterator edge = edges->second.begin(); edge != edges->second.end(); ++edge) {
+                    outSum[edges->first].weight += edge->second;
+                }
+            }
+
+            //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序?
+            for (size_t i = 0; i < rankTime; i++) {
+                for (NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++) {
+                    double s = 0;
+
+                    for (Edges::iterator edge = graph[*node].begin(); edge != graph[*node].end(); edge++)
+                        // edge->first end节点；edge->second 权重
+                    {
+                        s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
+                    }
+
+                    ws[*node].weight = (1 - d) + d * s;
+                }
+            }
+
+            min_rank = max_rank = ws.begin()->second.weight;
+
+            for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
+                if (i->second.weight < min_rank) {
+                    min_rank = i->second.weight;
+                }
+
+                if (i->second.weight > max_rank) {
+                    max_rank = i->second.weight;
+                }
+            }
+
+            for (WordMap::iterator i = ws.begin(); i != ws.end(); i ++) {
+                ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
+            }
+        }
+    };
+
+public:
+    TextRankExtractor(const DictTrie* dictTrie,
+                      const HMMModel* model,
+                      const string& stopWordPath)
+        : segment_(dictTrie, model) {
+        LoadStopWordDict(stopWordPath);
+    }
+    TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
+        LoadStopWordDict(stopWordPath);
+    }
+    ~TextRankExtractor() {
+    }
+
+    void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
+        vector<Word> topWords;
+        Extract(sentence, topWords, topN);
+
+        for (size_t i = 0; i < topWords.size(); i++) {
+            keywords.push_back(topWords[i].word);
+        }
+    }
+
+    void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
+        vector<Word> topWords;
+        Extract(sentence, topWords, topN);
+
+        for (size_t i = 0; i < topWords.size(); i++) {
+            keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
+        }
+    }
+
+    void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span = 5, size_t rankTime = 10) const {
+        vector<string> words;
+        segment_.CutToStr(sentence, words);
+
+        TextRankExtractor::WordGraph graph;
+        WordMap wordmap;
+        size_t offset = 0;
+
+        for (size_t i = 0; i < words.size(); i++) {
+            size_t t = offset;
+            offset += words[i].size();
+
+            if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
+                continue;
+            }
+
+            for (size_t j = i + 1, skip = 0; j < i + span + skip && j < words.size(); j++) {
+                if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
+                    skip++;
+                    continue;
+                }
+
+                graph.addEdge(words[i], words[j], 1);
+            }
+
+            wordmap[words[i]].offsets.push_back(t);
+        }
+
+        if (offset != sentence.size()) {
+            XLOG(ERROR) << "words illegal";
+            return;
+        }
+
+        graph.rank(wordmap, rankTime);
+
+        keywords.clear();
+        keywords.reserve(wordmap.size());
+
+        for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
+            keywords.push_back(itr->second);
+        }
+
+        topN = min(topN, keywords.size());
+        partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
+        keywords.resize(topN);
+    }
+private:
+    void LoadStopWordDict(const string& filePath) {
+        ifstream ifs(filePath.c_str());
+        XCHECK(ifs.is_open()) << "open " << filePath << " failed";
+        string line ;
+
+        while (getline(ifs, line)) {
+            stopWords_.insert(line);
+        }
+
+        assert(stopWords_.size());
+    }
+
+    static bool Compare(const Word &x, const Word &y) {
+        return x.weight > y.weight;
+    }
+
+    MixSegment segment_;
+    unordered_set<string> stopWords_;
+}; // class TextRankExtractor
+
+inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
+    return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight <<
+           "}";
+}
+} // namespace cppjieba
+
+
+
--- a/libchinese-segmentation/cppjieba/Trie.hpp
+++ b/libchinese-segmentation/cppjieba/Trie.hpp
@ -1,192 +0,0 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_TRIE_HPP
-#define CPPJIEBA_TRIE_HPP
-
-#include <vector>
-#include <queue>
-#include "limonp/StdExtension.hpp"
-#include "Unicode.hpp"
-
-namespace cppjieba {
-
-using namespace std;
-
-const size_t MAX_WORD_LENGTH = 512;
-
-struct DictUnit {
-    Unicode word;
-    double weight;
-    string tag;
-}; // struct DictUnit
-
-// for debugging
-// inline ostream & operator << (ostream& os, const DictUnit& unit) {
-//   string s;
-//   s << unit.word;
-//   return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
-// }
-
-struct Dag {
-    RuneStr runestr;
-    // [offset, nexts.first]
-    limonp::LocalVector<pair<size_t, const DictUnit*> > nexts;
-    const DictUnit * pInfo;
-    double weight;
-    size_t nextPos; // TODO
-    Dag(): runestr(), pInfo(NULL), weight(0.0), nextPos(0) {
-    }
-}; // struct Dag
-
-typedef Rune TrieKey;
-
-class TrieNode {
-public :
-    TrieNode(): next(NULL), ptValue(NULL) {
-    }
-public:
-    typedef unordered_map<TrieKey, TrieNode*> NextMap;
-    NextMap *next;
-    const DictUnit *ptValue;
-};
-
-class Trie {
-public:
-    Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
-        : root_(new TrieNode) {
-        CreateTrie(keys, valuePointers);
-    }
-    ~Trie() {
-        DeleteNode(root_);
-    }
-
-    const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
-        if(begin == end) {
-            return NULL;
-        }
-
-        const TrieNode* ptNode = root_;
-        TrieNode::NextMap::const_iterator citer;
-        for(RuneStrArray::const_iterator it = begin; it != end; it++) {
-            if(NULL == ptNode->next) {
-                return NULL;
-            }
-            citer = ptNode->next->find(it->rune);
-            if(ptNode->next->end() == citer) {
-                return NULL;
-            }
-            ptNode = citer->second;
-        }
-        return ptNode->ptValue;
-    }
-
-    void Find(RuneStrArray::const_iterator begin,
-              RuneStrArray::const_iterator end,
-              vector<struct Dag>&res,
-              size_t max_word_len = MAX_WORD_LENGTH) const {
-        assert(root_ != NULL);
-        res.resize(end - begin);
-
-        const TrieNode *ptNode = NULL;
-        TrieNode::NextMap::const_iterator citer;
-        for(size_t i = 0; i < size_t(end - begin); i++) {
-            res[i].runestr = *(begin + i);
-
-            if(root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
-                ptNode = citer->second;
-            } else {
-                ptNode = NULL;
-            }
-            if(ptNode != NULL) {
-                res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
-            } else {
-                res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, static_cast<const DictUnit*>(NULL)));
-            }
-
-            for(size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) {
-                if(ptNode == NULL || ptNode->next == NULL) {
-                    break;
-                }
-                citer = ptNode->next->find((begin + j)->rune);
-                if(ptNode->next->end() == citer) {
-                    break;
-                }
-                ptNode = citer->second;
-                if(NULL != ptNode->ptValue) {
-                    res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
-                }
-            }
-        }
-    }
-
-    void InsertNode(const Unicode& key, const DictUnit* ptValue) {
-        if(key.begin() == key.end()) {
-            return;
-        }
-
-        TrieNode::NextMap::const_iterator kmIter;
-        TrieNode *ptNode = root_;
-        for(Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
-            if(NULL == ptNode->next) {
-                ptNode->next = new TrieNode::NextMap;
-            }
-            kmIter = ptNode->next->find(*citer);
-            if(ptNode->next->end() == kmIter) {
-                TrieNode *nextNode = new TrieNode;
-
-                ptNode->next->insert(make_pair(*citer, nextNode));
-                ptNode = nextNode;
-            } else {
-                ptNode = kmIter->second;
-            }
-        }
-        assert(ptNode != NULL);
-        ptNode->ptValue = ptValue;
-    }
-
-private:
-    void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
-        if(valuePointers.empty() || keys.empty()) {
-            return;
-        }
-        assert(keys.size() == valuePointers.size());
-
-        for(size_t i = 0; i < keys.size(); i++) {
-            InsertNode(keys[i], valuePointers[i]);
-        }
-    }
-
-    void DeleteNode(TrieNode* node) {
-        if(NULL == node) {
-            return;
-        }
-        if(NULL != node->next) {
-            for(TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) {
-                DeleteNode(it->second);
-            }
-            delete node->next;
-        }
-        delete node;
-    }
-
-    TrieNode* root_;
-}; // class Trie
-} // namespace cppjieba
-
-#endif // CPPJIEBA_TRIE_HPP
--- a/libchinese-segmentation/cppjieba/Unicode.hpp
+++ b/libchinese-segmentation/cppjieba/Unicode.hpp
@ -1,23 +1,4 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
-#ifndef CPPJIEBA_UNICODE_H
-#define CPPJIEBA_UNICODE_H
+#pragma once

 #include <stdint.h>
 #include <stdlib.h>
@ -25,6 +6,7 @@
 #include <vector>
 #include <ostream>
 #include "limonp/LocalVector.hpp"
+#include "limonp/StringUtil.hpp"

 namespace cppjieba {

@ -50,28 +32,28 @@ inline std::ostream& operator << (std::ostream& os, const Word& w) {
    return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
 }

-struct RuneStr {
+struct RuneInfo {
    Rune rune;
    uint32_t offset;
    uint32_t len;
-    uint32_t unicode_offset;
-    uint32_t unicode_length;
-    RuneStr(): rune(0), offset(0), len(0), unicode_offset(0), unicode_length(0) {
+    uint32_t unicode_offset = 0;
+    uint32_t unicode_length = 0;
+    RuneInfo(): rune(0), offset(0), len(0) {
    }
-    RuneStr(Rune r, uint32_t o, uint32_t l)
-        : rune(r), offset(o), len(l), unicode_offset(0), unicode_length(0) {
+    RuneInfo(Rune r, uint32_t o, uint32_t l)
+        : rune(r), offset(o), len(l) {
    }
-    RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
+    RuneInfo(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
        : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
    }
-}; // struct RuneStr
+}; // struct RuneInfo

-inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
+inline std::ostream& operator << (std::ostream& os, const RuneInfo& r) {
    return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
 }

-typedef limonp::LocalVector<Rune> Unicode;
-typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
+typedef limonp::LocalVector<Rune> RuneArray;
+typedef limonp::LocalVector<struct RuneInfo> RuneStrArray;

 // [left, right]
 struct WordRange {
@ -83,127 +65,169 @@ struct WordRange {
    size_t Length() const {
        return right - left + 1;
    }
+
    bool IsAllAscii() const {
-        for(RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
-            if(iter->rune >= 0x80) {
+        for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
+            if (iter->rune >= 0x80) {
                return false;
            }
        }
+
        return true;
    }
 }; // struct WordRange

-struct RuneStrLite {
-    uint32_t rune;
-    uint32_t len;
-    RuneStrLite(): rune(0), len(0) {
-    }
-    RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
-    }
-}; // struct RuneStrLite

-inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
-    RuneStrLite rp(0, 0);
-    if(str == NULL || len == 0) {
-        return rp;
-    }
-    if(!(str[0] & 0x80)) {  // 0xxxxxxx
-        // 7bit, total 7bit
-        rp.rune = (uint8_t)(str[0]) & 0x7f;
-        rp.len = 1;
-    } else if((uint8_t)str[0] <= 0xdf &&  1 < len) {
-        // 110xxxxxx
-        // 5bit, total 5bit
-        rp.rune = (uint8_t)(str[0]) & 0x1f;
-
-        // 6bit, total 11bit
-        rp.rune <<= 6;
-        rp.rune |= (uint8_t)(str[1]) & 0x3f;
-        rp.len = 2;
-    } else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
-        // 4bit, total 4bit
-        rp.rune = (uint8_t)(str[0]) & 0x0f;
-
-        // 6bit, total 10bit
-        rp.rune <<= 6;
-        rp.rune |= (uint8_t)(str[1]) & 0x3f;
-
-        // 6bit, total 16bit
-        rp.rune <<= 6;
-        rp.rune |= (uint8_t)(str[2]) & 0x3f;
-
-        rp.len = 3;
-    } else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
-        // 3bit, total 3bit
-        rp.rune = (uint8_t)(str[0]) & 0x07;
-
-        // 6bit, total 9bit
-        rp.rune <<= 6;
-        rp.rune |= (uint8_t)(str[1]) & 0x3f;
-
-        // 6bit, total 15bit
-        rp.rune <<= 6;
-        rp.rune |= (uint8_t)(str[2]) & 0x3f;
-
-        // 6bit, total 21bit
-        rp.rune <<= 6;
-        rp.rune |= (uint8_t)(str[3]) & 0x3f;
-
-        rp.len = 4;
-    } else {
-        rp.rune = 0;
-        rp.len = 0;
-    }
-    return rp;
+inline bool DecodeRunesInString(const string& s, RuneArray& arr) {
+    arr.clear();
+    return limonp::Utf8ToUnicode32(s, arr);
 }

-inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
-    runes.clear();
-    runes.reserve(len / 2);
-    for(uint32_t i = 0, j = 0; i < len;) {
-        RuneStrLite rp = DecodeRuneInString(s + i, len - i);
-        if(rp.len == 0) {
-            runes.clear();
-            return false;
-        }
-        RuneStr x(rp.rune, i, rp.len, j, 1);
-        runes.push_back(x);
-        i += rp.len;
-        ++j;
-    }
-    return true;
+inline RuneArray DecodeRunesInString(const string& s) {
+    RuneArray result;
+    DecodeRunesInString(s, result);
+    return result;
 }

+//重写DecodeRunesInString函数，将实现放入函数中降低内存占用加快处理流程--jxx20210518
 inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
-    return DecodeRunesInString(s.c_str(), s.size(), runes);
-}
+/*
+    RuneArray arr;

-inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
-    unicode.clear();
-    RuneStrArray runes;
-    if(!DecodeRunesInString(s, len, runes)) {
+    if (not DecodeRunesInString(s, arr)) {
        return false;
    }
-    unicode.reserve(runes.size());
-    for(size_t i = 0; i < runes.size(); i++) {
-        unicode.push_back(runes[i].rune);
+
+    runes.clear();
+
+    uint32_t offset = 0;
+
+    for (uint32_t i = 0; i < arr.size(); ++i) {
+        const uint32_t len = limonp::UnicodeToUtf8Bytes(arr[i]);
+        RuneInfo x(arr[i], offset, len, i, 1);
+        runes.push_back(x);
+        offset += len;
+    }
+*/
+
+    uint32_t tmp;
+    uint32_t offset = 0;
+    runes.clear();
+    for(size_t i = 0; i < s.size();) {
+      if(!(s.data()[i] & 0x80)) { // 0xxxxxxx
+        // 7bit, total 7bit
+        tmp = (uint8_t)(s.data()[i]) & 0x7f;
+        i++;
+      } else if ((uint8_t)s.data()[i] <= 0xdf && i + 1 < s.size()) { // 110xxxxxx
+        // 5bit, total 5bit
+        tmp = (uint8_t)(s.data()[i]) & 0x1f;
+
+        // 6bit, total 11bit
+        tmp <<= 6;
+        tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
+        i += 2;
+      } else if((uint8_t)s.data()[i] <= 0xef && i + 2 < s.size()) { // 1110xxxxxx
+        // 4bit, total 4bit
+        tmp = (uint8_t)(s.data()[i]) & 0x0f;
+
+        // 6bit, total 10bit
+        tmp <<= 6;
+        tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
+
+        // 6bit, total 16bit
+        tmp <<= 6;
+        tmp |= (uint8_t)(s.data()[i+2]) & 0x3f;
+
+        i += 3;
+      } else if((uint8_t)s.data()[i] <= 0xf7 && i + 3 < s.size()) { // 11110xxxx
+        // 3bit, total 3bit
+        tmp = (uint8_t)(s.data()[i]) & 0x07;
+
+        // 6bit, total 9bit
+        tmp <<= 6;
+        tmp |= (uint8_t)(s.data()[i+1]) & 0x3f;
+
+        // 6bit, total 15bit
+        tmp <<= 6;
+        tmp |= (uint8_t)(s.data()[i+2]) & 0x3f;
+
+        // 6bit, total 21bit
+        tmp <<= 6;
+        tmp |= (uint8_t)(s.data()[i+3]) & 0x3f;
+
+        i += 4;
+      } else {
+        return false;
+      }
+      uint32_t len = limonp::UnicodeToUtf8Bytes(tmp);
+      RuneInfo x(tmp, offset, len, i, 1);
+      runes.push_back(x);
+      offset += len;
    }
    return true;
 }

+class RunePtrWrapper {
+public:
+    const RuneInfo * m_ptr = nullptr;
+
+public:
+    explicit RunePtrWrapper(const RuneInfo * p) : m_ptr(p) {}
+
+    uint32_t operator *() {
+        return m_ptr->rune;
+    }
+
+    RunePtrWrapper operator ++(int) {
+        m_ptr ++;
+        return RunePtrWrapper(m_ptr);
+    }
+
+    bool operator !=(const RunePtrWrapper & b) const {
+        return this->m_ptr != b.m_ptr;
+    }
+};
+
+inline string EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) {
+    string str;
+    RunePtrWrapper it_begin(begin), it_end(end);
+    limonp::Unicode32ToUtf8(it_begin, it_end, str);
+    return str;
+}
+
+inline void EncodeRunesToString(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, string& str) {
+    RunePtrWrapper it_begin(begin), it_end(end);
+    limonp::Unicode32ToUtf8(it_begin, it_end, str);
+    return;
+}
+
+class Unicode32Counter {
+public :
+    size_t length = 0;
+    void clear() {
+        length = 0;
+    }
+    void push_back(uint32_t) {
+        ++length;
+    }
+};
+
+inline size_t Utf8CharNum(const char * str, size_t length) {
+    Unicode32Counter c;
+
+    if (limonp::Utf8ToUnicode32(str, length, c)) {
+        return c.length;
+    }
+
+    return 0;
+}
+
+inline size_t Utf8CharNum(const string & str) {
+    return Utf8CharNum(str.data(), str.size());
+}
+
 inline bool IsSingleWord(const string& str) {
-    RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
-    return rp.len == str.size();
-}
-
-inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
-    return DecodeRunesInString(s.c_str(), s.size(), unicode);
-}
-
-inline Unicode DecodeRunesInString(const string& s) {
-    Unicode result;
-    DecodeRunesInString(s, result);
-    return result;
+    return Utf8CharNum(str) == 1;
 }


@ -218,28 +242,31 @@ inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left,
 inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
    assert(right->offset >= left->offset);
    uint32_t len = right->offset - left->offset + right->len;
-    return s.substr(left->offset, len);
+    uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
+    return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length).word;
 }

 inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
-    for(size_t i = 0; i < wrs.size(); i++) {
+    for (size_t i = 0; i < wrs.size(); i++) {
        words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
    }
 }

-inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
-    vector<Word> result;
-    GetWordsFromWordRanges(s, wrs, result);
-    return result;
+inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<string>& words) {
+    for (size_t i = 0; i < wrs.size(); i++) {
+        words.push_back(GetStringFromRunes(s, wrs[i].left, wrs[i].right));
+    }
 }

 inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
    strs.resize(words.size());
-    for(size_t i = 0; i < words.size(); ++i) {
+
+    for (size_t i = 0; i < words.size(); ++i) {
        strs[i] = words[i].word;
    }
 }

+const size_t MAX_WORD_LENGTH = 512;
+
 } // namespace cppjieba

-#endif // CPPJIEBA_UNICODE_H
--- a/libchinese-segmentation/cppjieba/darts.h
+++ b/libchinese-segmentation/cppjieba/darts.h
--- a/libchinese-segmentation/cppjieba/limonp/ArgvContext.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/ArgvContext.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 /************************************
 * file enc : ascii
 * author   : wuyanyi09@gmail.com
@ -33,54 +15,54 @@ namespace limonp {
 using namespace std;

 class ArgvContext {
-public :
-    ArgvContext(int argc, const char* const * argv) {
-        for(int i = 0; i < argc; i++) {
-            if(StartsWith(argv[i], "-")) {
-                if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) {
-                    mpss_[argv[i]] = argv[i + 1];
-                    i++;
-                } else {
-                    sset_.insert(argv[i]);
-                }
-            } else {
-                args_.push_back(argv[i]);
-            }
+ public :
+  ArgvContext(int argc, const char* const * argv) {
+    for(int i = 0; i < argc; i++) {
+      if(StartsWith(argv[i], "-")) {
+        if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) {
+          mpss_[argv[i]] = argv[i+1];
+          i++;
+        } else {
+          sset_.insert(argv[i]);
        }
+      } else {
+        args_.push_back(argv[i]);
+      }
    }
-    ~ArgvContext() {
-    }
+  }
+  ~ArgvContext() {
+  }

-    friend ostream& operator << (ostream& os, const ArgvContext& args);
-    string operator [](size_t i) const {
-        if(i < args_.size()) {
-            return args_[i];
-        }
-        return "";
+  friend ostream& operator << (ostream& os, const ArgvContext& args);
+  string operator [](size_t i) const {
+    if(i < args_.size()) {
+      return args_[i];
    }
-    string operator [](const string& key) const {
-        map<string, string>::const_iterator it = mpss_.find(key);
-        if(it != mpss_.end()) {
-            return it->second;
-        }
-        return "";
+    return "";
+  }
+  string operator [](const string& key) const {
+    map<string, string>::const_iterator it = mpss_.find(key);
+    if(it != mpss_.end()) {
+      return it->second;
    }
+    return "";
+  }

-    bool HasKey(const string& key) const {
-        if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) {
-            return true;
-        }
-        return false;
+  bool HasKey(const string& key) const {
+    if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) {
+      return true;
    }
+    return false;
+  }

-private:
-    vector<string> args_;
-    map<string, string> mpss_;
-    set<string> sset_;
+ private:
+  vector<string> args_;
+  map<string, string> mpss_;
+  set<string> sset_;
 }; // class ArgvContext

 inline ostream& operator << (ostream& os, const ArgvContext& args) {
-    return os << args.args_ << args.mpss_ << args.sset_;
+  return os<<args.args_<<args.mpss_<<args.sset_;
 }

 } // namespace limonp
--- a/libchinese-segmentation/cppjieba/limonp/BlockingQueue.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/BlockingQueue.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_BLOCKINGQUEUE_HPP
 #define LIMONP_BLOCKINGQUEUE_HPP

@ -25,41 +7,41 @@
 namespace limonp {
 template<class T>
 class BlockingQueue: NonCopyable {
-public:
-    BlockingQueue()
-        : mutex_(), notEmpty_(mutex_), queue_() {
-    }
+ public:
+  BlockingQueue()
+    : mutex_(), notEmpty_(mutex_), queue_() {
+  }

-    void Push(const T& x) {
-        MutexLockGuard lock(mutex_);
-        queue_.push(x);
-        notEmpty_.Notify(); // Wait morphing saves us
-    }
+  void Push(const T& x) {
+    MutexLockGuard lock(mutex_);
+    queue_.push(x);
+    notEmpty_.Notify(); // Wait morphing saves us
+  }

-    T Pop() {
-        MutexLockGuard lock(mutex_);
-        // always use a while-loop, due to spurious wakeup
-        while(queue_.empty()) {
-            notEmpty_.Wait();
-        }
-        assert(!queue_.empty());
-        T front(queue_.front());
-        queue_.pop();
-        return front;
+  T Pop() {
+    MutexLockGuard lock(mutex_);
+    // always use a while-loop, due to spurious wakeup
+    while (queue_.empty()) {
+      notEmpty_.Wait();
    }
+    assert(!queue_.empty());
+    T front(queue_.front());
+    queue_.pop();
+    return front;
+  }

-    size_t Size() const {
-        MutexLockGuard lock(mutex_);
-        return queue_.size();
-    }
-    bool Empty() const {
-        return Size() == 0;
-    }
+  size_t Size() const {
+    MutexLockGuard lock(mutex_);
+    return queue_.size();
+  }
+  bool Empty() const {
+    return Size() == 0;
+  }

-private:
-    mutable MutexLock mutex_;
-    Condition         notEmpty_;
-    std::queue<T>     queue_;
+ private:
+  mutable MutexLock mutex_;
+  Condition         notEmpty_;
+  std::queue<T>     queue_;
 }; // class BlockingQueue

 } // namespace limonp
--- a/libchinese-segmentation/cppjieba/limonp/BoundedBlockingQueue.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/BoundedBlockingQueue.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
 #define LIMONP_BOUNDED_BLOCKING_QUEUE_HPP

@ -25,59 +7,59 @@ namespace limonp {

 template<typename T>
 class BoundedBlockingQueue : NonCopyable {
-public:
-    explicit BoundedBlockingQueue(size_t maxSize)
-        : mutex_(),
-          notEmpty_(mutex_),
-          notFull_(mutex_),
-          queue_(maxSize) {
-    }
+ public:
+  explicit BoundedBlockingQueue(size_t maxSize)
+    : mutex_(),
+      notEmpty_(mutex_),
+      notFull_(mutex_),
+      queue_(maxSize) {
+  }

-    void Push(const T& x) {
-        MutexLockGuard lock(mutex_);
-        while(queue_.Full()) {
-            notFull_.Wait();
-        }
-        assert(!queue_.Full());
-        queue_.Push(x);
-        notEmpty_.Notify();
+  void Push(const T& x) {
+    MutexLockGuard lock(mutex_);
+    while (queue_.Full()) {
+      notFull_.Wait();
    }
+    assert(!queue_.Full());
+    queue_.Push(x);
+    notEmpty_.Notify();
+  }

-    T Pop() {
-        MutexLockGuard lock(mutex_);
-        while(queue_.Empty()) {
-            notEmpty_.Wait();
-        }
-        assert(!queue_.Empty());
-        T res = queue_.Pop();
-        notFull_.Notify();
-        return res;
+  T Pop() {
+    MutexLockGuard lock(mutex_);
+    while (queue_.Empty()) {
+      notEmpty_.Wait();
    }
+    assert(!queue_.Empty());
+    T res = queue_.Pop();
+    notFull_.Notify();
+    return res;
+  }

-    bool Empty() const {
-        MutexLockGuard lock(mutex_);
-        return queue_.Empty();
-    }
+  bool Empty() const {
+    MutexLockGuard lock(mutex_);
+    return queue_.Empty();
+  }

-    bool Full() const {
-        MutexLockGuard lock(mutex_);
-        return queue_.Full();
-    }
+  bool Full() const {
+    MutexLockGuard lock(mutex_);
+    return queue_.Full();
+  }

-    size_t size() const {
-        MutexLockGuard lock(mutex_);
-        return queue_.size();
-    }
+  size_t size() const {
+    MutexLockGuard lock(mutex_);
+    return queue_.size();
+  }

-    size_t capacity() const {
-        return queue_.capacity();
-    }
+  size_t capacity() const {
+    return queue_.capacity();
+  }

-private:
-    mutable MutexLock          mutex_;
-    Condition                  notEmpty_;
-    Condition                  notFull_;
-    BoundedQueue<T>  queue_;
+ private:
+  mutable MutexLock          mutex_;
+  Condition                  notEmpty_;
+  Condition                  notFull_;
+  BoundedQueue<T>  queue_;
 }; // class BoundedBlockingQueue

 } // namespace limonp
--- a/libchinese-segmentation/cppjieba/limonp/BoundedQueue.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/BoundedQueue.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_BOUNDED_QUEUE_HPP
 #define LIMONP_BOUNDED_QUEUE_HPP

@ -27,55 +9,55 @@ namespace limonp {
 using namespace std;
 template<class T>
 class BoundedQueue {
-public:
-    explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) {
-        head_ = 0;
-        tail_ = 0;
-        size_ = 0;
-        assert(capacity_);
-    }
-    ~BoundedQueue() {
-    }
+ public:
+  explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) {
+    head_ = 0;
+    tail_ = 0;
+    size_ = 0;
+    assert(capacity_);
+  }
+  ~BoundedQueue() {
+  }

-    void Clear() {
-        head_ = 0;
-        tail_ = 0;
-        size_ = 0;
-    }
-    bool Empty() const {
-        return !size_;
-    }
-    bool Full() const {
-        return capacity_ == size_;
-    }
-    size_t Size() const {
-        return size_;
-    }
-    size_t Capacity() const {
-        return capacity_;
-    }
+  void Clear() {
+    head_ = 0;
+    tail_ = 0;
+    size_ = 0;
+  }
+  bool Empty() const {
+    return !size_;
+  }
+  bool Full() const {
+    return capacity_ == size_;
+  }
+  size_t Size() const {
+    return size_;
+  }
+  size_t Capacity() const {
+    return capacity_;
+  }

-    void Push(const T& t) {
-        assert(!Full());
-        circular_buffer_[tail_] = t;
-        tail_ = (tail_ + 1) % capacity_;
-        size_ ++;
-    }
+  void Push(const T& t) {
+    assert(!Full());
+    circular_buffer_[tail_] = t;
+    tail_ = (tail_ + 1) % capacity_;
+    size_ ++;
+  }

-    T Pop() {
-        assert(!Empty());
-        size_t oldPos = head_;
-        head_ = (head_ + 1) % capacity_;
-        size_ --;
-        return circular_buffer_[oldPos];
-    }
+  T Pop() {
+    assert(!Empty());
+    size_t oldPos = head_;
+    head_ = (head_ + 1) % capacity_;
+    size_ --;
+    return circular_buffer_[oldPos];
+  }

-private:
-    size_t head_;
-    size_t tail_;
-    size_t size_;
-    const size_t capacity_;
-    vector<T> circular_buffer_;
+ private:
+  size_t head_;
+  size_t tail_;
+  size_t size_;
+  const size_t capacity_;
+  vector<T> circular_buffer_;

 }; // class BoundedQueue
 } // namespace limonp
--- a/libchinese-segmentation/cppjieba/limonp/Closure.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Closure.hpp
@ -1,222 +1,204 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_CLOSURE_HPP
 #define LIMONP_CLOSURE_HPP

 namespace limonp {

 class ClosureInterface {
-public:
-    virtual ~ClosureInterface() {
-    }
-    virtual void Run() = 0;
+ public:
+  virtual ~ClosureInterface() {
+  }
+  virtual void Run() = 0;
 };

 template <class Funct>
 class Closure0: public ClosureInterface {
-public:
-    Closure0(Funct fun) {
-        fun_ = fun;
-    }
-    virtual ~Closure0() {
-    }
-    virtual void Run() {
-        (*fun_)();
-    }
-private:
-    Funct fun_;
-};
+ public:
+  Closure0(Funct fun) {
+    fun_ = fun;
+  }
+  virtual ~Closure0() {
+  }
+  virtual void Run() {
+    (*fun_)();
+  }
+ private:
+  Funct fun_;
+}; 

 template <class Funct, class Arg1>
 class Closure1: public ClosureInterface {
-public:
-    Closure1(Funct fun, Arg1 arg1) {
-        fun_ = fun;
-        arg1_ = arg1;
-    }
-    virtual ~Closure1() {
-    }
-    virtual void Run() {
-        (*fun_)(arg1_);
-    }
-private:
-    Funct fun_;
-    Arg1 arg1_;
-};
+ public:
+  Closure1(Funct fun, Arg1 arg1) {
+    fun_ = fun;
+    arg1_ = arg1;
+  }
+  virtual ~Closure1() {
+  }
+  virtual void Run() {
+    (*fun_)(arg1_);
+  }
+ private:
+  Funct fun_;
+  Arg1 arg1_;
+}; 

 template <class Funct, class Arg1, class Arg2>
 class Closure2: public ClosureInterface {
-public:
-    Closure2(Funct fun, Arg1 arg1, Arg2 arg2) {
-        fun_ = fun;
-        arg1_ = arg1;
-        arg2_ = arg2;
-    }
-    virtual ~Closure2() {
-    }
-    virtual void Run() {
-        (*fun_)(arg1_, arg2_);
-    }
-private:
-    Funct fun_;
-    Arg1 arg1_;
-    Arg2 arg2_;
-};
+ public:
+  Closure2(Funct fun, Arg1 arg1, Arg2 arg2) {
+    fun_ = fun;
+    arg1_ = arg1;
+    arg2_ = arg2;
+  }
+  virtual ~Closure2() {
+  }
+  virtual void Run() {
+    (*fun_)(arg1_, arg2_);
+  }
+ private:
+  Funct fun_;
+  Arg1 arg1_;
+  Arg2 arg2_;
+}; 

 template <class Funct, class Arg1, class Arg2, class Arg3>
 class Closure3: public ClosureInterface {
-public:
-    Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
-        fun_ = fun;
-        arg1_ = arg1;
-        arg2_ = arg2;
-        arg3_ = arg3;
-    }
-    virtual ~Closure3() {
-    }
-    virtual void Run() {
-        (*fun_)(arg1_, arg2_, arg3_);
-    }
-private:
-    Funct fun_;
-    Arg1 arg1_;
-    Arg2 arg2_;
-    Arg3 arg3_;
-};
+ public:
+  Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+    fun_ = fun;
+    arg1_ = arg1;
+    arg2_ = arg2;
+    arg3_ = arg3;
+  }
+  virtual ~Closure3() {
+  }
+  virtual void Run() {
+    (*fun_)(arg1_, arg2_, arg3_);
+  }
+ private:
+  Funct fun_;
+  Arg1 arg1_;
+  Arg2 arg2_;
+  Arg3 arg3_;
+}; 

-template <class Obj, class Funct>
+template <class Obj, class Funct> 
 class ObjClosure0: public ClosureInterface {
-public:
-    ObjClosure0(Obj* p, Funct fun) {
-        p_ = p;
-        fun_ = fun;
-    }
-    virtual ~ObjClosure0() {
-    }
-    virtual void Run() {
-        (p_->*fun_)();
-    }
-private:
-    Obj* p_;
-    Funct fun_;
-};
+ public:
+  ObjClosure0(Obj* p, Funct fun) {
+   p_ = p;
+   fun_ = fun;
+  }
+  virtual ~ObjClosure0() {
+  }
+  virtual void Run() {
+    (p_->*fun_)();
+  }
+ private:
+  Obj* p_;
+  Funct fun_;
+}; 

-template <class Obj, class Funct, class Arg1>
+template <class Obj, class Funct, class Arg1> 
 class ObjClosure1: public ClosureInterface {
-public:
-    ObjClosure1(Obj* p, Funct fun, Arg1 arg1) {
-        p_ = p;
-        fun_ = fun;
-        arg1_ = arg1;
-    }
-    virtual ~ObjClosure1() {
-    }
-    virtual void Run() {
-        (p_->*fun_)(arg1_);
-    }
-private:
-    Obj* p_;
-    Funct fun_;
-    Arg1 arg1_;
-};
+ public:
+  ObjClosure1(Obj* p, Funct fun, Arg1 arg1) {
+   p_ = p;
+   fun_ = fun;
+   arg1_ = arg1;
+  }
+  virtual ~ObjClosure1() {
+  }
+  virtual void Run() {
+    (p_->*fun_)(arg1_);
+  }
+ private:
+  Obj* p_;
+  Funct fun_;
+  Arg1 arg1_;
+}; 

-template <class Obj, class Funct, class Arg1, class Arg2>
+template <class Obj, class Funct, class Arg1, class Arg2> 
 class ObjClosure2: public ClosureInterface {
-public:
-    ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) {
-        p_ = p;
-        fun_ = fun;
-        arg1_ = arg1;
-        arg2_ = arg2;
-    }
-    virtual ~ObjClosure2() {
-    }
-    virtual void Run() {
-        (p_->*fun_)(arg1_, arg2_);
-    }
-private:
-    Obj* p_;
-    Funct fun_;
-    Arg1 arg1_;
-    Arg2 arg2_;
-};
-template <class Obj, class Funct, class Arg1, class Arg2, class Arg3>
+ public:
+  ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) {
+   p_ = p;
+   fun_ = fun;
+   arg1_ = arg1;
+   arg2_ = arg2;
+  }
+  virtual ~ObjClosure2() {
+  }
+  virtual void Run() {
+    (p_->*fun_)(arg1_, arg2_);
+  }
+ private:
+  Obj* p_;
+  Funct fun_;
+  Arg1 arg1_;
+  Arg2 arg2_;
+}; 
+template <class Obj, class Funct, class Arg1, class Arg2, class Arg3> 
 class ObjClosure3: public ClosureInterface {
-public:
-    ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
-        p_ = p;
-        fun_ = fun;
-        arg1_ = arg1;
-        arg2_ = arg2;
-        arg3_ = arg3;
-    }
-    virtual ~ObjClosure3() {
-    }
-    virtual void Run() {
-        (p_->*fun_)(arg1_, arg2_, arg3_);
-    }
-private:
-    Obj* p_;
-    Funct fun_;
-    Arg1 arg1_;
-    Arg2 arg2_;
-    Arg3 arg3_;
-};
+ public:
+  ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+   p_ = p;
+   fun_ = fun;
+   arg1_ = arg1;
+   arg2_ = arg2;
+   arg3_ = arg3;
+  }
+  virtual ~ObjClosure3() {
+  }
+  virtual void Run() {
+    (p_->*fun_)(arg1_, arg2_, arg3_);
+  }
+ private:
+  Obj* p_;
+  Funct fun_;
+  Arg1 arg1_;
+  Arg2 arg2_;
+  Arg3 arg3_;
+}; 

 template<class R>
-ClosureInterface* NewClosure(R(*fun)()) {
-    return new Closure0<R(*)()>(fun);
+ClosureInterface* NewClosure(R (*fun)()) {
+  return new Closure0<R (*)()>(fun);
 }

 template<class R, class Arg1>
-ClosureInterface* NewClosure(R(*fun)(Arg1), Arg1 arg1) {
-    return new Closure1<R(*)(Arg1), Arg1>(fun, arg1);
+ClosureInterface* NewClosure(R (*fun)(Arg1), Arg1 arg1) {
+  return new Closure1<R (*)(Arg1), Arg1>(fun, arg1);
 }

 template<class R, class Arg1, class Arg2>
-ClosureInterface* NewClosure(R(*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
-    return new Closure2<R(*)(Arg1, Arg2), Arg1, Arg2>(fun, arg1, arg2);
+ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
+  return new Closure2<R (*)(Arg1, Arg2), Arg1, Arg2>(fun, arg1, arg2);
 }

 template<class R, class Arg1, class Arg2, class Arg3>
-ClosureInterface* NewClosure(R(*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
-    return new Closure3<R(*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(fun, arg1, arg2, arg3);
+ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+  return new Closure3<R (*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(fun, arg1, arg2, arg3);
 }

 template<class R, class Obj>
-ClosureInterface* NewClosure(Obj* obj, R(Obj::* fun)()) {
-    return new ObjClosure0<Obj, R(Obj::*)()>(obj, fun);
+ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)()) {
+  return new ObjClosure0<Obj, R (Obj::* )()>(obj, fun);
 }

 template<class R, class Obj, class Arg1>
-ClosureInterface* NewClosure(Obj* obj, R(Obj::* fun)(Arg1), Arg1 arg1) {
-    return new ObjClosure1<Obj, R(Obj::*)(Arg1), Arg1>(obj, fun, arg1);
+ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1), Arg1 arg1) {
+  return new ObjClosure1<Obj, R (Obj::* )(Arg1), Arg1>(obj, fun, arg1);
 }

 template<class R, class Obj, class Arg1, class Arg2>
-ClosureInterface* NewClosure(Obj* obj, R(Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
-    return new ObjClosure2<Obj, R(Obj::*)(Arg1, Arg2), Arg1, Arg2>(obj, fun, arg1, arg2);
+ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
+  return new ObjClosure2<Obj, R (Obj::*)(Arg1, Arg2), Arg1, Arg2>(obj, fun, arg1, arg2);
 }

 template<class R, class Obj, class Arg1, class Arg2, class Arg3>
-ClosureInterface* NewClosure(Obj* obj, R(Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
-    return new ObjClosure3<Obj, R(Obj::*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(obj, fun, arg1, arg2, arg3);
+ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+  return new ObjClosure3<Obj, R (Obj::*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(obj, fun, arg1, arg2, arg3);
 }

 } // namespace limonp
--- a/libchinese-segmentation/cppjieba/limonp/Colors.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Colors.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_COLOR_PRINT_HPP
 #define LIMONP_COLOR_PRINT_HPP

@ -27,21 +9,21 @@ namespace limonp {
 using std::string;

 enum Color {
-    BLACK = 30,
-    RED,
-    GREEN,
-    YELLOW,
-    BLUE,
-    PURPLE
+  BLACK = 30,
+  RED,
+  GREEN,
+  YELLOW,
+  BLUE,
+  PURPLE
 }; // enum Color

 static void ColorPrintln(enum Color color, const char * fmt, ...) {
-    va_list ap;
-    printf("\033[0;%dm", color);
-    va_start(ap, fmt);
-    vprintf(fmt, ap);
-    va_end(ap);
-    printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly
+  va_list ap;
+  printf("\033[0;%dm", color);
+  va_start(ap, fmt);
+  vprintf(fmt, ap);
+  va_end(ap);
+  printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly
 }

 } // namespace limonp
--- a/libchinese-segmentation/cppjieba/limonp/Condition.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Condition.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_CONDITION_HPP
 #define LIMONP_CONDITION_HPP

@ -24,31 +6,31 @@
 namespace limonp {

 class Condition : NonCopyable {
-public:
-    explicit Condition(MutexLock& mutex)
-        : mutex_(mutex) {
-        XCHECK(!pthread_cond_init(&pcond_, NULL));
-    }
+ public:
+  explicit Condition(MutexLock& mutex)
+    : mutex_(mutex) {
+    XCHECK(!pthread_cond_init(&pcond_, NULL));
+  }

-    ~Condition() {
-        XCHECK(!pthread_cond_destroy(&pcond_));
-    }
+  ~Condition() {
+    XCHECK(!pthread_cond_destroy(&pcond_));
+  }

-    void Wait() {
-        XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex()));
-    }
+  void Wait() {
+    XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex()));
+  }

-    void Notify() {
-        XCHECK(!pthread_cond_signal(&pcond_));
-    }
+  void Notify() {
+    XCHECK(!pthread_cond_signal(&pcond_));
+  }

-    void NotifyAll() {
-        XCHECK(!pthread_cond_broadcast(&pcond_));
-    }
+  void NotifyAll() {
+    XCHECK(!pthread_cond_broadcast(&pcond_));
+  }

-private:
-    MutexLock& mutex_;
-    pthread_cond_t pcond_;
+ private:
+  MutexLock& mutex_;
+  pthread_cond_t pcond_;
 }; // class Condition

 } // namespace limonp
--- a/libchinese-segmentation/cppjieba/limonp/Config.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Config.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 /************************************
 * file enc : utf8
 * author   : wuyanyi09@gmail.com
@ -34,86 +16,86 @@ namespace limonp {
 using namespace std;

 class Config {
-public:
-    explicit Config(const string& filePath) {
-        LoadFile(filePath);
-    }
+ public:
+  explicit Config(const string& filePath) {
+    LoadFile(filePath);
+  }

-    operator bool () {
-        return !map_.empty();
-    }
+  operator bool () {
+    return !map_.empty();
+  }

-    string Get(const string& key, const string& defaultvalue) const {
-        map<string, string>::const_iterator it = map_.find(key);
-        if(map_.end() != it) {
-            return it->second;
-        }
-        return defaultvalue;
+  string Get(const string& key, const string& defaultvalue) const {
+    map<string, string>::const_iterator it = map_.find(key);
+    if(map_.end() != it) {
+      return it->second;
    }
-    int Get(const string& key, int defaultvalue) const {
-        string str = Get(key, "");
-        if("" == str) {
-            return defaultvalue;
-        }
-        return atoi(str.c_str());
+    return defaultvalue;
+  }
+  int Get(const string& key, int defaultvalue) const {
+    string str = Get(key, "");
+    if("" == str) {
+      return defaultvalue;
    }
-    const char* operator [](const char* key) const {
-        if(NULL == key) {
-            return NULL;
-        }
-        map<string, string>::const_iterator it = map_.find(key);
-        if(map_.end() != it) {
-            return it->second.c_str();
-        }
-        return NULL;
+    return atoi(str.c_str());
+  }
+  const char* operator [] (const char* key) const {
+    if(NULL == key) {
+      return NULL;
    }
+    map<string, string>::const_iterator it = map_.find(key);
+    if(map_.end() != it) {
+      return it->second.c_str();
+    }
+    return NULL;
+  }

-    string GetConfigInfo() const {
-        string res;
-        res << *this;
-        return res;
+  string GetConfigInfo() const {
+    string res;
+    res << *this;
+    return res;
+  }
+
+ private:
+  void LoadFile(const string& filePath) {
+    ifstream ifs(filePath.c_str());
+    assert(ifs);
+    string line;
+    vector<string> vecBuf;
+    size_t lineno = 0;
+    while(getline(ifs, line)) {
+      lineno ++;
+      Trim(line);
+      if(line.empty() || StartsWith(line, "#")) {
+        continue;
+      }
+      vecBuf.clear();
+      Split(line, vecBuf, "=");
+      if(2 != vecBuf.size()) {
+        fprintf(stderr, "line[%s] illegal.\n", line.c_str());
+        assert(false);
+        continue;
+      }
+      string& key = vecBuf[0];
+      string& value = vecBuf[1];
+      Trim(key);
+      Trim(value);
+      if(!map_.insert(make_pair(key, value)).second) {
+        fprintf(stderr, "key[%s] already exits.\n", key.c_str());
+        assert(false);
+        continue;
+      }
    }
+    ifs.close();
+  }

-private:
-    void LoadFile(const string& filePath) {
-        ifstream ifs(filePath.c_str());
-        assert(ifs);
-        string line;
-        vector<string> vecBuf;
-        size_t lineno = 0;
-        while(getline(ifs, line)) {
-            lineno ++;
-            Trim(line);
-            if(line.empty() || StartsWith(line, "#")) {
-                continue;
-            }
-            vecBuf.clear();
-            Split(line, vecBuf, "=");
-            if(2 != vecBuf.size()) {
-                fprintf(stderr, "line[%s] illegal.\n", line.c_str());
-                assert(false);
-                continue;
-            }
-            string& key = vecBuf[0];
-            string& value = vecBuf[1];
-            Trim(key);
-            Trim(value);
-            if(!map_.insert(make_pair(key, value)).second) {
-                fprintf(stderr, "key[%s] already exits.\n", key.c_str());
-                assert(false);
-                continue;
-            }
-        }
-        ifs.close();
-    }
+  friend ostream& operator << (ostream& os, const Config& config);

-    friend ostream& operator << (ostream& os, const Config& config);
-
-    map<string, string> map_;
+  map<string, string> map_;
 }; // class Config

 inline ostream& operator << (ostream& os, const Config& config) {
-    return os << config.map_;
+  return os << config.map_;
 }

 } // namespace limonp
--- a/libchinese-segmentation/cppjieba/limonp/FileLock.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/FileLock.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_FILELOCK_HPP
 #define LIMONP_FILELOCK_HPP

@ -33,58 +15,58 @@ namespace limonp {
 using std::string;

 class FileLock {
-public:
-    FileLock() : fd_(-1), ok_(true) {
+ public:
+  FileLock() : fd_(-1), ok_(true) {
+  }
+  ~FileLock() {
+    if(fd_ > 0) {
+      Close();
    }
-    ~FileLock() {
-        if(fd_ > 0) {
-            Close();
-        }
+  }
+  void Open(const string& fname) {
+    assert(fd_ == -1);
+    fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
+    if(fd_ < 0) {
+      ok_ = false;
+      err_ = strerror(errno);
    }
-    void Open(const string& fname) {
-        assert(fd_ == -1);
-        fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
-        if(fd_ < 0) {
-            ok_ = false;
-            err_ = strerror(errno);
-        }
+  }
+  void Close() {
+    ::close(fd_);
+  }
+  void Lock() {
+    if(LockOrUnlock(fd_, true) < 0) {
+      ok_ = false;
+      err_ = strerror(errno);
    }
-    void Close() {
-        ::close(fd_);
-    }
-    void Lock() {
-        if(LockOrUnlock(fd_, true) < 0) {
-            ok_ = false;
-            err_ = strerror(errno);
-        }
-    }
-    void UnLock() {
-        if(LockOrUnlock(fd_, false) < 0) {
-            ok_ = false;
-            err_ = strerror(errno);
-        }
-    }
-    bool Ok() const {
-        return ok_;
-    }
-    string Error() const {
-        return err_;
-    }
-private:
-    static int LockOrUnlock(int fd, bool lock) {
-        errno = 0;
-        struct flock f;
-        memset(&f, 0, sizeof(f));
-        f.l_type = (lock ? F_WRLCK : F_UNLCK);
-        f.l_whence = SEEK_SET;
-        f.l_start = 0;
-        f.l_len = 0;        // Lock/unlock entire file
-        return fcntl(fd, F_SETLK, &f);
+  }
+  void UnLock() {
+    if(LockOrUnlock(fd_, false) < 0) {
+      ok_ = false;
+      err_ = strerror(errno);
    }
+  }
+  bool Ok() const {
+    return ok_;
+  }
+  string Error() const {
+    return err_;
+  }
+ private:
+  static int LockOrUnlock(int fd, bool lock) {
+    errno = 0;
+    struct flock f;
+    memset(&f, 0, sizeof(f));
+    f.l_type = (lock ? F_WRLCK : F_UNLCK);
+    f.l_whence = SEEK_SET;
+    f.l_start = 0;
+    f.l_len = 0;        // Lock/unlock entire file
+    return fcntl(fd, F_SETLK, &f);
+  }

-    int fd_;
-    bool ok_;
-    string err_;
+  int fd_;
+  bool ok_;
+  string err_;
 }; // class FileLock

 }// namespace limonp
--- a/libchinese-segmentation/cppjieba/limonp/ForcePublic.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/ForcePublic.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_FORCE_PUBLIC_H
 #define LIMONP_FORCE_PUBLIC_H

--- a/libchinese-segmentation/cppjieba/limonp/LocalVector.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/LocalVector.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_LOCAL_VECTOR_HPP
 #define LIMONP_LOCAL_VECTOR_HPP

@ -33,123 +15,126 @@ using namespace std;
 const size_t LOCAL_VECTOR_BUFFER_SIZE = 16;
 template <class T>
 class LocalVector {
-public:
-    typedef const T* const_iterator ;
-    typedef T value_type;
-    typedef size_t size_type;
-private:
-    T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
-    T * ptr_;
-    size_t size_;
-    size_t capacity_;
-public:
-    LocalVector() {
-        init_();
-    };
-    LocalVector(const LocalVector<T>& vec) {
-        init_();
-        *this = vec;
+ public:
+  typedef const T* const_iterator ;
+  typedef T value_type;
+  typedef size_t size_type;
+ private:
+  T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
+  T * ptr_;
+  size_t size_;
+  size_t capacity_;
+ public:
+  LocalVector() {
+    init_();
+  };
+  LocalVector(const LocalVector<T>& vec) {
+    init_();
+    *this = vec;
+  }
+  LocalVector(const_iterator  begin, const_iterator end) { // TODO: make it faster
+    init_();
+    while(begin != end) {
+      push_back(*begin++);
    }
-    LocalVector(const_iterator  begin, const_iterator end) { // TODO: make it faster
-        init_();
-        while(begin != end) {
-            push_back(*begin++);
-        }
+  }
+  LocalVector(size_t size, const T& t) { // TODO: make it faster
+    init_();
+    while(size--) {
+      push_back(t);
    }
-    LocalVector(size_t size, const T& t) { // TODO: make it faster
-        init_();
-        while(size--) {
-            push_back(t);
-        }
+  }
+  ~LocalVector() {
+    if(ptr_ != buffer_) {
+      free(ptr_);
    }
-    ~LocalVector() {
-        if(ptr_ != buffer_) {
-            free(ptr_);
-        }
-    };
-public:
-    LocalVector<T>& operator = (const LocalVector<T>& vec) {
-        clear();
-        size_ = vec.size();
-        capacity_ = vec.capacity();
-        if(vec.buffer_ == vec.ptr_) {
-            memcpy(buffer_, vec.buffer_, sizeof(T) * size_);
-            ptr_ = buffer_;
-        } else {
-            ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
-            assert(ptr_);
-            memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T));
-        }
-        return *this;
+  };
+ public:
+  LocalVector<T>& operator = (const LocalVector<T>& vec) {
+      if(this == &vec){
+          return *this;
+      }
+    clear();
+    size_ = vec.size();
+    capacity_ = vec.capacity();
+    if(vec.buffer_ == vec.ptr_) {
+      memcpy(buffer_, vec.buffer_, sizeof(T) * size_);
+      ptr_ = buffer_;
+    } else {
+      ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
+      assert(ptr_);
+      memcpy(ptr_, vec.ptr_, vec.size() * sizeof(T));
    }
-private:
-    void init_() {
-        ptr_ = buffer_;
-        size_ = 0;
-        capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
+    return *this;
+  }
+ private:
+  void init_() {
+    ptr_ = buffer_;
+    size_ = 0;
+    capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
+  }
+ public:
+  T& operator [] (size_t i) {
+    return ptr_[i];
+  }
+  const T& operator [] (size_t i) const {
+    return ptr_[i];
+  }
+  void push_back(const T& t) {
+    if(size_ == capacity_) {
+      assert(capacity_);
+      reserve(capacity_ * 2);
    }
-public:
-    T& operator [](size_t i) {
-        return ptr_[i];
+    ptr_[size_ ++ ] = t;
+  }
+  void reserve(size_t size) {
+    if(size <= capacity_) {
+      return;
    }
-    const T& operator [](size_t i) const {
-        return ptr_[i];
+    T * next =  (T*)malloc(sizeof(T) * size);
+    assert(next);
+    T * old = ptr_;
+    ptr_ = next;
+    memcpy(ptr_, old, sizeof(T) * capacity_);
+    capacity_ = size;
+    if(old != buffer_) {
+      free(old);
    }
-    void push_back(const T& t) {
-        if(size_ == capacity_) {
-            assert(capacity_);
-            reserve(capacity_ * 2);
-        }
-        ptr_[size_ ++ ] = t;
-    }
-    void reserve(size_t size) {
-        if(size <= capacity_) {
-            return;
-        }
-        T * next = (T*)malloc(sizeof(T) * size);
-        assert(next);
-        T * old = ptr_;
-        ptr_ = next;
-        memcpy(ptr_, old, sizeof(T) * capacity_);
-        capacity_ = size;
-        if(old != buffer_) {
-            free(old);
-        }
-    }
-    bool empty() const {
-        return 0 == size();
-    }
-    size_t size() const {
-        return size_;
-    }
-    size_t capacity() const {
-        return capacity_;
-    }
-    const_iterator begin() const {
-        return ptr_;
-    }
-    const_iterator end() const {
-        return ptr_ + size_;
-    }
-    void clear() {
-        if(ptr_ != buffer_) {
-            free(ptr_);
-        }
-        init_();
+  }
+  bool empty() const {
+    return 0 == size();
+  }
+  size_t size() const {
+    return size_;
+  }
+  size_t capacity() const {
+    return capacity_;
+  }
+  const_iterator begin() const {
+    return ptr_;
+  }
+  const_iterator end() const {
+    return ptr_ + size_;
+  }
+  void clear() {
+    if(ptr_ != buffer_) {
+      free(ptr_);
    }
+    init_();
+  }
 };

 template <class T>
 ostream & operator << (ostream& os, const LocalVector<T>& vec) {
-    if(vec.empty()) {
-        return os << "[]";
-    }
-    os << "[\"" << vec[0];
-    for(size_t i = 1; i < vec.size(); i++) {
-        os << "\", \"" << vec[i];
-    }
-    os << "\"]";
-    return os;
+  if(vec.empty()) {
+    return os << "[]";
+  }
+  os<<"[\""<<vec[0];
+  for(size_t i = 1; i < vec.size(); i++) {
+    os<<"\", \""<<vec[i];
+  }
+  os<<"\"]";
+  return os;
 }

 }
--- a/libchinese-segmentation/cppjieba/limonp/Logging.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Logging.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_LOGGING_HPP
 #define LIMONP_LOGGING_HPP

@ -38,55 +20,56 @@
 namespace limonp {

 enum {
-    LL_DEBUG = 0,
-    LL_INFO = 1,
-    LL_WARNING = 2,
-    LL_ERROR = 3,
-    LL_FATAL = 4,
+  LL_DEBUG = 0,
+  LL_INFO = 1,
+  LL_WARNING = 2,
+  LL_ERROR = 3,
+  LL_FATAL = 4,
 }; // enum

-static const char * LOG_LEVEL_ARRAY[] = {"DEBUG", "INFO", "WARN", "ERROR", "FATAL"};
-static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S";
+static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"};

 class Logger {
-public:
-    Logger(size_t level, const char* filename, int lineno)
-        : level_(level) {
+ public:
+  Logger(size_t level, const char* filename, int lineno)
+   : level_(level) {
 #ifdef LOGGING_LEVEL
-        if(level_ < LOGGING_LEVEL) {
-            return;
-        }
+     if (level_ < LOGGING_LEVEL) {
+       return;
+     }
 #endif
-        assert(level_ <= sizeof(LOG_LEVEL_ARRAY) / sizeof(*LOG_LEVEL_ARRAY));
-        char buf[32];
-        time_t now;
-        time(&now);
-        strftime(buf, sizeof(buf), LOG_TIME_FORMAT, localtime(&now));
-        stream_ << buf
-                << " " << filename
-                << ":" << lineno
-                << " " << LOG_LEVEL_ARRAY[level_]
-                << " ";
-    }
-    ~Logger() {
+    assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY));
+    char buf[32];
+    time_t now;
+    time(&now);
+    struct tm result;
+    localtime_r(&now, &result);
+    strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", &result);
+    stream_ << buf
+      << " " << filename
+      << ":" << lineno
+      << " " << LOG_LEVEL_ARRAY[level_]
+      << " ";
+  }
+  ~Logger() {
 #ifdef LOGGING_LEVEL
-        if(level_ < LOGGING_LEVEL) {
-            return;
-        }
+     if (level_ < LOGGING_LEVEL) {
+       return;
+     }
 #endif
-        std::cerr << stream_.str() << std::endl;
-        if(level_ == LL_FATAL) {
-            abort();
-        }
+    std::cerr << stream_.str() << std::endl;
+    if (level_ == LL_FATAL) {
+      abort();
    }
+  }

-    std::ostream& Stream() {
-        return stream_;
-    }
+  std::ostream& Stream() {
+    return stream_;
+  }

-private:
-    std::ostringstream stream_;
-    size_t level_;
+ private:
+  std::ostringstream stream_;
+  size_t level_;
 }; // class Logger

 } // namespace limonp
--- a/libchinese-segmentation/cppjieba/limonp/MutexLock.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/MutexLock.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_MUTEX_LOCK_HPP
 #define LIMONP_MUTEX_LOCK_HPP

@ -26,40 +8,40 @@
 namespace limonp {

 class MutexLock: NonCopyable {
-public:
-    MutexLock() {
-        XCHECK(!pthread_mutex_init(&mutex_, NULL));
-    }
-    ~MutexLock() {
-        XCHECK(!pthread_mutex_destroy(&mutex_));
-    }
-    pthread_mutex_t* GetPthreadMutex() {
-        return &mutex_;
-    }
+ public:
+  MutexLock() {
+    XCHECK(!pthread_mutex_init(&mutex_, NULL));
+  }
+  ~MutexLock() {
+    XCHECK(!pthread_mutex_destroy(&mutex_));
+  }
+  pthread_mutex_t* GetPthreadMutex() {
+    return &mutex_;
+  }

-private:
-    void Lock() {
-        XCHECK(!pthread_mutex_lock(&mutex_));
-    }
-    void Unlock() {
-        XCHECK(!pthread_mutex_unlock(&mutex_));
-    }
-    friend class MutexLockGuard;
+ private:
+  void Lock() {
+    XCHECK(!pthread_mutex_lock(&mutex_));
+  }
+  void Unlock() {
+    XCHECK(!pthread_mutex_unlock(&mutex_));
+  }
+  friend class MutexLockGuard;

-    pthread_mutex_t mutex_;
+  pthread_mutex_t mutex_;
 }; // class MutexLock

 class MutexLockGuard: NonCopyable {
-public:
-    explicit MutexLockGuard(MutexLock & mutex)
-        : mutex_(mutex) {
-        mutex_.Lock();
-    }
-    ~MutexLockGuard() {
-        mutex_.Unlock();
-    }
-private:
-    MutexLock & mutex_;
+ public:
+  explicit MutexLockGuard(MutexLock & mutex)
+    : mutex_(mutex) {
+    mutex_.Lock();
+  }
+  ~MutexLockGuard() {
+    mutex_.Unlock();
+  }
+ private:
+  MutexLock & mutex_;
 }; // class MutexLockGuard

 #define MutexLockGuard(x) XCHECK(false);
--- a/libchinese-segmentation/cppjieba/limonp/NonCopyable.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/NonCopyable.hpp
@ -1,35 +1,19 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
+/************************************
+ ************************************/
 #ifndef LIMONP_NONCOPYABLE_H
 #define LIMONP_NONCOPYABLE_H

 namespace limonp {

 class NonCopyable {
-protected:
-    NonCopyable() {
-    }
-    ~NonCopyable() {
-    }
-private:
-    NonCopyable(const NonCopyable&);
-    const NonCopyable& operator=(const NonCopyable&);
+ protected:
+  NonCopyable() {
+  }
+  ~NonCopyable() {
+  }
+ private:
+  NonCopyable(const NonCopyable& );
+  const NonCopyable& operator=(const NonCopyable& );
 }; // class NonCopyable

 } // namespace limonp
--- a/libchinese-segmentation/cppjieba/limonp/StdExtension.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/StdExtension.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_STD_EXTEMSION_HPP
 #define LIMONP_STD_EXTEMSION_HPP

@ -51,123 +33,123 @@ namespace std {

 template<typename T>
 ostream& operator << (ostream& os, const vector<T>& v) {
-    if(v.empty()) {
-        return os << "[]";
-    }
-    os << "[" << v[0];
-    for(size_t i = 1; i < v.size(); i++) {
-        os << ", " << v[i];
-    }
-    os << "]";
-    return os;
+  if(v.empty()) {
+    return os << "[]";
+  }
+  os<<"["<<v[0];
+  for(size_t i = 1; i < v.size(); i++) {
+    os<<", "<<v[i];
+  }
+  os<<"]";
+  return os;
 }

 template<>
 inline ostream& operator << (ostream& os, const vector<string>& v) {
-    if(v.empty()) {
-        return os << "[]";
-    }
-    os << "[\"" << v[0];
-    for(size_t i = 1; i < v.size(); i++) {
-        os << "\", \"" << v[i];
-    }
-    os << "\"]";
-    return os;
+  if(v.empty()) {
+    return os << "[]";
+  }
+  os<<"[\""<<v[0];
+  for(size_t i = 1; i < v.size(); i++) {
+    os<<"\", \""<<v[i];
+  }
+  os<<"\"]";
+  return os;
 }

 template<typename T>
 ostream& operator << (ostream& os, const deque<T>& dq) {
-    if(dq.empty()) {
-        return os << "[]";
-    }
-    os << "[\"" << dq[0];
-    for(size_t i = 1; i < dq.size(); i++) {
-        os << "\", \"" << dq[i];
-    }
-    os << "\"]";
-    return os;
+  if(dq.empty()) {
+    return os << "[]";
+  }
+  os<<"[\""<<dq[0];
+  for(size_t i = 1; i < dq.size(); i++) {
+    os<<"\", \""<<dq[i];
+  }
+  os<<"\"]";
+  return os;
 }


 template<class T1, class T2>
 ostream& operator << (ostream& os, const pair<T1, T2>& pr) {
-    os << pr.first << ":" << pr.second ;
-    return os;
+  os << pr.first << ":" << pr.second ;
+  return os;
 }


 template<class T>
 string& operator << (string& str, const T& obj) {
-    stringstream ss;
-    ss << obj; // call ostream& operator << (ostream& os,
-    return str = ss.str();
+  stringstream ss;
+  ss << obj; // call ostream& operator << (ostream& os,
+  return str = ss.str();
 }

 template<class T1, class T2>
 ostream& operator << (ostream& os, const map<T1, T2>& mp) {
-    if(mp.empty()) {
-        os << "{}";
-        return os;
-    }
-    os << '{';
-    typename map<T1, T2>::const_iterator it = mp.begin();
-    os << *it;
-    it++;
-    while(it != mp.end()) {
-        os << ", " << *it;
-        it++;
-    }
-    os << '}';
+  if(mp.empty()) {
+    os<<"{}";
    return os;
+  }
+  os<<'{';
+  typename map<T1, T2>::const_iterator it = mp.begin();
+  os<<*it;
+  it++;
+  while(it != mp.end()) {
+    os<<", "<<*it;
+    it++;
+  }
+  os<<'}';
+  return os;
 }
 template<class T1, class T2>
 ostream& operator << (ostream& os, const std::unordered_map<T1, T2>& mp) {
-    if(mp.empty()) {
-        return os << "{}";
-    }
-    os << '{';
-    typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
-    os << *it;
-    it++;
-    while(it != mp.end()) {
-        os << ", " << *it++;
-    }
-    return os << '}';
+  if(mp.empty()) {
+    return os << "{}";
+  }
+  os<<'{';
+  typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
+  os<<*it;
+  it++;
+  while(it != mp.end()) {
+    os<<", "<<*it++;
+  }
+  return os<<'}';
 }

 template<class T>
 ostream& operator << (ostream& os, const set<T>& st) {
-    if(st.empty()) {
-        os << "{}";
-        return os;
-    }
-    os << '{';
-    typename set<T>::const_iterator it = st.begin();
-    os << *it;
-    it++;
-    while(it != st.end()) {
-        os << ", " << *it;
-        it++;
-    }
-    os << '}';
+  if(st.empty()) {
+    os << "{}";
    return os;
+  }
+  os<<'{';
+  typename set<T>::const_iterator it = st.begin();
+  os<<*it;
+  it++;
+  while(it != st.end()) {
+    os<<", "<<*it;
+    it++;
+  }
+  os<<'}';
+  return os;
 }

 template<class KeyType, class ContainType>
 bool IsIn(const ContainType& contain, const KeyType& key) {
-    return contain.end() != contain.find(key);
+  return contain.end() != contain.find(key);
 }

 template<class T>
 basic_string<T> & operator << (basic_string<T> & s, ifstream & ifs) {
-    return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
+  return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
 }

 template<class T>
 ofstream & operator << (ofstream & ofs, const basic_string<T>& s) {
-    ostreambuf_iterator<T> itr(ofs);
-    copy(s.begin(), s.end(), itr);
-    return ofs;
+  ostreambuf_iterator<T> itr (ofs);
+  copy(s.begin(), s.end(), itr);
+  return ofs;
 }

 } // namespace std
--- a/libchinese-segmentation/cppjieba/limonp/StringUtil.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/StringUtil.hpp
@ -1,27 +1,14 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 /************************************
 * file enc : ascii
 * author   : wuyanyi09@gmail.com
 ************************************/
 #ifndef LIMONP_STR_FUNCTS_H
 #define LIMONP_STR_FUNCTS_H
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <memory.h>
+#include <sys/types.h>
 #include <fstream>
 #include <iostream>
 #include <string>
@ -29,14 +16,9 @@
 #include <algorithm>
 #include <cctype>
 #include <map>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <memory.h>
 #include <functional>
 #include <locale>
 #include <sstream>
-#include <sys/types.h>
 #include <iterator>
 #include <algorithm>
 #include "StdExtension.hpp"
@ -44,339 +26,356 @@
 namespace limonp {
 using namespace std;
 inline string StringFormat(const char* fmt, ...) {
-    int size = 256;
-    std::string str;
-    va_list ap;
-    while(1) {
-        str.resize(size);
-        va_start(ap, fmt);
-        int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
-        va_end(ap);
-        if(n > -1 && n < size) {
-            str.resize(n);
-            return str;
-        }
-        if(n > -1)
-            size = n + 1;
-        else
-            size *= 2;
+  int size = 256;
+  std::string str;
+  va_list ap;
+  while (1) {
+    str.resize(size);
+    va_start(ap, fmt);
+    int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
+    va_end(ap);
+    if (n > -1 && n < size) {
+      str.resize(n);
+      return str;
    }
-    return str;
+    if (n > -1)
+      size = n + 1;
+    else
+      size *= 2;
+  }
+  return str;
 }

 template<class T>
 void Join(T begin, T end, string& res, const string& connector) {
-    if(begin == end) {
-        return;
-    }
-    stringstream ss;
-    ss << *begin;
-    begin++;
-    while(begin != end) {
-        ss << connector << *begin;
-        begin ++;
-    }
-    res = ss.str();
+  if(begin == end) {
+    return;
+  }
+  stringstream ss;
+  ss<<*begin;
+  begin++;
+  while(begin != end) {
+    ss << connector << *begin;
+    begin ++;
+  }
+  res = ss.str();
 }

 template<class T>
 string Join(T begin, T end, const string& connector) {
-    string res;
-    Join(begin, end, res, connector);
-    return res;
+  string res;
+  Join(begin ,end, res, connector);
+  return res;
 }

 inline string& Upper(string& str) {
-    transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
-    return str;
+  transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
+  return str;
 }

 inline string& Lower(string& str) {
-    transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
-    return str;
+  transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
+  return str;
 }

 inline bool IsSpace(unsigned c) {
-    // when passing large int as the argument of isspace, it core dump, so here need a type cast.
-    return c > 0xff ? false : std::isspace(c & 0xff) != 0;
+  // when passing large int as the argument of isspace, it core dump, so here need a type cast.
+  return c > 0xff ? false : std::isspace(c & 0xff);
 }

 inline std::string& LTrim(std::string &s) {
-    s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))));
-    return s;
+  s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))));
+  return s;
 }

 inline std::string& RTrim(std::string &s) {
-    s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))).base(), s.end());
-    return s;
+  s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))).base(), s.end());
+  return s;
 }

 inline std::string& Trim(std::string &s) {
-    return LTrim(RTrim(s));
+  return LTrim(RTrim(s));
 }

 inline std::string& LTrim(std::string & s, char x) {
-    s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to<char>(), x))));
-    return s;
+  s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to<char>(), x))));
+  return s;
 }

 inline std::string& RTrim(std::string & s, char x) {
-    s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to<char>(), x))).base(), s.end());
-    return s;
+  s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to<char>(), x))).base(), s.end());
+  return s;
 }

 inline std::string& Trim(std::string &s, char x) {
-    return LTrim(RTrim(s, x), x);
+  return LTrim(RTrim(s, x), x);
 }

 inline void Split(const string& src, vector<string>& res, const string& pattern, size_t maxsplit = string::npos) {
-    res.clear();
-    size_t Start = 0;
-    size_t end = 0;
-    string sub;
-    while(Start < src.size()) {
-        end = src.find_first_of(pattern, Start);
-        if(string::npos == end || res.size() >= maxsplit) {
-            sub = src.substr(Start);
-            res.push_back(sub);
-            return;
-        }
-        sub = src.substr(Start, end - Start);
-        res.push_back(sub);
-        Start = end + 1;
+  res.clear();
+  size_t Start = 0;
+  size_t end = 0;
+  string sub;
+  while(Start < src.size()) {
+    end = src.find_first_of(pattern, Start);
+    if(string::npos == end || res.size() >= maxsplit) {
+      sub = src.substr(Start);
+      res.push_back(sub);
+      return;
    }
-    return;
+    sub = src.substr(Start, end - Start);
+    res.push_back(sub);
+    Start = end + 1;
+  }
+  return;
 }

 inline vector<string> Split(const string& src, const string& pattern, size_t maxsplit = string::npos) {
-    vector<string> res;
-    Split(src, res, pattern, maxsplit);
-    return res;
+  vector<string> res;
+  Split(src, res, pattern, maxsplit);
+  return res;
 }

 inline bool StartsWith(const string& str, const string& prefix) {
-    if(prefix.length() > str.length()) {
-        return false;
-    }
-    return 0 == str.compare(0, prefix.length(), prefix);
+  if(prefix.length() > str.length()) {
+    return false;
+  }
+  return 0 == str.compare(0, prefix.length(), prefix);
 }

 inline bool EndsWith(const string& str, const string& suffix) {
-    if(suffix.length() > str.length()) {
-        return false;
-    }
-    return 0 == str.compare(str.length() -  suffix.length(), suffix.length(), suffix);
+  if(suffix.length() > str.length()) {
+    return false;
+  }
+  return 0 == str.compare(str.length() -  suffix.length(), suffix.length(), suffix);
 }

 inline bool IsInStr(const string& str, char ch) {
-    return str.find(ch) != string::npos;
+  return str.find(ch) != string::npos;
 }

 inline uint16_t TwocharToUint16(char high, char low) {
-    return (((uint16_t(high) & 0x00ff) << 8) | (uint16_t(low) & 0x00ff));
+  return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
 }

 template <class Uint16Container>
 bool Utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec) {
-    if(!str) {
-        return false;
+  if(!str) {
+    return false;
+  }
+  char ch1, ch2;
+  uint16_t tmp;
+  vec.clear();
+  for(size_t i = 0; i < len;) {
+    if(!(str[i] & 0x80)) { // 0xxxxxxx
+      vec.push_back(str[i]);
+      i++;
+    } else if ((uint8_t)str[i] <= 0xdf && i + 1 < len) { // 110xxxxxx
+      ch1 = (str[i] >> 2) & 0x07;
+      ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
+      tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
+      vec.push_back(tmp);
+      i += 2;
+    } else if((uint8_t)str[i] <= 0xef && i + 2 < len) {
+      ch1 = ((uint8_t)str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
+      ch2 = (((uint8_t)str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
+      tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
+      vec.push_back(tmp);
+      i += 3;
+    } else {
+      return false;
    }
-    char ch1, ch2;
-    uint16_t tmp;
-    vec.clear();
-    for(size_t i = 0; i < len;) {
-        if(!(str[i] & 0x80)) { // 0xxxxxxx
-            vec.push_back(str[i]);
-            i++;
-        } else if((uint8_t)str[i] <= 0xdf && i + 1 < len) {  // 110xxxxxx
-            ch1 = (str[i] >> 2) & 0x07;
-            ch2 = (str[i + 1] & 0x3f) | ((str[i] & 0x03) << 6);
-            tmp = (((uint16_t(ch1) & 0x00ff) << 8) | (uint16_t(ch2) & 0x00ff));
-            vec.push_back(tmp);
-            i += 2;
-        } else if((uint8_t)str[i] <= 0xef && i + 2 < len) {
-            ch1 = ((uint8_t)str[i] << 4) | ((str[i + 1] >> 2) & 0x0f);
-            ch2 = (((uint8_t)str[i + 1] << 6) & 0xc0) | (str[i + 2] & 0x3f);
-            tmp = (((uint16_t(ch1) & 0x00ff) << 8) | (uint16_t(ch2) & 0x00ff));
-            vec.push_back(tmp);
-            i += 3;
-        } else {
-            return false;
-        }
-    }
-    return true;
+  }
+  return true;
 }

 template <class Uint16Container>
 bool Utf8ToUnicode(const string& str, Uint16Container& vec) {
-    return Utf8ToUnicode(str.c_str(), str.size(), vec);
+  return Utf8ToUnicode(str.c_str(), str.size(), vec);
+}
+
+template <class Uint32Container>
+bool Utf8ToUnicode32(const char * str, size_t size, Uint32Container& vec) {
+  uint32_t tmp;
+  vec.clear();
+  for(size_t i = 0; i < size;) {
+    if(!(str[i] & 0x80)) { // 0xxxxxxx
+      // 7bit, total 7bit
+      tmp = (uint8_t)(str[i]) & 0x7f;
+      i++;
+    } else if ((uint8_t)str[i] <= 0xdf && i + 1 < size) { // 110xxxxxx
+      // 5bit, total 5bit
+      tmp = (uint8_t)(str[i]) & 0x1f;
+
+      // 6bit, total 11bit
+      tmp <<= 6;
+      tmp |= (uint8_t)(str[i+1]) & 0x3f;
+      i += 2;
+    } else if((uint8_t)str[i] <= 0xef && i + 2 < size) { // 1110xxxxxx
+      // 4bit, total 4bit
+      tmp = (uint8_t)(str[i]) & 0x0f;
+
+      // 6bit, total 10bit
+      tmp <<= 6;
+      tmp |= (uint8_t)(str[i+1]) & 0x3f;
+
+      // 6bit, total 16bit
+      tmp <<= 6;
+      tmp |= (uint8_t)(str[i+2]) & 0x3f;
+
+      i += 3;
+    } else if((uint8_t)str[i] <= 0xf7 && i + 3 < size) { // 11110xxxx
+      // 3bit, total 3bit
+      tmp = (uint8_t)(str[i]) & 0x07;
+
+      // 6bit, total 9bit
+      tmp <<= 6;
+      tmp |= (uint8_t)(str[i+1]) & 0x3f;
+
+      // 6bit, total 15bit
+      tmp <<= 6;
+      tmp |= (uint8_t)(str[i+2]) & 0x3f;
+
+      // 6bit, total 21bit
+      tmp <<= 6;
+      tmp |= (uint8_t)(str[i+3]) & 0x3f;
+
+      i += 4;
+    } else {
+      return false;
+    }
+    vec.push_back(tmp);
+  }
+  return true;
 }

 template <class Uint32Container>
 bool Utf8ToUnicode32(const string& str, Uint32Container& vec) {
-    uint32_t tmp;
-    vec.clear();
-    for(size_t i = 0; i < str.size();) {
-        if(!(str[i] & 0x80)) { // 0xxxxxxx
-            // 7bit, total 7bit
-            tmp = (uint8_t)(str[i]) & 0x7f;
-            i++;
-        } else if((uint8_t)str[i] <= 0xdf && i + 1 < str.size()) {  // 110xxxxxx
-            // 5bit, total 5bit
-            tmp = (uint8_t)(str[i]) & 0x1f;
+    return Utf8ToUnicode32(str.data(), str.size(), vec);
+}

-            // 6bit, total 11bit
-            tmp <<= 6;
-            tmp |= (uint8_t)(str[i + 1]) & 0x3f;
-            i += 2;
-        } else if((uint8_t)str[i] <= 0xef && i + 2 < str.size()) { // 1110xxxxxx
-            // 4bit, total 4bit
-            tmp = (uint8_t)(str[i]) & 0x0f;
-
-            // 6bit, total 10bit
-            tmp <<= 6;
-            tmp |= (uint8_t)(str[i + 1]) & 0x3f;
-
-            // 6bit, total 16bit
-            tmp <<= 6;
-            tmp |= (uint8_t)(str[i + 2]) & 0x3f;
-
-            i += 3;
-        } else if((uint8_t)str[i] <= 0xf7 && i + 3 < str.size()) { // 11110xxxx
-            // 3bit, total 3bit
-            tmp = (uint8_t)(str[i]) & 0x07;
-
-            // 6bit, total 9bit
-            tmp <<= 6;
-            tmp |= (uint8_t)(str[i + 1]) & 0x3f;
-
-            // 6bit, total 15bit
-            tmp <<= 6;
-            tmp |= (uint8_t)(str[i + 2]) & 0x3f;
-
-            // 6bit, total 21bit
-            tmp <<= 6;
-            tmp |= (uint8_t)(str[i + 3]) & 0x3f;
-
-            i += 4;
-        } else {
-            return false;
-        }
-        vec.push_back(tmp);
+inline int UnicodeToUtf8Bytes(uint32_t ui){
+    if(ui <= 0x7f) {
+        return 1;
+    } else if(ui <= 0x7ff) {
+        return 2;
+    } else if(ui <= 0xffff) {
+        return 3;
+    } else {
+        return 4;
    }
-    return true;
 }

 template <class Uint32ContainerConIter>
 void Unicode32ToUtf8(Uint32ContainerConIter begin, Uint32ContainerConIter end, string& res) {
-    res.clear();
-    uint32_t ui;
-    while(begin != end) {
-        ui = *begin;
-        if(ui <= 0x7f) {
-            res += char(ui);
-        } else if(ui <= 0x7ff) {
-            res += char(((ui >> 6) & 0x1f) | 0xc0);
-            res += char((ui & 0x3f) | 0x80);
-        } else if(ui <= 0xffff) {
-            res += char(((ui >> 12) & 0x0f) | 0xe0);
-            res += char(((ui >> 6) & 0x3f) | 0x80);
-            res += char((ui & 0x3f) | 0x80);
-        } else {
-            res += char(((ui >> 18) & 0x03) | 0xf0);
-            res += char(((ui >> 12) & 0x3f) | 0x80);
-            res += char(((ui >> 6) & 0x3f) | 0x80);
-            res += char((ui & 0x3f) | 0x80);
-        }
-        begin ++;
+  res.clear();
+  uint32_t ui;
+  while(begin != end) {
+    ui = *begin;
+    if(ui <= 0x7f) {
+      res += char(ui);
+    } else if(ui <= 0x7ff) {
+      res += char(((ui >> 6) & 0x1f) | 0xc0);
+      res += char((ui & 0x3f) | 0x80);
+    } else if(ui <= 0xffff) {
+      res += char(((ui >> 12) & 0x0f) | 0xe0);
+      res += char(((ui >> 6) & 0x3f) | 0x80);
+      res += char((ui & 0x3f) | 0x80);
+    } else {
+      res += char(((ui >> 18) & 0x03) | 0xf0);
+      res += char(((ui >> 12) & 0x3f) | 0x80);
+      res += char(((ui >> 6) & 0x3f) | 0x80);
+      res += char((ui & 0x3f) | 0x80);
    }
+    begin ++;
+  }
 }

 template <class Uint16ContainerConIter>
 void UnicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
-    res.clear();
-    uint16_t ui;
-    while(begin != end) {
-        ui = *begin;
-        if(ui <= 0x7f) {
-            res += char(ui);
-        } else if(ui <= 0x7ff) {
-            res += char(((ui >> 6) & 0x1f) | 0xc0);
-            res += char((ui & 0x3f) | 0x80);
-        } else {
-            res += char(((ui >> 12) & 0x0f) | 0xe0);
-            res += char(((ui >> 6) & 0x3f) | 0x80);
-            res += char((ui & 0x3f) | 0x80);
-        }
-        begin ++;
+  res.clear();
+  uint16_t ui;
+  while(begin != end) {
+    ui = *begin;
+    if(ui <= 0x7f) {
+      res += char(ui);
+    } else if(ui <= 0x7ff) {
+      res += char(((ui>>6) & 0x1f) | 0xc0);
+      res += char((ui & 0x3f) | 0x80);
+    } else {
+      res += char(((ui >> 12) & 0x0f )| 0xe0);
+      res += char(((ui>>6) & 0x3f )| 0x80 );
+      res += char((ui & 0x3f) | 0x80);
    }
+    begin ++;
+  }
 }


 template <class Uint16Container>
 bool GBKTrans(const char* const str, size_t len, Uint16Container& vec) {
-    vec.clear();
-    if(!str) {
-        return true;
-    }
-    size_t i = 0;
-    while(i < len) {
-        if(0 == (str[i] & 0x80)) {
-            vec.push_back(uint16_t(str[i]));
-            i++;
-        } else {
-            if(i + 1 < len) { //&& (str[i+1] & 0x80))
-                uint16_t tmp = (((uint16_t(str[i]) & 0x00ff) << 8) | (uint16_t(str[i + 1]) & 0x00ff));
-                vec.push_back(tmp);
-                i += 2;
-            } else {
-                return false;
-            }
-        }
-    }
+  vec.clear();
+  if(!str) {
    return true;
+  }
+  size_t i = 0;
+  while(i < len) {
+    if(0 == (str[i] & 0x80)) {
+      vec.push_back(uint16_t(str[i]));
+      i++;
+    } else {
+      if(i + 1 < len) { //&& (str[i+1] & 0x80))
+        uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
+        vec.push_back(tmp);
+        i += 2;
+      } else {
+        return false;
+      }
+    }
+  }
+  return true;
 }

 template <class Uint16Container>
 bool GBKTrans(const string& str, Uint16Container& vec) {
-    return GBKTrans(str.c_str(), str.size(), vec);
+  return GBKTrans(str.c_str(), str.size(), vec);
 }

 template <class Uint16ContainerConIter>
 void GBKTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
-    res.clear();
-    //pair<char, char> pa;
-    char first, second;
-    while(begin != end) {
-        //pa = uint16ToChar2(*begin);
-        first = ((*begin) >> 8) & 0x00ff;
-        second = (*begin) & 0x00ff;
-        if(first & 0x80) {
-            res += first;
-            res += second;
-        } else {
-            res += second;
-        }
-        begin++;
+  res.clear();
+  //pair<char, char> pa;
+  char first, second;
+  while(begin != end) {
+    //pa = uint16ToChar2(*begin);
+    first = ((*begin)>>8) & 0x00ff;
+    second = (*begin) & 0x00ff;
+    if(first & 0x80) {
+      res += first;
+      res += second;
+    } else {
+      res += second;
    }
+    begin++;
+  }
 }

 /*
 * format example: "%Y-%m-%d %H:%M:%S"
 */
-inline void GetTime(const string& format, string&  timeStr) {
-    time_t timeNow;
-    time(&timeNow);
-    timeStr.resize(64);
-    size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow));
-    timeStr.resize(len);
-}
+// inline void GetTime(const string& format, string&  timeStr) {
+//   time_t timeNow;
+//   time(&timeNow);
+//   timeStr.resize(64);
+//   size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), localtime(&timeNow));
+//   timeStr.resize(len);
+// }

 inline string PathJoin(const string& path1, const string& path2) {
-    if(EndsWith(path1, "/")) {
-        return path1 + path2;
-    }
-    return path1 + "/" + path2;
+  if(EndsWith(path1, "/")) {
+    return path1 + path2;
+  }
+  return path1 + "/" + path2;
 }

 }
--- a/libchinese-segmentation/cppjieba/limonp/Thread.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/Thread.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_THREAD_HPP
 #define LIMONP_THREAD_HPP

@ -25,36 +7,36 @@
 namespace limonp {

 class IThread: NonCopyable {
-public:
-    IThread(): isStarted(false), isJoined(false) {
+ public:
+  IThread(): isStarted(false), isJoined(false) {
+  }
+  virtual ~IThread() {
+    if(isStarted && !isJoined) {
+      XCHECK(!pthread_detach(thread_));
    }
-    virtual ~IThread() {
-        if(isStarted && !isJoined) {
-            XCHECK(!pthread_detach(thread_));
-        }
-    };
+  };

-    virtual void Run() = 0;
-    void Start() {
-        XCHECK(!isStarted);
-        XCHECK(!pthread_create(&thread_, NULL, Worker, this));
-        isStarted = true;
-    }
-    void Join() {
-        XCHECK(!isJoined);
-        XCHECK(!pthread_join(thread_, NULL));
-        isJoined = true;
-    }
-private:
-    static void * Worker(void * data) {
-        IThread * ptr = (IThread*) data;
-        ptr->Run();
-        return NULL;
-    }
+  virtual void Run() = 0;
+  void Start() {
+    XCHECK(!isStarted);
+    XCHECK(!pthread_create(&thread_, NULL, Worker, this));
+    isStarted = true;
+  }
+  void Join() {
+    XCHECK(!isJoined);
+    XCHECK(!pthread_join(thread_, NULL));
+    isJoined = true;
+  }
+ private:
+  static void * Worker(void * data) {
+    IThread * ptr = (IThread* ) data;
+    ptr->Run();
+    return NULL;
+  }

-    pthread_t thread_;
-    bool isStarted;
-    bool isJoined;
+  pthread_t thread_;
+  bool isStarted;
+  bool isJoined;
 }; // class IThread

 } // namespace limonp
--- a/libchinese-segmentation/cppjieba/limonp/ThreadPool.hpp
+++ b/libchinese-segmentation/cppjieba/limonp/ThreadPool.hpp
@ -1,21 +1,3 @@
-/*
- * Copyright (C) 2020, KylinSoft Co., Ltd.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- *
- *
- */
 #ifndef LIMONP_THREAD_POOL_HPP
 #define LIMONP_THREAD_POOL_HPP

@ -30,73 +12,73 @@ using namespace std;

 //class ThreadPool;
 class ThreadPool: NonCopyable {
-public:
-    class Worker: public IThread {
-    public:
-        Worker(ThreadPool* pool): ptThreadPool_(pool) {
-            assert(ptThreadPool_);
-        }
-        virtual ~Worker() {
-        }
-
-        virtual void Run() {
-            while(true) {
-                ClosureInterface* closure = ptThreadPool_->queue_.Pop();
-                if(closure == NULL) {
-                    break;
-                }
-                try {
-                    closure->Run();
-                } catch(std::exception& e) {
-                    XLOG(ERROR) << e.what();
-                } catch(...) {
-                    XLOG(ERROR) << " unknown exception.";
-                }
-                delete closure;
-            }
-        }
-    private:
-        ThreadPool * ptThreadPool_;
-    }; // class Worker
-
-    ThreadPool(size_t thread_num)
-        : threads_(thread_num),
-          queue_(thread_num) {
-        assert(thread_num);
-        for(size_t i = 0; i < threads_.size(); i ++) {
-            threads_[i] = new Worker(this);
-        }
+ public:
+  class Worker: public IThread {
+   public:
+    Worker(ThreadPool* pool): ptThreadPool_(pool) {
+      assert(ptThreadPool_);
    }
-    ~ThreadPool() {
-        Stop();
+    virtual ~Worker() {
    }

-    void Start() {
-        for(size_t i = 0; i < threads_.size(); i++) {
-            threads_[i]->Start();
+    virtual void Run() {
+      while (true) {
+        ClosureInterface* closure = ptThreadPool_->queue_.Pop();
+        if (closure == NULL) {
+          break;
        }
-    }
-    void Stop() {
-        for(size_t i = 0; i < threads_.size(); i ++) {
-            queue_.Push(NULL);
+        try {
+          closure->Run();
+        } catch(std::exception& e) {
+          XLOG(ERROR) << e.what();
+        } catch(...) {
+          XLOG(ERROR) << " unknown exception.";
        }
-        for(size_t i = 0; i < threads_.size(); i ++) {
-            threads_[i]->Join();
-            delete threads_[i];
-        }
-        threads_.clear();
+        delete closure;
+      }
    }
+   private:
+    ThreadPool * ptThreadPool_;
+  }; // class Worker

-    void Add(ClosureInterface* task) {
-        assert(task);
-        queue_.Push(task);
+  ThreadPool(size_t thread_num)
+    : threads_(thread_num), 
+      queue_(thread_num) {
+    assert(thread_num);
+    for(size_t i = 0; i < threads_.size(); i ++) {
+      threads_[i] = new Worker(this);
    }
+  }
+  ~ThreadPool() {
+    Stop();
+  }

-private:
-    friend class Worker;
+  void Start() {
+    for(size_t i = 0; i < threads_.size(); i++) {
+      threads_[i]->Start();
+    }
+  }
+  void Stop() {
+    for(size_t i = 0; i < threads_.size(); i ++) {
+      queue_.Push(NULL);
+    }
+    for(size_t i = 0; i < threads_.size(); i ++) {
+      threads_[i]->Join();
+      delete threads_[i];
+    }
+    threads_.clear();
+  }

-    vector<IThread*> threads_;
-    BoundedBlockingQueue<ClosureInterface*> queue_;
+  void Add(ClosureInterface* task) {
+    assert(task);
+    queue_.Push(task);
+  }
+
+ private:
+  friend class Worker;
+
+  vector<IThread*> threads_;
+  BoundedBlockingQueue<ClosureInterface*> queue_;
 }; // class ThreadPool

 } // namespace limonp
--- a/libsearch/file-utils.cpp
+++ b/libsearch/file-utils.cpp
@ -20,6 +20,7 @@
 *
 */
 #include "file-utils.h"
+#include <QXmlStreamReader>

 using namespace Zeeker;
 size_t FileUtils::_max_index_count = 0;
@ -488,6 +489,22 @@ void FileUtils::getDocxTextContent(QString &path, QString &textcontent) {

    fileR.open(QIODevice::ReadOnly);        //读取方式打开

+    QXmlStreamReader reader(&fileR);
+
+    while (!reader.atEnd()){
+       if(reader.readNextStartElement() and reader.name().toString() == "t"){
+           textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
+           if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
+               break;
+           }
+       }
+    }
+
+    fileR.close();
+    file.close();
+    return;
+
+/*    //原加载DOM文档方式；
    QDomDocument doc;
    doc.setContent(fileR.readAll());
    fileR.close();
@ -512,6 +529,7 @@ void FileUtils::getDocxTextContent(QString &path, QString &textcontent) {
    }
    file.close();
    return;
+*/
 }

 void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
@ -529,6 +547,31 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
    }
    if(fileList.isEmpty())
        return;
+
+    for(int i = 0; i < fileList.size(); ++i){
+        QString name = prefix + QString::number(i + 1) + ".xml";
+        if(!file.setCurrentFile(name)) {
+            continue;
+        }
+        QuaZipFile fileR(&file);
+        fileR.open(QIODevice::ReadOnly);
+
+        QXmlStreamReader reader(&fileR);
+
+        while (!reader.atEnd()){
+           if(reader.readNextStartElement() and reader.name().toString() == "t"){
+               textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
+               if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
+                   break;
+               }
+           }
+        }
+        fileR.close();
+    }
+    file.close();
+    return;
+
+/*
    QDomElement sptree;
    QDomElement sp;
    QDomElement txbody;
@ -596,6 +639,7 @@ void FileUtils::getPptxTextContent(QString &path, QString &textcontent) {
    }
    file.close();
    return;
+*/
 }

 void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
@ -610,8 +654,24 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
        return;
    QuaZipFile fileR(&file);

-    fileR.open(QIODevice::ReadOnly);        //读取方式打开
+    fileR.open(QIODevice::ReadOnly);

+    QXmlStreamReader reader(&fileR);
+
+    while (!reader.atEnd()){
+       if(reader.readNextStartElement() and reader.name().toString() == "t"){
+           textcontent.append(reader.readElementText().replace("\n", "").replace("\r", " "));
+           if(textcontent.length() >= MAX_CONTENT_LENGTH/3){
+               break;
+           }
+       }
+    }
+
+    fileR.close();
+    file.close();
+    return;
+
+/*
    QDomDocument doc;
    doc.setContent(fileR.readAll());
    fileR.close();
@ -641,6 +701,7 @@ void FileUtils::getXlsxTextContent(QString &path, QString &textcontent) {
    }
    file.close();
    return;
+*/
 }

 void FileUtils::getPdfTextContent(QString &path, QString &textcontent) {
@ -650,7 +711,7 @@ void FileUtils::getPdfTextContent(QString &path, QString &textcontent) {
    const QRectF qf;
    int pageNum = doc->numPages();
    for(int i = 0; i < pageNum; ++i) {
-        textcontent.append(doc->page(i)->text(qf).replace("\n", ""));
+        textcontent.append(doc->page(i)->text(qf).replace("\n", "").replace("\r", " "));
        if(textcontent.length() >= MAX_CONTENT_LENGTH / 3)
            break;
    }
@ -679,7 +740,7 @@ void FileUtils::getTxtContent(QString &path, QString &textcontent) {
    stream.setCodec(codec);
    uchardet_delete(chardet);

-    textcontent = stream.readAll().replace("\n", "");
+    textcontent = stream.readAll().replace("\n", "").replace("\r", " ");

    file.close();
    encodedString.clear();
--- a/libsearch/index/construct-document.cpp
+++ b/libsearch/index/construct-document.cpp
@ -110,17 +110,21 @@ void ConstructDocumentForContent::run() {
        return;
    QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
    QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
-
-    QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000).toStdString());
-
    Document doc;
    doc.setData(content);
    doc.setUniqueTerm(uniqueterm);
    doc.addTerm(upTerm);
    doc.addValue(m_path);
-    for(int i = 0; i < term.size(); ++i) {
-        doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));

+    //'\xEF\xBC\x8C' is "，" "\xE3\x80\x82" is "。"  use three " " to replace ,to ensure the offset info.
+    content = content.replace("\t", " ").replace("\xEF\xBC\x8C", "   ").replace("\xE3\x80\x82", "   ");
+
+//    QVector<SKeyWord> term = ChineseSegmentation::getInstance()->callSegement(content.left(20480000));
+    //修改函数返回类型，修改入参为std::string引用--jxx20210519
+    std::vector<cppjieba::KeywordExtractor::Word> term = ChineseSegmentation::getInstance()->callSegementStd(content.left(20480000).toStdString());
+
+    for(size_t i = 0; i < term.size(); ++i) {
+        doc.addPosting(term.at(i).word, term.at(i).offsets, static_cast<int>(term.at(i).weight));
    }

    Zeeker::_mutex_doc_list_content.lock();
--- a/libsearch/index/document.cpp
+++ b/libsearch/index/document.cpp
@ -37,6 +37,17 @@ void Document::addPosting(std::string term, QVector<size_t> offset, int weight)
    }
 }

+void Document::addPosting(std::string term, std::vector<size_t> offset, int weight) {
+    if(term == "")
+        return;
+    if(term.length() > 240)
+        term = QString::fromStdString(term).left(30).toStdString();
+
+    for(size_t i : offset) {
+        m_document.add_posting(term, i, weight);
+    }
+}
+
 void Document::addPosting(std::string term, unsigned int offset, int weight) {
    if(term == "")
        return;
--- a/libsearch/index/document.h
+++ b/libsearch/index/document.h
@ -41,6 +41,7 @@ public:
    }
    void setData(QString &data);
    void addPosting(std::string term, QVector<size_t> offset, int weight = 1);
+    void addPosting(std::string term, std::vector<size_t> offset, int weight = 1);
    void addPosting(std::string term, unsigned int offset, int weight = 1);
    void addTerm(QString term);
    void addValue(QString value);
--- a/libsearch/index/file-reader.cpp
+++ b/libsearch/index/file-reader.cpp
@ -31,8 +31,9 @@ void FileReader::getTextContent(QString path, QString &textContent) {
    QFileInfo file(path);
    QString strsfx =  file.suffix();
    if(name == "application/zip") {
-        if(strsfx.endsWith("docx"))
+        if(strsfx.endsWith("docx")){
            FileUtils::getDocxTextContent(path, textContent);
+        }
        if(strsfx.endsWith("pptx"))
            FileUtils::getPptxTextContent(path, textContent);
        if(strsfx.endsWith("xlsx"))
--- a/libsearch/index/first-index.cpp
+++ b/libsearch/index/first-index.cpp
@ -46,7 +46,54 @@ void FirstIndex::DoSomething(const QFileInfo& fileInfo) {
 //    qDebug() << "there are some shit here"<<fileInfo.fileName() << fileInfo.absoluteFilePath() << QString(fileInfo.isDir() ? "1" : "0");
    this->q_index->enqueue(QVector<QString>() << fileInfo.fileName() << fileInfo.absoluteFilePath() << QString((fileInfo.isDir() && (!fileInfo.isSymLink())) ? "1" : "0"));
    if((fileInfo.fileName().split(".", QString::SkipEmptyParts).length() > 1) && (true == targetFileTypeMap[fileInfo.fileName().split(".").last()])) {
-        this->q_content_index->enqueue(fileInfo.absoluteFilePath());
+        //this->q_content_index->enqueue(fileInfo.absoluteFilePath());
+        if(fileInfo.fileName().split(".").last() == "docx"){
+            QuaZip file(fileInfo.absoluteFilePath());
+            if(!file.open(QuaZip::mdUnzip))
+                return;
+            if(!file.setCurrentFile("word/document.xml", QuaZip::csSensitive))
+                return;
+            QuaZipFile fileR(&file);
+            this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//docx解压缩后的xml文件为实际需要解析文件大小
+            qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
+            qDebug() << "文件大小:" << fileR.usize();
+            file.close();
+        }else if(fileInfo.fileName().split(".").last() == "pptx"){
+            QuaZip file(fileInfo.absoluteFilePath());
+            if(!file.open(QuaZip::mdUnzip))
+                return;
+            QString prefix("ppt/slides/slide");
+            qint64 fileSize(0);
+            qint64 fileIndex(0);
+            for(QString i : file.getFileNameList()) {
+                if(i.startsWith(prefix)){
+                    QString name = prefix + QString::number(fileIndex + 1) + ".xml";
+                    fileIndex++;
+                    if(!file.setCurrentFile(name)) {
+                        continue;
+                    }
+                    QuaZipFile fileR(&file);
+                    fileSize += fileR.usize();
+                }
+            }
+            file.close();
+            qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
+            qDebug() << "文件大小:" << fileSize;
+            this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileSize));//pptx解压缩后的xml文件为实际需要解析文件大小
+        }else if(fileInfo.fileName().split(".").last() == "xlsx"){
+            QuaZip file(fileInfo.absoluteFilePath());
+            if(!file.open(QuaZip::mdUnzip))
+                return;
+            if(!file.setCurrentFile("xl/sharedStrings.xml", QuaZip::csSensitive))
+                return;
+            QuaZipFile fileR(&file);
+            this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileR.usize()));//xlsx解压缩后的xml文件为实际解析文件大小
+            qDebug() << "文件路径:" <<fileInfo.absoluteFilePath();
+            qDebug() << "文件大小:" << fileR.usize();
+            file.close();
+        }else{
+            this->q_content_index->enqueue(qMakePair(fileInfo.absoluteFilePath(),fileInfo.size()));
+        }
    }
 }

@ -90,8 +137,9 @@ void FirstIndex::run() {

    this->q_index = new QQueue<QVector<QString>>();
    //this->q_content_index = new QQueue<QString>();
-    NEW_QUEUE(this->q_content_index);
+    //NEW_QUEUE(this->q_content_index);
 //    this->mlm = new MessageListManager();
+    this->q_content_index = new QQueue<QPair<QString,qint64>>();

    int fifo_fd;
    char buffer[2];
@ -168,9 +216,14 @@ void FirstIndex::run() {
            qDebug() << "q_content_index:" << q_content_index->size();
            while(!this->q_content_index->empty()) {
 //                for (size_t i = 0; (i < this->u_send_length) && (!this->q_content_index->empty()); ++i){
-                for(size_t i = 0; (i < 30) && (!this->q_content_index->empty()); ++i) {
-                    tmp->enqueue(this->q_content_index->dequeue());
+                qint64 fileSize = 0;
+                //修改一次处理的数据量，从30个文件改为文件总大小为50M以下，50M为暂定值--jxx20210519
+                for(size_t i = 0;/* (i < 30) && */(fileSize < 50*1024*1024) && (!this->q_content_index->empty()); ++i) {
+                    QPair<QString,qint64> tempPair = this->q_content_index->dequeue();
+                    fileSize += tempPair.second;
+                    tmp->enqueue(tempPair.first);
                }
+//                qDebug() << ">>>>>>>>all fileSize:" << fileSize << "file num:" << tmp->size() << "<<<<<<<<<<<<<<<<<<<";
                this->p_indexGenerator->creatAllIndex(tmp);
                tmp->clear();
            }
--- a/libsearch/index/first-index.h
+++ b/libsearch/index/first-index.h
@ -62,7 +62,9 @@ private:

    //test
    QQueue<QVector<QString>>* q_index;
-    QQueue<QString>* q_content_index;
+//    QQueue<QString>* q_content_index;
+    //修改QQueue存储数据为QPair<QString,qint64>，增加存储文件大小数据便于处理时统计--jxx20210519
+    QQueue<QPair<QString,qint64>>* q_content_index;

    const QMap<QString, bool> targetFileTypeMap = {
        std::map<QString, bool>::value_type("doc", true),
--- a/libsearch/index/search-manager.cpp
+++ b/libsearch/index/search-manager.cpp
@ -27,7 +27,7 @@ QMutex  SearchManager::m_mutex1;
 QMutex  SearchManager::m_mutex2;
 QMutex  SearchManager::m_mutex3;
 SearchManager::SearchManager(QObject *parent) : QObject(parent) {
-    m_pool.setMaxThreadCount(2);
+    m_pool.setMaxThreadCount(3);
    m_pool.setExpiryTimeout(1000);
 }

@ -280,29 +280,15 @@ int FileContentSearch::keywordSearchContent() {
            words.append(sKeyWord.at(i).word).append(" ");
        }

-        Xapian::Query query = qp.parse_query(words);
-//        Xapian::Query query = qp.parse_query(keyword.toStdString());
-
-
-
-//        QVector<SKeyWord> sKeyWord = ChineseSegmentation::getInstance()->callSegement(keyword);
-//        //Creat a query
-//        std::string words;
-//        for(int i=0;i<sKeyWord.size();i++)
-//        {
-//            words.append(sKeyWord.at(i).word).append(" ");
-//        }
-
-
 //        Xapian::Query query = qp.parse_query(words);

-        //        std::vector<Xapian::Query> v;
-        //        for(int i=0;i<sKeyWord.size();i++)
-        //        {
-        //            v.push_back(Xapian::Query(sKeyWord.at(i).word));
-        //            qDebug()<<QString::fromStdString(sKeyWord.at(i).word);
-        //        }
-        //        Xapian::Query queryPhrase =Xapian::Query(Xapian::Query::OP_AND, v.begin(), v.end());
+        std::vector<Xapian::Query> v;
+        for(int i=0; i<sKeyWord.size(); i++) {
+            v.push_back(Xapian::Query(sKeyWord.at(i).word));
+            qDebug() << QString::fromStdString(sKeyWord.at(i).word);
+        }
+        Xapian::Query query = Xapian::Query(Xapian::Query::OP_AND, v.begin(), v.end());
+
        qDebug() << "keywordSearchContent:" << QString::fromStdString(query.get_description());

        enquire.set_query(query);
--- a/libsearch/libsearch.pro
+++ b/libsearch/libsearch.pro
@ -67,7 +67,7 @@ unix {
    INSTALLS += target

    header.path = /usr/include/ukui-search
-    header.files += *.h index/*.h appsearch/*.h settingsearch/*.h
+    header.files += *.h index/*.h appsearch/*.h settingsearch/*.h plugininterface/*.h
    INSTALLS += header
 }

--- a/libsearch/parser/binary-parser.cpp
+++ b/libsearch/parser/binary-parser.cpp
@ -4963,7 +4963,7 @@ bool KBinaryParser::read8DocText(FILE *pFile, const ppsInfoType *pPPS,

                if(bUsesUnicode) {
                    ushort* usAucData = (ushort*)ptaucBytes;
-                    content.append(QString::fromUtf16(usAucData).replace("\r", ""));
+                    content.append(QString::fromUtf16(usAucData).replace("\n", "").replace("\r", " "));
                    usAucData = (ushort*)xfree((void*)usAucData);
                    ptaucBytes = NULL;
                    if(content.length() >= 682666) //20480000/3
@ -5066,7 +5066,7 @@ int KBinaryParser:: readSSTRecord(readDataParam &rdParam, ppsInfoType PPS_info,
        } else {
            ushort* usData = (ushort*)chData;

-            content.append(QString::fromUtf16(usData).replace("\r", ""));
+            content.append(QString::fromUtf16(usData).replace("\n", "").replace("\r", " "));
            usData = (ushort*)xfree((void*)usData);
            chData = NULL;
            if(content.length() >= 682666) //20480000/3
@ -5131,7 +5131,7 @@ ULONG KBinaryParser::readPPtRecord(FILE* pFile, ppsInfoType* PPS_info, ULONG* au
                return -1;
            ushort* usData = (ushort*)chData;

-            content.append(QString::fromUtf16(usData).replace("\r", ""));
+            content.append(QString::fromUtf16(usData).replace("\n", "").replace("\r", " "));

            usData = (ushort*)xfree((void*)usData);
            chData = NULL;
--- a/src/create-index-ask-dialog.h
+++ b/src/create-index-ask-dialog.h
@ -32,6 +32,7 @@
 #include <QStyleOption>
 #include <QApplication>
 #include <QPainter>
+#include <QPainterPath>

 namespace Zeeker {
 class CreateIndexAskDialog : public QDialog {