From bbf27b9d5fbe740cb7b46e4b9bb4d5cdca141e43 Mon Sep 17 00:00:00 2001
From: rookie-J <jixiaoxu@kylinos.cn>
Date: Wed, 23 Jun 2021 15:50:19 +0800
Subject: [PATCH] =?UTF-8?q?Optimization=20of=20IDF=20dictionary=20loading?=
 =?UTF-8?q?=20mode=EF=BC=9B=20Limit=20the=20maximum=20number=20of=20words?=
 =?UTF-8?q?=20segmentation;=20Other=20optimization;?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 libchinese-segmentation/cppjieba/DatTrie.hpp  | 114 ++++++++++++++-
 libchinese-segmentation/cppjieba/DictTrie.hpp |   2 +-
 .../cppjieba/HMMSegment.hpp                   |   8 +-
 libchinese-segmentation/cppjieba/IdfTrie.hpp  | 134 ++++++++++++++++++
 libchinese-segmentation/cppjieba/Jieba.hpp    |   2 +-
 .../cppjieba/KeywordExtractor.hpp             |  60 ++------
 .../cppjieba/MixSegment.hpp                   |   5 +-
 .../cppjieba/PreFilter.hpp                    |   7 +-
 libchinese-segmentation/cppjieba/Unicode.hpp  |  18 ---
 libchinese-segmentation/cppjieba/cppjieba.pri |   2 +-
 .../libchinese-segmentation.pro               |   2 +
 libsearch/index/construct-document.cpp        |  11 +-
 libsearch/index/document.cpp                  |  20 ++-
 libsearch/index/document.h                    |   7 +-
 libsearch/index/first-index.cpp               |   2 +-
 libsearch/index/index-generator.cpp           |   6 +-
 libsearch/libsearch.pro                       |   2 +-
 src/src.pro                                   |   2 +-
 ukui-search.pro                               |   1 +
 19 files changed, 309 insertions(+), 96 deletions(-)
 create mode 100644 libchinese-segmentation/cppjieba/IdfTrie.hpp
diff --git a/libchinese-segmentation/cppjieba/DatTrie.hpp b/libchinese-segmentation/cppjieba/DatTrie.hpp
index d4e64d1..a4967a3 100644
--- a/libchinese-segmentation/cppjieba/DatTrie.hpp
+++ b/libchinese-segmentation/cppjieba/DatTrie.hpp
@@ -33,6 +33,19 @@ struct DatElement {
     }
 };
 
+struct IdfElement {
+    string word;
+    double idf = 0;
+
+    bool operator < (const IdfElement & b) const {
+        if (word == b.word) {
+            return this->idf > b.idf;
+        }
+
+        return this->word < b.word;
+    }
+};
+
 inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
     return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
 }
@@ -91,13 +104,24 @@ public:
         JiebaDAT::result_pair_type find_result;
         dat_.exactMatchSearch(key.c_str(), find_result);
 
-        if ((0 == find_result.length) || (find_result.value < 0) || (find_result.value >= elements_num_)) {
+        if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
             return nullptr;
         }
 
         return &elements_ptr_[ find_result.value ];
     }
 
+    const double Find(const string & key, std::size_t length, std::size_t node_pos) const {
+        JiebaDAT::result_pair_type find_result;
+        dat_.exactMatchSearch(key.c_str(), find_result, length, node_pos);
+
+        if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= elements_num_)) {
+            return -1;
+        }
+
+        return idf_elements_ptr_[ find_result.value ];
+    }
+
     void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
               vector<struct DatDag>&res, size_t max_word_len) const {
 
@@ -119,7 +143,7 @@ public:
             for (std::size_t idx = 0; idx < num_results; ++idx) {
                 auto & match = result_pairs[idx];
 
-                if ((match.value < 0) || (match.value >= elements_num_)) {
+                if ((match.value < 0) || ((size_t)match.value >= elements_num_)) {
                     continue;
                 }
 
@@ -156,6 +180,11 @@ public:
         return InitAttachDat(dat_cache_file, md5);
     }
 
+    bool InitBuildDat(vector<IdfElement>& elements, const string & dat_cache_file, const string & md5) {
+        BuildDatCache(elements, dat_cache_file, md5);
+        return InitIdfAttachDat(dat_cache_file, md5);
+    }
+
     bool InitAttachDat(const string & dat_cache_file, const string & md5) {
         mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
 
@@ -187,6 +216,37 @@ public:
         return true;
     }
 
+    bool InitIdfAttachDat(const string & dat_cache_file, const string & md5) {
+        mmap_fd_ = ::open(dat_cache_file.c_str(), O_RDONLY);
+
+        if (mmap_fd_ < 0) {
+            return false;
+        }
+
+        const auto seek_off = ::lseek(mmap_fd_, 0, SEEK_END);
+        assert(seek_off >= 0);
+        mmap_length_ = seek_off;
+
+        mmap_addr_ = reinterpret_cast<char *>(mmap(NULL, mmap_length_, PROT_READ, MAP_SHARED, mmap_fd_, 0));
+        assert(MAP_FAILED != mmap_addr_);
+
+        assert(mmap_length_ >= sizeof(CacheFileHeader));
+        CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(mmap_addr_);
+        elements_num_ = header.elements_num;
+        min_weight_ = header.min_weight;
+        assert(sizeof(header.md5_hex) == md5.size());
+
+        if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
+            return false;
+        }
+
+        assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(double)  + header.dat_size * dat_.unit_size());
+        idf_elements_ptr_ = (const double *)(mmap_addr_ + sizeof(header));
+        const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(double) * elements_num_;
+        dat_.set_array(dat_ptr, header.dat_size);
+        return true;
+    }
+
 private:
     void BuildDatCache(vector<DatElement>& elements, const string & dat_cache_file, const string & md5) {
         std::sort(elements.begin(), elements.end());
@@ -240,12 +300,62 @@ private:
         }
     }
 
+    void BuildDatCache(vector<IdfElement>& elements, const string & dat_cache_file, const string & md5) {
+        std::sort(elements.begin(), elements.end());
+
+        vector<const char*> keys_ptr_vec;
+        vector<int> values_vec;
+        vector<double> mem_elem_vec;
+
+        keys_ptr_vec.reserve(elements.size());
+        values_vec.reserve(elements.size());
+        mem_elem_vec.reserve(elements.size());
+
+        CacheFileHeader header;
+        header.min_weight = min_weight_;
+        assert(sizeof(header.md5_hex) == md5.size());
+        memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
+
+        for (size_t i = 0; i < elements.size(); ++i) {
+            keys_ptr_vec.push_back(elements[i].word.data());
+            values_vec.push_back(i);
+            mem_elem_vec.push_back(elements[i].idf);
+        }
+
+        auto const ret = dat_.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
+        assert(0 == ret);
+        header.elements_num = mem_elem_vec.size();
+        header.dat_size = dat_.size();
+
+        {
+            string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
+            ::umask(S_IWGRP | S_IWOTH);
+            //const int fd =::mkstemp(&tmp_filepath[0]);
+            //原mkstemp用法有误，已修复--jxx20210519
+            const int fd =::mkstemp((char *)tmp_filepath.data());
+            qDebug() << "mkstemp error:" << errno << tmp_filepath.data();
+            assert(fd >= 0);
+            ::fchmod(fd, 0644);
+
+            auto write_bytes = ::write(fd, (const char *)&header, sizeof(header));
+            write_bytes += ::write(fd, (const char *)&mem_elem_vec[0], sizeof(double) * mem_elem_vec.size());
+            write_bytes += ::write(fd, dat_.array(), dat_.total_size());
+
+            assert(write_bytes == sizeof(header) + mem_elem_vec.size() * sizeof(double) + dat_.total_size());
+            ::close(fd);
+
+            const auto rename_ret = ::rename(tmp_filepath.c_str(), dat_cache_file.c_str());
+            assert(0 == rename_ret);
+        }
+    }
+
     DatTrie(const DatTrie &);
     DatTrie &operator=(const DatTrie &);
 
 private:
     JiebaDAT dat_;
     const DatMemElem * elements_ptr_ = nullptr;
+    const double * idf_elements_ptr_= nullptr;
     size_t elements_num_ = 0;
     double min_weight_ = 0;
 
diff --git a/libchinese-segmentation/cppjieba/DictTrie.hpp b/libchinese-segmentation/cppjieba/DictTrie.hpp
index 698f5d8..5ecee54 100644
--- a/libchinese-segmentation/cppjieba/DictTrie.hpp
+++ b/libchinese-segmentation/cppjieba/DictTrie.hpp
@@ -130,7 +130,7 @@ private:
             dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) +  ".dat_cache";
         }
         QString path = QString::fromStdString(dat_cache_path);
-        qDebug() << "#########path:" << path;
+        qDebug() << "#########Dict path:" << path;
         if (dat_.InitAttachDat(dat_cache_path, md5)) {
             LoadUserDict(user_dict_paths, false); // for load user_dict_single_chinese_word_;
             total_dict_size_ = file_size_sum;
diff --git a/libchinese-segmentation/cppjieba/HMMSegment.hpp b/libchinese-segmentation/cppjieba/HMMSegment.hpp
index 1a9937b..30af449 100644
--- a/libchinese-segmentation/cppjieba/HMMSegment.hpp
+++ b/libchinese-segmentation/cppjieba/HMMSegment.hpp
@@ -138,10 +138,10 @@ private:
         size_t now, old, stat;
         double tmp, endE, endS;
 
-        //vector<int> path(XYSize);
-        //vector<double> weight(XYSize);
-        int path[XYSize];
-        double weight[XYSize];
+        vector<int> path(XYSize);
+        vector<double> weight(XYSize);
+        //int path[XYSize];
+        //double weight[XYSize];
 
         //start
         for (size_t y = 0; y < Y; y++) {
diff --git a/libchinese-segmentation/cppjieba/IdfTrie.hpp b/libchinese-segmentation/cppjieba/IdfTrie.hpp
new file mode 100644
index 0000000..b26decf
--- /dev/null
+++ b/libchinese-segmentation/cppjieba/IdfTrie.hpp
@@ -0,0 +1,134 @@
+#pragma once
+
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <cstring>
+#include <cstdlib>
+#include <stdint.h>
+#include <cmath>
+#include <limits>
+#include "limonp/StringUtil.hpp"
+#include "limonp/Logging.hpp"
+#include "Unicode.hpp"
+#include "DatTrie.hpp"
+#include <QDebug>
+namespace cppjieba {
+
+using namespace limonp;
+
+const size_t IDF_COLUMN_NUM = 2;
+
+class IdfTrie {
+public:
+    enum UserWordWeightOption {
+        WordWeightMin,
+        WordWeightMedian,
+        WordWeightMax,
+    }; // enum UserWordWeightOption
+
+    IdfTrie(const string& dict_path, const string & dat_cache_path = "",
+             UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
+        Init(dict_path, dat_cache_path, user_word_weight_opt);
+    }
+
+    ~IdfTrie() {}
+
+    double Find(const string & word, std::size_t length = 0, std::size_t node_pos = 0) const {
+        return dat_.Find(word, length, node_pos);
+    }
+
+    void Find(RuneStrArray::const_iterator begin,
+              RuneStrArray::const_iterator end,
+              vector<struct DatDag>&res,
+              size_t max_word_len = MAX_WORD_LENGTH) const {
+        dat_.Find(begin, end, res, max_word_len);
+    }
+
+    bool IsUserDictSingleChineseWord(const Rune& word) const {
+        return IsIn(user_dict_single_chinese_word_, word);
+    }
+
+    double GetMinWeight() const {
+        return dat_.GetMinWeight();
+    }
+
+    size_t GetTotalDictSize() const {
+        return total_dict_size_;
+    }
+
+private:
+    void Init(const string& dict_path, string dat_cache_path,
+              UserWordWeightOption user_word_weight_opt) {
+        size_t file_size_sum = 0;
+        const string md5 = CalcFileListMD5(dict_path, file_size_sum);
+
+        if (dat_cache_path.empty()) {
+            //未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
+            dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) +  ".dat_cache";
+        }
+        QString path = QString::fromStdString(dat_cache_path);
+        qDebug() << "#########Idf path:" << path;
+        if (dat_.InitIdfAttachDat(dat_cache_path, md5)) {
+            total_dict_size_ = file_size_sum;
+            return;
+        }
+
+        LoadDefaultIdf(dict_path);
+        double idf_sum_ = CalcIdfSum(static_node_infos_);
+        assert(static_node_infos_.size());
+        idfAverage_ = idf_sum_ / static_node_infos_.size();
+        assert(idfAverage_ > 0.0);
+        double min_weight = 0;
+        dat_.SetMinWeight(min_weight);
+
+        const auto build_ret = dat_.InitBuildDat(static_node_infos_, dat_cache_path, md5);
+        assert(build_ret);
+        total_dict_size_ = file_size_sum;
+        vector<IdfElement>().swap(static_node_infos_);
+    }
+
+    void LoadDefaultIdf(const string& filePath) {
+        ifstream ifs(filePath.c_str());
+        if(not ifs.is_open()){
+            return ;
+        }
+        XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
+        string line;
+        vector<string> buf;
+        size_t lineno = 0;
+
+        for (; getline(ifs, line); lineno++) {
+            if (line.empty()) {
+                XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
+                continue;
+            }
+            Split(line, buf, " ");
+            XCHECK(buf.size() == IDF_COLUMN_NUM) << "split result illegal, line:" << line;
+            IdfElement node_info;
+            node_info.word = buf[0];
+            node_info.idf = atof(buf[1].c_str());
+            static_node_infos_.push_back(node_info);
+        }
+    }
+
+    double CalcIdfSum(const vector<IdfElement>& node_infos) const {
+        double sum = 0.0;
+
+        for (size_t i = 0; i < node_infos.size(); i++) {
+            sum += node_infos[i].idf;
+        }
+
+        return sum;
+    }
+public:
+    double idfAverage_;
+private:
+    vector<IdfElement> static_node_infos_;
+    size_t total_dict_size_ = 0;
+    DatTrie dat_;
+    unordered_set<Rune> user_dict_single_chinese_word_;
+};
+}
+
diff --git a/libchinese-segmentation/cppjieba/Jieba.hpp b/libchinese-segmentation/cppjieba/Jieba.hpp
index c017bd6..a7b11b3 100644
--- a/libchinese-segmentation/cppjieba/Jieba.hpp
+++ b/libchinese-segmentation/cppjieba/Jieba.hpp
@@ -21,7 +21,7 @@ public:
           mix_seg_(&dict_trie_, &model_, stopWordPath),
           full_seg_(&dict_trie_),
           query_seg_(&dict_trie_, &model_, stopWordPath),
-          extractor(&dict_trie_, &model_, idfPath, stopWordPath){ }
+          extractor(&dict_trie_, &model_, idfPath, dat_cache_path,stopWordPath){ }
     ~Jieba() { }
 
     void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
diff --git a/libchinese-segmentation/cppjieba/KeywordExtractor.hpp b/libchinese-segmentation/cppjieba/KeywordExtractor.hpp
index f87ad5f..0011e93 100644
--- a/libchinese-segmentation/cppjieba/KeywordExtractor.hpp
+++ b/libchinese-segmentation/cppjieba/KeywordExtractor.hpp
@@ -2,6 +2,7 @@
 
 #include <cmath>
 #include "MixSegment.hpp"
+#include "IdfTrie.hpp"
 
 namespace cppjieba {
 
@@ -11,18 +12,14 @@ using namespace std;
 /*utf8*/
 class KeywordExtractor {
 public:
-//    struct Word {
-//        string word;
-//        vector<size_t> offsets;
-//        double weight;
-//    }; // struct Word
 
     KeywordExtractor(const DictTrie* dictTrie,
                      const HMMModel* model,
                      const string& idfPath,
+                     const string& dat_cache_path,
                      const string& stopWordPath)
-        : segment_(dictTrie, model, stopWordPath) {
-        LoadIdfDict(idfPath);
+        : segment_(dictTrie, model, stopWordPath),
+        idf_trie_(idfPath,dat_cache_path){
     }
     ~KeywordExtractor() {
     }
@@ -63,12 +60,11 @@ public:
         keywords.reserve(wordmap.size());
 
         for (unordered_map<string, KeyWord>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
-            unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);//IDF词典查找
-
-            if (cit != idfMap_.end()) {
-                itr->second.weight *= cit->second;
+            double idf = idf_trie_.Find(itr->first);
+            if (-1 != idf) {//IDF词典查找
+                itr->second.weight *= idf;
             } else {
-                itr->second.weight *= idfAverage_;
+                itr->second.weight *= idf_trie_.idfAverage_;
             }
 
             itr->second.word = itr->first;
@@ -80,51 +76,13 @@ public:
         keywords.resize(topN);
     }
 private:
-    void LoadIdfDict(const string& idfPath) {
-        ifstream ifs(idfPath.c_str());
-        if(not ifs.is_open()){
-            return ;
-        }
-        XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
-        string line ;
-        vector<string> buf;
-        double idf = 0.0;
-        double idfSum = 0.0;
-        size_t lineno = 0;
-
-        for (; getline(ifs, line); lineno++) {
-            buf.clear();
-
-            if (line.empty()) {
-                XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
-                continue;
-            }
-
-            Split(line, buf, " ");
-
-            if (buf.size() != 2) {
-                XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
-                continue;
-            }
-
-            idf = atof(buf[1].c_str());
-            idfMap_[buf[0]] = idf;
-            idfSum += idf;
-
-        }
-
-        assert(lineno);
-        idfAverage_ = idfSum / lineno;
-        assert(idfAverage_ > 0.0);
-    }
 
     static bool Compare(const KeyWord& lhs, const KeyWord& rhs) {
         return lhs.weight > rhs.weight;
     }
 
     MixSegment segment_;
-    unordered_map<string, double> idfMap_;
-    double idfAverage_;
+    IdfTrie idf_trie_;
 
     unordered_set<Rune> symbols_;
 }; // class KeywordExtractor
diff --git a/libchinese-segmentation/cppjieba/MixSegment.hpp b/libchinese-segmentation/cppjieba/MixSegment.hpp
index 4c93748..a539039 100644
--- a/libchinese-segmentation/cppjieba/MixSegment.hpp
+++ b/libchinese-segmentation/cppjieba/MixSegment.hpp
@@ -156,8 +156,9 @@ public:
             // if mp Get a single one and it is not in userdict, collect it in sequence
             size_t j = i + 1; //当前i字符为单独的字符并且不在用户字典里（i字符不是最后一个字符），直接判定j字符
 
-            while (j < (words.size() - 1) && words[j].left == words[j].right &&
-                   !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
+            while (j < (words.size() - 1)
+                   && words[j].left == words[j].right
+                   && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
                 j++;
             }
 
diff --git a/libchinese-segmentation/cppjieba/PreFilter.hpp b/libchinese-segmentation/cppjieba/PreFilter.hpp
index 1a75a57..3f04dcf 100644
--- a/libchinese-segmentation/cppjieba/PreFilter.hpp
+++ b/libchinese-segmentation/cppjieba/PreFilter.hpp
@@ -71,7 +71,7 @@ public:
                 cursor_ ++;
             }
         }
-
+        int num = 0;
         while (cursor_ != sentence_.end()) {
             if (cursor_->rune == 0x20) {
                 if (wordRange.left == cursor_) {
@@ -83,6 +83,11 @@ public:
             }
 
             cursor_ ++;
+            num++;
+            if (num >= 1024) { //todo 防止一次性传入过多字节，暂定限制为1024个字
+                wordRange.right = cursor_;
+                return true;
+            }
         }
 
         wordRange.right = sentence_.end();
diff --git a/libchinese-segmentation/cppjieba/Unicode.hpp b/libchinese-segmentation/cppjieba/Unicode.hpp
index d77b5dd..360b461 100644
--- a/libchinese-segmentation/cppjieba/Unicode.hpp
+++ b/libchinese-segmentation/cppjieba/Unicode.hpp
@@ -97,24 +97,6 @@ inline RuneArray DecodeRunesInString(const string& s) {
 
 //重写DecodeRunesInString函数，将实现放入函数中降低内存占用加快处理流程--jxx20210518
 inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
-/*
-    RuneArray arr;
-
-    if (not DecodeRunesInString(s, arr)) {
-        return false;
-    }
-
-    runes.clear();
-
-    uint32_t offset = 0;
-
-    for (uint32_t i = 0; i < arr.size(); ++i) {
-        const uint32_t len = limonp::UnicodeToUtf8Bytes(arr[i]);
-        RuneInfo x(arr[i], offset, len, i, 1);
-        runes.push_back(x);
-        offset += len;
-    }
-*/
 
     uint32_t tmp;
     uint32_t offset = 0;
diff --git a/libchinese-segmentation/cppjieba/cppjieba.pri b/libchinese-segmentation/cppjieba/cppjieba.pri
index fd783c4..cec0ba9 100644
--- a/libchinese-segmentation/cppjieba/cppjieba.pri
+++ b/libchinese-segmentation/cppjieba/cppjieba.pri
@@ -2,6 +2,7 @@ INCLUDEPATH += $$PWD
 
 HEADERS += \
     $$PWD/DictTrie.hpp \
+    $$PWD/IdfTrie.hpp \
     $$PWD/FullSegment.hpp \
     $$PWD/HMMModel.hpp \
     $$PWD/HMMSegment.hpp \
@@ -17,5 +18,4 @@ HEADERS += \
     $$PWD/TextRankExtractor.hpp \
     $$PWD/Trie.hpp \
     $$PWD/Unicode.hpp
-
 include(limonp/limonp.pri)
diff --git a/libchinese-segmentation/libchinese-segmentation.pro b/libchinese-segmentation/libchinese-segmentation.pro
index 583f794..28fb1a1 100644
--- a/libchinese-segmentation/libchinese-segmentation.pro
+++ b/libchinese-segmentation/libchinese-segmentation.pro
@@ -19,6 +19,8 @@ DEFINES += QT_DEPRECATED_WARNINGS
 #DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
 include(cppjieba/cppjieba.pri)
 
+#LIBS += -L/usr/local/lib/libjemalloc -ljemalloc
+
 SOURCES += \
     chinese-segmentation.cpp \
 
diff --git a/libsearch/index/construct-document.cpp b/libsearch/index/construct-document.cpp
index b96ff2f..445bdb5 100644
--- a/libsearch/index/construct-document.cpp
+++ b/libsearch/index/construct-document.cpp
@@ -108,12 +108,14 @@ void ConstructDocumentForContent::run() {
     FileReader::getTextContent(m_path, content);
     if(content.isEmpty())
         return;
-    QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
-    QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
+    //QString uniqueterm = QString::fromStdString(FileUtils::makeDocUterm(m_path));
+    //QString upTerm = QString::fromStdString(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
     Document doc;
     doc.setData(content);
-    doc.setUniqueTerm(uniqueterm);
-    doc.addTerm(upTerm);
+    //doc.setUniqueTerm(uniqueterm);
+    doc.setUniqueTerm(FileUtils::makeDocUterm(m_path));
+    //doc.addTerm(upTerm);
+    doc.addTerm(FileUtils::makeDocUterm(m_path.section("/", 0, -2, QString::SectionIncludeLeadingSep)));
     doc.addValue(m_path);
 
     //'\xEF\xBC\x8C' is "，" "\xE3\x80\x82" is "。"  use three " " to replace ,to ensure the offset info.
@@ -131,6 +133,7 @@ void ConstructDocumentForContent::run() {
     IndexGenerator::_mutex_doc_list_content.unlock();
     content.clear();
     content.squeeze();
+
     term.clear();
     term.shrink_to_fit();
     return;
diff --git a/libsearch/index/document.cpp b/libsearch/index/document.cpp
index 5f71336..57f907a 100644
--- a/libsearch/index/document.cpp
+++ b/libsearch/index/document.cpp
@@ -37,7 +37,7 @@ void Document::addPosting(std::string term, QVector<size_t> offset, int weight)
     }
 }
 
-void Document::addPosting(std::string term, std::vector<size_t> offset, int weight) {
+void Document::addPosting(std::string &term, std::vector<size_t> &offset, int weight) {
     if(term == "")
         return;
     if(term.length() > 240)
@@ -63,6 +63,12 @@ void Document::addTerm(QString term) {
     m_document.add_term(term.toStdString());
 }
 
+void Document::addTerm(std::string term) {
+    if(term.empty())
+        return;
+    m_document.add_term(term);
+}
+
 void Document::addValue(QString value) {
     m_document.add_value(1, value.toStdString());
 }
@@ -73,12 +79,20 @@ void Document::setUniqueTerm(QString term) {
     m_document.add_term(term.toStdString());
 
 //    m_unique_term = new QString(term);
-    m_unique_term = std::move(term);
+    m_unique_term = std::move(term.toStdString());
 }
+
+void Document::setUniqueTerm(std::string term) {
+    if(term.empty())
+        return;
+    m_document.add_term(term);
+    m_unique_term = term;
+}
+
 std::string Document::getUniqueTerm() {
 //    qDebug()<<"m_unique_term!"<<*m_unique_term;
 //    qDebug() << QString::fromStdString(m_unique_term.toStdString());
-    return m_unique_term.toStdString();
+    return m_unique_term;//.toStdString();
 }
 
 void Document::setIndexText(QStringList indexText) {
diff --git a/libsearch/index/document.h b/libsearch/index/document.h
index 6025bc8..84e6262 100644
--- a/libsearch/index/document.h
+++ b/libsearch/index/document.h
@@ -41,11 +41,13 @@ public:
     }
     void setData(QString &data);
     void addPosting(std::string term, QVector<size_t> offset, int weight = 1);
-    void addPosting(std::string term, std::vector<size_t> offset, int weight = 1);
+    void addPosting(std::string &term, std::vector<size_t> &offset, int weight = 1);
     void addPosting(std::string term, unsigned int offset, int weight = 1);
     void addTerm(QString term);
+    void addTerm(std::string term);
     void addValue(QString value);
     void setUniqueTerm(QString term);
+    void setUniqueTerm(std::string term);
     std::string getUniqueTerm();
     void setIndexText(QStringList indexText);
     QStringList getIndexText();
@@ -53,7 +55,8 @@ public:
 private:
     Xapian::Document m_document;
     QStringList m_index_text;
-    QString m_unique_term;
+    //QString m_unique_term;
+    std::string m_unique_term;
 
 };
 }
diff --git a/libsearch/index/first-index.cpp b/libsearch/index/first-index.cpp
index d23ba88..fb3ef49 100644
--- a/libsearch/index/first-index.cpp
+++ b/libsearch/index/first-index.cpp
@@ -154,7 +154,6 @@ void FirstIndex::run() {
 
 
     ++FileUtils::_index_status;
-
     pid_t pid;
     pid = fork();
     if(pid  == 0) {
@@ -235,6 +234,7 @@ void FirstIndex::run() {
             qDebug() << "content index end;";
             sem.release(2);
         });
+
         mutex1.lock();
         mutex2.lock();
         mutex3.lock();
diff --git a/libsearch/index/index-generator.cpp b/libsearch/index/index-generator.cpp
index 8f57d4b..06efc6e 100644
--- a/libsearch/index/index-generator.cpp
+++ b/libsearch/index/index-generator.cpp
@@ -29,7 +29,7 @@
 #include "index-generator.h"
 #include "chinese-segmentation.h"
 #include <QStandardPaths>
-
+#include <malloc.h>
 
 #define INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/index_data").toStdString()
 #define CONTENT_INDEX_PATH (QStandardPaths::writableLocation(QStandardPaths::HomeLocation)+"/.config/org.ukui/ukui-search/content_index_data").toStdString()
@@ -127,11 +127,11 @@ bool IndexGenerator::creatAllIndex(QQueue<QString> *messageList) {
 //        GlobalSettings::getInstance()->setValue(CONTENT_INDEX_DATABASE_STATE, "2");
 //        FileUtils::_index_status &= ~0x2;
         qDebug() << "finish creatAllIndex for content";
+
         IndexGenerator::_doc_list_content.clear();
         IndexGenerator::_doc_list_content.squeeze();
         QVector<Document>().swap(IndexGenerator::_doc_list_content);
-//        delete _doc_list_content;
-//        _doc_list_content = nullptr;
+        malloc_trim(0);
     }
     Q_EMIT this->transactionFinished();
     return true;
diff --git a/libsearch/libsearch.pro b/libsearch/libsearch.pro
index 8ffed23..38cbb89 100644
--- a/libsearch/libsearch.pro
+++ b/libsearch/libsearch.pro
@@ -33,7 +33,7 @@ include(plugininterface/plugin-interface.pri)
 include(pluginmanage/plugin-manager.pri)
 
 LIBS += -L$$OUT_PWD/../libchinese-segmentation/ -lchinese-segmentation
-LIBS += -lxapian -lquazip5 -luchardet
+LIBS += -lxapian -lquazip5 -luchardet #-L/usr/local/lib/libjemalloc -ljemalloc
 
 SOURCES += \
     file-utils.cpp \
diff --git a/src/src.pro b/src/src.pro
index e207640..ff309e5 100644
--- a/src/src.pro
+++ b/src/src.pro
@@ -9,7 +9,7 @@ TEMPLATE = app
 PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0
 CONFIG += c++11 link_pkgconfig no_keywords lrelease
 LIBS += -lxapian -lgsettings-qt -lquazip5 -lX11
-LIBS += -lukui-log4qt
+#LIBS += -lukui-log4qt -L/usr/local/lib/libjemalloc -ljemalloc
 # The following define makes your compiler emit warnings if you use
 # any Qt feature that has been marked deprecated (the exact warnings
 # depend on your compiler). Please consult the documentation of the
diff --git a/ukui-search.pro b/ukui-search.pro
index 47447cd..7fd587d 100644
--- a/ukui-search.pro
+++ b/ukui-search.pro
@@ -19,3 +19,4 @@ src.depends = libsearch
 
 CONFIG += ordered
 
+