ukui-search/libchinese-segmentation/cppjieba/MixSegment.hpp

#pragma once

#include <cassert>
#include "MPSegment.hpp"
#include "HMMSegment.hpp"
#include "limonp/StringUtil.hpp"
#include "PosTagger.hpp"

namespace cppjieba {
class MixSegment: public SegmentTagged {
public:
    MixSegment(const DictTrie* dictTrie, const HMMModel* model)
        : mpSeg_(dictTrie), hmmSeg_(model) {
    }
    ~MixSegment() {}

    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
                     size_t) const override {
        if (!hmm) {
            mpSeg_.CutRuneArray(begin, end, res);
            return;
        }

        vector<WordRange> words;
        assert(end >= begin);
        words.reserve(end - begin);
        mpSeg_.CutRuneArray(begin, end, words);

        vector<WordRange> hmmRes;
        hmmRes.reserve(end - begin);

        for (size_t i = 0; i < words.size(); i++) {
            //if mp Get a word, it's ok, put it into result
            if (words[i].left != words[i].right || (words[i].left == words[i].right &&
                                                    mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
                res.push_back(words[i]);
                continue;
            }

            // if mp Get a single one and it is not in userdict, collect it in sequence
            size_t j = i;

            while (j < words.size() && words[j].left == words[j].right &&
                   !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
                j++;
            }

            // Cut the sequence with hmm
            assert(j - 1 >= i);
            // TODO
            hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);

            //put hmm result to result
            for (size_t k = 0; k < hmmRes.size(); k++) {
                res.push_back(hmmRes[k]);
            }

            //clear tmp vars
            hmmRes.clear();

            //let i jump over this piece
            i = j - 1;
        }
    }

    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
                     size_t) const override {
        //目前hmm默认开启，后期如有需要关闭再修改--jxx20210519
//        if (!hmm) {
//            mpSeg_.CutRuneArray(begin, end, res);
//            return;
//        }

        vector<WordRange> words;
        assert(end >= begin);
        words.reserve(end - begin);
        mpSeg_.CutRuneArray(begin, end, words);

        vector<WordRange> hmmRes;
        hmmRes.reserve(end - begin);

        for (size_t i = 0; i < words.size(); i++) {
            //if mp Get a word, it's ok, put it into result
            if (words[i].left != words[i].right || (words[i].left == words[i].right &&
                                                    mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
                res.push_back(GetStringFromRunes(s, words[i].left, words[i].right));
                continue;
            }

            // if mp Get a single one and it is not in userdict, collect it in sequence
            size_t j = i;

            while (j < words.size() && words[j].left == words[j].right &&
                   !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
                j++;
            }

            // Cut the sequence with hmm
            assert(j - 1 >= i);
            // TODO
            hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);

            //put hmm result to result
            for (size_t k = 0; k < hmmRes.size(); k++) {
                res.push_back(GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right));
            }

            //clear tmp vars
            hmmRes.clear();

            //let i jump over this piece
            i = j - 1;
        }
    }

    const DictTrie* GetDictTrie() const override {
        return mpSeg_.GetDictTrie();
    }

    bool Tag(const string& src, vector<pair<string, string> >& res) const override {
        return tagger_.Tag(src, res, *this);
    }

    string LookupTag(const string &str) const {
        return tagger_.LookupTag(str, *this);
    }

private:
    MPSegment mpSeg_;
    HMMSegment hmmSeg_;
    PosTagger tagger_;

}; // class MixSegment

} // namespace cppjieba
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
+								#pragma once
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
 								#include <cassert>
 								#include "MPSegment.hpp"
 								#include "HMMSegment.hpp"
 								#include "limonp/StringUtil.hpp"
 								#include "PosTagger.hpp"
 								namespace cppjieba {
 								class MixSegment: public SegmentTagged {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								public:
 								    MixSegment(const DictTrie* dictTrie, const HMMModel* model)
 								        : mpSeg_(dictTrie), hmmSeg_(model) {
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
+								    }
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
+								    ~MixSegment() {}
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
+								    virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm,
 								                     size_t) const override {
 								        if (!hmm) {
 								            mpSeg_.CutRuneArray(begin, end, res);
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								            return;
 								        }
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								        vector<WordRange> words;
 								        assert(end >= begin);
 								        words.reserve(end - begin);
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
+								        mpSeg_.CutRuneArray(begin, end, words);
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
 								        vector<WordRange> hmmRes;
 								        hmmRes.reserve(end - begin);
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
 								        for (size_t i = 0; i < words.size(); i++) {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								            //if mp Get a word, it's ok, put it into result
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
+								            if (words[i].left != words[i].right || (words[i].left == words[i].right &&
 								                                                    mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								                res.push_back(words[i]);
 								                continue;
 								            }
 								            // if mp Get a single one and it is not in userdict, collect it in sequence
 								            size_t j = i;
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
 								            while (j < words.size() && words[j].left == words[j].right &&
 								                   !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								                j++;
 								            }
 								            // Cut the sequence with hmm
 								            assert(j - 1 >= i);
 								            // TODO
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
+								            hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								            //put hmm result to result
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
+								            for (size_t k = 0; k < hmmRes.size(); k++) {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								                res.push_back(hmmRes[k]);
 								            }
 								            //clear tmp vars
 								            hmmRes.clear();
 								            //let i jump over this piece
 								            i = j - 1;
 								        }
 								    }
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
+								    virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
 								                     size_t) const override {
 								        //目前hmm默认开启，后期如有需要关闭再修改--jxx20210519
 								//        if (!hmm) {
 								//            mpSeg_.CutRuneArray(begin, end, res);
 								//            return;
 								//        }
 								        vector<WordRange> words;
 								        assert(end >= begin);
 								        words.reserve(end - begin);
 								        mpSeg_.CutRuneArray(begin, end, words);
 								        vector<WordRange> hmmRes;
 								        hmmRes.reserve(end - begin);
 								        for (size_t i = 0; i < words.size(); i++) {
 								            //if mp Get a word, it's ok, put it into result
 								            if (words[i].left != words[i].right || (words[i].left == words[i].right &&
 								                                                    mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
 								                res.push_back(GetStringFromRunes(s, words[i].left, words[i].right));
 								                continue;
 								            }
 								            // if mp Get a single one and it is not in userdict, collect it in sequence
 								            size_t j = i;
 								            while (j < words.size() && words[j].left == words[j].right &&
 								                   !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
 								                j++;
 								            }
 								            // Cut the sequence with hmm
 								            assert(j - 1 >= i);
 								            // TODO
 								            hmmSeg_.CutRuneArray(words[i].left, words[j - 1].left + 1, hmmRes);
 								            //put hmm result to result
 								            for (size_t k = 0; k < hmmRes.size(); k++) {
 								                res.push_back(GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right));
 								            }
 								            //clear tmp vars
 								            hmmRes.clear();
 								            //let i jump over this piece
 								            i = j - 1;
 								        }
 								    }
 								    const DictTrie* GetDictTrie() const override {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								        return mpSeg_.GetDictTrie();
 								    }
-												Add file size judgment; Optimize the reference of Jieba function; Fix the default storage path of Jieba temporary files;

											
										
										
											2021-05-22 09:18:35 +08:00
+								    bool Tag(const string& src, vector<pair<string, string> >& res) const override {
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								        return tagger_.Tag(src, res, *this);
 								    }
 								    string LookupTag(const string &str) const {
 								        return tagger_.LookupTag(str, *this);
 								    }
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
-												Format code style.

											
										
										
											2021-04-26 15:06:47 +08:00
+								private:
 								    MPSegment mpSeg_;
 								    HMMSegment hmmSeg_;
 								    PosTagger tagger_;
-												Add file content index and search funtion(lib).

											
										
										
											2020-12-31 21:14:13 +08:00
 								}; // class MixSegment
 								} // namespace cppjieba