ukui-search/libchinese-segmentation/cppjieba/FullSegment.hpp

/*
 * Copyright (C) 2020, KylinSoft Co., Ltd.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
 *
 *
 */
#ifndef CPPJIEBA_FULLSEGMENT_H
#define CPPJIEBA_FULLSEGMENT_H

#include <algorithm>
#include <set>
#include <cassert>
#include "limonp/Logging.hpp"
#include "DictTrie.hpp"
#include "SegmentBase.hpp"
#include "Unicode.hpp"

namespace cppjieba {
class FullSegment: public SegmentBase {
 public:
  FullSegment(const string& dictPath) {
    dictTrie_ = new DictTrie(dictPath);
    isNeedDestroy_ = true;
  }
  FullSegment(const DictTrie* dictTrie)
    : dictTrie_(dictTrie), isNeedDestroy_(false) {
    assert(dictTrie_);
  }
  ~FullSegment() {
    if (isNeedDestroy_) {
      delete dictTrie_;
    }
  }
  void Cut(const string& sentence, 
        vector<string>& words) const {
    vector<Word> tmp;
    Cut(sentence, tmp);
    GetStringsFromWords(tmp, words);
  }
  void Cut(const string& sentence, 
        vector<Word>& words) const {
    PreFilter pre_filter(symbols_, sentence);
    PreFilter::Range range;
    vector<WordRange> wrs;
    wrs.reserve(sentence.size()/2);
    while (pre_filter.HasNext()) {
      range = pre_filter.Next();
      Cut(range.begin, range.end, wrs);
    }
    words.clear();
    words.reserve(wrs.size());
    GetWordsFromWordRanges(sentence, wrs, words);
  }
  void Cut(RuneStrArray::const_iterator begin, 
        RuneStrArray::const_iterator end, 
        vector<WordRange>& res) const {
    // result of searching in trie tree
    LocalVector<pair<size_t, const DictUnit*> > tRes;

    // max index of res's words
    size_t maxIdx = 0;

    // always equals to (uItr - begin)
    size_t uIdx = 0;

    // tmp variables
    size_t wordLen = 0;
    assert(dictTrie_);
    vector<struct Dag> dags;
    dictTrie_->Find(begin, end, dags);
    for (size_t i = 0; i < dags.size(); i++) {
      for (size_t j = 0; j < dags[i].nexts.size(); j++) {
        size_t nextoffset = dags[i].nexts[j].first;
        assert(nextoffset < dags.size());
        const DictUnit* du = dags[i].nexts[j].second;
        if (du == NULL) {
          if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
            WordRange wr(begin + i, begin + nextoffset);
            res.push_back(wr);
          }
        } else {
          wordLen = du->word.size();
          if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
            WordRange wr(begin + i, begin + nextoffset);
            res.push_back(wr);
          }
        }
        maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
      }
      uIdx++;
    }
  }
 private:
  const DictTrie* dictTrie_;
  bool isNeedDestroy_;
};
}

#endif
Update copyright. 2021-01-29 11:43:07 +08:00			`/*`
			`* Copyright (C) 2020, KylinSoft Co., Ltd.`
			`*`
			`* This program is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, either version 3 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program. If not, see <https://www.gnu.org/licenses/>.`
			`*`
			`*`
			`*/`
Add file content index and search funtion(lib). 2020-12-31 21:14:13 +08:00			`#ifndef CPPJIEBA_FULLSEGMENT_H`
			`#define CPPJIEBA_FULLSEGMENT_H`

			`#include <algorithm>`
			`#include <set>`
			`#include <cassert>`
			`#include "limonp/Logging.hpp"`
			`#include "DictTrie.hpp"`
			`#include "SegmentBase.hpp"`
			`#include "Unicode.hpp"`

			`namespace cppjieba {`
			`class FullSegment: public SegmentBase {`
			`public:`
			`FullSegment(const string& dictPath) {`
			`dictTrie_ = new DictTrie(dictPath);`
			`isNeedDestroy_ = true;`
			`}`
			`FullSegment(const DictTrie* dictTrie)`
			`: dictTrie_(dictTrie), isNeedDestroy_(false) {`
			`assert(dictTrie_);`
			`}`
			`~FullSegment() {`
			`if (isNeedDestroy_) {`
			`delete dictTrie_;`
			`}`
			`}`
			`void Cut(const string& sentence,`
			`vector<string>& words) const {`
			`vector<Word> tmp;`
			`Cut(sentence, tmp);`
			`GetStringsFromWords(tmp, words);`
			`}`
			`void Cut(const string& sentence,`
			`vector<Word>& words) const {`
			`PreFilter pre_filter(symbols_, sentence);`
			`PreFilter::Range range;`
			`vector<WordRange> wrs;`
			`wrs.reserve(sentence.size()/2);`
			`while (pre_filter.HasNext()) {`
			`range = pre_filter.Next();`
			`Cut(range.begin, range.end, wrs);`
			`}`
			`words.clear();`
			`words.reserve(wrs.size());`
			`GetWordsFromWordRanges(sentence, wrs, words);`
			`}`
			`void Cut(RuneStrArray::const_iterator begin,`
			`RuneStrArray::const_iterator end,`
			`vector<WordRange>& res) const {`
			`// result of searching in trie tree`
			`LocalVector<pair<size_t, const DictUnit*> > tRes;`

			`// max index of res's words`
			`size_t maxIdx = 0;`

			`// always equals to (uItr - begin)`
			`size_t uIdx = 0;`

			`// tmp variables`
			`size_t wordLen = 0;`
			`assert(dictTrie_);`
			`vector<struct Dag> dags;`
			`dictTrie_->Find(begin, end, dags);`
			`for (size_t i = 0; i < dags.size(); i++) {`
			`for (size_t j = 0; j < dags[i].nexts.size(); j++) {`
			`size_t nextoffset = dags[i].nexts[j].first;`
			`assert(nextoffset < dags.size());`
			`const DictUnit* du = dags[i].nexts[j].second;`
			`if (du == NULL) {`
			`if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {`
			`WordRange wr(begin + i, begin + nextoffset);`
			`res.push_back(wr);`
			`}`
			`} else {`
			`wordLen = du->word.size();`
			`if (wordLen >= 2 \|\| (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {`
			`WordRange wr(begin + i, begin + nextoffset);`
			`res.push_back(wr);`
			`}`
			`}`
			`maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;`
			`}`
			`uIdx++;`
			`}`
			`}`
			`private:`
			`const DictTrie* dictTrie_;`
			`bool isNeedDestroy_;`
			`};`
			`}`

			`#endif`