ukui-search/libchinese-segmentation/cppjieba/HMMSegment.hpp

196 lines
5.6 KiB
C++
Raw Normal View History

#pragma once
#include <iostream>
#include <fstream>
#include <memory.h>
#include <cassert>
#include "HMMModel.hpp"
#include "SegmentBase.hpp"
namespace cppjieba {
class HMMSegment: public SegmentBase {
2021-04-26 15:06:47 +08:00
public:
HMMSegment(const HMMModel* model)
: model_(model) {
}
~HMMSegment() { }
2021-04-26 15:06:47 +08:00
virtual void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool,
size_t) const override {
2021-04-26 15:06:47 +08:00
RuneStrArray::const_iterator left = begin;
RuneStrArray::const_iterator right = begin;
while (right != end) {
if (right->rune < 0x80) { //asc码
if (left != right) {
2021-04-26 15:06:47 +08:00
InternalCut(left, right, res);
}
2021-04-26 15:06:47 +08:00
left = right;
2021-04-26 15:06:47 +08:00
do {
right = SequentialLetterRule(left, end);//非英文字符则返回left否则返回left后非英文字母的位置
if (right != left) {
2021-04-26 15:06:47 +08:00
break;
}
right = NumbersRule(left, end);//非数字则返回left否则返回left后非数字的位置
if (right != left) {
2021-04-26 15:06:47 +08:00
break;
}
2021-04-26 15:06:47 +08:00
right ++;
} while (false);
2021-04-26 15:06:47 +08:00
WordRange wr(left, right - 1);
res.push_back(wr);
left = right;
} else {
right++;
}
}
if (left != right) {
2021-04-26 15:06:47 +08:00
InternalCut(left, right, res);
}
}
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
size_t) const override {
}
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t) const override {
}
2021-04-26 15:06:47 +08:00
private:
// sequential letters rule
RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end) const {
2021-04-26 15:06:47 +08:00
Rune x = begin->rune;
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
2021-04-26 15:06:47 +08:00
begin ++;
} else {
return begin;
}
while (begin != end) {
2021-04-26 15:06:47 +08:00
x = begin->rune;
if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
2021-04-26 15:06:47 +08:00
begin ++;
} else {
break;
}
}
2021-04-26 15:06:47 +08:00
return begin;
}
2021-04-26 15:06:47 +08:00
//
RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
Rune x = begin->rune;
if ('0' <= x && x <= '9') {
2021-04-26 15:06:47 +08:00
begin ++;
} else {
return begin;
}
while (begin != end) {
2021-04-26 15:06:47 +08:00
x = begin->rune;
if (('0' <= x && x <= '9') || x == '.') {
2021-04-26 15:06:47 +08:00
begin++;
} else {
break;
}
}
2021-04-26 15:06:47 +08:00
return begin;
}
2021-04-26 15:06:47 +08:00
void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
vector<size_t> status;
Viterbi(begin, end, status);
2021-04-26 15:06:47 +08:00
RuneStrArray::const_iterator left = begin;
RuneStrArray::const_iterator right;
for (size_t i = 0; i < status.size(); i++) {
if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
2021-04-26 15:06:47 +08:00
right = begin + i + 1;
WordRange wr(left, right - 1);
res.push_back(wr);
left = right;
}
}
}
2021-04-26 15:06:47 +08:00
void Viterbi(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<size_t>& status) const {
size_t Y = HMMModel::STATUS_SUM;
size_t X = end - begin;
2021-04-26 15:06:47 +08:00
size_t XYSize = X * Y;
size_t now, old, stat;
double tmp, endE, endS;
2021-07-07 11:37:00 +08:00
//vector<int> path(XYSize);
//vector<double> weight(XYSize);
int path[XYSize];
double weight[XYSize];
2021-04-26 15:06:47 +08:00
//start
for (size_t y = 0; y < Y; y++) {
2021-04-26 15:06:47 +08:00
weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
path[0 + y * X] = -1;
}
2021-04-26 15:06:47 +08:00
double emitProb;
for (size_t x = 1; x < X; x++) {
for (size_t y = 0; y < Y; y++) {
2021-04-26 15:06:47 +08:00
now = x + y * X;
weight[now] = MIN_DOUBLE;
path[now] = HMMModel::E; // warning
emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin + x)->rune, MIN_DOUBLE);
for (size_t preY = 0; preY < Y; preY++) {
2021-04-26 15:06:47 +08:00
old = x - 1 + preY * X;
tmp = weight[old] + model_->transProb[preY][y] + emitProb;
if (tmp > weight[now]) {
2021-04-26 15:06:47 +08:00
weight[now] = tmp;
path[now] = preY;
}
}
}
}
2021-04-26 15:06:47 +08:00
endE = weight[X - 1 + HMMModel::E * X];
endS = weight[X - 1 + HMMModel::S * X];
stat = 0;
if (endE >= endS) {
2021-04-26 15:06:47 +08:00
stat = HMMModel::E;
} else {
stat = HMMModel::S;
}
2021-04-26 15:06:47 +08:00
status.resize(X);
for (int x = X - 1 ; x >= 0; x--) {
2021-04-26 15:06:47 +08:00
status[x] = stat;
stat = path[x + stat * X];
}
}
2021-04-26 15:06:47 +08:00
const HMMModel* model_;
}; // class HMMSegment
} // namespace cppjieba