2021-01-29 11:43:07 +08:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2020, KylinSoft Co., Ltd.
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
|
|
* (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
*
|
|
|
|
*
|
|
|
|
*/
|
2020-12-31 21:14:13 +08:00
|
|
|
#ifndef CPPJIEBA_UNICODE_H
|
|
|
|
#define CPPJIEBA_UNICODE_H
|
|
|
|
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
#include <ostream>
|
|
|
|
#include "limonp/LocalVector.hpp"
|
|
|
|
|
|
|
|
namespace cppjieba {
|
|
|
|
|
|
|
|
using std::string;
|
|
|
|
using std::vector;
|
|
|
|
|
|
|
|
typedef uint32_t Rune;
|
|
|
|
|
|
|
|
struct Word {
|
2021-04-26 15:06:47 +08:00
|
|
|
string word;
|
|
|
|
uint32_t offset;
|
|
|
|
uint32_t unicode_offset;
|
|
|
|
uint32_t unicode_length;
|
|
|
|
Word(const string& w, uint32_t o)
|
|
|
|
: word(w), offset(o) {
|
|
|
|
}
|
|
|
|
Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
|
|
|
|
: word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
|
|
|
}
|
2020-12-31 21:14:13 +08:00
|
|
|
}; // struct Word
|
|
|
|
|
|
|
|
inline std::ostream& operator << (std::ostream& os, const Word& w) {
|
2021-04-26 15:06:47 +08:00
|
|
|
return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
struct RuneStr {
|
2021-04-26 15:06:47 +08:00
|
|
|
Rune rune;
|
|
|
|
uint32_t offset;
|
|
|
|
uint32_t len;
|
|
|
|
uint32_t unicode_offset;
|
|
|
|
uint32_t unicode_length;
|
|
|
|
RuneStr(): rune(0), offset(0), len(0), unicode_offset(0), unicode_length(0) {
|
|
|
|
}
|
|
|
|
RuneStr(Rune r, uint32_t o, uint32_t l)
|
|
|
|
: rune(r), offset(o), len(l), unicode_offset(0), unicode_length(0) {
|
|
|
|
}
|
|
|
|
RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
|
|
|
|
: rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
|
|
|
}
|
2020-12-31 21:14:13 +08:00
|
|
|
}; // struct RuneStr
|
|
|
|
|
|
|
|
inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
|
2021-04-26 15:06:47 +08:00
|
|
|
return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
typedef limonp::LocalVector<Rune> Unicode;
|
|
|
|
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
|
|
|
|
|
|
|
|
// [left, right]
|
|
|
|
struct WordRange {
|
2021-04-26 15:06:47 +08:00
|
|
|
RuneStrArray::const_iterator left;
|
|
|
|
RuneStrArray::const_iterator right;
|
|
|
|
WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
|
|
|
|
: left(l), right(r) {
|
|
|
|
}
|
|
|
|
size_t Length() const {
|
|
|
|
return right - left + 1;
|
|
|
|
}
|
|
|
|
bool IsAllAscii() const {
|
|
|
|
for(RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
|
|
|
|
if(iter->rune >= 0x80) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
}; // struct WordRange
|
|
|
|
|
|
|
|
struct RuneStrLite {
|
2021-04-26 15:06:47 +08:00
|
|
|
uint32_t rune;
|
|
|
|
uint32_t len;
|
|
|
|
RuneStrLite(): rune(0), len(0) {
|
|
|
|
}
|
|
|
|
RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
|
|
|
|
}
|
2020-12-31 21:14:13 +08:00
|
|
|
}; // struct RuneStrLite
|
|
|
|
|
|
|
|
inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
|
2021-04-26 15:06:47 +08:00
|
|
|
RuneStrLite rp(0, 0);
|
|
|
|
if(str == NULL || len == 0) {
|
|
|
|
return rp;
|
|
|
|
}
|
|
|
|
if(!(str[0] & 0x80)) { // 0xxxxxxx
|
|
|
|
// 7bit, total 7bit
|
|
|
|
rp.rune = (uint8_t)(str[0]) & 0x7f;
|
|
|
|
rp.len = 1;
|
|
|
|
} else if((uint8_t)str[0] <= 0xdf && 1 < len) {
|
|
|
|
// 110xxxxxx
|
|
|
|
// 5bit, total 5bit
|
|
|
|
rp.rune = (uint8_t)(str[0]) & 0x1f;
|
|
|
|
|
|
|
|
// 6bit, total 11bit
|
|
|
|
rp.rune <<= 6;
|
|
|
|
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
|
|
|
rp.len = 2;
|
|
|
|
} else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
|
|
|
|
// 4bit, total 4bit
|
|
|
|
rp.rune = (uint8_t)(str[0]) & 0x0f;
|
|
|
|
|
|
|
|
// 6bit, total 10bit
|
|
|
|
rp.rune <<= 6;
|
|
|
|
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
|
|
|
|
|
|
|
// 6bit, total 16bit
|
|
|
|
rp.rune <<= 6;
|
|
|
|
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
|
|
|
|
|
|
|
rp.len = 3;
|
|
|
|
} else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
|
|
|
|
// 3bit, total 3bit
|
|
|
|
rp.rune = (uint8_t)(str[0]) & 0x07;
|
|
|
|
|
|
|
|
// 6bit, total 9bit
|
|
|
|
rp.rune <<= 6;
|
|
|
|
rp.rune |= (uint8_t)(str[1]) & 0x3f;
|
|
|
|
|
|
|
|
// 6bit, total 15bit
|
|
|
|
rp.rune <<= 6;
|
|
|
|
rp.rune |= (uint8_t)(str[2]) & 0x3f;
|
|
|
|
|
|
|
|
// 6bit, total 21bit
|
|
|
|
rp.rune <<= 6;
|
|
|
|
rp.rune |= (uint8_t)(str[3]) & 0x3f;
|
|
|
|
|
|
|
|
rp.len = 4;
|
|
|
|
} else {
|
|
|
|
rp.rune = 0;
|
|
|
|
rp.len = 0;
|
|
|
|
}
|
2020-12-31 21:14:13 +08:00
|
|
|
return rp;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
|
2021-04-26 15:06:47 +08:00
|
|
|
runes.clear();
|
|
|
|
runes.reserve(len / 2);
|
|
|
|
for(uint32_t i = 0, j = 0; i < len;) {
|
|
|
|
RuneStrLite rp = DecodeRuneInString(s + i, len - i);
|
|
|
|
if(rp.len == 0) {
|
|
|
|
runes.clear();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
RuneStr x(rp.rune, i, rp.len, j, 1);
|
|
|
|
runes.push_back(x);
|
|
|
|
i += rp.len;
|
|
|
|
++j;
|
|
|
|
}
|
|
|
|
return true;
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
|
2021-04-26 15:06:47 +08:00
|
|
|
return DecodeRunesInString(s.c_str(), s.size(), runes);
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
|
2021-04-26 15:06:47 +08:00
|
|
|
unicode.clear();
|
|
|
|
RuneStrArray runes;
|
|
|
|
if(!DecodeRunesInString(s, len, runes)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
unicode.reserve(runes.size());
|
|
|
|
for(size_t i = 0; i < runes.size(); i++) {
|
|
|
|
unicode.push_back(runes[i].rune);
|
|
|
|
}
|
|
|
|
return true;
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
inline bool IsSingleWord(const string& str) {
|
2021-04-26 15:06:47 +08:00
|
|
|
RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
|
|
|
|
return rp.len == str.size();
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
|
2021-04-26 15:06:47 +08:00
|
|
|
return DecodeRunesInString(s.c_str(), s.size(), unicode);
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
inline Unicode DecodeRunesInString(const string& s) {
|
2021-04-26 15:06:47 +08:00
|
|
|
Unicode result;
|
|
|
|
DecodeRunesInString(s, result);
|
|
|
|
return result;
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// [left, right]
|
|
|
|
inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
2021-04-26 15:06:47 +08:00
|
|
|
assert(right->offset >= left->offset);
|
|
|
|
uint32_t len = right->offset - left->offset + right->len;
|
|
|
|
uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
|
|
|
|
return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
|
2021-04-26 15:06:47 +08:00
|
|
|
assert(right->offset >= left->offset);
|
|
|
|
uint32_t len = right->offset - left->offset + right->len;
|
|
|
|
return s.substr(left->offset, len);
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
|
2021-04-26 15:06:47 +08:00
|
|
|
for(size_t i = 0; i < wrs.size(); i++) {
|
|
|
|
words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
|
|
|
|
}
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
|
2021-04-26 15:06:47 +08:00
|
|
|
vector<Word> result;
|
|
|
|
GetWordsFromWordRanges(s, wrs, result);
|
|
|
|
return result;
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
|
2021-04-26 15:06:47 +08:00
|
|
|
strs.resize(words.size());
|
|
|
|
for(size_t i = 0; i < words.size(); ++i) {
|
|
|
|
strs[i] = words[i].word;
|
|
|
|
}
|
2020-12-31 21:14:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace cppjieba
|
|
|
|
|
|
|
|
#endif // CPPJIEBA_UNICODE_H
|