Merge remote-tracking branch 'upstream-gitee/openkylin/yangtze' into packaging/openkylin/yangtze
This commit is contained in:
commit
1832b01721
|
@ -1,9 +1,9 @@
|
|||
<schemalist gettext-domain="ukui-search">
|
||||
<schema id="org.ukui.search.settings" path="/org/ukui/ukui-search/settings/">
|
||||
<key name="index-search" type="b">
|
||||
<key name="file-index-enable" type="b">
|
||||
<default>false</default>
|
||||
<summary>search method</summary>
|
||||
<description>Is current search-method index-search.</description>
|
||||
<summary>file index switch</summary>
|
||||
<description>Enable or disable file index service.</description>
|
||||
</key>
|
||||
<key name="web-engine" type="s">
|
||||
<default>"baidu"</default>
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
[Desktop Entry]
|
||||
Name=ukui-search-app-data-service
|
||||
Name[zh_CN]=应用数据搜索服务
|
||||
GenericName=ukui-search-app-data-service
|
||||
GenericName[zh_CN]=应用数据搜索服务
|
||||
Comment=ukui-search-app-data-service
|
||||
Comment[zh_CN]=应用数据搜索服务
|
||||
Exec=/usr/bin/ukui-search-app-data-service %U
|
||||
Type=Application
|
||||
Icon=kylin-search
|
||||
X-UKUI-AutoRestart=true
|
||||
OnlyShowIn=UKUI
|
||||
NoDisplay=true
|
||||
X-UKUI-Autostart-Phase=Application
|
||||
Terminal=false
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
[Desktop Entry]
|
||||
Name=ukui-search-service-dir-manager
|
||||
Name[zh_CN]=搜索服务目录管理
|
||||
GenericName=ukui-search-service-dir-manager
|
||||
GenericName[zh_CN]=搜索服务目录管理
|
||||
Comment=ukui-search-service-dir-manager
|
||||
Comment[zh_CN]=搜索服务目录管理
|
||||
Exec=/usr/bin/ukui-search-service-dir-manager %U
|
||||
Type=Application
|
||||
Icon=kylin-search
|
||||
X-UKUI-AutoRestart=true
|
||||
OnlyShowIn=UKUI
|
||||
NoDisplay=true
|
||||
X-UKUI-Autostart-Phase=Application
|
||||
Terminal=false
|
|
@ -1 +1 @@
|
|||
3.0 (quilt)
|
||||
3.0 (native)
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
|
||||
#include "create-index-ask-dialog.h"
|
||||
#include <QPainterPath>
|
||||
#include "kwindowsystem.h"
|
||||
#include <KWindowSystem>
|
||||
|
||||
#define MAIN_SIZE QSize(380, 202)
|
||||
#define MAIN_SPACING 0
|
||||
|
|
|
@ -45,7 +45,7 @@ SearchLineEdit::SearchLineEdit(QWidget *parent) : QLineEdit(parent) {
|
|||
pixmap = QPixmap(QIcon(":/res/icons/system-search.symbolic.png").pixmap(QSize(18, 18)));
|
||||
}
|
||||
m_queryIcon->setProperty("useIconHighlightEffect", 0x10);
|
||||
m_queryIcon->setFixedSize(pixmap.size());
|
||||
m_queryIcon->setFixedSize(pixmap.size() / pixmap.devicePixelRatio());
|
||||
m_queryIcon->setPixmap(pixmap);
|
||||
|
||||
m_ly = new QHBoxLayout(this);
|
||||
|
@ -75,6 +75,11 @@ SearchLineEdit::SearchLineEdit(QWidget *parent) : QLineEdit(parent) {
|
|||
m_timer->start(0.1 * 1000);
|
||||
}
|
||||
});
|
||||
|
||||
//跟随主题透明度变化
|
||||
connect(qApp, &QApplication::paletteChanged, this, [=]() {
|
||||
update();
|
||||
});
|
||||
}
|
||||
|
||||
SearchLineEdit::~SearchLineEdit() {
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
#include <QPushButton>
|
||||
#include <QScrollArea>
|
||||
#include <QTimer>
|
||||
#include <libsearch.h>
|
||||
#include "libsearch.h"
|
||||
#if (QT_VERSION >= QT_VERSION_CHECK(5, 12, 0))
|
||||
#include "xatom-helper.h"
|
||||
#endif
|
||||
|
|
|
@ -341,24 +341,25 @@ void ResultArea::mouseReleaseEvent(QMouseEvent *event)
|
|||
|
||||
bool ResultArea::viewportEvent(QEvent *event)
|
||||
{
|
||||
if(event->type() == QEvent::TouchBegin) {
|
||||
QTouchEvent *e = dynamic_cast<QTouchEvent *>(event);
|
||||
if(e->touchPoints().size() == 1) {
|
||||
m_pressPoint = m_widget->mapFrom(this, e->touchPoints().at(0).pos().toPoint());
|
||||
if (event->type() == QEvent::MouseButtonPress) {
|
||||
QMouseEvent *e = dynamic_cast<QMouseEvent *>(event);
|
||||
if (e->source() == Qt::MouseEventSynthesizedByApplication) {
|
||||
qDebug() << "MouseButtonPress MouseEventSynthesizedByApplication";
|
||||
m_pressPoint = m_widget->mapFrom(this, e->pos());
|
||||
event->accept();
|
||||
return true;
|
||||
}
|
||||
} else if (event->type() == QEvent::TouchUpdate) {
|
||||
QTouchEvent *e = dynamic_cast<QTouchEvent *>(event);
|
||||
// qDebug() << "touchpoint===========" << e->touchPoints().size();
|
||||
if(e->touchPoints().size() == 1) {
|
||||
int delta = m_pressPoint.y() - m_widget->mapFrom(this, e->touchPoints().at(0).pos().toPoint()).y();
|
||||
// qDebug() << "last pos:" << m_pressPoint.y();
|
||||
// qDebug() << "new pos:" << m_widget->mapFrom(this, e->touchPoints().at(0).pos().toPoint()).y();
|
||||
// qDebug() << "delta" << delta;
|
||||
// qDebug() << "height" << m_widget->height() << "--" << verticalScrollBar()->maximum();
|
||||
} else if (event->type() == QEvent::MouseMove) {
|
||||
QMouseEvent *e = dynamic_cast<QMouseEvent *>(event);
|
||||
if (e->source() == Qt::MouseEventSynthesizedByApplication) {
|
||||
qDebug() << "MouseMove MouseEventSynthesizedByApplication";
|
||||
int delta = m_pressPoint.y() - m_widget->mapFrom(this, e->pos()).y();
|
||||
// qDebug() << "last pos:" << m_pressPoint.y();
|
||||
// qDebug() << "new pos:" << m_widget->mapFrom(this, e->touchPoints().at(0).pos().toPoint()).y();
|
||||
// qDebug() << "delta" << delta;
|
||||
// qDebug() << "value" << verticalScrollBar()->value() << "--" << verticalScrollBar()->value() + delta;
|
||||
this->verticalScrollBar()->setValue(verticalScrollBar()->value() + delta);
|
||||
m_pressPoint = m_widget->mapFrom(this,e->touchPoints().at(0).pos().toPoint());
|
||||
m_pressPoint = m_widget->mapFrom(this,e->pos());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -458,12 +459,13 @@ void ResultArea::initConnections()
|
|||
connect(this->m_titleLabel, &TitleLabel::retractClicked, this, [=] () {
|
||||
Q_FOREACH(auto widget, m_widget_list) {
|
||||
if (widget->pluginName() == m_titleLabel->text()) {
|
||||
widget->reduceListSlot();
|
||||
widget->resetTitleLabel();
|
||||
if (!m_titleLabel->isHidden()) {
|
||||
m_titleLabel->hide();
|
||||
this->setViewportMargins(0,0,0,0);
|
||||
}
|
||||
widget->reduceListSlot();
|
||||
this->verticalScrollBar()->setValue(widget->pos().ry());
|
||||
widget->resetTitleLabel();
|
||||
}
|
||||
}
|
||||
});
|
||||
|
@ -510,8 +512,8 @@ void ResultArea::setupConnectionsForWidget(ResultWidget *widget)
|
|||
});
|
||||
connect(widget, &ResultWidget::retractClicked, this, [=] () {//点击收起搜索结果后
|
||||
if (!m_titleLabel->isHidden()) {
|
||||
m_titleLabel->hide();
|
||||
this->setViewportMargins(0,0,0,0);
|
||||
m_titleLabel->hide();
|
||||
}
|
||||
});
|
||||
connect(widget, &ResultWidget::sendBestListData, m_bestListWidget, &BestListWidget::sendBestListData);
|
||||
|
|
|
@ -173,6 +173,10 @@ void SearchResultPage::initConnections()
|
|||
sendResizeWidthSignal(280);
|
||||
});
|
||||
connect(this, &SearchResultPage::setSelectionInfo, m_resultArea, &ResultArea::setSelectionInfo);
|
||||
//跟随主题透明度变化
|
||||
connect(qApp, &QApplication::paletteChanged, this, [=]() {
|
||||
update();
|
||||
});
|
||||
}
|
||||
|
||||
void SearchResultPage::setupConnectionsForWidget(ResultWidget *widget)
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
QT += core gui dbus KWindowSystem xml x11extras
|
||||
QT += core gui dbus KWindowSystem xml x11extras sql
|
||||
|
||||
greaterThan(QT_MAJOR_VERSION, 4): QT += widgets
|
||||
|
||||
VERSION = 1.0.0
|
||||
VERSION = 2.2.3
|
||||
DEFINES += VERSION='\\"$${VERSION}\\"'
|
||||
TARGET = ukui-search
|
||||
TEMPLATE = app
|
||||
|
||||
PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0
|
||||
PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0 kysdk-waylandhelper
|
||||
CONFIG += c++11 link_pkgconfig no_keywords lrelease
|
||||
LIBS += -lxapian -lgsettings-qt -lquazip5 -lX11
|
||||
#LIBS += -lukui-log4qt
|
||||
|
@ -59,7 +59,7 @@ RESOURCES += \
|
|||
TRANSLATIONS += \
|
||||
../translations/ukui-search/zh_CN.ts \
|
||||
../translations/ukui-search/tr.ts \
|
||||
../translations/ukui-search/bo.ts
|
||||
../translations/ukui-search/bo_CN.ts
|
||||
|
||||
qm_files.path = /usr/share/ukui-search/translations/
|
||||
qm_files.files = $$OUT_PWD/.qm/*.qm
|
||||
|
|
|
@ -31,15 +31,18 @@
|
|||
#include <QPixmap>
|
||||
#if (QT_VERSION >= QT_VERSION_CHECK(5, 12, 0))
|
||||
#include <KWindowEffects>
|
||||
#include "kwindowsystem.h"
|
||||
#include <KWindowSystem>
|
||||
|
||||
#endif
|
||||
#include "global-settings.h"
|
||||
#include <QtX11Extras/QX11Info>
|
||||
#include "ukuistylehelper/ukuistylehelper.h"
|
||||
#include "windowmanager/windowmanager.h"
|
||||
#include "global-settings.h"
|
||||
|
||||
#define MAIN_MARGINS 0, 0, 0, 0
|
||||
#define TITLE_MARGINS 0,0,0,0
|
||||
#define UKUI_SEARCH_SCHEMAS "org.ukui.search.settings"
|
||||
#define SEARCH_METHOD_KEY "indexSearch"
|
||||
#define SEARCH_METHOD_KEY "fileIndexEnable"
|
||||
#define WEB_ENGINE_KEY "webEngine"
|
||||
#define WINDOW_WIDTH 700
|
||||
#define WINDOW_HEIGHT 610
|
||||
|
@ -79,12 +82,19 @@ MainWindow::MainWindow(QWidget *parent) :
|
|||
initConnections();
|
||||
initGsettings();
|
||||
|
||||
connect(KWindowSystem::self(), &KWindowSystem::activeWindowChanged, this,[&](WId activeWindowId){
|
||||
if (activeWindowId != this->winId()) {
|
||||
tryHideMainwindow();
|
||||
}
|
||||
});
|
||||
// connect(KWindowSystem::self(), &KWindowSystem::activeWindowChanged, this,[&](WId activeWindowId){
|
||||
// qDebug() << "activeWindowChanged!!!" << activeWindowId;
|
||||
// if (activeWindowId != this->winId()) {
|
||||
// tryHideMainwindow();
|
||||
// }
|
||||
// });
|
||||
|
||||
m_appWidgetPlugin = new AppWidgetPlugin;
|
||||
|
||||
connect(m_appWidgetPlugin, &AppWidgetPlugin::startSearch, this, [ & ] (QString keyword){
|
||||
this->bootOptionsFilter("-s");
|
||||
this->setText(keyword);
|
||||
});
|
||||
//NEW_TODO, register plugins
|
||||
// SearchPluginManager::getInstance()->registerPlugin(\\);
|
||||
// m_stackedWidget->setPlugins(SearchPluginManager::getInstance()->getPluginIds());
|
||||
|
@ -150,11 +160,11 @@ void MainWindow::initUi() {
|
|||
//创建索引询问弹窗
|
||||
m_askDialog = new CreateIndexAskDialog(this);
|
||||
#if (QT_VERSION >= QT_VERSION_CHECK(5, 12, 0))
|
||||
MotifWmHints ask_dialog_hints;
|
||||
ask_dialog_hints.flags = MWM_HINTS_FUNCTIONS | MWM_HINTS_DECORATIONS;
|
||||
ask_dialog_hints.functions = MWM_FUNC_ALL;
|
||||
ask_dialog_hints.decorations = MWM_DECOR_BORDER;
|
||||
XAtomHelper::getInstance()->setWindowMotifHint(m_askDialog->winId(), ask_dialog_hints);
|
||||
// MotifWmHints ask_dialog_hints;
|
||||
// ask_dialog_hints.flags = MWM_HINTS_FUNCTIONS | MWM_HINTS_DECORATIONS;
|
||||
// ask_dialog_hints.functions = MWM_FUNC_ALL;
|
||||
// ask_dialog_hints.decorations = MWM_DECOR_BORDER;
|
||||
// XAtomHelper::getInstance()->setWindowMotifHint(m_askDialog->winId(), ask_dialog_hints);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -192,7 +202,6 @@ void MainWindow::bootOptionsFilter(QString opt) {
|
|||
if (this->isHidden()) {
|
||||
clearSearchResult();
|
||||
centerToScreen(this);
|
||||
this->show();
|
||||
this->m_searchBarWidget->setFocus();
|
||||
this->activateWindow();
|
||||
}
|
||||
|
@ -217,7 +226,6 @@ void MainWindow::trayIconActivatedSlot(QSystemTrayIcon::ActivationReason reason)
|
|||
if(!this->isVisible()) {
|
||||
clearSearchResult();
|
||||
centerToScreen(this);
|
||||
this->show();
|
||||
// this->m_searchLineEdit->focusIn(); //打开主界面时输入框夺焦,可直接输入
|
||||
this->raise();
|
||||
this->activateWindow();
|
||||
|
@ -301,10 +309,12 @@ void MainWindow::searchKeywordSlot(const QString &keyword)
|
|||
//NEW_TODO
|
||||
if(keyword == "") {
|
||||
// m_stackedWidget->setPage(int(StackedPage::HomePage));
|
||||
QTimer::singleShot(10, this, [ = ]() {
|
||||
m_askTimer->stop();
|
||||
Q_EMIT m_searchResultPage->stopSearch();
|
||||
// Q_EMIT m_searchResultPage->stopSearch();
|
||||
m_searchResultPage->hide();
|
||||
this->resizeHeight(68);
|
||||
});
|
||||
|
||||
} else {
|
||||
// m_stackedWidget->setPage(int(StackedPage::SearchPage));
|
||||
|
@ -327,6 +337,11 @@ void MainWindow::resizeHeight(int height)
|
|||
this->setFixedHeight(height);
|
||||
}
|
||||
|
||||
void MainWindow::tryHide()
|
||||
{
|
||||
this->tryHideMainwindow();
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief monitorResolutionChange 监听屏幕改变
|
||||
* @param rect
|
||||
|
@ -421,7 +436,16 @@ void MainWindow::centerToScreen(QWidget* widget) {
|
|||
// desk_x = width;
|
||||
// desk_y = height;
|
||||
// }
|
||||
widget->move(desk_x / 2 - x / 2 + desk_rect.left(), desk_y / 3 + desk_rect.top());
|
||||
widget->show();
|
||||
kdk::WindowManager::setGeometry(this->windowHandle(),QRect(desk_x / 2 - x / 2 + desk_rect.left(),
|
||||
desk_y / 3 + desk_rect.top(),
|
||||
this->width(),
|
||||
this->height()));
|
||||
//设置跳过多任务视图
|
||||
kdk::WindowManager::setSkipSwitcher(this->windowHandle(),true);
|
||||
//设置跳过任务栏
|
||||
kdk::WindowManager::setSkipTaskBar(this->windowHandle(),true);
|
||||
// widget->move(desk_x / 2 - x / 2 + desk_rect.left(), desk_y / 3 + desk_rect.top());
|
||||
}
|
||||
|
||||
void MainWindow::initGsettings() {
|
||||
|
@ -460,7 +484,12 @@ void MainWindow::initTimer() {
|
|||
connect(m_askTimer, &QTimer::timeout, this, [ = ]() {
|
||||
if(this->isVisible()) {
|
||||
m_isAskDialogVisible = true;
|
||||
kdk::UkuiStyleHelper::self()->removeHeader(m_askDialog);
|
||||
m_askDialog->show();
|
||||
//设置跳过多任务视图
|
||||
kdk::WindowManager::setSkipSwitcher(m_askDialog->windowHandle(),true);
|
||||
//设置跳过任务栏
|
||||
kdk::WindowManager::setSkipTaskBar(m_askDialog->windowHandle(),true);
|
||||
m_currentSearchAsked = true;
|
||||
}
|
||||
m_askTimer->stop();
|
||||
|
@ -558,11 +587,21 @@ void MainWindow::paintEvent(QPaintEvent *event) {
|
|||
|
||||
bool MainWindow::eventFilter(QObject *watched, QEvent *event)
|
||||
{
|
||||
if (watched == this) {
|
||||
//失焦退出
|
||||
if (event->type() == QEvent::ActivationChange) {
|
||||
if (QApplication::activeWindow() != this) {
|
||||
tryHideMainwindow();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
//kwin alt+f4发出close事件, 需要在存在子窗口时屏蔽该事件
|
||||
if ((watched == this) && (event->type() == QEvent::Close)) {
|
||||
if (event->type() == QEvent::Close) {
|
||||
event->ignore();
|
||||
tryHideMainwindow();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return QObject::eventFilter(watched, event);
|
||||
}
|
||||
|
|
|
@ -46,6 +46,7 @@
|
|||
#include <QSystemTrayIcon>
|
||||
#include <QTimer>
|
||||
|
||||
#include "search-app-widget-plugin/search.h"
|
||||
#include "index-generator.h"
|
||||
#include "libsearch.h"
|
||||
#include "create-index-ask-dialog.h"
|
||||
|
@ -110,6 +111,7 @@ public Q_SLOTS:
|
|||
void settingsBtnClickedSlot();
|
||||
void searchKeywordSlot(const QString&);
|
||||
void resizeHeight(int height);
|
||||
void tryHide();
|
||||
|
||||
private:
|
||||
|
||||
|
@ -142,6 +144,7 @@ private:
|
|||
QTimer * m_researchTimer = nullptr; //创建索引后重新执行一次搜索的计时器
|
||||
bool m_currentSearchAsked = false; //本次搜索是否已经询问过是否创建索引了
|
||||
QGSettings * m_search_gsettings = nullptr;
|
||||
AppWidgetPlugin *m_appWidgetPlugin = nullptr;
|
||||
|
||||
void setSearchMethod(const bool&);
|
||||
double getTransparentData();
|
||||
|
|
|
@ -67,6 +67,7 @@ ReceiveResultThread::ReceiveResultThread(DataQueue<SearchPluginIface::ResultInfo
|
|||
void ReceiveResultThread::stop()
|
||||
{
|
||||
this->requestInterruption();
|
||||
this->wait();
|
||||
this->quit();
|
||||
}
|
||||
|
||||
|
|
Binary file not shown.
|
@ -3,7 +3,6 @@
|
|||
<file>res/icons/edit-find-symbolic.svg</file>
|
||||
<file>res/icons/desktop.png</file>
|
||||
<file>res/icons/close.svg</file>
|
||||
<file>res/qt-translations/qt_zh_CN.qm</file>
|
||||
<file>res/icons/net-disconnected.svg</file>
|
||||
<file>res/icons/system-search.symbolic.png</file>
|
||||
<file>res/icons/ukui-up-symbolic.svg</file>
|
||||
|
|
|
@ -12,6 +12,15 @@ void UkuiSearchDbusServices::searchKeyword(QString keyword)
|
|||
m_mainWindow->setText(keyword);
|
||||
}
|
||||
|
||||
void UkuiSearchDbusServices::mainWindowSwitch()
|
||||
{
|
||||
if (m_mainWindow->isActiveWindow()) {
|
||||
m_mainWindow->tryHide();
|
||||
} else {
|
||||
m_mainWindow->bootOptionsFilter("-s");
|
||||
}
|
||||
}
|
||||
|
||||
UkuiSearchDbusServices::UkuiSearchDbusServices(MainWindow *m)
|
||||
{
|
||||
m_mainWindow = m;
|
||||
|
|
|
@ -20,9 +20,10 @@ public:
|
|||
public Q_SLOTS:
|
||||
void showWindow();
|
||||
void searchKeyword(QString keyword);
|
||||
void mainWindowSwitch();
|
||||
|
||||
private:
|
||||
MainWindow *m_mainWindow;
|
||||
MainWindow *m_mainWindow = nullptr;
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ UkuiSearchGui::UkuiSearchGui(int &argc, char *argv[], const QString &application
|
|||
|
||||
QTranslator *qt_translator = new QTranslator(this);
|
||||
try {
|
||||
if(! qt_translator->load(":/res/qt-translations/qt_zh_CN.qm")) throw - 1;
|
||||
if(! qt_translator->load("/usr/share/qt5/translations/qt_" + QLocale::system().name())) throw - 1;
|
||||
this->installTranslator(qt_translator);
|
||||
} catch(...) {
|
||||
qDebug() << "Load translations file" << QLocale() << "failed!";
|
||||
|
|
|
@ -31,13 +31,24 @@ bool BestListView::isSelected()
|
|||
|
||||
int BestListView::showHeight()
|
||||
{
|
||||
int height;
|
||||
int rowheight = this->rowHeight(this->model()->index(0, 0, QModelIndex()));
|
||||
int height(0);
|
||||
// int rowheight = this->rowHeight(this->model()->index(0, 0, QModelIndex()));
|
||||
// if (this->isExpanded()) {
|
||||
// height = m_count * rowheight;
|
||||
// } else {
|
||||
// int show_count = m_count > NUM_LIMIT_SHOWN_DEFAULT ? NUM_LIMIT_SHOWN_DEFAULT : m_count;
|
||||
// height = show_count * rowheight;
|
||||
// }
|
||||
|
||||
if (this->isExpanded()) {
|
||||
height = m_count * rowheight;
|
||||
for (int i = 0; i<m_count; ++i) {
|
||||
height += this->rowHeight(this->model()->index(i, 0, QModelIndex()));
|
||||
}
|
||||
} else {
|
||||
int show_count = m_count > NUM_LIMIT_SHOWN_DEFAULT ? NUM_LIMIT_SHOWN_DEFAULT : m_count;
|
||||
height = show_count * rowheight;
|
||||
for (int i = 0; i<show_count; ++i) {
|
||||
height += this->rowHeight(this->model()->index(i, 0, QModelIndex()));
|
||||
}
|
||||
}
|
||||
return height;
|
||||
}
|
||||
|
|
|
@ -3,15 +3,15 @@
|
|||
using namespace UkuiSearch;
|
||||
static ResultItemStyle *global_instance_of_item_style = nullptr;
|
||||
|
||||
ResultViewDelegate::ResultViewDelegate(QObject *parent) : QStyledItemDelegate(parent)
|
||||
ResultViewDelegate::ResultViewDelegate(QObject *parent) : QStyledItemDelegate(parent),
|
||||
m_textDoc(new QTextDocument(this)),
|
||||
m_hightLightEffectHelper(new HightLightEffectHelper(this))
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
void ResultViewDelegate::setSearchKeyword(const QString ®FindKeyWords)
|
||||
{
|
||||
m_regFindKeyWords.clear();
|
||||
m_regFindKeyWords = regFindKeyWords;
|
||||
m_hightLightEffectHelper->setExpression(regFindKeyWords);
|
||||
}
|
||||
|
||||
QSize ResultViewDelegate::sizeHint(const QStyleOptionViewItem &option, const QModelIndex &index) const
|
||||
|
@ -21,90 +21,36 @@ QSize ResultViewDelegate::sizeHint(const QStyleOptionViewItem &option, const QMo
|
|||
return size;
|
||||
}
|
||||
|
||||
void ResultViewDelegate::paint(QPainter * painter, const QStyleOptionViewItem & option, const QModelIndex & index) const {
|
||||
void ResultViewDelegate::paint(QPainter *painter, const QStyleOptionViewItem &option, const QModelIndex &index) const
|
||||
{
|
||||
QStyleOptionViewItem opt = option;
|
||||
initStyleOption(&opt, index);
|
||||
QStyle *style = opt.widget->style();
|
||||
opt.displayAlignment = Qt::Alignment(Qt::AlignLeft|Qt::AlignVCenter);
|
||||
|
||||
QString text = opt.text;
|
||||
if(text.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
opt.text = QString();
|
||||
|
||||
QStyle *style = opt.widget->style();
|
||||
style->proxy()->drawControl(QStyle::CE_ItemViewItem, &opt, painter, opt.widget); //绘制非文本区域内容
|
||||
|
||||
opt.text = text;
|
||||
QTextDocument doc;
|
||||
doc.setHtml(getHtmlText(painter, opt, index)); //提取富文本
|
||||
QAbstractTextDocumentLayout* layout = doc.documentLayout();
|
||||
const double height = layout->documentSize().height();
|
||||
|
||||
|
||||
QRect textRect = style->subElementRect(QStyle::SE_ItemViewItemText, &opt, opt.widget);
|
||||
//使图标和文本间隔与原来保持一致,故文本区域右移4
|
||||
// textRect.adjust(4, 0, 0, 0);
|
||||
double y = textRect.y();
|
||||
y += (textRect.height() - height) / 2;
|
||||
QFontMetrics fontMetrics(opt.font);
|
||||
text = fontMetrics.elidedText(text, Qt::ElideRight, textRect.width() - 5); //富余5px的宽度
|
||||
opt.text = text;
|
||||
|
||||
QAbstractTextDocumentLayout::PaintContext context;
|
||||
|
||||
QPalette::ColorGroup cg = opt.state & QStyle::State_Enabled
|
||||
? QPalette::Normal : QPalette::Disabled;
|
||||
if (cg == QPalette::Normal && !(opt.state & QStyle::State_Active))
|
||||
cg = QPalette::Inactive;
|
||||
|
||||
if(opt.state & QStyle::State_Selected) {
|
||||
painter->setPen(opt.palette.color(cg, QPalette::HighlightedText));
|
||||
} else {
|
||||
painter->setPen(opt.palette.color(cg, QPalette::Text));
|
||||
}
|
||||
painter->save();
|
||||
painter->translate(QPointF(textRect.x(), y));
|
||||
layout->draw(painter, context); //绘制文本区域内容
|
||||
if(opt.state & QStyle::State_Selected) {
|
||||
m_hightLightEffectHelper->setTextColor(QBrush(opt.palette.highlightedText().color()));
|
||||
} else {
|
||||
m_hightLightEffectHelper->setTextColor(QBrush(opt.palette.text().color()));
|
||||
}
|
||||
painter->translate(textRect.topLeft());
|
||||
|
||||
m_textDoc->setPlainText(text);
|
||||
m_hightLightEffectHelper->setDocument(m_textDoc);
|
||||
m_hightLightEffectHelper->rehighlight();
|
||||
m_textDoc->drawContents(painter);
|
||||
painter->restore();
|
||||
|
||||
}
|
||||
|
||||
QString ResultViewDelegate::getHtmlText(QPainter *painter, const QStyleOptionViewItem &itemOption, const QModelIndex &index) const
|
||||
{
|
||||
int indexFindLeft = 0;
|
||||
QString indexString = index.model()->data(index, Qt::DisplayRole).toString();
|
||||
QFont ft(painter->font().family(), GlobalSettings::getInstance()->getValue(FONT_SIZE_KEY).toInt());
|
||||
QFontMetrics fm(ft);
|
||||
QString indexColString = fm.elidedText(indexString, Qt::ElideRight, itemOption.rect.width() - 30 - 10); //当字体超过Item的长度时显示为省略号
|
||||
QString htmlString;
|
||||
if((indexColString.toUpper()).contains((m_regFindKeyWords.toUpper()))) {
|
||||
indexFindLeft = indexColString.toUpper().indexOf(m_regFindKeyWords.toUpper()); //得到查找字体在当前整个Item字体中的位置
|
||||
htmlString = escapeHtml(indexColString.left(indexFindLeft)) + "<b>" + escapeHtml(indexColString.mid(indexFindLeft, m_regFindKeyWords.length())) + "</b>" + escapeHtml(indexColString.right(indexColString.length() - indexFindLeft - m_regFindKeyWords.length()));
|
||||
} else {
|
||||
bool boldOpenned = false;
|
||||
for(int i = 0; i < indexColString.length(); i++) {
|
||||
if((m_regFindKeyWords.toUpper()).contains(QString(indexColString.at(i)).toUpper())) {
|
||||
if(! boldOpenned) {
|
||||
boldOpenned = true;
|
||||
htmlString.append(QString("<b>"));
|
||||
}
|
||||
htmlString.append(escapeHtml(QString(indexColString.at(i))));
|
||||
} else {
|
||||
if(boldOpenned) {
|
||||
boldOpenned = false;
|
||||
htmlString.append(QString("</b>"));
|
||||
}
|
||||
htmlString.append(escapeHtml(QString(indexColString.at(i))));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
// qDebug()<<indexColString<<"---->"<<htmlString;
|
||||
return "<pre>" + htmlString + "</pre>";
|
||||
}
|
||||
|
||||
QString ResultViewDelegate::escapeHtml(const QString &str) const
|
||||
{
|
||||
QString temp = str;
|
||||
temp.replace("<", "<");
|
||||
temp.replace(">", ">");
|
||||
return temp;
|
||||
}
|
||||
|
||||
ResultItemStyle *ResultItemStyle::getStyle()
|
||||
|
@ -259,3 +205,32 @@ void ResultItemStyle::drawControl(QStyle::ControlElement element, const QStyleOp
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
HightLightEffectHelper::HightLightEffectHelper(QObject *parent) : QSyntaxHighlighter(parent)
|
||||
{
|
||||
m_expression.setCaseSensitivity(Qt::CaseInsensitive);
|
||||
m_expression.setPatternSyntax(QRegExp::FixedString);
|
||||
}
|
||||
|
||||
void HightLightEffectHelper::setExpression(const QString &text)
|
||||
{
|
||||
m_expression.setPattern(text);
|
||||
}
|
||||
|
||||
void HightLightEffectHelper::setTextColor(const QBrush &brush)
|
||||
{
|
||||
m_textCharFormat.setForeground(brush);
|
||||
}
|
||||
|
||||
void HightLightEffectHelper::highlightBlock(const QString &text)
|
||||
{
|
||||
setFormat(0, text.length(), m_textCharFormat);
|
||||
m_textCharFormat.setFontWeight(QFont::Bold);
|
||||
int index = text.indexOf(m_expression);
|
||||
while(index >= 0){
|
||||
int length = m_expression.matchedLength();
|
||||
setFormat(index, length, m_textCharFormat);
|
||||
index = text.indexOf(m_expression, index+length);
|
||||
}
|
||||
m_textCharFormat.setFontWeight(QFont::Normal);
|
||||
}
|
||||
|
|
|
@ -27,10 +27,29 @@
|
|||
#include <QTextDocument>
|
||||
#include <QAbstractTextDocumentLayout>
|
||||
#include <QProxyStyle>
|
||||
#include <QSyntaxHighlighter>
|
||||
#include <QTextCharFormat>
|
||||
#include <QRegExp>
|
||||
#include "global-settings.h"
|
||||
|
||||
namespace UkuiSearch {
|
||||
class ResultViewDelegate : public QStyledItemDelegate {
|
||||
class HightLightEffectHelper : public QSyntaxHighlighter
|
||||
{
|
||||
public:
|
||||
explicit HightLightEffectHelper(QObject *parent = nullptr);
|
||||
void setExpression(const QString &text);
|
||||
void setTextColor(const QBrush &brush);
|
||||
|
||||
protected:
|
||||
void highlightBlock(const QString &text);
|
||||
|
||||
private:
|
||||
QRegExp m_expression;
|
||||
QTextCharFormat m_textCharFormat;
|
||||
};
|
||||
|
||||
class ResultViewDelegate : public QStyledItemDelegate
|
||||
{
|
||||
Q_OBJECT
|
||||
public:
|
||||
explicit ResultViewDelegate(QObject *parent = nullptr);
|
||||
|
@ -38,11 +57,12 @@ public:
|
|||
void setSearchKeyword(const QString &);
|
||||
protected:
|
||||
QSize sizeHint(const QStyleOptionViewItem &option, const QModelIndex &index) const;
|
||||
private:
|
||||
QString m_regFindKeyWords = 0;
|
||||
void paint(QPainter *, const QStyleOptionViewItem &, const QModelIndex &) const override;
|
||||
QString getHtmlText(QPainter *, const QStyleOptionViewItem &, const QModelIndex &) const;
|
||||
QString escapeHtml(const QString&) const;
|
||||
|
||||
private:
|
||||
QTextDocument *m_textDoc = nullptr;
|
||||
HightLightEffectHelper *m_hightLightEffectHelper = nullptr;
|
||||
|
||||
};
|
||||
|
||||
class ResultItemStyle : public QProxyStyle
|
||||
|
|
|
@ -168,6 +168,7 @@ ResultView::ResultView(const QString &plugin_id, QWidget *parent) : QTreeView(pa
|
|||
{
|
||||
// setStyle(ResultItemStyle::getStyle());
|
||||
this->setFrameShape(QFrame::NoFrame);
|
||||
this->viewport()->setAttribute(Qt::WA_AcceptTouchEvents);
|
||||
this->viewport()->setAutoFillBackground(false);
|
||||
this->setIconSize(QSize(VIEW_ICON_SIZE, VIEW_ICON_SIZE));
|
||||
this->setRootIsDecorated(false);
|
||||
|
@ -181,6 +182,9 @@ ResultView::ResultView(const QString &plugin_id, QWidget *parent) : QTreeView(pa
|
|||
m_plugin_id = plugin_id;
|
||||
m_styleDelegate = new ResultViewDelegate(this);
|
||||
this->setItemDelegate(m_styleDelegate);
|
||||
m_touchTimer = new QTimer(this);
|
||||
m_touchTimer->setSingleShot(true);
|
||||
m_touchTimer->setInterval(100);
|
||||
}
|
||||
|
||||
bool ResultView::isSelected()
|
||||
|
@ -190,13 +194,23 @@ bool ResultView::isSelected()
|
|||
|
||||
int ResultView::showHeight()
|
||||
{
|
||||
int height;
|
||||
int rowheight = this->rowHeight(this->model()->index(0, 0, QModelIndex()));
|
||||
int height(0);
|
||||
// int rowheight = this->rowHeight(this->model()->index(0, 0, QModelIndex()));
|
||||
// if (this->isExpanded()) {
|
||||
// height = m_count * rowheight;
|
||||
// } else {
|
||||
// int show_count = m_count > NUM_LIMIT_SHOWN_DEFAULT ? NUM_LIMIT_SHOWN_DEFAULT : m_count;
|
||||
// height = show_count * rowheight;
|
||||
// }
|
||||
if (this->isExpanded()) {
|
||||
height = m_count * rowheight;
|
||||
for (int i = 0; i<m_count; ++i) {
|
||||
height += this->rowHeight(this->model()->index(i, 0, QModelIndex()));
|
||||
}
|
||||
} else {
|
||||
int show_count = m_count > NUM_LIMIT_SHOWN_DEFAULT ? NUM_LIMIT_SHOWN_DEFAULT : m_count;
|
||||
height = show_count * rowheight;
|
||||
for (int i = 0; i<show_count; ++i) {
|
||||
height += this->rowHeight(this->model()->index(i, 0, QModelIndex()));
|
||||
}
|
||||
}
|
||||
return height;
|
||||
}
|
||||
|
@ -268,10 +282,10 @@ void ResultView::onRowSelectedSlot(const QModelIndex &index)
|
|||
void ResultView::onItemListChanged(const int &count)
|
||||
{
|
||||
m_count = count;
|
||||
Q_EMIT this->listLengthChanged(count);
|
||||
QModelIndex index = this->currentIndex();
|
||||
m_model->refresh();
|
||||
this->setCurrentIndex(index);
|
||||
Q_EMIT this->listLengthChanged(count);
|
||||
}
|
||||
|
||||
void ResultView::setExpanded(const bool &is_expanded)
|
||||
|
@ -334,6 +348,53 @@ void ResultView::mouseMoveEvent(QMouseEvent *event)
|
|||
return QTreeView::mouseMoveEvent(event);
|
||||
}
|
||||
|
||||
bool ResultView::viewportEvent(QEvent *event)
|
||||
{
|
||||
if (event->type() == QEvent::TouchBegin) {
|
||||
qDebug() << "TouchBegin==============";
|
||||
QTouchEvent *e = dynamic_cast<QTouchEvent *>(event);
|
||||
QMouseEvent me(QEvent::MouseButtonPress,
|
||||
e->touchPoints().at(0).pos(),
|
||||
this->mapTo(this->window(),e->touchPoints().at(0).pos().toPoint()),
|
||||
this->mapToGlobal(e->touchPoints().at(0).pos().toPoint()),
|
||||
Qt::LeftButton,Qt::LeftButton,Qt::NoModifier,Qt::MouseEventSynthesizedByApplication);
|
||||
QApplication::sendEvent(parent(), &me);
|
||||
m_touchTimer->start();
|
||||
event->accept();
|
||||
return true;
|
||||
} else if (event->type() == QEvent::TouchEnd) {
|
||||
qDebug() << "touchend==============" << m_touchTimer->remainingTime();
|
||||
if (m_touchTimer->remainingTime() > 0.001) {
|
||||
QTouchEvent *e = dynamic_cast<QTouchEvent *>(event);
|
||||
QMouseEvent me(QEvent::MouseButtonPress,
|
||||
e->touchPoints().at(0).pos(),
|
||||
this->mapTo(this->window(),e->touchPoints().at(0).pos().toPoint()),
|
||||
this->mapToGlobal(e->touchPoints().at(0).pos().toPoint()),
|
||||
Qt::LeftButton,Qt::LeftButton,Qt::NoModifier,Qt::MouseEventSynthesizedByApplication);
|
||||
QApplication::sendEvent(this->viewport(),&me);
|
||||
|
||||
QMouseEvent mer(QEvent::MouseButtonRelease,
|
||||
e->touchPoints().at(0).pos(),
|
||||
this->mapTo(this->window(),e->touchPoints().at(0).pos().toPoint()),
|
||||
this->mapToGlobal(e->touchPoints().at(0).pos().toPoint()),
|
||||
Qt::LeftButton,Qt::LeftButton,Qt::NoModifier,Qt::MouseEventSynthesizedByApplication);
|
||||
QApplication::sendEvent(this->viewport(),&mer);
|
||||
}
|
||||
return true;
|
||||
} else if (event->type() == QEvent::TouchUpdate) {
|
||||
qDebug() << "touchupdate==============";
|
||||
QTouchEvent *e = dynamic_cast<QTouchEvent *>(event);
|
||||
QMouseEvent me(QEvent::MouseMove,
|
||||
e->touchPoints().at(0).pos(),
|
||||
this->mapTo(this->window(),e->touchPoints().at(0).pos().toPoint()),
|
||||
this->mapToGlobal(e->touchPoints().at(0).pos().toPoint()),
|
||||
Qt::LeftButton,Qt::LeftButton,Qt::NoModifier,Qt::MouseEventSynthesizedByApplication);
|
||||
QApplication::sendEvent(parent(), &me);
|
||||
return true;
|
||||
}
|
||||
return QTreeView::viewportEvent(event);
|
||||
}
|
||||
|
||||
void ResultView::initConnections()
|
||||
{
|
||||
connect(this, &ResultView::startSearch, [ = ](const QString &keyword) {
|
||||
|
|
|
@ -37,6 +37,7 @@ protected:
|
|||
void mousePressEvent(QMouseEvent *event);
|
||||
void mouseReleaseEvent(QMouseEvent *event);
|
||||
void mouseMoveEvent(QMouseEvent *event);
|
||||
bool viewportEvent(QEvent *event);
|
||||
|
||||
private:
|
||||
void initConnections();
|
||||
|
@ -47,6 +48,7 @@ private:
|
|||
int m_count = 0;
|
||||
QModelIndex m_tmpCurrentIndex;
|
||||
QModelIndex m_tmpMousePressIndex;
|
||||
QTimer *m_touchTimer;
|
||||
|
||||
Q_SIGNALS:
|
||||
void startSearch(const QString &);
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
#include <QDBusReply>
|
||||
#include "web-search-view.h"
|
||||
#define MAIN_MARGINS 0,0,0,0
|
||||
#define MAIN_SPACING 0
|
||||
|
@ -97,6 +98,30 @@ void WebSearchView::LaunchBrowser()
|
|||
} else { //默认值
|
||||
address = "http://baidu.com/s?word=" + m_keyWord ; //百度
|
||||
}
|
||||
bool res(false);
|
||||
QDBusInterface * appLaunchInterface = new QDBusInterface("com.kylin.AppManager",
|
||||
"/com/kylin/AppManager",
|
||||
"com.kylin.AppManager",
|
||||
QDBusConnection::sessionBus());
|
||||
if(!appLaunchInterface->isValid()) {
|
||||
qWarning() << qPrintable(QDBusConnection::sessionBus().lastError().message());
|
||||
res = false;
|
||||
} else {
|
||||
appLaunchInterface->setTimeout(10000);
|
||||
QDBusReply<bool> reply = appLaunchInterface->call("LaunchDefaultAppWithUrl", address);
|
||||
if(reply.isValid()) {
|
||||
res = reply;
|
||||
} else {
|
||||
qWarning() << "SoftWareCenter dbus called failed!";
|
||||
res = false;
|
||||
}
|
||||
}
|
||||
if(appLaunchInterface) {
|
||||
delete appLaunchInterface;
|
||||
}
|
||||
appLaunchInterface = NULL;
|
||||
if (res)
|
||||
return;
|
||||
QDesktopServices::openUrl(address);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,674 @@
|
|||
GNU GENERAL PUBLIC LICENSE
|
||||
Version 3, 29 June 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU General Public License is a free, copyleft license for
|
||||
software and other kinds of works.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
the GNU General Public License is intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users. We, the Free Software Foundation, use the
|
||||
GNU General Public License for most of our software; it applies also to
|
||||
any other work released this way by its authors. You can apply it to
|
||||
your programs, too.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
To protect your rights, we need to prevent others from denying you
|
||||
these rights or asking you to surrender the rights. Therefore, you have
|
||||
certain responsibilities if you distribute copies of the software, or if
|
||||
you modify it: responsibilities to respect the freedom of others.
|
||||
|
||||
For example, if you distribute copies of such a program, whether
|
||||
gratis or for a fee, you must pass on to the recipients the same
|
||||
freedoms that you received. You must make sure that they, too, receive
|
||||
or can get the source code. And you must show them these terms so they
|
||||
know their rights.
|
||||
|
||||
Developers that use the GNU GPL protect your rights with two steps:
|
||||
(1) assert copyright on the software, and (2) offer you this License
|
||||
giving you legal permission to copy, distribute and/or modify it.
|
||||
|
||||
For the developers' and authors' protection, the GPL clearly explains
|
||||
that there is no warranty for this free software. For both users' and
|
||||
authors' sake, the GPL requires that modified versions be marked as
|
||||
changed, so that their problems will not be attributed erroneously to
|
||||
authors of previous versions.
|
||||
|
||||
Some devices are designed to deny users access to install or run
|
||||
modified versions of the software inside them, although the manufacturer
|
||||
can do so. This is fundamentally incompatible with the aim of
|
||||
protecting users' freedom to change the software. The systematic
|
||||
pattern of such abuse occurs in the area of products for individuals to
|
||||
use, which is precisely where it is most unacceptable. Therefore, we
|
||||
have designed this version of the GPL to prohibit the practice for those
|
||||
products. If such problems arise substantially in other domains, we
|
||||
stand ready to extend this provision to those domains in future versions
|
||||
of the GPL, as needed to protect the freedom of users.
|
||||
|
||||
Finally, every program is threatened constantly by software patents.
|
||||
States should not allow patents to restrict development and use of
|
||||
software on general-purpose computers, but in those that do, we wish to
|
||||
avoid the special danger that patents applied to a free program could
|
||||
make it effectively proprietary. To prevent this, the GPL assures that
|
||||
patents cannot be used to render the program non-free.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Use with the GNU Affero General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU Affero General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the special requirements of the GNU Affero General Public License,
|
||||
section 13, concerning interaction through a network will apply to the
|
||||
combination as such.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU General Public License from time to time. Such new versions will
|
||||
be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If the program does terminal interaction, make it output a short
|
||||
notice like this when it starts in an interactive mode:
|
||||
|
||||
<program> Copyright (C) <year> <name of author>
|
||||
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions; type `show c' for details.
|
||||
|
||||
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||
parts of the General Public License. Of course, your program's commands
|
||||
might be different; for a GUI interface, you would use an "about box".
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU GPL, see
|
||||
<http://www.gnu.org/licenses/>.
|
||||
|
||||
The GNU General Public License does not permit incorporating your program
|
||||
into proprietary programs. If your program is a subroutine library, you
|
||||
may consider it more useful to permit linking proprietary applications with
|
||||
the library. If this is what you want to do, use the GNU Lesser General
|
||||
Public License instead of this License. But first, please read
|
||||
<http://www.gnu.org/philosophy/why-not-lgpl.html>.
|
|
@ -0,0 +1,33 @@
|
|||
#ifndef CHINESESEGMENTATIONPRIVATE_H
|
||||
#define CHINESESEGMENTATIONPRIVATE_H
|
||||
|
||||
#include "chinese-segmentation.h"
|
||||
#include "cppjieba/Jieba.hpp"
|
||||
#include "cppjieba/KeywordExtractor.hpp"
|
||||
|
||||
class ChineseSegmentationPrivate
|
||||
{
|
||||
public:
|
||||
explicit ChineseSegmentationPrivate(ChineseSegmentation *parent = nullptr);
|
||||
~ChineseSegmentationPrivate();
|
||||
vector<KeyWord> callSegment(const string& sentence);
|
||||
|
||||
vector<string> callMixSegmentCutStr(const string& sentence);
|
||||
vector<Word> callMixSegmentCutWord(const string& sentence);
|
||||
string lookUpTagOfWord(const string& word);
|
||||
vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
|
||||
|
||||
vector<Word> callFullSegment(const string& sentence);
|
||||
|
||||
vector<Word> callQuerySegment(const string& sentence);
|
||||
|
||||
vector<Word> callHMMSegment(const string& sentence);
|
||||
|
||||
vector<Word> callMPSegment(const string& sentence);
|
||||
|
||||
private:
|
||||
cppjieba::Jieba *m_jieba;
|
||||
ChineseSegmentation *q = nullptr;
|
||||
};
|
||||
|
||||
#endif // CHINESESEGMENTATIONPRIVATE_H
|
|
@ -19,72 +19,144 @@
|
|||
*
|
||||
*/
|
||||
#include "chinese-segmentation.h"
|
||||
#include <QFileInfo>
|
||||
#include <QDebug>
|
||||
static ChineseSegmentation *global_instance_chinese_segmentation = nullptr;
|
||||
QMutex ChineseSegmentation::m_mutex;
|
||||
#include "chinese-segmentation-private.h"
|
||||
|
||||
ChineseSegmentation::ChineseSegmentation() {
|
||||
const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
|
||||
ChineseSegmentationPrivate::ChineseSegmentationPrivate(ChineseSegmentation *parent) : q(parent)
|
||||
{
|
||||
//const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
|
||||
const char * const HMM_PATH = "/usr/share/ukui-search/res/dict/hmm_model.utf8";
|
||||
const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
|
||||
const char * const IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
|
||||
//const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
|
||||
//const char * const IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
|
||||
const char * const STOP_WORD_PATH = "/usr/share/ukui-search/res/dict/stop_words.utf8";
|
||||
m_jieba = new cppjieba::Jieba(DICT_PATH,
|
||||
HMM_PATH,
|
||||
USER_DICT_PATH,
|
||||
IDF_PATH,
|
||||
IDF_DICT_PATH,
|
||||
STOP_WORD_PATH,
|
||||
"");
|
||||
}
|
||||
|
||||
ChineseSegmentation::~ChineseSegmentation() {
|
||||
ChineseSegmentationPrivate::~ChineseSegmentationPrivate() {
|
||||
if(m_jieba)
|
||||
delete m_jieba;
|
||||
m_jieba = nullptr;
|
||||
}
|
||||
|
||||
ChineseSegmentation *ChineseSegmentation::getInstance() {
|
||||
QMutexLocker locker(&m_mutex);
|
||||
if(!global_instance_chinese_segmentation) {
|
||||
global_instance_chinese_segmentation = new ChineseSegmentation;
|
||||
}
|
||||
return global_instance_chinese_segmentation;
|
||||
}
|
||||
|
||||
QVector<SKeyWord> ChineseSegmentation::callSegement(std::string s) {
|
||||
// std::string s;
|
||||
// s = str.toStdString();
|
||||
// str.squeeze();
|
||||
|
||||
vector<KeyWord> ChineseSegmentationPrivate::callSegment(const string &sentence) {
|
||||
const size_t topk = -1;
|
||||
std::vector<cppjieba::KeyWord> keywordres;
|
||||
ChineseSegmentation::m_jieba->extractor.Extract(s, keywordres, topk);
|
||||
std::string().swap(s);
|
||||
QVector<SKeyWord> vecNeeds;
|
||||
convert(keywordres, vecNeeds);
|
||||
vector<KeyWord> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->extractor.Extract(sentence, keywordres, topk);
|
||||
|
||||
keywordres.clear();
|
||||
// keywordres.shrink_to_fit();
|
||||
return vecNeeds;
|
||||
return keywordres;
|
||||
|
||||
}
|
||||
|
||||
std::vector<cppjieba::KeyWord> ChineseSegmentation::callSegementStd(const std::string &str) {
|
||||
|
||||
const size_t topk = -1;
|
||||
std::vector<cppjieba::KeyWord> keywordres;
|
||||
ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk);
|
||||
|
||||
vector<string> ChineseSegmentationPrivate::callMixSegmentCutStr(const string &sentence)
|
||||
{
|
||||
vector<string> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->Cut(sentence, keywordres);
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
void ChineseSegmentation::convert(std::vector<cppjieba::KeyWord> &keywordres, QVector<SKeyWord> &kw) {
|
||||
for(auto i : keywordres) {
|
||||
SKeyWord temp;
|
||||
temp.word = i.word;
|
||||
temp.offsets = QVector<size_t>::fromStdVector(i.offsets);
|
||||
temp.weight = i.weight;
|
||||
kw.append(temp);
|
||||
}
|
||||
vector<Word> ChineseSegmentationPrivate::callMixSegmentCutWord(const string &sentence)
|
||||
{
|
||||
vector<Word> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->Cut(sentence, keywordres);
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
string ChineseSegmentationPrivate::lookUpTagOfWord(const string &word)
|
||||
{
|
||||
return ChineseSegmentationPrivate::m_jieba->LookupTag(word);
|
||||
}
|
||||
|
||||
vector<pair<string, string>> ChineseSegmentationPrivate::getTagOfWordsInSentence(const string &sentence)
|
||||
{
|
||||
vector<pair<string, string>> words;
|
||||
ChineseSegmentationPrivate::m_jieba->Tag(sentence, words);
|
||||
return words;
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentationPrivate::callFullSegment(const string &sentence)
|
||||
{
|
||||
vector<Word> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->CutAll(sentence, keywordres);
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentationPrivate::callQuerySegment(const string &sentence)
|
||||
{
|
||||
vector<Word> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->CutForSearch(sentence, keywordres);
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentationPrivate::callHMMSegment(const string &sentence)
|
||||
{
|
||||
vector<Word> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->CutHMM(sentence, keywordres);
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentationPrivate::callMPSegment(const string &sentence)
|
||||
{
|
||||
size_t maxWordLen = 512;
|
||||
vector<Word> keywordres;
|
||||
ChineseSegmentationPrivate::m_jieba->CutSmall(sentence, keywordres, maxWordLen);
|
||||
return keywordres;
|
||||
}
|
||||
|
||||
ChineseSegmentation *ChineseSegmentation::getInstance()
|
||||
{
|
||||
static ChineseSegmentation *global_instance_chinese_segmentation = new ChineseSegmentation;
|
||||
return global_instance_chinese_segmentation;
|
||||
}
|
||||
|
||||
vector<KeyWord> ChineseSegmentation::callSegment(const string &sentence)
|
||||
{
|
||||
return d->callSegment(sentence);
|
||||
}
|
||||
|
||||
vector<string> ChineseSegmentation::callMixSegmentCutStr(const string &sentence)
|
||||
{
|
||||
return d->callMixSegmentCutStr(sentence);
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentation::callMixSegmentCutWord(const string &str)
|
||||
{
|
||||
return d->callMixSegmentCutWord(str);
|
||||
}
|
||||
|
||||
string ChineseSegmentation::lookUpTagOfWord(const string &word)
|
||||
{
|
||||
return d->lookUpTagOfWord(word);
|
||||
}
|
||||
|
||||
vector<pair<string, string> > ChineseSegmentation::getTagOfWordsInSentence(const string &sentence)
|
||||
{
|
||||
return d->getTagOfWordsInSentence(sentence);
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentation::callFullSegment(const string &sentence)
|
||||
{
|
||||
return d->callFullSegment(sentence);
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentation::callQuerySegment(const string &sentence)
|
||||
{
|
||||
return d->callQuerySegment(sentence);
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentation::callHMMSegment(const string &sentence)
|
||||
{
|
||||
return d->callHMMSegment(sentence);
|
||||
}
|
||||
|
||||
vector<Word> ChineseSegmentation::callMPSegment(const string &sentence)
|
||||
{
|
||||
return d->callMPSegment(sentence);
|
||||
}
|
||||
|
||||
ChineseSegmentation::ChineseSegmentation() : d(new ChineseSegmentationPrivate)
|
||||
{
|
||||
}
|
||||
|
|
|
@ -22,42 +22,95 @@
|
|||
#define CHINESESEGMENTATION_H
|
||||
|
||||
#include "libchinese-segmentation_global.h"
|
||||
#include "cppjieba/Jieba.hpp"
|
||||
//#include "Logging.hpp"
|
||||
//#include "LocalVector.hpp"
|
||||
//#include "cppjieba/QuerySegment.hpp"
|
||||
#include "cppjieba/KeywordExtractor.hpp"
|
||||
#include <QVector>
|
||||
#include <QString>
|
||||
#include <QDebug>
|
||||
#include <QMutex>
|
||||
|
||||
struct SKeyWord {
|
||||
std::string word;
|
||||
QVector<size_t> offsets;
|
||||
double weight;
|
||||
~SKeyWord() {
|
||||
word = std::move("");
|
||||
offsets.clear();
|
||||
offsets.shrink_to_fit();
|
||||
}
|
||||
};
|
||||
#include "common-struct.h"
|
||||
|
||||
class ChineseSegmentationPrivate;
|
||||
class CHINESESEGMENTATION_EXPORT ChineseSegmentation {
|
||||
public:
|
||||
static ChineseSegmentation *getInstance();
|
||||
QVector<SKeyWord> callSegement(std::string s);
|
||||
std::vector<cppjieba::KeyWord> callSegementStd(const std::string& str);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callSegment
|
||||
* 调用extractor进行关键词提取,先使用Mix方式初步分词,再使用Idf词典进行关键词提取,只包含两字以上关键词
|
||||
*
|
||||
* @param sentence 要提取关键词的句子
|
||||
* @return vector<KeyWord> 存放提取后关键词的信息的容器
|
||||
*/
|
||||
vector<KeyWord> callSegment(const string &sentence);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callMixSegmentCutStr
|
||||
* 使用Mix方法进行分词,即先使用最大概率法MP初步分词,再用隐式马尔科夫模型HMM进一步分词,可以准确切出词典已有词和未登录词,结果比较准确
|
||||
*
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<string> 只存放分词后每个词的内容的容器
|
||||
*/
|
||||
vector<string> callMixSegmentCutStr(const string& sentence);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callMixSegmentCutWord
|
||||
* 和callMixSegmentCutStr功能相同
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<Word> 存放分词后每个词所有信息的容器
|
||||
*/
|
||||
vector<Word> callMixSegmentCutWord(const string& str);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::lookUpTagOfWord
|
||||
* 查询word的词性
|
||||
* @param word 要查询词性的词
|
||||
* @return string word的词性
|
||||
*/
|
||||
string lookUpTagOfWord(const string& word);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::getTagOfWordsInSentence
|
||||
* 使用Mix分词后获取每个词的词性
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<pair<string, string>> 分词后的每个词的内容(firsr)和其对应的词性(second)
|
||||
*/
|
||||
vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callFullSegment
|
||||
* 使用Full进行分词,Full会切出字典里所有的词。
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<Word> 存放分词后每个词所有信息的容器
|
||||
*/
|
||||
vector<Word> callFullSegment(const string& sentence);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callQuerySegment
|
||||
* 使用Query进行分词,即先使用Mix,对于长词再用Full,结果最精确,但词的数量也最大
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<Word> 存放分词后每个词所有信息的容器
|
||||
*/
|
||||
vector<Word> callQuerySegment(const string& sentence);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callHMMSegment
|
||||
* 使用隐式马尔科夫模型HMM进行分词
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<Word> 存放分词后每个词所有信息的容器
|
||||
*/
|
||||
vector<Word> callHMMSegment(const string& sentence);
|
||||
|
||||
/**
|
||||
* @brief ChineseSegmentation::callMPSegment
|
||||
* 使用最大概率法MP进行分词
|
||||
* @param sentence 要分词的句子
|
||||
* @return vector<Word> 存放分词后每个词所有信息的容器
|
||||
*/
|
||||
vector<Word> callMPSegment(const string& sentence);
|
||||
|
||||
private:
|
||||
explicit ChineseSegmentation();
|
||||
~ChineseSegmentation();
|
||||
void convert(std::vector<cppjieba::KeyWord>& keywordres, QVector<SKeyWord>& kw);
|
||||
~ChineseSegmentation() = default;
|
||||
ChineseSegmentation(const ChineseSegmentation&) = delete;
|
||||
ChineseSegmentation& operator =(const ChineseSegmentation&) = delete;
|
||||
|
||||
private:
|
||||
static QMutex m_mutex;
|
||||
cppjieba::Jieba *m_jieba;
|
||||
|
||||
ChineseSegmentationPrivate *d = nullptr;
|
||||
};
|
||||
|
||||
#endif // CHINESESEGMENTATION_H
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
#ifndef COMMONSTRUCT_H
|
||||
#define COMMONSTRUCT_H
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
/**
|
||||
* @brief The KeyWord struct
|
||||
*
|
||||
* @property word the content of keyword
|
||||
* @property offsets the Unicode offsets, can be used to check the word pos in a sentence
|
||||
* @property weight the weight of the keyword
|
||||
*/
|
||||
|
||||
struct KeyWord {
|
||||
string word;
|
||||
vector<size_t> offsets;
|
||||
double weight;
|
||||
~KeyWord() {
|
||||
word = std::move("");
|
||||
offsets.clear();
|
||||
offsets.shrink_to_fit();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The Word struct
|
||||
*
|
||||
* @property word the content of word
|
||||
* @property offset the offset of the word(absolute pos, Chinese 3 , English 1), can be used to check the word pos in a sentence
|
||||
* @property unicode_offset the Unicode offset of the word
|
||||
* @property unicode_length the Unicode length of the word
|
||||
*/
|
||||
struct Word {
|
||||
string word;
|
||||
uint32_t offset;
|
||||
uint32_t unicode_offset;
|
||||
uint32_t unicode_length;
|
||||
Word(const string& w, uint32_t o)
|
||||
: word(w), offset(o) {
|
||||
}
|
||||
Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
|
||||
: word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
||||
}
|
||||
~Word() {
|
||||
word = std::move("");
|
||||
}
|
||||
}; // struct Word
|
||||
|
||||
#endif // COMMONSTRUCT_H
|
|
@ -13,7 +13,12 @@
|
|||
|
||||
#include "limonp/Md5.hpp"
|
||||
#include "Unicode.hpp"
|
||||
#include "darts.h"
|
||||
//#define USE_DARTS_CLONE
|
||||
#ifdef USE_DARTS_CLONE
|
||||
#include "../storage-base/darts-clone/darts.h"
|
||||
#else
|
||||
#include "../storage-base/cedar/cedar.h"
|
||||
#endif
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
|
@ -60,20 +65,6 @@ inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
|
|||
return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
|
||||
}
|
||||
|
||||
struct DatMemElem {
|
||||
double weight = 0.0;
|
||||
char tag[8] = {};
|
||||
|
||||
void SetTag(const string & str) {
|
||||
memset(&tag[0], 0, sizeof(tag));
|
||||
strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
|
||||
}
|
||||
|
||||
string GetTag() const {
|
||||
return &tag[0];
|
||||
}
|
||||
};
|
||||
|
||||
struct PinYinMemElem {
|
||||
char tag[6] = {};
|
||||
|
||||
|
@ -90,14 +81,11 @@ struct PinYinMemElem {
|
|||
inline std::ostream & operator << (std::ostream& os, const DatMemElem & elem) {
|
||||
return os << "/tag=" << elem.GetTag() << "/weight=" << elem.weight;
|
||||
}
|
||||
|
||||
struct DatDag {
|
||||
limonp::LocalVector<pair<size_t, const DatMemElem *> > nexts;
|
||||
double max_weight;
|
||||
int max_next;
|
||||
};
|
||||
|
||||
#ifdef USE_DARTS_CLONE
|
||||
typedef Darts::DoubleArray JiebaDAT;
|
||||
#else
|
||||
typedef cedar::da<int, -1, -2, false> JiebaDAT;
|
||||
#endif
|
||||
|
||||
|
||||
struct CacheFileHeader {
|
||||
|
@ -124,6 +112,7 @@ public:
|
|||
}
|
||||
|
||||
const DatMemElem * Find(const string & key) const {
|
||||
#ifdef USE_DARTS_CLONE
|
||||
JiebaDAT::result_pair_type find_result;
|
||||
dat_.exactMatchSearch(key.c_str(), find_result);
|
||||
|
||||
|
@ -132,9 +121,16 @@ public:
|
|||
}
|
||||
|
||||
return &elements_ptr_[ find_result.value ];
|
||||
#else
|
||||
int result = dat_.exactMatchSearch<int>(key.c_str());
|
||||
if (result < 0)
|
||||
return nullptr;
|
||||
return &elements_ptr_[result];
|
||||
#endif
|
||||
}
|
||||
|
||||
const double Find(const string & key, std::size_t length, std::size_t node_pos) const {
|
||||
#ifdef USE_DARTS_CLONE
|
||||
JiebaDAT::result_pair_type find_result;
|
||||
dat_.exactMatchSearch(key.c_str(), find_result, length, node_pos);
|
||||
|
||||
|
@ -143,9 +139,16 @@ public:
|
|||
}
|
||||
|
||||
return idf_elements_ptr_[ find_result.value ];
|
||||
#else
|
||||
int result = dat_.exactMatchSearch<int>(key.c_str(), length, node_pos);
|
||||
if (result < 0)
|
||||
return -1;
|
||||
return idf_elements_ptr_[result];
|
||||
#endif
|
||||
}
|
||||
|
||||
const PinYinMemElem * PinYinFind(const string & key) const {
|
||||
#ifdef USE_DARTS_CLONE
|
||||
JiebaDAT::result_pair_type find_result;
|
||||
dat_.exactMatchSearch(key.c_str(), find_result);
|
||||
|
||||
|
@ -154,6 +157,12 @@ public:
|
|||
}
|
||||
|
||||
return &pinyin_elements_ptr_[ find_result.value ];
|
||||
#else
|
||||
int result = dat_.exactMatchSearch<int>(key.c_str());
|
||||
if (result < 0)
|
||||
return nullptr;
|
||||
return &pinyin_elements_ptr_[result];
|
||||
#endif
|
||||
}
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||
|
@ -259,7 +268,7 @@ public:
|
|||
max_weight[i] = -3.14e+100;
|
||||
}
|
||||
int max_next[str_size];//存放动态规划后的分词结果
|
||||
memset(max_next,-1,str_size);
|
||||
//memset(max_next,-1,str_size);
|
||||
|
||||
double val(0);
|
||||
for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
|
||||
|
@ -367,7 +376,7 @@ public:
|
|||
|
||||
assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(DatMemElem) + header.dat_size * dat_.unit_size());
|
||||
elements_ptr_ = (const DatMemElem *)(mmap_addr_ + sizeof(header));
|
||||
const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(DatMemElem) * elements_num_;
|
||||
char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(DatMemElem) * elements_num_;
|
||||
dat_.set_array(dat_ptr, header.dat_size);
|
||||
return true;
|
||||
}
|
||||
|
@ -398,7 +407,7 @@ public:
|
|||
|
||||
assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(double) + header.dat_size * dat_.unit_size());
|
||||
idf_elements_ptr_ = (const double *)(mmap_addr_ + sizeof(header));
|
||||
const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(double) * elements_num_;
|
||||
char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(double) * elements_num_;
|
||||
dat_.set_array(dat_ptr, header.dat_size);
|
||||
return true;
|
||||
}
|
||||
|
@ -429,7 +438,7 @@ public:
|
|||
|
||||
assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(PinYinMemElem) + header.dat_size * dat_.unit_size());
|
||||
pinyin_elements_ptr_ = (const PinYinMemElem *)(mmap_addr_ + sizeof(header));
|
||||
const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(PinYinMemElem) * elements_num_;
|
||||
char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(PinYinMemElem) * elements_num_;
|
||||
dat_.set_array(dat_ptr, header.dat_size);
|
||||
return true;
|
||||
}
|
||||
|
@ -469,7 +478,6 @@ private:
|
|||
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
|
||||
::umask(S_IWGRP | S_IWOTH);
|
||||
//const int fd =::mkstemp(&tmp_filepath[0]);
|
||||
//原mkstemp用法有误,已修复--jxx20210519
|
||||
const int fd =::mkstemp((char *)tmp_filepath.data());
|
||||
qDebug() << "mkstemp :" << errno << tmp_filepath.data();
|
||||
assert(fd >= 0);
|
||||
|
@ -518,7 +526,6 @@ private:
|
|||
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
|
||||
::umask(S_IWGRP | S_IWOTH);
|
||||
//const int fd =::mkstemp(&tmp_filepath[0]);
|
||||
//原mkstemp用法有误,已修复--jxx20210519
|
||||
const int fd =::mkstemp((char *)tmp_filepath.data());
|
||||
qDebug() << "mkstemp error:" << errno << tmp_filepath.data();
|
||||
assert(fd >= 0);
|
||||
|
|
|
@ -18,7 +18,6 @@ namespace cppjieba {
|
|||
|
||||
using namespace limonp;
|
||||
|
||||
const double MIN_DOUBLE = -3.14e+100;
|
||||
const double MAX_DOUBLE = 3.14e+100;
|
||||
const size_t DICT_COLUMN_NUM = 3;
|
||||
const char* const UNKNOWN_TAG = "";
|
||||
|
@ -42,14 +41,14 @@ public:
|
|||
return dat_.Find(word);
|
||||
}
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin,
|
||||
void FindDatDag(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<struct DatDag>&res,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
dat_.Find(begin, end, res, max_word_len);
|
||||
}
|
||||
|
||||
void Find(RuneStrArray::const_iterator begin,
|
||||
void FindWordRange(RuneStrArray::const_iterator begin,
|
||||
RuneStrArray::const_iterator end,
|
||||
vector<WordRange>& words,
|
||||
size_t max_word_len = MAX_WORD_LENGTH) const {
|
||||
|
@ -134,9 +133,9 @@ private:
|
|||
total_dict_size_ = file_size_sum;
|
||||
|
||||
if (dat_cache_path.empty()) {
|
||||
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
|
||||
dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache";
|
||||
dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
|
||||
}
|
||||
dat_cache_path += VERSION;
|
||||
QString path = QString::fromStdString(dat_cache_path);
|
||||
qDebug() << "#########Dict path:" << path;
|
||||
if (dat_.InitAttachDat(dat_cache_path, md5)) {
|
||||
|
|
|
@ -4,7 +4,8 @@
|
|||
#include <set>
|
||||
#include <cassert>
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "segment-trie/segment-trie.h"
|
||||
//#include "DictTrie.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
#include "Unicode.hpp"
|
||||
|
||||
|
@ -22,7 +23,7 @@ public:
|
|||
vector<WordRange>& res, bool, size_t) const override {
|
||||
assert(dictTrie_);
|
||||
vector<struct DatDag> dags;
|
||||
dictTrie_->Find(begin, end, dags);
|
||||
dictTrie_->FindDatDag(begin, end, dags);
|
||||
size_t max_word_end_pos = 0;
|
||||
|
||||
for (size_t i = 0; i < dags.size(); i++) {
|
||||
|
@ -45,11 +46,19 @@ public:
|
|||
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
|
||||
size_t) const override {
|
||||
|
||||
std::ignore = s;
|
||||
std::ignore = begin;
|
||||
std::ignore = end;
|
||||
std::ignore = res;
|
||||
std::ignore = hmm;
|
||||
}
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
|
||||
size_t) const override {
|
||||
|
||||
std::ignore = s;
|
||||
std::ignore = begin;
|
||||
std::ignore = end;
|
||||
std::ignore = res;
|
||||
std::ignore = hmm;
|
||||
}
|
||||
private:
|
||||
const DictTrie* dictTrie_;
|
||||
|
|
|
@ -1,12 +1,18 @@
|
|||
#pragma once
|
||||
|
||||
#include "limonp/StringUtil.hpp"
|
||||
|
||||
//#define USE_CEDAR_SEGMENT //使用cedar初步测试性能损失3%-5%左右,内存占用降低近1M
|
||||
#ifdef USE_CEDAR_SEGMENT
|
||||
#include "cedar/cedar.h"
|
||||
#endif
|
||||
namespace cppjieba {
|
||||
|
||||
using namespace limonp;
|
||||
#ifdef USE_CEDAR_SEGMENT
|
||||
typedef cedar::da<float, -1, -2, false> EmitProbMap;
|
||||
#else
|
||||
typedef unordered_map<Rune, double> EmitProbMap;
|
||||
|
||||
#endif
|
||||
struct HMMModel {
|
||||
/*
|
||||
* STATUS:
|
||||
|
@ -73,6 +79,12 @@ struct HMMModel {
|
|||
}
|
||||
double GetEmitProb(const EmitProbMap* ptMp, Rune key,
|
||||
double defVal)const {
|
||||
#ifdef USE_CEDAR_SEGMENT
|
||||
char str_key[8];
|
||||
snprintf(str_key, sizeof(str_key), "%d", key);
|
||||
float result = ptMp->exactMatchSearch<float>(str_key);
|
||||
return result < 0 ? defVal : result;
|
||||
#else
|
||||
EmitProbMap::const_iterator cit = ptMp->find(key);
|
||||
|
||||
if (cit == ptMp->end()) {
|
||||
|
@ -80,6 +92,7 @@ struct HMMModel {
|
|||
}
|
||||
|
||||
return cit->second;
|
||||
#endif
|
||||
}
|
||||
bool GetLine(ifstream& ifile, string& line) {
|
||||
while (getline(ifile, line)) {
|
||||
|
@ -119,8 +132,13 @@ struct HMMModel {
|
|||
XLOG(ERROR) << "TransCode failed.";
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef USE_CEDAR_SEGMENT
|
||||
char str_key[8];
|
||||
snprintf(str_key, sizeof(str_key), "%d", unicode[0]);
|
||||
mp.update(str_key, std::strlen(str_key), atof(tmp2[1].c_str()));
|
||||
#else
|
||||
mp[unicode[0]] = atof(tmp2[1].c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
|
@ -8,6 +8,9 @@
|
|||
#include "SegmentBase.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
const double MIN_DOUBLE = -3.14e+100;
|
||||
|
||||
class HMMSegment: public SegmentBase {
|
||||
public:
|
||||
HMMSegment(const HMMModel* model)
|
||||
|
@ -59,11 +62,19 @@ public:
|
|||
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
|
||||
size_t) const override {
|
||||
|
||||
std::ignore = s;
|
||||
std::ignore = begin;
|
||||
std::ignore = end;
|
||||
std::ignore = res;
|
||||
std::ignore = hmm;
|
||||
}
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
|
||||
size_t) const override {
|
||||
|
||||
std::ignore = s;
|
||||
std::ignore = begin;
|
||||
std::ignore = end;
|
||||
std::ignore = res;
|
||||
std::ignore = hmm;
|
||||
}
|
||||
private:
|
||||
// sequential letters rule
|
||||
|
|
|
@ -51,9 +51,9 @@ private:
|
|||
total_dict_size_ = file_size_sum;
|
||||
|
||||
if (dat_cache_path.empty()) {
|
||||
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
|
||||
dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache";
|
||||
dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
|
||||
}
|
||||
dat_cache_path += VERSION;
|
||||
QString path = QString::fromStdString(dat_cache_path);
|
||||
qDebug() << "#########Idf path:" << path;
|
||||
if (dat_.InitIdfAttachDat(dat_cache_path, md5)) {
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
#include <memory>
|
||||
#include "QuerySegment.hpp"
|
||||
#include "KeywordExtractor.hpp"
|
||||
#include "segment-trie/segment-trie.h"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
|
@ -21,7 +22,7 @@ public:
|
|||
mix_seg_(&dict_trie_, &model_, stopWordPath),
|
||||
full_seg_(&dict_trie_),
|
||||
query_seg_(&dict_trie_, &model_, stopWordPath),
|
||||
extractor(&dict_trie_, &model_, idfPath, dat_cache_path,stopWordPath){ }
|
||||
extractor(&dict_trie_, &model_, idfPath, dat_cache_path, stopWordPath){ }
|
||||
~Jieba() { }
|
||||
|
||||
void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
|
||||
|
@ -61,9 +62,6 @@ public:
|
|||
string LookupTag(const string &str) const {
|
||||
return mix_seg_.LookupTag(str);
|
||||
}
|
||||
bool Find(const string& word) {
|
||||
return nullptr != dict_trie_.Find(word);
|
||||
}
|
||||
|
||||
void ResetSeparators(const string& s) {
|
||||
//TODO
|
||||
|
|
|
@ -2,7 +2,8 @@
|
|||
|
||||
#include <cmath>
|
||||
#include "MixSegment.hpp"
|
||||
#include "IdfTrie.hpp"
|
||||
//#include "IdfTrie.hpp"
|
||||
#include "idf-trie/idf-trie.h"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
|
@ -19,7 +20,7 @@ public:
|
|||
const string& dat_cache_path,
|
||||
const string& stopWordPath)
|
||||
: segment_(dictTrie, model, stopWordPath),
|
||||
idf_trie_(idfPath,dat_cache_path){
|
||||
idf_trie_(idfPath, dat_cache_path){
|
||||
}
|
||||
~KeywordExtractor() {
|
||||
}
|
||||
|
@ -64,7 +65,7 @@ public:
|
|||
if (-1 != idf) {//IDF词典查找
|
||||
itr->second.weight *= idf;
|
||||
} else {
|
||||
itr->second.weight *= idf_trie_.idfAverage_;
|
||||
itr->second.weight *= idf_trie_.GetIdfAverage();
|
||||
}
|
||||
|
||||
itr->second.word = itr->first;
|
||||
|
|
|
@ -4,7 +4,8 @@
|
|||
#include <set>
|
||||
#include <cassert>
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "segment-trie/segment-trie.h"
|
||||
//#include "DictTrie.hpp"
|
||||
#include "SegmentTagged.hpp"
|
||||
#include "PosTagger.hpp"
|
||||
|
||||
|
@ -22,20 +23,24 @@ public:
|
|||
RuneStrArray::const_iterator end,
|
||||
vector<WordRange>& words,
|
||||
bool, size_t max_word_len) const override {
|
||||
// vector<DatDag> dags;
|
||||
// dictTrie_->Find(begin, end, dags, max_word_len);//依据DAG词典生成DAG--jxx
|
||||
// CalcDP(dags);//动态规划(Dynamic Programming,DP),根据DAG计算最优动态规划路径--jxx
|
||||
// CutByDag(begin, end, dags, words);//依据DAG最优路径分词--jxx
|
||||
dictTrie_->Find(begin, end, words, max_word_len);
|
||||
dictTrie_->FindWordRange(begin, end, words, max_word_len);
|
||||
}
|
||||
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
|
||||
size_t) const override {
|
||||
|
||||
std::ignore = s;
|
||||
std::ignore = begin;
|
||||
std::ignore = end;
|
||||
std::ignore = res;
|
||||
std::ignore = hmm;
|
||||
}
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
|
||||
size_t) const override {
|
||||
|
||||
std::ignore = s;
|
||||
std::ignore = begin;
|
||||
std::ignore = end;
|
||||
std::ignore = res;
|
||||
std::ignore = hmm;
|
||||
}
|
||||
const DictTrie* GetDictTrie() const override {
|
||||
return dictTrie_;
|
||||
|
@ -77,6 +82,7 @@ private:
|
|||
}
|
||||
*/
|
||||
/* 倒叙方式重写CalcDP函数,初步测试未发现问题*/
|
||||
/*
|
||||
void CalcDP(vector<DatDag>& dags) const {
|
||||
double val(0);
|
||||
size_t size = dags.size();
|
||||
|
@ -87,8 +93,6 @@ private:
|
|||
|
||||
for (const auto & it : dags[size - 1 - i].nexts) {
|
||||
const auto nextPos = it.first;
|
||||
val = dictTrie_->GetMinWeight();
|
||||
|
||||
if (nullptr != it.second) {
|
||||
val = it.second->weight;
|
||||
}
|
||||
|
@ -119,7 +123,7 @@ private:
|
|||
i = next;
|
||||
}
|
||||
}
|
||||
|
||||
*///相关功能已集成到Find函数中
|
||||
const DictTrie* dictTrie_;
|
||||
PosTagger tagger_;
|
||||
|
||||
|
|
|
@ -5,6 +5,10 @@
|
|||
#include "HMMSegment.hpp"
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "PosTagger.hpp"
|
||||
#define STOP_WORDS_USE_CEDAR_SEGMENT //使用cedar初步测试性能提升3%-5%左右,内存占用降低近不明显
|
||||
#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
|
||||
#include "cedar/cedar.h"
|
||||
#endif
|
||||
|
||||
namespace cppjieba {
|
||||
class MixSegment: public SegmentTagged {
|
||||
|
@ -73,7 +77,7 @@ public:
|
|||
// mpSeg_.CutRuneArray(begin, end, res);
|
||||
// return;
|
||||
// }
|
||||
|
||||
std::ignore = hmm;
|
||||
vector<WordRange> words;
|
||||
assert(end >= begin);
|
||||
words.reserve(end - begin);
|
||||
|
@ -122,6 +126,7 @@ public:
|
|||
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
|
||||
size_t) const override {
|
||||
std::ignore = hmm;
|
||||
vector<WordRange> words;
|
||||
vector<WordRange> hmmRes;
|
||||
assert(end >= begin);
|
||||
|
@ -139,9 +144,15 @@ public:
|
|||
string str = GetStringFromRunes(s, words[i].left, words[i].right);
|
||||
|
||||
if (words[i].left != words[i].right) {
|
||||
#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
|
||||
if (0 < stopWords_.exactMatchSearch<int>(str.c_str(), str.size())) {
|
||||
continue;
|
||||
}
|
||||
#else
|
||||
if (stopWords_.find(str) != stopWords_.end()) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
res[str].offsets.push_back(words[i].left->offset);
|
||||
res[str].weight += 1.0;
|
||||
continue;
|
||||
|
@ -149,9 +160,15 @@ public:
|
|||
|
||||
if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
|
||||
|| i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
|
||||
#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
|
||||
if (0 < stopWords_.exactMatchSearch<int>(str.c_str(), str.size())) {
|
||||
continue;
|
||||
}
|
||||
#else
|
||||
if (stopWords_.find(str) != stopWords_.end()) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
res[str].offsets.push_back(words[i].left->offset);
|
||||
res[str].weight += 1.0;
|
||||
continue;
|
||||
|
@ -181,9 +198,16 @@ public:
|
|||
//put hmm result to result
|
||||
for (size_t k = 0; k < hmmRes.size(); k++) {
|
||||
string hmmStr = GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right);
|
||||
if (IsSingleWord(hmmStr) || stopWords_.find(hmmStr) != stopWords_.end()) {
|
||||
#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
|
||||
if (0 < stopWords_.exactMatchSearch<int>(hmmStr.c_str(), hmmStr.size())) {
|
||||
continue;
|
||||
}
|
||||
#else
|
||||
if (/*IsSingleWord(hmmStr) || */stopWords_.find(hmmStr) != stopWords_.end()) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
res[hmmStr].offsets.push_back(hmmRes[k].left->offset);
|
||||
res[hmmStr].weight += 1.0;
|
||||
}
|
||||
|
@ -227,14 +251,21 @@ public:
|
|||
string line ;
|
||||
|
||||
while (getline(ifs, line)) {
|
||||
#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
|
||||
stopWords_.update(line.c_str(), line.size(), 1);
|
||||
#else
|
||||
stopWords_.insert(line);
|
||||
#endif
|
||||
}
|
||||
|
||||
assert(stopWords_.size());
|
||||
}
|
||||
private:
|
||||
#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
|
||||
cedar::da<int, -1, -2, false> stopWords_;
|
||||
#else
|
||||
unordered_set<string> stopWords_;
|
||||
|
||||
#endif
|
||||
MPSegment mpSeg_;
|
||||
HMMSegment hmmSeg_;
|
||||
PosTagger tagger_;
|
||||
|
|
|
@ -63,7 +63,7 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
bool isMultiTone(string &word) {
|
||||
bool isMultiTone(const string &word) {
|
||||
if (qmap_chinese2pinyin.contains(QString::fromStdString(word)))
|
||||
return true;
|
||||
// if (map_chinese2pinyin.contains(word))
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
#pragma once
|
||||
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "SegmentTagged.hpp"
|
||||
#include "segment-trie/segment-trie.h"
|
||||
//#include "DictTrie.hpp"
|
||||
//#include "SegmentTagged.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
using namespace limonp;
|
||||
|
@ -31,10 +32,10 @@ public:
|
|||
|
||||
string LookupTag(const string &str, const SegmentTagged& segment) const {
|
||||
const DictTrie * dict = segment.GetDictTrie();
|
||||
assert(dict != NULL);
|
||||
assert(dict != nullptr);
|
||||
const auto tmp = dict->Find(str);
|
||||
|
||||
if (tmp == NULL || tmp->GetTag().empty()) {
|
||||
if (tmp == nullptr || tmp->GetTag().empty()) {
|
||||
RuneStrArray runes;
|
||||
|
||||
if (!DecodeRunesInString(str, runes)) {
|
||||
|
|
|
@ -69,6 +69,7 @@ public:
|
|||
}
|
||||
cursor_ ++;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
int max_num = 0;
|
||||
|
|
|
@ -4,12 +4,10 @@
|
|||
#include <set>
|
||||
#include <cassert>
|
||||
#include "limonp/Logging.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
#include "SegmentBase.hpp"
|
||||
#include "FullSegment.hpp"
|
||||
#include "MixSegment.hpp"
|
||||
#include "Unicode.hpp"
|
||||
#include "DictTrie.hpp"
|
||||
|
||||
namespace cppjieba {
|
||||
class QuerySegment: public SegmentBase {
|
||||
|
@ -35,7 +33,7 @@ public:
|
|||
for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
|
||||
string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 2);
|
||||
|
||||
if (trie_->Find(text) != NULL) {
|
||||
if (trie_->Find(text) != nullptr) {
|
||||
WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
|
||||
res.push_back(wr);
|
||||
}
|
||||
|
@ -46,7 +44,7 @@ public:
|
|||
for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
|
||||
string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 3);
|
||||
|
||||
if (trie_->Find(text) != NULL) {
|
||||
if (trie_->Find(text) != nullptr) {
|
||||
WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
|
||||
res.push_back(wr);
|
||||
}
|
||||
|
@ -59,11 +57,19 @@ public:
|
|||
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
|
||||
size_t) const override {
|
||||
|
||||
std::ignore = s;
|
||||
std::ignore = begin;
|
||||
std::ignore = end;
|
||||
std::ignore = res;
|
||||
std::ignore = hmm;
|
||||
}
|
||||
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
|
||||
size_t) const override {
|
||||
|
||||
std::ignore = s;
|
||||
std::ignore = begin;
|
||||
std::ignore = end;
|
||||
std::ignore = res;
|
||||
std::ignore = hmm;
|
||||
}
|
||||
private:
|
||||
bool IsAllAscii(const RuneArray& s) const {
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include <ostream>
|
||||
#include "limonp/LocalVector.hpp"
|
||||
#include "limonp/StringUtil.hpp"
|
||||
#include "common-struct.h"
|
||||
|
||||
namespace cppjieba {
|
||||
|
||||
|
@ -15,29 +16,30 @@ using std::vector;
|
|||
|
||||
typedef uint32_t Rune;
|
||||
|
||||
struct KeyWord {
|
||||
string word;
|
||||
vector<size_t> offsets;
|
||||
double weight;
|
||||
}; // struct Word
|
||||
|
||||
struct Word {
|
||||
string word;
|
||||
uint32_t offset;
|
||||
uint32_t unicode_offset;
|
||||
uint32_t unicode_length;
|
||||
Word(const string& w, uint32_t o)
|
||||
: word(w), offset(o) {
|
||||
}
|
||||
Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
|
||||
: word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
|
||||
}
|
||||
}; // struct Word
|
||||
|
||||
inline std::ostream& operator << (std::ostream& os, const Word& w) {
|
||||
return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
|
||||
}
|
||||
|
||||
struct DatMemElem {
|
||||
double weight = 0.0;
|
||||
char tag[8] = {};
|
||||
|
||||
void SetTag(const string & str) {
|
||||
memset(&tag[0], 0, sizeof(tag));
|
||||
strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
|
||||
}
|
||||
|
||||
string GetTag() const {
|
||||
return &tag[0];
|
||||
}
|
||||
};
|
||||
|
||||
struct DatDag {
|
||||
limonp::LocalVector<pair<size_t, const DatMemElem *> > nexts;
|
||||
//double max_weight;
|
||||
//size_t max_next;
|
||||
};
|
||||
|
||||
struct RuneInfo {
|
||||
Rune rune;
|
||||
uint32_t offset;
|
||||
|
@ -95,7 +97,6 @@ inline RuneArray DecodeRunesInString(const string& s) {
|
|||
return result;
|
||||
}
|
||||
|
||||
//重写DecodeRunesInString函数,将实现放入函数中降低内存占用加快处理流程--jxx20210518
|
||||
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
|
||||
|
||||
uint32_t tmp;
|
||||
|
|
|
@ -17,6 +17,27 @@ HEADERS += \
|
|||
$$PWD/SegmentBase.hpp \
|
||||
$$PWD/SegmentTagged.hpp \
|
||||
$$PWD/TextRankExtractor.hpp \
|
||||
$$PWD/Trie.hpp \
|
||||
$$PWD/Unicode.hpp
|
||||
# $$PWD/Trie.hpp \
|
||||
$$PWD/Unicode.hpp \
|
||||
$$PWD/DatTrie.hpp \
|
||||
$$PWD/idf-trie/idf-trie.h \
|
||||
$$PWD/segment-trie/segment-trie.h
|
||||
|
||||
DISTFILES += \
|
||||
dict/README.md \
|
||||
dict/hmm_model.utf8 \
|
||||
dict/idf.utf8 \
|
||||
dict/jieba.dict.utf8 \
|
||||
dict/pos_dict/char_state_tab.utf8 \
|
||||
dict/pos_dict/prob_emit.utf8 \
|
||||
dict/pos_dict/prob_start.utf8 \
|
||||
dict/pos_dict/prob_trans.utf8 \
|
||||
dict/stop_words.utf8 \
|
||||
dict/user.dict.utf8
|
||||
#dict/pinyinWithoutTone.txt \
|
||||
|
||||
include(limonp/limonp.pri)
|
||||
|
||||
SOURCES += \
|
||||
$$PWD/idf-trie/idf-trie.cpp \
|
||||
$$PWD/segment-trie/segment-trie.cpp
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
#include "idf-trie.h"
|
||||
|
||||
IdfTrie::IdfTrie(const vector<string> file_paths, string dat_cache_path)
|
||||
: StorageBase<double, false, IdfCacheFileHeader>(file_paths, dat_cache_path)
|
||||
{
|
||||
this->Init();
|
||||
}
|
||||
|
||||
IdfTrie::IdfTrie(string file_path, string dat_cache_path)
|
||||
: StorageBase<double, false, IdfCacheFileHeader>(vector<string>{file_path}, dat_cache_path)
|
||||
{
|
||||
this->Init();
|
||||
}
|
||||
|
||||
void IdfTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
|
||||
{
|
||||
IdfCacheFileHeader header;
|
||||
assert(sizeof(header.md5_hex) == md5.size());
|
||||
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
|
||||
|
||||
int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
|
||||
double idf_sum(0), idf_average(0), tmp(0);
|
||||
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
|
||||
umask(S_IWGRP | S_IWOTH);
|
||||
const int fd =mkstemp((char *)tmp_filepath.data());
|
||||
assert(fd >= 0);
|
||||
fchmod(fd, 0644);
|
||||
|
||||
write_bytes = write(fd, (const char *)&header, sizeof(IdfCacheFileHeader));
|
||||
|
||||
ifstream ifs(IDF_DICT_PATH);
|
||||
string line;
|
||||
vector<string> buf;
|
||||
|
||||
for (; getline(ifs, line);) {
|
||||
if (limonp::StartsWith(line, "#") or line.empty()) {
|
||||
continue;
|
||||
}
|
||||
limonp::Split(line, buf, " ");
|
||||
if (buf.size() != 2)
|
||||
continue;
|
||||
this->Update(buf[0].c_str(), buf[0].size(), elements_num);
|
||||
offset += sizeof(double);
|
||||
elements_num++;
|
||||
tmp = atof(buf[1].c_str());
|
||||
write_bytes += write(fd, &tmp, sizeof(double));
|
||||
idf_sum += tmp;
|
||||
}
|
||||
idf_average = idf_sum / elements_num;
|
||||
write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
|
||||
|
||||
lseek(fd, sizeof(header.md5_hex), SEEK_SET);
|
||||
write(fd, &elements_num, sizeof(int));
|
||||
write(fd, &offset, sizeof(int));
|
||||
data_trie_size = this->GetDataTrieSize();
|
||||
write(fd, &data_trie_size, sizeof(int));
|
||||
write(fd, &idf_average, sizeof(double));
|
||||
|
||||
close(fd);
|
||||
assert((size_t)write_bytes == sizeof(IdfCacheFileHeader) + offset + this->GetDataTrieTotalSize());
|
||||
|
||||
const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str());
|
||||
assert(0 == rename_ret);
|
||||
}
|
||||
|
||||
double IdfTrie::Find(const string &key) const
|
||||
{
|
||||
int result = this->ExactMatchSearch(key.c_str(), key.size());
|
||||
if (result < 0)
|
||||
return -1;
|
||||
return this->GetElementPtr()[result];
|
||||
}
|
||||
|
||||
double IdfTrie::GetIdfAverage() const
|
||||
{
|
||||
return this->GetCacheFileHeaderPtr()->idf_average;
|
||||
}
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
#ifndef IdfTrie_H
|
||||
#define IdfTrie_H
|
||||
|
||||
#include "storage-base.hpp"
|
||||
|
||||
const char * const IDF_DICT_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
|
||||
|
||||
struct IdfCacheFileHeader : CacheFileHeaderBase
|
||||
{
|
||||
double idf_average = 0;
|
||||
};
|
||||
|
||||
class IdfTrie : public StorageBase<double, false, IdfCacheFileHeader>
|
||||
{
|
||||
public:
|
||||
IdfTrie(const vector<string> file_paths, string dat_cache_path);
|
||||
IdfTrie(string file_path, string dat_cache_path);
|
||||
void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
|
||||
double Find(const string &key) const;
|
||||
double GetIdfAverage() const;
|
||||
|
||||
private:
|
||||
|
||||
};
|
||||
|
||||
#endif // IdfTrie_H
|
|
@ -0,0 +1,276 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
#include <cmath>
|
||||
#include "segment-trie.h"
|
||||
|
||||
DictTrie::DictTrie(const vector<string> file_paths, string dat_cache_path)
|
||||
: StorageBase<DatMemElem, false, DictCacheFileHeader>(file_paths, dat_cache_path)
|
||||
{
|
||||
this->Init();
|
||||
}
|
||||
|
||||
DictTrie::DictTrie(const string &dict_path, const string &user_dict_paths, const string &dat_cache_path)
|
||||
: StorageBase<DatMemElem, false, DictCacheFileHeader>(vector<string>{dict_path, user_dict_paths}, dat_cache_path)
|
||||
{
|
||||
this->Init();
|
||||
}
|
||||
|
||||
void DictTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
|
||||
{
|
||||
DictCacheFileHeader header;
|
||||
assert(sizeof(header.md5_hex) == md5.size());
|
||||
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
|
||||
|
||||
int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
|
||||
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
|
||||
umask(S_IWGRP | S_IWOTH);
|
||||
const int fd =mkstemp((char *)tmp_filepath.data());
|
||||
assert(fd >= 0);
|
||||
fchmod(fd, 0644);
|
||||
|
||||
write_bytes = write(fd, (const char *)&header, sizeof(DictCacheFileHeader));
|
||||
|
||||
this->PreLoad();
|
||||
this->LoadDefaultDict(fd, write_bytes, offset, elements_num);
|
||||
this->LoadUserDict(fd, write_bytes, offset, elements_num);
|
||||
|
||||
write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
|
||||
|
||||
lseek(fd, sizeof(header.md5_hex), SEEK_SET);
|
||||
write(fd, &elements_num, sizeof(int));
|
||||
write(fd, &offset, sizeof(int));
|
||||
data_trie_size = this->GetDataTrieSize();
|
||||
write(fd, &data_trie_size, sizeof(int));
|
||||
write(fd, &m_min_weight, sizeof(double));
|
||||
|
||||
close(fd);
|
||||
assert((size_t)write_bytes == sizeof(DictCacheFileHeader) + offset + this->GetDataTrieTotalSize());
|
||||
|
||||
const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str());
|
||||
assert(0 == rename_ret);
|
||||
}
|
||||
|
||||
const DatMemElem * DictTrie::Find(const string &key) const
|
||||
{
|
||||
int result = this->ExactMatchSearch(key.c_str(), key.size());
|
||||
if (result < 0)
|
||||
return nullptr;
|
||||
return &this->GetElementPtr()[result];
|
||||
}
|
||||
|
||||
|
||||
|
||||
void DictTrie::FindDatDag(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<DatDag> &res, size_t max_word_len) const {
|
||||
|
||||
res.clear();
|
||||
res.resize(end - begin);
|
||||
|
||||
string text_str;
|
||||
EncodeRunesToString(begin, end, text_str);
|
||||
|
||||
static const size_t max_num = 128;
|
||||
result_pair_type result_pairs[max_num] = {};
|
||||
|
||||
for (size_t i = 0, begin_pos = 0; i < size_t(end - begin); i++) {
|
||||
|
||||
std::size_t num_results = this->CommonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
|
||||
|
||||
res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + 1, nullptr));
|
||||
|
||||
for (std::size_t idx = 0; idx < num_results; ++idx) {
|
||||
auto & match = result_pairs[idx];
|
||||
|
||||
if ((match.value < 0) || ((size_t)match.value >= this->GetCacheFileHeaderPtr()->elements_size)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
|
||||
|
||||
if (char_num > max_word_len) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const DatMemElem * pValue = &this->GetElementPtr()[match.value];
|
||||
|
||||
if (1 == char_num) {
|
||||
res[i].nexts[0].second = pValue;
|
||||
continue;
|
||||
}
|
||||
|
||||
res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + char_num, pValue));
|
||||
}
|
||||
|
||||
begin_pos += limonp::UnicodeToUtf8Bytes((begin + i)->rune);
|
||||
}
|
||||
}
|
||||
|
||||
void DictTrie::FindWordRange(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange> &words, size_t max_word_len) const {
|
||||
|
||||
string text_str;
|
||||
EncodeRunesToString(begin, end, text_str);
|
||||
|
||||
static const size_t max_num = 128;
|
||||
result_pair_type result_pairs[max_num] = {};//存放字典查询结果
|
||||
size_t str_size = end - begin;
|
||||
double max_weight[str_size];//存放逆向路径最大weight
|
||||
for (size_t i = 0; i<str_size; i++) {
|
||||
max_weight[i] = -3.14e+100;
|
||||
}
|
||||
size_t max_next[str_size];//存放动态规划后的分词结果
|
||||
//memset(max_next,-1,str_size*sizeof(size_t));
|
||||
|
||||
double val(0);
|
||||
for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
|
||||
size_t nextPos = str_size - i;//逆向计算
|
||||
begin_pos -= (end - i - 1)->len;
|
||||
|
||||
std::size_t num_results = this->CommonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
|
||||
if (0 == num_results) {//字典不存在则单独分词
|
||||
val = GetMinWeight();
|
||||
if (nextPos < str_size) {
|
||||
val += max_weight[nextPos];
|
||||
}
|
||||
if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
|
||||
max_weight[nextPos - 1] = val;
|
||||
max_next[nextPos - 1] = nextPos;
|
||||
}
|
||||
} else {//字典存在则根据查询结果数量计算最大概率路径
|
||||
for (std::size_t idx = 0; idx < num_results; ++idx) {
|
||||
auto & match = result_pairs[idx];
|
||||
if ((match.value < 0) || ((uint32_t)match.value >= this->GetCacheFileHeaderPtr()->elements_size)) {
|
||||
continue;
|
||||
}
|
||||
auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
|
||||
if (char_num > max_word_len) {
|
||||
continue;
|
||||
}
|
||||
auto * pValue = &this->GetElementPtr()[match.value];
|
||||
|
||||
val = pValue->weight;
|
||||
if (1 == char_num) {
|
||||
if (nextPos < str_size) {
|
||||
val += max_weight[nextPos];
|
||||
}
|
||||
if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
|
||||
max_weight[nextPos - 1] = val;
|
||||
max_next[nextPos - 1] = nextPos;
|
||||
}
|
||||
} else {
|
||||
if (nextPos - 1 + char_num < str_size) {
|
||||
val += max_weight[nextPos - 1 + char_num];
|
||||
}
|
||||
if ((nextPos - 1 + char_num <= str_size) && (val > max_weight[nextPos - 1])) {
|
||||
max_weight[nextPos - 1] = val;
|
||||
max_next[nextPos - 1] = nextPos - 1 + char_num;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < str_size;) {//统计动态规划结果
|
||||
assert(max_next[i] > i);
|
||||
assert(max_next[i] <= str_size);
|
||||
WordRange wr(begin + i, begin + max_next[i] - 1);
|
||||
words.push_back(wr);
|
||||
i = max_next[i];
|
||||
}
|
||||
}
|
||||
|
||||
bool DictTrie::IsUserDictSingleChineseWord(const Rune &word) const {
|
||||
return IsIn(m_user_dict_single_chinese_word, word);
|
||||
}
|
||||
|
||||
void DictTrie::PreLoad()
|
||||
{
|
||||
ifstream ifs(DICT_PATH);
|
||||
string line;
|
||||
vector<string> buf;
|
||||
|
||||
for (; getline(ifs, line);) {
|
||||
if (limonp::StartsWith(line, "#") or line.empty()) {
|
||||
continue;
|
||||
}
|
||||
limonp::Split(line, buf, " ");
|
||||
if (buf.size() != 3)
|
||||
continue;
|
||||
m_freq_sum += atof(buf[1].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void DictTrie::LoadDefaultDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
|
||||
{
|
||||
ifstream ifs(DICT_PATH);
|
||||
string line;
|
||||
vector<string> buf;
|
||||
|
||||
for (; getline(ifs, line);) {
|
||||
if (limonp::StartsWith(line, "#") or line.empty()) {
|
||||
continue;
|
||||
}
|
||||
limonp::Split(line, buf, " ");
|
||||
if (buf.size() != 3)
|
||||
continue;
|
||||
DatMemElem node_info;
|
||||
node_info.weight = log(atof(buf[1].c_str()) / m_freq_sum);
|
||||
node_info.SetTag(buf[2]);
|
||||
this->Update(buf[0].c_str(), buf[0].size(), elements_num);
|
||||
offset += (sizeof(DatMemElem));
|
||||
elements_num++;
|
||||
if (m_min_weight > node_info.weight) {
|
||||
m_min_weight = node_info.weight;
|
||||
}
|
||||
write_bytes += write(fd, &node_info, sizeof(DatMemElem));
|
||||
}
|
||||
}
|
||||
|
||||
void DictTrie::LoadUserDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
|
||||
{
|
||||
ifstream ifs(USER_DICT_PATH);
|
||||
string line;
|
||||
vector<string> buf;
|
||||
for (; getline(ifs, line);) {
|
||||
if (limonp::StartsWith(line, "#") or line.empty()) {
|
||||
continue;
|
||||
}
|
||||
limonp::Split(line, buf, " ");
|
||||
if (buf.size() != 3)
|
||||
continue;
|
||||
DatMemElem node_info;
|
||||
assert(m_freq_sum > 0.0);
|
||||
const int freq = atoi(buf[1].c_str());
|
||||
node_info.weight = log(1.0 * freq / m_freq_sum);
|
||||
node_info.SetTag(buf[2]);
|
||||
this->Update(buf[0].c_str(), buf[0].size(), elements_num);
|
||||
offset += (sizeof(DatMemElem));
|
||||
elements_num++;
|
||||
write_bytes += write(fd, &node_info, sizeof(DatMemElem));
|
||||
if (Utf8CharNum(buf[0]) == 1) {
|
||||
RuneArray word;
|
||||
if (DecodeRunesInString(buf[0], word)) {
|
||||
m_user_dict_single_chinese_word.insert(word[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline double DictTrie::GetMinWeight() const
|
||||
{
|
||||
return this->GetCacheFileHeaderPtr()->min_weight;
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
#ifndef SegmentTrie_H
|
||||
#define SegmentTrie_H
|
||||
|
||||
#include "storage-base.hpp"
|
||||
#include "cppjieba/Unicode.hpp"
|
||||
|
||||
using namespace cppjieba;
|
||||
|
||||
const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
|
||||
const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
|
||||
|
||||
struct DictCacheFileHeader : CacheFileHeaderBase
|
||||
{
|
||||
double min_weight = 0;
|
||||
};
|
||||
|
||||
class DictTrie : public StorageBase<DatMemElem, false, DictCacheFileHeader>
|
||||
{
|
||||
public:
|
||||
DictTrie(const vector<string> file_paths, string dat_cache_path = "");
|
||||
DictTrie(const string& dict_path, const string& user_dict_paths = "", const string & dat_cache_path = "");
|
||||
void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
|
||||
|
||||
const DatMemElem *Find(const string &key) const;
|
||||
void FindDatDag(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||
vector<struct DatDag>&res, size_t max_word_len = MAX_WORD_LENGTH) const;
|
||||
void FindWordRange(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
|
||||
vector<WordRange>& words, size_t max_word_len = MAX_WORD_LENGTH) const;
|
||||
bool IsUserDictSingleChineseWord(const Rune& word) const;
|
||||
|
||||
private:
|
||||
DictTrie();
|
||||
void PreLoad();
|
||||
void LoadDefaultDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
|
||||
void LoadUserDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
|
||||
double GetMinWeight() const;
|
||||
|
||||
double m_freq_sum = 0.0;
|
||||
double m_min_weight = 3.14e+100;
|
||||
unordered_set<Rune> m_user_dict_single_chinese_word;
|
||||
};
|
||||
|
||||
#endif // SegmentTrie_H
|
|
@ -0,0 +1 @@
|
|||
#include "chinese-segmentation.h"
|
|
@ -0,0 +1 @@
|
|||
#include "hanzi-to-pinyin.h"
|
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef HANZITOPINYINPRIVATE_H
|
||||
#define HANZITOPINYINPRIVATE_H
|
||||
|
||||
#include <QtCore/qglobal.h>
|
||||
#include <QHash>
|
||||
#include "pinyin4cpp_dictTrie.h"
|
||||
#include "hanzi-to-pinyin.h"
|
||||
#include "pinyin4cpp-trie.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static const QHash<QString, QString> PhoneticSymbol = {
|
||||
{"ā", "a1"}, {"á", "a2"}, {"ǎ", "a3"}, {"à", "a4"},
|
||||
{"ē", "e1"}, {"é", "e2"}, {"ě", "e3"}, {"è", "e4"},
|
||||
{"ō", "o1"}, {"ó", "o2"}, {"ǒ", "o3"}, {"ò", "o4"},
|
||||
{"ī", "i1"}, {"í", "i2"}, {"ǐ", "i3"}, {"ì", "i4"},
|
||||
{"ū", "u1"}, {"ú", "u2"}, {"ǔ", "u3"}, {"ù", "u4"},
|
||||
// üe
|
||||
{"ü", "v"},
|
||||
{"ǖ", "v1"}, {"ǘ", "v2"}, {"ǚ", "v3"}, {"ǜ", "v4"},
|
||||
{"ń", "n2"}, {"ň", "n3"}, {"ǹ", "n4"},
|
||||
{"m̄", "m1"}, {"ḿ", "m2"}, {"m̀", "m4"},
|
||||
{"ê̄", "ê1"}, {"ế", "ê2"}, {"ê̌", "ê3"}, {"ề", "ê4"}
|
||||
};
|
||||
|
||||
#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
|
||||
|
||||
class PINYINMANAGER_EXPORT HanZiToPinYinPrivate
|
||||
{
|
||||
public:
|
||||
HanZiToPinYinPrivate(HanZiToPinYin *parent = nullptr);
|
||||
~HanZiToPinYinPrivate();
|
||||
|
||||
public:
|
||||
template <typename T>
|
||||
bool isMultiTone(T &&t) {return m_pinYinTrie.IsMultiTone(std::forward<T>(t));}
|
||||
|
||||
bool contains(string &word);
|
||||
int getResults(string &word, QStringList &results);
|
||||
void setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType);
|
||||
|
||||
private:
|
||||
void convertDataStyle(QStringList &results);
|
||||
|
||||
HanZiToPinYin *q = nullptr;
|
||||
//Pinyin4cppDictTrie *m_pinYinTrie = nullptr;
|
||||
Pinyin4cppTrie m_pinYinTrie;
|
||||
|
||||
SegType m_segType = SegType::Segmentation;
|
||||
PolyphoneType m_polyphoneType = PolyphoneType::Disable;
|
||||
PinyinDataStyle m_pinyinDataStyle = PinyinDataStyle::Default;
|
||||
ExDataProcessType m_exDataProcessType = ExDataProcessType::Default;
|
||||
};
|
||||
#endif // HANZITOPINYINPRIVATE_H
|
|
@ -0,0 +1,360 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
|
||||
#include <mutex>
|
||||
#include <cctype>
|
||||
#include "hanzi-to-pinyin.h"
|
||||
#include "hanzi-to-pinyin-private.h"
|
||||
#include "chinese-segmentation.h"
|
||||
#include "cppjieba/Unicode.hpp"
|
||||
|
||||
HanZiToPinYin * HanZiToPinYin::g_pinYinManager = nullptr;
|
||||
std::once_flag g_singleFlag;
|
||||
|
||||
bool HanZiToPinYinPrivate::contains(string &word)
|
||||
{
|
||||
return m_pinYinTrie.Contains(word);
|
||||
}
|
||||
|
||||
int HanZiToPinYinPrivate::getResults(string &word, QStringList &results)
|
||||
{
|
||||
results.clear();
|
||||
|
||||
string directResult = m_pinYinTrie.Find(word);
|
||||
|
||||
if (directResult == string()) {
|
||||
if (m_segType == SegType::NoSegmentation) {//无分词、无结果直接返回-1
|
||||
return -1;
|
||||
} else {//无结果、启用分词
|
||||
vector<string> segResults = ChineseSegmentation::getInstance()->callMixSegmentCutStr(word);
|
||||
string data;
|
||||
for (string &info : segResults) {
|
||||
if (info == string()) {
|
||||
continue;
|
||||
}
|
||||
data = m_pinYinTrie.Find(info);
|
||||
if (data == string()) {//分词后无结果
|
||||
if (cppjieba::IsSingleWord(info)) {//单个字符
|
||||
if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回
|
||||
results.append(QString().fromStdString(info));
|
||||
} else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略
|
||||
continue;
|
||||
}
|
||||
} else {//多个字符
|
||||
string oneWord;
|
||||
cppjieba::RuneStrArray runeArray;
|
||||
cppjieba::DecodeRunesInString(info, runeArray);
|
||||
for (auto i = runeArray.begin(); i != runeArray.end(); ++i) {
|
||||
oneWord = cppjieba::GetStringFromRunes(info, i, i);
|
||||
data = m_pinYinTrie.Find(oneWord);
|
||||
if (data == string()) {//单字无结果则按设置返回
|
||||
if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回
|
||||
results.append(QString().fromStdString(oneWord));
|
||||
} else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
|
||||
results.append(QString().fromStdString(data));
|
||||
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
|
||||
if (limonp::IsInStr(data, ',')) {
|
||||
results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0))));
|
||||
} else {
|
||||
results.append(QString().fromStdString(data));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {//分词后有结果
|
||||
if (cppjieba::IsSingleWord(info)) {//单个字符
|
||||
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
|
||||
results.append(QString().fromStdString(data));
|
||||
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
|
||||
if (limonp::IsInStr(data, ',')) {
|
||||
results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0))));
|
||||
} else {
|
||||
results.append(QString().fromStdString(data));
|
||||
}
|
||||
}
|
||||
} else {//多个字符
|
||||
vector<string> dataVec = limonp::Split(data, "/");
|
||||
if (dataVec.size() == 1) {//无多音词
|
||||
vector<string> dataVec = limonp::Split(data, ",");
|
||||
for (auto &oneResult : dataVec) {
|
||||
results.append(QString().fromStdString(oneResult));
|
||||
}
|
||||
} else {
|
||||
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
|
||||
int wordSize = limonp::Split(dataVec[0], ",").size();
|
||||
for (int i = 0; i < wordSize; ++i) {
|
||||
QStringList oneResult;
|
||||
for (size_t j = 0; j < dataVec.size(); ++j) {
|
||||
oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i]));
|
||||
}
|
||||
results.append(oneResult.join('/'));
|
||||
}
|
||||
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
|
||||
vector<string> tmp = limonp::Split(dataVec[0], ",");
|
||||
for (auto &oneResult : tmp) {
|
||||
results.append(QString().fromStdString(oneResult));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {//可以直接查到结果
|
||||
if (cppjieba::IsSingleWord(word)) {//单个字符
|
||||
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
|
||||
results.append(QString().fromStdString(directResult));
|
||||
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
|
||||
if (limonp::IsInStr(directResult, ',')) {
|
||||
results.append(QString().fromStdString(directResult.substr(0, directResult.find_first_of(",", 0))));
|
||||
} else {
|
||||
results.append(QString().fromStdString(directResult));
|
||||
}
|
||||
}
|
||||
} else {//多个字符
|
||||
vector<string> dataVec = limonp::Split(directResult, "/");
|
||||
if (dataVec.size() == 1) {//无多音词
|
||||
vector<string> dataVec = limonp::Split(directResult, ",");
|
||||
for (auto &oneResult : dataVec) {
|
||||
results.append(QString().fromStdString(oneResult));
|
||||
}
|
||||
} else {
|
||||
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
|
||||
int wordSize = limonp::Split(dataVec[0], ",").size();
|
||||
for (int i = 0; i < wordSize; ++i) {
|
||||
QStringList oneResult;
|
||||
for (size_t j = 0; j < dataVec.size(); ++j) {
|
||||
oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i]));
|
||||
}
|
||||
results.append(oneResult.join('/'));
|
||||
}
|
||||
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
|
||||
vector<string> tmp = limonp::Split(dataVec[0], ",");
|
||||
for (auto &oneResult : tmp) {
|
||||
results.append(QString().fromStdString(oneResult));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
convertDataStyle(results);
|
||||
return 0;//todo
|
||||
}
|
||||
|
||||
void HanZiToPinYinPrivate::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType)
|
||||
{
|
||||
m_pinyinDataStyle = dataStyle;
|
||||
m_segType = segType;
|
||||
m_polyphoneType = polyphoneType;
|
||||
m_exDataProcessType = processType;
|
||||
}
|
||||
|
||||
void HanZiToPinYinPrivate::convertDataStyle(QStringList &results)
|
||||
{
|
||||
QString value;
|
||||
if (m_pinyinDataStyle == PinyinDataStyle::Default) {
|
||||
for (QString &info : results) {
|
||||
if(info == ",") {
|
||||
continue;
|
||||
}
|
||||
//if info's length was been changed, there's someting wrong while traverse the chars of info
|
||||
for (const QChar &c : info) {
|
||||
if (!isalpha(c.toLatin1())) {
|
||||
value = PhoneticSymbol.value(c);
|
||||
if (!value.isEmpty()) {
|
||||
info.replace(c, value.at(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序)
|
||||
QStringList tmpValue;
|
||||
for (auto &str : tmpList) {
|
||||
if (!tmpValue.contains(str)) {
|
||||
tmpValue.push_back(str);
|
||||
}
|
||||
}
|
||||
info = tmpValue.join(",");
|
||||
}
|
||||
} else if (m_pinyinDataStyle == PinyinDataStyle::Tone) {
|
||||
//无需处理
|
||||
} else if (m_pinyinDataStyle == PinyinDataStyle::Tone2) {
|
||||
for (QString &info : results) {
|
||||
for (int i = 0; i < info.size();) {
|
||||
auto c = info.at(i);
|
||||
if (!isalpha(c.toLatin1())) {
|
||||
value = PhoneticSymbol.value(c);
|
||||
if (!value.isEmpty()) {
|
||||
info.replace(c, PhoneticSymbol.value(c));
|
||||
i += PhoneticSymbol.value(c).size();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
} else if (m_pinyinDataStyle == PinyinDataStyle::Tone3) {
|
||||
for (QString &info : results) {
|
||||
if(info == "/") {
|
||||
continue;
|
||||
}
|
||||
bool isPolyphoneWords(false);
|
||||
if (info.contains("/")) {
|
||||
isPolyphoneWords = true;
|
||||
info.replace("/", ",");
|
||||
}
|
||||
|
||||
for (int i = 0; i < info.size();) {
|
||||
auto c = info.at(i);
|
||||
if (!isalpha(c.toLatin1())) {
|
||||
value = PhoneticSymbol.value(c);
|
||||
if (!value.isEmpty()) {
|
||||
info.replace(i, 1, value.at(0));
|
||||
//多音词模式
|
||||
if (info.contains(",")) {
|
||||
int pos = info.indexOf(',', i);
|
||||
if (isPolyphoneWords) {
|
||||
info.replace(",", "/");
|
||||
}
|
||||
//最后一个读音时
|
||||
if (pos == -1) {
|
||||
info.append(value.at(1));
|
||||
break;
|
||||
}
|
||||
info.insert(pos, value.at(1));
|
||||
i = pos + 1; //insert导致','的位置加一,将i行进到','的位置
|
||||
i++;
|
||||
continue;
|
||||
} else {
|
||||
info.append(value.at(1));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
}
|
||||
} else if (m_pinyinDataStyle == PinyinDataStyle::FirstLetter) {
|
||||
for (QString &info : results) {
|
||||
if(info == "," or info == "/") {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool isPolyphoneWords(false);
|
||||
if (info.contains("/")) {
|
||||
isPolyphoneWords = true;
|
||||
info.replace("/", ",");
|
||||
}
|
||||
|
||||
for (int i = 0; i < info.size();i++) {
|
||||
auto c = info.at(i);
|
||||
if (!isalpha(c.toLatin1())) {
|
||||
value = PhoneticSymbol.value(c);
|
||||
if (!value.isEmpty()) {
|
||||
info.replace(c, value.at(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序)
|
||||
QStringList tmpValue;
|
||||
for (auto &str : tmpList) {
|
||||
if (!tmpValue.contains(str)) {
|
||||
tmpValue.push_back(str.at(0));
|
||||
}
|
||||
}
|
||||
if (isPolyphoneWords) {
|
||||
info = tmpValue.join("/");
|
||||
} else {
|
||||
info = tmpValue.join(",");
|
||||
}
|
||||
}
|
||||
} else if (m_pinyinDataStyle == PinyinDataStyle::English) {
|
||||
//暂不支持
|
||||
}
|
||||
}
|
||||
|
||||
HanZiToPinYinPrivate::HanZiToPinYinPrivate(HanZiToPinYin *parent) : q(parent)
|
||||
{
|
||||
//const char * const SINGLE_WORD_PINYIN_PATH = "/usr/share/ukui-search/res/dict/singleWordPinyin.txt";
|
||||
//const char * const WORDS_PINYIN_PATH = "/usr/share/ukui-search/res/dict/wordsPinyin.txt";
|
||||
//m_pinYinTrie = new Pinyin4cppDictTrie(SINGLE_WORD_PINYIN_PATH, WORDS_PINYIN_PATH);
|
||||
//m_pinYinTrie = new Pinyin4cppTrie;
|
||||
}
|
||||
|
||||
HanZiToPinYinPrivate::~HanZiToPinYinPrivate()
|
||||
{
|
||||
// if (m_pinYinTrie){
|
||||
// delete m_pinYinTrie;
|
||||
// m_pinYinTrie = nullptr;
|
||||
// }
|
||||
}
|
||||
|
||||
HanZiToPinYin * HanZiToPinYin::getInstance()
|
||||
{
|
||||
call_once(g_singleFlag, []() {
|
||||
g_pinYinManager = new HanZiToPinYin;
|
||||
});
|
||||
return g_pinYinManager;
|
||||
}
|
||||
|
||||
bool HanZiToPinYin::contains(string &word)
|
||||
{
|
||||
return d->contains(word);
|
||||
}
|
||||
|
||||
bool HanZiToPinYin::isMultiTone(string &word)
|
||||
{
|
||||
return d->isMultiTone(word);
|
||||
}
|
||||
|
||||
bool HanZiToPinYin::isMultiTone(string &&word)
|
||||
{
|
||||
return d->isMultiTone(word);
|
||||
}
|
||||
|
||||
bool HanZiToPinYin::isMultiTone(const string &word)
|
||||
{
|
||||
return d->isMultiTone(word);
|
||||
}
|
||||
|
||||
bool HanZiToPinYin::isMultiTone(const string &&word)
|
||||
{
|
||||
return d->isMultiTone(word);
|
||||
}
|
||||
|
||||
int HanZiToPinYin::getResults(string word, QStringList &results)
|
||||
{
|
||||
return d->getResults(word, results);
|
||||
}
|
||||
|
||||
void HanZiToPinYin::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType)
|
||||
{
|
||||
d->setConfig(dataStyle, segType, polyphoneType, processType);
|
||||
}
|
||||
|
||||
HanZiToPinYin::HanZiToPinYin() : d(new HanZiToPinYinPrivate)
|
||||
{
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef HANZITOPINYIN_H
|
||||
#define HANZITOPINYIN_H
|
||||
|
||||
#include <QtCore/qglobal.h>
|
||||
#include <QStringList>
|
||||
#include "pinyin4cpp-common.h"
|
||||
#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
|
||||
|
||||
using namespace std;
|
||||
|
||||
class HanZiToPinYinPrivate;
|
||||
class PINYINMANAGER_EXPORT HanZiToPinYin
|
||||
{
|
||||
public:
|
||||
static HanZiToPinYin * getInstance();
|
||||
|
||||
public:
|
||||
/**
|
||||
* @brief HanZiToPinYin::isMultiTone 判断是否为多音字/词/句
|
||||
* @param word 要判断的字/词/句
|
||||
* @return bool 不是返回false
|
||||
*/
|
||||
bool isMultiTone(string &word);
|
||||
bool isMultiTone(string &&word);
|
||||
bool isMultiTone(const string &word);
|
||||
bool isMultiTone(const string &&word);
|
||||
|
||||
/**
|
||||
* @brief HanZiToPinYin::contains 查询某个字/词/句是否有拼音(是否在数据库包含)
|
||||
* @param word 要查询的字/词/句
|
||||
* @return bool 数据库不包含返回false
|
||||
*/
|
||||
bool contains(string &word);
|
||||
|
||||
/**
|
||||
* @brief HanZiToPinYin::getResults 获取某个字/词/句的拼音
|
||||
* @param word 要获取拼音的字/词/句
|
||||
* @param results word的拼音列表(有可能多音字),每次调用results会被清空
|
||||
* @return int 获取到返回0,否则返回-1
|
||||
*/
|
||||
int getResults(string word, QStringList &results);
|
||||
|
||||
/**
|
||||
* @brief setConfig 设置HanZiToPinYin的各项功能,详见pinyin4cpp-common.h
|
||||
* @param dataStyle 返回数据风格,默认defult
|
||||
* @param segType 是否启用分词,默认启用
|
||||
* @param polyphoneType 是否启用多音字,默认不启用
|
||||
* @param processType 无拼音数据处理模式,默认defult
|
||||
*/
|
||||
void setConfig(PinyinDataStyle dataStyle,SegType segType,PolyphoneType polyphoneType,ExDataProcessType processType);
|
||||
|
||||
protected:
|
||||
HanZiToPinYin();
|
||||
~HanZiToPinYin();
|
||||
HanZiToPinYin(const HanZiToPinYin&) = delete;
|
||||
HanZiToPinYin& operator =(const HanZiToPinYin&) = delete;
|
||||
private:
|
||||
static HanZiToPinYin *g_pinYinManager;
|
||||
HanZiToPinYinPrivate *d = nullptr;
|
||||
};
|
||||
|
||||
#endif // PINYINMANAGER_H
|
|
@ -1,39 +1,50 @@
|
|||
QT -= gui
|
||||
|
||||
VERSION = 0.0.1
|
||||
VERSION = 1.1.0
|
||||
TARGET = chinese-segmentation
|
||||
TEMPLATE = lib
|
||||
DEFINES += LIBCHINESESEGMENTATION_LIBRARY
|
||||
DEFINES += VERSION='\\"$${VERSION}\\"'
|
||||
|
||||
CONFIG += c++11
|
||||
CONFIG += c++11 create_pc create_prl no_install_prl
|
||||
|
||||
# The following define makes your compiler emit warnings if you use
|
||||
# any Qt feature that has been marked deprecated (the exact warnings
|
||||
# depend on your compiler). Please consult the documentation of the
|
||||
# deprecated API in order to know how to port your code away from it.
|
||||
DEFINES += QT_DEPRECATED_WARNINGS
|
||||
QMAKE_CXXFLAGS += -Werror=return-type -Werror=return-local-addr
|
||||
#QMAKE_CXXFLAGS += -Werror=uninitialized
|
||||
QMAKE_CXXFLAGS += -execution-charset:utf-8
|
||||
|
||||
# You can also make your code fail to compile if it uses deprecated APIs.
|
||||
# In order to do so, uncomment the following line.
|
||||
# You can also select to disable deprecated APIs only up to a certain version of Qt.
|
||||
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
|
||||
include(cppjieba/cppjieba.pri)
|
||||
include(pinyin4cpp/pinyin4cpp.pri)
|
||||
include(storage-base/storage-base-cedar.pri)
|
||||
|
||||
#LIBS += -L/usr/local/lib/libjemalloc -ljemalloc
|
||||
|
||||
SOURCES += \
|
||||
chinese-segmentation.cpp \
|
||||
pinyinmanager.cpp
|
||||
hanzi-to-pinyin.cpp
|
||||
|
||||
HEADERS += \
|
||||
chinese-segmentation-private.h \
|
||||
chinese-segmentation.h \
|
||||
libchinese-segmentation_global.h \
|
||||
pinyinmanager.h
|
||||
common-struct.h \
|
||||
hanzi-to-pinyin-private.h \
|
||||
hanzi-to-pinyin.h \
|
||||
pinyin4cpp-common.h \
|
||||
libchinese-segmentation_global.h
|
||||
|
||||
dict_files.path = /usr/share/ukui-search/res/dict/
|
||||
dict_files.files = $$PWD/dict/*.utf8\
|
||||
dict_files.files += $$PWD/dict/pos_dict/*.utf8\
|
||||
dict_files.files += $$PWD/dict/*.txt\
|
||||
dict_files.files += $$PWD/pinyin4cpp/dict/*.txt
|
||||
|
||||
INSTALLS += \
|
||||
dict_files \
|
||||
|
@ -41,28 +52,28 @@ INSTALLS += \
|
|||
# Default rules for deployment.
|
||||
unix {
|
||||
target.path = $$[QT_INSTALL_LIBS]
|
||||
}
|
||||
QMAKE_PKGCONFIG_NAME = chinese-segmentation
|
||||
QMAKE_PKGCONFIG_DESCRIPTION = chinese-segmentation Header files
|
||||
QMAKE_PKGCONFIG_VERSION = $$VERSION
|
||||
QMAKE_PKGCONFIG_LIBDIR = $$target.path
|
||||
QMAKE_PKGCONFIG_DESTDIR = pkgconfig
|
||||
QMAKE_PKGCONFIG_INCDIR = /usr/include/chinese-seg
|
||||
QMAKE_PKGCONFIG_CFLAGS += -I/usr/include/chinese-seg
|
||||
|
||||
!isEmpty(target.path): INSTALLS += target
|
||||
|
||||
header.path = /usr/include/chinese-seg/
|
||||
header.files += *.h
|
||||
headercppjieba.path = /usr/include/chinese-seg/cppjieba/
|
||||
headercppjieba.files = cppjieba/*
|
||||
INSTALLS += header headercppjieba
|
||||
header.path = /usr/include/chinese-seg
|
||||
header.files += chinese-segmentation.h libchinese-segmentation_global.h common-struct.h hanzi-to-pinyin.h pinyin4cpp-common.h
|
||||
header.files += development-files/header-files/*
|
||||
# headercppjieba.path = /usr/include/chinese-seg/cppjieba/
|
||||
# headercppjieba.files = cppjieba/*
|
||||
INSTALLS += header
|
||||
}
|
||||
|
||||
|
||||
#DISTFILES += \
|
||||
# jiaba/jieba.pri
|
||||
|
||||
DISTFILES += \
|
||||
dict/README.md \
|
||||
dict/hmm_model.utf8 \
|
||||
dict/idf.utf8 \
|
||||
dict/jieba.dict.utf8 \
|
||||
dict/pos_dict/char_state_tab.utf8 \
|
||||
dict/pos_dict/prob_emit.utf8 \
|
||||
dict/pos_dict/prob_start.utf8 \
|
||||
dict/pos_dict/prob_trans.utf8 \
|
||||
dict/stop_words.utf8 \
|
||||
dict/user.dict.utf8 \
|
||||
dict/pinyinWithoutTone.txt
|
||||
|
||||
development-files/header-files/* \
|
||||
pinyin4cpp/pinyin4cpp.pri
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef PINYIN4CPP_COMMON_H
|
||||
#define PINYIN4CPP_COMMON_H
|
||||
|
||||
/**
|
||||
* @brief The PinyinDataStyle enum
|
||||
* Default 默认模式,“中心” return “zhong xin”
|
||||
* Tone 带读音模式 #“中心” return “zhōng xīn”
|
||||
* Tone2 带读音模式2 #“中心” return “zho1ng xi1n”
|
||||
* Tone3 带读音模式3 #“中心” return “zhong1 xin1”
|
||||
* FirstLetter 首字母模式 #“中心” return “z x”
|
||||
* English 英文翻译模式(暂不支持) #“中心” return “center,heart,core”
|
||||
*/
|
||||
enum class PinyinDataStyle {
|
||||
Default = 1u << 0,
|
||||
Tone = 1u << 1,
|
||||
Tone2 = 1u << 2,
|
||||
Tone3 = 1u << 3,
|
||||
FirstLetter = 1u << 4,
|
||||
English = 1u << 5
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The SegType enum
|
||||
* Segmentation 默认带分词 #“银河麒麟”->“银河”“麒麟”
|
||||
* NoSegmentation 无分词模式 #“银河麒麟”
|
||||
*/
|
||||
enum class SegType {
|
||||
Segmentation = 1u << 0,
|
||||
NoSegmentation = 1u << 1
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The PolyphoneType enum
|
||||
* Disable 默认不启用多音字,“奇安信”return “qi an xin”多音字按照常用读音返回
|
||||
* Enable 启用多音字 “奇安信” return“qi,ji an xin”
|
||||
* 注意:多音词返回格式为 “朝阳” return "zhao/chao yang/yang"
|
||||
*/
|
||||
enum class PolyphoneType {
|
||||
Disable = 1u << 0,
|
||||
Enable = 1u << 1
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief The ExDataProcessType enum
|
||||
* Default 默认无拼音数据直接返回,“123木头人” return "123 mu tou ren"(分词模式)
|
||||
* Delete 删除多余数据,#“123木头人” return "mu tou ren"(分词模式)
|
||||
*/
|
||||
enum class ExDataProcessType {
|
||||
Default = 1u << 0,
|
||||
Delete = 1u << 1
|
||||
};
|
||||
|
||||
#endif //PINYIN4CPP_COMMON_H
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,127 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
#include "pinyin4cpp-trie.h"
|
||||
|
||||
Pinyin4cppTrie::Pinyin4cppTrie(string dat_cache_path)
|
||||
: StorageBase<char, false, CacheFileHeaderBase>(vector<string>{SINGLE_WORD_PINYIN_PATH, WORDS_PINYIN_PATH}, dat_cache_path)
|
||||
{
|
||||
this->Init();
|
||||
}
|
||||
|
||||
Pinyin4cppTrie::Pinyin4cppTrie(const vector<string> file_paths, string dat_cache_path)
|
||||
: StorageBase<char, false, CacheFileHeaderBase>(file_paths, dat_cache_path)
|
||||
{
|
||||
this->Init();
|
||||
}
|
||||
|
||||
|
||||
|
||||
bool Pinyin4cppTrie::Contains(string &word) {
|
||||
if (this->Find(word) != string())
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Pinyin4cppTrie::IsMultiTone(const string &word) {
|
||||
string result = this->Find(word);
|
||||
if (result.find(",") == result.npos)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
void Pinyin4cppTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
|
||||
{
|
||||
CacheFileHeaderBase header;
|
||||
assert(sizeof(header.md5_hex) == md5.size());
|
||||
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
|
||||
|
||||
int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
|
||||
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
|
||||
umask(S_IWGRP | S_IWOTH);
|
||||
const int fd =mkstemp((char *)tmp_filepath.data());
|
||||
assert(fd >= 0);
|
||||
fchmod(fd, 0644);
|
||||
|
||||
write_bytes = write(fd, (const char *)&header, sizeof(CacheFileHeaderBase));
|
||||
|
||||
this->LoadSingleWordDict(fd, write_bytes, offset, elements_num);
|
||||
this->LoadWordsDict(fd, write_bytes, offset, elements_num);
|
||||
|
||||
write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
|
||||
|
||||
lseek(fd, sizeof(header.md5_hex), SEEK_SET);
|
||||
write(fd, &elements_num, sizeof(int));
|
||||
write(fd, &offset, sizeof(int));
|
||||
data_trie_size = this->GetDataTrieSize();
|
||||
write(fd, &data_trie_size, sizeof(int));
|
||||
|
||||
close(fd);
|
||||
assert((size_t)write_bytes == sizeof(CacheFileHeaderBase) + offset + this->GetDataTrieTotalSize());
|
||||
|
||||
const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str());
|
||||
assert(0 == rename_ret);
|
||||
}
|
||||
|
||||
string Pinyin4cppTrie::Find(const string &key)
|
||||
{
|
||||
int result = this->ExactMatchSearch(key.c_str(), key.size());
|
||||
if (result < 0)
|
||||
return string();
|
||||
return string(&this->GetElementPtr()[result]);
|
||||
}
|
||||
|
||||
void Pinyin4cppTrie::LoadSingleWordDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
|
||||
{
|
||||
ifstream ifs(SINGLE_WORD_PINYIN_PATH);
|
||||
string line;
|
||||
vector<string> buf;
|
||||
|
||||
for (; getline(ifs, line);) {
|
||||
if (limonp::StartsWith(line, "#") or line.empty()) {
|
||||
continue;
|
||||
}
|
||||
limonp::Split(line, buf, ":");
|
||||
if (buf.size() != 3)
|
||||
continue;
|
||||
this->Update(buf[2].c_str(), buf[2].size(), offset);
|
||||
offset += (buf[1].size() + 1);
|
||||
elements_num++;
|
||||
write_bytes += write(fd, buf[1].c_str(), buf[1].size() + 1);
|
||||
}
|
||||
}
|
||||
|
||||
void Pinyin4cppTrie::LoadWordsDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
|
||||
{
|
||||
ifstream ifs(WORDS_PINYIN_PATH);
|
||||
string line;
|
||||
vector<string> buf;
|
||||
for (; getline(ifs, line);) {
|
||||
if (limonp::StartsWith(line, "#") or line.empty()) {
|
||||
continue;
|
||||
}
|
||||
limonp::Split(line, buf, ":");
|
||||
if (buf.size() != 2)
|
||||
continue;
|
||||
this->Update(buf[0].c_str(), buf[0].size(), offset);
|
||||
offset += (buf[1].size() + 1);
|
||||
elements_num++;
|
||||
write_bytes += write(fd, buf[1].c_str(), buf[1].size() + 1);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
#ifndef PINYIN4CPPTRIE_H
|
||||
#define PINYIN4CPPTRIE_H
|
||||
|
||||
#include "storage-base.hpp"
|
||||
|
||||
const char * const SINGLE_WORD_PINYIN_PATH = "/usr/share/ukui-search/res/dict/singleWordPinyin.txt";
|
||||
const char * const WORDS_PINYIN_PATH = "/usr/share/ukui-search/res/dict/wordsPinyin.txt";
|
||||
|
||||
class Pinyin4cppTrie : public StorageBase<char, false, CacheFileHeaderBase>
|
||||
{
|
||||
public:
|
||||
Pinyin4cppTrie(string dat_cache_path = "");
|
||||
Pinyin4cppTrie(const vector<string> file_paths, string dat_cache_path = "");
|
||||
void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
|
||||
string Find(const string &key);
|
||||
bool Contains(string &word);
|
||||
bool IsMultiTone(const string &word);
|
||||
|
||||
private:
|
||||
void LoadSingleWordDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
|
||||
void LoadWordsDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
|
||||
};
|
||||
|
||||
#endif // PINYIN4CPPTRIE_H
|
|
@ -0,0 +1,15 @@
|
|||
INCLUDEPATH += $$PWD
|
||||
|
||||
HEADERS += \
|
||||
$$PWD/pinyin4cpp-trie.h \
|
||||
$$PWD/pinyin4cpp_dataTrie.h \
|
||||
$$PWD/pinyin4cpp_dictTrie.h
|
||||
|
||||
SOURCES += \
|
||||
$$PWD/pinyin4cpp-trie.cpp \
|
||||
$$PWD/pinyin4cpp_dataTrie.cpp \
|
||||
$$PWD/pinyin4cpp_dictTrie.cpp
|
||||
|
||||
DISTFILES += \
|
||||
pinyin4cpp/dict/wordsPinyin.txt \
|
||||
pinyin4cpp/dict/singleWordPinyin.txt
|
|
@ -0,0 +1,135 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
|
||||
#include "pinyin4cpp_dataTrie.h"
|
||||
|
||||
Pinyin4cppDataTrie::Pinyin4cppDataTrie()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
Pinyin4cppDataTrie::~Pinyin4cppDataTrie()
|
||||
{
|
||||
munmap(m_mmapAddr, m_mmapLength);
|
||||
m_mmapAddr = nullptr;
|
||||
close(m_mmapFd);
|
||||
m_mmapFd = -1;
|
||||
}
|
||||
|
||||
string Pinyin4cppDataTrie::Find(const string &key) const {
|
||||
// darts-clone的接口方法
|
||||
Darts::DoubleArray::result_pair_type find_result;
|
||||
m_DoubleArrayDataTrie.exactMatchSearch(key.c_str(), find_result);
|
||||
if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= m_elementsSize)) {//todo
|
||||
return string();
|
||||
}
|
||||
return string(&m_elementsPtr[find_result.value]);
|
||||
|
||||
// cedarpp的接口方法
|
||||
// int result = m_DoubleArrayDataTrie.exactMatchSearch<int>(key.c_str(), key.size());
|
||||
// if (result < 0)
|
||||
// return string();
|
||||
// return string(&m_elementsPtr[result]);
|
||||
|
||||
}
|
||||
|
||||
bool Pinyin4cppDataTrie::InitBuildDat(map<string, string> &elements, const string &dat_cache_file, const string &md5) {
|
||||
BuildDatCache(elements, dat_cache_file, md5);
|
||||
return InitAttachDat(dat_cache_file, md5);
|
||||
}
|
||||
|
||||
bool Pinyin4cppDataTrie::InitAttachDat(const string &dat_cache_file, const string &md5) {
|
||||
m_mmapFd = open(dat_cache_file.c_str(), O_RDONLY);
|
||||
|
||||
if (m_mmapFd < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto seek_off = lseek(m_mmapFd, 0, SEEK_END);
|
||||
assert(seek_off >= 0);
|
||||
|
||||
m_mmapLength = seek_off;
|
||||
m_mmapAddr = reinterpret_cast<char *>(mmap(NULL, m_mmapLength, PROT_READ, MAP_SHARED, m_mmapFd, 0));
|
||||
assert(MAP_FAILED != m_mmapAddr);
|
||||
assert(m_mmapLength >= sizeof(CacheFileHeader));
|
||||
|
||||
CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(m_mmapAddr);
|
||||
m_elementsNum = header.elements_num;
|
||||
m_elementsSize = header.elements_size;
|
||||
assert(sizeof(header.md5_hex) == md5.size());
|
||||
|
||||
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
assert(m_mmapLength == sizeof(CacheFileHeader) + header.elements_size + header.dat_size * m_DoubleArrayDataTrie.unit_size());
|
||||
|
||||
m_elementsPtr = (const char *)(m_mmapAddr + sizeof(CacheFileHeader));
|
||||
const char * dat_ptr = m_mmapAddr + sizeof(CacheFileHeader) + header.elements_size;
|
||||
m_DoubleArrayDataTrie.set_array((char *)dat_ptr, header.dat_size);
|
||||
return true;
|
||||
}
|
||||
|
||||
void Pinyin4cppDataTrie::BuildDatCache(map<string, string> &elements, const string &dat_cache_file, const string &md5) {
|
||||
vector<const char*> keys_ptr_vec;
|
||||
vector<int> values_vec;
|
||||
vector<string> mem_elem_vec;
|
||||
|
||||
keys_ptr_vec.reserve(elements.size());
|
||||
values_vec.reserve(elements.size());
|
||||
mem_elem_vec.reserve(elements.size());
|
||||
|
||||
CacheFileHeader header;
|
||||
assert(sizeof(header.md5_hex) == md5.size());
|
||||
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
|
||||
|
||||
int offset(0);
|
||||
for (auto &info:elements) {
|
||||
keys_ptr_vec.push_back(info.first.c_str());
|
||||
values_vec.push_back(offset);
|
||||
offset += (info.second.size() + 1);//+1指字符串后加\0
|
||||
assert(info.second.size() > 0);
|
||||
mem_elem_vec.push_back(info.second);
|
||||
}
|
||||
|
||||
auto const ret = m_DoubleArrayDataTrie.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
|
||||
assert(0 == ret);
|
||||
header.elements_num = mem_elem_vec.size();
|
||||
header.elements_size = offset;
|
||||
header.dat_size = m_DoubleArrayDataTrie.size();
|
||||
|
||||
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
|
||||
umask(S_IWGRP | S_IWOTH);
|
||||
const int fd =mkstemp((char *)tmp_filepath.data());
|
||||
assert(fd >= 0);
|
||||
fchmod(fd, 0644);
|
||||
|
||||
auto write_bytes = write(fd, (const char *)&header, sizeof(header));
|
||||
for (size_t i = 0; i < elements.size(); ++i) {
|
||||
write_bytes += write(fd, mem_elem_vec[i].c_str(), mem_elem_vec[i].size() + 1);
|
||||
}
|
||||
write_bytes += write(fd, m_DoubleArrayDataTrie.array(), m_DoubleArrayDataTrie.total_size());
|
||||
|
||||
assert((size_t)write_bytes == sizeof(header) + offset + m_DoubleArrayDataTrie.total_size());
|
||||
close(fd);
|
||||
|
||||
const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str());
|
||||
assert(0 == rename_ret);
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
#ifndef PINYIN4cpp_DATATRIE_H
|
||||
#define PINYIN4cpp_DATATRIE_H
|
||||
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <QDebug>
|
||||
#include "Md5.hpp"
|
||||
#include "LocalVector.hpp"
|
||||
#include "StringUtil.hpp"
|
||||
//#define USE_REDUCED_TRIE
|
||||
#include "../storage-base/cedar/cedar.h"
|
||||
#include "../storage-base/darts-clone/darts.h"
|
||||
|
||||
using namespace std;
|
||||
using std::pair;
|
||||
|
||||
struct CacheFileHeader { //todo 字节对齐
|
||||
char md5_hex[32] = {};
|
||||
uint32_t elements_num = 0;
|
||||
uint32_t elements_size = 0;
|
||||
uint32_t dat_size = 0;
|
||||
};
|
||||
|
||||
class Pinyin4cppDataTrie {
|
||||
public:
|
||||
Pinyin4cppDataTrie();
|
||||
~Pinyin4cppDataTrie();
|
||||
|
||||
string Find(const string & key) const;
|
||||
|
||||
bool InitBuildDat(map<string, string>& elements, const string & dat_cache_file, const string & md5);
|
||||
|
||||
bool InitAttachDat(const string & dat_cache_file, const string & md5);
|
||||
|
||||
private:
|
||||
void BuildDatCache(map<string, string>& elements, const string & dat_cache_file, const string & md5);
|
||||
|
||||
Pinyin4cppDataTrie(const Pinyin4cppDataTrie &);
|
||||
Pinyin4cppDataTrie &operator=(const Pinyin4cppDataTrie &);
|
||||
|
||||
private:
|
||||
Darts::DoubleArray m_DoubleArrayDataTrie;
|
||||
//cedar::da<int, -1, -2, true> m_DoubleArrayDataTrie;
|
||||
const char * m_elementsPtr = nullptr;
|
||||
size_t m_elementsNum = 0;
|
||||
size_t m_elementsSize = 0;
|
||||
size_t m_mmapLength = 0;
|
||||
|
||||
int m_mmapFd = -1;
|
||||
char * m_mmapAddr = nullptr;
|
||||
};
|
||||
|
||||
#endif //PINYIN4cpp_DATATRIE_H
|
|
@ -0,0 +1,156 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
|
||||
#include "pinyin4cpp_dictTrie.h"
|
||||
#include "malloc.h"
|
||||
|
||||
Pinyin4cppDictTrie::Pinyin4cppDictTrie(const string &single_word_dict_path, const string &words_dict_paths, const string &dat_cache_path) {
|
||||
Init(single_word_dict_path, words_dict_paths, dat_cache_path);
|
||||
}
|
||||
|
||||
string Pinyin4cppDictTrie::Find(const string &word) const {
|
||||
return m_DataTrie.Find(word);
|
||||
}
|
||||
|
||||
bool Pinyin4cppDictTrie::Contains(string &word) {
|
||||
if (m_DataTrie.Find(word) != string())
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Pinyin4cppDictTrie::IsMultiTone(const string &word) {
|
||||
string result = m_DataTrie.Find(word);
|
||||
if (result.find(",") == result.npos)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t Pinyin4cppDictTrie::GetTotalDictSize() const {
|
||||
return m_TotalDictSize_;
|
||||
}
|
||||
|
||||
void Pinyin4cppDictTrie::Init(const string &single_word_dict_path, const string &words_dict_paths, string dat_cache_path) {
|
||||
const auto dict_list = single_word_dict_path + "|" + words_dict_paths;
|
||||
size_t file_size_sum = 0;
|
||||
const string md5 = CalcFileListMD5(dict_list, file_size_sum);
|
||||
m_TotalDictSize_ = file_size_sum;
|
||||
|
||||
if (dat_cache_path.empty()) {
|
||||
dat_cache_path = "/tmp/" + md5 + ".dat_cache";//未指定词库数据文件存储位置的默认存储在tmp目录下
|
||||
}
|
||||
qDebug() << "#####Pinyin Dict path:" << dat_cache_path.c_str();
|
||||
if (m_DataTrie.InitAttachDat(dat_cache_path, md5)) {
|
||||
return;
|
||||
}
|
||||
|
||||
LoadSingleWordDict(single_word_dict_path);
|
||||
LoadWordsDict(words_dict_paths);
|
||||
bool build_ret = m_DataTrie.InitBuildDat(m_StaticNodeInfos, dat_cache_path, md5);
|
||||
assert(build_ret);
|
||||
m_StaticNodeInfos.clear();
|
||||
malloc_trim(0);
|
||||
}
|
||||
|
||||
void Pinyin4cppDictTrie::LoadSingleWordDict(const string &filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
string line;
|
||||
vector<string> buf;
|
||||
|
||||
for (; getline(ifs, line);) {
|
||||
if (limonp::StartsWith(line, "#")) {
|
||||
continue;
|
||||
}
|
||||
limonp::Split(line, buf, ":");
|
||||
assert(buf.size() == SINGLE_WORD_DICT_COLUMN_NUM);
|
||||
if (m_StaticNodeInfos.find(buf[2]) != m_StaticNodeInfos.end()) {
|
||||
vector<string> tmp;
|
||||
bool isfind(false);
|
||||
limonp::Split(m_StaticNodeInfos[buf[2]], tmp, ",");
|
||||
for (auto &onePinyin:tmp) {
|
||||
if (onePinyin == buf[1]) {
|
||||
isfind = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!isfind) {
|
||||
m_StaticNodeInfos[buf[2]] += ("," + buf[2]);
|
||||
}
|
||||
} else {
|
||||
m_StaticNodeInfos[buf[2]] = buf[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Pinyin4cppDictTrie::LoadWordsDict(const string &filePath) {
|
||||
ifstream ifs(filePath.c_str());
|
||||
string line;
|
||||
vector<string> buf;
|
||||
for (; getline(ifs, line);) {
|
||||
if (limonp::StartsWith(line, "#")) {
|
||||
continue;
|
||||
}
|
||||
limonp::Split(line, buf, ":");
|
||||
assert(buf.size() == WORDS_DICT_COLUMN_NUM);
|
||||
if (m_StaticNodeInfos.find(buf[0]) != m_StaticNodeInfos.end()) {
|
||||
vector<string> tmp;
|
||||
bool isfind(false);
|
||||
limonp::Split(m_StaticNodeInfos[buf[0]], tmp, "/");
|
||||
for (auto &onePinyin:tmp) {
|
||||
if (onePinyin == buf[1]) {
|
||||
isfind = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!isfind) {
|
||||
m_StaticNodeInfos[buf[0]] += ("/" + buf[1]);
|
||||
}
|
||||
} else {
|
||||
m_StaticNodeInfos[buf[0]] = buf[1];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
string CalcFileListMD5(const string &files_list, size_t &file_size_sum) {
|
||||
limonp::MD5 md5;
|
||||
|
||||
const auto files = limonp::Split(files_list, "|;");
|
||||
file_size_sum = 0;
|
||||
|
||||
for (auto const & local_path : files) {
|
||||
const int fd = open(local_path.c_str(), O_RDONLY);
|
||||
if (fd < 0){
|
||||
continue;
|
||||
}
|
||||
auto const len = lseek(fd, 0, SEEK_END);
|
||||
if (len > 0) {
|
||||
void * addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
|
||||
assert(MAP_FAILED != addr);
|
||||
|
||||
md5.Update((unsigned char *) addr, len);
|
||||
file_size_sum += len;
|
||||
|
||||
munmap(addr, len);
|
||||
}
|
||||
close(fd);
|
||||
}
|
||||
|
||||
md5.Final();
|
||||
return string(md5.digestChars);
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
#ifndef PINYIN4cpp_DICTTRIE_H
|
||||
#define PINYIN4cpp_DICTTRIE_H
|
||||
|
||||
#include "pinyin4cpp_dataTrie.h"
|
||||
using namespace std;
|
||||
|
||||
const size_t SINGLE_WORD_DICT_COLUMN_NUM = 3;
|
||||
const size_t WORDS_DICT_COLUMN_NUM = 2;
|
||||
|
||||
class Pinyin4cppDictTrie {
|
||||
public:
|
||||
Pinyin4cppDictTrie(const string& single_word_dict_path, const string& words_dict_paths, const string & dat_cache_path = "");
|
||||
|
||||
~Pinyin4cppDictTrie() {}
|
||||
|
||||
string Find(const string &word) const;
|
||||
|
||||
bool Contains(string &word);
|
||||
bool IsMultiTone(const string &word);
|
||||
|
||||
size_t GetTotalDictSize() const;
|
||||
|
||||
private:
|
||||
void Init(const string& single_word_dict_path, const string& words_dict_paths, string dat_cache_path);
|
||||
|
||||
void LoadSingleWordDict(const string& filePath);
|
||||
|
||||
void LoadWordsDict(const string& filePath);
|
||||
|
||||
private:
|
||||
map<string, string> m_StaticNodeInfos;
|
||||
|
||||
size_t m_TotalDictSize_ = 0;
|
||||
Pinyin4cppDataTrie m_DataTrie;
|
||||
|
||||
};
|
||||
|
||||
inline string CalcFileListMD5(const string & files_list, size_t & file_size_sum);
|
||||
|
||||
#endif //PINYIN4cpp_DICTTRIE_H
|
|
@ -1,55 +0,0 @@
|
|||
#include "pinyinmanager.h"
|
||||
#include <mutex>
|
||||
PinYinManager * PinYinManager::g_pinYinManager = nullptr;
|
||||
std::once_flag g_singleFlag;
|
||||
PinYinManager * PinYinManager::getInstance()
|
||||
{
|
||||
call_once(g_singleFlag, []() {
|
||||
g_pinYinManager = new PinYinManager;
|
||||
});
|
||||
return g_pinYinManager;
|
||||
}
|
||||
|
||||
bool PinYinManager::contains(string &word)
|
||||
{
|
||||
return m_pinYinTrie->contains(word);
|
||||
}
|
||||
|
||||
bool PinYinManager::isMultiTon(string &word)
|
||||
{
|
||||
return m_pinYinTrie->isMultiTone(word);
|
||||
}
|
||||
|
||||
bool PinYinManager::isMultiTon(string word)
|
||||
{
|
||||
return m_pinYinTrie->isMultiTone(word);
|
||||
}
|
||||
|
||||
int PinYinManager::getResults(string word, QStringList &results)
|
||||
{
|
||||
results.clear();
|
||||
if (-1 != m_pinYinTrie->getMultiTonResults(word, results)) {
|
||||
return 0;
|
||||
}
|
||||
QString tmp;
|
||||
if (-1 != m_pinYinTrie->getSingleTonResult(word, tmp)) {
|
||||
results.append(tmp);
|
||||
return 0;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
PinYinManager::PinYinManager()
|
||||
{
|
||||
const char * const PINYIN_PATH = "/usr/share/ukui-search/res/dict/pinyinWithoutTone.txt";
|
||||
m_pinYinTrie = new cppjieba::PinYinTrie(PINYIN_PATH);
|
||||
}
|
||||
|
||||
PinYinManager::~PinYinManager()
|
||||
{
|
||||
if (m_pinYinTrie){
|
||||
delete m_pinYinTrie;
|
||||
m_pinYinTrie = nullptr;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,33 +0,0 @@
|
|||
#ifndef PINYINMANAGER_H
|
||||
#define PINYINMANAGER_H
|
||||
|
||||
#include <QtCore/qglobal.h>
|
||||
#include "cppjieba/PinYinTrie.hpp"
|
||||
|
||||
#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
|
||||
|
||||
using namespace std;
|
||||
|
||||
class PINYINMANAGER_EXPORT PinYinManager
|
||||
{
|
||||
public:
|
||||
static PinYinManager * getInstance();
|
||||
|
||||
public:
|
||||
bool contains(string &word);
|
||||
bool isMultiTon(string &word);
|
||||
bool isMultiTon(string word);
|
||||
|
||||
int getResults(string word, QStringList &results);
|
||||
|
||||
protected:
|
||||
PinYinManager();
|
||||
~PinYinManager();
|
||||
|
||||
private:
|
||||
static PinYinManager *g_pinYinManager;
|
||||
cppjieba::PinYinTrie *m_pinYinTrie = nullptr;
|
||||
|
||||
};
|
||||
|
||||
#endif // PINYINMANAGER_H
|
|
@ -0,0 +1,682 @@
|
|||
// cedar -- C++ implementation of Efficiently-updatable Double ARray trie
|
||||
// $Id: cedar.h 1938 2022-03-17 16:22:30Z ynaga $
|
||||
// Copyright (c) 2009-2015 Naoki Yoshinaga <ynaga@tkl.iis.u-tokyo.ac.jp>
|
||||
#ifndef CEDAR_H
|
||||
#define CEDAR_H
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <cassert>
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#define STATIC_ASSERT(e, msg) typedef char msg[(e) ? 1 : -1]
|
||||
|
||||
namespace cedar {
|
||||
// typedefs
|
||||
typedef unsigned char uchar;
|
||||
template <typename T> struct NaN { enum { N1 = -1, N2 = -2 }; };
|
||||
template <> struct NaN <float> { enum { N1 = 0x7f800001, N2 = 0x7f800002 }; };
|
||||
static const int MAX_ALLOC_SIZE = 1 << 16; // must be divisible by 256
|
||||
// dynamic double array
|
||||
template <typename value_type,
|
||||
const int NO_VALUE = NaN <value_type>::N1,
|
||||
const int NO_PATH = NaN <value_type>::N2,
|
||||
const bool ORDERED = true,
|
||||
const int MAX_TRIAL = 1,
|
||||
const size_t NUM_TRACKING_NODES = 0>
|
||||
class da {
|
||||
public:
|
||||
enum error_code { CEDAR_NO_VALUE = NO_VALUE, CEDAR_NO_PATH = NO_PATH, CEDAR_VALUE_LIMIT = 2147483647 };
|
||||
typedef value_type result_type;
|
||||
struct result_pair_type {
|
||||
value_type value;
|
||||
size_t length; // prefix length
|
||||
};
|
||||
struct result_triple_type { // for predict ()
|
||||
value_type value;
|
||||
size_t length; // suffix length
|
||||
size_t id; // node id of value
|
||||
};
|
||||
struct node {
|
||||
union { int base_; value_type value; }; // negative means prev empty index
|
||||
int check; // negative means next empty index
|
||||
node (const int base__ = 0, const int check_ = 0)
|
||||
: base_ (base__), check (check_) {}
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
int base () const { return - (base_ + 1); } // ~ in two's complement system
|
||||
#else
|
||||
int base () const { return base_; }
|
||||
#endif
|
||||
};
|
||||
struct ninfo { // x1.5 update speed; +.25 % memory (8n -> 10n)
|
||||
uchar sibling; // right sibling (= 0 if not exist)
|
||||
uchar child; // first child
|
||||
ninfo () : sibling (0), child (0) {}
|
||||
};
|
||||
struct block { // a block w/ 256 elements
|
||||
int prev; // prev block; 3 bytes
|
||||
int next; // next block; 3 bytes
|
||||
short num; // # empty elements; 0 - 256
|
||||
short reject; // minimum # branching failed to locate; soft limit
|
||||
int trial; // # trial
|
||||
int ehead; // first empty item
|
||||
block () : prev (0), next (0), num (256), reject (257), trial (0), ehead (0) {}
|
||||
};
|
||||
da () : tracking_node (), _array (0), _ninfo (0), _block (0), _bheadF (0), _bheadC (0), _bheadO (0), _capacity (0), _size (0), _no_delete (false), _reject () {
|
||||
STATIC_ASSERT(sizeof (value_type) <= sizeof (int),
|
||||
value_type_is_not_supported___maintain_a_value_array_by_yourself_and_store_its_index
|
||||
);
|
||||
_initialize ();
|
||||
}
|
||||
~da () { clear (false); }
|
||||
size_t capacity () const { return static_cast <size_t> (_capacity); }
|
||||
size_t size () const { return static_cast <size_t> (_size); }
|
||||
size_t total_size () const { return sizeof (node) * _size; }
|
||||
size_t unit_size () const { return sizeof (node); }
|
||||
size_t nonzero_size () const {
|
||||
size_t i = 0;
|
||||
for (int to = 0; to < _size; ++to)
|
||||
if (_array[to].check >= 0) ++i;
|
||||
return i;
|
||||
}
|
||||
size_t num_keys () const {
|
||||
size_t i = 0;
|
||||
for (int to = 0; to < _size; ++to)
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
if (_array[to].check >= 0 && _array[to].value >= 0) ++i;
|
||||
#else
|
||||
if (_array[to].check >= 0 && _array[_array[to].check].base () == to) ++i;
|
||||
#endif
|
||||
return i;
|
||||
}
|
||||
// interfance
|
||||
template <typename T>
|
||||
T exactMatchSearch (const char* key) const
|
||||
{ return exactMatchSearch <T> (key, std::strlen (key)); }
|
||||
template <typename T>
|
||||
T exactMatchSearch (const char* key, size_t len, size_t from = 0) const {
|
||||
union { int i; value_type x; } b;
|
||||
size_t pos = 0;
|
||||
b.i = _find (key, from, pos, len);
|
||||
if (b.i == CEDAR_NO_PATH) b.i = CEDAR_NO_VALUE;
|
||||
T result;
|
||||
_set_result (&result, b.x, len, from);
|
||||
return result;
|
||||
}
|
||||
template <typename T>
|
||||
size_t commonPrefixSearch (const char* key, T* result, size_t result_len) const
|
||||
{ return commonPrefixSearch (key, result, result_len, std::strlen (key)); }
|
||||
template <typename T>
|
||||
size_t commonPrefixSearch (const char* key, T* result, size_t result_len, size_t len, size_t from = 0) const {
|
||||
size_t num = 0;
|
||||
for (size_t pos = 0; pos < len; ) {
|
||||
union { int i; value_type x; } b;
|
||||
b.i = _find (key, from, pos, pos + 1);
|
||||
if (b.i == CEDAR_NO_VALUE) continue;
|
||||
if (b.i == CEDAR_NO_PATH) return num;
|
||||
if (num < result_len) _set_result (&result[num], b.x, pos, from);
|
||||
++num;
|
||||
}
|
||||
return num;
|
||||
}
|
||||
// predict key from double array
|
||||
template <typename T>
|
||||
size_t commonPrefixPredict (const char* key, T* result, size_t result_len)
|
||||
{ return commonPrefixPredict (key, result, result_len, std::strlen (key)); }
|
||||
template <typename T>
|
||||
size_t commonPrefixPredict (const char* key, T* result, size_t result_len, size_t len, size_t from = 0) {
|
||||
size_t num (0), pos (0), p (0);
|
||||
if (_find (key, from, pos, len) == CEDAR_NO_PATH) return 0;
|
||||
union { int i; value_type x; } b;
|
||||
size_t root = from;
|
||||
for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p, root)) {
|
||||
if (num < result_len) _set_result (&result[num], b.x, p, from);
|
||||
++num;
|
||||
}
|
||||
return num;
|
||||
}
|
||||
void suffix (char* key, size_t len, size_t to) const {
|
||||
key[len] = '\0';
|
||||
while (len--) {
|
||||
const int from = _array[to].check;
|
||||
key[len]
|
||||
= static_cast <char> (_array[from].base () ^ static_cast <int> (to));
|
||||
to = static_cast <size_t> (from);
|
||||
}
|
||||
}
|
||||
value_type traverse (const char* key, size_t& from, size_t& pos) const
|
||||
{ return traverse (key, from, pos, std::strlen (key)); }
|
||||
value_type traverse (const char* key, size_t& from, size_t& pos, size_t len) const {
|
||||
union { int i; value_type x; } b;
|
||||
b.i = _find (key, from, pos, len);
|
||||
return b.x;
|
||||
}
|
||||
struct empty_callback { void operator () (const int, const int) {} }; // dummy empty function
|
||||
value_type& update (const char* key)
|
||||
{ return update (key, std::strlen (key)); }
|
||||
value_type& update (const char* key, size_t len, value_type val = value_type (0))
|
||||
{ size_t from (0), pos (0); return update (key, from, pos, len, val); }
|
||||
value_type& update (const char* key, size_t& from, size_t& pos, size_t len, value_type val = value_type (0))
|
||||
{ empty_callback cf; return update (key, from, pos, len, val, cf); }
|
||||
template <typename T>
|
||||
value_type& update (const char* key, size_t& from, size_t& pos, size_t len, value_type val, T& cf) {
|
||||
if (! len && ! from)
|
||||
_err (__FILE__, __LINE__, "failed to insert zero-length key\n");
|
||||
#ifndef USE_FAST_LOAD
|
||||
if (! _ninfo || ! _block) restore ();
|
||||
#endif
|
||||
for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
|
||||
pos < len; ++pos) {
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
const value_type val_ = _array[from].value;
|
||||
if (val_ >= 0 && val_ != CEDAR_VALUE_LIMIT) // always new; correct this!
|
||||
{ const int to = _follow (from, 0, cf); _array[to].value = val_; }
|
||||
#endif
|
||||
from = static_cast <size_t> (_follow (from, key_[pos], cf));
|
||||
}
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
const int to = _array[from].value >= 0 ? static_cast <int> (from) : _follow (from, 0, cf);
|
||||
if (_array[to].value == CEDAR_VALUE_LIMIT) _array[to].value = 0;
|
||||
#else
|
||||
const int to = _follow (from, 0, cf);
|
||||
#endif
|
||||
return _array[to].value += val;
|
||||
}
|
||||
// easy-going erase () without compression
|
||||
int erase (const char* key) { return erase (key, std::strlen (key)); }
|
||||
int erase (const char* key, size_t len, size_t from = 0) {
|
||||
size_t pos = 0;
|
||||
const int i = _find (key, from, pos, len);
|
||||
if (i == CEDAR_NO_PATH || i == CEDAR_NO_VALUE) return -1;
|
||||
erase (from);
|
||||
return 0;
|
||||
}
|
||||
void erase (size_t from) {
|
||||
// _test ();
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
int e = _array[from].value >= 0 ? static_cast <int> (from) : _array[from].base () ^ 0;
|
||||
from = static_cast <size_t> (_array[e].check);
|
||||
#else
|
||||
int e = _array[from].base () ^ 0;
|
||||
#endif
|
||||
bool flag = false; // have sibling
|
||||
do {
|
||||
const node& n = _array[from];
|
||||
flag = _ninfo[n.base () ^ _ninfo[from].child].sibling;
|
||||
if (flag) _pop_sibling (from, n.base (), static_cast <uchar> (n.base () ^ e));
|
||||
_push_enode (e);
|
||||
e = static_cast <int> (from);
|
||||
from = static_cast <size_t> (_array[from].check);
|
||||
} while (! flag);
|
||||
}
|
||||
int build (size_t num, const char** key, const size_t* len = 0, const value_type* val = 0) {
|
||||
for (size_t i = 0; i < num; ++i)
|
||||
update (key[i], len ? len[i] : std::strlen (key[i]), val ? val[i] : value_type (i));
|
||||
return 0;
|
||||
}
|
||||
template <typename T>
|
||||
void dump (T* result, const size_t result_len) {
|
||||
union { int i; value_type x; } b;
|
||||
size_t num (0), from (0), p (0);
|
||||
for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p))
|
||||
if (num < result_len)
|
||||
_set_result (&result[num++], b.x, p, from);
|
||||
else
|
||||
_err (__FILE__, __LINE__, "dump() needs array of length = num_keys()\n");
|
||||
}
|
||||
int save (const char* fn, const char* mode = "wb") const {
|
||||
// _test ();
|
||||
FILE* fp = std::fopen (fn, mode);
|
||||
if (! fp) return -1;
|
||||
std::fwrite (_array, sizeof (node), static_cast <size_t> (_size), fp);
|
||||
std::fclose (fp);
|
||||
#ifdef USE_FAST_LOAD
|
||||
const char* const info
|
||||
= std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
|
||||
fp = std::fopen (info, mode);
|
||||
delete [] info; // resolve memory leak
|
||||
if (! fp) return -1;
|
||||
std::fwrite (&_bheadF, sizeof (int), 1, fp);
|
||||
std::fwrite (&_bheadC, sizeof (int), 1, fp);
|
||||
std::fwrite (&_bheadO, sizeof (int), 1, fp);
|
||||
std::fwrite (_ninfo, sizeof (ninfo), static_cast <size_t> (_size), fp);
|
||||
std::fwrite (_block, sizeof (block), static_cast <size_t> (_size >> 8), fp);
|
||||
std::fclose (fp);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
int open (const char* fn, const char* mode = "rb",
|
||||
const size_t offset = 0, size_t size_ = 0) {
|
||||
FILE* fp = std::fopen (fn, mode);
|
||||
if (! fp) return -1;
|
||||
// get size
|
||||
if (! size_) {
|
||||
if (std::fseek (fp, 0, SEEK_END) != 0) return -1;
|
||||
size_ = static_cast <size_t> (std::ftell (fp));
|
||||
if (std::fseek (fp, 0, SEEK_SET) != 0) return -1;
|
||||
}
|
||||
if (size_ <= offset) return -1;
|
||||
// set array
|
||||
clear (false);
|
||||
size_ = (size_ - offset) / sizeof (node);
|
||||
if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
|
||||
_array = static_cast <node*> (std::malloc (sizeof (node) * size_));
|
||||
#ifdef USE_FAST_LOAD
|
||||
_ninfo = static_cast <ninfo*> (std::malloc (sizeof (ninfo) * size_));
|
||||
_block = static_cast <block*> (std::malloc (sizeof (block) * size_));
|
||||
if (! _array || ! _ninfo || ! _block)
|
||||
#else
|
||||
if (! _array)
|
||||
#endif
|
||||
_err (__FILE__, __LINE__, "memory allocation failed\n");
|
||||
if (size_ != std::fread (_array, sizeof (node), size_, fp)) return -1;
|
||||
std::fclose (fp);
|
||||
_size = static_cast <int> (size_);
|
||||
#ifdef USE_FAST_LOAD
|
||||
const char* const info
|
||||
= std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
|
||||
fp = std::fopen (info, mode);
|
||||
delete [] info; // resolve memory leak
|
||||
if (! fp) return -1;
|
||||
std::fread (&_bheadF, sizeof (int), 1, fp);
|
||||
std::fread (&_bheadC, sizeof (int), 1, fp);
|
||||
std::fread (&_bheadO, sizeof (int), 1, fp);
|
||||
if (size_ != std::fread (_ninfo, sizeof (ninfo), size_, fp) ||
|
||||
size_ != std::fread (_block, sizeof (block), size_ >> 8, fp) << 8)
|
||||
return -1;
|
||||
std::fclose (fp);
|
||||
_capacity = _size;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
#ifndef USE_FAST_LOAD
|
||||
void restore () { // restore information to update
|
||||
if (! _block) _restore_block ();
|
||||
if (! _ninfo) _restore_ninfo ();
|
||||
_capacity = _size;
|
||||
}
|
||||
#endif
|
||||
void set_array (void* p, size_t size_ = 0) { // ad-hoc
|
||||
clear (false);
|
||||
_array = static_cast <node*> (p);
|
||||
_size = static_cast <int> (size_);
|
||||
_no_delete = true;
|
||||
}
|
||||
const void* array () const { return _array; }
|
||||
void clear (const bool reuse = true) {
|
||||
if (_array && ! _no_delete) std::free (_array);
|
||||
if (_ninfo) std::free (_ninfo);
|
||||
if (_block) std::free (_block);
|
||||
_array = 0; _ninfo = 0; _block = 0;
|
||||
_bheadF = _bheadC = _bheadO = _capacity = _size = 0; // *
|
||||
if (reuse) _initialize ();
|
||||
_no_delete = false;
|
||||
}
|
||||
// return the first child for a tree rooted by a given node
|
||||
int begin (size_t& from, size_t& len) {
|
||||
#ifndef USE_FAST_LOAD
|
||||
if (! _ninfo) _restore_ninfo ();
|
||||
#endif
|
||||
int base = _array[from].base ();
|
||||
uchar c = _ninfo[from].child;
|
||||
if (! from && ! (c = _ninfo[base ^ c].sibling)) // bug fix
|
||||
return CEDAR_NO_PATH; // no entry
|
||||
for (; c; ++len) {
|
||||
from = static_cast <size_t> (_array[from].base ()) ^ c;
|
||||
c = _ninfo[from].child;
|
||||
}
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
if (_array[from].value >= 0) return _array[from].value;
|
||||
#endif
|
||||
return _array[_array[from].base () ^ c].base_;
|
||||
}
|
||||
// return the next child if any
|
||||
int next (size_t& from, size_t& len, const size_t root = 0) {
|
||||
uchar c = 0;
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
if (_array[from].value < 0)
|
||||
#endif
|
||||
c = _ninfo[_array[from].base () ^ 0].sibling;
|
||||
for (; ! c && from != root; --len) {
|
||||
c = _ninfo[from].sibling;
|
||||
from = static_cast <size_t> (_array[from].check);
|
||||
}
|
||||
return c ?
|
||||
begin (from = static_cast <size_t> (_array[from].base ()) ^ c, ++len) :
|
||||
CEDAR_NO_PATH;
|
||||
}
|
||||
// test the validity of double array for debug
|
||||
void test (const size_t from = 0) const {
|
||||
const int base = _array[from].base ();
|
||||
uchar c = _ninfo[from].child;
|
||||
do {
|
||||
if (from) assert (_array[base ^ c].check == static_cast <int> (from));
|
||||
if (c && _array[base ^ c].value < 0) // correct this
|
||||
test (static_cast <size_t> (base ^ c));
|
||||
} while ((c = _ninfo[base ^ c].sibling));
|
||||
}
|
||||
size_t tracking_node[NUM_TRACKING_NODES + 1];
|
||||
private:
|
||||
// currently disabled; implement these if you need
|
||||
da (const da&);
|
||||
da& operator= (const da&);
|
||||
node* _array;
|
||||
ninfo* _ninfo;
|
||||
block* _block;
|
||||
int _bheadF; // first block of Full; 0
|
||||
int _bheadC; // first block of Closed; 0 if no Closed
|
||||
int _bheadO; // first block of Open; 0 if no Open
|
||||
int _capacity;
|
||||
int _size;
|
||||
int _no_delete;
|
||||
short _reject[257];
|
||||
//
|
||||
static void _err (const char* fn, const int ln, const char* msg)
|
||||
{ std::fprintf (stderr, "cedar: %s [%d]: %s", fn, ln, msg); std::exit (1); }
|
||||
template <typename T>
|
||||
static void _realloc_array (T*& p, const int size_n, const int size_p = 0) {
|
||||
void* tmp = std::realloc (p, sizeof (T) * static_cast <size_t> (size_n));
|
||||
if (! tmp)
|
||||
std::free (p), _err (__FILE__, __LINE__, "memory reallocation failed\n");
|
||||
p = static_cast <T*> (tmp);
|
||||
static const T T0 = T ();
|
||||
for (T* q (p + size_p), * const r (p + size_n); q != r; ++q) *q = T0;
|
||||
}
|
||||
void _initialize () { // initilize the first special block
|
||||
_realloc_array (_array, 256, 256);
|
||||
_realloc_array (_ninfo, 256);
|
||||
_realloc_array (_block, 1);
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
_array[0] = node (-1, -1);
|
||||
#else
|
||||
_array[0] = node (0, -1);
|
||||
#endif
|
||||
for (int i = 1; i < 256; ++i)
|
||||
_array[i] = node (i == 1 ? -255 : - (i - 1), i == 255 ? -1 : - (i + 1));
|
||||
_block[0].ehead = 1; // bug fix for erase
|
||||
_capacity = _size = 256;
|
||||
for (size_t i = 0 ; i <= NUM_TRACKING_NODES; ++i) tracking_node[i] = 0;
|
||||
for (short i = 0; i <= 256; ++i) _reject[i] = i + 1;
|
||||
}
|
||||
// follow/create edge
|
||||
template <typename T>
|
||||
int _follow (size_t& from, const uchar& label, T& cf) {
|
||||
int to = 0;
|
||||
const int base = _array[from].base ();
|
||||
if (base < 0 || _array[to = base ^ label].check < 0) {
|
||||
to = _pop_enode (base, label, static_cast <int> (from));
|
||||
_push_sibling (from, to ^ label, label, base >= 0);
|
||||
} else if (_array[to].check != static_cast <int> (from))
|
||||
to = _resolve (from, base, label, cf);
|
||||
return to;
|
||||
}
|
||||
// find key from double array
|
||||
int _find (const char* key, size_t& from, size_t& pos, const size_t len) const {
|
||||
for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
|
||||
pos < len; ) { // follow link
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
if (_array[from].value >= 0) return CEDAR_NO_PATH;
|
||||
#endif
|
||||
size_t to = static_cast <size_t> (_array[from].base ()); to ^= key_[pos];
|
||||
if (_array[to].check != static_cast <int> (from)) return CEDAR_NO_PATH;
|
||||
++pos;
|
||||
from = to;
|
||||
}
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
if (_array[from].value >= 0) // get value from leaf; only allow integer key
|
||||
return _array[from].value;
|
||||
#endif
|
||||
const node n = _array[_array[from].base () ^ 0];
|
||||
if (n.check != static_cast <int> (from)) return CEDAR_NO_VALUE;
|
||||
return n.base_;
|
||||
}
|
||||
#ifndef USE_FAST_LOAD
|
||||
void _restore_ninfo () {
|
||||
_realloc_array (_ninfo, _size);
|
||||
for (int to = 0; to < _size; ++to) {
|
||||
const int from = _array[to].check;
|
||||
if (from < 0) continue; // skip empty node
|
||||
const int base = _array[from].base ();
|
||||
if (const uchar label = static_cast <uchar> (base ^ to)) // skip leaf
|
||||
_push_sibling (static_cast <size_t> (from), base, label,
|
||||
! from || _ninfo[from].child || _array[base ^ 0].check == from);
|
||||
}
|
||||
}
|
||||
void _restore_block () {
|
||||
_realloc_array (_block, _size >> 8);
|
||||
_bheadF = _bheadC = _bheadO = 0;
|
||||
for (int bi (0), e (0); e < _size; ++bi) { // register blocks to full
|
||||
block& b = _block[bi];
|
||||
b.num = 0;
|
||||
for (; e < (bi << 8) + 256; ++e)
|
||||
if (_array[e].check < 0 && ++b.num == 1) b.ehead = e;
|
||||
int& head_out = b.num == 1 ? _bheadC : (b.num == 0 ? _bheadF : _bheadO);
|
||||
_push_block (bi, head_out, ! head_out && b.num);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
void _set_result (result_type* x, value_type r, size_t = 0, size_t = 0) const
|
||||
{ *x = r; }
|
||||
void _set_result (result_pair_type* x, value_type r, size_t l, size_t = 0) const
|
||||
{ x->value = r; x->length = l; }
|
||||
void _set_result (result_triple_type* x, value_type r, size_t l, size_t from) const
|
||||
{ x->value = r; x->length = l; x->id = from; }
|
||||
void _pop_block (const int bi, int& head_in, const bool last) {
|
||||
if (last) { // last one poped; Closed or Open
|
||||
head_in = 0;
|
||||
} else {
|
||||
const block& b = _block[bi];
|
||||
_block[b.prev].next = b.next;
|
||||
_block[b.next].prev = b.prev;
|
||||
if (bi == head_in) head_in = b.next;
|
||||
}
|
||||
}
|
||||
void _push_block (const int bi, int& head_out, const bool empty) {
|
||||
block& b = _block[bi];
|
||||
if (empty) { // the destination is empty
|
||||
head_out = b.prev = b.next = bi;
|
||||
} else { // use most recently pushed
|
||||
int& tail_out = _block[head_out].prev;
|
||||
b.prev = tail_out;
|
||||
b.next = head_out;
|
||||
head_out = tail_out = _block[tail_out].next = bi;
|
||||
}
|
||||
}
|
||||
int _add_block () {
|
||||
if (_size == _capacity) { // allocate memory if needed
|
||||
#ifdef USE_EXACT_FIT
|
||||
_capacity += _size >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : _size;
|
||||
#else
|
||||
_capacity += _capacity;
|
||||
#endif
|
||||
_realloc_array (_array, _capacity, _capacity);
|
||||
_realloc_array (_ninfo, _capacity, _size);
|
||||
_realloc_array (_block, _capacity >> 8, _size >> 8);
|
||||
}
|
||||
_block[_size >> 8].ehead = _size;
|
||||
_array[_size] = node (- (_size + 255), - (_size + 1));
|
||||
for (int i = _size + 1; i < _size + 255; ++i)
|
||||
_array[i] = node (-(i - 1), -(i + 1));
|
||||
_array[_size + 255] = node (- (_size + 254), -_size);
|
||||
_push_block (_size >> 8, _bheadO, ! _bheadO); // append to block Open
|
||||
_size += 256;
|
||||
return (_size >> 8) - 1;
|
||||
}
|
||||
// transfer block from one start w/ head_in to one start w/ head_out
|
||||
void _transfer_block (const int bi, int& head_in, int& head_out) {
|
||||
_pop_block (bi, head_in, bi == _block[bi].next);
|
||||
_push_block (bi, head_out, ! head_out && _block[bi].num);
|
||||
}
|
||||
// pop empty node from block; never transfer the special block (bi = 0)
|
||||
int _pop_enode (const int base, const uchar label, const int from) {
|
||||
const int e = base < 0 ? _find_place () : base ^ label;
|
||||
const int bi = e >> 8;
|
||||
node& n = _array[e];
|
||||
block& b = _block[bi];
|
||||
if (--b.num == 0) {
|
||||
if (bi) _transfer_block (bi, _bheadC, _bheadF); // Closed to Full
|
||||
} else { // release empty node from empty ring
|
||||
_array[-n.base_].check = n.check;
|
||||
_array[-n.check].base_ = n.base_;
|
||||
if (e == b.ehead) b.ehead = -n.check; // set ehead
|
||||
if (bi && b.num == 1 && b.trial != MAX_TRIAL) // Open to Closed
|
||||
_transfer_block (bi, _bheadO, _bheadC);
|
||||
}
|
||||
// initialize the released node
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
n.value = CEDAR_VALUE_LIMIT; n.check = from;
|
||||
if (base < 0) _array[from].base_ = - (e ^ label) - 1;
|
||||
#else
|
||||
if (label) n.base_ = -1; else n.value = value_type (0); n.check = from;
|
||||
if (base < 0) _array[from].base_ = e ^ label;
|
||||
#endif
|
||||
return e;
|
||||
}
|
||||
// push empty node into empty ring
|
||||
void _push_enode (const int e) {
|
||||
const int bi = e >> 8;
|
||||
block& b = _block[bi];
|
||||
if (++b.num == 1) { // Full to Closed
|
||||
b.ehead = e;
|
||||
_array[e] = node (-e, -e);
|
||||
if (bi) _transfer_block (bi, _bheadF, _bheadC); // Full to Closed
|
||||
} else {
|
||||
const int prev = b.ehead;
|
||||
const int next = -_array[prev].check;
|
||||
_array[e] = node (-prev, -next);
|
||||
_array[prev].check = _array[next].base_ = -e;
|
||||
if (b.num == 2 || b.trial == MAX_TRIAL) // Closed to Open
|
||||
if (bi) _transfer_block (bi, _bheadC, _bheadO);
|
||||
b.trial = 0;
|
||||
}
|
||||
if (b.reject < _reject[b.num]) b.reject = _reject[b.num];
|
||||
_ninfo[e] = ninfo (); // reset ninfo; no child, no sibling
|
||||
}
|
||||
// push label to from's child
|
||||
void _push_sibling (const size_t from, const int base, const uchar label, const bool flag = true) {
|
||||
uchar* c = &_ninfo[from].child;
|
||||
if (flag && (ORDERED ? label > *c : ! *c))
|
||||
do c = &_ninfo[base ^ *c].sibling; while (ORDERED && *c && *c < label);
|
||||
_ninfo[base ^ label].sibling = *c, *c = label;
|
||||
}
|
||||
// pop label from from's child
|
||||
void _pop_sibling (const size_t from, const int base, const uchar label) {
|
||||
uchar* c = &_ninfo[from].child;
|
||||
while (*c != label) c = &_ninfo[base ^ *c].sibling;
|
||||
*c = _ninfo[base ^ label].sibling;
|
||||
}
|
||||
// check whether to replace branching w/ the newly added node
|
||||
bool _consult (const int base_n, const int base_p, uchar c_n, uchar c_p) const {
|
||||
do if (! (c_p = _ninfo[base_p ^ c_p].sibling)) return false;
|
||||
while ((c_n = _ninfo[base_n ^ c_n].sibling));
|
||||
return true;
|
||||
}
|
||||
// enumerate (equal to or more than one) child nodes
|
||||
uchar* _set_child (uchar* p, const int base, uchar c, const int label = -1) {
|
||||
--p;
|
||||
if (! c) { *++p = c; c = _ninfo[base ^ c].sibling; } // 0: terminal
|
||||
if (ORDERED)
|
||||
while (c && c < label) { *++p = c; c = _ninfo[base ^ c].sibling; }
|
||||
if (label != -1) *++p = static_cast <uchar> (label);
|
||||
while (c) { *++p = c; c = _ninfo[base ^ c].sibling; }
|
||||
return p;
|
||||
}
|
||||
// explore new block to settle down
|
||||
int _find_place () {
|
||||
if (_bheadC) return _block[_bheadC].ehead;
|
||||
if (_bheadO) return _block[_bheadO].ehead;
|
||||
return _add_block () << 8;
|
||||
}
|
||||
int _find_place (const uchar* const first, const uchar* const last) {
|
||||
if (int bi = _bheadO) {
|
||||
const int bz = _block[_bheadO].prev;
|
||||
const short nc = static_cast <short> (last - first + 1);
|
||||
while (1) { // set candidate block
|
||||
block& b = _block[bi];
|
||||
if (b.num >= nc && nc < b.reject) // explore configuration
|
||||
for (int e = b.ehead;;) {
|
||||
const int base = e ^ *first;
|
||||
for (const uchar* p = first; _array[base ^ *++p].check < 0; )
|
||||
if (p == last) return b.ehead = e; // no conflict
|
||||
if ((e = -_array[e].check) == b.ehead) break;
|
||||
}
|
||||
b.reject = nc;
|
||||
if (b.reject < _reject[b.num]) _reject[b.num] = b.reject;
|
||||
const int bi_ = b.next;
|
||||
if (++b.trial == MAX_TRIAL) _transfer_block (bi, _bheadO, _bheadC);
|
||||
if (bi == bz) break;
|
||||
bi = bi_;
|
||||
};
|
||||
}
|
||||
return _add_block () << 8;
|
||||
}
|
||||
// resolve conflict on base_n ^ label_n = base_p ^ label_p
|
||||
template <typename T>
|
||||
int _resolve (size_t& from_n, const int base_n, const uchar label_n, T& cf) {
|
||||
// examine siblings of conflicted nodes
|
||||
const int to_pn = base_n ^ label_n;
|
||||
const int from_p = _array[to_pn].check;
|
||||
const int base_p = _array[from_p].base ();
|
||||
const bool flag // whether to replace siblings of newly added
|
||||
= _consult (base_n, base_p, _ninfo[from_n].child, _ninfo[from_p].child);
|
||||
uchar child[256];
|
||||
uchar* const first = &child[0];
|
||||
uchar* const last =
|
||||
flag ? _set_child (first, base_n, _ninfo[from_n].child, label_n)
|
||||
: _set_child (first, base_p, _ninfo[from_p].child);
|
||||
const int base =
|
||||
(first == last ? _find_place () : _find_place (first, last)) ^ *first;
|
||||
// replace & modify empty list
|
||||
const int from = flag ? static_cast <int> (from_n) : from_p;
|
||||
const int base_ = flag ? base_n : base_p;
|
||||
if (flag && *first == label_n) _ninfo[from].child = label_n; // new child
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
_array[from].base_ = -base - 1; // new base
|
||||
#else
|
||||
_array[from].base_ = base; // new base
|
||||
#endif
|
||||
for (const uchar* p = first; p <= last; ++p) { // to_ => to
|
||||
const int to = _pop_enode (base, *p, from);
|
||||
const int to_ = base_ ^ *p;
|
||||
_ninfo[to].sibling = (p == last ? 0 : *(p + 1));
|
||||
if (flag && to_ == to_pn) continue; // skip newcomer (no child)
|
||||
cf (to_, to); // user-defined callback function to handle moved nodes
|
||||
node& n = _array[to];
|
||||
node& n_ = _array[to_];
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
if ((n.base_ = n_.base_) < 0 && *p) // copy base; bug fix
|
||||
#else
|
||||
if ((n.base_ = n_.base_) > 0 && *p) // copy base; bug fix
|
||||
#endif
|
||||
{
|
||||
uchar c = _ninfo[to].child = _ninfo[to_].child;
|
||||
do _array[n.base () ^ c].check = to; // adjust grand son's check
|
||||
while ((c = _ninfo[n.base () ^ c].sibling));
|
||||
}
|
||||
if (! flag && to_ == static_cast <int> (from_n)) // parent node moved
|
||||
from_n = static_cast <size_t> (to); // bug fix
|
||||
if (! flag && to_ == to_pn) { // the address is immediately used
|
||||
_push_sibling (from_n, to_pn ^ label_n, label_n);
|
||||
_ninfo[to_].child = 0; // remember to reset child
|
||||
#ifdef USE_REDUCED_TRIE
|
||||
n_.value = CEDAR_VALUE_LIMIT;
|
||||
#else
|
||||
if (label_n) n_.base_ = -1; else n_.value = value_type (0);
|
||||
#endif
|
||||
n_.check = static_cast <int> (from_n);
|
||||
} else
|
||||
_push_enode (to_);
|
||||
if (NUM_TRACKING_NODES) // keep the traversed node updated
|
||||
for (size_t j = 0; tracking_node[j] != 0; ++j)
|
||||
if (tracking_node[j] == static_cast <size_t> (to_))
|
||||
{ tracking_node[j] = static_cast <size_t> (to); break; }
|
||||
}
|
||||
return flag ? base ^ label_n : to_pn;
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,834 @@
|
|||
// cedar -- C++ implementation of Efficiently-updatable Double ARray trie
|
||||
// $Id: cedarpp.h 1916 2017-07-12 07:30:56Z ynaga $
|
||||
// Copyright (c) 2009-2015 Naoki Yoshinaga <ynaga@tkl.iis.u-tokyo.ac.jp>
|
||||
#ifndef CEDAR_H
|
||||
#define CEDAR_H
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <climits>
|
||||
#include <cassert>
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#define STATIC_ASSERT(e, msg) typedef char msg[(e) ? 1 : -1]
|
||||
|
||||
namespace cedar {
|
||||
// typedefs
|
||||
#if LONG_BIT == 64
|
||||
typedef unsigned long npos_t; // possibly compatible with size_t
|
||||
#else
|
||||
typedef unsigned long long npos_t;
|
||||
#endif
|
||||
typedef unsigned char uchar;
|
||||
static const npos_t TAIL_OFFSET_MASK = static_cast <npos_t> (0xffffffff);
|
||||
static const npos_t NODE_INDEX_MASK = static_cast <npos_t> (0xffffffff) << 32;
|
||||
template <typename T> struct NaN { enum { N1 = -1, N2 = -2 }; };
|
||||
template <> struct NaN <float> { enum { N1 = 0x7f800001, N2 = 0x7f800002 }; };
|
||||
static const int MAX_ALLOC_SIZE = 1 << 16; // must be divisible by 256
|
||||
// dynamic double array
|
||||
template <typename value_type,
|
||||
const int NO_VALUE = NaN <value_type>::N1,
|
||||
const int NO_PATH = NaN <value_type>::N2,
|
||||
const bool ORDERED = true,
|
||||
const int MAX_TRIAL = 1,
|
||||
const size_t NUM_TRACKING_NODES = 0>
|
||||
class da {
|
||||
public:
|
||||
enum error_code { CEDAR_NO_VALUE = NO_VALUE, CEDAR_NO_PATH = NO_PATH };
|
||||
typedef value_type result_type;
|
||||
struct result_pair_type {
|
||||
value_type value;
|
||||
size_t length; // prefix length
|
||||
};
|
||||
struct result_triple_type { // for predict ()
|
||||
value_type value;
|
||||
size_t length; // suffix length
|
||||
npos_t id; // node id of value
|
||||
};
|
||||
struct node {
|
||||
union { int base; value_type value; }; // negative means prev empty index
|
||||
int check; // negative means next empty index
|
||||
node (const int base_ = 0, const int check_ = 0)
|
||||
: base (base_), check (check_) {}
|
||||
};
|
||||
struct ninfo { // x1.5 update speed; +.25 % memory (8n -> 10n)
|
||||
uchar sibling; // right sibling (= 0 if not exist)
|
||||
uchar child; // first child
|
||||
ninfo () : sibling (0), child (0) {}
|
||||
};
|
||||
struct block { // a block w/ 256 elements
|
||||
int prev; // prev block; 3 bytes
|
||||
int next; // next block; 3 bytes
|
||||
short num; // # empty elements; 0 - 256
|
||||
short reject; // minimum # branching failed to locate; soft limit
|
||||
int trial; // # trial
|
||||
int ehead; // first empty item
|
||||
block () : prev (0), next (0), num (256), reject (257), trial (0), ehead (0) {}
|
||||
};
|
||||
da () : tracking_node (), _array (0), _tail (0), _tail0 (0), _ninfo (0), _block (0), _bheadF (0), _bheadC (0), _bheadO (0), _capacity (0), _size (0), _quota (0), _quota0 (0), _no_delete (false), _reject () {
|
||||
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
|
||||
STATIC_ASSERT(sizeof (value_type) <= sizeof (int),
|
||||
value_type_is_not_supported___maintain_a_value_array_by_yourself_and_store_its_index_to_trie
|
||||
);
|
||||
#pragma GCC diagnostic warning "-Wunused-local-typedefs"
|
||||
_initialize ();
|
||||
}
|
||||
~da () { clear (false); }
|
||||
size_t capacity () const { return static_cast <size_t> (_capacity); }
|
||||
size_t size () const { return static_cast <size_t> (_size); }
|
||||
size_t length () const { return static_cast <size_t> (*_length); }
|
||||
size_t total_size () const { return sizeof (node) * _size; }
|
||||
size_t unit_size () const { return sizeof (node); }
|
||||
size_t nonzero_size () const {
|
||||
size_t i = 0;
|
||||
for (int to = 0; to < _size; ++to)
|
||||
if (_array[to].check >= 0) ++i;
|
||||
return i;
|
||||
}
|
||||
size_t nonzero_length () const {
|
||||
size_t i (0), j (0);
|
||||
for (int to = 0; to < _size; ++to) {
|
||||
const node& n = _array[to];
|
||||
if (n.check >= 0 && _array[n.check].base != to && n.base < 0)
|
||||
{ ++j; for (const char* p = &_tail[-n.base]; *p; ++p) ++i; }
|
||||
}
|
||||
return i + j * (1 + sizeof (value_type));
|
||||
}
|
||||
size_t num_keys () const {
|
||||
size_t i = 0;
|
||||
for (int to = 0; to < _size; ++to) {
|
||||
const node& n = _array[to];
|
||||
if (n.check >= 0 && (_array[n.check].base == to || n.base < 0)) ++i;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
// interfance
|
||||
template <typename T>
|
||||
T exactMatchSearch (const char* key) const
|
||||
{ return exactMatchSearch <T> (key, std::strlen (key)); }
|
||||
template <typename T>
|
||||
T exactMatchSearch (const char* key, size_t len, npos_t from = 0) const {
|
||||
union { int i; value_type x; } b;
|
||||
size_t pos = 0;
|
||||
b.i = _find (key, from, pos, len);
|
||||
if (b.i == CEDAR_NO_PATH) b.i = CEDAR_NO_VALUE;
|
||||
T result;
|
||||
_set_result (&result, b.x, len, from);
|
||||
return result;
|
||||
}
|
||||
template <typename T>
|
||||
size_t commonPrefixSearch (const char* key, T* result, size_t result_len) const
|
||||
{ return commonPrefixSearch (key, result, result_len, std::strlen (key)); }
|
||||
template <typename T>
|
||||
size_t commonPrefixSearch (const char* key, T* result, size_t result_len, size_t len, npos_t from = 0) const {
|
||||
size_t num = 0;
|
||||
for (size_t pos = 0; pos < len; ) {
|
||||
union { int i; value_type x; } b;
|
||||
b.i = _find (key, from, pos, pos + 1);
|
||||
if (b.i == CEDAR_NO_VALUE) continue;
|
||||
if (b.i == CEDAR_NO_PATH) return num;
|
||||
if (num < result_len) _set_result (&result[num], b.x, pos, from);
|
||||
++num;
|
||||
}
|
||||
return num;
|
||||
}
|
||||
// predict key from double array
|
||||
template <typename T>
|
||||
size_t commonPrefixPredict (const char* key, T* result, size_t result_len)
|
||||
{ return commonPrefixPredict (key, result, result_len, std::strlen (key)); }
|
||||
template <typename T>
|
||||
size_t commonPrefixPredict (const char* key, T* result, size_t result_len, size_t len, npos_t from = 0) {
|
||||
size_t num (0), pos (0), p (0);
|
||||
if (_find (key, from, pos, len) == CEDAR_NO_PATH) return 0;
|
||||
union { int i; value_type x; } b;
|
||||
const npos_t root = from;
|
||||
for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p, root)) {
|
||||
if (num < result_len)
|
||||
_set_result (&result[num], b.x, p, from);
|
||||
++num;
|
||||
}
|
||||
return num;
|
||||
}
|
||||
void suffix (char* key, size_t len, npos_t to) const {
|
||||
key[len] = '\0';
|
||||
if (const int offset = static_cast <int> (to >> 32)) {
|
||||
to &= TAIL_OFFSET_MASK;
|
||||
size_t len_tail = std::strlen (&_tail[-_array[to].base]);
|
||||
if (len > len_tail) len -= len_tail; else len_tail = len, len = 0;
|
||||
std::memcpy (&key[len], &_tail[static_cast <size_t> (offset) - len_tail], len_tail);
|
||||
}
|
||||
while (len--) {
|
||||
const int from = _array[to].check;
|
||||
key[len] = static_cast <char> (_array[from].base ^ static_cast <int> (to));
|
||||
to = static_cast <npos_t> (from);
|
||||
}
|
||||
}
|
||||
value_type traverse (const char* key, npos_t& from, size_t& pos) const
|
||||
{ return traverse (key, from, pos, std::strlen (key)); }
|
||||
value_type traverse (const char* key, npos_t& from, size_t& pos, size_t len) const {
|
||||
union { int i; value_type x; } b;
|
||||
b.i = _find (key, from, pos, len);
|
||||
return b.x;
|
||||
}
|
||||
struct empty_callback { void operator () (const int, const int) {} }; // dummy empty function
|
||||
value_type& update (const char* key)
|
||||
{ return update (key, std::strlen (key)); }
|
||||
value_type& update (const char* key, size_t len, value_type val = value_type (0))
|
||||
{ npos_t from (0); size_t pos (0); return update (key, from, pos, len, val); }
|
||||
value_type& update (const char* key, npos_t& from, size_t& pos, size_t len, value_type val = value_type (0))
|
||||
{ empty_callback cf; return update (key, from, pos, len, val, cf); }
|
||||
template <typename T>
|
||||
value_type& update (const char* key, npos_t& from, size_t& pos, size_t len, value_type val, T& cf) {
|
||||
if (! len && ! from)
|
||||
_err (__FILE__, __LINE__, "failed to insert zero-length key\n");
|
||||
#ifndef USE_FAST_LOAD
|
||||
if (! _ninfo || ! _block) restore ();
|
||||
#endif
|
||||
npos_t offset = from >> 32;
|
||||
if (! offset) { // node on trie
|
||||
for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
|
||||
_array[from].base >= 0; ++pos) {
|
||||
if (pos == len) // could be reduced
|
||||
{ const int to = _follow (from, 0, cf); return _array[to].value += val; }
|
||||
from = static_cast <size_t> (_follow (from, key_[pos], cf));
|
||||
}
|
||||
offset = static_cast <npos_t> (-_array[from].base);
|
||||
}
|
||||
if (offset >= sizeof (int)) { // go to _tail
|
||||
const size_t pos_orig = pos;
|
||||
char* const tail = &_tail[offset] - pos;
|
||||
while (pos < len && key[pos] == tail[pos]) ++pos;
|
||||
//
|
||||
if (pos == len && tail[pos] == '\0') { // found exact key
|
||||
if (const npos_t moved = pos - pos_orig) { // search end on tail
|
||||
from &= TAIL_OFFSET_MASK;
|
||||
from |= (offset + moved) << 32;
|
||||
}
|
||||
return *reinterpret_cast <value_type*> (&tail[len + 1]) += val;
|
||||
}
|
||||
// otherwise, insert the common prefix in tail if any
|
||||
if (from >> 32) {
|
||||
from &= TAIL_OFFSET_MASK; // reset to update tail offset
|
||||
for (npos_t offset_ = static_cast <npos_t> (-_array[from].base);
|
||||
offset_ < offset; ) {
|
||||
from = static_cast <size_t>
|
||||
(_follow (from, static_cast <uchar> (_tail[offset_]), cf));
|
||||
++offset_;
|
||||
// this shows intricacy in debugging updatable double array trie
|
||||
if (NUM_TRACKING_NODES) // keep the traversed node (on tail) updated
|
||||
for (size_t j = 0; tracking_node[j] != 0; ++j)
|
||||
if (tracking_node[j] >> 32 == offset_)
|
||||
tracking_node[j] = static_cast <npos_t> (from);
|
||||
}
|
||||
}
|
||||
for (size_t pos_ = pos_orig; pos_ < pos; ++pos_)
|
||||
from = static_cast <size_t>
|
||||
(_follow (from, static_cast <uchar> (key[pos_]), cf));
|
||||
npos_t moved = pos - pos_orig;
|
||||
if (tail[pos]) { // remember to move offset to existing tail
|
||||
const int to_ = _follow (from, static_cast <uchar> (tail[pos]), cf);
|
||||
_array[to_].base = - static_cast <int> (offset + ++moved);
|
||||
moved -= 1 + sizeof (value_type); // keep record
|
||||
}
|
||||
moved += offset;
|
||||
for (npos_t i = offset; i <= moved; i += 1 + sizeof (value_type)) {
|
||||
if (_quota0 == ++*_length0) {
|
||||
#ifdef USE_EXACT_FIT
|
||||
_quota0 += *_length0 >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : *_length0;
|
||||
#else
|
||||
_quota0 += _quota0;
|
||||
#endif
|
||||
_realloc_array (_tail0, _quota0, *_length0);
|
||||
}
|
||||
_tail0[*_length0] = static_cast <int> (i);
|
||||
}
|
||||
if (pos == len || tail[pos] == '\0') {
|
||||
const int to = _follow (from, 0, cf); // could be reduced
|
||||
if (pos == len) return _array[to].value += val; // set value on trie
|
||||
_array[to].value += *reinterpret_cast <value_type*> (&tail[pos + 1]);
|
||||
}
|
||||
from = static_cast <size_t> (_follow (from, static_cast <uchar> (key[pos]), cf));
|
||||
++pos;
|
||||
}
|
||||
const int needed = static_cast <int> (len - pos + 1 + sizeof (value_type));
|
||||
if (pos == len && *_length0) { // reuse
|
||||
const int offset0 = _tail0[*_length0];
|
||||
_tail[offset0] = '\0';
|
||||
_array[from].base = -offset0;
|
||||
--*_length0;
|
||||
return *reinterpret_cast <value_type*> (&_tail[offset0 + 1]) = val;
|
||||
}
|
||||
if (_quota < *_length + needed) {
|
||||
#ifdef USE_EXACT_FIT
|
||||
_quota += needed > *_length || needed > MAX_ALLOC_SIZE ? needed :
|
||||
(*_length >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : *_length);
|
||||
#else
|
||||
_quota += _quota >= needed ? _quota : needed;
|
||||
#endif
|
||||
_realloc_array (_tail, _quota, *_length);
|
||||
}
|
||||
_array[from].base = -*_length;
|
||||
const size_t pos_orig = pos;
|
||||
char* const tail = &_tail[*_length] - pos;
|
||||
if (pos < len) {
|
||||
do tail[pos] = key[pos]; while (++pos < len);
|
||||
from |= (static_cast <npos_t> (*_length) + (len - pos_orig)) << 32;
|
||||
}
|
||||
*_length += needed;
|
||||
return *reinterpret_cast <value_type*> (&tail[len + 1]) += val;
|
||||
}
|
||||
// easy-going erase () without compression
|
||||
int erase (const char* key) { return erase (key, std::strlen (key)); }
|
||||
int erase (const char* key, size_t len, npos_t from = 0) {
|
||||
size_t pos = 0;
|
||||
const int i = _find (key, from, pos, len);
|
||||
if (i == CEDAR_NO_PATH || i == CEDAR_NO_VALUE) return -1;
|
||||
if (from >> 32) from &= TAIL_OFFSET_MASK; // leave tail as is
|
||||
bool flag = _array[from].base < 0; // have sibling
|
||||
int e = flag ? static_cast <int> (from) : _array[from].base ^ 0;
|
||||
from = _array[e].check;
|
||||
do {
|
||||
const node& n = _array[from];
|
||||
flag = _ninfo[n.base ^ _ninfo[from].child].sibling;
|
||||
if (flag) _pop_sibling (from, n.base, static_cast <uchar> (n.base ^ e));
|
||||
_push_enode (e);
|
||||
e = static_cast <int> (from);
|
||||
from = static_cast <size_t> (_array[from].check);
|
||||
} while (! flag);
|
||||
return 0;
|
||||
}
|
||||
int build (size_t num, const char** key, const size_t* len = 0, const value_type* val = 0) {
|
||||
for (size_t i = 0; i < num; ++i)
|
||||
update (key[i], len ? len[i] : std::strlen (key[i]), val ? val[i] : value_type (i));
|
||||
return 0;
|
||||
}
|
||||
template <typename T>
|
||||
void dump (T* result, const size_t result_len) {
|
||||
union { int i; value_type x; } b;
|
||||
size_t num (0), p (0);
|
||||
npos_t from = 0;
|
||||
for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p))
|
||||
if (num < result_len)
|
||||
_set_result (&result[num++], b.x, p, from);
|
||||
else
|
||||
_err (__FILE__, __LINE__, "dump() needs array of length = num_keys()\n");
|
||||
}
|
||||
void shrink_tail () {
|
||||
union { char* tail; int* length; } t;
|
||||
const size_t length_
|
||||
= static_cast <size_t> (*_length)
|
||||
- static_cast <size_t> (*_length0) * (1 + sizeof (value_type));
|
||||
t.tail = static_cast <char*> (std::malloc (length_));
|
||||
if (! t.tail) _err (__FILE__, __LINE__, "memory allocation failed\n");
|
||||
*t.length = static_cast <int> (sizeof (int));
|
||||
for (int to = 0; to < _size; ++to) {
|
||||
node& n = _array[to];
|
||||
if (n.check >= 0 && _array[n.check].base != to && n.base < 0) {
|
||||
char* const tail (&t.tail[*t.length]), * const tail_ (&_tail[-n.base]);
|
||||
n.base = - *t.length;
|
||||
int i = 0; do tail[i] = tail_[i]; while (tail[i++]);
|
||||
*reinterpret_cast <value_type*> (&tail[i])
|
||||
= *reinterpret_cast <const value_type*> (&tail_[i]);
|
||||
*t.length += i + static_cast <int> (sizeof (value_type));
|
||||
}
|
||||
}
|
||||
std::free (_tail);
|
||||
_tail = t.tail;
|
||||
_realloc_array (_tail, *_length, *_length);
|
||||
_quota = *_length;
|
||||
_realloc_array (_tail0, 1);
|
||||
_quota0 = 1;
|
||||
}
|
||||
int save (const char* fn, const char* mode, const bool shrink) {
|
||||
if (shrink) shrink_tail ();
|
||||
return save (fn, mode);
|
||||
}
|
||||
int save (const char* fn, const char* mode = "wb") const {
|
||||
// _test ();
|
||||
FILE* fp = std::fopen (fn, mode);
|
||||
if (! fp) return -1;
|
||||
std::fwrite (_tail, sizeof (char), static_cast <size_t> (*_length), fp);
|
||||
std::fwrite (_array, sizeof (node), static_cast <size_t> (_size), fp);
|
||||
std::fclose (fp);
|
||||
#ifdef USE_FAST_LOAD
|
||||
const char* const info
|
||||
= std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
|
||||
fp = std::fopen (info, mode);
|
||||
delete [] info; // resolve memory leak
|
||||
if (! fp) return -1;
|
||||
std::fwrite (&_bheadF, sizeof (int), 1, fp);
|
||||
std::fwrite (&_bheadC, sizeof (int), 1, fp);
|
||||
std::fwrite (&_bheadO, sizeof (int), 1, fp);
|
||||
std::fwrite (_ninfo, sizeof (ninfo), static_cast <size_t> (_size), fp);
|
||||
std::fwrite (_block, sizeof (block), static_cast <size_t> (_size >> 8), fp);
|
||||
std::fclose (fp);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
int open (const char* fn, const char* mode = "rb",
|
||||
const size_t offset = 0, size_t size_ = 0) {
|
||||
FILE* fp = std::fopen (fn, mode);
|
||||
if (! fp) return -1;
|
||||
// get size
|
||||
if (! size_) {
|
||||
if (std::fseek (fp, 0, SEEK_END) != 0) return -1;
|
||||
size_ = static_cast <size_t> (std::ftell (fp));
|
||||
if (std::fseek (fp, 0, SEEK_SET) != 0) return -1;
|
||||
}
|
||||
if (size_ <= offset) return -1;
|
||||
if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
|
||||
int len = 0;
|
||||
if (std::fread (&len, sizeof (int), 1, fp) != 1) return -1;
|
||||
const size_t length_ = static_cast <size_t> (len);
|
||||
if (size_ <= offset + length_) return -1;
|
||||
// set array
|
||||
clear (false);
|
||||
size_ = (size_ - offset - length_) / sizeof (node);
|
||||
_array = static_cast <node*> (std::malloc (sizeof (node) * size_));
|
||||
_tail = static_cast <char*> (std::malloc (length_));
|
||||
_tail0 = static_cast <int*> (std::malloc (sizeof (int)));
|
||||
#ifdef USE_FAST_LOAD
|
||||
_ninfo = static_cast <ninfo*> (std::malloc (sizeof (ninfo) * size_));
|
||||
_block = static_cast <block*> (std::malloc (sizeof (block) * size_));
|
||||
if (! _array || ! _tail || ! _tail0 || ! _ninfo || ! _block)
|
||||
#else
|
||||
if (! _array || ! _tail || ! _tail0)
|
||||
#endif
|
||||
_err (__FILE__, __LINE__, "memory allocation failed\n");
|
||||
if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
|
||||
if (length_ != std::fread (_tail, sizeof (char), length_, fp) ||
|
||||
size_ != std::fread (_array, sizeof (node), size_, fp))
|
||||
return -1;
|
||||
std::fclose (fp);
|
||||
_size = static_cast <int> (size_);
|
||||
*_length0 = 0;
|
||||
#ifdef USE_FAST_LOAD
|
||||
const char* const info
|
||||
= std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
|
||||
fp = std::fopen (info, mode);
|
||||
delete [] info; // resolve memory leak
|
||||
if (! fp) return -1;
|
||||
std::fread (&_bheadF, sizeof (int), 1, fp);
|
||||
std::fread (&_bheadC, sizeof (int), 1, fp);
|
||||
std::fread (&_bheadO, sizeof (int), 1, fp);
|
||||
if (size_ != std::fread (_ninfo, sizeof (ninfo), size_, fp) ||
|
||||
size_ >> 8 != std::fread (_block, sizeof (block), size_ >> 8, fp))
|
||||
return -1;
|
||||
std::fclose (fp);
|
||||
_capacity = _size;
|
||||
_quota = *_length;
|
||||
_quota0 = 1;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
#ifndef USE_FAST_LOAD
|
||||
void restore () { // restore information to update
|
||||
if (! _block) _restore_block ();
|
||||
if (! _ninfo) _restore_ninfo ();
|
||||
_capacity = _size;
|
||||
_quota = *_length;
|
||||
_quota0 = 1;
|
||||
}
|
||||
#endif
|
||||
void set_array (void* p, size_t size_ = 0) { // ad-hoc
|
||||
clear (false);
|
||||
if (size_)
|
||||
size_ = size_ * unit_size () - static_cast <size_t> (*static_cast <int*> (p));
|
||||
_tail = static_cast <char*> (p);
|
||||
_array = reinterpret_cast <node*> (_tail + *_length);
|
||||
_size = static_cast <int> (size_ / unit_size () + (size_ % unit_size () ? 1 : 0));
|
||||
_no_delete = true;
|
||||
}
|
||||
const void* array () const { return _array; }
|
||||
void clear (const bool reuse = true) {
|
||||
if (_no_delete) _array = 0, _tail = 0;
|
||||
if (_array) std::free (_array);
|
||||
if (_tail) std::free (_tail);
|
||||
if (_tail0) std::free (_tail0);
|
||||
if (_ninfo) std::free (_ninfo);
|
||||
if (_block) std::free (_block);
|
||||
_array = 0; _tail = 0; _tail0 = 0; _ninfo = 0; _block = 0;
|
||||
_bheadF = _bheadC = _bheadO = _capacity = _size = _quota = _quota0 = 0;
|
||||
if (reuse) _initialize ();
|
||||
_no_delete = false;
|
||||
}
|
||||
// return the first child for a tree rooted by a given node
|
||||
int begin (npos_t& from, size_t& len) {
|
||||
#ifndef USE_FAST_LOAD
|
||||
if (! _ninfo) _restore_ninfo ();
|
||||
#endif
|
||||
int base = from >> 32 ? - static_cast <int> (from >> 32) : _array[from].base;
|
||||
if (base >= 0) { // on trie
|
||||
uchar c = _ninfo[from].child;
|
||||
if (! from && ! (c = _ninfo[base ^ c].sibling)) // bug fix
|
||||
return CEDAR_NO_PATH; // no entry
|
||||
for (; c && base >= 0; ++len) {
|
||||
from = static_cast <size_t> (base) ^ c;
|
||||
base = _array[from].base;
|
||||
c = _ninfo[from].child;
|
||||
}
|
||||
if (base >= 0) return _array[base ^ c].base;
|
||||
}
|
||||
const size_t len_ = std::strlen (&_tail[-base]);
|
||||
from &= TAIL_OFFSET_MASK;
|
||||
from |= static_cast <npos_t> (static_cast <size_t> (-base) + len_) << 32;
|
||||
len += len_;
|
||||
return *reinterpret_cast <int*> (&_tail[-base] + len_ + 1);
|
||||
}
|
||||
// return the next child if any
|
||||
int next (npos_t& from, size_t& len, const npos_t root = 0) {
|
||||
uchar c = 0;
|
||||
if (const int offset = static_cast <int> (from >> 32)) { // on tail
|
||||
if (root >> 32) return CEDAR_NO_PATH;
|
||||
from &= TAIL_OFFSET_MASK;
|
||||
len -= static_cast <size_t> (offset - (-_array[from].base));
|
||||
} else
|
||||
c = _ninfo[_array[from].base ^ 0].sibling;
|
||||
for (; ! c && from != root; --len) {
|
||||
c = _ninfo[from].sibling;
|
||||
from = static_cast <size_t> (_array[from].check);
|
||||
}
|
||||
if (! c) return CEDAR_NO_PATH;
|
||||
return begin (from = static_cast <size_t> (_array[from].base) ^ c, ++len);
|
||||
}
|
||||
npos_t tracking_node[NUM_TRACKING_NODES + 1];
|
||||
private:
|
||||
// currently disabled; implement these if you need
|
||||
da (const da&);
|
||||
da& operator= (const da&);
|
||||
node* _array;
|
||||
union { char* _tail; int* _length; };
|
||||
union { int* _tail0; int* _length0; };
|
||||
ninfo* _ninfo;
|
||||
block* _block;
|
||||
int _bheadF; // first block of Full; 0
|
||||
int _bheadC; // first block of Closed; 0 if no Closed
|
||||
int _bheadO; // first block of Open; 0 if no Open
|
||||
int _capacity;
|
||||
int _size;
|
||||
int _quota;
|
||||
int _quota0;
|
||||
int _no_delete;
|
||||
short _reject[257];
|
||||
//
|
||||
static void _err (const char* fn, const int ln, const char* msg)
|
||||
{ std::fprintf (stderr, "cedar: %s [%d]: %s", fn, ln, msg); std::exit (1); }
|
||||
template <typename T>
|
||||
static void _realloc_array (T*& p, const int size_n, const int size_p = 0) {
|
||||
void* tmp = std::realloc (p, sizeof (T) * static_cast <size_t> (size_n));
|
||||
if (! tmp)
|
||||
std::free (p), _err (__FILE__, __LINE__, "memory reallocation failed\n");
|
||||
p = static_cast <T*> (tmp);
|
||||
static const T T0 = T ();
|
||||
for (T* q (p + size_p), * const r (p + size_n); q != r; ++q) *q = T0;
|
||||
}
|
||||
void _initialize () { // initilize the first special block
|
||||
_realloc_array (_array, 256, 256);
|
||||
_realloc_array (_tail, sizeof (int));
|
||||
_realloc_array (_tail0, 1);
|
||||
_realloc_array (_ninfo, 256);
|
||||
_realloc_array (_block, 1);
|
||||
_array[0] = node (0, -1);
|
||||
for (int i = 1; i < 256; ++i)
|
||||
_array[i] = node (i == 1 ? -255 : - (i - 1), i == 255 ? -1 : - (i + 1));
|
||||
_capacity = _size = 256;
|
||||
_block[0].ehead = 1; // bug fix for erase
|
||||
_quota = *_length = static_cast <int> (sizeof (int));
|
||||
_quota0 = 1;
|
||||
for (size_t i = 0 ; i <= NUM_TRACKING_NODES; ++i) tracking_node[i] = 0;
|
||||
for (short i = 0; i <= 256; ++i) _reject[i] = i + 1;
|
||||
}
|
||||
// follow/create edge
|
||||
template <typename T>
|
||||
int _follow (npos_t& from, const uchar& label, T& cf) {
|
||||
int to = 0;
|
||||
const int base = _array[from].base;
|
||||
if (base < 0 || _array[to = base ^ label].check < 0) {
|
||||
to = _pop_enode (base, label, static_cast <int> (from));
|
||||
_push_sibling (from, to ^ label, label, base >= 0);
|
||||
} else if (_array[to].check != static_cast <int> (from))
|
||||
to = _resolve (from, base, label, cf);
|
||||
return to;
|
||||
}
|
||||
// find key from double array
|
||||
int _find (const char* key, npos_t& from, size_t& pos, const size_t len) const {
|
||||
npos_t offset = from >> 32;
|
||||
if (! offset) { // node on trie
|
||||
for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
|
||||
_array[from].base >= 0; ) {
|
||||
if (pos == len) {
|
||||
const node& n = _array[_array[from].base ^ 0];
|
||||
if (n.check != static_cast <int> (from)) return CEDAR_NO_VALUE;
|
||||
return n.base;
|
||||
}
|
||||
size_t to = static_cast <size_t> (_array[from].base); to ^= key_[pos];
|
||||
if (_array[to].check != static_cast <int> (from)) return CEDAR_NO_PATH;
|
||||
++pos;
|
||||
from = to;
|
||||
}
|
||||
offset = static_cast <npos_t> (-_array[from].base);
|
||||
}
|
||||
// switch to _tail to match suffix
|
||||
const size_t pos_orig = pos; // start position in reading _tail
|
||||
const char* const tail = &_tail[offset] - pos;
|
||||
if (pos < len) {
|
||||
do if (key[pos] != tail[pos]) break; while (++pos < len);
|
||||
if (const npos_t moved = pos - pos_orig) {
|
||||
from &= TAIL_OFFSET_MASK;
|
||||
from |= (offset + moved) << 32;
|
||||
}
|
||||
if (pos < len) return CEDAR_NO_PATH; // input > tail, input != tail
|
||||
}
|
||||
if (tail[pos]) return CEDAR_NO_VALUE; // input < tail
|
||||
return *reinterpret_cast <const int*> (&tail[len + 1]);
|
||||
}
|
||||
#ifndef USE_FAST_LOAD
|
||||
void _restore_ninfo () {
|
||||
_realloc_array (_ninfo, _size);
|
||||
for (int to = 0; to < _size; ++to) {
|
||||
const int from = _array[to].check;
|
||||
if (from < 0) continue; // skip empty node
|
||||
const int base = _array[from].base;
|
||||
if (const uchar label = static_cast <uchar> (base ^ to)) // skip leaf
|
||||
_push_sibling (static_cast <size_t> (from), base, label,
|
||||
! from || _ninfo[from].child || _array[base ^ 0].check == from);
|
||||
}
|
||||
}
|
||||
void _restore_block () {
|
||||
_realloc_array (_block, _size >> 8);
|
||||
_bheadF = _bheadC = _bheadO = 0;
|
||||
for (int bi (0), e (0); e < _size; ++bi) { // register blocks to full
|
||||
block& b = _block[bi];
|
||||
b.num = 0;
|
||||
for (; e < (bi << 8) + 256; ++e)
|
||||
if (_array[e].check < 0 && ++b.num == 1) b.ehead = e;
|
||||
int& head_out = b.num == 1 ? _bheadC : (b.num == 0 ? _bheadF : _bheadO);
|
||||
_push_block (bi, head_out, ! head_out && b.num);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
void _set_result (result_type* x, value_type r, size_t = 0, npos_t = 0) const
|
||||
{ *x = r; }
|
||||
void _set_result (result_pair_type* x, value_type r, size_t l, npos_t = 0) const
|
||||
{ x->value = r; x->length = l; }
|
||||
void _set_result (result_triple_type* x, value_type r, size_t l, npos_t from) const
|
||||
{ x->value = r; x->length = l; x->id = from; }
|
||||
void _pop_block (const int bi, int& head_in, const bool last) {
|
||||
if (last) { // last one poped; Closed or Open
|
||||
head_in = 0;
|
||||
} else {
|
||||
const block& b = _block[bi];
|
||||
_block[b.prev].next = b.next;
|
||||
_block[b.next].prev = b.prev;
|
||||
if (bi == head_in) head_in = b.next;
|
||||
}
|
||||
}
|
||||
void _push_block (const int bi, int& head_out, const bool empty) {
|
||||
block& b = _block[bi];
|
||||
if (empty) { // the destination is empty
|
||||
head_out = b.prev = b.next = bi;
|
||||
} else { // use most recently pushed
|
||||
int& tail_out = _block[head_out].prev;
|
||||
b.prev = tail_out;
|
||||
b.next = head_out;
|
||||
head_out = tail_out = _block[tail_out].next = bi;
|
||||
}
|
||||
}
|
||||
int _add_block () {
|
||||
if (_size == _capacity) { // allocate memory if needed
|
||||
#ifdef USE_EXACT_FIT
|
||||
_capacity += _size >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : _size;
|
||||
#else
|
||||
_capacity += _capacity;
|
||||
#endif
|
||||
_realloc_array (_array, _capacity, _capacity);
|
||||
_realloc_array (_ninfo, _capacity, _size);
|
||||
_realloc_array (_block, _capacity >> 8, _size >> 8);
|
||||
}
|
||||
_block[_size >> 8].ehead = _size;
|
||||
_array[_size] = node (- (_size + 255), - (_size + 1));
|
||||
for (int i = _size + 1; i < _size + 255; ++i)
|
||||
_array[i] = node (-(i - 1), -(i + 1));
|
||||
_array[_size + 255] = node (- (_size + 254), -_size);
|
||||
_push_block (_size >> 8, _bheadO, ! _bheadO); // append to block Open
|
||||
_size += 256;
|
||||
return (_size >> 8) - 1;
|
||||
}
|
||||
// transfer block from one start w/ head_in to one start w/ head_out
|
||||
void _transfer_block (const int bi, int& head_in, int& head_out) {
|
||||
_pop_block (bi, head_in, bi == _block[bi].next);
|
||||
_push_block (bi, head_out, ! head_out && _block[bi].num);
|
||||
}
|
||||
// pop empty node from block; never transfer the special block (bi = 0)
|
||||
int _pop_enode (const int base, const uchar label, const int from) {
|
||||
const int e = base < 0 ? _find_place () : base ^ label;
|
||||
const int bi = e >> 8;
|
||||
node& n = _array[e];
|
||||
block& b = _block[bi];
|
||||
if (--b.num == 0) {
|
||||
if (bi) _transfer_block (bi, _bheadC, _bheadF); // Closed to Full
|
||||
} else { // release empty node from empty ring
|
||||
_array[-n.base].check = n.check;
|
||||
_array[-n.check].base = n.base;
|
||||
if (e == b.ehead) b.ehead = -n.check; // set ehead
|
||||
if (bi && b.num == 1 && b.trial != MAX_TRIAL) // Open to Closed
|
||||
_transfer_block (bi, _bheadO, _bheadC);
|
||||
}
|
||||
// initialize the released node
|
||||
if (label) n.base = -1; else n.value = value_type (0);
|
||||
n.check = from;
|
||||
if (base < 0) _array[from].base = e ^ label;
|
||||
return e;
|
||||
}
|
||||
// push empty node into empty ring
|
||||
void _push_enode (const int e) {
|
||||
const int bi = e >> 8;
|
||||
block& b = _block[bi];
|
||||
if (++b.num == 1) { // Full to Closed
|
||||
b.ehead = e;
|
||||
_array[e] = node (-e, -e);
|
||||
if (bi) _transfer_block (bi, _bheadF, _bheadC); // Full to Closed
|
||||
} else {
|
||||
const int prev = b.ehead;
|
||||
const int next = -_array[prev].check;
|
||||
_array[e] = node (-prev, -next);
|
||||
_array[prev].check = _array[next].base = -e;
|
||||
if (b.num == 2 || b.trial == MAX_TRIAL) { // Closed to Open
|
||||
if (bi) _transfer_block (bi, _bheadC, _bheadO);
|
||||
}
|
||||
b.trial = 0;
|
||||
}
|
||||
if (b.reject < _reject[b.num]) b.reject = _reject[b.num];
|
||||
_ninfo[e] = ninfo (); // reset ninfo; no child, no sibling
|
||||
}
|
||||
// push label to from's child
|
||||
void _push_sibling (const npos_t from, const int base, const uchar label, const bool flag = true) {
|
||||
uchar* c = &_ninfo[from].child;
|
||||
if (flag && (ORDERED ? label > *c : ! *c))
|
||||
do c = &_ninfo[base ^ *c].sibling; while (ORDERED && *c && *c < label);
|
||||
_ninfo[base ^ label].sibling = *c, *c = label;
|
||||
}
|
||||
// pop label from from's child
|
||||
void _pop_sibling (const npos_t from, const int base, const uchar label) {
|
||||
uchar* c = &_ninfo[from].child;
|
||||
while (*c != label) c = &_ninfo[base ^ *c].sibling;
|
||||
*c = _ninfo[base ^ label].sibling;
|
||||
}
|
||||
// check whether to replace branching w/ the newly added node
|
||||
bool _consult (const int base_n, const int base_p, uchar c_n, uchar c_p) const {
|
||||
do if (! (c_p = _ninfo[base_p ^ c_p].sibling)) return false;
|
||||
while ((c_n = _ninfo[base_n ^ c_n].sibling));
|
||||
return true;
|
||||
}
|
||||
// enumerate (equal to or more than one) child nodes
|
||||
uchar* _set_child (uchar* p, const int base, uchar c, const int label = -1) {
|
||||
--p;
|
||||
if (! c) { *++p = c; c = _ninfo[base ^ c].sibling; } // 0: terminal
|
||||
if (ORDERED)
|
||||
while (c && c < label) { *++p = c; c = _ninfo[base ^ c].sibling; }
|
||||
if (label != -1) *++p = static_cast <uchar> (label);
|
||||
while (c) { *++p = c; c = _ninfo[base ^ c].sibling; }
|
||||
return p;
|
||||
}
|
||||
// explore new block to settle down
|
||||
int _find_place () {
|
||||
if (_bheadC) return _block[_bheadC].ehead;
|
||||
if (_bheadO) return _block[_bheadO].ehead;
|
||||
return _add_block () << 8;
|
||||
}
|
||||
int _find_place (const uchar* const first, const uchar* const last) {
|
||||
if (int bi = _bheadO) {
|
||||
const int bz = _block[_bheadO].prev;
|
||||
const short nc = static_cast <short> (last - first + 1);
|
||||
while (1) { // set candidate block
|
||||
block& b = _block[bi];
|
||||
if (b.num >= nc && nc < b.reject) // explore configuration
|
||||
for (int e = b.ehead;;) {
|
||||
const int base = e ^ *first;
|
||||
for (const uchar* p = first; _array[base ^ *++p].check < 0; )
|
||||
if (p == last) return b.ehead = e; // no conflict
|
||||
if ((e = -_array[e].check) == b.ehead) break;
|
||||
}
|
||||
b.reject = nc;
|
||||
if (b.reject < _reject[b.num]) _reject[b.num] = b.reject;
|
||||
const int bi_ = b.next;
|
||||
if (++b.trial == MAX_TRIAL) _transfer_block (bi, _bheadO, _bheadC);
|
||||
if (bi == bz) break;
|
||||
bi = bi_;
|
||||
}
|
||||
}
|
||||
return _add_block () << 8;
|
||||
}
|
||||
// resolve conflict on base_n ^ label_n = base_p ^ label_p
|
||||
template <typename T>
|
||||
int _resolve (npos_t& from_n, const int base_n, const uchar label_n, T& cf) {
|
||||
// examine siblings of conflicted nodes
|
||||
const int to_pn = base_n ^ label_n;
|
||||
const int from_p = _array[to_pn].check;
|
||||
const int base_p = _array[from_p].base;
|
||||
const bool flag // whether to replace siblings of newly added
|
||||
= _consult (base_n, base_p, _ninfo[from_n].child, _ninfo[from_p].child);
|
||||
uchar child[256];
|
||||
uchar* const first = &child[0];
|
||||
uchar* const last =
|
||||
flag ? _set_child (first, base_n, _ninfo[from_n].child, label_n)
|
||||
: _set_child (first, base_p, _ninfo[from_p].child);
|
||||
const int base =
|
||||
(first == last ? _find_place () : _find_place (first, last)) ^ *first;
|
||||
// replace & modify empty list
|
||||
const int from = flag ? static_cast <int> (from_n) : from_p;
|
||||
const int base_ = flag ? base_n : base_p;
|
||||
if (flag && *first == label_n) _ninfo[from].child = label_n; // new child
|
||||
_array[from].base = base; // new base
|
||||
for (const uchar* p = first; p <= last; ++p) { // to_ => to
|
||||
const int to = _pop_enode (base, *p, from);
|
||||
const int to_ = base_ ^ *p;
|
||||
_ninfo[to].sibling = (p == last ? 0 : *(p + 1));
|
||||
if (flag && to_ == to_pn) continue; // skip newcomer (no child)
|
||||
cf (to_, to);
|
||||
node& n = _array[to];
|
||||
node& n_ = _array[to_];
|
||||
if ((n.base = n_.base) > 0 && *p) { // copy base; bug fix
|
||||
uchar c = _ninfo[to].child = _ninfo[to_].child;
|
||||
do _array[n.base ^ c].check = to; // adjust grand son's check
|
||||
while ((c = _ninfo[n.base ^ c].sibling));
|
||||
}
|
||||
if (! flag && to_ == static_cast <int> (from_n)) // parent node moved
|
||||
from_n = static_cast <size_t> (to); // bug fix
|
||||
if (! flag && to_ == to_pn) { // the address is immediately used
|
||||
_push_sibling (from_n, to_pn ^ label_n, label_n);
|
||||
_ninfo[to_].child = 0; // remember to reset child
|
||||
if (label_n) n_.base = -1; else n_.value = value_type (0);
|
||||
n_.check = static_cast <int> (from_n);
|
||||
} else
|
||||
_push_enode (to_);
|
||||
if (NUM_TRACKING_NODES) // keep the traversed node updated
|
||||
for (size_t j = 0; tracking_node[j] != 0; ++j) {
|
||||
if (static_cast <int> (tracking_node[j] & TAIL_OFFSET_MASK) == to_) {
|
||||
tracking_node[j] &= NODE_INDEX_MASK;
|
||||
tracking_node[j] |= static_cast <npos_t> (to);
|
||||
}
|
||||
}
|
||||
}
|
||||
return flag ? base ^ label_n : to_pn;
|
||||
}
|
||||
// test the validity of double array for debug
|
||||
void _test (const npos_t from = 0) const {
|
||||
const int base = _array[from].base;
|
||||
if (base < 0) { // validate tail offset
|
||||
assert (*_length >= static_cast <int> (-base + 1 + sizeof (value_type)));
|
||||
return;
|
||||
}
|
||||
uchar c = _ninfo[from].child;
|
||||
do {
|
||||
if (from) assert (_array[base ^ c].check == static_cast <int> (from));
|
||||
if (c) _test (static_cast <npos_t> (base ^ c));
|
||||
} while ((c = _ninfo[base ^ c].sibling));
|
||||
}
|
||||
};
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,12 @@
|
|||
INCLUDEPATH += $$PWD
|
||||
|
||||
HEADERS += \
|
||||
$$PWD/darts-clone/darts.h \
|
||||
$$PWD/cedar/cedarpp.h \
|
||||
$$PWD/cedar/cedar.h \
|
||||
$$PWD/storage-base.h \
|
||||
$$PWD/storage-base.hpp
|
||||
|
||||
SOURCES += \
|
||||
$$PWD/storage-base.cpp
|
||||
|
|
@ -0,0 +1,202 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
#ifndef STORAGEBASE_CPP
|
||||
#define STORAGEBASE_CPP
|
||||
|
||||
#include "storage-base.h"
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
StorageBase<ordered, cache_file_header>::StorageBase(const vector<string> file_paths, string dat_cache_path)
|
||||
:m_file_paths(file_paths), m_dat_cache_path(dat_cache_path), m_double_array_data_trie(new cedar::da<int, -1, -2, ordered>)
|
||||
{
|
||||
static_assert(std::is_base_of<CacheFileHeaderBase, header_type>::value, "CacheFileHeader class not derived from CacheFileHeaderBase!");
|
||||
}
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
void StorageBase<ordered, cache_file_header>::Init()
|
||||
{
|
||||
int file_size_sum = 0;
|
||||
const string md5 = CalcFileListMD5(m_file_paths, file_size_sum);
|
||||
m_total_dict_size = file_size_sum;
|
||||
|
||||
if (m_dat_cache_path.empty()) {
|
||||
m_dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
|
||||
}
|
||||
m_dat_cache_path += VERSION;
|
||||
if (InitAttachDat(m_dat_cache_path, md5)) {
|
||||
return;
|
||||
}
|
||||
|
||||
LoadSourceFile(m_dat_cache_path, md5);//构建DATrie,写入dat文件
|
||||
|
||||
bool build_ret = InitAttachDat(m_dat_cache_path, md5);
|
||||
|
||||
assert(build_ret);
|
||||
}
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
string StorageBase<ordered, cache_file_header>::Find(const string &key)
|
||||
{
|
||||
int result = m_double_array_data_trie->exactMatchSearch<int>(key.c_str(), key.size());
|
||||
if (result < 0)
|
||||
return string();
|
||||
return string(&m_elements_ptr[result]);
|
||||
}
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
bool StorageBase<ordered, cache_file_header>::Contains(string &word)
|
||||
{
|
||||
if (this->Find(word) != string())
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
bool StorageBase<ordered, cache_file_header>::IsMultiTone(const string &word)
|
||||
{
|
||||
string result = this->Find(word);
|
||||
if (result.find(",") == result.npos)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
int StorageBase<ordered, cache_file_header>::GetTotalDictSize() const
|
||||
{
|
||||
return m_total_dict_size;
|
||||
}
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
StorageBase<ordered, cache_file_header>::~StorageBase()
|
||||
{
|
||||
munmap(m_mmap_addr, m_mmap_length);
|
||||
m_mmap_addr = nullptr;
|
||||
close(m_mmap_fd);
|
||||
m_mmap_fd = -1;
|
||||
|
||||
if (m_double_array_data_trie)
|
||||
delete m_double_array_data_trie;
|
||||
m_double_array_data_trie = nullptr;
|
||||
}
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
cedar::da<int, -1, -2, ordered> *StorageBase<ordered, cache_file_header>::GetDoubleArrayDataTrie()
|
||||
{
|
||||
return m_double_array_data_trie;
|
||||
}
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
const void *StorageBase<ordered, cache_file_header>::GetDataTrieArray()
|
||||
{
|
||||
return m_double_array_data_trie->array();
|
||||
}
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
int StorageBase<ordered, cache_file_header>::GetDataTrieSize()
|
||||
{
|
||||
return m_double_array_data_trie->size();
|
||||
}
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
int StorageBase<ordered, cache_file_header>::GetDataTrieTotalSize()
|
||||
{
|
||||
return m_double_array_data_trie->total_size();
|
||||
}
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
cache_file_header *StorageBase<ordered, cache_file_header>::GetCacheFileHeaderPtr()
|
||||
{
|
||||
return reinterpret_cast<header_type*>(m_mmap_addr);
|
||||
}
|
||||
|
||||
|
||||
template<const bool ordered, typename cache_file_header>
|
||||
bool StorageBase<ordered, cache_file_header>::InitAttachDat(const string &dat_cache_file, const string &md5)
|
||||
{
|
||||
m_mmap_fd = open(dat_cache_file.c_str(), O_RDONLY);
|
||||
|
||||
if (m_mmap_fd < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto seek_off = lseek(m_mmap_fd, 0, SEEK_END);
|
||||
if (seek_off < 0){
|
||||
close(m_mmap_fd);
|
||||
m_mmap_fd = -1;
|
||||
return false;
|
||||
};
|
||||
|
||||
m_mmap_length = seek_off;
|
||||
m_mmap_addr = reinterpret_cast<char *>(mmap(NULL, m_mmap_length, PROT_READ, MAP_SHARED, m_mmap_fd, 0));
|
||||
if (m_mmap_addr == MAP_FAILED) {
|
||||
close(m_mmap_fd);
|
||||
m_mmap_fd = -1;
|
||||
return false;
|
||||
}
|
||||
if (m_mmap_length < sizeof(header_type)) {
|
||||
munmap(m_mmap_addr, m_mmap_length);
|
||||
m_mmap_addr = nullptr;
|
||||
close(m_mmap_fd);
|
||||
m_mmap_fd = -1;
|
||||
return false;
|
||||
}
|
||||
header_type & header = *reinterpret_cast<header_type*>(m_mmap_addr);
|
||||
|
||||
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())
|
||||
or m_mmap_length != sizeof(header_type) + header.elements_size + header.dat_size * m_double_array_data_trie->unit_size()) {
|
||||
munmap(m_mmap_addr, m_mmap_length);
|
||||
m_mmap_addr = nullptr;
|
||||
close(m_mmap_fd);
|
||||
m_mmap_fd = -1;
|
||||
return false;
|
||||
}
|
||||
|
||||
m_elements_ptr = (const char *)(m_mmap_addr + sizeof(header_type));
|
||||
const char * dat_ptr = m_mmap_addr + sizeof(header_type) + header.elements_size;
|
||||
this->m_double_array_data_trie->set_array((char *)dat_ptr, header.dat_size);
|
||||
return true;
|
||||
}
|
||||
|
||||
string CalcFileListMD5(const vector<string> &files_list, int &file_size_sum) {
|
||||
limonp::MD5 md5;
|
||||
file_size_sum = 0;
|
||||
|
||||
for (auto const & local_path : files_list) {
|
||||
const int fd = open(local_path.c_str(), O_RDONLY);
|
||||
if (fd < 0){
|
||||
continue;
|
||||
}
|
||||
auto const len = lseek(fd, 0, SEEK_END);
|
||||
if (len > 0) {
|
||||
void * addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
|
||||
assert(MAP_FAILED != addr);
|
||||
|
||||
md5.Update((unsigned char *) addr, len);
|
||||
file_size_sum += len;
|
||||
|
||||
munmap(addr, len);
|
||||
}
|
||||
close(fd);
|
||||
}
|
||||
|
||||
md5.Final();
|
||||
return string(md5.digestChars);
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,93 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
#ifndef STORAGEBASE_H
|
||||
#define STORAGEBASE_H
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include "Md5.hpp"
|
||||
#include "StringUtil.hpp"
|
||||
#include "cedar.h"
|
||||
using namespace std;
|
||||
|
||||
struct CacheFileHeaderBase { //todo 字节对齐
|
||||
char md5_hex[32] = {};
|
||||
uint32_t elements_num = 0;
|
||||
uint32_t elements_size = 0;
|
||||
uint32_t dat_size = 0;
|
||||
};
|
||||
|
||||
template<const bool ordered = false, typename cache_file_header = CacheFileHeaderBase>
|
||||
class StorageBase
|
||||
{
|
||||
public:
|
||||
typedef cache_file_header header_type;
|
||||
|
||||
StorageBase(const vector<string> file_paths, string dat_cache_path = "");
|
||||
|
||||
virtual void Init();
|
||||
|
||||
virtual string Find(const string &key);
|
||||
|
||||
virtual bool Contains(string &word);
|
||||
|
||||
virtual bool IsMultiTone(const string &word);
|
||||
|
||||
virtual int GetTotalDictSize() const;
|
||||
|
||||
virtual void LoadSourceFile(const string &dat_cache_file, const string &md5) = 0;
|
||||
|
||||
virtual ~StorageBase();
|
||||
|
||||
cedar::da<int, -1, -2, ordered> * GetDoubleArrayDataTrie();
|
||||
const void * GetDataTrieArray();
|
||||
int GetDataTrieSize();
|
||||
int GetDataTrieTotalSize();
|
||||
|
||||
cache_file_header * GetCacheFileHeaderPtr();
|
||||
|
||||
private:
|
||||
StorageBase();
|
||||
StorageBase(const StorageBase&);
|
||||
StorageBase& operator = (const StorageBase&);
|
||||
|
||||
bool InitAttachDat(const string &dat_cache_file, const string &md5);
|
||||
|
||||
vector<string> m_file_paths;
|
||||
string m_dat_cache_path;
|
||||
|
||||
cedar::da<int, -1, -2, ordered> * m_double_array_data_trie = nullptr;
|
||||
|
||||
const char * m_elements_ptr = nullptr;
|
||||
|
||||
int m_mmap_fd = -1;
|
||||
int m_mmap_length = 0;
|
||||
char * m_mmap_addr = nullptr;
|
||||
|
||||
int m_total_dict_size = 0;
|
||||
|
||||
};
|
||||
|
||||
inline string CalcFileListMD5(const vector<string> &files_list, int & file_size_sum);
|
||||
#include "storage-base.cpp"
|
||||
#endif // STORAGEBASE_H
|
|
@ -0,0 +1,232 @@
|
|||
/*
|
||||
* Copyright (C) 2022, KylinSoft Co., Ltd.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*
|
||||
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
|
||||
*
|
||||
*/
|
||||
#ifndef STORAGEBASE_H
|
||||
#define STORAGEBASE_H
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include "Md5.hpp"
|
||||
#include "StringUtil.hpp"
|
||||
//#define USE_DARTS
|
||||
#ifdef USE_DARTS
|
||||
#include "../storage-base/darts-clone/darts.h"
|
||||
#include <cassert>
|
||||
#else
|
||||
#include "../storage-base/cedar/cedar.h"
|
||||
#endif
|
||||
using namespace std;
|
||||
|
||||
inline string CalcFileListMD5(const vector<string> &files_list, int & file_size_sum)
|
||||
{
|
||||
limonp::MD5 md5;
|
||||
file_size_sum = 0;
|
||||
|
||||
for (auto const & local_path : files_list) {
|
||||
const int fd = open(local_path.c_str(), O_RDONLY);
|
||||
if (fd < 0){
|
||||
continue;
|
||||
}
|
||||
auto const len = lseek(fd, 0, SEEK_END);
|
||||
if (len > 0) {
|
||||
void * addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
|
||||
assert(MAP_FAILED != addr);
|
||||
|
||||
md5.Update((unsigned char *) addr, len);
|
||||
file_size_sum += len;
|
||||
|
||||
munmap(addr, len);
|
||||
}
|
||||
close(fd);
|
||||
}
|
||||
|
||||
md5.Final();
|
||||
return string(md5.digestChars);
|
||||
}
|
||||
|
||||
struct CacheFileHeaderBase { //todo 字节对齐
|
||||
char md5_hex[32] = {};
|
||||
uint32_t elements_num = 0;
|
||||
uint32_t elements_size = 0;
|
||||
uint32_t dat_size = 0;
|
||||
};
|
||||
|
||||
template<typename element_ptr_type, const bool ordered = false, typename cache_file_header = CacheFileHeaderBase>
|
||||
class StorageBase
|
||||
{
|
||||
public:
|
||||
typedef cache_file_header header_type;
|
||||
#ifdef USE_DARTS
|
||||
typedef typename Darts::DoubleArray::result_pair_type result_pair_type;
|
||||
StorageBase(const vector<string> file_paths, string dat_cache_path = "")
|
||||
:m_file_paths(file_paths), m_dat_cache_path(dat_cache_path), m_double_array_data_trie(new Darts::DoubleArray)
|
||||
{
|
||||
static_assert(std::is_base_of<CacheFileHeaderBase, header_type>::value, "CacheFileHeader class not derived from CacheFileHeaderBase!");
|
||||
}
|
||||
#else
|
||||
typedef typename cedar::da<int, -1, -2, ordered>::result_pair_type result_pair_type;
|
||||
StorageBase(const vector<string> file_paths, string dat_cache_path = "")
|
||||
:m_file_paths(file_paths), m_dat_cache_path(dat_cache_path)/*, m_double_array_data_trie(new cedar::da<int, -1, -2, ordered>)*/
|
||||
{
|
||||
static_assert(std::is_base_of<CacheFileHeaderBase, header_type>::value, "CacheFileHeader class not derived from CacheFileHeaderBase!");
|
||||
}
|
||||
#endif
|
||||
virtual void Init()
|
||||
{
|
||||
int file_size_sum = 0;
|
||||
const string md5 = CalcFileListMD5(m_file_paths, file_size_sum);
|
||||
m_total_dict_size = file_size_sum;
|
||||
|
||||
if (m_dat_cache_path.empty()) {
|
||||
m_dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
|
||||
}
|
||||
m_dat_cache_path += VERSION;
|
||||
if (InitAttachDat(m_dat_cache_path, md5)) {
|
||||
return;
|
||||
}
|
||||
|
||||
LoadSourceFile(m_dat_cache_path, md5);//构建DATrie,写入dat文件
|
||||
|
||||
bool build_ret = InitAttachDat(m_dat_cache_path, md5);
|
||||
|
||||
assert(build_ret);
|
||||
}
|
||||
|
||||
virtual void LoadSourceFile(const string &dat_cache_file, const string &md5) = 0;
|
||||
|
||||
virtual ~StorageBase()
|
||||
{
|
||||
munmap(m_mmap_addr, m_mmap_length);
|
||||
m_mmap_addr = nullptr;
|
||||
close(m_mmap_fd);
|
||||
m_mmap_fd = -1;
|
||||
}
|
||||
#ifndef USE_DARTS
|
||||
inline int Update(const char* key, size_t len, int val)
|
||||
{
|
||||
return m_double_array_data_trie.update(key, len, val);
|
||||
}
|
||||
#endif
|
||||
inline size_t CommonPrefixSearch(const char* key, result_pair_type* result, size_t result_len) const
|
||||
{
|
||||
return m_double_array_data_trie.commonPrefixSearch(key, result, result_len);
|
||||
}
|
||||
|
||||
inline int ExactMatchSearch(const char* key, size_t len) const
|
||||
{
|
||||
return m_double_array_data_trie.template exactMatchSearch<int>(key, len);
|
||||
}
|
||||
|
||||
inline const void * GetDataTrieArray()
|
||||
{
|
||||
return m_double_array_data_trie.array();
|
||||
}
|
||||
|
||||
inline int GetDataTrieSize()
|
||||
{
|
||||
return m_double_array_data_trie.size();
|
||||
}
|
||||
|
||||
inline int GetDataTrieTotalSize()
|
||||
{
|
||||
return m_double_array_data_trie.total_size();
|
||||
}
|
||||
|
||||
inline cache_file_header * GetCacheFileHeaderPtr() const
|
||||
{
|
||||
return reinterpret_cast<header_type*>(m_mmap_addr);
|
||||
}
|
||||
|
||||
inline const element_ptr_type * GetElementPtr() const
|
||||
{
|
||||
return m_elements_ptr;
|
||||
}
|
||||
|
||||
private:
|
||||
StorageBase();
|
||||
StorageBase(const StorageBase&);
|
||||
StorageBase& operator = (const StorageBase&);
|
||||
|
||||
bool InitAttachDat(const string &dat_cache_file, const string &md5)
|
||||
{
|
||||
m_mmap_fd = open(dat_cache_file.c_str(), O_RDONLY);
|
||||
|
||||
if (m_mmap_fd < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto seek_off = lseek(m_mmap_fd, 0, SEEK_END);
|
||||
if (seek_off < 0){
|
||||
close(m_mmap_fd);
|
||||
m_mmap_fd = -1;
|
||||
return false;
|
||||
};
|
||||
|
||||
m_mmap_length = seek_off;
|
||||
m_mmap_addr = reinterpret_cast<char *>(mmap(NULL, m_mmap_length, PROT_READ, MAP_SHARED, m_mmap_fd, 0));
|
||||
if (m_mmap_addr == MAP_FAILED) {
|
||||
close(m_mmap_fd);
|
||||
m_mmap_fd = -1;
|
||||
return false;
|
||||
}
|
||||
if (m_mmap_length < sizeof(header_type)) {
|
||||
munmap(m_mmap_addr, m_mmap_length);
|
||||
m_mmap_addr = nullptr;
|
||||
close(m_mmap_fd);
|
||||
m_mmap_fd = -1;
|
||||
return false;
|
||||
}
|
||||
header_type & header = *reinterpret_cast<header_type*>(m_mmap_addr);
|
||||
|
||||
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())
|
||||
or m_mmap_length != sizeof(header_type) + header.elements_size + header.dat_size * m_double_array_data_trie.unit_size()) {
|
||||
munmap(m_mmap_addr, m_mmap_length);
|
||||
m_mmap_addr = nullptr;
|
||||
close(m_mmap_fd);
|
||||
m_mmap_fd = -1;
|
||||
return false;
|
||||
}
|
||||
|
||||
m_elements_ptr = (const element_ptr_type *)(m_mmap_addr + sizeof(header_type));
|
||||
const char * dat_ptr = m_mmap_addr + sizeof(header_type) + header.elements_size;
|
||||
this->m_double_array_data_trie.set_array((char *)dat_ptr, header.dat_size);
|
||||
return true;
|
||||
}
|
||||
|
||||
vector<string> m_file_paths;
|
||||
string m_dat_cache_path;
|
||||
|
||||
#ifdef USE_DARTS
|
||||
Darts::DoubleArray m_double_array_data_trie;
|
||||
#else
|
||||
cedar::da<int, -1, -2, ordered> m_double_array_data_trie;
|
||||
#endif
|
||||
|
||||
const element_ptr_type * m_elements_ptr = nullptr;
|
||||
int m_mmap_fd = -1;
|
||||
size_t m_mmap_length = 0;
|
||||
char * m_mmap_addr = nullptr;
|
||||
int m_total_dict_size = 0;
|
||||
|
||||
};
|
||||
|
||||
#endif // STORAGEBASE_H
|
|
@ -0,0 +1,11 @@
|
|||
#include "mainwindow.h"
|
||||
|
||||
#include <QApplication>
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
QApplication a(argc, argv);
|
||||
MainWindow w;
|
||||
w.show();
|
||||
return a.exec();
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
#include "mainwindow.h"
|
||||
#include "ui_mainwindow.h"
|
||||
#include <HanZiToPinYin>
|
||||
#include <ChineseSegmentation>
|
||||
#include <QMenu>
|
||||
#include <QDebug>
|
||||
#include <QStringList>
|
||||
|
||||
MainWindow::MainWindow(QWidget *parent)
|
||||
: QMainWindow(parent)
|
||||
, ui(new Ui::MainWindow)
|
||||
{
|
||||
ui->setupUi(this);
|
||||
QMenu * menu = new QMenu(this);
|
||||
menu->addAction("Default");
|
||||
menu->addAction("Tone");
|
||||
menu->addAction("Tone2");
|
||||
menu->addAction("Tone3");
|
||||
menu->addAction("FirstLetter");
|
||||
ui->toolButton->setMenu(menu);
|
||||
initconnections();
|
||||
ui->lineEdit_2->setFocus();
|
||||
}
|
||||
|
||||
MainWindow::~MainWindow()
|
||||
{
|
||||
delete ui;
|
||||
}
|
||||
|
||||
void MainWindow::initconnections()
|
||||
{
|
||||
connect(ui->toolButton->menu(), &QMenu::triggered, [&](QAction *action){
|
||||
qDebug() << "tool button:" << action->text();
|
||||
m_action = action->text();
|
||||
ui->toolButton->setText(action->text());
|
||||
});
|
||||
connect(ui->pushButton, &QPushButton::pressed, [&]() {
|
||||
PinyinDataStyle dataStyle;
|
||||
SegType segType;
|
||||
PolyphoneType polyType;
|
||||
ExDataProcessType exType;
|
||||
|
||||
if (m_action == "Default") {
|
||||
dataStyle = PinyinDataStyle::Default;
|
||||
} else if (m_action == "Tone") {
|
||||
dataStyle = PinyinDataStyle::Tone;
|
||||
} else if (m_action == "Tone2") {
|
||||
dataStyle = PinyinDataStyle::Tone2;
|
||||
} else if (m_action == "Tone3") {
|
||||
dataStyle = PinyinDataStyle::Tone3;
|
||||
} else if (m_action == "FirstLetter") {
|
||||
dataStyle = PinyinDataStyle::FirstLetter;
|
||||
}
|
||||
|
||||
if(!ui->checkSegBox->isChecked())
|
||||
segType = SegType::Segmentation;
|
||||
else
|
||||
segType = SegType::NoSegmentation;
|
||||
|
||||
if(ui->checkPolyBox_2->isChecked())
|
||||
polyType = PolyphoneType::Enable;
|
||||
else
|
||||
polyType = PolyphoneType::Disable;
|
||||
|
||||
if (ui->checkExBox_3->isChecked())
|
||||
exType = ExDataProcessType::Default;
|
||||
else
|
||||
exType = ExDataProcessType::Delete;
|
||||
|
||||
HanZiToPinYin::getInstance()->setConfig(dataStyle, segType, polyType, exType);
|
||||
|
||||
ui->lineEdit_4->clear();
|
||||
QString text = ui->lineEdit_2->text();
|
||||
qDebug() << "input:" << text;
|
||||
|
||||
QStringList list;
|
||||
HanZiToPinYin::getInstance()->getResults(text.toStdString(), list);
|
||||
|
||||
ui->lineEdit_4->setText(list.join(" "));
|
||||
qDebug() << "result:" << list.join(" ");
|
||||
|
||||
vector<KeyWord> result = ChineseSegmentation::getInstance()->callSegment(ui->lineEdit_2->text().toStdString());
|
||||
|
||||
list.clear();
|
||||
for (auto &info:result) {
|
||||
list.append(QString().fromStdString(info.word));
|
||||
}
|
||||
ui->lineEdit_6->setText(list.join("/"));
|
||||
|
||||
});
|
||||
}
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
#ifndef MAINWINDOW_H
|
||||
#define MAINWINDOW_H
|
||||
|
||||
#include <QtWidgets>
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
namespace Ui { class MainWindow; }
|
||||
QT_END_NAMESPACE
|
||||
|
||||
class MainWindow : public QMainWindow
|
||||
{
|
||||
Q_OBJECT
|
||||
|
||||
public:
|
||||
MainWindow(QWidget *parent = nullptr);
|
||||
~MainWindow();
|
||||
|
||||
private:
|
||||
void initconnections();
|
||||
Ui::MainWindow *ui;
|
||||
QString m_action;
|
||||
};
|
||||
#endif // MAINWINDOW_H
|
|
@ -0,0 +1,181 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<ui version="4.0">
|
||||
<class>MainWindow</class>
|
||||
<widget class="QMainWindow" name="MainWindow">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>800</width>
|
||||
<height>600</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="windowTitle">
|
||||
<string>MainWindow</string>
|
||||
</property>
|
||||
<widget class="QWidget" name="centralwidget">
|
||||
<widget class="QPushButton" name="pushButton">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>40</x>
|
||||
<y>440</y>
|
||||
<width>191</width>
|
||||
<height>81</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>点击开始</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLineEdit" name="lineEdit">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>40</x>
|
||||
<y>20</y>
|
||||
<width>91</width>
|
||||
<height>31</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>输入文字:</string>
|
||||
</property>
|
||||
<property name="readOnly">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLineEdit" name="lineEdit_2">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>40</x>
|
||||
<y>70</y>
|
||||
<width>711</width>
|
||||
<height>41</height>
|
||||
</rect>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLineEdit" name="lineEdit_3">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>40</x>
|
||||
<y>310</y>
|
||||
<width>121</width>
|
||||
<height>31</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>拼音转换结果:</string>
|
||||
</property>
|
||||
<property name="readOnly">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLineEdit" name="lineEdit_4">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>40</x>
|
||||
<y>360</y>
|
||||
<width>711</width>
|
||||
<height>41</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="readOnly">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QCheckBox" name="checkSegBox">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>280</x>
|
||||
<y>430</y>
|
||||
<width>111</width>
|
||||
<height>29</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>不启用分词</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QCheckBox" name="checkPolyBox_2">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>280</x>
|
||||
<y>470</y>
|
||||
<width>131</width>
|
||||
<height>29</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>启用多音字</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QToolButton" name="toolButton">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>530</x>
|
||||
<y>460</y>
|
||||
<width>181</width>
|
||||
<height>30</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>数据形式...</string>
|
||||
</property>
|
||||
<property name="popupMode">
|
||||
<enum>QToolButton::MenuButtonPopup</enum>
|
||||
</property>
|
||||
<property name="autoRaise">
|
||||
<bool>false</bool>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QCheckBox" name="checkExBox_3">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>280</x>
|
||||
<y>510</y>
|
||||
<width>181</width>
|
||||
<height>29</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>无拼音数据原数据返回</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLineEdit" name="lineEdit_5">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>40</x>
|
||||
<y>160</y>
|
||||
<width>113</width>
|
||||
<height>31</height>
|
||||
</rect>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>分词结果:</string>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QLineEdit" name="lineEdit_6">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>40</x>
|
||||
<y>220</y>
|
||||
<width>711</width>
|
||||
<height>41</height>
|
||||
</rect>
|
||||
</property>
|
||||
</widget>
|
||||
</widget>
|
||||
<widget class="QMenuBar" name="menubar">
|
||||
<property name="geometry">
|
||||
<rect>
|
||||
<x>0</x>
|
||||
<y>0</y>
|
||||
<width>800</width>
|
||||
<height>28</height>
|
||||
</rect>
|
||||
</property>
|
||||
</widget>
|
||||
<widget class="QStatusBar" name="statusbar"/>
|
||||
</widget>
|
||||
<resources/>
|
||||
<connections/>
|
||||
</ui>
|
|
@ -0,0 +1,26 @@
|
|||
QT += core gui
|
||||
greaterThan(QT_MAJOR_VERSION, 4): QT += widgets
|
||||
CONFIG += c++11 link_pkgconfig
|
||||
|
||||
PKGCONFIG += chinese-segmentation
|
||||
|
||||
# The following define makes your compiler emit warnings if you use
|
||||
# any Qt feature that has been marked deprecated (the exact warnings
|
||||
# depend on your compiler). Please consult the documentation of the
|
||||
# deprecated API in order to know how to port your code away from it.
|
||||
DEFINES += QT_DEPRECATED_WARNINGS
|
||||
|
||||
# You can also make your code fail to compile if it uses deprecated APIs.
|
||||
# In order to do so, uncomment the following line.
|
||||
# You can also select to disable deprecated APIs only up to a certain version of Qt.
|
||||
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
|
||||
|
||||
HEADERS += \
|
||||
mainwindow.h
|
||||
|
||||
SOURCES += \
|
||||
main.cpp \
|
||||
mainwindow.cpp
|
||||
|
||||
FORMS += \
|
||||
mainwindow.ui
|
|
@ -1,161 +0,0 @@
|
|||
/*
|
||||
* Friso test program.
|
||||
* Of couse you can make it a perfect demo for friso.
|
||||
* all threads or proccess share the same friso_t,
|
||||
* defferent threads/proccess use defferent friso_task_t.
|
||||
* and you could share the friso_config_t if you wish...
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include "friso-interface.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
#define __LENGTH__ 15
|
||||
#define __INPUT_LENGTH__ 20480
|
||||
#define ___EXIT_INFO___ \
|
||||
println("Thanks for trying friso."); \
|
||||
break;
|
||||
|
||||
#define ___ABOUT___ \
|
||||
println("+---------------------------------------------------------------+"); \
|
||||
println("| Friso - a Chinese word segmentation writen by c. |"); \
|
||||
println("| bug report email - chenxin619315@gmail.com. |"); \
|
||||
println("| or: visit https://github.com/lionsoul2014/friso. |"); \
|
||||
println("| java version for https://github.com/lionsoul2014/jcseg |"); \
|
||||
println("| type 'quit' to exit the program. |"); \
|
||||
println("+---------------------------------------------------------------+");
|
||||
|
||||
//read a line from a command line.
|
||||
static fstring getLine(FILE *fp, fstring __dst) {
|
||||
register int c;
|
||||
register fstring cs;
|
||||
|
||||
cs = __dst;
|
||||
while((c = getc(fp)) != EOF) {
|
||||
if(c == '\n') break;
|
||||
*cs++ = c;
|
||||
}
|
||||
*cs = '\0';
|
||||
|
||||
return (c == EOF && cs == __dst) ? NULL : __dst;
|
||||
}
|
||||
|
||||
/*static void printcode( fstring str ) {
|
||||
int i,length;
|
||||
length = strlen( str );
|
||||
printf("str:length=%d\n", length );
|
||||
for ( i = 0; i < length; i++ ) {
|
||||
printf("%d ", str[i] );
|
||||
}
|
||||
putchar('\n');
|
||||
}*/
|
||||
|
||||
//int friso_test(int argc, char **argv)
|
||||
int friso_test() {
|
||||
|
||||
clock_t s_time, e_time;
|
||||
char line[__INPUT_LENGTH__] = {0};
|
||||
int i;
|
||||
fstring __path__ = NULL, mode = NULL;
|
||||
|
||||
friso_t friso;
|
||||
friso_config_t config;
|
||||
friso_task_t task;
|
||||
|
||||
// get the lexicon directory from command line arguments
|
||||
// for ( i = 0; i < argc; i++ ) {
|
||||
// if ( strcasecmp( "-init", argv[i] ) == 0 ) {
|
||||
// __path__ = argv[i+1];
|
||||
// }
|
||||
// }
|
||||
__path__ = "/usr/share/ukui-search/res/friso.ini";
|
||||
|
||||
if(__path__ == NULL) {
|
||||
println("Usage: friso -init lexicon path");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
s_time = clock();
|
||||
|
||||
//initialize
|
||||
friso = friso_new();
|
||||
config = friso_new_config();
|
||||
/*friso_dic_t dic = friso_dic_new();
|
||||
friso_dic_load_from_ifile( dic, __path__, __LENGTH__ );
|
||||
friso_set_dic( friso, dic );
|
||||
friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/
|
||||
if(friso_init_from_ifile(friso, config, __path__) != 1) {
|
||||
printf("fail to initialize friso and config.\n");
|
||||
goto err;
|
||||
}
|
||||
|
||||
switch(config->mode) {
|
||||
case __FRISO_SIMPLE_MODE__:
|
||||
mode = "Simple";
|
||||
break;
|
||||
case __FRISO_COMPLEX_MODE__:
|
||||
mode = "Complex";
|
||||
break;
|
||||
case __FRISO_DETECT_MODE__:
|
||||
mode = "Detect";
|
||||
break;
|
||||
}
|
||||
|
||||
//friso_set_mode( config, __FRISO_DETECT_MODE__ );
|
||||
//printf("clr_stw=%d\n", friso->clr_stw);
|
||||
//printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" ));
|
||||
//printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究"));
|
||||
|
||||
e_time = clock();
|
||||
|
||||
printf("Initialized in %fsec\n", (double)(e_time - s_time) / CLOCKS_PER_SEC);
|
||||
printf("Mode: %s\n", mode);
|
||||
printf("+-Version: %s (%s)\n", friso_version(), friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK");
|
||||
___ABOUT___;
|
||||
|
||||
//set the task.
|
||||
task = friso_new_task();
|
||||
|
||||
while(1) {
|
||||
print("friso>> ");
|
||||
getLine(stdin, line);
|
||||
//exit the programe
|
||||
if(strcasecmp(line, "quit") == 0) {
|
||||
___EXIT_INFO___
|
||||
}
|
||||
|
||||
//for ( i = 0; i < 1000000; i++ ) {
|
||||
//set the task text.
|
||||
friso_set_text(task, line);
|
||||
println("分词结果:");
|
||||
|
||||
s_time = clock();
|
||||
while((config->next_token(friso, config, task)) != NULL) {
|
||||
printf(
|
||||
"%s[%d, %d, %d] ",
|
||||
task->token->word,
|
||||
task->token->offset,
|
||||
task->token->length,
|
||||
task->token->rlen
|
||||
);
|
||||
// printf("%s ", task->token->word);
|
||||
}
|
||||
//}
|
||||
e_time = clock();
|
||||
printf("\nDone, cost < %fsec\n", ((double)(e_time - s_time)) / CLOCKS_PER_SEC);
|
||||
|
||||
}
|
||||
|
||||
friso_free_task(task);
|
||||
|
||||
//error block.
|
||||
err:
|
||||
friso_free_config(config);
|
||||
friso_free(friso);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,10 +0,0 @@
|
|||
/*
|
||||
* temporary use friso.ini, it should be removed in the future.
|
||||
* MouseZhangZh
|
||||
*/
|
||||
#include "friso/src/friso_API.h"
|
||||
#include "friso/src/friso.h"
|
||||
#include "friso/src/friso_ctype.h"
|
||||
|
||||
//int friso_test(int argc, char **argv);
|
||||
int friso_test();
|
|
@ -1,225 +0,0 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
==========================================================================
|
||||
The following license applies to the Friso ANSI C library
|
||||
--------------------------------------------------------------------------
|
||||
Copyright (c) 2010 lionsoul<chenxin619315@gmail.com>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@ -1,68 +0,0 @@
|
|||
# friso configuration file.
|
||||
# do not change the name of the left key.
|
||||
# @email chenxin619315@gmail.com
|
||||
# @date 2012-12-20
|
||||
#
|
||||
|
||||
# charset, only UTF8 and GBK support.
|
||||
# set it with UTF8(0) or GBK(1)
|
||||
friso.charset = 0
|
||||
|
||||
# lexicon directory absolute path.
|
||||
# the value must end with '/'
|
||||
# this will tell friso how to find friso.lex.ini configuration file and all the lexicon files.
|
||||
#
|
||||
# if it is not start with '/' for linux, or matches no ':' for winnt in its value
|
||||
# friso will search the friso.lex.ini relative to friso.ini
|
||||
# absolute path search:
|
||||
# linux: friso.lex_dir = /c/products/friso/dict/UTF-8/
|
||||
# Winnt: friso.lex_dir = D:/products/friso/dict/UTF-8/
|
||||
# relative path search (All system)
|
||||
friso.lex_dir = ./dict/UTF-8/
|
||||
|
||||
# the maximum matching length.
|
||||
friso.max_len = 5
|
||||
|
||||
# 1 for recognition chinese name.
|
||||
# and 0 for closed it.
|
||||
friso.r_name = 1
|
||||
|
||||
# the maximum length for the cjk words in a
|
||||
# chinese and english mixed word.
|
||||
friso.mix_len = 2
|
||||
|
||||
# the maxinum length for the chinese last name adron.
|
||||
friso.lna_len = 1
|
||||
|
||||
# append the synonyms words
|
||||
friso.add_syn = 1
|
||||
|
||||
# clear the stopwords or not (1 to open it and 0 to close it)
|
||||
# @date 2013-06-13
|
||||
friso.clr_stw = 0
|
||||
|
||||
# keep the unrecongized words or not (1 to open it and 0 to close it)
|
||||
# @date 2013-06-13
|
||||
friso.keep_urec = 0
|
||||
|
||||
# use sphinx output style like 'admire|love|enjoy einsten'
|
||||
# @date 2013-10-25
|
||||
friso.spx_out = 0
|
||||
|
||||
# start the secondary segmentation for complex english token.
|
||||
friso.en_sseg = 1
|
||||
|
||||
# min length of the secondary segmentation token. (better larger than 1)
|
||||
friso.st_minl = 2
|
||||
|
||||
# default keep punctuations for english token.
|
||||
friso.kpuncs = @%.#&+
|
||||
|
||||
# the threshold value for a char not a part of a chinese name.
|
||||
friso.nthreshold = 2000000
|
||||
|
||||
# default mode for friso.
|
||||
# 1 : simple mode - simply maxmum matching algorithm.
|
||||
# 2 : complex mode - four rules of mmseg alogrithm.
|
||||
# 3 : detect mode - only return the words that the do exists in the lexicon
|
||||
friso.mode = 2
|
|
@ -1,18 +0,0 @@
|
|||
INCLUDEPATH += $$PWD
|
||||
|
||||
HEADERS += \
|
||||
$$PWD/src/friso_API.h \
|
||||
$$PWD/src/friso.h \
|
||||
$$PWD/src/friso_ctype.h
|
||||
|
||||
SOURCES += \
|
||||
$$PWD/src/friso.c \
|
||||
$$PWD/src/friso_lexicon.c \
|
||||
$$PWD/src/friso_string.c \
|
||||
$$PWD/src/friso_array.c \
|
||||
$$PWD/src/friso_ctype.c \
|
||||
$$PWD/src/friso_GBK.c \
|
||||
$$PWD/src/friso_hash.c \
|
||||
$$PWD/src/friso_link.c \
|
||||
$$PWD/src/friso_UTF8.c
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -1,370 +0,0 @@
|
|||
/*
|
||||
* main interface file for friso tokenizer.
|
||||
* you could modify it and re-release and free for commercial use.
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#ifndef _friso_h
|
||||
#define _friso_h
|
||||
|
||||
#include "friso_API.h"
|
||||
#include <stdio.h>
|
||||
|
||||
/* {{{ friso main interface define :: start*/
|
||||
#define FRISO_VERSION "1.6.4"
|
||||
#define friso_version() FRISO_VERSION
|
||||
|
||||
|
||||
#define DEFAULT_SEGMENT_LENGTH 5
|
||||
#define DEFAULT_MIX_LENGTH 2
|
||||
#define DEFAULT_LNA_LENGTH 1
|
||||
#define DEFAULT_NTHRESHOLD 1000000
|
||||
#define DEFAULT_SEGMENT_MODE 2
|
||||
|
||||
/*
|
||||
* Type: friso_lex_t
|
||||
* -----------
|
||||
* This type used to represent the type of the lexicon.
|
||||
*/
|
||||
typedef enum {
|
||||
__LEX_CJK_WORDS__ = 0,
|
||||
__LEX_CJK_UNITS__ = 1,
|
||||
__LEX_ECM_WORDS__ = 2, //english and chinese mixed words.
|
||||
__LEX_CEM_WORDS__ = 3, //chinese and english mixed words.
|
||||
__LEX_CN_LNAME__ = 4,
|
||||
__LEX_CN_SNAME__ = 5,
|
||||
__LEX_CN_DNAME1__ = 6,
|
||||
__LEX_CN_DNAME2__ = 7,
|
||||
__LEX_CN_LNA__ = 8,
|
||||
__LEX_STOPWORDS__ = 9,
|
||||
__LEX_ENPUN_WORDS__ = 10,
|
||||
__LEX_EN_WORDS__ = 11,
|
||||
__LEX_OTHER_WORDS__ = 15,
|
||||
__LEX_NCSYN_WORDS__ = 16,
|
||||
__LEX_PUNC_WORDS__ = 17, //punctuations
|
||||
__LEX_UNKNOW_WORDS__ = 18 //unrecognized words.
|
||||
} friso_lex_t;
|
||||
|
||||
typedef friso_hash_t * friso_dic_t;
|
||||
#define __FRISO_LEXICON_LENGTH__ 12
|
||||
|
||||
|
||||
//charset that Friso now support.
|
||||
typedef enum {
|
||||
FRISO_UTF8 = 0, //UTF-8
|
||||
FRISO_GBK = 1 //GBK
|
||||
} friso_charset_t;
|
||||
|
||||
/*
|
||||
* Type: friso_mode_t
|
||||
* ------------------
|
||||
* use to identidy the mode that the friso use.
|
||||
*/
|
||||
typedef enum {
|
||||
__FRISO_SIMPLE_MODE__ = 1,
|
||||
__FRISO_COMPLEX_MODE__ = 2,
|
||||
__FRISO_DETECT_MODE__ = 3
|
||||
} friso_mode_t;
|
||||
|
||||
/* friso entry.*/
|
||||
typedef struct {
|
||||
friso_dic_t dic; //friso dictionary
|
||||
friso_charset_t charset; //project charset.
|
||||
} friso_entry;
|
||||
typedef friso_entry * friso_t;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Type: lex_entry_cdt
|
||||
* -------------------
|
||||
* This type used to represent the lexicon entry struct.
|
||||
*/
|
||||
#define _LEX_APPENSYN_MASK (1 << 0) //append synoyums words.
|
||||
#define lex_appensyn_open(e) e->ctrlMask |= _LEX_APPENSYN_MASK
|
||||
#define lex_appensyn_close(e) e->ctrlMask &= ~_LEX_APPENSYN_MASK
|
||||
#define lex_appensyn_check(e) ((e->ctrlMask & _LEX_APPENSYN_MASK) != 0)
|
||||
typedef struct {
|
||||
/*
|
||||
* the type of the lexicon item.
|
||||
* available value is all the elements in friso_lex_t enum.
|
||||
* and if it is __LEX_OTHER_WORDS__, we need to free it after use it.
|
||||
*/
|
||||
uchar_t length; //the length of the token.(after the convertor of Friso.)
|
||||
uchar_t rlen; //the real length of the token.(before any convert)
|
||||
uchar_t type;
|
||||
uchar_t ctrlMask; //function control mask, like append the synoyums words.
|
||||
uint_t offset; //offset index.
|
||||
fstring word;
|
||||
//fstring py; //pinyin of the word.(invalid)
|
||||
friso_array_t syn; //synoyums words.
|
||||
friso_array_t pos; //part of speech.
|
||||
uint_t fre; //single word frequency.
|
||||
} lex_entry_cdt;
|
||||
typedef lex_entry_cdt * lex_entry_t;
|
||||
|
||||
|
||||
/*the segmentation token entry.*/
|
||||
#define __HITS_WORD_LENGTH__ 64
|
||||
|
||||
typedef struct {
|
||||
uchar_t type; //type of the word. (item of friso_lex_t)
|
||||
uchar_t length; //length of the token.
|
||||
uchar_t rlen; //the real length of the token.(in orgin string)
|
||||
char pos; //part of speech.
|
||||
int offset; //start offset of the word.
|
||||
char word[__HITS_WORD_LENGTH__];
|
||||
//char py[0];
|
||||
} friso_token_entry;
|
||||
typedef friso_token_entry * friso_token_t;
|
||||
|
||||
|
||||
/*
|
||||
* Type: friso_task_entry
|
||||
* This type used to represent the current segmentation content.
|
||||
* like the text to split, and the current index, token buffer eg....
|
||||
*/
|
||||
//action control mask for #FRISO_TASK_T#.
|
||||
#define _TASK_CHECK_CF_MASK (1 << 0) //Wether to check the chinese fraction.
|
||||
#define _TASK_START_SS_MASK (1 << 1) //Wether to start the secondary segmentation.
|
||||
#define task_ssseg_open(task) task->ctrlMask |= _TASK_START_SS_MASK
|
||||
#define task_ssseg_close(task) task->ctrlMask &= ~_TASK_START_SS_MASK
|
||||
#define task_ssseg_check(task) ((task->ctrlMask & _TASK_START_SS_MASK) != 0)
|
||||
typedef struct {
|
||||
fstring text; //text to tokenize
|
||||
uint_t idx; //start offset index.
|
||||
uint_t length; //length of the text.
|
||||
uint_t bytes; //latest word bytes in C.
|
||||
uint_t unicode; //latest word unicode number.
|
||||
uint_t ctrlMask; //action control mask.
|
||||
friso_link_t pool; //task pool.
|
||||
string_buffer_t sbuf; //string buffer.
|
||||
friso_token_t token; //token result token;
|
||||
char buffer[7]; //word buffer. (1-6 bytes for an utf-8 word in C).
|
||||
} friso_task_entry;
|
||||
typedef friso_task_entry * friso_task_t;
|
||||
|
||||
|
||||
/* task configuration entry.*/
|
||||
#define _FRISO_KEEP_PUNC_LEN 13
|
||||
#define friso_en_kpunc(config, ch) (strchr(config->kpuncs, ch) != 0)
|
||||
//typedef friso_token_t ( * friso_next_hit_fn ) ( friso_t, void *, friso_task_t );
|
||||
//typedef lex_entry_t ( * friso_next_lex_fn ) ( friso_t, void *, friso_task_t );
|
||||
struct friso_config_struct {
|
||||
ushort_t max_len; //the max match length (4 - 7).
|
||||
ushort_t r_name; //1 for open chinese name recognition 0 for close it.
|
||||
ushort_t mix_len; //the max length for the CJK words in a mix string.
|
||||
ushort_t lna_len; //the max length for the chinese last name adron.
|
||||
ushort_t add_syn; //append synonyms tokenizer words.
|
||||
ushort_t clr_stw; //clear the stopwords.
|
||||
ushort_t keep_urec; //keep the unrecongnized words.
|
||||
ushort_t spx_out; //use sphinx output customize.
|
||||
ushort_t en_sseg; //start the secondary segmentation.
|
||||
ushort_t st_minl; //min length of the secondary segmentation token.
|
||||
uint_t nthreshold; //the threshold value for a char to make up a chinese name.
|
||||
friso_mode_t mode; //Complex mode or simple mode
|
||||
|
||||
//pointer to the function to get the next token
|
||||
friso_token_t (*next_token)(friso_t, struct friso_config_struct *, friso_task_t);
|
||||
//pointer to the function to get the next cjk lex_entry_t
|
||||
lex_entry_t (*next_cjk)(friso_t, struct friso_config_struct *, friso_task_t);
|
||||
|
||||
char kpuncs[_FRISO_KEEP_PUNC_LEN]; //keep punctuations buffer.
|
||||
};
|
||||
typedef struct friso_config_struct friso_config_entry;
|
||||
typedef friso_config_entry * friso_config_t;
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Function: friso_new;
|
||||
* Usage: vars = friso_new( void );
|
||||
* --------------------------------
|
||||
* This function used to create a new empty friso friso_t;
|
||||
* with default value.
|
||||
*/
|
||||
FRISO_API friso_t friso_new(void);
|
||||
|
||||
//creat a friso entry with a default value from a configuratile file.
|
||||
//@return 1 for successfully and 0 for failed.
|
||||
FRISO_API int friso_init_from_ifile(friso_t, friso_config_t, fstring);
|
||||
|
||||
/*
|
||||
* Function: friso_free_vars;
|
||||
* Usage: friso_free( vars );
|
||||
* --------------------------
|
||||
* This function is used to free the allocation of the given vars.
|
||||
*/
|
||||
FRISO_API void friso_free(friso_t);
|
||||
|
||||
/*
|
||||
* Function: friso_set_dic
|
||||
* Usage: dic = friso_set_dic( vars, dic );
|
||||
* ----------------------------------------
|
||||
* This function is used to set the dictionary for friso.
|
||||
* and firso_dic_t is the pointer of a hash table array.
|
||||
*/
|
||||
//FRISO_API void friso_set_dic( friso_t, friso_dic_t );
|
||||
#define friso_set_dic(friso, dic)\
|
||||
do {\
|
||||
friso->dic = dic;\
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Function: friso_set_mode
|
||||
* Usage: friso_set_mode( vars, mode );
|
||||
* ------------------------------------
|
||||
* This function is used to set the mode(complex or simple) that you want to friso to use.
|
||||
*/
|
||||
FRISO_API void friso_set_mode(friso_config_t, friso_mode_t);
|
||||
|
||||
/*create a new friso configuration entry and initialize
|
||||
it with the default value.*/
|
||||
FRISO_API friso_config_t friso_new_config(void);
|
||||
|
||||
//initialize the specified friso config entry with default value.
|
||||
FRISO_API void friso_init_config(friso_config_t);
|
||||
|
||||
//free the specified friso configuration entry.
|
||||
//FRISO_API void friso_free_config( friso_config_t );
|
||||
#define friso_free_config(cfg) FRISO_FREE(cfg)
|
||||
|
||||
/*
|
||||
* Function: friso_new_task;
|
||||
* Usage: segment = friso_new_task( void );
|
||||
* ----------------------------------------
|
||||
* This function is used to create a new friso segment type;
|
||||
*/
|
||||
FRISO_API friso_task_t friso_new_task(void);
|
||||
|
||||
/*
|
||||
* Function: friso_free_task;
|
||||
* Usage: friso_free_task( task );
|
||||
* -------------------------------
|
||||
* This function is used to free the allocation of function friso_new_segment();
|
||||
*/
|
||||
FRISO_API void friso_free_task(friso_task_t);
|
||||
|
||||
//create a new friso token
|
||||
FRISO_API friso_token_t friso_new_token(void);
|
||||
|
||||
//free the given friso token
|
||||
//FRISO_API void friso_free_token( friso_token_t );
|
||||
#define friso_free_token(token) FRISO_FREE(token)
|
||||
|
||||
/*
|
||||
* Function: friso_set_text
|
||||
* Usage: friso_set_text( task, text );
|
||||
* ------------------------------------
|
||||
* This function is used to set the text that is going to segment.
|
||||
*/
|
||||
FRISO_API void friso_set_text(friso_task_t, fstring);
|
||||
|
||||
|
||||
//get the next cjk word with mmseg simple mode
|
||||
FRISO_API lex_entry_t next_simple_cjk(friso_t, friso_config_t, friso_task_t);
|
||||
|
||||
//get the next cjk word with mmseg complex mode(mmseg core algorithm)
|
||||
FRISO_API lex_entry_t next_complex_cjk(friso_t, friso_config_t, friso_task_t);
|
||||
|
||||
/*
|
||||
* Function: next_mmseg_token
|
||||
* Usage: word = next_mmseg_token( vars, seg );
|
||||
* --------------------------------------
|
||||
* This function is used to get next word that friso segmented
|
||||
* with a split mode of __FRISO_SIMPLE_MODE__ or __FRISO_COMPLEX_MODE__
|
||||
*/
|
||||
FRISO_API friso_token_t next_mmseg_token(friso_t, friso_config_t, friso_task_t);
|
||||
|
||||
//__FRISO_DETECT_MODE__
|
||||
FRISO_API friso_token_t next_detect_token(friso_t, friso_config_t, friso_task_t);
|
||||
/* }}} friso main interface define :: end*/
|
||||
|
||||
/* {{{ lexicon interface define :: start*/
|
||||
|
||||
/*
|
||||
* Function: friso_dic_new
|
||||
* Usage: dic = friso_new_dic();
|
||||
* -----------------------------
|
||||
* This function used to create a new dictionary.(memory allocation).
|
||||
*/
|
||||
FRISO_API friso_dic_t friso_dic_new(void);
|
||||
|
||||
FRISO_API fstring file_get_line(fstring, FILE *);
|
||||
|
||||
/*
|
||||
* Function: friso_dic_free
|
||||
* Usage: friso_dic_free( void );
|
||||
* ------------------------------
|
||||
* This function is used to free all the allocation of friso_dic_new.
|
||||
*/
|
||||
FRISO_API void friso_dic_free(friso_dic_t);
|
||||
|
||||
//create a new lexicon entry.
|
||||
FRISO_API lex_entry_t new_lex_entry(fstring, friso_array_t, uint_t, uint_t, uint_t);
|
||||
|
||||
//free the given lexicon entry.
|
||||
//free all the allocations that its synonyms word's items pointed to
|
||||
//when the second arguments is 1
|
||||
FRISO_API void free_lex_entry_full(lex_entry_t);
|
||||
FRISO_API void free_lex_entry(lex_entry_t);
|
||||
|
||||
/*
|
||||
* Function: friso_dic_load
|
||||
* Usage: friso_dic_load( friso, friso_lex_t, path, length );
|
||||
* --------------------------------------------------
|
||||
* This function is used to load dictionary from a given path.
|
||||
* no length limit when length less than 0.
|
||||
*/
|
||||
FRISO_API void friso_dic_load(friso_t, friso_config_t,
|
||||
friso_lex_t, fstring, uint_t);
|
||||
|
||||
/*
|
||||
* load the lexicon configuration file.
|
||||
* and load all the valid lexicon from the conf file.
|
||||
*/
|
||||
FRISO_API void friso_dic_load_from_ifile(friso_t, friso_config_t, fstring, uint_t);
|
||||
|
||||
/*
|
||||
* Function: friso_dic_match
|
||||
* Usage: friso_dic_add( dic, friso_lex_t, word, syn );
|
||||
* ----------------------------------------------
|
||||
* This function used to put new word into the dictionary.
|
||||
*/
|
||||
FRISO_API void friso_dic_add(friso_dic_t, friso_lex_t, fstring, friso_array_t);
|
||||
|
||||
/*
|
||||
* Function: friso_dic_add_with_fre
|
||||
* Usage: friso_dic_add_with_fre( dic, friso_lex_t, word, value, syn, fre );
|
||||
* -------------------------------------------------------------------
|
||||
* This function used to put new word width frequency into the dictionary.
|
||||
*/
|
||||
FRISO_API void friso_dic_add_with_fre(friso_dic_t, friso_lex_t, fstring, friso_array_t, uint_t);
|
||||
|
||||
/*
|
||||
* Function: friso_dic_match
|
||||
* Usage: result = friso_dic_match( dic, friso_lex_t, word );
|
||||
* ----------------------------------------------------
|
||||
* This function is used to check the given word is in the dictionary or not.
|
||||
*/
|
||||
FRISO_API int friso_dic_match(friso_dic_t, friso_lex_t, fstring);
|
||||
|
||||
/*
|
||||
* Function: friso_dic_get
|
||||
* Usage: friso_dic_get( dic, friso_lex_t, word );
|
||||
* -----------------------------------------
|
||||
* This function is used to search the specified lex_entry_t.
|
||||
*/
|
||||
FRISO_API lex_entry_t friso_dic_get(friso_dic_t, friso_lex_t, fstring);
|
||||
|
||||
/*
|
||||
* Function: friso_spec_dic_size
|
||||
* Usage: friso_spec_dic_size( dic, friso_lex_t )
|
||||
* This function is used to get the size of the dictionary with a specified type.
|
||||
*/
|
||||
FRISO_API uint_t friso_spec_dic_size(friso_dic_t, friso_lex_t);
|
||||
FRISO_API uint_t friso_all_dic_size(friso_dic_t);
|
||||
/* }}} lexicon interface define :: end*/
|
||||
|
||||
#endif /*end ifndef*/
|
|
@ -1,412 +0,0 @@
|
|||
/*
|
||||
* friso ADT application interface header source file.
|
||||
* 1. string bufffer interface.
|
||||
* 2. hashmap interface.
|
||||
* 3. dynamaic array interface.
|
||||
* 4. double link list interface.
|
||||
*
|
||||
* @author chenxin <chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#ifndef _friso_api_h
|
||||
#define _friso_api_h
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
//yat, just take it as this way, 99 percent you will find no problem
|
||||
#if ( defined(_WIN32) || defined(_WINDOWS_) || defined(__WINDOWS_) )
|
||||
# define FRISO_WINNT
|
||||
#else
|
||||
# define FRISO_LINUX
|
||||
#endif
|
||||
|
||||
#ifdef FRISO_WINNT
|
||||
# define FRISO_API extern __declspec(dllexport)
|
||||
# define __STATIC_API__ static
|
||||
#else
|
||||
/*platform shared library statement :: unix*/
|
||||
# define FRISO_API extern
|
||||
# define __STATIC_API__ static inline
|
||||
#endif
|
||||
|
||||
#define ___ALLOCATION_ERROR___ \
|
||||
printf("Unable to do the memory allocation, program will now exit\n" ); \
|
||||
exit(1);
|
||||
|
||||
#define print(str) printf("%s", str )
|
||||
#define println(str) printf("%s\n", str )
|
||||
|
||||
/*
|
||||
* memory allocation macro definition which make it more more convenient
|
||||
* to change to use your favorite or a better memory manage library.
|
||||
*/
|
||||
#define FRISO_CALLOC(_bytes, _blocks) calloc(_bytes, _blocks)
|
||||
#define FRISO_MALLOC(_bytes) malloc(_bytes)
|
||||
#define FRISO_FREE(_ptr) free( _ptr )
|
||||
|
||||
typedef unsigned short ushort_t;
|
||||
typedef unsigned char uchar_t;
|
||||
typedef unsigned int uint_t;
|
||||
typedef char * fstring;
|
||||
|
||||
|
||||
|
||||
|
||||
/* {{{ fstring handle interface define::start. */
|
||||
#define __CHAR_BYTES__ 8
|
||||
#define __BUFFER_DEFAULT_LENGTH__ 16
|
||||
|
||||
typedef struct {
|
||||
fstring buffer;
|
||||
uint_t length;
|
||||
uint_t allocs;
|
||||
} string_buffer_entry;
|
||||
|
||||
typedef string_buffer_entry * string_buffer_t;
|
||||
|
||||
//FRISO_API string_buffer_t new_string_buffer( void );
|
||||
#define new_string_buffer() \
|
||||
new_string_buffer_with_opacity( __DEFAULT_ARRAY_LIST_OPACITY__ );
|
||||
FRISO_API string_buffer_t new_string_buffer_with_opacity(uint_t);
|
||||
FRISO_API string_buffer_t new_string_buffer_with_string(fstring str);
|
||||
|
||||
/*
|
||||
* this function will copy the chars that the fstring pointed.
|
||||
* to the buffer.
|
||||
* this may cause the resize action of the buffer.
|
||||
*/
|
||||
FRISO_API void string_buffer_append(string_buffer_t, fstring);
|
||||
FRISO_API void string_buffer_append_char(string_buffer_t, char);
|
||||
|
||||
//insert the given fstring from the specified position.
|
||||
FRISO_API void string_buffer_insert(string_buffer_t, uint_t idx, fstring);
|
||||
|
||||
//remove the char in the specified position.
|
||||
FRISO_API fstring string_buffer_remove(string_buffer_t, uint_t idx, uint_t);
|
||||
|
||||
/*
|
||||
* turn the string_buffer to a string.
|
||||
* or return the buffer of the string_buffer.
|
||||
*/
|
||||
FRISO_API string_buffer_t string_buffer_trim(string_buffer_t);
|
||||
|
||||
/*
|
||||
* free the given fstring buffer.
|
||||
* and this function will not free the allocations of the
|
||||
* the string_buffer_t->buffer, we return it to you, if there is
|
||||
* a necessary you could free it youself by calling free();
|
||||
*/
|
||||
FRISO_API fstring string_buffer_devote(string_buffer_t);
|
||||
|
||||
/*
|
||||
* clear the given fstring buffer.
|
||||
* reset its buffer with 0 and reset its length to 0.
|
||||
*/
|
||||
FRISO_API void string_buffer_clear(string_buffer_t);
|
||||
|
||||
//free the fstring buffer include the buffer.
|
||||
FRISO_API void free_string_buffer(string_buffer_t);
|
||||
|
||||
/**
|
||||
* fstring specified chars tokenizer functions
|
||||
*
|
||||
* @date 2013-06-08
|
||||
*/
|
||||
typedef struct {
|
||||
fstring source;
|
||||
uint_t srcLen;
|
||||
fstring delimiter;
|
||||
uint_t delLen;
|
||||
uint_t idx;
|
||||
} string_split_entry;
|
||||
typedef string_split_entry * string_split_t;
|
||||
|
||||
/**
|
||||
* create a new string_split_entry.
|
||||
*
|
||||
* @param source
|
||||
* @return string_split_t;
|
||||
*/
|
||||
FRISO_API string_split_t new_string_split(fstring, fstring);
|
||||
|
||||
FRISO_API void string_split_reset(string_split_t, fstring, fstring);
|
||||
|
||||
FRISO_API void string_split_set_source(string_split_t, fstring);
|
||||
|
||||
FRISO_API void string_split_set_delimiter(string_split_t, fstring);
|
||||
|
||||
FRISO_API void free_string_split(string_split_t);
|
||||
|
||||
/**
|
||||
* get the next split fstring, and copy the
|
||||
* splited fstring into the __dst buffer .
|
||||
*
|
||||
* @param string_split_t
|
||||
* @param __dst
|
||||
* @return fstring (NULL if reach the end of the source
|
||||
* or there is no more segmentation)
|
||||
*/
|
||||
FRISO_API fstring string_split_next(string_split_t, fstring);
|
||||
/* }}} */
|
||||
|
||||
|
||||
|
||||
|
||||
/* {{{ dynamaic array interface define::start*/
|
||||
#define __DEFAULT_ARRAY_LIST_OPACITY__ 8
|
||||
|
||||
/*friso array list entry struct*/
|
||||
typedef struct {
|
||||
void **items;
|
||||
uint_t allocs;
|
||||
uint_t length;
|
||||
} friso_array_entry;
|
||||
|
||||
typedef friso_array_entry * friso_array_t;
|
||||
|
||||
//create a new friso dynamic array.
|
||||
//FRISO_API friso_array_t new_array_list( void );
|
||||
#define new_array_list() new_array_list_with_opacity(__DEFAULT_ARRAY_LIST_OPACITY__)
|
||||
|
||||
//create a new friso dynamic array with the given opacity
|
||||
FRISO_API friso_array_t new_array_list_with_opacity(uint_t);
|
||||
|
||||
/*
|
||||
* free the given friso array.
|
||||
* and its items, but never where the items's item to pointed to .
|
||||
*/
|
||||
FRISO_API void free_array_list(friso_array_t);
|
||||
|
||||
//add a new item to the array.
|
||||
FRISO_API void array_list_add(friso_array_t, void *);
|
||||
|
||||
//insert a new item at a specifed position.
|
||||
FRISO_API void array_list_insert(friso_array_t, uint_t, void *);
|
||||
|
||||
//get a item at a specified position.
|
||||
FRISO_API void *array_list_get(friso_array_t, uint_t);
|
||||
|
||||
/*
|
||||
* set the item at a specified position.
|
||||
* this will return the old value.
|
||||
*/
|
||||
FRISO_API void *array_list_set(friso_array_t, uint_t, void *);
|
||||
|
||||
/*
|
||||
* remove the given item at a specified position.
|
||||
* this will return the value of the removed item.
|
||||
*/
|
||||
FRISO_API void *array_list_remove(friso_array_t, uint_t);
|
||||
|
||||
/*trim the array list for final use.*/
|
||||
FRISO_API friso_array_t array_list_trim(friso_array_t);
|
||||
|
||||
/*
|
||||
* clear the array list.
|
||||
* this function will free all the allocations that the pointer pointed.
|
||||
* but will not free the point array allocations,
|
||||
* and will reset the length of it.
|
||||
*/
|
||||
FRISO_API friso_array_t array_list_clear(friso_array_t);
|
||||
|
||||
//return the size of the array.
|
||||
//FRISO_API uint_t array_list_size( friso_array_t );
|
||||
#define array_list_size( array ) array->length
|
||||
|
||||
//return the allocations of the array.
|
||||
//FRISO_API uint_t array_list_allocs( friso_array_t );
|
||||
#define array_list_allocs( array ) array->allocs
|
||||
|
||||
//check if the array is empty.
|
||||
//FRISO_API int array_list_empty( friso_array_t );
|
||||
#define array_list_empty( array ) ( array->length == 0 )
|
||||
/* }}} dynamaic array interface define::end*/
|
||||
|
||||
|
||||
|
||||
|
||||
/* {{{ link list interface define::start*/
|
||||
struct friso_link_node {
|
||||
void *value;
|
||||
struct friso_link_node *prev;
|
||||
struct friso_link_node *next;
|
||||
};
|
||||
typedef struct friso_link_node link_node_entry;
|
||||
typedef link_node_entry * link_node_t;
|
||||
|
||||
/*
|
||||
* link list adt
|
||||
*/
|
||||
typedef struct {
|
||||
link_node_t head;
|
||||
link_node_t tail;
|
||||
uint_t size;
|
||||
} friso_link_entry;
|
||||
|
||||
typedef friso_link_entry * friso_link_t;
|
||||
|
||||
//create a new link list
|
||||
FRISO_API friso_link_t new_link_list(void);
|
||||
|
||||
//free the specified link list
|
||||
FRISO_API void free_link_list(friso_link_t);
|
||||
|
||||
//return the size of the current link list.
|
||||
//FRISO_API uint_t link_list_size( friso_link_t );
|
||||
#define link_list_size( link ) link->size
|
||||
|
||||
//check the given link is empty or not.
|
||||
//FRISO_API int link_list_empty( friso_link_t );
|
||||
#define link_list_empty( link ) (link->size == 0)
|
||||
|
||||
//clear all the nodes in the link list( except the head and the tail ).
|
||||
FRISO_API friso_link_t link_list_clear(friso_link_t link);
|
||||
|
||||
//add a new node to the link list.(append from the tail)
|
||||
FRISO_API void link_list_add(friso_link_t, void *);
|
||||
|
||||
//add a new node before the specified node
|
||||
FRISO_API void link_list_insert_before(friso_link_t, uint_t, void *);
|
||||
|
||||
//get the node in the current index.
|
||||
FRISO_API void *link_list_get(friso_link_t, uint_t);
|
||||
|
||||
//modify the node in the current index.
|
||||
FRISO_API void *link_list_set(friso_link_t, uint_t, void *);
|
||||
|
||||
//remove the specified link node
|
||||
FRISO_API void *link_list_remove(friso_link_t, uint_t);
|
||||
|
||||
//remove the given node
|
||||
FRISO_API void *link_list_remove_node(friso_link_t, link_node_t);
|
||||
|
||||
//remove the node from the frist.
|
||||
FRISO_API void *link_list_remove_first(friso_link_t);
|
||||
|
||||
//remove the last node from the link list
|
||||
FRISO_API void *link_list_remove_last(friso_link_t);
|
||||
|
||||
//append a node from the end.
|
||||
FRISO_API void link_list_add_last(friso_link_t, void *);
|
||||
|
||||
//add a node at the begining of the link list.
|
||||
FRISO_API void link_list_add_first(friso_link_t, void *);
|
||||
/* }}} link list interface define::end*/
|
||||
|
||||
|
||||
|
||||
|
||||
/* {{{ hashtable interface define :: start*/
|
||||
struct hash_entry {
|
||||
fstring _key; //the node key
|
||||
void * _val; //the node value
|
||||
struct hash_entry * _next;
|
||||
};
|
||||
typedef struct hash_entry friso_hash_entry;
|
||||
typedef friso_hash_entry * hash_entry_t;
|
||||
typedef void (*fhash_callback_fn_t)(hash_entry_t);
|
||||
|
||||
typedef struct {
|
||||
uint_t length;
|
||||
uint_t size;
|
||||
float factor;
|
||||
uint_t threshold;
|
||||
hash_entry_t *table;
|
||||
} friso_hash_cdt;
|
||||
|
||||
typedef friso_hash_cdt * friso_hash_t;
|
||||
|
||||
//default value for friso_hash_cdt
|
||||
#define DEFAULT_LENGTH 31
|
||||
#define DEFAULT_FACTOR 0.85f
|
||||
|
||||
/*
|
||||
* Function: new_hash_table
|
||||
* Usage: table = new_hash_table();
|
||||
* --------------------------------
|
||||
* this function allocates a new symbol table with no entries.
|
||||
*/
|
||||
FRISO_API friso_hash_t new_hash_table(void);
|
||||
|
||||
/*
|
||||
* Function: free_hash_table
|
||||
* Usage: free_hash_table( table );
|
||||
* --------------------------------------
|
||||
* this function will free all the allocation for memory.
|
||||
*/
|
||||
FRISO_API void free_hash_table(friso_hash_t, fhash_callback_fn_t);
|
||||
|
||||
/*
|
||||
* Function: put_new_mapping
|
||||
* Usage: put_mapping( table, key, value );
|
||||
* ----------------------------------------
|
||||
* the function associates the specified key with the given value.
|
||||
*/
|
||||
FRISO_API void *hash_put_mapping(friso_hash_t, fstring, void *);
|
||||
|
||||
/*
|
||||
* Function: is_mapping_exists
|
||||
* Usage: bool = is_mapping_exists( table, key );
|
||||
* ----------------------------------------------
|
||||
* this function check the given key mapping is exists or not.
|
||||
*/
|
||||
FRISO_API int hash_exist_mapping(friso_hash_t, fstring);
|
||||
|
||||
/*
|
||||
* Function: get_mapping_value
|
||||
* Usage: value = get_mapping_value( table, key );
|
||||
* -----------------------------------------------
|
||||
* this function return the value associated with the given key.
|
||||
* UNDEFINED will be return if the mapping is not exists.
|
||||
*/
|
||||
FRISO_API void * hash_get_value(friso_hash_t, fstring);
|
||||
|
||||
/*
|
||||
* Function: remove_mapping
|
||||
* Usage: remove_mapping( table, key );
|
||||
* ------------------------------------
|
||||
* This function is used to remove the mapping associated with the given key.
|
||||
*/
|
||||
FRISO_API hash_entry_t hash_remove_mapping(friso_hash_t, fstring);
|
||||
|
||||
/*
|
||||
* Function: get_table_size
|
||||
* Usage: size = get_table_size( table );
|
||||
* --------------------------------------
|
||||
* This function is used to count the size of the specified table.
|
||||
*/
|
||||
//FRISO_API uint_t hash_get_size( friso_hash_t );
|
||||
#define hash_get_size( hash ) hash->size
|
||||
/* }}} hashtable interface define :: end*/
|
||||
|
||||
|
||||
|
||||
|
||||
/* {{{ utf8 string interface define :: start*/
|
||||
|
||||
/*
|
||||
* Function: get_utf8_bytes
|
||||
*
|
||||
* */
|
||||
FRISO_API int get_utf8_bytes(char);
|
||||
|
||||
|
||||
/*
|
||||
* Function: get_utf8_unicode
|
||||
*
|
||||
* */
|
||||
FRISO_API int get_utf8_unicode(const fstring);
|
||||
|
||||
|
||||
/*
|
||||
* Function: unicode_to_utf8
|
||||
*
|
||||
* */
|
||||
FRISO_API int unicode_to_utf8(uint_t, fstring);
|
||||
|
||||
|
||||
/* }}} utf8 string interface define :: start*/
|
||||
|
||||
|
||||
#endif /*end ifndef*/
|
|
@ -1,283 +0,0 @@
|
|||
/**
|
||||
* Friso GBK serial functions implementation source file.
|
||||
* @package src/friso_GBK.c .
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "friso_API.h"
|
||||
#include "friso_ctype.h"
|
||||
|
||||
/* read the next GBK word from the specified position.
|
||||
*
|
||||
* @return int the bytes of the current readed word.
|
||||
*/
|
||||
FRISO_API int gbk_next_word(
|
||||
friso_task_t task,
|
||||
uint_t *idx,
|
||||
fstring __word) {
|
||||
int c;
|
||||
if(*idx >= task->length) return 0;
|
||||
|
||||
c = (uchar_t)task->text[*idx];
|
||||
if(c <= 0x80) {
|
||||
task->bytes = 1;
|
||||
} else {
|
||||
task->bytes = 2;
|
||||
}
|
||||
|
||||
//copy the word to the buffer.
|
||||
memcpy(__word, task->text + (*idx), task->bytes);
|
||||
(*idx) += task->bytes;
|
||||
__word[task->bytes] = '\0';
|
||||
|
||||
return task->bytes;
|
||||
}
|
||||
|
||||
//get the bytes of a gbk char.
|
||||
//FRISO_API int get_gbk_bytes( char c )
|
||||
//{
|
||||
// return 1;
|
||||
//}
|
||||
|
||||
//check if the given buffer is a gbk word (ANSII string).
|
||||
// included the simplified and traditional words.
|
||||
FRISO_API int gbk_cn_string(char *str) {
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
//GBK/2: gb2312 chinese word.
|
||||
return (((c1 >= 0xb0 && c1 <= 0xf7)
|
||||
&& (c2 >= 0xa1 && c2 <= 0xfe))
|
||||
//GBK/3: extend chinese words.
|
||||
|| ((c1 >= 0x81 && c1 <= 0xa0)
|
||||
&& ((c2 >= 0x40 && c2 <= 0x7e)
|
||||
|| (c2 >= 0x80 && c2 <= 0xfe)))
|
||||
//GBK/4: extend chinese words.
|
||||
|| ((c1 >= 0xaa && c1 <= 0xfe)
|
||||
&& ((c2 >= 0x40 && c2 <= 0xfe)
|
||||
|| (c2 >= 0x80 && c2 <= 0xa0))));
|
||||
}
|
||||
|
||||
/*check if the given char is a ASCII letter
|
||||
* include all the arabic number, letters and english puntuations.*/
|
||||
FRISO_API int gbk_halfwidth_en_char(char c) {
|
||||
int u = (uchar_t) c;
|
||||
return (u >= 32 && u <= 126);
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the given char is a full-width latain.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width puntuations.
|
||||
*/
|
||||
FRISO_API int gbk_fullwidth_en_char(char *str) {
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
return ((c1 == 0xA3)
|
||||
&& ((c2 >= 0xB0 && c2 <= 0xB9) //arabic numbers.
|
||||
|| (c2 >= 0xC1 && c2 <= 0xDA) //uppercase letters.
|
||||
|| (c2 >= 0xE1 && c2 <= 0xFA))); //lowercase letters.
|
||||
}
|
||||
|
||||
//check if the given char is a upper case english letter.
|
||||
// included the full-width and half-width letters.
|
||||
FRISO_API int gbk_uppercase_letter(char *str) {
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
if(c1 <= 0x80) { //half-width
|
||||
return (c1 >= 65 && c1 <= 90);
|
||||
} else { //full-width
|
||||
return (c1 == 0xa3 && (c2 >= 0xc1 && c2 <= 0xda));
|
||||
}
|
||||
}
|
||||
|
||||
//check if the given char is a lower case char.
|
||||
// included the full-width and half-width letters.
|
||||
FRISO_API int gbk_lowercase_letter(char *str) {
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
if(c1 <= 0x80) { //half-width
|
||||
return (c1 >= 97 && c1 <= 122);
|
||||
} else { //full-width
|
||||
return (c1 == 0xa3 && (c2 >= 0xe1 && c2 <= 0xfa));
|
||||
}
|
||||
}
|
||||
|
||||
//check if the given char is a arabic numeric.
|
||||
// included the full-width and half-width arabic numeric.
|
||||
FRISO_API int gbk_numeric_letter(char *str) {
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
if(c1 <= 0x80) { //half-width
|
||||
return (c1 >= 48 && c1 <= 57);
|
||||
} else { //full-width
|
||||
return ((c1 == 0xa3) && (c2 >= 0xb0 && c2 <= 0xb9));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the given fstring is make up with numeric chars.
|
||||
* both full-width,half-width numeric is ok.
|
||||
*/
|
||||
FRISO_API int gbk_numeric_string(char *str) {
|
||||
char *s = str;
|
||||
int c1 = 0;
|
||||
int c2 = 0;
|
||||
|
||||
while(*s != '\0') {
|
||||
c1 = (uchar_t)(*s++);
|
||||
if(c1 <= 0x80) { //half-width
|
||||
if(c1 < 48 || c2 > 57) return 0;
|
||||
} else { //full-width
|
||||
if(c1 != 0xa3) return 0;
|
||||
c2 = (uchar_t)(*s++);
|
||||
if(c2 < 0xb0 || c2 > 0xb9) return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
FRISO_API int gbk_decimal_string(char *str) {
|
||||
int c1 = 0;
|
||||
int c2 = 0;
|
||||
int len = strlen(str), i, p = 0;
|
||||
|
||||
//point header check.
|
||||
if(str[0] == '.' || str[len - 1] == '.') return 0;
|
||||
|
||||
for(i = 0; i < len;) {
|
||||
c1 = (uchar_t) str[i++];
|
||||
//count the number of the points.
|
||||
if(c1 == 46) {
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if(c1 <= 0x80) { //half-width
|
||||
if(c1 < 48 || c1 > 57) return 0;
|
||||
} else { //full-width
|
||||
if(c1 != 0xa3) return 0;
|
||||
c2 = (uchar_t) str[i++];
|
||||
if(c2 < 0xb0 || c2 > 0xb9) return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return (p == 1);
|
||||
}
|
||||
|
||||
//check if the given char is a english(ASCII) letter.
|
||||
// (full-width and half-width), not the punctuation/arabic of course.
|
||||
FRISO_API int gbk_en_letter(char *str) {
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
if(c1 <= 0x80) {
|
||||
return ((c1 >= 65 && c1 <= 90) //lowercase
|
||||
|| (c1 >= 97 && c1 <= 122)); //uppercase
|
||||
} else {
|
||||
return ((c1 == 0xa3)
|
||||
&& ((c2 >= 0xc1 && c2 <= 0xda) //lowercase
|
||||
|| (c2 >= 0xe1 && c2 <= 0xfa))); //uppercase
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check the given char is a whitespace or not.
|
||||
// included full-width and half-width whitespace.
|
||||
FRISO_API int gbk_whitespace(char *str) {
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
if(c1 <= 0x80) {
|
||||
return (c1 == 32);
|
||||
} else {
|
||||
return (c1 == 0xa3 && c2 == 0xa0);
|
||||
}
|
||||
}
|
||||
|
||||
/* check if the given char is a letter number like 'ⅠⅡ'
|
||||
*/
|
||||
FRISO_API int gbk_letter_number(char *str) {
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
return ((c1 == 0xa2)
|
||||
&& ((c2 >= 0xa1 && c2 <= 0xb0) //lowercase
|
||||
|| (c2 >= 0xf0 && c2 <= 0xfe))); //uppercase
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the given char is a other number like '①⑩⑽㈩'
|
||||
*/
|
||||
FRISO_API int gbk_other_number(char *str) {
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
return ((c1 == 0xa2) && (c2 >= 0xc5 && c2 <= 0xee));
|
||||
}
|
||||
|
||||
//check if the given char is a english punctuation.
|
||||
FRISO_API int gbk_en_punctuation(char c) {
|
||||
int u = (uchar_t) c;
|
||||
return ((u > 32 && u < 48)
|
||||
|| (u > 57 && u < 65)
|
||||
|| (u > 90 && u < 97)
|
||||
|| (u > 122 && u < 127));
|
||||
}
|
||||
|
||||
//check the given char is a chinese punctuation.
|
||||
FRISO_API int gbk_cn_punctuation(char *str) {
|
||||
int c1 = (uchar_t) str[0];
|
||||
int c2 = (uchar_t) str[1];
|
||||
//full-width en punctuation.
|
||||
return ((c1 == 0xa3 && ((c2 >= 0xa1 && c2 <= 0xaf)
|
||||
|| (c2 >= 0xba && c2 <= 0xc0)
|
||||
|| (c2 >= 0xdb && c2 <= 0xe0)
|
||||
|| (c2 >= 0xfb && c2 <= 0xfe)))
|
||||
//chinese punctuation.
|
||||
|| (c1 == 0xa1 && ((c2 >= 0xa1 && c2 <= 0xae)
|
||||
|| (c2 >= 0xb0 && c2 <= 0xbf)))
|
||||
//A6 area special punctuations:" "
|
||||
|| (c1 == 0xa6 && (c2 >= 0xf9 && c2 <= 0xfe))
|
||||
//A8 area special punctuations: " ˊˋ˙–―‥‵℅ "
|
||||
|| (c1 == 0xa8 && (c2 >= 0x40 && c2 <= 0x47)));
|
||||
}
|
||||
|
||||
/* {{{
|
||||
'@', '$','%', '^', '&', '-', ':', '.', '/', '\'', '#', '+'
|
||||
*/
|
||||
//cause it it the same as utf-8, we use utf8's interface instead.
|
||||
//@see the friso_ctype.h#gbk_keep_punctuation macro defined.
|
||||
|
||||
//static friso_hash_t __keep_punctuations_hash__ = NULL;
|
||||
|
||||
/* @Deprecated
|
||||
* check the given char is an english keep punctuation.*/
|
||||
//FRISO_API int gbk_keep_punctuation( char *str )
|
||||
//{
|
||||
// if ( __keep_punctuations_hash__ == NULL ) {
|
||||
// __keep_punctuations_hash__ = new_hash_table();
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
|
||||
// }
|
||||
// //check the hash.
|
||||
// return hash_exist_mapping( __keep_punctuations_hash__, str );
|
||||
//}
|
||||
/* }}} */
|
||||
|
||||
//check if the given english char is a full-width char or not.
|
||||
//FRISO_API int gbk_fullwidth_char( char *str )
|
||||
//{
|
||||
// return 1;
|
||||
//}
|
|
@ -1,467 +0,0 @@
|
|||
/**
|
||||
* Friso utf8 serial function implementation source file.
|
||||
* @package src/friso_UTF8.c .
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "friso_API.h"
|
||||
#include "friso_ctype.h"
|
||||
|
||||
/* read the next utf-8 word from the specified position.
|
||||
*
|
||||
* @return int the bytes of the current readed word.
|
||||
*/
|
||||
FRISO_API int utf8_next_word(
|
||||
friso_task_t task,
|
||||
uint_t *idx,
|
||||
fstring __word) {
|
||||
if(*idx >= task->length) return 0;
|
||||
|
||||
//register uint_t t;
|
||||
task->bytes = get_utf8_bytes(task->text[ *idx ]);
|
||||
|
||||
//for ( t = 0; t < task->bytes; t++ ) {
|
||||
// __word[t] = task->text[ (*idx)++ ];
|
||||
//}
|
||||
|
||||
//change the loop to memcpy.
|
||||
//it is more efficient.
|
||||
//@date 2013-09-04
|
||||
memcpy(__word, task->text + (*idx), task->bytes);
|
||||
(*idx) += task->bytes;
|
||||
__word[task->bytes] = '\0';
|
||||
|
||||
//the unicode counter was moved here from version 1.6.0
|
||||
task->unicode = get_utf8_unicode(__word);
|
||||
|
||||
return task->bytes;
|
||||
}
|
||||
|
||||
/*
|
||||
* print a character in a binary style.
|
||||
*
|
||||
* @param int
|
||||
*/
|
||||
FRISO_API void print_char_binary(char value) {
|
||||
register uint_t t;
|
||||
|
||||
for(t = 0; t < __CHAR_BYTES__; t++) {
|
||||
if((value & 0x80) == 0x80) {
|
||||
printf("1");
|
||||
} else {
|
||||
printf("0");
|
||||
}
|
||||
value <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* get the bytes of a utf-8 char.
|
||||
* between 1 - 6.
|
||||
*
|
||||
* @param __char
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int get_utf8_bytes(char value) {
|
||||
register uint_t t = 0;
|
||||
|
||||
//one byte ascii char.
|
||||
if((value & 0x80) == 0) return 1;
|
||||
for(; (value & 0x80) != 0; value <<= 1) {
|
||||
t++;
|
||||
}
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
/*
|
||||
* get the unicode serial of a utf-8 char.
|
||||
*
|
||||
* @param ch
|
||||
* @return int.
|
||||
*/
|
||||
FRISO_API int get_utf8_unicode(const fstring ch) {
|
||||
int code = 0, bytes = get_utf8_bytes(*ch);
|
||||
register uchar_t *bit = (uchar_t *) &code;
|
||||
register char b1, b2, b3;
|
||||
|
||||
switch(bytes) {
|
||||
case 1:
|
||||
*bit = *ch;
|
||||
break;
|
||||
case 2:
|
||||
b1 = *ch;
|
||||
b2 = *(ch + 1);
|
||||
|
||||
*bit = (b1 << 6) + (b2 & 0x3F);
|
||||
*(bit + 1) = (b1 >> 2) & 0x07;
|
||||
break;
|
||||
case 3:
|
||||
b1 = *ch;
|
||||
b2 = *(ch + 1);
|
||||
b3 = *(ch + 2);
|
||||
|
||||
*bit = (b2 << 6) + (b3 & 0x3F);
|
||||
*(bit + 1) = (b1 << 4) + ((b2 >> 2) & 0x0F);
|
||||
break;
|
||||
//ignore the ones that are larger than 3 bytes;
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
//turn the unicode serial to a utf-8 string.
|
||||
FRISO_API int unicode_to_utf8(uint_t u, fstring __word) {
|
||||
if(u <= 0x0000007F) {
|
||||
//U-00000000 - U-0000007F
|
||||
//0xxxxxxx
|
||||
*__word = (u & 0x7F);
|
||||
return 1;
|
||||
} else if(u >= 0x00000080 && u <= 0x000007FF) {
|
||||
//U-00000080 - U-000007FF
|
||||
//110xxxxx 10xxxxxx
|
||||
*(__word + 1) = (u & 0x3F) | 0x80;
|
||||
*__word = ((u >> 6) & 0x1F) | 0xC0;
|
||||
return 2;
|
||||
} else if(u >= 0x00000800 && u <= 0x0000FFFF) {
|
||||
//U-00000800 - U-0000FFFF
|
||||
//1110xxxx 10xxxxxx 10xxxxxx
|
||||
*(__word + 2) = (u & 0x3F) | 0x80;
|
||||
*(__word + 1) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 12) & 0x0F) | 0xE0;
|
||||
return 3;
|
||||
} else if(u >= 0x00010000 && u <= 0x001FFFFF) {
|
||||
//U-00010000 - U-001FFFFF
|
||||
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*(__word + 3) = (u & 0x3F) | 0x80;
|
||||
*(__word + 2) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*(__word + 1) = ((u >> 12) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 18) & 0x07) | 0xF0;
|
||||
return 4;
|
||||
} else if(u >= 0x00200000 && u <= 0x03FFFFFF) {
|
||||
//U-00200000 - U-03FFFFFF
|
||||
//111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*(__word + 4) = (u & 0x3F) | 0x80;
|
||||
*(__word + 3) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*(__word + 2) = ((u >> 12) & 0x3F) | 0x80;
|
||||
*(__word + 1) = ((u >> 18) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 24) & 0x03) | 0xF8;
|
||||
return 5;
|
||||
} else if(u >= 0x04000000 && u <= 0x7FFFFFFF) {
|
||||
//U-04000000 - U-7FFFFFFF
|
||||
//1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
*(__word + 5) = (u & 0x3F) | 0x80;
|
||||
*(__word + 4) = ((u >> 6) & 0x3F) | 0x80;
|
||||
*(__word + 3) = ((u >> 12) & 0x3F) | 0x80;
|
||||
*(__word + 2) = ((u >> 18) & 0x3F) | 0x80;
|
||||
*(__word + 1) = ((u >> 24) & 0x3F) | 0x80;
|
||||
*__word = ((u >> 30) & 0x01) | 0xFC;
|
||||
return 6;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* check the given char is a CJK char or not.
|
||||
* 2E80-2EFF CJK 部首补充
|
||||
* 2F00-2FDF 康熙字典部首
|
||||
* 3000-303F CJK 符号和标点 --ignore
|
||||
* 31C0-31EF CJK 笔画
|
||||
* 3200-32FF 封闭式 CJK 文字和月份 --ignore.
|
||||
* 3300-33FF CJK 兼容
|
||||
* 3400-4DBF CJK 统一表意符号扩展 A
|
||||
* 4DC0-4DFF 易经六十四卦符号
|
||||
* 4E00-9FBF CJK 统一表意符号
|
||||
* F900-FAFF CJK 兼容象形文字
|
||||
* FE30-FE4F CJK 兼容形式
|
||||
* FF00-FFEF 全角ASCII、全角标点 --ignore (as basic latin)
|
||||
*
|
||||
* Japanese:
|
||||
* 3040-309F 日本平假名
|
||||
* 30A0-30FF 日本片假名
|
||||
* 31F0-31FF 日本片假名拼音扩展
|
||||
*
|
||||
* Korean:
|
||||
* AC00-D7AF 韩文拼音
|
||||
* 1100-11FF 韩文字母
|
||||
* 3130-318F 韩文兼容字母
|
||||
*
|
||||
* @param ch :pointer to the char
|
||||
* @return int : 1 for yes and 0 for not.
|
||||
*/
|
||||
|
||||
//Comment one of the following macro define
|
||||
//to clear the check of the specified language.
|
||||
#define FRISO_CJK_CHK_C
|
||||
//#define FRISO_CJK_CHK_J
|
||||
//#define FRISO_CJK_CHK_K
|
||||
FRISO_API int utf8_cjk_string(uint_t u) {
|
||||
int c = 0, j = 0, k = 0;
|
||||
//Chinese.
|
||||
#ifdef FRISO_CJK_CHK_C
|
||||
c = ((u >= 0x4E00 && u <= 0x9FBF)
|
||||
|| (u >= 0x2E80 && u <= 0x2EFF) || (u >= 0x2F00 && u <= 0x2FDF)
|
||||
|| (u >= 0x31C0 && u <= 0x31EF) //|| ( u >= 0x3200 && u <= 0x32FF )
|
||||
|| (u >= 0x3300 && u <= 0x33FF) //|| ( u >= 0x3400 && u <= 0x4DBF )
|
||||
|| (u >= 0x4DC0 && u <= 0x4DFF) || (u >= 0xF900 && u <= 0xFAFF)
|
||||
|| (u >= 0xFE30 && u <= 0xFE4F));
|
||||
#endif
|
||||
|
||||
//Japanese.
|
||||
#ifdef FRISO_CJK_CHK_J
|
||||
j = ((u >= 0x3040 && u <= 0x309F)
|
||||
|| (u >= 0x30A0 && u <= 0x30FF) || (u >= 0x31F0 && u <= 0x31FF));
|
||||
#endif
|
||||
|
||||
//Korean
|
||||
#ifdef FRISO_CJK_CHK_K
|
||||
k = ((u >= 0xAC00 && u <= 0xD7AF)
|
||||
|| (u >= 0x1100 && u <= 0x11FF) || (u >= 0x3130 && u <= 0x318F));
|
||||
#endif
|
||||
|
||||
return (c || j || k);
|
||||
}
|
||||
|
||||
/*
|
||||
* check the given char is a Basic Latin letter or not.
|
||||
* include all the letters and english punctuations.
|
||||
*
|
||||
* @param c
|
||||
* @return int 1 for yes and 0 for not.
|
||||
*/
|
||||
FRISO_API int utf8_halfwidth_en_char(uint_t u) {
|
||||
return (u >= 32 && u <= 126);
|
||||
}
|
||||
|
||||
/*
|
||||
* check the given char is a full-width latain or not.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width punctuations.
|
||||
*
|
||||
* @param c
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int utf8_fullwidth_en_char(uint_t u) {
|
||||
return ((u >= 65296 && u <= 65305) //arabic number
|
||||
|| (u >= 65313 && u <= 65338) //upper case letters
|
||||
|| (u >= 65345 && u <= 65370)); //lower case letters
|
||||
}
|
||||
|
||||
//check the given char is a upper case letters or not.
|
||||
// included the full-width and half-width letters.
|
||||
FRISO_API int utf8_uppercase_letter(uint_t u) {
|
||||
if(u > 65280) u -= 65248;
|
||||
return (u >= 65 && u <= 90);
|
||||
}
|
||||
|
||||
//check the given char is a upper case letters or not.
|
||||
// included the full-width and half-width letters.
|
||||
FRISO_API int utf8_lowercase_letter(uint_t u) {
|
||||
if(u > 65280) u -= 65248;
|
||||
return (u >= 97 && u <= 122);
|
||||
}
|
||||
|
||||
//check the given char is a numeric
|
||||
// included the full-width and half-width arabic numeric.
|
||||
FRISO_API int utf8_numeric_letter(uint_t u) {
|
||||
if(u > 65280) u -= 65248; //make full-width half-width.
|
||||
return ((u >= 48 && u <= 57));
|
||||
}
|
||||
|
||||
//check the given char is a english letter.(included the full-width)
|
||||
// not the punctuation of course.
|
||||
FRISO_API int utf8_en_letter(uint_t u) {
|
||||
if(u > 65280) u -= 65248;
|
||||
return ((u >= 65 && u <= 90)
|
||||
|| (u >= 97 && u <= 122));
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the given fstring is make up with numeric.
|
||||
* both full-width,half-width numeric is ok.
|
||||
*
|
||||
* @param str
|
||||
* @return int
|
||||
* 65296, 0
|
||||
* 65297, 1
|
||||
* 65298, 2
|
||||
* 65299, 3
|
||||
* 65300, 4
|
||||
* 65301, 5
|
||||
* 65302, 6
|
||||
* 65303, 7
|
||||
* 65304, 8
|
||||
* 65305, 9
|
||||
*/
|
||||
FRISO_API int utf8_numeric_string(const fstring str) {
|
||||
fstring s = str;
|
||||
int bytes, u;
|
||||
|
||||
while(*s != '\0') {
|
||||
//if ( ! utf8_numeric_letter( get_utf8_unicode( s++ ) ) ) {
|
||||
// return 0;
|
||||
//}
|
||||
|
||||
//new implemention.
|
||||
//@date 2013-10-14
|
||||
bytes = 1;
|
||||
if(*s < 0) { //full-width chars.
|
||||
u = get_utf8_unicode(s);
|
||||
bytes = get_utf8_bytes(*s);
|
||||
if(u < 65296 || u > 65305) return 0;
|
||||
} else if(*s < 48 || *s > 57) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
s += bytes;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
FRISO_API int utf8_decimal_string(const fstring str) {
|
||||
int len = strlen(str), i, p = 0;
|
||||
int bytes = 0, u;
|
||||
|
||||
if(str[0] == '.' || str[len - 1] == '.') return 0;
|
||||
|
||||
for(i = 1; i < len; bytes = 1) {
|
||||
//count the number of char '.'
|
||||
if(str[i] == '.') {
|
||||
i++;
|
||||
p++;
|
||||
continue;
|
||||
} else if(str[i] < 0) {
|
||||
//full-width numeric.
|
||||
u = get_utf8_unicode(str + i);
|
||||
bytes = get_utf8_bytes(str[i]);
|
||||
if(u < 65296 || u > 65305) return 0;
|
||||
} else if(str[i] < 48 || str[i] > 57) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
i += bytes;
|
||||
}
|
||||
|
||||
return (p == 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* check the given char is a whitespace or not.
|
||||
*
|
||||
* @param ch
|
||||
* @return int 1 for yes and 0 for not.
|
||||
*/
|
||||
FRISO_API int utf8_whitespace(uint_t u) {
|
||||
if(u == 32 || u == 12288) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* check the given char is a english punctuation.
|
||||
*
|
||||
* @param ch
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int utf8_en_punctuation(uint_t u) {
|
||||
//if ( u > 65280 ) u = u - 65248; //make full-width half-width
|
||||
return ((u > 32 && u < 48)
|
||||
|| (u > 57 && u < 65)
|
||||
|| (u > 90 && u < 97) //added @2013-08-31
|
||||
|| (u > 122 && u < 127));
|
||||
}
|
||||
|
||||
/*
|
||||
* check the given char is a chinese punctuation.
|
||||
* @date 2013-08-31 added.
|
||||
*
|
||||
* @param ch
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int utf8_cn_punctuation(uint_t u) {
|
||||
return ((u > 65280 && u < 65296)
|
||||
|| (u > 65305 && u < 65312)
|
||||
|| (u > 65338 && u < 65345)
|
||||
|| (u > 65370 && u < 65382)
|
||||
//cjk symbol and punctuation.(added 2013-09-06)
|
||||
//from http://www.unicode.org/charts/PDF/U3000.pdf
|
||||
|| (u >= 12289 && u <= 12319));
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the given char is a letter number in unicode.
|
||||
* like 'ⅠⅡ'.
|
||||
* @param ch
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int utf8_letter_number(uint_t u) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* check if the given char is a other number in unicode.
|
||||
* like '①⑩⑽㈩'.
|
||||
* @param ch
|
||||
* @return int
|
||||
*/
|
||||
FRISO_API int utf8_other_number(uint_t u) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
//A macro define has replace this.
|
||||
//FRISO_API int is_en_punctuation( char c )
|
||||
//{
|
||||
// return utf8_en_punctuation( (uint_t) c );
|
||||
//}
|
||||
|
||||
/* {{{
|
||||
'@', '$','%', '^', '&', '-', ':', '.', '/', '\'', '#', '+'
|
||||
*/
|
||||
//static friso_hash_t __keep_punctuations_hash__ = NULL;
|
||||
|
||||
/* @Deprecated
|
||||
* check the given char is an english keep punctuation.*/
|
||||
//FRISO_API int utf8_keep_punctuation( fstring str )
|
||||
//{
|
||||
// if ( __keep_punctuations_hash__ == NULL )
|
||||
// {
|
||||
// __keep_punctuations_hash__ = new_hash_table();
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
|
||||
// //hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
|
||||
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
|
||||
// }
|
||||
// //check the hash.
|
||||
// return hash_exist_mapping( __keep_punctuations_hash__, str );
|
||||
//}
|
||||
/* }}} */
|
||||
|
||||
/*
|
||||
* check the given english char is a full-width char or not.
|
||||
*
|
||||
* @param ch
|
||||
* @return 1 for yes and 0 for not.
|
||||
*/
|
||||
//FRISO_API int utf8_fullwidth_char( uint_t u )
|
||||
//{
|
||||
// if ( u == 12288 )
|
||||
// return 1; //full-width space
|
||||
// //(32 - 126) ascii code
|
||||
// return (u > 65280 && u <= 65406);
|
||||
//}
|
|
@ -1,209 +0,0 @@
|
|||
/*
|
||||
* friso dynamaic Array interface implementation defined in header file "friso_API.h".
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include "friso_API.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
/* ********************************************
|
||||
* friso array list static functions block *
|
||||
**********************************************/
|
||||
__STATIC_API__ void **create_array_entries(uint_t __blocks) {
|
||||
register uint_t t;
|
||||
void **block = (void **) FRISO_CALLOC(sizeof(void *), __blocks);
|
||||
if(block == NULL) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize
|
||||
for(t = 0; t < __blocks; t++) {
|
||||
block[t] = NULL;
|
||||
}
|
||||
|
||||
return block;
|
||||
}
|
||||
|
||||
//resize the array. (the opacity should not be smaller than array->length)
|
||||
__STATIC_API__ friso_array_t resize_array_list(
|
||||
friso_array_t array,
|
||||
uint_t opacity) {
|
||||
register uint_t t;
|
||||
void **block = create_array_entries(opacity);
|
||||
|
||||
for(t = 0; t < array->length ; t++) {
|
||||
block[t] = array->items[t];
|
||||
}
|
||||
|
||||
FRISO_FREE(array->items);
|
||||
array->items = block;
|
||||
array->allocs = opacity;
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
|
||||
/* ********************************************
|
||||
* friso array list FRISO_API functions block *
|
||||
**********************************************/
|
||||
//create a new array list. (A macro define has replace this.)
|
||||
//FRISO_API friso_array_t new_array_list( void ) {
|
||||
// return new_array_list_with_opacity( __DEFAULT_ARRAY_LIST_OPACITY__ );
|
||||
//}
|
||||
|
||||
//create a new array list with a given opacity.
|
||||
FRISO_API friso_array_t new_array_list_with_opacity(uint_t opacity) {
|
||||
friso_array_t array = (friso_array_t)
|
||||
FRISO_MALLOC(sizeof(friso_array_entry));
|
||||
if(array == NULL) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize
|
||||
array->items = create_array_entries(opacity);
|
||||
array->allocs = opacity;
|
||||
array->length = 0;
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
/*
|
||||
* free the given friso array.
|
||||
* and its items, but never where its items item pointed to .
|
||||
*/
|
||||
FRISO_API void free_array_list(friso_array_t array) {
|
||||
//free the allocation that all the items pointed to
|
||||
//register int t;
|
||||
//if ( flag == 1 ) {
|
||||
// for ( t = 0; t < array->length; t++ ) {
|
||||
// if ( array->items[t] == NULL ) continue;
|
||||
// FRISO_FREE( array->items[t] );
|
||||
// array->items[t] = NULL;
|
||||
// }
|
||||
//}
|
||||
|
||||
FRISO_FREE(array->items);
|
||||
FRISO_FREE(array);
|
||||
}
|
||||
|
||||
//add a new item to the array.
|
||||
FRISO_API void array_list_add(friso_array_t array, void *value) {
|
||||
//check the condition to resize.
|
||||
if(array->length == array->allocs) {
|
||||
resize_array_list(array, array->length * 2 + 1);
|
||||
}
|
||||
array->items[array->length++] = value;
|
||||
}
|
||||
|
||||
//insert a new item at a specified position.
|
||||
FRISO_API void array_list_insert(
|
||||
friso_array_t array,
|
||||
uint_t idx,
|
||||
void *value) {
|
||||
register uint_t t;
|
||||
|
||||
if(idx <= array->length) {
|
||||
//check the condition to resize the array.
|
||||
if(array->length == array->allocs) {
|
||||
resize_array_list(array, array->length * 2 + 1);
|
||||
}
|
||||
|
||||
//move the elements after idx.
|
||||
//for ( t = idx; t < array->length; t++ ) {
|
||||
// array->items[t+1] = array->items[t];
|
||||
//}
|
||||
for(t = array->length - 1; t >= idx; t--) {
|
||||
array->items[t + 1] = array->items[t];
|
||||
}
|
||||
|
||||
array->items[idx] = value;
|
||||
array->length++;
|
||||
}
|
||||
}
|
||||
|
||||
//get the item at a specified position.
|
||||
FRISO_API void *array_list_get(friso_array_t array, uint_t idx) {
|
||||
if(idx < array->length) {
|
||||
return array->items[idx];
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//set the value of the item at a specified position.
|
||||
//this will return the old value.
|
||||
FRISO_API void * array_list_set(
|
||||
friso_array_t array,
|
||||
uint_t idx,
|
||||
void * value) {
|
||||
void * oval = NULL;
|
||||
if(idx < array->length) {
|
||||
oval = array->items[idx];
|
||||
array->items[idx] = value;
|
||||
}
|
||||
return oval;
|
||||
}
|
||||
|
||||
//remove the item at a specified position.
|
||||
//this will return the value of the removed item.
|
||||
FRISO_API void * array_list_remove(
|
||||
friso_array_t array, uint_t idx) {
|
||||
register uint_t t;
|
||||
void *oval = NULL;
|
||||
|
||||
if(idx < array->length) {
|
||||
oval = array->items[idx];
|
||||
//move the elements after idx.
|
||||
for(t = idx; t < array->length - 1; t++) {
|
||||
array->items[t] = array->items[ t + 1 ];
|
||||
}
|
||||
array->items[array->length - 1] = NULL;
|
||||
array->length--;
|
||||
}
|
||||
|
||||
return oval;
|
||||
}
|
||||
|
||||
/*trim the array list*/
|
||||
FRISO_API friso_array_t array_list_trim(friso_array_t array) {
|
||||
if(array->length < array->allocs) {
|
||||
return resize_array_list(array, array->length);
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
/*
|
||||
* clear the array list.
|
||||
* this function will free all the allocations that the pointer pointed.
|
||||
* but will not free the point array allocations,
|
||||
* and will reset the length of it.
|
||||
*/
|
||||
FRISO_API friso_array_t array_list_clear(friso_array_t array) {
|
||||
register uint_t t;
|
||||
//free all the allocations that the array->length's pointer pointed.
|
||||
for(t = 0; t < array->length; t++) {
|
||||
/*if ( array->items[t] == NULL ) continue;
|
||||
FRISO_FREE( array->items[t] ); */
|
||||
array->items[t] = NULL;
|
||||
}
|
||||
//attribute reset.
|
||||
array->length = 0;
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
//get the size of the array list. (A macro define has replace this.)
|
||||
//FRISO_API uint_t array_list_size( friso_array_t array ) {
|
||||
// return array->length;
|
||||
//}
|
||||
|
||||
//return the allocations of the array list.(A macro define has replace this)
|
||||
//FRISO_API uint_t array_list_allocs( friso_array_t array ) {
|
||||
// return array->allocs;
|
||||
//}
|
||||
|
||||
//check if the array is empty.(A macro define has replace this.)
|
||||
//FRISO_API int array_list_empty( friso_array_t array )
|
||||
//{
|
||||
// return ( array->length == 0 );
|
||||
//}
|
|
@ -1,244 +0,0 @@
|
|||
/**
|
||||
* friso string type check functions,
|
||||
* like english/CJK, full-wdith/half-width, punctuation or not.
|
||||
* @see friso_UTF8.c and friso_GBK.c for detail.
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "friso_ctype.h"
|
||||
#include "friso_API.h"
|
||||
|
||||
/* check if the specified string is a cn string.
|
||||
*
|
||||
* @return int (true for cn string or false)
|
||||
* */
|
||||
FRISO_API int friso_cn_string(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task) {
|
||||
if(charset == FRISO_UTF8) {
|
||||
return utf8_cjk_string(task->unicode);
|
||||
} else if(charset == FRISO_GBK) {
|
||||
return gbk_cn_string(task->buffer);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified word is a whitespace.
|
||||
FRISO_API int friso_whitespace(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task) {
|
||||
if(charset == FRISO_UTF8) {
|
||||
return utf8_whitespace(task->unicode);
|
||||
} else if(charset == FRISO_GBK) {
|
||||
return gbk_whitespace(task->buffer);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specifiled word is a numeric letter.
|
||||
FRISO_API int friso_numeric_letter(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task) {
|
||||
if(charset == FRISO_UTF8) {
|
||||
return utf8_numeric_letter((uint_t) task->text[task->idx]);
|
||||
} else if(charset == FRISO_GBK) {
|
||||
return gbk_numeric_letter(task->text + task->idx);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified word is aa english letter.
|
||||
FRISO_API int friso_en_letter(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task) {
|
||||
if(charset == FRISO_UTF8) {
|
||||
return utf8_en_letter((uint_t) task->text[task->idx]);
|
||||
} else if(charset == FRISO_GBK) {
|
||||
return gbk_en_letter(task->text + task->idx);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified word is a half-width letter.
|
||||
// punctuations are inclued.
|
||||
FRISO_API int friso_halfwidth_en_char(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task) {
|
||||
if(charset == FRISO_UTF8) {
|
||||
return utf8_halfwidth_en_char(task->unicode);
|
||||
} else if(charset == FRISO_GBK) {
|
||||
return gbk_halfwidth_en_char(task->buffer[0]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified word is a full-width letter.
|
||||
// full-width punctuations are not included.
|
||||
FRISO_API int friso_fullwidth_en_char(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task) {
|
||||
if(charset == FRISO_UTF8) {
|
||||
return utf8_fullwidth_en_char(task->unicode);
|
||||
} else if(charset == FRISO_GBK) {
|
||||
return gbk_fullwidth_en_char(task->buffer);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified word is an english punctuations.
|
||||
FRISO_API int friso_en_punctuation(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task) {
|
||||
if(charset == FRISO_UTF8) {
|
||||
return utf8_en_punctuation(task->unicode);
|
||||
} else if(charset == FRISO_GBK) {
|
||||
return gbk_en_punctuation(task->buffer[0]);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified word ia sn chinese punctuation.
|
||||
FRISO_API int friso_cn_punctuation(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task) {
|
||||
if(charset == FRISO_UTF8) {
|
||||
return utf8_cn_punctuation(task->unicode);
|
||||
} else if(charset == FRISO_GBK) {
|
||||
return gbk_cn_punctuation(task->buffer);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
FRISO_API int friso_letter_number(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
FRISO_API int friso_other_number(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the word is a keep punctuation.
|
||||
//@Deprecated
|
||||
//FRISO_API int friso_keep_punctuation(
|
||||
// friso_charset_t charset,
|
||||
// friso_task_t task )
|
||||
//{
|
||||
// if ( charset == FRISO_UTF8 )
|
||||
// return utf8_keep_punctuation( task->buffer );
|
||||
// else if ( charset == FRISO_GBK )
|
||||
// return gbk_keep_punctuation( task->buffer );
|
||||
// return 0;
|
||||
//}
|
||||
|
||||
//check if the specified char is en english punctuation.
|
||||
// this function is the same as friso_en_punctuation.
|
||||
FRISO_API int is_en_punctuation(
|
||||
friso_charset_t charset, char c) {
|
||||
if(charset == FRISO_UTF8) {
|
||||
return utf8_en_punctuation((uint_t) c);
|
||||
} else if(charset == FRISO_GBK) {
|
||||
return gbk_en_punctuation(c);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check the specified string is make up with numeric.
|
||||
FRISO_API int friso_numeric_string(
|
||||
friso_charset_t charset,
|
||||
char *buffer) {
|
||||
if(charset == FRISO_UTF8) {
|
||||
return utf8_numeric_string(buffer);
|
||||
} else if(charset == FRISO_GBK) {
|
||||
return gbk_numeric_string(buffer);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check the specified string is a decimal string.
|
||||
FRISO_API int friso_decimal_string(
|
||||
friso_charset_t charset, char *buffer) {
|
||||
if(charset == FRISO_UTF8) {
|
||||
return utf8_decimal_string(buffer);
|
||||
} else if(charset == FRISO_GBK) {
|
||||
return gbk_decimal_string(buffer);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//check if the specified char is english uppercase letter.
|
||||
// included full-width and half-width letters.
|
||||
FRISO_API int friso_uppercase_letter(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task) {
|
||||
if(charset == FRISO_UTF8) {
|
||||
return utf8_uppercase_letter(task->unicode);
|
||||
} else if(charset == FRISO_GBK) {
|
||||
return gbk_uppercase_letter(task->buffer);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* get the type of the specified char.
|
||||
* the type will be the constants defined above.
|
||||
* (include the fullwidth english char.)
|
||||
*/
|
||||
FRISO_API friso_enchar_t friso_enchar_type(
|
||||
friso_charset_t charset,
|
||||
friso_task_t task) {
|
||||
//Unicode or ASCII.(Both UTF-8 and GBK are valid)
|
||||
uint_t u = 0;
|
||||
|
||||
if(charset == FRISO_UTF8) {
|
||||
u = task->unicode;
|
||||
//if ( u >= 65280 ) u = 65280 - 65248;
|
||||
} else if(charset == FRISO_GBK) {
|
||||
u = (uchar_t)task->buffer[0];
|
||||
//if ( u == 0xa3 ) ; //full-width.
|
||||
}
|
||||
|
||||
//range check.
|
||||
if(u > 126 || u < 32) return FRISO_EN_UNKNOW;
|
||||
if(u == 32) return FRISO_EN_WHITESPACE;
|
||||
if(u >= 48 && u <= 57) return FRISO_EN_NUMERIC;
|
||||
if(u >= 65 && u <= 90) return FRISO_EN_LETTER;
|
||||
if(u >= 97 && u <= 122) return FRISO_EN_LETTER;
|
||||
|
||||
return FRISO_EN_PUNCTUATION;
|
||||
}
|
||||
|
||||
/* get the type of the specified en char.
|
||||
* the type will be the constants defined above.
|
||||
* (the char should be half-width english char only)
|
||||
*/
|
||||
FRISO_API friso_enchar_t get_enchar_type(char ch) {
|
||||
uint_t u = (uchar_t) ch;
|
||||
|
||||
//range check.
|
||||
if(u > 126 || u < 32) return FRISO_EN_UNKNOW;
|
||||
if(u == 32) return FRISO_EN_WHITESPACE;
|
||||
if(u >= 48 && u <= 57) return FRISO_EN_NUMERIC;
|
||||
if(u >= 65 && u <= 90) return FRISO_EN_LETTER;
|
||||
if(u >= 97 && u <= 122) return FRISO_EN_LETTER;
|
||||
|
||||
return FRISO_EN_PUNCTUATION;
|
||||
}
|
|
@ -1,261 +0,0 @@
|
|||
/**
|
||||
* Friso charset about function interface header file.
|
||||
* @package src/friso_charset.h .
|
||||
* Available charset for now:
|
||||
* 1. UTF8 - function start with utf8
|
||||
* 2. GBK - function start with gbk
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#ifndef _friso_charset_h
|
||||
#define _friso_charset_h
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "friso.h"
|
||||
#include "friso_API.h"
|
||||
|
||||
/** {{{ wrap interface */
|
||||
/* check if the specified string is a cn string.
|
||||
*
|
||||
* @return int (true for cn string or false)
|
||||
* */
|
||||
FRISO_API int friso_cn_string(friso_charset_t, friso_task_t);
|
||||
|
||||
//check if the specified word is a whitespace.
|
||||
FRISO_API int friso_whitespace(friso_charset_t, friso_task_t);
|
||||
|
||||
//check if the specifiled word is a numeric letter.
|
||||
FRISO_API int friso_numeric_letter(friso_charset_t, friso_task_t);
|
||||
|
||||
//check if the specified word is a english letter.
|
||||
FRISO_API int friso_en_letter(friso_charset_t, friso_task_t);
|
||||
|
||||
//check if the specified word is a half-width letter.
|
||||
// punctuations are inclued.
|
||||
FRISO_API int friso_halfwidth_en_char(friso_charset_t, friso_task_t);
|
||||
|
||||
//check if the specified word is a full-width letter.
|
||||
// full-width punctuations are not included.
|
||||
FRISO_API int friso_fullwidth_en_char(friso_charset_t, friso_task_t);
|
||||
|
||||
//check if the specified word is an english punctuations.
|
||||
FRISO_API int friso_en_punctuation(friso_charset_t, friso_task_t);
|
||||
|
||||
//check if the specified word ia sn chinese punctuation.
|
||||
FRISO_API int friso_cn_punctuation(friso_charset_t, friso_task_t);
|
||||
|
||||
FRISO_API int friso_letter_number(friso_charset_t, friso_task_t);
|
||||
FRISO_API int friso_other_number(friso_charset_t, friso_task_t);
|
||||
|
||||
//check if the word is a keep punctuation.
|
||||
//@Deprecated
|
||||
//FRISO_API int friso_keep_punctuation( friso_charset_t, friso_task_t );
|
||||
|
||||
//check the specified string is numeric string.
|
||||
FRISO_API int friso_numeric_string(friso_charset_t, char *);
|
||||
|
||||
//check the specified string is a decimal string.
|
||||
FRISO_API int friso_decimal_string(friso_charset_t, char *);
|
||||
|
||||
//check if the specified char is english uppercase letter.
|
||||
// included full-width and half-width letters.
|
||||
FRISO_API int friso_uppercase_letter(friso_charset_t, friso_task_t);
|
||||
|
||||
|
||||
//en char type.
|
||||
//#define FRISO_EN_LETTER 0 //a-z && A-Z
|
||||
//#define FRISO_EN_NUMERIC 1 //0-9
|
||||
//#define FRISO_EN_PUNCTUATION 2 //english punctuations
|
||||
//#define FRISO_EN_WHITESPACE 3 //whitespace
|
||||
//#define FRISO_EN_UNKNOW -1 //beyond 32-122
|
||||
typedef enum {
|
||||
FRISO_EN_LETTER = 0, //A-Z, a-z
|
||||
FRISO_EN_NUMERIC = 1, //0-9
|
||||
FRISO_EN_PUNCTUATION = 2, //english punctuations
|
||||
FRISO_EN_WHITESPACE = 3, //whitespace
|
||||
FRISO_EN_UNKNOW = -1 //unkow(beyond 32-126)
|
||||
} friso_enchar_t;
|
||||
|
||||
/* get the type of the specified char.
|
||||
* the type will be the constants defined above.
|
||||
* (include the fullwidth english char.)
|
||||
*/
|
||||
FRISO_API friso_enchar_t friso_enchar_type(friso_charset_t, friso_task_t);
|
||||
|
||||
/* get the type of the specified en char.
|
||||
* the type will be the constants defined above.
|
||||
* (the char should be half-width english char only)
|
||||
*/
|
||||
FRISO_API friso_enchar_t get_enchar_type(char);
|
||||
|
||||
/* }}} */
|
||||
|
||||
|
||||
|
||||
|
||||
/** {{{ UTF8 interface*/
|
||||
|
||||
/* read the next utf-8 word from the specified position.
|
||||
*
|
||||
* @return int the bytes of the current readed word.
|
||||
*/
|
||||
FRISO_API int utf8_next_word(friso_task_t, uint_t *, fstring);
|
||||
|
||||
//get the bytes of a utf-8 char.
|
||||
FRISO_API int get_utf8_bytes(char);
|
||||
|
||||
//return the unicode serial number of a given string.
|
||||
FRISO_API int get_utf8_unicode(const fstring);
|
||||
|
||||
//convert the unicode serial to a utf-8 string.
|
||||
FRISO_API int unicode_to_utf8(uint_t, fstring);
|
||||
|
||||
//check if the given char is a CJK.
|
||||
FRISO_API int utf8_cjk_string(uint_t) ;
|
||||
|
||||
/*check the given char is a Basic Latin letter or not.
|
||||
* include all the letters and english puntuations.*/
|
||||
FRISO_API int utf8_halfwidth_en_char(uint_t);
|
||||
|
||||
/*
|
||||
* check the given char is a full-width latain or not.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width puntuations.
|
||||
*/
|
||||
FRISO_API int utf8_fullwidth_en_char(uint_t);
|
||||
|
||||
//check the given char is a upper case letter or not.
|
||||
// included all the full-width and half-width letters.
|
||||
FRISO_API int utf8_uppercase_letter(uint_t);
|
||||
|
||||
//check the given char is a lower case letter or not.
|
||||
// included all the full-width and half-width letters.
|
||||
FRISO_API int utf8_lowercase_letter(uint_t);
|
||||
|
||||
//check the given char is a numeric.
|
||||
// included the full-width and half-width arabic numeric.
|
||||
FRISO_API int utf8_numeric_letter(uint_t);
|
||||
|
||||
/*
|
||||
* check if the given fstring is make up with numeric chars.
|
||||
* both full-width,half-width numeric is ok.
|
||||
*/
|
||||
FRISO_API int utf8_numeric_string(char *);
|
||||
|
||||
FRISO_API int utf8_decimal_string(char *);
|
||||
|
||||
//check the given char is a english char.
|
||||
//(full-width and half-width)
|
||||
//not the punctuation of course.
|
||||
FRISO_API int utf8_en_letter(uint_t);
|
||||
|
||||
//check the given char is a whitespace or not.
|
||||
FRISO_API int utf8_whitespace(uint_t);
|
||||
|
||||
/* check if the given char is a letter number like 'ⅠⅡ'
|
||||
*/
|
||||
FRISO_API int utf8_letter_number(uint_t);
|
||||
|
||||
/*
|
||||
* check if the given char is a other number like '①⑩⑽㈩'
|
||||
*/
|
||||
FRISO_API int utf8_other_number(uint_t);
|
||||
|
||||
//check if the given char is a english punctuation.
|
||||
FRISO_API int utf8_en_punctuation(uint_t) ;
|
||||
|
||||
//check if the given char is a chinese punctuation.
|
||||
FRISO_API int utf8_cn_punctuation(uint_t u);
|
||||
|
||||
FRISO_API int is_en_punctuation(friso_charset_t, char);
|
||||
//#define is_en_punctuation( c ) utf8_en_punctuation((uint_t) c)
|
||||
|
||||
//@Deprecated
|
||||
//FRISO_API int utf8_keep_punctuation( fstring );
|
||||
/* }}} */
|
||||
|
||||
|
||||
|
||||
|
||||
/** {{{ GBK interface */
|
||||
|
||||
/* read the next GBK word from the specified position.
|
||||
*
|
||||
* @return int the bytes of the current readed word.
|
||||
*/
|
||||
FRISO_API int gbk_next_word(friso_task_t, uint_t *, fstring);
|
||||
|
||||
//get the bytes of a utf-8 char.
|
||||
FRISO_API int get_gbk_bytes(char);
|
||||
|
||||
//check if the given char is a gbk char (ANSII string).
|
||||
FRISO_API int gbk_cn_string(char *) ;
|
||||
|
||||
/*check if the given char is a ASCII letter
|
||||
* include all the letters and english puntuations.*/
|
||||
FRISO_API int gbk_halfwidth_en_char(char);
|
||||
|
||||
/*
|
||||
* check if the given char is a full-width latain.
|
||||
* include the full-width arabic numeber, letters.
|
||||
* but not the full-width puntuations.
|
||||
*/
|
||||
FRISO_API int gbk_fullwidth_en_char(char *);
|
||||
|
||||
//check if the given char is a upper case char.
|
||||
// included all the full-width and half-width letters.
|
||||
FRISO_API int gbk_uppercase_letter(char *);
|
||||
|
||||
//check if the given char is a lower case char.
|
||||
// included all the full-width and half-width letters.
|
||||
FRISO_API int gbk_lowercase_letter(char *);
|
||||
|
||||
//check if the given char is a numeric.
|
||||
// included the full-width and half-width arabic numeric.
|
||||
FRISO_API int gbk_numeric_letter(char *);
|
||||
|
||||
/*
|
||||
* check if the given fstring is make up with numeric chars.
|
||||
* both full-width,half-width numeric is ok.
|
||||
*/
|
||||
FRISO_API int gbk_numeric_string(char *);
|
||||
|
||||
FRISO_API int gbk_decimal_string(char *);
|
||||
|
||||
//check if the given char is a english(ASCII) char.
|
||||
//(full-width and half-width)
|
||||
//not the punctuation of course.
|
||||
FRISO_API int gbk_en_letter(char *);
|
||||
|
||||
//check the specified char is a whitespace or not.
|
||||
FRISO_API int gbk_whitespace(char *);
|
||||
|
||||
/* check if the given char is a letter number like 'ⅠⅡ'
|
||||
*/
|
||||
FRISO_API int gbk_letter_number(char *);
|
||||
|
||||
/*
|
||||
* check if the given char is a other number like '①⑩⑽㈩'
|
||||
*/
|
||||
FRISO_API int gbk_other_number(char *);
|
||||
|
||||
//check if the given char is a english punctuation.
|
||||
FRISO_API int gbk_en_punctuation(char) ;
|
||||
|
||||
//check the given char is a chinese punctuation.
|
||||
FRISO_API int gbk_cn_punctuation(char *);
|
||||
|
||||
//cause the logic handle is the same as the utf8.
|
||||
// here invoke the utf8 interface directly.
|
||||
//FRISO_API int gbk_keep_punctuation( char * );
|
||||
//@Deprecated
|
||||
//#define gbk_keep_punctuation( str ) utf8_keep_punctuation(str)
|
||||
|
||||
//check if the given english char is a full-width char or not.
|
||||
//FRISO_API int gbk_fullwidth_char( char * ) ;
|
||||
/* }}}*/
|
||||
|
||||
#endif /*end _friso_charset_h*/
|
|
@ -1,285 +0,0 @@
|
|||
/*
|
||||
* friso hash table functions implementation defined in header file "friso_API.h".
|
||||
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include "friso_API.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
//-166411799L
|
||||
//31 131 1331 13331 133331 ..
|
||||
//31 131 1313 13131 131313 .. the best
|
||||
#define HASH_FACTOR 1313131
|
||||
|
||||
/* ************************
|
||||
* mapping function area *
|
||||
**************************/
|
||||
__STATIC_API__ uint_t hash(fstring str, uint_t length) {
|
||||
//hash code
|
||||
uint_t h = 0;
|
||||
|
||||
while(*str != '\0') {
|
||||
h = h * HASH_FACTOR + (*str++);
|
||||
}
|
||||
|
||||
return (h % length);
|
||||
}
|
||||
|
||||
/*test if a integer is a prime.*/
|
||||
__STATIC_API__ int is_prime(int n) {
|
||||
int j;
|
||||
if(n == 2 || n == 3) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if(n == 1 || n % 2 == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
for(j = 3; j * j < n; j++) {
|
||||
if(n % j == 0) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*get the next prime just after the speicified integer.*/
|
||||
__STATIC_API__ int next_prime(int n) {
|
||||
if(n % 2 == 0) n++;
|
||||
for(; ! is_prime(n); n = n + 2) ;
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
//fstring copy, return the pointer of the new string.
|
||||
//static fstring string_copy( fstring _src ) {
|
||||
//int bytes = strlen( _src );
|
||||
//fstring _dst = ( fstring ) FRISO_MALLOC( bytes + 1 );
|
||||
//register int t = 0;
|
||||
|
||||
//do {
|
||||
//_dst[t] = _src[t];
|
||||
//t++;
|
||||
//} while ( _src[t] != '\0' );
|
||||
//_dst[t] = '\0';
|
||||
|
||||
//return _dst;
|
||||
//}
|
||||
|
||||
/* *********************************
|
||||
* static hashtable function area. *
|
||||
***********************************/
|
||||
__STATIC_API__ hash_entry_t new_hash_entry(
|
||||
fstring key,
|
||||
void * value,
|
||||
hash_entry_t next) {
|
||||
hash_entry_t e = (hash_entry_t)
|
||||
FRISO_MALLOC(sizeof(friso_hash_entry));
|
||||
if(e == NULL) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//e->_key = string_copy( key );
|
||||
e->_key = key;
|
||||
e->_val = value;
|
||||
e->_next = next;
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
//create blocks copy of entries.
|
||||
__STATIC_API__ hash_entry_t * create_hash_entries(uint_t blocks) {
|
||||
register uint_t t;
|
||||
hash_entry_t *e = (hash_entry_t *)
|
||||
FRISO_CALLOC(sizeof(hash_entry_t), blocks);
|
||||
if(e == NULL) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
for(t = 0; t < blocks; t++) {
|
||||
e[t] = NULL;
|
||||
}
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
//a static function to do the re-hash work.
|
||||
__STATIC_API__ void rebuild_hash(friso_hash_t _hash) {
|
||||
//printf("rehashed.\n");
|
||||
//find the next prime as the length of the hashtable.
|
||||
uint_t t, length = next_prime(_hash->length * 2 + 1);
|
||||
hash_entry_t e, next, *_src = _hash->table, \
|
||||
*table = create_hash_entries(length);
|
||||
uint_t bucket;
|
||||
|
||||
//copy the nodes
|
||||
for(t = 0; t < _hash->length; t++) {
|
||||
e = *(_src + t);
|
||||
if(e != NULL) {
|
||||
do {
|
||||
next = e->_next;
|
||||
bucket = hash(e->_key, length);
|
||||
e->_next = table[bucket];
|
||||
table[bucket] = e;
|
||||
e = next;
|
||||
} while(e != NULL);
|
||||
}
|
||||
}
|
||||
|
||||
_hash->table = table;
|
||||
_hash->length = length;
|
||||
_hash->threshold = (uint_t)(_hash->length * _hash->factor);
|
||||
|
||||
//free the old hash_entry_t blocks allocations.
|
||||
FRISO_FREE(_src);
|
||||
}
|
||||
|
||||
/* ********************************
|
||||
* hashtable interface functions. *
|
||||
* ********************************/
|
||||
|
||||
//create a new hash table.
|
||||
FRISO_API friso_hash_t new_hash_table(void) {
|
||||
friso_hash_t _hash = (friso_hash_t) FRISO_MALLOC(sizeof(friso_hash_cdt));
|
||||
if(_hash == NULL) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize the the hashtable
|
||||
_hash->length = DEFAULT_LENGTH;
|
||||
_hash->size = 0;
|
||||
_hash->factor = DEFAULT_FACTOR;
|
||||
_hash->threshold = (uint_t)(_hash->length * _hash->factor);
|
||||
_hash->table = create_hash_entries(_hash->length);
|
||||
|
||||
return _hash;
|
||||
}
|
||||
|
||||
FRISO_API void free_hash_table(
|
||||
friso_hash_t _hash,
|
||||
fhash_callback_fn_t fentry_func) {
|
||||
register uint_t j;
|
||||
hash_entry_t e, n;
|
||||
|
||||
for(j = 0; j < _hash->length; j++) {
|
||||
e = *(_hash->table + j);
|
||||
for(; e != NULL ;) {
|
||||
n = e->_next;
|
||||
if(fentry_func != NULL) fentry_func(e);
|
||||
FRISO_FREE(e);
|
||||
e = n;
|
||||
}
|
||||
}
|
||||
|
||||
//free the pointer array block ( 4 * htable->length continuous bytes ).
|
||||
FRISO_FREE(_hash->table);
|
||||
FRISO_FREE(_hash);
|
||||
}
|
||||
|
||||
|
||||
//put a new mapping insite.
|
||||
//the value cannot be NULL.
|
||||
FRISO_API void *hash_put_mapping(
|
||||
friso_hash_t _hash,
|
||||
fstring key,
|
||||
void * value) {
|
||||
uint_t bucket = (key == NULL) ? 0 : hash(key, _hash->length);
|
||||
hash_entry_t e = *(_hash->table + bucket);
|
||||
void *oval = NULL;
|
||||
|
||||
//check the given key is already exists or not.
|
||||
for(; e != NULL; e = e->_next) {
|
||||
if(key == e->_key
|
||||
|| (key != NULL && e->_key != NULL
|
||||
&& strcmp(key, e->_key) == 0)) {
|
||||
oval = e->_val; //bak the old value
|
||||
e->_key = key;
|
||||
e->_val = value;
|
||||
return oval;
|
||||
}
|
||||
}
|
||||
|
||||
//put a new mapping into the hashtable.
|
||||
_hash->table[bucket] = new_hash_entry(key, value, _hash->table[bucket]);
|
||||
_hash->size++;
|
||||
|
||||
//check the condition to rebuild the hashtable.
|
||||
if(_hash->size >= _hash->threshold) {
|
||||
rebuild_hash(_hash);
|
||||
}
|
||||
|
||||
return oval;
|
||||
}
|
||||
|
||||
//check the existence of the mapping associated with the given key.
|
||||
FRISO_API int hash_exist_mapping(
|
||||
friso_hash_t _hash, fstring key) {
|
||||
uint_t bucket = (key == NULL) ? 0 : hash(key, _hash->length);
|
||||
hash_entry_t e;
|
||||
|
||||
for(e = *(_hash->table + bucket);
|
||||
e != NULL; e = e->_next) {
|
||||
if(key == e->_key
|
||||
|| (key != NULL && e->_key != NULL
|
||||
&& strcmp(key, e->_key) == 0)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//get the value associated with the given key.
|
||||
FRISO_API void *hash_get_value(friso_hash_t _hash, fstring key) {
|
||||
uint_t bucket = (key == NULL) ? 0 : hash(key, _hash->length);
|
||||
hash_entry_t e;
|
||||
|
||||
for(e = *(_hash->table + bucket);
|
||||
e != NULL; e = e->_next) {
|
||||
if(key == e->_key
|
||||
|| (key != NULL && e->_key != NULL
|
||||
&& strcmp(key, e->_key) == 0)) {
|
||||
return e->_val;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//remove the mapping associated with the given key.
|
||||
FRISO_API hash_entry_t hash_remove_mapping(
|
||||
friso_hash_t _hash, fstring key) {
|
||||
uint_t bucket = (key == NULL) ? 0 : hash(key, _hash->length);
|
||||
hash_entry_t e, prev = NULL;
|
||||
hash_entry_t b;
|
||||
|
||||
for(e = *(_hash->table + bucket);
|
||||
e != NULL; prev = e, e = e->_next) {
|
||||
if(key == e->_key
|
||||
|| (key != NULL && e->_key != NULL
|
||||
&& strcmp(key, e->_key) == 0)) {
|
||||
b = e;
|
||||
//the node located at *( htable->table + bucket )
|
||||
if(prev == NULL) {
|
||||
_hash->table[bucket] = e->_next;
|
||||
} else {
|
||||
prev->_next = e->_next;
|
||||
}
|
||||
//printf("%s was removed\n", b->_key);
|
||||
_hash->size--;
|
||||
//FRISO_FREE( b );
|
||||
return b;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//count the size.(A macro define has replace this.)
|
||||
//FRISO_API uint_t hash_get_size( friso_hash_t _hash ) {
|
||||
// return _hash->size;
|
||||
//}
|
|
@ -1,540 +0,0 @@
|
|||
/*
|
||||
* friso lexicon functions implementation.
|
||||
* used to deal with the friso lexicon, like: load,remove,match...
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "friso_API.h"
|
||||
#include "friso.h"
|
||||
|
||||
#define __SPLIT_MAX_TOKENS__ 5
|
||||
#define __LEX_FILE_DELIME__ '#'
|
||||
#define __FRISO_LEX_IFILE__ "friso.lex.ini"
|
||||
|
||||
//create a new lexicon
|
||||
FRISO_API friso_dic_t friso_dic_new() {
|
||||
register uint_t t;
|
||||
friso_dic_t dic = (friso_dic_t) FRISO_CALLOC(
|
||||
sizeof(friso_hash_t), __FRISO_LEXICON_LENGTH__);
|
||||
if(dic == NULL) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
for(t = 0; t < __FRISO_LEXICON_LENGTH__; t++) {
|
||||
dic[t] = new_hash_table();
|
||||
}
|
||||
|
||||
return dic;
|
||||
}
|
||||
|
||||
/**
|
||||
* default callback function to invoke
|
||||
* when free the friso dictionary .
|
||||
*
|
||||
* @date 2013-06-12
|
||||
*/
|
||||
__STATIC_API__ void default_fdic_callback(hash_entry_t e) {
|
||||
register uint_t i;
|
||||
friso_array_t syn;
|
||||
lex_entry_t lex = (lex_entry_t) e->_val;
|
||||
//free the lex->word
|
||||
FRISO_FREE(lex->word);
|
||||
//free the lex->syn if it is not NULL
|
||||
if(lex->syn != NULL) {
|
||||
syn = lex->syn;
|
||||
for(i = 0; i < syn->length; i++) {
|
||||
FRISO_FREE(syn->items[i]);
|
||||
}
|
||||
free_array_list(syn);
|
||||
}
|
||||
|
||||
//free the e->_val
|
||||
//@date 2014-01-28 posted by mlemay@gmail.com
|
||||
FRISO_FREE(lex);
|
||||
}
|
||||
|
||||
FRISO_API void friso_dic_free(friso_dic_t dic) {
|
||||
register uint_t t;
|
||||
for(t = 0; t < __FRISO_LEXICON_LENGTH__; t++) {
|
||||
//free the hash table
|
||||
free_hash_table(dic[t], default_fdic_callback);
|
||||
}
|
||||
|
||||
FRISO_FREE(dic);
|
||||
}
|
||||
|
||||
|
||||
//create a new lexicon entry
|
||||
FRISO_API lex_entry_t new_lex_entry(
|
||||
fstring word,
|
||||
friso_array_t syn,
|
||||
uint_t fre,
|
||||
uint_t length,
|
||||
uint_t type) {
|
||||
lex_entry_t e = (lex_entry_t)
|
||||
FRISO_MALLOC(sizeof(lex_entry_cdt));
|
||||
if(e == NULL) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize.
|
||||
e->word = word;
|
||||
e->syn = syn; //synoyum words array list.
|
||||
e->pos = NULL; //part of speech array list.
|
||||
//e->py = NULL; //set to NULL first.
|
||||
e->fre = fre;
|
||||
e->length = (uchar_t) length; //length
|
||||
e->rlen = (uchar_t) length; //set to length by default.
|
||||
e->type = (uchar_t) type; //type
|
||||
e->ctrlMask = 0; //control mask.
|
||||
e->offset = -1;
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
/**
|
||||
* free the given lexicon entry.
|
||||
* you have to do three thing maybe:
|
||||
* 1. free where its syn items points to. (not implemented)
|
||||
* 2. free its syn. (friso_array_t)
|
||||
* 3. free its pos. (friso_array_t)
|
||||
* 4. free the lex_entry_t.
|
||||
*/
|
||||
FRISO_API void free_lex_entry_full(lex_entry_t e) {
|
||||
register uint_t i;
|
||||
friso_array_t syn;
|
||||
|
||||
//free the lex->word
|
||||
FRISO_FREE(e->word);
|
||||
//free the lex->syn if it is not NULL
|
||||
if(e->syn != NULL) {
|
||||
syn = e->syn;
|
||||
for(i = 0; i < syn->length; i++) {
|
||||
FRISO_FREE(syn->items[i]);
|
||||
}
|
||||
free_array_list(syn);
|
||||
}
|
||||
|
||||
//free the e->_val
|
||||
//@date 2014-01-28 posted by mlemay@gmail.com
|
||||
FRISO_FREE(e);
|
||||
}
|
||||
|
||||
FRISO_API void free_lex_entry(lex_entry_t e) {
|
||||
//if ( e->syn != NULL ) {
|
||||
// if ( flag == 1 ) free_array_list( e->syn);
|
||||
// else free_array_list( e->syn );
|
||||
//}
|
||||
|
||||
FRISO_FREE(e);
|
||||
}
|
||||
|
||||
|
||||
//add a new entry to the dictionary.
|
||||
FRISO_API void friso_dic_add(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word,
|
||||
friso_array_t syn) {
|
||||
void *olex = NULL;
|
||||
if(lex >= 0 && lex < __FRISO_LEXICON_LENGTH__) {
|
||||
//printf("lex=%d, word=%s, syn=%s\n", lex, word, syn);
|
||||
olex = hash_put_mapping(dic[lex], word,
|
||||
new_lex_entry(word, syn, 0,
|
||||
(uint_t) strlen(word), (uint_t) lex));
|
||||
if(olex != NULL) {
|
||||
free_lex_entry_full((lex_entry_t)olex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FRISO_API void friso_dic_add_with_fre(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word,
|
||||
friso_array_t syn,
|
||||
uint_t frequency) {
|
||||
void *olex = NULL;
|
||||
if(lex >= 0 && lex < __FRISO_LEXICON_LENGTH__) {
|
||||
olex = hash_put_mapping(dic[lex], word,
|
||||
new_lex_entry(word, syn, frequency,
|
||||
(uint_t) strlen(word), (uint_t) lex));
|
||||
if(olex != NULL) {
|
||||
free_lex_entry_full((lex_entry_t)olex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* read a line from a specified stream.
|
||||
* the newline will be cleared.
|
||||
*
|
||||
* @date 2012-11-24
|
||||
*/
|
||||
FRISO_API fstring file_get_line(fstring __dst, FILE * _stream) {
|
||||
register int c;
|
||||
fstring cs;
|
||||
|
||||
cs = __dst;
|
||||
while((c = fgetc(_stream)) != EOF) {
|
||||
if(c == '\n') break;
|
||||
*cs++ = c;
|
||||
}
|
||||
*cs = '\0';
|
||||
|
||||
return (c == EOF && cs == __dst) ? NULL : __dst;
|
||||
}
|
||||
|
||||
/*
|
||||
* static function to copy a string.
|
||||
*/
|
||||
///instead of memcpy
|
||||
__STATIC_API__ fstring string_copy(
|
||||
fstring _src,
|
||||
fstring __dst,
|
||||
uint_t blocks) {
|
||||
|
||||
register fstring __src = _src;
|
||||
register uint_t t;
|
||||
|
||||
for(t = 0; t < blocks; t++) {
|
||||
if(*__src == '\0') break;
|
||||
__dst[t] = *__src++;
|
||||
}
|
||||
__dst[t] = '\0';
|
||||
|
||||
return __dst;
|
||||
}
|
||||
|
||||
/**
|
||||
* make a heap allocation, and copy the
|
||||
* source fstring to the new allocation, and
|
||||
* you should free it after use it .
|
||||
*
|
||||
* @param _src source fstring
|
||||
* @param blocks number of bytes to copy
|
||||
*/
|
||||
__STATIC_API__ fstring string_copy_heap(
|
||||
fstring _src, uint_t blocks) {
|
||||
register uint_t t;
|
||||
|
||||
fstring str = (fstring) FRISO_MALLOC(blocks + 1);
|
||||
if(str == NULL) {
|
||||
___ALLOCATION_ERROR___;
|
||||
}
|
||||
|
||||
for(t = 0; t < blocks; t++) {
|
||||
//if ( *_src == '\0' ) break;
|
||||
str[t] = *_src++;
|
||||
}
|
||||
|
||||
str[t] = '\0';
|
||||
return str;
|
||||
}
|
||||
|
||||
/*
|
||||
* find the postion of the first appear of the given char.
|
||||
* address of the char in the fstring will be return .
|
||||
* if not found NULL will be return .
|
||||
*/
|
||||
__STATIC_API__ fstring indexOf(fstring __str, char delimiter) {
|
||||
uint_t i, __length__;
|
||||
|
||||
__length__ = strlen(__str);
|
||||
for(i = 0; i < __length__; i++) {
|
||||
if(__str[i] == delimiter) {
|
||||
return __str + i;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* load all the valid wors from a specified lexicon file .
|
||||
*
|
||||
* @param dic friso dictionary instance (A hash array)
|
||||
* @param lex the lexicon type
|
||||
* @param lex_file the path of the lexicon file
|
||||
* @param length the maximum length of the word item
|
||||
*/
|
||||
FRISO_API void friso_dic_load(
|
||||
friso_t friso,
|
||||
friso_config_t config,
|
||||
friso_lex_t lex,
|
||||
fstring lex_file,
|
||||
uint_t length) {
|
||||
|
||||
FILE * _stream;
|
||||
char __char[1024], _buffer[512];
|
||||
fstring _line;
|
||||
string_split_entry sse;
|
||||
|
||||
fstring _word;
|
||||
char _sbuffer[512];
|
||||
fstring _syn;
|
||||
friso_array_t sywords;
|
||||
uint_t _fre;
|
||||
|
||||
if((_stream = fopen(lex_file, "rb")) != NULL) {
|
||||
while((_line = file_get_line(__char, _stream)) != NULL) {
|
||||
//clear up the notes
|
||||
//make sure the length of the line is greater than 1.
|
||||
//like the single '#' mark in stopwords dictionary.
|
||||
if(_line[0] == '#' && strlen(_line) > 1) continue;
|
||||
|
||||
//handle the stopwords.
|
||||
if(lex == __LEX_STOPWORDS__) {
|
||||
//clean the chinese words that its length is greater than max length.
|
||||
if(((int)_line[0]) < 0 && strlen(_line) > length) continue;
|
||||
friso_dic_add(friso->dic, __LEX_STOPWORDS__,
|
||||
string_copy_heap(_line, strlen(_line)), NULL);
|
||||
continue;
|
||||
}
|
||||
|
||||
//split the fstring with '/'.
|
||||
string_split_reset(&sse, "/", _line);
|
||||
if(string_split_next(&sse, _buffer) == NULL) {
|
||||
continue;
|
||||
}
|
||||
|
||||
//1. get the word.
|
||||
_word = string_copy_heap(_buffer, strlen(_buffer));
|
||||
|
||||
if(string_split_next(&sse, _buffer) == NULL) {
|
||||
//normal lexicon type,
|
||||
//add them to the dictionary directly
|
||||
friso_dic_add(friso->dic, lex, _word, NULL);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* filter out the words that its length is larger
|
||||
* than the specified limit.
|
||||
* but not for __LEX_ECM_WORDS__ and english __LEX_STOPWORDS__
|
||||
* and __LEX_CEM_WORDS__.
|
||||
*/
|
||||
if(!(lex == __LEX_ECM_WORDS__ || lex == __LEX_CEM_WORDS__)
|
||||
&& strlen(_word) > length) {
|
||||
FRISO_FREE(_word);
|
||||
continue;
|
||||
}
|
||||
|
||||
//2. get the synonyms words.
|
||||
_syn = NULL;
|
||||
if(strcmp(_buffer, "null") != 0) {
|
||||
_syn = string_copy(_buffer, _sbuffer, strlen(_buffer));
|
||||
}
|
||||
|
||||
//3. get the word frequency if it available.
|
||||
_fre = 0;
|
||||
if(string_split_next(&sse, _buffer) != NULL) {
|
||||
_fre = atoi(_buffer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Here:
|
||||
* split the synonyms words with mark ","
|
||||
* and put them in a array list if the synonyms is not NULL
|
||||
*/
|
||||
sywords = NULL;
|
||||
if(config->add_syn && _syn != NULL) {
|
||||
string_split_reset(&sse, ",", _sbuffer);
|
||||
sywords = new_array_list_with_opacity(5);
|
||||
while(string_split_next(&sse, _buffer) != NULL) {
|
||||
if(strlen(_buffer) > length) continue;
|
||||
array_list_add(sywords,
|
||||
string_copy_heap(_buffer, strlen(_buffer)));
|
||||
}
|
||||
sywords = array_list_trim(sywords);
|
||||
}
|
||||
|
||||
//4. add the word item
|
||||
friso_dic_add_with_fre(
|
||||
friso->dic, lex, _word, sywords, _fre);
|
||||
}
|
||||
|
||||
fclose(_stream);
|
||||
} else {
|
||||
fprintf(stderr, "Warning: Fail to open lexicon file %s\n", lex_file);
|
||||
fprintf(stderr, "Warning: Without lexicon file, segment results will not correct \n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* get the lexicon type index with the specified
|
||||
* type keywords .
|
||||
*
|
||||
* @see friso.h#friso_lex_t
|
||||
* @param _key
|
||||
* @return int
|
||||
*/
|
||||
__STATIC_API__ friso_lex_t get_lexicon_type_with_constant(fstring _key) {
|
||||
if(strcmp(_key, "__LEX_CJK_WORDS__") == 0) {
|
||||
return __LEX_CJK_WORDS__;
|
||||
} else if(strcmp(_key, "__LEX_CJK_UNITS__") == 0) {
|
||||
return __LEX_CJK_UNITS__;
|
||||
} else if(strcmp(_key, "__LEX_ECM_WORDS__") == 0) {
|
||||
return __LEX_ECM_WORDS__;
|
||||
} else if(strcmp(_key, "__LEX_CEM_WORDS__") == 0) {
|
||||
return __LEX_CEM_WORDS__;
|
||||
} else if(strcmp(_key, "__LEX_CN_LNAME__") == 0) {
|
||||
return __LEX_CN_LNAME__;
|
||||
} else if(strcmp(_key, "__LEX_CN_SNAME__") == 0) {
|
||||
return __LEX_CN_SNAME__;
|
||||
} else if(strcmp(_key, "__LEX_CN_DNAME1__") == 0) {
|
||||
return __LEX_CN_DNAME1__;
|
||||
} else if(strcmp(_key, "__LEX_CN_DNAME2__") == 0) {
|
||||
return __LEX_CN_DNAME2__;
|
||||
} else if(strcmp(_key, "__LEX_CN_LNA__") == 0) {
|
||||
return __LEX_CN_LNA__;
|
||||
} else if(strcmp(_key, "__LEX_STOPWORDS__") == 0) {
|
||||
return __LEX_STOPWORDS__;
|
||||
} else if(strcmp(_key, "__LEX_ENPUN_WORDS__") == 0) {
|
||||
return __LEX_ENPUN_WORDS__;
|
||||
} else if(strcmp(_key, "__LEX_EN_WORDS__") == 0) {
|
||||
return __LEX_EN_WORDS__;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* load the lexicon configuration file.
|
||||
* and load all the valid lexicon from the configuration file.
|
||||
*
|
||||
* @param friso friso instance
|
||||
* @param config friso_config instance
|
||||
* @param _path dictionary directory
|
||||
* @param _limitts words length limit
|
||||
*/
|
||||
FRISO_API void friso_dic_load_from_ifile(
|
||||
friso_t friso,
|
||||
friso_config_t config,
|
||||
fstring _path,
|
||||
uint_t _limits) {
|
||||
|
||||
//1.parse the configuration file.
|
||||
FILE *__stream;
|
||||
char __chars__[1024], __key__[30], *__line__;
|
||||
uint_t __length__, i, t;
|
||||
friso_lex_t lex_t;
|
||||
string_buffer_t sb;
|
||||
|
||||
//get the lexicon configruation file path
|
||||
sb = new_string_buffer();
|
||||
|
||||
string_buffer_append(sb, _path);
|
||||
string_buffer_append(sb, __FRISO_LEX_IFILE__);
|
||||
//printf("%s\n", sb->buffer);
|
||||
|
||||
if((__stream = fopen(sb->buffer, "rb")) != NULL) {
|
||||
while((__line__ =
|
||||
file_get_line(__chars__, __stream)) != NULL) {
|
||||
//comment filter.
|
||||
if(__line__[0] == '#') continue;
|
||||
if(__line__[0] == '\0') continue;
|
||||
|
||||
__length__ = strlen(__line__);
|
||||
//item start
|
||||
if(__line__[ __length__ - 1 ] == '[') {
|
||||
//get the type key
|
||||
for(i = 0; i < __length__
|
||||
&& (__line__[i] == ' ' || __line__[i] == '\t'); i++);
|
||||
for(t = 0; i < __length__; i++, t++) {
|
||||
if(__line__[i] == ' '
|
||||
|| __line__[i] == '\t' || __line__[i] == ':') break;
|
||||
__key__[t] = __line__[i];
|
||||
}
|
||||
__key__[t] = '\0';
|
||||
|
||||
//get the lexicon type
|
||||
lex_t = get_lexicon_type_with_constant(__key__);
|
||||
if(lex_t == -1) continue;
|
||||
|
||||
//printf("key=%s, type=%d\n", __key__, lex_t );
|
||||
while((__line__ = file_get_line(__chars__, __stream)) != NULL) {
|
||||
//comments filter.
|
||||
if(__line__[0] == '#') continue;
|
||||
if(__line__[0] == '\0') continue;
|
||||
|
||||
__length__ = strlen(__line__);
|
||||
if(__line__[ __length__ - 1 ] == ']') break;
|
||||
|
||||
for(i = 0; i < __length__
|
||||
&& (__line__[i] == ' ' || __line__[i] == '\t'); i++);
|
||||
for(t = 0; i < __length__; i++, t++) {
|
||||
if(__line__[i] == ' '
|
||||
|| __line__[i] == '\t' || __line__[i] == ';') break;
|
||||
__key__[t] = __line__[i];
|
||||
}
|
||||
__key__[t] = '\0';
|
||||
|
||||
//load the lexicon item from the lexicon file.
|
||||
string_buffer_clear(sb);
|
||||
string_buffer_append(sb, _path);
|
||||
string_buffer_append(sb, __key__);
|
||||
//printf("key=%s, type=%d\n", __key__, lex_t);
|
||||
friso_dic_load(friso, config, lex_t, sb->buffer, _limits);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} //end while
|
||||
|
||||
fclose(__stream);
|
||||
} else {
|
||||
fprintf(stderr, "Warning: Fail to open the lexicon configuration file %s\n", sb->buffer);
|
||||
fprintf(stderr, "Warning: Without lexicon file, segment results will not correct \n");
|
||||
}
|
||||
|
||||
free_string_buffer(sb);
|
||||
}
|
||||
|
||||
//match the item.
|
||||
FRISO_API int friso_dic_match(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word) {
|
||||
if(lex >= 0 && lex < __FRISO_LEXICON_LENGTH__) {
|
||||
return hash_exist_mapping(dic[lex], word);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
//get the lex_entry_t associated with the word.
|
||||
FRISO_API lex_entry_t friso_dic_get(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex,
|
||||
fstring word) {
|
||||
if(lex >= 0 && lex < __FRISO_LEXICON_LENGTH__) {
|
||||
return (lex_entry_t) hash_get_value(dic[lex], word);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//get the size of the specified type dictionary.
|
||||
FRISO_API uint_t friso_spec_dic_size(
|
||||
friso_dic_t dic,
|
||||
friso_lex_t lex) {
|
||||
if(lex >= 0 && lex < __FRISO_LEXICON_LENGTH__) {
|
||||
return hash_get_size(dic[lex]);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
//get size of the whole dictionary.
|
||||
FRISO_API uint_t friso_all_dic_size(
|
||||
friso_dic_t dic) {
|
||||
register uint_t size = 0, t;
|
||||
|
||||
for(t = 0; t < __FRISO_LEXICON_LENGTH__; t++) {
|
||||
size += hash_get_size(dic[t]);
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
|
@ -1,266 +0,0 @@
|
|||
/*
|
||||
* link list functions implementation defined in header file "friso_API.h".
|
||||
* when the link_node is being deleted, here we just free
|
||||
* the allocation of the node, not the allcation of it's value.
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include "friso_API.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
//create a new link list node.
|
||||
__STATIC_API__ link_node_t new_node_entry(
|
||||
void * value,
|
||||
link_node_t prev,
|
||||
link_node_t next) {
|
||||
link_node_t node = (link_node_t)
|
||||
FRISO_MALLOC(sizeof(link_node_entry));
|
||||
if(node == NULL) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
node->value = value;
|
||||
node->prev = prev;
|
||||
node->next = next;
|
||||
|
||||
return node;
|
||||
}
|
||||
|
||||
//create a new link list
|
||||
FRISO_API friso_link_t new_link_list(void) {
|
||||
friso_link_t e = (friso_link_t)
|
||||
FRISO_MALLOC(sizeof(friso_link_entry));
|
||||
if(e == NULL) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize the entry
|
||||
e->head = new_node_entry(NULL, NULL, NULL);
|
||||
e->tail = new_node_entry(NULL, e->head, NULL);
|
||||
e->head->next = e->tail;
|
||||
e->size = 0;
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
//free the given link list
|
||||
FRISO_API void free_link_list(friso_link_t link) {
|
||||
link_node_t node, next;
|
||||
for(node = link->head; node != NULL;) {
|
||||
next = node->next;
|
||||
FRISO_FREE(node);
|
||||
node = next;
|
||||
}
|
||||
|
||||
FRISO_FREE(link);
|
||||
}
|
||||
|
||||
//clear all nodes in the link list.
|
||||
FRISO_API friso_link_t link_list_clear(
|
||||
friso_link_t link) {
|
||||
link_node_t node, next;
|
||||
//free all the middle nodes.
|
||||
for(node = link->head->next; node != link->tail;) {
|
||||
next = node->next;
|
||||
FRISO_FREE(node);
|
||||
node = next;
|
||||
}
|
||||
|
||||
link->head->next = link->tail;
|
||||
link->tail->prev = link->head;
|
||||
link->size = 0;
|
||||
|
||||
return link;
|
||||
}
|
||||
|
||||
//get the size of the link list.
|
||||
//FRISO_API uint_t link_list_size( friso_link_t link ) {
|
||||
// return link->size;
|
||||
//}
|
||||
|
||||
//check if the link list is empty
|
||||
//FRISO_API int link_list_empty( friso_link_t link ) {
|
||||
// return ( link->size == 0 );
|
||||
//}
|
||||
|
||||
|
||||
/*
|
||||
* find the node at a specified position.
|
||||
* static
|
||||
*/
|
||||
__STATIC_API__ link_node_t get_node(
|
||||
friso_link_t link, uint_t idx) {
|
||||
link_node_t p = NULL;
|
||||
register uint_t t;
|
||||
|
||||
if(idx >= 0 && idx < link->size) {
|
||||
if(idx < link->size / 2) { //find from the head.
|
||||
p = link->head;
|
||||
for(t = 0; t <= idx; t++)
|
||||
p = p->next;
|
||||
} else { //find from the tail.
|
||||
p = link->tail;
|
||||
for(t = link->size; t > idx; t--)
|
||||
p = p->prev;
|
||||
}
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
/*
|
||||
* insert a node before the given node.
|
||||
* static
|
||||
*/
|
||||
//__STATIC_API__ void insert_before(
|
||||
// friso_link_t link,
|
||||
// link_node_t node,
|
||||
// void * value )
|
||||
//{
|
||||
// link_node_t e = new_node_entry( value, node->prev, node );
|
||||
// e->prev->next = e;
|
||||
// e->next->prev = e;
|
||||
// //node->prev = e;
|
||||
//
|
||||
// link->size++;
|
||||
//}
|
||||
#define insert_before( link, node, value ) \
|
||||
{ \
|
||||
link_node_t e = new_node_entry( value, node->prev, node ); \
|
||||
e->prev->next = e; \
|
||||
e->next->prev = e; \
|
||||
link->size++; \
|
||||
}
|
||||
|
||||
/*
|
||||
* static function:
|
||||
* remove the given node, the allocation of the value will not free,
|
||||
* but we return it to you, you will free it youself when there is a necessary.
|
||||
*
|
||||
* @return the value of the removed node.
|
||||
*/
|
||||
__STATIC_API__ void * remove_node(
|
||||
friso_link_t link, link_node_t node) {
|
||||
void * _value = node->value;
|
||||
|
||||
node->prev->next = node->next;
|
||||
node->next->prev = node->prev;
|
||||
link->size--;
|
||||
|
||||
FRISO_FREE(node);
|
||||
|
||||
return _value;
|
||||
}
|
||||
|
||||
|
||||
//add a new node to the link list.(insert just before the tail)
|
||||
FRISO_API void link_list_add(
|
||||
friso_link_t link, void * value) {
|
||||
insert_before(link, link->tail, value);
|
||||
}
|
||||
|
||||
//add a new node before the given index.
|
||||
FRISO_API void link_list_insert_before(
|
||||
friso_link_t link, uint_t idx, void * value) {
|
||||
link_node_t node = get_node(link, idx);
|
||||
if(node != NULL) {
|
||||
insert_before(link, node, value);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* get the value with the specified node.
|
||||
*
|
||||
* @return the value of the node.
|
||||
*/
|
||||
FRISO_API void * link_list_get(
|
||||
friso_link_t link, uint_t idx) {
|
||||
link_node_t node = get_node(link, idx);
|
||||
if(node != NULL) {
|
||||
return node->value;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* set the value of the node that located in the specified position.
|
||||
* we did't free the allocation of the old value, we return it to you.
|
||||
* free it yourself when it is necessary.
|
||||
*
|
||||
* @return the old value.
|
||||
*/
|
||||
FRISO_API void *link_list_set(
|
||||
friso_link_t link,
|
||||
uint_t idx, void * value) {
|
||||
link_node_t node = get_node(link, idx);
|
||||
void * _value = NULL;
|
||||
|
||||
if(node != NULL) {
|
||||
_value = node->value;
|
||||
node->value = value;
|
||||
}
|
||||
|
||||
return _value;
|
||||
}
|
||||
|
||||
/*
|
||||
* remove the node located in the specified position.
|
||||
*
|
||||
* @see remove_node
|
||||
* @return the value of the node removed.
|
||||
*/
|
||||
FRISO_API void *link_list_remove(
|
||||
friso_link_t link, uint_t idx) {
|
||||
link_node_t node = get_node(link, idx);
|
||||
|
||||
if(node != NULL) {
|
||||
//printf("idx=%d, node->value=%s\n", idx, (string) node->value );
|
||||
return remove_node(link, node);
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* remove the given node from the given link list.
|
||||
*
|
||||
* @see remove_node.
|
||||
* @return the value of the node removed.
|
||||
*/
|
||||
FRISO_API void *link_list_remove_node(
|
||||
friso_link_t link,
|
||||
link_node_t node) {
|
||||
return remove_node(link, node);
|
||||
}
|
||||
|
||||
//remove the first node after the head
|
||||
FRISO_API void *link_list_remove_first(
|
||||
friso_link_t link) {
|
||||
if(link->size > 0) {
|
||||
return remove_node(link, link->head->next);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//remove the last node just before the tail.
|
||||
FRISO_API void *link_list_remove_last(
|
||||
friso_link_t link) {
|
||||
if(link->size > 0) {
|
||||
return remove_node(link, link->tail->prev);
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
//append a node from the tail.
|
||||
FRISO_API void link_list_add_last(
|
||||
friso_link_t link,
|
||||
void *value) {
|
||||
insert_before(link, link->tail, value);
|
||||
}
|
||||
|
||||
//append a note just after the head.
|
||||
FRISO_API void link_list_add_first(
|
||||
friso_link_t link, void *value) {
|
||||
insert_before(link, link->head->next, value);
|
||||
}
|
|
@ -1,298 +0,0 @@
|
|||
/*
|
||||
* utf-8 handle functions implementation.
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include "friso_API.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/* ******************************************
|
||||
* fstring buffer functions implements. *
|
||||
********************************************/
|
||||
/**
|
||||
* create a new buffer
|
||||
* @Note:
|
||||
* 1. it's real length is 1 byte greater than the specifield value
|
||||
* 2. we did not do any optimization for the memory allocation to ...
|
||||
* avoid the memory defragmentation.
|
||||
*
|
||||
* @date: 2014-10-16
|
||||
*/
|
||||
__STATIC_API__ fstring create_buffer(uint_t length) {
|
||||
fstring buffer = (fstring) FRISO_MALLOC(length + 1);
|
||||
if(buffer == NULL) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
memset(buffer, 0x00, length + 1);
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
//the __allocs should not be smaller than sb->length
|
||||
__STATIC_API__ string_buffer_t resize_buffer(
|
||||
string_buffer_t sb, uint_t __allocs) {
|
||||
//create a new buffer.
|
||||
//if ( __allocs < sb->length ) __allocs = sb->length + 1;
|
||||
fstring str = create_buffer(__allocs);
|
||||
|
||||
//register uint_t t;
|
||||
//for ( t = 0; t < sb->length; t++ ) {
|
||||
// str[t] = sb->buffer[t];
|
||||
//}
|
||||
memcpy(str, sb->buffer, sb->length);
|
||||
FRISO_FREE(sb->buffer);
|
||||
|
||||
sb->buffer = str;
|
||||
sb->allocs = __allocs;
|
||||
|
||||
return sb;
|
||||
}
|
||||
|
||||
//create a new fstring buffer with a default opacity.
|
||||
//FRISO_API string_buffer_t new_string_buffer( void )
|
||||
//{
|
||||
// return new_string_buffer_with_opacity( __BUFFER_DEFAULT_LENGTH__ );
|
||||
//}
|
||||
|
||||
//create a new fstring buffer with the given opacity.
|
||||
FRISO_API string_buffer_t new_string_buffer_with_opacity(uint_t opacity) {
|
||||
string_buffer_t sb = (string_buffer_t)
|
||||
FRISO_MALLOC(sizeof(string_buffer_entry));
|
||||
if(sb == NULL) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
sb->buffer = create_buffer(opacity);
|
||||
sb->length = 0;
|
||||
sb->allocs = opacity;
|
||||
|
||||
return sb;
|
||||
}
|
||||
|
||||
//create a buffer with the given string.
|
||||
FRISO_API string_buffer_t new_string_buffer_with_string(fstring str) {
|
||||
//buffer allocations.
|
||||
string_buffer_t sb = (string_buffer_t)
|
||||
FRISO_MALLOC(sizeof(string_buffer_entry));
|
||||
if(sb == NULL) {
|
||||
___ALLOCATION_ERROR___
|
||||
}
|
||||
|
||||
//initialize
|
||||
sb->length = strlen(str);
|
||||
sb->buffer = create_buffer(sb->length + __BUFFER_DEFAULT_LENGTH__);
|
||||
sb->allocs = sb->length + __BUFFER_DEFAULT_LENGTH__;
|
||||
|
||||
//register uint_t t;
|
||||
//copy the str to the buffer.
|
||||
//for ( t = 0; t < sb->length; t++ ) {
|
||||
// sb->buffer[t] = str[t];
|
||||
//}
|
||||
memcpy(sb->buffer, str, sb->length);
|
||||
|
||||
return sb;
|
||||
}
|
||||
|
||||
FRISO_API void string_buffer_append(
|
||||
string_buffer_t sb, fstring __str) {
|
||||
register uint_t __len__ = strlen(__str);
|
||||
|
||||
//check the necessity to resize the buffer.
|
||||
if(sb->length + __len__ > sb->allocs) {
|
||||
sb = resize_buffer(sb, (sb->length + __len__) * 2 + 1);
|
||||
}
|
||||
|
||||
//register uint_t t;
|
||||
////copy the __str to the buffer.
|
||||
//for ( t = 0; t < __len__; t++ ) {
|
||||
// sb->buffer[ sb->length++ ] = __str[t];
|
||||
//}
|
||||
memcpy(sb->buffer + sb->length, __str, __len__);
|
||||
sb->length += __len__;
|
||||
}
|
||||
|
||||
FRISO_API void string_buffer_append_char(
|
||||
string_buffer_t sb, char ch) {
|
||||
//check the necessity to resize the buffer.
|
||||
if(sb->length + 1 > sb->allocs) {
|
||||
sb = resize_buffer(sb, sb->length * 2 + 1);
|
||||
}
|
||||
|
||||
sb->buffer[sb->length++] = ch;
|
||||
}
|
||||
|
||||
FRISO_API void string_buffer_insert(
|
||||
string_buffer_t sb,
|
||||
uint_t idx,
|
||||
fstring __str) {
|
||||
}
|
||||
|
||||
/*
|
||||
* remove the given bytes from the buffer start from idx.
|
||||
* this will cause the byte move after the idx+length.
|
||||
*
|
||||
* @return the new string.
|
||||
*/
|
||||
FRISO_API fstring string_buffer_remove(
|
||||
string_buffer_t sb,
|
||||
uint_t idx,
|
||||
uint_t length) {
|
||||
uint_t t;
|
||||
//move the bytes after the idx + length
|
||||
for(t = idx + length; t < sb->length; t++) {
|
||||
sb->buffer[t - length] = sb->buffer[t];
|
||||
}
|
||||
sb->buffer[t] = '\0';
|
||||
//memcpy( sb->buffer + idx,
|
||||
// sb->buffer + idx + length,
|
||||
// sb->length - idx - length );
|
||||
|
||||
t = sb->length - idx;
|
||||
if(t > 0) {
|
||||
sb->length -= (t > length) ? length : t;
|
||||
}
|
||||
sb->buffer[sb->length - 1] = '\0';
|
||||
|
||||
return sb->buffer;
|
||||
}
|
||||
|
||||
/*
|
||||
* turn the string_buffer to a string.
|
||||
* or return the buffer of the string_buffer.
|
||||
*/
|
||||
FRISO_API string_buffer_t string_buffer_trim(string_buffer_t sb) {
|
||||
//resize the buffer.
|
||||
if(sb->length < sb->allocs - 1) {
|
||||
sb = resize_buffer(sb, sb->length + 1);
|
||||
}
|
||||
return sb;
|
||||
}
|
||||
|
||||
/*
|
||||
* free the given fstring buffer.
|
||||
* and this function will not free the allocations of the
|
||||
* string_buffer_t->buffer, we return it to you, if there is
|
||||
* a necessary you could free it youself by calling free();
|
||||
*/
|
||||
FRISO_API fstring string_buffer_devote(string_buffer_t sb) {
|
||||
fstring buffer = sb->buffer;
|
||||
FRISO_FREE(sb);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/*
|
||||
* clear the given fstring buffer.
|
||||
* reset its buffer with 0 and reset its length to 0.
|
||||
*/
|
||||
FRISO_API void string_buffer_clear(string_buffer_t sb) {
|
||||
memset(sb->buffer, 0x00, sb->length);
|
||||
sb->length = 0;
|
||||
}
|
||||
|
||||
//free everything of the fstring buffer.
|
||||
FRISO_API void free_string_buffer(string_buffer_t sb) {
|
||||
FRISO_FREE(sb->buffer);
|
||||
FRISO_FREE(sb);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* create a new string_split_entry.
|
||||
*
|
||||
* @param source
|
||||
* @return string_split_t;
|
||||
*/
|
||||
FRISO_API string_split_t new_string_split(
|
||||
fstring delimiter,
|
||||
fstring source) {
|
||||
string_split_t e = (string_split_t)
|
||||
FRISO_MALLOC(sizeof(string_split_entry));
|
||||
if(e == NULL) {
|
||||
___ALLOCATION_ERROR___;
|
||||
}
|
||||
|
||||
e->delimiter = delimiter;
|
||||
e->delLen = strlen(delimiter);
|
||||
e->source = source;
|
||||
e->srcLen = strlen(source);
|
||||
e->idx = 0;
|
||||
|
||||
return e;
|
||||
}
|
||||
|
||||
FRISO_API void string_split_reset(
|
||||
string_split_t sst,
|
||||
fstring delimiter,
|
||||
fstring source) {
|
||||
sst->delimiter = delimiter;
|
||||
sst->delLen = strlen(delimiter);
|
||||
sst->source = source;
|
||||
sst->srcLen = strlen(source);
|
||||
sst->idx = 0;
|
||||
}
|
||||
|
||||
FRISO_API void string_split_set_source(
|
||||
string_split_t sst, fstring source) {
|
||||
sst->source = source;
|
||||
sst->srcLen = strlen(source);
|
||||
sst->idx = 0;
|
||||
}
|
||||
|
||||
FRISO_API void string_split_set_delimiter(
|
||||
string_split_t sst, fstring delimiter) {
|
||||
sst->delimiter = delimiter;
|
||||
sst->delLen = strlen(delimiter);
|
||||
sst->idx = 0;
|
||||
}
|
||||
|
||||
FRISO_API void free_string_split(string_split_t sst) {
|
||||
FRISO_FREE(sst);
|
||||
}
|
||||
|
||||
/**
|
||||
* get the next split fstring, and copy the
|
||||
* splited fstring into the __dst buffer .
|
||||
*
|
||||
* @param string_split_t
|
||||
* @param __dst
|
||||
* @return fstring (NULL if reach the end of the source
|
||||
* or there is no more segmentation)
|
||||
*/
|
||||
FRISO_API fstring string_split_next(
|
||||
string_split_t sst, fstring __dst) {
|
||||
uint_t i, _ok;
|
||||
fstring _dst = __dst;
|
||||
|
||||
//check if reach the end of the fstring
|
||||
if(sst->idx >= sst->srcLen) return NULL;
|
||||
|
||||
while(1) {
|
||||
_ok = 1;
|
||||
for(i = 0; i < sst->delLen
|
||||
&& (sst->idx + i < sst->srcLen); i++) {
|
||||
if(sst->source[sst->idx + i] != sst->delimiter[i]) {
|
||||
_ok = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//find the delimiter here,
|
||||
//break the loop and self plus the sst->idx, then return the buffer .
|
||||
if(_ok == 1) {
|
||||
sst->idx += sst->delLen;
|
||||
break;
|
||||
}
|
||||
|
||||
//coy the char to the buffer
|
||||
*_dst++ = sst->source[sst->idx++];
|
||||
//check if reach the end of the fstring
|
||||
if(sst->idx >= sst->srcLen) break;
|
||||
}
|
||||
|
||||
*_dst = '\0';
|
||||
return _dst;
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
/*
|
||||
* dynamatic array test program.
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include "friso_API.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int main(int argc, char **args) {
|
||||
|
||||
//create a new array list.
|
||||
friso_array_t array = new_array_list();
|
||||
fstring keys[] = {
|
||||
"chenmanwen", "yangqinghua",
|
||||
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
|
||||
"zhangrenfang", "yangjian",
|
||||
"liuxiao", "pankai",
|
||||
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
|
||||
"caizaili", "panpan", "xiaolude", "yintanwen"
|
||||
};
|
||||
int j, idx = 2, len = sizeof(keys) / sizeof(fstring);
|
||||
|
||||
for(j = 0; j < len; j++) {
|
||||
array_list_add(array, keys[j]);
|
||||
}
|
||||
|
||||
printf("length=%d, allocations=%d\n", array->length, array->allocs);
|
||||
array_list_trim(array);
|
||||
printf("after tirm length=%d, allocations=%d\n", array->length, array->allocs);
|
||||
printf("idx=%d, value=%s\n", idx, (fstring) array_list_get(array, idx));
|
||||
|
||||
printf("\nAfter set %dth item.\n", idx);
|
||||
array_list_set(array, idx, "chenxin__");
|
||||
printf("idx=%d, value=%s\n", idx, (fstring) array_list_get(array, idx));
|
||||
|
||||
printf("\nAfter remove %dth item.\n", idx);
|
||||
array_list_remove(array, idx);
|
||||
printf("length=%d, allocations=%d\n", array->length, array->allocs);
|
||||
printf("idx=%d, value=%s\n", idx, (fstring) array_list_get(array, idx));
|
||||
|
||||
printf("\nInsert a item at %dth\n", idx);
|
||||
array_list_insert(array, idx, "*chenxin*");
|
||||
printf("idx=%d, value=%s\n", idx, (fstring) array_list_get(array, idx));
|
||||
|
||||
free_array_list(array);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,161 +0,0 @@
|
|||
/*
|
||||
* Friso test program.
|
||||
* Of couse you can make it a perfect demo for friso.
|
||||
* all threads or proccess share the same friso_t,
|
||||
* defferent threads/proccess use defferent friso_task_t.
|
||||
* and you could share the friso_config_t if you wish...
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include "friso_API.h"
|
||||
#include "friso.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
#define __LENGTH__ 15
|
||||
#define __INPUT_LENGTH__ 20480
|
||||
#define ___EXIT_INFO___ \
|
||||
println("Thanks for trying friso."); \
|
||||
break;
|
||||
|
||||
#define ___ABOUT___ \
|
||||
println("+---------------------------------------------------------------+"); \
|
||||
println("| Friso - a Chinese word segmentation writen by c. |"); \
|
||||
println("| bug report email - chenxin619315@gmail.com. |"); \
|
||||
println("| or: visit https://github.com/lionsoul2014/friso. |"); \
|
||||
println("| java version for https://github.com/lionsoul2014/jcseg |"); \
|
||||
println("| type 'quit' to exit the program. |"); \
|
||||
println("+---------------------------------------------------------------+");
|
||||
|
||||
//read a line from a command line.
|
||||
static fstring getLine(FILE *fp, fstring __dst) {
|
||||
register int c;
|
||||
register fstring cs;
|
||||
|
||||
cs = __dst;
|
||||
while((c = getc(fp)) != EOF) {
|
||||
if(c == '\n') break;
|
||||
*cs++ = c;
|
||||
}
|
||||
*cs = '\0';
|
||||
|
||||
return (c == EOF && cs == __dst) ? NULL : __dst;
|
||||
}
|
||||
|
||||
/*static void printcode( fstring str ) {
|
||||
int i,length;
|
||||
length = strlen( str );
|
||||
printf("str:length=%d\n", length );
|
||||
for ( i = 0; i < length; i++ ) {
|
||||
printf("%d ", str[i] );
|
||||
}
|
||||
putchar('\n');
|
||||
}*/
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
||||
clock_t s_time, e_time;
|
||||
char line[__INPUT_LENGTH__] = {0};
|
||||
int i;
|
||||
fstring __path__ = NULL, mode = NULL;
|
||||
|
||||
friso_t friso;
|
||||
friso_config_t config;
|
||||
friso_task_t task;
|
||||
|
||||
// get the lexicon directory from command line arguments
|
||||
for(i = 0; i < argc; i++) {
|
||||
if(strcasecmp("-init", argv[i]) == 0) {
|
||||
__path__ = argv[i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
if(__path__ == NULL) {
|
||||
println("Usage: friso -init lexicon path");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
s_time = clock();
|
||||
|
||||
//initialize
|
||||
friso = friso_new();
|
||||
config = friso_new_config();
|
||||
/*friso_dic_t dic = friso_dic_new();
|
||||
friso_dic_load_from_ifile( dic, __path__, __LENGTH__ );
|
||||
friso_set_dic( friso, dic );
|
||||
friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/
|
||||
if(friso_init_from_ifile(friso, config, __path__) != 1) {
|
||||
printf("fail to initialize friso and config.\n");
|
||||
goto err;
|
||||
}
|
||||
|
||||
switch(config->mode) {
|
||||
case __FRISO_SIMPLE_MODE__:
|
||||
mode = "Simple";
|
||||
break;
|
||||
case __FRISO_COMPLEX_MODE__:
|
||||
mode = "Complex";
|
||||
break;
|
||||
case __FRISO_DETECT_MODE__:
|
||||
mode = "Detect";
|
||||
break;
|
||||
}
|
||||
|
||||
//friso_set_mode( config, __FRISO_DETECT_MODE__ );
|
||||
//printf("clr_stw=%d\n", friso->clr_stw);
|
||||
//printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" ));
|
||||
//printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究"));
|
||||
|
||||
e_time = clock();
|
||||
|
||||
printf("Initialized in %fsec\n", (double)(e_time - s_time) / CLOCKS_PER_SEC);
|
||||
printf("Mode: %s\n", mode);
|
||||
printf("+-Version: %s (%s)\n", friso_version(), friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK");
|
||||
___ABOUT___;
|
||||
|
||||
//set the task.
|
||||
task = friso_new_task();
|
||||
|
||||
while(1) {
|
||||
print("friso>> ");
|
||||
getLine(stdin, line);
|
||||
//exit the programe
|
||||
if(strcasecmp(line, "quit") == 0) {
|
||||
___EXIT_INFO___
|
||||
}
|
||||
|
||||
//for ( i = 0; i < 1000000; i++ ) {
|
||||
//set the task text.
|
||||
friso_set_text(task, line);
|
||||
println("分词结果:");
|
||||
|
||||
s_time = clock();
|
||||
while((config->next_token(friso, config, task)) != NULL) {
|
||||
printf(
|
||||
"%s[%d, %d, %d] ",
|
||||
task->token->word,
|
||||
task->token->offset,
|
||||
task->token->length,
|
||||
task->token->rlen
|
||||
);
|
||||
// printf("%s ", task->token->word);
|
||||
}
|
||||
//}
|
||||
e_time = clock();
|
||||
printf("\nDone, cost < %fsec\n", ((double)(e_time - s_time)) / CLOCKS_PER_SEC);
|
||||
|
||||
}
|
||||
|
||||
friso_free_task(task);
|
||||
|
||||
//error block.
|
||||
err:
|
||||
friso_free_config(config);
|
||||
friso_free(friso);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,65 +0,0 @@
|
|||
/**
|
||||
* hashmap testing program
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include "friso_API.h"
|
||||
#include <stdio.h>
|
||||
|
||||
void print_hash_info(friso_hash_t _hash) {
|
||||
printf("info:length=%d, size=%d, facotr=%f, threshold=%d\n", _hash->length, \
|
||||
_hash->size, _hash->factor, _hash->threshold);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
friso_hash_t _hash = new_hash_table();
|
||||
char *names[] = {
|
||||
"陈满文", "阳清华",
|
||||
"陈鑫", "罗江艳",
|
||||
"小燕子", "比比",
|
||||
"张仁芳", "阳建",
|
||||
"陈配", "李恒",
|
||||
"张志刚", "张怡少",
|
||||
"阳江波", "蔡再利",
|
||||
"阳绘章", "尹唐文",
|
||||
"谭志鹏", "肖路德",
|
||||
"潘凯", "刘潇",
|
||||
"马朝辉", "张强",
|
||||
"殷美林", "元明清",
|
||||
"周安", "郭桥安",
|
||||
"刘敏", "黄广华",
|
||||
"李胜", "黄海清"
|
||||
};
|
||||
//char *str[] = {"陈鑫", "张仁芳", "比比"};
|
||||
char **str = names;
|
||||
int j, len = 30;
|
||||
|
||||
print_hash_info(_hash);
|
||||
for(j = 0; j < len; j++) {
|
||||
hash_put_mapping(_hash, names[j], names[j]);
|
||||
}
|
||||
|
||||
print_hash_info(_hash);
|
||||
|
||||
printf("Press any key to continue.");
|
||||
getchar();
|
||||
|
||||
//remove mappings
|
||||
for(j = 0; j < len; j++) {
|
||||
printf("Exist %s?%2d\n", str[j], hash_exist_mapping(_hash, str[j]));
|
||||
printf("Now, remove %s\n", str[j]);
|
||||
hash_remove_mapping(_hash, str[j]);
|
||||
printf("Exist %s?%2d\n", str[j], hash_exist_mapping(_hash, str[j]));
|
||||
printf("*********************************\n");
|
||||
}
|
||||
|
||||
printf("Press any key to continue.");
|
||||
getchar();
|
||||
|
||||
print_hash_info(_hash);
|
||||
//free the table
|
||||
free_hash_table(_hash, 0);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,108 +0,0 @@
|
|||
/*
|
||||
* lex functions test program.
|
||||
*
|
||||
* @author lionsoul<chenxin619315@gmail.com>
|
||||
*/
|
||||
|
||||
#include "friso.h"
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
#include <string.h>
|
||||
|
||||
#define __LENGTH__ 15
|
||||
#define ___PRINT_HELP_INFO___ \
|
||||
printf("1. help print the current menu.\n"); \
|
||||
printf("2. #set set the classify of the dictionary.\n"); \
|
||||
printf("3. other search the words in the dictionary.\n"); \
|
||||
printf("4. quit exit the programe.\n");
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
lex_entry_t e;
|
||||
int lex = __LEX_CJK_WORDS__;
|
||||
char _line[__LENGTH__];
|
||||
clock_t s_time, e_time;
|
||||
friso_t friso;
|
||||
friso_config_t config;
|
||||
|
||||
s_time = clock();
|
||||
friso = friso_new();
|
||||
config = friso_new_config();
|
||||
config->add_syn = 0;
|
||||
friso->dic = friso_dic_new();
|
||||
|
||||
//__CJK_WORDS__
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-main.lex", __LENGTH__);
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-admin.lex", __LENGTH__);
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-chars.lex", __LENGTH__);
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-cn-mz.lex", __LENGTH__);
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-cn-place.lex", __LENGTH__);
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-company.lex", __LENGTH__);
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-festival.lex", __LENGTH__);
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-flname.lex", __LENGTH__);
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-food.lex", __LENGTH__);
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-lang.lex", __LENGTH__);
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-nation.lex", __LENGTH__);
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-net.lex", __LENGTH__);
|
||||
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-org.lex", __LENGTH__);
|
||||
|
||||
//__CJK_UNITS__
|
||||
friso_dic_load(friso, config, __LEX_CJK_UNITS__, "../vendors/dict/UTF-8/lex-units.lex", __LENGTH__);
|
||||
//__MIX_WORDS__
|
||||
friso_dic_load(friso, config, __LEX_ECM_WORDS__, "../vendors/dict/UTF-8/lex-ecmixed.lex", __LENGTH__);
|
||||
//__CN_LNAME__
|
||||
friso_dic_load(friso, config, __LEX_CN_LNAME__, "../vendors/dict/UTF-8/lex-lname.lex", __LENGTH__);
|
||||
//__CN_SNAME__
|
||||
friso_dic_load(friso, config, __LEX_CN_SNAME__, "../vendors/dict/UTF-8/lex-sname.lex", __LENGTH__);
|
||||
//__CN_DNAME1__
|
||||
friso_dic_load(friso, config, __LEX_CN_DNAME1__, "../vendors/dict/UTF-8/lex-dname-1.lex", __LENGTH__);
|
||||
//__CN_DNAME2__
|
||||
friso_dic_load(friso, config, __LEX_CN_DNAME2__, "../vendors/dict/UTF-8/lex-dname-2.lex", __LENGTH__);
|
||||
//__CN_LNA__
|
||||
friso_dic_load(friso, config, __LEX_CN_LNA__, "../vendors/dict/UTF-8/lex-ln-adorn.lex", __LENGTH__);
|
||||
|
||||
e_time = clock();
|
||||
printf(
|
||||
"Done, cost: %f sec, size=%d\n",
|
||||
(double)(e_time - s_time) / CLOCKS_PER_SEC,
|
||||
friso_all_dic_size(friso->dic)
|
||||
);
|
||||
|
||||
while(1) {
|
||||
printf("friso-%d>> ", lex);
|
||||
if(scanf("%s", _line) != 1) {
|
||||
printf("Invalid input\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
if(strcmp(_line, "quit") == 0) {
|
||||
break;
|
||||
} else if(strcmp(_line, "help") == 0) {
|
||||
___PRINT_HELP_INFO___
|
||||
} else if(strcmp(_line, "#set") == 0) {
|
||||
printf("lex_t>> ");
|
||||
if(scanf("%d", &lex) != 1) {
|
||||
printf("Warning: Invalid lex type input\n");
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
s_time = clock();
|
||||
e = friso_dic_get(friso->dic, lex, _line);
|
||||
e_time = clock();
|
||||
if(e != NULL) {
|
||||
printf(
|
||||
"word=%s, syn=%s, fre=%d, cost:%fsec\n",
|
||||
e->word, e->syn == NULL ? "NULL" : (char *)e->syn->items[0],
|
||||
e->fre,
|
||||
(double)(e_time - s_time) / CLOCKS_PER_SEC
|
||||
);
|
||||
} else {
|
||||
printf("%s was not found.\n", _line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// friso_dic_free( friso->dic );
|
||||
friso_free(friso);
|
||||
|
||||
return 0;
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue