Merge remote-tracking branch 'upstream-gitee/openkylin/yangtze' into packaging/openkylin/yangtze

This commit is contained in:
iaom 2022-10-24 10:45:13 +08:00
commit 1832b01721
348 changed files with 459999 additions and 391548 deletions

View File

@ -1,9 +1,9 @@
<schemalist gettext-domain="ukui-search">
<schema id="org.ukui.search.settings" path="/org/ukui/ukui-search/settings/">
<key name="index-search" type="b">
<key name="file-index-enable" type="b">
<default>false</default>
<summary>search method</summary>
<description>Is current search-method index-search.</description>
<summary>file index switch</summary>
<description>Enable or disable file index service.</description>
</key>
<key name="web-engine" type="s">
<default>"baidu"</default>

View File

@ -0,0 +1,16 @@
[Desktop Entry]
Name=ukui-search-app-data-service
Name[zh_CN]=应用数据搜索服务
GenericName=ukui-search-app-data-service
GenericName[zh_CN]=应用数据搜索服务
Comment=ukui-search-app-data-service
Comment[zh_CN]=应用数据搜索服务
Exec=/usr/bin/ukui-search-app-data-service %U
Type=Application
Icon=kylin-search
X-UKUI-AutoRestart=true
OnlyShowIn=UKUI
NoDisplay=true
X-UKUI-Autostart-Phase=Application
Terminal=false

View File

@ -0,0 +1,15 @@
[Desktop Entry]
Name=ukui-search-service-dir-manager
Name[zh_CN]=搜索服务目录管理
GenericName=ukui-search-service-dir-manager
GenericName[zh_CN]=搜索服务目录管理
Comment=ukui-search-service-dir-manager
Comment[zh_CN]=搜索服务目录管理
Exec=/usr/bin/ukui-search-service-dir-manager %U
Type=Application
Icon=kylin-search
X-UKUI-AutoRestart=true
OnlyShowIn=UKUI
NoDisplay=true
X-UKUI-Autostart-Phase=Application
Terminal=false

View File

@ -1 +1 @@
3.0 (quilt)
3.0 (native)

View File

@ -21,7 +21,7 @@
#include "create-index-ask-dialog.h"
#include <QPainterPath>
#include "kwindowsystem.h"
#include <KWindowSystem>
#define MAIN_SIZE QSize(380, 202)
#define MAIN_SPACING 0

View File

@ -45,7 +45,7 @@ SearchLineEdit::SearchLineEdit(QWidget *parent) : QLineEdit(parent) {
pixmap = QPixmap(QIcon(":/res/icons/system-search.symbolic.png").pixmap(QSize(18, 18)));
}
m_queryIcon->setProperty("useIconHighlightEffect", 0x10);
m_queryIcon->setFixedSize(pixmap.size());
m_queryIcon->setFixedSize(pixmap.size() / pixmap.devicePixelRatio());
m_queryIcon->setPixmap(pixmap);
m_ly = new QHBoxLayout(this);
@ -75,6 +75,11 @@ SearchLineEdit::SearchLineEdit(QWidget *parent) : QLineEdit(parent) {
m_timer->start(0.1 * 1000);
}
});
//跟随主题透明度变化
connect(qApp, &QApplication::paletteChanged, this, [=]() {
update();
});
}
SearchLineEdit::~SearchLineEdit() {

View File

@ -32,7 +32,7 @@
#include <QPushButton>
#include <QScrollArea>
#include <QTimer>
#include <libsearch.h>
#include "libsearch.h"
#if (QT_VERSION >= QT_VERSION_CHECK(5, 12, 0))
#include "xatom-helper.h"
#endif

View File

@ -341,24 +341,25 @@ void ResultArea::mouseReleaseEvent(QMouseEvent *event)
bool ResultArea::viewportEvent(QEvent *event)
{
if(event->type() == QEvent::TouchBegin) {
QTouchEvent *e = dynamic_cast<QTouchEvent *>(event);
if(e->touchPoints().size() == 1) {
m_pressPoint = m_widget->mapFrom(this, e->touchPoints().at(0).pos().toPoint());
if (event->type() == QEvent::MouseButtonPress) {
QMouseEvent *e = dynamic_cast<QMouseEvent *>(event);
if (e->source() == Qt::MouseEventSynthesizedByApplication) {
qDebug() << "MouseButtonPress MouseEventSynthesizedByApplication";
m_pressPoint = m_widget->mapFrom(this, e->pos());
event->accept();
return true;
}
} else if (event->type() == QEvent::TouchUpdate) {
QTouchEvent *e = dynamic_cast<QTouchEvent *>(event);
// qDebug() << "touchpoint===========" << e->touchPoints().size();
if(e->touchPoints().size() == 1) {
int delta = m_pressPoint.y() - m_widget->mapFrom(this, e->touchPoints().at(0).pos().toPoint()).y();
} else if (event->type() == QEvent::MouseMove) {
QMouseEvent *e = dynamic_cast<QMouseEvent *>(event);
if (e->source() == Qt::MouseEventSynthesizedByApplication) {
qDebug() << "MouseMove MouseEventSynthesizedByApplication";
int delta = m_pressPoint.y() - m_widget->mapFrom(this, e->pos()).y();
// qDebug() << "last pos:" << m_pressPoint.y();
// qDebug() << "new pos:" << m_widget->mapFrom(this, e->touchPoints().at(0).pos().toPoint()).y();
// qDebug() << "delta" << delta;
// qDebug() << "height" << m_widget->height() << "--" << verticalScrollBar()->maximum();
// qDebug() << "value" << verticalScrollBar()->value() << "--" << verticalScrollBar()->value() + delta;
this->verticalScrollBar()->setValue(verticalScrollBar()->value() + delta);
m_pressPoint = m_widget->mapFrom(this,e->touchPoints().at(0).pos().toPoint());
m_pressPoint = m_widget->mapFrom(this,e->pos());
return true;
}
}
@ -458,12 +459,13 @@ void ResultArea::initConnections()
connect(this->m_titleLabel, &TitleLabel::retractClicked, this, [=] () {
Q_FOREACH(auto widget, m_widget_list) {
if (widget->pluginName() == m_titleLabel->text()) {
widget->reduceListSlot();
widget->resetTitleLabel();
if (!m_titleLabel->isHidden()) {
m_titleLabel->hide();
this->setViewportMargins(0,0,0,0);
}
widget->reduceListSlot();
this->verticalScrollBar()->setValue(widget->pos().ry());
widget->resetTitleLabel();
}
}
});
@ -510,8 +512,8 @@ void ResultArea::setupConnectionsForWidget(ResultWidget *widget)
});
connect(widget, &ResultWidget::retractClicked, this, [=] () {//点击收起搜索结果后
if (!m_titleLabel->isHidden()) {
m_titleLabel->hide();
this->setViewportMargins(0,0,0,0);
m_titleLabel->hide();
}
});
connect(widget, &ResultWidget::sendBestListData, m_bestListWidget, &BestListWidget::sendBestListData);

View File

@ -173,6 +173,10 @@ void SearchResultPage::initConnections()
sendResizeWidthSignal(280);
});
connect(this, &SearchResultPage::setSelectionInfo, m_resultArea, &ResultArea::setSelectionInfo);
//跟随主题透明度变化
connect(qApp, &QApplication::paletteChanged, this, [=]() {
update();
});
}
void SearchResultPage::setupConnectionsForWidget(ResultWidget *widget)

View File

@ -1,13 +1,13 @@
QT += core gui dbus KWindowSystem xml x11extras
QT += core gui dbus KWindowSystem xml x11extras sql
greaterThan(QT_MAJOR_VERSION, 4): QT += widgets
VERSION = 1.0.0
VERSION = 2.2.3
DEFINES += VERSION='\\"$${VERSION}\\"'
TARGET = ukui-search
TEMPLATE = app
PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0
PKGCONFIG += gio-2.0 glib-2.0 gio-unix-2.0 kysdk-waylandhelper
CONFIG += c++11 link_pkgconfig no_keywords lrelease
LIBS += -lxapian -lgsettings-qt -lquazip5 -lX11
#LIBS += -lukui-log4qt
@ -59,7 +59,7 @@ RESOURCES += \
TRANSLATIONS += \
../translations/ukui-search/zh_CN.ts \
../translations/ukui-search/tr.ts \
../translations/ukui-search/bo.ts
../translations/ukui-search/bo_CN.ts
qm_files.path = /usr/share/ukui-search/translations/
qm_files.files = $$OUT_PWD/.qm/*.qm

View File

@ -31,15 +31,18 @@
#include <QPixmap>
#if (QT_VERSION >= QT_VERSION_CHECK(5, 12, 0))
#include <KWindowEffects>
#include "kwindowsystem.h"
#include <KWindowSystem>
#endif
#include "global-settings.h"
#include <QtX11Extras/QX11Info>
#include "ukuistylehelper/ukuistylehelper.h"
#include "windowmanager/windowmanager.h"
#include "global-settings.h"
#define MAIN_MARGINS 0, 0, 0, 0
#define TITLE_MARGINS 0,0,0,0
#define UKUI_SEARCH_SCHEMAS "org.ukui.search.settings"
#define SEARCH_METHOD_KEY "indexSearch"
#define SEARCH_METHOD_KEY "fileIndexEnable"
#define WEB_ENGINE_KEY "webEngine"
#define WINDOW_WIDTH 700
#define WINDOW_HEIGHT 610
@ -79,12 +82,19 @@ MainWindow::MainWindow(QWidget *parent) :
initConnections();
initGsettings();
connect(KWindowSystem::self(), &KWindowSystem::activeWindowChanged, this,[&](WId activeWindowId){
if (activeWindowId != this->winId()) {
tryHideMainwindow();
}
});
// connect(KWindowSystem::self(), &KWindowSystem::activeWindowChanged, this,[&](WId activeWindowId){
// qDebug() << "activeWindowChanged!!!" << activeWindowId;
// if (activeWindowId != this->winId()) {
// tryHideMainwindow();
// }
// });
m_appWidgetPlugin = new AppWidgetPlugin;
connect(m_appWidgetPlugin, &AppWidgetPlugin::startSearch, this, [ & ] (QString keyword){
this->bootOptionsFilter("-s");
this->setText(keyword);
});
//NEW_TODO, register plugins
// SearchPluginManager::getInstance()->registerPlugin(\\);
// m_stackedWidget->setPlugins(SearchPluginManager::getInstance()->getPluginIds());
@ -150,11 +160,11 @@ void MainWindow::initUi() {
//创建索引询问弹窗
m_askDialog = new CreateIndexAskDialog(this);
#if (QT_VERSION >= QT_VERSION_CHECK(5, 12, 0))
MotifWmHints ask_dialog_hints;
ask_dialog_hints.flags = MWM_HINTS_FUNCTIONS | MWM_HINTS_DECORATIONS;
ask_dialog_hints.functions = MWM_FUNC_ALL;
ask_dialog_hints.decorations = MWM_DECOR_BORDER;
XAtomHelper::getInstance()->setWindowMotifHint(m_askDialog->winId(), ask_dialog_hints);
// MotifWmHints ask_dialog_hints;
// ask_dialog_hints.flags = MWM_HINTS_FUNCTIONS | MWM_HINTS_DECORATIONS;
// ask_dialog_hints.functions = MWM_FUNC_ALL;
// ask_dialog_hints.decorations = MWM_DECOR_BORDER;
// XAtomHelper::getInstance()->setWindowMotifHint(m_askDialog->winId(), ask_dialog_hints);
#endif
}
@ -192,7 +202,6 @@ void MainWindow::bootOptionsFilter(QString opt) {
if (this->isHidden()) {
clearSearchResult();
centerToScreen(this);
this->show();
this->m_searchBarWidget->setFocus();
this->activateWindow();
}
@ -217,7 +226,6 @@ void MainWindow::trayIconActivatedSlot(QSystemTrayIcon::ActivationReason reason)
if(!this->isVisible()) {
clearSearchResult();
centerToScreen(this);
this->show();
// this->m_searchLineEdit->focusIn(); //打开主界面时输入框夺焦,可直接输入
this->raise();
this->activateWindow();
@ -301,10 +309,12 @@ void MainWindow::searchKeywordSlot(const QString &keyword)
//NEW_TODO
if(keyword == "") {
// m_stackedWidget->setPage(int(StackedPage::HomePage));
QTimer::singleShot(10, this, [ = ]() {
m_askTimer->stop();
Q_EMIT m_searchResultPage->stopSearch();
// Q_EMIT m_searchResultPage->stopSearch();
m_searchResultPage->hide();
this->resizeHeight(68);
});
} else {
// m_stackedWidget->setPage(int(StackedPage::SearchPage));
@ -327,6 +337,11 @@ void MainWindow::resizeHeight(int height)
this->setFixedHeight(height);
}
void MainWindow::tryHide()
{
this->tryHideMainwindow();
}
/**
* @brief monitorResolutionChange
* @param rect
@ -421,7 +436,16 @@ void MainWindow::centerToScreen(QWidget* widget) {
// desk_x = width;
// desk_y = height;
// }
widget->move(desk_x / 2 - x / 2 + desk_rect.left(), desk_y / 3 + desk_rect.top());
widget->show();
kdk::WindowManager::setGeometry(this->windowHandle(),QRect(desk_x / 2 - x / 2 + desk_rect.left(),
desk_y / 3 + desk_rect.top(),
this->width(),
this->height()));
//设置跳过多任务视图
kdk::WindowManager::setSkipSwitcher(this->windowHandle(),true);
//设置跳过任务栏
kdk::WindowManager::setSkipTaskBar(this->windowHandle(),true);
// widget->move(desk_x / 2 - x / 2 + desk_rect.left(), desk_y / 3 + desk_rect.top());
}
void MainWindow::initGsettings() {
@ -460,7 +484,12 @@ void MainWindow::initTimer() {
connect(m_askTimer, &QTimer::timeout, this, [ = ]() {
if(this->isVisible()) {
m_isAskDialogVisible = true;
kdk::UkuiStyleHelper::self()->removeHeader(m_askDialog);
m_askDialog->show();
//设置跳过多任务视图
kdk::WindowManager::setSkipSwitcher(m_askDialog->windowHandle(),true);
//设置跳过任务栏
kdk::WindowManager::setSkipTaskBar(m_askDialog->windowHandle(),true);
m_currentSearchAsked = true;
}
m_askTimer->stop();
@ -558,11 +587,21 @@ void MainWindow::paintEvent(QPaintEvent *event) {
bool MainWindow::eventFilter(QObject *watched, QEvent *event)
{
if (watched == this) {
//失焦退出
if (event->type() == QEvent::ActivationChange) {
if (QApplication::activeWindow() != this) {
tryHideMainwindow();
return true;
}
}
//kwin alt+f4发出close事件, 需要在存在子窗口时屏蔽该事件
if ((watched == this) && (event->type() == QEvent::Close)) {
if (event->type() == QEvent::Close) {
event->ignore();
tryHideMainwindow();
return true;
}
}
return QObject::eventFilter(watched, event);
}

View File

@ -46,6 +46,7 @@
#include <QSystemTrayIcon>
#include <QTimer>
#include "search-app-widget-plugin/search.h"
#include "index-generator.h"
#include "libsearch.h"
#include "create-index-ask-dialog.h"
@ -110,6 +111,7 @@ public Q_SLOTS:
void settingsBtnClickedSlot();
void searchKeywordSlot(const QString&);
void resizeHeight(int height);
void tryHide();
private:
@ -142,6 +144,7 @@ private:
QTimer * m_researchTimer = nullptr; //创建索引后重新执行一次搜索的计时器
bool m_currentSearchAsked = false; //本次搜索是否已经询问过是否创建索引了
QGSettings * m_search_gsettings = nullptr;
AppWidgetPlugin *m_appWidgetPlugin = nullptr;
void setSearchMethod(const bool&);
double getTransparentData();

View File

@ -67,6 +67,7 @@ ReceiveResultThread::ReceiveResultThread(DataQueue<SearchPluginIface::ResultInfo
void ReceiveResultThread::stop()
{
this->requestInterruption();
this->wait();
this->quit();
}

View File

@ -3,7 +3,6 @@
<file>res/icons/edit-find-symbolic.svg</file>
<file>res/icons/desktop.png</file>
<file>res/icons/close.svg</file>
<file>res/qt-translations/qt_zh_CN.qm</file>
<file>res/icons/net-disconnected.svg</file>
<file>res/icons/system-search.symbolic.png</file>
<file>res/icons/ukui-up-symbolic.svg</file>

View File

@ -12,6 +12,15 @@ void UkuiSearchDbusServices::searchKeyword(QString keyword)
m_mainWindow->setText(keyword);
}
void UkuiSearchDbusServices::mainWindowSwitch()
{
if (m_mainWindow->isActiveWindow()) {
m_mainWindow->tryHide();
} else {
m_mainWindow->bootOptionsFilter("-s");
}
}
UkuiSearchDbusServices::UkuiSearchDbusServices(MainWindow *m)
{
m_mainWindow = m;

View File

@ -20,9 +20,10 @@ public:
public Q_SLOTS:
void showWindow();
void searchKeyword(QString keyword);
void mainWindowSwitch();
private:
MainWindow *m_mainWindow;
MainWindow *m_mainWindow = nullptr;
};
}

View File

@ -35,7 +35,7 @@ UkuiSearchGui::UkuiSearchGui(int &argc, char *argv[], const QString &application
QTranslator *qt_translator = new QTranslator(this);
try {
if(! qt_translator->load(":/res/qt-translations/qt_zh_CN.qm")) throw - 1;
if(! qt_translator->load("/usr/share/qt5/translations/qt_" + QLocale::system().name())) throw - 1;
this->installTranslator(qt_translator);
} catch(...) {
qDebug() << "Load translations file" << QLocale() << "failed!";

View File

@ -31,13 +31,24 @@ bool BestListView::isSelected()
int BestListView::showHeight()
{
int height;
int rowheight = this->rowHeight(this->model()->index(0, 0, QModelIndex()));
int height(0);
// int rowheight = this->rowHeight(this->model()->index(0, 0, QModelIndex()));
// if (this->isExpanded()) {
// height = m_count * rowheight;
// } else {
// int show_count = m_count > NUM_LIMIT_SHOWN_DEFAULT ? NUM_LIMIT_SHOWN_DEFAULT : m_count;
// height = show_count * rowheight;
// }
if (this->isExpanded()) {
height = m_count * rowheight;
for (int i = 0; i<m_count; ++i) {
height += this->rowHeight(this->model()->index(i, 0, QModelIndex()));
}
} else {
int show_count = m_count > NUM_LIMIT_SHOWN_DEFAULT ? NUM_LIMIT_SHOWN_DEFAULT : m_count;
height = show_count * rowheight;
for (int i = 0; i<show_count; ++i) {
height += this->rowHeight(this->model()->index(i, 0, QModelIndex()));
}
}
return height;
}

View File

@ -3,15 +3,15 @@
using namespace UkuiSearch;
static ResultItemStyle *global_instance_of_item_style = nullptr;
ResultViewDelegate::ResultViewDelegate(QObject *parent) : QStyledItemDelegate(parent)
ResultViewDelegate::ResultViewDelegate(QObject *parent) : QStyledItemDelegate(parent),
m_textDoc(new QTextDocument(this)),
m_hightLightEffectHelper(new HightLightEffectHelper(this))
{
}
void ResultViewDelegate::setSearchKeyword(const QString &regFindKeyWords)
{
m_regFindKeyWords.clear();
m_regFindKeyWords = regFindKeyWords;
m_hightLightEffectHelper->setExpression(regFindKeyWords);
}
QSize ResultViewDelegate::sizeHint(const QStyleOptionViewItem &option, const QModelIndex &index) const
@ -21,90 +21,36 @@ QSize ResultViewDelegate::sizeHint(const QStyleOptionViewItem &option, const QMo
return size;
}
void ResultViewDelegate::paint(QPainter * painter, const QStyleOptionViewItem & option, const QModelIndex & index) const {
void ResultViewDelegate::paint(QPainter *painter, const QStyleOptionViewItem &option, const QModelIndex &index) const
{
QStyleOptionViewItem opt = option;
initStyleOption(&opt, index);
QStyle *style = opt.widget->style();
opt.displayAlignment = Qt::Alignment(Qt::AlignLeft|Qt::AlignVCenter);
QString text = opt.text;
if(text.isEmpty()) {
return;
}
opt.text = QString();
QStyle *style = opt.widget->style();
style->proxy()->drawControl(QStyle::CE_ItemViewItem, &opt, painter, opt.widget); //绘制非文本区域内容
opt.text = text;
QTextDocument doc;
doc.setHtml(getHtmlText(painter, opt, index)); //提取富文本
QAbstractTextDocumentLayout* layout = doc.documentLayout();
const double height = layout->documentSize().height();
QRect textRect = style->subElementRect(QStyle::SE_ItemViewItemText, &opt, opt.widget);
//使图标和文本间隔与原来保持一致故文本区域右移4
// textRect.adjust(4, 0, 0, 0);
double y = textRect.y();
y += (textRect.height() - height) / 2;
QFontMetrics fontMetrics(opt.font);
text = fontMetrics.elidedText(text, Qt::ElideRight, textRect.width() - 5); //富余5px的宽度
opt.text = text;
QAbstractTextDocumentLayout::PaintContext context;
QPalette::ColorGroup cg = opt.state & QStyle::State_Enabled
? QPalette::Normal : QPalette::Disabled;
if (cg == QPalette::Normal && !(opt.state & QStyle::State_Active))
cg = QPalette::Inactive;
if(opt.state & QStyle::State_Selected) {
painter->setPen(opt.palette.color(cg, QPalette::HighlightedText));
} else {
painter->setPen(opt.palette.color(cg, QPalette::Text));
}
painter->save();
painter->translate(QPointF(textRect.x(), y));
layout->draw(painter, context); //绘制文本区域内容
if(opt.state & QStyle::State_Selected) {
m_hightLightEffectHelper->setTextColor(QBrush(opt.palette.highlightedText().color()));
} else {
m_hightLightEffectHelper->setTextColor(QBrush(opt.palette.text().color()));
}
painter->translate(textRect.topLeft());
m_textDoc->setPlainText(text);
m_hightLightEffectHelper->setDocument(m_textDoc);
m_hightLightEffectHelper->rehighlight();
m_textDoc->drawContents(painter);
painter->restore();
}
QString ResultViewDelegate::getHtmlText(QPainter *painter, const QStyleOptionViewItem &itemOption, const QModelIndex &index) const
{
int indexFindLeft = 0;
QString indexString = index.model()->data(index, Qt::DisplayRole).toString();
QFont ft(painter->font().family(), GlobalSettings::getInstance()->getValue(FONT_SIZE_KEY).toInt());
QFontMetrics fm(ft);
QString indexColString = fm.elidedText(indexString, Qt::ElideRight, itemOption.rect.width() - 30 - 10); //当字体超过Item的长度时显示为省略号
QString htmlString;
if((indexColString.toUpper()).contains((m_regFindKeyWords.toUpper()))) {
indexFindLeft = indexColString.toUpper().indexOf(m_regFindKeyWords.toUpper()); //得到查找字体在当前整个Item字体中的位置
htmlString = escapeHtml(indexColString.left(indexFindLeft)) + "<b>" + escapeHtml(indexColString.mid(indexFindLeft, m_regFindKeyWords.length())) + "</b>" + escapeHtml(indexColString.right(indexColString.length() - indexFindLeft - m_regFindKeyWords.length()));
} else {
bool boldOpenned = false;
for(int i = 0; i < indexColString.length(); i++) {
if((m_regFindKeyWords.toUpper()).contains(QString(indexColString.at(i)).toUpper())) {
if(! boldOpenned) {
boldOpenned = true;
htmlString.append(QString("<b>"));
}
htmlString.append(escapeHtml(QString(indexColString.at(i))));
} else {
if(boldOpenned) {
boldOpenned = false;
htmlString.append(QString("</b>"));
}
htmlString.append(escapeHtml(QString(indexColString.at(i))));
}
}
}
// qDebug()<<indexColString<<"---->"<<htmlString;
return "<pre>" + htmlString + "</pre>";
}
QString ResultViewDelegate::escapeHtml(const QString &str) const
{
QString temp = str;
temp.replace("<", "&lt;");
temp.replace(">", "&gt;");
return temp;
}
ResultItemStyle *ResultItemStyle::getStyle()
@ -259,3 +205,32 @@ void ResultItemStyle::drawControl(QStyle::ControlElement element, const QStyleOp
break;
}
}
HightLightEffectHelper::HightLightEffectHelper(QObject *parent) : QSyntaxHighlighter(parent)
{
m_expression.setCaseSensitivity(Qt::CaseInsensitive);
m_expression.setPatternSyntax(QRegExp::FixedString);
}
void HightLightEffectHelper::setExpression(const QString &text)
{
m_expression.setPattern(text);
}
void HightLightEffectHelper::setTextColor(const QBrush &brush)
{
m_textCharFormat.setForeground(brush);
}
void HightLightEffectHelper::highlightBlock(const QString &text)
{
setFormat(0, text.length(), m_textCharFormat);
m_textCharFormat.setFontWeight(QFont::Bold);
int index = text.indexOf(m_expression);
while(index >= 0){
int length = m_expression.matchedLength();
setFormat(index, length, m_textCharFormat);
index = text.indexOf(m_expression, index+length);
}
m_textCharFormat.setFontWeight(QFont::Normal);
}

View File

@ -27,10 +27,29 @@
#include <QTextDocument>
#include <QAbstractTextDocumentLayout>
#include <QProxyStyle>
#include <QSyntaxHighlighter>
#include <QTextCharFormat>
#include <QRegExp>
#include "global-settings.h"
namespace UkuiSearch {
class ResultViewDelegate : public QStyledItemDelegate {
class HightLightEffectHelper : public QSyntaxHighlighter
{
public:
explicit HightLightEffectHelper(QObject *parent = nullptr);
void setExpression(const QString &text);
void setTextColor(const QBrush &brush);
protected:
void highlightBlock(const QString &text);
private:
QRegExp m_expression;
QTextCharFormat m_textCharFormat;
};
class ResultViewDelegate : public QStyledItemDelegate
{
Q_OBJECT
public:
explicit ResultViewDelegate(QObject *parent = nullptr);
@ -38,11 +57,12 @@ public:
void setSearchKeyword(const QString &);
protected:
QSize sizeHint(const QStyleOptionViewItem &option, const QModelIndex &index) const;
private:
QString m_regFindKeyWords = 0;
void paint(QPainter *, const QStyleOptionViewItem &, const QModelIndex &) const override;
QString getHtmlText(QPainter *, const QStyleOptionViewItem &, const QModelIndex &) const;
QString escapeHtml(const QString&) const;
private:
QTextDocument *m_textDoc = nullptr;
HightLightEffectHelper *m_hightLightEffectHelper = nullptr;
};
class ResultItemStyle : public QProxyStyle

View File

@ -168,6 +168,7 @@ ResultView::ResultView(const QString &plugin_id, QWidget *parent) : QTreeView(pa
{
// setStyle(ResultItemStyle::getStyle());
this->setFrameShape(QFrame::NoFrame);
this->viewport()->setAttribute(Qt::WA_AcceptTouchEvents);
this->viewport()->setAutoFillBackground(false);
this->setIconSize(QSize(VIEW_ICON_SIZE, VIEW_ICON_SIZE));
this->setRootIsDecorated(false);
@ -181,6 +182,9 @@ ResultView::ResultView(const QString &plugin_id, QWidget *parent) : QTreeView(pa
m_plugin_id = plugin_id;
m_styleDelegate = new ResultViewDelegate(this);
this->setItemDelegate(m_styleDelegate);
m_touchTimer = new QTimer(this);
m_touchTimer->setSingleShot(true);
m_touchTimer->setInterval(100);
}
bool ResultView::isSelected()
@ -190,13 +194,23 @@ bool ResultView::isSelected()
int ResultView::showHeight()
{
int height;
int rowheight = this->rowHeight(this->model()->index(0, 0, QModelIndex()));
int height(0);
// int rowheight = this->rowHeight(this->model()->index(0, 0, QModelIndex()));
// if (this->isExpanded()) {
// height = m_count * rowheight;
// } else {
// int show_count = m_count > NUM_LIMIT_SHOWN_DEFAULT ? NUM_LIMIT_SHOWN_DEFAULT : m_count;
// height = show_count * rowheight;
// }
if (this->isExpanded()) {
height = m_count * rowheight;
for (int i = 0; i<m_count; ++i) {
height += this->rowHeight(this->model()->index(i, 0, QModelIndex()));
}
} else {
int show_count = m_count > NUM_LIMIT_SHOWN_DEFAULT ? NUM_LIMIT_SHOWN_DEFAULT : m_count;
height = show_count * rowheight;
for (int i = 0; i<show_count; ++i) {
height += this->rowHeight(this->model()->index(i, 0, QModelIndex()));
}
}
return height;
}
@ -268,10 +282,10 @@ void ResultView::onRowSelectedSlot(const QModelIndex &index)
void ResultView::onItemListChanged(const int &count)
{
m_count = count;
Q_EMIT this->listLengthChanged(count);
QModelIndex index = this->currentIndex();
m_model->refresh();
this->setCurrentIndex(index);
Q_EMIT this->listLengthChanged(count);
}
void ResultView::setExpanded(const bool &is_expanded)
@ -334,6 +348,53 @@ void ResultView::mouseMoveEvent(QMouseEvent *event)
return QTreeView::mouseMoveEvent(event);
}
bool ResultView::viewportEvent(QEvent *event)
{
if (event->type() == QEvent::TouchBegin) {
qDebug() << "TouchBegin==============";
QTouchEvent *e = dynamic_cast<QTouchEvent *>(event);
QMouseEvent me(QEvent::MouseButtonPress,
e->touchPoints().at(0).pos(),
this->mapTo(this->window(),e->touchPoints().at(0).pos().toPoint()),
this->mapToGlobal(e->touchPoints().at(0).pos().toPoint()),
Qt::LeftButton,Qt::LeftButton,Qt::NoModifier,Qt::MouseEventSynthesizedByApplication);
QApplication::sendEvent(parent(), &me);
m_touchTimer->start();
event->accept();
return true;
} else if (event->type() == QEvent::TouchEnd) {
qDebug() << "touchend==============" << m_touchTimer->remainingTime();
if (m_touchTimer->remainingTime() > 0.001) {
QTouchEvent *e = dynamic_cast<QTouchEvent *>(event);
QMouseEvent me(QEvent::MouseButtonPress,
e->touchPoints().at(0).pos(),
this->mapTo(this->window(),e->touchPoints().at(0).pos().toPoint()),
this->mapToGlobal(e->touchPoints().at(0).pos().toPoint()),
Qt::LeftButton,Qt::LeftButton,Qt::NoModifier,Qt::MouseEventSynthesizedByApplication);
QApplication::sendEvent(this->viewport(),&me);
QMouseEvent mer(QEvent::MouseButtonRelease,
e->touchPoints().at(0).pos(),
this->mapTo(this->window(),e->touchPoints().at(0).pos().toPoint()),
this->mapToGlobal(e->touchPoints().at(0).pos().toPoint()),
Qt::LeftButton,Qt::LeftButton,Qt::NoModifier,Qt::MouseEventSynthesizedByApplication);
QApplication::sendEvent(this->viewport(),&mer);
}
return true;
} else if (event->type() == QEvent::TouchUpdate) {
qDebug() << "touchupdate==============";
QTouchEvent *e = dynamic_cast<QTouchEvent *>(event);
QMouseEvent me(QEvent::MouseMove,
e->touchPoints().at(0).pos(),
this->mapTo(this->window(),e->touchPoints().at(0).pos().toPoint()),
this->mapToGlobal(e->touchPoints().at(0).pos().toPoint()),
Qt::LeftButton,Qt::LeftButton,Qt::NoModifier,Qt::MouseEventSynthesizedByApplication);
QApplication::sendEvent(parent(), &me);
return true;
}
return QTreeView::viewportEvent(event);
}
void ResultView::initConnections()
{
connect(this, &ResultView::startSearch, [ = ](const QString &keyword) {

View File

@ -37,6 +37,7 @@ protected:
void mousePressEvent(QMouseEvent *event);
void mouseReleaseEvent(QMouseEvent *event);
void mouseMoveEvent(QMouseEvent *event);
bool viewportEvent(QEvent *event);
private:
void initConnections();
@ -47,6 +48,7 @@ private:
int m_count = 0;
QModelIndex m_tmpCurrentIndex;
QModelIndex m_tmpMousePressIndex;
QTimer *m_touchTimer;
Q_SIGNALS:
void startSearch(const QString &);

View File

@ -18,6 +18,7 @@
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#include <QDBusReply>
#include "web-search-view.h"
#define MAIN_MARGINS 0,0,0,0
#define MAIN_SPACING 0
@ -97,6 +98,30 @@ void WebSearchView::LaunchBrowser()
} else { //默认值
address = "http://baidu.com/s?word=" + m_keyWord ; //百度
}
bool res(false);
QDBusInterface * appLaunchInterface = new QDBusInterface("com.kylin.AppManager",
"/com/kylin/AppManager",
"com.kylin.AppManager",
QDBusConnection::sessionBus());
if(!appLaunchInterface->isValid()) {
qWarning() << qPrintable(QDBusConnection::sessionBus().lastError().message());
res = false;
} else {
appLaunchInterface->setTimeout(10000);
QDBusReply<bool> reply = appLaunchInterface->call("LaunchDefaultAppWithUrl", address);
if(reply.isValid()) {
res = reply;
} else {
qWarning() << "SoftWareCenter dbus called failed!";
res = false;
}
}
if(appLaunchInterface) {
delete appLaunchInterface;
}
appLaunchInterface = NULL;
if (res)
return;
QDesktopServices::openUrl(address);
}

View File

@ -0,0 +1,674 @@
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
<program> Copyright (C) <year> <name of author>
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.

View File

@ -0,0 +1,33 @@
#ifndef CHINESESEGMENTATIONPRIVATE_H
#define CHINESESEGMENTATIONPRIVATE_H
#include "chinese-segmentation.h"
#include "cppjieba/Jieba.hpp"
#include "cppjieba/KeywordExtractor.hpp"
class ChineseSegmentationPrivate
{
public:
explicit ChineseSegmentationPrivate(ChineseSegmentation *parent = nullptr);
~ChineseSegmentationPrivate();
vector<KeyWord> callSegment(const string& sentence);
vector<string> callMixSegmentCutStr(const string& sentence);
vector<Word> callMixSegmentCutWord(const string& sentence);
string lookUpTagOfWord(const string& word);
vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
vector<Word> callFullSegment(const string& sentence);
vector<Word> callQuerySegment(const string& sentence);
vector<Word> callHMMSegment(const string& sentence);
vector<Word> callMPSegment(const string& sentence);
private:
cppjieba::Jieba *m_jieba;
ChineseSegmentation *q = nullptr;
};
#endif // CHINESESEGMENTATIONPRIVATE_H

View File

@ -19,72 +19,144 @@
*
*/
#include "chinese-segmentation.h"
#include <QFileInfo>
#include <QDebug>
static ChineseSegmentation *global_instance_chinese_segmentation = nullptr;
QMutex ChineseSegmentation::m_mutex;
#include "chinese-segmentation-private.h"
ChineseSegmentation::ChineseSegmentation() {
const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
ChineseSegmentationPrivate::ChineseSegmentationPrivate(ChineseSegmentation *parent) : q(parent)
{
//const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
const char * const HMM_PATH = "/usr/share/ukui-search/res/dict/hmm_model.utf8";
const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
const char * const IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
//const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
//const char * const IDF_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
const char * const STOP_WORD_PATH = "/usr/share/ukui-search/res/dict/stop_words.utf8";
m_jieba = new cppjieba::Jieba(DICT_PATH,
HMM_PATH,
USER_DICT_PATH,
IDF_PATH,
IDF_DICT_PATH,
STOP_WORD_PATH,
"");
}
ChineseSegmentation::~ChineseSegmentation() {
ChineseSegmentationPrivate::~ChineseSegmentationPrivate() {
if(m_jieba)
delete m_jieba;
m_jieba = nullptr;
}
ChineseSegmentation *ChineseSegmentation::getInstance() {
QMutexLocker locker(&m_mutex);
if(!global_instance_chinese_segmentation) {
global_instance_chinese_segmentation = new ChineseSegmentation;
}
return global_instance_chinese_segmentation;
}
QVector<SKeyWord> ChineseSegmentation::callSegement(std::string s) {
// std::string s;
// s = str.toStdString();
// str.squeeze();
vector<KeyWord> ChineseSegmentationPrivate::callSegment(const string &sentence) {
const size_t topk = -1;
std::vector<cppjieba::KeyWord> keywordres;
ChineseSegmentation::m_jieba->extractor.Extract(s, keywordres, topk);
std::string().swap(s);
QVector<SKeyWord> vecNeeds;
convert(keywordres, vecNeeds);
vector<KeyWord> keywordres;
ChineseSegmentationPrivate::m_jieba->extractor.Extract(sentence, keywordres, topk);
keywordres.clear();
// keywordres.shrink_to_fit();
return vecNeeds;
return keywordres;
}
std::vector<cppjieba::KeyWord> ChineseSegmentation::callSegementStd(const std::string &str) {
const size_t topk = -1;
std::vector<cppjieba::KeyWord> keywordres;
ChineseSegmentation::m_jieba->extractor.Extract(str, keywordres, topk);
vector<string> ChineseSegmentationPrivate::callMixSegmentCutStr(const string &sentence)
{
vector<string> keywordres;
ChineseSegmentationPrivate::m_jieba->Cut(sentence, keywordres);
return keywordres;
}
void ChineseSegmentation::convert(std::vector<cppjieba::KeyWord> &keywordres, QVector<SKeyWord> &kw) {
for(auto i : keywordres) {
SKeyWord temp;
temp.word = i.word;
temp.offsets = QVector<size_t>::fromStdVector(i.offsets);
temp.weight = i.weight;
kw.append(temp);
vector<Word> ChineseSegmentationPrivate::callMixSegmentCutWord(const string &sentence)
{
vector<Word> keywordres;
ChineseSegmentationPrivate::m_jieba->Cut(sentence, keywordres);
return keywordres;
}
string ChineseSegmentationPrivate::lookUpTagOfWord(const string &word)
{
return ChineseSegmentationPrivate::m_jieba->LookupTag(word);
}
vector<pair<string, string>> ChineseSegmentationPrivate::getTagOfWordsInSentence(const string &sentence)
{
vector<pair<string, string>> words;
ChineseSegmentationPrivate::m_jieba->Tag(sentence, words);
return words;
}
vector<Word> ChineseSegmentationPrivate::callFullSegment(const string &sentence)
{
vector<Word> keywordres;
ChineseSegmentationPrivate::m_jieba->CutAll(sentence, keywordres);
return keywordres;
}
vector<Word> ChineseSegmentationPrivate::callQuerySegment(const string &sentence)
{
vector<Word> keywordres;
ChineseSegmentationPrivate::m_jieba->CutForSearch(sentence, keywordres);
return keywordres;
}
vector<Word> ChineseSegmentationPrivate::callHMMSegment(const string &sentence)
{
vector<Word> keywordres;
ChineseSegmentationPrivate::m_jieba->CutHMM(sentence, keywordres);
return keywordres;
}
vector<Word> ChineseSegmentationPrivate::callMPSegment(const string &sentence)
{
size_t maxWordLen = 512;
vector<Word> keywordres;
ChineseSegmentationPrivate::m_jieba->CutSmall(sentence, keywordres, maxWordLen);
return keywordres;
}
ChineseSegmentation *ChineseSegmentation::getInstance()
{
static ChineseSegmentation *global_instance_chinese_segmentation = new ChineseSegmentation;
return global_instance_chinese_segmentation;
}
vector<KeyWord> ChineseSegmentation::callSegment(const string &sentence)
{
return d->callSegment(sentence);
}
vector<string> ChineseSegmentation::callMixSegmentCutStr(const string &sentence)
{
return d->callMixSegmentCutStr(sentence);
}
vector<Word> ChineseSegmentation::callMixSegmentCutWord(const string &str)
{
return d->callMixSegmentCutWord(str);
}
string ChineseSegmentation::lookUpTagOfWord(const string &word)
{
return d->lookUpTagOfWord(word);
}
vector<pair<string, string> > ChineseSegmentation::getTagOfWordsInSentence(const string &sentence)
{
return d->getTagOfWordsInSentence(sentence);
}
vector<Word> ChineseSegmentation::callFullSegment(const string &sentence)
{
return d->callFullSegment(sentence);
}
vector<Word> ChineseSegmentation::callQuerySegment(const string &sentence)
{
return d->callQuerySegment(sentence);
}
vector<Word> ChineseSegmentation::callHMMSegment(const string &sentence)
{
return d->callHMMSegment(sentence);
}
vector<Word> ChineseSegmentation::callMPSegment(const string &sentence)
{
return d->callMPSegment(sentence);
}
ChineseSegmentation::ChineseSegmentation() : d(new ChineseSegmentationPrivate)
{
}

View File

@ -22,42 +22,95 @@
#define CHINESESEGMENTATION_H
#include "libchinese-segmentation_global.h"
#include "cppjieba/Jieba.hpp"
//#include "Logging.hpp"
//#include "LocalVector.hpp"
//#include "cppjieba/QuerySegment.hpp"
#include "cppjieba/KeywordExtractor.hpp"
#include <QVector>
#include <QString>
#include <QDebug>
#include <QMutex>
struct SKeyWord {
std::string word;
QVector<size_t> offsets;
double weight;
~SKeyWord() {
word = std::move("");
offsets.clear();
offsets.shrink_to_fit();
}
};
#include "common-struct.h"
class ChineseSegmentationPrivate;
class CHINESESEGMENTATION_EXPORT ChineseSegmentation {
public:
static ChineseSegmentation *getInstance();
QVector<SKeyWord> callSegement(std::string s);
std::vector<cppjieba::KeyWord> callSegementStd(const std::string& str);
/**
* @brief ChineseSegmentation::callSegment
* extractor进行关键词提取使Mix方式初步分词使Idf词典进行关键词提取
*
* @param sentence
* @return vector<KeyWord>
*/
vector<KeyWord> callSegment(const string &sentence);
/**
* @brief ChineseSegmentation::callMixSegmentCutStr
* 使Mix方法进行分词使MP初步分词HMM进一步分词
*
* @param sentence
* @return vector<string>
*/
vector<string> callMixSegmentCutStr(const string& sentence);
/**
* @brief ChineseSegmentation::callMixSegmentCutWord
* callMixSegmentCutStr功能相同
* @param sentence
* @return vector<Word>
*/
vector<Word> callMixSegmentCutWord(const string& str);
/**
* @brief ChineseSegmentation::lookUpTagOfWord
* word的词性
* @param word
* @return string word的词性
*/
string lookUpTagOfWord(const string& word);
/**
* @brief ChineseSegmentation::getTagOfWordsInSentence
* 使Mix分词后获取每个词的词性
* @param sentence
* @return vector<pair<string, string>> (firsr)(second)
*/
vector<pair<string, string>> getTagOfWordsInSentence(const string &sentence);
/**
* @brief ChineseSegmentation::callFullSegment
* 使Full进行分词Full会切出字典里所有的词
* @param sentence
* @return vector<Word>
*/
vector<Word> callFullSegment(const string& sentence);
/**
* @brief ChineseSegmentation::callQuerySegment
* 使Query进行分词使MixFull
* @param sentence
* @return vector<Word>
*/
vector<Word> callQuerySegment(const string& sentence);
/**
* @brief ChineseSegmentation::callHMMSegment
* 使HMM进行分词
* @param sentence
* @return vector<Word>
*/
vector<Word> callHMMSegment(const string& sentence);
/**
* @brief ChineseSegmentation::callMPSegment
* 使MP进行分词
* @param sentence
* @return vector<Word>
*/
vector<Word> callMPSegment(const string& sentence);
private:
explicit ChineseSegmentation();
~ChineseSegmentation();
void convert(std::vector<cppjieba::KeyWord>& keywordres, QVector<SKeyWord>& kw);
~ChineseSegmentation() = default;
ChineseSegmentation(const ChineseSegmentation&) = delete;
ChineseSegmentation& operator =(const ChineseSegmentation&) = delete;
private:
static QMutex m_mutex;
cppjieba::Jieba *m_jieba;
ChineseSegmentationPrivate *d = nullptr;
};
#endif // CHINESESEGMENTATION_H

View File

@ -0,0 +1,52 @@
#ifndef COMMONSTRUCT_H
#define COMMONSTRUCT_H
#include <string>
#include <vector>
using namespace std;
/**
* @brief The KeyWord struct
*
* @property word the content of keyword
* @property offsets the Unicode offsets, can be used to check the word pos in a sentence
* @property weight the weight of the keyword
*/
struct KeyWord {
string word;
vector<size_t> offsets;
double weight;
~KeyWord() {
word = std::move("");
offsets.clear();
offsets.shrink_to_fit();
}
};
/**
* @brief The Word struct
*
* @property word the content of word
* @property offset the offset of the word(absolute pos, Chinese 3 , English 1) can be used to check the word pos in a sentence
* @property unicode_offset the Unicode offset of the word
* @property unicode_length the Unicode length of the word
*/
struct Word {
string word;
uint32_t offset;
uint32_t unicode_offset;
uint32_t unicode_length;
Word(const string& w, uint32_t o)
: word(w), offset(o) {
}
Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
: word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
}
~Word() {
word = std::move("");
}
}; // struct Word
#endif // COMMONSTRUCT_H

View File

@ -13,7 +13,12 @@
#include "limonp/Md5.hpp"
#include "Unicode.hpp"
#include "darts.h"
//#define USE_DARTS_CLONE
#ifdef USE_DARTS_CLONE
#include "../storage-base/darts-clone/darts.h"
#else
#include "../storage-base/cedar/cedar.h"
#endif
namespace cppjieba {
@ -60,20 +65,6 @@ inline std::ostream & operator << (std::ostream& os, const DatElement & elem) {
return os << "word=" << elem.word << "/tag=" << elem.tag << "/weight=" << elem.weight;
}
struct DatMemElem {
double weight = 0.0;
char tag[8] = {};
void SetTag(const string & str) {
memset(&tag[0], 0, sizeof(tag));
strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
}
string GetTag() const {
return &tag[0];
}
};
struct PinYinMemElem {
char tag[6] = {};
@ -90,14 +81,11 @@ struct PinYinMemElem {
inline std::ostream & operator << (std::ostream& os, const DatMemElem & elem) {
return os << "/tag=" << elem.GetTag() << "/weight=" << elem.weight;
}
struct DatDag {
limonp::LocalVector<pair<size_t, const DatMemElem *> > nexts;
double max_weight;
int max_next;
};
#ifdef USE_DARTS_CLONE
typedef Darts::DoubleArray JiebaDAT;
#else
typedef cedar::da<int, -1, -2, false> JiebaDAT;
#endif
struct CacheFileHeader {
@ -124,6 +112,7 @@ public:
}
const DatMemElem * Find(const string & key) const {
#ifdef USE_DARTS_CLONE
JiebaDAT::result_pair_type find_result;
dat_.exactMatchSearch(key.c_str(), find_result);
@ -132,9 +121,16 @@ public:
}
return &elements_ptr_[ find_result.value ];
#else
int result = dat_.exactMatchSearch<int>(key.c_str());
if (result < 0)
return nullptr;
return &elements_ptr_[result];
#endif
}
const double Find(const string & key, std::size_t length, std::size_t node_pos) const {
#ifdef USE_DARTS_CLONE
JiebaDAT::result_pair_type find_result;
dat_.exactMatchSearch(key.c_str(), find_result, length, node_pos);
@ -143,9 +139,16 @@ public:
}
return idf_elements_ptr_[ find_result.value ];
#else
int result = dat_.exactMatchSearch<int>(key.c_str(), length, node_pos);
if (result < 0)
return -1;
return idf_elements_ptr_[result];
#endif
}
const PinYinMemElem * PinYinFind(const string & key) const {
#ifdef USE_DARTS_CLONE
JiebaDAT::result_pair_type find_result;
dat_.exactMatchSearch(key.c_str(), find_result);
@ -154,6 +157,12 @@ public:
}
return &pinyin_elements_ptr_[ find_result.value ];
#else
int result = dat_.exactMatchSearch<int>(key.c_str());
if (result < 0)
return nullptr;
return &pinyin_elements_ptr_[result];
#endif
}
void Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
@ -259,7 +268,7 @@ public:
max_weight[i] = -3.14e+100;
}
int max_next[str_size];//存放动态规划后的分词结果
memset(max_next,-1,str_size);
//memset(max_next,-1,str_size);
double val(0);
for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
@ -367,7 +376,7 @@ public:
assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(DatMemElem) + header.dat_size * dat_.unit_size());
elements_ptr_ = (const DatMemElem *)(mmap_addr_ + sizeof(header));
const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(DatMemElem) * elements_num_;
char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(DatMemElem) * elements_num_;
dat_.set_array(dat_ptr, header.dat_size);
return true;
}
@ -398,7 +407,7 @@ public:
assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(double) + header.dat_size * dat_.unit_size());
idf_elements_ptr_ = (const double *)(mmap_addr_ + sizeof(header));
const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(double) * elements_num_;
char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(double) * elements_num_;
dat_.set_array(dat_ptr, header.dat_size);
return true;
}
@ -429,7 +438,7 @@ public:
assert(mmap_length_ == sizeof(header) + header.elements_num * sizeof(PinYinMemElem) + header.dat_size * dat_.unit_size());
pinyin_elements_ptr_ = (const PinYinMemElem *)(mmap_addr_ + sizeof(header));
const char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(PinYinMemElem) * elements_num_;
char * dat_ptr = mmap_addr_ + sizeof(header) + sizeof(PinYinMemElem) * elements_num_;
dat_.set_array(dat_ptr, header.dat_size);
return true;
}
@ -469,7 +478,6 @@ private:
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
::umask(S_IWGRP | S_IWOTH);
//const int fd =::mkstemp(&tmp_filepath[0]);
//原mkstemp用法有误已修复--jxx20210519
const int fd =::mkstemp((char *)tmp_filepath.data());
qDebug() << "mkstemp :" << errno << tmp_filepath.data();
assert(fd >= 0);
@ -518,7 +526,6 @@ private:
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
::umask(S_IWGRP | S_IWOTH);
//const int fd =::mkstemp(&tmp_filepath[0]);
//原mkstemp用法有误已修复--jxx20210519
const int fd =::mkstemp((char *)tmp_filepath.data());
qDebug() << "mkstemp error:" << errno << tmp_filepath.data();
assert(fd >= 0);

View File

@ -18,7 +18,6 @@ namespace cppjieba {
using namespace limonp;
const double MIN_DOUBLE = -3.14e+100;
const double MAX_DOUBLE = 3.14e+100;
const size_t DICT_COLUMN_NUM = 3;
const char* const UNKNOWN_TAG = "";
@ -42,14 +41,14 @@ public:
return dat_.Find(word);
}
void Find(RuneStrArray::const_iterator begin,
void FindDatDag(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<struct DatDag>&res,
size_t max_word_len = MAX_WORD_LENGTH) const {
dat_.Find(begin, end, res, max_word_len);
}
void Find(RuneStrArray::const_iterator begin,
void FindWordRange(RuneStrArray::const_iterator begin,
RuneStrArray::const_iterator end,
vector<WordRange>& words,
size_t max_word_len = MAX_WORD_LENGTH) const {
@ -134,9 +133,9 @@ private:
total_dict_size_ = file_size_sum;
if (dat_cache_path.empty()) {
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache";
dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
}
dat_cache_path += VERSION;
QString path = QString::fromStdString(dat_cache_path);
qDebug() << "#########Dict path:" << path;
if (dat_.InitAttachDat(dat_cache_path, md5)) {

View File

@ -4,7 +4,8 @@
#include <set>
#include <cassert>
#include "limonp/Logging.hpp"
#include "DictTrie.hpp"
#include "segment-trie/segment-trie.h"
//#include "DictTrie.hpp"
#include "SegmentBase.hpp"
#include "Unicode.hpp"
@ -22,7 +23,7 @@ public:
vector<WordRange>& res, bool, size_t) const override {
assert(dictTrie_);
vector<struct DatDag> dags;
dictTrie_->Find(begin, end, dags);
dictTrie_->FindDatDag(begin, end, dags);
size_t max_word_end_pos = 0;
for (size_t i = 0; i < dags.size(); i++) {
@ -45,11 +46,19 @@ public:
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
size_t) const override {
std::ignore = s;
std::ignore = begin;
std::ignore = end;
std::ignore = res;
std::ignore = hmm;
}
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t) const override {
std::ignore = s;
std::ignore = begin;
std::ignore = end;
std::ignore = res;
std::ignore = hmm;
}
private:
const DictTrie* dictTrie_;

View File

@ -1,12 +1,18 @@
#pragma once
#include "limonp/StringUtil.hpp"
//#define USE_CEDAR_SEGMENT //使用cedar初步测试性能损失3%-5%左右内存占用降低近1M
#ifdef USE_CEDAR_SEGMENT
#include "cedar/cedar.h"
#endif
namespace cppjieba {
using namespace limonp;
#ifdef USE_CEDAR_SEGMENT
typedef cedar::da<float, -1, -2, false> EmitProbMap;
#else
typedef unordered_map<Rune, double> EmitProbMap;
#endif
struct HMMModel {
/*
* STATUS:
@ -73,6 +79,12 @@ struct HMMModel {
}
double GetEmitProb(const EmitProbMap* ptMp, Rune key,
double defVal)const {
#ifdef USE_CEDAR_SEGMENT
char str_key[8];
snprintf(str_key, sizeof(str_key), "%d", key);
float result = ptMp->exactMatchSearch<float>(str_key);
return result < 0 ? defVal : result;
#else
EmitProbMap::const_iterator cit = ptMp->find(key);
if (cit == ptMp->end()) {
@ -80,6 +92,7 @@ struct HMMModel {
}
return cit->second;
#endif
}
bool GetLine(ifstream& ifile, string& line) {
while (getline(ifile, line)) {
@ -119,8 +132,13 @@ struct HMMModel {
XLOG(ERROR) << "TransCode failed.";
return false;
}
#ifdef USE_CEDAR_SEGMENT
char str_key[8];
snprintf(str_key, sizeof(str_key), "%d", unicode[0]);
mp.update(str_key, std::strlen(str_key), atof(tmp2[1].c_str()));
#else
mp[unicode[0]] = atof(tmp2[1].c_str());
#endif
}
return true;

View File

@ -8,6 +8,9 @@
#include "SegmentBase.hpp"
namespace cppjieba {
const double MIN_DOUBLE = -3.14e+100;
class HMMSegment: public SegmentBase {
public:
HMMSegment(const HMMModel* model)
@ -59,11 +62,19 @@ public:
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
size_t) const override {
std::ignore = s;
std::ignore = begin;
std::ignore = end;
std::ignore = res;
std::ignore = hmm;
}
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t) const override {
std::ignore = s;
std::ignore = begin;
std::ignore = end;
std::ignore = res;
std::ignore = hmm;
}
private:
// sequential letters rule

View File

@ -51,9 +51,9 @@ private:
total_dict_size_ = file_size_sum;
if (dat_cache_path.empty()) {
//未指定词库数据文件存储位置的默认存储在tmp目录下--jxx20200519
dat_cache_path = /*dict_path*/"/tmp/" + md5 + "." + to_string(user_word_weight_opt) + ".dat_cache";
dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
}
dat_cache_path += VERSION;
QString path = QString::fromStdString(dat_cache_path);
qDebug() << "#########Idf path:" << path;
if (dat_.InitIdfAttachDat(dat_cache_path, md5)) {

View File

@ -3,6 +3,7 @@
#include <memory>
#include "QuerySegment.hpp"
#include "KeywordExtractor.hpp"
#include "segment-trie/segment-trie.h"
namespace cppjieba {
@ -61,9 +62,6 @@ public:
string LookupTag(const string &str) const {
return mix_seg_.LookupTag(str);
}
bool Find(const string& word) {
return nullptr != dict_trie_.Find(word);
}
void ResetSeparators(const string& s) {
//TODO

View File

@ -2,7 +2,8 @@
#include <cmath>
#include "MixSegment.hpp"
#include "IdfTrie.hpp"
//#include "IdfTrie.hpp"
#include "idf-trie/idf-trie.h"
namespace cppjieba {
@ -64,7 +65,7 @@ public:
if (-1 != idf) {//IDF词典查找
itr->second.weight *= idf;
} else {
itr->second.weight *= idf_trie_.idfAverage_;
itr->second.weight *= idf_trie_.GetIdfAverage();
}
itr->second.word = itr->first;

View File

@ -4,7 +4,8 @@
#include <set>
#include <cassert>
#include "limonp/Logging.hpp"
#include "DictTrie.hpp"
#include "segment-trie/segment-trie.h"
//#include "DictTrie.hpp"
#include "SegmentTagged.hpp"
#include "PosTagger.hpp"
@ -22,20 +23,24 @@ public:
RuneStrArray::const_iterator end,
vector<WordRange>& words,
bool, size_t max_word_len) const override {
// vector<DatDag> dags;
// dictTrie_->Find(begin, end, dags, max_word_len);//依据DAG词典生成DAG--jxx
// CalcDP(dags);//动态规划Dynamic ProgrammingDP根据DAG计算最优动态规划路径--jxx
// CutByDag(begin, end, dags, words);//依据DAG最优路径分词--jxx
dictTrie_->Find(begin, end, words, max_word_len);
dictTrie_->FindWordRange(begin, end, words, max_word_len);
}
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
size_t) const override {
std::ignore = s;
std::ignore = begin;
std::ignore = end;
std::ignore = res;
std::ignore = hmm;
}
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t) const override {
std::ignore = s;
std::ignore = begin;
std::ignore = end;
std::ignore = res;
std::ignore = hmm;
}
const DictTrie* GetDictTrie() const override {
return dictTrie_;
@ -77,6 +82,7 @@ private:
}
*/
/* 倒叙方式重写CalcDP函数初步测试未发现问题*/
/*
void CalcDP(vector<DatDag>& dags) const {
double val(0);
size_t size = dags.size();
@ -87,8 +93,6 @@ private:
for (const auto & it : dags[size - 1 - i].nexts) {
const auto nextPos = it.first;
val = dictTrie_->GetMinWeight();
if (nullptr != it.second) {
val = it.second->weight;
}
@ -119,7 +123,7 @@ private:
i = next;
}
}
*///相关功能已集成到Find函数中
const DictTrie* dictTrie_;
PosTagger tagger_;

View File

@ -5,6 +5,10 @@
#include "HMMSegment.hpp"
#include "limonp/StringUtil.hpp"
#include "PosTagger.hpp"
#define STOP_WORDS_USE_CEDAR_SEGMENT //使用cedar初步测试性能提升3%-5%左右,内存占用降低近不明显
#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
#include "cedar/cedar.h"
#endif
namespace cppjieba {
class MixSegment: public SegmentTagged {
@ -73,7 +77,7 @@ public:
// mpSeg_.CutRuneArray(begin, end, res);
// return;
// }
std::ignore = hmm;
vector<WordRange> words;
assert(end >= begin);
words.reserve(end - begin);
@ -122,6 +126,7 @@ public:
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t) const override {
std::ignore = hmm;
vector<WordRange> words;
vector<WordRange> hmmRes;
assert(end >= begin);
@ -139,9 +144,15 @@ public:
string str = GetStringFromRunes(s, words[i].left, words[i].right);
if (words[i].left != words[i].right) {
#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
if (0 < stopWords_.exactMatchSearch<int>(str.c_str(), str.size())) {
continue;
}
#else
if (stopWords_.find(str) != stopWords_.end()) {
continue;
}
#endif
res[str].offsets.push_back(words[i].left->offset);
res[str].weight += 1.0;
continue;
@ -149,9 +160,15 @@ public:
if (mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune)
|| i == (words.size() - 1)) {//i++后如果是最后一个字符则直接push_back
#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
if (0 < stopWords_.exactMatchSearch<int>(str.c_str(), str.size())) {
continue;
}
#else
if (stopWords_.find(str) != stopWords_.end()) {
continue;
}
#endif
res[str].offsets.push_back(words[i].left->offset);
res[str].weight += 1.0;
continue;
@ -181,9 +198,16 @@ public:
//put hmm result to result
for (size_t k = 0; k < hmmRes.size(); k++) {
string hmmStr = GetStringFromRunes(s, hmmRes[k].left, hmmRes[k].right);
if (IsSingleWord(hmmStr) || stopWords_.find(hmmStr) != stopWords_.end()) {
#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
if (0 < stopWords_.exactMatchSearch<int>(hmmStr.c_str(), hmmStr.size())) {
continue;
}
#else
if (/*IsSingleWord(hmmStr) || */stopWords_.find(hmmStr) != stopWords_.end()) {
continue;
}
#endif
res[hmmStr].offsets.push_back(hmmRes[k].left->offset);
res[hmmStr].weight += 1.0;
}
@ -227,14 +251,21 @@ public:
string line ;
while (getline(ifs, line)) {
#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
stopWords_.update(line.c_str(), line.size(), 1);
#else
stopWords_.insert(line);
#endif
}
assert(stopWords_.size());
}
private:
#ifdef STOP_WORDS_USE_CEDAR_SEGMENT
cedar::da<int, -1, -2, false> stopWords_;
#else
unordered_set<string> stopWords_;
#endif
MPSegment mpSeg_;
HMMSegment hmmSeg_;
PosTagger tagger_;

View File

@ -63,7 +63,7 @@ public:
return false;
}
bool isMultiTone(string &word) {
bool isMultiTone(const string &word) {
if (qmap_chinese2pinyin.contains(QString::fromStdString(word)))
return true;
// if (map_chinese2pinyin.contains(word))

View File

@ -1,8 +1,9 @@
#pragma once
#include "limonp/StringUtil.hpp"
#include "DictTrie.hpp"
#include "SegmentTagged.hpp"
#include "segment-trie/segment-trie.h"
//#include "DictTrie.hpp"
//#include "SegmentTagged.hpp"
namespace cppjieba {
using namespace limonp;
@ -31,10 +32,10 @@ public:
string LookupTag(const string &str, const SegmentTagged& segment) const {
const DictTrie * dict = segment.GetDictTrie();
assert(dict != NULL);
assert(dict != nullptr);
const auto tmp = dict->Find(str);
if (tmp == NULL || tmp->GetTag().empty()) {
if (tmp == nullptr || tmp->GetTag().empty()) {
RuneStrArray runes;
if (!DecodeRunesInString(str, runes)) {

View File

@ -69,6 +69,7 @@ public:
}
cursor_ ++;
}
return false;
}
int max_num = 0;

View File

@ -4,12 +4,10 @@
#include <set>
#include <cassert>
#include "limonp/Logging.hpp"
#include "DictTrie.hpp"
#include "SegmentBase.hpp"
#include "FullSegment.hpp"
#include "MixSegment.hpp"
#include "Unicode.hpp"
#include "DictTrie.hpp"
namespace cppjieba {
class QuerySegment: public SegmentBase {
@ -35,7 +33,7 @@ public:
for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 2);
if (trie_->Find(text) != NULL) {
if (trie_->Find(text) != nullptr) {
WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
res.push_back(wr);
}
@ -46,7 +44,7 @@ public:
for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
string text = EncodeRunesToString(mixResItr->left + i, mixResItr->left + i + 3);
if (trie_->Find(text) != NULL) {
if (trie_->Find(text) != nullptr) {
WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
res.push_back(wr);
}
@ -59,11 +57,19 @@ public:
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<string>& res, bool hmm,
size_t) const override {
std::ignore = s;
std::ignore = begin;
std::ignore = end;
std::ignore = res;
std::ignore = hmm;
}
virtual void CutWithSentence(const string& s, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, unordered_map<string, KeyWord>& res, bool hmm,
size_t) const override {
std::ignore = s;
std::ignore = begin;
std::ignore = end;
std::ignore = res;
std::ignore = hmm;
}
private:
bool IsAllAscii(const RuneArray& s) const {

View File

@ -7,6 +7,7 @@
#include <ostream>
#include "limonp/LocalVector.hpp"
#include "limonp/StringUtil.hpp"
#include "common-struct.h"
namespace cppjieba {
@ -15,29 +16,30 @@ using std::vector;
typedef uint32_t Rune;
struct KeyWord {
string word;
vector<size_t> offsets;
double weight;
}; // struct Word
struct Word {
string word;
uint32_t offset;
uint32_t unicode_offset;
uint32_t unicode_length;
Word(const string& w, uint32_t o)
: word(w), offset(o) {
}
Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
: word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
}
}; // struct Word
inline std::ostream& operator << (std::ostream& os, const Word& w) {
return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
}
struct DatMemElem {
double weight = 0.0;
char tag[8] = {};
void SetTag(const string & str) {
memset(&tag[0], 0, sizeof(tag));
strncpy(&tag[0], str.c_str(), std::min(str.size(), sizeof(tag) - 1));
}
string GetTag() const {
return &tag[0];
}
};
struct DatDag {
limonp::LocalVector<pair<size_t, const DatMemElem *> > nexts;
//double max_weight;
//size_t max_next;
};
struct RuneInfo {
Rune rune;
uint32_t offset;
@ -95,7 +97,6 @@ inline RuneArray DecodeRunesInString(const string& s) {
return result;
}
//重写DecodeRunesInString函数将实现放入函数中降低内存占用加快处理流程--jxx20210518
inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
uint32_t tmp;

View File

@ -17,6 +17,27 @@ HEADERS += \
$$PWD/SegmentBase.hpp \
$$PWD/SegmentTagged.hpp \
$$PWD/TextRankExtractor.hpp \
$$PWD/Trie.hpp \
$$PWD/Unicode.hpp
# $$PWD/Trie.hpp \
$$PWD/Unicode.hpp \
$$PWD/DatTrie.hpp \
$$PWD/idf-trie/idf-trie.h \
$$PWD/segment-trie/segment-trie.h
DISTFILES += \
dict/README.md \
dict/hmm_model.utf8 \
dict/idf.utf8 \
dict/jieba.dict.utf8 \
dict/pos_dict/char_state_tab.utf8 \
dict/pos_dict/prob_emit.utf8 \
dict/pos_dict/prob_start.utf8 \
dict/pos_dict/prob_trans.utf8 \
dict/stop_words.utf8 \
dict/user.dict.utf8
#dict/pinyinWithoutTone.txt \
include(limonp/limonp.pri)
SOURCES += \
$$PWD/idf-trie/idf-trie.cpp \
$$PWD/segment-trie/segment-trie.cpp

View File

@ -0,0 +1,97 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#include "idf-trie.h"
IdfTrie::IdfTrie(const vector<string> file_paths, string dat_cache_path)
: StorageBase<double, false, IdfCacheFileHeader>(file_paths, dat_cache_path)
{
this->Init();
}
IdfTrie::IdfTrie(string file_path, string dat_cache_path)
: StorageBase<double, false, IdfCacheFileHeader>(vector<string>{file_path}, dat_cache_path)
{
this->Init();
}
void IdfTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
{
IdfCacheFileHeader header;
assert(sizeof(header.md5_hex) == md5.size());
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
double idf_sum(0), idf_average(0), tmp(0);
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
umask(S_IWGRP | S_IWOTH);
const int fd =mkstemp((char *)tmp_filepath.data());
assert(fd >= 0);
fchmod(fd, 0644);
write_bytes = write(fd, (const char *)&header, sizeof(IdfCacheFileHeader));
ifstream ifs(IDF_DICT_PATH);
string line;
vector<string> buf;
for (; getline(ifs, line);) {
if (limonp::StartsWith(line, "#") or line.empty()) {
continue;
}
limonp::Split(line, buf, " ");
if (buf.size() != 2)
continue;
this->Update(buf[0].c_str(), buf[0].size(), elements_num);
offset += sizeof(double);
elements_num++;
tmp = atof(buf[1].c_str());
write_bytes += write(fd, &tmp, sizeof(double));
idf_sum += tmp;
}
idf_average = idf_sum / elements_num;
write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
lseek(fd, sizeof(header.md5_hex), SEEK_SET);
write(fd, &elements_num, sizeof(int));
write(fd, &offset, sizeof(int));
data_trie_size = this->GetDataTrieSize();
write(fd, &data_trie_size, sizeof(int));
write(fd, &idf_average, sizeof(double));
close(fd);
assert((size_t)write_bytes == sizeof(IdfCacheFileHeader) + offset + this->GetDataTrieTotalSize());
const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str());
assert(0 == rename_ret);
}
double IdfTrie::Find(const string &key) const
{
int result = this->ExactMatchSearch(key.c_str(), key.size());
if (result < 0)
return -1;
return this->GetElementPtr()[result];
}
double IdfTrie::GetIdfAverage() const
{
return this->GetCacheFileHeaderPtr()->idf_average;
}

View File

@ -0,0 +1,45 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef IdfTrie_H
#define IdfTrie_H
#include "storage-base.hpp"
const char * const IDF_DICT_PATH = "/usr/share/ukui-search/res/dict/idf.utf8";
struct IdfCacheFileHeader : CacheFileHeaderBase
{
double idf_average = 0;
};
class IdfTrie : public StorageBase<double, false, IdfCacheFileHeader>
{
public:
IdfTrie(const vector<string> file_paths, string dat_cache_path);
IdfTrie(string file_path, string dat_cache_path);
void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
double Find(const string &key) const;
double GetIdfAverage() const;
private:
};
#endif // IdfTrie_H

View File

@ -0,0 +1,276 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#include <cmath>
#include "segment-trie.h"
DictTrie::DictTrie(const vector<string> file_paths, string dat_cache_path)
: StorageBase<DatMemElem, false, DictCacheFileHeader>(file_paths, dat_cache_path)
{
this->Init();
}
DictTrie::DictTrie(const string &dict_path, const string &user_dict_paths, const string &dat_cache_path)
: StorageBase<DatMemElem, false, DictCacheFileHeader>(vector<string>{dict_path, user_dict_paths}, dat_cache_path)
{
this->Init();
}
void DictTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
{
DictCacheFileHeader header;
assert(sizeof(header.md5_hex) == md5.size());
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
umask(S_IWGRP | S_IWOTH);
const int fd =mkstemp((char *)tmp_filepath.data());
assert(fd >= 0);
fchmod(fd, 0644);
write_bytes = write(fd, (const char *)&header, sizeof(DictCacheFileHeader));
this->PreLoad();
this->LoadDefaultDict(fd, write_bytes, offset, elements_num);
this->LoadUserDict(fd, write_bytes, offset, elements_num);
write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
lseek(fd, sizeof(header.md5_hex), SEEK_SET);
write(fd, &elements_num, sizeof(int));
write(fd, &offset, sizeof(int));
data_trie_size = this->GetDataTrieSize();
write(fd, &data_trie_size, sizeof(int));
write(fd, &m_min_weight, sizeof(double));
close(fd);
assert((size_t)write_bytes == sizeof(DictCacheFileHeader) + offset + this->GetDataTrieTotalSize());
const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str());
assert(0 == rename_ret);
}
const DatMemElem * DictTrie::Find(const string &key) const
{
int result = this->ExactMatchSearch(key.c_str(), key.size());
if (result < 0)
return nullptr;
return &this->GetElementPtr()[result];
}
void DictTrie::FindDatDag(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<DatDag> &res, size_t max_word_len) const {
res.clear();
res.resize(end - begin);
string text_str;
EncodeRunesToString(begin, end, text_str);
static const size_t max_num = 128;
result_pair_type result_pairs[max_num] = {};
for (size_t i = 0, begin_pos = 0; i < size_t(end - begin); i++) {
std::size_t num_results = this->CommonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + 1, nullptr));
for (std::size_t idx = 0; idx < num_results; ++idx) {
auto & match = result_pairs[idx];
if ((match.value < 0) || ((size_t)match.value >= this->GetCacheFileHeaderPtr()->elements_size)) {
continue;
}
auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
if (char_num > max_word_len) {
continue;
}
const DatMemElem * pValue = &this->GetElementPtr()[match.value];
if (1 == char_num) {
res[i].nexts[0].second = pValue;
continue;
}
res[i].nexts.push_back(pair<size_t, const DatMemElem *>(i + char_num, pValue));
}
begin_pos += limonp::UnicodeToUtf8Bytes((begin + i)->rune);
}
}
void DictTrie::FindWordRange(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange> &words, size_t max_word_len) const {
string text_str;
EncodeRunesToString(begin, end, text_str);
static const size_t max_num = 128;
result_pair_type result_pairs[max_num] = {};//存放字典查询结果
size_t str_size = end - begin;
double max_weight[str_size];//存放逆向路径最大weight
for (size_t i = 0; i<str_size; i++) {
max_weight[i] = -3.14e+100;
}
size_t max_next[str_size];//存放动态规划后的分词结果
//memset(max_next,-1,str_size*sizeof(size_t));
double val(0);
for (size_t i = 0, begin_pos = text_str.size(); i < str_size; i++) {
size_t nextPos = str_size - i;//逆向计算
begin_pos -= (end - i - 1)->len;
std::size_t num_results = this->CommonPrefixSearch(&text_str[begin_pos], &result_pairs[0], max_num);
if (0 == num_results) {//字典不存在则单独分词
val = GetMinWeight();
if (nextPos < str_size) {
val += max_weight[nextPos];
}
if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
max_weight[nextPos - 1] = val;
max_next[nextPos - 1] = nextPos;
}
} else {//字典存在则根据查询结果数量计算最大概率路径
for (std::size_t idx = 0; idx < num_results; ++idx) {
auto & match = result_pairs[idx];
if ((match.value < 0) || ((uint32_t)match.value >= this->GetCacheFileHeaderPtr()->elements_size)) {
continue;
}
auto const char_num = Utf8CharNum(&text_str[begin_pos], match.length);
if (char_num > max_word_len) {
continue;
}
auto * pValue = &this->GetElementPtr()[match.value];
val = pValue->weight;
if (1 == char_num) {
if (nextPos < str_size) {
val += max_weight[nextPos];
}
if ((nextPos <= str_size) && (val > max_weight[nextPos - 1])) {
max_weight[nextPos - 1] = val;
max_next[nextPos - 1] = nextPos;
}
} else {
if (nextPos - 1 + char_num < str_size) {
val += max_weight[nextPos - 1 + char_num];
}
if ((nextPos - 1 + char_num <= str_size) && (val > max_weight[nextPos - 1])) {
max_weight[nextPos - 1] = val;
max_next[nextPos - 1] = nextPos - 1 + char_num;
}
}
}
}
}
for (size_t i = 0; i < str_size;) {//统计动态规划结果
assert(max_next[i] > i);
assert(max_next[i] <= str_size);
WordRange wr(begin + i, begin + max_next[i] - 1);
words.push_back(wr);
i = max_next[i];
}
}
bool DictTrie::IsUserDictSingleChineseWord(const Rune &word) const {
return IsIn(m_user_dict_single_chinese_word, word);
}
void DictTrie::PreLoad()
{
ifstream ifs(DICT_PATH);
string line;
vector<string> buf;
for (; getline(ifs, line);) {
if (limonp::StartsWith(line, "#") or line.empty()) {
continue;
}
limonp::Split(line, buf, " ");
if (buf.size() != 3)
continue;
m_freq_sum += atof(buf[1].c_str());
}
}
void DictTrie::LoadDefaultDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
{
ifstream ifs(DICT_PATH);
string line;
vector<string> buf;
for (; getline(ifs, line);) {
if (limonp::StartsWith(line, "#") or line.empty()) {
continue;
}
limonp::Split(line, buf, " ");
if (buf.size() != 3)
continue;
DatMemElem node_info;
node_info.weight = log(atof(buf[1].c_str()) / m_freq_sum);
node_info.SetTag(buf[2]);
this->Update(buf[0].c_str(), buf[0].size(), elements_num);
offset += (sizeof(DatMemElem));
elements_num++;
if (m_min_weight > node_info.weight) {
m_min_weight = node_info.weight;
}
write_bytes += write(fd, &node_info, sizeof(DatMemElem));
}
}
void DictTrie::LoadUserDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
{
ifstream ifs(USER_DICT_PATH);
string line;
vector<string> buf;
for (; getline(ifs, line);) {
if (limonp::StartsWith(line, "#") or line.empty()) {
continue;
}
limonp::Split(line, buf, " ");
if (buf.size() != 3)
continue;
DatMemElem node_info;
assert(m_freq_sum > 0.0);
const int freq = atoi(buf[1].c_str());
node_info.weight = log(1.0 * freq / m_freq_sum);
node_info.SetTag(buf[2]);
this->Update(buf[0].c_str(), buf[0].size(), elements_num);
offset += (sizeof(DatMemElem));
elements_num++;
write_bytes += write(fd, &node_info, sizeof(DatMemElem));
if (Utf8CharNum(buf[0]) == 1) {
RuneArray word;
if (DecodeRunesInString(buf[0], word)) {
m_user_dict_single_chinese_word.insert(word[0]);
}
}
}
}
inline double DictTrie::GetMinWeight() const
{
return this->GetCacheFileHeaderPtr()->min_weight;
}

View File

@ -0,0 +1,62 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef SegmentTrie_H
#define SegmentTrie_H
#include "storage-base.hpp"
#include "cppjieba/Unicode.hpp"
using namespace cppjieba;
const char * const DICT_PATH = "/usr/share/ukui-search/res/dict/jieba.dict.utf8";
const char * const USER_DICT_PATH = "/usr/share/ukui-search/res/dict/user.dict.utf8";
struct DictCacheFileHeader : CacheFileHeaderBase
{
double min_weight = 0;
};
class DictTrie : public StorageBase<DatMemElem, false, DictCacheFileHeader>
{
public:
DictTrie(const vector<string> file_paths, string dat_cache_path = "");
DictTrie(const string& dict_path, const string& user_dict_paths = "", const string & dat_cache_path = "");
void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
const DatMemElem *Find(const string &key) const;
void FindDatDag(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
vector<struct DatDag>&res, size_t max_word_len = MAX_WORD_LENGTH) const;
void FindWordRange(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end,
vector<WordRange>& words, size_t max_word_len = MAX_WORD_LENGTH) const;
bool IsUserDictSingleChineseWord(const Rune& word) const;
private:
DictTrie();
void PreLoad();
void LoadDefaultDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
void LoadUserDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
double GetMinWeight() const;
double m_freq_sum = 0.0;
double m_min_weight = 3.14e+100;
unordered_set<Rune> m_user_dict_single_chinese_word;
};
#endif // SegmentTrie_H

View File

@ -0,0 +1 @@
#include "chinese-segmentation.h"

View File

@ -0,0 +1 @@
#include "hanzi-to-pinyin.h"

View File

@ -0,0 +1,74 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef HANZITOPINYINPRIVATE_H
#define HANZITOPINYINPRIVATE_H
#include <QtCore/qglobal.h>
#include <QHash>
#include "pinyin4cpp_dictTrie.h"
#include "hanzi-to-pinyin.h"
#include "pinyin4cpp-trie.h"
using namespace std;
static const QHash<QString, QString> PhoneticSymbol = {
{"ā", "a1"}, {"á", "a2"}, {"ǎ", "a3"}, {"à", "a4"},
{"ē", "e1"}, {"é", "e2"}, {"ě", "e3"}, {"è", "e4"},
{"ō", "o1"}, {"ó", "o2"}, {"ǒ", "o3"}, {"ò", "o4"},
{"ī", "i1"}, {"í", "i2"}, {"ǐ", "i3"}, {"ì", "i4"},
{"ū", "u1"}, {"ú", "u2"}, {"ǔ", "u3"}, {"ù", "u4"},
// üe
{"ü", "v"},
{"ǖ", "v1"}, {"ǘ", "v2"}, {"ǚ", "v3"}, {"ǜ", "v4"},
{"ń", "n2"}, {"ň", "n3"}, {"ǹ", "n4"},
{"", "m1"}, {"ḿ", "m2"}, {"", "m4"},
{"ê̄", "ê1"}, {"ế", "ê2"}, {"ê̌", "ê3"}, {"", "ê4"}
};
#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
class PINYINMANAGER_EXPORT HanZiToPinYinPrivate
{
public:
HanZiToPinYinPrivate(HanZiToPinYin *parent = nullptr);
~HanZiToPinYinPrivate();
public:
template <typename T>
bool isMultiTone(T &&t) {return m_pinYinTrie.IsMultiTone(std::forward<T>(t));}
bool contains(string &word);
int getResults(string &word, QStringList &results);
void setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType);
private:
void convertDataStyle(QStringList &results);
HanZiToPinYin *q = nullptr;
//Pinyin4cppDictTrie *m_pinYinTrie = nullptr;
Pinyin4cppTrie m_pinYinTrie;
SegType m_segType = SegType::Segmentation;
PolyphoneType m_polyphoneType = PolyphoneType::Disable;
PinyinDataStyle m_pinyinDataStyle = PinyinDataStyle::Default;
ExDataProcessType m_exDataProcessType = ExDataProcessType::Default;
};
#endif // HANZITOPINYINPRIVATE_H

View File

@ -0,0 +1,360 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#include <mutex>
#include <cctype>
#include "hanzi-to-pinyin.h"
#include "hanzi-to-pinyin-private.h"
#include "chinese-segmentation.h"
#include "cppjieba/Unicode.hpp"
HanZiToPinYin * HanZiToPinYin::g_pinYinManager = nullptr;
std::once_flag g_singleFlag;
bool HanZiToPinYinPrivate::contains(string &word)
{
return m_pinYinTrie.Contains(word);
}
int HanZiToPinYinPrivate::getResults(string &word, QStringList &results)
{
results.clear();
string directResult = m_pinYinTrie.Find(word);
if (directResult == string()) {
if (m_segType == SegType::NoSegmentation) {//无分词、无结果直接返回-1
return -1;
} else {//无结果、启用分词
vector<string> segResults = ChineseSegmentation::getInstance()->callMixSegmentCutStr(word);
string data;
for (string &info : segResults) {
if (info == string()) {
continue;
}
data = m_pinYinTrie.Find(info);
if (data == string()) {//分词后无结果
if (cppjieba::IsSingleWord(info)) {//单个字符
if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回
results.append(QString().fromStdString(info));
} else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略
continue;
}
} else {//多个字符
string oneWord;
cppjieba::RuneStrArray runeArray;
cppjieba::DecodeRunesInString(info, runeArray);
for (auto i = runeArray.begin(); i != runeArray.end(); ++i) {
oneWord = cppjieba::GetStringFromRunes(info, i, i);
data = m_pinYinTrie.Find(oneWord);
if (data == string()) {//单字无结果则按设置返回
if (m_exDataProcessType == ExDataProcessType::Default) {//原数据返回
results.append(QString().fromStdString(oneWord));
} else if (m_exDataProcessType == ExDataProcessType::Delete) {//忽略
continue;
}
}
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
results.append(QString().fromStdString(data));
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
if (limonp::IsInStr(data, ',')) {
results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0))));
} else {
results.append(QString().fromStdString(data));
}
}
}
}
} else {//分词后有结果
if (cppjieba::IsSingleWord(info)) {//单个字符
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
results.append(QString().fromStdString(data));
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
if (limonp::IsInStr(data, ',')) {
results.append(QString().fromStdString(data.substr(0, data.find_first_of(",", 0))));
} else {
results.append(QString().fromStdString(data));
}
}
} else {//多个字符
vector<string> dataVec = limonp::Split(data, "/");
if (dataVec.size() == 1) {//无多音词
vector<string> dataVec = limonp::Split(data, ",");
for (auto &oneResult : dataVec) {
results.append(QString().fromStdString(oneResult));
}
} else {
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
int wordSize = limonp::Split(dataVec[0], ",").size();
for (int i = 0; i < wordSize; ++i) {
QStringList oneResult;
for (size_t j = 0; j < dataVec.size(); ++j) {
oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i]));
}
results.append(oneResult.join('/'));
}
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
vector<string> tmp = limonp::Split(dataVec[0], ",");
for (auto &oneResult : tmp) {
results.append(QString().fromStdString(oneResult));
}
}
}
}
}
}
}
} else {//可以直接查到结果
if (cppjieba::IsSingleWord(word)) {//单个字符
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
results.append(QString().fromStdString(directResult));
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
if (limonp::IsInStr(directResult, ',')) {
results.append(QString().fromStdString(directResult.substr(0, directResult.find_first_of(",", 0))));
} else {
results.append(QString().fromStdString(directResult));
}
}
} else {//多个字符
vector<string> dataVec = limonp::Split(directResult, "/");
if (dataVec.size() == 1) {//无多音词
vector<string> dataVec = limonp::Split(directResult, ",");
for (auto &oneResult : dataVec) {
results.append(QString().fromStdString(oneResult));
}
} else {
if (m_polyphoneType == PolyphoneType::Enable) {//启用多音字
int wordSize = limonp::Split(dataVec[0], ",").size();
for (int i = 0; i < wordSize; ++i) {
QStringList oneResult;
for (size_t j = 0; j < dataVec.size(); ++j) {
oneResult.append(QString().fromStdString(limonp::Split(dataVec[j], ",")[i]));
}
results.append(oneResult.join('/'));
}
} else if (m_polyphoneType == PolyphoneType::Disable) {//不启用多音字
vector<string> tmp = limonp::Split(dataVec[0], ",");
for (auto &oneResult : tmp) {
results.append(QString().fromStdString(oneResult));
}
}
}
}
}
convertDataStyle(results);
return 0;//todo
}
void HanZiToPinYinPrivate::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType)
{
m_pinyinDataStyle = dataStyle;
m_segType = segType;
m_polyphoneType = polyphoneType;
m_exDataProcessType = processType;
}
void HanZiToPinYinPrivate::convertDataStyle(QStringList &results)
{
QString value;
if (m_pinyinDataStyle == PinyinDataStyle::Default) {
for (QString &info : results) {
if(info == ",") {
continue;
}
//if info's length was been changed, there's someting wrong while traverse the chars of info
for (const QChar &c : info) {
if (!isalpha(c.toLatin1())) {
value = PhoneticSymbol.value(c);
if (!value.isEmpty()) {
info.replace(c, value.at(0));
}
}
}
QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序)
QStringList tmpValue;
for (auto &str : tmpList) {
if (!tmpValue.contains(str)) {
tmpValue.push_back(str);
}
}
info = tmpValue.join(",");
}
} else if (m_pinyinDataStyle == PinyinDataStyle::Tone) {
//无需处理
} else if (m_pinyinDataStyle == PinyinDataStyle::Tone2) {
for (QString &info : results) {
for (int i = 0; i < info.size();) {
auto c = info.at(i);
if (!isalpha(c.toLatin1())) {
value = PhoneticSymbol.value(c);
if (!value.isEmpty()) {
info.replace(c, PhoneticSymbol.value(c));
i += PhoneticSymbol.value(c).size();
continue;
}
}
i++;
}
}
} else if (m_pinyinDataStyle == PinyinDataStyle::Tone3) {
for (QString &info : results) {
if(info == "/") {
continue;
}
bool isPolyphoneWords(false);
if (info.contains("/")) {
isPolyphoneWords = true;
info.replace("/", ",");
}
for (int i = 0; i < info.size();) {
auto c = info.at(i);
if (!isalpha(c.toLatin1())) {
value = PhoneticSymbol.value(c);
if (!value.isEmpty()) {
info.replace(i, 1, value.at(0));
//多音词模式
if (info.contains(",")) {
int pos = info.indexOf(',', i);
if (isPolyphoneWords) {
info.replace(",", "/");
}
//最后一个读音时
if (pos == -1) {
info.append(value.at(1));
break;
}
info.insert(pos, value.at(1));
i = pos + 1; //insert导致','的位置加一将i行进到','的位置
i++;
continue;
} else {
info.append(value.at(1));
break;
}
}
}
i++;
}
}
} else if (m_pinyinDataStyle == PinyinDataStyle::FirstLetter) {
for (QString &info : results) {
if(info == "," or info == "/") {
continue;
}
bool isPolyphoneWords(false);
if (info.contains("/")) {
isPolyphoneWords = true;
info.replace("/", ",");
}
for (int i = 0; i < info.size();i++) {
auto c = info.at(i);
if (!isalpha(c.toLatin1())) {
value = PhoneticSymbol.value(c);
if (!value.isEmpty()) {
info.replace(c, value.at(0));
}
}
}
QStringList tmpList = info.split(',', QString::SkipEmptyParts); //去重(保持原顺序)
QStringList tmpValue;
for (auto &str : tmpList) {
if (!tmpValue.contains(str)) {
tmpValue.push_back(str.at(0));
}
}
if (isPolyphoneWords) {
info = tmpValue.join("/");
} else {
info = tmpValue.join(",");
}
}
} else if (m_pinyinDataStyle == PinyinDataStyle::English) {
//暂不支持
}
}
HanZiToPinYinPrivate::HanZiToPinYinPrivate(HanZiToPinYin *parent) : q(parent)
{
//const char * const SINGLE_WORD_PINYIN_PATH = "/usr/share/ukui-search/res/dict/singleWordPinyin.txt";
//const char * const WORDS_PINYIN_PATH = "/usr/share/ukui-search/res/dict/wordsPinyin.txt";
//m_pinYinTrie = new Pinyin4cppDictTrie(SINGLE_WORD_PINYIN_PATH, WORDS_PINYIN_PATH);
//m_pinYinTrie = new Pinyin4cppTrie;
}
HanZiToPinYinPrivate::~HanZiToPinYinPrivate()
{
// if (m_pinYinTrie){
// delete m_pinYinTrie;
// m_pinYinTrie = nullptr;
// }
}
HanZiToPinYin * HanZiToPinYin::getInstance()
{
call_once(g_singleFlag, []() {
g_pinYinManager = new HanZiToPinYin;
});
return g_pinYinManager;
}
bool HanZiToPinYin::contains(string &word)
{
return d->contains(word);
}
bool HanZiToPinYin::isMultiTone(string &word)
{
return d->isMultiTone(word);
}
bool HanZiToPinYin::isMultiTone(string &&word)
{
return d->isMultiTone(word);
}
bool HanZiToPinYin::isMultiTone(const string &word)
{
return d->isMultiTone(word);
}
bool HanZiToPinYin::isMultiTone(const string &&word)
{
return d->isMultiTone(word);
}
int HanZiToPinYin::getResults(string word, QStringList &results)
{
return d->getResults(word, results);
}
void HanZiToPinYin::setConfig(PinyinDataStyle dataStyle, SegType segType, PolyphoneType polyphoneType, ExDataProcessType processType)
{
d->setConfig(dataStyle, segType, polyphoneType, processType);
}
HanZiToPinYin::HanZiToPinYin() : d(new HanZiToPinYinPrivate)
{
}

View File

@ -0,0 +1,82 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef HANZITOPINYIN_H
#define HANZITOPINYIN_H
#include <QtCore/qglobal.h>
#include <QStringList>
#include "pinyin4cpp-common.h"
#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
using namespace std;
class HanZiToPinYinPrivate;
class PINYINMANAGER_EXPORT HanZiToPinYin
{
public:
static HanZiToPinYin * getInstance();
public:
/**
* @brief HanZiToPinYin::isMultiTone //
* @param word //
* @return bool false
*/
bool isMultiTone(string &word);
bool isMultiTone(string &&word);
bool isMultiTone(const string &word);
bool isMultiTone(const string &&word);
/**
* @brief HanZiToPinYin::contains //
* @param word //
* @return bool false
*/
bool contains(string &word);
/**
* @brief HanZiToPinYin::getResults //
* @param word //
* @param results word的拼音列表results会被清空
* @return int 0-1
*/
int getResults(string word, QStringList &results);
/**
* @brief setConfig HanZiToPinYin的各项功能pinyin4cpp-common.h
* @param dataStyle defult
* @param segType
* @param polyphoneType
* @param processType defult
*/
void setConfig(PinyinDataStyle dataStyle,SegType segType,PolyphoneType polyphoneType,ExDataProcessType processType);
protected:
HanZiToPinYin();
~HanZiToPinYin();
HanZiToPinYin(const HanZiToPinYin&) = delete;
HanZiToPinYin& operator =(const HanZiToPinYin&) = delete;
private:
static HanZiToPinYin *g_pinYinManager;
HanZiToPinYinPrivate *d = nullptr;
};
#endif // PINYINMANAGER_H

View File

@ -1,39 +1,50 @@
QT -= gui
VERSION = 0.0.1
VERSION = 1.1.0
TARGET = chinese-segmentation
TEMPLATE = lib
DEFINES += LIBCHINESESEGMENTATION_LIBRARY
DEFINES += VERSION='\\"$${VERSION}\\"'
CONFIG += c++11
CONFIG += c++11 create_pc create_prl no_install_prl
# The following define makes your compiler emit warnings if you use
# any Qt feature that has been marked deprecated (the exact warnings
# depend on your compiler). Please consult the documentation of the
# deprecated API in order to know how to port your code away from it.
DEFINES += QT_DEPRECATED_WARNINGS
QMAKE_CXXFLAGS += -Werror=return-type -Werror=return-local-addr
#QMAKE_CXXFLAGS += -Werror=uninitialized
QMAKE_CXXFLAGS += -execution-charset:utf-8
# You can also make your code fail to compile if it uses deprecated APIs.
# In order to do so, uncomment the following line.
# You can also select to disable deprecated APIs only up to a certain version of Qt.
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
include(cppjieba/cppjieba.pri)
include(pinyin4cpp/pinyin4cpp.pri)
include(storage-base/storage-base-cedar.pri)
#LIBS += -L/usr/local/lib/libjemalloc -ljemalloc
SOURCES += \
chinese-segmentation.cpp \
pinyinmanager.cpp
hanzi-to-pinyin.cpp
HEADERS += \
chinese-segmentation-private.h \
chinese-segmentation.h \
libchinese-segmentation_global.h \
pinyinmanager.h
common-struct.h \
hanzi-to-pinyin-private.h \
hanzi-to-pinyin.h \
pinyin4cpp-common.h \
libchinese-segmentation_global.h
dict_files.path = /usr/share/ukui-search/res/dict/
dict_files.files = $$PWD/dict/*.utf8\
dict_files.files += $$PWD/dict/pos_dict/*.utf8\
dict_files.files += $$PWD/dict/*.txt\
dict_files.files += $$PWD/pinyin4cpp/dict/*.txt
INSTALLS += \
dict_files \
@ -41,28 +52,28 @@ INSTALLS += \
# Default rules for deployment.
unix {
target.path = $$[QT_INSTALL_LIBS]
}
QMAKE_PKGCONFIG_NAME = chinese-segmentation
QMAKE_PKGCONFIG_DESCRIPTION = chinese-segmentation Header files
QMAKE_PKGCONFIG_VERSION = $$VERSION
QMAKE_PKGCONFIG_LIBDIR = $$target.path
QMAKE_PKGCONFIG_DESTDIR = pkgconfig
QMAKE_PKGCONFIG_INCDIR = /usr/include/chinese-seg
QMAKE_PKGCONFIG_CFLAGS += -I/usr/include/chinese-seg
!isEmpty(target.path): INSTALLS += target
header.path = /usr/include/chinese-seg/
header.files += *.h
headercppjieba.path = /usr/include/chinese-seg/cppjieba/
headercppjieba.files = cppjieba/*
INSTALLS += header headercppjieba
header.path = /usr/include/chinese-seg
header.files += chinese-segmentation.h libchinese-segmentation_global.h common-struct.h hanzi-to-pinyin.h pinyin4cpp-common.h
header.files += development-files/header-files/*
# headercppjieba.path = /usr/include/chinese-seg/cppjieba/
# headercppjieba.files = cppjieba/*
INSTALLS += header
}
#DISTFILES += \
# jiaba/jieba.pri
DISTFILES += \
dict/README.md \
dict/hmm_model.utf8 \
dict/idf.utf8 \
dict/jieba.dict.utf8 \
dict/pos_dict/char_state_tab.utf8 \
dict/pos_dict/prob_emit.utf8 \
dict/pos_dict/prob_start.utf8 \
dict/pos_dict/prob_trans.utf8 \
dict/stop_words.utf8 \
dict/user.dict.utf8 \
dict/pinyinWithoutTone.txt
development-files/header-files/* \
pinyin4cpp/pinyin4cpp.pri

View File

@ -0,0 +1,73 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef PINYIN4CPP_COMMON_H
#define PINYIN4CPP_COMMON_H
/**
* @brief The PinyinDataStyle enum
* Default return zhong xin
* Tone # return zhōng xīn
* Tone2 2 # return zho1ng xi1n
* Tone3 3 # return zhong1 xin1
* FirstLetter # return z x
* English () # return center,heart,core
*/
enum class PinyinDataStyle {
Default = 1u << 0,
Tone = 1u << 1,
Tone2 = 1u << 2,
Tone3 = 1u << 3,
FirstLetter = 1u << 4,
English = 1u << 5
};
/**
* @brief The SegType enum
* Segmentation #->
* NoSegmentation #
*/
enum class SegType {
Segmentation = 1u << 0,
NoSegmentation = 1u << 1
};
/**
* @brief The PolyphoneType enum
* Disable return qi an xin
* Enable returnqi,ji an xin
* return "zhao/chao yang/yang"
*/
enum class PolyphoneType {
Disable = 1u << 0,
Enable = 1u << 1
};
/**
* @brief The ExDataProcessType enum
* Default 123 return "123 mu tou ren"
* Delete #123 return "mu tou ren"
*/
enum class ExDataProcessType {
Default = 1u << 0,
Delete = 1u << 1
};
#endif //PINYIN4CPP_COMMON_H

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,127 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#include "pinyin4cpp-trie.h"
Pinyin4cppTrie::Pinyin4cppTrie(string dat_cache_path)
: StorageBase<char, false, CacheFileHeaderBase>(vector<string>{SINGLE_WORD_PINYIN_PATH, WORDS_PINYIN_PATH}, dat_cache_path)
{
this->Init();
}
Pinyin4cppTrie::Pinyin4cppTrie(const vector<string> file_paths, string dat_cache_path)
: StorageBase<char, false, CacheFileHeaderBase>(file_paths, dat_cache_path)
{
this->Init();
}
bool Pinyin4cppTrie::Contains(string &word) {
if (this->Find(word) != string())
return true;
return false;
}
bool Pinyin4cppTrie::IsMultiTone(const string &word) {
string result = this->Find(word);
if (result.find(",") == result.npos)
return true;
return false;
}
void Pinyin4cppTrie::LoadSourceFile(const string &dat_cache_file, const string &md5)
{
CacheFileHeaderBase header;
assert(sizeof(header.md5_hex) == md5.size());
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
int offset(0), elements_num(0), write_bytes(0), data_trie_size(0);
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
umask(S_IWGRP | S_IWOTH);
const int fd =mkstemp((char *)tmp_filepath.data());
assert(fd >= 0);
fchmod(fd, 0644);
write_bytes = write(fd, (const char *)&header, sizeof(CacheFileHeaderBase));
this->LoadSingleWordDict(fd, write_bytes, offset, elements_num);
this->LoadWordsDict(fd, write_bytes, offset, elements_num);
write_bytes += write(fd, this->GetDataTrieArray(), this->GetDataTrieTotalSize());
lseek(fd, sizeof(header.md5_hex), SEEK_SET);
write(fd, &elements_num, sizeof(int));
write(fd, &offset, sizeof(int));
data_trie_size = this->GetDataTrieSize();
write(fd, &data_trie_size, sizeof(int));
close(fd);
assert((size_t)write_bytes == sizeof(CacheFileHeaderBase) + offset + this->GetDataTrieTotalSize());
const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str());
assert(0 == rename_ret);
}
string Pinyin4cppTrie::Find(const string &key)
{
int result = this->ExactMatchSearch(key.c_str(), key.size());
if (result < 0)
return string();
return string(&this->GetElementPtr()[result]);
}
void Pinyin4cppTrie::LoadSingleWordDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
{
ifstream ifs(SINGLE_WORD_PINYIN_PATH);
string line;
vector<string> buf;
for (; getline(ifs, line);) {
if (limonp::StartsWith(line, "#") or line.empty()) {
continue;
}
limonp::Split(line, buf, ":");
if (buf.size() != 3)
continue;
this->Update(buf[2].c_str(), buf[2].size(), offset);
offset += (buf[1].size() + 1);
elements_num++;
write_bytes += write(fd, buf[1].c_str(), buf[1].size() + 1);
}
}
void Pinyin4cppTrie::LoadWordsDict(const int &fd, int &write_bytes, int &offset, int &elements_num)
{
ifstream ifs(WORDS_PINYIN_PATH);
string line;
vector<string> buf;
for (; getline(ifs, line);) {
if (limonp::StartsWith(line, "#") or line.empty()) {
continue;
}
limonp::Split(line, buf, ":");
if (buf.size() != 2)
continue;
this->Update(buf[0].c_str(), buf[0].size(), offset);
offset += (buf[1].size() + 1);
elements_num++;
write_bytes += write(fd, buf[1].c_str(), buf[1].size() + 1);
}
}

View File

@ -0,0 +1,43 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef PINYIN4CPPTRIE_H
#define PINYIN4CPPTRIE_H
#include "storage-base.hpp"
const char * const SINGLE_WORD_PINYIN_PATH = "/usr/share/ukui-search/res/dict/singleWordPinyin.txt";
const char * const WORDS_PINYIN_PATH = "/usr/share/ukui-search/res/dict/wordsPinyin.txt";
class Pinyin4cppTrie : public StorageBase<char, false, CacheFileHeaderBase>
{
public:
Pinyin4cppTrie(string dat_cache_path = "");
Pinyin4cppTrie(const vector<string> file_paths, string dat_cache_path = "");
void LoadSourceFile(const string &dat_cache_file, const string &md5) override;
string Find(const string &key);
bool Contains(string &word);
bool IsMultiTone(const string &word);
private:
void LoadSingleWordDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
void LoadWordsDict(const int &fd, int &write_bytes, int &offset, int &elements_num);
};
#endif // PINYIN4CPPTRIE_H

View File

@ -0,0 +1,15 @@
INCLUDEPATH += $$PWD
HEADERS += \
$$PWD/pinyin4cpp-trie.h \
$$PWD/pinyin4cpp_dataTrie.h \
$$PWD/pinyin4cpp_dictTrie.h
SOURCES += \
$$PWD/pinyin4cpp-trie.cpp \
$$PWD/pinyin4cpp_dataTrie.cpp \
$$PWD/pinyin4cpp_dictTrie.cpp
DISTFILES += \
pinyin4cpp/dict/wordsPinyin.txt \
pinyin4cpp/dict/singleWordPinyin.txt

View File

@ -0,0 +1,135 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#include "pinyin4cpp_dataTrie.h"
Pinyin4cppDataTrie::Pinyin4cppDataTrie()
{
}
Pinyin4cppDataTrie::~Pinyin4cppDataTrie()
{
munmap(m_mmapAddr, m_mmapLength);
m_mmapAddr = nullptr;
close(m_mmapFd);
m_mmapFd = -1;
}
string Pinyin4cppDataTrie::Find(const string &key) const {
// darts-clone的接口方法
Darts::DoubleArray::result_pair_type find_result;
m_DoubleArrayDataTrie.exactMatchSearch(key.c_str(), find_result);
if ((0 == find_result.length) || (find_result.value < 0) || ((size_t)find_result.value >= m_elementsSize)) {//todo
return string();
}
return string(&m_elementsPtr[find_result.value]);
// cedarpp的接口方法
// int result = m_DoubleArrayDataTrie.exactMatchSearch<int>(key.c_str(), key.size());
// if (result < 0)
// return string();
// return string(&m_elementsPtr[result]);
}
bool Pinyin4cppDataTrie::InitBuildDat(map<string, string> &elements, const string &dat_cache_file, const string &md5) {
BuildDatCache(elements, dat_cache_file, md5);
return InitAttachDat(dat_cache_file, md5);
}
bool Pinyin4cppDataTrie::InitAttachDat(const string &dat_cache_file, const string &md5) {
m_mmapFd = open(dat_cache_file.c_str(), O_RDONLY);
if (m_mmapFd < 0) {
return false;
}
const auto seek_off = lseek(m_mmapFd, 0, SEEK_END);
assert(seek_off >= 0);
m_mmapLength = seek_off;
m_mmapAddr = reinterpret_cast<char *>(mmap(NULL, m_mmapLength, PROT_READ, MAP_SHARED, m_mmapFd, 0));
assert(MAP_FAILED != m_mmapAddr);
assert(m_mmapLength >= sizeof(CacheFileHeader));
CacheFileHeader & header = *reinterpret_cast<CacheFileHeader*>(m_mmapAddr);
m_elementsNum = header.elements_num;
m_elementsSize = header.elements_size;
assert(sizeof(header.md5_hex) == md5.size());
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())) {
return false;
}
assert(m_mmapLength == sizeof(CacheFileHeader) + header.elements_size + header.dat_size * m_DoubleArrayDataTrie.unit_size());
m_elementsPtr = (const char *)(m_mmapAddr + sizeof(CacheFileHeader));
const char * dat_ptr = m_mmapAddr + sizeof(CacheFileHeader) + header.elements_size;
m_DoubleArrayDataTrie.set_array((char *)dat_ptr, header.dat_size);
return true;
}
void Pinyin4cppDataTrie::BuildDatCache(map<string, string> &elements, const string &dat_cache_file, const string &md5) {
vector<const char*> keys_ptr_vec;
vector<int> values_vec;
vector<string> mem_elem_vec;
keys_ptr_vec.reserve(elements.size());
values_vec.reserve(elements.size());
mem_elem_vec.reserve(elements.size());
CacheFileHeader header;
assert(sizeof(header.md5_hex) == md5.size());
memcpy(&header.md5_hex[0], md5.c_str(), md5.size());
int offset(0);
for (auto &info:elements) {
keys_ptr_vec.push_back(info.first.c_str());
values_vec.push_back(offset);
offset += (info.second.size() + 1);//+1指字符串后加\0
assert(info.second.size() > 0);
mem_elem_vec.push_back(info.second);
}
auto const ret = m_DoubleArrayDataTrie.build(keys_ptr_vec.size(), &keys_ptr_vec[0], NULL, &values_vec[0]);
assert(0 == ret);
header.elements_num = mem_elem_vec.size();
header.elements_size = offset;
header.dat_size = m_DoubleArrayDataTrie.size();
string tmp_filepath = string(dat_cache_file) + "_XXXXXX";
umask(S_IWGRP | S_IWOTH);
const int fd =mkstemp((char *)tmp_filepath.data());
assert(fd >= 0);
fchmod(fd, 0644);
auto write_bytes = write(fd, (const char *)&header, sizeof(header));
for (size_t i = 0; i < elements.size(); ++i) {
write_bytes += write(fd, mem_elem_vec[i].c_str(), mem_elem_vec[i].size() + 1);
}
write_bytes += write(fd, m_DoubleArrayDataTrie.array(), m_DoubleArrayDataTrie.total_size());
assert((size_t)write_bytes == sizeof(header) + offset + m_DoubleArrayDataTrie.total_size());
close(fd);
const auto rename_ret = rename(tmp_filepath.c_str(), dat_cache_file.c_str());
assert(0 == rename_ret);
}

View File

@ -0,0 +1,74 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef PINYIN4cpp_DATATRIE_H
#define PINYIN4cpp_DATATRIE_H
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <QDebug>
#include "Md5.hpp"
#include "LocalVector.hpp"
#include "StringUtil.hpp"
//#define USE_REDUCED_TRIE
#include "../storage-base/cedar/cedar.h"
#include "../storage-base/darts-clone/darts.h"
using namespace std;
using std::pair;
struct CacheFileHeader { //todo 字节对齐
char md5_hex[32] = {};
uint32_t elements_num = 0;
uint32_t elements_size = 0;
uint32_t dat_size = 0;
};
class Pinyin4cppDataTrie {
public:
Pinyin4cppDataTrie();
~Pinyin4cppDataTrie();
string Find(const string & key) const;
bool InitBuildDat(map<string, string>& elements, const string & dat_cache_file, const string & md5);
bool InitAttachDat(const string & dat_cache_file, const string & md5);
private:
void BuildDatCache(map<string, string>& elements, const string & dat_cache_file, const string & md5);
Pinyin4cppDataTrie(const Pinyin4cppDataTrie &);
Pinyin4cppDataTrie &operator=(const Pinyin4cppDataTrie &);
private:
Darts::DoubleArray m_DoubleArrayDataTrie;
//cedar::da<int, -1, -2, true> m_DoubleArrayDataTrie;
const char * m_elementsPtr = nullptr;
size_t m_elementsNum = 0;
size_t m_elementsSize = 0;
size_t m_mmapLength = 0;
int m_mmapFd = -1;
char * m_mmapAddr = nullptr;
};
#endif //PINYIN4cpp_DATATRIE_H

View File

@ -0,0 +1,156 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#include "pinyin4cpp_dictTrie.h"
#include "malloc.h"
Pinyin4cppDictTrie::Pinyin4cppDictTrie(const string &single_word_dict_path, const string &words_dict_paths, const string &dat_cache_path) {
Init(single_word_dict_path, words_dict_paths, dat_cache_path);
}
string Pinyin4cppDictTrie::Find(const string &word) const {
return m_DataTrie.Find(word);
}
bool Pinyin4cppDictTrie::Contains(string &word) {
if (m_DataTrie.Find(word) != string())
return true;
return false;
}
bool Pinyin4cppDictTrie::IsMultiTone(const string &word) {
string result = m_DataTrie.Find(word);
if (result.find(",") == result.npos)
return true;
return false;
}
size_t Pinyin4cppDictTrie::GetTotalDictSize() const {
return m_TotalDictSize_;
}
void Pinyin4cppDictTrie::Init(const string &single_word_dict_path, const string &words_dict_paths, string dat_cache_path) {
const auto dict_list = single_word_dict_path + "|" + words_dict_paths;
size_t file_size_sum = 0;
const string md5 = CalcFileListMD5(dict_list, file_size_sum);
m_TotalDictSize_ = file_size_sum;
if (dat_cache_path.empty()) {
dat_cache_path = "/tmp/" + md5 + ".dat_cache";//未指定词库数据文件存储位置的默认存储在tmp目录下
}
qDebug() << "#####Pinyin Dict path:" << dat_cache_path.c_str();
if (m_DataTrie.InitAttachDat(dat_cache_path, md5)) {
return;
}
LoadSingleWordDict(single_word_dict_path);
LoadWordsDict(words_dict_paths);
bool build_ret = m_DataTrie.InitBuildDat(m_StaticNodeInfos, dat_cache_path, md5);
assert(build_ret);
m_StaticNodeInfos.clear();
malloc_trim(0);
}
void Pinyin4cppDictTrie::LoadSingleWordDict(const string &filePath) {
ifstream ifs(filePath.c_str());
string line;
vector<string> buf;
for (; getline(ifs, line);) {
if (limonp::StartsWith(line, "#")) {
continue;
}
limonp::Split(line, buf, ":");
assert(buf.size() == SINGLE_WORD_DICT_COLUMN_NUM);
if (m_StaticNodeInfos.find(buf[2]) != m_StaticNodeInfos.end()) {
vector<string> tmp;
bool isfind(false);
limonp::Split(m_StaticNodeInfos[buf[2]], tmp, ",");
for (auto &onePinyin:tmp) {
if (onePinyin == buf[1]) {
isfind = true;
break;
}
}
if (!isfind) {
m_StaticNodeInfos[buf[2]] += ("," + buf[2]);
}
} else {
m_StaticNodeInfos[buf[2]] = buf[1];
}
}
}
void Pinyin4cppDictTrie::LoadWordsDict(const string &filePath) {
ifstream ifs(filePath.c_str());
string line;
vector<string> buf;
for (; getline(ifs, line);) {
if (limonp::StartsWith(line, "#")) {
continue;
}
limonp::Split(line, buf, ":");
assert(buf.size() == WORDS_DICT_COLUMN_NUM);
if (m_StaticNodeInfos.find(buf[0]) != m_StaticNodeInfos.end()) {
vector<string> tmp;
bool isfind(false);
limonp::Split(m_StaticNodeInfos[buf[0]], tmp, "/");
for (auto &onePinyin:tmp) {
if (onePinyin == buf[1]) {
isfind = true;
break;
}
}
if (!isfind) {
m_StaticNodeInfos[buf[0]] += ("/" + buf[1]);
}
} else {
m_StaticNodeInfos[buf[0]] = buf[1];
}
}
}
string CalcFileListMD5(const string &files_list, size_t &file_size_sum) {
limonp::MD5 md5;
const auto files = limonp::Split(files_list, "|;");
file_size_sum = 0;
for (auto const & local_path : files) {
const int fd = open(local_path.c_str(), O_RDONLY);
if (fd < 0){
continue;
}
auto const len = lseek(fd, 0, SEEK_END);
if (len > 0) {
void * addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
assert(MAP_FAILED != addr);
md5.Update((unsigned char *) addr, len);
file_size_sum += len;
munmap(addr, len);
}
close(fd);
}
md5.Final();
return string(md5.digestChars);
}

View File

@ -0,0 +1,59 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef PINYIN4cpp_DICTTRIE_H
#define PINYIN4cpp_DICTTRIE_H
#include "pinyin4cpp_dataTrie.h"
using namespace std;
const size_t SINGLE_WORD_DICT_COLUMN_NUM = 3;
const size_t WORDS_DICT_COLUMN_NUM = 2;
class Pinyin4cppDictTrie {
public:
Pinyin4cppDictTrie(const string& single_word_dict_path, const string& words_dict_paths, const string & dat_cache_path = "");
~Pinyin4cppDictTrie() {}
string Find(const string &word) const;
bool Contains(string &word);
bool IsMultiTone(const string &word);
size_t GetTotalDictSize() const;
private:
void Init(const string& single_word_dict_path, const string& words_dict_paths, string dat_cache_path);
void LoadSingleWordDict(const string& filePath);
void LoadWordsDict(const string& filePath);
private:
map<string, string> m_StaticNodeInfos;
size_t m_TotalDictSize_ = 0;
Pinyin4cppDataTrie m_DataTrie;
};
inline string CalcFileListMD5(const string & files_list, size_t & file_size_sum);
#endif //PINYIN4cpp_DICTTRIE_H

View File

@ -1,55 +0,0 @@
#include "pinyinmanager.h"
#include <mutex>
PinYinManager * PinYinManager::g_pinYinManager = nullptr;
std::once_flag g_singleFlag;
PinYinManager * PinYinManager::getInstance()
{
call_once(g_singleFlag, []() {
g_pinYinManager = new PinYinManager;
});
return g_pinYinManager;
}
bool PinYinManager::contains(string &word)
{
return m_pinYinTrie->contains(word);
}
bool PinYinManager::isMultiTon(string &word)
{
return m_pinYinTrie->isMultiTone(word);
}
bool PinYinManager::isMultiTon(string word)
{
return m_pinYinTrie->isMultiTone(word);
}
int PinYinManager::getResults(string word, QStringList &results)
{
results.clear();
if (-1 != m_pinYinTrie->getMultiTonResults(word, results)) {
return 0;
}
QString tmp;
if (-1 != m_pinYinTrie->getSingleTonResult(word, tmp)) {
results.append(tmp);
return 0;
}
return -1;
}
PinYinManager::PinYinManager()
{
const char * const PINYIN_PATH = "/usr/share/ukui-search/res/dict/pinyinWithoutTone.txt";
m_pinYinTrie = new cppjieba::PinYinTrie(PINYIN_PATH);
}
PinYinManager::~PinYinManager()
{
if (m_pinYinTrie){
delete m_pinYinTrie;
m_pinYinTrie = nullptr;
}
}

View File

@ -1,33 +0,0 @@
#ifndef PINYINMANAGER_H
#define PINYINMANAGER_H
#include <QtCore/qglobal.h>
#include "cppjieba/PinYinTrie.hpp"
#define PINYINMANAGER_EXPORT Q_DECL_IMPORT
using namespace std;
class PINYINMANAGER_EXPORT PinYinManager
{
public:
static PinYinManager * getInstance();
public:
bool contains(string &word);
bool isMultiTon(string &word);
bool isMultiTon(string word);
int getResults(string word, QStringList &results);
protected:
PinYinManager();
~PinYinManager();
private:
static PinYinManager *g_pinYinManager;
cppjieba::PinYinTrie *m_pinYinTrie = nullptr;
};
#endif // PINYINMANAGER_H

View File

@ -0,0 +1,682 @@
// cedar -- C++ implementation of Efficiently-updatable Double ARray trie
// $Id: cedar.h 1938 2022-03-17 16:22:30Z ynaga $
// Copyright (c) 2009-2015 Naoki Yoshinaga <ynaga@tkl.iis.u-tokyo.ac.jp>
#ifndef CEDAR_H
#define CEDAR_H
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cassert>
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#define STATIC_ASSERT(e, msg) typedef char msg[(e) ? 1 : -1]
namespace cedar {
// typedefs
typedef unsigned char uchar;
template <typename T> struct NaN { enum { N1 = -1, N2 = -2 }; };
template <> struct NaN <float> { enum { N1 = 0x7f800001, N2 = 0x7f800002 }; };
static const int MAX_ALLOC_SIZE = 1 << 16; // must be divisible by 256
// dynamic double array
template <typename value_type,
const int NO_VALUE = NaN <value_type>::N1,
const int NO_PATH = NaN <value_type>::N2,
const bool ORDERED = true,
const int MAX_TRIAL = 1,
const size_t NUM_TRACKING_NODES = 0>
class da {
public:
enum error_code { CEDAR_NO_VALUE = NO_VALUE, CEDAR_NO_PATH = NO_PATH, CEDAR_VALUE_LIMIT = 2147483647 };
typedef value_type result_type;
struct result_pair_type {
value_type value;
size_t length; // prefix length
};
struct result_triple_type { // for predict ()
value_type value;
size_t length; // suffix length
size_t id; // node id of value
};
struct node {
union { int base_; value_type value; }; // negative means prev empty index
int check; // negative means next empty index
node (const int base__ = 0, const int check_ = 0)
: base_ (base__), check (check_) {}
#ifdef USE_REDUCED_TRIE
int base () const { return - (base_ + 1); } // ~ in two's complement system
#else
int base () const { return base_; }
#endif
};
struct ninfo { // x1.5 update speed; +.25 % memory (8n -> 10n)
uchar sibling; // right sibling (= 0 if not exist)
uchar child; // first child
ninfo () : sibling (0), child (0) {}
};
struct block { // a block w/ 256 elements
int prev; // prev block; 3 bytes
int next; // next block; 3 bytes
short num; // # empty elements; 0 - 256
short reject; // minimum # branching failed to locate; soft limit
int trial; // # trial
int ehead; // first empty item
block () : prev (0), next (0), num (256), reject (257), trial (0), ehead (0) {}
};
da () : tracking_node (), _array (0), _ninfo (0), _block (0), _bheadF (0), _bheadC (0), _bheadO (0), _capacity (0), _size (0), _no_delete (false), _reject () {
STATIC_ASSERT(sizeof (value_type) <= sizeof (int),
value_type_is_not_supported___maintain_a_value_array_by_yourself_and_store_its_index
);
_initialize ();
}
~da () { clear (false); }
size_t capacity () const { return static_cast <size_t> (_capacity); }
size_t size () const { return static_cast <size_t> (_size); }
size_t total_size () const { return sizeof (node) * _size; }
size_t unit_size () const { return sizeof (node); }
size_t nonzero_size () const {
size_t i = 0;
for (int to = 0; to < _size; ++to)
if (_array[to].check >= 0) ++i;
return i;
}
size_t num_keys () const {
size_t i = 0;
for (int to = 0; to < _size; ++to)
#ifdef USE_REDUCED_TRIE
if (_array[to].check >= 0 && _array[to].value >= 0) ++i;
#else
if (_array[to].check >= 0 && _array[_array[to].check].base () == to) ++i;
#endif
return i;
}
// interfance
template <typename T>
T exactMatchSearch (const char* key) const
{ return exactMatchSearch <T> (key, std::strlen (key)); }
template <typename T>
T exactMatchSearch (const char* key, size_t len, size_t from = 0) const {
union { int i; value_type x; } b;
size_t pos = 0;
b.i = _find (key, from, pos, len);
if (b.i == CEDAR_NO_PATH) b.i = CEDAR_NO_VALUE;
T result;
_set_result (&result, b.x, len, from);
return result;
}
template <typename T>
size_t commonPrefixSearch (const char* key, T* result, size_t result_len) const
{ return commonPrefixSearch (key, result, result_len, std::strlen (key)); }
template <typename T>
size_t commonPrefixSearch (const char* key, T* result, size_t result_len, size_t len, size_t from = 0) const {
size_t num = 0;
for (size_t pos = 0; pos < len; ) {
union { int i; value_type x; } b;
b.i = _find (key, from, pos, pos + 1);
if (b.i == CEDAR_NO_VALUE) continue;
if (b.i == CEDAR_NO_PATH) return num;
if (num < result_len) _set_result (&result[num], b.x, pos, from);
++num;
}
return num;
}
// predict key from double array
template <typename T>
size_t commonPrefixPredict (const char* key, T* result, size_t result_len)
{ return commonPrefixPredict (key, result, result_len, std::strlen (key)); }
template <typename T>
size_t commonPrefixPredict (const char* key, T* result, size_t result_len, size_t len, size_t from = 0) {
size_t num (0), pos (0), p (0);
if (_find (key, from, pos, len) == CEDAR_NO_PATH) return 0;
union { int i; value_type x; } b;
size_t root = from;
for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p, root)) {
if (num < result_len) _set_result (&result[num], b.x, p, from);
++num;
}
return num;
}
void suffix (char* key, size_t len, size_t to) const {
key[len] = '\0';
while (len--) {
const int from = _array[to].check;
key[len]
= static_cast <char> (_array[from].base () ^ static_cast <int> (to));
to = static_cast <size_t> (from);
}
}
value_type traverse (const char* key, size_t& from, size_t& pos) const
{ return traverse (key, from, pos, std::strlen (key)); }
value_type traverse (const char* key, size_t& from, size_t& pos, size_t len) const {
union { int i; value_type x; } b;
b.i = _find (key, from, pos, len);
return b.x;
}
struct empty_callback { void operator () (const int, const int) {} }; // dummy empty function
value_type& update (const char* key)
{ return update (key, std::strlen (key)); }
value_type& update (const char* key, size_t len, value_type val = value_type (0))
{ size_t from (0), pos (0); return update (key, from, pos, len, val); }
value_type& update (const char* key, size_t& from, size_t& pos, size_t len, value_type val = value_type (0))
{ empty_callback cf; return update (key, from, pos, len, val, cf); }
template <typename T>
value_type& update (const char* key, size_t& from, size_t& pos, size_t len, value_type val, T& cf) {
if (! len && ! from)
_err (__FILE__, __LINE__, "failed to insert zero-length key\n");
#ifndef USE_FAST_LOAD
if (! _ninfo || ! _block) restore ();
#endif
for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
pos < len; ++pos) {
#ifdef USE_REDUCED_TRIE
const value_type val_ = _array[from].value;
if (val_ >= 0 && val_ != CEDAR_VALUE_LIMIT) // always new; correct this!
{ const int to = _follow (from, 0, cf); _array[to].value = val_; }
#endif
from = static_cast <size_t> (_follow (from, key_[pos], cf));
}
#ifdef USE_REDUCED_TRIE
const int to = _array[from].value >= 0 ? static_cast <int> (from) : _follow (from, 0, cf);
if (_array[to].value == CEDAR_VALUE_LIMIT) _array[to].value = 0;
#else
const int to = _follow (from, 0, cf);
#endif
return _array[to].value += val;
}
// easy-going erase () without compression
int erase (const char* key) { return erase (key, std::strlen (key)); }
int erase (const char* key, size_t len, size_t from = 0) {
size_t pos = 0;
const int i = _find (key, from, pos, len);
if (i == CEDAR_NO_PATH || i == CEDAR_NO_VALUE) return -1;
erase (from);
return 0;
}
void erase (size_t from) {
// _test ();
#ifdef USE_REDUCED_TRIE
int e = _array[from].value >= 0 ? static_cast <int> (from) : _array[from].base () ^ 0;
from = static_cast <size_t> (_array[e].check);
#else
int e = _array[from].base () ^ 0;
#endif
bool flag = false; // have sibling
do {
const node& n = _array[from];
flag = _ninfo[n.base () ^ _ninfo[from].child].sibling;
if (flag) _pop_sibling (from, n.base (), static_cast <uchar> (n.base () ^ e));
_push_enode (e);
e = static_cast <int> (from);
from = static_cast <size_t> (_array[from].check);
} while (! flag);
}
int build (size_t num, const char** key, const size_t* len = 0, const value_type* val = 0) {
for (size_t i = 0; i < num; ++i)
update (key[i], len ? len[i] : std::strlen (key[i]), val ? val[i] : value_type (i));
return 0;
}
template <typename T>
void dump (T* result, const size_t result_len) {
union { int i; value_type x; } b;
size_t num (0), from (0), p (0);
for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p))
if (num < result_len)
_set_result (&result[num++], b.x, p, from);
else
_err (__FILE__, __LINE__, "dump() needs array of length = num_keys()\n");
}
int save (const char* fn, const char* mode = "wb") const {
// _test ();
FILE* fp = std::fopen (fn, mode);
if (! fp) return -1;
std::fwrite (_array, sizeof (node), static_cast <size_t> (_size), fp);
std::fclose (fp);
#ifdef USE_FAST_LOAD
const char* const info
= std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
fp = std::fopen (info, mode);
delete [] info; // resolve memory leak
if (! fp) return -1;
std::fwrite (&_bheadF, sizeof (int), 1, fp);
std::fwrite (&_bheadC, sizeof (int), 1, fp);
std::fwrite (&_bheadO, sizeof (int), 1, fp);
std::fwrite (_ninfo, sizeof (ninfo), static_cast <size_t> (_size), fp);
std::fwrite (_block, sizeof (block), static_cast <size_t> (_size >> 8), fp);
std::fclose (fp);
#endif
return 0;
}
int open (const char* fn, const char* mode = "rb",
const size_t offset = 0, size_t size_ = 0) {
FILE* fp = std::fopen (fn, mode);
if (! fp) return -1;
// get size
if (! size_) {
if (std::fseek (fp, 0, SEEK_END) != 0) return -1;
size_ = static_cast <size_t> (std::ftell (fp));
if (std::fseek (fp, 0, SEEK_SET) != 0) return -1;
}
if (size_ <= offset) return -1;
// set array
clear (false);
size_ = (size_ - offset) / sizeof (node);
if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
_array = static_cast <node*> (std::malloc (sizeof (node) * size_));
#ifdef USE_FAST_LOAD
_ninfo = static_cast <ninfo*> (std::malloc (sizeof (ninfo) * size_));
_block = static_cast <block*> (std::malloc (sizeof (block) * size_));
if (! _array || ! _ninfo || ! _block)
#else
if (! _array)
#endif
_err (__FILE__, __LINE__, "memory allocation failed\n");
if (size_ != std::fread (_array, sizeof (node), size_, fp)) return -1;
std::fclose (fp);
_size = static_cast <int> (size_);
#ifdef USE_FAST_LOAD
const char* const info
= std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
fp = std::fopen (info, mode);
delete [] info; // resolve memory leak
if (! fp) return -1;
std::fread (&_bheadF, sizeof (int), 1, fp);
std::fread (&_bheadC, sizeof (int), 1, fp);
std::fread (&_bheadO, sizeof (int), 1, fp);
if (size_ != std::fread (_ninfo, sizeof (ninfo), size_, fp) ||
size_ != std::fread (_block, sizeof (block), size_ >> 8, fp) << 8)
return -1;
std::fclose (fp);
_capacity = _size;
#endif
return 0;
}
#ifndef USE_FAST_LOAD
void restore () { // restore information to update
if (! _block) _restore_block ();
if (! _ninfo) _restore_ninfo ();
_capacity = _size;
}
#endif
void set_array (void* p, size_t size_ = 0) { // ad-hoc
clear (false);
_array = static_cast <node*> (p);
_size = static_cast <int> (size_);
_no_delete = true;
}
const void* array () const { return _array; }
void clear (const bool reuse = true) {
if (_array && ! _no_delete) std::free (_array);
if (_ninfo) std::free (_ninfo);
if (_block) std::free (_block);
_array = 0; _ninfo = 0; _block = 0;
_bheadF = _bheadC = _bheadO = _capacity = _size = 0; // *
if (reuse) _initialize ();
_no_delete = false;
}
// return the first child for a tree rooted by a given node
int begin (size_t& from, size_t& len) {
#ifndef USE_FAST_LOAD
if (! _ninfo) _restore_ninfo ();
#endif
int base = _array[from].base ();
uchar c = _ninfo[from].child;
if (! from && ! (c = _ninfo[base ^ c].sibling)) // bug fix
return CEDAR_NO_PATH; // no entry
for (; c; ++len) {
from = static_cast <size_t> (_array[from].base ()) ^ c;
c = _ninfo[from].child;
}
#ifdef USE_REDUCED_TRIE
if (_array[from].value >= 0) return _array[from].value;
#endif
return _array[_array[from].base () ^ c].base_;
}
// return the next child if any
int next (size_t& from, size_t& len, const size_t root = 0) {
uchar c = 0;
#ifdef USE_REDUCED_TRIE
if (_array[from].value < 0)
#endif
c = _ninfo[_array[from].base () ^ 0].sibling;
for (; ! c && from != root; --len) {
c = _ninfo[from].sibling;
from = static_cast <size_t> (_array[from].check);
}
return c ?
begin (from = static_cast <size_t> (_array[from].base ()) ^ c, ++len) :
CEDAR_NO_PATH;
}
// test the validity of double array for debug
void test (const size_t from = 0) const {
const int base = _array[from].base ();
uchar c = _ninfo[from].child;
do {
if (from) assert (_array[base ^ c].check == static_cast <int> (from));
if (c && _array[base ^ c].value < 0) // correct this
test (static_cast <size_t> (base ^ c));
} while ((c = _ninfo[base ^ c].sibling));
}
size_t tracking_node[NUM_TRACKING_NODES + 1];
private:
// currently disabled; implement these if you need
da (const da&);
da& operator= (const da&);
node* _array;
ninfo* _ninfo;
block* _block;
int _bheadF; // first block of Full; 0
int _bheadC; // first block of Closed; 0 if no Closed
int _bheadO; // first block of Open; 0 if no Open
int _capacity;
int _size;
int _no_delete;
short _reject[257];
//
static void _err (const char* fn, const int ln, const char* msg)
{ std::fprintf (stderr, "cedar: %s [%d]: %s", fn, ln, msg); std::exit (1); }
template <typename T>
static void _realloc_array (T*& p, const int size_n, const int size_p = 0) {
void* tmp = std::realloc (p, sizeof (T) * static_cast <size_t> (size_n));
if (! tmp)
std::free (p), _err (__FILE__, __LINE__, "memory reallocation failed\n");
p = static_cast <T*> (tmp);
static const T T0 = T ();
for (T* q (p + size_p), * const r (p + size_n); q != r; ++q) *q = T0;
}
void _initialize () { // initilize the first special block
_realloc_array (_array, 256, 256);
_realloc_array (_ninfo, 256);
_realloc_array (_block, 1);
#ifdef USE_REDUCED_TRIE
_array[0] = node (-1, -1);
#else
_array[0] = node (0, -1);
#endif
for (int i = 1; i < 256; ++i)
_array[i] = node (i == 1 ? -255 : - (i - 1), i == 255 ? -1 : - (i + 1));
_block[0].ehead = 1; // bug fix for erase
_capacity = _size = 256;
for (size_t i = 0 ; i <= NUM_TRACKING_NODES; ++i) tracking_node[i] = 0;
for (short i = 0; i <= 256; ++i) _reject[i] = i + 1;
}
// follow/create edge
template <typename T>
int _follow (size_t& from, const uchar& label, T& cf) {
int to = 0;
const int base = _array[from].base ();
if (base < 0 || _array[to = base ^ label].check < 0) {
to = _pop_enode (base, label, static_cast <int> (from));
_push_sibling (from, to ^ label, label, base >= 0);
} else if (_array[to].check != static_cast <int> (from))
to = _resolve (from, base, label, cf);
return to;
}
// find key from double array
int _find (const char* key, size_t& from, size_t& pos, const size_t len) const {
for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
pos < len; ) { // follow link
#ifdef USE_REDUCED_TRIE
if (_array[from].value >= 0) return CEDAR_NO_PATH;
#endif
size_t to = static_cast <size_t> (_array[from].base ()); to ^= key_[pos];
if (_array[to].check != static_cast <int> (from)) return CEDAR_NO_PATH;
++pos;
from = to;
}
#ifdef USE_REDUCED_TRIE
if (_array[from].value >= 0) // get value from leaf; only allow integer key
return _array[from].value;
#endif
const node n = _array[_array[from].base () ^ 0];
if (n.check != static_cast <int> (from)) return CEDAR_NO_VALUE;
return n.base_;
}
#ifndef USE_FAST_LOAD
void _restore_ninfo () {
_realloc_array (_ninfo, _size);
for (int to = 0; to < _size; ++to) {
const int from = _array[to].check;
if (from < 0) continue; // skip empty node
const int base = _array[from].base ();
if (const uchar label = static_cast <uchar> (base ^ to)) // skip leaf
_push_sibling (static_cast <size_t> (from), base, label,
! from || _ninfo[from].child || _array[base ^ 0].check == from);
}
}
void _restore_block () {
_realloc_array (_block, _size >> 8);
_bheadF = _bheadC = _bheadO = 0;
for (int bi (0), e (0); e < _size; ++bi) { // register blocks to full
block& b = _block[bi];
b.num = 0;
for (; e < (bi << 8) + 256; ++e)
if (_array[e].check < 0 && ++b.num == 1) b.ehead = e;
int& head_out = b.num == 1 ? _bheadC : (b.num == 0 ? _bheadF : _bheadO);
_push_block (bi, head_out, ! head_out && b.num);
}
}
#endif
void _set_result (result_type* x, value_type r, size_t = 0, size_t = 0) const
{ *x = r; }
void _set_result (result_pair_type* x, value_type r, size_t l, size_t = 0) const
{ x->value = r; x->length = l; }
void _set_result (result_triple_type* x, value_type r, size_t l, size_t from) const
{ x->value = r; x->length = l; x->id = from; }
void _pop_block (const int bi, int& head_in, const bool last) {
if (last) { // last one poped; Closed or Open
head_in = 0;
} else {
const block& b = _block[bi];
_block[b.prev].next = b.next;
_block[b.next].prev = b.prev;
if (bi == head_in) head_in = b.next;
}
}
void _push_block (const int bi, int& head_out, const bool empty) {
block& b = _block[bi];
if (empty) { // the destination is empty
head_out = b.prev = b.next = bi;
} else { // use most recently pushed
int& tail_out = _block[head_out].prev;
b.prev = tail_out;
b.next = head_out;
head_out = tail_out = _block[tail_out].next = bi;
}
}
int _add_block () {
if (_size == _capacity) { // allocate memory if needed
#ifdef USE_EXACT_FIT
_capacity += _size >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : _size;
#else
_capacity += _capacity;
#endif
_realloc_array (_array, _capacity, _capacity);
_realloc_array (_ninfo, _capacity, _size);
_realloc_array (_block, _capacity >> 8, _size >> 8);
}
_block[_size >> 8].ehead = _size;
_array[_size] = node (- (_size + 255), - (_size + 1));
for (int i = _size + 1; i < _size + 255; ++i)
_array[i] = node (-(i - 1), -(i + 1));
_array[_size + 255] = node (- (_size + 254), -_size);
_push_block (_size >> 8, _bheadO, ! _bheadO); // append to block Open
_size += 256;
return (_size >> 8) - 1;
}
// transfer block from one start w/ head_in to one start w/ head_out
void _transfer_block (const int bi, int& head_in, int& head_out) {
_pop_block (bi, head_in, bi == _block[bi].next);
_push_block (bi, head_out, ! head_out && _block[bi].num);
}
// pop empty node from block; never transfer the special block (bi = 0)
int _pop_enode (const int base, const uchar label, const int from) {
const int e = base < 0 ? _find_place () : base ^ label;
const int bi = e >> 8;
node& n = _array[e];
block& b = _block[bi];
if (--b.num == 0) {
if (bi) _transfer_block (bi, _bheadC, _bheadF); // Closed to Full
} else { // release empty node from empty ring
_array[-n.base_].check = n.check;
_array[-n.check].base_ = n.base_;
if (e == b.ehead) b.ehead = -n.check; // set ehead
if (bi && b.num == 1 && b.trial != MAX_TRIAL) // Open to Closed
_transfer_block (bi, _bheadO, _bheadC);
}
// initialize the released node
#ifdef USE_REDUCED_TRIE
n.value = CEDAR_VALUE_LIMIT; n.check = from;
if (base < 0) _array[from].base_ = - (e ^ label) - 1;
#else
if (label) n.base_ = -1; else n.value = value_type (0); n.check = from;
if (base < 0) _array[from].base_ = e ^ label;
#endif
return e;
}
// push empty node into empty ring
void _push_enode (const int e) {
const int bi = e >> 8;
block& b = _block[bi];
if (++b.num == 1) { // Full to Closed
b.ehead = e;
_array[e] = node (-e, -e);
if (bi) _transfer_block (bi, _bheadF, _bheadC); // Full to Closed
} else {
const int prev = b.ehead;
const int next = -_array[prev].check;
_array[e] = node (-prev, -next);
_array[prev].check = _array[next].base_ = -e;
if (b.num == 2 || b.trial == MAX_TRIAL) // Closed to Open
if (bi) _transfer_block (bi, _bheadC, _bheadO);
b.trial = 0;
}
if (b.reject < _reject[b.num]) b.reject = _reject[b.num];
_ninfo[e] = ninfo (); // reset ninfo; no child, no sibling
}
// push label to from's child
void _push_sibling (const size_t from, const int base, const uchar label, const bool flag = true) {
uchar* c = &_ninfo[from].child;
if (flag && (ORDERED ? label > *c : ! *c))
do c = &_ninfo[base ^ *c].sibling; while (ORDERED && *c && *c < label);
_ninfo[base ^ label].sibling = *c, *c = label;
}
// pop label from from's child
void _pop_sibling (const size_t from, const int base, const uchar label) {
uchar* c = &_ninfo[from].child;
while (*c != label) c = &_ninfo[base ^ *c].sibling;
*c = _ninfo[base ^ label].sibling;
}
// check whether to replace branching w/ the newly added node
bool _consult (const int base_n, const int base_p, uchar c_n, uchar c_p) const {
do if (! (c_p = _ninfo[base_p ^ c_p].sibling)) return false;
while ((c_n = _ninfo[base_n ^ c_n].sibling));
return true;
}
// enumerate (equal to or more than one) child nodes
uchar* _set_child (uchar* p, const int base, uchar c, const int label = -1) {
--p;
if (! c) { *++p = c; c = _ninfo[base ^ c].sibling; } // 0: terminal
if (ORDERED)
while (c && c < label) { *++p = c; c = _ninfo[base ^ c].sibling; }
if (label != -1) *++p = static_cast <uchar> (label);
while (c) { *++p = c; c = _ninfo[base ^ c].sibling; }
return p;
}
// explore new block to settle down
int _find_place () {
if (_bheadC) return _block[_bheadC].ehead;
if (_bheadO) return _block[_bheadO].ehead;
return _add_block () << 8;
}
int _find_place (const uchar* const first, const uchar* const last) {
if (int bi = _bheadO) {
const int bz = _block[_bheadO].prev;
const short nc = static_cast <short> (last - first + 1);
while (1) { // set candidate block
block& b = _block[bi];
if (b.num >= nc && nc < b.reject) // explore configuration
for (int e = b.ehead;;) {
const int base = e ^ *first;
for (const uchar* p = first; _array[base ^ *++p].check < 0; )
if (p == last) return b.ehead = e; // no conflict
if ((e = -_array[e].check) == b.ehead) break;
}
b.reject = nc;
if (b.reject < _reject[b.num]) _reject[b.num] = b.reject;
const int bi_ = b.next;
if (++b.trial == MAX_TRIAL) _transfer_block (bi, _bheadO, _bheadC);
if (bi == bz) break;
bi = bi_;
};
}
return _add_block () << 8;
}
// resolve conflict on base_n ^ label_n = base_p ^ label_p
template <typename T>
int _resolve (size_t& from_n, const int base_n, const uchar label_n, T& cf) {
// examine siblings of conflicted nodes
const int to_pn = base_n ^ label_n;
const int from_p = _array[to_pn].check;
const int base_p = _array[from_p].base ();
const bool flag // whether to replace siblings of newly added
= _consult (base_n, base_p, _ninfo[from_n].child, _ninfo[from_p].child);
uchar child[256];
uchar* const first = &child[0];
uchar* const last =
flag ? _set_child (first, base_n, _ninfo[from_n].child, label_n)
: _set_child (first, base_p, _ninfo[from_p].child);
const int base =
(first == last ? _find_place () : _find_place (first, last)) ^ *first;
// replace & modify empty list
const int from = flag ? static_cast <int> (from_n) : from_p;
const int base_ = flag ? base_n : base_p;
if (flag && *first == label_n) _ninfo[from].child = label_n; // new child
#ifdef USE_REDUCED_TRIE
_array[from].base_ = -base - 1; // new base
#else
_array[from].base_ = base; // new base
#endif
for (const uchar* p = first; p <= last; ++p) { // to_ => to
const int to = _pop_enode (base, *p, from);
const int to_ = base_ ^ *p;
_ninfo[to].sibling = (p == last ? 0 : *(p + 1));
if (flag && to_ == to_pn) continue; // skip newcomer (no child)
cf (to_, to); // user-defined callback function to handle moved nodes
node& n = _array[to];
node& n_ = _array[to_];
#ifdef USE_REDUCED_TRIE
if ((n.base_ = n_.base_) < 0 && *p) // copy base; bug fix
#else
if ((n.base_ = n_.base_) > 0 && *p) // copy base; bug fix
#endif
{
uchar c = _ninfo[to].child = _ninfo[to_].child;
do _array[n.base () ^ c].check = to; // adjust grand son's check
while ((c = _ninfo[n.base () ^ c].sibling));
}
if (! flag && to_ == static_cast <int> (from_n)) // parent node moved
from_n = static_cast <size_t> (to); // bug fix
if (! flag && to_ == to_pn) { // the address is immediately used
_push_sibling (from_n, to_pn ^ label_n, label_n);
_ninfo[to_].child = 0; // remember to reset child
#ifdef USE_REDUCED_TRIE
n_.value = CEDAR_VALUE_LIMIT;
#else
if (label_n) n_.base_ = -1; else n_.value = value_type (0);
#endif
n_.check = static_cast <int> (from_n);
} else
_push_enode (to_);
if (NUM_TRACKING_NODES) // keep the traversed node updated
for (size_t j = 0; tracking_node[j] != 0; ++j)
if (tracking_node[j] == static_cast <size_t> (to_))
{ tracking_node[j] = static_cast <size_t> (to); break; }
}
return flag ? base ^ label_n : to_pn;
}
};
}
#endif

View File

@ -0,0 +1,834 @@
// cedar -- C++ implementation of Efficiently-updatable Double ARray trie
// $Id: cedarpp.h 1916 2017-07-12 07:30:56Z ynaga $
// Copyright (c) 2009-2015 Naoki Yoshinaga <ynaga@tkl.iis.u-tokyo.ac.jp>
#ifndef CEDAR_H
#define CEDAR_H
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <climits>
#include <cassert>
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#define STATIC_ASSERT(e, msg) typedef char msg[(e) ? 1 : -1]
namespace cedar {
// typedefs
#if LONG_BIT == 64
typedef unsigned long npos_t; // possibly compatible with size_t
#else
typedef unsigned long long npos_t;
#endif
typedef unsigned char uchar;
static const npos_t TAIL_OFFSET_MASK = static_cast <npos_t> (0xffffffff);
static const npos_t NODE_INDEX_MASK = static_cast <npos_t> (0xffffffff) << 32;
template <typename T> struct NaN { enum { N1 = -1, N2 = -2 }; };
template <> struct NaN <float> { enum { N1 = 0x7f800001, N2 = 0x7f800002 }; };
static const int MAX_ALLOC_SIZE = 1 << 16; // must be divisible by 256
// dynamic double array
template <typename value_type,
const int NO_VALUE = NaN <value_type>::N1,
const int NO_PATH = NaN <value_type>::N2,
const bool ORDERED = true,
const int MAX_TRIAL = 1,
const size_t NUM_TRACKING_NODES = 0>
class da {
public:
enum error_code { CEDAR_NO_VALUE = NO_VALUE, CEDAR_NO_PATH = NO_PATH };
typedef value_type result_type;
struct result_pair_type {
value_type value;
size_t length; // prefix length
};
struct result_triple_type { // for predict ()
value_type value;
size_t length; // suffix length
npos_t id; // node id of value
};
struct node {
union { int base; value_type value; }; // negative means prev empty index
int check; // negative means next empty index
node (const int base_ = 0, const int check_ = 0)
: base (base_), check (check_) {}
};
struct ninfo { // x1.5 update speed; +.25 % memory (8n -> 10n)
uchar sibling; // right sibling (= 0 if not exist)
uchar child; // first child
ninfo () : sibling (0), child (0) {}
};
struct block { // a block w/ 256 elements
int prev; // prev block; 3 bytes
int next; // next block; 3 bytes
short num; // # empty elements; 0 - 256
short reject; // minimum # branching failed to locate; soft limit
int trial; // # trial
int ehead; // first empty item
block () : prev (0), next (0), num (256), reject (257), trial (0), ehead (0) {}
};
da () : tracking_node (), _array (0), _tail (0), _tail0 (0), _ninfo (0), _block (0), _bheadF (0), _bheadC (0), _bheadO (0), _capacity (0), _size (0), _quota (0), _quota0 (0), _no_delete (false), _reject () {
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
STATIC_ASSERT(sizeof (value_type) <= sizeof (int),
value_type_is_not_supported___maintain_a_value_array_by_yourself_and_store_its_index_to_trie
);
#pragma GCC diagnostic warning "-Wunused-local-typedefs"
_initialize ();
}
~da () { clear (false); }
size_t capacity () const { return static_cast <size_t> (_capacity); }
size_t size () const { return static_cast <size_t> (_size); }
size_t length () const { return static_cast <size_t> (*_length); }
size_t total_size () const { return sizeof (node) * _size; }
size_t unit_size () const { return sizeof (node); }
size_t nonzero_size () const {
size_t i = 0;
for (int to = 0; to < _size; ++to)
if (_array[to].check >= 0) ++i;
return i;
}
size_t nonzero_length () const {
size_t i (0), j (0);
for (int to = 0; to < _size; ++to) {
const node& n = _array[to];
if (n.check >= 0 && _array[n.check].base != to && n.base < 0)
{ ++j; for (const char* p = &_tail[-n.base]; *p; ++p) ++i; }
}
return i + j * (1 + sizeof (value_type));
}
size_t num_keys () const {
size_t i = 0;
for (int to = 0; to < _size; ++to) {
const node& n = _array[to];
if (n.check >= 0 && (_array[n.check].base == to || n.base < 0)) ++i;
}
return i;
}
// interfance
template <typename T>
T exactMatchSearch (const char* key) const
{ return exactMatchSearch <T> (key, std::strlen (key)); }
template <typename T>
T exactMatchSearch (const char* key, size_t len, npos_t from = 0) const {
union { int i; value_type x; } b;
size_t pos = 0;
b.i = _find (key, from, pos, len);
if (b.i == CEDAR_NO_PATH) b.i = CEDAR_NO_VALUE;
T result;
_set_result (&result, b.x, len, from);
return result;
}
template <typename T>
size_t commonPrefixSearch (const char* key, T* result, size_t result_len) const
{ return commonPrefixSearch (key, result, result_len, std::strlen (key)); }
template <typename T>
size_t commonPrefixSearch (const char* key, T* result, size_t result_len, size_t len, npos_t from = 0) const {
size_t num = 0;
for (size_t pos = 0; pos < len; ) {
union { int i; value_type x; } b;
b.i = _find (key, from, pos, pos + 1);
if (b.i == CEDAR_NO_VALUE) continue;
if (b.i == CEDAR_NO_PATH) return num;
if (num < result_len) _set_result (&result[num], b.x, pos, from);
++num;
}
return num;
}
// predict key from double array
template <typename T>
size_t commonPrefixPredict (const char* key, T* result, size_t result_len)
{ return commonPrefixPredict (key, result, result_len, std::strlen (key)); }
template <typename T>
size_t commonPrefixPredict (const char* key, T* result, size_t result_len, size_t len, npos_t from = 0) {
size_t num (0), pos (0), p (0);
if (_find (key, from, pos, len) == CEDAR_NO_PATH) return 0;
union { int i; value_type x; } b;
const npos_t root = from;
for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p, root)) {
if (num < result_len)
_set_result (&result[num], b.x, p, from);
++num;
}
return num;
}
void suffix (char* key, size_t len, npos_t to) const {
key[len] = '\0';
if (const int offset = static_cast <int> (to >> 32)) {
to &= TAIL_OFFSET_MASK;
size_t len_tail = std::strlen (&_tail[-_array[to].base]);
if (len > len_tail) len -= len_tail; else len_tail = len, len = 0;
std::memcpy (&key[len], &_tail[static_cast <size_t> (offset) - len_tail], len_tail);
}
while (len--) {
const int from = _array[to].check;
key[len] = static_cast <char> (_array[from].base ^ static_cast <int> (to));
to = static_cast <npos_t> (from);
}
}
value_type traverse (const char* key, npos_t& from, size_t& pos) const
{ return traverse (key, from, pos, std::strlen (key)); }
value_type traverse (const char* key, npos_t& from, size_t& pos, size_t len) const {
union { int i; value_type x; } b;
b.i = _find (key, from, pos, len);
return b.x;
}
struct empty_callback { void operator () (const int, const int) {} }; // dummy empty function
value_type& update (const char* key)
{ return update (key, std::strlen (key)); }
value_type& update (const char* key, size_t len, value_type val = value_type (0))
{ npos_t from (0); size_t pos (0); return update (key, from, pos, len, val); }
value_type& update (const char* key, npos_t& from, size_t& pos, size_t len, value_type val = value_type (0))
{ empty_callback cf; return update (key, from, pos, len, val, cf); }
template <typename T>
value_type& update (const char* key, npos_t& from, size_t& pos, size_t len, value_type val, T& cf) {
if (! len && ! from)
_err (__FILE__, __LINE__, "failed to insert zero-length key\n");
#ifndef USE_FAST_LOAD
if (! _ninfo || ! _block) restore ();
#endif
npos_t offset = from >> 32;
if (! offset) { // node on trie
for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
_array[from].base >= 0; ++pos) {
if (pos == len) // could be reduced
{ const int to = _follow (from, 0, cf); return _array[to].value += val; }
from = static_cast <size_t> (_follow (from, key_[pos], cf));
}
offset = static_cast <npos_t> (-_array[from].base);
}
if (offset >= sizeof (int)) { // go to _tail
const size_t pos_orig = pos;
char* const tail = &_tail[offset] - pos;
while (pos < len && key[pos] == tail[pos]) ++pos;
//
if (pos == len && tail[pos] == '\0') { // found exact key
if (const npos_t moved = pos - pos_orig) { // search end on tail
from &= TAIL_OFFSET_MASK;
from |= (offset + moved) << 32;
}
return *reinterpret_cast <value_type*> (&tail[len + 1]) += val;
}
// otherwise, insert the common prefix in tail if any
if (from >> 32) {
from &= TAIL_OFFSET_MASK; // reset to update tail offset
for (npos_t offset_ = static_cast <npos_t> (-_array[from].base);
offset_ < offset; ) {
from = static_cast <size_t>
(_follow (from, static_cast <uchar> (_tail[offset_]), cf));
++offset_;
// this shows intricacy in debugging updatable double array trie
if (NUM_TRACKING_NODES) // keep the traversed node (on tail) updated
for (size_t j = 0; tracking_node[j] != 0; ++j)
if (tracking_node[j] >> 32 == offset_)
tracking_node[j] = static_cast <npos_t> (from);
}
}
for (size_t pos_ = pos_orig; pos_ < pos; ++pos_)
from = static_cast <size_t>
(_follow (from, static_cast <uchar> (key[pos_]), cf));
npos_t moved = pos - pos_orig;
if (tail[pos]) { // remember to move offset to existing tail
const int to_ = _follow (from, static_cast <uchar> (tail[pos]), cf);
_array[to_].base = - static_cast <int> (offset + ++moved);
moved -= 1 + sizeof (value_type); // keep record
}
moved += offset;
for (npos_t i = offset; i <= moved; i += 1 + sizeof (value_type)) {
if (_quota0 == ++*_length0) {
#ifdef USE_EXACT_FIT
_quota0 += *_length0 >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : *_length0;
#else
_quota0 += _quota0;
#endif
_realloc_array (_tail0, _quota0, *_length0);
}
_tail0[*_length0] = static_cast <int> (i);
}
if (pos == len || tail[pos] == '\0') {
const int to = _follow (from, 0, cf); // could be reduced
if (pos == len) return _array[to].value += val; // set value on trie
_array[to].value += *reinterpret_cast <value_type*> (&tail[pos + 1]);
}
from = static_cast <size_t> (_follow (from, static_cast <uchar> (key[pos]), cf));
++pos;
}
const int needed = static_cast <int> (len - pos + 1 + sizeof (value_type));
if (pos == len && *_length0) { // reuse
const int offset0 = _tail0[*_length0];
_tail[offset0] = '\0';
_array[from].base = -offset0;
--*_length0;
return *reinterpret_cast <value_type*> (&_tail[offset0 + 1]) = val;
}
if (_quota < *_length + needed) {
#ifdef USE_EXACT_FIT
_quota += needed > *_length || needed > MAX_ALLOC_SIZE ? needed :
(*_length >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : *_length);
#else
_quota += _quota >= needed ? _quota : needed;
#endif
_realloc_array (_tail, _quota, *_length);
}
_array[from].base = -*_length;
const size_t pos_orig = pos;
char* const tail = &_tail[*_length] - pos;
if (pos < len) {
do tail[pos] = key[pos]; while (++pos < len);
from |= (static_cast <npos_t> (*_length) + (len - pos_orig)) << 32;
}
*_length += needed;
return *reinterpret_cast <value_type*> (&tail[len + 1]) += val;
}
// easy-going erase () without compression
int erase (const char* key) { return erase (key, std::strlen (key)); }
int erase (const char* key, size_t len, npos_t from = 0) {
size_t pos = 0;
const int i = _find (key, from, pos, len);
if (i == CEDAR_NO_PATH || i == CEDAR_NO_VALUE) return -1;
if (from >> 32) from &= TAIL_OFFSET_MASK; // leave tail as is
bool flag = _array[from].base < 0; // have sibling
int e = flag ? static_cast <int> (from) : _array[from].base ^ 0;
from = _array[e].check;
do {
const node& n = _array[from];
flag = _ninfo[n.base ^ _ninfo[from].child].sibling;
if (flag) _pop_sibling (from, n.base, static_cast <uchar> (n.base ^ e));
_push_enode (e);
e = static_cast <int> (from);
from = static_cast <size_t> (_array[from].check);
} while (! flag);
return 0;
}
int build (size_t num, const char** key, const size_t* len = 0, const value_type* val = 0) {
for (size_t i = 0; i < num; ++i)
update (key[i], len ? len[i] : std::strlen (key[i]), val ? val[i] : value_type (i));
return 0;
}
template <typename T>
void dump (T* result, const size_t result_len) {
union { int i; value_type x; } b;
size_t num (0), p (0);
npos_t from = 0;
for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p))
if (num < result_len)
_set_result (&result[num++], b.x, p, from);
else
_err (__FILE__, __LINE__, "dump() needs array of length = num_keys()\n");
}
void shrink_tail () {
union { char* tail; int* length; } t;
const size_t length_
= static_cast <size_t> (*_length)
- static_cast <size_t> (*_length0) * (1 + sizeof (value_type));
t.tail = static_cast <char*> (std::malloc (length_));
if (! t.tail) _err (__FILE__, __LINE__, "memory allocation failed\n");
*t.length = static_cast <int> (sizeof (int));
for (int to = 0; to < _size; ++to) {
node& n = _array[to];
if (n.check >= 0 && _array[n.check].base != to && n.base < 0) {
char* const tail (&t.tail[*t.length]), * const tail_ (&_tail[-n.base]);
n.base = - *t.length;
int i = 0; do tail[i] = tail_[i]; while (tail[i++]);
*reinterpret_cast <value_type*> (&tail[i])
= *reinterpret_cast <const value_type*> (&tail_[i]);
*t.length += i + static_cast <int> (sizeof (value_type));
}
}
std::free (_tail);
_tail = t.tail;
_realloc_array (_tail, *_length, *_length);
_quota = *_length;
_realloc_array (_tail0, 1);
_quota0 = 1;
}
int save (const char* fn, const char* mode, const bool shrink) {
if (shrink) shrink_tail ();
return save (fn, mode);
}
int save (const char* fn, const char* mode = "wb") const {
// _test ();
FILE* fp = std::fopen (fn, mode);
if (! fp) return -1;
std::fwrite (_tail, sizeof (char), static_cast <size_t> (*_length), fp);
std::fwrite (_array, sizeof (node), static_cast <size_t> (_size), fp);
std::fclose (fp);
#ifdef USE_FAST_LOAD
const char* const info
= std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
fp = std::fopen (info, mode);
delete [] info; // resolve memory leak
if (! fp) return -1;
std::fwrite (&_bheadF, sizeof (int), 1, fp);
std::fwrite (&_bheadC, sizeof (int), 1, fp);
std::fwrite (&_bheadO, sizeof (int), 1, fp);
std::fwrite (_ninfo, sizeof (ninfo), static_cast <size_t> (_size), fp);
std::fwrite (_block, sizeof (block), static_cast <size_t> (_size >> 8), fp);
std::fclose (fp);
#endif
return 0;
}
int open (const char* fn, const char* mode = "rb",
const size_t offset = 0, size_t size_ = 0) {
FILE* fp = std::fopen (fn, mode);
if (! fp) return -1;
// get size
if (! size_) {
if (std::fseek (fp, 0, SEEK_END) != 0) return -1;
size_ = static_cast <size_t> (std::ftell (fp));
if (std::fseek (fp, 0, SEEK_SET) != 0) return -1;
}
if (size_ <= offset) return -1;
if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
int len = 0;
if (std::fread (&len, sizeof (int), 1, fp) != 1) return -1;
const size_t length_ = static_cast <size_t> (len);
if (size_ <= offset + length_) return -1;
// set array
clear (false);
size_ = (size_ - offset - length_) / sizeof (node);
_array = static_cast <node*> (std::malloc (sizeof (node) * size_));
_tail = static_cast <char*> (std::malloc (length_));
_tail0 = static_cast <int*> (std::malloc (sizeof (int)));
#ifdef USE_FAST_LOAD
_ninfo = static_cast <ninfo*> (std::malloc (sizeof (ninfo) * size_));
_block = static_cast <block*> (std::malloc (sizeof (block) * size_));
if (! _array || ! _tail || ! _tail0 || ! _ninfo || ! _block)
#else
if (! _array || ! _tail || ! _tail0)
#endif
_err (__FILE__, __LINE__, "memory allocation failed\n");
if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
if (length_ != std::fread (_tail, sizeof (char), length_, fp) ||
size_ != std::fread (_array, sizeof (node), size_, fp))
return -1;
std::fclose (fp);
_size = static_cast <int> (size_);
*_length0 = 0;
#ifdef USE_FAST_LOAD
const char* const info
= std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
fp = std::fopen (info, mode);
delete [] info; // resolve memory leak
if (! fp) return -1;
std::fread (&_bheadF, sizeof (int), 1, fp);
std::fread (&_bheadC, sizeof (int), 1, fp);
std::fread (&_bheadO, sizeof (int), 1, fp);
if (size_ != std::fread (_ninfo, sizeof (ninfo), size_, fp) ||
size_ >> 8 != std::fread (_block, sizeof (block), size_ >> 8, fp))
return -1;
std::fclose (fp);
_capacity = _size;
_quota = *_length;
_quota0 = 1;
#endif
return 0;
}
#ifndef USE_FAST_LOAD
void restore () { // restore information to update
if (! _block) _restore_block ();
if (! _ninfo) _restore_ninfo ();
_capacity = _size;
_quota = *_length;
_quota0 = 1;
}
#endif
void set_array (void* p, size_t size_ = 0) { // ad-hoc
clear (false);
if (size_)
size_ = size_ * unit_size () - static_cast <size_t> (*static_cast <int*> (p));
_tail = static_cast <char*> (p);
_array = reinterpret_cast <node*> (_tail + *_length);
_size = static_cast <int> (size_ / unit_size () + (size_ % unit_size () ? 1 : 0));
_no_delete = true;
}
const void* array () const { return _array; }
void clear (const bool reuse = true) {
if (_no_delete) _array = 0, _tail = 0;
if (_array) std::free (_array);
if (_tail) std::free (_tail);
if (_tail0) std::free (_tail0);
if (_ninfo) std::free (_ninfo);
if (_block) std::free (_block);
_array = 0; _tail = 0; _tail0 = 0; _ninfo = 0; _block = 0;
_bheadF = _bheadC = _bheadO = _capacity = _size = _quota = _quota0 = 0;
if (reuse) _initialize ();
_no_delete = false;
}
// return the first child for a tree rooted by a given node
int begin (npos_t& from, size_t& len) {
#ifndef USE_FAST_LOAD
if (! _ninfo) _restore_ninfo ();
#endif
int base = from >> 32 ? - static_cast <int> (from >> 32) : _array[from].base;
if (base >= 0) { // on trie
uchar c = _ninfo[from].child;
if (! from && ! (c = _ninfo[base ^ c].sibling)) // bug fix
return CEDAR_NO_PATH; // no entry
for (; c && base >= 0; ++len) {
from = static_cast <size_t> (base) ^ c;
base = _array[from].base;
c = _ninfo[from].child;
}
if (base >= 0) return _array[base ^ c].base;
}
const size_t len_ = std::strlen (&_tail[-base]);
from &= TAIL_OFFSET_MASK;
from |= static_cast <npos_t> (static_cast <size_t> (-base) + len_) << 32;
len += len_;
return *reinterpret_cast <int*> (&_tail[-base] + len_ + 1);
}
// return the next child if any
int next (npos_t& from, size_t& len, const npos_t root = 0) {
uchar c = 0;
if (const int offset = static_cast <int> (from >> 32)) { // on tail
if (root >> 32) return CEDAR_NO_PATH;
from &= TAIL_OFFSET_MASK;
len -= static_cast <size_t> (offset - (-_array[from].base));
} else
c = _ninfo[_array[from].base ^ 0].sibling;
for (; ! c && from != root; --len) {
c = _ninfo[from].sibling;
from = static_cast <size_t> (_array[from].check);
}
if (! c) return CEDAR_NO_PATH;
return begin (from = static_cast <size_t> (_array[from].base) ^ c, ++len);
}
npos_t tracking_node[NUM_TRACKING_NODES + 1];
private:
// currently disabled; implement these if you need
da (const da&);
da& operator= (const da&);
node* _array;
union { char* _tail; int* _length; };
union { int* _tail0; int* _length0; };
ninfo* _ninfo;
block* _block;
int _bheadF; // first block of Full; 0
int _bheadC; // first block of Closed; 0 if no Closed
int _bheadO; // first block of Open; 0 if no Open
int _capacity;
int _size;
int _quota;
int _quota0;
int _no_delete;
short _reject[257];
//
static void _err (const char* fn, const int ln, const char* msg)
{ std::fprintf (stderr, "cedar: %s [%d]: %s", fn, ln, msg); std::exit (1); }
template <typename T>
static void _realloc_array (T*& p, const int size_n, const int size_p = 0) {
void* tmp = std::realloc (p, sizeof (T) * static_cast <size_t> (size_n));
if (! tmp)
std::free (p), _err (__FILE__, __LINE__, "memory reallocation failed\n");
p = static_cast <T*> (tmp);
static const T T0 = T ();
for (T* q (p + size_p), * const r (p + size_n); q != r; ++q) *q = T0;
}
void _initialize () { // initilize the first special block
_realloc_array (_array, 256, 256);
_realloc_array (_tail, sizeof (int));
_realloc_array (_tail0, 1);
_realloc_array (_ninfo, 256);
_realloc_array (_block, 1);
_array[0] = node (0, -1);
for (int i = 1; i < 256; ++i)
_array[i] = node (i == 1 ? -255 : - (i - 1), i == 255 ? -1 : - (i + 1));
_capacity = _size = 256;
_block[0].ehead = 1; // bug fix for erase
_quota = *_length = static_cast <int> (sizeof (int));
_quota0 = 1;
for (size_t i = 0 ; i <= NUM_TRACKING_NODES; ++i) tracking_node[i] = 0;
for (short i = 0; i <= 256; ++i) _reject[i] = i + 1;
}
// follow/create edge
template <typename T>
int _follow (npos_t& from, const uchar& label, T& cf) {
int to = 0;
const int base = _array[from].base;
if (base < 0 || _array[to = base ^ label].check < 0) {
to = _pop_enode (base, label, static_cast <int> (from));
_push_sibling (from, to ^ label, label, base >= 0);
} else if (_array[to].check != static_cast <int> (from))
to = _resolve (from, base, label, cf);
return to;
}
// find key from double array
int _find (const char* key, npos_t& from, size_t& pos, const size_t len) const {
npos_t offset = from >> 32;
if (! offset) { // node on trie
for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
_array[from].base >= 0; ) {
if (pos == len) {
const node& n = _array[_array[from].base ^ 0];
if (n.check != static_cast <int> (from)) return CEDAR_NO_VALUE;
return n.base;
}
size_t to = static_cast <size_t> (_array[from].base); to ^= key_[pos];
if (_array[to].check != static_cast <int> (from)) return CEDAR_NO_PATH;
++pos;
from = to;
}
offset = static_cast <npos_t> (-_array[from].base);
}
// switch to _tail to match suffix
const size_t pos_orig = pos; // start position in reading _tail
const char* const tail = &_tail[offset] - pos;
if (pos < len) {
do if (key[pos] != tail[pos]) break; while (++pos < len);
if (const npos_t moved = pos - pos_orig) {
from &= TAIL_OFFSET_MASK;
from |= (offset + moved) << 32;
}
if (pos < len) return CEDAR_NO_PATH; // input > tail, input != tail
}
if (tail[pos]) return CEDAR_NO_VALUE; // input < tail
return *reinterpret_cast <const int*> (&tail[len + 1]);
}
#ifndef USE_FAST_LOAD
void _restore_ninfo () {
_realloc_array (_ninfo, _size);
for (int to = 0; to < _size; ++to) {
const int from = _array[to].check;
if (from < 0) continue; // skip empty node
const int base = _array[from].base;
if (const uchar label = static_cast <uchar> (base ^ to)) // skip leaf
_push_sibling (static_cast <size_t> (from), base, label,
! from || _ninfo[from].child || _array[base ^ 0].check == from);
}
}
void _restore_block () {
_realloc_array (_block, _size >> 8);
_bheadF = _bheadC = _bheadO = 0;
for (int bi (0), e (0); e < _size; ++bi) { // register blocks to full
block& b = _block[bi];
b.num = 0;
for (; e < (bi << 8) + 256; ++e)
if (_array[e].check < 0 && ++b.num == 1) b.ehead = e;
int& head_out = b.num == 1 ? _bheadC : (b.num == 0 ? _bheadF : _bheadO);
_push_block (bi, head_out, ! head_out && b.num);
}
}
#endif
void _set_result (result_type* x, value_type r, size_t = 0, npos_t = 0) const
{ *x = r; }
void _set_result (result_pair_type* x, value_type r, size_t l, npos_t = 0) const
{ x->value = r; x->length = l; }
void _set_result (result_triple_type* x, value_type r, size_t l, npos_t from) const
{ x->value = r; x->length = l; x->id = from; }
void _pop_block (const int bi, int& head_in, const bool last) {
if (last) { // last one poped; Closed or Open
head_in = 0;
} else {
const block& b = _block[bi];
_block[b.prev].next = b.next;
_block[b.next].prev = b.prev;
if (bi == head_in) head_in = b.next;
}
}
void _push_block (const int bi, int& head_out, const bool empty) {
block& b = _block[bi];
if (empty) { // the destination is empty
head_out = b.prev = b.next = bi;
} else { // use most recently pushed
int& tail_out = _block[head_out].prev;
b.prev = tail_out;
b.next = head_out;
head_out = tail_out = _block[tail_out].next = bi;
}
}
int _add_block () {
if (_size == _capacity) { // allocate memory if needed
#ifdef USE_EXACT_FIT
_capacity += _size >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : _size;
#else
_capacity += _capacity;
#endif
_realloc_array (_array, _capacity, _capacity);
_realloc_array (_ninfo, _capacity, _size);
_realloc_array (_block, _capacity >> 8, _size >> 8);
}
_block[_size >> 8].ehead = _size;
_array[_size] = node (- (_size + 255), - (_size + 1));
for (int i = _size + 1; i < _size + 255; ++i)
_array[i] = node (-(i - 1), -(i + 1));
_array[_size + 255] = node (- (_size + 254), -_size);
_push_block (_size >> 8, _bheadO, ! _bheadO); // append to block Open
_size += 256;
return (_size >> 8) - 1;
}
// transfer block from one start w/ head_in to one start w/ head_out
void _transfer_block (const int bi, int& head_in, int& head_out) {
_pop_block (bi, head_in, bi == _block[bi].next);
_push_block (bi, head_out, ! head_out && _block[bi].num);
}
// pop empty node from block; never transfer the special block (bi = 0)
int _pop_enode (const int base, const uchar label, const int from) {
const int e = base < 0 ? _find_place () : base ^ label;
const int bi = e >> 8;
node& n = _array[e];
block& b = _block[bi];
if (--b.num == 0) {
if (bi) _transfer_block (bi, _bheadC, _bheadF); // Closed to Full
} else { // release empty node from empty ring
_array[-n.base].check = n.check;
_array[-n.check].base = n.base;
if (e == b.ehead) b.ehead = -n.check; // set ehead
if (bi && b.num == 1 && b.trial != MAX_TRIAL) // Open to Closed
_transfer_block (bi, _bheadO, _bheadC);
}
// initialize the released node
if (label) n.base = -1; else n.value = value_type (0);
n.check = from;
if (base < 0) _array[from].base = e ^ label;
return e;
}
// push empty node into empty ring
void _push_enode (const int e) {
const int bi = e >> 8;
block& b = _block[bi];
if (++b.num == 1) { // Full to Closed
b.ehead = e;
_array[e] = node (-e, -e);
if (bi) _transfer_block (bi, _bheadF, _bheadC); // Full to Closed
} else {
const int prev = b.ehead;
const int next = -_array[prev].check;
_array[e] = node (-prev, -next);
_array[prev].check = _array[next].base = -e;
if (b.num == 2 || b.trial == MAX_TRIAL) { // Closed to Open
if (bi) _transfer_block (bi, _bheadC, _bheadO);
}
b.trial = 0;
}
if (b.reject < _reject[b.num]) b.reject = _reject[b.num];
_ninfo[e] = ninfo (); // reset ninfo; no child, no sibling
}
// push label to from's child
void _push_sibling (const npos_t from, const int base, const uchar label, const bool flag = true) {
uchar* c = &_ninfo[from].child;
if (flag && (ORDERED ? label > *c : ! *c))
do c = &_ninfo[base ^ *c].sibling; while (ORDERED && *c && *c < label);
_ninfo[base ^ label].sibling = *c, *c = label;
}
// pop label from from's child
void _pop_sibling (const npos_t from, const int base, const uchar label) {
uchar* c = &_ninfo[from].child;
while (*c != label) c = &_ninfo[base ^ *c].sibling;
*c = _ninfo[base ^ label].sibling;
}
// check whether to replace branching w/ the newly added node
bool _consult (const int base_n, const int base_p, uchar c_n, uchar c_p) const {
do if (! (c_p = _ninfo[base_p ^ c_p].sibling)) return false;
while ((c_n = _ninfo[base_n ^ c_n].sibling));
return true;
}
// enumerate (equal to or more than one) child nodes
uchar* _set_child (uchar* p, const int base, uchar c, const int label = -1) {
--p;
if (! c) { *++p = c; c = _ninfo[base ^ c].sibling; } // 0: terminal
if (ORDERED)
while (c && c < label) { *++p = c; c = _ninfo[base ^ c].sibling; }
if (label != -1) *++p = static_cast <uchar> (label);
while (c) { *++p = c; c = _ninfo[base ^ c].sibling; }
return p;
}
// explore new block to settle down
int _find_place () {
if (_bheadC) return _block[_bheadC].ehead;
if (_bheadO) return _block[_bheadO].ehead;
return _add_block () << 8;
}
int _find_place (const uchar* const first, const uchar* const last) {
if (int bi = _bheadO) {
const int bz = _block[_bheadO].prev;
const short nc = static_cast <short> (last - first + 1);
while (1) { // set candidate block
block& b = _block[bi];
if (b.num >= nc && nc < b.reject) // explore configuration
for (int e = b.ehead;;) {
const int base = e ^ *first;
for (const uchar* p = first; _array[base ^ *++p].check < 0; )
if (p == last) return b.ehead = e; // no conflict
if ((e = -_array[e].check) == b.ehead) break;
}
b.reject = nc;
if (b.reject < _reject[b.num]) _reject[b.num] = b.reject;
const int bi_ = b.next;
if (++b.trial == MAX_TRIAL) _transfer_block (bi, _bheadO, _bheadC);
if (bi == bz) break;
bi = bi_;
}
}
return _add_block () << 8;
}
// resolve conflict on base_n ^ label_n = base_p ^ label_p
template <typename T>
int _resolve (npos_t& from_n, const int base_n, const uchar label_n, T& cf) {
// examine siblings of conflicted nodes
const int to_pn = base_n ^ label_n;
const int from_p = _array[to_pn].check;
const int base_p = _array[from_p].base;
const bool flag // whether to replace siblings of newly added
= _consult (base_n, base_p, _ninfo[from_n].child, _ninfo[from_p].child);
uchar child[256];
uchar* const first = &child[0];
uchar* const last =
flag ? _set_child (first, base_n, _ninfo[from_n].child, label_n)
: _set_child (first, base_p, _ninfo[from_p].child);
const int base =
(first == last ? _find_place () : _find_place (first, last)) ^ *first;
// replace & modify empty list
const int from = flag ? static_cast <int> (from_n) : from_p;
const int base_ = flag ? base_n : base_p;
if (flag && *first == label_n) _ninfo[from].child = label_n; // new child
_array[from].base = base; // new base
for (const uchar* p = first; p <= last; ++p) { // to_ => to
const int to = _pop_enode (base, *p, from);
const int to_ = base_ ^ *p;
_ninfo[to].sibling = (p == last ? 0 : *(p + 1));
if (flag && to_ == to_pn) continue; // skip newcomer (no child)
cf (to_, to);
node& n = _array[to];
node& n_ = _array[to_];
if ((n.base = n_.base) > 0 && *p) { // copy base; bug fix
uchar c = _ninfo[to].child = _ninfo[to_].child;
do _array[n.base ^ c].check = to; // adjust grand son's check
while ((c = _ninfo[n.base ^ c].sibling));
}
if (! flag && to_ == static_cast <int> (from_n)) // parent node moved
from_n = static_cast <size_t> (to); // bug fix
if (! flag && to_ == to_pn) { // the address is immediately used
_push_sibling (from_n, to_pn ^ label_n, label_n);
_ninfo[to_].child = 0; // remember to reset child
if (label_n) n_.base = -1; else n_.value = value_type (0);
n_.check = static_cast <int> (from_n);
} else
_push_enode (to_);
if (NUM_TRACKING_NODES) // keep the traversed node updated
for (size_t j = 0; tracking_node[j] != 0; ++j) {
if (static_cast <int> (tracking_node[j] & TAIL_OFFSET_MASK) == to_) {
tracking_node[j] &= NODE_INDEX_MASK;
tracking_node[j] |= static_cast <npos_t> (to);
}
}
}
return flag ? base ^ label_n : to_pn;
}
// test the validity of double array for debug
void _test (const npos_t from = 0) const {
const int base = _array[from].base;
if (base < 0) { // validate tail offset
assert (*_length >= static_cast <int> (-base + 1 + sizeof (value_type)));
return;
}
uchar c = _ninfo[from].child;
do {
if (from) assert (_array[base ^ c].check == static_cast <int> (from));
if (c) _test (static_cast <npos_t> (base ^ c));
} while ((c = _ninfo[base ^ c].sibling));
}
};
}
#endif

View File

@ -0,0 +1,12 @@
INCLUDEPATH += $$PWD
HEADERS += \
$$PWD/darts-clone/darts.h \
$$PWD/cedar/cedarpp.h \
$$PWD/cedar/cedar.h \
$$PWD/storage-base.h \
$$PWD/storage-base.hpp
SOURCES += \
$$PWD/storage-base.cpp

View File

@ -0,0 +1,202 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef STORAGEBASE_CPP
#define STORAGEBASE_CPP
#include "storage-base.h"
template<const bool ordered, typename cache_file_header>
StorageBase<ordered, cache_file_header>::StorageBase(const vector<string> file_paths, string dat_cache_path)
:m_file_paths(file_paths), m_dat_cache_path(dat_cache_path), m_double_array_data_trie(new cedar::da<int, -1, -2, ordered>)
{
static_assert(std::is_base_of<CacheFileHeaderBase, header_type>::value, "CacheFileHeader class not derived from CacheFileHeaderBase!");
}
template<const bool ordered, typename cache_file_header>
void StorageBase<ordered, cache_file_header>::Init()
{
int file_size_sum = 0;
const string md5 = CalcFileListMD5(m_file_paths, file_size_sum);
m_total_dict_size = file_size_sum;
if (m_dat_cache_path.empty()) {
m_dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
}
m_dat_cache_path += VERSION;
if (InitAttachDat(m_dat_cache_path, md5)) {
return;
}
LoadSourceFile(m_dat_cache_path, md5);//构建DATrie写入dat文件
bool build_ret = InitAttachDat(m_dat_cache_path, md5);
assert(build_ret);
}
template<const bool ordered, typename cache_file_header>
string StorageBase<ordered, cache_file_header>::Find(const string &key)
{
int result = m_double_array_data_trie->exactMatchSearch<int>(key.c_str(), key.size());
if (result < 0)
return string();
return string(&m_elements_ptr[result]);
}
template<const bool ordered, typename cache_file_header>
bool StorageBase<ordered, cache_file_header>::Contains(string &word)
{
if (this->Find(word) != string())
return true;
return false;
}
template<const bool ordered, typename cache_file_header>
bool StorageBase<ordered, cache_file_header>::IsMultiTone(const string &word)
{
string result = this->Find(word);
if (result.find(",") == result.npos)
return true;
return false;
}
template<const bool ordered, typename cache_file_header>
int StorageBase<ordered, cache_file_header>::GetTotalDictSize() const
{
return m_total_dict_size;
}
template<const bool ordered, typename cache_file_header>
StorageBase<ordered, cache_file_header>::~StorageBase()
{
munmap(m_mmap_addr, m_mmap_length);
m_mmap_addr = nullptr;
close(m_mmap_fd);
m_mmap_fd = -1;
if (m_double_array_data_trie)
delete m_double_array_data_trie;
m_double_array_data_trie = nullptr;
}
template<const bool ordered, typename cache_file_header>
cedar::da<int, -1, -2, ordered> *StorageBase<ordered, cache_file_header>::GetDoubleArrayDataTrie()
{
return m_double_array_data_trie;
}
template<const bool ordered, typename cache_file_header>
const void *StorageBase<ordered, cache_file_header>::GetDataTrieArray()
{
return m_double_array_data_trie->array();
}
template<const bool ordered, typename cache_file_header>
int StorageBase<ordered, cache_file_header>::GetDataTrieSize()
{
return m_double_array_data_trie->size();
}
template<const bool ordered, typename cache_file_header>
int StorageBase<ordered, cache_file_header>::GetDataTrieTotalSize()
{
return m_double_array_data_trie->total_size();
}
template<const bool ordered, typename cache_file_header>
cache_file_header *StorageBase<ordered, cache_file_header>::GetCacheFileHeaderPtr()
{
return reinterpret_cast<header_type*>(m_mmap_addr);
}
template<const bool ordered, typename cache_file_header>
bool StorageBase<ordered, cache_file_header>::InitAttachDat(const string &dat_cache_file, const string &md5)
{
m_mmap_fd = open(dat_cache_file.c_str(), O_RDONLY);
if (m_mmap_fd < 0) {
return false;
}
const auto seek_off = lseek(m_mmap_fd, 0, SEEK_END);
if (seek_off < 0){
close(m_mmap_fd);
m_mmap_fd = -1;
return false;
};
m_mmap_length = seek_off;
m_mmap_addr = reinterpret_cast<char *>(mmap(NULL, m_mmap_length, PROT_READ, MAP_SHARED, m_mmap_fd, 0));
if (m_mmap_addr == MAP_FAILED) {
close(m_mmap_fd);
m_mmap_fd = -1;
return false;
}
if (m_mmap_length < sizeof(header_type)) {
munmap(m_mmap_addr, m_mmap_length);
m_mmap_addr = nullptr;
close(m_mmap_fd);
m_mmap_fd = -1;
return false;
}
header_type & header = *reinterpret_cast<header_type*>(m_mmap_addr);
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())
or m_mmap_length != sizeof(header_type) + header.elements_size + header.dat_size * m_double_array_data_trie->unit_size()) {
munmap(m_mmap_addr, m_mmap_length);
m_mmap_addr = nullptr;
close(m_mmap_fd);
m_mmap_fd = -1;
return false;
}
m_elements_ptr = (const char *)(m_mmap_addr + sizeof(header_type));
const char * dat_ptr = m_mmap_addr + sizeof(header_type) + header.elements_size;
this->m_double_array_data_trie->set_array((char *)dat_ptr, header.dat_size);
return true;
}
string CalcFileListMD5(const vector<string> &files_list, int &file_size_sum) {
limonp::MD5 md5;
file_size_sum = 0;
for (auto const & local_path : files_list) {
const int fd = open(local_path.c_str(), O_RDONLY);
if (fd < 0){
continue;
}
auto const len = lseek(fd, 0, SEEK_END);
if (len > 0) {
void * addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
assert(MAP_FAILED != addr);
md5.Update((unsigned char *) addr, len);
file_size_sum += len;
munmap(addr, len);
}
close(fd);
}
md5.Final();
return string(md5.digestChars);
}
#endif

View File

@ -0,0 +1,93 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef STORAGEBASE_H
#define STORAGEBASE_H
#include <string>
#include <vector>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include "Md5.hpp"
#include "StringUtil.hpp"
#include "cedar.h"
using namespace std;
struct CacheFileHeaderBase { //todo 字节对齐
char md5_hex[32] = {};
uint32_t elements_num = 0;
uint32_t elements_size = 0;
uint32_t dat_size = 0;
};
template<const bool ordered = false, typename cache_file_header = CacheFileHeaderBase>
class StorageBase
{
public:
typedef cache_file_header header_type;
StorageBase(const vector<string> file_paths, string dat_cache_path = "");
virtual void Init();
virtual string Find(const string &key);
virtual bool Contains(string &word);
virtual bool IsMultiTone(const string &word);
virtual int GetTotalDictSize() const;
virtual void LoadSourceFile(const string &dat_cache_file, const string &md5) = 0;
virtual ~StorageBase();
cedar::da<int, -1, -2, ordered> * GetDoubleArrayDataTrie();
const void * GetDataTrieArray();
int GetDataTrieSize();
int GetDataTrieTotalSize();
cache_file_header * GetCacheFileHeaderPtr();
private:
StorageBase();
StorageBase(const StorageBase&);
StorageBase& operator = (const StorageBase&);
bool InitAttachDat(const string &dat_cache_file, const string &md5);
vector<string> m_file_paths;
string m_dat_cache_path;
cedar::da<int, -1, -2, ordered> * m_double_array_data_trie = nullptr;
const char * m_elements_ptr = nullptr;
int m_mmap_fd = -1;
int m_mmap_length = 0;
char * m_mmap_addr = nullptr;
int m_total_dict_size = 0;
};
inline string CalcFileListMD5(const vector<string> &files_list, int & file_size_sum);
#include "storage-base.cpp"
#endif // STORAGEBASE_H

View File

@ -0,0 +1,232 @@
/*
* Copyright (C) 2022, KylinSoft Co., Ltd.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*
* Authors: jixiaoxu <jixiaoxu@kylinos.cn>
*
*/
#ifndef STORAGEBASE_H
#define STORAGEBASE_H
#include <string>
#include <vector>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include "Md5.hpp"
#include "StringUtil.hpp"
//#define USE_DARTS
#ifdef USE_DARTS
#include "../storage-base/darts-clone/darts.h"
#include <cassert>
#else
#include "../storage-base/cedar/cedar.h"
#endif
using namespace std;
inline string CalcFileListMD5(const vector<string> &files_list, int & file_size_sum)
{
limonp::MD5 md5;
file_size_sum = 0;
for (auto const & local_path : files_list) {
const int fd = open(local_path.c_str(), O_RDONLY);
if (fd < 0){
continue;
}
auto const len = lseek(fd, 0, SEEK_END);
if (len > 0) {
void * addr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
assert(MAP_FAILED != addr);
md5.Update((unsigned char *) addr, len);
file_size_sum += len;
munmap(addr, len);
}
close(fd);
}
md5.Final();
return string(md5.digestChars);
}
struct CacheFileHeaderBase { //todo 字节对齐
char md5_hex[32] = {};
uint32_t elements_num = 0;
uint32_t elements_size = 0;
uint32_t dat_size = 0;
};
template<typename element_ptr_type, const bool ordered = false, typename cache_file_header = CacheFileHeaderBase>
class StorageBase
{
public:
typedef cache_file_header header_type;
#ifdef USE_DARTS
typedef typename Darts::DoubleArray::result_pair_type result_pair_type;
StorageBase(const vector<string> file_paths, string dat_cache_path = "")
:m_file_paths(file_paths), m_dat_cache_path(dat_cache_path), m_double_array_data_trie(new Darts::DoubleArray)
{
static_assert(std::is_base_of<CacheFileHeaderBase, header_type>::value, "CacheFileHeader class not derived from CacheFileHeaderBase!");
}
#else
typedef typename cedar::da<int, -1, -2, ordered>::result_pair_type result_pair_type;
StorageBase(const vector<string> file_paths, string dat_cache_path = "")
:m_file_paths(file_paths), m_dat_cache_path(dat_cache_path)/*, m_double_array_data_trie(new cedar::da<int, -1, -2, ordered>)*/
{
static_assert(std::is_base_of<CacheFileHeaderBase, header_type>::value, "CacheFileHeader class not derived from CacheFileHeaderBase!");
}
#endif
virtual void Init()
{
int file_size_sum = 0;
const string md5 = CalcFileListMD5(m_file_paths, file_size_sum);
m_total_dict_size = file_size_sum;
if (m_dat_cache_path.empty()) {
m_dat_cache_path = "/tmp/" + md5 + ".dat_";//未指定词库数据文件存储位置的默认存储在tmp目录下
}
m_dat_cache_path += VERSION;
if (InitAttachDat(m_dat_cache_path, md5)) {
return;
}
LoadSourceFile(m_dat_cache_path, md5);//构建DATrie写入dat文件
bool build_ret = InitAttachDat(m_dat_cache_path, md5);
assert(build_ret);
}
virtual void LoadSourceFile(const string &dat_cache_file, const string &md5) = 0;
virtual ~StorageBase()
{
munmap(m_mmap_addr, m_mmap_length);
m_mmap_addr = nullptr;
close(m_mmap_fd);
m_mmap_fd = -1;
}
#ifndef USE_DARTS
inline int Update(const char* key, size_t len, int val)
{
return m_double_array_data_trie.update(key, len, val);
}
#endif
inline size_t CommonPrefixSearch(const char* key, result_pair_type* result, size_t result_len) const
{
return m_double_array_data_trie.commonPrefixSearch(key, result, result_len);
}
inline int ExactMatchSearch(const char* key, size_t len) const
{
return m_double_array_data_trie.template exactMatchSearch<int>(key, len);
}
inline const void * GetDataTrieArray()
{
return m_double_array_data_trie.array();
}
inline int GetDataTrieSize()
{
return m_double_array_data_trie.size();
}
inline int GetDataTrieTotalSize()
{
return m_double_array_data_trie.total_size();
}
inline cache_file_header * GetCacheFileHeaderPtr() const
{
return reinterpret_cast<header_type*>(m_mmap_addr);
}
inline const element_ptr_type * GetElementPtr() const
{
return m_elements_ptr;
}
private:
StorageBase();
StorageBase(const StorageBase&);
StorageBase& operator = (const StorageBase&);
bool InitAttachDat(const string &dat_cache_file, const string &md5)
{
m_mmap_fd = open(dat_cache_file.c_str(), O_RDONLY);
if (m_mmap_fd < 0) {
return false;
}
const auto seek_off = lseek(m_mmap_fd, 0, SEEK_END);
if (seek_off < 0){
close(m_mmap_fd);
m_mmap_fd = -1;
return false;
};
m_mmap_length = seek_off;
m_mmap_addr = reinterpret_cast<char *>(mmap(NULL, m_mmap_length, PROT_READ, MAP_SHARED, m_mmap_fd, 0));
if (m_mmap_addr == MAP_FAILED) {
close(m_mmap_fd);
m_mmap_fd = -1;
return false;
}
if (m_mmap_length < sizeof(header_type)) {
munmap(m_mmap_addr, m_mmap_length);
m_mmap_addr = nullptr;
close(m_mmap_fd);
m_mmap_fd = -1;
return false;
}
header_type & header = *reinterpret_cast<header_type*>(m_mmap_addr);
if (0 != memcmp(&header.md5_hex[0], md5.c_str(), md5.size())
or m_mmap_length != sizeof(header_type) + header.elements_size + header.dat_size * m_double_array_data_trie.unit_size()) {
munmap(m_mmap_addr, m_mmap_length);
m_mmap_addr = nullptr;
close(m_mmap_fd);
m_mmap_fd = -1;
return false;
}
m_elements_ptr = (const element_ptr_type *)(m_mmap_addr + sizeof(header_type));
const char * dat_ptr = m_mmap_addr + sizeof(header_type) + header.elements_size;
this->m_double_array_data_trie.set_array((char *)dat_ptr, header.dat_size);
return true;
}
vector<string> m_file_paths;
string m_dat_cache_path;
#ifdef USE_DARTS
Darts::DoubleArray m_double_array_data_trie;
#else
cedar::da<int, -1, -2, ordered> m_double_array_data_trie;
#endif
const element_ptr_type * m_elements_ptr = nullptr;
int m_mmap_fd = -1;
size_t m_mmap_length = 0;
char * m_mmap_addr = nullptr;
int m_total_dict_size = 0;
};
#endif // STORAGEBASE_H

View File

@ -0,0 +1,11 @@
#include "mainwindow.h"
#include <QApplication>
int main(int argc, char *argv[])
{
QApplication a(argc, argv);
MainWindow w;
w.show();
return a.exec();
}

View File

@ -0,0 +1,92 @@
#include "mainwindow.h"
#include "ui_mainwindow.h"
#include <HanZiToPinYin>
#include <ChineseSegmentation>
#include <QMenu>
#include <QDebug>
#include <QStringList>
MainWindow::MainWindow(QWidget *parent)
: QMainWindow(parent)
, ui(new Ui::MainWindow)
{
ui->setupUi(this);
QMenu * menu = new QMenu(this);
menu->addAction("Default");
menu->addAction("Tone");
menu->addAction("Tone2");
menu->addAction("Tone3");
menu->addAction("FirstLetter");
ui->toolButton->setMenu(menu);
initconnections();
ui->lineEdit_2->setFocus();
}
MainWindow::~MainWindow()
{
delete ui;
}
void MainWindow::initconnections()
{
connect(ui->toolButton->menu(), &QMenu::triggered, [&](QAction *action){
qDebug() << "tool button:" << action->text();
m_action = action->text();
ui->toolButton->setText(action->text());
});
connect(ui->pushButton, &QPushButton::pressed, [&]() {
PinyinDataStyle dataStyle;
SegType segType;
PolyphoneType polyType;
ExDataProcessType exType;
if (m_action == "Default") {
dataStyle = PinyinDataStyle::Default;
} else if (m_action == "Tone") {
dataStyle = PinyinDataStyle::Tone;
} else if (m_action == "Tone2") {
dataStyle = PinyinDataStyle::Tone2;
} else if (m_action == "Tone3") {
dataStyle = PinyinDataStyle::Tone3;
} else if (m_action == "FirstLetter") {
dataStyle = PinyinDataStyle::FirstLetter;
}
if(!ui->checkSegBox->isChecked())
segType = SegType::Segmentation;
else
segType = SegType::NoSegmentation;
if(ui->checkPolyBox_2->isChecked())
polyType = PolyphoneType::Enable;
else
polyType = PolyphoneType::Disable;
if (ui->checkExBox_3->isChecked())
exType = ExDataProcessType::Default;
else
exType = ExDataProcessType::Delete;
HanZiToPinYin::getInstance()->setConfig(dataStyle, segType, polyType, exType);
ui->lineEdit_4->clear();
QString text = ui->lineEdit_2->text();
qDebug() << "input:" << text;
QStringList list;
HanZiToPinYin::getInstance()->getResults(text.toStdString(), list);
ui->lineEdit_4->setText(list.join(" "));
qDebug() << "result:" << list.join(" ");
vector<KeyWord> result = ChineseSegmentation::getInstance()->callSegment(ui->lineEdit_2->text().toStdString());
list.clear();
for (auto &info:result) {
list.append(QString().fromStdString(info.word));
}
ui->lineEdit_6->setText(list.join("/"));
});
}

View File

@ -0,0 +1,23 @@
#ifndef MAINWINDOW_H
#define MAINWINDOW_H
#include <QtWidgets>
QT_BEGIN_NAMESPACE
namespace Ui { class MainWindow; }
QT_END_NAMESPACE
class MainWindow : public QMainWindow
{
Q_OBJECT
public:
MainWindow(QWidget *parent = nullptr);
~MainWindow();
private:
void initconnections();
Ui::MainWindow *ui;
QString m_action;
};
#endif // MAINWINDOW_H

View File

@ -0,0 +1,181 @@
<?xml version="1.0" encoding="UTF-8"?>
<ui version="4.0">
<class>MainWindow</class>
<widget class="QMainWindow" name="MainWindow">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>800</width>
<height>600</height>
</rect>
</property>
<property name="windowTitle">
<string>MainWindow</string>
</property>
<widget class="QWidget" name="centralwidget">
<widget class="QPushButton" name="pushButton">
<property name="geometry">
<rect>
<x>40</x>
<y>440</y>
<width>191</width>
<height>81</height>
</rect>
</property>
<property name="text">
<string>点击开始</string>
</property>
</widget>
<widget class="QLineEdit" name="lineEdit">
<property name="geometry">
<rect>
<x>40</x>
<y>20</y>
<width>91</width>
<height>31</height>
</rect>
</property>
<property name="text">
<string>输入文字:</string>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
<widget class="QLineEdit" name="lineEdit_2">
<property name="geometry">
<rect>
<x>40</x>
<y>70</y>
<width>711</width>
<height>41</height>
</rect>
</property>
</widget>
<widget class="QLineEdit" name="lineEdit_3">
<property name="geometry">
<rect>
<x>40</x>
<y>310</y>
<width>121</width>
<height>31</height>
</rect>
</property>
<property name="text">
<string>拼音转换结果:</string>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
<widget class="QLineEdit" name="lineEdit_4">
<property name="geometry">
<rect>
<x>40</x>
<y>360</y>
<width>711</width>
<height>41</height>
</rect>
</property>
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
<widget class="QCheckBox" name="checkSegBox">
<property name="geometry">
<rect>
<x>280</x>
<y>430</y>
<width>111</width>
<height>29</height>
</rect>
</property>
<property name="text">
<string>不启用分词</string>
</property>
</widget>
<widget class="QCheckBox" name="checkPolyBox_2">
<property name="geometry">
<rect>
<x>280</x>
<y>470</y>
<width>131</width>
<height>29</height>
</rect>
</property>
<property name="text">
<string>启用多音字</string>
</property>
</widget>
<widget class="QToolButton" name="toolButton">
<property name="geometry">
<rect>
<x>530</x>
<y>460</y>
<width>181</width>
<height>30</height>
</rect>
</property>
<property name="text">
<string>数据形式...</string>
</property>
<property name="popupMode">
<enum>QToolButton::MenuButtonPopup</enum>
</property>
<property name="autoRaise">
<bool>false</bool>
</property>
</widget>
<widget class="QCheckBox" name="checkExBox_3">
<property name="geometry">
<rect>
<x>280</x>
<y>510</y>
<width>181</width>
<height>29</height>
</rect>
</property>
<property name="text">
<string>无拼音数据原数据返回</string>
</property>
</widget>
<widget class="QLineEdit" name="lineEdit_5">
<property name="geometry">
<rect>
<x>40</x>
<y>160</y>
<width>113</width>
<height>31</height>
</rect>
</property>
<property name="text">
<string>分词结果:</string>
</property>
</widget>
<widget class="QLineEdit" name="lineEdit_6">
<property name="geometry">
<rect>
<x>40</x>
<y>220</y>
<width>711</width>
<height>41</height>
</rect>
</property>
</widget>
</widget>
<widget class="QMenuBar" name="menubar">
<property name="geometry">
<rect>
<x>0</x>
<y>0</y>
<width>800</width>
<height>28</height>
</rect>
</property>
</widget>
<widget class="QStatusBar" name="statusbar"/>
</widget>
<resources/>
<connections/>
</ui>

View File

@ -0,0 +1,26 @@
QT += core gui
greaterThan(QT_MAJOR_VERSION, 4): QT += widgets
CONFIG += c++11 link_pkgconfig
PKGCONFIG += chinese-segmentation
# The following define makes your compiler emit warnings if you use
# any Qt feature that has been marked deprecated (the exact warnings
# depend on your compiler). Please consult the documentation of the
# deprecated API in order to know how to port your code away from it.
DEFINES += QT_DEPRECATED_WARNINGS
# You can also make your code fail to compile if it uses deprecated APIs.
# In order to do so, uncomment the following line.
# You can also select to disable deprecated APIs only up to a certain version of Qt.
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
HEADERS += \
mainwindow.h
SOURCES += \
main.cpp \
mainwindow.cpp
FORMS += \
mainwindow.ui

View File

@ -1,161 +0,0 @@
/*
* Friso test program.
* Of couse you can make it a perfect demo for friso.
* all threads or proccess share the same friso_t,
* defferent threads/proccess use defferent friso_task_t.
* and you could share the friso_config_t if you wish...
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso-interface.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define __LENGTH__ 15
#define __INPUT_LENGTH__ 20480
#define ___EXIT_INFO___ \
println("Thanks for trying friso."); \
break;
#define ___ABOUT___ \
println("+---------------------------------------------------------------+"); \
println("| Friso - a Chinese word segmentation writen by c. |"); \
println("| bug report email - chenxin619315@gmail.com. |"); \
println("| or: visit https://github.com/lionsoul2014/friso. |"); \
println("| java version for https://github.com/lionsoul2014/jcseg |"); \
println("| type 'quit' to exit the program. |"); \
println("+---------------------------------------------------------------+");
//read a line from a command line.
static fstring getLine(FILE *fp, fstring __dst) {
register int c;
register fstring cs;
cs = __dst;
while((c = getc(fp)) != EOF) {
if(c == '\n') break;
*cs++ = c;
}
*cs = '\0';
return (c == EOF && cs == __dst) ? NULL : __dst;
}
/*static void printcode( fstring str ) {
int i,length;
length = strlen( str );
printf("str:length=%d\n", length );
for ( i = 0; i < length; i++ ) {
printf("%d ", str[i] );
}
putchar('\n');
}*/
//int friso_test(int argc, char **argv)
int friso_test() {
clock_t s_time, e_time;
char line[__INPUT_LENGTH__] = {0};
int i;
fstring __path__ = NULL, mode = NULL;
friso_t friso;
friso_config_t config;
friso_task_t task;
// get the lexicon directory from command line arguments
// for ( i = 0; i < argc; i++ ) {
// if ( strcasecmp( "-init", argv[i] ) == 0 ) {
// __path__ = argv[i+1];
// }
// }
__path__ = "/usr/share/ukui-search/res/friso.ini";
if(__path__ == NULL) {
println("Usage: friso -init lexicon path");
exit(0);
}
s_time = clock();
//initialize
friso = friso_new();
config = friso_new_config();
/*friso_dic_t dic = friso_dic_new();
friso_dic_load_from_ifile( dic, __path__, __LENGTH__ );
friso_set_dic( friso, dic );
friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/
if(friso_init_from_ifile(friso, config, __path__) != 1) {
printf("fail to initialize friso and config.\n");
goto err;
}
switch(config->mode) {
case __FRISO_SIMPLE_MODE__:
mode = "Simple";
break;
case __FRISO_COMPLEX_MODE__:
mode = "Complex";
break;
case __FRISO_DETECT_MODE__:
mode = "Detect";
break;
}
//friso_set_mode( config, __FRISO_DETECT_MODE__ );
//printf("clr_stw=%d\n", friso->clr_stw);
//printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" ));
//printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究"));
e_time = clock();
printf("Initialized in %fsec\n", (double)(e_time - s_time) / CLOCKS_PER_SEC);
printf("Mode: %s\n", mode);
printf("+-Version: %s (%s)\n", friso_version(), friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK");
___ABOUT___;
//set the task.
task = friso_new_task();
while(1) {
print("friso>> ");
getLine(stdin, line);
//exit the programe
if(strcasecmp(line, "quit") == 0) {
___EXIT_INFO___
}
//for ( i = 0; i < 1000000; i++ ) {
//set the task text.
friso_set_text(task, line);
println("分词结果:");
s_time = clock();
while((config->next_token(friso, config, task)) != NULL) {
printf(
"%s[%d, %d, %d] ",
task->token->word,
task->token->offset,
task->token->length,
task->token->rlen
);
// printf("%s ", task->token->word);
}
//}
e_time = clock();
printf("\nDone, cost < %fsec\n", ((double)(e_time - s_time)) / CLOCKS_PER_SEC);
}
friso_free_task(task);
//error block.
err:
friso_free_config(config);
friso_free(friso);
return 0;
}

View File

@ -1,10 +0,0 @@
/*
* temporary use friso.ini, it should be removed in the future.
* MouseZhangZh
*/
#include "friso/src/friso_API.h"
#include "friso/src/friso.h"
#include "friso/src/friso_ctype.h"
//int friso_test(int argc, char **argv);
int friso_test();

View File

@ -1,225 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==========================================================================
The following license applies to the Friso ANSI C library
--------------------------------------------------------------------------
Copyright (c) 2010 lionsoul<chenxin619315@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -1,68 +0,0 @@
# friso configuration file.
# do not change the name of the left key.
# @email chenxin619315@gmail.com
# @date 2012-12-20
#
# charset, only UTF8 and GBK support.
# set it with UTF8(0) or GBK(1)
friso.charset = 0
# lexicon directory absolute path.
# the value must end with '/'
# this will tell friso how to find friso.lex.ini configuration file and all the lexicon files.
#
# if it is not start with '/' for linux, or matches no ':' for winnt in its value
# friso will search the friso.lex.ini relative to friso.ini
# absolute path search:
# linux: friso.lex_dir = /c/products/friso/dict/UTF-8/
# Winnt: friso.lex_dir = D:/products/friso/dict/UTF-8/
# relative path search (All system)
friso.lex_dir = ./dict/UTF-8/
# the maximum matching length.
friso.max_len = 5
# 1 for recognition chinese name.
# and 0 for closed it.
friso.r_name = 1
# the maximum length for the cjk words in a
# chinese and english mixed word.
friso.mix_len = 2
# the maxinum length for the chinese last name adron.
friso.lna_len = 1
# append the synonyms words
friso.add_syn = 1
# clear the stopwords or not (1 to open it and 0 to close it)
# @date 2013-06-13
friso.clr_stw = 0
# keep the unrecongized words or not (1 to open it and 0 to close it)
# @date 2013-06-13
friso.keep_urec = 0
# use sphinx output style like 'admire|love|enjoy einsten'
# @date 2013-10-25
friso.spx_out = 0
# start the secondary segmentation for complex english token.
friso.en_sseg = 1
# min length of the secondary segmentation token. (better larger than 1)
friso.st_minl = 2
# default keep punctuations for english token.
friso.kpuncs = @%.#&+
# the threshold value for a char not a part of a chinese name.
friso.nthreshold = 2000000
# default mode for friso.
# 1 : simple mode - simply maxmum matching algorithm.
# 2 : complex mode - four rules of mmseg alogrithm.
# 3 : detect mode - only return the words that the do exists in the lexicon
friso.mode = 2

View File

@ -1,18 +0,0 @@
INCLUDEPATH += $$PWD
HEADERS += \
$$PWD/src/friso_API.h \
$$PWD/src/friso.h \
$$PWD/src/friso_ctype.h
SOURCES += \
$$PWD/src/friso.c \
$$PWD/src/friso_lexicon.c \
$$PWD/src/friso_string.c \
$$PWD/src/friso_array.c \
$$PWD/src/friso_ctype.c \
$$PWD/src/friso_GBK.c \
$$PWD/src/friso_hash.c \
$$PWD/src/friso_link.c \
$$PWD/src/friso_UTF8.c

File diff suppressed because it is too large Load Diff

View File

@ -1,370 +0,0 @@
/*
* main interface file for friso tokenizer.
* you could modify it and re-release and free for commercial use.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#ifndef _friso_h
#define _friso_h
#include "friso_API.h"
#include <stdio.h>
/* {{{ friso main interface define :: start*/
#define FRISO_VERSION "1.6.4"
#define friso_version() FRISO_VERSION
#define DEFAULT_SEGMENT_LENGTH 5
#define DEFAULT_MIX_LENGTH 2
#define DEFAULT_LNA_LENGTH 1
#define DEFAULT_NTHRESHOLD 1000000
#define DEFAULT_SEGMENT_MODE 2
/*
* Type: friso_lex_t
* -----------
* This type used to represent the type of the lexicon.
*/
typedef enum {
__LEX_CJK_WORDS__ = 0,
__LEX_CJK_UNITS__ = 1,
__LEX_ECM_WORDS__ = 2, //english and chinese mixed words.
__LEX_CEM_WORDS__ = 3, //chinese and english mixed words.
__LEX_CN_LNAME__ = 4,
__LEX_CN_SNAME__ = 5,
__LEX_CN_DNAME1__ = 6,
__LEX_CN_DNAME2__ = 7,
__LEX_CN_LNA__ = 8,
__LEX_STOPWORDS__ = 9,
__LEX_ENPUN_WORDS__ = 10,
__LEX_EN_WORDS__ = 11,
__LEX_OTHER_WORDS__ = 15,
__LEX_NCSYN_WORDS__ = 16,
__LEX_PUNC_WORDS__ = 17, //punctuations
__LEX_UNKNOW_WORDS__ = 18 //unrecognized words.
} friso_lex_t;
typedef friso_hash_t * friso_dic_t;
#define __FRISO_LEXICON_LENGTH__ 12
//charset that Friso now support.
typedef enum {
FRISO_UTF8 = 0, //UTF-8
FRISO_GBK = 1 //GBK
} friso_charset_t;
/*
* Type: friso_mode_t
* ------------------
* use to identidy the mode that the friso use.
*/
typedef enum {
__FRISO_SIMPLE_MODE__ = 1,
__FRISO_COMPLEX_MODE__ = 2,
__FRISO_DETECT_MODE__ = 3
} friso_mode_t;
/* friso entry.*/
typedef struct {
friso_dic_t dic; //friso dictionary
friso_charset_t charset; //project charset.
} friso_entry;
typedef friso_entry * friso_t;
/*
* Type: lex_entry_cdt
* -------------------
* This type used to represent the lexicon entry struct.
*/
#define _LEX_APPENSYN_MASK (1 << 0) //append synoyums words.
#define lex_appensyn_open(e) e->ctrlMask |= _LEX_APPENSYN_MASK
#define lex_appensyn_close(e) e->ctrlMask &= ~_LEX_APPENSYN_MASK
#define lex_appensyn_check(e) ((e->ctrlMask & _LEX_APPENSYN_MASK) != 0)
typedef struct {
/*
* the type of the lexicon item.
* available value is all the elements in friso_lex_t enum.
* and if it is __LEX_OTHER_WORDS__, we need to free it after use it.
*/
uchar_t length; //the length of the token.(after the convertor of Friso.)
uchar_t rlen; //the real length of the token.(before any convert)
uchar_t type;
uchar_t ctrlMask; //function control mask, like append the synoyums words.
uint_t offset; //offset index.
fstring word;
//fstring py; //pinyin of the word.(invalid)
friso_array_t syn; //synoyums words.
friso_array_t pos; //part of speech.
uint_t fre; //single word frequency.
} lex_entry_cdt;
typedef lex_entry_cdt * lex_entry_t;
/*the segmentation token entry.*/
#define __HITS_WORD_LENGTH__ 64
typedef struct {
uchar_t type; //type of the word. (item of friso_lex_t)
uchar_t length; //length of the token.
uchar_t rlen; //the real length of the token.(in orgin string)
char pos; //part of speech.
int offset; //start offset of the word.
char word[__HITS_WORD_LENGTH__];
//char py[0];
} friso_token_entry;
typedef friso_token_entry * friso_token_t;
/*
* Type: friso_task_entry
* This type used to represent the current segmentation content.
* like the text to split, and the current index, token buffer eg....
*/
//action control mask for #FRISO_TASK_T#.
#define _TASK_CHECK_CF_MASK (1 << 0) //Wether to check the chinese fraction.
#define _TASK_START_SS_MASK (1 << 1) //Wether to start the secondary segmentation.
#define task_ssseg_open(task) task->ctrlMask |= _TASK_START_SS_MASK
#define task_ssseg_close(task) task->ctrlMask &= ~_TASK_START_SS_MASK
#define task_ssseg_check(task) ((task->ctrlMask & _TASK_START_SS_MASK) != 0)
typedef struct {
fstring text; //text to tokenize
uint_t idx; //start offset index.
uint_t length; //length of the text.
uint_t bytes; //latest word bytes in C.
uint_t unicode; //latest word unicode number.
uint_t ctrlMask; //action control mask.
friso_link_t pool; //task pool.
string_buffer_t sbuf; //string buffer.
friso_token_t token; //token result token;
char buffer[7]; //word buffer. (1-6 bytes for an utf-8 word in C).
} friso_task_entry;
typedef friso_task_entry * friso_task_t;
/* task configuration entry.*/
#define _FRISO_KEEP_PUNC_LEN 13
#define friso_en_kpunc(config, ch) (strchr(config->kpuncs, ch) != 0)
//typedef friso_token_t ( * friso_next_hit_fn ) ( friso_t, void *, friso_task_t );
//typedef lex_entry_t ( * friso_next_lex_fn ) ( friso_t, void *, friso_task_t );
struct friso_config_struct {
ushort_t max_len; //the max match length (4 - 7).
ushort_t r_name; //1 for open chinese name recognition 0 for close it.
ushort_t mix_len; //the max length for the CJK words in a mix string.
ushort_t lna_len; //the max length for the chinese last name adron.
ushort_t add_syn; //append synonyms tokenizer words.
ushort_t clr_stw; //clear the stopwords.
ushort_t keep_urec; //keep the unrecongnized words.
ushort_t spx_out; //use sphinx output customize.
ushort_t en_sseg; //start the secondary segmentation.
ushort_t st_minl; //min length of the secondary segmentation token.
uint_t nthreshold; //the threshold value for a char to make up a chinese name.
friso_mode_t mode; //Complex mode or simple mode
//pointer to the function to get the next token
friso_token_t (*next_token)(friso_t, struct friso_config_struct *, friso_task_t);
//pointer to the function to get the next cjk lex_entry_t
lex_entry_t (*next_cjk)(friso_t, struct friso_config_struct *, friso_task_t);
char kpuncs[_FRISO_KEEP_PUNC_LEN]; //keep punctuations buffer.
};
typedef struct friso_config_struct friso_config_entry;
typedef friso_config_entry * friso_config_t;
/*
* Function: friso_new;
* Usage: vars = friso_new( void );
* --------------------------------
* This function used to create a new empty friso friso_t;
* with default value.
*/
FRISO_API friso_t friso_new(void);
//creat a friso entry with a default value from a configuratile file.
//@return 1 for successfully and 0 for failed.
FRISO_API int friso_init_from_ifile(friso_t, friso_config_t, fstring);
/*
* Function: friso_free_vars;
* Usage: friso_free( vars );
* --------------------------
* This function is used to free the allocation of the given vars.
*/
FRISO_API void friso_free(friso_t);
/*
* Function: friso_set_dic
* Usage: dic = friso_set_dic( vars, dic );
* ----------------------------------------
* This function is used to set the dictionary for friso.
* and firso_dic_t is the pointer of a hash table array.
*/
//FRISO_API void friso_set_dic( friso_t, friso_dic_t );
#define friso_set_dic(friso, dic)\
do {\
friso->dic = dic;\
} while (0)
/*
* Function: friso_set_mode
* Usage: friso_set_mode( vars, mode );
* ------------------------------------
* This function is used to set the mode(complex or simple) that you want to friso to use.
*/
FRISO_API void friso_set_mode(friso_config_t, friso_mode_t);
/*create a new friso configuration entry and initialize
it with the default value.*/
FRISO_API friso_config_t friso_new_config(void);
//initialize the specified friso config entry with default value.
FRISO_API void friso_init_config(friso_config_t);
//free the specified friso configuration entry.
//FRISO_API void friso_free_config( friso_config_t );
#define friso_free_config(cfg) FRISO_FREE(cfg)
/*
* Function: friso_new_task;
* Usage: segment = friso_new_task( void );
* ----------------------------------------
* This function is used to create a new friso segment type;
*/
FRISO_API friso_task_t friso_new_task(void);
/*
* Function: friso_free_task;
* Usage: friso_free_task( task );
* -------------------------------
* This function is used to free the allocation of function friso_new_segment();
*/
FRISO_API void friso_free_task(friso_task_t);
//create a new friso token
FRISO_API friso_token_t friso_new_token(void);
//free the given friso token
//FRISO_API void friso_free_token( friso_token_t );
#define friso_free_token(token) FRISO_FREE(token)
/*
* Function: friso_set_text
* Usage: friso_set_text( task, text );
* ------------------------------------
* This function is used to set the text that is going to segment.
*/
FRISO_API void friso_set_text(friso_task_t, fstring);
//get the next cjk word with mmseg simple mode
FRISO_API lex_entry_t next_simple_cjk(friso_t, friso_config_t, friso_task_t);
//get the next cjk word with mmseg complex mode(mmseg core algorithm)
FRISO_API lex_entry_t next_complex_cjk(friso_t, friso_config_t, friso_task_t);
/*
* Function: next_mmseg_token
* Usage: word = next_mmseg_token( vars, seg );
* --------------------------------------
* This function is used to get next word that friso segmented
* with a split mode of __FRISO_SIMPLE_MODE__ or __FRISO_COMPLEX_MODE__
*/
FRISO_API friso_token_t next_mmseg_token(friso_t, friso_config_t, friso_task_t);
//__FRISO_DETECT_MODE__
FRISO_API friso_token_t next_detect_token(friso_t, friso_config_t, friso_task_t);
/* }}} friso main interface define :: end*/
/* {{{ lexicon interface define :: start*/
/*
* Function: friso_dic_new
* Usage: dic = friso_new_dic();
* -----------------------------
* This function used to create a new dictionary.(memory allocation).
*/
FRISO_API friso_dic_t friso_dic_new(void);
FRISO_API fstring file_get_line(fstring, FILE *);
/*
* Function: friso_dic_free
* Usage: friso_dic_free( void );
* ------------------------------
* This function is used to free all the allocation of friso_dic_new.
*/
FRISO_API void friso_dic_free(friso_dic_t);
//create a new lexicon entry.
FRISO_API lex_entry_t new_lex_entry(fstring, friso_array_t, uint_t, uint_t, uint_t);
//free the given lexicon entry.
//free all the allocations that its synonyms word's items pointed to
//when the second arguments is 1
FRISO_API void free_lex_entry_full(lex_entry_t);
FRISO_API void free_lex_entry(lex_entry_t);
/*
* Function: friso_dic_load
* Usage: friso_dic_load( friso, friso_lex_t, path, length );
* --------------------------------------------------
* This function is used to load dictionary from a given path.
* no length limit when length less than 0.
*/
FRISO_API void friso_dic_load(friso_t, friso_config_t,
friso_lex_t, fstring, uint_t);
/*
* load the lexicon configuration file.
* and load all the valid lexicon from the conf file.
*/
FRISO_API void friso_dic_load_from_ifile(friso_t, friso_config_t, fstring, uint_t);
/*
* Function: friso_dic_match
* Usage: friso_dic_add( dic, friso_lex_t, word, syn );
* ----------------------------------------------
* This function used to put new word into the dictionary.
*/
FRISO_API void friso_dic_add(friso_dic_t, friso_lex_t, fstring, friso_array_t);
/*
* Function: friso_dic_add_with_fre
* Usage: friso_dic_add_with_fre( dic, friso_lex_t, word, value, syn, fre );
* -------------------------------------------------------------------
* This function used to put new word width frequency into the dictionary.
*/
FRISO_API void friso_dic_add_with_fre(friso_dic_t, friso_lex_t, fstring, friso_array_t, uint_t);
/*
* Function: friso_dic_match
* Usage: result = friso_dic_match( dic, friso_lex_t, word );
* ----------------------------------------------------
* This function is used to check the given word is in the dictionary or not.
*/
FRISO_API int friso_dic_match(friso_dic_t, friso_lex_t, fstring);
/*
* Function: friso_dic_get
* Usage: friso_dic_get( dic, friso_lex_t, word );
* -----------------------------------------
* This function is used to search the specified lex_entry_t.
*/
FRISO_API lex_entry_t friso_dic_get(friso_dic_t, friso_lex_t, fstring);
/*
* Function: friso_spec_dic_size
* Usage: friso_spec_dic_size( dic, friso_lex_t )
* This function is used to get the size of the dictionary with a specified type.
*/
FRISO_API uint_t friso_spec_dic_size(friso_dic_t, friso_lex_t);
FRISO_API uint_t friso_all_dic_size(friso_dic_t);
/* }}} lexicon interface define :: end*/
#endif /*end ifndef*/

View File

@ -1,412 +0,0 @@
/*
* friso ADT application interface header source file.
* 1. string bufffer interface.
* 2. hashmap interface.
* 3. dynamaic array interface.
* 4. double link list interface.
*
* @author chenxin <chenxin619315@gmail.com>
*/
#ifndef _friso_api_h
#define _friso_api_h
#include <stdio.h>
#include <stdlib.h>
//yat, just take it as this way, 99 percent you will find no problem
#if ( defined(_WIN32) || defined(_WINDOWS_) || defined(__WINDOWS_) )
# define FRISO_WINNT
#else
# define FRISO_LINUX
#endif
#ifdef FRISO_WINNT
# define FRISO_API extern __declspec(dllexport)
# define __STATIC_API__ static
#else
/*platform shared library statement :: unix*/
# define FRISO_API extern
# define __STATIC_API__ static inline
#endif
#define ___ALLOCATION_ERROR___ \
printf("Unable to do the memory allocation, program will now exit\n" ); \
exit(1);
#define print(str) printf("%s", str )
#define println(str) printf("%s\n", str )
/*
* memory allocation macro definition which make it more more convenient
* to change to use your favorite or a better memory manage library.
*/
#define FRISO_CALLOC(_bytes, _blocks) calloc(_bytes, _blocks)
#define FRISO_MALLOC(_bytes) malloc(_bytes)
#define FRISO_FREE(_ptr) free( _ptr )
typedef unsigned short ushort_t;
typedef unsigned char uchar_t;
typedef unsigned int uint_t;
typedef char * fstring;
/* {{{ fstring handle interface define::start. */
#define __CHAR_BYTES__ 8
#define __BUFFER_DEFAULT_LENGTH__ 16
typedef struct {
fstring buffer;
uint_t length;
uint_t allocs;
} string_buffer_entry;
typedef string_buffer_entry * string_buffer_t;
//FRISO_API string_buffer_t new_string_buffer( void );
#define new_string_buffer() \
new_string_buffer_with_opacity( __DEFAULT_ARRAY_LIST_OPACITY__ );
FRISO_API string_buffer_t new_string_buffer_with_opacity(uint_t);
FRISO_API string_buffer_t new_string_buffer_with_string(fstring str);
/*
* this function will copy the chars that the fstring pointed.
* to the buffer.
* this may cause the resize action of the buffer.
*/
FRISO_API void string_buffer_append(string_buffer_t, fstring);
FRISO_API void string_buffer_append_char(string_buffer_t, char);
//insert the given fstring from the specified position.
FRISO_API void string_buffer_insert(string_buffer_t, uint_t idx, fstring);
//remove the char in the specified position.
FRISO_API fstring string_buffer_remove(string_buffer_t, uint_t idx, uint_t);
/*
* turn the string_buffer to a string.
* or return the buffer of the string_buffer.
*/
FRISO_API string_buffer_t string_buffer_trim(string_buffer_t);
/*
* free the given fstring buffer.
* and this function will not free the allocations of the
* the string_buffer_t->buffer, we return it to you, if there is
* a necessary you could free it youself by calling free();
*/
FRISO_API fstring string_buffer_devote(string_buffer_t);
/*
* clear the given fstring buffer.
* reset its buffer with 0 and reset its length to 0.
*/
FRISO_API void string_buffer_clear(string_buffer_t);
//free the fstring buffer include the buffer.
FRISO_API void free_string_buffer(string_buffer_t);
/**
* fstring specified chars tokenizer functions
*
* @date 2013-06-08
*/
typedef struct {
fstring source;
uint_t srcLen;
fstring delimiter;
uint_t delLen;
uint_t idx;
} string_split_entry;
typedef string_split_entry * string_split_t;
/**
* create a new string_split_entry.
*
* @param source
* @return string_split_t;
*/
FRISO_API string_split_t new_string_split(fstring, fstring);
FRISO_API void string_split_reset(string_split_t, fstring, fstring);
FRISO_API void string_split_set_source(string_split_t, fstring);
FRISO_API void string_split_set_delimiter(string_split_t, fstring);
FRISO_API void free_string_split(string_split_t);
/**
* get the next split fstring, and copy the
* splited fstring into the __dst buffer .
*
* @param string_split_t
* @param __dst
* @return fstring (NULL if reach the end of the source
* or there is no more segmentation)
*/
FRISO_API fstring string_split_next(string_split_t, fstring);
/* }}} */
/* {{{ dynamaic array interface define::start*/
#define __DEFAULT_ARRAY_LIST_OPACITY__ 8
/*friso array list entry struct*/
typedef struct {
void **items;
uint_t allocs;
uint_t length;
} friso_array_entry;
typedef friso_array_entry * friso_array_t;
//create a new friso dynamic array.
//FRISO_API friso_array_t new_array_list( void );
#define new_array_list() new_array_list_with_opacity(__DEFAULT_ARRAY_LIST_OPACITY__)
//create a new friso dynamic array with the given opacity
FRISO_API friso_array_t new_array_list_with_opacity(uint_t);
/*
* free the given friso array.
* and its items, but never where the items's item to pointed to .
*/
FRISO_API void free_array_list(friso_array_t);
//add a new item to the array.
FRISO_API void array_list_add(friso_array_t, void *);
//insert a new item at a specifed position.
FRISO_API void array_list_insert(friso_array_t, uint_t, void *);
//get a item at a specified position.
FRISO_API void *array_list_get(friso_array_t, uint_t);
/*
* set the item at a specified position.
* this will return the old value.
*/
FRISO_API void *array_list_set(friso_array_t, uint_t, void *);
/*
* remove the given item at a specified position.
* this will return the value of the removed item.
*/
FRISO_API void *array_list_remove(friso_array_t, uint_t);
/*trim the array list for final use.*/
FRISO_API friso_array_t array_list_trim(friso_array_t);
/*
* clear the array list.
* this function will free all the allocations that the pointer pointed.
* but will not free the point array allocations,
* and will reset the length of it.
*/
FRISO_API friso_array_t array_list_clear(friso_array_t);
//return the size of the array.
//FRISO_API uint_t array_list_size( friso_array_t );
#define array_list_size( array ) array->length
//return the allocations of the array.
//FRISO_API uint_t array_list_allocs( friso_array_t );
#define array_list_allocs( array ) array->allocs
//check if the array is empty.
//FRISO_API int array_list_empty( friso_array_t );
#define array_list_empty( array ) ( array->length == 0 )
/* }}} dynamaic array interface define::end*/
/* {{{ link list interface define::start*/
struct friso_link_node {
void *value;
struct friso_link_node *prev;
struct friso_link_node *next;
};
typedef struct friso_link_node link_node_entry;
typedef link_node_entry * link_node_t;
/*
* link list adt
*/
typedef struct {
link_node_t head;
link_node_t tail;
uint_t size;
} friso_link_entry;
typedef friso_link_entry * friso_link_t;
//create a new link list
FRISO_API friso_link_t new_link_list(void);
//free the specified link list
FRISO_API void free_link_list(friso_link_t);
//return the size of the current link list.
//FRISO_API uint_t link_list_size( friso_link_t );
#define link_list_size( link ) link->size
//check the given link is empty or not.
//FRISO_API int link_list_empty( friso_link_t );
#define link_list_empty( link ) (link->size == 0)
//clear all the nodes in the link list( except the head and the tail ).
FRISO_API friso_link_t link_list_clear(friso_link_t link);
//add a new node to the link list.(append from the tail)
FRISO_API void link_list_add(friso_link_t, void *);
//add a new node before the specified node
FRISO_API void link_list_insert_before(friso_link_t, uint_t, void *);
//get the node in the current index.
FRISO_API void *link_list_get(friso_link_t, uint_t);
//modify the node in the current index.
FRISO_API void *link_list_set(friso_link_t, uint_t, void *);
//remove the specified link node
FRISO_API void *link_list_remove(friso_link_t, uint_t);
//remove the given node
FRISO_API void *link_list_remove_node(friso_link_t, link_node_t);
//remove the node from the frist.
FRISO_API void *link_list_remove_first(friso_link_t);
//remove the last node from the link list
FRISO_API void *link_list_remove_last(friso_link_t);
//append a node from the end.
FRISO_API void link_list_add_last(friso_link_t, void *);
//add a node at the begining of the link list.
FRISO_API void link_list_add_first(friso_link_t, void *);
/* }}} link list interface define::end*/
/* {{{ hashtable interface define :: start*/
struct hash_entry {
fstring _key; //the node key
void * _val; //the node value
struct hash_entry * _next;
};
typedef struct hash_entry friso_hash_entry;
typedef friso_hash_entry * hash_entry_t;
typedef void (*fhash_callback_fn_t)(hash_entry_t);
typedef struct {
uint_t length;
uint_t size;
float factor;
uint_t threshold;
hash_entry_t *table;
} friso_hash_cdt;
typedef friso_hash_cdt * friso_hash_t;
//default value for friso_hash_cdt
#define DEFAULT_LENGTH 31
#define DEFAULT_FACTOR 0.85f
/*
* Function: new_hash_table
* Usage: table = new_hash_table();
* --------------------------------
* this function allocates a new symbol table with no entries.
*/
FRISO_API friso_hash_t new_hash_table(void);
/*
* Function: free_hash_table
* Usage: free_hash_table( table );
* --------------------------------------
* this function will free all the allocation for memory.
*/
FRISO_API void free_hash_table(friso_hash_t, fhash_callback_fn_t);
/*
* Function: put_new_mapping
* Usage: put_mapping( table, key, value );
* ----------------------------------------
* the function associates the specified key with the given value.
*/
FRISO_API void *hash_put_mapping(friso_hash_t, fstring, void *);
/*
* Function: is_mapping_exists
* Usage: bool = is_mapping_exists( table, key );
* ----------------------------------------------
* this function check the given key mapping is exists or not.
*/
FRISO_API int hash_exist_mapping(friso_hash_t, fstring);
/*
* Function: get_mapping_value
* Usage: value = get_mapping_value( table, key );
* -----------------------------------------------
* this function return the value associated with the given key.
* UNDEFINED will be return if the mapping is not exists.
*/
FRISO_API void * hash_get_value(friso_hash_t, fstring);
/*
* Function: remove_mapping
* Usage: remove_mapping( table, key );
* ------------------------------------
* This function is used to remove the mapping associated with the given key.
*/
FRISO_API hash_entry_t hash_remove_mapping(friso_hash_t, fstring);
/*
* Function: get_table_size
* Usage: size = get_table_size( table );
* --------------------------------------
* This function is used to count the size of the specified table.
*/
//FRISO_API uint_t hash_get_size( friso_hash_t );
#define hash_get_size( hash ) hash->size
/* }}} hashtable interface define :: end*/
/* {{{ utf8 string interface define :: start*/
/*
* Function: get_utf8_bytes
*
* */
FRISO_API int get_utf8_bytes(char);
/*
* Function: get_utf8_unicode
*
* */
FRISO_API int get_utf8_unicode(const fstring);
/*
* Function: unicode_to_utf8
*
* */
FRISO_API int unicode_to_utf8(uint_t, fstring);
/* }}} utf8 string interface define :: start*/
#endif /*end ifndef*/

View File

@ -1,283 +0,0 @@
/**
* Friso GBK serial functions implementation source file.
* @package src/friso_GBK.c .
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "friso_API.h"
#include "friso_ctype.h"
/* read the next GBK word from the specified position.
*
* @return int the bytes of the current readed word.
*/
FRISO_API int gbk_next_word(
friso_task_t task,
uint_t *idx,
fstring __word) {
int c;
if(*idx >= task->length) return 0;
c = (uchar_t)task->text[*idx];
if(c <= 0x80) {
task->bytes = 1;
} else {
task->bytes = 2;
}
//copy the word to the buffer.
memcpy(__word, task->text + (*idx), task->bytes);
(*idx) += task->bytes;
__word[task->bytes] = '\0';
return task->bytes;
}
//get the bytes of a gbk char.
//FRISO_API int get_gbk_bytes( char c )
//{
// return 1;
//}
//check if the given buffer is a gbk word (ANSII string).
// included the simplified and traditional words.
FRISO_API int gbk_cn_string(char *str) {
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
//GBK/2: gb2312 chinese word.
return (((c1 >= 0xb0 && c1 <= 0xf7)
&& (c2 >= 0xa1 && c2 <= 0xfe))
//GBK/3: extend chinese words.
|| ((c1 >= 0x81 && c1 <= 0xa0)
&& ((c2 >= 0x40 && c2 <= 0x7e)
|| (c2 >= 0x80 && c2 <= 0xfe)))
//GBK/4: extend chinese words.
|| ((c1 >= 0xaa && c1 <= 0xfe)
&& ((c2 >= 0x40 && c2 <= 0xfe)
|| (c2 >= 0x80 && c2 <= 0xa0))));
}
/*check if the given char is a ASCII letter
* include all the arabic number, letters and english puntuations.*/
FRISO_API int gbk_halfwidth_en_char(char c) {
int u = (uchar_t) c;
return (u >= 32 && u <= 126);
}
/*
* check if the given char is a full-width latain.
* include the full-width arabic numeber, letters.
* but not the full-width puntuations.
*/
FRISO_API int gbk_fullwidth_en_char(char *str) {
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
return ((c1 == 0xA3)
&& ((c2 >= 0xB0 && c2 <= 0xB9) //arabic numbers.
|| (c2 >= 0xC1 && c2 <= 0xDA) //uppercase letters.
|| (c2 >= 0xE1 && c2 <= 0xFA))); //lowercase letters.
}
//check if the given char is a upper case english letter.
// included the full-width and half-width letters.
FRISO_API int gbk_uppercase_letter(char *str) {
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if(c1 <= 0x80) { //half-width
return (c1 >= 65 && c1 <= 90);
} else { //full-width
return (c1 == 0xa3 && (c2 >= 0xc1 && c2 <= 0xda));
}
}
//check if the given char is a lower case char.
// included the full-width and half-width letters.
FRISO_API int gbk_lowercase_letter(char *str) {
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if(c1 <= 0x80) { //half-width
return (c1 >= 97 && c1 <= 122);
} else { //full-width
return (c1 == 0xa3 && (c2 >= 0xe1 && c2 <= 0xfa));
}
}
//check if the given char is a arabic numeric.
// included the full-width and half-width arabic numeric.
FRISO_API int gbk_numeric_letter(char *str) {
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if(c1 <= 0x80) { //half-width
return (c1 >= 48 && c1 <= 57);
} else { //full-width
return ((c1 == 0xa3) && (c2 >= 0xb0 && c2 <= 0xb9));
}
}
/*
* check if the given fstring is make up with numeric chars.
* both full-width,half-width numeric is ok.
*/
FRISO_API int gbk_numeric_string(char *str) {
char *s = str;
int c1 = 0;
int c2 = 0;
while(*s != '\0') {
c1 = (uchar_t)(*s++);
if(c1 <= 0x80) { //half-width
if(c1 < 48 || c2 > 57) return 0;
} else { //full-width
if(c1 != 0xa3) return 0;
c2 = (uchar_t)(*s++);
if(c2 < 0xb0 || c2 > 0xb9) return 0;
}
}
return 1;
}
FRISO_API int gbk_decimal_string(char *str) {
int c1 = 0;
int c2 = 0;
int len = strlen(str), i, p = 0;
//point header check.
if(str[0] == '.' || str[len - 1] == '.') return 0;
for(i = 0; i < len;) {
c1 = (uchar_t) str[i++];
//count the number of the points.
if(c1 == 46) {
p++;
continue;
}
if(c1 <= 0x80) { //half-width
if(c1 < 48 || c1 > 57) return 0;
} else { //full-width
if(c1 != 0xa3) return 0;
c2 = (uchar_t) str[i++];
if(c2 < 0xb0 || c2 > 0xb9) return 0;
}
}
return (p == 1);
}
//check if the given char is a english(ASCII) letter.
// (full-width and half-width), not the punctuation/arabic of course.
FRISO_API int gbk_en_letter(char *str) {
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if(c1 <= 0x80) {
return ((c1 >= 65 && c1 <= 90) //lowercase
|| (c1 >= 97 && c1 <= 122)); //uppercase
} else {
return ((c1 == 0xa3)
&& ((c2 >= 0xc1 && c2 <= 0xda) //lowercase
|| (c2 >= 0xe1 && c2 <= 0xfa))); //uppercase
}
return 0;
}
//check the given char is a whitespace or not.
// included full-width and half-width whitespace.
FRISO_API int gbk_whitespace(char *str) {
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if(c1 <= 0x80) {
return (c1 == 32);
} else {
return (c1 == 0xa3 && c2 == 0xa0);
}
}
/* check if the given char is a letter number like 'ⅠⅡ'
*/
FRISO_API int gbk_letter_number(char *str) {
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
return ((c1 == 0xa2)
&& ((c2 >= 0xa1 && c2 <= 0xb0) //lowercase
|| (c2 >= 0xf0 && c2 <= 0xfe))); //uppercase
}
/*
* check if the given char is a other number like ''
*/
FRISO_API int gbk_other_number(char *str) {
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
return ((c1 == 0xa2) && (c2 >= 0xc5 && c2 <= 0xee));
}
//check if the given char is a english punctuation.
FRISO_API int gbk_en_punctuation(char c) {
int u = (uchar_t) c;
return ((u > 32 && u < 48)
|| (u > 57 && u < 65)
|| (u > 90 && u < 97)
|| (u > 122 && u < 127));
}
//check the given char is a chinese punctuation.
FRISO_API int gbk_cn_punctuation(char *str) {
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
//full-width en punctuation.
return ((c1 == 0xa3 && ((c2 >= 0xa1 && c2 <= 0xaf)
|| (c2 >= 0xba && c2 <= 0xc0)
|| (c2 >= 0xdb && c2 <= 0xe0)
|| (c2 >= 0xfb && c2 <= 0xfe)))
//chinese punctuation.
|| (c1 == 0xa1 && ((c2 >= 0xa1 && c2 <= 0xae)
|| (c2 >= 0xb0 && c2 <= 0xbf)))
//A6 area special punctuations:" "
|| (c1 == 0xa6 && (c2 >= 0xf9 && c2 <= 0xfe))
//A8 area special punctuations: " ˊˋ˙–―‥‵℅ "
|| (c1 == 0xa8 && (c2 >= 0x40 && c2 <= 0x47)));
}
/* {{{
'@', '$','%', '^', '&', '-', ':', '.', '/', '\'', '#', '+'
*/
//cause it it the same as utf-8, we use utf8's interface instead.
//@see the friso_ctype.h#gbk_keep_punctuation macro defined.
//static friso_hash_t __keep_punctuations_hash__ = NULL;
/* @Deprecated
* check the given char is an english keep punctuation.*/
//FRISO_API int gbk_keep_punctuation( char *str )
//{
// if ( __keep_punctuations_hash__ == NULL ) {
// __keep_punctuations_hash__ = new_hash_table();
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
// hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
// }
// //check the hash.
// return hash_exist_mapping( __keep_punctuations_hash__, str );
//}
/* }}} */
//check if the given english char is a full-width char or not.
//FRISO_API int gbk_fullwidth_char( char *str )
//{
// return 1;
//}

View File

@ -1,467 +0,0 @@
/**
* Friso utf8 serial function implementation source file.
* @package src/friso_UTF8.c .
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "friso_API.h"
#include "friso_ctype.h"
/* read the next utf-8 word from the specified position.
*
* @return int the bytes of the current readed word.
*/
FRISO_API int utf8_next_word(
friso_task_t task,
uint_t *idx,
fstring __word) {
if(*idx >= task->length) return 0;
//register uint_t t;
task->bytes = get_utf8_bytes(task->text[ *idx ]);
//for ( t = 0; t < task->bytes; t++ ) {
// __word[t] = task->text[ (*idx)++ ];
//}
//change the loop to memcpy.
//it is more efficient.
//@date 2013-09-04
memcpy(__word, task->text + (*idx), task->bytes);
(*idx) += task->bytes;
__word[task->bytes] = '\0';
//the unicode counter was moved here from version 1.6.0
task->unicode = get_utf8_unicode(__word);
return task->bytes;
}
/*
* print a character in a binary style.
*
* @param int
*/
FRISO_API void print_char_binary(char value) {
register uint_t t;
for(t = 0; t < __CHAR_BYTES__; t++) {
if((value & 0x80) == 0x80) {
printf("1");
} else {
printf("0");
}
value <<= 1;
}
}
/*
* get the bytes of a utf-8 char.
* between 1 - 6.
*
* @param __char
* @return int
*/
FRISO_API int get_utf8_bytes(char value) {
register uint_t t = 0;
//one byte ascii char.
if((value & 0x80) == 0) return 1;
for(; (value & 0x80) != 0; value <<= 1) {
t++;
}
return t;
}
/*
* get the unicode serial of a utf-8 char.
*
* @param ch
* @return int.
*/
FRISO_API int get_utf8_unicode(const fstring ch) {
int code = 0, bytes = get_utf8_bytes(*ch);
register uchar_t *bit = (uchar_t *) &code;
register char b1, b2, b3;
switch(bytes) {
case 1:
*bit = *ch;
break;
case 2:
b1 = *ch;
b2 = *(ch + 1);
*bit = (b1 << 6) + (b2 & 0x3F);
*(bit + 1) = (b1 >> 2) & 0x07;
break;
case 3:
b1 = *ch;
b2 = *(ch + 1);
b3 = *(ch + 2);
*bit = (b2 << 6) + (b3 & 0x3F);
*(bit + 1) = (b1 << 4) + ((b2 >> 2) & 0x0F);
break;
//ignore the ones that are larger than 3 bytes;
}
return code;
}
//turn the unicode serial to a utf-8 string.
FRISO_API int unicode_to_utf8(uint_t u, fstring __word) {
if(u <= 0x0000007F) {
//U-00000000 - U-0000007F
//0xxxxxxx
*__word = (u & 0x7F);
return 1;
} else if(u >= 0x00000080 && u <= 0x000007FF) {
//U-00000080 - U-000007FF
//110xxxxx 10xxxxxx
*(__word + 1) = (u & 0x3F) | 0x80;
*__word = ((u >> 6) & 0x1F) | 0xC0;
return 2;
} else if(u >= 0x00000800 && u <= 0x0000FFFF) {
//U-00000800 - U-0000FFFF
//1110xxxx 10xxxxxx 10xxxxxx
*(__word + 2) = (u & 0x3F) | 0x80;
*(__word + 1) = ((u >> 6) & 0x3F) | 0x80;
*__word = ((u >> 12) & 0x0F) | 0xE0;
return 3;
} else if(u >= 0x00010000 && u <= 0x001FFFFF) {
//U-00010000 - U-001FFFFF
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*(__word + 3) = (u & 0x3F) | 0x80;
*(__word + 2) = ((u >> 6) & 0x3F) | 0x80;
*(__word + 1) = ((u >> 12) & 0x3F) | 0x80;
*__word = ((u >> 18) & 0x07) | 0xF0;
return 4;
} else if(u >= 0x00200000 && u <= 0x03FFFFFF) {
//U-00200000 - U-03FFFFFF
//111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*(__word + 4) = (u & 0x3F) | 0x80;
*(__word + 3) = ((u >> 6) & 0x3F) | 0x80;
*(__word + 2) = ((u >> 12) & 0x3F) | 0x80;
*(__word + 1) = ((u >> 18) & 0x3F) | 0x80;
*__word = ((u >> 24) & 0x03) | 0xF8;
return 5;
} else if(u >= 0x04000000 && u <= 0x7FFFFFFF) {
//U-04000000 - U-7FFFFFFF
//1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*(__word + 5) = (u & 0x3F) | 0x80;
*(__word + 4) = ((u >> 6) & 0x3F) | 0x80;
*(__word + 3) = ((u >> 12) & 0x3F) | 0x80;
*(__word + 2) = ((u >> 18) & 0x3F) | 0x80;
*(__word + 1) = ((u >> 24) & 0x3F) | 0x80;
*__word = ((u >> 30) & 0x01) | 0xFC;
return 6;
}
return 0;
}
/*
* check the given char is a CJK char or not.
* 2E80-2EFF CJK
* 2F00-2FDF
* 3000-303F CJK --ignore
* 31C0-31EF CJK
* 3200-32FF CJK --ignore.
* 3300-33FF CJK
* 3400-4DBF CJK A
* 4DC0-4DFF
* 4E00-9FBF CJK
* F900-FAFF CJK
* FE30-FE4F CJK
* FF00-FFEF ASCII --ignore (as basic latin)
*
* Japanese:
* 3040-309F
* 30A0-30FF
* 31F0-31FF
*
* Korean:
* AC00-D7AF
* 1100-11FF
* 3130-318F
*
* @param ch :pointer to the char
* @return int : 1 for yes and 0 for not.
*/
//Comment one of the following macro define
//to clear the check of the specified language.
#define FRISO_CJK_CHK_C
//#define FRISO_CJK_CHK_J
//#define FRISO_CJK_CHK_K
FRISO_API int utf8_cjk_string(uint_t u) {
int c = 0, j = 0, k = 0;
//Chinese.
#ifdef FRISO_CJK_CHK_C
c = ((u >= 0x4E00 && u <= 0x9FBF)
|| (u >= 0x2E80 && u <= 0x2EFF) || (u >= 0x2F00 && u <= 0x2FDF)
|| (u >= 0x31C0 && u <= 0x31EF) //|| ( u >= 0x3200 && u <= 0x32FF )
|| (u >= 0x3300 && u <= 0x33FF) //|| ( u >= 0x3400 && u <= 0x4DBF )
|| (u >= 0x4DC0 && u <= 0x4DFF) || (u >= 0xF900 && u <= 0xFAFF)
|| (u >= 0xFE30 && u <= 0xFE4F));
#endif
//Japanese.
#ifdef FRISO_CJK_CHK_J
j = ((u >= 0x3040 && u <= 0x309F)
|| (u >= 0x30A0 && u <= 0x30FF) || (u >= 0x31F0 && u <= 0x31FF));
#endif
//Korean
#ifdef FRISO_CJK_CHK_K
k = ((u >= 0xAC00 && u <= 0xD7AF)
|| (u >= 0x1100 && u <= 0x11FF) || (u >= 0x3130 && u <= 0x318F));
#endif
return (c || j || k);
}
/*
* check the given char is a Basic Latin letter or not.
* include all the letters and english punctuations.
*
* @param c
* @return int 1 for yes and 0 for not.
*/
FRISO_API int utf8_halfwidth_en_char(uint_t u) {
return (u >= 32 && u <= 126);
}
/*
* check the given char is a full-width latain or not.
* include the full-width arabic numeber, letters.
* but not the full-width punctuations.
*
* @param c
* @return int
*/
FRISO_API int utf8_fullwidth_en_char(uint_t u) {
return ((u >= 65296 && u <= 65305) //arabic number
|| (u >= 65313 && u <= 65338) //upper case letters
|| (u >= 65345 && u <= 65370)); //lower case letters
}
//check the given char is a upper case letters or not.
// included the full-width and half-width letters.
FRISO_API int utf8_uppercase_letter(uint_t u) {
if(u > 65280) u -= 65248;
return (u >= 65 && u <= 90);
}
//check the given char is a upper case letters or not.
// included the full-width and half-width letters.
FRISO_API int utf8_lowercase_letter(uint_t u) {
if(u > 65280) u -= 65248;
return (u >= 97 && u <= 122);
}
//check the given char is a numeric
// included the full-width and half-width arabic numeric.
FRISO_API int utf8_numeric_letter(uint_t u) {
if(u > 65280) u -= 65248; //make full-width half-width.
return ((u >= 48 && u <= 57));
}
//check the given char is a english letter.(included the full-width)
// not the punctuation of course.
FRISO_API int utf8_en_letter(uint_t u) {
if(u > 65280) u -= 65248;
return ((u >= 65 && u <= 90)
|| (u >= 97 && u <= 122));
}
/*
* check if the given fstring is make up with numeric.
* both full-width,half-width numeric is ok.
*
* @param str
* @return int
* 65296,
* 65297,
* 65298,
* 65299,
* 65300,
* 65301,
* 65302,
* 65303,
* 65304,
* 65305,
*/
FRISO_API int utf8_numeric_string(const fstring str) {
fstring s = str;
int bytes, u;
while(*s != '\0') {
//if ( ! utf8_numeric_letter( get_utf8_unicode( s++ ) ) ) {
// return 0;
//}
//new implemention.
//@date 2013-10-14
bytes = 1;
if(*s < 0) { //full-width chars.
u = get_utf8_unicode(s);
bytes = get_utf8_bytes(*s);
if(u < 65296 || u > 65305) return 0;
} else if(*s < 48 || *s > 57) {
return 0;
}
s += bytes;
}
return 1;
}
FRISO_API int utf8_decimal_string(const fstring str) {
int len = strlen(str), i, p = 0;
int bytes = 0, u;
if(str[0] == '.' || str[len - 1] == '.') return 0;
for(i = 1; i < len; bytes = 1) {
//count the number of char '.'
if(str[i] == '.') {
i++;
p++;
continue;
} else if(str[i] < 0) {
//full-width numeric.
u = get_utf8_unicode(str + i);
bytes = get_utf8_bytes(str[i]);
if(u < 65296 || u > 65305) return 0;
} else if(str[i] < 48 || str[i] > 57) {
return 0;
}
i += bytes;
}
return (p == 1);
}
/*
* check the given char is a whitespace or not.
*
* @param ch
* @return int 1 for yes and 0 for not.
*/
FRISO_API int utf8_whitespace(uint_t u) {
if(u == 32 || u == 12288) {
return 1;
}
return 0;
}
/*
* check the given char is a english punctuation.
*
* @param ch
* @return int
*/
FRISO_API int utf8_en_punctuation(uint_t u) {
//if ( u > 65280 ) u = u - 65248; //make full-width half-width
return ((u > 32 && u < 48)
|| (u > 57 && u < 65)
|| (u > 90 && u < 97) //added @2013-08-31
|| (u > 122 && u < 127));
}
/*
* check the given char is a chinese punctuation.
* @date 2013-08-31 added.
*
* @param ch
* @return int
*/
FRISO_API int utf8_cn_punctuation(uint_t u) {
return ((u > 65280 && u < 65296)
|| (u > 65305 && u < 65312)
|| (u > 65338 && u < 65345)
|| (u > 65370 && u < 65382)
//cjk symbol and punctuation.(added 2013-09-06)
//from http://www.unicode.org/charts/PDF/U3000.pdf
|| (u >= 12289 && u <= 12319));
}
/*
* check if the given char is a letter number in unicode.
* like ''.
* @param ch
* @return int
*/
FRISO_API int utf8_letter_number(uint_t u) {
return 0;
}
/*
* check if the given char is a other number in unicode.
* like ''.
* @param ch
* @return int
*/
FRISO_API int utf8_other_number(uint_t u) {
return 0;
}
//A macro define has replace this.
//FRISO_API int is_en_punctuation( char c )
//{
// return utf8_en_punctuation( (uint_t) c );
//}
/* {{{
'@', '$','%', '^', '&', '-', ':', '.', '/', '\'', '#', '+'
*/
//static friso_hash_t __keep_punctuations_hash__ = NULL;
/* @Deprecated
* check the given char is an english keep punctuation.*/
//FRISO_API int utf8_keep_punctuation( fstring str )
//{
// if ( __keep_punctuations_hash__ == NULL )
// {
// __keep_punctuations_hash__ = new_hash_table();
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
// }
// //check the hash.
// return hash_exist_mapping( __keep_punctuations_hash__, str );
//}
/* }}} */
/*
* check the given english char is a full-width char or not.
*
* @param ch
* @return 1 for yes and 0 for not.
*/
//FRISO_API int utf8_fullwidth_char( uint_t u )
//{
// if ( u == 12288 )
// return 1; //full-width space
// //(32 - 126) ascii code
// return (u > 65280 && u <= 65406);
//}

View File

@ -1,209 +0,0 @@
/*
* friso dynamaic Array interface implementation defined in header file "friso_API.h".
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdlib.h>
/* ********************************************
* friso array list static functions block *
**********************************************/
__STATIC_API__ void **create_array_entries(uint_t __blocks) {
register uint_t t;
void **block = (void **) FRISO_CALLOC(sizeof(void *), __blocks);
if(block == NULL) {
___ALLOCATION_ERROR___
}
//initialize
for(t = 0; t < __blocks; t++) {
block[t] = NULL;
}
return block;
}
//resize the array. (the opacity should not be smaller than array->length)
__STATIC_API__ friso_array_t resize_array_list(
friso_array_t array,
uint_t opacity) {
register uint_t t;
void **block = create_array_entries(opacity);
for(t = 0; t < array->length ; t++) {
block[t] = array->items[t];
}
FRISO_FREE(array->items);
array->items = block;
array->allocs = opacity;
return array;
}
/* ********************************************
* friso array list FRISO_API functions block *
**********************************************/
//create a new array list. (A macro define has replace this.)
//FRISO_API friso_array_t new_array_list( void ) {
// return new_array_list_with_opacity( __DEFAULT_ARRAY_LIST_OPACITY__ );
//}
//create a new array list with a given opacity.
FRISO_API friso_array_t new_array_list_with_opacity(uint_t opacity) {
friso_array_t array = (friso_array_t)
FRISO_MALLOC(sizeof(friso_array_entry));
if(array == NULL) {
___ALLOCATION_ERROR___
}
//initialize
array->items = create_array_entries(opacity);
array->allocs = opacity;
array->length = 0;
return array;
}
/*
* free the given friso array.
* and its items, but never where its items item pointed to .
*/
FRISO_API void free_array_list(friso_array_t array) {
//free the allocation that all the items pointed to
//register int t;
//if ( flag == 1 ) {
// for ( t = 0; t < array->length; t++ ) {
// if ( array->items[t] == NULL ) continue;
// FRISO_FREE( array->items[t] );
// array->items[t] = NULL;
// }
//}
FRISO_FREE(array->items);
FRISO_FREE(array);
}
//add a new item to the array.
FRISO_API void array_list_add(friso_array_t array, void *value) {
//check the condition to resize.
if(array->length == array->allocs) {
resize_array_list(array, array->length * 2 + 1);
}
array->items[array->length++] = value;
}
//insert a new item at a specified position.
FRISO_API void array_list_insert(
friso_array_t array,
uint_t idx,
void *value) {
register uint_t t;
if(idx <= array->length) {
//check the condition to resize the array.
if(array->length == array->allocs) {
resize_array_list(array, array->length * 2 + 1);
}
//move the elements after idx.
//for ( t = idx; t < array->length; t++ ) {
// array->items[t+1] = array->items[t];
//}
for(t = array->length - 1; t >= idx; t--) {
array->items[t + 1] = array->items[t];
}
array->items[idx] = value;
array->length++;
}
}
//get the item at a specified position.
FRISO_API void *array_list_get(friso_array_t array, uint_t idx) {
if(idx < array->length) {
return array->items[idx];
}
return NULL;
}
//set the value of the item at a specified position.
//this will return the old value.
FRISO_API void * array_list_set(
friso_array_t array,
uint_t idx,
void * value) {
void * oval = NULL;
if(idx < array->length) {
oval = array->items[idx];
array->items[idx] = value;
}
return oval;
}
//remove the item at a specified position.
//this will return the value of the removed item.
FRISO_API void * array_list_remove(
friso_array_t array, uint_t idx) {
register uint_t t;
void *oval = NULL;
if(idx < array->length) {
oval = array->items[idx];
//move the elements after idx.
for(t = idx; t < array->length - 1; t++) {
array->items[t] = array->items[ t + 1 ];
}
array->items[array->length - 1] = NULL;
array->length--;
}
return oval;
}
/*trim the array list*/
FRISO_API friso_array_t array_list_trim(friso_array_t array) {
if(array->length < array->allocs) {
return resize_array_list(array, array->length);
}
return array;
}
/*
* clear the array list.
* this function will free all the allocations that the pointer pointed.
* but will not free the point array allocations,
* and will reset the length of it.
*/
FRISO_API friso_array_t array_list_clear(friso_array_t array) {
register uint_t t;
//free all the allocations that the array->length's pointer pointed.
for(t = 0; t < array->length; t++) {
/*if ( array->items[t] == NULL ) continue;
FRISO_FREE( array->items[t] ); */
array->items[t] = NULL;
}
//attribute reset.
array->length = 0;
return array;
}
//get the size of the array list. (A macro define has replace this.)
//FRISO_API uint_t array_list_size( friso_array_t array ) {
// return array->length;
//}
//return the allocations of the array list.(A macro define has replace this)
//FRISO_API uint_t array_list_allocs( friso_array_t array ) {
// return array->allocs;
//}
//check if the array is empty.(A macro define has replace this.)
//FRISO_API int array_list_empty( friso_array_t array )
//{
// return ( array->length == 0 );
//}

View File

@ -1,244 +0,0 @@
/**
* friso string type check functions,
* like english/CJK, full-wdith/half-width, punctuation or not.
* @see friso_UTF8.c and friso_GBK.c for detail.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "friso_ctype.h"
#include "friso_API.h"
/* check if the specified string is a cn string.
*
* @return int (true for cn string or false)
* */
FRISO_API int friso_cn_string(
friso_charset_t charset,
friso_task_t task) {
if(charset == FRISO_UTF8) {
return utf8_cjk_string(task->unicode);
} else if(charset == FRISO_GBK) {
return gbk_cn_string(task->buffer);
}
return 0;
}
//check if the specified word is a whitespace.
FRISO_API int friso_whitespace(
friso_charset_t charset,
friso_task_t task) {
if(charset == FRISO_UTF8) {
return utf8_whitespace(task->unicode);
} else if(charset == FRISO_GBK) {
return gbk_whitespace(task->buffer);
}
return 0;
}
//check if the specifiled word is a numeric letter.
FRISO_API int friso_numeric_letter(
friso_charset_t charset,
friso_task_t task) {
if(charset == FRISO_UTF8) {
return utf8_numeric_letter((uint_t) task->text[task->idx]);
} else if(charset == FRISO_GBK) {
return gbk_numeric_letter(task->text + task->idx);
}
return 0;
}
//check if the specified word is aa english letter.
FRISO_API int friso_en_letter(
friso_charset_t charset,
friso_task_t task) {
if(charset == FRISO_UTF8) {
return utf8_en_letter((uint_t) task->text[task->idx]);
} else if(charset == FRISO_GBK) {
return gbk_en_letter(task->text + task->idx);
}
return 0;
}
//check if the specified word is a half-width letter.
// punctuations are inclued.
FRISO_API int friso_halfwidth_en_char(
friso_charset_t charset,
friso_task_t task) {
if(charset == FRISO_UTF8) {
return utf8_halfwidth_en_char(task->unicode);
} else if(charset == FRISO_GBK) {
return gbk_halfwidth_en_char(task->buffer[0]);
}
return 0;
}
//check if the specified word is a full-width letter.
// full-width punctuations are not included.
FRISO_API int friso_fullwidth_en_char(
friso_charset_t charset,
friso_task_t task) {
if(charset == FRISO_UTF8) {
return utf8_fullwidth_en_char(task->unicode);
} else if(charset == FRISO_GBK) {
return gbk_fullwidth_en_char(task->buffer);
}
return 0;
}
//check if the specified word is an english punctuations.
FRISO_API int friso_en_punctuation(
friso_charset_t charset,
friso_task_t task) {
if(charset == FRISO_UTF8) {
return utf8_en_punctuation(task->unicode);
} else if(charset == FRISO_GBK) {
return gbk_en_punctuation(task->buffer[0]);
}
return 0;
}
//check if the specified word ia sn chinese punctuation.
FRISO_API int friso_cn_punctuation(
friso_charset_t charset,
friso_task_t task) {
if(charset == FRISO_UTF8) {
return utf8_cn_punctuation(task->unicode);
} else if(charset == FRISO_GBK) {
return gbk_cn_punctuation(task->buffer);
}
return 0;
}
FRISO_API int friso_letter_number(
friso_charset_t charset,
friso_task_t task) {
return 0;
}
FRISO_API int friso_other_number(
friso_charset_t charset,
friso_task_t task) {
return 0;
}
//check if the word is a keep punctuation.
//@Deprecated
//FRISO_API int friso_keep_punctuation(
// friso_charset_t charset,
// friso_task_t task )
//{
// if ( charset == FRISO_UTF8 )
// return utf8_keep_punctuation( task->buffer );
// else if ( charset == FRISO_GBK )
// return gbk_keep_punctuation( task->buffer );
// return 0;
//}
//check if the specified char is en english punctuation.
// this function is the same as friso_en_punctuation.
FRISO_API int is_en_punctuation(
friso_charset_t charset, char c) {
if(charset == FRISO_UTF8) {
return utf8_en_punctuation((uint_t) c);
} else if(charset == FRISO_GBK) {
return gbk_en_punctuation(c);
}
return 0;
}
//check the specified string is make up with numeric.
FRISO_API int friso_numeric_string(
friso_charset_t charset,
char *buffer) {
if(charset == FRISO_UTF8) {
return utf8_numeric_string(buffer);
} else if(charset == FRISO_GBK) {
return gbk_numeric_string(buffer);
}
return 0;
}
//check the specified string is a decimal string.
FRISO_API int friso_decimal_string(
friso_charset_t charset, char *buffer) {
if(charset == FRISO_UTF8) {
return utf8_decimal_string(buffer);
} else if(charset == FRISO_GBK) {
return gbk_decimal_string(buffer);
}
return 0;
}
//check if the specified char is english uppercase letter.
// included full-width and half-width letters.
FRISO_API int friso_uppercase_letter(
friso_charset_t charset,
friso_task_t task) {
if(charset == FRISO_UTF8) {
return utf8_uppercase_letter(task->unicode);
} else if(charset == FRISO_GBK) {
return gbk_uppercase_letter(task->buffer);
}
return 0;
}
/* get the type of the specified char.
* the type will be the constants defined above.
* (include the fullwidth english char.)
*/
FRISO_API friso_enchar_t friso_enchar_type(
friso_charset_t charset,
friso_task_t task) {
//Unicode or ASCII.(Both UTF-8 and GBK are valid)
uint_t u = 0;
if(charset == FRISO_UTF8) {
u = task->unicode;
//if ( u >= 65280 ) u = 65280 - 65248;
} else if(charset == FRISO_GBK) {
u = (uchar_t)task->buffer[0];
//if ( u == 0xa3 ) ; //full-width.
}
//range check.
if(u > 126 || u < 32) return FRISO_EN_UNKNOW;
if(u == 32) return FRISO_EN_WHITESPACE;
if(u >= 48 && u <= 57) return FRISO_EN_NUMERIC;
if(u >= 65 && u <= 90) return FRISO_EN_LETTER;
if(u >= 97 && u <= 122) return FRISO_EN_LETTER;
return FRISO_EN_PUNCTUATION;
}
/* get the type of the specified en char.
* the type will be the constants defined above.
* (the char should be half-width english char only)
*/
FRISO_API friso_enchar_t get_enchar_type(char ch) {
uint_t u = (uchar_t) ch;
//range check.
if(u > 126 || u < 32) return FRISO_EN_UNKNOW;
if(u == 32) return FRISO_EN_WHITESPACE;
if(u >= 48 && u <= 57) return FRISO_EN_NUMERIC;
if(u >= 65 && u <= 90) return FRISO_EN_LETTER;
if(u >= 97 && u <= 122) return FRISO_EN_LETTER;
return FRISO_EN_PUNCTUATION;
}

View File

@ -1,261 +0,0 @@
/**
* Friso charset about function interface header file.
* @package src/friso_charset.h .
* Available charset for now:
* 1. UTF8 - function start with utf8
* 2. GBK - function start with gbk
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#ifndef _friso_charset_h
#define _friso_charset_h
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "friso.h"
#include "friso_API.h"
/** {{{ wrap interface */
/* check if the specified string is a cn string.
*
* @return int (true for cn string or false)
* */
FRISO_API int friso_cn_string(friso_charset_t, friso_task_t);
//check if the specified word is a whitespace.
FRISO_API int friso_whitespace(friso_charset_t, friso_task_t);
//check if the specifiled word is a numeric letter.
FRISO_API int friso_numeric_letter(friso_charset_t, friso_task_t);
//check if the specified word is a english letter.
FRISO_API int friso_en_letter(friso_charset_t, friso_task_t);
//check if the specified word is a half-width letter.
// punctuations are inclued.
FRISO_API int friso_halfwidth_en_char(friso_charset_t, friso_task_t);
//check if the specified word is a full-width letter.
// full-width punctuations are not included.
FRISO_API int friso_fullwidth_en_char(friso_charset_t, friso_task_t);
//check if the specified word is an english punctuations.
FRISO_API int friso_en_punctuation(friso_charset_t, friso_task_t);
//check if the specified word ia sn chinese punctuation.
FRISO_API int friso_cn_punctuation(friso_charset_t, friso_task_t);
FRISO_API int friso_letter_number(friso_charset_t, friso_task_t);
FRISO_API int friso_other_number(friso_charset_t, friso_task_t);
//check if the word is a keep punctuation.
//@Deprecated
//FRISO_API int friso_keep_punctuation( friso_charset_t, friso_task_t );
//check the specified string is numeric string.
FRISO_API int friso_numeric_string(friso_charset_t, char *);
//check the specified string is a decimal string.
FRISO_API int friso_decimal_string(friso_charset_t, char *);
//check if the specified char is english uppercase letter.
// included full-width and half-width letters.
FRISO_API int friso_uppercase_letter(friso_charset_t, friso_task_t);
//en char type.
//#define FRISO_EN_LETTER 0 //a-z && A-Z
//#define FRISO_EN_NUMERIC 1 //0-9
//#define FRISO_EN_PUNCTUATION 2 //english punctuations
//#define FRISO_EN_WHITESPACE 3 //whitespace
//#define FRISO_EN_UNKNOW -1 //beyond 32-122
typedef enum {
FRISO_EN_LETTER = 0, //A-Z, a-z
FRISO_EN_NUMERIC = 1, //0-9
FRISO_EN_PUNCTUATION = 2, //english punctuations
FRISO_EN_WHITESPACE = 3, //whitespace
FRISO_EN_UNKNOW = -1 //unkow(beyond 32-126)
} friso_enchar_t;
/* get the type of the specified char.
* the type will be the constants defined above.
* (include the fullwidth english char.)
*/
FRISO_API friso_enchar_t friso_enchar_type(friso_charset_t, friso_task_t);
/* get the type of the specified en char.
* the type will be the constants defined above.
* (the char should be half-width english char only)
*/
FRISO_API friso_enchar_t get_enchar_type(char);
/* }}} */
/** {{{ UTF8 interface*/
/* read the next utf-8 word from the specified position.
*
* @return int the bytes of the current readed word.
*/
FRISO_API int utf8_next_word(friso_task_t, uint_t *, fstring);
//get the bytes of a utf-8 char.
FRISO_API int get_utf8_bytes(char);
//return the unicode serial number of a given string.
FRISO_API int get_utf8_unicode(const fstring);
//convert the unicode serial to a utf-8 string.
FRISO_API int unicode_to_utf8(uint_t, fstring);
//check if the given char is a CJK.
FRISO_API int utf8_cjk_string(uint_t) ;
/*check the given char is a Basic Latin letter or not.
* include all the letters and english puntuations.*/
FRISO_API int utf8_halfwidth_en_char(uint_t);
/*
* check the given char is a full-width latain or not.
* include the full-width arabic numeber, letters.
* but not the full-width puntuations.
*/
FRISO_API int utf8_fullwidth_en_char(uint_t);
//check the given char is a upper case letter or not.
// included all the full-width and half-width letters.
FRISO_API int utf8_uppercase_letter(uint_t);
//check the given char is a lower case letter or not.
// included all the full-width and half-width letters.
FRISO_API int utf8_lowercase_letter(uint_t);
//check the given char is a numeric.
// included the full-width and half-width arabic numeric.
FRISO_API int utf8_numeric_letter(uint_t);
/*
* check if the given fstring is make up with numeric chars.
* both full-width,half-width numeric is ok.
*/
FRISO_API int utf8_numeric_string(char *);
FRISO_API int utf8_decimal_string(char *);
//check the given char is a english char.
//(full-width and half-width)
//not the punctuation of course.
FRISO_API int utf8_en_letter(uint_t);
//check the given char is a whitespace or not.
FRISO_API int utf8_whitespace(uint_t);
/* check if the given char is a letter number like 'ⅠⅡ'
*/
FRISO_API int utf8_letter_number(uint_t);
/*
* check if the given char is a other number like ''
*/
FRISO_API int utf8_other_number(uint_t);
//check if the given char is a english punctuation.
FRISO_API int utf8_en_punctuation(uint_t) ;
//check if the given char is a chinese punctuation.
FRISO_API int utf8_cn_punctuation(uint_t u);
FRISO_API int is_en_punctuation(friso_charset_t, char);
//#define is_en_punctuation( c ) utf8_en_punctuation((uint_t) c)
//@Deprecated
//FRISO_API int utf8_keep_punctuation( fstring );
/* }}} */
/** {{{ GBK interface */
/* read the next GBK word from the specified position.
*
* @return int the bytes of the current readed word.
*/
FRISO_API int gbk_next_word(friso_task_t, uint_t *, fstring);
//get the bytes of a utf-8 char.
FRISO_API int get_gbk_bytes(char);
//check if the given char is a gbk char (ANSII string).
FRISO_API int gbk_cn_string(char *) ;
/*check if the given char is a ASCII letter
* include all the letters and english puntuations.*/
FRISO_API int gbk_halfwidth_en_char(char);
/*
* check if the given char is a full-width latain.
* include the full-width arabic numeber, letters.
* but not the full-width puntuations.
*/
FRISO_API int gbk_fullwidth_en_char(char *);
//check if the given char is a upper case char.
// included all the full-width and half-width letters.
FRISO_API int gbk_uppercase_letter(char *);
//check if the given char is a lower case char.
// included all the full-width and half-width letters.
FRISO_API int gbk_lowercase_letter(char *);
//check if the given char is a numeric.
// included the full-width and half-width arabic numeric.
FRISO_API int gbk_numeric_letter(char *);
/*
* check if the given fstring is make up with numeric chars.
* both full-width,half-width numeric is ok.
*/
FRISO_API int gbk_numeric_string(char *);
FRISO_API int gbk_decimal_string(char *);
//check if the given char is a english(ASCII) char.
//(full-width and half-width)
//not the punctuation of course.
FRISO_API int gbk_en_letter(char *);
//check the specified char is a whitespace or not.
FRISO_API int gbk_whitespace(char *);
/* check if the given char is a letter number like 'ⅠⅡ'
*/
FRISO_API int gbk_letter_number(char *);
/*
* check if the given char is a other number like ''
*/
FRISO_API int gbk_other_number(char *);
//check if the given char is a english punctuation.
FRISO_API int gbk_en_punctuation(char) ;
//check the given char is a chinese punctuation.
FRISO_API int gbk_cn_punctuation(char *);
//cause the logic handle is the same as the utf8.
// here invoke the utf8 interface directly.
//FRISO_API int gbk_keep_punctuation( char * );
//@Deprecated
//#define gbk_keep_punctuation( str ) utf8_keep_punctuation(str)
//check if the given english char is a full-width char or not.
//FRISO_API int gbk_fullwidth_char( char * ) ;
/* }}}*/
#endif /*end _friso_charset_h*/

View File

@ -1,285 +0,0 @@
/*
* friso hash table functions implementation defined in header file "friso_API.h".
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdlib.h>
#include <string.h>
//-166411799L
//31 131 1331 13331 133331 ..
//31 131 1313 13131 131313 .. the best
#define HASH_FACTOR 1313131
/* ************************
* mapping function area *
**************************/
__STATIC_API__ uint_t hash(fstring str, uint_t length) {
//hash code
uint_t h = 0;
while(*str != '\0') {
h = h * HASH_FACTOR + (*str++);
}
return (h % length);
}
/*test if a integer is a prime.*/
__STATIC_API__ int is_prime(int n) {
int j;
if(n == 2 || n == 3) {
return 1;
}
if(n == 1 || n % 2 == 0) {
return 0;
}
for(j = 3; j * j < n; j++) {
if(n % j == 0) {
return 0;
}
}
return 1;
}
/*get the next prime just after the speicified integer.*/
__STATIC_API__ int next_prime(int n) {
if(n % 2 == 0) n++;
for(; ! is_prime(n); n = n + 2) ;
return n;
}
//fstring copy, return the pointer of the new string.
//static fstring string_copy( fstring _src ) {
//int bytes = strlen( _src );
//fstring _dst = ( fstring ) FRISO_MALLOC( bytes + 1 );
//register int t = 0;
//do {
//_dst[t] = _src[t];
//t++;
//} while ( _src[t] != '\0' );
//_dst[t] = '\0';
//return _dst;
//}
/* *********************************
* static hashtable function area. *
***********************************/
__STATIC_API__ hash_entry_t new_hash_entry(
fstring key,
void * value,
hash_entry_t next) {
hash_entry_t e = (hash_entry_t)
FRISO_MALLOC(sizeof(friso_hash_entry));
if(e == NULL) {
___ALLOCATION_ERROR___
}
//e->_key = string_copy( key );
e->_key = key;
e->_val = value;
e->_next = next;
return e;
}
//create blocks copy of entries.
__STATIC_API__ hash_entry_t * create_hash_entries(uint_t blocks) {
register uint_t t;
hash_entry_t *e = (hash_entry_t *)
FRISO_CALLOC(sizeof(hash_entry_t), blocks);
if(e == NULL) {
___ALLOCATION_ERROR___
}
for(t = 0; t < blocks; t++) {
e[t] = NULL;
}
return e;
}
//a static function to do the re-hash work.
__STATIC_API__ void rebuild_hash(friso_hash_t _hash) {
//printf("rehashed.\n");
//find the next prime as the length of the hashtable.
uint_t t, length = next_prime(_hash->length * 2 + 1);
hash_entry_t e, next, *_src = _hash->table, \
*table = create_hash_entries(length);
uint_t bucket;
//copy the nodes
for(t = 0; t < _hash->length; t++) {
e = *(_src + t);
if(e != NULL) {
do {
next = e->_next;
bucket = hash(e->_key, length);
e->_next = table[bucket];
table[bucket] = e;
e = next;
} while(e != NULL);
}
}
_hash->table = table;
_hash->length = length;
_hash->threshold = (uint_t)(_hash->length * _hash->factor);
//free the old hash_entry_t blocks allocations.
FRISO_FREE(_src);
}
/* ********************************
* hashtable interface functions. *
* ********************************/
//create a new hash table.
FRISO_API friso_hash_t new_hash_table(void) {
friso_hash_t _hash = (friso_hash_t) FRISO_MALLOC(sizeof(friso_hash_cdt));
if(_hash == NULL) {
___ALLOCATION_ERROR___
}
//initialize the the hashtable
_hash->length = DEFAULT_LENGTH;
_hash->size = 0;
_hash->factor = DEFAULT_FACTOR;
_hash->threshold = (uint_t)(_hash->length * _hash->factor);
_hash->table = create_hash_entries(_hash->length);
return _hash;
}
FRISO_API void free_hash_table(
friso_hash_t _hash,
fhash_callback_fn_t fentry_func) {
register uint_t j;
hash_entry_t e, n;
for(j = 0; j < _hash->length; j++) {
e = *(_hash->table + j);
for(; e != NULL ;) {
n = e->_next;
if(fentry_func != NULL) fentry_func(e);
FRISO_FREE(e);
e = n;
}
}
//free the pointer array block ( 4 * htable->length continuous bytes ).
FRISO_FREE(_hash->table);
FRISO_FREE(_hash);
}
//put a new mapping insite.
//the value cannot be NULL.
FRISO_API void *hash_put_mapping(
friso_hash_t _hash,
fstring key,
void * value) {
uint_t bucket = (key == NULL) ? 0 : hash(key, _hash->length);
hash_entry_t e = *(_hash->table + bucket);
void *oval = NULL;
//check the given key is already exists or not.
for(; e != NULL; e = e->_next) {
if(key == e->_key
|| (key != NULL && e->_key != NULL
&& strcmp(key, e->_key) == 0)) {
oval = e->_val; //bak the old value
e->_key = key;
e->_val = value;
return oval;
}
}
//put a new mapping into the hashtable.
_hash->table[bucket] = new_hash_entry(key, value, _hash->table[bucket]);
_hash->size++;
//check the condition to rebuild the hashtable.
if(_hash->size >= _hash->threshold) {
rebuild_hash(_hash);
}
return oval;
}
//check the existence of the mapping associated with the given key.
FRISO_API int hash_exist_mapping(
friso_hash_t _hash, fstring key) {
uint_t bucket = (key == NULL) ? 0 : hash(key, _hash->length);
hash_entry_t e;
for(e = *(_hash->table + bucket);
e != NULL; e = e->_next) {
if(key == e->_key
|| (key != NULL && e->_key != NULL
&& strcmp(key, e->_key) == 0)) {
return 1;
}
}
return 0;
}
//get the value associated with the given key.
FRISO_API void *hash_get_value(friso_hash_t _hash, fstring key) {
uint_t bucket = (key == NULL) ? 0 : hash(key, _hash->length);
hash_entry_t e;
for(e = *(_hash->table + bucket);
e != NULL; e = e->_next) {
if(key == e->_key
|| (key != NULL && e->_key != NULL
&& strcmp(key, e->_key) == 0)) {
return e->_val;
}
}
return NULL;
}
//remove the mapping associated with the given key.
FRISO_API hash_entry_t hash_remove_mapping(
friso_hash_t _hash, fstring key) {
uint_t bucket = (key == NULL) ? 0 : hash(key, _hash->length);
hash_entry_t e, prev = NULL;
hash_entry_t b;
for(e = *(_hash->table + bucket);
e != NULL; prev = e, e = e->_next) {
if(key == e->_key
|| (key != NULL && e->_key != NULL
&& strcmp(key, e->_key) == 0)) {
b = e;
//the node located at *( htable->table + bucket )
if(prev == NULL) {
_hash->table[bucket] = e->_next;
} else {
prev->_next = e->_next;
}
//printf("%s was removed\n", b->_key);
_hash->size--;
//FRISO_FREE( b );
return b;
}
}
return NULL;
}
//count the size.(A macro define has replace this.)
//FRISO_API uint_t hash_get_size( friso_hash_t _hash ) {
// return _hash->size;
//}

View File

@ -1,540 +0,0 @@
/*
* friso lexicon functions implementation.
* used to deal with the friso lexicon, like: load,remove,match...
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include <stdlib.h>
#include <string.h>
#include "friso_API.h"
#include "friso.h"
#define __SPLIT_MAX_TOKENS__ 5
#define __LEX_FILE_DELIME__ '#'
#define __FRISO_LEX_IFILE__ "friso.lex.ini"
//create a new lexicon
FRISO_API friso_dic_t friso_dic_new() {
register uint_t t;
friso_dic_t dic = (friso_dic_t) FRISO_CALLOC(
sizeof(friso_hash_t), __FRISO_LEXICON_LENGTH__);
if(dic == NULL) {
___ALLOCATION_ERROR___
}
for(t = 0; t < __FRISO_LEXICON_LENGTH__; t++) {
dic[t] = new_hash_table();
}
return dic;
}
/**
* default callback function to invoke
* when free the friso dictionary .
*
* @date 2013-06-12
*/
__STATIC_API__ void default_fdic_callback(hash_entry_t e) {
register uint_t i;
friso_array_t syn;
lex_entry_t lex = (lex_entry_t) e->_val;
//free the lex->word
FRISO_FREE(lex->word);
//free the lex->syn if it is not NULL
if(lex->syn != NULL) {
syn = lex->syn;
for(i = 0; i < syn->length; i++) {
FRISO_FREE(syn->items[i]);
}
free_array_list(syn);
}
//free the e->_val
//@date 2014-01-28 posted by mlemay@gmail.com
FRISO_FREE(lex);
}
FRISO_API void friso_dic_free(friso_dic_t dic) {
register uint_t t;
for(t = 0; t < __FRISO_LEXICON_LENGTH__; t++) {
//free the hash table
free_hash_table(dic[t], default_fdic_callback);
}
FRISO_FREE(dic);
}
//create a new lexicon entry
FRISO_API lex_entry_t new_lex_entry(
fstring word,
friso_array_t syn,
uint_t fre,
uint_t length,
uint_t type) {
lex_entry_t e = (lex_entry_t)
FRISO_MALLOC(sizeof(lex_entry_cdt));
if(e == NULL) {
___ALLOCATION_ERROR___
}
//initialize.
e->word = word;
e->syn = syn; //synoyum words array list.
e->pos = NULL; //part of speech array list.
//e->py = NULL; //set to NULL first.
e->fre = fre;
e->length = (uchar_t) length; //length
e->rlen = (uchar_t) length; //set to length by default.
e->type = (uchar_t) type; //type
e->ctrlMask = 0; //control mask.
e->offset = -1;
return e;
}
/**
* free the given lexicon entry.
* you have to do three thing maybe:
* 1. free where its syn items points to. (not implemented)
* 2. free its syn. (friso_array_t)
* 3. free its pos. (friso_array_t)
* 4. free the lex_entry_t.
*/
FRISO_API void free_lex_entry_full(lex_entry_t e) {
register uint_t i;
friso_array_t syn;
//free the lex->word
FRISO_FREE(e->word);
//free the lex->syn if it is not NULL
if(e->syn != NULL) {
syn = e->syn;
for(i = 0; i < syn->length; i++) {
FRISO_FREE(syn->items[i]);
}
free_array_list(syn);
}
//free the e->_val
//@date 2014-01-28 posted by mlemay@gmail.com
FRISO_FREE(e);
}
FRISO_API void free_lex_entry(lex_entry_t e) {
//if ( e->syn != NULL ) {
// if ( flag == 1 ) free_array_list( e->syn);
// else free_array_list( e->syn );
//}
FRISO_FREE(e);
}
//add a new entry to the dictionary.
FRISO_API void friso_dic_add(
friso_dic_t dic,
friso_lex_t lex,
fstring word,
friso_array_t syn) {
void *olex = NULL;
if(lex >= 0 && lex < __FRISO_LEXICON_LENGTH__) {
//printf("lex=%d, word=%s, syn=%s\n", lex, word, syn);
olex = hash_put_mapping(dic[lex], word,
new_lex_entry(word, syn, 0,
(uint_t) strlen(word), (uint_t) lex));
if(olex != NULL) {
free_lex_entry_full((lex_entry_t)olex);
}
}
}
FRISO_API void friso_dic_add_with_fre(
friso_dic_t dic,
friso_lex_t lex,
fstring word,
friso_array_t syn,
uint_t frequency) {
void *olex = NULL;
if(lex >= 0 && lex < __FRISO_LEXICON_LENGTH__) {
olex = hash_put_mapping(dic[lex], word,
new_lex_entry(word, syn, frequency,
(uint_t) strlen(word), (uint_t) lex));
if(olex != NULL) {
free_lex_entry_full((lex_entry_t)olex);
}
}
}
/*
* read a line from a specified stream.
* the newline will be cleared.
*
* @date 2012-11-24
*/
FRISO_API fstring file_get_line(fstring __dst, FILE * _stream) {
register int c;
fstring cs;
cs = __dst;
while((c = fgetc(_stream)) != EOF) {
if(c == '\n') break;
*cs++ = c;
}
*cs = '\0';
return (c == EOF && cs == __dst) ? NULL : __dst;
}
/*
* static function to copy a string.
*/
///instead of memcpy
__STATIC_API__ fstring string_copy(
fstring _src,
fstring __dst,
uint_t blocks) {
register fstring __src = _src;
register uint_t t;
for(t = 0; t < blocks; t++) {
if(*__src == '\0') break;
__dst[t] = *__src++;
}
__dst[t] = '\0';
return __dst;
}
/**
* make a heap allocation, and copy the
* source fstring to the new allocation, and
* you should free it after use it .
*
* @param _src source fstring
* @param blocks number of bytes to copy
*/
__STATIC_API__ fstring string_copy_heap(
fstring _src, uint_t blocks) {
register uint_t t;
fstring str = (fstring) FRISO_MALLOC(blocks + 1);
if(str == NULL) {
___ALLOCATION_ERROR___;
}
for(t = 0; t < blocks; t++) {
//if ( *_src == '\0' ) break;
str[t] = *_src++;
}
str[t] = '\0';
return str;
}
/*
* find the postion of the first appear of the given char.
* address of the char in the fstring will be return .
* if not found NULL will be return .
*/
__STATIC_API__ fstring indexOf(fstring __str, char delimiter) {
uint_t i, __length__;
__length__ = strlen(__str);
for(i = 0; i < __length__; i++) {
if(__str[i] == delimiter) {
return __str + i;
}
}
return NULL;
}
/**
* load all the valid wors from a specified lexicon file .
*
* @param dic friso dictionary instance (A hash array)
* @param lex the lexicon type
* @param lex_file the path of the lexicon file
* @param length the maximum length of the word item
*/
FRISO_API void friso_dic_load(
friso_t friso,
friso_config_t config,
friso_lex_t lex,
fstring lex_file,
uint_t length) {
FILE * _stream;
char __char[1024], _buffer[512];
fstring _line;
string_split_entry sse;
fstring _word;
char _sbuffer[512];
fstring _syn;
friso_array_t sywords;
uint_t _fre;
if((_stream = fopen(lex_file, "rb")) != NULL) {
while((_line = file_get_line(__char, _stream)) != NULL) {
//clear up the notes
//make sure the length of the line is greater than 1.
//like the single '#' mark in stopwords dictionary.
if(_line[0] == '#' && strlen(_line) > 1) continue;
//handle the stopwords.
if(lex == __LEX_STOPWORDS__) {
//clean the chinese words that its length is greater than max length.
if(((int)_line[0]) < 0 && strlen(_line) > length) continue;
friso_dic_add(friso->dic, __LEX_STOPWORDS__,
string_copy_heap(_line, strlen(_line)), NULL);
continue;
}
//split the fstring with '/'.
string_split_reset(&sse, "/", _line);
if(string_split_next(&sse, _buffer) == NULL) {
continue;
}
//1. get the word.
_word = string_copy_heap(_buffer, strlen(_buffer));
if(string_split_next(&sse, _buffer) == NULL) {
//normal lexicon type,
//add them to the dictionary directly
friso_dic_add(friso->dic, lex, _word, NULL);
continue;
}
/*
* filter out the words that its length is larger
* than the specified limit.
* but not for __LEX_ECM_WORDS__ and english __LEX_STOPWORDS__
* and __LEX_CEM_WORDS__.
*/
if(!(lex == __LEX_ECM_WORDS__ || lex == __LEX_CEM_WORDS__)
&& strlen(_word) > length) {
FRISO_FREE(_word);
continue;
}
//2. get the synonyms words.
_syn = NULL;
if(strcmp(_buffer, "null") != 0) {
_syn = string_copy(_buffer, _sbuffer, strlen(_buffer));
}
//3. get the word frequency if it available.
_fre = 0;
if(string_split_next(&sse, _buffer) != NULL) {
_fre = atoi(_buffer);
}
/**
* Here:
* split the synonyms words with mark ","
* and put them in a array list if the synonyms is not NULL
*/
sywords = NULL;
if(config->add_syn && _syn != NULL) {
string_split_reset(&sse, ",", _sbuffer);
sywords = new_array_list_with_opacity(5);
while(string_split_next(&sse, _buffer) != NULL) {
if(strlen(_buffer) > length) continue;
array_list_add(sywords,
string_copy_heap(_buffer, strlen(_buffer)));
}
sywords = array_list_trim(sywords);
}
//4. add the word item
friso_dic_add_with_fre(
friso->dic, lex, _word, sywords, _fre);
}
fclose(_stream);
} else {
fprintf(stderr, "Warning: Fail to open lexicon file %s\n", lex_file);
fprintf(stderr, "Warning: Without lexicon file, segment results will not correct \n");
}
}
/**
* get the lexicon type index with the specified
* type keywords .
*
* @see friso.h#friso_lex_t
* @param _key
* @return int
*/
__STATIC_API__ friso_lex_t get_lexicon_type_with_constant(fstring _key) {
if(strcmp(_key, "__LEX_CJK_WORDS__") == 0) {
return __LEX_CJK_WORDS__;
} else if(strcmp(_key, "__LEX_CJK_UNITS__") == 0) {
return __LEX_CJK_UNITS__;
} else if(strcmp(_key, "__LEX_ECM_WORDS__") == 0) {
return __LEX_ECM_WORDS__;
} else if(strcmp(_key, "__LEX_CEM_WORDS__") == 0) {
return __LEX_CEM_WORDS__;
} else if(strcmp(_key, "__LEX_CN_LNAME__") == 0) {
return __LEX_CN_LNAME__;
} else if(strcmp(_key, "__LEX_CN_SNAME__") == 0) {
return __LEX_CN_SNAME__;
} else if(strcmp(_key, "__LEX_CN_DNAME1__") == 0) {
return __LEX_CN_DNAME1__;
} else if(strcmp(_key, "__LEX_CN_DNAME2__") == 0) {
return __LEX_CN_DNAME2__;
} else if(strcmp(_key, "__LEX_CN_LNA__") == 0) {
return __LEX_CN_LNA__;
} else if(strcmp(_key, "__LEX_STOPWORDS__") == 0) {
return __LEX_STOPWORDS__;
} else if(strcmp(_key, "__LEX_ENPUN_WORDS__") == 0) {
return __LEX_ENPUN_WORDS__;
} else if(strcmp(_key, "__LEX_EN_WORDS__") == 0) {
return __LEX_EN_WORDS__;
}
return -1;
}
/*
* load the lexicon configuration file.
* and load all the valid lexicon from the configuration file.
*
* @param friso friso instance
* @param config friso_config instance
* @param _path dictionary directory
* @param _limitts words length limit
*/
FRISO_API void friso_dic_load_from_ifile(
friso_t friso,
friso_config_t config,
fstring _path,
uint_t _limits) {
//1.parse the configuration file.
FILE *__stream;
char __chars__[1024], __key__[30], *__line__;
uint_t __length__, i, t;
friso_lex_t lex_t;
string_buffer_t sb;
//get the lexicon configruation file path
sb = new_string_buffer();
string_buffer_append(sb, _path);
string_buffer_append(sb, __FRISO_LEX_IFILE__);
//printf("%s\n", sb->buffer);
if((__stream = fopen(sb->buffer, "rb")) != NULL) {
while((__line__ =
file_get_line(__chars__, __stream)) != NULL) {
//comment filter.
if(__line__[0] == '#') continue;
if(__line__[0] == '\0') continue;
__length__ = strlen(__line__);
//item start
if(__line__[ __length__ - 1 ] == '[') {
//get the type key
for(i = 0; i < __length__
&& (__line__[i] == ' ' || __line__[i] == '\t'); i++);
for(t = 0; i < __length__; i++, t++) {
if(__line__[i] == ' '
|| __line__[i] == '\t' || __line__[i] == ':') break;
__key__[t] = __line__[i];
}
__key__[t] = '\0';
//get the lexicon type
lex_t = get_lexicon_type_with_constant(__key__);
if(lex_t == -1) continue;
//printf("key=%s, type=%d\n", __key__, lex_t );
while((__line__ = file_get_line(__chars__, __stream)) != NULL) {
//comments filter.
if(__line__[0] == '#') continue;
if(__line__[0] == '\0') continue;
__length__ = strlen(__line__);
if(__line__[ __length__ - 1 ] == ']') break;
for(i = 0; i < __length__
&& (__line__[i] == ' ' || __line__[i] == '\t'); i++);
for(t = 0; i < __length__; i++, t++) {
if(__line__[i] == ' '
|| __line__[i] == '\t' || __line__[i] == ';') break;
__key__[t] = __line__[i];
}
__key__[t] = '\0';
//load the lexicon item from the lexicon file.
string_buffer_clear(sb);
string_buffer_append(sb, _path);
string_buffer_append(sb, __key__);
//printf("key=%s, type=%d\n", __key__, lex_t);
friso_dic_load(friso, config, lex_t, sb->buffer, _limits);
}
}
} //end while
fclose(__stream);
} else {
fprintf(stderr, "Warning: Fail to open the lexicon configuration file %s\n", sb->buffer);
fprintf(stderr, "Warning: Without lexicon file, segment results will not correct \n");
}
free_string_buffer(sb);
}
//match the item.
FRISO_API int friso_dic_match(
friso_dic_t dic,
friso_lex_t lex,
fstring word) {
if(lex >= 0 && lex < __FRISO_LEXICON_LENGTH__) {
return hash_exist_mapping(dic[lex], word);
}
return 0;
}
//get the lex_entry_t associated with the word.
FRISO_API lex_entry_t friso_dic_get(
friso_dic_t dic,
friso_lex_t lex,
fstring word) {
if(lex >= 0 && lex < __FRISO_LEXICON_LENGTH__) {
return (lex_entry_t) hash_get_value(dic[lex], word);
}
return NULL;
}
//get the size of the specified type dictionary.
FRISO_API uint_t friso_spec_dic_size(
friso_dic_t dic,
friso_lex_t lex) {
if(lex >= 0 && lex < __FRISO_LEXICON_LENGTH__) {
return hash_get_size(dic[lex]);
}
return 0;
}
//get size of the whole dictionary.
FRISO_API uint_t friso_all_dic_size(
friso_dic_t dic) {
register uint_t size = 0, t;
for(t = 0; t < __FRISO_LEXICON_LENGTH__; t++) {
size += hash_get_size(dic[t]);
}
return size;
}

View File

@ -1,266 +0,0 @@
/*
* link list functions implementation defined in header file "friso_API.h".
* when the link_node is being deleted, here we just free
* the allocation of the node, not the allcation of it's value.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdlib.h>
//create a new link list node.
__STATIC_API__ link_node_t new_node_entry(
void * value,
link_node_t prev,
link_node_t next) {
link_node_t node = (link_node_t)
FRISO_MALLOC(sizeof(link_node_entry));
if(node == NULL) {
___ALLOCATION_ERROR___
}
node->value = value;
node->prev = prev;
node->next = next;
return node;
}
//create a new link list
FRISO_API friso_link_t new_link_list(void) {
friso_link_t e = (friso_link_t)
FRISO_MALLOC(sizeof(friso_link_entry));
if(e == NULL) {
___ALLOCATION_ERROR___
}
//initialize the entry
e->head = new_node_entry(NULL, NULL, NULL);
e->tail = new_node_entry(NULL, e->head, NULL);
e->head->next = e->tail;
e->size = 0;
return e;
}
//free the given link list
FRISO_API void free_link_list(friso_link_t link) {
link_node_t node, next;
for(node = link->head; node != NULL;) {
next = node->next;
FRISO_FREE(node);
node = next;
}
FRISO_FREE(link);
}
//clear all nodes in the link list.
FRISO_API friso_link_t link_list_clear(
friso_link_t link) {
link_node_t node, next;
//free all the middle nodes.
for(node = link->head->next; node != link->tail;) {
next = node->next;
FRISO_FREE(node);
node = next;
}
link->head->next = link->tail;
link->tail->prev = link->head;
link->size = 0;
return link;
}
//get the size of the link list.
//FRISO_API uint_t link_list_size( friso_link_t link ) {
// return link->size;
//}
//check if the link list is empty
//FRISO_API int link_list_empty( friso_link_t link ) {
// return ( link->size == 0 );
//}
/*
* find the node at a specified position.
* static
*/
__STATIC_API__ link_node_t get_node(
friso_link_t link, uint_t idx) {
link_node_t p = NULL;
register uint_t t;
if(idx >= 0 && idx < link->size) {
if(idx < link->size / 2) { //find from the head.
p = link->head;
for(t = 0; t <= idx; t++)
p = p->next;
} else { //find from the tail.
p = link->tail;
for(t = link->size; t > idx; t--)
p = p->prev;
}
}
return p;
}
/*
* insert a node before the given node.
* static
*/
//__STATIC_API__ void insert_before(
// friso_link_t link,
// link_node_t node,
// void * value )
//{
// link_node_t e = new_node_entry( value, node->prev, node );
// e->prev->next = e;
// e->next->prev = e;
// //node->prev = e;
//
// link->size++;
//}
#define insert_before( link, node, value ) \
{ \
link_node_t e = new_node_entry( value, node->prev, node ); \
e->prev->next = e; \
e->next->prev = e; \
link->size++; \
}
/*
* static function:
* remove the given node, the allocation of the value will not free,
* but we return it to you, you will free it youself when there is a necessary.
*
* @return the value of the removed node.
*/
__STATIC_API__ void * remove_node(
friso_link_t link, link_node_t node) {
void * _value = node->value;
node->prev->next = node->next;
node->next->prev = node->prev;
link->size--;
FRISO_FREE(node);
return _value;
}
//add a new node to the link list.(insert just before the tail)
FRISO_API void link_list_add(
friso_link_t link, void * value) {
insert_before(link, link->tail, value);
}
//add a new node before the given index.
FRISO_API void link_list_insert_before(
friso_link_t link, uint_t idx, void * value) {
link_node_t node = get_node(link, idx);
if(node != NULL) {
insert_before(link, node, value);
}
}
/*
* get the value with the specified node.
*
* @return the value of the node.
*/
FRISO_API void * link_list_get(
friso_link_t link, uint_t idx) {
link_node_t node = get_node(link, idx);
if(node != NULL) {
return node->value;
}
return NULL;
}
/*
* set the value of the node that located in the specified position.
* we did't free the allocation of the old value, we return it to you.
* free it yourself when it is necessary.
*
* @return the old value.
*/
FRISO_API void *link_list_set(
friso_link_t link,
uint_t idx, void * value) {
link_node_t node = get_node(link, idx);
void * _value = NULL;
if(node != NULL) {
_value = node->value;
node->value = value;
}
return _value;
}
/*
* remove the node located in the specified position.
*
* @see remove_node
* @return the value of the node removed.
*/
FRISO_API void *link_list_remove(
friso_link_t link, uint_t idx) {
link_node_t node = get_node(link, idx);
if(node != NULL) {
//printf("idx=%d, node->value=%s\n", idx, (string) node->value );
return remove_node(link, node);
}
return NULL;
}
/*
* remove the given node from the given link list.
*
* @see remove_node.
* @return the value of the node removed.
*/
FRISO_API void *link_list_remove_node(
friso_link_t link,
link_node_t node) {
return remove_node(link, node);
}
//remove the first node after the head
FRISO_API void *link_list_remove_first(
friso_link_t link) {
if(link->size > 0) {
return remove_node(link, link->head->next);
}
return NULL;
}
//remove the last node just before the tail.
FRISO_API void *link_list_remove_last(
friso_link_t link) {
if(link->size > 0) {
return remove_node(link, link->tail->prev);
}
return NULL;
}
//append a node from the tail.
FRISO_API void link_list_add_last(
friso_link_t link,
void *value) {
insert_before(link, link->tail, value);
}
//append a note just after the head.
FRISO_API void link_list_add_first(
friso_link_t link, void *value) {
insert_before(link, link->head->next, value);
}

View File

@ -1,298 +0,0 @@
/*
* utf-8 handle functions implementation.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* ******************************************
* fstring buffer functions implements. *
********************************************/
/**
* create a new buffer
* @Note:
* 1. it's real length is 1 byte greater than the specifield value
* 2. we did not do any optimization for the memory allocation to ...
* avoid the memory defragmentation.
*
* @date: 2014-10-16
*/
__STATIC_API__ fstring create_buffer(uint_t length) {
fstring buffer = (fstring) FRISO_MALLOC(length + 1);
if(buffer == NULL) {
___ALLOCATION_ERROR___
}
memset(buffer, 0x00, length + 1);
return buffer;
}
//the __allocs should not be smaller than sb->length
__STATIC_API__ string_buffer_t resize_buffer(
string_buffer_t sb, uint_t __allocs) {
//create a new buffer.
//if ( __allocs < sb->length ) __allocs = sb->length + 1;
fstring str = create_buffer(__allocs);
//register uint_t t;
//for ( t = 0; t < sb->length; t++ ) {
// str[t] = sb->buffer[t];
//}
memcpy(str, sb->buffer, sb->length);
FRISO_FREE(sb->buffer);
sb->buffer = str;
sb->allocs = __allocs;
return sb;
}
//create a new fstring buffer with a default opacity.
//FRISO_API string_buffer_t new_string_buffer( void )
//{
// return new_string_buffer_with_opacity( __BUFFER_DEFAULT_LENGTH__ );
//}
//create a new fstring buffer with the given opacity.
FRISO_API string_buffer_t new_string_buffer_with_opacity(uint_t opacity) {
string_buffer_t sb = (string_buffer_t)
FRISO_MALLOC(sizeof(string_buffer_entry));
if(sb == NULL) {
___ALLOCATION_ERROR___
}
sb->buffer = create_buffer(opacity);
sb->length = 0;
sb->allocs = opacity;
return sb;
}
//create a buffer with the given string.
FRISO_API string_buffer_t new_string_buffer_with_string(fstring str) {
//buffer allocations.
string_buffer_t sb = (string_buffer_t)
FRISO_MALLOC(sizeof(string_buffer_entry));
if(sb == NULL) {
___ALLOCATION_ERROR___
}
//initialize
sb->length = strlen(str);
sb->buffer = create_buffer(sb->length + __BUFFER_DEFAULT_LENGTH__);
sb->allocs = sb->length + __BUFFER_DEFAULT_LENGTH__;
//register uint_t t;
//copy the str to the buffer.
//for ( t = 0; t < sb->length; t++ ) {
// sb->buffer[t] = str[t];
//}
memcpy(sb->buffer, str, sb->length);
return sb;
}
FRISO_API void string_buffer_append(
string_buffer_t sb, fstring __str) {
register uint_t __len__ = strlen(__str);
//check the necessity to resize the buffer.
if(sb->length + __len__ > sb->allocs) {
sb = resize_buffer(sb, (sb->length + __len__) * 2 + 1);
}
//register uint_t t;
////copy the __str to the buffer.
//for ( t = 0; t < __len__; t++ ) {
// sb->buffer[ sb->length++ ] = __str[t];
//}
memcpy(sb->buffer + sb->length, __str, __len__);
sb->length += __len__;
}
FRISO_API void string_buffer_append_char(
string_buffer_t sb, char ch) {
//check the necessity to resize the buffer.
if(sb->length + 1 > sb->allocs) {
sb = resize_buffer(sb, sb->length * 2 + 1);
}
sb->buffer[sb->length++] = ch;
}
FRISO_API void string_buffer_insert(
string_buffer_t sb,
uint_t idx,
fstring __str) {
}
/*
* remove the given bytes from the buffer start from idx.
* this will cause the byte move after the idx+length.
*
* @return the new string.
*/
FRISO_API fstring string_buffer_remove(
string_buffer_t sb,
uint_t idx,
uint_t length) {
uint_t t;
//move the bytes after the idx + length
for(t = idx + length; t < sb->length; t++) {
sb->buffer[t - length] = sb->buffer[t];
}
sb->buffer[t] = '\0';
//memcpy( sb->buffer + idx,
// sb->buffer + idx + length,
// sb->length - idx - length );
t = sb->length - idx;
if(t > 0) {
sb->length -= (t > length) ? length : t;
}
sb->buffer[sb->length - 1] = '\0';
return sb->buffer;
}
/*
* turn the string_buffer to a string.
* or return the buffer of the string_buffer.
*/
FRISO_API string_buffer_t string_buffer_trim(string_buffer_t sb) {
//resize the buffer.
if(sb->length < sb->allocs - 1) {
sb = resize_buffer(sb, sb->length + 1);
}
return sb;
}
/*
* free the given fstring buffer.
* and this function will not free the allocations of the
* string_buffer_t->buffer, we return it to you, if there is
* a necessary you could free it youself by calling free();
*/
FRISO_API fstring string_buffer_devote(string_buffer_t sb) {
fstring buffer = sb->buffer;
FRISO_FREE(sb);
return buffer;
}
/*
* clear the given fstring buffer.
* reset its buffer with 0 and reset its length to 0.
*/
FRISO_API void string_buffer_clear(string_buffer_t sb) {
memset(sb->buffer, 0x00, sb->length);
sb->length = 0;
}
//free everything of the fstring buffer.
FRISO_API void free_string_buffer(string_buffer_t sb) {
FRISO_FREE(sb->buffer);
FRISO_FREE(sb);
}
/**
* create a new string_split_entry.
*
* @param source
* @return string_split_t;
*/
FRISO_API string_split_t new_string_split(
fstring delimiter,
fstring source) {
string_split_t e = (string_split_t)
FRISO_MALLOC(sizeof(string_split_entry));
if(e == NULL) {
___ALLOCATION_ERROR___;
}
e->delimiter = delimiter;
e->delLen = strlen(delimiter);
e->source = source;
e->srcLen = strlen(source);
e->idx = 0;
return e;
}
FRISO_API void string_split_reset(
string_split_t sst,
fstring delimiter,
fstring source) {
sst->delimiter = delimiter;
sst->delLen = strlen(delimiter);
sst->source = source;
sst->srcLen = strlen(source);
sst->idx = 0;
}
FRISO_API void string_split_set_source(
string_split_t sst, fstring source) {
sst->source = source;
sst->srcLen = strlen(source);
sst->idx = 0;
}
FRISO_API void string_split_set_delimiter(
string_split_t sst, fstring delimiter) {
sst->delimiter = delimiter;
sst->delLen = strlen(delimiter);
sst->idx = 0;
}
FRISO_API void free_string_split(string_split_t sst) {
FRISO_FREE(sst);
}
/**
* get the next split fstring, and copy the
* splited fstring into the __dst buffer .
*
* @param string_split_t
* @param __dst
* @return fstring (NULL if reach the end of the source
* or there is no more segmentation)
*/
FRISO_API fstring string_split_next(
string_split_t sst, fstring __dst) {
uint_t i, _ok;
fstring _dst = __dst;
//check if reach the end of the fstring
if(sst->idx >= sst->srcLen) return NULL;
while(1) {
_ok = 1;
for(i = 0; i < sst->delLen
&& (sst->idx + i < sst->srcLen); i++) {
if(sst->source[sst->idx + i] != sst->delimiter[i]) {
_ok = 0;
break;
}
}
//find the delimiter here,
//break the loop and self plus the sst->idx, then return the buffer .
if(_ok == 1) {
sst->idx += sst->delLen;
break;
}
//coy the char to the buffer
*_dst++ = sst->source[sst->idx++];
//check if reach the end of the fstring
if(sst->idx >= sst->srcLen) break;
}
*_dst = '\0';
return _dst;
}

View File

@ -1,50 +0,0 @@
/*
* dynamatic array test program.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char **args) {
//create a new array list.
friso_array_t array = new_array_list();
fstring keys[] = {
"chenmanwen", "yangqinghua",
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
"zhangrenfang", "yangjian",
"liuxiao", "pankai",
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
"caizaili", "panpan", "xiaolude", "yintanwen"
};
int j, idx = 2, len = sizeof(keys) / sizeof(fstring);
for(j = 0; j < len; j++) {
array_list_add(array, keys[j]);
}
printf("length=%d, allocations=%d\n", array->length, array->allocs);
array_list_trim(array);
printf("after tirm length=%d, allocations=%d\n", array->length, array->allocs);
printf("idx=%d, value=%s\n", idx, (fstring) array_list_get(array, idx));
printf("\nAfter set %dth item.\n", idx);
array_list_set(array, idx, "chenxin__");
printf("idx=%d, value=%s\n", idx, (fstring) array_list_get(array, idx));
printf("\nAfter remove %dth item.\n", idx);
array_list_remove(array, idx);
printf("length=%d, allocations=%d\n", array->length, array->allocs);
printf("idx=%d, value=%s\n", idx, (fstring) array_list_get(array, idx));
printf("\nInsert a item at %dth\n", idx);
array_list_insert(array, idx, "*chenxin*");
printf("idx=%d, value=%s\n", idx, (fstring) array_list_get(array, idx));
free_array_list(array);
return 0;
}

View File

@ -1,161 +0,0 @@
/*
* Friso test program.
* Of couse you can make it a perfect demo for friso.
* all threads or proccess share the same friso_t,
* defferent threads/proccess use defferent friso_task_t.
* and you could share the friso_config_t if you wish...
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include "friso.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define __LENGTH__ 15
#define __INPUT_LENGTH__ 20480
#define ___EXIT_INFO___ \
println("Thanks for trying friso."); \
break;
#define ___ABOUT___ \
println("+---------------------------------------------------------------+"); \
println("| Friso - a Chinese word segmentation writen by c. |"); \
println("| bug report email - chenxin619315@gmail.com. |"); \
println("| or: visit https://github.com/lionsoul2014/friso. |"); \
println("| java version for https://github.com/lionsoul2014/jcseg |"); \
println("| type 'quit' to exit the program. |"); \
println("+---------------------------------------------------------------+");
//read a line from a command line.
static fstring getLine(FILE *fp, fstring __dst) {
register int c;
register fstring cs;
cs = __dst;
while((c = getc(fp)) != EOF) {
if(c == '\n') break;
*cs++ = c;
}
*cs = '\0';
return (c == EOF && cs == __dst) ? NULL : __dst;
}
/*static void printcode( fstring str ) {
int i,length;
length = strlen( str );
printf("str:length=%d\n", length );
for ( i = 0; i < length; i++ ) {
printf("%d ", str[i] );
}
putchar('\n');
}*/
int main(int argc, char **argv) {
clock_t s_time, e_time;
char line[__INPUT_LENGTH__] = {0};
int i;
fstring __path__ = NULL, mode = NULL;
friso_t friso;
friso_config_t config;
friso_task_t task;
// get the lexicon directory from command line arguments
for(i = 0; i < argc; i++) {
if(strcasecmp("-init", argv[i]) == 0) {
__path__ = argv[i + 1];
}
}
if(__path__ == NULL) {
println("Usage: friso -init lexicon path");
exit(0);
}
s_time = clock();
//initialize
friso = friso_new();
config = friso_new_config();
/*friso_dic_t dic = friso_dic_new();
friso_dic_load_from_ifile( dic, __path__, __LENGTH__ );
friso_set_dic( friso, dic );
friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/
if(friso_init_from_ifile(friso, config, __path__) != 1) {
printf("fail to initialize friso and config.\n");
goto err;
}
switch(config->mode) {
case __FRISO_SIMPLE_MODE__:
mode = "Simple";
break;
case __FRISO_COMPLEX_MODE__:
mode = "Complex";
break;
case __FRISO_DETECT_MODE__:
mode = "Detect";
break;
}
//friso_set_mode( config, __FRISO_DETECT_MODE__ );
//printf("clr_stw=%d\n", friso->clr_stw);
//printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" ));
//printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究"));
e_time = clock();
printf("Initialized in %fsec\n", (double)(e_time - s_time) / CLOCKS_PER_SEC);
printf("Mode: %s\n", mode);
printf("+-Version: %s (%s)\n", friso_version(), friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK");
___ABOUT___;
//set the task.
task = friso_new_task();
while(1) {
print("friso>> ");
getLine(stdin, line);
//exit the programe
if(strcasecmp(line, "quit") == 0) {
___EXIT_INFO___
}
//for ( i = 0; i < 1000000; i++ ) {
//set the task text.
friso_set_text(task, line);
println("分词结果:");
s_time = clock();
while((config->next_token(friso, config, task)) != NULL) {
printf(
"%s[%d, %d, %d] ",
task->token->word,
task->token->offset,
task->token->length,
task->token->rlen
);
// printf("%s ", task->token->word);
}
//}
e_time = clock();
printf("\nDone, cost < %fsec\n", ((double)(e_time - s_time)) / CLOCKS_PER_SEC);
}
friso_free_task(task);
//error block.
err:
friso_free_config(config);
friso_free(friso);
return 0;
}

View File

@ -1,65 +0,0 @@
/**
* hashmap testing program
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdio.h>
void print_hash_info(friso_hash_t _hash) {
printf("info:length=%d, size=%d, facotr=%f, threshold=%d\n", _hash->length, \
_hash->size, _hash->factor, _hash->threshold);
}
int main(int argc, char **argv) {
friso_hash_t _hash = new_hash_table();
char *names[] = {
"陈满文", "阳清华",
"陈鑫", "罗江艳",
"小燕子", "比比",
"张仁芳", "阳建",
"陈配", "李恒",
"张志刚", "张怡少",
"阳江波", "蔡再利",
"阳绘章", "尹唐文",
"谭志鹏", "肖路德",
"潘凯", "刘潇",
"马朝辉", "张强",
"殷美林", "元明清",
"周安", "郭桥安",
"刘敏", "黄广华",
"李胜", "黄海清"
};
//char *str[] = {"陈鑫", "张仁芳", "比比"};
char **str = names;
int j, len = 30;
print_hash_info(_hash);
for(j = 0; j < len; j++) {
hash_put_mapping(_hash, names[j], names[j]);
}
print_hash_info(_hash);
printf("Press any key to continue.");
getchar();
//remove mappings
for(j = 0; j < len; j++) {
printf("Exist %s?%2d\n", str[j], hash_exist_mapping(_hash, str[j]));
printf("Now, remove %s\n", str[j]);
hash_remove_mapping(_hash, str[j]);
printf("Exist %s?%2d\n", str[j], hash_exist_mapping(_hash, str[j]));
printf("*********************************\n");
}
printf("Press any key to continue.");
getchar();
print_hash_info(_hash);
//free the table
free_hash_table(_hash, 0);
return 0;
}

View File

@ -1,108 +0,0 @@
/*
* lex functions test program.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso.h"
#include <stdio.h>
#include <time.h>
#include <string.h>
#define __LENGTH__ 15
#define ___PRINT_HELP_INFO___ \
printf("1. help print the current menu.\n"); \
printf("2. #set set the classify of the dictionary.\n"); \
printf("3. other search the words in the dictionary.\n"); \
printf("4. quit exit the programe.\n");
int main(int argc, char **argv) {
lex_entry_t e;
int lex = __LEX_CJK_WORDS__;
char _line[__LENGTH__];
clock_t s_time, e_time;
friso_t friso;
friso_config_t config;
s_time = clock();
friso = friso_new();
config = friso_new_config();
config->add_syn = 0;
friso->dic = friso_dic_new();
//__CJK_WORDS__
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-main.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-admin.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-chars.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-cn-mz.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-cn-place.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-company.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-festival.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-flname.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-food.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-lang.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-nation.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-net.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-org.lex", __LENGTH__);
//__CJK_UNITS__
friso_dic_load(friso, config, __LEX_CJK_UNITS__, "../vendors/dict/UTF-8/lex-units.lex", __LENGTH__);
//__MIX_WORDS__
friso_dic_load(friso, config, __LEX_ECM_WORDS__, "../vendors/dict/UTF-8/lex-ecmixed.lex", __LENGTH__);
//__CN_LNAME__
friso_dic_load(friso, config, __LEX_CN_LNAME__, "../vendors/dict/UTF-8/lex-lname.lex", __LENGTH__);
//__CN_SNAME__
friso_dic_load(friso, config, __LEX_CN_SNAME__, "../vendors/dict/UTF-8/lex-sname.lex", __LENGTH__);
//__CN_DNAME1__
friso_dic_load(friso, config, __LEX_CN_DNAME1__, "../vendors/dict/UTF-8/lex-dname-1.lex", __LENGTH__);
//__CN_DNAME2__
friso_dic_load(friso, config, __LEX_CN_DNAME2__, "../vendors/dict/UTF-8/lex-dname-2.lex", __LENGTH__);
//__CN_LNA__
friso_dic_load(friso, config, __LEX_CN_LNA__, "../vendors/dict/UTF-8/lex-ln-adorn.lex", __LENGTH__);
e_time = clock();
printf(
"Done, cost: %f sec, size=%d\n",
(double)(e_time - s_time) / CLOCKS_PER_SEC,
friso_all_dic_size(friso->dic)
);
while(1) {
printf("friso-%d>> ", lex);
if(scanf("%s", _line) != 1) {
printf("Invalid input\n");
continue;
}
if(strcmp(_line, "quit") == 0) {
break;
} else if(strcmp(_line, "help") == 0) {
___PRINT_HELP_INFO___
} else if(strcmp(_line, "#set") == 0) {
printf("lex_t>> ");
if(scanf("%d", &lex) != 1) {
printf("Warning: Invalid lex type input\n");
continue;
}
} else {
s_time = clock();
e = friso_dic_get(friso->dic, lex, _line);
e_time = clock();
if(e != NULL) {
printf(
"word=%s, syn=%s, fre=%d, cost:%fsec\n",
e->word, e->syn == NULL ? "NULL" : (char *)e->syn->items[0],
e->fre,
(double)(e_time - s_time) / CLOCKS_PER_SEC
);
} else {
printf("%s was not found.\n", _line);
}
}
}
// friso_dic_free( friso->dic );
friso_free(friso);
return 0;
}

Some files were not shown because too many files have changed in this diff Show More