Upload a new chinese segmentation lib --friso

This commit is contained in:
MouseZhangZh 2021-02-19 10:46:53 +08:00
parent 9acf580c0c
commit 361df062be
83 changed files with 381805 additions and 5 deletions

163
libfriso/friso-interface.c Normal file
View File

@ -0,0 +1,163 @@
/*
* Friso test program.
* Of couse you can make it a perfect demo for friso.
* all threads or proccess share the same friso_t,
* defferent threads/proccess use defferent friso_task_t.
* and you could share the friso_config_t if you wish...
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso-interface.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define __LENGTH__ 15
#define __INPUT_LENGTH__ 20480
#define ___EXIT_INFO___ \
println("Thanks for trying friso."); \
break;
#define ___ABOUT___ \
println("+---------------------------------------------------------------+"); \
println("| Friso - a Chinese word segmentation writen by c. |"); \
println("| bug report email - chenxin619315@gmail.com. |"); \
println("| or: visit https://github.com/lionsoul2014/friso. |"); \
println("| java version for https://github.com/lionsoul2014/jcseg |"); \
println("| type 'quit' to exit the program. |"); \
println("+---------------------------------------------------------------+");
//read a line from a command line.
static fstring getLine( FILE *fp, fstring __dst )
{
register int c;
register fstring cs;
cs = __dst;
while ( ( c = getc( fp ) ) != EOF ) {
if ( c == '\n' ) break;
*cs++ = c;
}
*cs = '\0';
return ( c == EOF && cs == __dst ) ? NULL : __dst;
}
/*static void printcode( fstring str ) {
int i,length;
length = strlen( str );
printf("str:length=%d\n", length );
for ( i = 0; i < length; i++ ) {
printf("%d ", str[i] );
}
putchar('\n');
}*/
//int friso_test(int argc, char **argv)
int friso_test()
{
clock_t s_time, e_time;
char line[__INPUT_LENGTH__] = {0};
int i;
fstring __path__ = NULL, mode = NULL;
friso_t friso;
friso_config_t config;
friso_task_t task;
// get the lexicon directory from command line arguments
// for ( i = 0; i < argc; i++ ) {
// if ( strcasecmp( "-init", argv[i] ) == 0 ) {
// __path__ = argv[i+1];
// }
// }
__path__ = "/usr/share/ukui-search/res/friso.ini";
if ( __path__ == NULL ) {
println("Usage: friso -init lexicon path");
exit(0);
}
s_time = clock();
//initialize
friso = friso_new();
config = friso_new_config();
/*friso_dic_t dic = friso_dic_new();
friso_dic_load_from_ifile( dic, __path__, __LENGTH__ );
friso_set_dic( friso, dic );
friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/
if ( friso_init_from_ifile(friso, config, __path__) != 1 ) {
printf("fail to initialize friso and config.\n");
goto err;
}
switch ( config->mode ) {
case __FRISO_SIMPLE_MODE__:
mode = "Simple";
break;
case __FRISO_COMPLEX_MODE__:
mode = "Complex";
break;
case __FRISO_DETECT_MODE__:
mode = "Detect";
break;
}
//friso_set_mode( config, __FRISO_DETECT_MODE__ );
//printf("clr_stw=%d\n", friso->clr_stw);
//printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" ));
//printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究"));
e_time = clock();
printf("Initialized in %fsec\n", (double) ( e_time - s_time ) / CLOCKS_PER_SEC );
printf("Mode: %s\n", mode);
printf("+-Version: %s (%s)\n", friso_version(), friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK" );
___ABOUT___;
//set the task.
task = friso_new_task();
while ( 1 ) {
print("friso>> ");
getLine( stdin, line );
//exit the programe
if (strcasecmp( line, "quit") == 0) {
___EXIT_INFO___
}
//for ( i = 0; i < 1000000; i++ ) {
//set the task text.
friso_set_text( task, line );
println("分词结果:");
s_time = clock();
while ( ( config->next_token( friso, config, task ) ) != NULL ) {
printf(
"%s[%d, %d, %d] ",
task->token->word,
task->token->offset,
task->token->length,
task->token->rlen
);
// printf("%s ", task->token->word);
}
//}
e_time = clock();
printf("\nDone, cost < %fsec\n", ( (double)(e_time - s_time) ) / CLOCKS_PER_SEC );
}
friso_free_task(task);
//error block.
err:
friso_free_config(config);
friso_free(friso);
return 0;
}

View File

@ -0,0 +1,10 @@
/*
* temporary use friso.ini, it should be removed in the future.
* MouseZhangZh
*/
#include "friso/src/friso_API.h"
#include "friso/src/friso.h"
#include "friso/src/friso_ctype.h"
//int friso_test(int argc, char **argv);
int friso_test();

15
libfriso/friso/.gitignore vendored Normal file
View File

@ -0,0 +1,15 @@
*.o
*.lo
*.la
*.so
*.out
src/friso
src/*-testing
# lib/ #
CHANGES.txt
# vim #
*.swp
*.vim
*.viminfo
.libs/
.idea/

225
libfriso/friso/LICENSE.md Normal file
View File

@ -0,0 +1,225 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==========================================================================
The following license applies to the Friso ANSI C library
--------------------------------------------------------------------------
Copyright (c) 2010 lionsoul<chenxin619315@gmail.com>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

68
libfriso/friso/friso.ini Normal file
View File

@ -0,0 +1,68 @@
# friso configuration file.
# do not change the name of the left key.
# @email chenxin619315@gmail.com
# @date 2012-12-20
#
# charset, only UTF8 and GBK support.
# set it with UTF8(0) or GBK(1)
friso.charset = 0
# lexicon directory absolute path.
# the value must end with '/'
# this will tell friso how to find friso.lex.ini configuration file and all the lexicon files.
#
# if it is not start with '/' for linux, or matches no ':' for winnt in its value
# friso will search the friso.lex.ini relative to friso.ini
# absolute path search:
# linux: friso.lex_dir = /c/products/friso/dict/UTF-8/
# Winnt: friso.lex_dir = D:/products/friso/dict/UTF-8/
# relative path search (All system)
friso.lex_dir = ./dict/UTF-8/
# the maximum matching length.
friso.max_len = 5
# 1 for recognition chinese name.
# and 0 for closed it.
friso.r_name = 1
# the maximum length for the cjk words in a
# chinese and english mixed word.
friso.mix_len = 2
# the maxinum length for the chinese last name adron.
friso.lna_len = 1
# append the synonyms words
friso.add_syn = 1
# clear the stopwords or not (1 to open it and 0 to close it)
# @date 2013-06-13
friso.clr_stw = 0
# keep the unrecongized words or not (1 to open it and 0 to close it)
# @date 2013-06-13
friso.keep_urec = 0
# use sphinx output style like 'admire|love|enjoy einsten'
# @date 2013-10-25
friso.spx_out = 0
# start the secondary segmentation for complex english token.
friso.en_sseg = 1
# min length of the secondary segmentation token. (better larger than 1)
friso.st_minl = 2
# default keep punctuations for english token.
friso.kpuncs = @%.#&+
# the threshold value for a char not a part of a chinese name.
friso.nthreshold = 2000000
# default mode for friso.
# 1 : simple mode - simply maxmum matching algorithm.
# 2 : complex mode - four rules of mmseg alogrithm.
# 3 : detect mode - only return the words that the do exists in the lexicon
friso.mode = 2

18
libfriso/friso/friso.pri Normal file
View File

@ -0,0 +1,18 @@
INCLUDEPATH += $$PWD
HEADERS += \
$$PWD/src/friso_API.h \
$$PWD/src/friso.h \
$$PWD/src/friso_ctype.h
SOURCES += \
$$PWD/src/friso.c \
$$PWD/src/friso_lexicon.c \
$$PWD/src/friso_string.c \
$$PWD/src/friso_array.c \
$$PWD/src/friso_ctype.c \
$$PWD/src/friso_GBK.c \
$$PWD/src/friso_hash.c \
$$PWD/src/friso_link.c \
$$PWD/src/friso_UTF8.c

1824
libfriso/friso/src/friso.c Normal file

File diff suppressed because it is too large Load Diff

370
libfriso/friso/src/friso.h Normal file
View File

@ -0,0 +1,370 @@
/*
* main interface file for friso tokenizer.
* you could modify it and re-release and free for commercial use.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#ifndef _friso_h
#define _friso_h
#include "friso_API.h"
#include <stdio.h>
/* {{{ friso main interface define :: start*/
#define FRISO_VERSION "1.6.4"
#define friso_version() FRISO_VERSION
#define DEFAULT_SEGMENT_LENGTH 5
#define DEFAULT_MIX_LENGTH 2
#define DEFAULT_LNA_LENGTH 1
#define DEFAULT_NTHRESHOLD 1000000
#define DEFAULT_SEGMENT_MODE 2
/*
* Type: friso_lex_t
* -----------
* This type used to represent the type of the lexicon.
*/
typedef enum {
__LEX_CJK_WORDS__ = 0,
__LEX_CJK_UNITS__ = 1,
__LEX_ECM_WORDS__ = 2, //english and chinese mixed words.
__LEX_CEM_WORDS__ = 3, //chinese and english mixed words.
__LEX_CN_LNAME__ = 4,
__LEX_CN_SNAME__ = 5,
__LEX_CN_DNAME1__ = 6,
__LEX_CN_DNAME2__ = 7,
__LEX_CN_LNA__ = 8,
__LEX_STOPWORDS__ = 9,
__LEX_ENPUN_WORDS__ = 10,
__LEX_EN_WORDS__ = 11,
__LEX_OTHER_WORDS__ = 15,
__LEX_NCSYN_WORDS__ = 16,
__LEX_PUNC_WORDS__ = 17, //punctuations
__LEX_UNKNOW_WORDS__ = 18 //unrecognized words.
} friso_lex_t;
typedef friso_hash_t * friso_dic_t;
#define __FRISO_LEXICON_LENGTH__ 12
//charset that Friso now support.
typedef enum {
FRISO_UTF8 = 0, //UTF-8
FRISO_GBK = 1 //GBK
} friso_charset_t;
/*
* Type: friso_mode_t
* ------------------
* use to identidy the mode that the friso use.
*/
typedef enum {
__FRISO_SIMPLE_MODE__ = 1,
__FRISO_COMPLEX_MODE__ = 2,
__FRISO_DETECT_MODE__ = 3
} friso_mode_t;
/* friso entry.*/
typedef struct {
friso_dic_t dic; //friso dictionary
friso_charset_t charset; //project charset.
} friso_entry;
typedef friso_entry * friso_t;
/*
* Type: lex_entry_cdt
* -------------------
* This type used to represent the lexicon entry struct.
*/
#define _LEX_APPENSYN_MASK (1 << 0) //append synoyums words.
#define lex_appensyn_open(e) e->ctrlMask |= _LEX_APPENSYN_MASK
#define lex_appensyn_close(e) e->ctrlMask &= ~_LEX_APPENSYN_MASK
#define lex_appensyn_check(e) ((e->ctrlMask & _LEX_APPENSYN_MASK) != 0)
typedef struct {
/*
* the type of the lexicon item.
* available value is all the elements in friso_lex_t enum.
* and if it is __LEX_OTHER_WORDS__, we need to free it after use it.
*/
uchar_t length; //the length of the token.(after the convertor of Friso.)
uchar_t rlen; //the real length of the token.(before any convert)
uchar_t type;
uchar_t ctrlMask; //function control mask, like append the synoyums words.
uint_t offset; //offset index.
fstring word;
//fstring py; //pinyin of the word.(invalid)
friso_array_t syn; //synoyums words.
friso_array_t pos; //part of speech.
uint_t fre; //single word frequency.
} lex_entry_cdt;
typedef lex_entry_cdt * lex_entry_t;
/*the segmentation token entry.*/
#define __HITS_WORD_LENGTH__ 64
typedef struct {
uchar_t type; //type of the word. (item of friso_lex_t)
uchar_t length; //length of the token.
uchar_t rlen; //the real length of the token.(in orgin string)
char pos; //part of speech.
int offset; //start offset of the word.
char word[__HITS_WORD_LENGTH__];
//char py[0];
} friso_token_entry;
typedef friso_token_entry * friso_token_t;
/*
* Type: friso_task_entry
* This type used to represent the current segmentation content.
* like the text to split, and the current index, token buffer eg....
*/
//action control mask for #FRISO_TASK_T#.
#define _TASK_CHECK_CF_MASK (1 << 0) //Wether to check the chinese fraction.
#define _TASK_START_SS_MASK (1 << 1) //Wether to start the secondary segmentation.
#define task_ssseg_open(task) task->ctrlMask |= _TASK_START_SS_MASK
#define task_ssseg_close(task) task->ctrlMask &= ~_TASK_START_SS_MASK
#define task_ssseg_check(task) ((task->ctrlMask & _TASK_START_SS_MASK) != 0)
typedef struct {
fstring text; //text to tokenize
uint_t idx; //start offset index.
uint_t length; //length of the text.
uint_t bytes; //latest word bytes in C.
uint_t unicode; //latest word unicode number.
uint_t ctrlMask; //action control mask.
friso_link_t pool; //task pool.
string_buffer_t sbuf; //string buffer.
friso_token_t token; //token result token;
char buffer[7]; //word buffer. (1-6 bytes for an utf-8 word in C).
} friso_task_entry;
typedef friso_task_entry * friso_task_t;
/* task configuration entry.*/
#define _FRISO_KEEP_PUNC_LEN 13
#define friso_en_kpunc(config, ch) (strchr(config->kpuncs, ch) != 0)
//typedef friso_token_t ( * friso_next_hit_fn ) ( friso_t, void *, friso_task_t );
//typedef lex_entry_t ( * friso_next_lex_fn ) ( friso_t, void *, friso_task_t );
struct friso_config_struct {
ushort_t max_len; //the max match length (4 - 7).
ushort_t r_name; //1 for open chinese name recognition 0 for close it.
ushort_t mix_len; //the max length for the CJK words in a mix string.
ushort_t lna_len; //the max length for the chinese last name adron.
ushort_t add_syn; //append synonyms tokenizer words.
ushort_t clr_stw; //clear the stopwords.
ushort_t keep_urec; //keep the unrecongnized words.
ushort_t spx_out; //use sphinx output customize.
ushort_t en_sseg; //start the secondary segmentation.
ushort_t st_minl; //min length of the secondary segmentation token.
uint_t nthreshold; //the threshold value for a char to make up a chinese name.
friso_mode_t mode; //Complex mode or simple mode
//pointer to the function to get the next token
friso_token_t (*next_token) (friso_t, struct friso_config_struct *, friso_task_t);
//pointer to the function to get the next cjk lex_entry_t
lex_entry_t (*next_cjk ) (friso_t, struct friso_config_struct *, friso_task_t);
char kpuncs[_FRISO_KEEP_PUNC_LEN]; //keep punctuations buffer.
};
typedef struct friso_config_struct friso_config_entry;
typedef friso_config_entry * friso_config_t;
/*
* Function: friso_new;
* Usage: vars = friso_new( void );
* --------------------------------
* This function used to create a new empty friso friso_t;
* with default value.
*/
FRISO_API friso_t friso_new( void );
//creat a friso entry with a default value from a configuratile file.
//@return 1 for successfully and 0 for failed.
FRISO_API int friso_init_from_ifile( friso_t, friso_config_t, fstring );
/*
* Function: friso_free_vars;
* Usage: friso_free( vars );
* --------------------------
* This function is used to free the allocation of the given vars.
*/
FRISO_API void friso_free( friso_t );
/*
* Function: friso_set_dic
* Usage: dic = friso_set_dic( vars, dic );
* ----------------------------------------
* This function is used to set the dictionary for friso.
* and firso_dic_t is the pointer of a hash table array.
*/
//FRISO_API void friso_set_dic( friso_t, friso_dic_t );
#define friso_set_dic(friso, dic)\
do {\
friso->dic = dic;\
} while (0)
/*
* Function: friso_set_mode
* Usage: friso_set_mode( vars, mode );
* ------------------------------------
* This function is used to set the mode(complex or simple) that you want to friso to use.
*/
FRISO_API void friso_set_mode( friso_config_t, friso_mode_t );
/*create a new friso configuration entry and initialize
it with the default value.*/
FRISO_API friso_config_t friso_new_config( void );
//initialize the specified friso config entry with default value.
FRISO_API void friso_init_config( friso_config_t );
//free the specified friso configuration entry.
//FRISO_API void friso_free_config( friso_config_t );
#define friso_free_config(cfg) FRISO_FREE(cfg)
/*
* Function: friso_new_task;
* Usage: segment = friso_new_task( void );
* ----------------------------------------
* This function is used to create a new friso segment type;
*/
FRISO_API friso_task_t friso_new_task( void );
/*
* Function: friso_free_task;
* Usage: friso_free_task( task );
* -------------------------------
* This function is used to free the allocation of function friso_new_segment();
*/
FRISO_API void friso_free_task( friso_task_t );
//create a new friso token
FRISO_API friso_token_t friso_new_token( void );
//free the given friso token
//FRISO_API void friso_free_token( friso_token_t );
#define friso_free_token(token) FRISO_FREE(token)
/*
* Function: friso_set_text
* Usage: friso_set_text( task, text );
* ------------------------------------
* This function is used to set the text that is going to segment.
*/
FRISO_API void friso_set_text( friso_task_t, fstring );
//get the next cjk word with mmseg simple mode
FRISO_API lex_entry_t next_simple_cjk( friso_t, friso_config_t, friso_task_t );
//get the next cjk word with mmseg complex mode(mmseg core algorithm)
FRISO_API lex_entry_t next_complex_cjk( friso_t, friso_config_t, friso_task_t );
/*
* Function: next_mmseg_token
* Usage: word = next_mmseg_token( vars, seg );
* --------------------------------------
* This function is used to get next word that friso segmented
* with a split mode of __FRISO_SIMPLE_MODE__ or __FRISO_COMPLEX_MODE__
*/
FRISO_API friso_token_t next_mmseg_token( friso_t, friso_config_t, friso_task_t );
//__FRISO_DETECT_MODE__
FRISO_API friso_token_t next_detect_token( friso_t, friso_config_t, friso_task_t );
/* }}} friso main interface define :: end*/
/* {{{ lexicon interface define :: start*/
/*
* Function: friso_dic_new
* Usage: dic = friso_new_dic();
* -----------------------------
* This function used to create a new dictionary.(memory allocation).
*/
FRISO_API friso_dic_t friso_dic_new( void );
FRISO_API fstring file_get_line( fstring, FILE * );
/*
* Function: friso_dic_free
* Usage: friso_dic_free( void );
* ------------------------------
* This function is used to free all the allocation of friso_dic_new.
*/
FRISO_API void friso_dic_free( friso_dic_t );
//create a new lexicon entry.
FRISO_API lex_entry_t new_lex_entry( fstring, friso_array_t, uint_t, uint_t, uint_t );
//free the given lexicon entry.
//free all the allocations that its synonyms word's items pointed to
//when the second arguments is 1
FRISO_API void free_lex_entry_full( lex_entry_t );
FRISO_API void free_lex_entry( lex_entry_t );
/*
* Function: friso_dic_load
* Usage: friso_dic_load( friso, friso_lex_t, path, length );
* --------------------------------------------------
* This function is used to load dictionary from a given path.
* no length limit when length less than 0.
*/
FRISO_API void friso_dic_load( friso_t, friso_config_t,
friso_lex_t, fstring, uint_t );
/*
* load the lexicon configuration file.
* and load all the valid lexicon from the conf file.
*/
FRISO_API void friso_dic_load_from_ifile( friso_t, friso_config_t, fstring, uint_t );
/*
* Function: friso_dic_match
* Usage: friso_dic_add( dic, friso_lex_t, word, syn );
* ----------------------------------------------
* This function used to put new word into the dictionary.
*/
FRISO_API void friso_dic_add( friso_dic_t, friso_lex_t, fstring, friso_array_t );
/*
* Function: friso_dic_add_with_fre
* Usage: friso_dic_add_with_fre( dic, friso_lex_t, word, value, syn, fre );
* -------------------------------------------------------------------
* This function used to put new word width frequency into the dictionary.
*/
FRISO_API void friso_dic_add_with_fre( friso_dic_t, friso_lex_t, fstring, friso_array_t, uint_t );
/*
* Function: friso_dic_match
* Usage: result = friso_dic_match( dic, friso_lex_t, word );
* ----------------------------------------------------
* This function is used to check the given word is in the dictionary or not.
*/
FRISO_API int friso_dic_match( friso_dic_t, friso_lex_t, fstring );
/*
* Function: friso_dic_get
* Usage: friso_dic_get( dic, friso_lex_t, word );
* -----------------------------------------
* This function is used to search the specified lex_entry_t.
*/
FRISO_API lex_entry_t friso_dic_get( friso_dic_t, friso_lex_t, fstring );
/*
* Function: friso_spec_dic_size
* Usage: friso_spec_dic_size( dic, friso_lex_t )
* This function is used to get the size of the dictionary with a specified type.
*/
FRISO_API uint_t friso_spec_dic_size( friso_dic_t, friso_lex_t );
FRISO_API uint_t friso_all_dic_size( friso_dic_t );
/* }}} lexicon interface define :: end*/
#endif /*end ifndef*/

View File

@ -0,0 +1,412 @@
/*
* friso ADT application interface header source file.
* 1. string bufffer interface.
* 2. hashmap interface.
* 3. dynamaic array interface.
* 4. double link list interface.
*
* @author chenxin <chenxin619315@gmail.com>
*/
#ifndef _friso_api_h
#define _friso_api_h
#include <stdio.h>
#include <stdlib.h>
//yat, just take it as this way, 99 percent you will find no problem
#if ( defined(_WIN32) || defined(_WINDOWS_) || defined(__WINDOWS_) )
# define FRISO_WINNT
#else
# define FRISO_LINUX
#endif
#ifdef FRISO_WINNT
# define FRISO_API extern __declspec(dllexport)
# define __STATIC_API__ static
#else
/*platform shared library statement :: unix*/
# define FRISO_API extern
# define __STATIC_API__ static inline
#endif
#define ___ALLOCATION_ERROR___ \
printf("Unable to do the memory allocation, program will now exit\n" ); \
exit(1);
#define print(str) printf("%s", str )
#define println(str) printf("%s\n", str )
/*
* memory allocation macro definition which make it more more convenient
* to change to use your favorite or a better memory manage library.
*/
#define FRISO_CALLOC(_bytes, _blocks) calloc(_bytes, _blocks)
#define FRISO_MALLOC(_bytes) malloc(_bytes)
#define FRISO_FREE(_ptr) free( _ptr )
typedef unsigned short ushort_t;
typedef unsigned char uchar_t;
typedef unsigned int uint_t;
typedef char * fstring;
/* {{{ fstring handle interface define::start. */
#define __CHAR_BYTES__ 8
#define __BUFFER_DEFAULT_LENGTH__ 16
typedef struct {
fstring buffer;
uint_t length;
uint_t allocs;
} string_buffer_entry;
typedef string_buffer_entry * string_buffer_t;
//FRISO_API string_buffer_t new_string_buffer( void );
#define new_string_buffer() \
new_string_buffer_with_opacity( __DEFAULT_ARRAY_LIST_OPACITY__ );
FRISO_API string_buffer_t new_string_buffer_with_opacity( uint_t );
FRISO_API string_buffer_t new_string_buffer_with_string( fstring str );
/*
* this function will copy the chars that the fstring pointed.
* to the buffer.
* this may cause the resize action of the buffer.
*/
FRISO_API void string_buffer_append( string_buffer_t, fstring );
FRISO_API void string_buffer_append_char( string_buffer_t, char );
//insert the given fstring from the specified position.
FRISO_API void string_buffer_insert( string_buffer_t, uint_t idx, fstring );
//remove the char in the specified position.
FRISO_API fstring string_buffer_remove( string_buffer_t, uint_t idx, uint_t );
/*
* turn the string_buffer to a string.
* or return the buffer of the string_buffer.
*/
FRISO_API string_buffer_t string_buffer_trim( string_buffer_t );
/*
* free the given fstring buffer.
* and this function will not free the allocations of the
* the string_buffer_t->buffer, we return it to you, if there is
* a necessary you could free it youself by calling free();
*/
FRISO_API fstring string_buffer_devote( string_buffer_t );
/*
* clear the given fstring buffer.
* reset its buffer with 0 and reset its length to 0.
*/
FRISO_API void string_buffer_clear( string_buffer_t );
//free the fstring buffer include the buffer.
FRISO_API void free_string_buffer( string_buffer_t );
/**
* fstring specified chars tokenizer functions
*
* @date 2013-06-08
*/
typedef struct {
fstring source;
uint_t srcLen;
fstring delimiter;
uint_t delLen;
uint_t idx;
} string_split_entry;
typedef string_split_entry * string_split_t;
/**
* create a new string_split_entry.
*
* @param source
* @return string_split_t;
*/
FRISO_API string_split_t new_string_split( fstring, fstring );
FRISO_API void string_split_reset( string_split_t, fstring, fstring );
FRISO_API void string_split_set_source( string_split_t, fstring );
FRISO_API void string_split_set_delimiter( string_split_t, fstring );
FRISO_API void free_string_split( string_split_t );
/**
* get the next split fstring, and copy the
* splited fstring into the __dst buffer .
*
* @param string_split_t
* @param __dst
* @return fstring (NULL if reach the end of the source
* or there is no more segmentation)
*/
FRISO_API fstring string_split_next( string_split_t, fstring );
/* }}} */
/* {{{ dynamaic array interface define::start*/
#define __DEFAULT_ARRAY_LIST_OPACITY__ 8
/*friso array list entry struct*/
typedef struct {
void **items;
uint_t allocs;
uint_t length;
} friso_array_entry;
typedef friso_array_entry * friso_array_t;
//create a new friso dynamic array.
//FRISO_API friso_array_t new_array_list( void );
#define new_array_list() new_array_list_with_opacity(__DEFAULT_ARRAY_LIST_OPACITY__)
//create a new friso dynamic array with the given opacity
FRISO_API friso_array_t new_array_list_with_opacity( uint_t );
/*
* free the given friso array.
* and its items, but never where the items's item to pointed to .
*/
FRISO_API void free_array_list( friso_array_t );
//add a new item to the array.
FRISO_API void array_list_add( friso_array_t, void * );
//insert a new item at a specifed position.
FRISO_API void array_list_insert( friso_array_t, uint_t, void * );
//get a item at a specified position.
FRISO_API void *array_list_get( friso_array_t, uint_t );
/*
* set the item at a specified position.
* this will return the old value.
*/
FRISO_API void *array_list_set( friso_array_t, uint_t, void * );
/*
* remove the given item at a specified position.
* this will return the value of the removed item.
*/
FRISO_API void *array_list_remove( friso_array_t, uint_t );
/*trim the array list for final use.*/
FRISO_API friso_array_t array_list_trim( friso_array_t );
/*
* clear the array list.
* this function will free all the allocations that the pointer pointed.
* but will not free the point array allocations,
* and will reset the length of it.
*/
FRISO_API friso_array_t array_list_clear( friso_array_t );
//return the size of the array.
//FRISO_API uint_t array_list_size( friso_array_t );
#define array_list_size( array ) array->length
//return the allocations of the array.
//FRISO_API uint_t array_list_allocs( friso_array_t );
#define array_list_allocs( array ) array->allocs
//check if the array is empty.
//FRISO_API int array_list_empty( friso_array_t );
#define array_list_empty( array ) ( array->length == 0 )
/* }}} dynamaic array interface define::end*/
/* {{{ link list interface define::start*/
struct friso_link_node {
void *value;
struct friso_link_node *prev;
struct friso_link_node *next;
};
typedef struct friso_link_node link_node_entry;
typedef link_node_entry * link_node_t;
/*
* link list adt
*/
typedef struct {
link_node_t head;
link_node_t tail;
uint_t size;
} friso_link_entry;
typedef friso_link_entry * friso_link_t;
//create a new link list
FRISO_API friso_link_t new_link_list( void );
//free the specified link list
FRISO_API void free_link_list( friso_link_t );
//return the size of the current link list.
//FRISO_API uint_t link_list_size( friso_link_t );
#define link_list_size( link ) link->size
//check the given link is empty or not.
//FRISO_API int link_list_empty( friso_link_t );
#define link_list_empty( link ) (link->size == 0)
//clear all the nodes in the link list( except the head and the tail ).
FRISO_API friso_link_t link_list_clear( friso_link_t link );
//add a new node to the link list.(append from the tail)
FRISO_API void link_list_add( friso_link_t, void * );
//add a new node before the specified node
FRISO_API void link_list_insert_before( friso_link_t, uint_t, void * );
//get the node in the current index.
FRISO_API void *link_list_get( friso_link_t, uint_t );
//modify the node in the current index.
FRISO_API void *link_list_set( friso_link_t, uint_t, void * );
//remove the specified link node
FRISO_API void *link_list_remove( friso_link_t, uint_t );
//remove the given node
FRISO_API void *link_list_remove_node( friso_link_t, link_node_t );
//remove the node from the frist.
FRISO_API void *link_list_remove_first( friso_link_t );
//remove the last node from the link list
FRISO_API void *link_list_remove_last( friso_link_t );
//append a node from the end.
FRISO_API void link_list_add_last( friso_link_t, void * );
//add a node at the begining of the link list.
FRISO_API void link_list_add_first( friso_link_t, void * );
/* }}} link list interface define::end*/
/* {{{ hashtable interface define :: start*/
struct hash_entry {
fstring _key; //the node key
void * _val; //the node value
struct hash_entry * _next;
};
typedef struct hash_entry friso_hash_entry;
typedef friso_hash_entry * hash_entry_t;
typedef void (*fhash_callback_fn_t)( hash_entry_t );
typedef struct {
uint_t length;
uint_t size;
float factor;
uint_t threshold;
hash_entry_t *table;
} friso_hash_cdt;
typedef friso_hash_cdt * friso_hash_t;
//default value for friso_hash_cdt
#define DEFAULT_LENGTH 31
#define DEFAULT_FACTOR 0.85f
/*
* Function: new_hash_table
* Usage: table = new_hash_table();
* --------------------------------
* this function allocates a new symbol table with no entries.
*/
FRISO_API friso_hash_t new_hash_table( void );
/*
* Function: free_hash_table
* Usage: free_hash_table( table );
* --------------------------------------
* this function will free all the allocation for memory.
*/
FRISO_API void free_hash_table( friso_hash_t, fhash_callback_fn_t );
/*
* Function: put_new_mapping
* Usage: put_mapping( table, key, value );
* ----------------------------------------
* the function associates the specified key with the given value.
*/
FRISO_API void *hash_put_mapping( friso_hash_t, fstring, void * );
/*
* Function: is_mapping_exists
* Usage: bool = is_mapping_exists( table, key );
* ----------------------------------------------
* this function check the given key mapping is exists or not.
*/
FRISO_API int hash_exist_mapping( friso_hash_t, fstring );
/*
* Function: get_mapping_value
* Usage: value = get_mapping_value( table, key );
* -----------------------------------------------
* this function return the value associated with the given key.
* UNDEFINED will be return if the mapping is not exists.
*/
FRISO_API void * hash_get_value( friso_hash_t, fstring );
/*
* Function: remove_mapping
* Usage: remove_mapping( table, key );
* ------------------------------------
* This function is used to remove the mapping associated with the given key.
*/
FRISO_API hash_entry_t hash_remove_mapping( friso_hash_t, fstring );
/*
* Function: get_table_size
* Usage: size = get_table_size( table );
* --------------------------------------
* This function is used to count the size of the specified table.
*/
//FRISO_API uint_t hash_get_size( friso_hash_t );
#define hash_get_size( hash ) hash->size
/* }}} hashtable interface define :: end*/
/* {{{ utf8 string interface define :: start*/
/*
* Function: get_utf8_bytes
*
* */
FRISO_API int get_utf8_bytes(char);
/*
* Function: get_utf8_unicode
*
* */
FRISO_API int get_utf8_unicode(const fstring);
/*
* Function: unicode_to_utf8
*
* */
FRISO_API int unicode_to_utf8(uint_t, fstring);
/* }}} utf8 string interface define :: start*/
#endif /*end ifndef*/

View File

@ -0,0 +1,298 @@
/**
* Friso GBK serial functions implementation source file.
* @package src/friso_GBK.c .
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "friso_API.h"
#include "friso_ctype.h"
/* read the next GBK word from the specified position.
*
* @return int the bytes of the current readed word.
*/
FRISO_API int gbk_next_word(
friso_task_t task,
uint_t *idx,
fstring __word )
{
int c;
if ( *idx >= task->length ) return 0;
c = (uchar_t)task->text[*idx];
if ( c <= 0x80 ) {
task->bytes = 1;
} else {
task->bytes = 2;
}
//copy the word to the buffer.
memcpy(__word, task->text + (*idx), task->bytes);
(*idx) += task->bytes;
__word[task->bytes] = '\0';
return task->bytes;
}
//get the bytes of a gbk char.
//FRISO_API int get_gbk_bytes( char c )
//{
// return 1;
//}
//check if the given buffer is a gbk word (ANSII string).
// included the simplified and traditional words.
FRISO_API int gbk_cn_string(char *str)
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
//GBK/2: gb2312 chinese word.
return ( ((c1 >= 0xb0 && c1 <= 0xf7)
&& (c2 >= 0xa1 && c2 <= 0xfe))
//GBK/3: extend chinese words.
|| ((c1 >= 0x81 && c1 <= 0xa0)
&& ( (c2 >= 0x40 && c2 <= 0x7e)
|| (c2 >= 0x80 && c2 <= 0xfe) ))
//GBK/4: extend chinese words.
|| ((c1 >= 0xaa && c1 <= 0xfe)
&& ( (c2 >= 0x40 && c2 <= 0xfe)
|| (c2 >= 0x80 && c2 <= 0xa0) )) );
}
/*check if the given char is a ASCII letter
* include all the arabic number, letters and english puntuations.*/
FRISO_API int gbk_halfwidth_en_char( char c )
{
int u = (uchar_t) c;
return ( u >= 32 && u <= 126 );
}
/*
* check if the given char is a full-width latain.
* include the full-width arabic numeber, letters.
* but not the full-width puntuations.
*/
FRISO_API int gbk_fullwidth_en_char( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
return ( (c1 == 0xA3)
&& ( (c2 >= 0xB0 && c2 <= 0xB9) //arabic numbers.
|| ( c2 >= 0xC1 && c2 <= 0xDA ) //uppercase letters.
|| ( c2 >= 0xE1 && c2 <= 0xFA) ) ); //lowercase letters.
}
//check if the given char is a upper case english letter.
// included the full-width and half-width letters.
FRISO_API int gbk_uppercase_letter( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if ( c1 <= 0x80 ) { //half-width
return ( c1 >= 65 && c1 <= 90 );
} else { //full-width
return ( c1 == 0xa3 && ( c2 >= 0xc1 && c2 <= 0xda ) );
}
}
//check if the given char is a lower case char.
// included the full-width and half-width letters.
FRISO_API int gbk_lowercase_letter( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if ( c1 <= 0x80 ) { //half-width
return ( c1 >= 97 && c1 <= 122 );
} else { //full-width
return ( c1 == 0xa3 && ( c2 >= 0xe1 && c2 <= 0xfa ) );
}
}
//check if the given char is a arabic numeric.
// included the full-width and half-width arabic numeric.
FRISO_API int gbk_numeric_letter( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if ( c1 <= 0x80 ) { //half-width
return ( c1 >= 48 && c1 <= 57 );
} else { //full-width
return ( ( c1 == 0xa3 ) && ( c2 >= 0xb0 && c2 <= 0xb9 ) );
}
}
/*
* check if the given fstring is make up with numeric chars.
* both full-width,half-width numeric is ok.
*/
FRISO_API int gbk_numeric_string( char *str )
{
char *s = str;
int c1 = 0;
int c2 = 0;
while ( *s != '\0' ) {
c1 = (uchar_t) (*s++);
if ( c1 <= 0x80 ) { //half-width
if ( c1 < 48 || c2 > 57 ) return 0;
} else { //full-width
if ( c1 != 0xa3 ) return 0;
c2 = (uchar_t) (*s++);
if ( c2 < 0xb0 || c2 > 0xb9 ) return 0;
}
}
return 1;
}
FRISO_API int gbk_decimal_string( char *str )
{
int c1 = 0;
int c2 = 0;
int len = strlen(str), i, p = 0;
//point header check.
if ( str[0] == '.' || str[len - 1] == '.' ) return 0;
for ( i = 0; i < len; ) {
c1 = (uchar_t) str[i++];
//count the number of the points.
if ( c1 == 46 ) {
p++;
continue;
}
if ( c1 <= 0x80 ) { //half-width
if ( c1 < 48 || c1 > 57 ) return 0;
} else { //full-width
if ( c1 != 0xa3 ) return 0;
c2 = (uchar_t) str[i++];
if ( c2 < 0xb0 || c2 > 0xb9 ) return 0;
}
}
return (p == 1);
}
//check if the given char is a english(ASCII) letter.
// (full-width and half-width), not the punctuation/arabic of course.
FRISO_API int gbk_en_letter( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if ( c1 <= 0x80 ) {
return ( (c1 >= 65 && c1 <= 90) //lowercase
|| (c1 >= 97 && c1 <= 122)); //uppercase
} else {
return ( (c1 == 0xa3)
&& ( ( c2 >= 0xc1 && c2 <= 0xda ) //lowercase
|| ( c2 >= 0xe1 && c2 <= 0xfa ) ) ); //uppercase
}
return 0;
}
//check the given char is a whitespace or not.
// included full-width and half-width whitespace.
FRISO_API int gbk_whitespace( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
if ( c1 <= 0x80 ) {
return (c1 == 32);
} else {
return ( c1 == 0xa3 && c2 == 0xa0 );
}
}
/* check if the given char is a letter number like 'ⅠⅡ'
*/
FRISO_API int gbk_letter_number( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
return ( (c1 == 0xa2)
&& ( ( c2 >= 0xa1 && c2 <= 0xb0 ) //lowercase
|| ( c2 >= 0xf0 && c2 <= 0xfe ) ) ); //uppercase
}
/*
* check if the given char is a other number like ''
*/
FRISO_API int gbk_other_number( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
return ( ( c1 == 0xa2 ) && ( c2 >= 0xc5 && c2 <= 0xee ) );
}
//check if the given char is a english punctuation.
FRISO_API int gbk_en_punctuation( char c )
{
int u = (uchar_t) c;
return ( (u > 32 && u < 48)
|| ( u > 57 && u < 65 )
|| ( u > 90 && u < 97 )
|| ( u > 122 && u < 127 ) );
}
//check the given char is a chinese punctuation.
FRISO_API int gbk_cn_punctuation( char *str )
{
int c1 = (uchar_t) str[0];
int c2 = (uchar_t) str[1];
//full-width en punctuation.
return ( (c1 == 0xa3 && (( c2 >= 0xa1 && c2 <= 0xaf )
|| ( c2 >= 0xba && c2 <= 0xc0 )
|| ( c2 >= 0xdb && c2 <= 0xe0 )
|| ( c2 >= 0xfb && c2 <= 0xfe ) ))
//chinese punctuation.
|| (c1 == 0xa1 && ( (c2 >= 0xa1 && c2 <= 0xae)
|| ( c2 >= 0xb0 && c2 <= 0xbf ) ))
//A6 area special punctuations:" "
|| (c1 == 0xa6 && (c2 >= 0xf9 && c2 <= 0xfe))
//A8 area special punctuations: " ˊˋ˙–―‥‵℅ "
|| (c1 == 0xa8 && (c2 >= 0x40 && c2 <= 0x47)) );
}
/* {{{
'@', '$','%', '^', '&', '-', ':', '.', '/', '\'', '#', '+'
*/
//cause it it the same as utf-8, we use utf8's interface instead.
//@see the friso_ctype.h#gbk_keep_punctuation macro defined.
//static friso_hash_t __keep_punctuations_hash__ = NULL;
/* @Deprecated
* check the given char is an english keep punctuation.*/
//FRISO_API int gbk_keep_punctuation( char *str )
//{
// if ( __keep_punctuations_hash__ == NULL ) {
// __keep_punctuations_hash__ = new_hash_table();
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
// hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
// }
// //check the hash.
// return hash_exist_mapping( __keep_punctuations_hash__, str );
//}
/* }}} */
//check if the given english char is a full-width char or not.
//FRISO_API int gbk_fullwidth_char( char *str )
//{
// return 1;
//}

View File

@ -0,0 +1,486 @@
/**
* Friso utf8 serial function implementation source file.
* @package src/friso_UTF8.c .
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "friso_API.h"
#include "friso_ctype.h"
/* read the next utf-8 word from the specified position.
*
* @return int the bytes of the current readed word.
*/
FRISO_API int utf8_next_word(
friso_task_t task,
uint_t *idx,
fstring __word )
{
if ( *idx >= task->length ) return 0;
//register uint_t t;
task->bytes = get_utf8_bytes( task->text[ *idx ] );
//for ( t = 0; t < task->bytes; t++ ) {
// __word[t] = task->text[ (*idx)++ ];
//}
//change the loop to memcpy.
//it is more efficient.
//@date 2013-09-04
memcpy(__word, task->text + (*idx), task->bytes);
(*idx) += task->bytes;
__word[task->bytes] = '\0';
//the unicode counter was moved here from version 1.6.0
task->unicode = get_utf8_unicode( __word );
return task->bytes;
}
/*
* print a character in a binary style.
*
* @param int
*/
FRISO_API void print_char_binary( char value )
{
register uint_t t;
for ( t = 0; t < __CHAR_BYTES__; t++ ) {
if ( ( value & 0x80 ) == 0x80 ) {
printf("1");
} else {
printf("0");
}
value <<= 1;
}
}
/*
* get the bytes of a utf-8 char.
* between 1 - 6.
*
* @param __char
* @return int
*/
FRISO_API int get_utf8_bytes( char value )
{
register uint_t t = 0;
//one byte ascii char.
if ( ( value & 0x80 ) == 0 ) return 1;
for ( ; ( value & 0x80 ) != 0; value <<= 1 ) {
t++;
}
return t;
}
/*
* get the unicode serial of a utf-8 char.
*
* @param ch
* @return int.
*/
FRISO_API int get_utf8_unicode( const fstring ch )
{
int code = 0, bytes = get_utf8_bytes( *ch );
register uchar_t *bit = ( uchar_t * ) &code;
register char b1,b2,b3;
switch ( bytes ) {
case 1:
*bit = *ch;
break;
case 2:
b1 = *ch;
b2 = *(ch + 1);
*bit = (b1 << 6) + (b2 & 0x3F);
*(bit+1) = (b1 >> 2) & 0x07;
break;
case 3:
b1 = *ch;
b2 = *(ch + 1);
b3 = *(ch + 2);
*bit = (b2 << 6) + (b3 & 0x3F);
*(bit+1) = (b1 << 4) + ((b2 >> 2) & 0x0F);
break;
//ignore the ones that are larger than 3 bytes;
}
return code;
}
//turn the unicode serial to a utf-8 string.
FRISO_API int unicode_to_utf8( uint_t u, fstring __word )
{
if ( u <= 0x0000007F ) {
//U-00000000 - U-0000007F
//0xxxxxxx
*__word = ( u & 0x7F );
return 1;
} else if ( u >= 0x00000080 && u <= 0x000007FF ) {
//U-00000080 - U-000007FF
//110xxxxx 10xxxxxx
*( __word + 1 ) = ( u & 0x3F) | 0x80;
*__word = ((u >> 6) & 0x1F) | 0xC0;
return 2;
} else if ( u >= 0x00000800 && u <= 0x0000FFFF ) {
//U-00000800 - U-0000FFFF
//1110xxxx 10xxxxxx 10xxxxxx
*( __word + 2 ) = ( u & 0x3F) | 0x80;
*( __word + 1 ) = ((u >> 6) & 0x3F) | 0x80;
*__word = ((u >> 12) & 0x0F) | 0xE0;
return 3;
} else if ( u >= 0x00010000 && u <= 0x001FFFFF ) {
//U-00010000 - U-001FFFFF
//11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*( __word + 3 ) = ( u & 0x3F) | 0x80;
*( __word + 2 ) = ((u >> 6) & 0x3F) | 0x80;
*( __word + 1 ) = ((u >> 12) & 0x3F) | 0x80;
*__word = ((u >> 18) & 0x07) | 0xF0;
return 4;
} else if ( u >= 0x00200000 && u <= 0x03FFFFFF ) {
//U-00200000 - U-03FFFFFF
//111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*( __word + 4 ) = ( u & 0x3F) | 0x80;
*( __word + 3 ) = ((u >> 6) & 0x3F) | 0x80;
*( __word + 2 ) = ((u >> 12) & 0x3F) | 0x80;
*( __word + 1 ) = ((u >> 18) & 0x3F) | 0x80;
*__word = ((u >> 24) & 0x03) | 0xF8;
return 5;
} else if ( u >= 0x04000000 && u <= 0x7FFFFFFF ) {
//U-04000000 - U-7FFFFFFF
//1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*( __word + 5 ) = ( u & 0x3F) | 0x80;
*( __word + 4 ) = ((u >> 6) & 0x3F) | 0x80;
*( __word + 3 ) = ((u >> 12) & 0x3F) | 0x80;
*( __word + 2 ) = ((u >> 18) & 0x3F) | 0x80;
*( __word + 1 ) = ((u >> 24) & 0x3F) | 0x80;
*__word = ((u >> 30) & 0x01) | 0xFC;
return 6;
}
return 0;
}
/*
* check the given char is a CJK char or not.
* 2E80-2EFF CJK
* 2F00-2FDF
* 3000-303F CJK --ignore
* 31C0-31EF CJK
* 3200-32FF CJK --ignore.
* 3300-33FF CJK
* 3400-4DBF CJK A
* 4DC0-4DFF
* 4E00-9FBF CJK
* F900-FAFF CJK
* FE30-FE4F CJK
* FF00-FFEF ASCII --ignore (as basic latin)
*
* Japanese:
* 3040-309F
* 30A0-30FF
* 31F0-31FF
*
* Korean:
* AC00-D7AF
* 1100-11FF
* 3130-318F
*
* @param ch :pointer to the char
* @return int : 1 for yes and 0 for not.
*/
//Comment one of the following macro define
//to clear the check of the specified language.
#define FRISO_CJK_CHK_C
//#define FRISO_CJK_CHK_J
//#define FRISO_CJK_CHK_K
FRISO_API int utf8_cjk_string( uint_t u )
{
int c = 0, j = 0, k = 0;
//Chinese.
#ifdef FRISO_CJK_CHK_C
c = ( ( u >= 0x4E00 && u <= 0x9FBF )
|| ( u >= 0x2E80 && u <= 0x2EFF ) || ( u >= 0x2F00 && u <= 0x2FDF )
|| ( u >= 0x31C0 && u <= 0x31EF ) //|| ( u >= 0x3200 && u <= 0x32FF )
|| ( u >= 0x3300 && u <= 0x33FF ) //|| ( u >= 0x3400 && u <= 0x4DBF )
|| ( u >= 0x4DC0 && u <= 0x4DFF ) || ( u >= 0xF900 && u <= 0xFAFF )
|| ( u >= 0xFE30 && u <= 0xFE4F ) );
#endif
//Japanese.
#ifdef FRISO_CJK_CHK_J
j = ( ( u >= 0x3040 && u <= 0x309F )
|| ( u >= 0x30A0 && u <= 0x30FF ) || ( u >= 0x31F0 && u <= 0x31FF ) );
#endif
//Korean
#ifdef FRISO_CJK_CHK_K
k = ( ( u >= 0xAC00 && u <= 0xD7AF )
|| ( u >= 0x1100 && u <= 0x11FF ) || ( u >= 0x3130 && u <= 0x318F ) );
#endif
return ( c || j || k );
}
/*
* check the given char is a Basic Latin letter or not.
* include all the letters and english punctuations.
*
* @param c
* @return int 1 for yes and 0 for not.
*/
FRISO_API int utf8_halfwidth_en_char( uint_t u )
{
return ( u >= 32 && u <= 126 );
}
/*
* check the given char is a full-width latain or not.
* include the full-width arabic numeber, letters.
* but not the full-width punctuations.
*
* @param c
* @return int
*/
FRISO_API int utf8_fullwidth_en_char( uint_t u )
{
return ( (u >= 65296 && u <= 65305 ) //arabic number
|| ( u >= 65313 && u <= 65338 ) //upper case letters
|| ( u >= 65345 && u <= 65370 ) ); //lower case letters
}
//check the given char is a upper case letters or not.
// included the full-width and half-width letters.
FRISO_API int utf8_uppercase_letter( uint_t u )
{
if ( u > 65280 ) u -= 65248;
return ( u >= 65 && u <= 90 );
}
//check the given char is a upper case letters or not.
// included the full-width and half-width letters.
FRISO_API int utf8_lowercase_letter( uint_t u )
{
if ( u > 65280 ) u -= 65248;
return ( u >= 97 && u <= 122 );
}
//check the given char is a numeric
// included the full-width and half-width arabic numeric.
FRISO_API int utf8_numeric_letter( uint_t u )
{
if ( u > 65280 ) u -= 65248; //make full-width half-width.
return ( ( u >= 48 && u <= 57 ) );
}
//check the given char is a english letter.(included the full-width)
// not the punctuation of course.
FRISO_API int utf8_en_letter( uint_t u )
{
if ( u > 65280 ) u -= 65248;
return ( ( u >= 65 && u <= 90 )
|| ( u >= 97 && u <= 122 ) );
}
/*
* check if the given fstring is make up with numeric.
* both full-width,half-width numeric is ok.
*
* @param str
* @return int
* 65296,
* 65297,
* 65298,
* 65299,
* 65300,
* 65301,
* 65302,
* 65303,
* 65304,
* 65305,
*/
FRISO_API int utf8_numeric_string( const fstring str )
{
fstring s = str;
int bytes, u;
while ( *s != '\0' ) {
//if ( ! utf8_numeric_letter( get_utf8_unicode( s++ ) ) ) {
// return 0;
//}
//new implemention.
//@date 2013-10-14
bytes = 1;
if ( *s < 0 ) { //full-width chars.
u = get_utf8_unicode(s);
bytes = get_utf8_bytes(*s);
if ( u < 65296 || u > 65305 ) return 0;
} else if ( *s < 48 || *s > 57 ) {
return 0;
}
s += bytes;
}
return 1;
}
FRISO_API int utf8_decimal_string( const fstring str )
{
int len = strlen(str), i, p = 0;
int bytes = 0, u;
if ( str[0] == '.' || str[len-1] == '.' ) return 0;
for ( i = 1; i < len; bytes = 1 ) {
//count the number of char '.'
if ( str[i] == '.' ) {
i++;
p++;
continue;
} else if ( str[i] < 0 ) {
//full-width numeric.
u = get_utf8_unicode(str+i);
bytes = get_utf8_bytes(str[i]);
if ( u < 65296 || u > 65305 ) return 0;
} else if ( str[i] < 48 || str[i] > 57 ) {
return 0;
}
i += bytes;
}
return (p == 1);
}
/*
* check the given char is a whitespace or not.
*
* @param ch
* @return int 1 for yes and 0 for not.
*/
FRISO_API int utf8_whitespace( uint_t u )
{
if ( u == 32 || u == 12288 ) {
return 1;
}
return 0;
}
/*
* check the given char is a english punctuation.
*
* @param ch
* @return int
*/
FRISO_API int utf8_en_punctuation( uint_t u )
{
//if ( u > 65280 ) u = u - 65248; //make full-width half-width
return ( (u > 32 && u < 48)
|| ( u > 57 && u < 65 )
|| ( u > 90 && u < 97 ) //added @2013-08-31
|| ( u > 122 && u < 127 ) );
}
/*
* check the given char is a chinese punctuation.
* @date 2013-08-31 added.
*
* @param ch
* @return int
*/
FRISO_API int utf8_cn_punctuation( uint_t u )
{
return ( ( u > 65280 && u < 65296 )
|| ( u > 65305 && u < 65312 )
|| ( u > 65338 && u < 65345 )
|| ( u > 65370 && u < 65382 )
//cjk symbol and punctuation.(added 2013-09-06)
//from http://www.unicode.org/charts/PDF/U3000.pdf
|| ( u >= 12289 && u <= 12319) );
}
/*
* check if the given char is a letter number in unicode.
* like ''.
* @param ch
* @return int
*/
FRISO_API int utf8_letter_number( uint_t u )
{
return 0;
}
/*
* check if the given char is a other number in unicode.
* like ''.
* @param ch
* @return int
*/
FRISO_API int utf8_other_number( uint_t u )
{
return 0;
}
//A macro define has replace this.
//FRISO_API int is_en_punctuation( char c )
//{
// return utf8_en_punctuation( (uint_t) c );
//}
/* {{{
'@', '$','%', '^', '&', '-', ':', '.', '/', '\'', '#', '+'
*/
//static friso_hash_t __keep_punctuations_hash__ = NULL;
/* @Deprecated
* check the given char is an english keep punctuation.*/
//FRISO_API int utf8_keep_punctuation( fstring str )
//{
// if ( __keep_punctuations_hash__ == NULL )
// {
// __keep_punctuations_hash__ = new_hash_table();
// hash_put_mapping( __keep_punctuations_hash__, "@", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "$", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "%", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "^", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "&", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "-", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, ":", NULL );
// hash_put_mapping( __keep_punctuations_hash__, ".", NULL );
// //hash_put_mapping( __keep_punctuations_hash__, "/", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "'", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "#", NULL );
// hash_put_mapping( __keep_punctuations_hash__, "+", NULL );
// }
// //check the hash.
// return hash_exist_mapping( __keep_punctuations_hash__, str );
//}
/* }}} */
/*
* check the given english char is a full-width char or not.
*
* @param ch
* @return 1 for yes and 0 for not.
*/
//FRISO_API int utf8_fullwidth_char( uint_t u )
//{
// if ( u == 12288 )
// return 1; //full-width space
// //(32 - 126) ascii code
// return (u > 65280 && u <= 65406);
//}

View File

@ -0,0 +1,220 @@
/*
* friso dynamaic Array interface implementation defined in header file "friso_API.h".
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdlib.h>
/* ********************************************
* friso array list static functions block *
**********************************************/
__STATIC_API__ void **create_array_entries( uint_t __blocks )
{
register uint_t t;
void **block = ( void ** ) FRISO_CALLOC( sizeof( void * ), __blocks );
if ( block == NULL ) {
___ALLOCATION_ERROR___
}
//initialize
for ( t = 0; t < __blocks; t++ ) {
block[t] = NULL;
}
return block;
}
//resize the array. (the opacity should not be smaller than array->length)
__STATIC_API__ friso_array_t resize_array_list(
friso_array_t array,
uint_t opacity )
{
register uint_t t;
void **block = create_array_entries( opacity );
for ( t = 0; t < array->length ; t++ ) {
block[t] = array->items[t];
}
FRISO_FREE( array->items );
array->items = block;
array->allocs = opacity;
return array;
}
/* ********************************************
* friso array list FRISO_API functions block *
**********************************************/
//create a new array list. (A macro define has replace this.)
//FRISO_API friso_array_t new_array_list( void ) {
// return new_array_list_with_opacity( __DEFAULT_ARRAY_LIST_OPACITY__ );
//}
//create a new array list with a given opacity.
FRISO_API friso_array_t new_array_list_with_opacity( uint_t opacity )
{
friso_array_t array = ( friso_array_t )
FRISO_MALLOC( sizeof( friso_array_entry ) );
if ( array == NULL ) {
___ALLOCATION_ERROR___
}
//initialize
array->items = create_array_entries( opacity );
array->allocs = opacity;
array->length = 0;
return array;
}
/*
* free the given friso array.
* and its items, but never where its items item pointed to .
*/
FRISO_API void free_array_list( friso_array_t array )
{
//free the allocation that all the items pointed to
//register int t;
//if ( flag == 1 ) {
// for ( t = 0; t < array->length; t++ ) {
// if ( array->items[t] == NULL ) continue;
// FRISO_FREE( array->items[t] );
// array->items[t] = NULL;
// }
//}
FRISO_FREE( array->items );
FRISO_FREE( array );
}
//add a new item to the array.
FRISO_API void array_list_add( friso_array_t array, void *value )
{
//check the condition to resize.
if ( array->length == array->allocs ) {
resize_array_list( array, array->length * 2 + 1 );
}
array->items[array->length++] = value;
}
//insert a new item at a specified position.
FRISO_API void array_list_insert(
friso_array_t array,
uint_t idx,
void *value )
{
register uint_t t;
if ( idx <= array->length ) {
//check the condition to resize the array.
if ( array->length == array->allocs ) {
resize_array_list( array, array->length * 2 + 1 );
}
//move the elements after idx.
//for ( t = idx; t < array->length; t++ ) {
// array->items[t+1] = array->items[t];
//}
for ( t = array->length - 1; t >= idx; t-- ) {
array->items[t+1] = array->items[t];
}
array->items[idx] = value;
array->length++;
}
}
//get the item at a specified position.
FRISO_API void *array_list_get( friso_array_t array, uint_t idx )
{
if ( idx < array->length ) {
return array->items[idx];
}
return NULL;
}
//set the value of the item at a specified position.
//this will return the old value.
FRISO_API void * array_list_set(
friso_array_t array,
uint_t idx,
void * value )
{
void * oval = NULL;
if ( idx < array->length ) {
oval = array->items[idx];
array->items[idx] = value;
}
return oval;
}
//remove the item at a specified position.
//this will return the value of the removed item.
FRISO_API void * array_list_remove(
friso_array_t array, uint_t idx )
{
register uint_t t;
void *oval = NULL;
if ( idx < array->length ) {
oval = array->items[idx];
//move the elements after idx.
for ( t = idx; t < array->length - 1; t++ ) {
array->items[t] = array->items[ t + 1 ];
}
array->items[array->length - 1] = NULL;
array->length--;
}
return oval;
}
/*trim the array list*/
FRISO_API friso_array_t array_list_trim( friso_array_t array )
{
if ( array->length < array->allocs ) {
return resize_array_list( array, array->length );
}
return array;
}
/*
* clear the array list.
* this function will free all the allocations that the pointer pointed.
* but will not free the point array allocations,
* and will reset the length of it.
*/
FRISO_API friso_array_t array_list_clear( friso_array_t array )
{
register uint_t t;
//free all the allocations that the array->length's pointer pointed.
for ( t = 0; t < array->length; t++ ) {
/*if ( array->items[t] == NULL ) continue;
FRISO_FREE( array->items[t] ); */
array->items[t] = NULL;
}
//attribute reset.
array->length = 0;
return array;
}
//get the size of the array list. (A macro define has replace this.)
//FRISO_API uint_t array_list_size( friso_array_t array ) {
// return array->length;
//}
//return the allocations of the array list.(A macro define has replace this)
//FRISO_API uint_t array_list_allocs( friso_array_t array ) {
// return array->allocs;
//}
//check if the array is empty.(A macro define has replace this.)
//FRISO_API int array_list_empty( friso_array_t array )
//{
// return ( array->length == 0 );
//}

View File

@ -0,0 +1,260 @@
/**
* friso string type check functions,
* like english/CJK, full-wdith/half-width, punctuation or not.
* @see friso_UTF8.c and friso_GBK.c for detail.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "friso_ctype.h"
#include "friso_API.h"
/* check if the specified string is a cn string.
*
* @return int (true for cn string or false)
* */
FRISO_API int friso_cn_string(
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 ) {
return utf8_cjk_string(task->unicode);
} else if ( charset == FRISO_GBK ) {
return gbk_cn_string(task->buffer);
}
return 0;
}
//check if the specified word is a whitespace.
FRISO_API int friso_whitespace(
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 ) {
return utf8_whitespace(task->unicode);
} else if ( charset == FRISO_GBK ) {
return gbk_whitespace(task->buffer);
}
return 0;
}
//check if the specifiled word is a numeric letter.
FRISO_API int friso_numeric_letter(
friso_charset_t charset,
friso_task_t task)
{
if ( charset == FRISO_UTF8 ) {
return utf8_numeric_letter((uint_t) task->text[task->idx]);
} else if ( charset == FRISO_GBK ) {
return gbk_numeric_letter(task->text + task->idx);
}
return 0;
}
//check if the specified word is aa english letter.
FRISO_API int friso_en_letter(
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 ) {
return utf8_en_letter( ( uint_t ) task->text[task->idx]);
} else if ( charset == FRISO_GBK ) {
return gbk_en_letter( task->text + task->idx );
}
return 0;
}
//check if the specified word is a half-width letter.
// punctuations are inclued.
FRISO_API int friso_halfwidth_en_char(
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 ) {
return utf8_halfwidth_en_char(task->unicode);
} else if ( charset == FRISO_GBK ) {
return gbk_halfwidth_en_char(task->buffer[0]);
}
return 0;
}
//check if the specified word is a full-width letter.
// full-width punctuations are not included.
FRISO_API int friso_fullwidth_en_char(
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 ) {
return utf8_fullwidth_en_char( task->unicode );
} else if ( charset == FRISO_GBK ) {
return gbk_fullwidth_en_char( task->buffer );
}
return 0;
}
//check if the specified word is an english punctuations.
FRISO_API int friso_en_punctuation(
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 ) {
return utf8_en_punctuation( task->unicode );
} else if ( charset == FRISO_GBK ) {
return gbk_en_punctuation( task->buffer[0] );
}
return 0;
}
//check if the specified word ia sn chinese punctuation.
FRISO_API int friso_cn_punctuation(
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 ) {
return utf8_cn_punctuation( task->unicode );
} else if ( charset == FRISO_GBK ) {
return gbk_cn_punctuation( task->buffer );
}
return 0;
}
FRISO_API int friso_letter_number(
friso_charset_t charset,
friso_task_t task )
{
return 0;
}
FRISO_API int friso_other_number(
friso_charset_t charset,
friso_task_t task )
{
return 0;
}
//check if the word is a keep punctuation.
//@Deprecated
//FRISO_API int friso_keep_punctuation(
// friso_charset_t charset,
// friso_task_t task )
//{
// if ( charset == FRISO_UTF8 )
// return utf8_keep_punctuation( task->buffer );
// else if ( charset == FRISO_GBK )
// return gbk_keep_punctuation( task->buffer );
// return 0;
//}
//check if the specified char is en english punctuation.
// this function is the same as friso_en_punctuation.
FRISO_API int is_en_punctuation(
friso_charset_t charset, char c )
{
if ( charset == FRISO_UTF8 ) {
return utf8_en_punctuation( (uint_t) c);
} else if ( charset == FRISO_GBK ) {
return gbk_en_punctuation( c );
}
return 0;
}
//check the specified string is make up with numeric.
FRISO_API int friso_numeric_string(
friso_charset_t charset,
char *buffer )
{
if ( charset == FRISO_UTF8 ) {
return utf8_numeric_string( buffer );
} else if ( charset == FRISO_GBK ) {
return gbk_numeric_string( buffer );
}
return 0;
}
//check the specified string is a decimal string.
FRISO_API int friso_decimal_string(
friso_charset_t charset, char *buffer )
{
if ( charset == FRISO_UTF8 ) {
return utf8_decimal_string( buffer );
} else if ( charset == FRISO_GBK ) {
return gbk_decimal_string( buffer );
}
return 0;
}
//check if the specified char is english uppercase letter.
// included full-width and half-width letters.
FRISO_API int friso_uppercase_letter(
friso_charset_t charset,
friso_task_t task )
{
if ( charset == FRISO_UTF8 ) {
return utf8_uppercase_letter( task->unicode );
} else if ( charset == FRISO_GBK ) {
return gbk_uppercase_letter( task->buffer );
}
return 0;
}
/* get the type of the specified char.
* the type will be the constants defined above.
* (include the fullwidth english char.)
*/
FRISO_API friso_enchar_t friso_enchar_type(
friso_charset_t charset,
friso_task_t task )
{
//Unicode or ASCII.(Both UTF-8 and GBK are valid)
uint_t u = 0;
if ( charset == FRISO_UTF8 ) {
u = task->unicode;
//if ( u >= 65280 ) u = 65280 - 65248;
} else if ( charset == FRISO_GBK ) {
u = (uchar_t)task->buffer[0];
//if ( u == 0xa3 ) ; //full-width.
}
//range check.
if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW;
if ( u == 32 ) return FRISO_EN_WHITESPACE;
if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC;
if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER;
if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER;
return FRISO_EN_PUNCTUATION;
}
/* get the type of the specified en char.
* the type will be the constants defined above.
* (the char should be half-width english char only)
*/
FRISO_API friso_enchar_t get_enchar_type( char ch )
{
uint_t u = (uchar_t) ch;
//range check.
if ( u > 126 || u < 32 ) return FRISO_EN_UNKNOW;
if ( u == 32 ) return FRISO_EN_WHITESPACE;
if ( u >= 48 && u <= 57 ) return FRISO_EN_NUMERIC;
if ( u >= 65 && u <= 90 ) return FRISO_EN_LETTER;
if ( u >= 97 && u <= 122 ) return FRISO_EN_LETTER;
return FRISO_EN_PUNCTUATION;
}

View File

@ -0,0 +1,261 @@
/**
* Friso charset about function interface header file.
* @package src/friso_charset.h .
* Available charset for now:
* 1. UTF8 - function start with utf8
* 2. GBK - function start with gbk
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#ifndef _friso_charset_h
#define _friso_charset_h
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "friso.h"
#include "friso_API.h"
/** {{{ wrap interface */
/* check if the specified string is a cn string.
*
* @return int (true for cn string or false)
* */
FRISO_API int friso_cn_string( friso_charset_t, friso_task_t );
//check if the specified word is a whitespace.
FRISO_API int friso_whitespace( friso_charset_t, friso_task_t );
//check if the specifiled word is a numeric letter.
FRISO_API int friso_numeric_letter(friso_charset_t, friso_task_t);
//check if the specified word is a english letter.
FRISO_API int friso_en_letter( friso_charset_t, friso_task_t );
//check if the specified word is a half-width letter.
// punctuations are inclued.
FRISO_API int friso_halfwidth_en_char( friso_charset_t, friso_task_t );
//check if the specified word is a full-width letter.
// full-width punctuations are not included.
FRISO_API int friso_fullwidth_en_char( friso_charset_t, friso_task_t );
//check if the specified word is an english punctuations.
FRISO_API int friso_en_punctuation( friso_charset_t, friso_task_t );
//check if the specified word ia sn chinese punctuation.
FRISO_API int friso_cn_punctuation( friso_charset_t, friso_task_t );
FRISO_API int friso_letter_number( friso_charset_t, friso_task_t );
FRISO_API int friso_other_number( friso_charset_t, friso_task_t );
//check if the word is a keep punctuation.
//@Deprecated
//FRISO_API int friso_keep_punctuation( friso_charset_t, friso_task_t );
//check the specified string is numeric string.
FRISO_API int friso_numeric_string( friso_charset_t, char * );
//check the specified string is a decimal string.
FRISO_API int friso_decimal_string( friso_charset_t, char * );
//check if the specified char is english uppercase letter.
// included full-width and half-width letters.
FRISO_API int friso_uppercase_letter( friso_charset_t, friso_task_t );
//en char type.
//#define FRISO_EN_LETTER 0 //a-z && A-Z
//#define FRISO_EN_NUMERIC 1 //0-9
//#define FRISO_EN_PUNCTUATION 2 //english punctuations
//#define FRISO_EN_WHITESPACE 3 //whitespace
//#define FRISO_EN_UNKNOW -1 //beyond 32-122
typedef enum {
FRISO_EN_LETTER = 0, //A-Z, a-z
FRISO_EN_NUMERIC = 1, //0-9
FRISO_EN_PUNCTUATION = 2, //english punctuations
FRISO_EN_WHITESPACE = 3, //whitespace
FRISO_EN_UNKNOW = -1 //unkow(beyond 32-126)
} friso_enchar_t;
/* get the type of the specified char.
* the type will be the constants defined above.
* (include the fullwidth english char.)
*/
FRISO_API friso_enchar_t friso_enchar_type( friso_charset_t, friso_task_t );
/* get the type of the specified en char.
* the type will be the constants defined above.
* (the char should be half-width english char only)
*/
FRISO_API friso_enchar_t get_enchar_type( char );
/* }}} */
/** {{{ UTF8 interface*/
/* read the next utf-8 word from the specified position.
*
* @return int the bytes of the current readed word.
*/
FRISO_API int utf8_next_word( friso_task_t, uint_t *, fstring );
//get the bytes of a utf-8 char.
FRISO_API int get_utf8_bytes( char );
//return the unicode serial number of a given string.
FRISO_API int get_utf8_unicode( const fstring );
//convert the unicode serial to a utf-8 string.
FRISO_API int unicode_to_utf8( uint_t, fstring );
//check if the given char is a CJK.
FRISO_API int utf8_cjk_string( uint_t ) ;
/*check the given char is a Basic Latin letter or not.
* include all the letters and english puntuations.*/
FRISO_API int utf8_halfwidth_en_char( uint_t );
/*
* check the given char is a full-width latain or not.
* include the full-width arabic numeber, letters.
* but not the full-width puntuations.
*/
FRISO_API int utf8_fullwidth_en_char( uint_t );
//check the given char is a upper case letter or not.
// included all the full-width and half-width letters.
FRISO_API int utf8_uppercase_letter( uint_t );
//check the given char is a lower case letter or not.
// included all the full-width and half-width letters.
FRISO_API int utf8_lowercase_letter( uint_t );
//check the given char is a numeric.
// included the full-width and half-width arabic numeric.
FRISO_API int utf8_numeric_letter( uint_t );
/*
* check if the given fstring is make up with numeric chars.
* both full-width,half-width numeric is ok.
*/
FRISO_API int utf8_numeric_string( char * );
FRISO_API int utf8_decimal_string( char * );
//check the given char is a english char.
//(full-width and half-width)
//not the punctuation of course.
FRISO_API int utf8_en_letter( uint_t );
//check the given char is a whitespace or not.
FRISO_API int utf8_whitespace( uint_t );
/* check if the given char is a letter number like 'ⅠⅡ'
*/
FRISO_API int utf8_letter_number( uint_t );
/*
* check if the given char is a other number like ''
*/
FRISO_API int utf8_other_number( uint_t );
//check if the given char is a english punctuation.
FRISO_API int utf8_en_punctuation( uint_t ) ;
//check if the given char is a chinese punctuation.
FRISO_API int utf8_cn_punctuation( uint_t u );
FRISO_API int is_en_punctuation( friso_charset_t, char );
//#define is_en_punctuation( c ) utf8_en_punctuation((uint_t) c)
//@Deprecated
//FRISO_API int utf8_keep_punctuation( fstring );
/* }}} */
/** {{{ GBK interface */
/* read the next GBK word from the specified position.
*
* @return int the bytes of the current readed word.
*/
FRISO_API int gbk_next_word( friso_task_t, uint_t *, fstring );
//get the bytes of a utf-8 char.
FRISO_API int get_gbk_bytes( char );
//check if the given char is a gbk char (ANSII string).
FRISO_API int gbk_cn_string( char * ) ;
/*check if the given char is a ASCII letter
* include all the letters and english puntuations.*/
FRISO_API int gbk_halfwidth_en_char( char );
/*
* check if the given char is a full-width latain.
* include the full-width arabic numeber, letters.
* but not the full-width puntuations.
*/
FRISO_API int gbk_fullwidth_en_char( char * );
//check if the given char is a upper case char.
// included all the full-width and half-width letters.
FRISO_API int gbk_uppercase_letter( char * );
//check if the given char is a lower case char.
// included all the full-width and half-width letters.
FRISO_API int gbk_lowercase_letter( char * );
//check if the given char is a numeric.
// included the full-width and half-width arabic numeric.
FRISO_API int gbk_numeric_letter( char * );
/*
* check if the given fstring is make up with numeric chars.
* both full-width,half-width numeric is ok.
*/
FRISO_API int gbk_numeric_string( char * );
FRISO_API int gbk_decimal_string( char * );
//check if the given char is a english(ASCII) char.
//(full-width and half-width)
//not the punctuation of course.
FRISO_API int gbk_en_letter( char * );
//check the specified char is a whitespace or not.
FRISO_API int gbk_whitespace( char * );
/* check if the given char is a letter number like 'ⅠⅡ'
*/
FRISO_API int gbk_letter_number( char * );
/*
* check if the given char is a other number like ''
*/
FRISO_API int gbk_other_number( char * );
//check if the given char is a english punctuation.
FRISO_API int gbk_en_punctuation( char ) ;
//check the given char is a chinese punctuation.
FRISO_API int gbk_cn_punctuation( char * );
//cause the logic handle is the same as the utf8.
// here invoke the utf8 interface directly.
//FRISO_API int gbk_keep_punctuation( char * );
//@Deprecated
//#define gbk_keep_punctuation( str ) utf8_keep_punctuation(str)
//check if the given english char is a full-width char or not.
//FRISO_API int gbk_fullwidth_char( char * ) ;
/* }}}*/
#endif /*end _friso_charset_h*/

View File

@ -0,0 +1,297 @@
/*
* friso hash table functions implementation defined in header file "friso_API.h".
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdlib.h>
#include <string.h>
//-166411799L
//31 131 1331 13331 133331 ..
//31 131 1313 13131 131313 .. the best
#define HASH_FACTOR 1313131
/* ************************
* mapping function area *
**************************/
__STATIC_API__ uint_t hash( fstring str, uint_t length )
{
//hash code
uint_t h = 0;
while ( *str != '\0' ) {
h = h * HASH_FACTOR + ( *str++ );
}
return (h % length);
}
/*test if a integer is a prime.*/
__STATIC_API__ int is_prime( int n )
{
int j;
if ( n == 2 || n == 3 ) {
return 1;
}
if ( n == 1 || n % 2 == 0 ) {
return 0;
}
for ( j = 3; j * j < n; j++ ) {
if ( n % j == 0 ) {
return 0;
}
}
return 1;
}
/*get the next prime just after the speicified integer.*/
__STATIC_API__ int next_prime( int n )
{
if ( n % 2 == 0 ) n++;
for ( ; ! is_prime( n ); n = n + 2 ) ;
return n;
}
//fstring copy, return the pointer of the new string.
//static fstring string_copy( fstring _src ) {
//int bytes = strlen( _src );
//fstring _dst = ( fstring ) FRISO_MALLOC( bytes + 1 );
//register int t = 0;
//do {
//_dst[t] = _src[t];
//t++;
//} while ( _src[t] != '\0' );
//_dst[t] = '\0';
//return _dst;
//}
/* *********************************
* static hashtable function area. *
***********************************/
__STATIC_API__ hash_entry_t new_hash_entry(
fstring key,
void * value,
hash_entry_t next )
{
hash_entry_t e = ( hash_entry_t )
FRISO_MALLOC( sizeof( friso_hash_entry ) );
if ( e == NULL ) {
___ALLOCATION_ERROR___
}
//e->_key = string_copy( key );
e->_key = key;
e->_val = value;
e->_next = next;
return e;
}
//create blocks copy of entries.
__STATIC_API__ hash_entry_t * create_hash_entries( uint_t blocks )
{
register uint_t t;
hash_entry_t *e = ( hash_entry_t * )
FRISO_CALLOC( sizeof( hash_entry_t ), blocks );
if ( e == NULL ) {
___ALLOCATION_ERROR___
}
for ( t = 0; t < blocks; t++ ) {
e[t] = NULL;
}
return e;
}
//a static function to do the re-hash work.
__STATIC_API__ void rebuild_hash( friso_hash_t _hash )
{
//printf("rehashed.\n");
//find the next prime as the length of the hashtable.
uint_t t, length = next_prime( _hash->length * 2 + 1 );
hash_entry_t e, next, *_src = _hash->table, \
*table = create_hash_entries( length );
uint_t bucket;
//copy the nodes
for ( t = 0; t < _hash->length; t++ ) {
e = *( _src + t );
if ( e != NULL ) {
do {
next = e->_next;
bucket = hash( e->_key, length );
e->_next = table[bucket];
table[bucket] = e;
e = next;
} while ( e != NULL );
}
}
_hash->table = table;
_hash->length = length;
_hash->threshold = ( uint_t ) ( _hash->length * _hash->factor );
//free the old hash_entry_t blocks allocations.
FRISO_FREE( _src );
}
/* ********************************
* hashtable interface functions. *
* ********************************/
//create a new hash table.
FRISO_API friso_hash_t new_hash_table( void )
{
friso_hash_t _hash = ( friso_hash_t ) FRISO_MALLOC( sizeof ( friso_hash_cdt ) );
if ( _hash == NULL ) {
___ALLOCATION_ERROR___
}
//initialize the the hashtable
_hash->length = DEFAULT_LENGTH;
_hash->size = 0;
_hash->factor = DEFAULT_FACTOR;
_hash->threshold = ( uint_t ) ( _hash->length * _hash->factor );
_hash->table = create_hash_entries( _hash->length );
return _hash;
}
FRISO_API void free_hash_table(
friso_hash_t _hash,
fhash_callback_fn_t fentry_func )
{
register uint_t j;
hash_entry_t e, n;
for ( j = 0; j < _hash->length; j++ ) {
e = *( _hash->table + j );
for ( ; e != NULL ; ) {
n = e->_next;
if ( fentry_func != NULL ) fentry_func(e);
FRISO_FREE( e );
e = n;
}
}
//free the pointer array block ( 4 * htable->length continuous bytes ).
FRISO_FREE( _hash->table );
FRISO_FREE( _hash );
}
//put a new mapping insite.
//the value cannot be NULL.
FRISO_API void *hash_put_mapping(
friso_hash_t _hash,
fstring key,
void * value )
{
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
hash_entry_t e = *( _hash->table + bucket );
void *oval = NULL;
//check the given key is already exists or not.
for ( ; e != NULL; e = e->_next ) {
if ( key == e->_key
|| ( key != NULL && e->_key != NULL
&& strcmp( key, e->_key ) == 0 ) ) {
oval = e->_val; //bak the old value
e->_key = key;
e->_val = value;
return oval;
}
}
//put a new mapping into the hashtable.
_hash->table[bucket] = new_hash_entry( key, value, _hash->table[bucket] );
_hash->size++;
//check the condition to rebuild the hashtable.
if ( _hash->size >= _hash->threshold ) {
rebuild_hash( _hash );
}
return oval;
}
//check the existence of the mapping associated with the given key.
FRISO_API int hash_exist_mapping(
friso_hash_t _hash, fstring key )
{
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
hash_entry_t e;
for ( e = *( _hash->table + bucket );
e != NULL; e = e->_next ) {
if ( key == e->_key
|| ( key != NULL && e->_key != NULL
&& strcmp( key, e->_key ) == 0 )) {
return 1;
}
}
return 0;
}
//get the value associated with the given key.
FRISO_API void *hash_get_value( friso_hash_t _hash, fstring key )
{
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
hash_entry_t e;
for ( e = *( _hash->table + bucket );
e != NULL; e = e->_next ) {
if ( key == e->_key
|| ( key != NULL && e->_key != NULL
&& strcmp( key, e->_key ) == 0 )) {
return e->_val;
}
}
return NULL;
}
//remove the mapping associated with the given key.
FRISO_API hash_entry_t hash_remove_mapping(
friso_hash_t _hash, fstring key )
{
uint_t bucket = ( key == NULL ) ? 0 : hash( key, _hash->length );
hash_entry_t e, prev = NULL;
hash_entry_t b;
for ( e = *( _hash->table + bucket );
e != NULL; prev = e, e = e->_next ) {
if ( key == e->_key
|| ( key != NULL && e->_key != NULL
&& strcmp( key, e->_key ) == 0 ) ) {
b = e;
//the node located at *( htable->table + bucket )
if ( prev == NULL ) {
_hash->table[bucket] = e->_next;
} else {
prev->_next = e->_next;
}
//printf("%s was removed\n", b->_key);
_hash->size--;
//FRISO_FREE( b );
return b;
}
}
return NULL;
}
//count the size.(A macro define has replace this.)
//FRISO_API uint_t hash_get_size( friso_hash_t _hash ) {
// return _hash->size;
//}

View File

@ -0,0 +1,559 @@
/*
* friso lexicon functions implementation.
* used to deal with the friso lexicon, like: load,remove,match...
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include <stdlib.h>
#include <string.h>
#include "friso_API.h"
#include "friso.h"
#define __SPLIT_MAX_TOKENS__ 5
#define __LEX_FILE_DELIME__ '#'
#define __FRISO_LEX_IFILE__ "friso.lex.ini"
//create a new lexicon
FRISO_API friso_dic_t friso_dic_new()
{
register uint_t t;
friso_dic_t dic = ( friso_dic_t ) FRISO_CALLOC(
sizeof( friso_hash_t ), __FRISO_LEXICON_LENGTH__ );
if ( dic == NULL ) {
___ALLOCATION_ERROR___
}
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
dic[t] = new_hash_table();
}
return dic;
}
/**
* default callback function to invoke
* when free the friso dictionary .
*
* @date 2013-06-12
*/
__STATIC_API__ void default_fdic_callback( hash_entry_t e )
{
register uint_t i;
friso_array_t syn;
lex_entry_t lex = ( lex_entry_t ) e->_val;
//free the lex->word
FRISO_FREE( lex->word );
//free the lex->syn if it is not NULL
if ( lex->syn != NULL ) {
syn = lex->syn;
for ( i = 0; i < syn->length; i++ ) {
FRISO_FREE( syn->items[i] );
}
free_array_list( syn );
}
//free the e->_val
//@date 2014-01-28 posted by mlemay@gmail.com
FRISO_FREE(lex);
}
FRISO_API void friso_dic_free( friso_dic_t dic )
{
register uint_t t;
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
//free the hash table
free_hash_table( dic[t], default_fdic_callback );
}
FRISO_FREE( dic );
}
//create a new lexicon entry
FRISO_API lex_entry_t new_lex_entry(
fstring word,
friso_array_t syn,
uint_t fre,
uint_t length,
uint_t type )
{
lex_entry_t e = ( lex_entry_t )
FRISO_MALLOC( sizeof( lex_entry_cdt ) );
if ( e == NULL ) {
___ALLOCATION_ERROR___
}
//initialize.
e->word = word;
e->syn = syn; //synoyum words array list.
e->pos = NULL; //part of speech array list.
//e->py = NULL; //set to NULL first.
e->fre = fre;
e->length = (uchar_t) length; //length
e->rlen = (uchar_t) length; //set to length by default.
e->type = (uchar_t) type; //type
e->ctrlMask = 0; //control mask.
e->offset = -1;
return e;
}
/**
* free the given lexicon entry.
* you have to do three thing maybe:
* 1. free where its syn items points to. (not implemented)
* 2. free its syn. (friso_array_t)
* 3. free its pos. (friso_array_t)
* 4. free the lex_entry_t.
*/
FRISO_API void free_lex_entry_full( lex_entry_t e )
{
register uint_t i;
friso_array_t syn;
//free the lex->word
FRISO_FREE( e->word );
//free the lex->syn if it is not NULL
if ( e->syn != NULL ) {
syn = e->syn;
for ( i = 0; i < syn->length; i++ ) {
FRISO_FREE( syn->items[i] );
}
free_array_list( syn );
}
//free the e->_val
//@date 2014-01-28 posted by mlemay@gmail.com
FRISO_FREE(e);
}
FRISO_API void free_lex_entry( lex_entry_t e )
{
//if ( e->syn != NULL ) {
// if ( flag == 1 ) free_array_list( e->syn);
// else free_array_list( e->syn );
//}
FRISO_FREE(e);
}
//add a new entry to the dictionary.
FRISO_API void friso_dic_add(
friso_dic_t dic,
friso_lex_t lex,
fstring word,
friso_array_t syn )
{
void *olex = NULL;
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
//printf("lex=%d, word=%s, syn=%s\n", lex, word, syn);
olex = hash_put_mapping( dic[lex], word,
new_lex_entry( word, syn, 0,
(uint_t) strlen(word), (uint_t) lex ) );
if ( olex != NULL ) {
free_lex_entry_full((lex_entry_t)olex);
}
}
}
FRISO_API void friso_dic_add_with_fre(
friso_dic_t dic,
friso_lex_t lex,
fstring word,
friso_array_t syn,
uint_t frequency )
{
void *olex = NULL;
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
olex = hash_put_mapping( dic[lex], word,
new_lex_entry( word, syn, frequency,
( uint_t ) strlen(word), ( uint_t ) lex ) );
if ( olex != NULL ) {
free_lex_entry_full((lex_entry_t)olex);
}
}
}
/*
* read a line from a specified stream.
* the newline will be cleared.
*
* @date 2012-11-24
*/
FRISO_API fstring file_get_line( fstring __dst, FILE * _stream )
{
register int c;
fstring cs;
cs = __dst;
while ( ( c = fgetc( _stream ) ) != EOF ) {
if ( c == '\n' ) break;
*cs++ = c;
}
*cs = '\0';
return ( c == EOF && cs == __dst ) ? NULL : __dst;
}
/*
* static function to copy a string.
*/
///instead of memcpy
__STATIC_API__ fstring string_copy(
fstring _src,
fstring __dst,
uint_t blocks )
{
register fstring __src = _src;
register uint_t t;
for ( t = 0; t < blocks; t++ ) {
if ( *__src == '\0' ) break;
__dst[t] = *__src++;
}
__dst[t] = '\0';
return __dst;
}
/**
* make a heap allocation, and copy the
* source fstring to the new allocation, and
* you should free it after use it .
*
* @param _src source fstring
* @param blocks number of bytes to copy
*/
__STATIC_API__ fstring string_copy_heap(
fstring _src, uint_t blocks )
{
register uint_t t;
fstring str = ( fstring ) FRISO_MALLOC( blocks + 1 );
if ( str == NULL ) {
___ALLOCATION_ERROR___;
}
for ( t = 0; t < blocks; t++ ) {
//if ( *_src == '\0' ) break;
str[t] = *_src++;
}
str[t] = '\0';
return str;
}
/*
* find the postion of the first appear of the given char.
* address of the char in the fstring will be return .
* if not found NULL will be return .
*/
__STATIC_API__ fstring indexOf( fstring __str, char delimiter )
{
uint_t i, __length__;
__length__ = strlen( __str );
for ( i = 0; i < __length__; i++ ) {
if ( __str[i] == delimiter ) {
return __str + i;
}
}
return NULL;
}
/**
* load all the valid wors from a specified lexicon file .
*
* @param dic friso dictionary instance (A hash array)
* @param lex the lexicon type
* @param lex_file the path of the lexicon file
* @param length the maximum length of the word item
*/
FRISO_API void friso_dic_load(
friso_t friso,
friso_config_t config,
friso_lex_t lex,
fstring lex_file,
uint_t length )
{
FILE * _stream;
char __char[1024], _buffer[512];
fstring _line;
string_split_entry sse;
fstring _word;
char _sbuffer[512];
fstring _syn;
friso_array_t sywords;
uint_t _fre;
if ( ( _stream = fopen( lex_file, "rb" ) ) != NULL ) {
while ( ( _line = file_get_line( __char, _stream ) ) != NULL ) {
//clear up the notes
//make sure the length of the line is greater than 1.
//like the single '#' mark in stopwords dictionary.
if ( _line[0] == '#' && strlen(_line) > 1 ) continue;
//handle the stopwords.
if ( lex == __LEX_STOPWORDS__ ) {
//clean the chinese words that its length is greater than max length.
if ( ((int)_line[0]) < 0 && strlen( _line ) > length ) continue;
friso_dic_add( friso->dic, __LEX_STOPWORDS__,
string_copy_heap( _line, strlen(_line) ), NULL );
continue;
}
//split the fstring with '/'.
string_split_reset( &sse, "/", _line);
if ( string_split_next( &sse, _buffer ) == NULL ) {
continue;
}
//1. get the word.
_word = string_copy_heap( _buffer, strlen(_buffer) );
if ( string_split_next( &sse, _buffer ) == NULL ) {
//normal lexicon type,
//add them to the dictionary directly
friso_dic_add( friso->dic, lex, _word, NULL );
continue;
}
/*
* filter out the words that its length is larger
* than the specified limit.
* but not for __LEX_ECM_WORDS__ and english __LEX_STOPWORDS__
* and __LEX_CEM_WORDS__.
*/
if ( ! ( lex == __LEX_ECM_WORDS__ || lex == __LEX_CEM_WORDS__ )
&& strlen( _word ) > length ) {
FRISO_FREE(_word);
continue;
}
//2. get the synonyms words.
_syn = NULL;
if ( strcmp( _buffer, "null" ) != 0 ) {
_syn = string_copy( _buffer, _sbuffer, strlen(_buffer) );
}
//3. get the word frequency if it available.
_fre = 0;
if ( string_split_next( &sse, _buffer ) != NULL ) {
_fre = atoi( _buffer );
}
/**
* Here:
* split the synonyms words with mark ","
* and put them in a array list if the synonyms is not NULL
*/
sywords = NULL;
if ( config->add_syn && _syn != NULL ) {
string_split_reset( &sse, ",", _sbuffer );
sywords = new_array_list_with_opacity(5);
while ( string_split_next( &sse, _buffer ) != NULL ) {
if ( strlen(_buffer) > length ) continue;
array_list_add( sywords,
string_copy_heap(_buffer, strlen(_buffer)) );
}
sywords = array_list_trim( sywords );
}
//4. add the word item
friso_dic_add_with_fre(
friso->dic, lex, _word, sywords, _fre );
}
fclose( _stream );
} else {
fprintf(stderr, "Warning: Fail to open lexicon file %s\n", lex_file);
fprintf(stderr, "Warning: Without lexicon file, segment results will not correct \n");
}
}
/**
* get the lexicon type index with the specified
* type keywords .
*
* @see friso.h#friso_lex_t
* @param _key
* @return int
*/
__STATIC_API__ friso_lex_t get_lexicon_type_with_constant( fstring _key )
{
if ( strcmp( _key, "__LEX_CJK_WORDS__" ) == 0 ) {
return __LEX_CJK_WORDS__;
} else if ( strcmp( _key, "__LEX_CJK_UNITS__" ) == 0 ) {
return __LEX_CJK_UNITS__;
} else if ( strcmp( _key, "__LEX_ECM_WORDS__" ) == 0 ) {
return __LEX_ECM_WORDS__;
} else if ( strcmp( _key, "__LEX_CEM_WORDS__" ) == 0 ) {
return __LEX_CEM_WORDS__;
} else if ( strcmp( _key, "__LEX_CN_LNAME__" ) == 0 ) {
return __LEX_CN_LNAME__;
} else if ( strcmp( _key, "__LEX_CN_SNAME__" ) == 0 ) {
return __LEX_CN_SNAME__;
} else if ( strcmp( _key, "__LEX_CN_DNAME1__" ) == 0 ) {
return __LEX_CN_DNAME1__;
} else if ( strcmp( _key, "__LEX_CN_DNAME2__" ) == 0 ) {
return __LEX_CN_DNAME2__;
} else if ( strcmp( _key, "__LEX_CN_LNA__" ) == 0 ) {
return __LEX_CN_LNA__;
} else if ( strcmp( _key, "__LEX_STOPWORDS__" ) == 0 ) {
return __LEX_STOPWORDS__;
} else if ( strcmp( _key, "__LEX_ENPUN_WORDS__" ) == 0 ) {
return __LEX_ENPUN_WORDS__;
} else if ( strcmp( _key, "__LEX_EN_WORDS__" ) == 0 ) {
return __LEX_EN_WORDS__;
}
return -1;
}
/*
* load the lexicon configuration file.
* and load all the valid lexicon from the configuration file.
*
* @param friso friso instance
* @param config friso_config instance
* @param _path dictionary directory
* @param _limitts words length limit
*/
FRISO_API void friso_dic_load_from_ifile(
friso_t friso,
friso_config_t config,
fstring _path,
uint_t _limits )
{
//1.parse the configuration file.
FILE *__stream;
char __chars__[1024], __key__[30], *__line__;
uint_t __length__, i, t;
friso_lex_t lex_t;
string_buffer_t sb;
//get the lexicon configruation file path
sb = new_string_buffer();
string_buffer_append( sb, _path );
string_buffer_append( sb, __FRISO_LEX_IFILE__ );
//printf("%s\n", sb->buffer);
if ( ( __stream = fopen( sb->buffer, "rb" ) ) != NULL ) {
while ( ( __line__ =
file_get_line( __chars__, __stream ) ) != NULL ) {
//comment filter.
if ( __line__[0] == '#' ) continue;
if ( __line__[0] == '\0' ) continue;
__length__ = strlen( __line__ );
//item start
if ( __line__[ __length__ - 1 ] == '[' ) {
//get the type key
for ( i = 0; i < __length__
&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
for ( t = 0; i < __length__; i++,t++ ) {
if ( __line__[i] == ' '
|| __line__[i] == '\t' || __line__[i] == ':' ) break;
__key__[t] = __line__[i];
}
__key__[t] = '\0';
//get the lexicon type
lex_t = get_lexicon_type_with_constant(__key__);
if ( lex_t == -1 ) continue;
//printf("key=%s, type=%d\n", __key__, lex_t );
while ( ( __line__ = file_get_line( __chars__, __stream ) ) != NULL ) {
//comments filter.
if ( __line__[0] == '#' ) continue;
if ( __line__[0] == '\0' ) continue;
__length__ = strlen( __line__ );
if ( __line__[ __length__ - 1 ] == ']' ) break;
for ( i = 0; i < __length__
&& ( __line__[i] == ' ' || __line__[i] == '\t' ); i++ );
for ( t = 0; i < __length__; i++,t++ ) {
if ( __line__[i] == ' '
|| __line__[i] == '\t' || __line__[i] == ';' ) break;
__key__[t] = __line__[i];
}
__key__[t] = '\0';
//load the lexicon item from the lexicon file.
string_buffer_clear( sb );
string_buffer_append( sb, _path );
string_buffer_append( sb, __key__ );
//printf("key=%s, type=%d\n", __key__, lex_t);
friso_dic_load( friso, config, lex_t, sb->buffer, _limits );
}
}
} //end while
fclose( __stream );
} else {
fprintf(stderr, "Warning: Fail to open the lexicon configuration file %s\n", sb->buffer);
fprintf(stderr, "Warning: Without lexicon file, segment results will not correct \n");
}
free_string_buffer(sb);
}
//match the item.
FRISO_API int friso_dic_match(
friso_dic_t dic,
friso_lex_t lex,
fstring word )
{
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
return hash_exist_mapping( dic[lex], word );
}
return 0;
}
//get the lex_entry_t associated with the word.
FRISO_API lex_entry_t friso_dic_get(
friso_dic_t dic,
friso_lex_t lex,
fstring word )
{
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
return ( lex_entry_t ) hash_get_value( dic[lex], word );
}
return NULL;
}
//get the size of the specified type dictionary.
FRISO_API uint_t friso_spec_dic_size(
friso_dic_t dic,
friso_lex_t lex )
{
if ( lex >= 0 && lex < __FRISO_LEXICON_LENGTH__ ) {
return hash_get_size( dic[lex] );
}
return 0;
}
//get size of the whole dictionary.
FRISO_API uint_t friso_all_dic_size(
friso_dic_t dic )
{
register uint_t size = 0, t;
for ( t = 0; t < __FRISO_LEXICON_LENGTH__; t++ ) {
size += hash_get_size( dic[t] );
}
return size;
}

View File

@ -0,0 +1,284 @@
/*
* link list functions implementation defined in header file "friso_API.h".
* when the link_node is being deleted, here we just free
* the allocation of the node, not the allcation of it's value.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdlib.h>
//create a new link list node.
__STATIC_API__ link_node_t new_node_entry(
void * value,
link_node_t prev,
link_node_t next )
{
link_node_t node = ( link_node_t )
FRISO_MALLOC( sizeof( link_node_entry ) );
if ( node == NULL ) {
___ALLOCATION_ERROR___
}
node->value = value;
node->prev = prev;
node->next = next;
return node;
}
//create a new link list
FRISO_API friso_link_t new_link_list( void )
{
friso_link_t e = ( friso_link_t )
FRISO_MALLOC( sizeof( friso_link_entry ) );
if ( e == NULL ) {
___ALLOCATION_ERROR___
}
//initialize the entry
e->head = new_node_entry( NULL, NULL, NULL );
e->tail = new_node_entry( NULL, e->head, NULL );
e->head->next = e->tail;
e->size = 0;
return e;
}
//free the given link list
FRISO_API void free_link_list( friso_link_t link )
{
link_node_t node, next;
for ( node = link->head; node != NULL; ) {
next = node->next;
FRISO_FREE( node );
node = next;
}
FRISO_FREE( link );
}
//clear all nodes in the link list.
FRISO_API friso_link_t link_list_clear(
friso_link_t link )
{
link_node_t node, next;
//free all the middle nodes.
for ( node = link->head->next; node != link->tail; )
{
next = node->next;
FRISO_FREE( node );
node = next;
}
link->head->next = link->tail;
link->tail->prev = link->head;
link->size = 0;
return link;
}
//get the size of the link list.
//FRISO_API uint_t link_list_size( friso_link_t link ) {
// return link->size;
//}
//check if the link list is empty
//FRISO_API int link_list_empty( friso_link_t link ) {
// return ( link->size == 0 );
//}
/*
* find the node at a specified position.
* static
*/
__STATIC_API__ link_node_t get_node(
friso_link_t link, uint_t idx )
{
link_node_t p = NULL;
register uint_t t;
if ( idx >= 0 && idx < link->size )
{
if ( idx < link->size / 2 ) { //find from the head.
p = link->head;
for ( t = 0; t <= idx; t++ )
p = p->next;
} else { //find from the tail.
p = link->tail;
for ( t = link->size; t > idx; t-- )
p = p->prev;
}
}
return p;
}
/*
* insert a node before the given node.
* static
*/
//__STATIC_API__ void insert_before(
// friso_link_t link,
// link_node_t node,
// void * value )
//{
// link_node_t e = new_node_entry( value, node->prev, node );
// e->prev->next = e;
// e->next->prev = e;
// //node->prev = e;
//
// link->size++;
//}
#define insert_before( link, node, value ) \
{ \
link_node_t e = new_node_entry( value, node->prev, node ); \
e->prev->next = e; \
e->next->prev = e; \
link->size++; \
}
/*
* static function:
* remove the given node, the allocation of the value will not free,
* but we return it to you, you will free it youself when there is a necessary.
*
* @return the value of the removed node.
*/
__STATIC_API__ void * remove_node(
friso_link_t link, link_node_t node )
{
void * _value = node->value;
node->prev->next = node->next;
node->next->prev = node->prev;
link->size--;
FRISO_FREE( node );
return _value;
}
//add a new node to the link list.(insert just before the tail)
FRISO_API void link_list_add(
friso_link_t link, void * value )
{
insert_before( link, link->tail, value );
}
//add a new node before the given index.
FRISO_API void link_list_insert_before(
friso_link_t link, uint_t idx, void * value )
{
link_node_t node = get_node( link, idx );
if ( node != NULL ) {
insert_before( link, node, value );
}
}
/*
* get the value with the specified node.
*
* @return the value of the node.
*/
FRISO_API void * link_list_get(
friso_link_t link, uint_t idx )
{
link_node_t node = get_node( link, idx );
if ( node != NULL ) {
return node->value;
}
return NULL;
}
/*
* set the value of the node that located in the specified position.
* we did't free the allocation of the old value, we return it to you.
* free it yourself when it is necessary.
*
* @return the old value.
*/
FRISO_API void *link_list_set(
friso_link_t link,
uint_t idx, void * value )
{
link_node_t node = get_node( link, idx );
void * _value = NULL;
if ( node != NULL ) {
_value = node->value;
node->value = value;
}
return _value;
}
/*
* remove the node located in the specified position.
*
* @see remove_node
* @return the value of the node removed.
*/
FRISO_API void *link_list_remove(
friso_link_t link, uint_t idx )
{
link_node_t node = get_node( link, idx );
if ( node != NULL ) {
//printf("idx=%d, node->value=%s\n", idx, (string) node->value );
return remove_node( link, node );
}
return NULL;
}
/*
* remove the given node from the given link list.
*
* @see remove_node.
* @return the value of the node removed.
*/
FRISO_API void *link_list_remove_node(
friso_link_t link,
link_node_t node )
{
return remove_node( link, node );
}
//remove the first node after the head
FRISO_API void *link_list_remove_first(
friso_link_t link )
{
if ( link->size > 0 ) {
return remove_node( link, link->head->next );
}
return NULL;
}
//remove the last node just before the tail.
FRISO_API void *link_list_remove_last(
friso_link_t link )
{
if ( link->size > 0 ) {
return remove_node( link, link->tail->prev );
}
return NULL;
}
//append a node from the tail.
FRISO_API void link_list_add_last(
friso_link_t link,
void *value )
{
insert_before( link, link->tail, value );
}
//append a note just after the head.
FRISO_API void link_list_add_first(
friso_link_t link, void *value )
{
insert_before( link, link->head->next, value );
}

View File

@ -0,0 +1,316 @@
/*
* utf-8 handle functions implementation.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* ******************************************
* fstring buffer functions implements. *
********************************************/
/**
* create a new buffer
* @Note:
* 1. it's real length is 1 byte greater than the specifield value
* 2. we did not do any optimization for the memory allocation to ...
* avoid the memory defragmentation.
*
* @date: 2014-10-16
*/
__STATIC_API__ fstring create_buffer( uint_t length )
{
fstring buffer = ( fstring ) FRISO_MALLOC( length + 1 );
if ( buffer == NULL ) {
___ALLOCATION_ERROR___
}
memset( buffer, 0x00, length + 1 );
return buffer;
}
//the __allocs should not be smaller than sb->length
__STATIC_API__ string_buffer_t resize_buffer(
string_buffer_t sb, uint_t __allocs )
{
//create a new buffer.
//if ( __allocs < sb->length ) __allocs = sb->length + 1;
fstring str = create_buffer( __allocs );
//register uint_t t;
//for ( t = 0; t < sb->length; t++ ) {
// str[t] = sb->buffer[t];
//}
memcpy( str, sb->buffer, sb->length );
FRISO_FREE( sb->buffer );
sb->buffer = str;
sb->allocs = __allocs;
return sb;
}
//create a new fstring buffer with a default opacity.
//FRISO_API string_buffer_t new_string_buffer( void )
//{
// return new_string_buffer_with_opacity( __BUFFER_DEFAULT_LENGTH__ );
//}
//create a new fstring buffer with the given opacity.
FRISO_API string_buffer_t new_string_buffer_with_opacity( uint_t opacity )
{
string_buffer_t sb = ( string_buffer_t )
FRISO_MALLOC( sizeof( string_buffer_entry ) );
if ( sb == NULL ) {
___ALLOCATION_ERROR___
}
sb->buffer = create_buffer( opacity );
sb->length = 0;
sb->allocs = opacity;
return sb;
}
//create a buffer with the given string.
FRISO_API string_buffer_t new_string_buffer_with_string( fstring str )
{
//buffer allocations.
string_buffer_t sb = ( string_buffer_t )
FRISO_MALLOC( sizeof( string_buffer_entry ) );
if ( sb == NULL ) {
___ALLOCATION_ERROR___
}
//initialize
sb->length = strlen( str );
sb->buffer = create_buffer( sb->length + __BUFFER_DEFAULT_LENGTH__ );
sb->allocs = sb->length + __BUFFER_DEFAULT_LENGTH__;
//register uint_t t;
//copy the str to the buffer.
//for ( t = 0; t < sb->length; t++ ) {
// sb->buffer[t] = str[t];
//}
memcpy( sb->buffer, str, sb->length );
return sb;
}
FRISO_API void string_buffer_append(
string_buffer_t sb, fstring __str )
{
register uint_t __len__ = strlen( __str );
//check the necessity to resize the buffer.
if ( sb->length + __len__ > sb->allocs ) {
sb = resize_buffer( sb, ( sb->length + __len__ ) * 2 + 1 );
}
//register uint_t t;
////copy the __str to the buffer.
//for ( t = 0; t < __len__; t++ ) {
// sb->buffer[ sb->length++ ] = __str[t];
//}
memcpy( sb->buffer + sb->length, __str, __len__ );
sb->length += __len__;
}
FRISO_API void string_buffer_append_char(
string_buffer_t sb, char ch )
{
//check the necessity to resize the buffer.
if ( sb->length + 1 > sb->allocs ) {
sb = resize_buffer( sb, sb->length * 2 + 1 );
}
sb->buffer[sb->length++] = ch;
}
FRISO_API void string_buffer_insert(
string_buffer_t sb,
uint_t idx,
fstring __str )
{
}
/*
* remove the given bytes from the buffer start from idx.
* this will cause the byte move after the idx+length.
*
* @return the new string.
*/
FRISO_API fstring string_buffer_remove(
string_buffer_t sb,
uint_t idx,
uint_t length )
{
uint_t t;
//move the bytes after the idx + length
for ( t = idx + length; t < sb->length; t++ ) {
sb->buffer[t - length] = sb->buffer[t];
}
sb->buffer[t] = '\0';
//memcpy( sb->buffer + idx,
// sb->buffer + idx + length,
// sb->length - idx - length );
t = sb->length - idx;
if ( t > 0 ) {
sb->length -= ( t > length ) ? length : t;
}
sb->buffer[sb->length-1] = '\0';
return sb->buffer;
}
/*
* turn the string_buffer to a string.
* or return the buffer of the string_buffer.
*/
FRISO_API string_buffer_t string_buffer_trim( string_buffer_t sb )
{
//resize the buffer.
if ( sb->length < sb->allocs - 1 ) {
sb = resize_buffer( sb, sb->length + 1 );
}
return sb;
}
/*
* free the given fstring buffer.
* and this function will not free the allocations of the
* string_buffer_t->buffer, we return it to you, if there is
* a necessary you could free it youself by calling free();
*/
FRISO_API fstring string_buffer_devote( string_buffer_t sb )
{
fstring buffer = sb->buffer;
FRISO_FREE( sb );
return buffer;
}
/*
* clear the given fstring buffer.
* reset its buffer with 0 and reset its length to 0.
*/
FRISO_API void string_buffer_clear( string_buffer_t sb )
{
memset( sb->buffer, 0x00, sb->length );
sb->length = 0;
}
//free everything of the fstring buffer.
FRISO_API void free_string_buffer( string_buffer_t sb )
{
FRISO_FREE( sb->buffer );
FRISO_FREE( sb );
}
/**
* create a new string_split_entry.
*
* @param source
* @return string_split_t;
*/
FRISO_API string_split_t new_string_split(
fstring delimiter,
fstring source )
{
string_split_t e = ( string_split_t )
FRISO_MALLOC( sizeof( string_split_entry ) );
if ( e == NULL ) {
___ALLOCATION_ERROR___;
}
e->delimiter = delimiter;
e->delLen = strlen(delimiter);
e->source = source;
e->srcLen = strlen(source);
e->idx = 0;
return e;
}
FRISO_API void string_split_reset(
string_split_t sst,
fstring delimiter,
fstring source )
{
sst->delimiter = delimiter;
sst->delLen = strlen(delimiter);
sst->source = source;
sst->srcLen = strlen(source);
sst->idx = 0;
}
FRISO_API void string_split_set_source(
string_split_t sst, fstring source )
{
sst->source = source;
sst->srcLen = strlen(source);
sst->idx = 0;
}
FRISO_API void string_split_set_delimiter(
string_split_t sst, fstring delimiter )
{
sst->delimiter = delimiter;
sst->delLen = strlen( delimiter );
sst->idx = 0;
}
FRISO_API void free_string_split( string_split_t sst )
{
FRISO_FREE(sst);
}
/**
* get the next split fstring, and copy the
* splited fstring into the __dst buffer .
*
* @param string_split_t
* @param __dst
* @return fstring (NULL if reach the end of the source
* or there is no more segmentation)
*/
FRISO_API fstring string_split_next(
string_split_t sst, fstring __dst)
{
uint_t i, _ok;
fstring _dst = __dst;
//check if reach the end of the fstring
if ( sst->idx >= sst->srcLen ) return NULL;
while ( 1 ) {
_ok = 1;
for ( i = 0; i < sst->delLen
&& (sst->idx + i < sst->srcLen); i++ ) {
if ( sst->source[sst->idx+i] != sst->delimiter[i] ) {
_ok = 0;
break;
}
}
//find the delimiter here,
//break the loop and self plus the sst->idx, then return the buffer .
if ( _ok == 1 ) {
sst->idx += sst->delLen;
break;
}
//coy the char to the buffer
*_dst++ = sst->source[sst->idx++];
//check if reach the end of the fstring
if ( sst->idx >= sst->srcLen ) break;
}
*_dst = '\0';
return _dst;
}

View File

@ -0,0 +1,50 @@
/*
* dynamatic array test program.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdio.h>
#include <stdlib.h>
int main( int argc, char **args ) {
//create a new array list.
friso_array_t array = new_array_list();
fstring keys[] = {
"chenmanwen", "yangqinghua",
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
"zhangrenfang", "yangjian",
"liuxiao", "pankai",
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
"caizaili", "panpan", "xiaolude", "yintanwen"
};
int j, idx = 2, len = sizeof( keys ) / sizeof( fstring );
for ( j = 0; j < len; j++ ) {
array_list_add( array, keys[j] );
}
printf("length=%d, allocations=%d\n", array->length, array->allocs );
array_list_trim( array );
printf("after tirm length=%d, allocations=%d\n", array->length, array->allocs );
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
printf("\nAfter set %dth item.\n", idx );
array_list_set( array, idx, "chenxin__" );
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
printf("\nAfter remove %dth item.\n", idx );
array_list_remove( array, idx );
printf("length=%d, allocations=%d\n", array->length, array->allocs );
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
printf("\nInsert a item at %dth\n", idx );
array_list_insert( array, idx, "*chenxin*" );
printf("idx=%d, value=%s\n", idx, ( fstring ) array_list_get( array, idx ) );
free_array_list( array );
return 0;
}

View File

@ -0,0 +1,163 @@
/*
* Friso test program.
* Of couse you can make it a perfect demo for friso.
* all threads or proccess share the same friso_t,
* defferent threads/proccess use defferent friso_task_t.
* and you could share the friso_config_t if you wish...
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include "friso.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define __LENGTH__ 15
#define __INPUT_LENGTH__ 20480
#define ___EXIT_INFO___ \
println("Thanks for trying friso."); \
break;
#define ___ABOUT___ \
println("+---------------------------------------------------------------+"); \
println("| Friso - a Chinese word segmentation writen by c. |"); \
println("| bug report email - chenxin619315@gmail.com. |"); \
println("| or: visit https://github.com/lionsoul2014/friso. |"); \
println("| java version for https://github.com/lionsoul2014/jcseg |"); \
println("| type 'quit' to exit the program. |"); \
println("+---------------------------------------------------------------+");
//read a line from a command line.
static fstring getLine( FILE *fp, fstring __dst )
{
register int c;
register fstring cs;
cs = __dst;
while ( ( c = getc( fp ) ) != EOF ) {
if ( c == '\n' ) break;
*cs++ = c;
}
*cs = '\0';
return ( c == EOF && cs == __dst ) ? NULL : __dst;
}
/*static void printcode( fstring str ) {
int i,length;
length = strlen( str );
printf("str:length=%d\n", length );
for ( i = 0; i < length; i++ ) {
printf("%d ", str[i] );
}
putchar('\n');
}*/
int main(int argc, char **argv)
{
clock_t s_time, e_time;
char line[__INPUT_LENGTH__] = {0};
int i;
fstring __path__ = NULL, mode = NULL;
friso_t friso;
friso_config_t config;
friso_task_t task;
// get the lexicon directory from command line arguments
for ( i = 0; i < argc; i++ ) {
if ( strcasecmp( "-init", argv[i] ) == 0 ) {
__path__ = argv[i+1];
}
}
if ( __path__ == NULL ) {
println("Usage: friso -init lexicon path");
exit(0);
}
s_time = clock();
//initialize
friso = friso_new();
config = friso_new_config();
/*friso_dic_t dic = friso_dic_new();
friso_dic_load_from_ifile( dic, __path__, __LENGTH__ );
friso_set_dic( friso, dic );
friso_set_mode( friso, __FRISO_COMPLEX_MODE__ );*/
if ( friso_init_from_ifile(friso, config, __path__) != 1 ) {
printf("fail to initialize friso and config.\n");
goto err;
}
switch ( config->mode ) {
case __FRISO_SIMPLE_MODE__:
mode = "Simple";
break;
case __FRISO_COMPLEX_MODE__:
mode = "Complex";
break;
case __FRISO_DETECT_MODE__:
mode = "Detect";
break;
}
//friso_set_mode( config, __FRISO_DETECT_MODE__ );
//printf("clr_stw=%d\n", friso->clr_stw);
//printf("match c++?%d\n", friso_dic_match( friso->dic, __LEX_ENPUN_WORDS__, "c++" ));
//printf("match(研究)?%d\n", friso_dic_match( friso->dic, __LEX_CJK_WORDS__, "研究"));
e_time = clock();
printf("Initialized in %fsec\n", (double) ( e_time - s_time ) / CLOCKS_PER_SEC );
printf("Mode: %s\n", mode);
printf("+-Version: %s (%s)\n", friso_version(), friso->charset == FRISO_UTF8 ? "UTF-8" : "GBK" );
___ABOUT___;
//set the task.
task = friso_new_task();
while ( 1 ) {
print("friso>> ");
getLine( stdin, line );
//exit the programe
if (strcasecmp( line, "quit") == 0) {
___EXIT_INFO___
}
//for ( i = 0; i < 1000000; i++ ) {
//set the task text.
friso_set_text( task, line );
println("分词结果:");
s_time = clock();
while ( ( config->next_token( friso, config, task ) ) != NULL ) {
printf(
"%s[%d, %d, %d] ",
task->token->word,
task->token->offset,
task->token->length,
task->token->rlen
);
// printf("%s ", task->token->word);
}
//}
e_time = clock();
printf("\nDone, cost < %fsec\n", ( (double)(e_time - s_time) ) / CLOCKS_PER_SEC );
}
friso_free_task(task);
//error block.
err:
friso_free_config(config);
friso_free(friso);
return 0;
}

View File

@ -0,0 +1,66 @@
/**
* hashmap testing program
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdio.h>
void print_hash_info( friso_hash_t _hash ) {
printf("info:length=%d, size=%d, facotr=%f, threshold=%d\n", _hash->length, \
_hash->size, _hash->factor, _hash->threshold);
}
int main(int argc, char **argv)
{
friso_hash_t _hash = new_hash_table();
char *names[] = {
"陈满文", "阳清华",
"陈鑫", "罗江艳",
"小燕子", "比比",
"张仁芳", "阳建",
"陈配", "李恒",
"张志刚", "张怡少",
"阳江波", "蔡再利",
"阳绘章", "尹唐文",
"谭志鹏", "肖路德",
"潘凯", "刘潇",
"马朝辉", "张强",
"殷美林", "元明清",
"周安", "郭桥安",
"刘敏", "黄广华",
"李胜", "黄海清"
};
//char *str[] = {"陈鑫", "张仁芳", "比比"};
char **str = names;
int j, len = 30;
print_hash_info( _hash );
for (j = 0; j < len; j++) {
hash_put_mapping(_hash, names[j], names[j]);
}
print_hash_info(_hash);
printf("Press any key to continue.");
getchar();
//remove mappings
for (j = 0; j < len; j++) {
printf("Exist %s?%2d\n", str[j], hash_exist_mapping(_hash, str[j]));
printf("Now, remove %s\n", str[j]);
hash_remove_mapping(_hash, str[j]);
printf("Exist %s?%2d\n", str[j], hash_exist_mapping(_hash, str[j]));
printf("*********************************\n");
}
printf("Press any key to continue.");
getchar();
print_hash_info(_hash);
//free the table
free_hash_table(_hash, 0);
return 0;
}

View File

@ -0,0 +1,109 @@
/*
* lex functions test program.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso.h"
#include <stdio.h>
#include <time.h>
#include <string.h>
#define __LENGTH__ 15
#define ___PRINT_HELP_INFO___ \
printf("1. help print the current menu.\n"); \
printf("2. #set set the classify of the dictionary.\n"); \
printf("3. other search the words in the dictionary.\n"); \
printf("4. quit exit the programe.\n");
int main(int argc, char **argv)
{
lex_entry_t e;
int lex = __LEX_CJK_WORDS__;
char _line[__LENGTH__];
clock_t s_time, e_time;
friso_t friso;
friso_config_t config;
s_time = clock();
friso = friso_new();
config = friso_new_config();
config->add_syn = 0;
friso->dic = friso_dic_new();
//__CJK_WORDS__
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-main.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-admin.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-chars.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-cn-mz.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-cn-place.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-company.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-festival.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-flname.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-food.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-lang.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-nation.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-net.lex", __LENGTH__);
friso_dic_load(friso, config, __LEX_CJK_WORDS__, "../vendors/dict/UTF-8/lex-org.lex", __LENGTH__);
//__CJK_UNITS__
friso_dic_load(friso, config, __LEX_CJK_UNITS__, "../vendors/dict/UTF-8/lex-units.lex", __LENGTH__);
//__MIX_WORDS__
friso_dic_load(friso, config, __LEX_ECM_WORDS__, "../vendors/dict/UTF-8/lex-ecmixed.lex", __LENGTH__);
//__CN_LNAME__
friso_dic_load(friso, config, __LEX_CN_LNAME__, "../vendors/dict/UTF-8/lex-lname.lex", __LENGTH__);
//__CN_SNAME__
friso_dic_load(friso, config, __LEX_CN_SNAME__, "../vendors/dict/UTF-8/lex-sname.lex", __LENGTH__);
//__CN_DNAME1__
friso_dic_load(friso, config, __LEX_CN_DNAME1__, "../vendors/dict/UTF-8/lex-dname-1.lex", __LENGTH__);
//__CN_DNAME2__
friso_dic_load(friso, config, __LEX_CN_DNAME2__, "../vendors/dict/UTF-8/lex-dname-2.lex", __LENGTH__);
//__CN_LNA__
friso_dic_load(friso, config, __LEX_CN_LNA__, "../vendors/dict/UTF-8/lex-ln-adorn.lex", __LENGTH__ );
e_time = clock();
printf(
"Done, cost: %f sec, size=%d\n",
(double) (e_time - s_time) / CLOCKS_PER_SEC,
friso_all_dic_size(friso->dic)
);
while (1) {
printf("friso-%d>> ", lex);
if (scanf("%s", _line) != 1) {
printf("Invalid input\n");
continue;
}
if (strcmp( _line, "quit" ) == 0) {
break;
} else if ( strcmp(_line, "help") == 0 ) {
___PRINT_HELP_INFO___
} else if ( strcmp( _line, "#set" ) == 0 ) {
printf("lex_t>> ");
if (scanf("%d", &lex) != 1) {
printf("Warning: Invalid lex type input\n");
continue;
}
} else {
s_time = clock();
e = friso_dic_get( friso->dic, lex, _line );
e_time = clock();
if (e != NULL) {
printf(
"word=%s, syn=%s, fre=%d, cost:%fsec\n",
e->word, e->syn==NULL? "NULL" : (char *)e->syn->items[0],
e->fre,
(double) (e_time - s_time) / CLOCKS_PER_SEC
);
} else {
printf("%s was not found.\n", _line);
}
}
}
// friso_dic_free( friso->dic );
friso_free(friso);
return 0;
}

View File

@ -0,0 +1,50 @@
/*
* link list test programe.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdio.h>
#include <stdlib.h>
int main( int argc, char **args ) {
friso_link_t link;
fstring keys[] = {
"chenmanwen", "yangqinghua",
"chenxin", "luojiangyan", "xiaoyanzi", "bibi",
"zhangrenfang", "yangjian",
"liuxiao", "pankai",
"chenpei", "liheng", "zhangzhigang", "zhgangyishao", "yangjiangbo",
"caizaili", "panpan", "xiaolude", "yintanwen"
};
int j, len = sizeof( keys ) / sizeof( fstring );
link = new_link_list();
//print the size of the link
printf("size=%d\n", link->size );
for ( j = 0; j < len; j++ ) {
//link_add( link, keys[j] );
link_list_add_last( link, keys[j] );
}
printf("size=%d\n", link->size );
for ( j = 0; j < len / 2; j++ ) {
//printf("idx=%d, remove %s\n", j, ( fstring ) link_remove( link, 0 ) );
printf("idx=%d, remove %s\n", j, ( fstring ) link_list_remove_first( link ) );
}
printf("size=%d\n", link->size );
//clear all the nodes
link_list_clear( link );
printf("size=%d, head->next->value=%s\n", link->size, ( fstring ) link->head->next->value );
free_link_list( link );
return 0;
}

View File

@ -0,0 +1,28 @@
/**
* friso fstring split test program .
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdio.h>
#include <stdlib.h>
int main ( int argc, char **args )
{
fstring source = ",I am a chinese,,my name is Lion,and i am the author of friso,bug report email chenxin619315@gmail.com,qq:1187582057";
char buffer[128];
string_split_t split = new_string_split(",", source );
printf("sst->idx=%d\n", split->idx);
printf("sst->srcLen=%d\n", split->srcLen);
printf("sst->delLen=%d\n", split->delLen);
while ( string_split_next(split, buffer) != NULL) {
printf("buffer:%s\n", buffer);
}
free_string_split(split);
return 0;
}

View File

@ -0,0 +1,51 @@
/*
* fstring handle mode test program.
*
* @author lionsoul<chenxin619315@gmail.com>
*/
#include "friso_API.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main( int argc, char **args ) {
fstring str = "康熙字典部首, 符号和标点, 统一表意符号扩展 A ,CJK㈩兼Ⅱ容形式⑩.";
char word[4];
int bytes, t, j, length = strlen( str );
string_buffer_t sb = new_string_buffer();
printf("str=%s, length=%d\n", str, length );
for (t = 0; t < length; t += bytes) {
bytes = get_utf8_bytes(*(str + t));
if ( bytes == 0 ) {
continue;
}
for ( j = 0; j < bytes; j++ ) {
word[j] = *(str + t + j );
}
word[j] = '\0';
string_buffer_append( sb, word );
printf("word=%s\n", word );
}
printf("length=%d, buffer=%s\n", sb->length, sb->buffer );
string_buffer_remove( sb, 0, 3 );
printf("length=%d, buffer=%s\n", sb->length, sb->buffer );
string_buffer_remove( sb, 0, 3 );
printf("length=%d, buffer=%s\n", sb->length, sb->buffer );
string_buffer_remove( sb, sb->length - 3, 6 );
sb = string_buffer_trim( sb );
printf("length=%d, buffer=%s\n", sb->length, string_buffer_devote( sb ) );
//00011110 - yuan ma
//11100001 - fa ma
//11100010 - bu ma
return 0;
}

View File

@ -0,0 +1,65 @@
# friso lexicon configure file.
# @email chenxin619315@gmail.com
# @date 2012-12-19
# main lexion
__LEX_CJK_WORDS__ :[
lex-main.lex;
lex-admin.lex;
lex-chars.lex;
lex-cn-mz.lex;
lex-cn-place.lex;
lex-company.lex;
lex-festival.lex;
lex-flname.lex;
lex-food.lex;
lex-lang.lex;
lex-nation.lex;
lex-net.lex;
lex-org.lex;
lex-touris.lex;
# add more here
]
# single chinese unit lexicon
__LEX_CJK_UNITS__ :[
lex-units.lex;
]
# chinese and english mixed word lexicon like "b超".
__LEX_ECM_WORDS__:[
lex-ecmixed.lex;
]
# english and chinese mixed word lexicon like "卡拉ok".
__LEX_CEM_WORDS__:[
lex-cemixed.lex;
]
# chinese last name lexicon.
__LEX_CN_LNAME__:[
lex-lname.lex;
]
# single name words lexicon.
__LEX_CN_SNAME__:[
lex-sname.lex;
]
# first word of a double chinese name.
__LEX_CN_DNAME1__:[
lex-dname-1.lex;
]
# second word of a double chinese name.
__LEX_CN_DNAME2__:[
lex-dname-2.lex;
]
# chinese last name decorate word.
__LEX_CN_LNA__:[
lex-ln-adorn.lex;
]
# stopwords lexicon
__LEX_STOPWORDS__:[
lex-stopword.lex;
]
# english and punctuation mixed words lexicon.
__LEX_ENPUN_WORDS__:[
lex-en-pun.lex;
]
# english words(for synonyms words)
__LEX_EN_WORDS__:[
lex-en.lex;
]

View File

@ -0,0 +1,27 @@
人事部/人事管理部门,人事管理部
人事管理部/人事管理部门,人事部
人事管理部/事管理部门,人事部
信息产业部/null
农业部/null
医管局/医疗管理部门,医疗管理部
医疗管理部/医疗管理部门,医管局
医疗管理部门/医管局,医疗管理部
发改委/null
国土资源部/null
国防部/人民武装力量部,军事部,防卫厅
军事部/人民武装力量部,防卫厅
外交部/国务院,政治部,对外关系部,外务省
外交部长/null
教育部/null
文化部/null
民政部/null
能源部/null
财政部/null
铁道部/null
防卫厅/null
防卫省/null
革命委员会/null
交通运输部/null
对外经济贸易部/null
技术部/null
总装备部/null

View File

@ -0,0 +1,6 @@
#中文英文混合词词库
卡拉ok/null
漂亮mm/null
拳皇ova/拳皇动漫
奇都ktv/null
哆啦a梦/null

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,168 @@
汉族/null
汉族人/null
汉族语/null
蒙古族/null
蒙古族人/null
蒙古族语/null
满族/null
满族人/null
满族语/null
朝鲜族/null
朝鲜族人/null
朝鲜族语/null
赫哲族/null
赫哲族人/null
赫哲族语/null
达斡尔族/null
达斡尔族人/null
达斡尔族语/null
鄂温克族/null
鄂温克族人/null
鄂温克族语/null
鄂伦春族/null
鄂伦春族人/null
鄂伦春族语/null
回族/null
回族人/null
回族语/null
东乡族/null
东乡族人/null
东乡族语/null
土族/null
土族人/null
土族语/null
撒拉族/null
撒拉族人/null
撒拉族语/null
保安族/null
保安族人/null
保安族语/null
裕固族/null
裕固族人/null
裕固族语/null
维吾尔族/null
维吾尔族人/null
维吾尔族语/null
哈萨克族/null
哈萨克族人/null
哈萨克族语/null
柯尔克孜族/null
柯尔克孜族人/null
柯尔克孜族语/null
锡伯族/null
锡伯族人/null
锡伯族语/null
塔吉克族/null
塔吉克族人/null
塔吉克族语/null
乌孜别克族/null
乌孜别克族人/null
乌孜别克族语/null
俄罗斯族/null
俄罗斯族人/null
俄罗斯族语/null
塔塔尔族/null
塔塔尔族人/null
塔塔尔族语/null
藏族/null
藏族人/null
藏族语/null
门巴族/null
门巴族人/null
门巴族语/null
珞巴族/null
珞巴族人/null
珞巴族语/null
羌族/null
羌族人/null
羌族语/null
彝族/null
彝族人/null
彝族语/null
白族/null
白族人/null
白族语/null
哈尼族/null
哈尼族人/null
哈尼族语/null
傣族/null
傣族人/null
傣族语/null
僳僳族/null
僳僳族人/null
僳僳族语/null
佤族/null
佤族人/null
佤族语/null
拉祜族/null
拉祜族人/null
拉祜族语/null
纳西族/null
纳西族人/null
纳西族语/null
景颇族/null
景颇族人/null
景颇族语/null
布朗族/null
布朗族人/null
布朗族语/null
阿昌族/null
阿昌族人/null
阿昌族语/null
普米族/null
普米族人/null
普米族语/null
怒族/null
怒族人/null
怒族语/null
德昂族/null
德昂族人/null
德昂族语/null
独龙族/null
独龙族人/null
独龙族语/null
基诺族/null
基诺族人/null
基诺族语/null
苗族/null
苗族人/null
苗族语/null
布依族/null
布依族人/null
布依族语/null
侗族/null
侗族人/null
侗族语/null
水族/null
水族人/null
水族语/null
仡佬族/null
仡佬族人/null
仡佬族语/null
壮族/null
壮族人/null
壮族语/null
瑶族/null
瑶族人/null
瑶族语/null
仫佬族/null
仫佬族人/null
仫佬族语/null
毛南族/null
毛南族人/null
毛南族语/null
京族/null
京族人/null
京族语/null
土家族/null
土家族人/null
土家族语/null
黎族/null
黎族人/null
黎族语/null
畲族/null
畲族人/null
畲族语/null
高山族/null
高山族人/null
高山族语/null

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,100 @@
央视/null
电信/null
移动/null
网通/null
联通/null
铁通/null
百度/null
环球网/null
长城网/null
新浪/null
腾讯/null
搜搜/soso
谷歌/null
雅虎/null
微软/null
中关村/null
搜狐/null
网易/null
硅谷/null
维基百科/null
巨人网络/null
阿里巴巴/null
阿里旺旺/旺旺
旺旺/null
淘宝/null
赶集网/null
猪八戒网/null
唯你英语/null
拉手网/null
百贯福泰/null
汇划算/null
汇划算网/null
聚划算/null
天猫/null
天猫网/null
亚马逊/null
亚马逊网/null
拍拍/null
拍拍网/null
京东/null
京东商城/null
返利网/null
支付宝/null
支付宝担保/null
支付宝及时到帐/null
支付宝双工能/null
财付通/null
财付通及时到帐/null
网银在线/null
苏宁易购/null
苏宁电器/null
仙童公司/null
开源中国/null
畅想网络/null
快乐大本营/null
越策越开心/null
超级男声/null
超男/null
超级女声/超女
超女/超级女声
好声音/null
快乐男声/快男
快男/快乐男声
快乐女声/null
快女/null
德克士/null
肯德基/null
奥利奥/null
回头客/null
苏波尔/null
苏宁/null
苏宁电器/null
苏宁易购/null
中央银行/null
人民银行/null
工商银行/null
农业银行/null
中国银行/null
建设银行/null
交通银行/null
华夏银行/null
光大银行/null
招商银行/null
中信银行/null
兴业银行/null
民生银行/null
深圳发展银行/null
广东发展银行/null
上海浦东发展银行/null
恒丰银行/null
农业发展银行/null
国家进出口信贷银行/null
国家开发银行/null
北京商业银行/null
上海银行/null
济南商业银行/null
信用社/null
农村信用社/null
邮政局/null
邮政储蓄银行/null

View File

@ -0,0 +1,210 @@
#双姓名首字词库
广
耀
鸿
怀

View File

@ -0,0 +1,211 @@
#双姓名尾字词库
鸿

View File

@ -0,0 +1,131 @@
#英文中文混合字, 注意英文字符均为小写
a咖/主角
a片/毛片,av
a座/null
a股/股票
a型/null
a杯/a罩杯
a罩杯/a杯
a计划/null
aa制/null
ab型/null
ab档案/null
a美a/null
a梦/null
x-射线/null
#
b座/null
b股/null
b型/null
b树/null
b计划/null
b超/null
b杯/b罩杯
b罩杯/b杯
bb机/call机
bb仔/null
bp机/null
#
c盘/null
c座/null
c语言/null
c杯/c罩杯
c罩杯/c杯
cd盒/null
cd机/null
call机/bb机
#
d盘/null
d座/null
d版/null
d杯/d罩杯
d罩杯/d杯
dna鉴定/null
#
e盘/null
e座/null
e化/null
e通/null
e仔/null
e语言/易语言
e杯/e罩杯
e罩杯/e杯
#
f盘/null
f座/null
f杯/f罩杯
f罩杯/f杯
#
g盘/null
g点/null
g杯/g罩杯
g罩杯/g杯
#
h盘/null
h股/null
h杯/h罩杯
h罩杯/h杯
#
i盘/null
ic卡/null
ip卡/null
ip段/null
ip电话/null
ip地址/null
it行业/null
it民工/码农
it男/null
#
j盘/null
#
k仔/null
k盘/null
k党/null
k书/看书,搞学习
k粉/氯胺酮
k歌/唱歌,嗨歌
k他命/null
k歌之王/null
#
n年/很久
#
o型/null
#
pc机/null
ph值/null
#
sim卡/null
#
u盘/null
u形/null
usb手指/null
usb接口/null
usb插口/null
usb记忆棒/null
#
visa卡/null
v沟/null
#
z盘/null
#
q版/null
qq号/null
q立方/null
#
rss订阅/null
#
t盘/null
#
x光/null
x光线/x射线
x射线/x光线
γ射线/null
#
t恤衫/t恤
t恤/t恤衫
t字帐/null
t型台/null
#
250g硬盘/null
160g硬盘/null
500g硬盘/null

View File

@ -0,0 +1,6 @@
#英文和标点组合成的词,英文字母统一使用小写。
c++
g++
c#
i++
x-

View File

@ -0,0 +1,4 @@
#英文词条, 做英文词语同义词追加用
decimal/decimals,fraction
spirit/mind
admire/appreciate,like,love,enjoy

View File

@ -0,0 +1,186 @@
七七纪念日/null
七夕/七夕情人节,情人节,中国情人节
七夕情人节/七夕,中国情人节,情人节
七夕节/七夕,情人节,中国情人节
万圣节/鬼节
世界人权日/null
世界儿歌节/null
世界儿童节/null
世界动物日/null
世界卫生日/null
世界地球日/null
世界教师日/null
世界无烟日/null
世界无童工日/null
世界林业节/null
世界森林日/null
世界水日/null
世界海洋日/null
世界湿地日/null
世界献血日/null
世界环境日/null
世界电视日/null
世界睡眠日/null
世界粮食日/null
世界精神卫生日/null
世界红十字日/null
世界问候日/null
中国人民抗日战争纪念日/null
抗日战争纪念日/null
中国国耻日/null
中国学生营养日/null
中国爱牙日/null
中国爱耳日/null
中国青年志愿者服务日/null
中国青年节/null
中秋/null
中秋节/null
人口日/null
人权日/null
儿歌节/null
儿童节/null
元宵/null
元宵节/null
元旦/null
元旦节/null
党生日/null
全国中小学生安全教育日/null
全国助残日/null
全国爱眼日/null
全国爱耳日/null
六十亿人口日/null
六四纪念日/null
冬至/null
减轻自然灾害日/null
动物日/null
助残日/null
劳动妇女节/null
劳动节/null
博物馆日/null
卫生日/null
和平日/null
国庆/null
国庆节/null
国耻日/null
国际儿童节/null
国际减轻自然灾害日/null
国际劳动妇女节/null
国际劳动节/null
国际博物馆日/null
国际和平日/null
国际奥林匹克日/null
国际妇女节/null
国际容忍日/null
国际左撇子日/null
国际志愿者日/null
国际护士节/null
国际无车日/null
国际残疾人日/null
国际母语日/null
国际气象节/null
国际消费者权益日/null
国际牛奶日/null
国际盲人节/null
国际禁毒日/null
国际老人日/null
国际臭氧层保护日/null
国际非洲儿童日/null
国际音乐日/null
国际麻风日/null
圣诞节/null
地球日/null
处暑/null
复活节/null
夏至/null
大寒/null
大暑/null
大雪/null
奥林匹克日/null
妇女节/null
三八节/null
三八妇女节/null
学生营养日/null
安全教育日/null
安全日/null
容忍日/null
寒露/null
小寒/null
小年/null
小暑/null
小满/null
小雪/null
左撇子日/null
平安夜/null
建党日/null
建军节/null
志愿人员日/null
志愿者日/null
情人节/null
惊蛰/null
愚人节/null
感恩节/null
扫房日/null
抗日战争纪念日/null
抗日纪念日/null
护士节/null
教师日/null
教师节/null
文化遗产日/null
无烟日/null
无童工日/null
无车日/null
春分/null
春节/null
植树节/null
残疾人日/null
母亲节/null
母语日/null
气象节/null
水日/null
海洋日/null
消费者权益日/null
清明/null
清明节/null
湿地日/null
爱牙日/null
爱眼日/null
爱耳日/null
父亲节/null
牛奶日/null
独立日/null
献血日/null
环境日/null
电视日/null
白露/null
盲人节/null
睡眠日/null
秋分/null
立冬/null
立夏/null
立春/null
立秋/null
端午节/null
粮食日/null
精神卫生日/null
红十字日/null
老人日/null
联合国日/null
腊八节/null
腊日/null
臭氧保护日/null
臭氧层保护日/null
芒种/null
营养日/null
谷雨/null
重阳/null
重阳节/null
问候日/null
除夕/null
雨水/null
霜降/null
青年志愿者服务日/null
青年节/null
非洲儿童日/null
音乐日/null
麻风日/null
龙头节/null

View File

@ -0,0 +1,11 @@
#西方姓氏词库
亚历山大/null
克林顿/null
克里斯汀/null
布什/null
布莱尔/null
科特勒/null
约翰/null
约翰逊/null
蒂娜/null
安妮/null

View File

@ -0,0 +1,12 @@
雪碧/null
可口可乐/null
冰红茶/null
奶茶/null
花生奶/null
芬达/null
珍珠奶茶/null
达利源/null
肯德鸡/null
炸薯条/null
麻辣烫/null
麻辣干锅/null

View File

@ -0,0 +1,20 @@
中文/国语
国语/null
台湾话/台语
台语/台湾话
客家话/null
汉字/null
汉语/国语,中文
法文/法文
法语/法语
福建话/null
粤语/广东话
美语/英语,英文
英文/英语
英语/英文
西班牙语/null
闽南语/null
泰语/null
西班牙语/null
俄罗斯语/null
拉丁语/null

View File

@ -0,0 +1,4 @@
#姓氏修饰,例如:老陈,小陈,中的老,小
#如果他已经是姓氏(lex-lname.lex中的词),则无须放在这里。

View File

@ -0,0 +1,513 @@
#中文姓氏词库
#单姓
#向
宿
#和
寿
鹿
#但
#复姓
欧阳
上官
司徒
刘付
皇甫
长孙
相里
令狐
诸葛

169459
libfriso/friso/vendors/dict/GBK/lex-main.lex vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,54 @@
东非/null
中华/null
中华/null
中华人民共和国/null
中华民国/null
中国/null
中國/null
中非/null
乌克兰/null
也门/null
以色列/null
伊拉克/null
伊朗/null
俄罗斯/null
分类/null
加拿大/null
南非/null
古巴/null
台湾/null
埃及/null
塞尔维亚/null
墨西哥/null
威尔士/null
尼日利亚/null
巴比伦/null
希腊/null
德国/null
德意志/null
意大利/null
捷克/null
日本/null
朝鲜/null
比利时/null
法兰西/null
法国/null
波兰/null
波黑/null
瑞典/null
瑞士/null
白俄罗斯/null
缅甸/null
美利坚/null
美利坚合众国/null
美国/null
老挝/null
苏格兰/null
苏联/null
英国/null
英格兰/null
葡萄牙/null
蒙古/null
西班牙/null
越南/null
韩国/null

View File

@ -0,0 +1,23 @@
油条哥/null
活雷锋/null
夕阳红/null
帮扶村/null
后援会/null
复炸油/null
献血哥/null
放心姐/null
啃老族/null
特训班/null
平头男/null
爆头哥/null
楼主/null
有两把刷子/null
非典/null
微信/null
微博/null
吊丝/null
高富帅/null
矮穷挫/null
白富美/null
狮子的魂/null
仓老师/仓井空

View File

@ -0,0 +1,15 @@
上海合作组织/null
世卫/null
世界卫生组织/null
世界银行/null
东盟/null
亚太经合组织/null
人权理事会/null
六方会谈/null
北约/null
哈马斯/null
安全理事会/null
安理会/null
欧佩克/null
红十字会/null
联合国/null

View File

@ -0,0 +1,207 @@
#中文单名词库

View File

@ -0,0 +1,887 @@
#en-punctuation
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
#0
#1
#2
#3
#4
#5
#6
#7
#8
#9
:
;
<
=
>
?
@
[
\
]
^
_
`
#a
#b
#c
#d
#e
#f
#g
#h
#i
#j
#k
#l
#m
#n
#o
#p
#q
#r
#s
#t
#u
#v
#w
#x
#y
#z
{
|
}
~
!
#fullwidth
_
?
?
?
?
?
?
?
#cn-punctuation
?
?
?
?
?
?
?
#中文
使
沿
#英文
to
can
could
dare
do
did
does
may
might
would
should
must
will
ought
shall
need
is
a
am
are
about
according
after
against
all
almost
also
although
among
an
and
another
any
anything
approximately
as
asked
at
back
because
before
besides
between
both
but
by
call
called
currently
despite
did
do
dr
during
each
earlier
eight
even
eventually
every
everything
five
for
four
from
he
her
here
his
how
however
i
if
in
indeed
instead
it
its
just
last
like
major
many
may
maybe
meanwhile
more
moreover
most
mr
mrs
ms
much
my
neither
net
never
nevertheless
nine
no
none
not
nothing
now
of
on
once
one
only
or
other
our
over
partly
perhaps
prior
regarding
separately
seven
several
she
should
similarly
since
six
so
some
somehow
still
such
ten
that
the
their
then
there
therefore
these
they
this
those
though
three
to
two
under
unless
unlike
until
volume
we
what
whatever
whats
when
where
which
while
why
with
without
yesterday
yet
you
your
aboard
about
above
according to
across
afore
after
against
agin
along
alongside
amid
amidst
among
amongst
anent
around
as
aslant
astride
at
athwart
bar
because of
before
behind
below
beneath
beside
besides
between
betwixt
beyond
but
by
circa
despite
down
during
due to
ere
except
for
from
in
inside
into
less
like
mid
midst
minus
near
next
nigh
nigher
nighest
notwithstanding
of
off
on
on to
onto
out
out of
outside
over
past
pending
per
plus
qua
re
round
sans
save
since
through
throughout
thru
till
to
toward
towards
under
underneath
unlike
until
unto
up
upon
versus
via
vice
with
within
without
he
her
herself
hers
him
himself
his
I
it
its
itself
me
mine
my
myself
ours
she
their
theirs
them
themselves
they
us
we
our
ourselves
you
your
yours
yourselves
yourself
this
that
these
those
a
about
above
across
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
am
among
amongst
amoungst
amount
an
and
another
any
anyhow
anyone
anything
anyway
anywhere
are
around
as
at
back
be
became
because
become
becomes
becoming
been
before
beforehand
behind
being
below
beside
besides
between
beyond
bill
both
bottom
but
by
call
can
cannot
cant
co
computer
con
could
couldnt
cry
de
describe
detail
do
done
down
due
during
each
eg
eight
either
eleven
else
elsewhere
empty
enough
etc
even
ever
every
everyone
everything
everywhere
except
few
fifteen
fify
fill
find
fire
first
five
for
former
formerly
forty
found
four
from
front
full
further
get
give
go
had
has
hasnt
have
he
hence
her
here
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
ie
if
in
inc
indeed
interest
into
is
it
its
itself
keep
last
latter
latterly
least
less
ltd
made
many
may
me
meanwhile
might
mill
mine
more
moreover
most
mostly
move
much
must
my
myself
name
namely
neither
never
nevertheless
next
nine
no
nobody
none
noone
nor
not
nothing
now
nowhere
of
off
often
on
once
one
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
own
part
per
perhaps
please
put
rather
re
same
see
seem
seemed
seeming
seems
serious
several
she
should
show
side
since
sincere
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
such
take
ten
than
that
the
their
them
themselves
then
thence
there
thereafter
thereby
therefore
therein
thereupon
these
they
thick
thin
third
this
those
though
three
through
throughout
thru
thus
to
together
too
top
toward
towards
twelve
twenty
two
un
under
until
up
upon
us
very
via
was
we
well
were
what
whatever
when
whence
whenever
where
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
whoever
whole
whom
whose
why
will
with
within
without
would
yet
you
your
yours
yourself
yourselves
#other number

View File

@ -0,0 +1,7 @@
世博园/null
世博会/null
长城/null
黄山/null
衡山/null
华山/null
泰山/null

View File

@ -0,0 +1,39 @@
#中文单字单位词库
#长度
#时间
#分
#币
#容量
#重量
#地积
#其他

View File

@ -0,0 +1,65 @@
# friso lexicon configure file.
# @email chenxin619315@gmail.com
# @date 2012-12-19
# main lexion
__LEX_CJK_WORDS__ :[
lex-main.lex;
lex-admin.lex;
lex-chars.lex;
lex-cn-mz.lex;
lex-cn-place.lex;
lex-company.lex;
lex-festival.lex;
lex-flname.lex;
lex-food.lex;
lex-lang.lex;
lex-nation.lex;
lex-net.lex;
lex-org.lex;
lex-touris.lex;
# add more here
]
# single chinese unit lexicon
__LEX_CJK_UNITS__ :[
lex-units.lex;
]
# chinese and english mixed word lexicon like "b超".
__LEX_ECM_WORDS__:[
lex-ecmixed.lex;
]
# english and chinese mixed word lexicon like "卡拉ok".
__LEX_CEM_WORDS__:[
lex-cemixed.lex;
]
# chinese last name lexicon.
__LEX_CN_LNAME__:[
lex-lname.lex;
]
# single name words lexicon.
__LEX_CN_SNAME__:[
lex-sname.lex;
]
# first word of a double chinese name.
__LEX_CN_DNAME1__:[
lex-dname-1.lex;
]
# second word of a double chinese name.
__LEX_CN_DNAME2__:[
lex-dname-2.lex;
]
# chinese last name decorate word.
__LEX_CN_LNA__:[
lex-ln-adorn.lex;
]
# stopwords lexicon
__LEX_STOPWORDS__:[
lex-stopword.lex;
]
# english and punctuation mixed words lexicon.
__LEX_ENPUN_WORDS__:[
lex-en-pun.lex;
]
# english words(for synonyms words)
__LEX_EN_WORDS__:[
lex-en.lex;
]

View File

@ -0,0 +1,26 @@
人事部/人事管理部门,人事管理部
人事管理部/人事管理部门,人事部
信息产业部/null
农业部/null
医管局/医疗管理部门,医疗管理部
医疗管理部/医疗管理部门,医管局
医疗管理部门/医管局,医疗管理部
发改委/null
国土资源部/null
国防部/人民武装力量部,军事部,防卫厅
军事部/人民武装力量部,防卫厅
外交部/国务院,政治部,对外关系部,外务省
外交部长/null
教育部/null
文化部/null
民政部/null
能源部/null
财政部/null
铁道部/null
防卫厅/null
防卫省/null
革命委员会/null
交通运输部/null
对外经济贸易部/null
技术部/null
总装备部/null

View File

@ -0,0 +1,9 @@
#中文英文混合词词库
卡拉ok/null
漂亮mm/null
拳皇ova/拳皇动漫
奇都ktv/null
哆啦a梦/null
高3/高三
高2/高二
高1/高一

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,168 @@
汉族/null
汉族人/null
汉族语/null
蒙古族/null
蒙古族人/null
蒙古族语/null
满族/null
满族人/null
满族语/null
朝鲜族/null
朝鲜族人/null
朝鲜族语/null
赫哲族/null
赫哲族人/null
赫哲族语/null
达斡尔族/null
达斡尔族人/null
达斡尔族语/null
鄂温克族/null
鄂温克族人/null
鄂温克族语/null
鄂伦春族/null
鄂伦春族人/null
鄂伦春族语/null
回族/null
回族人/null
回族语/null
东乡族/null
东乡族人/null
东乡族语/null
土族/null
土族人/null
土族语/null
撒拉族/null
撒拉族人/null
撒拉族语/null
保安族/null
保安族人/null
保安族语/null
裕固族/null
裕固族人/null
裕固族语/null
维吾尔族/null
维吾尔族人/null
维吾尔族语/null
哈萨克族/null
哈萨克族人/null
哈萨克族语/null
柯尔克孜族/null
柯尔克孜族人/null
柯尔克孜族语/null
锡伯族/null
锡伯族人/null
锡伯族语/null
塔吉克族/null
塔吉克族人/null
塔吉克族语/null
乌孜别克族/null
乌孜别克族人/null
乌孜别克族语/null
俄罗斯族/null
俄罗斯族人/null
俄罗斯族语/null
塔塔尔族/null
塔塔尔族人/null
塔塔尔族语/null
藏族/null
藏族人/null
藏族语/null
门巴族/null
门巴族人/null
门巴族语/null
珞巴族/null
珞巴族人/null
珞巴族语/null
羌族/null
羌族人/null
羌族语/null
彝族/null
彝族人/null
彝族语/null
白族/null
白族人/null
白族语/null
哈尼族/null
哈尼族人/null
哈尼族语/null
傣族/null
傣族人/null
傣族语/null
僳僳族/null
僳僳族人/null
僳僳族语/null
佤族/null
佤族人/null
佤族语/null
拉祜族/null
拉祜族人/null
拉祜族语/null
纳西族/null
纳西族人/null
纳西族语/null
景颇族/null
景颇族人/null
景颇族语/null
布朗族/null
布朗族人/null
布朗族语/null
阿昌族/null
阿昌族人/null
阿昌族语/null
普米族/null
普米族人/null
普米族语/null
怒族/null
怒族人/null
怒族语/null
德昂族/null
德昂族人/null
德昂族语/null
独龙族/null
独龙族人/null
独龙族语/null
基诺族/null
基诺族人/null
基诺族语/null
苗族/null
苗族人/null
苗族语/null
布依族/null
布依族人/null
布依族语/null
侗族/null
侗族人/null
侗族语/null
水族/null
水族人/null
水族语/null
仡佬族/null
仡佬族人/null
仡佬族语/null
壮族/null
壮族人/null
壮族语/null
瑶族/null
瑶族人/null
瑶族语/null
仫佬族/null
仫佬族人/null
仫佬族语/null
毛南族/null
毛南族人/null
毛南族语/null
京族/null
京族人/null
京族语/null
土家族/null
土家族人/null
土家族语/null
黎族/null
黎族人/null
黎族语/null
畲族/null
畲族人/null
畲族语/null
高山族/null
高山族人/null
高山族语/null

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,100 @@
央视/null
电信/null
移动/null
网通/null
联通/null
铁通/null
百度/null
环球网/null
长城网/null
新浪/null
腾讯/null
搜搜/soso
谷歌/null
雅虎/null
微软/null
中关村/null
搜狐/null
网易/null
硅谷/null
维基百科/null
巨人网络/null
阿里巴巴/null
阿里旺旺/旺旺
旺旺/null
淘宝/null
赶集网/null
猪八戒网/null
唯你英语/null
拉手网/null
百贯福泰/null
汇划算/null
汇划算网/null
聚划算/null
天猫/null
天猫网/null
亚马逊/null
亚马逊网/null
拍拍/null
拍拍网/null
京东/null
京东商城/null
返利网/null
支付宝/null
支付宝担保/null
支付宝及时到帐/null
支付宝双工能/null
财付通/null
财付通及时到帐/null
网银在线/null
苏宁易购/null
苏宁电器/null
仙童公司/null
开源中国/null
畅想网络/null
快乐大本营/null
越策越开心/null
超级男声/null
超男/null
超级女声/超女
超女/超级女声
好声音/null
快乐男声/快男
快男/快乐男声
快乐女声/null
快女/null
德克士/null
肯德基/null
奥利奥/null
回头客/null
苏波尔/null
苏宁/null
苏宁电器/null
苏宁易购/null
中央银行/null
人民银行/null
工商银行/null
农业银行/null
中国银行/null
建设银行/null
交通银行/null
华夏银行/null
光大银行/null
招商银行/null
中信银行/null
兴业银行/null
民生银行/null
深圳发展银行/null
广东发展银行/null
上海浦东发展银行/null
恒丰银行/null
农业发展银行/null
国家进出口信贷银行/null
国家开发银行/null
北京商业银行/null
上海银行/null
济南商业银行/null
信用社/null
农村信用社/null
邮政局/null
邮政储蓄银行/null

View File

@ -0,0 +1,210 @@
#双姓名首字词库
广
耀
鸿
怀

View File

@ -0,0 +1,211 @@
#双姓名尾字词库
鸿

View File

@ -0,0 +1,131 @@
#英文中文混合字, 注意英文字符均为小写
a咖/主角
a片/毛片,av
a座/null
a股/股票
a型/null
a杯/a罩杯
a罩杯/a杯
a计划/null
aa制/null
ab型/null
ab档案/null
a美a/null
a梦/null
x-射线/null
#
b座/null
b股/null
b型/null
b树/null
b计划/null
b超/null
b杯/b罩杯
b罩杯/b杯
bb机/call机
bb仔/null
bp机/null
#
c盘/null
c座/null
c语言/null
c杯/c罩杯
c罩杯/c杯
cd盒/null
cd机/null
call机/bb机
#
d盘/null
d座/null
d版/null
d杯/d罩杯
d罩杯/d杯
dna鉴定/null
#
e盘/null
e座/null
e化/null
e通/null
e仔/null
e语言/易语言
e杯/e罩杯
e罩杯/e杯
#
f盘/null
f座/null
f杯/f罩杯
f罩杯/f杯
#
g盘/null
g点/null
g杯/g罩杯
g罩杯/g杯
#
h盘/null
h股/null
h杯/h罩杯
h罩杯/h杯
#
i盘/null
ic卡/null
ip卡/null
ip段/null
ip电话/null
ip地址/null
it行业/null
it民工/码农
it男/null
#
j盘/null
#
k仔/null
k盘/null
k党/null
k书/看书,搞学习
k粉/氯胺酮
k歌/唱歌,嗨歌
k他命/null
k歌之王/null
#
n年/很久
#
o型/null
#
pc机/null
ph值/null
#
sim卡/null
#
u盘/null
u形/null
usb手指/null
usb接口/null
usb插口/null
usb记忆棒/null
#
visa卡/null
v沟/null
#
z盘/null
#
q版/null
qq号/null
q立方/null
#
rss订阅/null
#
t盘/null
#
x光/null
x光线/x射线
x射线/x光线
γ射线/null
#
t恤衫/t恤
t恤/t恤衫
t字帐/null
t型台/null
#
250g硬盘/null
160g硬盘/null
500g硬盘/null

View File

@ -0,0 +1,6 @@
#英文和标点组合成的词,英文字母统一使用小写。
c++
g++
c#
i++
x-

View File

@ -0,0 +1,5 @@
#英文词条, 做英文词语同义词追加用
decimal/decimals,fraction
spirit/mind
admire/appreciate,like,love,enjoy
chenxin12/chenxin,lionsoul

View File

@ -0,0 +1,186 @@
七七纪念日/null
七夕/七夕情人节,情人节,中国情人节
七夕情人节/七夕,中国情人节,情人节
七夕节/七夕,情人节,中国情人节
万圣节/鬼节
世界人权日/null
世界儿歌节/null
世界儿童节/null
世界动物日/null
世界卫生日/null
世界地球日/null
世界教师日/null
世界无烟日/null
世界无童工日/null
世界林业节/null
世界森林日/null
世界水日/null
世界海洋日/null
世界湿地日/null
世界献血日/null
世界环境日/null
世界电视日/null
世界睡眠日/null
世界粮食日/null
世界精神卫生日/null
世界红十字日/null
世界问候日/null
中国人民抗日战争纪念日/null
抗日战争纪念日/null
中国国耻日/null
中国学生营养日/null
中国爱牙日/null
中国爱耳日/null
中国青年志愿者服务日/null
中国青年节/null
中秋/null
中秋节/null
人口日/null
人权日/null
儿歌节/null
儿童节/null
元宵/null
元宵节/null
元旦/null
元旦节/null
党生日/null
全国中小学生安全教育日/null
全国助残日/null
全国爱眼日/null
全国爱耳日/null
六十亿人口日/null
六四纪念日/null
冬至/null
减轻自然灾害日/null
动物日/null
助残日/null
劳动妇女节/null
劳动节/null
博物馆日/null
卫生日/null
和平日/null
国庆/null
国庆节/null
国耻日/null
国际儿童节/null
国际减轻自然灾害日/null
国际劳动妇女节/null
国际劳动节/null
国际博物馆日/null
国际和平日/null
国际奥林匹克日/null
国际妇女节/null
国际容忍日/null
国际左撇子日/null
国际志愿者日/null
国际护士节/null
国际无车日/null
国际残疾人日/null
国际母语日/null
国际气象节/null
国际消费者权益日/null
国际牛奶日/null
国际盲人节/null
国际禁毒日/null
国际老人日/null
国际臭氧层保护日/null
国际非洲儿童日/null
国际音乐日/null
国际麻风日/null
圣诞节/null
地球日/null
处暑/null
复活节/null
夏至/null
大寒/null
大暑/null
大雪/null
奥林匹克日/null
妇女节/null
三八节/null
三八妇女节/null
学生营养日/null
安全教育日/null
安全日/null
容忍日/null
寒露/null
小寒/null
小年/null
小暑/null
小满/null
小雪/null
左撇子日/null
平安夜/null
建党日/null
建军节/null
志愿人员日/null
志愿者日/null
情人节/null
惊蛰/null
愚人节/null
感恩节/null
扫房日/null
抗日战争纪念日/null
抗日纪念日/null
护士节/null
教师日/null
教师节/null
文化遗产日/null
无烟日/null
无童工日/null
无车日/null
春分/null
春节/null
植树节/null
残疾人日/null
母亲节/null
母语日/null
气象节/null
水日/null
海洋日/null
消费者权益日/null
清明/null
清明节/null
湿地日/null
爱牙日/null
爱眼日/null
爱耳日/null
父亲节/null
牛奶日/null
独立日/null
献血日/null
环境日/null
电视日/null
白露/null
盲人节/null
睡眠日/null
秋分/null
立冬/null
立夏/null
立春/null
立秋/null
端午节/null
粮食日/null
精神卫生日/null
红十字日/null
老人日/null
联合国日/null
腊八节/null
腊日/null
臭氧保护日/null
臭氧层保护日/null
芒种/null
营养日/null
谷雨/null
重阳/null
重阳节/null
问候日/null
除夕/null
雨水/null
霜降/null
青年志愿者服务日/null
青年节/null
非洲儿童日/null
音乐日/null
麻风日/null
龙头节/null

View File

@ -0,0 +1,11 @@
#西方姓氏词库
亚历山大/null
克林顿/null
克里斯汀/null
布什/null
布莱尔/null
科特勒/null
约翰/null
约翰逊/null
蒂娜/null
安妮/null

View File

@ -0,0 +1,12 @@
雪碧/null
可口可乐/null
冰红茶/null
奶茶/null
花生奶/null
芬达/null
珍珠奶茶/null
达利源/null
肯德鸡/null
炸薯条/null
麻辣烫/null
麻辣干锅/null

View File

@ -0,0 +1,20 @@
中文/国语
国语/null
台湾话/台语
台语/台湾话
客家话/null
汉字/null
汉语/国语,中文
法文/法文
法语/法语
福建话/null
粤语/广东话
美语/英语,英文
英文/英语
英语/英文
西班牙语/null
闽南语/null
泰语/null
西班牙语/null
俄罗斯语/null
拉丁语/null

View File

@ -0,0 +1,4 @@
#姓氏修饰,例如:老陈,小陈,中的老,小
#如果他已经是姓氏(lex-lname.lex中的词),则无须放在这里。

View File

@ -0,0 +1,513 @@
#中文姓氏词库
#单姓
#向
宿
#和
寿
鹿
#但
#复姓
欧阳
上官
司徒
刘付
皇甫
长孙
相里
令狐
诸葛

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,54 @@
东非/null
中华/null
中华/null
中华人民共和国/null
中华民国/null
中国/null
中國/null
中非/null
乌克兰/null
也门/null
以色列/null
伊拉克/null
伊朗/null
俄罗斯/null
分类/null
加拿大/null
南非/null
古巴/null
台湾/null
埃及/null
塞尔维亚/null
墨西哥/null
威尔士/null
尼日利亚/null
巴比伦/null
希腊/null
德国/null
德意志/null
意大利/null
捷克/null
日本/null
朝鲜/null
比利时/null
法兰西/null
法国/null
波兰/null
波黑/null
瑞典/null
瑞士/null
白俄罗斯/null
缅甸/null
美利坚/null
美利坚合众国/null
美国/null
老挝/null
苏格兰/null
苏联/null
英国/null
英格兰/null
葡萄牙/null
蒙古/null
西班牙/null
越南/null
韩国/null

View File

@ -0,0 +1,26 @@
油条哥/null
活雷锋/null
夕阳红/null
帮扶村/null
后援会/null
复炸油/null
献血哥/null
放心姐/null
啃老族/null
特训班/null
平头男/null
爆头哥/null
楼主/null
有两把刷子/null
非典/null
微信/null
微博/null
吊丝/null
高富帅/null
矮穷挫/null
白富美/null
狮子的魂/null
仓老师/仓井空
郭德纲/null
单田芳/null
李笑笑/null

View File

@ -0,0 +1,15 @@
上海合作组织/null
世卫/null
世界卫生组织/null
世界银行/null
东盟/null
亚太经合组织/null
人权理事会/null
六方会谈/null
北约/null
哈马斯/null
安全理事会/null
安理会/null
欧佩克/null
红十字会/null
联合国/null

View File

@ -0,0 +1,207 @@
#中文单名词库

View File

@ -0,0 +1,887 @@
#en-punctuation
!
"
#
$
%
&
'
(
)
*
+
,
-
.
/
#0
#1
#2
#3
#4
#5
#6
#7
#8
#9
:
;
<
=
>
?
@
[
\
]
^
_
`
#a
#b
#c
#d
#e
#f
#g
#h
#i
#j
#k
#l
#m
#n
#o
#p
#q
#r
#s
#t
#u
#v
#w
#x
#y
#z
{
|
}
~
!
#fullwidth
_
#cn-punctuation
#中文
使
沿
#英文
to
can
could
dare
do
did
does
may
might
would
should
must
will
ought
shall
need
is
a
am
are
about
according
after
against
all
almost
also
although
among
an
and
another
any
anything
approximately
as
asked
at
back
because
before
besides
between
both
but
by
call
called
currently
despite
did
do
dr
during
each
earlier
eight
even
eventually
every
everything
five
for
four
from
he
her
here
his
how
however
i
if
in
indeed
instead
it
its
just
last
like
major
many
may
maybe
meanwhile
more
moreover
most
mr
mrs
ms
much
my
neither
net
never
nevertheless
nine
no
none
not
nothing
now
of
on
once
one
only
or
other
our
over
partly
perhaps
prior
regarding
separately
seven
several
she
should
similarly
since
six
so
some
somehow
still
such
ten
that
the
their
then
there
therefore
these
they
this
those
though
three
to
two
under
unless
unlike
until
volume
we
what
whatever
whats
when
where
which
while
why
with
without
yesterday
yet
you
your
aboard
about
above
according to
across
afore
after
against
agin
along
alongside
amid
amidst
among
amongst
anent
around
as
aslant
astride
at
athwart
bar
because of
before
behind
below
beneath
beside
besides
between
betwixt
beyond
but
by
circa
despite
down
during
due to
ere
except
for
from
in
inside
into
less
like
mid
midst
minus
near
next
nigh
nigher
nighest
notwithstanding
of
off
on
on to
onto
out
out of
outside
over
past
pending
per
plus
qua
re
round
sans
save
since
through
throughout
thru
till
to
toward
towards
under
underneath
unlike
until
unto
up
upon
versus
via
vice
with
within
without
he
her
herself
hers
him
himself
his
I
it
its
itself
me
mine
my
myself
ours
she
their
theirs
them
themselves
they
us
we
our
ourselves
you
your
yours
yourselves
yourself
this
that
these
those
a
about
above
across
after
afterwards
again
against
all
almost
alone
along
already
also
although
always
am
among
amongst
amoungst
amount
an
and
another
any
anyhow
anyone
anything
anyway
anywhere
are
around
as
at
back
be
became
because
become
becomes
becoming
been
before
beforehand
behind
being
below
beside
besides
between
beyond
bill
both
bottom
but
by
call
can
cannot
cant
co
computer
con
could
couldnt
cry
de
describe
detail
do
done
down
due
during
each
eg
eight
either
eleven
else
elsewhere
empty
enough
etc
even
ever
every
everyone
everything
everywhere
except
few
fifteen
fify
fill
find
fire
first
five
for
former
formerly
forty
found
four
from
front
full
further
get
give
go
had
has
hasnt
have
he
hence
her
here
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
ie
if
in
inc
indeed
interest
into
is
it
its
itself
keep
last
latter
latterly
least
less
ltd
made
many
may
me
meanwhile
might
mill
mine
more
moreover
most
mostly
move
much
must
my
myself
name
namely
neither
never
nevertheless
next
nine
no
nobody
none
noone
nor
not
nothing
now
nowhere
of
off
often
on
once
one
only
onto
or
other
others
otherwise
our
ours
ourselves
out
over
own
part
per
perhaps
please
put
rather
re
same
see
seem
seemed
seeming
seems
serious
several
she
should
show
side
since
sincere
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
such
take
ten
than
that
the
their
them
themselves
then
thence
there
thereafter
thereby
therefore
therein
thereupon
these
they
thick
thin
third
this
those
though
three
through
throughout
thru
thus
to
together
too
top
toward
towards
twelve
twenty
two
un
under
until
up
upon
us
very
via
was
we
well
were
what
whatever
when
whence
whenever
where
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
whoever
whole
whom
whose
why
will
with
within
without
would
yet
you
your
yours
yourself
yourselves
#other number

View File

@ -0,0 +1,7 @@
世博园/null
世博会/null
长城/null
黄山/null
衡山/null
华山/null
泰山/null

View File

@ -0,0 +1,39 @@
#中文单字单位词库
#长度
#时间
#分
#币
#容量
#重量
#地积
#其他

52
libfriso/libfriso.pro Normal file
View File

@ -0,0 +1,52 @@
QT -= gui
VERSION = 0.0.1
TARGET = friso
TEMPLATE = lib
DEFINES += LIBFRISO_LIBRARY
CONFIG += c++11
# The following define makes your compiler emit warnings if you use
# any Qt feature that has been marked deprecated (the exact warnings
# depend on your compiler). Please consult the documentation of the
# deprecated API in order to know how to port your code away from it.
DEFINES += QT_DEPRECATED_WARNINGS
# You can also make your code fail to compile if it uses deprecated APIs.
# In order to do so, uncomment the following line.
# You can also select to disable deprecated APIs only up to a certain version of Qt.
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
include(friso/friso.pri)
SOURCES += \
friso-interface.c
HEADERS += \
friso-interface.h \
dict_utf_files.path = /usr/share/ukui-search/res/dict/UTF-8/
dict_utf_files.files = $$PWD/friso/vendors/dict/UTF-8/*
dict_gbk_files.path = /usr/share/ukui-search/res/dict/GBK/
dict_gbk_files.files = $$PWD/friso/vendors/dict/GBK/*
friso_ini.path = /usr/share/ukui-search/res/
friso_ini.files = $$PWD/friso/friso.ini
INSTALLS += \
dict_utf_files \
dict_gbk_files \
friso_ini
# Default rules for deployment.
unix {
target.path = /usr/lib
}
!isEmpty(target.path): INSTALLS += target
DISTFILES += \
friso/vendors/dict/UTF-8/* \
friso/vendors/dict/GBK/* \
friso/friso.ini

View File

@ -559,7 +559,7 @@ void FileUtils::getTxtContent(QString &path, QString &textcontent)
const char *codec = uchardet_get_charset(chardet);
if(QTextCodec::codecForName(codec) == 0)
qWarning()<<"Unsupported Text encoding format"<<path<<QString::fromLocal8Bit(codec);
qWarning()<<"Unsupported Text encoding format"<<path<<QString::fromLocal8Bit(codec)<<"zpf666";
QTextStream stream(encodedString,QIODevice::ReadOnly);
stream.setCodec(codec);

View File

@ -68,5 +68,8 @@ unix {
INCLUDEPATH += $$PWD/../libchinese-segmentation
DEPENDPATH += $$PWD/../libchinese-segmentation
INCLUDEPATH += $$PWD/../libfriso
DEPENDPATH += $$PWD/../libfriso
#DISTFILES += \
# ../translations/libsearch/libukui-search_zh_CN.ts

View File

@ -243,7 +243,7 @@ int main(int argc, char *argv[])
// FirstIndex fi("/home/zhangzihao/Desktop/qwerty");
// FirstIndex* fi = new FirstIndex("/home/zhangzihao/Desktop/qwerty");
FirstIndex fi("/home");
FirstIndex fi("/home/zhangzihao/Desktop");
fi.start();
// fi.wait();
// fi->wait();

View File

@ -66,11 +66,16 @@ qm_files.files = $$OUT_PWD/.qm/*.qm
INSTALLS += qm_files
LIBS += -L$$OUT_PWD/../libsearch -lukui-search -L$$OUT_PWD/../libchinese-segmentation/ -lchinese-segmentation
LIBS += -L$$OUT_PWD/../libsearch -lukui-search \
-L$$OUT_PWD/../libchinese-segmentation -lchinese-segmentation \
-L$$OUT_PWD/../libfriso -lfriso
INCLUDEPATH += $$PWD/../libsearch
DEPENDPATH += $$PWD/../libsearch
INCLUDEPATH += $$PWD/../libfriso
DEPENDPATH += $$PWD/../libfriso
#DISTFILES += \
# ../data/ukui-search-menu.desktop \
# $$OUT_PWD/.qm/bo.qm \

View File

@ -2,7 +2,8 @@ TEMPLATE = subdirs
SUBDIRS += $$PWD/libchinese-segmentation \
$$PWD/libsearch \
$$PWD/src \
$$PWD/ukuisearch-systemdbus
$$PWD/ukuisearch-systemdbus \
$$PWD/libfriso
# The following define makes your compiler emit warnings if you use
# any Qt feature that has been marked deprecated (the exact warnings
# depend on your compiler). Please consult the documentation of the
@ -14,7 +15,8 @@ DEFINES += QT_DEPRECATED_WARNINGS
# You can also select to disable deprecated APIs only up to a certain version of Qt.
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
libsearch.depends = libchinese-segmentation
libsearch.depends += libchinese-segmentation \
libfriso
src.depends = libsearch
CONFIG += ordered