mirror of https://gitee.com/openkylin/wget.git
249 lines
10 KiB
C
249 lines
10 KiB
C
/* Normalization forms (composition and decomposition) of Unicode strings.
|
|
Copyright (C) 2001-2002, 2009-2019 Free Software Foundation, Inc.
|
|
Written by Bruno Haible <bruno@clisp.org>, 2009.
|
|
|
|
This program is free software: you can redistribute it and/or modify it
|
|
under the terms of the GNU General Public License as published
|
|
by the Free Software Foundation; either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
|
|
|
#ifndef _UNINORM_H
|
|
#define _UNINORM_H
|
|
|
|
/* Get size_t. */
|
|
#include <stddef.h>
|
|
|
|
#include "unitypes.h"
|
|
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
|
|
/* Conventions:
|
|
|
|
All functions prefixed with u8_ operate on UTF-8 encoded strings.
|
|
Their unit is an uint8_t (1 byte).
|
|
|
|
All functions prefixed with u16_ operate on UTF-16 encoded strings.
|
|
Their unit is an uint16_t (a 2-byte word).
|
|
|
|
All functions prefixed with u32_ operate on UCS-4 encoded strings.
|
|
Their unit is an uint32_t (a 4-byte word).
|
|
|
|
All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
|
|
n units.
|
|
|
|
Functions returning a string result take a (resultbuf, lengthp) argument
|
|
pair. If resultbuf is not NULL and the result fits into *lengthp units,
|
|
it is put in resultbuf, and resultbuf is returned. Otherwise, a freshly
|
|
allocated string is returned. In both cases, *lengthp is set to the
|
|
length (number of units) of the returned string. In case of error,
|
|
NULL is returned and errno is set. */
|
|
|
|
|
|
enum
|
|
{
|
|
UC_DECOMP_CANONICAL,/* Canonical decomposition. */
|
|
UC_DECOMP_FONT, /* <font> A font variant (e.g. a blackletter form). */
|
|
UC_DECOMP_NOBREAK, /* <noBreak> A no-break version of a space or hyphen. */
|
|
UC_DECOMP_INITIAL, /* <initial> An initial presentation form (Arabic). */
|
|
UC_DECOMP_MEDIAL, /* <medial> A medial presentation form (Arabic). */
|
|
UC_DECOMP_FINAL, /* <final> A final presentation form (Arabic). */
|
|
UC_DECOMP_ISOLATED,/* <isolated> An isolated presentation form (Arabic). */
|
|
UC_DECOMP_CIRCLE, /* <circle> An encircled form. */
|
|
UC_DECOMP_SUPER, /* <super> A superscript form. */
|
|
UC_DECOMP_SUB, /* <sub> A subscript form. */
|
|
UC_DECOMP_VERTICAL,/* <vertical> A vertical layout presentation form. */
|
|
UC_DECOMP_WIDE, /* <wide> A wide (or zenkaku) compatibility character. */
|
|
UC_DECOMP_NARROW, /* <narrow> A narrow (or hankaku) compatibility character. */
|
|
UC_DECOMP_SMALL, /* <small> A small variant form (CNS compatibility). */
|
|
UC_DECOMP_SQUARE, /* <square> A CJK squared font variant. */
|
|
UC_DECOMP_FRACTION,/* <fraction> A vulgar fraction form. */
|
|
UC_DECOMP_COMPAT /* <compat> Otherwise unspecified compatibility character. */
|
|
};
|
|
|
|
/* Maximum size of decomposition of a single Unicode character. */
|
|
#define UC_DECOMPOSITION_MAX_LENGTH 32
|
|
|
|
/* Return the character decomposition mapping of a Unicode character.
|
|
DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
|
|
ucs_t elements.
|
|
When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
|
|
filled and N is returned. Otherwise -1 is returned. */
|
|
extern int
|
|
uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
|
|
|
|
/* Return the canonical character decomposition mapping of a Unicode character.
|
|
DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
|
|
ucs_t elements.
|
|
When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
|
|
returned. Otherwise -1 is returned. */
|
|
extern int
|
|
uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
|
|
|
|
|
|
/* Attempt to combine the Unicode characters uc1, uc2.
|
|
uc1 is known to have canonical combining class 0.
|
|
Return the combination of uc1 and uc2, if it exists.
|
|
Return 0 otherwise.
|
|
Not all decompositions can be recombined using this function. See the
|
|
Unicode file CompositionExclusions.txt for details. */
|
|
extern ucs4_t
|
|
uc_composition (ucs4_t uc1, ucs4_t uc2)
|
|
_UC_ATTRIBUTE_CONST;
|
|
|
|
|
|
/* An object of type uninorm_t denotes a Unicode normalization form. */
|
|
struct unicode_normalization_form;
|
|
typedef const struct unicode_normalization_form *uninorm_t;
|
|
|
|
/* UNINORM_NFD: Normalization form D: canonical decomposition. */
|
|
extern const struct unicode_normalization_form uninorm_nfd;
|
|
#define UNINORM_NFD (&uninorm_nfd)
|
|
|
|
/* UNINORM_NFC: Normalization form C: canonical decomposition, then
|
|
canonical composition. */
|
|
extern const struct unicode_normalization_form uninorm_nfc;
|
|
#define UNINORM_NFC (&uninorm_nfc)
|
|
|
|
/* UNINORM_NFKD: Normalization form KD: compatibility decomposition. */
|
|
extern const struct unicode_normalization_form uninorm_nfkd;
|
|
#define UNINORM_NFKD (&uninorm_nfkd)
|
|
|
|
/* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
|
|
canonical composition. */
|
|
extern const struct unicode_normalization_form uninorm_nfkc;
|
|
#define UNINORM_NFKC (&uninorm_nfkc)
|
|
|
|
/* Test whether a normalization form does compatibility decomposition. */
|
|
#define uninorm_is_compat_decomposing(nf) \
|
|
((* (const unsigned int *) (nf) >> 0) & 1)
|
|
|
|
/* Test whether a normalization form includes canonical composition. */
|
|
#define uninorm_is_composing(nf) \
|
|
((* (const unsigned int *) (nf) >> 1) & 1)
|
|
|
|
/* Return the decomposing variant of a normalization form.
|
|
This maps NFC,NFD -> NFD and NFKC,NFKD -> NFKD. */
|
|
extern uninorm_t
|
|
uninorm_decomposing_form (uninorm_t nf)
|
|
_UC_ATTRIBUTE_PURE;
|
|
|
|
|
|
/* Return the specified normalization form of a string. */
|
|
extern uint8_t *
|
|
u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
|
|
uint8_t *resultbuf, size_t *lengthp);
|
|
extern uint16_t *
|
|
u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
|
|
uint16_t *resultbuf, size_t *lengthp);
|
|
extern uint32_t *
|
|
u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
|
|
uint32_t *resultbuf, size_t *lengthp);
|
|
|
|
|
|
/* Compare S1 and S2, ignoring differences in normalization.
|
|
NF must be either UNINORM_NFD or UNINORM_NFKD.
|
|
If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
|
|
return 0. Upon failure, return -1 with errno set. */
|
|
extern int
|
|
u8_normcmp (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
|
|
uninorm_t nf, int *resultp);
|
|
extern int
|
|
u16_normcmp (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
|
|
uninorm_t nf, int *resultp);
|
|
extern int
|
|
u32_normcmp (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
|
|
uninorm_t nf, int *resultp);
|
|
|
|
|
|
/* Converts the string S of length N to a NUL-terminated byte sequence, in such
|
|
a way that comparing uN_normxfrm (S1) and uN_normxfrm (S2) with uN_cmp2() is
|
|
equivalent to comparing S1 and S2 with uN_normcoll().
|
|
NF must be either UNINORM_NFC or UNINORM_NFKC. */
|
|
extern char *
|
|
u8_normxfrm (const uint8_t *s, size_t n, uninorm_t nf,
|
|
char *resultbuf, size_t *lengthp);
|
|
extern char *
|
|
u16_normxfrm (const uint16_t *s, size_t n, uninorm_t nf,
|
|
char *resultbuf, size_t *lengthp);
|
|
extern char *
|
|
u32_normxfrm (const uint32_t *s, size_t n, uninorm_t nf,
|
|
char *resultbuf, size_t *lengthp);
|
|
|
|
|
|
/* Compare S1 and S2, ignoring differences in normalization, using the
|
|
collation rules of the current locale.
|
|
NF must be either UNINORM_NFC or UNINORM_NFKC.
|
|
If successful, set *RESULTP to -1 if S1 < S2, 0 if S1 = S2, 1 if S1 > S2, and
|
|
return 0. Upon failure, return -1 with errno set. */
|
|
extern int
|
|
u8_normcoll (const uint8_t *s1, size_t n1, const uint8_t *s2, size_t n2,
|
|
uninorm_t nf, int *resultp);
|
|
extern int
|
|
u16_normcoll (const uint16_t *s1, size_t n1, const uint16_t *s2, size_t n2,
|
|
uninorm_t nf, int *resultp);
|
|
extern int
|
|
u32_normcoll (const uint32_t *s1, size_t n1, const uint32_t *s2, size_t n2,
|
|
uninorm_t nf, int *resultp);
|
|
|
|
|
|
/* Normalization of a stream of Unicode characters.
|
|
|
|
A "stream of Unicode characters" is essentially a function that accepts an
|
|
ucs4_t argument repeatedly, optionally combined with a function that
|
|
"flushes" the stream. */
|
|
|
|
/* Data type of a stream of Unicode characters that normalizes its input
|
|
according to a given normalization form and passes the normalized character
|
|
sequence to the encapsulated stream of Unicode characters. */
|
|
struct uninorm_filter;
|
|
|
|
/* Create and return a normalization filter for Unicode characters.
|
|
The pair (stream_func, stream_data) is the encapsulated stream.
|
|
stream_func (stream_data, uc) receives the Unicode character uc
|
|
and returns 0 if successful, or -1 with errno set upon failure.
|
|
Return the new filter, or NULL with errno set upon failure. */
|
|
extern struct uninorm_filter *
|
|
uninorm_filter_create (uninorm_t nf,
|
|
int (*stream_func) (void *stream_data, ucs4_t uc),
|
|
void *stream_data);
|
|
|
|
/* Stuff a Unicode character into a normalizing filter.
|
|
Return 0 if successful, or -1 with errno set upon failure. */
|
|
extern int
|
|
uninorm_filter_write (struct uninorm_filter *filter, ucs4_t uc);
|
|
|
|
/* Bring data buffered in the filter to its destination, the encapsulated
|
|
stream.
|
|
Return 0 if successful, or -1 with errno set upon failure.
|
|
Note! If after calling this function, additional characters are written
|
|
into the filter, the resulting character sequence in the encapsulated stream
|
|
will not necessarily be normalized. */
|
|
extern int
|
|
uninorm_filter_flush (struct uninorm_filter *filter);
|
|
|
|
/* Bring data buffered in the filter to its destination, the encapsulated
|
|
stream, then close and free the filter.
|
|
Return 0 if successful, or -1 with errno set upon failure. */
|
|
extern int
|
|
uninorm_filter_free (struct uninorm_filter *filter);
|
|
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
|
|
#endif /* _UNINORM_H */
|