/* GRegex -- regular expression API wrapper around PCRE. * * Copyright (C) 1999, 2000 Scott Wimer * Copyright (C) 2004, Matthias Clasen * Copyright (C) 2005 - 2007, Marco Barisione * Copyright (C) 2022, Marco Trevisan * * SPDX-License-Identifier: LGPL-2.1-or-later * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this library; if not, see . */ #include "config.h" #include #include #define PCRE2_CODE_UNIT_WIDTH 8 #include #include "gtypes.h" #include "gregex.h" #include "glibintl.h" #include "glist.h" #include "gmessages.h" #include "gstrfuncs.h" #include "gatomic.h" #include "gtestutils.h" #include "gthread.h" /** * GRegex: * * A `GRegex` is the "compiled" form of a regular expression pattern. * * `GRegex` implements regular expression pattern matching using syntax and * semantics similar to Perl regular expression. See the * [PCRE documentation](man:pcrepattern(3)) for the syntax definition. * * Some functions accept a @start_position argument, setting it differs * from just passing over a shortened string and setting %G_REGEX_MATCH_NOTBOL * in the case of a pattern that begins with any kind of lookbehind assertion. * For example, consider the pattern "\Biss\B" which finds occurrences of "iss" * in the middle of words. ("\B" matches only if the current position in the * subject is not a word boundary.) When applied to the string "Mississipi" * from the fourth byte, namely "issipi", it does not match, because "\B" is * always false at the start of the subject, which is deemed to be a word * boundary. However, if the entire string is passed , but with * @start_position set to 4, it finds the second occurrence of "iss" because * it is able to look behind the starting point to discover that it is * preceded by a letter. * * Note that, unless you set the %G_REGEX_RAW flag, all the strings passed * to these functions must be encoded in UTF-8. The lengths and the positions * inside the strings are in bytes and not in characters, so, for instance, * "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a * single character. If you set %G_REGEX_RAW the strings can be non-valid * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two * bytes and two characters long. * * When matching a pattern, "\n" matches only against a "\n" character in * the string, and "\r" matches only a "\r" character. To match any newline * sequence use "\R". This particular group matches either the two-character * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed, * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"), * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line * separator, U+2028), or PS (paragraph separator, U+2029). * * The behaviour of the dot, circumflex, and dollar metacharacters are * affected by newline characters, the default is to recognize any newline * character (the same characters recognized by "\R"). This can be changed * with `G_REGEX_NEWLINE_CR`, `G_REGEX_NEWLINE_LF` and `G_REGEX_NEWLINE_CRLF` * compile options, and with `G_REGEX_MATCH_NEWLINE_ANY`, * `G_REGEX_MATCH_NEWLINE_CR`, `G_REGEX_MATCH_NEWLINE_LF` and * `G_REGEX_MATCH_NEWLINE_CRLF` match options. These settings are also * relevant when compiling a pattern if `G_REGEX_EXTENDED` is set, and an * unescaped "#" outside a character class is encountered. This indicates * a comment that lasts until after the next newline. * * Creating and manipulating the same `GRegex` structure from different * threads is not a problem as `GRegex` does not modify its internal * state between creation and destruction, on the other hand `GMatchInfo` * is not threadsafe. * * The regular expressions low-level functionalities are obtained through * the excellent [PCRE](http://www.pcre.org/) library written by Philip Hazel. * * Since: 2.14 */ #define G_REGEX_PCRE_GENERIC_MASK (PCRE2_ANCHORED | \ PCRE2_NO_UTF_CHECK | \ PCRE2_ENDANCHORED) /* Mask of all the possible values for GRegexCompileFlags. */ #define G_REGEX_COMPILE_MASK (G_REGEX_DEFAULT | \ G_REGEX_CASELESS | \ G_REGEX_MULTILINE | \ G_REGEX_DOTALL | \ G_REGEX_EXTENDED | \ G_REGEX_ANCHORED | \ G_REGEX_DOLLAR_ENDONLY | \ G_REGEX_UNGREEDY | \ G_REGEX_RAW | \ G_REGEX_NO_AUTO_CAPTURE | \ G_REGEX_OPTIMIZE | \ G_REGEX_FIRSTLINE | \ G_REGEX_DUPNAMES | \ G_REGEX_NEWLINE_CR | \ G_REGEX_NEWLINE_LF | \ G_REGEX_NEWLINE_CRLF | \ G_REGEX_NEWLINE_ANYCRLF | \ G_REGEX_BSR_ANYCRLF) #define G_REGEX_PCRE2_COMPILE_MASK (PCRE2_ALLOW_EMPTY_CLASS | \ PCRE2_ALT_BSUX | \ PCRE2_AUTO_CALLOUT | \ PCRE2_CASELESS | \ PCRE2_DOLLAR_ENDONLY | \ PCRE2_DOTALL | \ PCRE2_DUPNAMES | \ PCRE2_EXTENDED | \ PCRE2_FIRSTLINE | \ PCRE2_MATCH_UNSET_BACKREF | \ PCRE2_MULTILINE | \ PCRE2_NEVER_UCP | \ PCRE2_NEVER_UTF | \ PCRE2_NO_AUTO_CAPTURE | \ PCRE2_NO_AUTO_POSSESS | \ PCRE2_NO_DOTSTAR_ANCHOR | \ PCRE2_NO_START_OPTIMIZE | \ PCRE2_UCP | \ PCRE2_UNGREEDY | \ PCRE2_UTF | \ PCRE2_NEVER_BACKSLASH_C | \ PCRE2_ALT_CIRCUMFLEX | \ PCRE2_ALT_VERBNAMES | \ PCRE2_USE_OFFSET_LIMIT | \ PCRE2_EXTENDED_MORE | \ PCRE2_LITERAL | \ PCRE2_MATCH_INVALID_UTF | \ G_REGEX_PCRE_GENERIC_MASK) #define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF) /* Mask of all the possible values for GRegexMatchFlags. */ #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_DEFAULT | \ G_REGEX_MATCH_ANCHORED | \ G_REGEX_MATCH_NOTBOL | \ G_REGEX_MATCH_NOTEOL | \ G_REGEX_MATCH_NOTEMPTY | \ G_REGEX_MATCH_PARTIAL | \ G_REGEX_MATCH_NEWLINE_CR | \ G_REGEX_MATCH_NEWLINE_LF | \ G_REGEX_MATCH_NEWLINE_CRLF | \ G_REGEX_MATCH_NEWLINE_ANY | \ G_REGEX_MATCH_NEWLINE_ANYCRLF | \ G_REGEX_MATCH_BSR_ANYCRLF | \ G_REGEX_MATCH_BSR_ANY | \ G_REGEX_MATCH_PARTIAL_SOFT | \ G_REGEX_MATCH_PARTIAL_HARD | \ G_REGEX_MATCH_NOTEMPTY_ATSTART) #define G_REGEX_PCRE2_MATCH_MASK (PCRE2_NOTBOL |\ PCRE2_NOTEOL |\ PCRE2_NOTEMPTY |\ PCRE2_NOTEMPTY_ATSTART |\ PCRE2_PARTIAL_SOFT |\ PCRE2_PARTIAL_HARD |\ PCRE2_NO_JIT |\ PCRE2_COPY_MATCHED_SUBJECT |\ G_REGEX_PCRE_GENERIC_MASK) /* TODO: Support PCRE2_NEWLINE_NUL */ #define G_REGEX_NEWLINE_MASK (PCRE2_NEWLINE_CR | \ PCRE2_NEWLINE_LF | \ PCRE2_NEWLINE_CRLF | \ PCRE2_NEWLINE_ANYCRLF) /* Some match options are not supported when using JIT as stated in the * pcre2jit man page under the «UNSUPPORTED OPTIONS AND PATTERN ITEMS» section: * https://www.pcre.org/current/doc/html/pcre2jit.html#SEC5 */ #define G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS (PCRE2_ANCHORED | \ PCRE2_ENDANCHORED) #define G_REGEX_COMPILE_NEWLINE_MASK (G_REGEX_NEWLINE_CR | \ G_REGEX_NEWLINE_LF | \ G_REGEX_NEWLINE_CRLF | \ G_REGEX_NEWLINE_ANYCRLF) #define G_REGEX_MATCH_NEWLINE_MASK (G_REGEX_MATCH_NEWLINE_CR | \ G_REGEX_MATCH_NEWLINE_LF | \ G_REGEX_MATCH_NEWLINE_CRLF | \ G_REGEX_MATCH_NEWLINE_ANY | \ G_REGEX_MATCH_NEWLINE_ANYCRLF) /* if the string is in UTF-8 use g_utf8_ functions, else use * use just +/- 1. */ #define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ ((s) + 1) : \ g_utf8_next_char (s)) #define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ ((s) - 1) : \ g_utf8_prev_char (s)) struct _GMatchInfo { gint ref_count; /* the ref count (atomic) */ GRegex *regex; /* the regex */ uint32_t match_opts; /* pcre match options used at match time on the regex */ gint matches; /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */ uint32_t n_subpatterns; /* total number of sub patterns in the regex */ gint pos; /* position in the string where last match left off */ uint32_t n_offsets; /* number of offsets */ gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */ gint *workspace; /* workspace for pcre2_dfa_match() */ PCRE2_SIZE n_workspace; /* number of workspace elements */ const gchar *string; /* string passed to the match function */ gssize string_len; /* length of string, in bytes */ pcre2_match_context *match_context; pcre2_match_data *match_data; pcre2_jit_stack *jit_stack; }; typedef enum { JIT_STATUS_DEFAULT, JIT_STATUS_ENABLED, JIT_STATUS_DISABLED } JITStatus; struct _GRegex { gint ref_count; /* the ref count for the immutable part (atomic) */ gchar *pattern; /* the pattern */ pcre2_code *pcre_re; /* compiled form of the pattern */ uint32_t compile_opts; /* options used at compile time on the pattern, pcre2 values */ GRegexCompileFlags orig_compile_opts; /* options used at compile time on the pattern, gregex values */ uint32_t match_opts; /* pcre2 options used at match time on the regex */ GRegexMatchFlags orig_match_opts; /* options used as default match options, gregex values */ uint32_t jit_options; /* options which were enabled for jit compiler */ JITStatus jit_status; /* indicates the status of jit compiler for this compiled regex */ /* The jit_status here does _not_ correspond to whether we used the JIT in the last invocation, * which may be affected by match_options or a JIT_STACK_LIMIT error, but whether it was ever * enabled for the current regex AND current set of jit_options. * JIT_STATUS_DEFAULT means enablement was never tried, * JIT_STATUS_ENABLED means it was tried and successful (even if we're not currently using it), * and JIT_STATUS_DISABLED means it was tried and failed (so we shouldn't try again). */ }; /* TRUE if ret is an error code, FALSE otherwise. */ #define IS_PCRE2_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL) typedef struct _InterpolationData InterpolationData; static gboolean interpolation_list_needs_match (GList *list); static gboolean interpolate_replacement (const GMatchInfo *match_info, GString *result, gpointer data); static GList *split_replacement (const gchar *replacement, GError **error); static void free_interpolation_data (InterpolationData *data); static uint32_t get_pcre2_compile_options (GRegexCompileFlags compile_flags) { /* Maps compile flags to pcre2 values */ uint32_t pcre2_flags = 0; if (compile_flags & G_REGEX_CASELESS) pcre2_flags |= PCRE2_CASELESS; if (compile_flags & G_REGEX_MULTILINE) pcre2_flags |= PCRE2_MULTILINE; if (compile_flags & G_REGEX_DOTALL) pcre2_flags |= PCRE2_DOTALL; if (compile_flags & G_REGEX_EXTENDED) pcre2_flags |= PCRE2_EXTENDED; if (compile_flags & G_REGEX_ANCHORED) pcre2_flags |= PCRE2_ANCHORED; if (compile_flags & G_REGEX_DOLLAR_ENDONLY) pcre2_flags |= PCRE2_DOLLAR_ENDONLY; if (compile_flags & G_REGEX_UNGREEDY) pcre2_flags |= PCRE2_UNGREEDY; if (!(compile_flags & G_REGEX_RAW)) pcre2_flags |= PCRE2_UTF; if (compile_flags & G_REGEX_NO_AUTO_CAPTURE) pcre2_flags |= PCRE2_NO_AUTO_CAPTURE; if (compile_flags & G_REGEX_FIRSTLINE) pcre2_flags |= PCRE2_FIRSTLINE; if (compile_flags & G_REGEX_DUPNAMES) pcre2_flags |= PCRE2_DUPNAMES; return pcre2_flags & G_REGEX_PCRE2_COMPILE_MASK; } static uint32_t get_pcre2_match_options (GRegexMatchFlags match_flags, GRegexCompileFlags compile_flags) { /* Maps match flags to pcre2 values */ uint32_t pcre2_flags = 0; if (match_flags & G_REGEX_MATCH_ANCHORED) pcre2_flags |= PCRE2_ANCHORED; if (match_flags & G_REGEX_MATCH_NOTBOL) pcre2_flags |= PCRE2_NOTBOL; if (match_flags & G_REGEX_MATCH_NOTEOL) pcre2_flags |= PCRE2_NOTEOL; if (match_flags & G_REGEX_MATCH_NOTEMPTY) pcre2_flags |= PCRE2_NOTEMPTY; if (match_flags & G_REGEX_MATCH_PARTIAL_SOFT) pcre2_flags |= PCRE2_PARTIAL_SOFT; if (match_flags & G_REGEX_MATCH_PARTIAL_HARD) pcre2_flags |= PCRE2_PARTIAL_HARD; if (match_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART) pcre2_flags |= PCRE2_NOTEMPTY_ATSTART; if (compile_flags & G_REGEX_RAW) pcre2_flags |= PCRE2_NO_UTF_CHECK; return pcre2_flags & G_REGEX_PCRE2_MATCH_MASK; } static GRegexCompileFlags g_regex_compile_flags_from_pcre2 (uint32_t pcre2_flags) { GRegexCompileFlags compile_flags = G_REGEX_DEFAULT; if (pcre2_flags & PCRE2_CASELESS) compile_flags |= G_REGEX_CASELESS; if (pcre2_flags & PCRE2_MULTILINE) compile_flags |= G_REGEX_MULTILINE; if (pcre2_flags & PCRE2_DOTALL) compile_flags |= G_REGEX_DOTALL; if (pcre2_flags & PCRE2_EXTENDED) compile_flags |= G_REGEX_EXTENDED; if (pcre2_flags & PCRE2_ANCHORED) compile_flags |= G_REGEX_ANCHORED; if (pcre2_flags & PCRE2_DOLLAR_ENDONLY) compile_flags |= G_REGEX_DOLLAR_ENDONLY; if (pcre2_flags & PCRE2_UNGREEDY) compile_flags |= G_REGEX_UNGREEDY; if (!(pcre2_flags & PCRE2_UTF)) compile_flags |= G_REGEX_RAW; if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE) compile_flags |= G_REGEX_NO_AUTO_CAPTURE; if (pcre2_flags & PCRE2_FIRSTLINE) compile_flags |= G_REGEX_FIRSTLINE; if (pcre2_flags & PCRE2_DUPNAMES) compile_flags |= G_REGEX_DUPNAMES; return compile_flags & G_REGEX_COMPILE_MASK; } static GRegexMatchFlags g_regex_match_flags_from_pcre2 (uint32_t pcre2_flags) { GRegexMatchFlags match_flags = G_REGEX_MATCH_DEFAULT; if (pcre2_flags & PCRE2_ANCHORED) match_flags |= G_REGEX_MATCH_ANCHORED; if (pcre2_flags & PCRE2_NOTBOL) match_flags |= G_REGEX_MATCH_NOTBOL; if (pcre2_flags & PCRE2_NOTEOL) match_flags |= G_REGEX_MATCH_NOTEOL; if (pcre2_flags & PCRE2_NOTEMPTY) match_flags |= G_REGEX_MATCH_NOTEMPTY; if (pcre2_flags & PCRE2_PARTIAL_SOFT) match_flags |= G_REGEX_MATCH_PARTIAL_SOFT; if (pcre2_flags & PCRE2_PARTIAL_HARD) match_flags |= G_REGEX_MATCH_PARTIAL_HARD; if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART) match_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART; return (match_flags & G_REGEX_MATCH_MASK); } static uint32_t get_pcre2_newline_compile_options (GRegexCompileFlags compile_flags) { compile_flags &= G_REGEX_COMPILE_NEWLINE_MASK; switch (compile_flags) { case G_REGEX_NEWLINE_CR: return PCRE2_NEWLINE_CR; case G_REGEX_NEWLINE_LF: return PCRE2_NEWLINE_LF; case G_REGEX_NEWLINE_CRLF: return PCRE2_NEWLINE_CRLF; case G_REGEX_NEWLINE_ANYCRLF: return PCRE2_NEWLINE_ANYCRLF; default: if (compile_flags != 0) return 0; return PCRE2_NEWLINE_ANY; } } static uint32_t get_pcre2_newline_match_options (GRegexMatchFlags match_flags) { switch (match_flags & G_REGEX_MATCH_NEWLINE_MASK) { case G_REGEX_MATCH_NEWLINE_CR: return PCRE2_NEWLINE_CR; case G_REGEX_MATCH_NEWLINE_LF: return PCRE2_NEWLINE_LF; case G_REGEX_MATCH_NEWLINE_CRLF: return PCRE2_NEWLINE_CRLF; case G_REGEX_MATCH_NEWLINE_ANY: return PCRE2_NEWLINE_ANY; case G_REGEX_MATCH_NEWLINE_ANYCRLF: return PCRE2_NEWLINE_ANYCRLF; default: return 0; } } static uint32_t get_pcre2_bsr_compile_options (GRegexCompileFlags compile_flags) { if (compile_flags & G_REGEX_BSR_ANYCRLF) return PCRE2_BSR_ANYCRLF; return PCRE2_BSR_UNICODE; } static uint32_t get_pcre2_bsr_match_options (GRegexMatchFlags match_flags) { if (match_flags & G_REGEX_MATCH_BSR_ANYCRLF) return PCRE2_BSR_ANYCRLF; if (match_flags & G_REGEX_MATCH_BSR_ANY) return PCRE2_BSR_UNICODE; return 0; } static char * get_pcre2_error_string (int errcode) { PCRE2_UCHAR8 error_msg[2048]; int err_length; err_length = pcre2_get_error_message (errcode, error_msg, G_N_ELEMENTS (error_msg)); if (err_length <= 0) return NULL; /* The array is always filled with a trailing zero */ g_assert ((size_t) err_length < G_N_ELEMENTS (error_msg)); return g_memdup2 (error_msg, err_length + 1); } static const gchar * translate_match_error (gint errcode) { switch (errcode) { case PCRE2_ERROR_NOMATCH: /* not an error */ break; case PCRE2_ERROR_NULL: /* NULL argument, this should not happen in GRegex */ g_critical ("A NULL argument was passed to PCRE"); break; case PCRE2_ERROR_BADOPTION: return "bad options"; case PCRE2_ERROR_BADMAGIC: return _("corrupted object"); case PCRE2_ERROR_NOMEMORY: return _("out of memory"); case PCRE2_ERROR_NOSUBSTRING: /* not used by pcre2_match() */ break; case PCRE2_ERROR_MATCHLIMIT: case PCRE2_ERROR_CALLOUT: /* callouts are not implemented */ break; case PCRE2_ERROR_BADUTFOFFSET: /* we do not check if strings are valid */ break; case PCRE2_ERROR_PARTIAL: /* not an error */ break; case PCRE2_ERROR_INTERNAL: return _("internal error"); case PCRE2_ERROR_DFA_UITEM: return _("the pattern contains items not supported for partial matching"); case PCRE2_ERROR_DFA_UCOND: return _("back references as conditions are not supported for partial matching"); case PCRE2_ERROR_DFA_WSSIZE: /* handled expanding the workspace */ break; case PCRE2_ERROR_DFA_RECURSE: case PCRE2_ERROR_RECURSIONLIMIT: return _("recursion limit reached"); case PCRE2_ERROR_BADOFFSET: return _("bad offset"); case PCRE2_ERROR_RECURSELOOP: return _("recursion loop"); case PCRE2_ERROR_JIT_BADOPTION: /* should not happen in GRegex since we check modes before each match */ return _("matching mode is requested that was not compiled for JIT"); default: break; } return NULL; } static char * get_match_error_message (int errcode) { const char *msg = translate_match_error (errcode); char *error_string; if (msg) return g_strdup (msg); error_string = get_pcre2_error_string (errcode); if (error_string) return error_string; return g_strdup (_("unknown error")); } static void translate_compile_error (gint *errcode, const gchar **errmsg) { /* If errcode is known we put the translatable error message in * errmsg. If errcode is unknown we put the generic * G_REGEX_ERROR_COMPILE error code in errcode. * Note that there can be more PCRE errors with the same GRegexError * and that some PCRE errors are useless for us. */ gint original_errcode = *errcode; *errcode = -1; *errmsg = NULL; switch (original_errcode) { case PCRE2_ERROR_END_BACKSLASH: *errcode = G_REGEX_ERROR_STRAY_BACKSLASH; *errmsg = _("\\ at end of pattern"); break; case PCRE2_ERROR_END_BACKSLASH_C: *errcode = G_REGEX_ERROR_MISSING_CONTROL_CHAR; *errmsg = _("\\c at end of pattern"); break; case PCRE2_ERROR_UNKNOWN_ESCAPE: case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE: *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE; *errmsg = _("unrecognized character following \\"); break; case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER: *errcode = G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER; *errmsg = _("numbers out of order in {} quantifier"); break; case PCRE2_ERROR_QUANTIFIER_TOO_BIG: *errcode = G_REGEX_ERROR_QUANTIFIER_TOO_BIG; *errmsg = _("number too big in {} quantifier"); break; case PCRE2_ERROR_MISSING_SQUARE_BRACKET: *errcode = G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS; *errmsg = _("missing terminating ] for character class"); break; case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS: *errcode = G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS; *errmsg = _("invalid escape sequence in character class"); break; case PCRE2_ERROR_CLASS_RANGE_ORDER: *errcode = G_REGEX_ERROR_RANGE_OUT_OF_ORDER; *errmsg = _("range out of order in character class"); break; case PCRE2_ERROR_QUANTIFIER_INVALID: case PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT: *errcode = G_REGEX_ERROR_NOTHING_TO_REPEAT; *errmsg = _("nothing to repeat"); break; case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY: *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; *errmsg = _("unrecognized character after (? or (?-"); break; case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS: *errcode = G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS; *errmsg = _("POSIX named classes are supported only within a class"); break; case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING: *errcode = G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED; *errmsg = _("POSIX collating elements are not supported"); break; case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS: case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS: case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING: *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; *errmsg = _("missing terminating )"); break; case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE: *errcode = G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE; *errmsg = _("reference to non-existent subpattern"); break; case PCRE2_ERROR_MISSING_COMMENT_CLOSING: *errcode = G_REGEX_ERROR_UNTERMINATED_COMMENT; *errmsg = _("missing ) after comment"); break; case PCRE2_ERROR_PATTERN_TOO_LARGE: *errcode = G_REGEX_ERROR_EXPRESSION_TOO_LARGE; *errmsg = _("regular expression is too large"); break; case PCRE2_ERROR_MISSING_CONDITION_CLOSING: *errcode = G_REGEX_ERROR_MALFORMED_CONDITION; *errmsg = _("malformed number or name after (?("); break; case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH: *errcode = G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND; *errmsg = _("lookbehind assertion is not fixed length"); break; case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES: *errcode = G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES; *errmsg = _("conditional group contains more than two branches"); break; case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED: *errcode = G_REGEX_ERROR_ASSERTION_EXPECTED; *errmsg = _("assertion expected after (?("); break; case PCRE2_ERROR_BAD_RELATIVE_REFERENCE: *errcode = G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE; *errmsg = _("a numbered reference must not be zero"); break; case PCRE2_ERROR_UNKNOWN_POSIX_CLASS: *errcode = G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME; *errmsg = _("unknown POSIX class name"); break; case PCRE2_ERROR_CODE_POINT_TOO_BIG: case PCRE2_ERROR_INVALID_HEXADECIMAL: *errcode = G_REGEX_ERROR_HEX_CODE_TOO_LARGE; *errmsg = _("character value in \\x{...} sequence is too large"); break; case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C: *errcode = G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND; *errmsg = _("\\C not allowed in lookbehind assertion"); break; case PCRE2_ERROR_MISSING_NAME_TERMINATOR: *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR; *errmsg = _("missing terminator in subpattern name"); break; case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME: *errcode = G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME; *errmsg = _("two named subpatterns have the same name"); break; case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY: *errcode = G_REGEX_ERROR_MALFORMED_PROPERTY; *errmsg = _("malformed \\P or \\p sequence"); break; case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY: *errcode = G_REGEX_ERROR_UNKNOWN_PROPERTY; *errmsg = _("unknown property name after \\P or \\p"); break; case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG: *errcode = G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG; *errmsg = _("subpattern name is too long (maximum 32 characters)"); break; case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS: *errcode = G_REGEX_ERROR_TOO_MANY_SUBPATTERNS; *errmsg = _("too many named subpatterns (maximum 10,000)"); break; case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG: *errcode = G_REGEX_ERROR_INVALID_OCTAL_VALUE; *errmsg = _("octal value is greater than \\377"); break; case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES: *errcode = G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE; *errmsg = _("DEFINE group contains more than one branch"); break; case PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE: *errcode = G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS; *errmsg = _("inconsistent NEWLINE options"); break; case PCRE2_ERROR_BACKSLASH_G_SYNTAX: *errcode = G_REGEX_ERROR_MISSING_BACK_REFERENCE; *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or " "number, or by a plain number"); break; case PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED: *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN; *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)"); break; case PCRE2_ERROR_VERB_UNKNOWN: *errcode = G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB; *errmsg = _("(*VERB) not recognized"); break; case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG: *errcode = G_REGEX_ERROR_NUMBER_TOO_BIG; *errmsg = _("number is too big"); break; case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED: *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME; *errmsg = _("missing subpattern name after (?&"); break; case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH: *errcode = G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME; *errmsg = _("different names for subpatterns of the same number are not allowed"); break; case PCRE2_ERROR_MARK_MISSING_ARGUMENT: *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED; *errmsg = _("(*MARK) must have an argument"); break; case PCRE2_ERROR_BACKSLASH_C_SYNTAX: *errcode = G_REGEX_ERROR_INVALID_CONTROL_CHAR; *errmsg = _( "\\c must be followed by an ASCII character"); break; case PCRE2_ERROR_BACKSLASH_K_SYNTAX: *errcode = G_REGEX_ERROR_MISSING_NAME; *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name"); break; case PCRE2_ERROR_BACKSLASH_N_IN_CLASS: *errcode = G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS; *errmsg = _("\\N is not supported in a class"); break; case PCRE2_ERROR_VERB_NAME_TOO_LONG: *errcode = G_REGEX_ERROR_NAME_TOO_LONG; *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)"); break; case PCRE2_ERROR_INTERNAL_CODE_OVERFLOW: *errcode = G_REGEX_ERROR_INTERNAL; *errmsg = _("code overflow"); break; case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P: *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; *errmsg = _("unrecognized character after (?P"); break; case PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE: *errcode = G_REGEX_ERROR_INTERNAL; *errmsg = _("overran compiling workspace"); break; case PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN: *errcode = G_REGEX_ERROR_INTERNAL; *errmsg = _("previously-checked referenced subpattern not found"); break; case PCRE2_ERROR_HEAP_FAILED: case PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW: case PCRE2_ERROR_UNICODE_NOT_SUPPORTED: case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT: case PCRE2_ERROR_NO_SURROGATES_IN_UTF16: case PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS: case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE: case PCRE2_ERROR_INTERNAL_STUDY_ERROR: case PCRE2_ERROR_UTF_IS_DISABLED: case PCRE2_ERROR_UCP_IS_DISABLED: case PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS: case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED: case PCRE2_ERROR_INTERNAL_BAD_CODE: case PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP: *errcode = G_REGEX_ERROR_INTERNAL; break; case PCRE2_ERROR_INVALID_SUBPATTERN_NAME: case PCRE2_ERROR_CLASS_INVALID_RANGE: case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE: case PCRE2_ERROR_PARENTHESES_STACK_CHECK: case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED: case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG: case PCRE2_ERROR_MISSING_CALLOUT_CLOSING: case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB: case PCRE2_ERROR_NULL_PATTERN: case PCRE2_ERROR_BAD_OPTIONS: case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP: case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE: case PCRE2_ERROR_INVALID_OCTAL: case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG: case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG: case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS: case PCRE2_ERROR_VERSION_CONDITION_SYNTAX: case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER: case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER: case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED: case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP: case PCRE2_ERROR_PATTERN_TOO_COMPLICATED: case PCRE2_ERROR_LOOKBEHIND_TOO_LONG: case PCRE2_ERROR_PATTERN_STRING_TOO_LONG: case PCRE2_ERROR_BAD_LITERAL_OPTIONS: default: *errcode = G_REGEX_ERROR_COMPILE; break; } g_assert (*errcode != -1); } /* GMatchInfo */ static GMatchInfo * match_info_new (const GRegex *regex, const gchar *string, gint string_len, gint start_position, GRegexMatchFlags match_options, gboolean is_dfa) { GMatchInfo *match_info; if (string_len < 0) string_len = strlen (string); match_info = g_new0 (GMatchInfo, 1); match_info->ref_count = 1; match_info->regex = g_regex_ref ((GRegex *)regex); match_info->string = string; match_info->string_len = string_len; match_info->matches = PCRE2_ERROR_NOMATCH; match_info->pos = start_position; match_info->match_opts = get_pcre2_match_options (match_options, regex->orig_compile_opts); pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &match_info->n_subpatterns); match_info->match_context = pcre2_match_context_create (NULL); if (is_dfa) { /* These values should be enough for most cases, if they are not * enough g_regex_match_all_full() will expand them. */ match_info->n_workspace = 100; match_info->workspace = g_new (gint, match_info->n_workspace); } match_info->n_offsets = 2; match_info->offsets = g_new0 (gint, match_info->n_offsets); /* Set an invalid position for the previous match. */ match_info->offsets[0] = -1; match_info->offsets[1] = -1; match_info->match_data = pcre2_match_data_create_from_pattern ( match_info->regex->pcre_re, NULL); return match_info; } static gboolean recalc_match_offsets (GMatchInfo *match_info, GError **error) { PCRE2_SIZE *ovector; uint32_t ovector_size = 0; uint32_t pre_n_offset; uint32_t i; g_assert (!IS_PCRE2_ERROR (match_info->matches)); if (match_info->matches == PCRE2_ERROR_PARTIAL) ovector_size = 1; else if (match_info->matches > 0) ovector_size = match_info->matches; g_assert (ovector_size != 0); if (pcre2_get_ovector_count (match_info->match_data) < ovector_size) { g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, _("Error while matching regular expression %s: %s"), match_info->regex->pattern, _("code overflow")); return FALSE; } pre_n_offset = match_info->n_offsets; match_info->n_offsets = ovector_size * 2; ovector = pcre2_get_ovector_pointer (match_info->match_data); if (match_info->n_offsets != pre_n_offset) { match_info->offsets = g_realloc_n (match_info->offsets, match_info->n_offsets, sizeof (gint)); } for (i = 0; i < match_info->n_offsets; i++) { match_info->offsets[i] = (int) ovector[i]; } return TRUE; } static JITStatus enable_jit_with_match_options (GMatchInfo *match_info, uint32_t match_options) { gint retval; uint32_t old_jit_options, new_jit_options; if (!(match_info->regex->orig_compile_opts & G_REGEX_OPTIMIZE)) return JIT_STATUS_DISABLED; if (match_info->regex->jit_status == JIT_STATUS_DISABLED) return JIT_STATUS_DISABLED; if (match_options & G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS) return JIT_STATUS_DISABLED; old_jit_options = match_info->regex->jit_options; new_jit_options = old_jit_options | PCRE2_JIT_COMPLETE; if (match_options & PCRE2_PARTIAL_HARD) new_jit_options |= PCRE2_JIT_PARTIAL_HARD; if (match_options & PCRE2_PARTIAL_SOFT) new_jit_options |= PCRE2_JIT_PARTIAL_SOFT; /* no new options enabled */ if (new_jit_options == old_jit_options) { g_assert (match_info->regex->jit_status != JIT_STATUS_DEFAULT); return match_info->regex->jit_status; } retval = pcre2_jit_compile (match_info->regex->pcre_re, new_jit_options); if (retval == 0) { match_info->regex->jit_status = JIT_STATUS_ENABLED; match_info->regex->jit_options = new_jit_options; /* Set min stack size for JIT to 32KiB and max to 512KiB */ match_info->jit_stack = pcre2_jit_stack_create (1 << 15, 1 << 19, NULL); pcre2_jit_stack_assign (match_info->match_context, NULL, match_info->jit_stack); } else { match_info->regex->jit_status = JIT_STATUS_DISABLED; switch (retval) { case PCRE2_ERROR_NOMEMORY: g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " "but JIT was unable to allocate executable memory for the " "compiler. Falling back to interpretive code."); break; case PCRE2_ERROR_JIT_BADOPTION: g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " "but JIT support is not available. Falling back to " "interpretive code."); break; default: g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " "but request for JIT support had unexpectedly failed (error %d). " "Falling back to interpretive code.", retval); break; } } return match_info->regex->jit_status; g_assert_not_reached (); } /** * g_match_info_get_regex: * @match_info: a #GMatchInfo * * Returns #GRegex object used in @match_info. It belongs to Glib * and must not be freed. Use g_regex_ref() if you need to keep it * after you free @match_info object. * * Returns: (transfer none): #GRegex object used in @match_info * * Since: 2.14 */ GRegex * g_match_info_get_regex (const GMatchInfo *match_info) { g_return_val_if_fail (match_info != NULL, NULL); return match_info->regex; } /** * g_match_info_get_string: * @match_info: a #GMatchInfo * * Returns the string searched with @match_info. This is the * string passed to g_regex_match() or g_regex_replace() so * you may not free it before calling this function. * * Returns: the string searched with @match_info * * Since: 2.14 */ const gchar * g_match_info_get_string (const GMatchInfo *match_info) { g_return_val_if_fail (match_info != NULL, NULL); return match_info->string; } /** * g_match_info_ref: * @match_info: a #GMatchInfo * * Increases reference count of @match_info by 1. * * Returns: @match_info * * Since: 2.30 */ GMatchInfo * g_match_info_ref (GMatchInfo *match_info) { g_return_val_if_fail (match_info != NULL, NULL); g_atomic_int_inc (&match_info->ref_count); return match_info; } /** * g_match_info_unref: * @match_info: a #GMatchInfo * * Decreases reference count of @match_info by 1. When reference count drops * to zero, it frees all the memory associated with the match_info structure. * * Since: 2.30 */ void g_match_info_unref (GMatchInfo *match_info) { if (g_atomic_int_dec_and_test (&match_info->ref_count)) { g_regex_unref (match_info->regex); if (match_info->match_context) pcre2_match_context_free (match_info->match_context); if (match_info->jit_stack) pcre2_jit_stack_free (match_info->jit_stack); if (match_info->match_data) pcre2_match_data_free (match_info->match_data); g_free (match_info->offsets); g_free (match_info->workspace); g_free (match_info); } } /** * g_match_info_free: * @match_info: (nullable): a #GMatchInfo, or %NULL * * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does * nothing. * * Since: 2.14 */ void g_match_info_free (GMatchInfo *match_info) { if (match_info == NULL) return; g_match_info_unref (match_info); } /** * g_match_info_next: * @match_info: a #GMatchInfo structure * @error: location to store the error occurring, or %NULL to ignore errors * * Scans for the next match using the same parameters of the previous * call to g_regex_match_full() or g_regex_match() that returned * @match_info. * * The match is done on the string passed to the match function, so you * cannot free it before calling this function. * * Returns: %TRUE is the string matched, %FALSE otherwise * * Since: 2.14 */ gboolean g_match_info_next (GMatchInfo *match_info, GError **error) { JITStatus jit_status; gint prev_match_start; gint prev_match_end; uint32_t opts; g_return_val_if_fail (match_info != NULL, FALSE); g_return_val_if_fail (error == NULL || *error == NULL, FALSE); g_return_val_if_fail (match_info->pos >= 0, FALSE); prev_match_start = match_info->offsets[0]; prev_match_end = match_info->offsets[1]; if (match_info->pos > match_info->string_len) { /* we have reached the end of the string */ match_info->pos = -1; match_info->matches = PCRE2_ERROR_NOMATCH; return FALSE; } opts = match_info->regex->match_opts | match_info->match_opts; jit_status = enable_jit_with_match_options (match_info, opts); if (jit_status == JIT_STATUS_ENABLED) { match_info->matches = pcre2_jit_match (match_info->regex->pcre_re, (PCRE2_SPTR8) match_info->string, match_info->string_len, match_info->pos, opts, match_info->match_data, match_info->match_context); /* if the JIT stack limit was reached, fall back to non-JIT matching in * the next conditional statement */ if (match_info->matches == PCRE2_ERROR_JIT_STACKLIMIT) { g_debug ("PCRE2 JIT stack limit reached, falling back to " "non-optimized matching."); opts |= PCRE2_NO_JIT; jit_status = JIT_STATUS_DISABLED; } } if (jit_status != JIT_STATUS_ENABLED) { match_info->matches = pcre2_match (match_info->regex->pcre_re, (PCRE2_SPTR8) match_info->string, match_info->string_len, match_info->pos, opts, match_info->match_data, match_info->match_context); } if (IS_PCRE2_ERROR (match_info->matches)) { gchar *error_msg = get_match_error_message (match_info->matches); g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, _("Error while matching regular expression %s: %s"), match_info->regex->pattern, error_msg); g_clear_pointer (&error_msg, g_free); return FALSE; } else if (match_info->matches == 0) { /* info->offsets is too small. */ match_info->n_offsets *= 2; match_info->offsets = g_realloc_n (match_info->offsets, match_info->n_offsets, sizeof (gint)); pcre2_match_data_free (match_info->match_data); match_info->match_data = pcre2_match_data_create (match_info->n_offsets, NULL); return g_match_info_next (match_info, error); } else if (match_info->matches == PCRE2_ERROR_NOMATCH) { /* We're done with this match info */ match_info->pos = -1; return FALSE; } else if (!recalc_match_offsets (match_info, error)) return FALSE; /* avoid infinite loops if the pattern is an empty string or something * equivalent */ if (match_info->pos == match_info->offsets[1]) { if (match_info->pos > match_info->string_len) { /* we have reached the end of the string */ match_info->pos = -1; match_info->matches = PCRE2_ERROR_NOMATCH; return FALSE; } match_info->pos = NEXT_CHAR (match_info->regex, &match_info->string[match_info->pos]) - match_info->string; } else { match_info->pos = match_info->offsets[1]; } g_assert (match_info->matches < 0 || (uint32_t) match_info->matches <= match_info->n_subpatterns + 1); /* it's possible to get two identical matches when we are matching * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and * the string is "RegExTest" we have: * - search at position 0: match from 0 to 0 * - search at position 1: match from 3 to 3 * - search at position 3: match from 3 to 3 (duplicate) * - search at position 4: match from 5 to 5 * - search at position 5: match from 5 to 5 (duplicate) * - search at position 6: no match -> stop * so we have to ignore the duplicates. * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */ if (match_info->matches >= 0 && prev_match_start == match_info->offsets[0] && prev_match_end == match_info->offsets[1]) { /* ignore this match and search the next one */ return g_match_info_next (match_info, error); } return match_info->matches >= 0; } /** * g_match_info_matches: * @match_info: a #GMatchInfo structure * * Returns whether the previous match operation succeeded. * * Returns: %TRUE if the previous match operation succeeded, * %FALSE otherwise * * Since: 2.14 */ gboolean g_match_info_matches (const GMatchInfo *match_info) { g_return_val_if_fail (match_info != NULL, FALSE); return match_info->matches >= 0; } /** * g_match_info_get_match_count: * @match_info: a #GMatchInfo structure * * Retrieves the number of matched substrings (including substring 0, * that is the whole matched text), so 1 is returned if the pattern * has no substrings in it and 0 is returned if the match failed. * * If the last match was obtained using the DFA algorithm, that is * using g_regex_match_all() or g_regex_match_all_full(), the retrieved * count is not that of the number of capturing parentheses but that of * the number of matched substrings. * * Returns: Number of matched substrings, or -1 if an error occurred * * Since: 2.14 */ gint g_match_info_get_match_count (const GMatchInfo *match_info) { g_return_val_if_fail (match_info, -1); if (match_info->matches == PCRE2_ERROR_NOMATCH) /* no match */ return 0; else if (match_info->matches < PCRE2_ERROR_NOMATCH) /* error */ return -1; else /* match */ return match_info->matches; } /** * g_match_info_is_partial_match: * @match_info: a #GMatchInfo structure * * Usually if the string passed to g_regex_match*() matches as far as * it goes, but is too short to match the entire pattern, %FALSE is * returned. There are circumstances where it might be helpful to * distinguish this case from other cases in which there is no match. * * Consider, for example, an application where a human is required to * type in data for a field with specific formatting requirements. An * example might be a date in the form ddmmmyy, defined by the pattern * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$". * If the application sees the user’s keystrokes one by one, and can * check that what has been typed so far is potentially valid, it is * able to raise an error as soon as a mistake is made. * * GRegex supports the concept of partial matching by means of the * %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD flags. * When they are used, the return code for * g_regex_match() or g_regex_match_full() is, as usual, %TRUE * for a complete match, %FALSE otherwise. But, when these functions * return %FALSE, you can check if the match was partial calling * g_match_info_is_partial_match(). * * The difference between %G_REGEX_MATCH_PARTIAL_SOFT and * %G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered * with %G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a * possible complete match, while with %G_REGEX_MATCH_PARTIAL_HARD matching * stops at the partial match. * When both %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD * are set, the latter takes precedence. * * There were formerly some restrictions on the pattern for partial matching. * The restrictions no longer apply. * * See pcrepartial(3) for more information on partial matching. * * Returns: %TRUE if the match was partial, %FALSE otherwise * * Since: 2.14 */ gboolean g_match_info_is_partial_match (const GMatchInfo *match_info) { g_return_val_if_fail (match_info != NULL, FALSE); return match_info->matches == PCRE2_ERROR_PARTIAL; } /** * g_match_info_expand_references: * @match_info: (nullable): a #GMatchInfo or %NULL * @string_to_expand: the string to expand * @error: location to store the error occurring, or %NULL to ignore errors * * Returns a new string containing the text in @string_to_expand with * references and escape sequences expanded. References refer to the last * match done with @string against @regex and have the same syntax used by * g_regex_replace(). * * The @string_to_expand must be UTF-8 encoded even if %G_REGEX_RAW was * passed to g_regex_new(). * * The backreferences are extracted from the string passed to the match * function, so you cannot call this function after freeing the string. * * @match_info may be %NULL in which case @string_to_expand must not * contain references. For instance "foo\n" does not refer to an actual * pattern and '\n' merely will be replaced with \n character, * while to expand "\0" (whole match) one needs the result of a match. * Use g_regex_check_replacement() to find out whether @string_to_expand * contains references. * * Returns: (nullable): the expanded string, or %NULL if an error occurred * * Since: 2.14 */ gchar * g_match_info_expand_references (const GMatchInfo *match_info, const gchar *string_to_expand, GError **error) { GString *result; GList *list; GError *tmp_error = NULL; g_return_val_if_fail (string_to_expand != NULL, NULL); g_return_val_if_fail (error == NULL || *error == NULL, NULL); list = split_replacement (string_to_expand, &tmp_error); if (tmp_error != NULL) { g_propagate_error (error, tmp_error); return NULL; } if (!match_info && interpolation_list_needs_match (list)) { g_critical ("String '%s' contains references to the match, can't " "expand references without GMatchInfo object", string_to_expand); return NULL; } result = g_string_sized_new (strlen (string_to_expand)); interpolate_replacement (match_info, result, list); g_list_free_full (list, (GDestroyNotify) free_interpolation_data); return g_string_free (result, FALSE); } /** * g_match_info_fetch: * @match_info: #GMatchInfo structure * @match_num: number of the sub expression * * Retrieves the text matching the @match_num'th capturing * parentheses. 0 is the full text of the match, 1 is the first paren * set, 2 the second, and so on. * * If @match_num is a valid sub pattern but it didn't match anything * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty * string is returned. * * If the match was obtained using the DFA algorithm, that is using * g_regex_match_all() or g_regex_match_all_full(), the retrieved * string is not that of a set of parentheses but that of a matched * substring. Substrings are matched in reverse order of length, so * 0 is the longest match. * * The string is fetched from the string passed to the match function, * so you cannot call this function after freeing the string. * * Returns: (nullable): The matched substring, or %NULL if an error * occurred. You have to free the string yourself * * Since: 2.14 */ gchar * g_match_info_fetch (const GMatchInfo *match_info, gint match_num) { gchar *match = NULL; gint start, end; g_return_val_if_fail (match_info != NULL, NULL); g_return_val_if_fail (match_num >= 0, NULL); /* match_num does not exist or it didn't matched, i.e. matching "b" * against "(a)?b" then group 0 is empty. */ if (!g_match_info_fetch_pos (match_info, match_num, &start, &end)) match = NULL; else if (start == -1) match = g_strdup (""); else match = g_strndup (&match_info->string[start], end - start); return match; } /** * g_match_info_fetch_pos: * @match_info: #GMatchInfo structure * @match_num: number of the sub expression * @start_pos: (out) (optional): pointer to location where to store * the start position, or %NULL * @end_pos: (out) (optional): pointer to location where to store * the end position, or %NULL * * Retrieves the position in bytes of the @match_num'th capturing * parentheses. 0 is the full text of the match, 1 is the first * paren set, 2 the second, and so on. * * If @match_num is a valid sub pattern but it didn't match anything * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos * and @end_pos are set to -1 and %TRUE is returned. * * If the match was obtained using the DFA algorithm, that is using * g_regex_match_all() or g_regex_match_all_full(), the retrieved * position is not that of a set of parentheses but that of a matched * substring. Substrings are matched in reverse order of length, so * 0 is the longest match. * * Returns: %TRUE if the position was fetched, %FALSE otherwise. If * the position cannot be fetched, @start_pos and @end_pos are left * unchanged * * Since: 2.14 */ gboolean g_match_info_fetch_pos (const GMatchInfo *match_info, gint match_num, gint *start_pos, gint *end_pos) { g_return_val_if_fail (match_info != NULL, FALSE); g_return_val_if_fail (match_num >= 0, FALSE); /* check whether there was an error */ if (match_info->matches < 0) return FALSE; /* make sure the sub expression number they're requesting is less than * the total number of sub expressions in the regex. When matching all * (g_regex_match_all()), also compare against the number of matches */ if ((uint32_t) match_num >= MAX (match_info->n_subpatterns + 1, (uint32_t) match_info->matches)) return FALSE; if (start_pos != NULL) *start_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num] : -1; if (end_pos != NULL) *end_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num + 1] : -1; return TRUE; } /* * Returns number of first matched subpattern with name @name. * There may be more than one in case when DUPNAMES is used, * and not all subpatterns with that name match; * pcre2_substring_number_from_name() does not work in that case. */ static gint get_matched_substring_number (const GMatchInfo *match_info, const gchar *name) { gint entrysize; PCRE2_SPTR first, last; guchar *entry; if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES)) return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR8) name); /* This code is analogous to code from pcre2_substring.c: * pcre2_substring_get_byname() */ entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re, (PCRE2_SPTR8) name, &first, &last); if (entrysize <= 0) return entrysize; for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize) { guint n = (entry[0] << 8) + entry[1]; if (n * 2 < match_info->n_offsets && match_info->offsets[n * 2] >= 0) return n; } return (first[0] << 8) + first[1]; } /** * g_match_info_fetch_named: * @match_info: #GMatchInfo structure * @name: name of the subexpression * * Retrieves the text matching the capturing parentheses named @name. * * If @name is a valid sub pattern name but it didn't match anything * (e.g. sub pattern "X", matching "b" against "(?Pa)?b") * then an empty string is returned. * * The string is fetched from the string passed to the match function, * so you cannot call this function after freeing the string. * * Returns: (nullable): The matched substring, or %NULL if an error * occurred. You have to free the string yourself * * Since: 2.14 */ gchar * g_match_info_fetch_named (const GMatchInfo *match_info, const gchar *name) { gint num; g_return_val_if_fail (match_info != NULL, NULL); g_return_val_if_fail (name != NULL, NULL); num = get_matched_substring_number (match_info, name); if (num < 0) return NULL; else return g_match_info_fetch (match_info, num); } /** * g_match_info_fetch_named_pos: * @match_info: #GMatchInfo structure * @name: name of the subexpression * @start_pos: (out) (optional): pointer to location where to store * the start position, or %NULL * @end_pos: (out) (optional): pointer to location where to store * the end position, or %NULL * * Retrieves the position in bytes of the capturing parentheses named @name. * * If @name is a valid sub pattern name but it didn't match anything * (e.g. sub pattern "X", matching "b" against "(?Pa)?b") * then @start_pos and @end_pos are set to -1 and %TRUE is returned. * * Returns: %TRUE if the position was fetched, %FALSE otherwise. * If the position cannot be fetched, @start_pos and @end_pos * are left unchanged. * * Since: 2.14 */ gboolean g_match_info_fetch_named_pos (const GMatchInfo *match_info, const gchar *name, gint *start_pos, gint *end_pos) { gint num; g_return_val_if_fail (match_info != NULL, FALSE); g_return_val_if_fail (name != NULL, FALSE); num = get_matched_substring_number (match_info, name); if (num < 0) return FALSE; return g_match_info_fetch_pos (match_info, num, start_pos, end_pos); } /** * g_match_info_fetch_all: * @match_info: a #GMatchInfo structure * * Bundles up pointers to each of the matching substrings from a match * and stores them in an array of gchar pointers. The first element in * the returned array is the match number 0, i.e. the entire matched * text. * * If a sub pattern didn't match anything (e.g. sub pattern 1, matching * "b" against "(a)?b") then an empty string is inserted. * * If the last match was obtained using the DFA algorithm, that is using * g_regex_match_all() or g_regex_match_all_full(), the retrieved * strings are not that matched by sets of parentheses but that of the * matched substring. Substrings are matched in reverse order of length, * so the first one is the longest match. * * The strings are fetched from the string passed to the match function, * so you cannot call this function after freeing the string. * * Returns: (transfer full): a %NULL-terminated array of gchar * * pointers. It must be freed using g_strfreev(). If the previous * match failed %NULL is returned * * Since: 2.14 */ gchar ** g_match_info_fetch_all (const GMatchInfo *match_info) { gchar **result; gint i; g_return_val_if_fail (match_info != NULL, NULL); if (match_info->matches < 0) return NULL; result = g_new (gchar *, match_info->matches + 1); for (i = 0; i < match_info->matches; i++) result[i] = g_match_info_fetch (match_info, i); result[i] = NULL; return result; } /* GRegex */ G_DEFINE_QUARK (g-regex-error-quark, g_regex_error) /** * g_regex_ref: * @regex: a #GRegex * * Increases reference count of @regex by 1. * * Returns: @regex * * Since: 2.14 */ GRegex * g_regex_ref (GRegex *regex) { g_return_val_if_fail (regex != NULL, NULL); g_atomic_int_inc (®ex->ref_count); return regex; } /** * g_regex_unref: * @regex: a #GRegex * * Decreases reference count of @regex by 1. When reference count drops * to zero, it frees all the memory associated with the regex structure. * * Since: 2.14 */ void g_regex_unref (GRegex *regex) { g_return_if_fail (regex != NULL); if (g_atomic_int_dec_and_test (®ex->ref_count)) { g_free (regex->pattern); if (regex->pcre_re != NULL) pcre2_code_free (regex->pcre_re); g_free (regex); } } static pcre2_code * regex_compile (const gchar *pattern, uint32_t compile_options, uint32_t newline_options, uint32_t bsr_options, GError **error); static uint32_t get_pcre2_inline_compile_options (pcre2_code *re, uint32_t compile_options); /** * g_regex_new: * @pattern: the regular expression * @compile_options: compile options for the regular expression, or 0 * @match_options: match options for the regular expression, or 0 * @error: return location for a #GError * * Compiles the regular expression to an internal form, and does * the initial setup of the #GRegex structure. * * Returns: (nullable): a #GRegex structure or %NULL if an error occurred. Call * g_regex_unref() when you are done with it * * Since: 2.14 */ GRegex * g_regex_new (const gchar *pattern, GRegexCompileFlags compile_options, GRegexMatchFlags match_options, GError **error) { GRegex *regex; pcre2_code *re; static gsize initialised = 0; uint32_t pcre_compile_options; uint32_t pcre_match_options; uint32_t newline_options; uint32_t bsr_options; g_return_val_if_fail (pattern != NULL, NULL); g_return_val_if_fail (error == NULL || *error == NULL, NULL); G_GNUC_BEGIN_IGNORE_DEPRECATIONS g_return_val_if_fail ((compile_options & ~(G_REGEX_COMPILE_MASK | G_REGEX_JAVASCRIPT_COMPAT)) == 0, NULL); G_GNUC_END_IGNORE_DEPRECATIONS g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); if (g_once_init_enter (&initialised)) { int supports_utf8; pcre2_config (PCRE2_CONFIG_UNICODE, &supports_utf8); if (!supports_utf8) g_critical (_("PCRE library is compiled without UTF8 support")); g_once_init_leave (&initialised, supports_utf8 ? 1 : 2); } if (G_UNLIKELY (initialised != 1)) { g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, _("PCRE library is compiled with incompatible options")); return NULL; } pcre_compile_options = get_pcre2_compile_options (compile_options); pcre_match_options = get_pcre2_match_options (match_options, compile_options); newline_options = get_pcre2_newline_match_options (match_options); if (newline_options == 0) newline_options = get_pcre2_newline_compile_options (compile_options); if (newline_options == 0) { g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, "Invalid newline flags"); return NULL; } bsr_options = get_pcre2_bsr_match_options (match_options); if (!bsr_options) bsr_options = get_pcre2_bsr_compile_options (compile_options); re = regex_compile (pattern, pcre_compile_options, newline_options, bsr_options, error); if (re == NULL) return NULL; pcre_compile_options |= get_pcre2_inline_compile_options (re, pcre_compile_options); regex = g_new0 (GRegex, 1); regex->ref_count = 1; regex->pattern = g_strdup (pattern); regex->pcre_re = re; regex->compile_opts = pcre_compile_options; regex->orig_compile_opts = compile_options; regex->match_opts = pcre_match_options; regex->orig_match_opts = match_options; return regex; } static pcre2_code * regex_compile (const gchar *pattern, uint32_t compile_options, uint32_t newline_options, uint32_t bsr_options, GError **error) { pcre2_code *re; pcre2_compile_context *context; const gchar *errmsg; PCRE2_SIZE erroffset; gint errcode; context = pcre2_compile_context_create (NULL); /* set newline options */ if (pcre2_set_newline (context, newline_options) != 0) { g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, "Invalid newline flags"); pcre2_compile_context_free (context); return NULL; } /* set bsr options */ if (pcre2_set_bsr (context, bsr_options) != 0) { g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, "Invalid BSR flags"); pcre2_compile_context_free (context); return NULL; } /* In case UTF-8 mode is used, also set PCRE2_NO_UTF_CHECK */ if (compile_options & PCRE2_UTF) compile_options |= PCRE2_NO_UTF_CHECK; compile_options |= PCRE2_UCP; /* compile the pattern */ re = pcre2_compile ((PCRE2_SPTR8) pattern, PCRE2_ZERO_TERMINATED, compile_options, &errcode, &erroffset, context); pcre2_compile_context_free (context); /* if the compilation failed, set the error member and return * immediately */ if (re == NULL) { GError *tmp_error; gchar *offset_str; gchar *pcre2_errmsg = NULL; int original_errcode; /* Translate the PCRE error code to GRegexError and use a translated * error message if possible */ original_errcode = errcode; translate_compile_error (&errcode, &errmsg); if (!errmsg) { errmsg = _("unknown error"); pcre2_errmsg = get_pcre2_error_string (original_errcode); } /* PCRE uses byte offsets but we want to show character offsets */ erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]); offset_str = g_strdup_printf ("%" G_GSIZE_FORMAT, erroffset); tmp_error = g_error_new (G_REGEX_ERROR, errcode, _("Error while compiling regular expression ‘%s’ " "at char %s: %s"), pattern, offset_str, pcre2_errmsg ? pcre2_errmsg : errmsg); g_propagate_error (error, tmp_error); g_free (offset_str); g_clear_pointer (&pcre2_errmsg, g_free); return NULL; } return re; } static uint32_t get_pcre2_inline_compile_options (pcre2_code *re, uint32_t compile_options) { uint32_t pcre_compile_options; uint32_t nonpcre_compile_options; /* For options set at the beginning of the pattern, pcre puts them into * compile options, e.g. "(?i)foo" will make the pcre structure store * PCRE2_CASELESS even though it wasn't explicitly given for compilation. */ nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK; pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options); compile_options = pcre_compile_options & G_REGEX_PCRE2_COMPILE_MASK; compile_options |= nonpcre_compile_options; if (!(compile_options & PCRE2_DUPNAMES)) { uint32_t jchanged = 0; pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged); if (jchanged) compile_options |= PCRE2_DUPNAMES; } return compile_options; } /** * g_regex_get_pattern: * @regex: a #GRegex structure * * Gets the pattern string associated with @regex, i.e. a copy of * the string passed to g_regex_new(). * * Returns: the pattern of @regex * * Since: 2.14 */ const gchar * g_regex_get_pattern (const GRegex *regex) { g_return_val_if_fail (regex != NULL, NULL); return regex->pattern; } /** * g_regex_get_max_backref: * @regex: a #GRegex * * Returns the number of the highest back reference * in the pattern, or 0 if the pattern does not contain * back references. * * Returns: the number of the highest back reference * * Since: 2.14 */ gint g_regex_get_max_backref (const GRegex *regex) { uint32_t value; pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value); return value; } /** * g_regex_get_capture_count: * @regex: a #GRegex * * Returns the number of capturing subpatterns in the pattern. * * Returns: the number of capturing subpatterns * * Since: 2.14 */ gint g_regex_get_capture_count (const GRegex *regex) { uint32_t value; pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value); return value; } /** * g_regex_get_has_cr_or_lf: * @regex: a #GRegex structure * * Checks whether the pattern contains explicit CR or LF references. * * Returns: %TRUE if the pattern contains explicit CR or LF references * * Since: 2.34 */ gboolean g_regex_get_has_cr_or_lf (const GRegex *regex) { uint32_t value; pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value); return !!value; } /** * g_regex_get_max_lookbehind: * @regex: a #GRegex structure * * Gets the number of characters in the longest lookbehind assertion in the * pattern. This information is useful when doing multi-segment matching using * the partial matching facilities. * * Returns: the number of characters in the longest lookbehind assertion. * * Since: 2.38 */ gint g_regex_get_max_lookbehind (const GRegex *regex) { uint32_t max_lookbehind; pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND, &max_lookbehind); return max_lookbehind; } /** * g_regex_get_compile_flags: * @regex: a #GRegex * * Returns the compile options that @regex was created with. * * Depending on the version of PCRE that is used, this may or may not * include flags set by option expressions such as `(?i)` found at the * top-level within the compiled pattern. * * Returns: flags from #GRegexCompileFlags * * Since: 2.26 */ GRegexCompileFlags g_regex_get_compile_flags (const GRegex *regex) { GRegexCompileFlags extra_flags; uint32_t info_value; g_return_val_if_fail (regex != NULL, 0); /* Preserve original G_REGEX_OPTIMIZE */ extra_flags = (regex->orig_compile_opts & G_REGEX_OPTIMIZE); /* Also include the newline options */ pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_NEWLINE, &info_value); switch (info_value) { case PCRE2_NEWLINE_ANYCRLF: extra_flags |= G_REGEX_NEWLINE_ANYCRLF; break; case PCRE2_NEWLINE_CRLF: extra_flags |= G_REGEX_NEWLINE_CRLF; break; case PCRE2_NEWLINE_LF: extra_flags |= G_REGEX_NEWLINE_LF; break; case PCRE2_NEWLINE_CR: extra_flags |= G_REGEX_NEWLINE_CR; break; default: break; } /* Also include the bsr options */ pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BSR, &info_value); switch (info_value) { case PCRE2_BSR_ANYCRLF: extra_flags |= G_REGEX_BSR_ANYCRLF; break; default: break; } return g_regex_compile_flags_from_pcre2 (regex->compile_opts) | extra_flags; } /** * g_regex_get_match_flags: * @regex: a #GRegex * * Returns the match options that @regex was created with. * * Returns: flags from #GRegexMatchFlags * * Since: 2.26 */ GRegexMatchFlags g_regex_get_match_flags (const GRegex *regex) { uint32_t flags; g_return_val_if_fail (regex != NULL, 0); flags = g_regex_match_flags_from_pcre2 (regex->match_opts); flags |= (regex->orig_match_opts & G_REGEX_MATCH_NEWLINE_MASK); flags |= (regex->orig_match_opts & (G_REGEX_MATCH_BSR_ANY | G_REGEX_MATCH_BSR_ANYCRLF)); return flags; } /** * g_regex_match_simple: * @pattern: the regular expression * @string: the string to scan for matches * @compile_options: compile options for the regular expression, or 0 * @match_options: match options, or 0 * * Scans for a match in @string for @pattern. * * This function is equivalent to g_regex_match() but it does not * require to compile the pattern with g_regex_new(), avoiding some * lines of code when you need just to do a match without extracting * substrings, capture counts, and so on. * * If this function is to be called on the same @pattern more than * once, it's more efficient to compile the pattern once with * g_regex_new() and then use g_regex_match(). * * Returns: %TRUE if the string matched, %FALSE otherwise * * Since: 2.14 */ gboolean g_regex_match_simple (const gchar *pattern, const gchar *string, GRegexCompileFlags compile_options, GRegexMatchFlags match_options) { GRegex *regex; gboolean result; regex = g_regex_new (pattern, compile_options, G_REGEX_MATCH_DEFAULT, NULL); if (!regex) return FALSE; result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL); g_regex_unref (regex); return result; } /** * g_regex_match: * @regex: a #GRegex structure from g_regex_new() * @string: the string to scan for matches * @match_options: match options * @match_info: (out) (optional): pointer to location where to store * the #GMatchInfo, or %NULL if you do not need it * * Scans for a match in @string for the pattern in @regex. * The @match_options are combined with the match options specified * when the @regex structure was created, letting you have more * flexibility in reusing #GRegex structures. * * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. * * A #GMatchInfo structure, used to get information on the match, * is stored in @match_info if not %NULL. Note that if @match_info * is not %NULL then it is created even if the function returns %FALSE, * i.e. you must free it regardless if regular expression actually matched. * * To retrieve all the non-overlapping matches of the pattern in * string you can use g_match_info_next(). * * |[ * static void * print_uppercase_words (const gchar *string) * { * // Print all uppercase-only words. * GRegex *regex; * GMatchInfo *match_info; * * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); * g_regex_match (regex, string, 0, &match_info); * while (g_match_info_matches (match_info)) * { * gchar *word = g_match_info_fetch (match_info, 0); * g_print ("Found: %s\n", word); * g_free (word); * g_match_info_next (match_info, NULL); * } * g_match_info_free (match_info); * g_regex_unref (regex); * } * ]| * * @string is not copied and is used in #GMatchInfo internally. If * you use any #GMatchInfo method (except g_match_info_free()) after * freeing or modifying @string then the behaviour is undefined. * * Returns: %TRUE is the string matched, %FALSE otherwise * * Since: 2.14 */ gboolean g_regex_match (const GRegex *regex, const gchar *string, GRegexMatchFlags match_options, GMatchInfo **match_info) { return g_regex_match_full (regex, string, -1, 0, match_options, match_info, NULL); } /** * g_regex_match_full: * @regex: a #GRegex structure from g_regex_new() * @string: (array length=string_len): the string to scan for matches * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated * @start_position: starting index of the string to match, in bytes * @match_options: match options * @match_info: (out) (optional): pointer to location where to store * the #GMatchInfo, or %NULL if you do not need it * @error: location to store the error occurring, or %NULL to ignore errors * * Scans for a match in @string for the pattern in @regex. * The @match_options are combined with the match options specified * when the @regex structure was created, letting you have more * flexibility in reusing #GRegex structures. * * Setting @start_position differs from just passing over a shortened * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern * that begins with any kind of lookbehind assertion, such as "\b". * * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. * * A #GMatchInfo structure, used to get information on the match, is * stored in @match_info if not %NULL. Note that if @match_info is * not %NULL then it is created even if the function returns %FALSE, * i.e. you must free it regardless if regular expression actually * matched. * * @string is not copied and is used in #GMatchInfo internally. If * you use any #GMatchInfo method (except g_match_info_free()) after * freeing or modifying @string then the behaviour is undefined. * * To retrieve all the non-overlapping matches of the pattern in * string you can use g_match_info_next(). * * |[ * static void * print_uppercase_words (const gchar *string) * { * // Print all uppercase-only words. * GRegex *regex; * GMatchInfo *match_info; * GError *error = NULL; * * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error); * while (g_match_info_matches (match_info)) * { * gchar *word = g_match_info_fetch (match_info, 0); * g_print ("Found: %s\n", word); * g_free (word); * g_match_info_next (match_info, &error); * } * g_match_info_free (match_info); * g_regex_unref (regex); * if (error != NULL) * { * g_printerr ("Error while matching: %s\n", error->message); * g_error_free (error); * } * } * ]| * * Returns: %TRUE is the string matched, %FALSE otherwise * * Since: 2.14 */ gboolean g_regex_match_full (const GRegex *regex, const gchar *string, gssize string_len, gint start_position, GRegexMatchFlags match_options, GMatchInfo **match_info, GError **error) { GMatchInfo *info; gboolean match_ok; g_return_val_if_fail (regex != NULL, FALSE); g_return_val_if_fail (string != NULL, FALSE); g_return_val_if_fail (start_position >= 0, FALSE); g_return_val_if_fail (error == NULL || *error == NULL, FALSE); g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); info = match_info_new (regex, string, string_len, start_position, match_options, FALSE); match_ok = g_match_info_next (info, error); if (match_info != NULL) *match_info = info; else g_match_info_free (info); return match_ok; } /** * g_regex_match_all: * @regex: a #GRegex structure from g_regex_new() * @string: the string to scan for matches * @match_options: match options * @match_info: (out) (optional): pointer to location where to store * the #GMatchInfo, or %NULL if you do not need it * * Using the standard algorithm for regular expression matching only * the longest match in the string is retrieved. This function uses * a different algorithm so it can retrieve all the possible matches. * For more documentation see g_regex_match_all_full(). * * A #GMatchInfo structure, used to get information on the match, is * stored in @match_info if not %NULL. Note that if @match_info is * not %NULL then it is created even if the function returns %FALSE, * i.e. you must free it regardless if regular expression actually * matched. * * @string is not copied and is used in #GMatchInfo internally. If * you use any #GMatchInfo method (except g_match_info_free()) after * freeing or modifying @string then the behaviour is undefined. * * Returns: %TRUE is the string matched, %FALSE otherwise * * Since: 2.14 */ gboolean g_regex_match_all (const GRegex *regex, const gchar *string, GRegexMatchFlags match_options, GMatchInfo **match_info) { return g_regex_match_all_full (regex, string, -1, 0, match_options, match_info, NULL); } /** * g_regex_match_all_full: * @regex: a #GRegex structure from g_regex_new() * @string: (array length=string_len): the string to scan for matches * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated * @start_position: starting index of the string to match, in bytes * @match_options: match options * @match_info: (out) (optional): pointer to location where to store * the #GMatchInfo, or %NULL if you do not need it * @error: location to store the error occurring, or %NULL to ignore errors * * Using the standard algorithm for regular expression matching only * the longest match in the @string is retrieved, it is not possible * to obtain all the available matches. For instance matching * " " against the pattern "<.*>" * you get " ". * * This function uses a different algorithm (called DFA, i.e. deterministic * finite automaton), so it can retrieve all the possible matches, all * starting at the same point in the string. For instance matching * " " against the pattern "<.*>;" * you would obtain three matches: " ", * " " and "". * * The number of matched strings is retrieved using * g_match_info_get_match_count(). To obtain the matched strings and * their position you can use, respectively, g_match_info_fetch() and * g_match_info_fetch_pos(). Note that the strings are returned in * reverse order of length; that is, the longest matching string is * given first. * * Note that the DFA algorithm is slower than the standard one and it * is not able to capture substrings, so backreferences do not work. * * Setting @start_position differs from just passing over a shortened * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern * that begins with any kind of lookbehind assertion, such as "\b". * * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. * * A #GMatchInfo structure, used to get information on the match, is * stored in @match_info if not %NULL. Note that if @match_info is * not %NULL then it is created even if the function returns %FALSE, * i.e. you must free it regardless if regular expression actually * matched. * * @string is not copied and is used in #GMatchInfo internally. If * you use any #GMatchInfo method (except g_match_info_free()) after * freeing or modifying @string then the behaviour is undefined. * * Returns: %TRUE is the string matched, %FALSE otherwise * * Since: 2.14 */ gboolean g_regex_match_all_full (const GRegex *regex, const gchar *string, gssize string_len, gint start_position, GRegexMatchFlags match_options, GMatchInfo **match_info, GError **error) { GMatchInfo *info; gboolean done; pcre2_code *pcre_re; gboolean retval; uint32_t newline_options; uint32_t bsr_options; g_return_val_if_fail (regex != NULL, FALSE); g_return_val_if_fail (string != NULL, FALSE); g_return_val_if_fail (start_position >= 0, FALSE); g_return_val_if_fail (error == NULL || *error == NULL, FALSE); g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); newline_options = get_pcre2_newline_match_options (match_options); if (!newline_options) newline_options = get_pcre2_newline_compile_options (regex->orig_compile_opts); bsr_options = get_pcre2_bsr_match_options (match_options); if (!bsr_options) bsr_options = get_pcre2_bsr_compile_options (regex->orig_compile_opts); /* For PCRE2 we need to turn off PCRE2_NO_AUTO_POSSESS, which is an * optimization for normal regex matching, but results in omitting some * shorter matches here, and an observable behaviour change. * * DFA matching is rather niche, and very rarely used according to * codesearch.debian.net, so don't bother caching the recompiled RE. */ pcre_re = regex_compile (regex->pattern, regex->compile_opts | PCRE2_NO_AUTO_POSSESS, newline_options, bsr_options, error); if (pcre_re == NULL) return FALSE; info = match_info_new (regex, string, string_len, start_position, match_options, TRUE); done = FALSE; while (!done) { done = TRUE; info->matches = pcre2_dfa_match (pcre_re, (PCRE2_SPTR8) info->string, info->string_len, info->pos, (regex->match_opts | info->match_opts), info->match_data, info->match_context, info->workspace, info->n_workspace); if (info->matches == PCRE2_ERROR_DFA_WSSIZE) { /* info->workspace is too small. */ info->n_workspace *= 2; info->workspace = g_realloc_n (info->workspace, info->n_workspace, sizeof (gint)); done = FALSE; } else if (info->matches == 0) { /* info->offsets is too small. */ info->n_offsets *= 2; info->offsets = g_realloc_n (info->offsets, info->n_offsets, sizeof (gint)); pcre2_match_data_free (info->match_data); info->match_data = pcre2_match_data_create (info->n_offsets, NULL); done = FALSE; } else if (IS_PCRE2_ERROR (info->matches)) { gchar *error_msg = get_match_error_message (info->matches); g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, _("Error while matching regular expression %s: %s"), regex->pattern, error_msg); g_clear_pointer (&error_msg, g_free); } else if (info->matches != PCRE2_ERROR_NOMATCH) { if (!recalc_match_offsets (info, error)) info->matches = PCRE2_ERROR_NOMATCH; } } pcre2_code_free (pcre_re); /* don’t assert that (info->matches <= info->n_subpatterns + 1) as that only * holds true for a single match, rather than matching all */ /* set info->pos to -1 so that a call to g_match_info_next() fails. */ info->pos = -1; retval = info->matches >= 0; if (match_info != NULL) *match_info = info; else g_match_info_free (info); return retval; } /** * g_regex_get_string_number: * @regex: #GRegex structure * @name: name of the subexpression * * Retrieves the number of the subexpression named @name. * * Returns: The number of the subexpression or -1 if @name * does not exists * * Since: 2.14 */ gint g_regex_get_string_number (const GRegex *regex, const gchar *name) { gint num; g_return_val_if_fail (regex != NULL, -1); g_return_val_if_fail (name != NULL, -1); num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR8) name); if (num == PCRE2_ERROR_NOSUBSTRING) num = -1; return num; } /** * g_regex_split_simple: * @pattern: the regular expression * @string: the string to scan for matches * @compile_options: compile options for the regular expression, or 0 * @match_options: match options, or 0 * * Breaks the string on the pattern, and returns an array of * the tokens. If the pattern contains capturing parentheses, * then the text for each of the substrings will also be returned. * If the pattern does not match anywhere in the string, then the * whole string is returned as the first token. * * This function is equivalent to g_regex_split() but it does * not require to compile the pattern with g_regex_new(), avoiding * some lines of code when you need just to do a split without * extracting substrings, capture counts, and so on. * * If this function is to be called on the same @pattern more than * once, it's more efficient to compile the pattern once with * g_regex_new() and then use g_regex_split(). * * As a special case, the result of splitting the empty string "" * is an empty vector, not a vector containing a single string. * The reason for this special case is that being able to represent * an empty vector is typically more useful than consistent handling * of empty elements. If you do need to represent empty elements, * you'll need to check for the empty string before calling this * function. * * A pattern that can match empty strings splits @string into * separate characters wherever it matches the empty string between * characters. For example splitting "ab c" using as a separator * "\s*", you will get "a", "b" and "c". * * Returns: (transfer full): a %NULL-terminated array of strings. Free * it using g_strfreev() * * Since: 2.14 **/ gchar ** g_regex_split_simple (const gchar *pattern, const gchar *string, GRegexCompileFlags compile_options, GRegexMatchFlags match_options) { GRegex *regex; gchar **result; regex = g_regex_new (pattern, compile_options, 0, NULL); if (!regex) return NULL; result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL); g_regex_unref (regex); return result; } /** * g_regex_split: * @regex: a #GRegex structure * @string: the string to split with the pattern * @match_options: match time option flags * * Breaks the string on the pattern, and returns an array of the tokens. * If the pattern contains capturing parentheses, then the text for each * of the substrings will also be returned. If the pattern does not match * anywhere in the string, then the whole string is returned as the first * token. * * As a special case, the result of splitting the empty string "" is an * empty vector, not a vector containing a single string. The reason for * this special case is that being able to represent an empty vector is * typically more useful than consistent handling of empty elements. If * you do need to represent empty elements, you'll need to check for the * empty string before calling this function. * * A pattern that can match empty strings splits @string into separate * characters wherever it matches the empty string between characters. * For example splitting "ab c" using as a separator "\s*", you will get * "a", "b" and "c". * * Returns: (transfer full): a %NULL-terminated gchar ** array. Free * it using g_strfreev() * * Since: 2.14 **/ gchar ** g_regex_split (const GRegex *regex, const gchar *string, GRegexMatchFlags match_options) { return g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL); } /** * g_regex_split_full: * @regex: a #GRegex structure * @string: (array length=string_len): the string to split with the pattern * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated * @start_position: starting index of the string to match, in bytes * @match_options: match time option flags * @max_tokens: the maximum number of tokens to split @string into. * If this is less than 1, the string is split completely * @error: return location for a #GError * * Breaks the string on the pattern, and returns an array of the tokens. * If the pattern contains capturing parentheses, then the text for each * of the substrings will also be returned. If the pattern does not match * anywhere in the string, then the whole string is returned as the first * token. * * As a special case, the result of splitting the empty string "" is an * empty vector, not a vector containing a single string. The reason for * this special case is that being able to represent an empty vector is * typically more useful than consistent handling of empty elements. If * you do need to represent empty elements, you'll need to check for the * empty string before calling this function. * * A pattern that can match empty strings splits @string into separate * characters wherever it matches the empty string between characters. * For example splitting "ab c" using as a separator "\s*", you will get * "a", "b" and "c". * * Setting @start_position differs from just passing over a shortened * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern * that begins with any kind of lookbehind assertion, such as "\b". * * Returns: (transfer full): a %NULL-terminated gchar ** array. Free * it using g_strfreev() * * Since: 2.14 **/ gchar ** g_regex_split_full (const GRegex *regex, const gchar *string, gssize string_len, gint start_position, GRegexMatchFlags match_options, gint max_tokens, GError **error) { GError *tmp_error = NULL; GMatchInfo *match_info; GList *list, *last; gint i; gint token_count; gboolean match_ok; /* position of the last separator. */ gint last_separator_end; /* was the last match 0 bytes long? */ gboolean last_match_is_empty; /* the returned array of char **s */ gchar **string_list; g_return_val_if_fail (regex != NULL, NULL); g_return_val_if_fail (string != NULL, NULL); g_return_val_if_fail (start_position >= 0, NULL); g_return_val_if_fail (error == NULL || *error == NULL, NULL); g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); if (max_tokens <= 0) max_tokens = G_MAXINT; if (string_len < 0) string_len = strlen (string); /* zero-length string */ if (string_len - start_position == 0) return g_new0 (gchar *, 1); if (max_tokens == 1) { string_list = g_new0 (gchar *, 2); string_list[0] = g_strndup (&string[start_position], string_len - start_position); return string_list; } list = NULL; token_count = 0; last_separator_end = start_position; last_match_is_empty = FALSE; match_ok = g_regex_match_full (regex, string, string_len, start_position, match_options, &match_info, &tmp_error); while (tmp_error == NULL) { if (match_ok) { last_match_is_empty = (match_info->offsets[0] == match_info->offsets[1]); /* we need to skip empty separators at the same position of the end * of another separator. e.g. the string is "a b" and the separator * is " *", so from 1 to 2 we have a match and at position 2 we have * an empty match. */ if (last_separator_end != match_info->offsets[1]) { gchar *token; gint match_count; token = g_strndup (string + last_separator_end, match_info->offsets[0] - last_separator_end); list = g_list_prepend (list, token); token_count++; /* if there were substrings, these need to be added to * the list. */ match_count = g_match_info_get_match_count (match_info); if (match_count > 1) { for (i = 1; i < match_count; i++) list = g_list_prepend (list, g_match_info_fetch (match_info, i)); } } } else { /* if there was no match, copy to end of string. */ if (!last_match_is_empty) { gchar *token = g_strndup (string + last_separator_end, match_info->string_len - last_separator_end); list = g_list_prepend (list, token); } /* no more tokens, end the loop. */ break; } /* -1 to leave room for the last part. */ if (token_count >= max_tokens - 1) { /* we have reached the maximum number of tokens, so we copy * the remaining part of the string. */ if (last_match_is_empty) { /* the last match was empty, so we have moved one char * after the real position to avoid empty matches at the * same position. */ match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string; } /* the if is needed in the case we have terminated the available * tokens, but we are at the end of the string, so there are no * characters left to copy. */ if (string_len > match_info->pos) { gchar *token = g_strndup (string + match_info->pos, string_len - match_info->pos); list = g_list_prepend (list, token); } /* end the loop. */ break; } last_separator_end = match_info->pos; if (last_match_is_empty) /* if the last match was empty, g_match_info_next() has moved * forward to avoid infinite loops, but we still need to copy that * character. */ last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string; match_ok = g_match_info_next (match_info, &tmp_error); } g_match_info_free (match_info); if (tmp_error != NULL) { g_propagate_error (error, tmp_error); g_list_free_full (list, g_free); return NULL; } string_list = g_new (gchar *, g_list_length (list) + 1); i = 0; for (last = g_list_last (list); last; last = g_list_previous (last)) string_list[i++] = last->data; string_list[i] = NULL; g_list_free (list); return string_list; } enum { REPL_TYPE_STRING, REPL_TYPE_CHARACTER, REPL_TYPE_SYMBOLIC_REFERENCE, REPL_TYPE_NUMERIC_REFERENCE, REPL_TYPE_CHANGE_CASE }; typedef enum { CHANGE_CASE_NONE = 1 << 0, CHANGE_CASE_UPPER = 1 << 1, CHANGE_CASE_LOWER = 1 << 2, CHANGE_CASE_UPPER_SINGLE = 1 << 3, CHANGE_CASE_LOWER_SINGLE = 1 << 4, CHANGE_CASE_SINGLE_MASK = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE, CHANGE_CASE_LOWER_MASK = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE, CHANGE_CASE_UPPER_MASK = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE } ChangeCase; struct _InterpolationData { gchar *text; gint type; gint num; gchar c; ChangeCase change_case; }; static void free_interpolation_data (InterpolationData *data) { g_free (data->text); g_free (data); } static const gchar * expand_escape (const gchar *replacement, const gchar *p, InterpolationData *data, GError **error) { const gchar *q, *r; gint x, d, h, i; const gchar *error_detail; gint base = 0; GError *tmp_error = NULL; p++; switch (*p) { case 't': p++; data->c = '\t'; data->type = REPL_TYPE_CHARACTER; break; case 'n': p++; data->c = '\n'; data->type = REPL_TYPE_CHARACTER; break; case 'v': p++; data->c = '\v'; data->type = REPL_TYPE_CHARACTER; break; case 'r': p++; data->c = '\r'; data->type = REPL_TYPE_CHARACTER; break; case 'f': p++; data->c = '\f'; data->type = REPL_TYPE_CHARACTER; break; case 'a': p++; data->c = '\a'; data->type = REPL_TYPE_CHARACTER; break; case 'b': p++; data->c = '\b'; data->type = REPL_TYPE_CHARACTER; break; case '\\': p++; data->c = '\\'; data->type = REPL_TYPE_CHARACTER; break; case 'x': p++; x = 0; if (*p == '{') { p++; do { h = g_ascii_xdigit_value (*p); if (h < 0) { error_detail = _("hexadecimal digit or “}” expected"); goto error; } x = x * 16 + h; p++; } while (*p != '}'); p++; } else { for (i = 0; i < 2; i++) { h = g_ascii_xdigit_value (*p); if (h < 0) { error_detail = _("hexadecimal digit expected"); goto error; } x = x * 16 + h; p++; } } data->type = REPL_TYPE_STRING; data->text = g_new0 (gchar, 8); g_unichar_to_utf8 (x, data->text); break; case 'l': p++; data->type = REPL_TYPE_CHANGE_CASE; data->change_case = CHANGE_CASE_LOWER_SINGLE; break; case 'u': p++; data->type = REPL_TYPE_CHANGE_CASE; data->change_case = CHANGE_CASE_UPPER_SINGLE; break; case 'L': p++; data->type = REPL_TYPE_CHANGE_CASE; data->change_case = CHANGE_CASE_LOWER; break; case 'U': p++; data->type = REPL_TYPE_CHANGE_CASE; data->change_case = CHANGE_CASE_UPPER; break; case 'E': p++; data->type = REPL_TYPE_CHANGE_CASE; data->change_case = CHANGE_CASE_NONE; break; case 'g': p++; if (*p != '<') { error_detail = _("missing “<” in symbolic reference"); goto error; } q = p + 1; do { p++; if (!*p) { error_detail = _("unfinished symbolic reference"); goto error; } } while (*p != '>'); if (p - q == 0) { error_detail = _("zero-length symbolic reference"); goto error; } if (g_ascii_isdigit (*q)) { x = 0; do { h = g_ascii_digit_value (*q); if (h < 0) { error_detail = _("digit expected"); p = q; goto error; } x = x * 10 + h; q++; } while (q != p); data->num = x; data->type = REPL_TYPE_NUMERIC_REFERENCE; } else { r = q; do { if (!g_ascii_isalnum (*r)) { error_detail = _("illegal symbolic reference"); p = r; goto error; } r++; } while (r != p); data->text = g_strndup (q, p - q); data->type = REPL_TYPE_SYMBOLIC_REFERENCE; } p++; break; case '0': /* if \0 is followed by a number is an octal number representing a * character, else it is a numeric reference. */ if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0) { base = 8; p = g_utf8_next_char (p); } G_GNUC_FALLTHROUGH; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': x = 0; d = 0; for (i = 0; i < 3; i++) { h = g_ascii_digit_value (*p); if (h < 0) break; if (h > 7) { if (base == 8) break; else base = 10; } if (i == 2 && base == 10) break; x = x * 8 + h; d = d * 10 + h; p++; } if (base == 8 || i == 3) { data->type = REPL_TYPE_STRING; data->text = g_new0 (gchar, 8); g_unichar_to_utf8 (x, data->text); } else { data->type = REPL_TYPE_NUMERIC_REFERENCE; data->num = d; } break; case 0: error_detail = _("stray final “\\”"); goto error; break; default: error_detail = _("unknown escape sequence"); goto error; } return p; error: /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */ tmp_error = g_error_new (G_REGEX_ERROR, G_REGEX_ERROR_REPLACE, _("Error while parsing replacement " "text “%s” at char %lu: %s"), replacement, (gulong)(p - replacement), error_detail); g_propagate_error (error, tmp_error); return NULL; } static GList * split_replacement (const gchar *replacement, GError **error) { GList *list = NULL; InterpolationData *data; const gchar *p, *start; start = p = replacement; while (*p) { if (*p == '\\') { data = g_new0 (InterpolationData, 1); start = p = expand_escape (replacement, p, data, error); if (p == NULL) { g_list_free_full (list, (GDestroyNotify) free_interpolation_data); free_interpolation_data (data); return NULL; } list = g_list_prepend (list, data); } else { p++; if (*p == '\\' || *p == '\0') { if (p - start > 0) { data = g_new0 (InterpolationData, 1); data->text = g_strndup (start, p - start); data->type = REPL_TYPE_STRING; list = g_list_prepend (list, data); } } } } return g_list_reverse (list); } /* Change the case of c based on change_case. */ #define CHANGE_CASE(c, change_case) \ (((change_case) & CHANGE_CASE_LOWER_MASK) ? \ g_unichar_tolower (c) : \ g_unichar_toupper (c)) static void string_append (GString *string, const gchar *text, ChangeCase *change_case) { gunichar c; if (text[0] == '\0') return; if (*change_case == CHANGE_CASE_NONE) { g_string_append (string, text); } else if (*change_case & CHANGE_CASE_SINGLE_MASK) { c = g_utf8_get_char (text); g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); g_string_append (string, g_utf8_next_char (text)); *change_case = CHANGE_CASE_NONE; } else { while (*text != '\0') { c = g_utf8_get_char (text); g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); text = g_utf8_next_char (text); } } } static gboolean interpolate_replacement (const GMatchInfo *match_info, GString *result, gpointer data) { GList *list; InterpolationData *idata; gchar *match; ChangeCase change_case = CHANGE_CASE_NONE; for (list = data; list; list = list->next) { idata = list->data; switch (idata->type) { case REPL_TYPE_STRING: string_append (result, idata->text, &change_case); break; case REPL_TYPE_CHARACTER: g_string_append_c (result, CHANGE_CASE (idata->c, change_case)); if (change_case & CHANGE_CASE_SINGLE_MASK) change_case = CHANGE_CASE_NONE; break; case REPL_TYPE_NUMERIC_REFERENCE: match = g_match_info_fetch (match_info, idata->num); if (match) { string_append (result, match, &change_case); g_free (match); } break; case REPL_TYPE_SYMBOLIC_REFERENCE: match = g_match_info_fetch_named (match_info, idata->text); if (match) { string_append (result, match, &change_case); g_free (match); } break; case REPL_TYPE_CHANGE_CASE: change_case = idata->change_case; break; } } return FALSE; } /* whether actual match_info is needed for replacement, i.e. * whether there are references */ static gboolean interpolation_list_needs_match (GList *list) { while (list != NULL) { InterpolationData *data = list->data; if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE || data->type == REPL_TYPE_NUMERIC_REFERENCE) { return TRUE; } list = list->next; } return FALSE; } /** * g_regex_replace: * @regex: a #GRegex structure * @string: (array length=string_len): the string to perform matches against * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated * @start_position: starting index of the string to match, in bytes * @replacement: text to replace each match with * @match_options: options for the match * @error: location to store the error occurring, or %NULL to ignore errors * * Replaces all occurrences of the pattern in @regex with the * replacement text. Backreferences of the form '\number' or * '\g' in the replacement text are interpolated by the * number-th captured subexpression of the match, '\g' refers * to the captured subexpression with the given name. '\0' refers * to the complete match, but '\0' followed by a number is the octal * representation of a character. To include a literal '\' in the * replacement, write '\\\\'. * * There are also escapes that changes the case of the following text: * * - \l: Convert to lower case the next character * - \u: Convert to upper case the next character * - \L: Convert to lower case till \E * - \U: Convert to upper case till \E * - \E: End case modification * * If you do not need to use backreferences use g_regex_replace_literal(). * * The @replacement string must be UTF-8 encoded even if %G_REGEX_RAW was * passed to g_regex_new(). If you want to use not UTF-8 encoded strings * you can use g_regex_replace_literal(). * * Setting @start_position differs from just passing over a shortened * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern that * begins with any kind of lookbehind assertion, such as "\b". * * Returns: a newly allocated string containing the replacements * * Since: 2.14 */ gchar * g_regex_replace (const GRegex *regex, const gchar *string, gssize string_len, gint start_position, const gchar *replacement, GRegexMatchFlags match_options, GError **error) { gchar *result; GList *list; GError *tmp_error = NULL; g_return_val_if_fail (regex != NULL, NULL); g_return_val_if_fail (string != NULL, NULL); g_return_val_if_fail (start_position >= 0, NULL); g_return_val_if_fail (replacement != NULL, NULL); g_return_val_if_fail (error == NULL || *error == NULL, NULL); g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); list = split_replacement (replacement, &tmp_error); if (tmp_error != NULL) { g_propagate_error (error, tmp_error); return NULL; } result = g_regex_replace_eval (regex, string, string_len, start_position, match_options, interpolate_replacement, (gpointer)list, &tmp_error); if (tmp_error != NULL) g_propagate_error (error, tmp_error); g_list_free_full (list, (GDestroyNotify) free_interpolation_data); return result; } static gboolean literal_replacement (const GMatchInfo *match_info, GString *result, gpointer data) { g_string_append (result, data); return FALSE; } /** * g_regex_replace_literal: * @regex: a #GRegex structure * @string: (array length=string_len): the string to perform matches against * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated * @start_position: starting index of the string to match, in bytes * @replacement: text to replace each match with * @match_options: options for the match * @error: location to store the error occurring, or %NULL to ignore errors * * Replaces all occurrences of the pattern in @regex with the * replacement text. @replacement is replaced literally, to * include backreferences use g_regex_replace(). * * Setting @start_position differs from just passing over a * shortened string and setting %G_REGEX_MATCH_NOTBOL in the * case of a pattern that begins with any kind of lookbehind * assertion, such as "\b". * * Returns: a newly allocated string containing the replacements * * Since: 2.14 */ gchar * g_regex_replace_literal (const GRegex *regex, const gchar *string, gssize string_len, gint start_position, const gchar *replacement, GRegexMatchFlags match_options, GError **error) { g_return_val_if_fail (replacement != NULL, NULL); g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); return g_regex_replace_eval (regex, string, string_len, start_position, match_options, literal_replacement, (gpointer)replacement, error); } /** * g_regex_replace_eval: * @regex: a #GRegex structure from g_regex_new() * @string: (array length=string_len): string to perform matches against * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated * @start_position: starting index of the string to match, in bytes * @match_options: options for the match * @eval: (scope call): a function to call for each match * @user_data: user data to pass to the function * @error: location to store the error occurring, or %NULL to ignore errors * * Replaces occurrences of the pattern in regex with the output of * @eval for that occurrence. * * Setting @start_position differs from just passing over a shortened * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern * that begins with any kind of lookbehind assertion, such as "\b". * * The following example uses g_regex_replace_eval() to replace multiple * strings at once: * |[ * static gboolean * eval_cb (const GMatchInfo *info, * GString *res, * gpointer data) * { * gchar *match; * gchar *r; * * match = g_match_info_fetch (info, 0); * r = g_hash_table_lookup ((GHashTable *)data, match); * g_string_append (res, r); * g_free (match); * * return FALSE; * } * * ... * * GRegex *reg; * GHashTable *h; * gchar *res; * * h = g_hash_table_new (g_str_hash, g_str_equal); * * g_hash_table_insert (h, "1", "ONE"); * g_hash_table_insert (h, "2", "TWO"); * g_hash_table_insert (h, "3", "THREE"); * g_hash_table_insert (h, "4", "FOUR"); * * reg = g_regex_new ("1|2|3|4", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL); * g_hash_table_destroy (h); * * ... * ]| * * Returns: a newly allocated string containing the replacements * * Since: 2.14 */ gchar * g_regex_replace_eval (const GRegex *regex, const gchar *string, gssize string_len, gint start_position, GRegexMatchFlags match_options, GRegexEvalCallback eval, gpointer user_data, GError **error) { GMatchInfo *match_info; GString *result; gint str_pos = 0; gboolean done = FALSE; GError *tmp_error = NULL; g_return_val_if_fail (regex != NULL, NULL); g_return_val_if_fail (string != NULL, NULL); g_return_val_if_fail (start_position >= 0, NULL); g_return_val_if_fail (eval != NULL, NULL); g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); if (string_len < 0) string_len = strlen (string); result = g_string_sized_new (string_len); /* run down the string making matches. */ g_regex_match_full (regex, string, string_len, start_position, match_options, &match_info, &tmp_error); while (!done && g_match_info_matches (match_info)) { g_string_append_len (result, string + str_pos, match_info->offsets[0] - str_pos); done = (*eval) (match_info, result, user_data); str_pos = match_info->offsets[1]; g_match_info_next (match_info, &tmp_error); } g_match_info_free (match_info); if (tmp_error != NULL) { g_propagate_error (error, tmp_error); g_string_free (result, TRUE); return NULL; } g_string_append_len (result, string + str_pos, string_len - str_pos); return g_string_free (result, FALSE); } /** * g_regex_check_replacement: * @replacement: the replacement string * @has_references: (out) (optional): location to store information about * references in @replacement or %NULL * @error: location to store error * * Checks whether @replacement is a valid replacement string * (see g_regex_replace()), i.e. that all escape sequences in * it are valid. * * If @has_references is not %NULL then @replacement is checked * for pattern references. For instance, replacement text 'foo\n' * does not contain references and may be evaluated without information * about actual match, but '\0\1' (whole match followed by first * subpattern) requires valid #GMatchInfo object. * * Returns: whether @replacement is a valid replacement string * * Since: 2.14 */ gboolean g_regex_check_replacement (const gchar *replacement, gboolean *has_references, GError **error) { GList *list; GError *tmp = NULL; list = split_replacement (replacement, &tmp); if (tmp) { g_propagate_error (error, tmp); return FALSE; } if (has_references) *has_references = interpolation_list_needs_match (list); g_list_free_full (list, (GDestroyNotify) free_interpolation_data); return TRUE; } /** * g_regex_escape_nul: * @string: the string to escape * @length: the length of @string * * Escapes the nul characters in @string to "\x00". It can be used * to compile a regex with embedded nul characters. * * For completeness, @length can be -1 for a nul-terminated string. * In this case the output string will be of course equal to @string. * * Returns: a newly-allocated escaped string * * Since: 2.30 */ gchar * g_regex_escape_nul (const gchar *string, gint length) { GString *escaped; const gchar *p, *piece_start, *end; gint backslashes; g_return_val_if_fail (string != NULL, NULL); if (length < 0) return g_strdup (string); end = string + length; p = piece_start = string; escaped = g_string_sized_new (length + 1); backslashes = 0; while (p < end) { switch (*p) { case '\0': if (p != piece_start) { /* copy the previous piece. */ g_string_append_len (escaped, piece_start, p - piece_start); } if ((backslashes & 1) == 0) g_string_append_c (escaped, '\\'); g_string_append_c (escaped, 'x'); g_string_append_c (escaped, '0'); g_string_append_c (escaped, '0'); piece_start = ++p; backslashes = 0; break; case '\\': backslashes++; ++p; break; default: backslashes = 0; p = g_utf8_next_char (p); break; } } if (piece_start < end) g_string_append_len (escaped, piece_start, end - piece_start); return g_string_free (escaped, FALSE); } /** * g_regex_escape_string: * @string: the string to escape * @length: the length of @string, in bytes, or -1 if @string is nul-terminated * * Escapes the special characters used for regular expressions * in @string, for instance "a.b*c" becomes "a\.b\*c". This * function is useful to dynamically generate regular expressions. * * @string can contain nul characters that are replaced with "\0", * in this case remember to specify the correct length of @string * in @length. * * Returns: a newly-allocated escaped string * * Since: 2.14 */ gchar * g_regex_escape_string (const gchar *string, gint length) { GString *escaped; const char *p, *piece_start, *end; g_return_val_if_fail (string != NULL, NULL); if (length < 0) length = strlen (string); end = string + length; p = piece_start = string; escaped = g_string_sized_new (length + 1); while (p < end) { switch (*p) { case '\0': case '\\': case '|': case '(': case ')': case '[': case ']': case '{': case '}': case '^': case '$': case '*': case '+': case '?': case '.': if (p != piece_start) /* copy the previous piece. */ g_string_append_len (escaped, piece_start, p - piece_start); g_string_append_c (escaped, '\\'); if (*p == '\0') g_string_append_c (escaped, '0'); else g_string_append_c (escaped, *p); piece_start = ++p; break; default: p = g_utf8_next_char (p); break; } } if (piece_start < end) g_string_append_len (escaped, piece_start, end - piece_start); return g_string_free (escaped, FALSE); }