mirror of https://github.com/python/cpython.git
Updates from Fredrik Lundh <effbot@telia.com> about Unicode-related
behavior.
This commit is contained in:
parent
af57431701
commit
e53793bf4c
|
@ -175,13 +175,14 @@ Extensions usually do not create a new group;
|
||||||
\regexp{(?P<\var{name}>...)} is the only exception to this rule.
|
\regexp{(?P<\var{name}>...)} is the only exception to this rule.
|
||||||
Following are the currently supported extensions.
|
Following are the currently supported extensions.
|
||||||
|
|
||||||
\item[\code{(?iLmsx)}] (One or more letters from the set \character{i},
|
\item[\code{(?iLmsux)}] (One or more letters from the set \character{i},
|
||||||
\character{L}, \character{m}, \character{s}, \character{x}.) The group matches
|
\character{L}, \character{m}, \character{s}, \character{u},
|
||||||
the empty string; the letters set the corresponding flags
|
\character{x}.) The group matches the empty string; the letters set
|
||||||
(\constant{re.I}, \constant{re.L}, \constant{re.M}, \constant{re.S},
|
the corresponding flags (\constant{re.I}, \constant{re.L},
|
||||||
\constant{re.X}) for the entire regular expression. This is useful if
|
\constant{re.M}, \constant{re.S}, \constant{re.U}, \constant{re.X})
|
||||||
you wish to include the flags as part of the regular expression, instead
|
for the entire regular expression. This is useful if you wish to
|
||||||
of passing a \var{flag} argument to the \function{compile()} function.
|
include the flags as part of the regular expression, instead of
|
||||||
|
passing a \var{flag} argument to the \function{compile()} function.
|
||||||
|
|
||||||
\item[\code{(?:...)}] A non-grouping version of regular parentheses.
|
\item[\code{(?:...)}] A non-grouping version of regular parentheses.
|
||||||
Matches whatever regular expression is inside the parentheses, but the
|
Matches whatever regular expression is inside the parentheses, but the
|
||||||
|
@ -227,7 +228,6 @@ resulting RE will match the second character. For example,
|
||||||
|
|
||||||
\begin{list}{}{\leftmargin 0.7in \labelwidth 0.65in}
|
\begin{list}{}{\leftmargin 0.7in \labelwidth 0.65in}
|
||||||
|
|
||||||
%
|
|
||||||
\item[\code{\e \var{number}}] Matches the contents of the group of the
|
\item[\code{\e \var{number}}] Matches the contents of the group of the
|
||||||
same number. Groups are numbered starting from 1. For example,
|
same number. Groups are numbered starting from 1. For example,
|
||||||
\regexp{(.+) \e 1} matches \code{'the the'} or \code{'55 55'}, but not
|
\regexp{(.+) \e 1} matches \code{'the the'} or \code{'55 55'}, but not
|
||||||
|
@ -238,45 +238,50 @@ is 0, or \var{number} is 3 octal digits long, it will not be interpreted
|
||||||
as a group match, but as the character with octal value \var{number}.
|
as a group match, but as the character with octal value \var{number}.
|
||||||
Inside the \character{[} and \character{]} of a character class, all numeric
|
Inside the \character{[} and \character{]} of a character class, all numeric
|
||||||
escapes are treated as characters.
|
escapes are treated as characters.
|
||||||
%
|
|
||||||
\item[\code{\e A}] Matches only at the start of the string.
|
\item[\code{\e A}] Matches only at the start of the string.
|
||||||
%
|
|
||||||
\item[\code{\e b}] Matches the empty string, but only at the
|
\item[\code{\e b}] Matches the empty string, but only at the
|
||||||
beginning or end of a word. A word is defined as a sequence of
|
beginning or end of a word. A word is defined as a sequence of
|
||||||
alphanumeric characters, so the end of a word is indicated by
|
alphanumeric characters, so the end of a word is indicated by
|
||||||
whitespace or a non-alphanumeric character. Inside a character range,
|
whitespace or a non-alphanumeric character. Inside a character range,
|
||||||
\regexp{\e b} represents the backspace character, for compatibility with
|
\regexp{\e b} represents the backspace character, for compatibility with
|
||||||
Python's string literals.
|
Python's string literals.
|
||||||
%
|
|
||||||
\item[\code{\e B}] Matches the empty string, but only when it is
|
\item[\code{\e B}] Matches the empty string, but only when it is
|
||||||
\emph{not} at the beginning or end of a word.
|
\emph{not} at the beginning or end of a word.
|
||||||
%
|
|
||||||
\item[\code{\e d}]Matches any decimal digit; this is
|
\item[\code{\e d}]Matches any decimal digit; this is
|
||||||
equivalent to the set \regexp{[0-9]}.
|
equivalent to the set \regexp{[0-9]}.
|
||||||
%
|
|
||||||
\item[\code{\e D}]Matches any non-digit character; this is
|
\item[\code{\e D}]Matches any non-digit character; this is
|
||||||
equivalent to the set \regexp{[{\^}0-9]}.
|
equivalent to the set \regexp{[{\^}0-9]}.
|
||||||
%
|
|
||||||
\item[\code{\e s}]Matches any whitespace character; this is
|
\item[\code{\e s}]Matches any whitespace character; this is
|
||||||
equivalent to the set \regexp{[ \e t\e n\e r\e f\e v]}.
|
equivalent to the set \regexp{[ \e t\e n\e r\e f\e v]}.
|
||||||
%
|
|
||||||
\item[\code{\e S}]Matches any non-whitespace character; this is
|
\item[\code{\e S}]Matches any non-whitespace character; this is
|
||||||
equivalent to the set \regexp{[\^\ \e t\e n\e r\e f\e v]}.
|
equivalent to the set \regexp{[\^\ \e t\e n\e r\e f\e v]}.
|
||||||
%
|
|
||||||
\item[\code{\e w}]When the \constant{LOCALE} flag is not specified,
|
\item[\code{\e w}]When the \constant{LOCALE} and \constant{UNICODE}
|
||||||
|
flags are not specified,
|
||||||
matches any alphanumeric character; this is equivalent to the set
|
matches any alphanumeric character; this is equivalent to the set
|
||||||
\regexp{[a-zA-Z0-9_]}. With \constant{LOCALE}, it will match the set
|
\regexp{[a-zA-Z0-9_]}. With \constant{LOCALE}, it will match the set
|
||||||
\regexp{[0-9_]} plus whatever characters are defined as letters for the
|
\regexp{[0-9_]} plus whatever characters are defined as letters for
|
||||||
current locale.
|
the current locale. If \constant{UNICODE} is set, this will match the
|
||||||
%
|
characters \regexp{[0-9_]} plus whatever is classified as alphanumeric
|
||||||
\item[\code{\e W}]When the \constant{LOCALE} flag is not specified,
|
in the Unicode character properties database.
|
||||||
matches any non-alphanumeric character; this is equivalent to the set
|
|
||||||
\regexp{[{\^}a-zA-Z0-9_]}. With \constant{LOCALE}, it will match any
|
\item[\code{\e W}]When the \constant{LOCALE} and \constant{UNICODE}
|
||||||
character not in the set \regexp{[0-9_]}, and not defined as a letter
|
flags are not specified, matches any non-alphanumeric character; this
|
||||||
for the current locale.
|
is equivalent to the set \regexp{[{\^}a-zA-Z0-9_]}. With
|
||||||
|
\constant{LOCALE}, it will match any character not in the set
|
||||||
|
\regexp{[0-9_]}, and not defined as a letter for the current locale.
|
||||||
|
If \constant{UNICODE} is set, this will match anything other than
|
||||||
|
\regexp{[0-9_]} and characters marked at alphanumeric in the Unicode
|
||||||
|
character properties database.
|
||||||
|
|
||||||
\item[\code{\e Z}]Matches only at the end of the string.
|
\item[\code{\e Z}]Matches only at the end of the string.
|
||||||
%
|
|
||||||
|
|
||||||
\item[\code{\e \e}] Matches a literal backslash.
|
\item[\code{\e \e}] Matches a literal backslash.
|
||||||
|
|
||||||
|
@ -354,8 +359,8 @@ lowercase letters, too. This is not affected by the current locale.
|
||||||
|
|
||||||
\begin{datadesc}{L}
|
\begin{datadesc}{L}
|
||||||
\dataline{LOCALE}
|
\dataline{LOCALE}
|
||||||
Make \regexp{\e w}, \regexp{\e W}, \regexp{\e b},
|
Make \regexp{\e w}, \regexp{\e W}, \regexp{\e b}, and
|
||||||
\regexp{\e B}, dependent on the current locale.
|
\regexp{\e B} dependent on the current locale.
|
||||||
\end{datadesc}
|
\end{datadesc}
|
||||||
|
|
||||||
\begin{datadesc}{M}
|
\begin{datadesc}{M}
|
||||||
|
@ -372,9 +377,16 @@ newline (if any) at the end of the string.
|
||||||
|
|
||||||
\begin{datadesc}{S}
|
\begin{datadesc}{S}
|
||||||
\dataline{DOTALL}
|
\dataline{DOTALL}
|
||||||
Make the \character{.} special character match any character at all, including a
|
Make the \character{.} special character match any character at all,
|
||||||
newline; without this flag, \character{.} will match anything \emph{except}
|
including a newline; without this flag, \character{.} will match
|
||||||
a newline.
|
anything \emph{except} a newline.
|
||||||
|
\end{datadesc}
|
||||||
|
|
||||||
|
\begin{datadesc}{U}
|
||||||
|
\dataline{UNICODE}
|
||||||
|
Make \regexp{\e w}, \regexp{\e W}, \regexp{\e b}, and
|
||||||
|
\regexp{\e B} dependent on the Unicode character properties database.
|
||||||
|
\versionadded{2.0}
|
||||||
\end{datadesc}
|
\end{datadesc}
|
||||||
|
|
||||||
\begin{datadesc}{X}
|
\begin{datadesc}{X}
|
||||||
|
|
Loading…
Reference in New Issue