mirror of https://github.com/python/cpython.git
Applying modified version of patch #1018386, which fixes
some escaping bugs in SRE.
This commit is contained in:
parent
ab9351bf36
commit
a01a2ee933
|
@ -387,7 +387,8 @@ also accepted by the regular expression parser:
|
||||||
|
|
||||||
Octal escapes are included in a limited form: If the first digit is a
|
Octal escapes are included in a limited form: If the first digit is a
|
||||||
0, or if there are three octal digits, it is considered an octal
|
0, or if there are three octal digits, it is considered an octal
|
||||||
escape. Otherwise, it is a group reference.
|
escape. Otherwise, it is a group reference. As for string literals,
|
||||||
|
octal escapes are always at most three digits in length.
|
||||||
|
|
||||||
|
|
||||||
% Note the lack of a period in the section title; it causes problems
|
% Note the lack of a period in the section title; it causes problems
|
||||||
|
|
|
@ -217,21 +217,11 @@ def isname(name):
|
||||||
# check that group name is a valid string
|
# check that group name is a valid string
|
||||||
if not isident(name[0]):
|
if not isident(name[0]):
|
||||||
return False
|
return False
|
||||||
for char in name:
|
for char in name[1:]:
|
||||||
if not isident(char) and not isdigit(char):
|
if not isident(char) and not isdigit(char):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _group(escape, groups):
|
|
||||||
# check if the escape string represents a valid group
|
|
||||||
try:
|
|
||||||
gid = int(escape[1:])
|
|
||||||
if gid and gid < groups:
|
|
||||||
return gid
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
return None # not a valid group
|
|
||||||
|
|
||||||
def _class_escape(source, escape):
|
def _class_escape(source, escape):
|
||||||
# handle escape code inside character class
|
# handle escape code inside character class
|
||||||
code = ESCAPES.get(escape)
|
code = ESCAPES.get(escape)
|
||||||
|
@ -241,7 +231,8 @@ def _class_escape(source, escape):
|
||||||
if code:
|
if code:
|
||||||
return code
|
return code
|
||||||
try:
|
try:
|
||||||
if escape[1:2] == "x":
|
c = escape[1:2]
|
||||||
|
if c == "x":
|
||||||
# hexadecimal escape (exactly two digits)
|
# hexadecimal escape (exactly two digits)
|
||||||
while source.next in HEXDIGITS and len(escape) < 4:
|
while source.next in HEXDIGITS and len(escape) < 4:
|
||||||
escape = escape + source.get()
|
escape = escape + source.get()
|
||||||
|
@ -249,12 +240,14 @@ def _class_escape(source, escape):
|
||||||
if len(escape) != 2:
|
if len(escape) != 2:
|
||||||
raise error, "bogus escape: %s" % repr("\\" + escape)
|
raise error, "bogus escape: %s" % repr("\\" + escape)
|
||||||
return LITERAL, int(escape, 16) & 0xff
|
return LITERAL, int(escape, 16) & 0xff
|
||||||
elif escape[1:2] in OCTDIGITS:
|
elif c in OCTDIGITS:
|
||||||
# octal escape (up to three digits)
|
# octal escape (up to three digits)
|
||||||
while source.next in OCTDIGITS and len(escape) < 5:
|
while source.next in OCTDIGITS and len(escape) < 4:
|
||||||
escape = escape + source.get()
|
escape = escape + source.get()
|
||||||
escape = escape[1:]
|
escape = escape[1:]
|
||||||
return LITERAL, int(escape, 8) & 0xff
|
return LITERAL, int(escape, 8) & 0xff
|
||||||
|
elif c in DIGITS:
|
||||||
|
raise error, "bogus escape: %s" % repr(escape)
|
||||||
if len(escape) == 2:
|
if len(escape) == 2:
|
||||||
return LITERAL, ord(escape[1])
|
return LITERAL, ord(escape[1])
|
||||||
except ValueError:
|
except ValueError:
|
||||||
|
@ -270,19 +263,20 @@ def _escape(source, escape, state):
|
||||||
if code:
|
if code:
|
||||||
return code
|
return code
|
||||||
try:
|
try:
|
||||||
if escape[1:2] == "x":
|
c = escape[1:2]
|
||||||
|
if c == "x":
|
||||||
# hexadecimal escape
|
# hexadecimal escape
|
||||||
while source.next in HEXDIGITS and len(escape) < 4:
|
while source.next in HEXDIGITS and len(escape) < 4:
|
||||||
escape = escape + source.get()
|
escape = escape + source.get()
|
||||||
if len(escape) != 4:
|
if len(escape) != 4:
|
||||||
raise ValueError
|
raise ValueError
|
||||||
return LITERAL, int(escape[2:], 16) & 0xff
|
return LITERAL, int(escape[2:], 16) & 0xff
|
||||||
elif escape[1:2] == "0":
|
elif c == "0":
|
||||||
# octal escape
|
# octal escape
|
||||||
while source.next in OCTDIGITS and len(escape) < 4:
|
while source.next in OCTDIGITS and len(escape) < 4:
|
||||||
escape = escape + source.get()
|
escape = escape + source.get()
|
||||||
return LITERAL, int(escape[1:], 8) & 0xff
|
return LITERAL, int(escape[1:], 8) & 0xff
|
||||||
elif escape[1:2] in DIGITS:
|
elif c in DIGITS:
|
||||||
# octal escape *or* decimal group reference (sigh)
|
# octal escape *or* decimal group reference (sigh)
|
||||||
if source.next in DIGITS:
|
if source.next in DIGITS:
|
||||||
escape = escape + source.get()
|
escape = escape + source.get()
|
||||||
|
@ -291,9 +285,9 @@ def _escape(source, escape, state):
|
||||||
# got three octal digits; this is an octal escape
|
# got three octal digits; this is an octal escape
|
||||||
escape = escape + source.get()
|
escape = escape + source.get()
|
||||||
return LITERAL, int(escape[1:], 8) & 0xff
|
return LITERAL, int(escape[1:], 8) & 0xff
|
||||||
# got at least one decimal digit; this is a group reference
|
# not an octal escape, so this is a group reference
|
||||||
group = _group(escape, state.groups)
|
group = int(escape[1:])
|
||||||
if group:
|
if group < state.groups:
|
||||||
if not state.checkgroup(group):
|
if not state.checkgroup(group):
|
||||||
raise error, "cannot refer to open group"
|
raise error, "cannot refer to open group"
|
||||||
return GROUPREF, group
|
return GROUPREF, group
|
||||||
|
@ -709,7 +703,8 @@ def literal(literal, p=p, pappend=a):
|
||||||
break # end of replacement string
|
break # end of replacement string
|
||||||
if this and this[0] == "\\":
|
if this and this[0] == "\\":
|
||||||
# group
|
# group
|
||||||
if this == "\\g":
|
c = this[1:2]
|
||||||
|
if c == "g":
|
||||||
name = ""
|
name = ""
|
||||||
if s.match("<"):
|
if s.match("<"):
|
||||||
while 1:
|
while 1:
|
||||||
|
@ -723,6 +718,8 @@ def literal(literal, p=p, pappend=a):
|
||||||
raise error, "bad group name"
|
raise error, "bad group name"
|
||||||
try:
|
try:
|
||||||
index = int(name)
|
index = int(name)
|
||||||
|
if index < 0:
|
||||||
|
raise error, "negative group number"
|
||||||
except ValueError:
|
except ValueError:
|
||||||
if not isname(name):
|
if not isname(name):
|
||||||
raise error, "bad character in group name"
|
raise error, "bad character in group name"
|
||||||
|
@ -731,26 +728,23 @@ def literal(literal, p=p, pappend=a):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise IndexError, "unknown group name"
|
raise IndexError, "unknown group name"
|
||||||
a((MARK, index))
|
a((MARK, index))
|
||||||
elif len(this) > 1 and this[1] in DIGITS:
|
elif c == "0":
|
||||||
code = None
|
if s.next in OCTDIGITS:
|
||||||
while 1:
|
this = this + sget()
|
||||||
group = _group(this, pattern.groups+1)
|
if s.next in OCTDIGITS:
|
||||||
if group:
|
|
||||||
if (s.next not in DIGITS or
|
|
||||||
not _group(this + s.next, pattern.groups+1)):
|
|
||||||
code = MARK, group
|
|
||||||
break
|
|
||||||
elif s.next in OCTDIGITS:
|
|
||||||
this = this + sget()
|
this = this + sget()
|
||||||
else:
|
literal(makechar(int(this[1:], 8) & 0xff))
|
||||||
break
|
elif c in DIGITS:
|
||||||
if not code:
|
isoctal = False
|
||||||
this = this[1:]
|
if s.next in DIGITS:
|
||||||
code = LITERAL, makechar(int(this[-6:], 8) & 0xff)
|
this = this + sget()
|
||||||
if code[0] is LITERAL:
|
if (c in OCTDIGITS and s.next in OCTDIGITS and
|
||||||
literal(code[1])
|
this[2] in OCTDIGITS):
|
||||||
else:
|
this = this + sget()
|
||||||
a(code)
|
isoctal = True
|
||||||
|
literal(makechar(int(this[1:], 8) & 0xff))
|
||||||
|
if not isoctal:
|
||||||
|
a((MARK, int(this[1:])))
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
this = makechar(ESCAPES[this][1])
|
this = makechar(ESCAPES[this][1])
|
||||||
|
@ -782,7 +776,7 @@ def expand_template(template, match):
|
||||||
for index, group in groups:
|
for index, group in groups:
|
||||||
literals[index] = s = g(group)
|
literals[index] = s = g(group)
|
||||||
if s is None:
|
if s is None:
|
||||||
raise IndexError
|
raise error, "unmatched group"
|
||||||
except IndexError:
|
except IndexError:
|
||||||
raise error, "empty group"
|
raise error, "invalid group reference"
|
||||||
return sep.join(literals)
|
return sep.join(literals)
|
||||||
|
|
|
@ -83,6 +83,48 @@ def test_bug_449000(self):
|
||||||
self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
|
self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
|
||||||
'abc\ndef\n')
|
'abc\ndef\n')
|
||||||
|
|
||||||
|
def test_sub_template_numeric_escape(self):
|
||||||
|
# bug 776311 and friends
|
||||||
|
self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
|
||||||
|
self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
|
||||||
|
self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
|
||||||
|
self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
|
||||||
|
self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
|
||||||
|
self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
|
||||||
|
self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
|
||||||
|
|
||||||
|
self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
|
||||||
|
self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
|
||||||
|
|
||||||
|
self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
|
||||||
|
self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
|
||||||
|
self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
|
||||||
|
self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
|
||||||
|
self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
|
||||||
|
|
||||||
|
self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
|
||||||
|
self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
|
||||||
|
|
||||||
|
self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
|
||||||
|
self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
|
||||||
|
self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
|
||||||
|
self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
|
||||||
|
self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
|
||||||
|
self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
|
||||||
|
self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
|
||||||
|
self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
|
||||||
|
self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
|
||||||
|
self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
|
||||||
|
self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
|
||||||
|
self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
|
||||||
|
|
||||||
|
# in python2.3 (etc), these loop endlessly in sre_parser.py
|
||||||
|
self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
|
||||||
|
self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
|
||||||
|
'xz8')
|
||||||
|
self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
|
||||||
|
'xza')
|
||||||
|
|
||||||
def test_qualified_re_sub(self):
|
def test_qualified_re_sub(self):
|
||||||
self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
|
self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
|
||||||
self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
|
self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
|
||||||
|
@ -105,6 +147,7 @@ def test_symbolic_refs(self):
|
||||||
self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
|
self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
|
||||||
self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
|
self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
|
||||||
self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
|
self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
|
||||||
|
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
|
||||||
|
|
||||||
def test_re_subn(self):
|
def test_re_subn(self):
|
||||||
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
|
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
|
||||||
|
@ -386,6 +429,16 @@ def test_sre_character_literals(self):
|
||||||
self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None)
|
self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None)
|
||||||
self.assertRaises(re.error, re.match, "\911", "")
|
self.assertRaises(re.error, re.match, "\911", "")
|
||||||
|
|
||||||
|
def test_sre_character_class_literals(self):
|
||||||
|
for i in [0, 8, 16, 32, 64, 127, 128, 255]:
|
||||||
|
self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None)
|
||||||
|
self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None)
|
||||||
|
self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None)
|
||||||
|
self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None)
|
||||||
|
self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None)
|
||||||
|
self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None)
|
||||||
|
self.assertRaises(re.error, re.match, "[\911]", "")
|
||||||
|
|
||||||
def test_bug_113254(self):
|
def test_bug_113254(self):
|
||||||
self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
|
self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
|
||||||
self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
|
self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
|
||||||
|
|
Loading…
Reference in New Issue