mirror of https://github.com/python/cpython.git
SF bug #1504333: sgmlib should allow angle brackets in quoted values
(modified patch by Sam Ruby; changed to use separate REs for start and end tags to reduce matching cost for end tags; extended tests; updated to avoid breaking previous changes to support IPv6 addresses in unquoted attribute values)
This commit is contained in:
parent
960a3f88e5
commit
a136210a9f
|
@ -29,7 +29,12 @@
|
||||||
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
|
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
|
||||||
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
|
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
|
||||||
piclose = re.compile('>')
|
piclose = re.compile('>')
|
||||||
endbracket = re.compile('[<>]')
|
starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*('
|
||||||
|
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
|
||||||
|
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]'
|
||||||
|
r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?'
|
||||||
|
r')*\s*/?\s*(?=[<>])')
|
||||||
|
endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])')
|
||||||
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
|
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
|
||||||
attrfind = re.compile(
|
attrfind = re.compile(
|
||||||
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
|
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
|
||||||
|
@ -249,14 +254,10 @@ def parse_starttag(self, i):
|
||||||
self.finish_shorttag(tag, data)
|
self.finish_shorttag(tag, data)
|
||||||
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
|
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
|
||||||
return k
|
return k
|
||||||
# XXX The following should skip matching quotes (' or ")
|
match = starttag.match(rawdata, i)
|
||||||
# As a shortcut way to exit, this isn't so bad, but shouldn't
|
|
||||||
# be used to locate the actual end of the start tag since the
|
|
||||||
# < or > characters may be embedded in an attribute value.
|
|
||||||
match = endbracket.search(rawdata, i+1)
|
|
||||||
if not match:
|
if not match:
|
||||||
return -1
|
return -1
|
||||||
j = match.start(0)
|
j = match.end(0)
|
||||||
# Now parse the data between i+1 and j into a tag and attrs
|
# Now parse the data between i+1 and j into a tag and attrs
|
||||||
attrs = []
|
attrs = []
|
||||||
if rawdata[i:i+2] == '<>':
|
if rawdata[i:i+2] == '<>':
|
||||||
|
@ -305,10 +306,10 @@ def _convert_ref(self, match):
|
||||||
# Internal -- parse endtag
|
# Internal -- parse endtag
|
||||||
def parse_endtag(self, i):
|
def parse_endtag(self, i):
|
||||||
rawdata = self.rawdata
|
rawdata = self.rawdata
|
||||||
match = endbracket.search(rawdata, i+1)
|
match = endtag.match(rawdata, i)
|
||||||
if not match:
|
if not match:
|
||||||
return -1
|
return -1
|
||||||
j = match.start(0)
|
j = match.end(0)
|
||||||
tag = rawdata[i+2:j].strip().lower()
|
tag = rawdata[i+2:j].strip().lower()
|
||||||
if rawdata[j] == '>':
|
if rawdata[j] == '>':
|
||||||
j = j+1
|
j = j+1
|
||||||
|
|
|
@ -286,6 +286,21 @@ def test_convert_overrides(self):
|
||||||
('codepoint', 'convert', 42),
|
('codepoint', 'convert', 42),
|
||||||
])
|
])
|
||||||
|
|
||||||
|
def test_attr_values_quoted_markup(self):
|
||||||
|
"""Multi-line and markup in attribute values"""
|
||||||
|
self.check_events("""<a title='foo\n<br>bar'>text</a>""",
|
||||||
|
[("starttag", "a", [("title", "foo\n<br>bar")]),
|
||||||
|
("data", "text"),
|
||||||
|
("endtag", "a")])
|
||||||
|
self.check_events("""<a title='less < than'>text</a>""",
|
||||||
|
[("starttag", "a", [("title", "less < than")]),
|
||||||
|
("data", "text"),
|
||||||
|
("endtag", "a")])
|
||||||
|
self.check_events("""<a title='greater > than'>text</a>""",
|
||||||
|
[("starttag", "a", [("title", "greater > than")]),
|
||||||
|
("data", "text"),
|
||||||
|
("endtag", "a")])
|
||||||
|
|
||||||
def test_attr_funky_names(self):
|
def test_attr_funky_names(self):
|
||||||
self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
|
self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
|
||||||
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
|
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
|
||||||
|
|
Loading…
Reference in New Issue