Skip to content

Commit 9499d39

Browse files
authored
Update html from 3.13.5 (RustPython#6031)
1 parent 6a9579e commit 9499d39

File tree

4 files changed

+154
-34
lines changed

4 files changed

+154
-34
lines changed

Lib/html/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def escape(s, quote=True):
2525
return s
2626

2727

28-
# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
28+
# see https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
2929

3030
_invalid_charrefs = {
3131
0x00: '\ufffd', # REPLACEMENT CHARACTER

Lib/html/entities.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
__all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs']
44

55

6-
# maps the HTML entity name to the Unicode code point
7-
# from https://html.spec.whatwg.org/multipage/named-characters.html
6+
# maps HTML4 entity name to the Unicode code point
87
name2codepoint = {
98
'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
109
'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1
@@ -261,7 +260,11 @@
261260
}
262261

263262

264-
# maps the HTML5 named character references to the equivalent Unicode character(s)
263+
# HTML5 named character references
264+
# Generated by Tools/build/parse_html5_entities.py
265+
# from https://html.spec.whatwg.org/entities.json and
266+
# https://html.spec.whatwg.org/multipage/named-characters.html.
267+
# Map HTML5 named character references to the equivalent Unicode character(s).
265268
html5 = {
266269
'Aacute': '\xc1',
267270
'aacute': '\xe1',

Lib/html/parser.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import _markupbase
1313

1414
from html import unescape
15+
from html.entities import html5 as html5_entities
1516

1617

1718
__all__ = ['HTMLParser']
@@ -23,6 +24,7 @@
2324

2425
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
2526
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
27+
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
2628

2729
starttagopen = re.compile('<[a-zA-Z]')
2830
piclose = re.compile('>')
@@ -57,6 +59,22 @@
5759
# </ and the tag name, so maybe this should be fixed
5860
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
5961

62+
# Character reference processing logic specific to attribute values
63+
# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
64+
def _replace_attr_charref(match):
65+
ref = match.group(0)
66+
# Numeric / hex char refs must always be unescaped
67+
if ref.startswith('&#'):
68+
return unescape(ref)
69+
# Named character / entity references must only be unescaped
70+
# if they are an exact match, and they are not followed by an equals sign
71+
if not ref.endswith('=') and ref[1:] in html5_entities:
72+
return unescape(ref)
73+
# Otherwise do not unescape
74+
return ref
75+
76+
def _unescape_attrvalue(s):
77+
return attr_charref.sub(_replace_attr_charref, s)
6078

6179

6280
class HTMLParser(_markupbase.ParserBase):
@@ -89,6 +107,7 @@ def __init__(self, *, convert_charrefs=True):
89107
If convert_charrefs is True (the default), all character references
90108
are automatically converted to the corresponding Unicode characters.
91109
"""
110+
super().__init__()
92111
self.convert_charrefs = convert_charrefs
93112
self.reset()
94113

@@ -98,7 +117,7 @@ def reset(self):
98117
self.lasttag = '???'
99118
self.interesting = interesting_normal
100119
self.cdata_elem = None
101-
_markupbase.ParserBase.reset(self)
120+
super().reset()
102121

103122
def feed(self, data):
104123
r"""Feed data to the parser.
@@ -241,7 +260,7 @@ def goahead(self, end):
241260
else:
242261
assert 0, "interesting.search() lied"
243262
# end while
244-
if end and i < n and not self.cdata_elem:
263+
if end and i < n:
245264
if self.convert_charrefs and not self.cdata_elem:
246265
self.handle_data(unescape(rawdata[i:n]))
247266
else:
@@ -259,7 +278,7 @@ def parse_html_declaration(self, i):
259278
if rawdata[i:i+4] == '<!--':
260279
# this case is actually already handled in goahead()
261280
return self.parse_comment(i)
262-
elif rawdata[i:i+3] == '<![':
281+
elif rawdata[i:i+9] == '<![CDATA[':
263282
return self.parse_marked_section(i)
264283
elif rawdata[i:i+9].lower() == '<!doctype':
265284
# find the closing >
@@ -276,7 +295,7 @@ def parse_html_declaration(self, i):
276295
def parse_bogus_comment(self, i, report=1):
277296
rawdata = self.rawdata
278297
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
279-
'parse_comment()')
298+
'parse_bogus_comment()')
280299
pos = rawdata.find('>', i+2)
281300
if pos == -1:
282301
return -1
@@ -322,7 +341,7 @@ def parse_starttag(self, i):
322341
attrvalue[:1] == '"' == attrvalue[-1:]:
323342
attrvalue = attrvalue[1:-1]
324343
if attrvalue:
325-
attrvalue = unescape(attrvalue)
344+
attrvalue = _unescape_attrvalue(attrvalue)
326345
attrs.append((attrname.lower(), attrvalue))
327346
k = m.end()
328347

Lib/test/test_htmlparser.py

Lines changed: 123 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import pprint
55
import unittest
66

7+
from unittest.mock import patch
8+
79

810
class EventCollector(html.parser.HTMLParser):
911

@@ -315,6 +317,16 @@ def get_events(self):
315317
("endtag", element_lower)],
316318
collector=Collector(convert_charrefs=False))
317319

320+
def test_EOF_in_cdata(self):
321+
content = """<!-- not a comment --> &not-an-entity-ref;
322+
<a href="" /> </p><p> <span></span></style>
323+
'</script' + '>'"""
324+
s = f'<script>{content}'
325+
self._run_check(s, [
326+
("starttag", 'script', []),
327+
("data", content)
328+
])
329+
318330
def test_comments(self):
319331
html = ("<!-- I'm a valid comment -->"
320332
'<!--me too!-->'
@@ -346,18 +358,16 @@ def test_convert_charrefs(self):
346358
collector = lambda: EventCollectorCharrefs()
347359
self.assertTrue(collector().convert_charrefs)
348360
charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
349-
# check charrefs in the middle of the text/attributes
350-
expected = [('starttag', 'a', [('href', 'foo"zar')]),
351-
('data', 'a"z'), ('endtag', 'a')]
361+
# check charrefs in the middle of the text
362+
expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
352363
for charref in charrefs:
353-
self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
364+
self._run_check('<a>a{0}z</a>'.format(charref),
354365
expected, collector=collector())
355-
# check charrefs at the beginning/end of the text/attributes
356-
expected = [('data', '"'),
357-
('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
366+
# check charrefs at the beginning/end of the text
367+
expected = [('data', '"'), ('starttag', 'a', []),
358368
('data', '"'), ('endtag', 'a'), ('data', '"')]
359369
for charref in charrefs:
360-
self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
370+
self._run_check('{0}<a>'
361371
'{0}</a>{0}'.format(charref),
362372
expected, collector=collector())
363373
# check charrefs in <script>/<style> elements
@@ -380,6 +390,35 @@ def test_convert_charrefs(self):
380390
self._run_check('no charrefs here', [('data', 'no charrefs here')],
381391
collector=collector())
382392

393+
def test_convert_charrefs_in_attribute_values(self):
394+
# default value for convert_charrefs is now True
395+
collector = lambda: EventCollectorCharrefs()
396+
self.assertTrue(collector().convert_charrefs)
397+
398+
# always unescape terminated entity refs, numeric and hex char refs:
399+
# - regardless whether they are at start, middle, end of attribute
400+
# - or followed by alphanumeric, non-alphanumeric, or equals char
401+
charrefs = ['&cent;', '&#xa2;', '&#xa2', '&#162;', '&#162']
402+
expected = [('starttag', 'a',
403+
[('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
404+
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
405+
('endtag', 'a')]
406+
for charref in charrefs:
407+
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
408+
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
409+
.format(charref), expected, collector=collector())
410+
411+
# only unescape unterminated entity matches if they are not followed by
412+
# an alphanumeric or an equals sign
413+
charref = '&cent'
414+
expected = [('starttag', 'a',
415+
[('x', '¢'), ('x', 'z¢'), ('x', '&centz'),
416+
('x', 'z&centz'), ('x', '¢ z'), ('x', '&cent=z')]),
417+
('endtag', 'a')]
418+
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
419+
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
420+
.format(charref), expected, collector=collector())
421+
383422
# the remaining tests were for the "tolerant" parser (which is now
384423
# the default), and check various kind of broken markup
385424
def test_tolerant_parsing(self):
@@ -537,52 +576,99 @@ def test_EOF_in_charref(self):
537576
for html, expected in data:
538577
self._run_check(html, expected)
539578

540-
def test_broken_comments(self):
579+
def test_EOF_in_comments_or_decls(self):
580+
data = [
581+
('<!', [('data', '<!')]),
582+
('<!-', [('data', '<!-')]),
583+
('<!--', [('data', '<!--')]),
584+
('<![', [('data', '<![')]),
585+
('<![CDATA[', [('data', '<![CDATA[')]),
586+
('<![CDATA[x', [('data', '<![CDATA[x')]),
587+
('<!DOCTYPE', [('data', '<!DOCTYPE')]),
588+
('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]),
589+
]
590+
for html, expected in data:
591+
self._run_check(html, expected)
592+
def test_bogus_comments(self):
541593
html = ('<! not really a comment >'
542594
'<! not a comment either -->'
543595
'<! -- close enough -->'
544596
'<!><!<-- this was an empty comment>'
545-
'<!!! another bogus comment !!!>')
597+
'<!!! another bogus comment !!!>'
598+
# see #32876
599+
'<![with square brackets]!>'
600+
'<![\nmultiline\nbogusness\n]!>'
601+
'<![more brackets]-[and a hyphen]!>'
602+
'<![cdata[should be uppercase]]>'
603+
'<![CDATA [whitespaces are not ignored]]>'
604+
'<![CDATA]]>' # required '[' after CDATA
605+
)
546606
expected = [
547607
('comment', ' not really a comment '),
548608
('comment', ' not a comment either --'),
549609
('comment', ' -- close enough --'),
550610
('comment', ''),
551611
('comment', '<-- this was an empty comment'),
552612
('comment', '!! another bogus comment !!!'),
613+
('comment', '[with square brackets]!'),
614+
('comment', '[\nmultiline\nbogusness\n]!'),
615+
('comment', '[more brackets]-[and a hyphen]!'),
616+
('comment', '[cdata[should be uppercase]]'),
617+
('comment', '[CDATA [whitespaces are not ignored]]'),
618+
('comment', '[CDATA]]'),
553619
]
554620
self._run_check(html, expected)
555621

556622
def test_broken_condcoms(self):
557623
# these condcoms are missing the '--' after '<!' and before the '>'
624+
# and they are considered bogus comments according to
625+
# "8.2.4.42. Markup declaration open state"
558626
html = ('<![if !(IE)]>broken condcom<![endif]>'
559627
'<![if ! IE]><link href="favicon.tiff"/><![endif]>'
560628
'<![if !IE 6]><img src="firefox.png" /><![endif]>'
561629
'<![if !ie 6]><b>foo</b><![endif]>'
562630
'<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
563-
# According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
564-
# and "8.2.4.45 Markup declaration open state", comment tokens should
565-
# be emitted instead of 'unknown decl', but calling unknown_decl
566-
# provides more flexibility.
567-
# See also Lib/_markupbase.py:parse_declaration
568631
expected = [
569-
('unknown decl', 'if !(IE)'),
632+
('comment', '[if !(IE)]'),
570633
('data', 'broken condcom'),
571-
('unknown decl', 'endif'),
572-
('unknown decl', 'if ! IE'),
634+
('comment', '[endif]'),
635+
('comment', '[if ! IE]'),
573636
('startendtag', 'link', [('href', 'favicon.tiff')]),
574-
('unknown decl', 'endif'),
575-
('unknown decl', 'if !IE 6'),
637+
('comment', '[endif]'),
638+
('comment', '[if !IE 6]'),
576639
('startendtag', 'img', [('src', 'firefox.png')]),
577-
('unknown decl', 'endif'),
578-
('unknown decl', 'if !ie 6'),
640+
('comment', '[endif]'),
641+
('comment', '[if !ie 6]'),
579642
('starttag', 'b', []),
580643
('data', 'foo'),
581644
('endtag', 'b'),
582-
('unknown decl', 'endif'),
583-
('unknown decl', 'if (!IE)|(lt IE 9)'),
645+
('comment', '[endif]'),
646+
('comment', '[if (!IE)|(lt IE 9)]'),
584647
('startendtag', 'img', [('src', 'mammoth.bmp')]),
585-
('unknown decl', 'endif')
648+
('comment', '[endif]')
649+
]
650+
self._run_check(html, expected)
651+
652+
def test_cdata_declarations(self):
653+
# More tests should be added. See also "8.2.4.42. Markup
654+
# declaration open state", "8.2.4.69. CDATA section state",
655+
# and issue 32876
656+
html = ('<![CDATA[just some plain text]]>')
657+
expected = [('unknown decl', 'CDATA[just some plain text')]
658+
self._run_check(html, expected)
659+
660+
def test_cdata_declarations_multiline(self):
661+
html = ('<code><![CDATA['
662+
' if (a < b && a > b) {'
663+
' printf("[<marquee>How?</marquee>]");'
664+
' }'
665+
']]></code>')
666+
expected = [
667+
('starttag', 'code', []),
668+
('unknown decl',
669+
'CDATA[ if (a < b && a > b) { '
670+
'printf("[<marquee>How?</marquee>]"); }'),
671+
('endtag', 'code')
586672
]
587673
self._run_check(html, expected)
588674

@@ -787,5 +873,17 @@ def test_weird_chars_in_unquoted_attribute_values(self):
787873
('starttag', 'form',
788874
[('action', 'bogus|&#()value')])])
789875

876+
877+
class TestInheritance(unittest.TestCase):
878+
879+
@patch("_markupbase.ParserBase.__init__")
880+
@patch("_markupbase.ParserBase.reset")
881+
def test_base_class_methods_called(self, super_reset_method, super_init_method):
882+
with patch('_markupbase.ParserBase') as parser_base:
883+
EventCollector()
884+
super_init_method.assert_called_once()
885+
super_reset_method.assert_called_once()
886+
887+
790888
if __name__ == "__main__":
791889
unittest.main()

0 commit comments

Comments
 (0)