Update html from 3.13.5 (RustPython#6031)

ShaharNaveh · web-flow · commit 9499d39f55b7 · 2025-07-25T10:58:21.000+09:00
diff --git a/Lib/html/__init__.py b/Lib/html/__init__.py
@@ -25,7 +25,7 @@ def escape(s, quote=True):
     return s
 
 
-# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
+# see https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
 
 _invalid_charrefs = {
     0x00: '\ufffd',  # REPLACEMENT CHARACTER
diff --git a/Lib/html/entities.py b/Lib/html/entities.py
@@ -3,8 +3,7 @@
 __all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs']
 
 
-# maps the HTML entity name to the Unicode code point
-# from https://html.spec.whatwg.org/multipage/named-characters.html
+# maps HTML4 entity name to the Unicode code point
 name2codepoint = {
     'AElig':    0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
     'Aacute':   0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1
@@ -261,7 +260,11 @@
 }
 
 
-# maps the HTML5 named character references to the equivalent Unicode character(s)
+# HTML5 named character references
+# Generated by Tools/build/parse_html5_entities.py
+# from https://html.spec.whatwg.org/entities.json and
+# https://html.spec.whatwg.org/multipage/named-characters.html.
+# Map HTML5 named character references to the equivalent Unicode character(s).
 html5 = {
     'Aacute': '\xc1',
     'aacute': '\xe1',
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
@@ -12,6 +12,7 @@
 import _markupbase
 
 from html import unescape
+from html.entities import html5 as html5_entities
 
 
 __all__ = ['HTMLParser']
@@ -23,6 +24,7 @@
 
 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
 
 starttagopen = re.compile('<[a-zA-Z]')
 piclose = re.compile('>')
@@ -57,6 +59,22 @@
 # </ and the tag name, so maybe this should be fixed
 endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
 
+# Character reference processing logic specific to attribute values
+# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
+def _replace_attr_charref(match):
+    ref = match.group(0)
+    # Numeric / hex char refs must always be unescaped
+    if ref.startswith('&#'):
+        return unescape(ref)
+    # Named character / entity references must only be unescaped
+    # if they are an exact match, and they are not followed by an equals sign
+    if not ref.endswith('=') and ref[1:] in html5_entities:
+        return unescape(ref)
+    # Otherwise do not unescape
+    return ref
+
+def _unescape_attrvalue(s):
+    return attr_charref.sub(_replace_attr_charref, s)
 
 
 class HTMLParser(_markupbase.ParserBase):
@@ -89,6 +107,7 @@ def __init__(self, *, convert_charrefs=True):
         If convert_charrefs is True (the default), all character references
         are automatically converted to the corresponding Unicode characters.
         """
+        super().__init__()
         self.convert_charrefs = convert_charrefs
         self.reset()
 
@@ -98,7 +117,7 @@ def reset(self):
         self.lasttag = '???'
         self.interesting = interesting_normal
         self.cdata_elem = None
-        _markupbase.ParserBase.reset(self)
+        super().reset()
 
     def feed(self, data):
         r"""Feed data to the parser.
@@ -241,7 +260,7 @@ def goahead(self, end):
             else:
                 assert 0, "interesting.search() lied"
         # end while
-        if end and i < n and not self.cdata_elem:
+        if end and i < n:
             if self.convert_charrefs and not self.cdata_elem:
                 self.handle_data(unescape(rawdata[i:n]))
             else:
@@ -259,7 +278,7 @@ def parse_html_declaration(self, i):
         if rawdata[i:i+4] == '<!--':
             # this case is actually already handled in goahead()
             return self.parse_comment(i)
-        elif rawdata[i:i+3] == '<![':
+        elif rawdata[i:i+9] == '<![CDATA[':
             return self.parse_marked_section(i)
         elif rawdata[i:i+9].lower() == '<!doctype':
             # find the closing >
@@ -276,7 +295,7 @@ def parse_html_declaration(self, i):
     def parse_bogus_comment(self, i, report=1):
         rawdata = self.rawdata
         assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
-                                                'parse_comment()')
+                                                'parse_bogus_comment()')
         pos = rawdata.find('>', i+2)
         if pos == -1:
             return -1
@@ -322,7 +341,7 @@ def parse_starttag(self, i):
                  attrvalue[:1] == '"' == attrvalue[-1:]:
                 attrvalue = attrvalue[1:-1]
             if attrvalue:
-                attrvalue = unescape(attrvalue)
+                attrvalue = _unescape_attrvalue(attrvalue)
             attrs.append((attrname.lower(), attrvalue))
             k = m.end()
 
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
@@ -4,6 +4,8 @@
 import pprint
 import unittest
 
+from unittest.mock import patch
+
 
 class EventCollector(html.parser.HTMLParser):
 
@@ -315,6 +317,16 @@ def get_events(self):
                                 ("endtag", element_lower)],
                             collector=Collector(convert_charrefs=False))
 
+    def test_EOF_in_cdata(self):
+        content = """<!-- not a comment --> &not-an-entity-ref;
+                  <a href="" /> </p><p> <span></span></style>
+                  '</script' + '>'"""
+        s = f'<script>{content}'
+        self._run_check(s, [
+            ("starttag", 'script', []),
+            ("data", content)
+        ])
+
     def test_comments(self):
         html = ("<!-- I'm a valid comment -->"
                 '<!--me too!-->'
@@ -346,18 +358,16 @@ def test_convert_charrefs(self):
         collector = lambda: EventCollectorCharrefs()
         self.assertTrue(collector().convert_charrefs)
         charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
-        # check charrefs in the middle of the text/attributes
-        expected = [('starttag', 'a', [('href', 'foo"zar')]),
-                    ('data', 'a"z'), ('endtag', 'a')]
+        # check charrefs in the middle of the text
+        expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
         for charref in charrefs:
-            self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
+            self._run_check('<a>a{0}z</a>'.format(charref),
                             expected, collector=collector())
-        # check charrefs at the beginning/end of the text/attributes
-        expected = [('data', '"'),
-                    ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
+        # check charrefs at the beginning/end of the text
+        expected = [('data', '"'), ('starttag', 'a', []),
                     ('data', '"'), ('endtag', 'a'), ('data', '"')]
         for charref in charrefs:
-            self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
+            self._run_check('{0}<a>'
                             '{0}</a>{0}'.format(charref),
                             expected, collector=collector())
         # check charrefs in <script>/<style> elements
@@ -380,6 +390,35 @@ def test_convert_charrefs(self):
         self._run_check('no charrefs here', [('data', 'no charrefs here')],
                         collector=collector())
 
+    def test_convert_charrefs_in_attribute_values(self):
+        # default value for convert_charrefs is now True
+        collector = lambda: EventCollectorCharrefs()
+        self.assertTrue(collector().convert_charrefs)
+
+        # always unescape terminated entity refs, numeric and hex char refs:
+        # - regardless whether they are at start, middle, end of attribute
+        # - or followed by alphanumeric, non-alphanumeric, or equals char
+        charrefs = ['&cent;', '&#xa2;', '&#xa2', '&#162;', '&#162']
+        expected = [('starttag', 'a',
+                     [('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
+                      ('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
+                    ('endtag', 'a')]
+        for charref in charrefs:
+            self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
+                            '   x="z{0}z" x="{0} z" x="{0}=z"></a>'
+                            .format(charref), expected, collector=collector())
+
+        # only unescape unterminated entity matches if they are not followed by
+        # an alphanumeric or an equals sign
+        charref = '&cent'
+        expected = [('starttag', 'a',
+                     [('x', '¢'), ('x', 'z¢'), ('x', '&centz'),
+                      ('x', 'z&centz'), ('x', '¢ z'), ('x', '&cent=z')]),
+                    ('endtag', 'a')]
+        self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
+                        '   x="z{0}z" x="{0} z" x="{0}=z"></a>'
+                        .format(charref), expected, collector=collector())
+
     # the remaining tests were for the "tolerant" parser (which is now
     # the default), and check various kind of broken markup
     def test_tolerant_parsing(self):
@@ -537,52 +576,99 @@ def test_EOF_in_charref(self):
         for html, expected in data:
             self._run_check(html, expected)
 
-    def test_broken_comments(self):
+    def test_EOF_in_comments_or_decls(self):
+        data = [
+            ('<!', [('data', '<!')]),
+            ('<!-', [('data', '<!-')]),
+            ('<!--', [('data', '<!--')]),
+            ('<![', [('data', '<![')]),
+            ('<![CDATA[', [('data', '<![CDATA[')]),
+            ('<![CDATA[x', [('data', '<![CDATA[x')]),
+            ('<!DOCTYPE', [('data', '<!DOCTYPE')]),
+            ('<!DOCTYPE HTML', [('data', '<!DOCTYPE HTML')]),
+        ]
+        for html, expected in data:
+            self._run_check(html, expected)
+    def test_bogus_comments(self):
         html = ('<! not really a comment >'
                 '<! not a comment either -->'
                 '<! -- close enough -->'
                 '<!><!<-- this was an empty comment>'
-                '<!!! another bogus comment !!!>')
+                '<!!! another bogus comment !!!>'
+                # see #32876
+                '<![with square brackets]!>'
+                '<![\nmultiline\nbogusness\n]!>'
+                '<![more brackets]-[and a hyphen]!>'
+                '<![cdata[should be uppercase]]>'
+                '<![CDATA [whitespaces are not ignored]]>'
+                '<![CDATA]]>'  # required '[' after CDATA
+        )
         expected = [
             ('comment', ' not really a comment '),
             ('comment', ' not a comment either --'),
             ('comment', ' -- close enough --'),
             ('comment', ''),
             ('comment', '<-- this was an empty comment'),
             ('comment', '!! another bogus comment !!!'),
+            ('comment', '[with square brackets]!'),
+            ('comment', '[\nmultiline\nbogusness\n]!'),
+            ('comment', '[more brackets]-[and a hyphen]!'),
+            ('comment', '[cdata[should be uppercase]]'),
+            ('comment', '[CDATA [whitespaces are not ignored]]'),
+            ('comment', '[CDATA]]'),
         ]
         self._run_check(html, expected)
 
     def test_broken_condcoms(self):
         # these condcoms are missing the '--' after '<!' and before the '>'
+        # and they are considered bogus comments according to
+        # "8.2.4.42. Markup declaration open state"
         html = ('<![if !(IE)]>broken condcom<![endif]>'
                 '<![if ! IE]><link href="favicon.tiff"/><![endif]>'
                 '<![if !IE 6]><img src="firefox.png" /><![endif]>'
                 '<![if !ie 6]><b>foo</b><![endif]>'
                 '<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
-        # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
-        # and "8.2.4.45 Markup declaration open state", comment tokens should
-        # be emitted instead of 'unknown decl', but calling unknown_decl
-        # provides more flexibility.
-        # See also Lib/_markupbase.py:parse_declaration
         expected = [
-            ('unknown decl', 'if !(IE)'),
+            ('comment', '[if !(IE)]'),
             ('data', 'broken condcom'),
-            ('unknown decl', 'endif'),
-            ('unknown decl', 'if ! IE'),
+            ('comment', '[endif]'),
+            ('comment', '[if ! IE]'),
             ('startendtag', 'link', [('href', 'favicon.tiff')]),
-            ('unknown decl', 'endif'),
-            ('unknown decl', 'if !IE 6'),
+            ('comment', '[endif]'),
+            ('comment', '[if !IE 6]'),
             ('startendtag', 'img', [('src', 'firefox.png')]),
-            ('unknown decl', 'endif'),
-            ('unknown decl', 'if !ie 6'),
+            ('comment', '[endif]'),
+            ('comment', '[if !ie 6]'),
             ('starttag', 'b', []),
             ('data', 'foo'),
             ('endtag', 'b'),
-            ('unknown decl', 'endif'),
-            ('unknown decl', 'if (!IE)|(lt IE 9)'),
+            ('comment', '[endif]'),
+            ('comment', '[if (!IE)|(lt IE 9)]'),
             ('startendtag', 'img', [('src', 'mammoth.bmp')]),
-            ('unknown decl', 'endif')
+            ('comment', '[endif]')
+        ]
+        self._run_check(html, expected)
+
+    def test_cdata_declarations(self):
+        # More tests should be added. See also "8.2.4.42. Markup
+        # declaration open state", "8.2.4.69. CDATA section state",
+        # and issue 32876
+        html = ('<![CDATA[just some plain text]]>')
+        expected = [('unknown decl', 'CDATA[just some plain text')]
+        self._run_check(html, expected)
+
+    def test_cdata_declarations_multiline(self):
+        html = ('<code><![CDATA['
+                '    if (a < b && a > b) {'
+                '        printf("[<marquee>How?</marquee>]");'
+                '    }'
+                ']]></code>')
+        expected = [
+            ('starttag', 'code', []),
+            ('unknown decl',
+             'CDATA[    if (a < b && a > b) {        '
+             'printf("[<marquee>How?</marquee>]");    }'),
+            ('endtag', 'code')
         ]
         self._run_check(html, expected)
 
@@ -787,5 +873,17 @@ def test_weird_chars_in_unquoted_attribute_values(self):
                             ('starttag', 'form',
                                 [('action', 'bogus|&#()value')])])
 
+
+class TestInheritance(unittest.TestCase):
+
+    @patch("_markupbase.ParserBase.__init__")
+    @patch("_markupbase.ParserBase.reset")
+    def test_base_class_methods_called(self, super_reset_method, super_init_method):
+        with patch('_markupbase.ParserBase') as parser_base:
+            EventCollector()
+            super_init_method.assert_called_once()
+            super_reset_method.assert_called_once()
+
+
 if __name__ == "__main__":
     unittest.main()