Skip to content

Commit a335967

Browse files
[3.13] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665) (GH-137773)
"] ]>" and "]] >" no longer end the CDATA section. Make CDATA section parsing context depending. Add private method HTMLParser._set_support_cdata() to change the context. If called with True, "<[CDATA[" starts a CDATA section which ends with "]]>". If called with False, "<[CDATA[" starts a bogus comments which ends with ">". (cherry picked from commit 0cbbfc4) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent 46b2577 commit a335967

File tree

3 files changed

+94
-25
lines changed

3 files changed

+94
-25
lines changed

Lib/html/parser.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ def reset(self):
146146
self.lasttag = '???'
147147
self.interesting = interesting_normal
148148
self.cdata_elem = None
149+
self._support_cdata = True
149150
self._escapable = True
150151
super().reset()
151152

@@ -183,6 +184,19 @@ def clear_cdata_mode(self):
183184
self.cdata_elem = None
184185
self._escapable = True
185186

187+
def _set_support_cdata(self, flag=True):
188+
"""Enable or disable support of the CDATA sections.
189+
If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
190+
If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
191+
192+
This method is not called by default. Its purpose is to be called
193+
in custom handle_starttag() and handle_endtag() methods, with
194+
value that depends on the adjusted current node.
195+
See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
196+
for details.
197+
"""
198+
self._support_cdata = flag
199+
186200
# Internal -- handle data as far as reasonable. May leave state
187201
# and data to be processed by a subsequent call. If 'end' is
188202
# true, force handling all data as if followed by EOF marker.
@@ -258,7 +272,10 @@ def goahead(self, end):
258272
break
259273
self.handle_comment(rawdata[i+4:j])
260274
elif startswith("<![CDATA[", i):
261-
self.unknown_decl(rawdata[i+3:])
275+
if self._support_cdata:
276+
self.unknown_decl(rawdata[i+3:])
277+
else:
278+
self.handle_comment(rawdata[i+1:])
262279
elif rawdata[i:i+9].lower() == '<!doctype':
263280
self.handle_decl(rawdata[i+2:])
264281
elif startswith("<!", i):
@@ -334,7 +351,14 @@ def parse_html_declaration(self, i):
334351
# this case is actually already handled in goahead()
335352
return self.parse_comment(i)
336353
elif rawdata[i:i+9] == '<![CDATA[':
337-
return self.parse_marked_section(i)
354+
if self._support_cdata:
355+
j = rawdata.find(']]>', i+9)
356+
if j < 0:
357+
return -1
358+
self.unknown_decl(rawdata[i+3: j])
359+
return j + 3
360+
else:
361+
return self.parse_bogus_comment(i)
338362
elif rawdata[i:i+9].lower() == '<!doctype':
339363
# find the closing >
340364
gtpos = rawdata.find('>', i+9)

Lib/test/test_htmlparser.py

Lines changed: 63 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,13 @@
1010

1111
class EventCollector(html.parser.HTMLParser):
1212

13-
def __init__(self, *args, **kw):
13+
def __init__(self, *args, autocdata=False, **kw):
14+
self.autocdata = autocdata
1415
self.events = []
1516
self.append = self.events.append
1617
html.parser.HTMLParser.__init__(self, *args, **kw)
18+
if autocdata:
19+
self._set_support_cdata(False)
1720

1821
def get_events(self):
1922
# Normalize the list of events so that buffer artefacts don't
@@ -34,12 +37,16 @@ def get_events(self):
3437

3538
def handle_starttag(self, tag, attrs):
3639
self.append(("starttag", tag, attrs))
40+
if self.autocdata and tag == 'svg':
41+
self._set_support_cdata(True)
3742

3843
def handle_startendtag(self, tag, attrs):
3944
self.append(("startendtag", tag, attrs))
4045

4146
def handle_endtag(self, tag):
4247
self.append(("endtag", tag))
48+
if self.autocdata and tag == 'svg':
49+
self._set_support_cdata(False)
4350

4451
# all other markup
4552

@@ -767,10 +774,6 @@ def test_eof_in_declarations(self):
767774
('<!', [('comment', '')]),
768775
('<!-', [('comment', '-')]),
769776
('<![', [('comment', '[')]),
770-
('<![CDATA[', [('unknown decl', 'CDATA[')]),
771-
('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
772-
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
773-
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
774777
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
775778
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
776779
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
@@ -783,6 +786,18 @@ def test_eof_in_declarations(self):
783786
for html, expected in data:
784787
self._run_check(html, expected)
785788

789+
@support.subTests('content', ['', 'x', 'x]', 'x]]'])
790+
def test_eof_in_cdata(self, content):
791+
self._run_check('<![CDATA[' + content,
792+
[('unknown decl', 'CDATA[' + content)])
793+
self._run_check('<![CDATA[' + content,
794+
[('comment', '![CDATA[' + content)],
795+
collector=EventCollector(autocdata=True))
796+
self._run_check('<svg><text y="100"><![CDATA[' + content,
797+
[('starttag', 'svg', []),
798+
('starttag', 'text', [('y', '100')]),
799+
('unknown decl', 'CDATA[' + content)])
800+
786801
def test_bogus_comments(self):
787802
html = ('<!ELEMENT br EMPTY>'
788803
'<! not really a comment >'
@@ -845,28 +860,53 @@ def test_broken_condcoms(self):
845860
]
846861
self._run_check(html, expected)
847862

848-
def test_cdata_declarations(self):
849-
# More tests should be added. See also "8.2.4.42. Markup
850-
# declaration open state", "8.2.4.69. CDATA section state",
851-
# and issue 32876
852-
html = ('<![CDATA[just some plain text]]>')
853-
expected = [('unknown decl', 'CDATA[just some plain text')]
863+
@support.subTests('content', [
864+
'just some plain text',
865+
'<!-- not a comment -->',
866+
'&not-an-entity-ref;',
867+
"<not a='start tag'>",
868+
'',
869+
'[[I have many brackets]]',
870+
'I have a > in the middle',
871+
'I have a ]] in the middle',
872+
'] ]>',
873+
']] >',
874+
('\n'
875+
' if (a < b && a > b) {\n'
876+
' printf("[<marquee>How?</marquee>]");\n'
877+
' }\n'),
878+
])
879+
def test_cdata_section_content(self, content):
880+
# See "13.2.5.42 Markup declaration open state",
881+
# "13.2.5.69 CDATA section state", and issue bpo-32876.
882+
html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
883+
expected = [
884+
('starttag', 'svg', []),
885+
('starttag', 'text', [('y', '100')]),
886+
('unknown decl', 'CDATA[' + content),
887+
('endtag', 'text'),
888+
('endtag', 'svg'),
889+
]
854890
self._run_check(html, expected)
891+
self._run_check(html, expected, collector=EventCollector(autocdata=True))
855892

856-
def test_cdata_declarations_multiline(self):
857-
html = ('<code><![CDATA['
858-
' if (a < b && a > b) {'
859-
' printf("[<marquee>How?</marquee>]");'
860-
' }'
861-
']]></code>')
893+
def test_cdata_section(self):
894+
# See "13.2.5.42 Markup declaration open state".
895+
html = ('<![CDATA[foo<br>bar]]>'
896+
'<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
897+
'<![CDATA[foo<br>bar]]>')
862898
expected = [
863-
('starttag', 'code', []),
864-
('unknown decl',
865-
'CDATA[ if (a < b && a > b) { '
866-
'printf("[<marquee>How?</marquee>]"); }'),
867-
('endtag', 'code')
899+
('comment', '[CDATA[foo<br'),
900+
('data', 'bar]]>'),
901+
('starttag', 'svg', []),
902+
('starttag', 'text', [('y', '100')]),
903+
('unknown decl', 'CDATA[foo<br>bar'),
904+
('endtag', 'text'),
905+
('endtag', 'svg'),
906+
('comment', '[CDATA[foo<br'),
907+
('data', 'bar]]>'),
868908
]
869-
self._run_check(html, expected)
909+
self._run_check(html, expected, collector=EventCollector(autocdata=True))
870910

871911
def test_convert_charrefs_dropped_text(self):
872912
# #23144: make sure that all the events are triggered when
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
2+
the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
3+
Add private method ``_set_support_cdata()`` which can be used to specify
4+
how to parse ``<[CDATA[`` --- as a CDATA section in foreign content
5+
(SVG or MathML) or as a bogus comment in the HTML namespace.

0 commit comments

Comments
 (0)