Skip to content

Commit 1904262

Browse files
[3.12] gh-135661: Fix CDATA section parsing in HTMLParser (GH-135665)
"] ]>" and "]] >" no longer end the CDATA section. Make CDATA section parsing context depending. Add private method HTMLParser._set_support_cdata() to change the context. If called with True, "<[CDATA[" starts a CDATA section which ends with "]]>". If called with False, "<[CDATA[" starts a bogus comments which ends with ">". (cherry picked from commit 0cbbfc4) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent c9d9f78 commit 1904262

File tree

3 files changed

+104
-8
lines changed

3 files changed

+104
-8
lines changed

Lib/html/parser.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ def reset(self):
128128
self.lasttag = '???'
129129
self.interesting = interesting_normal
130130
self.cdata_elem = None
131+
self._support_cdata = True
131132
self._escapable = True
132133
super().reset()
133134

@@ -165,6 +166,19 @@ def clear_cdata_mode(self):
165166
self.cdata_elem = None
166167
self._escapable = True
167168

169+
def _set_support_cdata(self, flag=True):
170+
"""Enable or disable support of the CDATA sections.
171+
If enabled, "<[CDATA[" starts a CDATA section which ends with "]]>".
172+
If disabled, "<[CDATA[" starts a bogus comments which ends with ">".
173+
174+
This method is not called by default. Its purpose is to be called
175+
in custom handle_starttag() and handle_endtag() methods, with
176+
value that depends on the adjusted current node.
177+
See https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
178+
for details.
179+
"""
180+
self._support_cdata = flag
181+
168182
# Internal -- handle data as far as reasonable. May leave state
169183
# and data to be processed by a subsequent call. If 'end' is
170184
# true, force handling all data as if followed by EOF marker.
@@ -239,7 +253,7 @@ def goahead(self, end):
239253
j -= len(suffix)
240254
break
241255
self.handle_comment(rawdata[i+4:j])
242-
elif startswith("<![CDATA[", i):
256+
elif startswith("<![CDATA[", i) and self._support_cdata:
243257
self.unknown_decl(rawdata[i+3:])
244258
elif rawdata[i:i+9].lower() == '<!doctype':
245259
self.handle_decl(rawdata[i+2:])
@@ -315,15 +329,28 @@ def parse_html_declaration(self, i):
315329
if rawdata[i:i+4] == '<!--':
316330
# this case is actually already handled in goahead()
317331
return self.parse_comment(i)
318-
elif rawdata[i:i+3] == '<![':
319-
return self.parse_marked_section(i)
332+
elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
333+
j = rawdata.find(']]>', i+9)
334+
if j < 0:
335+
return -1
336+
self.unknown_decl(rawdata[i+3: j])
337+
return j + 3
320338
elif rawdata[i:i+9].lower() == '<!doctype':
321339
# find the closing >
322340
gtpos = rawdata.find('>', i+9)
323341
if gtpos == -1:
324342
return -1
325343
self.handle_decl(rawdata[i+2:gtpos])
326344
return gtpos+1
345+
elif rawdata[i:i+3] == '<![':
346+
j = rawdata.find('>', i+3)
347+
if j < 0:
348+
return -1
349+
if rawdata[j-1] == ']':
350+
self.unknown_decl(rawdata[i+3: j-1])
351+
else:
352+
self.handle_comment(rawdata[i+2: j])
353+
return j + 1
327354
else:
328355
return self.parse_bogus_comment(i)
329356

Lib/test/test_htmlparser.py

Lines changed: 69 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,13 @@
1010

1111
class EventCollector(html.parser.HTMLParser):
1212

13-
def __init__(self, *args, **kw):
13+
def __init__(self, *args, autocdata=False, **kw):
14+
self.autocdata = autocdata
1415
self.events = []
1516
self.append = self.events.append
1617
html.parser.HTMLParser.__init__(self, *args, **kw)
18+
if autocdata:
19+
self._set_support_cdata(False)
1720

1821
def get_events(self):
1922
# Normalize the list of events so that buffer artefacts don't
@@ -34,12 +37,16 @@ def get_events(self):
3437

3538
def handle_starttag(self, tag, attrs):
3639
self.append(("starttag", tag, attrs))
40+
if self.autocdata and tag == 'svg':
41+
self._set_support_cdata(True)
3742

3843
def handle_startendtag(self, tag, attrs):
3944
self.append(("startendtag", tag, attrs))
4045

4146
def handle_endtag(self, tag):
4247
self.append(("endtag", tag))
48+
if self.autocdata and tag == 'svg':
49+
self._set_support_cdata(False)
4350

4451
# all other markup
4552

@@ -740,10 +747,6 @@ def test_eof_in_declarations(self):
740747
('<!', [('comment', '')]),
741748
('<!-', [('comment', '-')]),
742749
('<![', [('comment', '[')]),
743-
('<![CDATA[', [('unknown decl', 'CDATA[')]),
744-
('<![CDATA[x', [('unknown decl', 'CDATA[x')]),
745-
('<![CDATA[x]', [('unknown decl', 'CDATA[x]')]),
746-
('<![CDATA[x]]', [('unknown decl', 'CDATA[x]]')]),
747750
('<!DOCTYPE', [('decl', 'DOCTYPE')]),
748751
('<!DOCTYPE ', [('decl', 'DOCTYPE ')]),
749752
('<!DOCTYPE html', [('decl', 'DOCTYPE html')]),
@@ -756,6 +759,18 @@ def test_eof_in_declarations(self):
756759
for html, expected in data:
757760
self._run_check(html, expected)
758761

762+
@support.subTests('content', ['', 'x', 'x]', 'x]]'])
763+
def test_eof_in_cdata(self, content):
764+
self._run_check('<![CDATA[' + content,
765+
[('unknown decl', 'CDATA[' + content)])
766+
self._run_check('<![CDATA[' + content,
767+
[('comment', '[CDATA[' + content)],
768+
collector=EventCollector(autocdata=True))
769+
self._run_check('<svg><text y="100"><![CDATA[' + content,
770+
[('starttag', 'svg', []),
771+
('starttag', 'text', [('y', '100')]),
772+
('unknown decl', 'CDATA[' + content)])
773+
759774
def test_bogus_comments(self):
760775
html = ('<!ELEMENT br EMPTY>'
761776
'<! not really a comment >'
@@ -805,8 +820,57 @@ def test_broken_condcoms(self):
805820
('startendtag', 'img', [('src', 'mammoth.bmp')]),
806821
('unknown decl', 'endif')
807822
]
823+
808824
self._run_check(html, expected)
809825

826+
@support.subTests('content', [
827+
'just some plain text',
828+
'<!-- not a comment -->',
829+
'&not-an-entity-ref;',
830+
"<not a='start tag'>",
831+
'',
832+
'[[I have many brackets]]',
833+
'I have a > in the middle',
834+
'I have a ]] in the middle',
835+
'] ]>',
836+
']] >',
837+
('\n'
838+
' if (a < b && a > b) {\n'
839+
' printf("[<marquee>How?</marquee>]");\n'
840+
' }\n'),
841+
])
842+
def test_cdata_section_content(self, content):
843+
# See "13.2.5.42 Markup declaration open state",
844+
# "13.2.5.69 CDATA section state", and issue bpo-32876.
845+
html = f'<svg><text y="100"><![CDATA[{content}]]></text></svg>'
846+
expected = [
847+
('starttag', 'svg', []),
848+
('starttag', 'text', [('y', '100')]),
849+
('unknown decl', 'CDATA[' + content),
850+
('endtag', 'text'),
851+
('endtag', 'svg'),
852+
]
853+
self._run_check(html, expected)
854+
self._run_check(html, expected, collector=EventCollector(autocdata=True))
855+
856+
def test_cdata_section(self):
857+
# See "13.2.5.42 Markup declaration open state".
858+
html = ('<![CDATA[foo<br>bar]]>'
859+
'<svg><text y="100"><![CDATA[foo<br>bar]]></text></svg>'
860+
'<![CDATA[foo<br>bar]]>')
861+
expected = [
862+
('comment', '[CDATA[foo<br'),
863+
('data', 'bar]]>'),
864+
('starttag', 'svg', []),
865+
('starttag', 'text', [('y', '100')]),
866+
('unknown decl', 'CDATA[foo<br>bar'),
867+
('endtag', 'text'),
868+
('endtag', 'svg'),
869+
('comment', '[CDATA[foo<br'),
870+
('data', 'bar]]>'),
871+
]
872+
self._run_check(html, expected, collector=EventCollector(autocdata=True))
873+
810874
def test_convert_charrefs_dropped_text(self):
811875
# #23144: make sure that all the events are triggered when
812876
# convert_charrefs is True, even if we don't call .close()
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to
2+
the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section.
3+
Add private method ``_set_support_cdata()`` which can be used to specify
4+
how to parse ``<[CDATA[`` --- as a CDATA section in foreign content
5+
(SVG or MathML) or as a bogus comment in the HTML namespace.

0 commit comments

Comments
 (0)