Skip to content

Commit 4ca65bc

Browse files
timonviolaserhiy-storchakaambv
committed
[3.11] pythongh-118350: Fix support of elements "textarea" and "title" in HTMLParser (pythonGH-135310)
(cherry picked from commit 4d02f31) Co-authored-by: Timon Viola <[email protected]> Co-authored-by: Serhiy Storchaka <[email protected]> Co-authored-by: Łukasz Langa <[email protected]>
1 parent 3511c2e commit 4ca65bc

File tree

3 files changed

+113
-5
lines changed

3 files changed

+113
-5
lines changed

Lib/html/parser.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ class HTMLParser(_markupbase.ParserBase):
110110
"""
111111

112112
CDATA_CONTENT_ELEMENTS = ("script", "style")
113+
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
113114

114115
def __init__(self, *, convert_charrefs=True):
115116
"""Initialize and reset this instance.
@@ -126,6 +127,7 @@ def reset(self):
126127
self.lasttag = '???'
127128
self.interesting = interesting_normal
128129
self.cdata_elem = None
130+
self._escapable = True
129131
_markupbase.ParserBase.reset(self)
130132

131133
def feed(self, data):
@@ -147,14 +149,20 @@ def get_starttag_text(self):
147149
"""Return full source of start tag: '<...>'."""
148150
return self.__starttag_text
149151

150-
def set_cdata_mode(self, elem):
152+
def set_cdata_mode(self, elem, *, escapable=False):
151153
self.cdata_elem = elem.lower()
152-
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
153-
re.IGNORECASE|re.ASCII)
154+
self._escapable = escapable
155+
if escapable and not self.convert_charrefs:
156+
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
157+
re.IGNORECASE|re.ASCII)
158+
else:
159+
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
160+
re.IGNORECASE|re.ASCII)
154161

155162
def clear_cdata_mode(self):
156163
self.interesting = interesting_normal
157164
self.cdata_elem = None
165+
self._escapable = True
158166

159167
# Internal -- handle data as far as reasonable. May leave state
160168
# and data to be processed by a subsequent call. If 'end' is
@@ -187,7 +195,7 @@ def goahead(self, end):
187195
break
188196
j = n
189197
if i < j:
190-
if self.convert_charrefs and not self.cdata_elem:
198+
if self.convert_charrefs and self._escapable:
191199
self.handle_data(unescape(rawdata[i:j]))
192200
else:
193201
self.handle_data(rawdata[i:j])
@@ -289,7 +297,7 @@ def goahead(self, end):
289297
assert 0, "interesting.search() lied"
290298
# end while
291299
if end and i < n:
292-
if self.convert_charrefs and not self.cdata_elem:
300+
if self.convert_charrefs and self._escapable:
293301
self.handle_data(unescape(rawdata[i:n]))
294302
else:
295303
self.handle_data(rawdata[i:n])
@@ -401,6 +409,8 @@ def parse_starttag(self, i):
401409
self.handle_starttag(tag, attrs)
402410
if tag in self.CDATA_CONTENT_ELEMENTS:
403411
self.set_cdata_mode(tag)
412+
elif tag in self.RCDATA_CONTENT_ELEMENTS:
413+
self.set_cdata_mode(tag, escapable=True)
404414
return endpos
405415

406416
# Internal -- check to see if we have a complete starttag; return end

Lib/test/test_htmlparser.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,49 @@ def test_style_content(self, content):
316316
("data", content),
317317
("endtag", "style")])
318318

319+
@support.subTests('content', [
320+
'<!-- not a comment -->',
321+
"<not a='start tag'>",
322+
'<![CDATA[not a cdata]]>',
323+
'<!not a bogus comment>',
324+
'</not a bogus comment>',
325+
'\u2603',
326+
'< /title>',
327+
'</ title>',
328+
'</titled>',
329+
'</title\v>',
330+
'</title\xa0>',
331+
'</tıtle>',
332+
])
333+
def test_title_content(self, content):
334+
source = f"<title>{content}</title>"
335+
self._run_check(source, [
336+
("starttag", "title", []),
337+
("data", content),
338+
("endtag", "title"),
339+
])
340+
341+
@support.subTests('content', [
342+
'<!-- not a comment -->',
343+
"<not a='start tag'>",
344+
'<![CDATA[not a cdata]]>',
345+
'<!not a bogus comment>',
346+
'</not a bogus comment>',
347+
'\u2603',
348+
'< /textarea>',
349+
'</ textarea>',
350+
'</textareable>',
351+
'</textarea\v>',
352+
'</textarea\xa0>',
353+
])
354+
def test_textarea_content(self, content):
355+
source = f"<textarea>{content}</textarea>"
356+
self._run_check(source, [
357+
("starttag", "textarea", []),
358+
("data", content),
359+
("endtag", "textarea"),
360+
])
361+
319362
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
320363
'script/', 'script foo=bar', 'script foo=">"'])
321364
def test_script_closing_tag(self, endtag):
@@ -345,6 +388,38 @@ def test_style_closing_tag(self, endtag):
345388
("endtag", "style")],
346389
collector=EventCollectorNoNormalize(convert_charrefs=False))
347390

391+
@support.subTests('endtag', ['title', 'TITLE', 'title ', 'title\n',
392+
'title/', 'title foo=bar', 'title foo=">"'])
393+
def test_title_closing_tag(self, endtag):
394+
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
395+
s = f'<TitLe>{content}</{endtag}>'
396+
self._run_check(s, [("starttag", "title", []),
397+
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
398+
("endtag", "title")],
399+
collector=EventCollectorNoNormalize(convert_charrefs=True))
400+
self._run_check(s, [("starttag", "title", []),
401+
('data', '<!-- not a comment --><i>Egg '),
402+
('entityref', 'amp'),
403+
('data', ' Spam</i>'),
404+
("endtag", "title")],
405+
collector=EventCollectorNoNormalize(convert_charrefs=False))
406+
407+
@support.subTests('endtag', ['textarea', 'TEXTAREA', 'textarea ', 'textarea\n',
408+
'textarea/', 'textarea foo=bar', 'textarea foo=">"'])
409+
def test_textarea_closing_tag(self, endtag):
410+
content = "<!-- not a comment --><i>Egg &amp; Spam</i>"
411+
s = f'<TexTarEa>{content}</{endtag}>'
412+
self._run_check(s, [("starttag", "textarea", []),
413+
('data', '<!-- not a comment --><i>Egg & Spam</i>'),
414+
("endtag", "textarea")],
415+
collector=EventCollectorNoNormalize(convert_charrefs=True))
416+
self._run_check(s, [("starttag", "textarea", []),
417+
('data', '<!-- not a comment --><i>Egg '),
418+
('entityref', 'amp'),
419+
('data', ' Spam</i>'),
420+
("endtag", "textarea")],
421+
collector=EventCollectorNoNormalize(convert_charrefs=False))
422+
348423
@support.subTests('tail,end', [
349424
('', False),
350425
('<', False),
@@ -362,6 +437,27 @@ def test_eof_in_script(self, tail, end):
362437
("data", content if end else content + tail)],
363438
collector=EventCollectorNoNormalize(convert_charrefs=False))
364439

440+
@support.subTests('tail,end', [
441+
('', False),
442+
('<', False),
443+
('</', False),
444+
('</t', False),
445+
('</title', False),
446+
('</title ', True),
447+
('</title foo=bar', True),
448+
('</title foo=">', True),
449+
])
450+
def test_eof_in_title(self, tail, end):
451+
s = f'<TitLe>Egg &amp; Spam{tail}'
452+
self._run_check(s, [("starttag", "title", []),
453+
("data", "Egg & Spam" + ('' if end else tail))],
454+
collector=EventCollectorNoNormalize(convert_charrefs=True))
455+
self._run_check(s, [("starttag", "title", []),
456+
('data', 'Egg '),
457+
('entityref', 'amp'),
458+
('data', ' Spam' + ('' if end else tail))],
459+
collector=EventCollectorNoNormalize(convert_charrefs=False))
460+
365461
def test_comments(self):
366462
html = ("<!-- I'm a valid comment -->"
367463
'<!--me too!-->'
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix support of escapable raw text mode (elements "textarea" and "title")
2+
in :class:`html.parser.HTMLParser`.

0 commit comments

Comments
 (0)