4
4
import pprint
5
5
import unittest
6
6
7
+ from unittest .mock import patch
8
+
7
9
8
10
class EventCollector (html .parser .HTMLParser ):
9
11
@@ -315,6 +317,16 @@ def get_events(self):
315
317
("endtag" , element_lower )],
316
318
collector = Collector (convert_charrefs = False ))
317
319
320
+ def test_EOF_in_cdata (self ):
321
+ content = """<!-- not a comment --> ¬-an-entity-ref;
322
+ <a href="" /> </p><p> <span></span></style>
323
+ '</script' + '>'"""
324
+ s = f'<script>{ content } '
325
+ self ._run_check (s , [
326
+ ("starttag" , 'script' , []),
327
+ ("data" , content )
328
+ ])
329
+
318
330
def test_comments (self ):
319
331
html = ("<!-- I'm a valid comment -->"
320
332
'<!--me too!-->'
@@ -346,18 +358,16 @@ def test_convert_charrefs(self):
346
358
collector = lambda : EventCollectorCharrefs ()
347
359
self .assertTrue (collector ().convert_charrefs )
348
360
charrefs = ['"' , '"' , '"' , '"' , '"' , '"' ]
349
- # check charrefs in the middle of the text/attributes
350
- expected = [('starttag' , 'a' , [('href' , 'foo"zar' )]),
351
- ('data' , 'a"z' ), ('endtag' , 'a' )]
361
+ # check charrefs in the middle of the text
362
+ expected = [('starttag' , 'a' , []), ('data' , 'a"z' ), ('endtag' , 'a' )]
352
363
for charref in charrefs :
353
- self ._run_check ('<a href="foo{0}zar" >a{0}z</a>' .format (charref ),
364
+ self ._run_check ('<a>a{0}z</a>' .format (charref ),
354
365
expected , collector = collector ())
355
- # check charrefs at the beginning/end of the text/attributes
356
- expected = [('data' , '"' ),
357
- ('starttag' , 'a' , [('x' , '"' ), ('y' , '"X' ), ('z' , 'X"' )]),
366
+ # check charrefs at the beginning/end of the text
367
+ expected = [('data' , '"' ), ('starttag' , 'a' , []),
358
368
('data' , '"' ), ('endtag' , 'a' ), ('data' , '"' )]
359
369
for charref in charrefs :
360
- self ._run_check ('{0}<a x="{0}" y="{0}X" z="X{0}" >'
370
+ self ._run_check ('{0}<a>'
361
371
'{0}</a>{0}' .format (charref ),
362
372
expected , collector = collector ())
363
373
# check charrefs in <script>/<style> elements
@@ -380,6 +390,35 @@ def test_convert_charrefs(self):
380
390
self ._run_check ('no charrefs here' , [('data' , 'no charrefs here' )],
381
391
collector = collector ())
382
392
393
+ def test_convert_charrefs_in_attribute_values (self ):
394
+ # default value for convert_charrefs is now True
395
+ collector = lambda : EventCollectorCharrefs ()
396
+ self .assertTrue (collector ().convert_charrefs )
397
+
398
+ # always unescape terminated entity refs, numeric and hex char refs:
399
+ # - regardless whether they are at start, middle, end of attribute
400
+ # - or followed by alphanumeric, non-alphanumeric, or equals char
401
+ charrefs = ['¢' , '¢' , '¢' , '¢' , '¢' ]
402
+ expected = [('starttag' , 'a' ,
403
+ [('x' , '¢' ), ('x' , 'z¢' ), ('x' , '¢z' ),
404
+ ('x' , 'z¢z' ), ('x' , '¢ z' ), ('x' , '¢=z' )]),
405
+ ('endtag' , 'a' )]
406
+ for charref in charrefs :
407
+ self ._run_check ('<a x="{0}" x="z{0}" x="{0}z" '
408
+ ' x="z{0}z" x="{0} z" x="{0}=z"></a>'
409
+ .format (charref ), expected , collector = collector ())
410
+
411
+ # only unescape unterminated entity matches if they are not followed by
412
+ # an alphanumeric or an equals sign
413
+ charref = '¢'
414
+ expected = [('starttag' , 'a' ,
415
+ [('x' , '¢' ), ('x' , 'z¢' ), ('x' , '¢z' ),
416
+ ('x' , 'z¢z' ), ('x' , '¢ z' ), ('x' , '¢=z' )]),
417
+ ('endtag' , 'a' )]
418
+ self ._run_check ('<a x="{0}" x="z{0}" x="{0}z" '
419
+ ' x="z{0}z" x="{0} z" x="{0}=z"></a>'
420
+ .format (charref ), expected , collector = collector ())
421
+
383
422
# the remaining tests were for the "tolerant" parser (which is now
384
423
# the default), and check various kind of broken markup
385
424
def test_tolerant_parsing (self ):
@@ -537,52 +576,99 @@ def test_EOF_in_charref(self):
537
576
for html , expected in data :
538
577
self ._run_check (html , expected )
539
578
540
- def test_broken_comments (self ):
579
+ def test_EOF_in_comments_or_decls (self ):
580
+ data = [
581
+ ('<!' , [('data' , '<!' )]),
582
+ ('<!-' , [('data' , '<!-' )]),
583
+ ('<!--' , [('data' , '<!--' )]),
584
+ ('<![' , [('data' , '<![' )]),
585
+ ('<![CDATA[' , [('data' , '<![CDATA[' )]),
586
+ ('<![CDATA[x' , [('data' , '<![CDATA[x' )]),
587
+ ('<!DOCTYPE' , [('data' , '<!DOCTYPE' )]),
588
+ ('<!DOCTYPE HTML' , [('data' , '<!DOCTYPE HTML' )]),
589
+ ]
590
+ for html , expected in data :
591
+ self ._run_check (html , expected )
592
+ def test_bogus_comments (self ):
541
593
html = ('<! not really a comment >'
542
594
'<! not a comment either -->'
543
595
'<! -- close enough -->'
544
596
'<!><!<-- this was an empty comment>'
545
- '<!!! another bogus comment !!!>' )
597
+ '<!!! another bogus comment !!!>'
598
+ # see #32876
599
+ '<![with square brackets]!>'
600
+ '<![\n multiline\n bogusness\n ]!>'
601
+ '<![more brackets]-[and a hyphen]!>'
602
+ '<![cdata[should be uppercase]]>'
603
+ '<![CDATA [whitespaces are not ignored]]>'
604
+ '<![CDATA]]>' # required '[' after CDATA
605
+ )
546
606
expected = [
547
607
('comment' , ' not really a comment ' ),
548
608
('comment' , ' not a comment either --' ),
549
609
('comment' , ' -- close enough --' ),
550
610
('comment' , '' ),
551
611
('comment' , '<-- this was an empty comment' ),
552
612
('comment' , '!! another bogus comment !!!' ),
613
+ ('comment' , '[with square brackets]!' ),
614
+ ('comment' , '[\n multiline\n bogusness\n ]!' ),
615
+ ('comment' , '[more brackets]-[and a hyphen]!' ),
616
+ ('comment' , '[cdata[should be uppercase]]' ),
617
+ ('comment' , '[CDATA [whitespaces are not ignored]]' ),
618
+ ('comment' , '[CDATA]]' ),
553
619
]
554
620
self ._run_check (html , expected )
555
621
556
622
def test_broken_condcoms (self ):
557
623
# these condcoms are missing the '--' after '<!' and before the '>'
624
+ # and they are considered bogus comments according to
625
+ # "8.2.4.42. Markup declaration open state"
558
626
html = ('<![if !(IE)]>broken condcom<![endif]>'
559
627
'<![if ! IE]><link href="favicon.tiff"/><![endif]>'
560
628
'<![if !IE 6]><img src="firefox.png" /><![endif]>'
561
629
'<![if !ie 6]><b>foo</b><![endif]>'
562
630
'<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>' )
563
- # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
564
- # and "8.2.4.45 Markup declaration open state", comment tokens should
565
- # be emitted instead of 'unknown decl', but calling unknown_decl
566
- # provides more flexibility.
567
- # See also Lib/_markupbase.py:parse_declaration
568
631
expected = [
569
- ('unknown decl ' , 'if !(IE)' ),
632
+ ('comment ' , '[ if !(IE)] ' ),
570
633
('data' , 'broken condcom' ),
571
- ('unknown decl ' , 'endif' ),
572
- ('unknown decl ' , 'if ! IE' ),
634
+ ('comment ' , '[ endif] ' ),
635
+ ('comment ' , '[ if ! IE] ' ),
573
636
('startendtag' , 'link' , [('href' , 'favicon.tiff' )]),
574
- ('unknown decl ' , 'endif' ),
575
- ('unknown decl ' , 'if !IE 6' ),
637
+ ('comment ' , '[ endif] ' ),
638
+ ('comment ' , '[ if !IE 6] ' ),
576
639
('startendtag' , 'img' , [('src' , 'firefox.png' )]),
577
- ('unknown decl ' , 'endif' ),
578
- ('unknown decl ' , 'if !ie 6' ),
640
+ ('comment ' , '[ endif] ' ),
641
+ ('comment ' , '[ if !ie 6] ' ),
579
642
('starttag' , 'b' , []),
580
643
('data' , 'foo' ),
581
644
('endtag' , 'b' ),
582
- ('unknown decl ' , 'endif' ),
583
- ('unknown decl ' , 'if (!IE)|(lt IE 9)' ),
645
+ ('comment ' , '[ endif] ' ),
646
+ ('comment ' , '[ if (!IE)|(lt IE 9)] ' ),
584
647
('startendtag' , 'img' , [('src' , 'mammoth.bmp' )]),
585
- ('unknown decl' , 'endif' )
648
+ ('comment' , '[endif]' )
649
+ ]
650
+ self ._run_check (html , expected )
651
+
652
+ def test_cdata_declarations (self ):
653
+ # More tests should be added. See also "8.2.4.42. Markup
654
+ # declaration open state", "8.2.4.69. CDATA section state",
655
+ # and issue 32876
656
+ html = ('<![CDATA[just some plain text]]>' )
657
+ expected = [('unknown decl' , 'CDATA[just some plain text' )]
658
+ self ._run_check (html , expected )
659
+
660
+ def test_cdata_declarations_multiline (self ):
661
+ html = ('<code><![CDATA['
662
+ ' if (a < b && a > b) {'
663
+ ' printf("[<marquee>How?</marquee>]");'
664
+ ' }'
665
+ ']]></code>' )
666
+ expected = [
667
+ ('starttag' , 'code' , []),
668
+ ('unknown decl' ,
669
+ 'CDATA[ if (a < b && a > b) { '
670
+ 'printf("[<marquee>How?</marquee>]"); }' ),
671
+ ('endtag' , 'code' )
586
672
]
587
673
self ._run_check (html , expected )
588
674
@@ -787,5 +873,17 @@ def test_weird_chars_in_unquoted_attribute_values(self):
787
873
('starttag' , 'form' ,
788
874
[('action' , 'bogus|&#()value' )])])
789
875
876
+
877
+ class TestInheritance (unittest .TestCase ):
878
+
879
+ @patch ("_markupbase.ParserBase.__init__" )
880
+ @patch ("_markupbase.ParserBase.reset" )
881
+ def test_base_class_methods_called (self , super_reset_method , super_init_method ):
882
+ with patch ('_markupbase.ParserBase' ) as parser_base :
883
+ EventCollector ()
884
+ super_init_method .assert_called_once ()
885
+ super_reset_method .assert_called_once ()
886
+
887
+
790
888
if __name__ == "__main__" :
791
889
unittest .main ()
0 commit comments