Skip to content

Commit f1c6297

Browse files
committed
Fix discarding html[lang]
`DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag. This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding. In f14428e, we tried to resolve it by putting `meta[charset]` tag at the start of the HTML fragment. Unfortunately, it turns out that causes parser to auto-insert a `html` element, losing the attributes of the original `html` tag. Let’s try to insert the `meta[charset]` tag into the proper place in the HTML document. We do not need to use the same trick with `JSLikeHTMLElement::__set`. That expects smaller HTML fragments, not `html` documents, so creating `html` and `head` elements will not be a problem. (cherry picked from commit efbbc86) Had to strip type hints since we still target PHP 5.6.
1 parent 5afefcf commit f1c6297

File tree

2 files changed

+84
-1
lines changed

2 files changed

+84
-1
lines changed

src/Readability.php

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1435,7 +1435,7 @@ private function loadHtml()
14351435
unset($tidy);
14361436
}
14371437

1438-
$this->html = '<meta charset="utf-8">' . (string) $this->html;
1438+
$this->html = self::ensureMetaCharset((string) $this->html);
14391439

14401440
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
14411441
$this->dom = (new HTML5())->loadHTML($this->html);
@@ -1453,4 +1453,45 @@ private function loadHtml()
14531453

14541454
$this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
14551455
}
1456+
1457+
/**
1458+
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
1459+
*
1460+
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
1461+
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
1462+
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
1463+
*
1464+
* @param string $html UTF-8 encoded document
1465+
*/
1466+
private static function ensureMetaCharset($html)
1467+
{
1468+
$charsetTag = '<meta charset="utf-8">';
1469+
1470+
// Only look at first 1024 bytes since, according to HTML5 specification,
1471+
// that’s where <meta> elements declaring a character encoding must be located.
1472+
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
1473+
$start = substr($html, 0, 1000);
1474+
1475+
if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
1476+
// <meta> tag is already present, no need for modification.
1477+
return $html;
1478+
}
1479+
1480+
if (1 === preg_match('/<head[^>]*>/i', $start)) {
1481+
// <head> tag was located, <meta> tags go there.
1482+
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);
1483+
1484+
return $html;
1485+
}
1486+
1487+
if (1 === preg_match('/<html[^>]*>/i', $start)) {
1488+
// <html> tag was located, let’s put it inside and have parser create <head>.
1489+
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);
1490+
1491+
return $html;
1492+
}
1493+
1494+
// Fallback – just plop the <meta> at the start of the fragment.
1495+
return $charsetTag . $html;
1496+
}
14561497
}

tests/ReadabilityTest.php

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -486,6 +486,48 @@ public function testWithWipedBody()
486486
$this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
487487
}
488488

489+
/**
490+
* @return array<string, array{0: string, 1: string, 2?: bool}>
491+
*/
492+
public function dataForHtmlLang()
493+
{
494+
return [
495+
'meta' => [
496+
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
497+
'fr',
498+
],
499+
'head' => [
500+
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
501+
'fr',
502+
],
503+
'headless' => [
504+
'<html lang="fr"><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
505+
'fr',
506+
// tidy would add <head> tag.
507+
false,
508+
],
509+
'fragment' => [
510+
'<article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article>',
511+
'',
512+
// tidy would add <html>.
513+
false,
514+
],
515+
];
516+
}
517+
518+
/**
519+
* @dataProvider dataForHtmlLang
520+
*/
521+
public function testHtmlLang($html, $lang, $useTidy = true)
522+
{
523+
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', $useTidy);
524+
$res = $readability->init();
525+
526+
$this->assertTrue($res);
527+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
528+
$this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang'));
529+
}
530+
489531
private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
490532
{
491533
$readability = new Readability($html, $url, $parser, $useTidy);

0 commit comments

Comments
 (0)