Skip to content

Commit 109a226

Browse files
authored
Merge pull request #103 from jtojnar/backports-local-no-domain
[1.x] Backport parser_url + html[lang] fixes
2 parents 487ce3a + f1c6297 commit 109a226

File tree

2 files changed

+93
-27
lines changed

2 files changed

+93
-27
lines changed

src/Readability.php

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1396,7 +1396,10 @@ private function loadHtml()
13961396
$this->logger->debug('Parsing URL: ' . $this->url);
13971397

13981398
if ($this->url) {
1399-
$this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', parse_url($this->url, \PHP_URL_HOST)), ['.' => '\.']) . '/';
1399+
$host = parse_url($this->url, \PHP_URL_HOST);
1400+
if (null !== $host) {
1401+
$this->domainRegExp = '/' . strtr(preg_replace('/www\d*\./', '', $host), ['.' => '\.']) . '/';
1402+
}
14001403
}
14011404

14021405
mb_internal_encoding('UTF-8');
@@ -1432,7 +1435,7 @@ private function loadHtml()
14321435
unset($tidy);
14331436
}
14341437

1435-
$this->html = '<meta charset="utf-8">' . (string) $this->html;
1438+
$this->html = self::ensureMetaCharset((string) $this->html);
14361439

14371440
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
14381441
$this->dom = (new HTML5())->loadHTML($this->html);
@@ -1450,4 +1453,45 @@ private function loadHtml()
14501453

14511454
$this->dom->registerNodeClass('DOMElement', 'Readability\JSLikeHTMLElement');
14521455
}
1456+
1457+
/**
1458+
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
1459+
*
1460+
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
1461+
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
1462+
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
1463+
*
1464+
* @param string $html UTF-8 encoded document
1465+
*/
1466+
private static function ensureMetaCharset($html)
1467+
{
1468+
$charsetTag = '<meta charset="utf-8">';
1469+
1470+
// Only look at first 1024 bytes since, according to HTML5 specification,
1471+
// that’s where <meta> elements declaring a character encoding must be located.
1472+
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
1473+
$start = substr($html, 0, 1000);
1474+
1475+
if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
1476+
// <meta> tag is already present, no need for modification.
1477+
return $html;
1478+
}
1479+
1480+
if (1 === preg_match('/<head[^>]*>/i', $start)) {
1481+
// <head> tag was located, <meta> tags go there.
1482+
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);
1483+
1484+
return $html;
1485+
}
1486+
1487+
if (1 === preg_match('/<html[^>]*>/i', $start)) {
1488+
// <html> tag was located, let’s put it inside and have parser create <head>.
1489+
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);
1490+
1491+
return $html;
1492+
}
1493+
1494+
// Fallback – just plop the <meta> at the start of the fragment.
1495+
return $charsetTag . $html;
1496+
}
14531497
}

tests/ReadabilityTest.php

Lines changed: 47 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,15 @@ public function testConstructDefault()
1919
$readability = $this->getReadability('');
2020

2121
$this->assertNull($readability->url);
22-
$this->assertInstanceOf('DomDocument', $readability->dom);
22+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
2323
}
2424

2525
public function testConstructHtml5Parser()
2626
{
2727
$readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'html5lib');
2828

2929
$this->assertSame('http://0.0.0.0', $readability->url);
30-
$this->assertInstanceOf('DomDocument', $readability->dom);
30+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
3131
$this->assertSame('<html/>', $readability->original_html);
3232
}
3333

@@ -39,7 +39,7 @@ public function testConstructSimple()
3939
$readability = $this->getReadability('<html/>', 'http://0.0.0.0');
4040

4141
$this->assertSame('http://0.0.0.0', $readability->url);
42-
$this->assertInstanceOf('DomDocument', $readability->dom);
42+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
4343
$this->assertSame('<html/>', $readability->original_html);
4444
$this->assertTrue($readability->tidied);
4545
}
@@ -52,15 +52,15 @@ public function testConstructDefaultWithoutTidy()
5252
$this->assertSame('', $readability->original_html);
5353
$this->assertFalse($readability->tidied);
5454

55-
$this->assertInstanceOf('DomDocument', $readability->dom);
55+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
5656
}
5757

5858
public function testConstructSimpleWithoutTidy()
5959
{
6060
$readability = $this->getReadability('<html/>', 'http://0.0.0.0', 'libxml', false);
6161

6262
$this->assertSame('http://0.0.0.0', $readability->url);
63-
$this->assertInstanceOf('DomDocument', $readability->dom);
63+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
6464
$this->assertSame('<html/>', $readability->original_html);
6565
$this->assertFalse($readability->tidied);
6666
}
@@ -106,7 +106,6 @@ public function testInitDivP()
106106
public function testInitDiv()
107107
{
108108
$readability = $this->getReadability('<div>' . str_repeat('This is the awesome content :)', 7) . '</div>', 'http://0.0.0.0');
109-
$readability->debug = true;
110109
$res = $readability->init();
111110

112111
$this->assertTrue($res);
@@ -120,7 +119,6 @@ public function testInitDiv()
120119
public function testWithFootnotes()
121120
{
122121
$readability = $this->getReadability('<div>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '</div>', 'http://0.0.0.0');
123-
$readability->debug = true;
124122
$readability->convertLinksToFootnotes = true;
125123
$res = $readability->init();
126124

@@ -137,7 +135,6 @@ public function testWithFootnotes()
137135
public function testStandardClean()
138136
{
139137
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<a href="#nofollow" rel="nofollow">will NOT be removed</a></div>', 'http://0.0.0.0');
140-
$readability->debug = true;
141138
$readability->lightClean = false;
142139
$res = $readability->init();
143140

@@ -154,7 +151,6 @@ public function testStandardClean()
154151
public function testWithIframe()
155152
{
156153
$readability = $this->getReadability('<div><h2>Title</h2>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe><iframe>http://soundcloud.com/test</iframe></p></div>', 'http://0.0.0.0');
157-
$readability->debug = true;
158154
$res = $readability->init();
159155

160156
$this->assertTrue($res);
@@ -169,7 +165,6 @@ public function testWithIframe()
169165
public function testWithArticle()
170166
{
171167
$readability = $this->getReadability('<article><p>' . str_repeat('This is an awesome text with some links, here there are: the awesome', 20) . '</p><p>This is an awesome text with some links, here there are <iframe src="http://youtube.com/test" href="#nofollow" rel="nofollow"></iframe></p></article>', 'http://0.0.0.0');
172-
$readability->debug = true;
173168
$res = $readability->init();
174169

175170
$this->assertTrue($res);
@@ -184,7 +179,6 @@ public function testWithArticle()
184179
public function testWithAside()
185180
{
186181
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<footer><aside>' . str_repeat('<p>This is an awesome text with some links, here there are</p>', 8) . '</aside></footer></article>', 'http://0.0.0.0');
187-
$readability->debug = true;
188182
$res = $readability->init();
189183

190184
$this->assertTrue($res);
@@ -199,7 +193,6 @@ public function testWithAside()
199193
public function testWithClasses()
200194
{
201195
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
202-
$readability->debug = true;
203196
$res = $readability->init();
204197

205198
$this->assertTrue($res);
@@ -214,7 +207,6 @@ public function testWithClasses()
214207
public function testWithClassesWithoutLightClean()
215208
{
216209
$readability = $this->getReadability('<article>' . str_repeat('<p>This is an awesome text with some links, here there are: <a href="http://0.0.0.0/test.html">the awesome</a></p>', 7) . '<div style="display:none">' . str_repeat('<p class="clock">This text should be removed</p>', 10) . '</div></article>', 'http://0.0.0.0');
217-
$readability->debug = true;
218210
$readability->lightClean = false;
219211
$res = $readability->init();
220212

@@ -230,7 +222,6 @@ public function testWithClassesWithoutLightClean()
230222
public function testWithTd()
231223
{
232224
$readability = $this->getReadability('<table><tr>' . str_repeat('<td><p>This is an awesome text with some links, here there are the awesome</td>', 7) . '</tr></table>', 'http://0.0.0.0');
233-
$readability->debug = true;
234225
$res = $readability->init();
235226

236227
$this->assertTrue($res);
@@ -243,7 +234,6 @@ public function testWithTd()
243234
public function testWithSameClasses()
244235
{
245236
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<div class="awesomecontent">This text is also an awesome text and you should know that !</div></article>', 'http://0.0.0.0');
246-
$readability->debug = true;
247237
$res = $readability->init();
248238

249239
$this->assertTrue($res);
@@ -257,7 +247,6 @@ public function testWithSameClasses()
257247
public function testWithScript()
258248
{
259249
$readability = $this->getReadability('<article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p><script>This text is also an awesome text and you should know that !</script></p></article>', 'http://0.0.0.0');
260-
$readability->debug = true;
261250
$res = $readability->init();
262251

263252
$this->assertTrue($res);
@@ -271,7 +260,6 @@ public function testWithScript()
271260
public function testTitle()
272261
{
273262
$readability = $this->getReadability('<title>this is my title</title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
274-
$readability->debug = true;
275263
$res = $readability->init();
276264

277265
$this->assertTrue($res);
@@ -285,7 +273,6 @@ public function testTitle()
285273
public function testTitleWithDash()
286274
{
287275
$readability = $this->getReadability('<title> title2 - title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
288-
$readability->debug = true;
289276
$res = $readability->init();
290277

291278
$this->assertTrue($res);
@@ -299,7 +286,6 @@ public function testTitleWithDash()
299286
public function testTitleWithDoubleDot()
300287
{
301288
$readability = $this->getReadability('<title> title2 : title3 </title><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
302-
$readability->debug = true;
303289
$res = $readability->init();
304290

305291
$this->assertTrue($res);
@@ -313,7 +299,6 @@ public function testTitleWithDoubleDot()
313299
public function testTitleTooShortUseH1()
314300
{
315301
$readability = $this->getReadability('<title>too short</title><h1>this is my h1 title !</h1><article class="awesomecontent">' . str_repeat('<p>This is an awesome text with some links, here there are the awesome</p>', 7) . '<p></p></article>', 'http://0.0.0.0');
316-
$readability->debug = true;
317302
$res = $readability->init();
318303

319304
$this->assertTrue($res);
@@ -365,7 +350,6 @@ public function testAutoClosingIframeNotThrowingException()
365350
</html>';
366351

367352
$readability = $this->getReadability($data, 'http://iosgames.ru/?p=22030');
368-
$readability->debug = true;
369353

370354
$res = $readability->init();
371355

@@ -433,7 +417,6 @@ public function testAppendIdAlreadyHere()
433417
</html>';
434418

435419
$readability = $this->getReadability($data, 'http://0.0.0.0');
436-
$readability->debug = true;
437420

438421
$res = $readability->init();
439422

@@ -472,7 +455,6 @@ public function testChildNodeGoneNull()
472455
$html = file_get_contents('tests/fixtures/childNodeGoesNull.html');
473456

474457
$readability = $this->getReadability($html, 'http://0.0.0.0');
475-
$readability->debug = true;
476458
$readability->convertLinksToFootnotes = true;
477459
$res = $readability->init();
478460

@@ -485,7 +467,6 @@ public function testKeepFootnotes()
485467
$html = file_get_contents('tests/fixtures/keepFootnotes.html');
486468

487469
$readability = $this->getReadability($html, 'http://0.0.0.0');
488-
$readability->debug = true;
489470
$res = $readability->init();
490471

491472
$this->assertTrue($res);
@@ -499,13 +480,54 @@ public function testWithWipedBody()
499480
$html = file_get_contents('tests/fixtures/wipedBody.html');
500481

501482
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', false);
502-
$readability->debug = true;
503483
$res = $readability->init();
504484

505485
$this->assertTrue($res);
506486
$this->assertStringContainsString('<a href="alice-I.html">Down the Rabbit-Hole</a>', $readability->getContent()->getInnerHtml());
507487
}
508488

489+
/**
490+
* @return array<string, array{0: string, 1: string, 2?: bool}>
491+
*/
492+
public function dataForHtmlLang()
493+
{
494+
return [
495+
'meta' => [
496+
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
497+
'fr',
498+
],
499+
'head' => [
500+
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
501+
'fr',
502+
],
503+
'headless' => [
504+
'<html lang="fr"><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
505+
'fr',
506+
// tidy would add <head> tag.
507+
false,
508+
],
509+
'fragment' => [
510+
'<article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article>',
511+
'',
512+
// tidy would add <html>.
513+
false,
514+
],
515+
];
516+
}
517+
518+
/**
519+
* @dataProvider dataForHtmlLang
520+
*/
521+
public function testHtmlLang($html, $lang, $useTidy = true)
522+
{
523+
$readability = $this->getReadability($html, 'http://0.0.0.0', 'libxml', $useTidy);
524+
$res = $readability->init();
525+
526+
$this->assertTrue($res);
527+
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
528+
$this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang'));
529+
}
530+
509531
private function getReadability($html, $url = null, $parser = 'libxml', $useTidy = true)
510532
{
511533
$readability = new Readability($html, $url, $parser, $useTidy);

0 commit comments

Comments
 (0)