From d36cc68da3cbd02febae9886d7193e37cb54a73f Mon Sep 17 00:00:00 2001 From: hoping <14806317+htmambo@users.noreply.github.com> Date: Sat, 2 Aug 2025 13:23:49 +0800 Subject: [PATCH] Check and fix incomplete URLs in content. --- src/Readability.php | 66 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/src/Readability.php b/src/Readability.php index c4ea980..1a9e3bf 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -281,11 +281,77 @@ public function init(): bool */ public function postProcessContent(\DOMElement $articleContent): void { + $this->fixIncompleteURLs($articleContent); if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', $this->url)) { $this->addFootnotes($articleContent); } } + private function parse_url($url): ?array{ + $urlInfo = @parse_url($url); + if (false === $urlInfo) { + $this->logger->error('Failed to parse URL: ' . $url); + return NULL; + } + + if (!isset($urlInfo['path'])) { + $urlInfo['path'] = '/'; + } + + // 构建基础URL和路径URL + $urlInfo['base_url'] = $urlInfo['scheme'] . '://' . $urlInfo['host']; + if (isset($urlInfo['port']) && $urlInfo['port'] && 80 !== $urlInfo['port'] && 443 !== $urlInfo['port']) { + $urlInfo['base_url'] .= ':' . $urlInfo['port']; + } + + $urlInfo['path_url'] = $urlInfo['base_url'] . rtrim(dirname($urlInfo['path']), '/') . '/'; + + return $urlInfo; + } + + /** + * Check and fix incomplete URLs in content. + */ + public function fixIncompleteURLs($articleContent): void{ + $urlInfo = $this->parse_url($this->url); + if (!$urlInfo || !isset($urlInfo['scheme'], $urlInfo['host'])) { + $this->logger->error('Invalid URL provided for post-processing: ' . $this->url); + return; + } + // Process all and elements to ensure their href/src attributes are complete URLs. + $this->processUrlElements($articleContent, 'a', 'href', $urlInfo); + $this->processUrlElements($articleContent, 'img', 'src', $urlInfo); + } + + private function processUrlElements($articleContent, string $tagName, string $attribute, array $urlInfo): void{ + $elements = $articleContent->getElementsByTagName($tagName); + for ($i = 0; $i < $elements->length; ++$i) { + $element = $elements->item($i); + $url = $element->getAttribute($attribute); + $completedUrl = $this->completeUrl($url, $urlInfo); + $element->setAttribute($attribute, $completedUrl); + } + } + + private function completeUrl(string $url, array $urlInfo): string{ + if (@parse_url($url, PHP_URL_HOST)) { + return $url; + } + + if (strpos($url, '#') === 0 || strpos($url, 'javascript:') === 0) { + return $url; + } + + if (strpos($url, '/') === 0) { + return $urlInfo['base_url'] . $url; + } + + $result = $urlInfo['path_url'] . $url; + $result = preg_replace('/\/[^\/]+\/\.\.\//', '/', $result); + $result = preg_replace('/\/\.\//', '/', $result); + return $result; + } + /** * For easier reading, convert this document to have footnotes at the bottom rather than inline links. *