feat: 新增网页解析类，优化依赖及图片编码支持

begoniezhao · lyingbug · commit 4fdbec17a7ee · 2025-11-18T22:37:01.000+08:00
diff --git a/docreader/parser/base_parser.py b/docreader/parser/base_parser.py
@@ -71,6 +71,7 @@ def __init__(
         max_concurrent_tasks: int = 5,  # Max concurrent tasks
         max_chunks: int = 1000,  # Max number of returned chunks
         chunking_config: Optional[ChunkingConfig] = None,
+        **kwargs,
     ):
         """Initialize parser
 
diff --git a/docreader/parser/markdown_image_util.py b/docreader/parser/markdown_image_util.py
@@ -12,7 +12,7 @@
 class MarkdownImageUtil:
     def __init__(self):
         self.b64_pattern = re.compile(
-            r"!\[([^\]]*)\]\(data:image/(\w+);base64,([^\)]+)\)"
+            r"!\[([^\]]*)\]\(data:image/(\w+)\+?\w*;base64,([^\)]+)\)"
         )
         self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
         self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
diff --git a/docreader/parser/web_parser.py b/docreader/parser/web_parser.py
@@ -1,19 +1,20 @@
 import asyncio
 import logging
 import os
-from typing import Any
 
-from bs4 import BeautifulSoup
 from playwright.async_api import async_playwright
+from trafilatura import extract
 
 from docreader.models.document import Document
 from docreader.parser.base_parser import BaseParser
+from docreader.parser.chain_parser import PipelineParser
+from docreader.parser.markdown_parser import MarkdownParser
 from docreader.utils import endecode
 
 logger = logging.getLogger(__name__)
 
 
-class WebParser(BaseParser):
+class StdWebParser(BaseParser):
     """Web page parser"""
 
     def __init__(self, title: str, **kwargs):
@@ -22,7 +23,7 @@ def __init__(self, title: str, **kwargs):
         super().__init__(file_name=title, **kwargs)
         logger.info(f"Initialized WebParser with title: {title}")
 
-    async def scrape(self, url: str) -> Any:
+    async def scrape(self, url: str) -> str:
         logger.info(f"Starting web page scraping for URL: {url}")
         try:
             async with async_playwright() as p:
@@ -40,9 +41,7 @@ async def scrape(self, url: str) -> Any:
                 except Exception as e:
                     logger.error(f"Error navigating to URL: {str(e)}")
                     await browser.close()
-                    return BeautifulSoup(
-                        "", "html.parser"
-                    )  # Return empty soup on navigation error
+                    return ""
 
                 logger.info("Retrieving page HTML content")
                 content = await page.content()
@@ -53,14 +52,13 @@ async def scrape(self, url: str) -> Any:
 
             # Parse HTML content with BeautifulSoup
             logger.info("Parsing HTML with BeautifulSoup")
-            soup = BeautifulSoup(content, "html.parser")
             logger.info("Successfully parsed HTML content")
-            return soup
+            return content
 
         except Exception as e:
             logger.error(f"Failed to scrape web page: {str(e)}")
             # Return empty BeautifulSoup object on error
-            return BeautifulSoup("", "html.parser")
+            return ""
 
     def parse_into_text(self, content: bytes) -> Document:
         """Parse web page
@@ -71,63 +69,36 @@ def parse_into_text(self, content: bytes) -> Document:
         Returns:
             Parse result
         """
-        logger.info("Starting web page parsing")
-
-        # Call async method synchronously
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-
-        try:
-            # Run async method
-            # Handle content possibly being a string
-            if isinstance(content, bytes):
-                url = endecode.decode_bytes(content)
-                logger.info(f"Decoded URL from bytes: {url}")
-            else:
-                url = str(content)
-                logger.info(f"Using content as URL directly: {url}")
-
-            logger.info(f"Scraping web page: {url}")
-            soup = loop.run_until_complete(self.scrape(url))
-
-            # Extract page text
-            logger.info("Extracting text from web page")
-            text = soup.get_text("\n")
-            logger.info(f"Extracted {len(text)} characters of text from URL: {url}")
-
-            # Get title, usually in <title> or <h1> tag
-            if self.title != "":
-                title = self.title
-                logger.info(f"Using provided title: {title}")
-            else:
-                title = soup.title.string if soup.title else None
-                logger.info(f"Found title tag: {title}")
-
-            if not title:  # If <title> tag does not exist or is empty, try <h1> tag
-                h1_tag = soup.find("h1")
-                if h1_tag:
-                    title = h1_tag.get_text()
-                    logger.info(f"Using h1 tag as title: {title}")
-                else:
-                    title = "Untitled Web Page"
-                    logger.info("No title found, using default")
-
-            logger.info(f"Web page title: {title}")
-            text = "\n".join(
-                (line.strip() for line in text.splitlines() if line.strip())
-            )
-
-            result = title + "\n\n" + text
-            logger.info(
-                f"Web page parsing complete, total content: {len(result)} characters"
-            )
-            return Document(content=result)
-
-        except Exception as e:
-            logger.error(f"Error parsing web page: {str(e)}")
-            return Document(content=f"Error parsing web page: {str(e)}")
-
-        finally:
-            # Close event loop
-            logger.info("Closing event loop")
-            loop.close()
+        url = endecode.decode_bytes(content)
+
+        logger.info(f"Scraping web page: {url}")
+        chtml = asyncio.run(self.scrape(url))
+        md_text = extract(
+            chtml,
+            output_format="markdown",
+            with_metadata=True,
+            include_images=True,
+            include_tables=True,
+            include_links=True,
+            deduplicate=True,
+        )
+        if not md_text:
+            logger.error("Failed to parse web page")
+            return Document(content=f"Error parsing web page: {url}")
+        return Document(content=md_text)
+
+
+class WebParser(PipelineParser):
+    _parser_cls = (StdWebParser, MarkdownParser)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.DEBUG)
+    logger.setLevel(logging.DEBUG)
+
+    url = "https://cloud.tencent.com/document/product/457/6759"
+
+    parser = WebParser(title="")
+    cc = parser.parse_into_text(url.encode())
+    with open("./tencent.md", "w") as f:
+        f.write(cc.content)
diff --git a/docreader/pyproject.toml b/docreader/pyproject.toml
@@ -33,5 +33,6 @@ dependencies = [
     "python-docx>=1.2.0",
     "requests>=2.32.5",
     "textract==1.5.0",
+    "trafilatura>=2.0.0",
     "urllib3>=2.5.0",
 ]
diff --git a/docreader/uv.lock b/docreader/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`class MarkdownImageUtil:`
`13`	`13`	`def __init__(self):`
`14`	`14`	`self.b64_pattern = re.compile(`
`15`		`- r"!\[([^\]]*)\]\(data:image/(\w+);base64,([^\)]+)\)"`
	`15`	`+ r"!\[([^\]])\]\(data:image/(\w+)\+?\w;base64,([^\)]+)\)"`
`16`	`16`	`)`
`17`	`17`	`self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")`
`18`	`18`	`self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")`
Original file line number	Diff line number	Diff line change
`@@ -33,5 +33,6 @@ dependencies = [`
`33`	`33`	`"python-docx>=1.2.0",`
`34`	`34`	`"requests>=2.32.5",`
`35`	`35`	`"textract==1.5.0",`
	`36`	`+ "trafilatura>=2.0.0",`
`36`	`37`	`"urllib3>=2.5.0",`
`37`	`38`	`]`