Skip to content

Commit 4fdbec1

Browse files
begoniezhaolyingbug
authored andcommitted
feat: 新增网页解析类,优化依赖及图片编码支持
1 parent 2d66abe commit 4fdbec1

File tree

5 files changed

+168
-71
lines changed

5 files changed

+168
-71
lines changed

docreader/parser/base_parser.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ def __init__(
7171
max_concurrent_tasks: int = 5, # Max concurrent tasks
7272
max_chunks: int = 1000, # Max number of returned chunks
7373
chunking_config: Optional[ChunkingConfig] = None,
74+
**kwargs,
7475
):
7576
"""Initialize parser
7677

docreader/parser/markdown_image_util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
class MarkdownImageUtil:
1313
def __init__(self):
1414
self.b64_pattern = re.compile(
15-
r"!\[([^\]]*)\]\(data:image/(\w+);base64,([^\)]+)\)"
15+
r"!\[([^\]]*)\]\(data:image/(\w+)\+?\w*;base64,([^\)]+)\)"
1616
)
1717
self.image_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
1818
self.replace_pattern = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")

docreader/parser/web_parser.py

Lines changed: 41 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,20 @@
11
import asyncio
22
import logging
33
import os
4-
from typing import Any
54

6-
from bs4 import BeautifulSoup
75
from playwright.async_api import async_playwright
6+
from trafilatura import extract
87

98
from docreader.models.document import Document
109
from docreader.parser.base_parser import BaseParser
10+
from docreader.parser.chain_parser import PipelineParser
11+
from docreader.parser.markdown_parser import MarkdownParser
1112
from docreader.utils import endecode
1213

1314
logger = logging.getLogger(__name__)
1415

1516

16-
class WebParser(BaseParser):
17+
class StdWebParser(BaseParser):
1718
"""Web page parser"""
1819

1920
def __init__(self, title: str, **kwargs):
@@ -22,7 +23,7 @@ def __init__(self, title: str, **kwargs):
2223
super().__init__(file_name=title, **kwargs)
2324
logger.info(f"Initialized WebParser with title: {title}")
2425

25-
async def scrape(self, url: str) -> Any:
26+
async def scrape(self, url: str) -> str:
2627
logger.info(f"Starting web page scraping for URL: {url}")
2728
try:
2829
async with async_playwright() as p:
@@ -40,9 +41,7 @@ async def scrape(self, url: str) -> Any:
4041
except Exception as e:
4142
logger.error(f"Error navigating to URL: {str(e)}")
4243
await browser.close()
43-
return BeautifulSoup(
44-
"", "html.parser"
45-
) # Return empty soup on navigation error
44+
return ""
4645

4746
logger.info("Retrieving page HTML content")
4847
content = await page.content()
@@ -53,14 +52,13 @@ async def scrape(self, url: str) -> Any:
5352

5453
# Parse HTML content with BeautifulSoup
5554
logger.info("Parsing HTML with BeautifulSoup")
56-
soup = BeautifulSoup(content, "html.parser")
5755
logger.info("Successfully parsed HTML content")
58-
return soup
56+
return content
5957

6058
except Exception as e:
6159
logger.error(f"Failed to scrape web page: {str(e)}")
6260
# Return empty BeautifulSoup object on error
63-
return BeautifulSoup("", "html.parser")
61+
return ""
6462

6563
def parse_into_text(self, content: bytes) -> Document:
6664
"""Parse web page
@@ -71,63 +69,36 @@ def parse_into_text(self, content: bytes) -> Document:
7169
Returns:
7270
Parse result
7371
"""
74-
logger.info("Starting web page parsing")
75-
76-
# Call async method synchronously
77-
loop = asyncio.new_event_loop()
78-
asyncio.set_event_loop(loop)
79-
80-
try:
81-
# Run async method
82-
# Handle content possibly being a string
83-
if isinstance(content, bytes):
84-
url = endecode.decode_bytes(content)
85-
logger.info(f"Decoded URL from bytes: {url}")
86-
else:
87-
url = str(content)
88-
logger.info(f"Using content as URL directly: {url}")
89-
90-
logger.info(f"Scraping web page: {url}")
91-
soup = loop.run_until_complete(self.scrape(url))
92-
93-
# Extract page text
94-
logger.info("Extracting text from web page")
95-
text = soup.get_text("\n")
96-
logger.info(f"Extracted {len(text)} characters of text from URL: {url}")
97-
98-
# Get title, usually in <title> or <h1> tag
99-
if self.title != "":
100-
title = self.title
101-
logger.info(f"Using provided title: {title}")
102-
else:
103-
title = soup.title.string if soup.title else None
104-
logger.info(f"Found title tag: {title}")
105-
106-
if not title: # If <title> tag does not exist or is empty, try <h1> tag
107-
h1_tag = soup.find("h1")
108-
if h1_tag:
109-
title = h1_tag.get_text()
110-
logger.info(f"Using h1 tag as title: {title}")
111-
else:
112-
title = "Untitled Web Page"
113-
logger.info("No title found, using default")
114-
115-
logger.info(f"Web page title: {title}")
116-
text = "\n".join(
117-
(line.strip() for line in text.splitlines() if line.strip())
118-
)
119-
120-
result = title + "\n\n" + text
121-
logger.info(
122-
f"Web page parsing complete, total content: {len(result)} characters"
123-
)
124-
return Document(content=result)
125-
126-
except Exception as e:
127-
logger.error(f"Error parsing web page: {str(e)}")
128-
return Document(content=f"Error parsing web page: {str(e)}")
129-
130-
finally:
131-
# Close event loop
132-
logger.info("Closing event loop")
133-
loop.close()
72+
url = endecode.decode_bytes(content)
73+
74+
logger.info(f"Scraping web page: {url}")
75+
chtml = asyncio.run(self.scrape(url))
76+
md_text = extract(
77+
chtml,
78+
output_format="markdown",
79+
with_metadata=True,
80+
include_images=True,
81+
include_tables=True,
82+
include_links=True,
83+
deduplicate=True,
84+
)
85+
if not md_text:
86+
logger.error("Failed to parse web page")
87+
return Document(content=f"Error parsing web page: {url}")
88+
return Document(content=md_text)
89+
90+
91+
class WebParser(PipelineParser):
92+
_parser_cls = (StdWebParser, MarkdownParser)
93+
94+
95+
if __name__ == "__main__":
96+
logging.basicConfig(level=logging.DEBUG)
97+
logger.setLevel(logging.DEBUG)
98+
99+
url = "https://cloud.tencent.com/document/product/457/6759"
100+
101+
parser = WebParser(title="")
102+
cc = parser.parse_into_text(url.encode())
103+
with open("./tencent.md", "w") as f:
104+
f.write(cc.content)

docreader/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,6 @@ dependencies = [
3333
"python-docx>=1.2.0",
3434
"requests>=2.32.5",
3535
"textract==1.5.0",
36+
"trafilatura>=2.0.0",
3637
"urllib3>=2.5.0",
3738
]

docreader/uv.lock

Lines changed: 124 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)