11import asyncio
22import logging
33import os
4- from typing import Any
54
6- from bs4 import BeautifulSoup
75from playwright .async_api import async_playwright
6+ from trafilatura import extract
87
98from docreader .models .document import Document
109from docreader .parser .base_parser import BaseParser
10+ from docreader .parser .chain_parser import PipelineParser
11+ from docreader .parser .markdown_parser import MarkdownParser
1112from docreader .utils import endecode
1213
1314logger = logging .getLogger (__name__ )
1415
1516
16- class WebParser (BaseParser ):
17+ class StdWebParser (BaseParser ):
1718 """Web page parser"""
1819
1920 def __init__ (self , title : str , ** kwargs ):
@@ -22,7 +23,7 @@ def __init__(self, title: str, **kwargs):
2223 super ().__init__ (file_name = title , ** kwargs )
2324 logger .info (f"Initialized WebParser with title: { title } " )
2425
25- async def scrape (self , url : str ) -> Any :
26+ async def scrape (self , url : str ) -> str :
2627 logger .info (f"Starting web page scraping for URL: { url } " )
2728 try :
2829 async with async_playwright () as p :
@@ -40,9 +41,7 @@ async def scrape(self, url: str) -> Any:
4041 except Exception as e :
4142 logger .error (f"Error navigating to URL: { str (e )} " )
4243 await browser .close ()
43- return BeautifulSoup (
44- "" , "html.parser"
45- ) # Return empty soup on navigation error
44+ return ""
4645
4746 logger .info ("Retrieving page HTML content" )
4847 content = await page .content ()
@@ -53,14 +52,13 @@ async def scrape(self, url: str) -> Any:
5352
5453 # Parse HTML content with BeautifulSoup
5554 logger .info ("Parsing HTML with BeautifulSoup" )
56- soup = BeautifulSoup (content , "html.parser" )
5755 logger .info ("Successfully parsed HTML content" )
58- return soup
56+ return content
5957
6058 except Exception as e :
6159 logger .error (f"Failed to scrape web page: { str (e )} " )
6260 # Return empty BeautifulSoup object on error
63- return BeautifulSoup ( "" , "html.parser" )
61+ return ""
6462
6563 def parse_into_text (self , content : bytes ) -> Document :
6664 """Parse web page
@@ -71,63 +69,36 @@ def parse_into_text(self, content: bytes) -> Document:
7169 Returns:
7270 Parse result
7371 """
74- logger .info ("Starting web page parsing" )
75-
76- # Call async method synchronously
77- loop = asyncio .new_event_loop ()
78- asyncio .set_event_loop (loop )
79-
80- try :
81- # Run async method
82- # Handle content possibly being a string
83- if isinstance (content , bytes ):
84- url = endecode .decode_bytes (content )
85- logger .info (f"Decoded URL from bytes: { url } " )
86- else :
87- url = str (content )
88- logger .info (f"Using content as URL directly: { url } " )
89-
90- logger .info (f"Scraping web page: { url } " )
91- soup = loop .run_until_complete (self .scrape (url ))
92-
93- # Extract page text
94- logger .info ("Extracting text from web page" )
95- text = soup .get_text ("\n " )
96- logger .info (f"Extracted { len (text )} characters of text from URL: { url } " )
97-
98- # Get title, usually in <title> or <h1> tag
99- if self .title != "" :
100- title = self .title
101- logger .info (f"Using provided title: { title } " )
102- else :
103- title = soup .title .string if soup .title else None
104- logger .info (f"Found title tag: { title } " )
105-
106- if not title : # If <title> tag does not exist or is empty, try <h1> tag
107- h1_tag = soup .find ("h1" )
108- if h1_tag :
109- title = h1_tag .get_text ()
110- logger .info (f"Using h1 tag as title: { title } " )
111- else :
112- title = "Untitled Web Page"
113- logger .info ("No title found, using default" )
114-
115- logger .info (f"Web page title: { title } " )
116- text = "\n " .join (
117- (line .strip () for line in text .splitlines () if line .strip ())
118- )
119-
120- result = title + "\n \n " + text
121- logger .info (
122- f"Web page parsing complete, total content: { len (result )} characters"
123- )
124- return Document (content = result )
125-
126- except Exception as e :
127- logger .error (f"Error parsing web page: { str (e )} " )
128- return Document (content = f"Error parsing web page: { str (e )} " )
129-
130- finally :
131- # Close event loop
132- logger .info ("Closing event loop" )
133- loop .close ()
72+ url = endecode .decode_bytes (content )
73+
74+ logger .info (f"Scraping web page: { url } " )
75+ chtml = asyncio .run (self .scrape (url ))
76+ md_text = extract (
77+ chtml ,
78+ output_format = "markdown" ,
79+ with_metadata = True ,
80+ include_images = True ,
81+ include_tables = True ,
82+ include_links = True ,
83+ deduplicate = True ,
84+ )
85+ if not md_text :
86+ logger .error ("Failed to parse web page" )
87+ return Document (content = f"Error parsing web page: { url } " )
88+ return Document (content = md_text )
89+
90+
91+ class WebParser (PipelineParser ):
92+ _parser_cls = (StdWebParser , MarkdownParser )
93+
94+
95+ if __name__ == "__main__" :
96+ logging .basicConfig (level = logging .DEBUG )
97+ logger .setLevel (logging .DEBUG )
98+
99+ url = "https://cloud.tencent.com/document/product/457/6759"
100+
101+ parser = WebParser (title = "" )
102+ cc = parser .parse_into_text (url .encode ())
103+ with open ("./tencent.md" , "w" ) as f :
104+ f .write (cc .content )
0 commit comments