Skip to content

Commit 73539a0

Browse files
committed
feat: add files for htmlify
1 parent 787df8f commit 73539a0

16 files changed

+2406
-0
lines changed
Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
"""
2+
Example demonstrating how to use the HTMLfy API with the scrapegraph-py async SDK.
3+
4+
This example shows how to:
5+
1. Set up the async client for HTMLfy
6+
2. Make the API call to get HTML content from a website
7+
3. Handle the response and save the HTML content
8+
4. Demonstrate both regular and heavy JS rendering modes
9+
5. Display the results and metadata
10+
11+
Requirements:
12+
- Python 3.7+
13+
- scrapegraph-py
14+
- python-dotenv
15+
- aiohttp
16+
- A .env file with your SGAI_API_KEY
17+
18+
Example .env file:
19+
SGAI_API_KEY=your_api_key_here
20+
"""
21+
22+
import asyncio
23+
import json
24+
import os
25+
import time
26+
from pathlib import Path
27+
from typing import Optional
28+
29+
from dotenv import load_dotenv
30+
31+
from scrapegraph_py import AsyncClient
32+
33+
# Load environment variables from .env file
34+
load_dotenv()
35+
36+
37+
async def htmlfy_website(
38+
client: AsyncClient,
39+
website_url: str,
40+
render_heavy_js: bool = False,
41+
headers: Optional[dict[str, str]] = None,
42+
) -> dict:
43+
"""
44+
Get HTML content from a website using the HTMLfy API.
45+
46+
Args:
47+
client: The scrapegraph-py async client instance
48+
website_url: The URL of the website to get HTML from
49+
render_heavy_js: Whether to render heavy JavaScript (defaults to False)
50+
headers: Optional headers to send with the request
51+
52+
Returns:
53+
dict: A dictionary containing the HTML content and metadata
54+
55+
Raises:
56+
Exception: If the API request fails
57+
"""
58+
js_mode = "with heavy JS rendering" if render_heavy_js else "without JS rendering"
59+
print(f"Getting HTML content from: {website_url}")
60+
print(f"Mode: {js_mode}")
61+
62+
start_time = time.time()
63+
64+
try:
65+
result = await client.htmlfy(
66+
website_url=website_url,
67+
render_heavy_js=render_heavy_js,
68+
headers=headers,
69+
)
70+
execution_time = time.time() - start_time
71+
print(f"Execution time: {execution_time:.2f} seconds")
72+
return result
73+
except Exception as e:
74+
print(f"Error: {str(e)}")
75+
raise
76+
77+
78+
def save_html_content(
79+
html_content: str, filename: str, output_dir: str = "htmlfy_output"
80+
):
81+
"""
82+
Save HTML content to a file.
83+
84+
Args:
85+
html_content: The HTML content to save
86+
filename: The name of the file (without extension)
87+
output_dir: The directory to save the file in
88+
"""
89+
# Create output directory if it doesn't exist
90+
output_path = Path(output_dir)
91+
output_path.mkdir(exist_ok=True)
92+
93+
# Save HTML file
94+
html_file = output_path / f"{filename}.html"
95+
with open(html_file, "w", encoding="utf-8") as f:
96+
f.write(html_content)
97+
98+
print(f"HTML content saved to: {html_file}")
99+
return html_file
100+
101+
102+
def analyze_html_content(html_content: str) -> dict:
103+
"""
104+
Analyze HTML content and provide basic statistics.
105+
106+
Args:
107+
html_content: The HTML content to analyze
108+
109+
Returns:
110+
dict: Basic statistics about the HTML content
111+
"""
112+
stats = {
113+
"total_length": len(html_content),
114+
"lines": len(html_content.splitlines()),
115+
"has_doctype": html_content.strip().startswith("<!DOCTYPE"),
116+
"has_html_tag": "<html" in html_content.lower(),
117+
"has_head_tag": "<head" in html_content.lower(),
118+
"has_body_tag": "<body" in html_content.lower(),
119+
"script_tags": html_content.lower().count("<script"),
120+
"style_tags": html_content.lower().count("<style"),
121+
"div_tags": html_content.lower().count("<div"),
122+
"p_tags": html_content.lower().count("<p"),
123+
"img_tags": html_content.lower().count("<img"),
124+
"link_tags": html_content.lower().count("<link"),
125+
}
126+
127+
return stats
128+
129+
130+
async def main():
131+
"""
132+
Main function demonstrating HTMLfy API usage.
133+
"""
134+
# Example websites to test
135+
test_websites = [
136+
{
137+
"url": "https://example.com",
138+
"name": "example",
139+
"render_heavy_js": False,
140+
"description": "Simple static website",
141+
},
142+
{
143+
"url": "https://httpbin.org/html",
144+
"name": "httpbin_html",
145+
"render_heavy_js": False,
146+
"description": "HTTP testing service",
147+
},
148+
]
149+
150+
print("HTMLfy API Example with scrapegraph-py Async SDK")
151+
print("=" * 65)
152+
153+
# Initialize the async client
154+
try:
155+
async with AsyncClient.from_env() as client:
156+
print("✅ Async client initialized successfully")
157+
158+
for website in test_websites:
159+
print(f"\nTesting: {website['description']}")
160+
print("-" * 40)
161+
162+
try:
163+
# Get HTML content
164+
result = await htmlfy_website(
165+
client=client,
166+
website_url=website["url"],
167+
render_heavy_js=website["render_heavy_js"],
168+
)
169+
170+
# Display response metadata
171+
print(f"Request ID: {result.get('htmlfy_request_id', 'N/A')}")
172+
print(f"Status: {result.get('status', 'N/A')}")
173+
print(f"Error: {result.get('error', 'None')}")
174+
175+
# Analyze HTML content
176+
html_content = result.get("html", "")
177+
if html_content:
178+
stats = analyze_html_content(html_content)
179+
print(f"\nHTML Content Analysis:")
180+
print(f" Total length: {stats['total_length']:,} characters")
181+
print(f" Lines: {stats['lines']:,}")
182+
print(f" Has DOCTYPE: {stats['has_doctype']}")
183+
print(f" Has HTML tag: {stats['has_html_tag']}")
184+
print(f" Has Head tag: {stats['has_head_tag']}")
185+
print(f" Has Body tag: {stats['has_body_tag']}")
186+
print(f" Script tags: {stats['script_tags']}")
187+
print(f" Style tags: {stats['style_tags']}")
188+
print(f" Div tags: {stats['div_tags']}")
189+
print(f" Paragraph tags: {stats['p_tags']}")
190+
print(f" Image tags: {stats['img_tags']}")
191+
print(f" Link tags: {stats['link_tags']}")
192+
193+
# Save HTML content
194+
filename = f"{website['name']}_{'js' if website['render_heavy_js'] else 'nojs'}"
195+
save_html_content(html_content, filename)
196+
197+
# Show first 500 characters as preview
198+
preview = html_content[:500].replace("\n", " ").strip()
199+
print(f"\nHTML Preview (first 500 chars):")
200+
print(f" {preview}...")
201+
else:
202+
print("No HTML content received")
203+
204+
except Exception as e:
205+
print(f"Error processing {website['url']}: {str(e)}")
206+
207+
print("\n" + "=" * 65)
208+
209+
print("\n✅ Async client closed successfully")
210+
211+
except Exception as e:
212+
print(f"❌ Failed to initialize async client: {str(e)}")
213+
print("Make sure you have SGAI_API_KEY in your .env file")
214+
215+
216+
if __name__ == "__main__":
217+
asyncio.run(main())

0 commit comments

Comments
 (0)