|
| 1 | +""" |
| 2 | +Example demonstrating how to use the HTMLfy API with the scrapegraph-py async SDK. |
| 3 | +
|
| 4 | +This example shows how to: |
| 5 | +1. Set up the async client for HTMLfy |
| 6 | +2. Make the API call to get HTML content from a website |
| 7 | +3. Handle the response and save the HTML content |
| 8 | +4. Demonstrate both regular and heavy JS rendering modes |
| 9 | +5. Display the results and metadata |
| 10 | +
|
| 11 | +Requirements: |
| 12 | +- Python 3.7+ |
| 13 | +- scrapegraph-py |
| 14 | +- python-dotenv |
| 15 | +- aiohttp |
| 16 | +- A .env file with your SGAI_API_KEY |
| 17 | +
|
| 18 | +Example .env file: |
| 19 | +SGAI_API_KEY=your_api_key_here |
| 20 | +""" |
| 21 | + |
| 22 | +import asyncio |
| 23 | +import json |
| 24 | +import os |
| 25 | +import time |
| 26 | +from pathlib import Path |
| 27 | +from typing import Optional |
| 28 | + |
| 29 | +from dotenv import load_dotenv |
| 30 | + |
| 31 | +from scrapegraph_py import AsyncClient |
| 32 | + |
| 33 | +# Load environment variables from .env file |
| 34 | +load_dotenv() |
| 35 | + |
| 36 | + |
| 37 | +async def htmlfy_website( |
| 38 | + client: AsyncClient, |
| 39 | + website_url: str, |
| 40 | + render_heavy_js: bool = False, |
| 41 | + headers: Optional[dict[str, str]] = None, |
| 42 | +) -> dict: |
| 43 | + """ |
| 44 | + Get HTML content from a website using the HTMLfy API. |
| 45 | +
|
| 46 | + Args: |
| 47 | + client: The scrapegraph-py async client instance |
| 48 | + website_url: The URL of the website to get HTML from |
| 49 | + render_heavy_js: Whether to render heavy JavaScript (defaults to False) |
| 50 | + headers: Optional headers to send with the request |
| 51 | +
|
| 52 | + Returns: |
| 53 | + dict: A dictionary containing the HTML content and metadata |
| 54 | +
|
| 55 | + Raises: |
| 56 | + Exception: If the API request fails |
| 57 | + """ |
| 58 | + js_mode = "with heavy JS rendering" if render_heavy_js else "without JS rendering" |
| 59 | + print(f"Getting HTML content from: {website_url}") |
| 60 | + print(f"Mode: {js_mode}") |
| 61 | + |
| 62 | + start_time = time.time() |
| 63 | + |
| 64 | + try: |
| 65 | + result = await client.htmlfy( |
| 66 | + website_url=website_url, |
| 67 | + render_heavy_js=render_heavy_js, |
| 68 | + headers=headers, |
| 69 | + ) |
| 70 | + execution_time = time.time() - start_time |
| 71 | + print(f"Execution time: {execution_time:.2f} seconds") |
| 72 | + return result |
| 73 | + except Exception as e: |
| 74 | + print(f"Error: {str(e)}") |
| 75 | + raise |
| 76 | + |
| 77 | + |
| 78 | +def save_html_content( |
| 79 | + html_content: str, filename: str, output_dir: str = "htmlfy_output" |
| 80 | +): |
| 81 | + """ |
| 82 | + Save HTML content to a file. |
| 83 | +
|
| 84 | + Args: |
| 85 | + html_content: The HTML content to save |
| 86 | + filename: The name of the file (without extension) |
| 87 | + output_dir: The directory to save the file in |
| 88 | + """ |
| 89 | + # Create output directory if it doesn't exist |
| 90 | + output_path = Path(output_dir) |
| 91 | + output_path.mkdir(exist_ok=True) |
| 92 | + |
| 93 | + # Save HTML file |
| 94 | + html_file = output_path / f"{filename}.html" |
| 95 | + with open(html_file, "w", encoding="utf-8") as f: |
| 96 | + f.write(html_content) |
| 97 | + |
| 98 | + print(f"HTML content saved to: {html_file}") |
| 99 | + return html_file |
| 100 | + |
| 101 | + |
| 102 | +def analyze_html_content(html_content: str) -> dict: |
| 103 | + """ |
| 104 | + Analyze HTML content and provide basic statistics. |
| 105 | +
|
| 106 | + Args: |
| 107 | + html_content: The HTML content to analyze |
| 108 | +
|
| 109 | + Returns: |
| 110 | + dict: Basic statistics about the HTML content |
| 111 | + """ |
| 112 | + stats = { |
| 113 | + "total_length": len(html_content), |
| 114 | + "lines": len(html_content.splitlines()), |
| 115 | + "has_doctype": html_content.strip().startswith("<!DOCTYPE"), |
| 116 | + "has_html_tag": "<html" in html_content.lower(), |
| 117 | + "has_head_tag": "<head" in html_content.lower(), |
| 118 | + "has_body_tag": "<body" in html_content.lower(), |
| 119 | + "script_tags": html_content.lower().count("<script"), |
| 120 | + "style_tags": html_content.lower().count("<style"), |
| 121 | + "div_tags": html_content.lower().count("<div"), |
| 122 | + "p_tags": html_content.lower().count("<p"), |
| 123 | + "img_tags": html_content.lower().count("<img"), |
| 124 | + "link_tags": html_content.lower().count("<link"), |
| 125 | + } |
| 126 | + |
| 127 | + return stats |
| 128 | + |
| 129 | + |
| 130 | +async def main(): |
| 131 | + """ |
| 132 | + Main function demonstrating HTMLfy API usage. |
| 133 | + """ |
| 134 | + # Example websites to test |
| 135 | + test_websites = [ |
| 136 | + { |
| 137 | + "url": "https://example.com", |
| 138 | + "name": "example", |
| 139 | + "render_heavy_js": False, |
| 140 | + "description": "Simple static website", |
| 141 | + }, |
| 142 | + { |
| 143 | + "url": "https://httpbin.org/html", |
| 144 | + "name": "httpbin_html", |
| 145 | + "render_heavy_js": False, |
| 146 | + "description": "HTTP testing service", |
| 147 | + }, |
| 148 | + ] |
| 149 | + |
| 150 | + print("HTMLfy API Example with scrapegraph-py Async SDK") |
| 151 | + print("=" * 65) |
| 152 | + |
| 153 | + # Initialize the async client |
| 154 | + try: |
| 155 | + async with AsyncClient.from_env() as client: |
| 156 | + print("✅ Async client initialized successfully") |
| 157 | + |
| 158 | + for website in test_websites: |
| 159 | + print(f"\nTesting: {website['description']}") |
| 160 | + print("-" * 40) |
| 161 | + |
| 162 | + try: |
| 163 | + # Get HTML content |
| 164 | + result = await htmlfy_website( |
| 165 | + client=client, |
| 166 | + website_url=website["url"], |
| 167 | + render_heavy_js=website["render_heavy_js"], |
| 168 | + ) |
| 169 | + |
| 170 | + # Display response metadata |
| 171 | + print(f"Request ID: {result.get('htmlfy_request_id', 'N/A')}") |
| 172 | + print(f"Status: {result.get('status', 'N/A')}") |
| 173 | + print(f"Error: {result.get('error', 'None')}") |
| 174 | + |
| 175 | + # Analyze HTML content |
| 176 | + html_content = result.get("html", "") |
| 177 | + if html_content: |
| 178 | + stats = analyze_html_content(html_content) |
| 179 | + print(f"\nHTML Content Analysis:") |
| 180 | + print(f" Total length: {stats['total_length']:,} characters") |
| 181 | + print(f" Lines: {stats['lines']:,}") |
| 182 | + print(f" Has DOCTYPE: {stats['has_doctype']}") |
| 183 | + print(f" Has HTML tag: {stats['has_html_tag']}") |
| 184 | + print(f" Has Head tag: {stats['has_head_tag']}") |
| 185 | + print(f" Has Body tag: {stats['has_body_tag']}") |
| 186 | + print(f" Script tags: {stats['script_tags']}") |
| 187 | + print(f" Style tags: {stats['style_tags']}") |
| 188 | + print(f" Div tags: {stats['div_tags']}") |
| 189 | + print(f" Paragraph tags: {stats['p_tags']}") |
| 190 | + print(f" Image tags: {stats['img_tags']}") |
| 191 | + print(f" Link tags: {stats['link_tags']}") |
| 192 | + |
| 193 | + # Save HTML content |
| 194 | + filename = f"{website['name']}_{'js' if website['render_heavy_js'] else 'nojs'}" |
| 195 | + save_html_content(html_content, filename) |
| 196 | + |
| 197 | + # Show first 500 characters as preview |
| 198 | + preview = html_content[:500].replace("\n", " ").strip() |
| 199 | + print(f"\nHTML Preview (first 500 chars):") |
| 200 | + print(f" {preview}...") |
| 201 | + else: |
| 202 | + print("No HTML content received") |
| 203 | + |
| 204 | + except Exception as e: |
| 205 | + print(f"Error processing {website['url']}: {str(e)}") |
| 206 | + |
| 207 | + print("\n" + "=" * 65) |
| 208 | + |
| 209 | + print("\n✅ Async client closed successfully") |
| 210 | + |
| 211 | + except Exception as e: |
| 212 | + print(f"❌ Failed to initialize async client: {str(e)}") |
| 213 | + print("Make sure you have SGAI_API_KEY in your .env file") |
| 214 | + |
| 215 | + |
| 216 | +if __name__ == "__main__": |
| 217 | + asyncio.run(main()) |
0 commit comments