From 73539a0c486baad7c479ee2fa488e3571ca369df Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 25 Aug 2025 14:36:44 +0200 Subject: [PATCH 1/4] feat: add files for htmlify --- .../examples/async/async_htmlfy_example.py | 217 ++++++++++ .../async_step_by_step_htmlfy_example.py | 184 ++++++++ .../sync/htmlfy_comprehensive_example.py | 408 ++++++++++++++++++ .../examples/sync/htmlfy_example.py | 217 ++++++++++ .../sync/htmlfy_output/example_nojs.html | 46 ++ .../sync/htmlfy_output/httpbin_html_nojs.html | 12 + .../sync/steps/step_by_step_htmlfy_example.py | 183 ++++++++ scrapegraph-py/scrapegraph_py/async_client.py | 45 ++ scrapegraph-py/scrapegraph_py/client.py | 44 ++ .../scrapegraph_py/models/__init__.py | 3 + .../scrapegraph_py/models/htmlfy.py | 55 +++ scrapegraph-py/test_htmlfy_integration.py | 143 ++++++ scrapegraph-py/tests/test_async_client.py | 250 +++++++++++ scrapegraph-py/tests/test_client.py | 220 ++++++++++ scrapegraph-py/tests/test_htmlfy_models.py | 249 +++++++++++ scrapegraph-py/tests/test_mocked_apis.py | 130 ++++++ 16 files changed, 2406 insertions(+) create mode 100644 scrapegraph-py/examples/async/async_htmlfy_example.py create mode 100644 scrapegraph-py/examples/async/steps/async_step_by_step_htmlfy_example.py create mode 100644 scrapegraph-py/examples/sync/htmlfy_comprehensive_example.py create mode 100644 scrapegraph-py/examples/sync/htmlfy_example.py create mode 100644 scrapegraph-py/examples/sync/htmlfy_output/example_nojs.html create mode 100644 scrapegraph-py/examples/sync/htmlfy_output/httpbin_html_nojs.html create mode 100644 scrapegraph-py/examples/sync/steps/step_by_step_htmlfy_example.py create mode 100644 scrapegraph-py/scrapegraph_py/models/htmlfy.py create mode 100644 scrapegraph-py/test_htmlfy_integration.py create mode 100644 scrapegraph-py/tests/test_htmlfy_models.py diff --git a/scrapegraph-py/examples/async/async_htmlfy_example.py b/scrapegraph-py/examples/async/async_htmlfy_example.py new file mode 100644 index 0000000..d056fff --- /dev/null +++ b/scrapegraph-py/examples/async/async_htmlfy_example.py @@ -0,0 +1,217 @@ +""" +Example demonstrating how to use the HTMLfy API with the scrapegraph-py async SDK. + +This example shows how to: +1. Set up the async client for HTMLfy +2. Make the API call to get HTML content from a website +3. Handle the response and save the HTML content +4. Demonstrate both regular and heavy JS rendering modes +5. Display the results and metadata + +Requirements: +- Python 3.7+ +- scrapegraph-py +- python-dotenv +- aiohttp +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import asyncio +import json +import os +import time +from pathlib import Path +from typing import Optional + +from dotenv import load_dotenv + +from scrapegraph_py import AsyncClient + +# Load environment variables from .env file +load_dotenv() + + +async def htmlfy_website( + client: AsyncClient, + website_url: str, + render_heavy_js: bool = False, + headers: Optional[dict[str, str]] = None, +) -> dict: + """ + Get HTML content from a website using the HTMLfy API. + + Args: + client: The scrapegraph-py async client instance + website_url: The URL of the website to get HTML from + render_heavy_js: Whether to render heavy JavaScript (defaults to False) + headers: Optional headers to send with the request + + Returns: + dict: A dictionary containing the HTML content and metadata + + Raises: + Exception: If the API request fails + """ + js_mode = "with heavy JS rendering" if render_heavy_js else "without JS rendering" + print(f"Getting HTML content from: {website_url}") + print(f"Mode: {js_mode}") + + start_time = time.time() + + try: + result = await client.htmlfy( + website_url=website_url, + render_heavy_js=render_heavy_js, + headers=headers, + ) + execution_time = time.time() - start_time + print(f"Execution time: {execution_time:.2f} seconds") + return result + except Exception as e: + print(f"Error: {str(e)}") + raise + + +def save_html_content( + html_content: str, filename: str, output_dir: str = "htmlfy_output" +): + """ + Save HTML content to a file. + + Args: + html_content: The HTML content to save + filename: The name of the file (without extension) + output_dir: The directory to save the file in + """ + # Create output directory if it doesn't exist + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True) + + # Save HTML file + html_file = output_path / f"{filename}.html" + with open(html_file, "w", encoding="utf-8") as f: + f.write(html_content) + + print(f"HTML content saved to: {html_file}") + return html_file + + +def analyze_html_content(html_content: str) -> dict: + """ + Analyze HTML content and provide basic statistics. + + Args: + html_content: The HTML content to analyze + + Returns: + dict: Basic statistics about the HTML content + """ + stats = { + "total_length": len(html_content), + "lines": len(html_content.splitlines()), + "has_doctype": html_content.strip().startswith(" 0: + print(f" {element}: {count}") + + # Check for JavaScript and CSS + has_js = elements["script"] > 0 + has_css = elements["style"] > 0 + + print(f"\n🎨 Content types:") + print(f" JavaScript: {'Yes' if has_js else 'No'}") + print(f" CSS: {'Yes' if has_css else 'No'}") + + return elements + + +async def main(): + """Main function demonstrating async step-by-step HTMLfy usage.""" + print("πŸš€ Async Step-by-Step HTMLfy API Example") + print("=" * 55) + + # Test URL + test_url = "https://example.com" + + try: + # Step 1: Initialize async client + async with AsyncClient.from_env() as client: + print("βœ… Async client initialized successfully") + + # Step 2: Make async HTMLfy request + result = await step_2_make_async_htmlfy_request(client, test_url, render_js=False) + + # Step 3: Handle response + html_content = step_3_handle_response(result) + if not html_content: + print("❌ Cannot proceed without HTML content") + return + + # Step 4: Save content + filename = "async_example_website" + saved_file = step_4_save_html_content(html_content, filename) + + # Step 5: Basic analysis + elements = step_5_basic_analysis(html_content) + + # Summary + print(f"\n🎯 Summary:") + print(f"βœ… Successfully processed {test_url} asynchronously") + print(f"πŸ’Ύ HTML saved to: {saved_file}") + print(f"πŸ“Š Analyzed {len(html_content):,} characters of HTML content") + + print("βœ… Async client closed successfully") + + except Exception as e: + print(f"\nπŸ’₯ Error occurred: {str(e)}") + print("Check your API key and internet connection") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/scrapegraph-py/examples/sync/htmlfy_comprehensive_example.py b/scrapegraph-py/examples/sync/htmlfy_comprehensive_example.py new file mode 100644 index 0000000..f7830b6 --- /dev/null +++ b/scrapegraph-py/examples/sync/htmlfy_comprehensive_example.py @@ -0,0 +1,408 @@ +""" +Comprehensive example demonstrating advanced usage of the HTMLfy API with the scrapegraph-py SDK. + +This example shows how to: +1. Set up the client for HTMLfy with various configurations +2. Handle different types of websites and rendering modes +3. Implement error handling and retry logic +4. Process multiple websites concurrently +5. Save and analyze HTML content with detailed metadata +6. Use custom headers and cookies for authentication +7. Compare different rendering modes + +Requirements: +- Python 3.7+ +- scrapegraph-py +- python-dotenv +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import json +import os +import time +from pathlib import Path +from typing import Optional, Dict, List +from concurrent.futures import ThreadPoolExecutor, as_completed + +from dotenv import load_dotenv + +from scrapegraph_py import Client + +# Load environment variables from .env file +load_dotenv() + + +class HtmlfyProcessor: + """A comprehensive HTMLfy processor with advanced features""" + + def __init__(self, api_key: Optional[str] = None): + """ + Initialize the HTMLfy processor. + + Args: + api_key: API key for authentication. If None, will try to load from environment + """ + try: + if api_key: + self.client = Client(api_key=api_key) + else: + self.client = Client.from_env() + print("βœ… Client initialized successfully") + except Exception as e: + print(f"❌ Failed to initialize client: {str(e)}") + raise + + def htmlfy_website( + self, + website_url: str, + render_heavy_js: bool = False, + headers: Optional[dict[str, str]] = None, + max_retries: int = 3, + ) -> dict: + """ + Get HTML content from a website using the HTMLfy API with retry logic. + + Args: + website_url: The URL of the website to get HTML from + render_heavy_js: Whether to render heavy JavaScript + headers: Optional headers to send with the request + max_retries: Maximum number of retry attempts + + Returns: + dict: A dictionary containing the HTML content and metadata + """ + js_mode = "with heavy JS rendering" if render_heavy_js else "without JS rendering" + print(f"🌐 Getting HTML content from: {website_url}") + print(f"πŸ”§ Mode: {js_mode}") + + for attempt in range(max_retries): + try: + start_time = time.time() + result = self.client.htmlfy( + website_url=website_url, + render_heavy_js=render_heavy_js, + headers=headers, + ) + execution_time = time.time() - start_time + + print(f"βœ… Success! Execution time: {execution_time:.2f} seconds") + return { + **result, + "execution_time": execution_time, + "attempts": attempt + 1, + } + + except Exception as e: + print(f"❌ Attempt {attempt + 1} failed: {str(e)}") + if attempt < max_retries - 1: + wait_time = 2 ** attempt # Exponential backoff + print(f"⏳ Waiting {wait_time}s before retry...") + time.sleep(wait_time) + else: + print(f"πŸ’₯ All {max_retries} attempts failed for {website_url}") + raise + + def process_website_batch( + self, + websites: List[Dict], + max_workers: int = 3, + output_dir: str = "htmlfy_output" + ) -> List[Dict]: + """ + Process multiple websites concurrently. + + Args: + websites: List of website configurations + max_workers: Maximum number of concurrent workers + output_dir: Directory to save output files + + Returns: + List of results for each website + """ + results = [] + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all tasks + future_to_website = { + executor.submit( + self._process_single_website, website, output_dir + ): website + for website in websites + } + + # Process completed tasks + for future in as_completed(future_to_website): + website = future_to_website[future] + try: + result = future.result() + results.append(result) + print(f"βœ… Completed: {website['url']}") + except Exception as e: + print(f"❌ Failed: {website['url']} - {str(e)}") + results.append({ + "website": website, + "error": str(e), + "status": "failed" + }) + + return results + + def _process_single_website( + self, website: Dict, output_dir: str + ) -> Dict: + """Process a single website and return results.""" + try: + # Get HTML content + result = self.htmlfy_website( + website_url=website["url"], + render_heavy_js=website.get("render_heavy_js", False), + headers=website.get("headers"), + ) + + # Analyze HTML content + html_content = result.get("html", "") + if html_content: + stats = self.analyze_html_content(html_content) + result["analysis"] = stats + + # Save HTML content + filename = self._generate_filename(website, result) + saved_file = self.save_html_content(html_content, filename, output_dir) + result["saved_file"] = str(saved_file) + + # Generate summary + result["summary"] = self.generate_summary(stats, result) + + return { + "website": website, + "result": result, + "status": "success" + } + + except Exception as e: + return { + "website": website, + "error": str(e), + "status": "failed" + } + + def analyze_html_content(self, html_content: str) -> dict: + """ + Analyze HTML content and provide comprehensive statistics. + + Args: + html_content: The HTML content to analyze + + Returns: + dict: Comprehensive statistics about the HTML content + """ + stats = { + "basic": { + "total_length": len(html_content), + "lines": len(html_content.splitlines()), + "words": len(html_content.split()), + "characters_no_spaces": len(html_content.replace(" ", "")), + }, + "structure": { + "has_doctype": html_content.strip().startswith(" str: + """Generate a human-readable summary of the HTML content.""" + basic = stats["basic"] + elements = stats["elements"] + + summary = f"HTML document with {basic['total_length']:,} characters " + summary += f"({basic['lines']:,} lines, {basic['words']:,} words). " + + if elements["div_tags"] > 0: + summary += f"Contains {elements['div_tags']} div elements, " + if elements["p_tags"] > 0: + summary += f"{elements['p_tags']} paragraphs, " + if elements["img_tags"] > 0: + summary += f"{elements['img_tags']} images, " + if elements["script_tags"] > 0: + summary += f"{elements['script_tags']} script tags, " + if elements["style_tags"] > 0: + summary += f"{elements['style_tags']} style tags. " + + execution_time = result.get("execution_time", 0) + summary += f"Processed in {execution_time:.2f} seconds." + + return summary + + def _generate_filename(self, website: Dict, result: Dict) -> str: + """Generate a filename for the saved HTML content.""" + name = website.get("name", "website") + js_mode = "js" if website.get("render_heavy_js", False) else "nojs" + timestamp = int(time.time()) + return f"{name}_{js_mode}_{timestamp}" + + def save_html_content( + self, html_content: str, filename: str, output_dir: str = "htmlfy_output" + ) -> Path: + """ + Save HTML content to a file with metadata. + + Args: + html_content: The HTML content to save + filename: The name of the file (without extension) + output_dir: The directory to save the file in + + Returns: + Path to the saved file + """ + # Create output directory if it doesn't exist + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True) + + # Save HTML file + html_file = output_path / f"{filename}.html" + with open(html_file, "w", encoding="utf-8") as f: + f.write(html_content) + + # Save metadata file + metadata_file = output_path / f"{filename}_metadata.json" + metadata = { + "filename": filename, + "saved_at": time.strftime("%Y-%m-%d %H:%M:%S"), + "file_size": len(html_content), + "encoding": "utf-8" + } + + with open(metadata_file, "w", encoding="utf-8") as f: + json.dump(metadata, f, indent=2) + + print(f"πŸ’Ύ HTML content saved to: {html_file}") + print(f"πŸ“Š Metadata saved to: {metadata_file}") + return html_file + + def close(self): + """Close the client to free up resources.""" + self.client.close() + print("πŸ”’ Client closed successfully") + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + +def main(): + """Main function demonstrating comprehensive HTMLfy usage.""" + + # Example websites with different configurations + test_websites = [ + { + "url": "https://example.com", + "name": "example", + "render_heavy_js": False, + "description": "Simple static website", + "headers": { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + } + }, + { + "url": "https://httpbin.org/html", + "name": "httpbin_html", + "render_heavy_js": False, + "description": "HTTP testing service", + }, + { + "url": "https://httpbin.org/json", + "name": "httpbin_json", + "render_heavy_js": False, + "description": "JSON endpoint", + }, + { + "url": "https://httpbin.org/headers", + "name": "httpbin_headers", + "render_heavy_js": False, + "description": "Headers endpoint", + "headers": { + "User-Agent": "ScrapeGraph-HTMLfy-Example/1.0", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" + } + } + ] + + print("πŸš€ Comprehensive HTMLfy API Example with scrapegraph-py SDK") + print("=" * 70) + + try: + with HtmlfyProcessor() as processor: + print("\nπŸ“‹ Processing websites...") + + # Process websites concurrently + results = processor.process_website_batch( + websites=test_websites, + max_workers=2, # Limit concurrent requests + output_dir="htmlfy_comprehensive_output" + ) + + # Display summary + print("\nπŸ“Š Processing Summary") + print("=" * 50) + + successful = [r for r in results if r["status"] == "success"] + failed = [r for r in results if r["status"] == "failed"] + + print(f"βœ… Successful: {len(successful)}") + print(f"❌ Failed: {len(failed)}") + + if successful: + print(f"\n🎯 Successful Results:") + for result in successful: + website = result["website"] + data = result["result"] + summary = data.get("summary", "No summary available") + print(f" 🌐 {website['url']}: {summary}") + + if failed: + print(f"\nπŸ’₯ Failed Results:") + for result in failed: + website = result["website"] + error = result["error"] + print(f" 🌐 {website['url']}: {error}") + + print(f"\nπŸ“ Output saved to: htmlfy_comprehensive_output/") + + except Exception as e: + print(f"❌ Fatal error: {str(e)}") + print("Make sure you have SGAI_API_KEY in your .env file") + + +if __name__ == "__main__": + main() diff --git a/scrapegraph-py/examples/sync/htmlfy_example.py b/scrapegraph-py/examples/sync/htmlfy_example.py new file mode 100644 index 0000000..dc178d7 --- /dev/null +++ b/scrapegraph-py/examples/sync/htmlfy_example.py @@ -0,0 +1,217 @@ +""" +Example demonstrating how to use the HTMLfy API with the scrapegraph-py SDK. + +This example shows how to: +1. Set up the client for HTMLfy +2. Make the API call to get HTML content from a website +3. Handle the response and save the HTML content +4. Demonstrate both regular and heavy JS rendering modes +5. Display the results and metadata + +Requirements: +- Python 3.7+ +- scrapegraph-py +- python-dotenv +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import json +import os +import time +from pathlib import Path +from typing import Optional + +from dotenv import load_dotenv + +from scrapegraph_py import Client + +# Load environment variables from .env file +load_dotenv() + + +def htmlfy_website( + client: Client, + website_url: str, + render_heavy_js: bool = False, + headers: Optional[dict[str, str]] = None, +) -> dict: + """ + Get HTML content from a website using the HTMLfy API. + + Args: + client: The scrapegraph-py client instance + website_url: The URL of the website to get HTML from + render_heavy_js: Whether to render heavy JavaScript (defaults to False) + headers: Optional headers to send with the request + + Returns: + dict: A dictionary containing the HTML content and metadata + + Raises: + Exception: If the API request fails + """ + js_mode = "with heavy JS rendering" if render_heavy_js else "without JS rendering" + print(f"Getting HTML content from: {website_url}") + print(f"Mode: {js_mode}") + + start_time = time.time() + + try: + result = client.htmlfy( + website_url=website_url, + render_heavy_js=render_heavy_js, + headers=headers, + ) + execution_time = time.time() - start_time + print(f"Execution time: {execution_time:.2f} seconds") + return result + except Exception as e: + print(f"Error: {str(e)}") + raise + + +def save_html_content( + html_content: str, filename: str, output_dir: str = "htmlfy_output" +): + """ + Save HTML content to a file. + + Args: + html_content: The HTML content to save + filename: The name of the file (without extension) + output_dir: The directory to save the file in + """ + # Create output directory if it doesn't exist + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True) + + # Save HTML file + html_file = output_path / f"{filename}.html" + with open(html_file, "w", encoding="utf-8") as f: + f.write(html_content) + + print(f"HTML content saved to: {html_file}") + return html_file + + +def analyze_html_content(html_content: str) -> dict: + """ + Analyze HTML content and provide basic statistics. + + Args: + html_content: The HTML content to analyze + + Returns: + dict: Basic statistics about the HTML content + """ + stats = { + "total_length": len(html_content), + "lines": len(html_content.splitlines()), + "has_doctype": html_content.strip().startswith(" + + + Example Domain + + + + + + + + +
+

Example Domain

+

This domain is for use in illustrative examples in documents. You may use this + domain in literature without prior coordination or asking for permission.

+

More information...

+
+ + diff --git a/scrapegraph-py/examples/sync/htmlfy_output/httpbin_html_nojs.html b/scrapegraph-py/examples/sync/htmlfy_output/httpbin_html_nojs.html new file mode 100644 index 0000000..05a40c6 --- /dev/null +++ b/scrapegraph-py/examples/sync/htmlfy_output/httpbin_html_nojs.html @@ -0,0 +1,12 @@ + + + +

Herman Melville - Moby-Dick

+ +
+

+ Availing himself of the mild, summer-cool weather that now reigned in these latitudes, and in preparation for the peculiarly active pursuits shortly to be anticipated, Perth, the begrimed, blistered old blacksmith, had not removed his portable forge to the hold again, after concluding his contributory work for Ahab's leg, but still retained it on deck, fast lashed to ringbolts by the foremast; being now almost incessantly invoked by the headsmen, and harpooneers, and bowsmen to do some little job for them; altering, or repairing, or new shaping their various weapons and boat furniture. Often he would be surrounded by an eager circle, all waiting to be served; holding boat-spades, pike-heads, harpoons, and lances, and jealously watching his every sooty movement, as he toiled. Nevertheless, this old man's was a patient hammer wielded by a patient arm. No murmur, no impatience, no petulance did come from him. Silent, slow, and solemn; bowing over still further his chronically broken back, he toiled away, as if toil were life itself, and the heavy beating of his hammer the heavy beating of his heart. And so it was.β€”Most miserable! A peculiar walk in this old man, a certain slight but painful appearing yawing in his gait, had at an early period of the voyage excited the curiosity of the mariners. And to the importunity of their persisted questionings he had finally given in; and so it came to pass that every one now knew the shameful story of his wretched fate. Belated, and not innocently, one bitter winter's midnight, on the road running between two country towns, the blacksmith half-stupidly felt the deadly numbness stealing over him, and sought refuge in a leaning, dilapidated barn. The issue was, the loss of the extremities of both feet. Out of this revelation, part by part, at last came out the four acts of the gladness, and the one long, and as yet uncatastrophied fifth act of the grief of his life's drama. He was an old man, who, at the age of nearly sixty, had postponedly encountered that thing in sorrow's technicals called ruin. He had been an artisan of famed excellence, and with plenty to do; owned a house and garden; embraced a youthful, daughter-like, loving wife, and three blithe, ruddy children; every Sunday went to a cheerful-looking church, planted in a grove. But one night, under cover of darkness, and further concealed in a most cunning disguisement, a desperate burglar slid into his happy home, and robbed them all of everything. And darker yet to tell, the blacksmith himself did ignorantly conduct this burglar into his family's heart. It was the Bottle Conjuror! Upon the opening of that fatal cork, forth flew the fiend, and shrivelled up his home. Now, for prudent, most wise, and economic reasons, the blacksmith's shop was in the basement of his dwelling, but with a separate entrance to it; so that always had the young and loving healthy wife listened with no unhappy nervousness, but with vigorous pleasure, to the stout ringing of her young-armed old husband's hammer; whose reverberations, muffled by passing through the floors and walls, came up to her, not unsweetly, in her nursery; and so, to stout Labor's iron lullaby, the blacksmith's infants were rocked to slumber. Oh, woe on woe! Oh, Death, why canst thou not sometimes be timely? Hadst thou taken this old blacksmith to thyself ere his full ruin came upon him, then had the young widow had a delicious grief, and her orphans a truly venerable, legendary sire to dream of in their after years; and all of them a care-killing competency. +

+
+ + \ No newline at end of file diff --git a/scrapegraph-py/examples/sync/steps/step_by_step_htmlfy_example.py b/scrapegraph-py/examples/sync/steps/step_by_step_htmlfy_example.py new file mode 100644 index 0000000..981862d --- /dev/null +++ b/scrapegraph-py/examples/sync/steps/step_by_step_htmlfy_example.py @@ -0,0 +1,183 @@ +""" +Step-by-step example demonstrating how to use the HTMLfy API with the scrapegraph-py SDK. + +This example shows the basic workflow: +1. Initialize the client +2. Make an HTMLfy request +3. Handle the response +4. Save the HTML content +5. Basic analysis + +Requirements: +- Python 3.7+ +- scrapegraph-py +- python-dotenv +- A .env file with your SGAI_API_KEY + +Example .env file: +SGAI_API_KEY=your_api_key_here +""" + +import os +from pathlib import Path +from dotenv import load_dotenv + +from scrapegraph_py import Client + +# Load environment variables from .env file +load_dotenv() + + +def step_1_initialize_client(): + """Step 1: Initialize the scrapegraph-py client.""" + print("πŸ”‘ Step 1: Initializing client...") + + try: + # Initialize client using environment variable + client = Client.from_env() + print("βœ… Client initialized successfully") + return client + except Exception as e: + print(f"❌ Failed to initialize client: {str(e)}") + print("Make sure you have SGAI_API_KEY in your .env file") + raise + + +def step_2_make_htmlfy_request(client, url, render_js=False): + """Step 2: Make an HTMLfy request.""" + print(f"\n🌐 Step 2: Making HTMLfy request to {url}") + print(f"πŸ”§ Render heavy JS: {render_js}") + + try: + # Make the HTMLfy request + result = client.htmlfy( + website_url=url, + render_heavy_js=render_js + ) + print("βœ… HTMLfy request completed successfully") + return result + except Exception as e: + print(f"❌ HTMLfy request failed: {str(e)}") + raise + + +def step_3_handle_response(result): + """Step 3: Handle and analyze the response.""" + print(f"\nπŸ“Š Step 3: Analyzing response...") + + # Check if we got HTML content + html_content = result.get("html", "") + if not html_content: + print("❌ No HTML content received") + return None + + # Basic response analysis + print(f"βœ… Received HTML content") + print(f"πŸ“ Content length: {len(html_content):,} characters") + print(f"πŸ“„ Lines: {len(html_content.splitlines()):,}") + + # Check for common HTML elements + has_doctype = html_content.strip().startswith(" 0: + print(f" {element}: {count}") + + # Check for JavaScript and CSS + has_js = elements["script"] > 0 + has_css = elements["style"] > 0 + + print(f"\n🎨 Content types:") + print(f" JavaScript: {'Yes' if has_js else 'No'}") + print(f" CSS: {'Yes' if has_css else 'No'}") + + return elements + + +def main(): + """Main function demonstrating step-by-step HTMLfy usage.""" + print("πŸš€ Step-by-Step HTMLfy API Example") + print("=" * 50) + + # Test URL + test_url = "https://example.com" + + try: + # Step 1: Initialize client + client = step_1_initialize_client() + + # Step 2: Make HTMLfy request + result = step_2_make_htmlfy_request(client, test_url, render_js=False) + + # Step 3: Handle response + html_content = step_3_handle_response(result) + if not html_content: + print("❌ Cannot proceed without HTML content") + return + + # Step 4: Save content + filename = "example_website" + saved_file = step_4_save_html_content(html_content, filename) + + # Step 5: Basic analysis + elements = step_5_basic_analysis(html_content) + + # Summary + print(f"\n🎯 Summary:") + print(f"βœ… Successfully processed {test_url}") + print(f"πŸ’Ύ HTML saved to: {saved_file}") + print(f"πŸ“Š Analyzed {len(html_content):,} characters of HTML content") + + # Close client + client.close() + print("πŸ”’ Client closed successfully") + + except Exception as e: + print(f"\nπŸ’₯ Error occurred: {str(e)}") + print("Check your API key and internet connection") + + +if __name__ == "__main__": + main() diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index e98e6a7..3fd3874 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -14,6 +14,7 @@ ) from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest from scrapegraph_py.models.feedback import FeedbackRequest +from scrapegraph_py.models.htmlfy import GetHtmlfyRequest, HtmlfyRequest from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest from scrapegraph_py.models.searchscraper import ( GetSearchScraperRequest, @@ -175,6 +176,50 @@ async def get_markdownify(self, request_id: str): logger.info(f"✨ Successfully retrieved result for request {request_id}") return result + async def htmlfy( + self, + website_url: str, + render_heavy_js: bool = False, + headers: Optional[dict[str, str]] = None, + ): + """Send an HTMLfy request to get HTML content from a website + + Args: + website_url: The URL of the website to get HTML from + render_heavy_js: Whether to render heavy JavaScript (defaults to False) + headers: Optional headers to send with the request + """ + logger.info(f"πŸ” Starting HTMLfy request for {website_url}") + logger.debug(f"πŸ”§ Render heavy JS: {render_heavy_js}") + if headers: + logger.debug("πŸ”§ Using custom headers") + + request = HtmlfyRequest( + website_url=website_url, + render_heavy_js=render_heavy_js, + headers=headers, + ) + logger.debug("βœ… Request validation passed") + + result = await self._make_request( + "POST", f"{API_BASE_URL}/htmlfy", json=request.model_dump() + ) + logger.info("✨ HTMLfy request completed successfully") + return result + + async def get_htmlfy(self, request_id: str): + """Get the result of a previous HTMLfy request""" + logger.info(f"πŸ” Fetching HTMLfy result for request {request_id}") + + # Validate input using Pydantic model + GetHtmlfyRequest(request_id=request_id) + logger.debug("βœ… Request ID validation passed") + + result = await self._make_request( + "GET", f"{API_BASE_URL}/htmlfy/{request_id}") + logger.info(f"✨ Successfully retrieved result for request {request_id}") + return result + async def smartscraper( self, user_prompt: str, diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index f78620d..045c0db 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -15,6 +15,7 @@ ) from scrapegraph_py.models.crawl import CrawlRequest, GetCrawlRequest from scrapegraph_py.models.feedback import FeedbackRequest +from scrapegraph_py.models.htmlfy import GetHtmlfyRequest, HtmlfyRequest from scrapegraph_py.models.markdownify import GetMarkdownifyRequest, MarkdownifyRequest from scrapegraph_py.models.searchscraper import ( GetSearchScraperRequest, @@ -182,6 +183,49 @@ def get_markdownify(self, request_id: str): logger.info(f"✨ Successfully retrieved result for request {request_id}") return result + def htmlfy( + self, + website_url: str, + render_heavy_js: bool = False, + headers: Optional[dict[str, str]] = None, + ): + """Send an HTMLfy request to get HTML content from a website + + Args: + website_url: The URL of the website to get HTML from + render_heavy_js: Whether to render heavy JavaScript (defaults to False) + headers: Optional headers to send with the request + """ + logger.info(f"πŸ” Starting HTMLfy request for {website_url}") + logger.debug(f"πŸ”§ Render heavy JS: {render_heavy_js}") + if headers: + logger.debug("πŸ”§ Using custom headers") + + request = HtmlfyRequest( + website_url=website_url, + render_heavy_js=render_heavy_js, + headers=headers, + ) + logger.debug("βœ… Request validation passed") + + result = self._make_request( + "POST", f"{API_BASE_URL}/htmlfy", json=request.model_dump() + ) + logger.info("✨ HTMLfy request completed successfully") + return result + + def get_htmlfy(self, request_id: str): + """Get the result of a previous HTMLfy request""" + logger.info(f"πŸ” Fetching HTMLfy result for request {request_id}") + + # Validate input using Pydantic model + GetHtmlfyRequest(request_id=request_id) + logger.debug("βœ… Request ID validation passed") + + result = self._make_request("GET", f"{API_BASE_URL}/htmlfy/{request_id}") + logger.info(f"✨ Successfully retrieved result for request {request_id}") + return result + def smartscraper( self, user_prompt: str, diff --git a/scrapegraph-py/scrapegraph_py/models/__init__.py b/scrapegraph-py/scrapegraph_py/models/__init__.py index cbde5de..97f9371 100644 --- a/scrapegraph-py/scrapegraph_py/models/__init__.py +++ b/scrapegraph-py/scrapegraph_py/models/__init__.py @@ -1,6 +1,7 @@ from .agenticscraper import AgenticScraperRequest, GetAgenticScraperRequest from .crawl import CrawlRequest, GetCrawlRequest from .feedback import FeedbackRequest +from .htmlfy import GetHtmlfyRequest, HtmlfyRequest from .markdownify import GetMarkdownifyRequest, MarkdownifyRequest from .searchscraper import GetSearchScraperRequest, SearchScraperRequest from .smartscraper import GetSmartScraperRequest, SmartScraperRequest @@ -11,6 +12,8 @@ "CrawlRequest", "GetCrawlRequest", "FeedbackRequest", + "GetHtmlfyRequest", + "HtmlfyRequest", "GetMarkdownifyRequest", "MarkdownifyRequest", "GetSearchScraperRequest", diff --git a/scrapegraph-py/scrapegraph_py/models/htmlfy.py b/scrapegraph-py/scrapegraph_py/models/htmlfy.py new file mode 100644 index 0000000..b32dcbf --- /dev/null +++ b/scrapegraph-py/scrapegraph_py/models/htmlfy.py @@ -0,0 +1,55 @@ +# Models for htmlfy endpoint + +from typing import Optional +from uuid import UUID + +from pydantic import BaseModel, Field, model_validator + + +class HtmlfyRequest(BaseModel): + website_url: str = Field(..., example="https://scrapegraphai.com/") + render_heavy_js: bool = Field( + False, + description="Whether to render heavy JavaScript (defaults to False)", + ) + headers: Optional[dict[str, str]] = Field( + None, + example={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36", + "Cookie": "cookie1=value1; cookie2=value2", + }, + description="Optional headers to send with the request, including cookies " + "and user agent", + ) + + @model_validator(mode="after") + def validate_url(self) -> "HtmlfyRequest": + if self.website_url is None or not self.website_url.strip(): + raise ValueError("Website URL cannot be empty") + if not ( + self.website_url.startswith("http://") + or self.website_url.startswith("https://") + ): + raise ValueError("Invalid URL") + return self + + def model_dump(self, *args, **kwargs) -> dict: + # Set exclude_none=True to exclude None values from serialization + kwargs.setdefault("exclude_none", True) + return super().model_dump(*args, **kwargs) + + +class GetHtmlfyRequest(BaseModel): + """Request model for get_htmlfy endpoint""" + + request_id: str = Field(..., example="123e4567-e89b-12d3-a456-426614174000") + + @model_validator(mode="after") + def validate_request_id(self) -> "GetHtmlfyRequest": + try: + # Validate the request_id is a valid UUID + UUID(self.request_id) + except ValueError: + raise ValueError("request_id must be a valid UUID") + return self diff --git a/scrapegraph-py/test_htmlfy_integration.py b/scrapegraph-py/test_htmlfy_integration.py new file mode 100644 index 0000000..ae3fb4a --- /dev/null +++ b/scrapegraph-py/test_htmlfy_integration.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +""" +Simple integration test for HTMLfy functionality. +This script tests the basic HTMLfy operations without requiring a real API key. +""" + +import os +import sys +from pathlib import Path + +# Add the src directory to the path +sys.path.insert(0, str(Path(__file__).parent / "scrapegraph_py")) + +from models.htmlfy import HtmlfyRequest, GetHtmlfyRequest + + +def test_htmlfy_models(): + """Test HTMLfy model validation""" + print("πŸ§ͺ Testing HTMLfy models...") + + # Test valid requests + try: + request = HtmlfyRequest( + website_url="https://example.com", + render_heavy_js=False + ) + print("βœ… Basic HTMLfy request validation passed") + + request_with_headers = HtmlfyRequest( + website_url="https://example.com", + render_heavy_js=True, + headers={"User-Agent": "Test Agent"} + ) + print("βœ… HTMLfy request with headers validation passed") + + except Exception as e: + print(f"❌ HTMLfy request validation failed: {e}") + return False + + # Test invalid requests + try: + HtmlfyRequest(website_url="") + print("❌ Empty URL should have failed validation") + return False + except ValueError: + print("βœ… Empty URL validation correctly failed") + + try: + HtmlfyRequest(website_url="invalid-url") + print("❌ Invalid URL should have failed validation") + return False + except ValueError: + print("βœ… Invalid URL validation correctly failed") + + # Test GetHtmlfyRequest + try: + get_request = GetHtmlfyRequest( + request_id="123e4567-e89b-12d3-a456-426614174000" + ) + print("βœ… Get HTMLfy request validation passed") + except Exception as e: + print(f"❌ Get HTMLfy request validation failed: {e}") + return False + + try: + GetHtmlfyRequest(request_id="invalid-uuid") + print("❌ Invalid UUID should have failed validation") + return False + except ValueError: + print("βœ… Invalid UUID validation correctly failed") + + print("βœ… All HTMLfy model tests passed!") + return True + + +def test_htmlfy_model_serialization(): + """Test HTMLfy model serialization""" + print("\nπŸ§ͺ Testing HTMLfy model serialization...") + + try: + # Test basic serialization + request = HtmlfyRequest( + website_url="https://example.com", + render_heavy_js=False + ) + data = request.model_dump() + + assert "website_url" in data + assert "render_heavy_js" in data + assert "headers" not in data # Should be excluded as None + print("βœ… Basic serialization test passed") + + # Test serialization with headers + request_with_headers = HtmlfyRequest( + website_url="https://example.com", + render_heavy_js=True, + headers={"User-Agent": "Test Agent"} + ) + data_with_headers = request_with_headers.model_dump() + + assert data_with_headers["headers"] == {"User-Agent": "Test Agent"} + print("βœ… Serialization with headers test passed") + + print("βœ… All serialization tests passed!") + return True + + except Exception as e: + print(f"❌ Serialization test failed: {e}") + return False + + +def main(): + """Run all HTMLfy tests""" + print("πŸš€ HTMLfy Integration Tests") + print("=" * 40) + + tests = [ + test_htmlfy_models, + test_htmlfy_model_serialization, + ] + + passed = 0 + total = len(tests) + + for test in tests: + if test(): + passed += 1 + print() + + print("πŸ“Š Test Results") + print("=" * 20) + print(f"Passed: {passed}/{total}") + + if passed == total: + print("πŸŽ‰ All tests passed!") + return 0 + else: + print("❌ Some tests failed!") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scrapegraph-py/tests/test_async_client.py b/scrapegraph-py/tests/test_async_client.py index c663814..90ff782 100644 --- a/scrapegraph-py/tests/test_async_client.py +++ b/scrapegraph-py/tests/test_async_client.py @@ -549,3 +549,253 @@ async def test_crawl_markdown_mode_validation(mock_api_key): "Data schema should not be provided when extraction_mode=False" in str(e) ) + + +# ============================================================================ +# ASYNC HTMLFY TESTS +# ============================================================================ + + +@pytest.mark.asyncio +async def test_async_htmlfy_basic(mock_api_key): + """Test basic async HTMLfy request""" + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/htmlfy", + payload={ + "htmlfy_request_id": str(uuid4()), + "status": "completed", + "html": "

Example Page

This is HTML content.

", + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.htmlfy(website_url="https://example.com") + assert response["status"] == "completed" + assert "html" in response + assert "

Example Page

" in response["html"] + + +@pytest.mark.asyncio +async def test_async_htmlfy_with_heavy_js(mock_api_key): + """Test async HTMLfy request with heavy JavaScript rendering""" + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/htmlfy", + payload={ + "htmlfy_request_id": str(uuid4()), + "status": "completed", + "html": "
JavaScript rendered content
", + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.htmlfy( + website_url="https://example.com", + render_heavy_js=True + ) + assert response["status"] == "completed" + assert "html" in response + assert "JavaScript rendered content" in response["html"] + + +@pytest.mark.asyncio +async def test_async_htmlfy_with_headers(mock_api_key): + """Test async HTMLfy request with custom headers""" + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/htmlfy", + payload={ + "htmlfy_request_id": str(uuid4()), + "status": "completed", + "html": "

Content with custom headers

", + }, + ) + + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Cookie": "session=123" + } + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.htmlfy( + website_url="https://example.com", + headers=headers + ) + assert response["status"] == "completed" + assert "html" in response + + +@pytest.mark.asyncio +async def test_async_htmlfy_with_all_options(mock_api_key): + """Test async HTMLfy request with all options enabled""" + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/htmlfy", + payload={ + "htmlfy_request_id": str(uuid4()), + "status": "completed", + "html": "
Full featured content
", + }, + ) + + headers = { + "User-Agent": "Custom Agent", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" + } + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.htmlfy( + website_url="https://example.com", + render_heavy_js=True, + headers=headers + ) + assert response["status"] == "completed" + assert "html" in response + + +@pytest.mark.asyncio +async def test_async_get_htmlfy(mock_api_key, mock_uuid): + """Test async get HTMLfy result""" + with aioresponses() as mocked: + mocked.get( + f"https://api.scrapegraphai.com/v1/htmlfy/{mock_uuid}", + payload={ + "htmlfy_request_id": mock_uuid, + "status": "completed", + "html": "

Retrieved HTML content

", + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.get_htmlfy(mock_uuid) + assert response["status"] == "completed" + assert response["htmlfy_request_id"] == mock_uuid + assert "html" in response + + +@pytest.mark.asyncio +async def test_async_htmlfy_error_response(mock_api_key): + """Test async HTMLfy error response handling""" + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/htmlfy", + payload={ + "error": "Website not accessible", + "status": "error" + }, + status=400 + ) + + async with AsyncClient(api_key=mock_api_key) as client: + with pytest.raises(Exception): + await client.htmlfy(website_url="https://inaccessible-site.com") + + +@pytest.mark.asyncio +async def test_async_htmlfy_processing_status(mock_api_key): + """Test async HTMLfy processing status response""" + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/htmlfy", + payload={ + "htmlfy_request_id": str(uuid4()), + "status": "processing", + "message": "HTMLfy job started" + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.htmlfy(website_url="https://example.com") + assert response["status"] == "processing" + assert "htmlfy_request_id" in response + + +@pytest.mark.asyncio +async def test_async_htmlfy_complex_html_response(mock_api_key): + """Test async HTMLfy with complex HTML response""" + complex_html = """ + + + + + + Complex Page + + + +
+ +
+
+

Welcome

+

This is a complex HTML page with multiple elements.

+
+ Sample image + + +
Data 1Data 2
+
+
+ + + + """ + + with aioresponses() as mocked: + mocked.post( + "https://api.scrapegraphai.com/v1/htmlfy", + payload={ + "htmlfy_request_id": str(uuid4()), + "status": "completed", + "html": complex_html, + }, + ) + + async with AsyncClient(api_key=mock_api_key) as client: + response = await client.htmlfy(website_url="https://complex-example.com") + assert response["status"] == "completed" + assert "html" in response + assert "" in response["html"] + assert "Complex Page" in response["html"] + assert " + + + """ + + responses.add( + responses.POST, + "https://api.scrapegraphai.com/v1/htmlfy", + json={ + "htmlfy_request_id": str(uuid4()), + "status": "completed", + "html": complex_html, + }, + ) + + with Client(api_key=mock_api_key) as client: + response = client.htmlfy(website_url="https://complex-example.com") + assert response["status"] == "completed" + assert "html" in response + assert "" in response["html"] + assert "Complex Page" in response["html"] + assert "