ScrapeGraphAI
diff --git a/‎scrapegraph-py/examples/async/async_crawl_example.py
Lines changed: 16 additions & 47 deletions b/‎scrapegraph-py/examples/async/async_crawl_example.py
Lines changed: 16 additions & 47 deletions
diff --git a/‎scrapegraph-py/examples/sync/crawl_example.py
Lines changed: 30 additions & 41 deletions b/‎scrapegraph-py/examples/sync/crawl_example.py
Lines changed: 30 additions & 41 deletions
diff --git a/‎scrapegraph-py/scrapegraph_py/async_client.py
Lines changed: 22 additions & 14 deletions b/‎scrapegraph-py/scrapegraph_py/async_client.py
Lines changed: 22 additions & 14 deletions
diff --git a/‎scrapegraph-py/scrapegraph_py/client.py
Lines changed: 22 additions & 14 deletions b/‎scrapegraph-py/scrapegraph_py/client.py
Lines changed: 22 additions & 14 deletions
diff --git a/‎scrapegraph-py/scrapegraph_py/models/crawl.py
Lines changed: 8 additions & 2 deletions b/‎scrapegraph-py/scrapegraph_py/models/crawl.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎scrapegraph-py/test_schema_fix.py
Lines changed: 0 additions & 35 deletions b/‎scrapegraph-py/test_schema_fix.py
Lines changed: 0 additions & 35 deletions
diff --git a/‎scrapegraph-py/test_cookies_integration.py renamed to ‎scrapegraph-py/tests/test_cookies_integration.py b/‎scrapegraph-py/test_cookies_integration.py renamed to ‎scrapegraph-py/tests/test_cookies_integration.py
diff --git a/‎scrapegraph-py/test_models_fix.py renamed to ‎scrapegraph-py/tests/test_models_fix.py b/‎scrapegraph-py/test_models_fix.py renamed to ‎scrapegraph-py/tests/test_models_fix.py
@@ -1,5 +1,5 @@
 """
-Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint with a custom schema using the async client.
+Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint using the async client.
 
 Requirements:
 - Python 3.7+
@@ -31,59 +31,28 @@ async def main():
         print("SGAI_API_KEY=your_api_key_here")
         return
 
-    # Example schema (from your curl command)
-    schema: Dict[str, Any] = {
-        "$schema": "http://json-schema.org/draft-07/schema#",
-        "title": "ScrapeGraphAI Website Content",
+    # Simple schema for founders' information
+    schema = {
         "type": "object",
         "properties": {
-            "company": {
-                "type": "object",
-                "properties": {
-                    "name": {"type": "string"},
-                    "description": {"type": "string"},
-                    "features": {"type": "array", "items": {"type": "string"}},
-                    "contact_email": {"type": "string", "format": "email"},
-                    "social_links": {
-                        "type": "object",
-                        "properties": {
-                            "github": {"type": "string", "format": "uri"},
-                            "linkedin": {"type": "string", "format": "uri"},
-                            "twitter": {"type": "string", "format": "uri"},
-                        },
-                        "additionalProperties": False,
-                    },
-                },
-                "required": ["name", "description"],
-            },
-            "services": {
+            "founders": {
                 "type": "array",
                 "items": {
                     "type": "object",
                     "properties": {
-                        "service_name": {"type": "string"},
-                        "description": {"type": "string"},
-                        "features": {"type": "array", "items": {"type": "string"}},
-                    },
-                    "required": ["service_name", "description"],
-                },
-            },
-            "legal": {
-                "type": "object",
-                "properties": {
-                    "privacy_policy": {"type": "string"},
-                    "terms_of_service": {"type": "string"},
-                },
-                "required": ["privacy_policy", "terms_of_service"],
-            },
-        },
-        "required": ["company", "services", "legal"],
+                        "name": {"type": "string"},
+                        "title": {"type": "string"},
+                        "bio": {"type": "string"},
+                        "linkedin": {"type": "string"},
+                        "twitter": {"type": "string"}
+                    }
+                }
+            }
+        }
     }
 
-    url = "https://scrapegraphai.com/"
-    prompt = (
-        "What does the company do? and I need text content from there privacy and terms"
-    )
+    url = "https://scrapegraphai.com"
+    prompt = "extract the founders'infos"
 
     try:
         # Initialize the async client
@@ -99,7 +68,7 @@ async def main():
                 depth=2,
                 max_pages=2,
                 same_domain_only=True,
-                batch_size=1,
+                # batch_size is optional and will be excluded if not provided
             )
             execution_time = time.time() - start_time
             print(f"POST /v1/crawl/ execution time: {execution_time:.2f} seconds")
 
@@ -1,5 +1,5 @@
 """
-Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint with a custom schema.
+Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint.
 
 Requirements:
 - Python 3.7+
@@ -13,57 +13,44 @@
 import json
 import os
 import time
-from typing import Dict, Any, List, Optional
+from typing import Dict, Any
 
 from dotenv import load_dotenv
 
-from pydantic import BaseModel, EmailStr, HttpUrl
 from scrapegraph_py import Client
 
 # Load environment variables from .env file
 load_dotenv()
 
-# Pydantic models for schema
-class SocialLinks(BaseModel):
-    github: Optional[HttpUrl]
-    linkedin: Optional[HttpUrl]
-    twitter: Optional[HttpUrl]
-
-class Company(BaseModel):
-    name: str
-    description: str
-    features: Optional[List[str]] = None
-    contact_email: Optional[EmailStr] = None
-    social_links: Optional[SocialLinks] = None
-
-class Service(BaseModel):
-    service_name: str
-    description: str
-    features: Optional[List[str]] = None
-
-class Legal(BaseModel):
-    privacy_policy: str
-    terms_of_service: str
-
-class WebsiteContent(BaseModel):
-    company: Company
-    services: List[Service]
-    legal: Legal
-
 def main():
     if not os.getenv("SGAI_API_KEY"):
         print("Error: SGAI_API_KEY not found in .env file")
         print("Please create a .env file with your API key:")
         print("SGAI_API_KEY=your_api_key_here")
         return
 
-    # Example schema (from your curl command)
-    schema = WebsiteContent.schema()
-
-    url = "https://scrapegraphai.com/"
-    prompt = (
-        "What does the company do? and I need text content from there privacy and terms"
-    )
+    # Simple schema for founders' information
+    schema = {
+        "type": "object",
+        "properties": {
+            "founders": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "name": {"type": "string"},
+                        "title": {"type": "string"},
+                        "bio": {"type": "string"},
+                        "linkedin": {"type": "string"},
+                        "twitter": {"type": "string"}
+                    }
+                }
+            }
+        }
+    }
+
+    url = "https://scrapegraphai.com"
+    prompt = "extract the founders'infos"
 
     try:
         # Initialize the client
@@ -80,7 +67,7 @@ def main():
             depth=2,
             max_pages=2,
             same_domain_only=True,
-            batch_size=1,
+            # batch_size is optional and will be excluded if not provided
         )
         execution_time = time.time() - start_time
         print(f"POST /v1/crawl/ execution time: {execution_time:.2f} seconds")
@@ -92,7 +79,8 @@ def main():
         start_time = time.time()
         if crawl_id:
             print("\nPolling for crawl result...")
-            for _ in range(10):
+            # Increase timeout to 5 minutes (60 iterations × 5 seconds)
+            for i in range(60):
                 time.sleep(5)
                 result = client.get_crawl(crawl_id)
                 if result.get("status") == "success" and result.get("result"):
@@ -108,9 +96,10 @@ def main():
                     print(json.dumps(result, indent=2))
                     break
                 else:
-                    print(f"Status: {result.get('status')}, waiting...")
+                    elapsed_time = (i + 1) * 5
+                    print(f"Status: {result.get('status')}, waiting... ({elapsed_time}s elapsed)")
             else:
-                print("Crawl did not complete in time.")
+                print("Crawl did not complete within 5 minutes.")
         else:
             print("No crawl ID found in response. Synchronous result:")
             print(json.dumps(crawl_response, indent=2))
 
@@ -319,7 +319,7 @@ async def crawl(
         depth: int = 2,
         max_pages: int = 2,
         same_domain_only: bool = True,
-        batch_size: int = 1,
+        batch_size: Optional[int] = None,
     ):
         """Send a crawl request"""
         logger.info("🔍 Starting crawl request")
@@ -330,22 +330,30 @@ async def crawl(
         logger.debug(f"🔍 Depth: {depth}")
         logger.debug(f"📄 Max pages: {max_pages}")
         logger.debug(f"🏠 Same domain only: {same_domain_only}")
-        logger.debug(f"📦 Batch size: {batch_size}")
-
-        request = CrawlRequest(
-            url=url,
-            prompt=prompt,
-            data_schema=data_schema,
-            cache_website=cache_website,
-            depth=depth,
-            max_pages=max_pages,
-            same_domain_only=same_domain_only,
-            batch_size=batch_size,
-        )
+        if batch_size is not None:
+            logger.debug(f"📦 Batch size: {batch_size}")
+
+        # Build request data, excluding batch_size if not provided
+        request_data = {
+            "url": url,
+            "prompt": prompt,
+            "data_schema": data_schema,
+            "cache_website": cache_website,
+            "depth": depth,
+            "max_pages": max_pages,
+            "same_domain_only": same_domain_only,
+        }
+        
+        if batch_size is not None:
+            request_data["batch_size"] = batch_size
+
+        request = CrawlRequest(**request_data)
         logger.debug("✅ Request validation passed")
 
+        # Serialize the request, excluding None values
+        request_json = request.model_dump(exclude_none=True)
         result = await self._make_request(
-            "POST", f"{API_BASE_URL}/crawl", json=request.model_dump()
+            "POST", f"{API_BASE_URL}/crawl", json=request_json
         )
         logger.info("✨ Crawl request completed successfully")
         return result
 
@@ -322,7 +322,7 @@ def crawl(
         depth: int = 2,
         max_pages: int = 2,
         same_domain_only: bool = True,
-        batch_size: int = 1,
+        batch_size: Optional[int] = None,
     ):
         """Send a crawl request"""
         logger.info("🔍 Starting crawl request")
@@ -333,22 +333,30 @@ def crawl(
         logger.debug(f"🔍 Depth: {depth}")
         logger.debug(f"📄 Max pages: {max_pages}")
         logger.debug(f"🏠 Same domain only: {same_domain_only}")
-        logger.debug(f"📦 Batch size: {batch_size}")
-
-        request = CrawlRequest(
-            url=url,
-            prompt=prompt,
-            data_schema=data_schema,
-            cache_website=cache_website,
-            depth=depth,
-            max_pages=max_pages,
-            same_domain_only=same_domain_only,
-            batch_size=batch_size,
-        )
+        if batch_size is not None:
+            logger.debug(f"📦 Batch size: {batch_size}")
+
+        # Build request data, excluding batch_size if not provided
+        request_data = {
+            "url": url,
+            "prompt": prompt,
+            "data_schema": data_schema,
+            "cache_website": cache_website,
+            "depth": depth,
+            "max_pages": max_pages,
+            "same_domain_only": same_domain_only,
+        }
+        
+        if batch_size is not None:
+            request_data["batch_size"] = batch_size
+
+        request = CrawlRequest(**request_data)
         logger.debug("✅ Request validation passed")
 
+        # Serialize the request, excluding None values
+        request_json = request.model_dump(exclude_none=True)
         result = self._make_request(
-            "POST", f"{API_BASE_URL}/crawl", json=request.model_dump()
+            "POST", f"{API_BASE_URL}/crawl", json=request_json
         )
         logger.info("✨ Crawl request completed successfully")
         return result
 
@@ -37,8 +37,8 @@ class CrawlRequest(BaseModel):
         default=True,
         description="Whether to only crawl pages from the same domain"
     )
-    batch_size: conint(ge=1, le=10) = Field(
-        default=1,
+    batch_size: Optional[conint(ge=1, le=10)] = Field(
+        default=None,
         description="Batch size for processing pages (1-10)"
     )
 
@@ -69,6 +69,12 @@ def validate_data_schema(self) -> "CrawlRequest":
             raise ValueError("Data schema cannot be empty")
         return self
 
+    @model_validator(mode="after")
+    def validate_batch_size(self) -> "CrawlRequest":
+        if self.batch_size is not None and (self.batch_size < 1 or self.batch_size > 10):
+            raise ValueError("Batch size must be between 1 and 10")
+        return self
+
 
 class GetCrawlRequest(BaseModel):
     """Request model for get_crawl endpoint"""