Skip to content

Commit f1e9efe

Browse files
committed
feat: update crawl integrarion
1 parent e62d4ec commit f1e9efe

File tree

8 files changed

+98
-153
lines changed

8 files changed

+98
-153
lines changed

scrapegraph-py/examples/async/async_crawl_example.py

Lines changed: 16 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint with a custom schema using the async client.
2+
Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint using the async client.
33
44
Requirements:
55
- Python 3.7+
@@ -31,59 +31,28 @@ async def main():
3131
print("SGAI_API_KEY=your_api_key_here")
3232
return
3333

34-
# Example schema (from your curl command)
35-
schema: Dict[str, Any] = {
36-
"$schema": "http://json-schema.org/draft-07/schema#",
37-
"title": "ScrapeGraphAI Website Content",
34+
# Simple schema for founders' information
35+
schema = {
3836
"type": "object",
3937
"properties": {
40-
"company": {
41-
"type": "object",
42-
"properties": {
43-
"name": {"type": "string"},
44-
"description": {"type": "string"},
45-
"features": {"type": "array", "items": {"type": "string"}},
46-
"contact_email": {"type": "string", "format": "email"},
47-
"social_links": {
48-
"type": "object",
49-
"properties": {
50-
"github": {"type": "string", "format": "uri"},
51-
"linkedin": {"type": "string", "format": "uri"},
52-
"twitter": {"type": "string", "format": "uri"},
53-
},
54-
"additionalProperties": False,
55-
},
56-
},
57-
"required": ["name", "description"],
58-
},
59-
"services": {
38+
"founders": {
6039
"type": "array",
6140
"items": {
6241
"type": "object",
6342
"properties": {
64-
"service_name": {"type": "string"},
65-
"description": {"type": "string"},
66-
"features": {"type": "array", "items": {"type": "string"}},
67-
},
68-
"required": ["service_name", "description"],
69-
},
70-
},
71-
"legal": {
72-
"type": "object",
73-
"properties": {
74-
"privacy_policy": {"type": "string"},
75-
"terms_of_service": {"type": "string"},
76-
},
77-
"required": ["privacy_policy", "terms_of_service"],
78-
},
79-
},
80-
"required": ["company", "services", "legal"],
43+
"name": {"type": "string"},
44+
"title": {"type": "string"},
45+
"bio": {"type": "string"},
46+
"linkedin": {"type": "string"},
47+
"twitter": {"type": "string"}
48+
}
49+
}
50+
}
51+
}
8152
}
8253

83-
url = "https://scrapegraphai.com/"
84-
prompt = (
85-
"What does the company do? and I need text content from there privacy and terms"
86-
)
54+
url = "https://scrapegraphai.com"
55+
prompt = "extract the founders'infos"
8756

8857
try:
8958
# Initialize the async client
@@ -99,7 +68,7 @@ async def main():
9968
depth=2,
10069
max_pages=2,
10170
same_domain_only=True,
102-
batch_size=1,
71+
# batch_size is optional and will be excluded if not provided
10372
)
10473
execution_time = time.time() - start_time
10574
print(f"POST /v1/crawl/ execution time: {execution_time:.2f} seconds")

scrapegraph-py/examples/sync/crawl_example.py

Lines changed: 30 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint with a custom schema.
2+
Example demonstrating how to use the ScrapeGraphAI /v1/crawl/ API endpoint.
33
44
Requirements:
55
- Python 3.7+
@@ -13,57 +13,44 @@
1313
import json
1414
import os
1515
import time
16-
from typing import Dict, Any, List, Optional
16+
from typing import Dict, Any
1717

1818
from dotenv import load_dotenv
1919

20-
from pydantic import BaseModel, EmailStr, HttpUrl
2120
from scrapegraph_py import Client
2221

2322
# Load environment variables from .env file
2423
load_dotenv()
2524

26-
# Pydantic models for schema
27-
class SocialLinks(BaseModel):
28-
github: Optional[HttpUrl]
29-
linkedin: Optional[HttpUrl]
30-
twitter: Optional[HttpUrl]
31-
32-
class Company(BaseModel):
33-
name: str
34-
description: str
35-
features: Optional[List[str]] = None
36-
contact_email: Optional[EmailStr] = None
37-
social_links: Optional[SocialLinks] = None
38-
39-
class Service(BaseModel):
40-
service_name: str
41-
description: str
42-
features: Optional[List[str]] = None
43-
44-
class Legal(BaseModel):
45-
privacy_policy: str
46-
terms_of_service: str
47-
48-
class WebsiteContent(BaseModel):
49-
company: Company
50-
services: List[Service]
51-
legal: Legal
52-
5325
def main():
5426
if not os.getenv("SGAI_API_KEY"):
5527
print("Error: SGAI_API_KEY not found in .env file")
5628
print("Please create a .env file with your API key:")
5729
print("SGAI_API_KEY=your_api_key_here")
5830
return
5931

60-
# Example schema (from your curl command)
61-
schema = WebsiteContent.schema()
62-
63-
url = "https://scrapegraphai.com/"
64-
prompt = (
65-
"What does the company do? and I need text content from there privacy and terms"
66-
)
32+
# Simple schema for founders' information
33+
schema = {
34+
"type": "object",
35+
"properties": {
36+
"founders": {
37+
"type": "array",
38+
"items": {
39+
"type": "object",
40+
"properties": {
41+
"name": {"type": "string"},
42+
"title": {"type": "string"},
43+
"bio": {"type": "string"},
44+
"linkedin": {"type": "string"},
45+
"twitter": {"type": "string"}
46+
}
47+
}
48+
}
49+
}
50+
}
51+
52+
url = "https://scrapegraphai.com"
53+
prompt = "extract the founders'infos"
6754

6855
try:
6956
# Initialize the client
@@ -80,7 +67,7 @@ def main():
8067
depth=2,
8168
max_pages=2,
8269
same_domain_only=True,
83-
batch_size=1,
70+
# batch_size is optional and will be excluded if not provided
8471
)
8572
execution_time = time.time() - start_time
8673
print(f"POST /v1/crawl/ execution time: {execution_time:.2f} seconds")
@@ -92,7 +79,8 @@ def main():
9279
start_time = time.time()
9380
if crawl_id:
9481
print("\nPolling for crawl result...")
95-
for _ in range(10):
82+
# Increase timeout to 5 minutes (60 iterations × 5 seconds)
83+
for i in range(60):
9684
time.sleep(5)
9785
result = client.get_crawl(crawl_id)
9886
if result.get("status") == "success" and result.get("result"):
@@ -108,9 +96,10 @@ def main():
10896
print(json.dumps(result, indent=2))
10997
break
11098
else:
111-
print(f"Status: {result.get('status')}, waiting...")
99+
elapsed_time = (i + 1) * 5
100+
print(f"Status: {result.get('status')}, waiting... ({elapsed_time}s elapsed)")
112101
else:
113-
print("Crawl did not complete in time.")
102+
print("Crawl did not complete within 5 minutes.")
114103
else:
115104
print("No crawl ID found in response. Synchronous result:")
116105
print(json.dumps(crawl_response, indent=2))

scrapegraph-py/scrapegraph_py/async_client.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,7 @@ async def crawl(
319319
depth: int = 2,
320320
max_pages: int = 2,
321321
same_domain_only: bool = True,
322-
batch_size: int = 1,
322+
batch_size: Optional[int] = None,
323323
):
324324
"""Send a crawl request"""
325325
logger.info("🔍 Starting crawl request")
@@ -330,22 +330,30 @@ async def crawl(
330330
logger.debug(f"🔍 Depth: {depth}")
331331
logger.debug(f"📄 Max pages: {max_pages}")
332332
logger.debug(f"🏠 Same domain only: {same_domain_only}")
333-
logger.debug(f"📦 Batch size: {batch_size}")
334-
335-
request = CrawlRequest(
336-
url=url,
337-
prompt=prompt,
338-
data_schema=data_schema,
339-
cache_website=cache_website,
340-
depth=depth,
341-
max_pages=max_pages,
342-
same_domain_only=same_domain_only,
343-
batch_size=batch_size,
344-
)
333+
if batch_size is not None:
334+
logger.debug(f"📦 Batch size: {batch_size}")
335+
336+
# Build request data, excluding batch_size if not provided
337+
request_data = {
338+
"url": url,
339+
"prompt": prompt,
340+
"data_schema": data_schema,
341+
"cache_website": cache_website,
342+
"depth": depth,
343+
"max_pages": max_pages,
344+
"same_domain_only": same_domain_only,
345+
}
346+
347+
if batch_size is not None:
348+
request_data["batch_size"] = batch_size
349+
350+
request = CrawlRequest(**request_data)
345351
logger.debug("✅ Request validation passed")
346352

353+
# Serialize the request, excluding None values
354+
request_json = request.model_dump(exclude_none=True)
347355
result = await self._make_request(
348-
"POST", f"{API_BASE_URL}/crawl", json=request.model_dump()
356+
"POST", f"{API_BASE_URL}/crawl", json=request_json
349357
)
350358
logger.info("✨ Crawl request completed successfully")
351359
return result

scrapegraph-py/scrapegraph_py/client.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ def crawl(
322322
depth: int = 2,
323323
max_pages: int = 2,
324324
same_domain_only: bool = True,
325-
batch_size: int = 1,
325+
batch_size: Optional[int] = None,
326326
):
327327
"""Send a crawl request"""
328328
logger.info("🔍 Starting crawl request")
@@ -333,22 +333,30 @@ def crawl(
333333
logger.debug(f"🔍 Depth: {depth}")
334334
logger.debug(f"📄 Max pages: {max_pages}")
335335
logger.debug(f"🏠 Same domain only: {same_domain_only}")
336-
logger.debug(f"📦 Batch size: {batch_size}")
337-
338-
request = CrawlRequest(
339-
url=url,
340-
prompt=prompt,
341-
data_schema=data_schema,
342-
cache_website=cache_website,
343-
depth=depth,
344-
max_pages=max_pages,
345-
same_domain_only=same_domain_only,
346-
batch_size=batch_size,
347-
)
336+
if batch_size is not None:
337+
logger.debug(f"📦 Batch size: {batch_size}")
338+
339+
# Build request data, excluding batch_size if not provided
340+
request_data = {
341+
"url": url,
342+
"prompt": prompt,
343+
"data_schema": data_schema,
344+
"cache_website": cache_website,
345+
"depth": depth,
346+
"max_pages": max_pages,
347+
"same_domain_only": same_domain_only,
348+
}
349+
350+
if batch_size is not None:
351+
request_data["batch_size"] = batch_size
352+
353+
request = CrawlRequest(**request_data)
348354
logger.debug("✅ Request validation passed")
349355

356+
# Serialize the request, excluding None values
357+
request_json = request.model_dump(exclude_none=True)
350358
result = self._make_request(
351-
"POST", f"{API_BASE_URL}/crawl", json=request.model_dump()
359+
"POST", f"{API_BASE_URL}/crawl", json=request_json
352360
)
353361
logger.info("✨ Crawl request completed successfully")
354362
return result

scrapegraph-py/scrapegraph_py/models/crawl.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ class CrawlRequest(BaseModel):
3737
default=True,
3838
description="Whether to only crawl pages from the same domain"
3939
)
40-
batch_size: conint(ge=1, le=10) = Field(
41-
default=1,
40+
batch_size: Optional[conint(ge=1, le=10)] = Field(
41+
default=None,
4242
description="Batch size for processing pages (1-10)"
4343
)
4444

@@ -69,6 +69,12 @@ def validate_data_schema(self) -> "CrawlRequest":
6969
raise ValueError("Data schema cannot be empty")
7070
return self
7171

72+
@model_validator(mode="after")
73+
def validate_batch_size(self) -> "CrawlRequest":
74+
if self.batch_size is not None and (self.batch_size < 1 or self.batch_size > 10):
75+
raise ValueError("Batch size must be between 1 and 10")
76+
return self
77+
7278

7379
class GetCrawlRequest(BaseModel):
7480
"""Request model for get_crawl endpoint"""

scrapegraph-py/test_schema_fix.py

Lines changed: 0 additions & 35 deletions
This file was deleted.

0 commit comments

Comments
 (0)