Skip to content

Commit b75053d

Browse files
Merge pull request #4 from Vikrant-Khedkar/fix/smartcrawler
Refactor web crawling methods in server.py to SmartCrawler terminology. Update method signatures and documentation for clarity on AI extraction and markdown conversion modes.
2 parents 16808d0 + 54b330d commit b75053d

File tree

1 file changed

+66
-50
lines changed

1 file changed

+66
-50
lines changed

src/scrapegraph_mcp/server.py

Lines changed: 66 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
- markdownify: Convert any webpage into clean, formatted markdown
66
- smartscraper: Extract structured data from any webpage using AI
77
- searchscraper: Perform AI-powered web searches with structured results
8-
- crawl_requester: Initiate intelligent web crawling requests (step 1)
9-
- crawl_fetcher: Fetch results from crawling requests (step 2)
8+
- smartcrawler_initiate: Initiate intelligent multi-page web crawling with AI extraction or markdown conversion
9+
- smartcrawler_fetch_results: Retrieve results from asynchronous crawling operations
1010
"""
1111

1212
import os
@@ -126,49 +126,58 @@ def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scr
126126

127127
return response.json()
128128

129-
def crawl_requester(
129+
def smartcrawler_initiate(
130130
self,
131131
url: str,
132132
prompt: str = None,
133-
cache_website: bool = None,
133+
extraction_mode: str = "ai",
134134
depth: int = None,
135135
max_pages: int = None,
136-
same_domain_only: bool = None,
137-
markdown_only: bool = None
136+
same_domain_only: bool = None
138137
) -> Dict[str, Any]:
139138
"""
140-
Initiate a web crawling request and get a request ID.
139+
Initiate a SmartCrawler request for multi-page web crawling.
140+
141+
SmartCrawler supports two modes:
142+
- AI Extraction Mode (10 credits per page): Extracts structured data based on your prompt
143+
- Markdown Conversion Mode (2 credits per page): Converts pages to clean markdown
144+
145+
Smartcrawler takes some time to process the request and returns the request id.
146+
Use smartcrawler_fetch_results to get the results of the request.
147+
You have to keep polling the smartcrawler_fetch_results until the request is complete.
148+
The request is complete when the status is "completed".
141149
142150
Args:
143151
url: Starting URL to crawl
144-
prompt: AI prompt for data extraction (optional, if not provided returns markdown only)
145-
cache_website: Whether to cache the website content (optional)
146-
depth: Maximum crawling depth (optional)
152+
prompt: AI prompt for data extraction (required for AI mode)
153+
extraction_mode: "ai" for AI extraction or "markdown" for markdown conversion (default: "ai")
154+
depth: Maximum link traversal depth (optional)
147155
max_pages: Maximum number of pages to crawl (optional)
148156
same_domain_only: Whether to crawl only within the same domain (optional)
149-
markdown_only: Whether to return only markdown content without AI processing (optional)
150157
151158
Returns:
152-
Dictionary containing the request ID and status
159+
Dictionary containing the request ID for async processing
153160
"""
154-
endpoint = f"{self.BASE_URL}/crawl/requester"
161+
endpoint = f"{self.BASE_URL}/crawl"
155162
data = {
156163
"url": url
157164
}
158165

159-
# Add optional parameters if provided
160-
if prompt is not None:
166+
# Handle extraction mode
167+
if extraction_mode == "markdown":
168+
data["markdown_only"] = True
169+
elif extraction_mode == "ai":
170+
if prompt is None:
171+
raise ValueError("prompt is required when extraction_mode is 'ai'")
161172
data["prompt"] = prompt
162-
if cache_website is not None:
163-
data["cache_website"] = cache_website
173+
else:
174+
raise ValueError(f"Invalid extraction_mode: {extraction_mode}. Must be 'ai' or 'markdown'")
164175
if depth is not None:
165176
data["depth"] = depth
166177
if max_pages is not None:
167178
data["max_pages"] = max_pages
168179
if same_domain_only is not None:
169180
data["same_domain_only"] = same_domain_only
170-
if markdown_only is not None:
171-
data["markdown_only"] = markdown_only
172181

173182
response = self.client.post(endpoint, headers=self.headers, json=data)
174183

@@ -178,22 +187,27 @@ def crawl_requester(
178187

179188
return response.json()
180189

181-
def crawl_fetcher(self, request_id: str) -> Dict[str, Any]:
190+
def smartcrawler_fetch_results(self, request_id: str) -> Dict[str, Any]:
182191
"""
183-
Fetch the results of a crawling request using the request ID.
192+
Fetch the results of a SmartCrawler operation.
184193
185194
Args:
186-
request_id: The request ID returned by crawl_requester
195+
request_id: The request ID returned by smartcrawler_initiate
187196
188197
Returns:
189-
Dictionary containing the crawl results or status
198+
Dictionary containing the crawled data (structured extraction or markdown)
199+
and metadata about processed pages
200+
201+
Note:
202+
It takes some time to process the request and returns the results.
203+
Meanwhile it returns the status of the request.
204+
You have to keep polling the smartcrawler_fetch_results until the request is complete.
205+
The request is complete when the status is "completed". and you get results
206+
Keep polling the smartcrawler_fetch_results until the request is complete.
190207
"""
191-
endpoint = f"{self.BASE_URL}/crawl/fetcher"
192-
data = {
193-
"request_id": request_id
194-
}
195-
196-
response = self.client.post(endpoint, headers=self.headers, json=data)
208+
endpoint = f"{self.BASE_URL}/crawl/{request_id}"
209+
210+
response = self.client.get(endpoint, headers=self.headers)
197211

198212
if response.status_code != 200:
199213
error_msg = f"Error {response.status_code}: {response.text}"
@@ -291,66 +305,68 @@ def searchscraper(
291305
return {"error": str(e)}
292306

293307

294-
# Add tool for crawl requester (smartcrawler step 1)
308+
# Add tool for SmartCrawler initiation
295309
@mcp.tool()
296-
def crawl_requester(
310+
def smartcrawler_initiate(
297311
url: str,
298312
prompt: str = None,
299-
cache_website: bool = None,
313+
extraction_mode: str = "ai",
300314
depth: int = None,
301315
max_pages: int = None,
302-
same_domain_only: bool = None,
303-
markdown_only: bool = None
316+
same_domain_only: bool = None
304317
) -> Dict[str, Any]:
305318
"""
306-
Initiate a web crawling request and get a request ID.
319+
Initiate a SmartCrawler request for intelligent multi-page web crawling.
320+
321+
SmartCrawler supports two modes:
322+
- AI Extraction Mode (10 credits per page): Extracts structured data based on your prompt
323+
- Markdown Conversion Mode (2 credits per page): Converts pages to clean markdown
307324
308325
Args:
309326
url: Starting URL to crawl
310-
prompt: AI prompt for data extraction (optional, if not provided returns markdown only)
311-
cache_website: Whether to cache the website content (optional)
312-
depth: Maximum crawling depth (optional)
327+
prompt: AI prompt for data extraction (required for AI mode)
328+
extraction_mode: "ai" for AI extraction or "markdown" for markdown conversion (default: "ai")
329+
depth: Maximum link traversal depth (optional)
313330
max_pages: Maximum number of pages to crawl (optional)
314331
same_domain_only: Whether to crawl only within the same domain (optional)
315-
markdown_only: Whether to return only markdown content without AI processing (optional)
316332
317333
Returns:
318-
Dictionary containing the request ID and status
334+
Dictionary containing the request ID for async processing
319335
"""
320336
if scrapegraph_client is None:
321337
return {"error": "ScapeGraph client not initialized. Please provide an API key."}
322338

323339
try:
324-
return scrapegraph_client.crawl_requester(
340+
return scrapegraph_client.smartcrawler_initiate(
325341
url=url,
326342
prompt=prompt,
327-
cache_website=cache_website,
343+
extraction_mode=extraction_mode,
328344
depth=depth,
329345
max_pages=max_pages,
330-
same_domain_only=same_domain_only,
331-
markdown_only=markdown_only
346+
same_domain_only=same_domain_only
332347
)
333348
except Exception as e:
334349
return {"error": str(e)}
335350

336351

337-
# Add tool for crawl fetcher (smartcrawler step 2)
352+
# Add tool for fetching SmartCrawler results
338353
@mcp.tool()
339-
def crawl_fetcher(request_id: str) -> Dict[str, Any]:
354+
def smartcrawler_fetch_results(request_id: str) -> Dict[str, Any]:
340355
"""
341-
Fetch the results of a crawling request using the request ID.
356+
Fetch the results of a SmartCrawler operation.
342357
343358
Args:
344-
request_id: The request ID returned by crawl_requester
359+
request_id: The request ID returned by smartcrawler_initiate
345360
346361
Returns:
347-
Dictionary containing the crawl results or status
362+
Dictionary containing the crawled data (structured extraction or markdown)
363+
and metadata about processed pages
348364
"""
349365
if scrapegraph_client is None:
350366
return {"error": "ScapeGraph client not initialized. Please provide an API key."}
351367

352368
try:
353-
return scrapegraph_client.crawl_fetcher(request_id)
369+
return scrapegraph_client.smartcrawler_fetch_results(request_id)
354370
except Exception as e:
355371
return {"error": str(e)}
356372

0 commit comments

Comments
 (0)