5
5
- markdownify: Convert any webpage into clean, formatted markdown
6
6
- smartscraper: Extract structured data from any webpage using AI
7
7
- searchscraper: Perform AI-powered web searches with structured results
8
- - crawl_requester : Initiate intelligent web crawling requests (step 1)
9
- - crawl_fetcher: Fetch results from crawling requests (step 2)
8
+ - smartcrawler_initiate : Initiate intelligent multi-page web crawling with AI extraction or markdown conversion
9
+ - smartcrawler_fetch_results: Retrieve results from asynchronous crawling operations
10
10
"""
11
11
12
12
import os
@@ -126,49 +126,58 @@ def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scr
126
126
127
127
return response .json ()
128
128
129
- def crawl_requester (
129
+ def smartcrawler_initiate (
130
130
self ,
131
131
url : str ,
132
132
prompt : str = None ,
133
- cache_website : bool = None ,
133
+ extraction_mode : str = "ai" ,
134
134
depth : int = None ,
135
135
max_pages : int = None ,
136
- same_domain_only : bool = None ,
137
- markdown_only : bool = None
136
+ same_domain_only : bool = None
138
137
) -> Dict [str , Any ]:
139
138
"""
140
- Initiate a web crawling request and get a request ID.
139
+ Initiate a SmartCrawler request for multi-page web crawling.
140
+
141
+ SmartCrawler supports two modes:
142
+ - AI Extraction Mode (10 credits per page): Extracts structured data based on your prompt
143
+ - Markdown Conversion Mode (2 credits per page): Converts pages to clean markdown
144
+
145
+ Smartcrawler takes some time to process the request and returns the request id.
146
+ Use smartcrawler_fetch_results to get the results of the request.
147
+ You have to keep polling the smartcrawler_fetch_results until the request is complete.
148
+ The request is complete when the status is "completed".
141
149
142
150
Args:
143
151
url: Starting URL to crawl
144
- prompt: AI prompt for data extraction (optional, if not provided returns markdown only )
145
- cache_website: Whether to cache the website content (optional )
146
- depth: Maximum crawling depth (optional)
152
+ prompt: AI prompt for data extraction (required for AI mode )
153
+ extraction_mode: "ai" for AI extraction or "markdown" for markdown conversion (default: "ai" )
154
+ depth: Maximum link traversal depth (optional)
147
155
max_pages: Maximum number of pages to crawl (optional)
148
156
same_domain_only: Whether to crawl only within the same domain (optional)
149
- markdown_only: Whether to return only markdown content without AI processing (optional)
150
157
151
158
Returns:
152
- Dictionary containing the request ID and status
159
+ Dictionary containing the request ID for async processing
153
160
"""
154
- endpoint = f"{ self .BASE_URL } /crawl/requester "
161
+ endpoint = f"{ self .BASE_URL } /crawl"
155
162
data = {
156
163
"url" : url
157
164
}
158
165
159
- # Add optional parameters if provided
160
- if prompt is not None :
166
+ # Handle extraction mode
167
+ if extraction_mode == "markdown" :
168
+ data ["markdown_only" ] = True
169
+ elif extraction_mode == "ai" :
170
+ if prompt is None :
171
+ raise ValueError ("prompt is required when extraction_mode is 'ai'" )
161
172
data ["prompt" ] = prompt
162
- if cache_website is not None :
163
- data [ "cache_website" ] = cache_website
173
+ else :
174
+ raise ValueError ( f"Invalid extraction_mode: { extraction_mode } . Must be 'ai' or 'markdown'" )
164
175
if depth is not None :
165
176
data ["depth" ] = depth
166
177
if max_pages is not None :
167
178
data ["max_pages" ] = max_pages
168
179
if same_domain_only is not None :
169
180
data ["same_domain_only" ] = same_domain_only
170
- if markdown_only is not None :
171
- data ["markdown_only" ] = markdown_only
172
181
173
182
response = self .client .post (endpoint , headers = self .headers , json = data )
174
183
@@ -178,22 +187,27 @@ def crawl_requester(
178
187
179
188
return response .json ()
180
189
181
- def crawl_fetcher (self , request_id : str ) -> Dict [str , Any ]:
190
+ def smartcrawler_fetch_results (self , request_id : str ) -> Dict [str , Any ]:
182
191
"""
183
- Fetch the results of a crawling request using the request ID .
192
+ Fetch the results of a SmartCrawler operation .
184
193
185
194
Args:
186
- request_id: The request ID returned by crawl_requester
195
+ request_id: The request ID returned by smartcrawler_initiate
187
196
188
197
Returns:
189
- Dictionary containing the crawl results or status
198
+ Dictionary containing the crawled data (structured extraction or markdown)
199
+ and metadata about processed pages
200
+
201
+ Note:
202
+ It takes some time to process the request and returns the results.
203
+ Meanwhile it returns the status of the request.
204
+ You have to keep polling the smartcrawler_fetch_results until the request is complete.
205
+ The request is complete when the status is "completed". and you get results
206
+ Keep polling the smartcrawler_fetch_results until the request is complete.
190
207
"""
191
- endpoint = f"{ self .BASE_URL } /crawl/fetcher"
192
- data = {
193
- "request_id" : request_id
194
- }
195
-
196
- response = self .client .post (endpoint , headers = self .headers , json = data )
208
+ endpoint = f"{ self .BASE_URL } /crawl/{ request_id } "
209
+
210
+ response = self .client .get (endpoint , headers = self .headers )
197
211
198
212
if response .status_code != 200 :
199
213
error_msg = f"Error { response .status_code } : { response .text } "
@@ -291,66 +305,68 @@ def searchscraper(
291
305
return {"error" : str (e )}
292
306
293
307
294
- # Add tool for crawl requester (smartcrawler step 1)
308
+ # Add tool for SmartCrawler initiation
295
309
@mcp .tool ()
296
- def crawl_requester (
310
+ def smartcrawler_initiate (
297
311
url : str ,
298
312
prompt : str = None ,
299
- cache_website : bool = None ,
313
+ extraction_mode : str = "ai" ,
300
314
depth : int = None ,
301
315
max_pages : int = None ,
302
- same_domain_only : bool = None ,
303
- markdown_only : bool = None
316
+ same_domain_only : bool = None
304
317
) -> Dict [str , Any ]:
305
318
"""
306
- Initiate a web crawling request and get a request ID.
319
+ Initiate a SmartCrawler request for intelligent multi-page web crawling.
320
+
321
+ SmartCrawler supports two modes:
322
+ - AI Extraction Mode (10 credits per page): Extracts structured data based on your prompt
323
+ - Markdown Conversion Mode (2 credits per page): Converts pages to clean markdown
307
324
308
325
Args:
309
326
url: Starting URL to crawl
310
- prompt: AI prompt for data extraction (optional, if not provided returns markdown only )
311
- cache_website: Whether to cache the website content (optional )
312
- depth: Maximum crawling depth (optional)
327
+ prompt: AI prompt for data extraction (required for AI mode )
328
+ extraction_mode: "ai" for AI extraction or "markdown" for markdown conversion (default: "ai" )
329
+ depth: Maximum link traversal depth (optional)
313
330
max_pages: Maximum number of pages to crawl (optional)
314
331
same_domain_only: Whether to crawl only within the same domain (optional)
315
- markdown_only: Whether to return only markdown content without AI processing (optional)
316
332
317
333
Returns:
318
- Dictionary containing the request ID and status
334
+ Dictionary containing the request ID for async processing
319
335
"""
320
336
if scrapegraph_client is None :
321
337
return {"error" : "ScapeGraph client not initialized. Please provide an API key." }
322
338
323
339
try :
324
- return scrapegraph_client .crawl_requester (
340
+ return scrapegraph_client .smartcrawler_initiate (
325
341
url = url ,
326
342
prompt = prompt ,
327
- cache_website = cache_website ,
343
+ extraction_mode = extraction_mode ,
328
344
depth = depth ,
329
345
max_pages = max_pages ,
330
- same_domain_only = same_domain_only ,
331
- markdown_only = markdown_only
346
+ same_domain_only = same_domain_only
332
347
)
333
348
except Exception as e :
334
349
return {"error" : str (e )}
335
350
336
351
337
- # Add tool for crawl fetcher (smartcrawler step 2)
352
+ # Add tool for fetching SmartCrawler results
338
353
@mcp .tool ()
339
- def crawl_fetcher (request_id : str ) -> Dict [str , Any ]:
354
+ def smartcrawler_fetch_results (request_id : str ) -> Dict [str , Any ]:
340
355
"""
341
- Fetch the results of a crawling request using the request ID .
356
+ Fetch the results of a SmartCrawler operation .
342
357
343
358
Args:
344
- request_id: The request ID returned by crawl_requester
359
+ request_id: The request ID returned by smartcrawler_initiate
345
360
346
361
Returns:
347
- Dictionary containing the crawl results or status
362
+ Dictionary containing the crawled data (structured extraction or markdown)
363
+ and metadata about processed pages
348
364
"""
349
365
if scrapegraph_client is None :
350
366
return {"error" : "ScapeGraph client not initialized. Please provide an API key." }
351
367
352
368
try :
353
- return scrapegraph_client .crawl_fetcher (request_id )
369
+ return scrapegraph_client .smartcrawler_fetch_results (request_id )
354
370
except Exception as e :
355
371
return {"error" : str (e )}
356
372
0 commit comments