Skip to content

Commit 6e07c7f

Browse files
Merge pull request #3 from ScrapeGraphAI/update-to-new-features-and-add-smartcrawler
Update server.py
2 parents 5d0be4d + 16808d0 commit 6e07c7f

File tree

1 file changed

+177
-8
lines changed

1 file changed

+177
-8
lines changed

src/scrapegraph_mcp/server.py

Lines changed: 177 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
- markdownify: Convert any webpage into clean, formatted markdown
66
- smartscraper: Extract structured data from any webpage using AI
77
- searchscraper: Perform AI-powered web searches with structured results
8+
- crawl_requester: Initiate intelligent web crawling requests (step 1)
9+
- crawl_fetcher: Fetch results from crawling requests (step 2)
810
"""
911

1012
import os
@@ -56,22 +58,32 @@ def markdownify(self, website_url: str) -> Dict[str, Any]:
5658

5759
return response.json()
5860

59-
def smartscraper(self, user_prompt: str, website_url: str) -> Dict[str, Any]:
61+
def smartscraper(self, user_prompt: str, website_url: str, number_of_scrolls: int = None, markdown_only: bool = None) -> Dict[str, Any]:
6062
"""
6163
Extract structured data from a webpage using AI.
6264
6365
Args:
6466
user_prompt: Instructions for what data to extract
6567
website_url: URL of the webpage to scrape
68+
number_of_scrolls: Number of infinite scrolls to perform (optional)
69+
markdown_only: Whether to return only markdown content without AI processing (optional)
6670
6771
Returns:
68-
Dictionary containing the extracted data
72+
Dictionary containing the extracted data or markdown content
6973
"""
7074
url = f"{self.BASE_URL}/smartscraper"
7175
data = {
7276
"user_prompt": user_prompt,
7377
"website_url": website_url
7478
}
79+
80+
# Add number_of_scrolls to the request if provided
81+
if number_of_scrolls is not None:
82+
data["number_of_scrolls"] = number_of_scrolls
83+
84+
# Add markdown_only to the request if provided
85+
if markdown_only is not None:
86+
data["markdown_only"] = markdown_only
7587

7688
response = self.client.post(url, headers=self.headers, json=data)
7789

@@ -81,12 +93,14 @@ def smartscraper(self, user_prompt: str, website_url: str) -> Dict[str, Any]:
8193

8294
return response.json()
8395

84-
def searchscraper(self, user_prompt: str) -> Dict[str, Any]:
96+
def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scrolls: int = None) -> Dict[str, Any]:
8597
"""
8698
Perform AI-powered web searches with structured results.
8799
88100
Args:
89101
user_prompt: Search query or instructions
102+
num_results: Number of websites to search (optional, default: 3 websites = 30 credits)
103+
number_of_scrolls: Number of infinite scrolls to perform on each website (optional)
90104
91105
Returns:
92106
Dictionary containing search results and reference URLs
@@ -95,6 +109,14 @@ def searchscraper(self, user_prompt: str) -> Dict[str, Any]:
95109
data = {
96110
"user_prompt": user_prompt
97111
}
112+
113+
# Add num_results to the request if provided
114+
if num_results is not None:
115+
data["num_results"] = num_results
116+
117+
# Add number_of_scrolls to the request if provided
118+
if number_of_scrolls is not None:
119+
data["number_of_scrolls"] = number_of_scrolls
98120

99121
response = self.client.post(url, headers=self.headers, json=data)
100122

@@ -104,6 +126,81 @@ def searchscraper(self, user_prompt: str) -> Dict[str, Any]:
104126

105127
return response.json()
106128

129+
def crawl_requester(
130+
self,
131+
url: str,
132+
prompt: str = None,
133+
cache_website: bool = None,
134+
depth: int = None,
135+
max_pages: int = None,
136+
same_domain_only: bool = None,
137+
markdown_only: bool = None
138+
) -> Dict[str, Any]:
139+
"""
140+
Initiate a web crawling request and get a request ID.
141+
142+
Args:
143+
url: Starting URL to crawl
144+
prompt: AI prompt for data extraction (optional, if not provided returns markdown only)
145+
cache_website: Whether to cache the website content (optional)
146+
depth: Maximum crawling depth (optional)
147+
max_pages: Maximum number of pages to crawl (optional)
148+
same_domain_only: Whether to crawl only within the same domain (optional)
149+
markdown_only: Whether to return only markdown content without AI processing (optional)
150+
151+
Returns:
152+
Dictionary containing the request ID and status
153+
"""
154+
endpoint = f"{self.BASE_URL}/crawl/requester"
155+
data = {
156+
"url": url
157+
}
158+
159+
# Add optional parameters if provided
160+
if prompt is not None:
161+
data["prompt"] = prompt
162+
if cache_website is not None:
163+
data["cache_website"] = cache_website
164+
if depth is not None:
165+
data["depth"] = depth
166+
if max_pages is not None:
167+
data["max_pages"] = max_pages
168+
if same_domain_only is not None:
169+
data["same_domain_only"] = same_domain_only
170+
if markdown_only is not None:
171+
data["markdown_only"] = markdown_only
172+
173+
response = self.client.post(endpoint, headers=self.headers, json=data)
174+
175+
if response.status_code != 200:
176+
error_msg = f"Error {response.status_code}: {response.text}"
177+
raise Exception(error_msg)
178+
179+
return response.json()
180+
181+
def crawl_fetcher(self, request_id: str) -> Dict[str, Any]:
182+
"""
183+
Fetch the results of a crawling request using the request ID.
184+
185+
Args:
186+
request_id: The request ID returned by crawl_requester
187+
188+
Returns:
189+
Dictionary containing the crawl results or status
190+
"""
191+
endpoint = f"{self.BASE_URL}/crawl/fetcher"
192+
data = {
193+
"request_id": request_id
194+
}
195+
196+
response = self.client.post(endpoint, headers=self.headers, json=data)
197+
198+
if response.status_code != 200:
199+
error_msg = f"Error {response.status_code}: {response.text}"
200+
raise Exception(error_msg)
201+
202+
return response.json()
203+
107204
def close(self) -> None:
108205
"""Close the HTTP client."""
109206
self.client.close()
@@ -142,37 +239,45 @@ def markdownify(website_url: str) -> Dict[str, Any]:
142239
@mcp.tool()
143240
def smartscraper(
144241
user_prompt: str,
145-
website_url: str
242+
website_url: str,
243+
number_of_scrolls: int = None,
244+
markdown_only: bool = None
146245
) -> Dict[str, Any]:
147246
"""
148247
Extract structured data from a webpage using AI.
149248
150249
Args:
151250
user_prompt: Instructions for what data to extract
152251
website_url: URL of the webpage to scrape
252+
number_of_scrolls: Number of infinite scrolls to perform (optional)
253+
markdown_only: Whether to return only markdown content without AI processing (optional)
153254
154255
Returns:
155-
Dictionary containing the extracted data
256+
Dictionary containing the extracted data or markdown content
156257
"""
157258
if scrapegraph_client is None:
158259
return {"error": "ScapeGraph client not initialized. Please provide an API key."}
159260

160261
try:
161-
return scrapegraph_client.smartscraper(user_prompt, website_url)
262+
return scrapegraph_client.smartscraper(user_prompt, website_url, number_of_scrolls, markdown_only)
162263
except Exception as e:
163264
return {"error": str(e)}
164265

165266

166267
# Add tool for searchscraper
167268
@mcp.tool()
168269
def searchscraper(
169-
user_prompt: str
270+
user_prompt: str,
271+
num_results: int = None,
272+
number_of_scrolls: int = None
170273
) -> Dict[str, Any]:
171274
"""
172275
Perform AI-powered web searches with structured results.
173276
174277
Args:
175278
user_prompt: Search query or instructions
279+
num_results: Number of websites to search (optional, default: 3 websites = 30 credits)
280+
number_of_scrolls: Number of infinite scrolls to perform on each website (optional)
176281
177282
Returns:
178283
Dictionary containing search results and reference URLs
@@ -181,7 +286,71 @@ def searchscraper(
181286
return {"error": "ScapeGraph client not initialized. Please provide an API key."}
182287

183288
try:
184-
return scrapegraph_client.searchscraper(user_prompt)
289+
return scrapegraph_client.searchscraper(user_prompt, num_results, number_of_scrolls)
290+
except Exception as e:
291+
return {"error": str(e)}
292+
293+
294+
# Add tool for crawl requester (smartcrawler step 1)
295+
@mcp.tool()
296+
def crawl_requester(
297+
url: str,
298+
prompt: str = None,
299+
cache_website: bool = None,
300+
depth: int = None,
301+
max_pages: int = None,
302+
same_domain_only: bool = None,
303+
markdown_only: bool = None
304+
) -> Dict[str, Any]:
305+
"""
306+
Initiate a web crawling request and get a request ID.
307+
308+
Args:
309+
url: Starting URL to crawl
310+
prompt: AI prompt for data extraction (optional, if not provided returns markdown only)
311+
cache_website: Whether to cache the website content (optional)
312+
depth: Maximum crawling depth (optional)
313+
max_pages: Maximum number of pages to crawl (optional)
314+
same_domain_only: Whether to crawl only within the same domain (optional)
315+
markdown_only: Whether to return only markdown content without AI processing (optional)
316+
317+
Returns:
318+
Dictionary containing the request ID and status
319+
"""
320+
if scrapegraph_client is None:
321+
return {"error": "ScapeGraph client not initialized. Please provide an API key."}
322+
323+
try:
324+
return scrapegraph_client.crawl_requester(
325+
url=url,
326+
prompt=prompt,
327+
cache_website=cache_website,
328+
depth=depth,
329+
max_pages=max_pages,
330+
same_domain_only=same_domain_only,
331+
markdown_only=markdown_only
332+
)
333+
except Exception as e:
334+
return {"error": str(e)}
335+
336+
337+
# Add tool for crawl fetcher (smartcrawler step 2)
338+
@mcp.tool()
339+
def crawl_fetcher(request_id: str) -> Dict[str, Any]:
340+
"""
341+
Fetch the results of a crawling request using the request ID.
342+
343+
Args:
344+
request_id: The request ID returned by crawl_requester
345+
346+
Returns:
347+
Dictionary containing the crawl results or status
348+
"""
349+
if scrapegraph_client is None:
350+
return {"error": "ScapeGraph client not initialized. Please provide an API key."}
351+
352+
try:
353+
return scrapegraph_client.crawl_fetcher(request_id)
185354
except Exception as e:
186355
return {"error": str(e)}
187356

0 commit comments

Comments
 (0)