Skip to content

Commit d8e140f

Browse files
committed
Update server.py
1 parent 5d0be4d commit d8e140f

File tree

1 file changed

+132
-8
lines changed

1 file changed

+132
-8
lines changed

src/scrapegraph_mcp/server.py

Lines changed: 132 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
- markdownify: Convert any webpage into clean, formatted markdown
66
- smartscraper: Extract structured data from any webpage using AI
77
- searchscraper: Perform AI-powered web searches with structured results
8+
- crawl: Perform intelligent web crawling with AI-powered data extraction
89
"""
910

1011
import os
@@ -56,22 +57,32 @@ def markdownify(self, website_url: str) -> Dict[str, Any]:
5657

5758
return response.json()
5859

59-
def smartscraper(self, user_prompt: str, website_url: str) -> Dict[str, Any]:
60+
def smartscraper(self, user_prompt: str, website_url: str, number_of_scrolls: int = None, markdown_only: bool = None) -> Dict[str, Any]:
6061
"""
6162
Extract structured data from a webpage using AI.
6263
6364
Args:
6465
user_prompt: Instructions for what data to extract
6566
website_url: URL of the webpage to scrape
67+
number_of_scrolls: Number of infinite scrolls to perform (optional)
68+
markdown_only: Whether to return only markdown content without AI processing (optional)
6669
6770
Returns:
68-
Dictionary containing the extracted data
71+
Dictionary containing the extracted data or markdown content
6972
"""
7073
url = f"{self.BASE_URL}/smartscraper"
7174
data = {
7275
"user_prompt": user_prompt,
7376
"website_url": website_url
7477
}
78+
79+
# Add number_of_scrolls to the request if provided
80+
if number_of_scrolls is not None:
81+
data["number_of_scrolls"] = number_of_scrolls
82+
83+
# Add markdown_only to the request if provided
84+
if markdown_only is not None:
85+
data["markdown_only"] = markdown_only
7586

7687
response = self.client.post(url, headers=self.headers, json=data)
7788

@@ -81,12 +92,14 @@ def smartscraper(self, user_prompt: str, website_url: str) -> Dict[str, Any]:
8192

8293
return response.json()
8394

84-
def searchscraper(self, user_prompt: str) -> Dict[str, Any]:
95+
def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scrolls: int = None) -> Dict[str, Any]:
8596
"""
8697
Perform AI-powered web searches with structured results.
8798
8899
Args:
89100
user_prompt: Search query or instructions
101+
num_results: Number of websites to search (optional, default: 3 websites = 30 credits)
102+
number_of_scrolls: Number of infinite scrolls to perform on each website (optional)
90103
91104
Returns:
92105
Dictionary containing search results and reference URLs
@@ -95,6 +108,14 @@ def searchscraper(self, user_prompt: str) -> Dict[str, Any]:
95108
data = {
96109
"user_prompt": user_prompt
97110
}
111+
112+
# Add num_results to the request if provided
113+
if num_results is not None:
114+
data["num_results"] = num_results
115+
116+
# Add number_of_scrolls to the request if provided
117+
if number_of_scrolls is not None:
118+
data["number_of_scrolls"] = number_of_scrolls
98119

99120
response = self.client.post(url, headers=self.headers, json=data)
100121

@@ -104,6 +125,58 @@ def searchscraper(self, user_prompt: str) -> Dict[str, Any]:
104125

105126
return response.json()
106127

128+
def crawl(
129+
self,
130+
url: str,
131+
prompt: str = None,
132+
cache_website: bool = None,
133+
depth: int = None,
134+
max_pages: int = None,
135+
same_domain_only: bool = None,
136+
markdown_only: bool = None
137+
) -> Dict[str, Any]:
138+
"""
139+
Perform intelligent web crawling with AI-powered data extraction.
140+
141+
Args:
142+
url: Starting URL to crawl
143+
prompt: AI prompt for data extraction (optional, if not provided returns markdown only)
144+
cache_website: Whether to cache the website content (optional)
145+
depth: Maximum crawling depth (optional)
146+
max_pages: Maximum number of pages to crawl (optional)
147+
same_domain_only: Whether to crawl only within the same domain (optional)
148+
markdown_only: Whether to return only markdown content without AI processing (optional)
149+
150+
Returns:
151+
Dictionary containing the crawl results
152+
"""
153+
endpoint = f"{self.BASE_URL}/crawl"
154+
data = {
155+
"url": url
156+
}
157+
158+
# Add optional parameters if provided
159+
if prompt is not None:
160+
data["prompt"] = prompt
161+
if cache_website is not None:
162+
data["cache_website"] = cache_website
163+
if depth is not None:
164+
data["depth"] = depth
165+
if max_pages is not None:
166+
data["max_pages"] = max_pages
167+
if same_domain_only is not None:
168+
data["same_domain_only"] = same_domain_only
169+
if markdown_only is not None:
170+
data["markdown_only"] = markdown_only
171+
172+
response = self.client.post(endpoint, headers=self.headers, json=data)
173+
174+
if response.status_code != 200:
175+
error_msg = f"Error {response.status_code}: {response.text}"
176+
raise Exception(error_msg)
177+
178+
return response.json()
179+
107180
def close(self) -> None:
108181
"""Close the HTTP client."""
109182
self.client.close()
@@ -142,37 +215,45 @@ def markdownify(website_url: str) -> Dict[str, Any]:
142215
@mcp.tool()
143216
def smartscraper(
144217
user_prompt: str,
145-
website_url: str
218+
website_url: str,
219+
number_of_scrolls: int = None,
220+
markdown_only: bool = None
146221
) -> Dict[str, Any]:
147222
"""
148223
Extract structured data from a webpage using AI.
149224
150225
Args:
151226
user_prompt: Instructions for what data to extract
152227
website_url: URL of the webpage to scrape
228+
number_of_scrolls: Number of infinite scrolls to perform (optional)
229+
markdown_only: Whether to return only markdown content without AI processing (optional)
153230
154231
Returns:
155-
Dictionary containing the extracted data
232+
Dictionary containing the extracted data or markdown content
156233
"""
157234
if scrapegraph_client is None:
158235
return {"error": "ScapeGraph client not initialized. Please provide an API key."}
159236

160237
try:
161-
return scrapegraph_client.smartscraper(user_prompt, website_url)
238+
return scrapegraph_client.smartscraper(user_prompt, website_url, number_of_scrolls, markdown_only)
162239
except Exception as e:
163240
return {"error": str(e)}
164241

165242

166243
# Add tool for searchscraper
167244
@mcp.tool()
168245
def searchscraper(
169-
user_prompt: str
246+
user_prompt: str,
247+
num_results: int = None,
248+
number_of_scrolls: int = None
170249
) -> Dict[str, Any]:
171250
"""
172251
Perform AI-powered web searches with structured results.
173252
174253
Args:
175254
user_prompt: Search query or instructions
255+
num_results: Number of websites to search (optional, default: 3 websites = 30 credits)
256+
number_of_scrolls: Number of infinite scrolls to perform on each website (optional)
176257
177258
Returns:
178259
Dictionary containing search results and reference URLs
@@ -181,7 +262,50 @@ def searchscraper(
181262
return {"error": "ScapeGraph client not initialized. Please provide an API key."}
182263

183264
try:
184-
return scrapegraph_client.searchscraper(user_prompt)
265+
return scrapegraph_client.searchscraper(user_prompt, num_results, number_of_scrolls)
266+
except Exception as e:
267+
return {"error": str(e)}
268+
269+
270+
# Add tool for crawl (smartcrawler)
271+
@mcp.tool()
272+
def crawl(
273+
url: str,
274+
prompt: str = None,
275+
cache_website: bool = None,
276+
depth: int = None,
277+
max_pages: int = None,
278+
same_domain_only: bool = None,
279+
markdown_only: bool = None
280+
) -> Dict[str, Any]:
281+
"""
282+
Perform intelligent web crawling with AI-powered data extraction.
283+
284+
Args:
285+
url: Starting URL to crawl
286+
prompt: AI prompt for data extraction (optional, if not provided returns markdown only)
287+
cache_website: Whether to cache the website content (optional)
288+
depth: Maximum crawling depth (optional)
289+
max_pages: Maximum number of pages to crawl (optional)
290+
same_domain_only: Whether to crawl only within the same domain (optional)
291+
markdown_only: Whether to return only markdown content without AI processing (optional)
292+
293+
Returns:
294+
Dictionary containing the crawl results
295+
"""
296+
if scrapegraph_client is None:
297+
return {"error": "ScapeGraph client not initialized. Please provide an API key."}
298+
299+
try:
300+
return scrapegraph_client.crawl(
301+
url=url,
302+
prompt=prompt,
303+
cache_website=cache_website,
304+
depth=depth,
305+
max_pages=max_pages,
306+
same_domain_only=same_domain_only,
307+
markdown_only=markdown_only
308+
)
185309
except Exception as e:
186310
return {"error": str(e)}
187311

0 commit comments

Comments
 (0)