Merge pull request #3 from ScrapeGraphAI/update-to-new-features-and-add-smartcrawler

Vikrant-Khedkar · web-flow · commit 6e07c7f12c80 · 2025-08-11T16:42:08.000+05:30
Update server.py
diff --git a/src/scrapegraph_mcp/server.py b/src/scrapegraph_mcp/server.py
@@ -5,6 +5,8 @@
 - markdownify: Convert any webpage into clean, formatted markdown
 - smartscraper: Extract structured data from any webpage using AI
 - searchscraper: Perform AI-powered web searches with structured results
+- crawl_requester: Initiate intelligent web crawling requests (step 1)
+- crawl_fetcher: Fetch results from crawling requests (step 2)
 """
 
 import os
@@ -56,22 +58,32 @@ def markdownify(self, website_url: str) -> Dict[str, Any]:
 
         return response.json()
 
-    def smartscraper(self, user_prompt: str, website_url: str) -> Dict[str, Any]:
+    def smartscraper(self, user_prompt: str, website_url: str, number_of_scrolls: int = None, markdown_only: bool = None) -> Dict[str, Any]:
         """
         Extract structured data from a webpage using AI.
 
         Args:
             user_prompt: Instructions for what data to extract
             website_url: URL of the webpage to scrape
+            number_of_scrolls: Number of infinite scrolls to perform (optional)
+            markdown_only: Whether to return only markdown content without AI processing (optional)
 
         Returns:
-            Dictionary containing the extracted data
+            Dictionary containing the extracted data or markdown content
         """
         url = f"{self.BASE_URL}/smartscraper"
         data = {
             "user_prompt": user_prompt,
             "website_url": website_url
         }
+        
+        # Add number_of_scrolls to the request if provided
+        if number_of_scrolls is not None:
+            data["number_of_scrolls"] = number_of_scrolls
+            
+        # Add markdown_only to the request if provided
+        if markdown_only is not None:
+            data["markdown_only"] = markdown_only
 
         response = self.client.post(url, headers=self.headers, json=data)
 
@@ -81,12 +93,14 @@ def smartscraper(self, user_prompt: str, website_url: str) -> Dict[str, Any]:
 
         return response.json()
 
-    def searchscraper(self, user_prompt: str) -> Dict[str, Any]:
+    def searchscraper(self, user_prompt: str, num_results: int = None, number_of_scrolls: int = None) -> Dict[str, Any]:
         """
         Perform AI-powered web searches with structured results.
 
         Args:
             user_prompt: Search query or instructions
+            num_results: Number of websites to search (optional, default: 3 websites = 30 credits)
+            number_of_scrolls: Number of infinite scrolls to perform on each website (optional)
 
         Returns:
             Dictionary containing search results and reference URLs
@@ -95,6 +109,14 @@ def searchscraper(self, user_prompt: str) -> Dict[str, Any]:
         data = {
             "user_prompt": user_prompt
         }
+        
+        # Add num_results to the request if provided
+        if num_results is not None:
+            data["num_results"] = num_results
+            
+        # Add number_of_scrolls to the request if provided
+        if number_of_scrolls is not None:
+            data["number_of_scrolls"] = number_of_scrolls
 
         response = self.client.post(url, headers=self.headers, json=data)
 
@@ -104,6 +126,81 @@ def searchscraper(self, user_prompt: str) -> Dict[str, Any]:
 
         return response.json()
 
+    def crawl_requester(
+        self, 
+        url: str, 
+        prompt: str = None, 
+        cache_website: bool = None,
+        depth: int = None,
+        max_pages: int = None,
+        same_domain_only: bool = None,
+        markdown_only: bool = None
+    ) -> Dict[str, Any]:
+        """
+        Initiate a web crawling request and get a request ID.
+
+        Args:
+            url: Starting URL to crawl
+            prompt: AI prompt for data extraction (optional, if not provided returns markdown only)
+            cache_website: Whether to cache the website content (optional)
+            depth: Maximum crawling depth (optional)
+            max_pages: Maximum number of pages to crawl (optional)
+            same_domain_only: Whether to crawl only within the same domain (optional)
+            markdown_only: Whether to return only markdown content without AI processing (optional)
+
+        Returns:
+            Dictionary containing the request ID and status
+        """
+        endpoint = f"{self.BASE_URL}/crawl/requester"
+        data = {
+            "url": url
+        }
+        
+        # Add optional parameters if provided
+        if prompt is not None:
+            data["prompt"] = prompt
+        if cache_website is not None:
+            data["cache_website"] = cache_website
+        if depth is not None:
+            data["depth"] = depth
+        if max_pages is not None:
+            data["max_pages"] = max_pages
+        if same_domain_only is not None:
+            data["same_domain_only"] = same_domain_only
+        if markdown_only is not None:
+            data["markdown_only"] = markdown_only
+
+        response = self.client.post(endpoint, headers=self.headers, json=data)
+
+        if response.status_code != 200:
+            error_msg = f"Error {response.status_code}: {response.text}"
+            raise Exception(error_msg)
+
+        return response.json()
+
+    def crawl_fetcher(self, request_id: str) -> Dict[str, Any]:
+        """
+        Fetch the results of a crawling request using the request ID.
+
+        Args:
+            request_id: The request ID returned by crawl_requester
+
+        Returns:
+            Dictionary containing the crawl results or status
+        """
+        endpoint = f"{self.BASE_URL}/crawl/fetcher"
+        data = {
+            "request_id": request_id
+        }
+
+        response = self.client.post(endpoint, headers=self.headers, json=data)
+
+        if response.status_code != 200:
+            error_msg = f"Error {response.status_code}: {response.text}"
+            raise Exception(error_msg)
+
+        return response.json()
+
     def close(self) -> None:
         """Close the HTTP client."""
         self.client.close()
@@ -142,37 +239,45 @@ def markdownify(website_url: str) -> Dict[str, Any]:
 @mcp.tool()
 def smartscraper(
     user_prompt: str, 
-    website_url: str
+    website_url: str,
+    number_of_scrolls: int = None,
+    markdown_only: bool = None
 ) -> Dict[str, Any]:
     """
     Extract structured data from a webpage using AI.
 
     Args:
         user_prompt: Instructions for what data to extract
         website_url: URL of the webpage to scrape
+        number_of_scrolls: Number of infinite scrolls to perform (optional)
+        markdown_only: Whether to return only markdown content without AI processing (optional)
 
     Returns:
-        Dictionary containing the extracted data
+        Dictionary containing the extracted data or markdown content
     """
     if scrapegraph_client is None:
         return {"error": "ScapeGraph client not initialized. Please provide an API key."}
 
     try:
-        return scrapegraph_client.smartscraper(user_prompt, website_url)
+        return scrapegraph_client.smartscraper(user_prompt, website_url, number_of_scrolls, markdown_only)
     except Exception as e:
         return {"error": str(e)}
 
 
 # Add tool for searchscraper
 @mcp.tool()
 def searchscraper(
-    user_prompt: str
+    user_prompt: str,
+    num_results: int = None,
+    number_of_scrolls: int = None
 ) -> Dict[str, Any]:
     """
     Perform AI-powered web searches with structured results.
 
     Args:
         user_prompt: Search query or instructions
+        num_results: Number of websites to search (optional, default: 3 websites = 30 credits)
+        number_of_scrolls: Number of infinite scrolls to perform on each website (optional)
 
     Returns:
         Dictionary containing search results and reference URLs
@@ -181,7 +286,71 @@ def searchscraper(
         return {"error": "ScapeGraph client not initialized. Please provide an API key."}
 
     try:
-        return scrapegraph_client.searchscraper(user_prompt)
+        return scrapegraph_client.searchscraper(user_prompt, num_results, number_of_scrolls)
+    except Exception as e:
+        return {"error": str(e)}
+
+
+# Add tool for crawl requester (smartcrawler step 1)
+@mcp.tool()
+def crawl_requester(
+    url: str,
+    prompt: str = None,
+    cache_website: bool = None,
+    depth: int = None,
+    max_pages: int = None,
+    same_domain_only: bool = None,
+    markdown_only: bool = None
+) -> Dict[str, Any]:
+    """
+    Initiate a web crawling request and get a request ID.
+
+    Args:
+        url: Starting URL to crawl
+        prompt: AI prompt for data extraction (optional, if not provided returns markdown only)
+        cache_website: Whether to cache the website content (optional)
+        depth: Maximum crawling depth (optional)
+        max_pages: Maximum number of pages to crawl (optional)
+        same_domain_only: Whether to crawl only within the same domain (optional)
+        markdown_only: Whether to return only markdown content without AI processing (optional)
+
+    Returns:
+        Dictionary containing the request ID and status
+    """
+    if scrapegraph_client is None:
+        return {"error": "ScapeGraph client not initialized. Please provide an API key."}
+
+    try:
+        return scrapegraph_client.crawl_requester(
+            url=url,
+            prompt=prompt,
+            cache_website=cache_website,
+            depth=depth,
+            max_pages=max_pages,
+            same_domain_only=same_domain_only,
+            markdown_only=markdown_only
+        )
+    except Exception as e:
+        return {"error": str(e)}
+
+
+# Add tool for crawl fetcher (smartcrawler step 2)
+@mcp.tool()
+def crawl_fetcher(request_id: str) -> Dict[str, Any]:
+    """
+    Fetch the results of a crawling request using the request ID.
+
+    Args:
+        request_id: The request ID returned by crawl_requester
+
+    Returns:
+        Dictionary containing the crawl results or status
+    """
+    if scrapegraph_client is None:
+        return {"error": "ScapeGraph client not initialized. Please provide an API key."}
+
+    try:
+        return scrapegraph_client.crawl_fetcher(request_id)
     except Exception as e:
         return {"error": str(e)}