data-artisans-centre · arisha8809 · Nov 26, 2024
diff --git a/agents/website_metadata_extractor/README.rst b/agents/website_metadata_extractor/README.rst
@@ -0,0 +1,94 @@
+Website Metadata Extractor Plugin
+=================================
+
+The **Website Metadata Extractor Plugin** is designed to fetch metadata (such as title, description, and keywords) from a website. This plugin allows you to extract relevant metadata from a webpage for further analysis or processing.
+
+Features
+--------
+- Extract metadata from any website using its URL.
+- Retrieve the **title**, **meta description**, and **meta keywords**.
+- Simple integration with the **PlugFlow** framework.
+- Handles error cases such as invalid URLs or network issues.
+
+Installation
+------------
+Ensure that the required dependencies are installed. Add the following to your environment or `requirements.txt`:
+
+.. code-block:: text
+
+    requests
+    beautifulsoup4
+
+Install the dependencies using pip:
+
+.. code-block:: bash
+
+    pip install requests beautifulsoup4
+
+Parameters
+----------
+The plugin accepts the following parameters:
+
+- `url` (str): The URL of the website to extract metadata from.
+- `max_comments` (int): Maximum number of comments to fetch (default: 10). (If this plugin is modified to also include comments, this could be relevant.)
+
+Example Usage
+-------------
+To execute the plugin, use the **PlugFlow CLI**:
+
+.. code-block:: bash
+
+    python main.py execute website-metadata-extractor --params '{"url": "https://example.com"}'
+
+Output
+------
+The plugin returns the fetched metadata as a JSON object with the following fields:
+
+- `Title`: The title of the webpage.
+- `Meta Description`: The content of the meta description tag.
+- `Meta Keywords`: The content of the meta keywords tag.
+
+Example:
+
+.. code-block:: json
+
+    {
+        "Title": "Example Domain",
+        "Meta Description": "This domain is established to be used for illustrative examples in documents.",
+        "Meta Keywords": "example, domain, illustrative"
+    }
+
+Health Check
+-------------
+The plugin includes a `health_check` method to verify its operational status. The method attempts to fetch metadata from a known dummy website and returns a status message.
+
+Example health check output:
+
+.. code-block:: json
+
+    {
+        "status": "healthy",
+        "message": "Service is operational"
+    }
+
+Testing
+-------
+To test the plugin, use the provided test suite located in the `tests` directory.
+
+Run all tests:
+
+.. code-block:: bash
+
+    pytest plugins/website_metadata_extractor/tests
+
+Contributing
+------------
+Contributions to improve or enhance the plugin are welcome. Follow these steps:
+
+1. Fork the repository.
+2. Create a new branch for your changes.
+3. Submit a pull request with a detailed description of your changes.
+
+License
+-------
+This plugin is distributed under the MIT License. See the LICENSE file for more information.
diff --git a/agents/website_metadata_extractor/__init__.py b/agents/website_metadata_extractor/__init__.py
@@ -0,0 +1,55 @@
+import requests
+import json
+from bs4 import BeautifulSoup
+from core.base import AgentBase
+from log import logger
+
+class WebsiteMetadataAgent(AgentBase):
+    """Agent to extract metadata from a website."""
+
+    def execute(self, url):
+        """Extract metadata from a website."""
+        try:
+            logger.info(f"Fetching metadata for URL: {url}")
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()  # Raise error for HTTP issues
+
+            soup = BeautifulSoup(response.content, "html.parser")
+            metadata = {
+                "Title": self._get_meta_data(soup, "title"),
+                "Meta Description": self._get_meta_data(soup, "description"),
+                "Meta Keywords": self._get_meta_data(soup, "keywords"),
+            }
+            metadata_json = json.dumps(metadata, indent=4)
+            logger.info(f"Extracted Metadata:\n{metadata_json}")
+            return metadata
+
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Request failed: {e}")
+            raise ValueError(f"Failed to fetch metadata. Error: {e}")
+
+    def health_check(self):
+        """Check if the agent can fetch a test URL."""
+        try:
+            logger.info("Performing health check...")
+            test_url = "https://example.com"
+            response = requests.get(test_url, timeout=5)
+            if response.status_code == 200:
+                logger.info("Health check passed.")
+                return {"status": "healthy", "message": "Service is operational"}
+            else:
+                logger.error(f"Health check failed: Unexpected response {response.status_code}.")
+                return {"status": "unhealthy", "message": f"Unexpected response: {response.status_code}"}
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Health check failed: {e}")
+            return {"status": "unhealthy", "message": f"Health check failed with error: {str(e)}"}
+
+    def _get_meta_data(self, soup, meta_name):
+        """Helper function to extract metadata from HTML content."""
+        if meta_name == "title" and soup.title:
+            return soup.title.string.strip()
+        elif meta_name == "description" or meta_name == "keywords":
+            meta_tag = soup.find("meta", attrs={"name": meta_name})
+            if meta_tag and "content" in meta_tag.attrs:
+                return meta_tag["content"].strip()
+        return f"No {meta_name} found"
diff --git a/agents/website_metadata_extractor/manifest.json b/agents/website_metadata_extractor/manifest.json
@@ -0,0 +1,7 @@
+{
+    "name": "website-metadata-extractor",
+    "description": "An agent to extract metadata from a website.",
+    "class_name": "WebsiteMetadataAgent",
+    "authors": ["Arisha gupta"],
+    "entry_point": "__init__"
+}
diff --git a/agents/website_metadata_extractor/tests/__init__.py b/agents/website_metadata_extractor/tests/__init__.py
diff --git a/agents/website_metadata_extractor/tests/test_website_metadata_extractor.py b/agents/website_metadata_extractor/tests/test_website_metadata_extractor.py
@@ -0,0 +1,100 @@
+import pytest
+from agents.website_metadata_extractor import WebsiteMetadataAgent
+import requests
+
+class MockRequests:
+    """Mock class for requests.get."""
+    @staticmethod
+    def get(url, timeout=10):
+        if "invalid" in url:
+            raise ValueError("Invalid URL")
+        elif "error" in url:
+            # Simulate network error
+            raise Exception("Simulated network error")
+
+        class MockResponse:
+            status_code = 200
+
+            @staticmethod
+            def raise_for_status():
+                pass
+
+            @property
+            def content(self):
+                # Simulate a simple HTML response for testing
+                return """
+                    <html>
+                        <head>
+                            <title>Test Title</title>
+                            <meta name="description" content="This is a test description">
+                            <meta name="keywords" content="test, metadata, extractor">
+                        </head>
+                        <body>
+                            <h1>Header 1</h1>
+                            <h2>Header 2</h2>
+                        </body>
+                    </html>
+                """
+        return MockResponse()
+
+
+@pytest.fixture
+def website_metadata_agent(monkeypatch):
+    """Fixture to initialize the WebsiteMetadataAgent with a mock requests.get."""
+    agent = WebsiteMetadataAgent()
+    # Patch the requests.get method with the mock class
+    monkeypatch.setattr("requests.get", MockRequests.get)
+    return agent
+
+
+def test_execute_success(website_metadata_agent):
+    """Test successful execution of the WebsiteMetadataAgent."""
+    url = "https://valid-url.com"
+    metadata = website_metadata_agent.execute(url)
+
+    # Assertions for metadata fields
+    assert metadata["Title"] == "Test Title", "Expected title to be 'Test Title'."
+    assert metadata["Meta Description"] == "This is a test description", "Expected meta description to match."
+    assert metadata["Meta Keywords"] == "test, metadata, extractor", "Expected meta keywords to match."
+
+
+def test_execute_invalid_url(website_metadata_agent):
+    """Test execution with an invalid URL."""
+    url = "https://invalid-url.com"
+    with pytest.raises(ValueError) as excinfo:
+        website_metadata_agent.execute(url)
+
+    assert "Invalid URL" in str(excinfo.value)
+
+
+def test_execute_network_error(website_metadata_agent):
+    """Test execution with a simulated network error."""
+    url = "https://error-url.com"
+    with pytest.raises(Exception, match="Simulated network error"):
+        website_metadata_agent.execute(url)
+
+
+def test_health_check_success(website_metadata_agent):
+    """Test health check success."""
+    health = website_metadata_agent.health_check()
+    assert health["status"] == "healthy", "Expected health status to be 'healthy'."
+    assert "Service is operational" in health["message"], "Expected success message in health check."
+
+
+def test_health_check_failure(monkeypatch, website_metadata_agent):
+    """Test health check failure by mocking requests.get to raise an exception."""
+
+    # Define the mock method to simulate a failure in health check
+    def mock_get(*args, **kwargs):
+        # Raising the correct exception (generic exception)
+        raise requests.exceptions.RequestException("Mock health check failure")
+
+    # Patch the requests.get method to simulate a failure during the health check
+    monkeypatch.setattr("requests.get", mock_get)
+
+    # Run the health check and assert the error
+    health = website_metadata_agent.health_check()
+
+    # Validate the output after the simulated failure
+    assert health["status"] == "unhealthy", "Expected health status to be 'unhealthy'."
+    assert "Mock health check failure" in health["message"], "Expected failure message in health check."