diff --git a/agents/website_metadata_extractor/README.rst b/agents/website_metadata_extractor/README.rst new file mode 100644 index 0000000..e1651d2 --- /dev/null +++ b/agents/website_metadata_extractor/README.rst @@ -0,0 +1,94 @@ +Website Metadata Extractor Plugin +================================= + +The **Website Metadata Extractor Plugin** is designed to fetch metadata (such as title, description, and keywords) from a website. This plugin allows you to extract relevant metadata from a webpage for further analysis or processing. + +Features +-------- +- Extract metadata from any website using its URL. +- Retrieve the **title**, **meta description**, and **meta keywords**. +- Simple integration with the **PlugFlow** framework. +- Handles error cases such as invalid URLs or network issues. + +Installation +------------ +Ensure that the required dependencies are installed. Add the following to your environment or `requirements.txt`: + +.. code-block:: text + + requests + beautifulsoup4 + +Install the dependencies using pip: + +.. code-block:: bash + + pip install requests beautifulsoup4 + +Parameters +---------- +The plugin accepts the following parameters: + +- `url` (str): The URL of the website to extract metadata from. +- `max_comments` (int): Maximum number of comments to fetch (default: 10). (If this plugin is modified to also include comments, this could be relevant.) + +Example Usage +------------- +To execute the plugin, use the **PlugFlow CLI**: + +.. code-block:: bash + + python main.py execute website-metadata-extractor --params '{"url": "https://example.com"}' + +Output +------ +The plugin returns the fetched metadata as a JSON object with the following fields: + +- `Title`: The title of the webpage. +- `Meta Description`: The content of the meta description tag. +- `Meta Keywords`: The content of the meta keywords tag. + +Example: + +.. code-block:: json + + { + "Title": "Example Domain", + "Meta Description": "This domain is established to be used for illustrative examples in documents.", + "Meta Keywords": "example, domain, illustrative" + } + +Health Check +------------- +The plugin includes a `health_check` method to verify its operational status. The method attempts to fetch metadata from a known dummy website and returns a status message. + +Example health check output: + +.. code-block:: json + + { + "status": "healthy", + "message": "Service is operational" + } + +Testing +------- +To test the plugin, use the provided test suite located in the `tests` directory. + +Run all tests: + +.. code-block:: bash + + pytest plugins/website_metadata_extractor/tests + +Contributing +------------ +Contributions to improve or enhance the plugin are welcome. Follow these steps: + +1. Fork the repository. +2. Create a new branch for your changes. +3. Submit a pull request with a detailed description of your changes. + +License +------- +This plugin is distributed under the MIT License. See the LICENSE file for more information. diff --git a/agents/website_metadata_extractor/__init__.py b/agents/website_metadata_extractor/__init__.py new file mode 100644 index 0000000..b10e1cc --- /dev/null +++ b/agents/website_metadata_extractor/__init__.py @@ -0,0 +1,55 @@ +import requests +import json +from bs4 import BeautifulSoup +from core.base import AgentBase +from log import logger + +class WebsiteMetadataAgent(AgentBase): + """Agent to extract metadata from a website.""" + + def execute(self, url): + """Extract metadata from a website.""" + try: + logger.info(f"Fetching metadata for URL: {url}") + response = requests.get(url, timeout=10) + response.raise_for_status() # Raise error for HTTP issues + + soup = BeautifulSoup(response.content, "html.parser") + metadata = { + "Title": self._get_meta_data(soup, "title"), + "Meta Description": self._get_meta_data(soup, "description"), + "Meta Keywords": self._get_meta_data(soup, "keywords"), + } + metadata_json = json.dumps(metadata, indent=4) + logger.info(f"Extracted Metadata:\n{metadata_json}") + return metadata + + except requests.exceptions.RequestException as e: + logger.error(f"Request failed: {e}") + raise ValueError(f"Failed to fetch metadata. Error: {e}") + + def health_check(self): + """Check if the agent can fetch a test URL.""" + try: + logger.info("Performing health check...") + test_url = "https://example.com" + response = requests.get(test_url, timeout=5) + if response.status_code == 200: + logger.info("Health check passed.") + return {"status": "healthy", "message": "Service is operational"} + else: + logger.error(f"Health check failed: Unexpected response {response.status_code}.") + return {"status": "unhealthy", "message": f"Unexpected response: {response.status_code}"} + except requests.exceptions.RequestException as e: + logger.error(f"Health check failed: {e}") + return {"status": "unhealthy", "message": f"Health check failed with error: {str(e)}"} + + def _get_meta_data(self, soup, meta_name): + """Helper function to extract metadata from HTML content.""" + if meta_name == "title" and soup.title: + return soup.title.string.strip() + elif meta_name == "description" or meta_name == "keywords": + meta_tag = soup.find("meta", attrs={"name": meta_name}) + if meta_tag and "content" in meta_tag.attrs: + return meta_tag["content"].strip() + return f"No {meta_name} found" diff --git a/agents/website_metadata_extractor/manifest.json b/agents/website_metadata_extractor/manifest.json new file mode 100644 index 0000000..bd2a570 --- /dev/null +++ b/agents/website_metadata_extractor/manifest.json @@ -0,0 +1,7 @@ +{ + "name": "website-metadata-extractor", + "description": "An agent to extract metadata from a website.", + "class_name": "WebsiteMetadataAgent", + "authors": ["Arisha gupta"], + "entry_point": "__init__" +} diff --git a/agents/website_metadata_extractor/tests/__init__.py b/agents/website_metadata_extractor/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/agents/website_metadata_extractor/tests/test_website_metadata_extractor.py b/agents/website_metadata_extractor/tests/test_website_metadata_extractor.py new file mode 100644 index 0000000..06ffc63 --- /dev/null +++ b/agents/website_metadata_extractor/tests/test_website_metadata_extractor.py @@ -0,0 +1,100 @@ +import pytest +from agents.website_metadata_extractor import WebsiteMetadataAgent +import requests + +class MockRequests: + """Mock class for requests.get.""" + @staticmethod + def get(url, timeout=10): + if "invalid" in url: + raise ValueError("Invalid URL") + elif "error" in url: + # Simulate network error + raise Exception("Simulated network error") + + class MockResponse: + status_code = 200 + + @staticmethod + def raise_for_status(): + pass + + @property + def content(self): + # Simulate a simple HTML response for testing + return """ + +
+