Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions agents/website_metadata_extractor/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
Website Metadata Extractor Plugin
=================================

The **Website Metadata Extractor Plugin** is designed to fetch metadata (such as title, description, and keywords) from a website. This plugin allows you to extract relevant metadata from a webpage for further analysis or processing.

Features
--------
- Extract metadata from any website using its URL.
- Retrieve the **title**, **meta description**, and **meta keywords**.
- Simple integration with the **PlugFlow** framework.
- Handles error cases such as invalid URLs or network issues.

Installation
------------
Ensure that the required dependencies are installed. Add the following to your environment or `requirements.txt`:

.. code-block:: text

requests
beautifulsoup4

Install the dependencies using pip:

.. code-block:: bash

pip install requests beautifulsoup4

Parameters
----------
The plugin accepts the following parameters:

- `url` (str): The URL of the website to extract metadata from.
- `max_comments` (int): Maximum number of comments to fetch (default: 10). (If this plugin is modified to also include comments, this could be relevant.)

Example Usage
-------------
To execute the plugin, use the **PlugFlow CLI**:

.. code-block:: bash

python main.py execute website-metadata-extractor --params '{"url": "https://example.com"}'

Output
------
The plugin returns the fetched metadata as a JSON object with the following fields:

- `Title`: The title of the webpage.
- `Meta Description`: The content of the meta description tag.
- `Meta Keywords`: The content of the meta keywords tag.

Example:

.. code-block:: json

{
"Title": "Example Domain",
"Meta Description": "This domain is established to be used for illustrative examples in documents.",
"Meta Keywords": "example, domain, illustrative"
}

Health Check
-------------
The plugin includes a `health_check` method to verify its operational status. The method attempts to fetch metadata from a known dummy website and returns a status message.

Example health check output:

.. code-block:: json

{
"status": "healthy",
"message": "Service is operational"
}

Testing
-------
To test the plugin, use the provided test suite located in the `tests` directory.

Run all tests:

.. code-block:: bash

pytest plugins/website_metadata_extractor/tests

Contributing
------------
Contributions to improve or enhance the plugin are welcome. Follow these steps:

1. Fork the repository.
2. Create a new branch for your changes.
3. Submit a pull request with a detailed description of your changes.

License
-------
This plugin is distributed under the MIT License. See the LICENSE file for more information.
55 changes: 55 additions & 0 deletions agents/website_metadata_extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import requests
import json
from bs4 import BeautifulSoup
from core.base import AgentBase
from log import logger

class WebsiteMetadataAgent(AgentBase):
"""Agent to extract metadata from a website."""

def execute(self, url):
"""Extract metadata from a website."""
try:
logger.info(f"Fetching metadata for URL: {url}")
response = requests.get(url, timeout=10)
response.raise_for_status() # Raise error for HTTP issues

soup = BeautifulSoup(response.content, "html.parser")
metadata = {
"Title": self._get_meta_data(soup, "title"),
"Meta Description": self._get_meta_data(soup, "description"),
"Meta Keywords": self._get_meta_data(soup, "keywords"),
}
metadata_json = json.dumps(metadata, indent=4)
logger.info(f"Extracted Metadata:\n{metadata_json}")
return metadata

except requests.exceptions.RequestException as e:
logger.error(f"Request failed: {e}")
raise ValueError(f"Failed to fetch metadata. Error: {e}")

def health_check(self):
"""Check if the agent can fetch a test URL."""
try:
logger.info("Performing health check...")
test_url = "https://example.com"
response = requests.get(test_url, timeout=5)
if response.status_code == 200:
logger.info("Health check passed.")
return {"status": "healthy", "message": "Service is operational"}
else:
logger.error(f"Health check failed: Unexpected response {response.status_code}.")
return {"status": "unhealthy", "message": f"Unexpected response: {response.status_code}"}
except requests.exceptions.RequestException as e:
logger.error(f"Health check failed: {e}")
return {"status": "unhealthy", "message": f"Health check failed with error: {str(e)}"}

def _get_meta_data(self, soup, meta_name):
"""Helper function to extract metadata from HTML content."""
if meta_name == "title" and soup.title:
return soup.title.string.strip()
elif meta_name == "description" or meta_name == "keywords":
meta_tag = soup.find("meta", attrs={"name": meta_name})
if meta_tag and "content" in meta_tag.attrs:
return meta_tag["content"].strip()
return f"No {meta_name} found"
7 changes: 7 additions & 0 deletions agents/website_metadata_extractor/manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"name": "website-metadata-extractor",
"description": "An agent to extract metadata from a website.",
"class_name": "WebsiteMetadataAgent",
"authors": ["Arisha gupta"],
"entry_point": "__init__"
}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import pytest
from agents.website_metadata_extractor import WebsiteMetadataAgent
import requests

class MockRequests:
"""Mock class for requests.get."""
@staticmethod
def get(url, timeout=10):
if "invalid" in url:
raise ValueError("Invalid URL")
elif "error" in url:
# Simulate network error
raise Exception("Simulated network error")

class MockResponse:
status_code = 200

@staticmethod
def raise_for_status():
pass

@property
def content(self):
# Simulate a simple HTML response for testing
return """
<html>
<head>
<title>Test Title</title>
<meta name="description" content="This is a test description">
<meta name="keywords" content="test, metadata, extractor">
</head>
<body>
<h1>Header 1</h1>
<h2>Header 2</h2>
</body>
</html>
"""
return MockResponse()


@pytest.fixture
def website_metadata_agent(monkeypatch):
"""Fixture to initialize the WebsiteMetadataAgent with a mock requests.get."""
agent = WebsiteMetadataAgent()
# Patch the requests.get method with the mock class
monkeypatch.setattr("requests.get", MockRequests.get)
return agent


def test_execute_success(website_metadata_agent):
"""Test successful execution of the WebsiteMetadataAgent."""
url = "https://valid-url.com"
metadata = website_metadata_agent.execute(url)

# Assertions for metadata fields
assert metadata["Title"] == "Test Title", "Expected title to be 'Test Title'."
assert metadata["Meta Description"] == "This is a test description", "Expected meta description to match."
assert metadata["Meta Keywords"] == "test, metadata, extractor", "Expected meta keywords to match."


def test_execute_invalid_url(website_metadata_agent):
"""Test execution with an invalid URL."""
url = "https://invalid-url.com"
with pytest.raises(ValueError) as excinfo:
website_metadata_agent.execute(url)

assert "Invalid URL" in str(excinfo.value)


def test_execute_network_error(website_metadata_agent):
"""Test execution with a simulated network error."""
url = "https://error-url.com"
with pytest.raises(Exception, match="Simulated network error"):
website_metadata_agent.execute(url)


def test_health_check_success(website_metadata_agent):
"""Test health check success."""
health = website_metadata_agent.health_check()
assert health["status"] == "healthy", "Expected health status to be 'healthy'."
assert "Service is operational" in health["message"], "Expected success message in health check."


def test_health_check_failure(monkeypatch, website_metadata_agent):
"""Test health check failure by mocking requests.get to raise an exception."""

# Define the mock method to simulate a failure in health check
def mock_get(*args, **kwargs):
# Raising the correct exception (generic exception)
raise requests.exceptions.RequestException("Mock health check failure")

# Patch the requests.get method to simulate a failure during the health check
monkeypatch.setattr("requests.get", mock_get)

# Run the health check and assert the error
health = website_metadata_agent.health_check()

# Validate the output after the simulated failure
assert health["status"] == "unhealthy", "Expected health status to be 'unhealthy'."
assert "Mock health check failure" in health["message"], "Expected failure message in health check."