From 5e316b4d2652e015ccfd1571cf16d60707368bf9 Mon Sep 17 00:00:00 2001 From: Amrit Krishnan Date: Wed, 4 Feb 2026 16:24:41 -0500 Subject: [PATCH 1/4] Add web fetch tool --- .../aieng/agent_evals/tools/web.py | 234 +++++++++++ .../tests/aieng/agent_evals/tools/test_web.py | 381 ++++++++++++++++++ 2 files changed, 615 insertions(+) create mode 100644 aieng-eval-agents/aieng/agent_evals/tools/web.py create mode 100644 aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py diff --git a/aieng-eval-agents/aieng/agent_evals/tools/web.py b/aieng-eval-agents/aieng/agent_evals/tools/web.py new file mode 100644 index 0000000..c91b0aa --- /dev/null +++ b/aieng-eval-agents/aieng/agent_evals/tools/web.py @@ -0,0 +1,234 @@ +"""Web fetch tool for retrieving content from URLs. + +Provides the web_fetch tool which fetches content from any URL (HTML pages or PDFs) +and returns the content for the agent to analyze. Similar to Anthropic's web_fetch tool. +""" + +import logging +from io import BytesIO +from typing import Any +from urllib.parse import urljoin + +import httpx +from google.adk.tools.function_tool import FunctionTool +from html_to_markdown import convert as html_to_markdown +from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential + + +logger = logging.getLogger(__name__) + +MAX_CONTENT_CHARS = 100_000 + +_http_retry = retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=1, max=10), + retry=retry_if_exception_type((httpx.RequestError, httpx.HTTPStatusError)), + reraise=True, +) + + +@_http_retry +def _fetch_with_retry(client: httpx.Client, url: str) -> httpx.Response: + """Fetch URL with automatic retry on transient failures.""" + response = client.get(url, headers={"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"}) + response.raise_for_status() + return response + + +def _html_to_markdown(html: str, base_url: str | None = None) -> str: + """Convert HTML to Markdown, preserving links, tables, and structure. + + Parameters + ---------- + html : str + The HTML content to convert. + base_url : str, optional + Base URL for resolving relative links. + + Returns + ------- + str + Markdown-formatted text with preserved links and tables. + """ + # Use html-to-markdown library for high-quality conversion + # It preserves links, tables, headings, lists, and other structure + markdown = html_to_markdown(html) + + # If base_url provided, convert relative URLs to absolute + if base_url: + import re # noqa: PLC0415 + + def make_absolute(match: re.Match) -> str: + """Convert relative URL to absolute.""" + prefix = match.group(1) # [text]( or src=" + url = match.group(2) + suffix = match.group(3) # ) or " + + # Skip if already absolute or is a data URI + if url.startswith(("http://", "https://", "data:", "mailto:", "#")): + return match.group(0) + + absolute_url = urljoin(base_url, url) + return f"{prefix}{absolute_url}{suffix}" + + # Fix markdown links: [text](url) + markdown = re.sub(r"(\[[^\]]*\]\()([^)]+)(\))", make_absolute, markdown) + + # Fix markdown images: ![alt](url) + markdown = re.sub(r"(!\[[^\]]*\]\()([^)]+)(\))", make_absolute, markdown) + + return markdown.strip() + + +def _extract_pdf_text(content: bytes, max_pages: int = 10) -> tuple[str, int]: + """Extract text from PDF bytes. + + Parameters + ---------- + content : bytes + The PDF file content. + max_pages : int + Maximum number of pages to extract. + + Returns + ------- + tuple[str, int] + The extracted text and total number of pages. + """ + from pypdf import PdfReader # noqa: PLC0415 + + pdf_file = BytesIO(content) + reader = PdfReader(pdf_file) + num_pages = len(reader.pages) + + pages_to_read = min(num_pages, max_pages) + text_parts = [] + + for i in range(pages_to_read): + page_text = reader.pages[i].extract_text() + if page_text: + text_parts.append(f"--- Page {i + 1} ---\n{page_text}") + + if pages_to_read < num_pages: + text_parts.append(f"\n[Document has {num_pages} pages. Showing first {pages_to_read}.]") + + return "\n\n".join(text_parts), num_pages + + +def _truncate_content(text: str) -> tuple[str, bool]: + """Truncate content if it exceeds the maximum length.""" + truncated = len(text) > MAX_CONTENT_CHARS + if truncated: + text = text[:MAX_CONTENT_CHARS] + "\n\n[Content truncated due to length]" + return text, truncated + + +def _make_error_response(error: str, url: str) -> dict[str, Any]: + """Create an error response dict.""" + return {"status": "error", "error": error, "url": url} + + +def _make_success_response(url: str, content: str, content_type: str, truncated: bool, **extra: Any) -> dict[str, Any]: + """Create a success response dict.""" + result = { + "status": "success", + "url": url, + "content": content, + "content_type": content_type, + "content_length": len(content), + "truncated": truncated, + } + result.update(extra) + return result + + +def web_fetch(url: str, max_pages: int = 10) -> dict[str, Any]: + """Fetch content from a URL (HTML page or PDF document). + + This tool retrieves the full content from a URL for analysis. It handles + both HTML pages (converted to readable text) and PDF documents (text extracted). + + For large data files (CSV, XLSX) that need searching, use fetch_file instead. + + Parameters + ---------- + url : str + The URL to fetch. Must be a valid HTTP or HTTPS URL. + max_pages : int, optional + For PDFs, maximum number of pages to extract (default 10). + + Returns + ------- + dict + On success: 'status', 'url', 'content', 'content_type', + 'content_length', 'truncated'. For PDFs also includes: + 'num_pages', 'pages_extracted'. On error: 'status', 'error', 'url'. + + Examples + -------- + >>> # Fetch an HTML page + >>> result = web_fetch("https://example.com/about") + >>> print(result["content"]) + + >>> # Fetch a PDF + >>> result = web_fetch("https://arxiv.org/pdf/2301.00234.pdf") + >>> print(f"Pages: {result['num_pages']}") + >>> print(result["content"]) + """ + # Validate URL + if not url.startswith(("http://", "https://")): + return _make_error_response("Invalid URL. Must start with http:// or https://", url) + + try: + with httpx.Client(timeout=60.0, follow_redirects=True) as client: + response = _fetch_with_retry(client, url) + content_type = response.headers.get("content-type", "") + final_url = str(response.url) + + # Handle PDF documents + if "application/pdf" in content_type or url.lower().endswith(".pdf"): + return _handle_pdf_response(response.content, max_pages, final_url, url) + + # Handle HTML and text content + if "text/html" in content_type or not content_type: + text = _html_to_markdown(response.text, base_url=final_url) + else: + text = response.text + text, truncated = _truncate_content(text) + + return _make_success_response(final_url, text, content_type or "text/html", truncated) + + except httpx.HTTPStatusError as e: + logger.warning(f"HTTP error fetching {url}: {e}") + return _make_error_response(f"HTTP {e.response.status_code}: {e.response.reason_phrase}", url) + except httpx.RequestError as e: + logger.warning(f"Request error fetching {url}: {e}") + return _make_error_response(f"Request failed: {e!s}", url) + except Exception as e: + logger.exception(f"Unexpected error in web_fetch for {url}") + return _make_error_response(f"Unexpected error: {e!s}", url) + + +def _handle_pdf_response(content: bytes, max_pages: int, final_url: str, url: str) -> dict[str, Any]: + """Handle PDF content extraction and response creation.""" + try: + text, num_pages = _extract_pdf_text(content, max_pages) + text, truncated = _truncate_content(text) + + return _make_success_response( + final_url, + text, + "application/pdf", + truncated, + num_pages=num_pages, + pages_extracted=min(num_pages, max_pages), + ) + except ImportError: + return _make_error_response("PDF support requires pypdf. Install with: pip install pypdf", url) + except Exception as e: + return _make_error_response(f"Failed to extract PDF text: {e!s}", url) + + +def create_web_fetch_tool() -> FunctionTool: + """Create an ADK FunctionTool for fetching web content.""" + return FunctionTool(func=web_fetch) diff --git a/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py b/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py new file mode 100644 index 0000000..7fc251d --- /dev/null +++ b/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py @@ -0,0 +1,381 @@ +"""Tests for the web tools module. + +Tests web_fetch which handles both HTML pages and PDF documents. +""" + +from io import BytesIO +from unittest.mock import AsyncMock, MagicMock, patch + +import httpx +import pytest +from aieng.agent_evals.tools._redirect import ( + _redirect_cache, + resolve_redirect_url_async, + resolve_redirect_urls_async, +) +from aieng.agent_evals.tools.web import ( + _html_to_markdown, + create_web_fetch_tool, + web_fetch, +) +from pypdf import PdfWriter + + +class TestHtmlToMarkdown: + """Tests for the _html_to_markdown function.""" + + def test_removes_script_tags(self): + """Test that script tags are removed.""" + html = "

Hello

" + result = _html_to_markdown(html) + assert "alert" not in result + assert "Hello" in result + + def test_removes_style_tags(self): + """Test that style tags are removed.""" + html = "

Text

" + result = _html_to_markdown(html) + assert "color" not in result + assert "Text" in result + + def test_converts_paragraphs(self): + """Test that paragraphs are preserved.""" + html = "

Para 1

Para 2

" + result = _html_to_markdown(html) + assert "Para 1" in result + assert "Para 2" in result + + def test_decodes_html_entities(self): + """Test that HTML entities are decoded.""" + html = "

Tom & Jerry

" + result = _html_to_markdown(html) + assert "Tom & Jerry" in result + + def test_preserves_links(self): + """Test that links are preserved in markdown format.""" + html = 'Example Link' + result = _html_to_markdown(html) + assert "[Example Link]" in result + assert "https://example.com" in result + + def test_preserves_links_with_base_url(self): + """Test that relative links are converted to absolute.""" + html = 'Link' + result = _html_to_markdown(html, base_url="https://example.com") + assert "https://example.com/page" in result + + def test_preserves_headings(self): + """Test that headings are converted to markdown.""" + html = "

Title

Subtitle

" + result = _html_to_markdown(html) + assert "Title" in result + assert "Subtitle" in result + + +class TestWebFetch: + """Tests for the web_fetch function.""" + + @patch("aieng.agent_evals.tools.web.httpx.Client") + def test_fetch_html_success(self, mock_client_class): + """Test successful HTML fetch returns content.""" + mock_response = MagicMock() + mock_response.text = "

Hello World

" + mock_response.headers = {"content-type": "text/html"} + mock_response.url = "https://example.com" + + mock_client = MagicMock() + mock_client.get.return_value = mock_response + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client_class.return_value = mock_client + + result = web_fetch("https://example.com") + + assert result["status"] == "success" + assert "content" in result + assert "Hello World" in result["content"] + assert result["content_type"] == "text/html" + + @patch("aieng.agent_evals.tools.web.httpx.Client") + def test_fetch_pdf_success(self, mock_client_class): + """Test that PDF content is extracted successfully.""" + # Create a PDF with text + writer = PdfWriter() + writer.add_blank_page(width=200, height=200) + pdf_bytes = BytesIO() + writer.write(pdf_bytes) + pdf_content = pdf_bytes.getvalue() + + mock_response = MagicMock() + mock_response.content = pdf_content + mock_response.headers = {"content-type": "application/pdf"} + mock_response.url = "https://example.com/doc.pdf" + + mock_client = MagicMock() + mock_client.get.return_value = mock_response + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client_class.return_value = mock_client + + result = web_fetch("https://example.com/doc.pdf") + + assert result["status"] == "success" + assert result["content_type"] == "application/pdf" + assert "num_pages" in result + assert result["num_pages"] >= 1 + + @patch("aieng.agent_evals.tools.web.httpx.Client") + def test_fetch_returns_content_length(self, mock_client_class): + """Test that fetch returns content length.""" + long_text = "A" * 10000 + mock_response = MagicMock() + mock_response.text = f"

{long_text}

" + mock_response.headers = {"content-type": "text/html"} + mock_response.url = "https://example.com" + + mock_client = MagicMock() + mock_client.get.return_value = mock_response + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client_class.return_value = mock_client + + result = web_fetch("https://example.com") + + assert result["status"] == "success" + # Content length should include the 10000 As (may have some markdown formatting) + assert result["content_length"] >= 10000 + assert not result["truncated"] + + @patch("aieng.agent_evals.tools.web.httpx.Client") + def test_fetch_truncates_large_content(self, mock_client_class): + """Test that very large content is truncated.""" + # Create content larger than MAX_CONTENT_CHARS (100KB) + large_text = "A" * 150_000 + mock_response = MagicMock() + mock_response.text = f"{large_text}" + mock_response.headers = {"content-type": "text/html"} + mock_response.url = "https://example.com" + + mock_client = MagicMock() + mock_client.get.return_value = mock_response + mock_client.__enter__ = MagicMock(return_value=mock_client) + mock_client.__exit__ = MagicMock(return_value=False) + mock_client_class.return_value = mock_client + + result = web_fetch("https://example.com") + + assert result["status"] == "success" + assert result["truncated"] is True + assert "[Content truncated" in result["content"] + + def test_fetch_invalid_url(self): + """Test that invalid URLs return error.""" + result = web_fetch("not-a-url") + assert result["status"] == "error" + assert "Invalid URL" in result["error"] + + +class TestCreateWebFetchTool: + """Tests for the create_web_fetch_tool function.""" + + def test_creates_tool_with_correct_function(self): + """Test that web fetch tool is created with the correct function.""" + tool = create_web_fetch_tool() + assert tool is not None + assert tool.func == web_fetch + + +class TestResolveRedirectUrlAsync: + """Tests for async redirect URL resolution.""" + + @pytest.mark.asyncio + async def test_non_redirect_url_returns_unchanged(self): + """Test that non-redirect URLs are returned unchanged.""" + url = "https://example.com/page" + result = await resolve_redirect_url_async(url) + assert result == url + + @pytest.mark.asyncio + async def test_resolves_redirect_url_async(self): + """Test async resolution of redirect URLs.""" + redirect_url = "https://vertexaisearch.cloud.google.com/grounding-api-redirect/async123" + final_url = "https://example.com/actual-page-async" + + mock_response = MagicMock() + mock_response.url = final_url + + # Create async mock for head method + async def mock_head(*_args, **_kwargs): + return mock_response + + # Clear the cache + _redirect_cache.clear() + + with patch("aieng.agent_evals.tools.web.httpx.AsyncClient") as mock_client_class: + mock_client = MagicMock() + mock_client.head = mock_head + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + result = await resolve_redirect_url_async(redirect_url) + assert result == final_url + + @pytest.mark.asyncio + async def test_resolve_multiple_urls_in_parallel(self): + """Test that multiple URLs can be resolved in parallel.""" + urls = [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + ] + results = await resolve_redirect_urls_async(urls) + + # Non-redirect URLs should be returned as-is + assert results == urls + + @pytest.mark.asyncio + async def test_empty_list_returns_empty(self): + """Test that empty list input returns empty list.""" + results = await resolve_redirect_urls_async([]) + assert results == [] + + @pytest.mark.asyncio + async def test_caches_resolved_urls(self): + """Test that resolved URLs are cached to avoid repeated HTTP calls.""" + redirect_url = "https://vertexaisearch.cloud.google.com/grounding-api-redirect/cache-test" + final_url = "https://example.com/cached-page" + + call_count = 0 + + async def mock_head(*_args, **_kwargs): + nonlocal call_count + call_count += 1 + mock_response = MagicMock() + mock_response.url = final_url + return mock_response + + # Clear the cache + _redirect_cache.clear() + + with patch("aieng.agent_evals.tools.web.httpx.AsyncClient") as mock_client_class: + mock_client = MagicMock() + mock_client.head = mock_head + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + # First call should make HTTP request + result1 = await resolve_redirect_url_async(redirect_url) + assert result1 == final_url + assert call_count == 1 + + # Second call should use cache (no HTTP request) + result2 = await resolve_redirect_url_async(redirect_url) + assert result2 == final_url + assert call_count == 1 # Still 1, used cache + + @pytest.mark.asyncio + async def test_retries_on_timeout(self): + """Test that resolution retries on transient failures.""" + redirect_url = "https://vertexaisearch.cloud.google.com/grounding-api-redirect/retry-test" + final_url = "https://example.com/retried-page" + + head_call_count = 0 + stream_call_count = 0 + + # HEAD will always fail (return None triggers GET fallback) + async def mock_head(*_args, **_kwargs): + nonlocal head_call_count + head_call_count += 1 + # Return None to simulate HEAD not supported, triggers GET fallback + raise httpx.HTTPStatusError( + "Method Not Allowed", + request=MagicMock(), + response=MagicMock(status_code=405), + ) + + # Stream (GET) will fail first two times, then succeed + class MockStreamContext: + def __init__(self, fail_count): + self.fail_count = fail_count + + async def __aenter__(self): + nonlocal stream_call_count + stream_call_count += 1 + if stream_call_count <= self.fail_count: + raise httpx.TimeoutException("Connection timed out") + mock_response = MagicMock() + mock_response.url = final_url + return mock_response + + async def __aexit__(self, *_args): + pass + + # Clear the cache + _redirect_cache.clear() + + with patch("aieng.agent_evals.tools.web.httpx.AsyncClient") as mock_client_class: + mock_client = MagicMock() + mock_client.head = mock_head + mock_client.stream.return_value = MockStreamContext(fail_count=2) + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) + mock_client_class.return_value = mock_client + + result = await resolve_redirect_url_async(redirect_url) + assert result == final_url + # HEAD called once per retry attempt (3 times) + # Stream called 3 times (2 failures + 1 success) + assert stream_call_count == 3 + + +@pytest.mark.integration_test +class TestWebFetchIntegration: + """Integration tests for web_fetch (requires network). + + These tests verify that web_fetch works correctly for both HTML pages + and PDF documents, returning content suitable for the agent to analyze. + """ + + def test_fetch_html_page_returns_readable_content(self): + """Test that HTML pages are converted to readable markdown.""" + result = web_fetch("https://www.iana.org/help/example-domains") + assert result["status"] == "success" + assert result["content_type"] == "text/html" or "html" in result["content_type"].lower() + + # Verify content is markdown (no raw HTML tags) + content = result["content"] + assert "" not in content.lower() + assert "" not in content.lower() + + # Verify content has meaningful text + assert len(content) > 100 + assert "example" in content.lower() + + # Verify links are preserved in markdown format (if any exist) + # The page should have links that are converted to [text](url) format + if "http" in content: + # Links should be in markdown format, not raw tags + assert " 0 + + # Verify extracted text is substantial + content = result["content"] + assert len(content) > 500 + + # Verify page markers are present + assert "--- Page" in content + + def test_fetch_pdf_pagination(self): + """Test that PDF max_pages parameter limits extraction.""" + result = web_fetch("https://arxiv.org/pdf/2301.00234.pdf", max_pages=1) + assert result["status"] == "success" + assert result["pages_extracted"] == 1 + assert result["num_pages"] >= 1 From de06de529788f989d7f4c4e24f3d440fb727931f Mon Sep 17 00:00:00 2001 From: Amrit Krishnan Date: Wed, 4 Feb 2026 17:12:35 -0500 Subject: [PATCH 2/4] Update web fetch tool to async --- .../aieng/agent_evals/tools/web.py | 36 ++++---- .../tests/aieng/agent_evals/tools/test_web.py | 84 ++++++++++++------- 2 files changed, 71 insertions(+), 49 deletions(-) diff --git a/aieng-eval-agents/aieng/agent_evals/tools/web.py b/aieng-eval-agents/aieng/agent_evals/tools/web.py index c91b0aa..9eb14d0 100644 --- a/aieng-eval-agents/aieng/agent_evals/tools/web.py +++ b/aieng-eval-agents/aieng/agent_evals/tools/web.py @@ -12,26 +12,28 @@ import httpx from google.adk.tools.function_tool import FunctionTool from html_to_markdown import convert as html_to_markdown -from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential +from tenacity import AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_exponential logger = logging.getLogger(__name__) MAX_CONTENT_CHARS = 100_000 -_http_retry = retry( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=1, max=10), - retry=retry_if_exception_type((httpx.RequestError, httpx.HTTPStatusError)), - reraise=True, -) - -@_http_retry -def _fetch_with_retry(client: httpx.Client, url: str) -> httpx.Response: +async def _fetch_with_retry(client: httpx.AsyncClient, url: str) -> httpx.Response: """Fetch URL with automatic retry on transient failures.""" - response = client.get(url, headers={"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"}) - response.raise_for_status() + response: httpx.Response | None = None + async for attempt in AsyncRetrying( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=1, max=10), + retry=retry_if_exception_type((httpx.RequestError, httpx.HTTPStatusError)), + ): + with attempt: + response = await client.get(url, headers={"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"}) + response.raise_for_status() + + # AsyncRetrying ensures response is set on success + assert response is not None return response @@ -142,7 +144,7 @@ def _make_success_response(url: str, content: str, content_type: str, truncated: return result -def web_fetch(url: str, max_pages: int = 10) -> dict[str, Any]: +async def web_fetch(url: str, max_pages: int = 10) -> dict[str, Any]: """Fetch content from a URL (HTML page or PDF document). This tool retrieves the full content from a URL for analysis. It handles @@ -167,11 +169,11 @@ def web_fetch(url: str, max_pages: int = 10) -> dict[str, Any]: Examples -------- >>> # Fetch an HTML page - >>> result = web_fetch("https://example.com/about") + >>> result = await web_fetch("https://example.com/about") >>> print(result["content"]) >>> # Fetch a PDF - >>> result = web_fetch("https://arxiv.org/pdf/2301.00234.pdf") + >>> result = await web_fetch("https://arxiv.org/pdf/2301.00234.pdf") >>> print(f"Pages: {result['num_pages']}") >>> print(result["content"]) """ @@ -180,8 +182,8 @@ def web_fetch(url: str, max_pages: int = 10) -> dict[str, Any]: return _make_error_response("Invalid URL. Must start with http:// or https://", url) try: - with httpx.Client(timeout=60.0, follow_redirects=True) as client: - response = _fetch_with_retry(client, url) + async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client: + response = await _fetch_with_retry(client, url) content_type = response.headers.get("content-type", "") final_url = str(response.url) diff --git a/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py b/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py index 7fc251d..d0e83f4 100644 --- a/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py +++ b/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py @@ -75,29 +75,34 @@ def test_preserves_headings(self): class TestWebFetch: """Tests for the web_fetch function.""" - @patch("aieng.agent_evals.tools.web.httpx.Client") - def test_fetch_html_success(self, mock_client_class): + @pytest.mark.asyncio + @patch("aieng.agent_evals.tools.web.httpx.AsyncClient") + async def test_fetch_html_success(self, mock_client_class): """Test successful HTML fetch returns content.""" mock_response = MagicMock() mock_response.text = "

Hello World

" mock_response.headers = {"content-type": "text/html"} mock_response.url = "https://example.com" + async def mock_get(*_args, **_kwargs): + return mock_response + mock_client = MagicMock() - mock_client.get.return_value = mock_response - mock_client.__enter__ = MagicMock(return_value=mock_client) - mock_client.__exit__ = MagicMock(return_value=False) + mock_client.get = mock_get + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) mock_client_class.return_value = mock_client - result = web_fetch("https://example.com") + result = await web_fetch("https://example.com") assert result["status"] == "success" assert "content" in result assert "Hello World" in result["content"] assert result["content_type"] == "text/html" - @patch("aieng.agent_evals.tools.web.httpx.Client") - def test_fetch_pdf_success(self, mock_client_class): + @pytest.mark.asyncio + @patch("aieng.agent_evals.tools.web.httpx.AsyncClient") + async def test_fetch_pdf_success(self, mock_client_class): """Test that PDF content is extracted successfully.""" # Create a PDF with text writer = PdfWriter() @@ -111,21 +116,25 @@ def test_fetch_pdf_success(self, mock_client_class): mock_response.headers = {"content-type": "application/pdf"} mock_response.url = "https://example.com/doc.pdf" + async def mock_get(*_args, **_kwargs): + return mock_response + mock_client = MagicMock() - mock_client.get.return_value = mock_response - mock_client.__enter__ = MagicMock(return_value=mock_client) - mock_client.__exit__ = MagicMock(return_value=False) + mock_client.get = mock_get + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) mock_client_class.return_value = mock_client - result = web_fetch("https://example.com/doc.pdf") + result = await web_fetch("https://example.com/doc.pdf") assert result["status"] == "success" assert result["content_type"] == "application/pdf" assert "num_pages" in result assert result["num_pages"] >= 1 - @patch("aieng.agent_evals.tools.web.httpx.Client") - def test_fetch_returns_content_length(self, mock_client_class): + @pytest.mark.asyncio + @patch("aieng.agent_evals.tools.web.httpx.AsyncClient") + async def test_fetch_returns_content_length(self, mock_client_class): """Test that fetch returns content length.""" long_text = "A" * 10000 mock_response = MagicMock() @@ -133,21 +142,25 @@ def test_fetch_returns_content_length(self, mock_client_class): mock_response.headers = {"content-type": "text/html"} mock_response.url = "https://example.com" + async def mock_get(*_args, **_kwargs): + return mock_response + mock_client = MagicMock() - mock_client.get.return_value = mock_response - mock_client.__enter__ = MagicMock(return_value=mock_client) - mock_client.__exit__ = MagicMock(return_value=False) + mock_client.get = mock_get + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) mock_client_class.return_value = mock_client - result = web_fetch("https://example.com") + result = await web_fetch("https://example.com") assert result["status"] == "success" # Content length should include the 10000 As (may have some markdown formatting) assert result["content_length"] >= 10000 assert not result["truncated"] - @patch("aieng.agent_evals.tools.web.httpx.Client") - def test_fetch_truncates_large_content(self, mock_client_class): + @pytest.mark.asyncio + @patch("aieng.agent_evals.tools.web.httpx.AsyncClient") + async def test_fetch_truncates_large_content(self, mock_client_class): """Test that very large content is truncated.""" # Create content larger than MAX_CONTENT_CHARS (100KB) large_text = "A" * 150_000 @@ -156,21 +169,25 @@ def test_fetch_truncates_large_content(self, mock_client_class): mock_response.headers = {"content-type": "text/html"} mock_response.url = "https://example.com" + async def mock_get(*_args, **_kwargs): + return mock_response + mock_client = MagicMock() - mock_client.get.return_value = mock_response - mock_client.__enter__ = MagicMock(return_value=mock_client) - mock_client.__exit__ = MagicMock(return_value=False) + mock_client.get = mock_get + mock_client.__aenter__ = AsyncMock(return_value=mock_client) + mock_client.__aexit__ = AsyncMock(return_value=None) mock_client_class.return_value = mock_client - result = web_fetch("https://example.com") + result = await web_fetch("https://example.com") assert result["status"] == "success" assert result["truncated"] is True assert "[Content truncated" in result["content"] - def test_fetch_invalid_url(self): + @pytest.mark.asyncio + async def test_fetch_invalid_url(self): """Test that invalid URLs return error.""" - result = web_fetch("not-a-url") + result = await web_fetch("not-a-url") assert result["status"] == "error" assert "Invalid URL" in result["error"] @@ -338,9 +355,10 @@ class TestWebFetchIntegration: and PDF documents, returning content suitable for the agent to analyze. """ - def test_fetch_html_page_returns_readable_content(self): + @pytest.mark.asyncio + async def test_fetch_html_page_returns_readable_content(self): """Test that HTML pages are converted to readable markdown.""" - result = web_fetch("https://www.iana.org/help/example-domains") + result = await web_fetch("https://www.iana.org/help/example-domains") assert result["status"] == "success" assert result["content_type"] == "text/html" or "html" in result["content_type"].lower() @@ -359,9 +377,10 @@ def test_fetch_html_page_returns_readable_content(self): # Links should be in markdown format, not raw
tags assert " 0 @@ -373,9 +392,10 @@ def test_fetch_pdf_extracts_text(self): # Verify page markers are present assert "--- Page" in content - def test_fetch_pdf_pagination(self): + @pytest.mark.asyncio + async def test_fetch_pdf_pagination(self): """Test that PDF max_pages parameter limits extraction.""" - result = web_fetch("https://arxiv.org/pdf/2301.00234.pdf", max_pages=1) + result = await web_fetch("https://arxiv.org/pdf/2301.00234.pdf", max_pages=1) assert result["status"] == "success" assert result["pages_extracted"] == 1 assert result["num_pages"] >= 1 From 33726ad303c4a4f4eb5068dbd2fd9133a9e15250 Mon Sep 17 00:00:00 2001 From: Amrit Krishnan Date: Thu, 5 Feb 2026 15:37:17 -0500 Subject: [PATCH 3/4] Fixes based on code review --- .../aieng/agent_evals/tools/search.py | 3 + .../aieng/agent_evals/tools/web.py | 75 +++++++++++-------- 2 files changed, 46 insertions(+), 32 deletions(-) diff --git a/aieng-eval-agents/aieng/agent_evals/tools/search.py b/aieng-eval-agents/aieng/agent_evals/tools/search.py index 33d18ae..46602c5 100644 --- a/aieng-eval-agents/aieng/agent_evals/tools/search.py +++ b/aieng-eval-agents/aieng/agent_evals/tools/search.py @@ -156,6 +156,9 @@ async def _google_search_async(query: str, model: str) -> dict[str, Any]: "summary": "", "sources": [], } + finally: + # Properly close the client to avoid aiohttp session leaks + client.close() async def google_search(query: str, model: str | None = None) -> dict[str, Any]: diff --git a/aieng-eval-agents/aieng/agent_evals/tools/web.py b/aieng-eval-agents/aieng/agent_evals/tools/web.py index 9eb14d0..1c99d44 100644 --- a/aieng-eval-agents/aieng/agent_evals/tools/web.py +++ b/aieng-eval-agents/aieng/agent_evals/tools/web.py @@ -5,6 +5,8 @@ """ import logging +import re +from collections.abc import Callable from io import BytesIO from typing import Any from urllib.parse import urljoin @@ -12,7 +14,8 @@ import httpx from google.adk.tools.function_tool import FunctionTool from html_to_markdown import convert as html_to_markdown -from tenacity import AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_exponential +from pypdf import PdfReader +from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential logger = logging.getLogger(__name__) @@ -20,20 +23,45 @@ MAX_CONTENT_CHARS = 100_000 +def _make_absolute_url(base_url: str) -> Callable[[re.Match[str]], str]: + """Create a function that converts relative URLs to absolute URLs. + + Parameters + ---------- + base_url : str + Base URL for resolving relative links. + + Returns + ------- + Callable[[re.Match[str]], str] + Function that takes a regex match and returns the URL converted to absolute. + """ + + def make_absolute(match: re.Match) -> str: + """Convert relative URL to absolute.""" + prefix = match.group(1) # [text]( or src=" + url = match.group(2) + suffix = match.group(3) # ) or " + + # Skip if already absolute or is a data URI + if url.startswith(("http://", "https://", "data:", "mailto:", "#")): + return match.group(0) + + absolute_url = urljoin(base_url, url) + return f"{prefix}{absolute_url}{suffix}" + + return make_absolute + + +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1, min=1, max=10), + retry=retry_if_exception_type((httpx.RequestError, httpx.HTTPStatusError)), +) async def _fetch_with_retry(client: httpx.AsyncClient, url: str) -> httpx.Response: """Fetch URL with automatic retry on transient failures.""" - response: httpx.Response | None = None - async for attempt in AsyncRetrying( - stop=stop_after_attempt(3), - wait=wait_exponential(multiplier=1, min=1, max=10), - retry=retry_if_exception_type((httpx.RequestError, httpx.HTTPStatusError)), - ): - with attempt: - response = await client.get(url, headers={"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"}) - response.raise_for_status() - - # AsyncRetrying ensures response is set on success - assert response is not None + response = await client.get(url, headers={"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"}) + response.raise_for_status() return response @@ -58,20 +86,7 @@ def _html_to_markdown(html: str, base_url: str | None = None) -> str: # If base_url provided, convert relative URLs to absolute if base_url: - import re # noqa: PLC0415 - - def make_absolute(match: re.Match) -> str: - """Convert relative URL to absolute.""" - prefix = match.group(1) # [text]( or src=" - url = match.group(2) - suffix = match.group(3) # ) or " - - # Skip if already absolute or is a data URI - if url.startswith(("http://", "https://", "data:", "mailto:", "#")): - return match.group(0) - - absolute_url = urljoin(base_url, url) - return f"{prefix}{absolute_url}{suffix}" + make_absolute = _make_absolute_url(base_url) # Fix markdown links: [text](url) markdown = re.sub(r"(\[[^\]]*\]\()([^)]+)(\))", make_absolute, markdown) @@ -97,8 +112,6 @@ def _extract_pdf_text(content: bytes, max_pages: int = 10) -> tuple[str, int]: tuple[str, int] The extracted text and total number of pages. """ - from pypdf import PdfReader # noqa: PLC0415 - pdf_file = BytesIO(content) reader = PdfReader(pdf_file) num_pages = len(reader.pages) @@ -192,7 +205,7 @@ async def web_fetch(url: str, max_pages: int = 10) -> dict[str, Any]: return _handle_pdf_response(response.content, max_pages, final_url, url) # Handle HTML and text content - if "text/html" in content_type or not content_type: + if "text/html" in content_type or content_type == "": text = _html_to_markdown(response.text, base_url=final_url) else: text = response.text @@ -225,8 +238,6 @@ def _handle_pdf_response(content: bytes, max_pages: int, final_url: str, url: st num_pages=num_pages, pages_extracted=min(num_pages, max_pages), ) - except ImportError: - return _make_error_response("PDF support requires pypdf. Install with: pip install pypdf", url) except Exception as e: return _make_error_response(f"Failed to extract PDF text: {e!s}", url) From 698e812a38f678558a0e7fdcdbdc8e00d47a3e8a Mon Sep 17 00:00:00 2001 From: Amrit Krishnan Date: Thu, 5 Feb 2026 15:42:51 -0500 Subject: [PATCH 4/4] Use retry decorator for the search tool's redirect url --- .../aieng/agent_evals/tools/_redirect.py | 72 +++++++------------ 1 file changed, 27 insertions(+), 45 deletions(-) diff --git a/aieng-eval-agents/aieng/agent_evals/tools/_redirect.py b/aieng-eval-agents/aieng/agent_evals/tools/_redirect.py index 76a9594..05772a3 100644 --- a/aieng-eval-agents/aieng/agent_evals/tools/_redirect.py +++ b/aieng-eval-agents/aieng/agent_evals/tools/_redirect.py @@ -9,7 +9,7 @@ import logging import httpx -from tenacity import AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_exponential +from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential logger = logging.getLogger(__name__) @@ -63,12 +63,28 @@ async def _resolve_with_get_async(client: httpx.AsyncClient, url: str) -> str: return str(response.url) -async def _resolve_single_url_async( - client: httpx.AsyncClient, - url: str, - max_retries: int = 3, - base_delay: float = 1.0, -) -> str: +@retry( + stop=stop_after_attempt(3), + wait=wait_exponential(multiplier=1.0, min=1.0, max=60.0), + retry=retry_if_exception_type((httpx.TimeoutException, httpx.ConnectError, httpx.ReadError)), +) +async def _resolve_single_url_with_retry(client: httpx.AsyncClient, url: str) -> str: + """Resolve URL with retry handling (internal helper).""" + # Try HEAD first (faster, no body download) + final_url = await _resolve_with_head_async(client, url) + + # Fall back to GET if HEAD failed + if final_url is None: + logger.debug(f"HEAD failed for {url[:60]}..., trying GET") + final_url = await _resolve_with_get_async(client, url) + + if final_url != url: + logger.debug(f"Resolved redirect: {url[:60]}... -> {final_url[:60]}...") + + return final_url + + +async def _resolve_single_url_async(client: httpx.AsyncClient, url: str) -> str: """Resolve a single URL with retries and exponential backoff. Uses tenacity for automatic retry handling with exponential backoff. @@ -79,10 +95,6 @@ async def _resolve_single_url_async( The HTTP client to use. url : str The URL to resolve. - max_retries : int - Maximum number of retry attempts. - base_delay : float - Base delay between retries (doubles each retry). Returns ------- @@ -98,27 +110,9 @@ async def _resolve_single_url_async( return _redirect_cache[url] try: - async for attempt in AsyncRetrying( - stop=stop_after_attempt(max_retries), - wait=wait_exponential(multiplier=base_delay, min=base_delay, max=60.0), - retry=retry_if_exception_type((httpx.TimeoutException, httpx.ConnectError, httpx.ReadError)), - ): - with attempt: - # Try HEAD first (faster, no body download) - final_url = await _resolve_with_head_async(client, url) - - # Fall back to GET if HEAD failed - if final_url is None: - logger.debug(f"HEAD failed for {url[:60]}..., trying GET") - final_url = await _resolve_with_get_async(client, url) - - if final_url != url: - logger.debug(f"Resolved redirect: {url[:60]}... -> {final_url[:60]}...") - - _redirect_cache[url] = final_url - - # If we reach here, the retry loop succeeded - return _redirect_cache[url] + final_url = await _resolve_single_url_with_retry(client, url) + _redirect_cache[url] = final_url + return final_url except Exception as e: # All retries exhausted or non-retryable error @@ -140,19 +134,7 @@ async def resolve_redirect_url_async(url: str) -> str: str The final destination URL after following redirects. """ - # Skip resolution for non-redirect URLs (fast path) - if not _is_redirect_url(url): - return url - - # Check cache first (fast path) - if url in _redirect_cache: - return _redirect_cache[url] - - async with httpx.AsyncClient( - timeout=_get_redirect_timeout(), - follow_redirects=True, - ) as client: - return await _resolve_single_url_async(client, url) + return (await resolve_redirect_urls_async([url]))[0] async def resolve_redirect_urls_async(urls: list[str]) -> list[str]: