From 5e316b4d2652e015ccfd1571cf16d60707368bf9 Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Wed, 4 Feb 2026 16:24:41 -0500
Subject: [PATCH 1/4] Add web fetch tool

---
 .../aieng/agent_evals/tools/web.py            | 234 +++++++++++
 .../tests/aieng/agent_evals/tools/test_web.py | 381 ++++++++++++++++++
 2 files changed, 615 insertions(+)
 create mode 100644 aieng-eval-agents/aieng/agent_evals/tools/web.py
 create mode 100644 aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py

diff --git a/aieng-eval-agents/aieng/agent_evals/tools/web.py b/aieng-eval-agents/aieng/agent_evals/tools/web.py
new file mode 100644
index 0000000..c91b0aa
--- /dev/null
+++ b/aieng-eval-agents/aieng/agent_evals/tools/web.py
@@ -0,0 +1,234 @@
+"""Web fetch tool for retrieving content from URLs.
+
+Provides the web_fetch tool which fetches content from any URL (HTML pages or PDFs)
+and returns the content for the agent to analyze. Similar to Anthropic's web_fetch tool.
+"""
+
+import logging
+from io import BytesIO
+from typing import Any
+from urllib.parse import urljoin
+
+import httpx
+from google.adk.tools.function_tool import FunctionTool
+from html_to_markdown import convert as html_to_markdown
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
+
+
+logger = logging.getLogger(__name__)
+
+MAX_CONTENT_CHARS = 100_000
+
+_http_retry = retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=1, max=10),
+    retry=retry_if_exception_type((httpx.RequestError, httpx.HTTPStatusError)),
+    reraise=True,
+)
+
+
+@_http_retry
+def _fetch_with_retry(client: httpx.Client, url: str) -> httpx.Response:
+    """Fetch URL with automatic retry on transient failures."""
+    response = client.get(url, headers={"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"})
+    response.raise_for_status()
+    return response
+
+
+def _html_to_markdown(html: str, base_url: str | None = None) -> str:
+    """Convert HTML to Markdown, preserving links, tables, and structure.
+
+    Parameters
+    ----------
+    html : str
+        The HTML content to convert.
+    base_url : str, optional
+        Base URL for resolving relative links.
+
+    Returns
+    -------
+    str
+        Markdown-formatted text with preserved links and tables.
+    """
+    # Use html-to-markdown library for high-quality conversion
+    # It preserves links, tables, headings, lists, and other structure
+    markdown = html_to_markdown(html)
+
+    # If base_url provided, convert relative URLs to absolute
+    if base_url:
+        import re  # noqa: PLC0415
+
+        def make_absolute(match: re.Match) -> str:
+            """Convert relative URL to absolute."""
+            prefix = match.group(1)  # [text]( or src="
+            url = match.group(2)
+            suffix = match.group(3)  # ) or "
+
+            # Skip if already absolute or is a data URI
+            if url.startswith(("http://", "https://", "data:", "mailto:", "#")):
+                return match.group(0)
+
+            absolute_url = urljoin(base_url, url)
+            return f"{prefix}{absolute_url}{suffix}"
+
+        # Fix markdown links: [text](url)
+        markdown = re.sub(r"(\[[^\]]*\]\()([^)]+)(\))", make_absolute, markdown)
+
+        # Fix markdown images: ![alt](url)
+        markdown = re.sub(r"(!\[[^\]]*\]\()([^)]+)(\))", make_absolute, markdown)
+
+    return markdown.strip()
+
+
+def _extract_pdf_text(content: bytes, max_pages: int = 10) -> tuple[str, int]:
+    """Extract text from PDF bytes.
+
+    Parameters
+    ----------
+    content : bytes
+        The PDF file content.
+    max_pages : int
+        Maximum number of pages to extract.
+
+    Returns
+    -------
+    tuple[str, int]
+        The extracted text and total number of pages.
+    """
+    from pypdf import PdfReader  # noqa: PLC0415
+
+    pdf_file = BytesIO(content)
+    reader = PdfReader(pdf_file)
+    num_pages = len(reader.pages)
+
+    pages_to_read = min(num_pages, max_pages)
+    text_parts = []
+
+    for i in range(pages_to_read):
+        page_text = reader.pages[i].extract_text()
+        if page_text:
+            text_parts.append(f"--- Page {i + 1} ---\n{page_text}")
+
+    if pages_to_read < num_pages:
+        text_parts.append(f"\n[Document has {num_pages} pages. Showing first {pages_to_read}.]")
+
+    return "\n\n".join(text_parts), num_pages
+
+
+def _truncate_content(text: str) -> tuple[str, bool]:
+    """Truncate content if it exceeds the maximum length."""
+    truncated = len(text) > MAX_CONTENT_CHARS
+    if truncated:
+        text = text[:MAX_CONTENT_CHARS] + "\n\n[Content truncated due to length]"
+    return text, truncated
+
+
+def _make_error_response(error: str, url: str) -> dict[str, Any]:
+    """Create an error response dict."""
+    return {"status": "error", "error": error, "url": url}
+
+
+def _make_success_response(url: str, content: str, content_type: str, truncated: bool, **extra: Any) -> dict[str, Any]:
+    """Create a success response dict."""
+    result = {
+        "status": "success",
+        "url": url,
+        "content": content,
+        "content_type": content_type,
+        "content_length": len(content),
+        "truncated": truncated,
+    }
+    result.update(extra)
+    return result
+
+
+def web_fetch(url: str, max_pages: int = 10) -> dict[str, Any]:
+    """Fetch content from a URL (HTML page or PDF document).
+
+    This tool retrieves the full content from a URL for analysis. It handles
+    both HTML pages (converted to readable text) and PDF documents (text extracted).
+
+    For large data files (CSV, XLSX) that need searching, use fetch_file instead.
+
+    Parameters
+    ----------
+    url : str
+        The URL to fetch. Must be a valid HTTP or HTTPS URL.
+    max_pages : int, optional
+        For PDFs, maximum number of pages to extract (default 10).
+
+    Returns
+    -------
+    dict
+        On success: 'status', 'url', 'content', 'content_type',
+        'content_length', 'truncated'. For PDFs also includes:
+        'num_pages', 'pages_extracted'. On error: 'status', 'error', 'url'.
+
+    Examples
+    --------
+    >>> # Fetch an HTML page
+    >>> result = web_fetch("https://example.com/about")
+    >>> print(result["content"])
+
+    >>> # Fetch a PDF
+    >>> result = web_fetch("https://arxiv.org/pdf/2301.00234.pdf")
+    >>> print(f"Pages: {result['num_pages']}")
+    >>> print(result["content"])
+    """
+    # Validate URL
+    if not url.startswith(("http://", "https://")):
+        return _make_error_response("Invalid URL. Must start with http:// or https://", url)
+
+    try:
+        with httpx.Client(timeout=60.0, follow_redirects=True) as client:
+            response = _fetch_with_retry(client, url)
+            content_type = response.headers.get("content-type", "")
+            final_url = str(response.url)
+
+            # Handle PDF documents
+            if "application/pdf" in content_type or url.lower().endswith(".pdf"):
+                return _handle_pdf_response(response.content, max_pages, final_url, url)
+
+            # Handle HTML and text content
+            if "text/html" in content_type or not content_type:
+                text = _html_to_markdown(response.text, base_url=final_url)
+            else:
+                text = response.text
+            text, truncated = _truncate_content(text)
+
+            return _make_success_response(final_url, text, content_type or "text/html", truncated)
+
+    except httpx.HTTPStatusError as e:
+        logger.warning(f"HTTP error fetching {url}: {e}")
+        return _make_error_response(f"HTTP {e.response.status_code}: {e.response.reason_phrase}", url)
+    except httpx.RequestError as e:
+        logger.warning(f"Request error fetching {url}: {e}")
+        return _make_error_response(f"Request failed: {e!s}", url)
+    except Exception as e:
+        logger.exception(f"Unexpected error in web_fetch for {url}")
+        return _make_error_response(f"Unexpected error: {e!s}", url)
+
+
+def _handle_pdf_response(content: bytes, max_pages: int, final_url: str, url: str) -> dict[str, Any]:
+    """Handle PDF content extraction and response creation."""
+    try:
+        text, num_pages = _extract_pdf_text(content, max_pages)
+        text, truncated = _truncate_content(text)
+
+        return _make_success_response(
+            final_url,
+            text,
+            "application/pdf",
+            truncated,
+            num_pages=num_pages,
+            pages_extracted=min(num_pages, max_pages),
+        )
+    except ImportError:
+        return _make_error_response("PDF support requires pypdf. Install with: pip install pypdf", url)
+    except Exception as e:
+        return _make_error_response(f"Failed to extract PDF text: {e!s}", url)
+
+
+def create_web_fetch_tool() -> FunctionTool:
+    """Create an ADK FunctionTool for fetching web content."""
+    return FunctionTool(func=web_fetch)
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py b/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py
new file mode 100644
index 0000000..7fc251d
--- /dev/null
+++ b/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py
@@ -0,0 +1,381 @@
+"""Tests for the web tools module.
+
+Tests web_fetch which handles both HTML pages and PDF documents.
+"""
+
+from io import BytesIO
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import httpx
+import pytest
+from aieng.agent_evals.tools._redirect import (
+    _redirect_cache,
+    resolve_redirect_url_async,
+    resolve_redirect_urls_async,
+)
+from aieng.agent_evals.tools.web import (
+    _html_to_markdown,
+    create_web_fetch_tool,
+    web_fetch,
+)
+from pypdf import PdfWriter
+
+
+class TestHtmlToMarkdown:
+    """Tests for the _html_to_markdown function."""
+
+    def test_removes_script_tags(self):
+        """Test that script tags are removed."""
+        html = "<html><script>alert('hi')</script><p>Hello</p></html>"
+        result = _html_to_markdown(html)
+        assert "alert" not in result
+        assert "Hello" in result
+
+    def test_removes_style_tags(self):
+        """Test that style tags are removed."""
+        html = "<html><style>.foo { color: red; }</style><p>Text</p></html>"
+        result = _html_to_markdown(html)
+        assert "color" not in result
+        assert "Text" in result
+
+    def test_converts_paragraphs(self):
+        """Test that paragraphs are preserved."""
+        html = "<p>Para 1</p><p>Para 2</p>"
+        result = _html_to_markdown(html)
+        assert "Para 1" in result
+        assert "Para 2" in result
+
+    def test_decodes_html_entities(self):
+        """Test that HTML entities are decoded."""
+        html = "<p>Tom &amp; Jerry</p>"
+        result = _html_to_markdown(html)
+        assert "Tom & Jerry" in result
+
+    def test_preserves_links(self):
+        """Test that links are preserved in markdown format."""
+        html = '<a href="https://example.com">Example Link</a>'
+        result = _html_to_markdown(html)
+        assert "[Example Link]" in result
+        assert "https://example.com" in result
+
+    def test_preserves_links_with_base_url(self):
+        """Test that relative links are converted to absolute."""
+        html = '<a href="/page">Link</a>'
+        result = _html_to_markdown(html, base_url="https://example.com")
+        assert "https://example.com/page" in result
+
+    def test_preserves_headings(self):
+        """Test that headings are converted to markdown."""
+        html = "<h1>Title</h1><h2>Subtitle</h2>"
+        result = _html_to_markdown(html)
+        assert "Title" in result
+        assert "Subtitle" in result
+
+
+class TestWebFetch:
+    """Tests for the web_fetch function."""
+
+    @patch("aieng.agent_evals.tools.web.httpx.Client")
+    def test_fetch_html_success(self, mock_client_class):
+        """Test successful HTML fetch returns content."""
+        mock_response = MagicMock()
+        mock_response.text = "<html><body><p>Hello World</p></body></html>"
+        mock_response.headers = {"content-type": "text/html"}
+        mock_response.url = "https://example.com"
+
+        mock_client = MagicMock()
+        mock_client.get.return_value = mock_response
+        mock_client.__enter__ = MagicMock(return_value=mock_client)
+        mock_client.__exit__ = MagicMock(return_value=False)
+        mock_client_class.return_value = mock_client
+
+        result = web_fetch("https://example.com")
+
+        assert result["status"] == "success"
+        assert "content" in result
+        assert "Hello World" in result["content"]
+        assert result["content_type"] == "text/html"
+
+    @patch("aieng.agent_evals.tools.web.httpx.Client")
+    def test_fetch_pdf_success(self, mock_client_class):
+        """Test that PDF content is extracted successfully."""
+        # Create a PDF with text
+        writer = PdfWriter()
+        writer.add_blank_page(width=200, height=200)
+        pdf_bytes = BytesIO()
+        writer.write(pdf_bytes)
+        pdf_content = pdf_bytes.getvalue()
+
+        mock_response = MagicMock()
+        mock_response.content = pdf_content
+        mock_response.headers = {"content-type": "application/pdf"}
+        mock_response.url = "https://example.com/doc.pdf"
+
+        mock_client = MagicMock()
+        mock_client.get.return_value = mock_response
+        mock_client.__enter__ = MagicMock(return_value=mock_client)
+        mock_client.__exit__ = MagicMock(return_value=False)
+        mock_client_class.return_value = mock_client
+
+        result = web_fetch("https://example.com/doc.pdf")
+
+        assert result["status"] == "success"
+        assert result["content_type"] == "application/pdf"
+        assert "num_pages" in result
+        assert result["num_pages"] >= 1
+
+    @patch("aieng.agent_evals.tools.web.httpx.Client")
+    def test_fetch_returns_content_length(self, mock_client_class):
+        """Test that fetch returns content length."""
+        long_text = "A" * 10000
+        mock_response = MagicMock()
+        mock_response.text = f"<html><body><p>{long_text}</p></body></html>"
+        mock_response.headers = {"content-type": "text/html"}
+        mock_response.url = "https://example.com"
+
+        mock_client = MagicMock()
+        mock_client.get.return_value = mock_response
+        mock_client.__enter__ = MagicMock(return_value=mock_client)
+        mock_client.__exit__ = MagicMock(return_value=False)
+        mock_client_class.return_value = mock_client
+
+        result = web_fetch("https://example.com")
+
+        assert result["status"] == "success"
+        # Content length should include the 10000 As (may have some markdown formatting)
+        assert result["content_length"] >= 10000
+        assert not result["truncated"]
+
+    @patch("aieng.agent_evals.tools.web.httpx.Client")
+    def test_fetch_truncates_large_content(self, mock_client_class):
+        """Test that very large content is truncated."""
+        # Create content larger than MAX_CONTENT_CHARS (100KB)
+        large_text = "A" * 150_000
+        mock_response = MagicMock()
+        mock_response.text = f"<html><body>{large_text}</body></html>"
+        mock_response.headers = {"content-type": "text/html"}
+        mock_response.url = "https://example.com"
+
+        mock_client = MagicMock()
+        mock_client.get.return_value = mock_response
+        mock_client.__enter__ = MagicMock(return_value=mock_client)
+        mock_client.__exit__ = MagicMock(return_value=False)
+        mock_client_class.return_value = mock_client
+
+        result = web_fetch("https://example.com")
+
+        assert result["status"] == "success"
+        assert result["truncated"] is True
+        assert "[Content truncated" in result["content"]
+
+    def test_fetch_invalid_url(self):
+        """Test that invalid URLs return error."""
+        result = web_fetch("not-a-url")
+        assert result["status"] == "error"
+        assert "Invalid URL" in result["error"]
+
+
+class TestCreateWebFetchTool:
+    """Tests for the create_web_fetch_tool function."""
+
+    def test_creates_tool_with_correct_function(self):
+        """Test that web fetch tool is created with the correct function."""
+        tool = create_web_fetch_tool()
+        assert tool is not None
+        assert tool.func == web_fetch
+
+
+class TestResolveRedirectUrlAsync:
+    """Tests for async redirect URL resolution."""
+
+    @pytest.mark.asyncio
+    async def test_non_redirect_url_returns_unchanged(self):
+        """Test that non-redirect URLs are returned unchanged."""
+        url = "https://example.com/page"
+        result = await resolve_redirect_url_async(url)
+        assert result == url
+
+    @pytest.mark.asyncio
+    async def test_resolves_redirect_url_async(self):
+        """Test async resolution of redirect URLs."""
+        redirect_url = "https://vertexaisearch.cloud.google.com/grounding-api-redirect/async123"
+        final_url = "https://example.com/actual-page-async"
+
+        mock_response = MagicMock()
+        mock_response.url = final_url
+
+        # Create async mock for head method
+        async def mock_head(*_args, **_kwargs):
+            return mock_response
+
+        # Clear the cache
+        _redirect_cache.clear()
+
+        with patch("aieng.agent_evals.tools.web.httpx.AsyncClient") as mock_client_class:
+            mock_client = MagicMock()
+            mock_client.head = mock_head
+            mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client.__aexit__ = AsyncMock(return_value=None)
+            mock_client_class.return_value = mock_client
+
+            result = await resolve_redirect_url_async(redirect_url)
+            assert result == final_url
+
+    @pytest.mark.asyncio
+    async def test_resolve_multiple_urls_in_parallel(self):
+        """Test that multiple URLs can be resolved in parallel."""
+        urls = [
+            "https://example.com/page1",
+            "https://example.com/page2",
+            "https://example.com/page3",
+        ]
+        results = await resolve_redirect_urls_async(urls)
+
+        # Non-redirect URLs should be returned as-is
+        assert results == urls
+
+    @pytest.mark.asyncio
+    async def test_empty_list_returns_empty(self):
+        """Test that empty list input returns empty list."""
+        results = await resolve_redirect_urls_async([])
+        assert results == []
+
+    @pytest.mark.asyncio
+    async def test_caches_resolved_urls(self):
+        """Test that resolved URLs are cached to avoid repeated HTTP calls."""
+        redirect_url = "https://vertexaisearch.cloud.google.com/grounding-api-redirect/cache-test"
+        final_url = "https://example.com/cached-page"
+
+        call_count = 0
+
+        async def mock_head(*_args, **_kwargs):
+            nonlocal call_count
+            call_count += 1
+            mock_response = MagicMock()
+            mock_response.url = final_url
+            return mock_response
+
+        # Clear the cache
+        _redirect_cache.clear()
+
+        with patch("aieng.agent_evals.tools.web.httpx.AsyncClient") as mock_client_class:
+            mock_client = MagicMock()
+            mock_client.head = mock_head
+            mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client.__aexit__ = AsyncMock(return_value=None)
+            mock_client_class.return_value = mock_client
+
+            # First call should make HTTP request
+            result1 = await resolve_redirect_url_async(redirect_url)
+            assert result1 == final_url
+            assert call_count == 1
+
+            # Second call should use cache (no HTTP request)
+            result2 = await resolve_redirect_url_async(redirect_url)
+            assert result2 == final_url
+            assert call_count == 1  # Still 1, used cache
+
+    @pytest.mark.asyncio
+    async def test_retries_on_timeout(self):
+        """Test that resolution retries on transient failures."""
+        redirect_url = "https://vertexaisearch.cloud.google.com/grounding-api-redirect/retry-test"
+        final_url = "https://example.com/retried-page"
+
+        head_call_count = 0
+        stream_call_count = 0
+
+        # HEAD will always fail (return None triggers GET fallback)
+        async def mock_head(*_args, **_kwargs):
+            nonlocal head_call_count
+            head_call_count += 1
+            # Return None to simulate HEAD not supported, triggers GET fallback
+            raise httpx.HTTPStatusError(
+                "Method Not Allowed",
+                request=MagicMock(),
+                response=MagicMock(status_code=405),
+            )
+
+        # Stream (GET) will fail first two times, then succeed
+        class MockStreamContext:
+            def __init__(self, fail_count):
+                self.fail_count = fail_count
+
+            async def __aenter__(self):
+                nonlocal stream_call_count
+                stream_call_count += 1
+                if stream_call_count <= self.fail_count:
+                    raise httpx.TimeoutException("Connection timed out")
+                mock_response = MagicMock()
+                mock_response.url = final_url
+                return mock_response
+
+            async def __aexit__(self, *_args):
+                pass
+
+        # Clear the cache
+        _redirect_cache.clear()
+
+        with patch("aieng.agent_evals.tools.web.httpx.AsyncClient") as mock_client_class:
+            mock_client = MagicMock()
+            mock_client.head = mock_head
+            mock_client.stream.return_value = MockStreamContext(fail_count=2)
+            mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+            mock_client.__aexit__ = AsyncMock(return_value=None)
+            mock_client_class.return_value = mock_client
+
+            result = await resolve_redirect_url_async(redirect_url)
+            assert result == final_url
+            # HEAD called once per retry attempt (3 times)
+            # Stream called 3 times (2 failures + 1 success)
+            assert stream_call_count == 3
+
+
+@pytest.mark.integration_test
+class TestWebFetchIntegration:
+    """Integration tests for web_fetch (requires network).
+
+    These tests verify that web_fetch works correctly for both HTML pages
+    and PDF documents, returning content suitable for the agent to analyze.
+    """
+
+    def test_fetch_html_page_returns_readable_content(self):
+        """Test that HTML pages are converted to readable markdown."""
+        result = web_fetch("https://www.iana.org/help/example-domains")
+        assert result["status"] == "success"
+        assert result["content_type"] == "text/html" or "html" in result["content_type"].lower()
+
+        # Verify content is markdown (no raw HTML tags)
+        content = result["content"]
+        assert "<html>" not in content.lower()
+        assert "<body>" not in content.lower()
+
+        # Verify content has meaningful text
+        assert len(content) > 100
+        assert "example" in content.lower()
+
+        # Verify links are preserved in markdown format (if any exist)
+        # The page should have links that are converted to [text](url) format
+        if "http" in content:
+            # Links should be in markdown format, not raw <a> tags
+            assert "<a " not in content.lower()
+
+    def test_fetch_pdf_extracts_text(self):
+        """Test that PDF content is extracted as searchable text."""
+        result = web_fetch("https://arxiv.org/pdf/2301.00234.pdf", max_pages=2)
+        assert result["status"] == "success"
+        assert result["content_type"] == "application/pdf"
+        assert result["num_pages"] > 0
+
+        # Verify extracted text is substantial
+        content = result["content"]
+        assert len(content) > 500
+
+        # Verify page markers are present
+        assert "--- Page" in content
+
+    def test_fetch_pdf_pagination(self):
+        """Test that PDF max_pages parameter limits extraction."""
+        result = web_fetch("https://arxiv.org/pdf/2301.00234.pdf", max_pages=1)
+        assert result["status"] == "success"
+        assert result["pages_extracted"] == 1
+        assert result["num_pages"] >= 1

From de06de529788f989d7f4c4e24f3d440fb727931f Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Wed, 4 Feb 2026 17:12:35 -0500
Subject: [PATCH 2/4] Update web fetch tool to async

---
 .../aieng/agent_evals/tools/web.py            | 36 ++++----
 .../tests/aieng/agent_evals/tools/test_web.py | 84 ++++++++++++-------
 2 files changed, 71 insertions(+), 49 deletions(-)

diff --git a/aieng-eval-agents/aieng/agent_evals/tools/web.py b/aieng-eval-agents/aieng/agent_evals/tools/web.py
index c91b0aa..9eb14d0 100644
--- a/aieng-eval-agents/aieng/agent_evals/tools/web.py
+++ b/aieng-eval-agents/aieng/agent_evals/tools/web.py
@@ -12,26 +12,28 @@
 import httpx
 from google.adk.tools.function_tool import FunctionTool
 from html_to_markdown import convert as html_to_markdown
-from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
+from tenacity import AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_exponential
 
 
 logger = logging.getLogger(__name__)
 
 MAX_CONTENT_CHARS = 100_000
 
-_http_retry = retry(
-    stop=stop_after_attempt(3),
-    wait=wait_exponential(multiplier=1, min=1, max=10),
-    retry=retry_if_exception_type((httpx.RequestError, httpx.HTTPStatusError)),
-    reraise=True,
-)
 
-
-@_http_retry
-def _fetch_with_retry(client: httpx.Client, url: str) -> httpx.Response:
+async def _fetch_with_retry(client: httpx.AsyncClient, url: str) -> httpx.Response:
     """Fetch URL with automatic retry on transient failures."""
-    response = client.get(url, headers={"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"})
-    response.raise_for_status()
+    response: httpx.Response | None = None
+    async for attempt in AsyncRetrying(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=1, max=10),
+        retry=retry_if_exception_type((httpx.RequestError, httpx.HTTPStatusError)),
+    ):
+        with attempt:
+            response = await client.get(url, headers={"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"})
+            response.raise_for_status()
+
+    # AsyncRetrying ensures response is set on success
+    assert response is not None
     return response
 
 
@@ -142,7 +144,7 @@ def _make_success_response(url: str, content: str, content_type: str, truncated:
     return result
 
 
-def web_fetch(url: str, max_pages: int = 10) -> dict[str, Any]:
+async def web_fetch(url: str, max_pages: int = 10) -> dict[str, Any]:
     """Fetch content from a URL (HTML page or PDF document).
 
     This tool retrieves the full content from a URL for analysis. It handles
@@ -167,11 +169,11 @@ def web_fetch(url: str, max_pages: int = 10) -> dict[str, Any]:
     Examples
     --------
     >>> # Fetch an HTML page
-    >>> result = web_fetch("https://example.com/about")
+    >>> result = await web_fetch("https://example.com/about")
     >>> print(result["content"])
 
     >>> # Fetch a PDF
-    >>> result = web_fetch("https://arxiv.org/pdf/2301.00234.pdf")
+    >>> result = await web_fetch("https://arxiv.org/pdf/2301.00234.pdf")
     >>> print(f"Pages: {result['num_pages']}")
     >>> print(result["content"])
     """
@@ -180,8 +182,8 @@ def web_fetch(url: str, max_pages: int = 10) -> dict[str, Any]:
         return _make_error_response("Invalid URL. Must start with http:// or https://", url)
 
     try:
-        with httpx.Client(timeout=60.0, follow_redirects=True) as client:
-            response = _fetch_with_retry(client, url)
+        async with httpx.AsyncClient(timeout=60.0, follow_redirects=True) as client:
+            response = await _fetch_with_retry(client, url)
             content_type = response.headers.get("content-type", "")
             final_url = str(response.url)
 
diff --git a/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py b/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py
index 7fc251d..d0e83f4 100644
--- a/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py
+++ b/aieng-eval-agents/tests/aieng/agent_evals/tools/test_web.py
@@ -75,29 +75,34 @@ def test_preserves_headings(self):
 class TestWebFetch:
     """Tests for the web_fetch function."""
 
-    @patch("aieng.agent_evals.tools.web.httpx.Client")
-    def test_fetch_html_success(self, mock_client_class):
+    @pytest.mark.asyncio
+    @patch("aieng.agent_evals.tools.web.httpx.AsyncClient")
+    async def test_fetch_html_success(self, mock_client_class):
         """Test successful HTML fetch returns content."""
         mock_response = MagicMock()
         mock_response.text = "<html><body><p>Hello World</p></body></html>"
         mock_response.headers = {"content-type": "text/html"}
         mock_response.url = "https://example.com"
 
+        async def mock_get(*_args, **_kwargs):
+            return mock_response
+
         mock_client = MagicMock()
-        mock_client.get.return_value = mock_response
-        mock_client.__enter__ = MagicMock(return_value=mock_client)
-        mock_client.__exit__ = MagicMock(return_value=False)
+        mock_client.get = mock_get
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.__aexit__ = AsyncMock(return_value=None)
         mock_client_class.return_value = mock_client
 
-        result = web_fetch("https://example.com")
+        result = await web_fetch("https://example.com")
 
         assert result["status"] == "success"
         assert "content" in result
         assert "Hello World" in result["content"]
         assert result["content_type"] == "text/html"
 
-    @patch("aieng.agent_evals.tools.web.httpx.Client")
-    def test_fetch_pdf_success(self, mock_client_class):
+    @pytest.mark.asyncio
+    @patch("aieng.agent_evals.tools.web.httpx.AsyncClient")
+    async def test_fetch_pdf_success(self, mock_client_class):
         """Test that PDF content is extracted successfully."""
         # Create a PDF with text
         writer = PdfWriter()
@@ -111,21 +116,25 @@ def test_fetch_pdf_success(self, mock_client_class):
         mock_response.headers = {"content-type": "application/pdf"}
         mock_response.url = "https://example.com/doc.pdf"
 
+        async def mock_get(*_args, **_kwargs):
+            return mock_response
+
         mock_client = MagicMock()
-        mock_client.get.return_value = mock_response
-        mock_client.__enter__ = MagicMock(return_value=mock_client)
-        mock_client.__exit__ = MagicMock(return_value=False)
+        mock_client.get = mock_get
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.__aexit__ = AsyncMock(return_value=None)
         mock_client_class.return_value = mock_client
 
-        result = web_fetch("https://example.com/doc.pdf")
+        result = await web_fetch("https://example.com/doc.pdf")
 
         assert result["status"] == "success"
         assert result["content_type"] == "application/pdf"
         assert "num_pages" in result
         assert result["num_pages"] >= 1
 
-    @patch("aieng.agent_evals.tools.web.httpx.Client")
-    def test_fetch_returns_content_length(self, mock_client_class):
+    @pytest.mark.asyncio
+    @patch("aieng.agent_evals.tools.web.httpx.AsyncClient")
+    async def test_fetch_returns_content_length(self, mock_client_class):
         """Test that fetch returns content length."""
         long_text = "A" * 10000
         mock_response = MagicMock()
@@ -133,21 +142,25 @@ def test_fetch_returns_content_length(self, mock_client_class):
         mock_response.headers = {"content-type": "text/html"}
         mock_response.url = "https://example.com"
 
+        async def mock_get(*_args, **_kwargs):
+            return mock_response
+
         mock_client = MagicMock()
-        mock_client.get.return_value = mock_response
-        mock_client.__enter__ = MagicMock(return_value=mock_client)
-        mock_client.__exit__ = MagicMock(return_value=False)
+        mock_client.get = mock_get
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.__aexit__ = AsyncMock(return_value=None)
         mock_client_class.return_value = mock_client
 
-        result = web_fetch("https://example.com")
+        result = await web_fetch("https://example.com")
 
         assert result["status"] == "success"
         # Content length should include the 10000 As (may have some markdown formatting)
         assert result["content_length"] >= 10000
         assert not result["truncated"]
 
-    @patch("aieng.agent_evals.tools.web.httpx.Client")
-    def test_fetch_truncates_large_content(self, mock_client_class):
+    @pytest.mark.asyncio
+    @patch("aieng.agent_evals.tools.web.httpx.AsyncClient")
+    async def test_fetch_truncates_large_content(self, mock_client_class):
         """Test that very large content is truncated."""
         # Create content larger than MAX_CONTENT_CHARS (100KB)
         large_text = "A" * 150_000
@@ -156,21 +169,25 @@ def test_fetch_truncates_large_content(self, mock_client_class):
         mock_response.headers = {"content-type": "text/html"}
         mock_response.url = "https://example.com"
 
+        async def mock_get(*_args, **_kwargs):
+            return mock_response
+
         mock_client = MagicMock()
-        mock_client.get.return_value = mock_response
-        mock_client.__enter__ = MagicMock(return_value=mock_client)
-        mock_client.__exit__ = MagicMock(return_value=False)
+        mock_client.get = mock_get
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.__aexit__ = AsyncMock(return_value=None)
         mock_client_class.return_value = mock_client
 
-        result = web_fetch("https://example.com")
+        result = await web_fetch("https://example.com")
 
         assert result["status"] == "success"
         assert result["truncated"] is True
         assert "[Content truncated" in result["content"]
 
-    def test_fetch_invalid_url(self):
+    @pytest.mark.asyncio
+    async def test_fetch_invalid_url(self):
         """Test that invalid URLs return error."""
-        result = web_fetch("not-a-url")
+        result = await web_fetch("not-a-url")
         assert result["status"] == "error"
         assert "Invalid URL" in result["error"]
 
@@ -338,9 +355,10 @@ class TestWebFetchIntegration:
     and PDF documents, returning content suitable for the agent to analyze.
     """
 
-    def test_fetch_html_page_returns_readable_content(self):
+    @pytest.mark.asyncio
+    async def test_fetch_html_page_returns_readable_content(self):
         """Test that HTML pages are converted to readable markdown."""
-        result = web_fetch("https://www.iana.org/help/example-domains")
+        result = await web_fetch("https://www.iana.org/help/example-domains")
         assert result["status"] == "success"
         assert result["content_type"] == "text/html" or "html" in result["content_type"].lower()
 
@@ -359,9 +377,10 @@ def test_fetch_html_page_returns_readable_content(self):
             # Links should be in markdown format, not raw <a> tags
             assert "<a " not in content.lower()
 
-    def test_fetch_pdf_extracts_text(self):
+    @pytest.mark.asyncio
+    async def test_fetch_pdf_extracts_text(self):
         """Test that PDF content is extracted as searchable text."""
-        result = web_fetch("https://arxiv.org/pdf/2301.00234.pdf", max_pages=2)
+        result = await web_fetch("https://arxiv.org/pdf/2301.00234.pdf", max_pages=2)
         assert result["status"] == "success"
         assert result["content_type"] == "application/pdf"
         assert result["num_pages"] > 0
@@ -373,9 +392,10 @@ def test_fetch_pdf_extracts_text(self):
         # Verify page markers are present
         assert "--- Page" in content
 
-    def test_fetch_pdf_pagination(self):
+    @pytest.mark.asyncio
+    async def test_fetch_pdf_pagination(self):
         """Test that PDF max_pages parameter limits extraction."""
-        result = web_fetch("https://arxiv.org/pdf/2301.00234.pdf", max_pages=1)
+        result = await web_fetch("https://arxiv.org/pdf/2301.00234.pdf", max_pages=1)
         assert result["status"] == "success"
         assert result["pages_extracted"] == 1
         assert result["num_pages"] >= 1

From 33726ad303c4a4f4eb5068dbd2fd9133a9e15250 Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Thu, 5 Feb 2026 15:37:17 -0500
Subject: [PATCH 3/4] Fixes based on code review

---
 .../aieng/agent_evals/tools/search.py         |  3 +
 .../aieng/agent_evals/tools/web.py            | 75 +++++++++++--------
 2 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/aieng-eval-agents/aieng/agent_evals/tools/search.py b/aieng-eval-agents/aieng/agent_evals/tools/search.py
index 33d18ae..46602c5 100644
--- a/aieng-eval-agents/aieng/agent_evals/tools/search.py
+++ b/aieng-eval-agents/aieng/agent_evals/tools/search.py
@@ -156,6 +156,9 @@ async def _google_search_async(query: str, model: str) -> dict[str, Any]:
             "summary": "",
             "sources": [],
         }
+    finally:
+        # Properly close the client to avoid aiohttp session leaks
+        client.close()
 
 
 async def google_search(query: str, model: str | None = None) -> dict[str, Any]:
diff --git a/aieng-eval-agents/aieng/agent_evals/tools/web.py b/aieng-eval-agents/aieng/agent_evals/tools/web.py
index 9eb14d0..1c99d44 100644
--- a/aieng-eval-agents/aieng/agent_evals/tools/web.py
+++ b/aieng-eval-agents/aieng/agent_evals/tools/web.py
@@ -5,6 +5,8 @@
 """
 
 import logging
+import re
+from collections.abc import Callable
 from io import BytesIO
 from typing import Any
 from urllib.parse import urljoin
@@ -12,7 +14,8 @@
 import httpx
 from google.adk.tools.function_tool import FunctionTool
 from html_to_markdown import convert as html_to_markdown
-from tenacity import AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_exponential
+from pypdf import PdfReader
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
 
 
 logger = logging.getLogger(__name__)
@@ -20,20 +23,45 @@
 MAX_CONTENT_CHARS = 100_000
 
 
+def _make_absolute_url(base_url: str) -> Callable[[re.Match[str]], str]:
+    """Create a function that converts relative URLs to absolute URLs.
+
+    Parameters
+    ----------
+    base_url : str
+        Base URL for resolving relative links.
+
+    Returns
+    -------
+    Callable[[re.Match[str]], str]
+        Function that takes a regex match and returns the URL converted to absolute.
+    """
+
+    def make_absolute(match: re.Match) -> str:
+        """Convert relative URL to absolute."""
+        prefix = match.group(1)  # [text]( or src="
+        url = match.group(2)
+        suffix = match.group(3)  # ) or "
+
+        # Skip if already absolute or is a data URI
+        if url.startswith(("http://", "https://", "data:", "mailto:", "#")):
+            return match.group(0)
+
+        absolute_url = urljoin(base_url, url)
+        return f"{prefix}{absolute_url}{suffix}"
+
+    return make_absolute
+
+
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=1, max=10),
+    retry=retry_if_exception_type((httpx.RequestError, httpx.HTTPStatusError)),
+)
 async def _fetch_with_retry(client: httpx.AsyncClient, url: str) -> httpx.Response:
     """Fetch URL with automatic retry on transient failures."""
-    response: httpx.Response | None = None
-    async for attempt in AsyncRetrying(
-        stop=stop_after_attempt(3),
-        wait=wait_exponential(multiplier=1, min=1, max=10),
-        retry=retry_if_exception_type((httpx.RequestError, httpx.HTTPStatusError)),
-    ):
-        with attempt:
-            response = await client.get(url, headers={"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"})
-            response.raise_for_status()
-
-    # AsyncRetrying ensures response is set on success
-    assert response is not None
+    response = await client.get(url, headers={"User-Agent": "Mozilla/5.0 (compatible; ResearchBot/1.0)"})
+    response.raise_for_status()
     return response
 
 
@@ -58,20 +86,7 @@ def _html_to_markdown(html: str, base_url: str | None = None) -> str:
 
     # If base_url provided, convert relative URLs to absolute
     if base_url:
-        import re  # noqa: PLC0415
-
-        def make_absolute(match: re.Match) -> str:
-            """Convert relative URL to absolute."""
-            prefix = match.group(1)  # [text]( or src="
-            url = match.group(2)
-            suffix = match.group(3)  # ) or "
-
-            # Skip if already absolute or is a data URI
-            if url.startswith(("http://", "https://", "data:", "mailto:", "#")):
-                return match.group(0)
-
-            absolute_url = urljoin(base_url, url)
-            return f"{prefix}{absolute_url}{suffix}"
+        make_absolute = _make_absolute_url(base_url)
 
         # Fix markdown links: [text](url)
         markdown = re.sub(r"(\[[^\]]*\]\()([^)]+)(\))", make_absolute, markdown)
@@ -97,8 +112,6 @@ def _extract_pdf_text(content: bytes, max_pages: int = 10) -> tuple[str, int]:
     tuple[str, int]
         The extracted text and total number of pages.
     """
-    from pypdf import PdfReader  # noqa: PLC0415
-
     pdf_file = BytesIO(content)
     reader = PdfReader(pdf_file)
     num_pages = len(reader.pages)
@@ -192,7 +205,7 @@ async def web_fetch(url: str, max_pages: int = 10) -> dict[str, Any]:
                 return _handle_pdf_response(response.content, max_pages, final_url, url)
 
             # Handle HTML and text content
-            if "text/html" in content_type or not content_type:
+            if "text/html" in content_type or content_type == "":
                 text = _html_to_markdown(response.text, base_url=final_url)
             else:
                 text = response.text
@@ -225,8 +238,6 @@ def _handle_pdf_response(content: bytes, max_pages: int, final_url: str, url: st
             num_pages=num_pages,
             pages_extracted=min(num_pages, max_pages),
         )
-    except ImportError:
-        return _make_error_response("PDF support requires pypdf. Install with: pip install pypdf", url)
     except Exception as e:
         return _make_error_response(f"Failed to extract PDF text: {e!s}", url)
 

From 698e812a38f678558a0e7fdcdbdc8e00d47a3e8a Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Thu, 5 Feb 2026 15:42:51 -0500
Subject: [PATCH 4/4] Use retry decorator for the search tool's redirect url

---
 .../aieng/agent_evals/tools/_redirect.py      | 72 +++++++------------
 1 file changed, 27 insertions(+), 45 deletions(-)

diff --git a/aieng-eval-agents/aieng/agent_evals/tools/_redirect.py b/aieng-eval-agents/aieng/agent_evals/tools/_redirect.py
index 76a9594..05772a3 100644
--- a/aieng-eval-agents/aieng/agent_evals/tools/_redirect.py
+++ b/aieng-eval-agents/aieng/agent_evals/tools/_redirect.py
@@ -9,7 +9,7 @@
 import logging
 
 import httpx
-from tenacity import AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_exponential
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
 
 
 logger = logging.getLogger(__name__)
@@ -63,12 +63,28 @@ async def _resolve_with_get_async(client: httpx.AsyncClient, url: str) -> str:
         return str(response.url)
 
 
-async def _resolve_single_url_async(
-    client: httpx.AsyncClient,
-    url: str,
-    max_retries: int = 3,
-    base_delay: float = 1.0,
-) -> str:
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1.0, min=1.0, max=60.0),
+    retry=retry_if_exception_type((httpx.TimeoutException, httpx.ConnectError, httpx.ReadError)),
+)
+async def _resolve_single_url_with_retry(client: httpx.AsyncClient, url: str) -> str:
+    """Resolve URL with retry handling (internal helper)."""
+    # Try HEAD first (faster, no body download)
+    final_url = await _resolve_with_head_async(client, url)
+
+    # Fall back to GET if HEAD failed
+    if final_url is None:
+        logger.debug(f"HEAD failed for {url[:60]}..., trying GET")
+        final_url = await _resolve_with_get_async(client, url)
+
+    if final_url != url:
+        logger.debug(f"Resolved redirect: {url[:60]}... -> {final_url[:60]}...")
+
+    return final_url
+
+
+async def _resolve_single_url_async(client: httpx.AsyncClient, url: str) -> str:
     """Resolve a single URL with retries and exponential backoff.
 
     Uses tenacity for automatic retry handling with exponential backoff.
@@ -79,10 +95,6 @@ async def _resolve_single_url_async(
         The HTTP client to use.
     url : str
         The URL to resolve.
-    max_retries : int
-        Maximum number of retry attempts.
-    base_delay : float
-        Base delay between retries (doubles each retry).
 
     Returns
     -------
@@ -98,27 +110,9 @@ async def _resolve_single_url_async(
         return _redirect_cache[url]
 
     try:
-        async for attempt in AsyncRetrying(
-            stop=stop_after_attempt(max_retries),
-            wait=wait_exponential(multiplier=base_delay, min=base_delay, max=60.0),
-            retry=retry_if_exception_type((httpx.TimeoutException, httpx.ConnectError, httpx.ReadError)),
-        ):
-            with attempt:
-                # Try HEAD first (faster, no body download)
-                final_url = await _resolve_with_head_async(client, url)
-
-                # Fall back to GET if HEAD failed
-                if final_url is None:
-                    logger.debug(f"HEAD failed for {url[:60]}..., trying GET")
-                    final_url = await _resolve_with_get_async(client, url)
-
-                if final_url != url:
-                    logger.debug(f"Resolved redirect: {url[:60]}... -> {final_url[:60]}...")
-
-                _redirect_cache[url] = final_url
-
-        # If we reach here, the retry loop succeeded
-        return _redirect_cache[url]
+        final_url = await _resolve_single_url_with_retry(client, url)
+        _redirect_cache[url] = final_url
+        return final_url
 
     except Exception as e:
         # All retries exhausted or non-retryable error
@@ -140,19 +134,7 @@ async def resolve_redirect_url_async(url: str) -> str:
     str
         The final destination URL after following redirects.
     """
-    # Skip resolution for non-redirect URLs (fast path)
-    if not _is_redirect_url(url):
-        return url
-
-    # Check cache first (fast path)
-    if url in _redirect_cache:
-        return _redirect_cache[url]
-
-    async with httpx.AsyncClient(
-        timeout=_get_redirect_timeout(),
-        follow_redirects=True,
-    ) as client:
-        return await _resolve_single_url_async(client, url)
+    return (await resolve_redirect_urls_async([url]))[0]
 
 
 async def resolve_redirect_urls_async(urls: list[str]) -> list[str]: