generated from VectorInstitute/aieng-template-implementation
-
Notifications
You must be signed in to change notification settings - Fork 0
Improve search tool to extract resolved urls #28
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
23 commits
Select commit
Hold shift + click to select a range
438a383
Improve search tool to extract resolved urls
amrit110 c2b09d2
Add web fetch tool
amrit110 67bb38d
Add html_to_markdown dependency
amrit110 da81995
Add html_to_markdown dependency
amrit110 05471dc
Refactor out the redirect url code
amrit110 5685b65
Remove unused synchronous version
amrit110 fc29b79
Fix merge conflict
amrit110 ce62b75
Merge branch 'main' into ak/improve_search_tool
amrit110 ca7d319
Merge branch 'main' into ak/improve_search_tool
amrit110 3e8938a
Merge branch 'ak/improve_search_tool' of github.com:VectorInstitute/e…
amrit110 5ada2fa
Fix merge conflicts
amrit110 1caea9e
Fix typing issues
amrit110 c6b1207
Merge branch 'main' into ak/improve_search_tool
amrit110 fa7a951
Update search fn to async
amrit110 871530f
Merge branch 'ak/improve_search_tool' of github.com:VectorInstitute/e…
amrit110 f6792cf
Use modern operator to denote union of types
amrit110 5ecb97f
[pre-commit.ci] Add auto fixes from pre-commit.com hooks
pre-commit-ci[bot] 1fee32a
Use tenacity for retries
amrit110 11655f8
Merge branch 'ak/improve_search_tool' of github.com:VectorInstitute/e…
amrit110 5eebe9f
Fix config in test using mock
amrit110 9a73472
Improve return docstring
amrit110 6f93ea0
Fix test
amrit110 2a9f4bb
Remove use of cast, lets stop lying to the type checker
amrit110 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,183 @@ | ||
| """URL redirect resolution utilities. | ||
|
|
||
| Provides utilities for resolving redirect URLs (especially Vertex AI grounding | ||
| redirects) to their final destinations. Used by search and web fetch tools to | ||
| display actual URLs. | ||
| """ | ||
|
|
||
| import asyncio | ||
| import logging | ||
|
|
||
| import httpx | ||
| from tenacity import AsyncRetrying, retry_if_exception_type, stop_after_attempt, wait_exponential | ||
|
|
||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
| REDIRECT_URL_PATTERNS = ( | ||
| "vertexaisearch.cloud.google.com/grounding-api-redirect", | ||
| "vertexaisearch.cloud.google.com/redirect", | ||
| ) | ||
|
|
||
| _REDIRECT_CONNECT_TIMEOUT = 10.0 | ||
| _REDIRECT_READ_TIMEOUT = 15.0 | ||
| _USER_AGENT = ( | ||
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | ||
| ) | ||
| _redirect_cache: dict[str, str] = {} | ||
|
|
||
|
|
||
| def _is_redirect_url(url: str) -> bool: | ||
| """Check if URL is a known redirect pattern.""" | ||
| return any(pattern in url for pattern in REDIRECT_URL_PATTERNS) | ||
|
|
||
|
|
||
| def _get_redirect_timeout() -> httpx.Timeout: | ||
| """Get timeout configuration for redirect resolution.""" | ||
| return httpx.Timeout( | ||
| connect=_REDIRECT_CONNECT_TIMEOUT, | ||
| read=_REDIRECT_READ_TIMEOUT, | ||
| write=10.0, | ||
| pool=10.0, | ||
| ) | ||
|
|
||
|
|
||
| async def _resolve_with_head_async(client: httpx.AsyncClient, url: str) -> str | None: | ||
| """Try to resolve redirect using async HEAD request.""" | ||
| try: | ||
| response = await client.head(url, headers={"User-Agent": _USER_AGENT}) | ||
| return str(response.url) | ||
| except httpx.HTTPStatusError as e: | ||
| # Some servers return 405 Method Not Allowed for HEAD | ||
| if e.response.status_code in (405, 501): | ||
| return None # Signal to try GET | ||
| raise | ||
| except Exception: | ||
| return None | ||
|
|
||
|
|
||
| async def _resolve_with_get_async(client: httpx.AsyncClient, url: str) -> str: | ||
| """Resolve redirect using async GET request (fallback when HEAD fails).""" | ||
| # Use stream to avoid downloading the body | ||
| async with client.stream("GET", url, headers={"User-Agent": _USER_AGENT}) as response: | ||
| return str(response.url) | ||
|
|
||
|
|
||
| async def _resolve_single_url_async( | ||
| client: httpx.AsyncClient, | ||
| url: str, | ||
| max_retries: int = 3, | ||
| base_delay: float = 1.0, | ||
| ) -> str: | ||
| """Resolve a single URL with retries and exponential backoff. | ||
|
|
||
| Uses tenacity for automatic retry handling with exponential backoff. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| client : httpx.AsyncClient | ||
| The HTTP client to use. | ||
| url : str | ||
| The URL to resolve. | ||
| max_retries : int | ||
| Maximum number of retry attempts. | ||
| base_delay : float | ||
| Base delay between retries (doubles each retry). | ||
|
|
||
| Returns | ||
| ------- | ||
| str | ||
| The resolved URL, or original URL if not a redirect or on failure. | ||
| """ | ||
| # Skip resolution for non-redirect URLs | ||
| if not _is_redirect_url(url): | ||
| return url | ||
|
|
||
| # Check cache first | ||
| if url in _redirect_cache: | ||
| return _redirect_cache[url] | ||
|
|
||
| try: | ||
| async for attempt in AsyncRetrying( | ||
| stop=stop_after_attempt(max_retries), | ||
| wait=wait_exponential(multiplier=base_delay, min=base_delay, max=60.0), | ||
| retry=retry_if_exception_type((httpx.TimeoutException, httpx.ConnectError, httpx.ReadError)), | ||
| ): | ||
| with attempt: | ||
| # Try HEAD first (faster, no body download) | ||
| final_url = await _resolve_with_head_async(client, url) | ||
|
|
||
| # Fall back to GET if HEAD failed | ||
| if final_url is None: | ||
| logger.debug(f"HEAD failed for {url[:60]}..., trying GET") | ||
| final_url = await _resolve_with_get_async(client, url) | ||
|
|
||
| if final_url != url: | ||
| logger.debug(f"Resolved redirect: {url[:60]}... -> {final_url[:60]}...") | ||
|
|
||
| _redirect_cache[url] = final_url | ||
|
|
||
| # If we reach here, the retry loop succeeded | ||
| return _redirect_cache[url] | ||
|
|
||
| except Exception as e: | ||
| # All retries exhausted or non-retryable error | ||
| logger.warning(f"Failed to resolve redirect URL {url[:60]}...: {type(e).__name__}: {e}") | ||
| _redirect_cache[url] = url # Cache failures to avoid repeated attempts | ||
| return url | ||
|
|
||
|
|
||
| async def resolve_redirect_url_async(url: str) -> str: | ||
| """Async version of resolve_redirect_url with caching and retries. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| url : str | ||
| The URL to resolve (may be a redirect URL). | ||
|
|
||
| Returns | ||
| ------- | ||
| str | ||
| The final destination URL after following redirects. | ||
| """ | ||
| # Skip resolution for non-redirect URLs (fast path) | ||
| if not _is_redirect_url(url): | ||
| return url | ||
|
|
||
| # Check cache first (fast path) | ||
| if url in _redirect_cache: | ||
| return _redirect_cache[url] | ||
|
|
||
| async with httpx.AsyncClient( | ||
| timeout=_get_redirect_timeout(), | ||
| follow_redirects=True, | ||
| ) as client: | ||
| return await _resolve_single_url_async(client, url) | ||
|
|
||
|
|
||
| async def resolve_redirect_urls_async(urls: list[str]) -> list[str]: | ||
| """Resolve multiple redirect URLs in parallel. | ||
|
|
||
| Resolves URLs concurrently with proper error handling per URL. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| urls : list[str] | ||
| List of URLs to resolve. | ||
|
|
||
| Returns | ||
| ------- | ||
| list[str] | ||
| List of resolved URLs in the same order. | ||
| """ | ||
| if not urls: | ||
| return [] | ||
|
|
||
| async with httpx.AsyncClient( | ||
| timeout=_get_redirect_timeout(), | ||
| follow_redirects=True, | ||
| limits=httpx.Limits(max_connections=20, max_keepalive_connections=10), | ||
| ) as client: | ||
| # Resolve all URLs in parallel | ||
| tasks = [_resolve_single_url_async(client, url) for url in urls] | ||
| return list(await asyncio.gather(*tasks)) | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.