Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions crawl4ai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2177,19 +2177,19 @@ def normalize_url(
str | None
A clean, canonical URL or None if href is empty/None.
"""
if not href:
if not href or not href.strip():
return None

# Resolve relative paths first
full_url = urljoin(base_url, href.strip())

# Preserve HTTPS if requested and original scheme was HTTPS
if preserve_https and original_scheme == 'https':
parsed_full = urlparse(full_url)
parsed_base = urlparse(base_url)
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
if (parsed_full.scheme == 'http' and
if (parsed_full.scheme == 'http' and
parsed_full.netloc == parsed_base.netloc and
not href.strip().startswith('//')):
full_url = full_url.replace('http://', 'https://', 1)
Expand All @@ -2199,6 +2199,14 @@ def normalize_url(

# ── netloc ──
netloc = parsed.netloc.lower()

# Remove default ports
if ':' in netloc:
host, port = netloc.rsplit(':', 1)
if (parsed.scheme == 'http' and port == '80') or (parsed.scheme == 'https' and port == '443'):
netloc = host
else:
netloc = f"{host}:{port}"

# ── path ──
# Strip duplicate slashes and trailing "/" (except root)
Expand All @@ -2212,21 +2220,25 @@ def normalize_url(
query = parsed.query
if query:
# explode, mutate, then rebuild
params = [(k.lower(), v) for k, v in parse_qsl(query, keep_blank_values=True)]
params = list(parse_qsl(query, keep_blank_values=True)) # Parse query string into key-value pairs, preserving blank values

if drop_query_tracking:
# Define default tracking parameters to remove for cleaner URLs
default_tracking = {
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term',
'utm_content', 'gclid', 'fbclid', 'ref', 'ref_src'
}
if extra_drop_params:
default_tracking |= {p.lower() for p in extra_drop_params}
params = [(k, v) for k, v in params if k not in default_tracking]
default_tracking |= {p.lower() for p in extra_drop_params} # Add any extra parameters to drop, case-insensitive
params = [(k, v) for k, v in params if k not in default_tracking] # Filter out tracking parameters

# Normalize parameter keys
params = [(k, v) for k, v in params]

if sort_query:
params.sort(key=lambda kv: kv[0])
params.sort(key=lambda kv: kv[0]) # Sort parameters alphabetically by key (now lowercase)

query = urlencode(params, doseq=True) if params else ''
query = urlencode(params, doseq=True) if params else '' # Rebuild query string, handling sequences properly

# ── fragment ──
fragment = parsed.fragment if keep_fragment else ''
Expand Down
Loading