unclecode · Ahmed-Tawfik94 · Sep 17, 2025 · Sep 18, 2025
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
@@ -2177,19 +2177,19 @@ def normalize_url(
     str | None
         A clean, canonical URL or None if href is empty/None.
     """
-    if not href:
+    if not href or not href.strip():
         return None
 
     # Resolve relative paths first
     full_url = urljoin(base_url, href.strip())
-    
+
     # Preserve HTTPS if requested and original scheme was HTTPS
     if preserve_https and original_scheme == 'https':
         parsed_full = urlparse(full_url)
         parsed_base = urlparse(base_url)
         # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
         # Protocol-relative URLs (//example.com) should follow the base URL's scheme
-        if (parsed_full.scheme == 'http' and 
+        if (parsed_full.scheme == 'http' and
             parsed_full.netloc == parsed_base.netloc and
             not href.strip().startswith('//')):
             full_url = full_url.replace('http://', 'https://', 1)
@@ -2199,6 +2199,14 @@ def normalize_url(
 
     # ── netloc ──
     netloc = parsed.netloc.lower()
+
+    # Remove default ports
+    if ':' in netloc:
+        host, port = netloc.rsplit(':', 1)
+        if (parsed.scheme == 'http' and port == '80') or (parsed.scheme == 'https' and port == '443'):
+            netloc = host
+        else:
+            netloc = f"{host}:{port}"
 
     # ── path ──
     # Strip duplicate slashes and trailing "/" (except root)
@@ -2212,21 +2220,25 @@ def normalize_url(
     query = parsed.query
     if query:
         # explode, mutate, then rebuild
-        params = [(k.lower(), v) for k, v in parse_qsl(query, keep_blank_values=True)]
+        params = list(parse_qsl(query, keep_blank_values=True)) # Parse query string into key-value pairs, preserving blank values
 
         if drop_query_tracking:
+            # Define default tracking parameters to remove for cleaner URLs
             default_tracking = {
                 'utm_source', 'utm_medium', 'utm_campaign', 'utm_term',
                 'utm_content', 'gclid', 'fbclid', 'ref', 'ref_src'
             }
             if extra_drop_params:
-                default_tracking |= {p.lower() for p in extra_drop_params}
-            params = [(k, v) for k, v in params if k not in default_tracking]
+                default_tracking |= {p.lower() for p in extra_drop_params} # Add any extra parameters to drop, case-insensitive
+            params = [(k, v) for k, v in params if k not in default_tracking] # Filter out tracking parameters
+
+        # Normalize parameter keys 
+        params = [(k, v) for k, v in params]
 
         if sort_query:
-            params.sort(key=lambda kv: kv[0])
+            params.sort(key=lambda kv: kv[0]) # Sort parameters alphabetically by key (now lowercase)
 
-        query = urlencode(params, doseq=True) if params else ''
+        query = urlencode(params, doseq=True) if params else '' # Rebuild query string, handling sequences properly
 
     # ── fragment ──
     fragment = parsed.fragment if keep_fragment else ''