unclecode · ntohidi · May 13, 2025 · Jul 29, 2025 · Aug 6, 2025 · Aug 11, 2025
diff --git a/.gitignore b/.gitignore
@@ -265,7 +265,7 @@ CLAUDE.md
 tests/**/test_site
 tests/**/reports
 tests/**/benchmark_reports
-
+test_scripts/
 docs/**/data
 .codecat/
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Added
+- **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
+  - Maintains HTTPS scheme for internal links even when servers redirect to HTTP
+  - Prevents security downgrades during deep crawling
+  - Useful for security-conscious crawling and sites supporting both protocols
+  - Fully backward compatible with opt-in flag (default: `False`)
+  - Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP
+
 ## [0.7.3] - 2025-08-09
 
 ### Added

diff --git a/README.md b/README.md
@@ -27,11 +27,13 @@
 
 Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community.
 
-[✨ Check out latest update v0.7.4](#-recent-updates)
+[✨ Check out latest update v0.7.5](#-recent-updates)
 
-✨ New in v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
+✨ New in v0.7.5: Docker Hooks System for pipeline customization, Enhanced LLM Integration with custom providers, HTTPS Preservation, and multiple community-reported bug fixes. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md)
 
-✨ Recent v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md)
+✨ Recent v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
+
+✨ Previous v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md)
 
 <details>
   <summary>🤓 <strong>My Personal Story</strong></summary>
@@ -304,9 +306,9 @@ The new Docker implementation includes:
 ### Getting Started
 
 ```bash
-# Pull and run the latest release candidate
-docker pull unclecode/crawl4ai:0.7.0
-docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.7.0
+# Pull and run the latest release
+docker pull unclecode/crawl4ai:latest
+docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest
 
 # Visit the playground at http://localhost:11235/playground
 ```
@@ -373,7 +375,7 @@ async def main():
 
     async with AsyncWebCrawler(config=browser_config) as crawler:
         result = await crawler.arun(
-            url="https://docs.micronaut.io/4.7.6/guide/",
+            url="https://docs.micronaut.io/4.9.9/guide/",
             config=run_config
         )
         print(len(result.markdown.raw_markdown))
@@ -425,7 +427,7 @@ async def main():
             "type": "attribute",
             "attribute": "src"
         }
-    }
+    ]
 }
 
     extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
@@ -544,6 +546,48 @@ async def test_news_crawl():
 
 ## ✨ Recent Updates
 
+<details>
+<summary><strong>Version 0.7.5 Release Highlights - The Docker Hooks & Security Update</strong></summary>
+
+- **🔧 Docker Hooks System**: Complete pipeline customization with user-provided Python functions:
+  ```python
+  import requests
+
+  # Real working hooks for httpbin.org
+  hooks_config = {
+      "on_page_context_created": """
+  async def hook(page, context, **kwargs):
+      print("Hook: Setting up page context")
+      # Block images to speed up crawling
+      await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
+      return page
+  """,
+      "before_goto": """
+  async def hook(page, context, url, **kwargs):
+      print(f"Hook: About to navigate to {url}")
+      # Add custom headers
+      await page.set_extra_http_headers({'X-Test-Header': 'crawl4ai-hooks-test'})
+      return page
+  """
+  }
+
+  # Test with Docker API
+  payload = {
+      "urls": ["https://httpbin.org/html"],
+      "hooks": {"code": hooks_config, "timeout": 30}
+  }
+  response = requests.post("http://localhost:11235/crawl", json=payload)
+  ```
+
+- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration
+- **🔒 HTTPS Preservation**: Secure internal link handling with `preserve_https_for_internal_links=True`
+- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance
+- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration
+
+[Full v0.7.5 Release Notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md)
+
+</details>
+
 <details>
 <summary><strong>Version 0.7.4 Release Highlights - The Intelligent Table Extraction & Performance Update</strong></summary>
 

diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
@@ -1,7 +1,7 @@
 # crawl4ai/__version__.py
 
 # This is the version that will be used for stable releases
-__version__ = "0.7.4"
+__version__ = "0.7.5"
 
 # For nightly builds, this gets set during build process
 __nightly_version__ = None

diff --git a/crawl4ai/adaptive_crawler.py b/crawl4ai/adaptive_crawler.py
@@ -19,7 +19,7 @@
 from pathlib import Path
 
 from crawl4ai.async_webcrawler import AsyncWebCrawler
-from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig
+from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig, LLMConfig
 from crawl4ai.models import Link, CrawlResult
 import numpy as np
 
@@ -178,7 +178,7 @@ class AdaptiveConfig:
 
     # Embedding strategy parameters
     embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
-    embedding_llm_config: Optional[Dict] = None  # Separate config for embeddings
+    embedding_llm_config: Optional[Union[LLMConfig, Dict]] = None  # Separate config for embeddings
     n_query_variations: int = 10
     coverage_threshold: float = 0.85
     alpha_shape_alpha: float = 0.5
@@ -250,6 +250,30 @@ def validate(self):
         assert 0 <= self.embedding_quality_max_confidence <= 1, "embedding_quality_max_confidence must be between 0 and 1"
         assert self.embedding_quality_scale_factor > 0, "embedding_quality_scale_factor must be positive"
         assert 0 <= self.embedding_min_confidence_threshold <= 1, "embedding_min_confidence_threshold must be between 0 and 1"
+
+    @property
+    def _embedding_llm_config_dict(self) -> Optional[Dict]:
+        """Convert LLMConfig to dict format for backward compatibility."""
+        if self.embedding_llm_config is None:
+            return None
+
+        if isinstance(self.embedding_llm_config, dict):
+            # Already a dict - return as-is for backward compatibility
+            return self.embedding_llm_config
+
+        # Convert LLMConfig object to dict format
+        return {
+            'provider': self.embedding_llm_config.provider,
+            'api_token': self.embedding_llm_config.api_token,
+            'base_url': getattr(self.embedding_llm_config, 'base_url', None),
+            'temperature': getattr(self.embedding_llm_config, 'temperature', None),
+            'max_tokens': getattr(self.embedding_llm_config, 'max_tokens', None),
+            'top_p': getattr(self.embedding_llm_config, 'top_p', None),
+            'frequency_penalty': getattr(self.embedding_llm_config, 'frequency_penalty', None),
+            'presence_penalty': getattr(self.embedding_llm_config, 'presence_penalty', None),
+            'stop': getattr(self.embedding_llm_config, 'stop', None),
+            'n': getattr(self.embedding_llm_config, 'n', None),
+        }
 
 
 class CrawlStrategy(ABC):
@@ -593,7 +617,7 @@ def _get_document_terms(self, crawl_result: CrawlResult) -> List[str]:
 class EmbeddingStrategy(CrawlStrategy):
     """Embedding-based adaptive crawling using semantic space coverage"""
 
-    def __init__(self, embedding_model: str = None, llm_config: Dict = None):
+    def __init__(self, embedding_model: str = None, llm_config: Union[LLMConfig, Dict] = None):
         self.embedding_model = embedding_model or "sentence-transformers/all-MiniLM-L6-v2"
         self.llm_config = llm_config
         self._embedding_cache = {}
@@ -605,14 +629,24 @@ def __init__(self, embedding_model: str = None, llm_config: Dict = None):
         self._kb_embeddings_hash = None  # Track KB changes
         self._validation_embeddings_cache = None  # Cache validation query embeddings
         self._kb_similarity_threshold = 0.95  # Threshold for deduplication
+
+    def _get_embedding_llm_config_dict(self) -> Dict:
+        """Get embedding LLM config as dict with fallback to default."""
+        if hasattr(self, 'config') and self.config:
+            config_dict = self.config._embedding_llm_config_dict
+            if config_dict:
+                return config_dict
+
+        # Fallback to default if no config provided
+        return {
+            'provider': 'openai/text-embedding-3-small',
+            'api_token': os.getenv('OPENAI_API_KEY')
+        }
 
     async def _get_embeddings(self, texts: List[str]) -> Any:
         """Get embeddings using configured method"""
         from .utils import get_text_embeddings
-        embedding_llm_config = {
-            'provider': 'openai/text-embedding-3-small',
-            'api_token': os.getenv('OPENAI_API_KEY')
-        }
+        embedding_llm_config = self._get_embedding_llm_config_dict()
         return await get_text_embeddings(
             texts, 
             embedding_llm_config,
@@ -679,8 +713,20 @@ async def map_query_semantic_space(self, query: str, n_synthetic: int = 10) -> A
         Return as a JSON array of strings."""
 
         # Use the LLM for query generation
-        provider = self.llm_config.get('provider', 'openai/gpt-4o-mini') if self.llm_config else 'openai/gpt-4o-mini'
-        api_token = self.llm_config.get('api_token') if self.llm_config else None
+        # Convert LLMConfig to dict if needed
+        llm_config_dict = None
+        if self.llm_config:
+            if isinstance(self.llm_config, dict):
+                llm_config_dict = self.llm_config
+            else:
+                # Convert LLMConfig object to dict
+                llm_config_dict = {
+                    'provider': self.llm_config.provider,
+                    'api_token': self.llm_config.api_token
+                }
+
+        provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
+        api_token = llm_config_dict.get('api_token') if llm_config_dict else None
 
         # response = perform_completion_with_backoff(
         #     provider=provider,
@@ -843,10 +889,7 @@ async def select_links_for_expansion(
 
         # Batch embed only uncached links
         if texts_to_embed:
-            embedding_llm_config = {
-                'provider': 'openai/text-embedding-3-small',
-                'api_token': os.getenv('OPENAI_API_KEY')
-            }
+            embedding_llm_config = self._get_embedding_llm_config_dict()
             new_embeddings = await get_text_embeddings(texts_to_embed, embedding_llm_config, self.embedding_model)
 
             # Cache the new embeddings
@@ -1184,10 +1227,7 @@ async def update_state(self, state: CrawlState, new_results: List[CrawlResult])
             return
 
         # Get embeddings for new texts
-        embedding_llm_config = {
-            'provider': 'openai/text-embedding-3-small',
-            'api_token': os.getenv('OPENAI_API_KEY')
-        }        
+        embedding_llm_config = self._get_embedding_llm_config_dict()      
         new_embeddings = await get_text_embeddings(new_texts, embedding_llm_config, self.embedding_model)
 
         # Deduplicate embeddings before adding to KB
@@ -1256,10 +1296,12 @@ def _create_strategy(self, strategy_name: str) -> CrawlStrategy:
         if strategy_name == "statistical":
             return StatisticalStrategy()
         elif strategy_name == "embedding":
-            return EmbeddingStrategy(
+            strategy = EmbeddingStrategy(
                 embedding_model=self.config.embedding_model,
                 llm_config=self.config.embedding_llm_config
             )
+            strategy.config = self.config  # Pass config to strategy
+            return strategy
         else:
             raise ValueError(f"Unknown strategy: {strategy_name}")