Muvon · donhardman · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -23,9 +23,9 @@ exclude = [
 ]
 
 [features]
-default = ["fastembed", "huggingface"]
-fastembed = ["dep:fastembed"]
-huggingface = ["dep:candle-core", "dep:candle-nn", "dep:candle-transformers", "dep:tokenizers", "dep:hf-hub"]
+default = []
+fastembed = ["octolib/fastembed"]
+huggingface = ["octolib/huggingface"]
 
 # Optimized release profile for static linking
 [profile.release]
@@ -74,30 +74,25 @@ clap = { version = "4.5.45", features = ["derive"] }
 clap_complete = "4.5.57"
 notify = { version = "8.2.0", default-features = false, features = ["crossbeam-channel", "macos_fsevent"] }
 notify-debouncer-mini = "0.7.0"
-fastembed = { version = "5.0.2", optional = true }
 toml = "0.9.5"
 lazy_static = "1.5.0"
 futures = { version = "0.3.31", default-features = false, features = ["std"] }
 globset = { version = "0.4.16", default-features = false }
 regex = { version = "1.11.1", default-features = false, features = ["std"] }
 dirs = "6.0.0"
-# Candle dependencies for HuggingFace support (optional)
-candle-core = { version = "0.9.1", optional = true }
-candle-nn = { version = "0.9.1", optional = true }
-candle-transformers = { version = "0.9.1", optional = true }
-tokenizers = { version = "0.21.4", optional = true }
-hf-hub = { version = "0.4.3", features = ["tokio"], optional = true }
 # EditorConfig parsing and formatting
 ec4rs = "1.2.0"
 tracing = "0.1.41"
 tracing-subscriber = { version = "0.3.20", features = ["env-filter", "json"] }
 tracing-appender = "0.2.3"
-tiktoken-rs = "0.7.0"
 # LSP integration dependencies
 lsp-types = "0.97.0"
 url = "2.5.4"
 dotenvy = "0.15"
 
+# Local dependency on octolib with embedding features
+octolib = { path = "../octolib", features = ["fastembed", "huggingface"] }
+
 [profile.dev]
 opt-level = 1          # Basic optimizations without slowing compilation too much
 debug = true           # Keep debug symbols for backtraces

diff --git a/src/embedding/mod.rs b/src/embedding/mod.rs
@@ -12,195 +12,136 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-pub mod provider;
-#[cfg(test)]
-mod tests;
-pub mod types;
+//! Re-export embedding functionality from octolib and add octocode-specific logic
 
 use crate::config::Config;
 use anyhow::Result;
-use tiktoken_rs::cl100k_base;
 
-pub use provider::{create_embedding_provider_from_parts, EmbeddingProvider};
-pub use types::*;
+// Re-export core functionality from octolib::embedding
+pub use octolib::embedding::{
+	count_tokens, create_embedding_provider_from_parts, split_texts_into_token_limited_batches,
+	truncate_output, EmbeddingProvider, InputType,
+};
+
+// Re-export types for backward compatibility
+pub use octolib::embedding::types::{parse_provider_model, EmbeddingProviderType};
+
+// Create a types module for backward compatibility
+pub mod types {
+    pub use octolib::embedding::types::*;
+}
+
+// Create a provider module for backward compatibility  
+pub mod provider {
+    pub use octolib::embedding::provider::*;
+}
+
+/// Configuration for embedding generation (octocode-specific)
+#[derive(Debug, Clone)]
+pub struct EmbeddingGenerationConfig {
+	/// Code embedding model (format: "provider:model")
+	pub code_model: String,
+	/// Text embedding model (format: "provider:model")
+	pub text_model: String,
+	/// Batch size for embedding generation
+	pub batch_size: usize,
+	/// Maximum tokens per batch
+	pub max_tokens_per_batch: usize,
+}
+
+impl Default for EmbeddingGenerationConfig {
+	fn default() -> Self {
+		Self {
+			code_model: "voyage:voyage-code-3".to_string(),
+			text_model: "voyage:voyage-3.5-lite".to_string(),
+			batch_size: 16,
+			max_tokens_per_batch: 100_000,
+		}
+	}
+}
+
+/// Convert octocode Config to octocode EmbeddingGenerationConfig
+impl From<&Config> for EmbeddingGenerationConfig {
+	fn from(config: &Config) -> Self {
+		Self {
+			code_model: config.embedding.code_model.clone(),
+			text_model: config.embedding.text_model.clone(),
+			batch_size: config.index.embeddings_batch_size,
+			max_tokens_per_batch: config.index.embeddings_max_tokens_per_batch,
+		}
+	}
+}
 
 /// Generate embeddings based on configured provider (supports provider:model format)
+/// Compatibility wrapper for octocode Config
 pub async fn generate_embeddings(
 	contents: &str,
 	is_code: bool,
 	config: &Config,
 ) -> Result<Vec<f32>> {
+	let embedding_config = EmbeddingGenerationConfig::from(config);
+
 	// Get the model string from config
 	let model_string = if is_code {
-		&config.embedding.code_model
+		&embedding_config.code_model
 	} else {
-		&config.embedding.text_model
+		&embedding_config.text_model
 	};
 
 	// Parse provider and model from the string
-	let (provider, model) = parse_provider_model(model_string);
-
-	let provider_impl = create_embedding_provider_from_parts(&provider, &model).await?;
-	provider_impl.generate_embedding(contents).await
-}
-
-/// Count tokens in a text using tiktoken (cl100k_base tokenizer)
-pub fn count_tokens(text: &str) -> usize {
-	let bpe = cl100k_base().expect("Failed to load cl100k_base tokenizer");
-	bpe.encode_with_special_tokens(text).len()
-}
-
-/// Truncate output if it exceeds token limit
-pub fn truncate_output(output: &str, max_tokens: usize) -> String {
-	if max_tokens == 0 {
-		return output.to_string();
-	}
-
-	let token_count = count_tokens(output);
-
-	if token_count <= max_tokens {
-		return output.to_string();
-	}
-
-	// Simple truncation - cut at character boundary
-	// Estimate roughly where to cut (tokens are ~4 chars average)
-	let estimated_chars = max_tokens * 3; // Conservative estimate
-	let truncated = if output.len() > estimated_chars {
-		&output[..estimated_chars]
+	let (provider, model) = if let Some((p, m)) = model_string.split_once(':') {
+		(p, m)
 	} else {
-		output
+		return Err(anyhow::anyhow!("Invalid model format: {}", model_string));
 	};
 
-	// Find last newline to avoid cutting mid-line
-	let last_newline = truncated.rfind('\n').unwrap_or(truncated.len());
-	let final_truncated = &truncated[..last_newline];
-
-	format!(
-		"{}\n\n[Output truncated - {} tokens estimated, max {} allowed. Use more specific queries to reduce output size]",
-		final_truncated,
-		token_count,
-		max_tokens
-	)
-}
-
-/// Split texts into batches respecting both count and token limits
-pub fn split_texts_into_token_limited_batches(
-	texts: Vec<String>,
-	max_batch_size: usize,
-	max_tokens_per_batch: usize,
-) -> Vec<Vec<String>> {
-	let mut batches = Vec::new();
-	let mut current_batch = Vec::new();
-	let mut current_token_count = 0;
-
-	for text in texts {
-		let text_tokens = count_tokens(&text);
-
-		// If adding this text would exceed either limit, start a new batch
-		if !current_batch.is_empty()
-			&& (current_batch.len() >= max_batch_size
-				|| current_token_count + text_tokens > max_tokens_per_batch)
-		{
-			batches.push(current_batch);
-			current_batch = Vec::new();
-			current_token_count = 0;
-		}
-
-		current_batch.push(text);
-		current_token_count += text_tokens;
-	}
-
-	// Add the last batch if it's not empty
-	if !current_batch.is_empty() {
-		batches.push(current_batch);
-	}
-
-	batches
+	octolib::embedding::generate_embeddings(contents, provider, model).await
 }
 
 /// Generate batch embeddings based on configured provider (supports provider:model format)
-/// Now includes token-aware batching and input_type support
+/// Compatibility wrapper for octocode Config
 pub async fn generate_embeddings_batch(
 	texts: Vec<String>,
 	is_code: bool,
 	config: &Config,
-	input_type: types::InputType,
+	input_type: InputType,
 ) -> Result<Vec<Vec<f32>>> {
+	let embedding_config = EmbeddingGenerationConfig::from(config);
+
 	// Get the model string from config
 	let model_string = if is_code {
-		&config.embedding.code_model
+		&embedding_config.code_model
 	} else {
-		&config.embedding.text_model
+		&embedding_config.text_model
 	};
 
 	// Parse provider and model from the string
-	let (provider, model) = parse_provider_model(model_string);
-
-	let provider_impl = create_embedding_provider_from_parts(&provider, &model).await?;
+	let (provider, model) = if let Some((p, m)) = model_string.split_once(':') {
+		(p, m)
+	} else {
+		return Err(anyhow::anyhow!("Invalid model format: {}", model_string));
+	};
 
-	// Split texts into token-limited batches
-	let batches = split_texts_into_token_limited_batches(
+	octolib::embedding::generate_embeddings_batch(
 		texts,
-		config.index.embeddings_batch_size,
-		config.index.embeddings_max_tokens_per_batch,
-	);
-
-	let mut all_embeddings = Vec::new();
-
-	// Process each batch with input_type
-	for batch in batches {
-		let batch_embeddings = provider_impl
-			.generate_embeddings_batch(batch, input_type.clone())
-			.await?;
-		all_embeddings.extend(batch_embeddings);
-	}
-
-	Ok(all_embeddings)
-}
-
-/// Calculate a unique hash for content including file path
-pub fn calculate_unique_content_hash(contents: &str, file_path: &str) -> String {
-	use sha2::{Digest, Sha256};
-	let mut hasher = Sha256::new();
-	hasher.update(contents.as_bytes());
-	hasher.update(file_path.as_bytes());
-	format!("{:x}", hasher.finalize())
-}
-
-/// Calculate a unique hash for content including file path and line ranges
-/// This ensures blocks are reindexed when their position changes in the file
-pub fn calculate_content_hash_with_lines(
-	contents: &str,
-	file_path: &str,
-	start_line: usize,
-	end_line: usize,
-) -> String {
-	use sha2::{Digest, Sha256};
-	let mut hasher = Sha256::new();
-	hasher.update(contents.as_bytes());
-	hasher.update(file_path.as_bytes());
-	hasher.update(start_line.to_string().as_bytes());
-	hasher.update(end_line.to_string().as_bytes());
-	format!("{:x}", hasher.finalize())
-}
-
-/// Calculate content hash without file path
-pub fn calculate_content_hash(contents: &str) -> String {
-	use sha2::{Digest, Sha256};
-	let mut hasher = Sha256::new();
-	hasher.update(contents.as_bytes());
-	format!("{:x}", hasher.finalize())
+		provider,
+		model,
+		input_type,
+		embedding_config.batch_size,
+		embedding_config.max_tokens_per_batch,
+	)
+	.await
 }
 
-/// Search mode embeddings result
+/// Search mode embeddings result (octocode-specific)
 #[derive(Debug, Clone)]
 pub struct SearchModeEmbeddings {
 	pub code_embeddings: Option<Vec<f32>>,
 	pub text_embeddings: Option<Vec<f32>>,
 }
 
 /// Generate embeddings for search based on mode - centralized logic to avoid duplication
-/// This ensures consistent behavior across CLI and MCP interfaces
+/// Compatibility wrapper for octocode Config (octocode-specific)
 pub async fn generate_search_embeddings(
 	query: &str,
 	mode: &str,
@@ -226,8 +167,9 @@ pub async fn generate_search_embeddings(
 		"all" => {
 			// For "all" mode, check if code and text models are different
 			// If different, generate separate embeddings; if same, use one set
-			let code_model = &config.embedding.code_model;
-			let text_model = &config.embedding.text_model;
+			let embedding_config = EmbeddingGenerationConfig::from(config);
+			let code_model = &embedding_config.code_model;
+			let text_model = &embedding_config.text_model;
 
 			if code_model == text_model {
 				// Same model for both - generate once and reuse
@@ -252,3 +194,37 @@ pub async fn generate_search_embeddings(
 		)),
 	}
 }
+
+/// Calculate a unique hash for content including file path (octocode-specific)
+pub fn calculate_unique_content_hash(contents: &str, file_path: &str) -> String {
+	use sha2::{Digest, Sha256};
+	let mut hasher = Sha256::new();
+	hasher.update(contents.as_bytes());
+	hasher.update(file_path.as_bytes());
+	format!("{:x}", hasher.finalize())
+}
+
+/// Calculate a unique hash for content including file path and line ranges (octocode-specific)
+/// This ensures blocks are reindexed when their position changes in the file
+pub fn calculate_content_hash_with_lines(
+	contents: &str,
+	file_path: &str,
+	start_line: usize,
+	end_line: usize,
+) -> String {
+	use sha2::{Digest, Sha256};
+	let mut hasher = Sha256::new();
+	hasher.update(contents.as_bytes());
+	hasher.update(file_path.as_bytes());
+	hasher.update(start_line.to_string().as_bytes());
+	hasher.update(end_line.to_string().as_bytes());
+	format!("{:x}", hasher.finalize())
+}
+
+/// Calculate content hash without file path (octocode-specific)
+pub fn calculate_content_hash(contents: &str) -> String {
+	use sha2::{Digest, Sha256};
+	let mut hasher = Sha256::new();
+	hasher.update(contents.as_bytes());
+	format!("{:x}", hasher.finalize())
+}