Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 158 additions & 71 deletions Cargo.lock

Large diffs are not rendered by default.

17 changes: 6 additions & 11 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ exclude = [
]

[features]
default = ["fastembed", "huggingface"]
fastembed = ["dep:fastembed"]
huggingface = ["dep:candle-core", "dep:candle-nn", "dep:candle-transformers", "dep:tokenizers", "dep:hf-hub"]
default = []
fastembed = ["octolib/fastembed"]
huggingface = ["octolib/huggingface"]

# Optimized release profile for static linking
[profile.release]
Expand Down Expand Up @@ -74,30 +74,25 @@ clap = { version = "4.5.45", features = ["derive"] }
clap_complete = "4.5.57"
notify = { version = "8.2.0", default-features = false, features = ["crossbeam-channel", "macos_fsevent"] }
notify-debouncer-mini = "0.7.0"
fastembed = { version = "5.0.2", optional = true }
toml = "0.9.5"
lazy_static = "1.5.0"
futures = { version = "0.3.31", default-features = false, features = ["std"] }
globset = { version = "0.4.16", default-features = false }
regex = { version = "1.11.1", default-features = false, features = ["std"] }
dirs = "6.0.0"
# Candle dependencies for HuggingFace support (optional)
candle-core = { version = "0.9.1", optional = true }
candle-nn = { version = "0.9.1", optional = true }
candle-transformers = { version = "0.9.1", optional = true }
tokenizers = { version = "0.21.4", optional = true }
hf-hub = { version = "0.4.3", features = ["tokio"], optional = true }
# EditorConfig parsing and formatting
ec4rs = "1.2.0"
tracing = "0.1.41"
tracing-subscriber = { version = "0.3.20", features = ["env-filter", "json"] }
tracing-appender = "0.2.3"
tiktoken-rs = "0.7.0"
# LSP integration dependencies
lsp-types = "0.97.0"
url = "2.5.4"
dotenvy = "0.15"

# Local dependency on octolib with embedding features
octolib = { path = "../octolib", features = ["fastembed", "huggingface"] }

[profile.dev]
opt-level = 1 # Basic optimizations without slowing compilation too much
debug = true # Keep debug symbols for backtraces
Expand Down
268 changes: 122 additions & 146 deletions src/embedding/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,195 +12,136 @@
// See the License for the specific language governing permissions and
// limitations under the License.

pub mod provider;
#[cfg(test)]
mod tests;
pub mod types;
//! Re-export embedding functionality from octolib and add octocode-specific logic

use crate::config::Config;
use anyhow::Result;
use tiktoken_rs::cl100k_base;

pub use provider::{create_embedding_provider_from_parts, EmbeddingProvider};
pub use types::*;
// Re-export core functionality from octolib::embedding
pub use octolib::embedding::{
count_tokens, create_embedding_provider_from_parts, split_texts_into_token_limited_batches,
truncate_output, EmbeddingProvider, InputType,
};

// Re-export types for backward compatibility
pub use octolib::embedding::types::{parse_provider_model, EmbeddingProviderType};

// Create a types module for backward compatibility
pub mod types {
pub use octolib::embedding::types::*;
}

// Create a provider module for backward compatibility
pub mod provider {
pub use octolib::embedding::provider::*;
}

/// Configuration for embedding generation (octocode-specific)
#[derive(Debug, Clone)]
pub struct EmbeddingGenerationConfig {
/// Code embedding model (format: "provider:model")
pub code_model: String,
/// Text embedding model (format: "provider:model")
pub text_model: String,
/// Batch size for embedding generation
pub batch_size: usize,
/// Maximum tokens per batch
pub max_tokens_per_batch: usize,
}

impl Default for EmbeddingGenerationConfig {
fn default() -> Self {
Self {
code_model: "voyage:voyage-code-3".to_string(),
text_model: "voyage:voyage-3.5-lite".to_string(),
batch_size: 16,
max_tokens_per_batch: 100_000,
}
}
}

/// Convert octocode Config to octocode EmbeddingGenerationConfig
impl From<&Config> for EmbeddingGenerationConfig {
fn from(config: &Config) -> Self {
Self {
code_model: config.embedding.code_model.clone(),
text_model: config.embedding.text_model.clone(),
batch_size: config.index.embeddings_batch_size,
max_tokens_per_batch: config.index.embeddings_max_tokens_per_batch,
}
}
}

/// Generate embeddings based on configured provider (supports provider:model format)
/// Compatibility wrapper for octocode Config
pub async fn generate_embeddings(
contents: &str,
is_code: bool,
config: &Config,
) -> Result<Vec<f32>> {
let embedding_config = EmbeddingGenerationConfig::from(config);

// Get the model string from config
let model_string = if is_code {
&config.embedding.code_model
&embedding_config.code_model
} else {
&config.embedding.text_model
&embedding_config.text_model
};

// Parse provider and model from the string
let (provider, model) = parse_provider_model(model_string);

let provider_impl = create_embedding_provider_from_parts(&provider, &model).await?;
provider_impl.generate_embedding(contents).await
}

/// Count tokens in a text using tiktoken (cl100k_base tokenizer)
pub fn count_tokens(text: &str) -> usize {
let bpe = cl100k_base().expect("Failed to load cl100k_base tokenizer");
bpe.encode_with_special_tokens(text).len()
}

/// Truncate output if it exceeds token limit
pub fn truncate_output(output: &str, max_tokens: usize) -> String {
if max_tokens == 0 {
return output.to_string();
}

let token_count = count_tokens(output);

if token_count <= max_tokens {
return output.to_string();
}

// Simple truncation - cut at character boundary
// Estimate roughly where to cut (tokens are ~4 chars average)
let estimated_chars = max_tokens * 3; // Conservative estimate
let truncated = if output.len() > estimated_chars {
&output[..estimated_chars]
let (provider, model) = if let Some((p, m)) = model_string.split_once(':') {
(p, m)
} else {
output
return Err(anyhow::anyhow!("Invalid model format: {}", model_string));
};

// Find last newline to avoid cutting mid-line
let last_newline = truncated.rfind('\n').unwrap_or(truncated.len());
let final_truncated = &truncated[..last_newline];

format!(
"{}\n\n[Output truncated - {} tokens estimated, max {} allowed. Use more specific queries to reduce output size]",
final_truncated,
token_count,
max_tokens
)
}

/// Split texts into batches respecting both count and token limits
pub fn split_texts_into_token_limited_batches(
texts: Vec<String>,
max_batch_size: usize,
max_tokens_per_batch: usize,
) -> Vec<Vec<String>> {
let mut batches = Vec::new();
let mut current_batch = Vec::new();
let mut current_token_count = 0;

for text in texts {
let text_tokens = count_tokens(&text);

// If adding this text would exceed either limit, start a new batch
if !current_batch.is_empty()
&& (current_batch.len() >= max_batch_size
|| current_token_count + text_tokens > max_tokens_per_batch)
{
batches.push(current_batch);
current_batch = Vec::new();
current_token_count = 0;
}

current_batch.push(text);
current_token_count += text_tokens;
}

// Add the last batch if it's not empty
if !current_batch.is_empty() {
batches.push(current_batch);
}

batches
octolib::embedding::generate_embeddings(contents, provider, model).await
}

/// Generate batch embeddings based on configured provider (supports provider:model format)
/// Now includes token-aware batching and input_type support
/// Compatibility wrapper for octocode Config
pub async fn generate_embeddings_batch(
texts: Vec<String>,
is_code: bool,
config: &Config,
input_type: types::InputType,
input_type: InputType,
) -> Result<Vec<Vec<f32>>> {
let embedding_config = EmbeddingGenerationConfig::from(config);

// Get the model string from config
let model_string = if is_code {
&config.embedding.code_model
&embedding_config.code_model
} else {
&config.embedding.text_model
&embedding_config.text_model
};

// Parse provider and model from the string
let (provider, model) = parse_provider_model(model_string);

let provider_impl = create_embedding_provider_from_parts(&provider, &model).await?;
let (provider, model) = if let Some((p, m)) = model_string.split_once(':') {
(p, m)
} else {
return Err(anyhow::anyhow!("Invalid model format: {}", model_string));
};

// Split texts into token-limited batches
let batches = split_texts_into_token_limited_batches(
octolib::embedding::generate_embeddings_batch(
texts,
config.index.embeddings_batch_size,
config.index.embeddings_max_tokens_per_batch,
);

let mut all_embeddings = Vec::new();

// Process each batch with input_type
for batch in batches {
let batch_embeddings = provider_impl
.generate_embeddings_batch(batch, input_type.clone())
.await?;
all_embeddings.extend(batch_embeddings);
}

Ok(all_embeddings)
}

/// Calculate a unique hash for content including file path
pub fn calculate_unique_content_hash(contents: &str, file_path: &str) -> String {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(contents.as_bytes());
hasher.update(file_path.as_bytes());
format!("{:x}", hasher.finalize())
}

/// Calculate a unique hash for content including file path and line ranges
/// This ensures blocks are reindexed when their position changes in the file
pub fn calculate_content_hash_with_lines(
contents: &str,
file_path: &str,
start_line: usize,
end_line: usize,
) -> String {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(contents.as_bytes());
hasher.update(file_path.as_bytes());
hasher.update(start_line.to_string().as_bytes());
hasher.update(end_line.to_string().as_bytes());
format!("{:x}", hasher.finalize())
}

/// Calculate content hash without file path
pub fn calculate_content_hash(contents: &str) -> String {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(contents.as_bytes());
format!("{:x}", hasher.finalize())
provider,
model,
input_type,
embedding_config.batch_size,
embedding_config.max_tokens_per_batch,
)
.await
}

/// Search mode embeddings result
/// Search mode embeddings result (octocode-specific)
#[derive(Debug, Clone)]
pub struct SearchModeEmbeddings {
pub code_embeddings: Option<Vec<f32>>,
pub text_embeddings: Option<Vec<f32>>,
}

/// Generate embeddings for search based on mode - centralized logic to avoid duplication
/// This ensures consistent behavior across CLI and MCP interfaces
/// Compatibility wrapper for octocode Config (octocode-specific)
pub async fn generate_search_embeddings(
query: &str,
mode: &str,
Expand All @@ -226,8 +167,9 @@ pub async fn generate_search_embeddings(
"all" => {
// For "all" mode, check if code and text models are different
// If different, generate separate embeddings; if same, use one set
let code_model = &config.embedding.code_model;
let text_model = &config.embedding.text_model;
let embedding_config = EmbeddingGenerationConfig::from(config);
let code_model = &embedding_config.code_model;
let text_model = &embedding_config.text_model;

if code_model == text_model {
// Same model for both - generate once and reuse
Expand All @@ -252,3 +194,37 @@ pub async fn generate_search_embeddings(
)),
}
}

/// Calculate a unique hash for content including file path (octocode-specific)
pub fn calculate_unique_content_hash(contents: &str, file_path: &str) -> String {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(contents.as_bytes());
hasher.update(file_path.as_bytes());
format!("{:x}", hasher.finalize())
}

/// Calculate a unique hash for content including file path and line ranges (octocode-specific)
/// This ensures blocks are reindexed when their position changes in the file
pub fn calculate_content_hash_with_lines(
contents: &str,
file_path: &str,
start_line: usize,
end_line: usize,
) -> String {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(contents.as_bytes());
hasher.update(file_path.as_bytes());
hasher.update(start_line.to_string().as_bytes());
hasher.update(end_line.to_string().as_bytes());
format!("{:x}", hasher.finalize())
}

/// Calculate content hash without file path (octocode-specific)
pub fn calculate_content_hash(contents: &str) -> String {
use sha2::{Digest, Sha256};
let mut hasher = Sha256::new();
hasher.update(contents.as_bytes());
format!("{:x}", hasher.finalize())
}
Loading
Loading