utilityai
diff --git a/‎embeddings/src/main.rs‎
Lines changed: 39 additions & 17 deletions b/‎embeddings/src/main.rs‎
Lines changed: 39 additions & 17 deletions
diff --git a/‎llama-cpp-2/src/context.rs‎
Lines changed: 16 additions & 2 deletions b/‎llama-cpp-2/src/context.rs‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎llama-cpp-2/src/context/kv_cache.rs‎
Lines changed: 5 additions & 4 deletions b/‎llama-cpp-2/src/context/kv_cache.rs‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎llama-cpp-2/src/context/sample.rs‎
Lines changed: 45 additions & 8 deletions b/‎llama-cpp-2/src/context/sample.rs‎
Lines changed: 45 additions & 8 deletions
diff --git a/‎llama-cpp-2/src/lib.rs‎
Lines changed: 4 additions & 0 deletions b/‎llama-cpp-2/src/lib.rs‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎llama-cpp-2/src/llama_backend.rs‎
Lines changed: 4 additions & 3 deletions b/‎llama-cpp-2/src/llama_backend.rs‎
Lines changed: 4 additions & 3 deletions
@@ -1,28 +1,27 @@
 //! This is a translation of embedding.cpp in llama.cpp using llama-cpp-2.
 #![allow(
-clippy::cast_possible_wrap,
-clippy::cast_possible_truncation,
-clippy::cast_precision_loss,
-clippy::cast_sign_loss
+    clippy::cast_possible_wrap,
+    clippy::cast_possible_truncation,
+    clippy::cast_precision_loss,
+    clippy::cast_sign_loss
 )]
 
 use std::io::Write;
 use std::path::PathBuf;
-use std::str::FromStr;
 use std::time::Duration;
 
 use anyhow::{bail, Context, Result};
 use clap::Parser;
 use hf_hub::api::sync::ApiBuilder;
 
-use llama_cpp_2::context::LlamaContext;
 use llama_cpp_2::context::params::LlamaContextParams;
+use llama_cpp_2::context::LlamaContext;
 use llama_cpp_2::ggml_time_us;
 use llama_cpp_2::llama_backend::LlamaBackend;
 use llama_cpp_2::llama_batch::LlamaBatch;
+use llama_cpp_2::model::params::LlamaModelParams;
 use llama_cpp_2::model::AddBos;
 use llama_cpp_2::model::LlamaModel;
-use llama_cpp_2::model::params::LlamaModelParams;
 
 #[derive(clap::Parser, Debug, Clone)]
 struct Args {
@@ -41,7 +40,6 @@ struct Args {
     disable_gpu: bool,
 }
 
-
 #[derive(clap::Subcommand, Debug, Clone)]
 enum Model {
     /// Use an already downloaded model
@@ -119,7 +117,8 @@ fn main() -> Result<()> {
     let prompt_lines = prompt.lines();
 
     // tokenize the prompt
-    let tokens_lines_list = prompt_lines.map(|line| model.str_to_token(&line, AddBos::Always))
+    let tokens_lines_list = prompt_lines
+        .map(|line| model.str_to_token(line, AddBos::Always))
         .collect::<Result<Vec<_>, _>>()
         .with_context(|| format!("failed to tokenize {prompt}"))?;
 
@@ -140,7 +139,7 @@ fn main() -> Result<()> {
         for token in token_line {
             eprintln!(" {} --> {}", token, model.token_to_str(*token)?);
         }
-        eprintln!()
+        eprintln!();
     }
 
     std::io::stderr().flush()?;
@@ -157,15 +156,27 @@ fn main() -> Result<()> {
     for tokens in &tokens_lines_list {
         // Flush the batch if the next prompt would exceed our batch size
         if (batch.n_tokens() as usize + tokens.len()) > n_ctx {
-            batch_decode(&mut ctx, &mut batch, max_seq_id_batch, &mut output, normalise)?;
+            batch_decode(
+                &mut ctx,
+                &mut batch,
+                max_seq_id_batch,
+                &mut output,
+                normalise,
+            )?;
             max_seq_id_batch = 0;
         }
 
-        batch.add_sequence(&tokens, max_seq_id_batch, false)?;
+        batch.add_sequence(tokens, max_seq_id_batch, false)?;
         max_seq_id_batch += 1;
     }
     // Handle final batch
-    batch_decode(&mut ctx, &mut batch, max_seq_id_batch, &mut output, normalise)?;
+    batch_decode(
+        &mut ctx,
+        &mut batch,
+        max_seq_id_batch,
+        &mut output,
+        normalise,
+    )?;
 
     let t_main_end = ggml_time_us();
 
@@ -175,7 +186,7 @@ fn main() -> Result<()> {
     }
 
     let duration = Duration::from_micros((t_main_end - t_main_start) as u64);
-    let total_tokens: usize = tokens_lines_list.iter().map(|v| v.len()).sum();
+    let total_tokens: usize = tokens_lines_list.iter().map(Vec::len).sum();
     eprintln!(
         "Created embeddings for {} tokens in {:.2} s, speed {:.2} t/s\n",
         total_tokens,
@@ -188,12 +199,20 @@ fn main() -> Result<()> {
     Ok(())
 }
 
-fn batch_decode(ctx: &mut LlamaContext, batch: &mut LlamaBatch, s_batch: i32, output: &mut Vec<Vec<f32>>, normalise: bool) -> Result<()> {
+fn batch_decode(
+    ctx: &mut LlamaContext,
+    batch: &mut LlamaBatch,
+    s_batch: i32,
+    output: &mut Vec<Vec<f32>>,
+    normalise: bool,
+) -> Result<()> {
     ctx.clear_kv_cache();
     ctx.decode(batch).with_context(|| "llama_decode() failed")?;
 
     for i in 0..s_batch {
-        let embedding = ctx.embeddings_seq_ith(i).with_context(|| "Failed to get embeddings")?;
+        let embedding = ctx
+            .embeddings_seq_ith(i)
+            .with_context(|| "Failed to get embeddings")?;
         let output_embeddings = if normalise {
             normalize(embedding)
         } else {
@@ -209,7 +228,10 @@ fn batch_decode(ctx: &mut LlamaContext, batch: &mut LlamaBatch, s_batch: i32, ou
 }
 
 fn normalize(input: &[f32]) -> Vec<f32> {
-    let magnitude = input.iter().fold(0.0, |acc, &val| val.mul_add(val, acc)).sqrt();
+    let magnitude = input
+        .iter()
+        .fold(0.0, |acc, &val| val.mul_add(val, acc))
+        .sqrt();
 
     input.iter().map(|&val| val / magnitude).collect()
 }
@@ -95,19 +95,26 @@ impl<'model> LlamaContext<'model> {
     /// - When the current context was constructed without enabling embeddings.
     /// - If the current model had a pooling type of [`llama_cpp_sys_2::LLAMA_POOLING_TYPE_NONE`]
     /// - If the given sequence index exceeds the max sequence id.
+    ///
+    /// # Panics
+    ///
+    /// * `n_embd` does not fit into a usize
     pub fn embeddings_seq_ith(&self, i: i32) -> Result<&[f32], EmbeddingsError> {
         if !self.embeddings_enabled {
             return Err(EmbeddingsError::NotEnabled);
         }
 
+        let n_embd =
+            usize::try_from(self.model.n_embd()).expect("n_embd does not fit into a usize");
+
         unsafe {
             let embedding = llama_cpp_sys_2::llama_get_embeddings_seq(self.context.as_ptr(), i);
 
             // Technically also possible whenever `i >= max(batch.n_seq)`, but can't check that here.
             if embedding.is_null() {
                 Err(EmbeddingsError::NonePoolType)
             } else {
-                Ok(std::slice::from_raw_parts(embedding, self.model.n_embd() as usize))
+                Ok(slice::from_raw_parts(embedding, n_embd))
             }
         }
     }
@@ -124,18 +131,25 @@ impl<'model> LlamaContext<'model> {
     /// - When the current context was constructed without enabling embeddings.
     /// - When the given token didn't have logits enabled when it was passed.
     /// - If the given token index exceeds the max token id.
+    ///
+    /// # Panics
+    ///
+    /// * `n_embd` does not fit into a usize
     pub fn embeddings_ith(&self, i: i32) -> Result<&[f32], EmbeddingsError> {
         if !self.embeddings_enabled {
             return Err(EmbeddingsError::NotEnabled);
         }
 
+        let n_embd =
+            usize::try_from(self.model.n_embd()).expect("n_embd does not fit into a usize");
+
         unsafe {
             let embedding = llama_cpp_sys_2::llama_get_embeddings_ith(self.context.as_ptr(), i);
             // Technically also possible whenever `i >= batch.n_tokens`, but no good way of checking `n_tokens` here.
             if embedding.is_null() {
                 Err(EmbeddingsError::LogitsNotEnabled)
             } else {
-                Ok(std::slice::from_raw_parts(embedding, self.model.n_embd() as usize))
+                Ok(slice::from_raw_parts(embedding, n_embd))
             }
         }
     }
 
@@ -67,6 +67,7 @@ impl LlamaContext<'_> {
         unsafe { llama_cpp_sys_2::llama_kv_cache_seq_keep(self.context.as_ptr(), seq_id) }
     }
 
+    #[allow(clippy::doc_markdown)]
     /// Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
     /// If the KV cache is RoPEd, the KV data is updated accordingly:
     ///   - lazily on next [`LlamaContext::decode`]
@@ -212,9 +213,9 @@ impl<'a> KVCacheView<'a> {
     }
 
     /// Information for individual cells.
-    /// 
+    ///
     /// # Panics
-    /// 
+    ///
     /// - if `n_cells` does not fit into usize.
     pub fn cells(&self) -> impl Iterator<Item = KVCacheViewCell> {
         unsafe {
@@ -228,9 +229,9 @@ impl<'a> KVCacheView<'a> {
     }
 
     /// The sequences for each cell. There will be `n_max_seq` items per cell.
-    /// 
+    ///
     /// # Panics
-    /// 
+    ///
     /// - if `n_cells * n_max_seq` does not fit into usize.
     /// - if `n_max_seq` does not fit into usize.
     pub fn cells_sequences(&self) -> impl Iterator<Item = &[llama_cpp_sys_2::llama_seq_id]> {
 
@@ -4,19 +4,24 @@ use crate::context::LlamaContext;
 use crate::grammar::LlamaGrammar;
 use crate::token::data_array::LlamaTokenDataArray;
 use crate::token::LlamaToken;
-use llama_cpp_sys_2::llama_context;
 
 /// struct to hold params for sampling
 #[derive(Debug)]
-#[deprecated(since = "0.1.32", note = "this does not scale well with many params and does not allow for changing of orders.")]
+#[deprecated(
+    since = "0.1.32",
+    note = "this does not scale well with many params and does not allow for changing of orders."
+)]
 pub struct Sampler<'grammar> {
     token_data_array: LlamaTokenDataArray,
     grammar: Option<&'grammar mut LlamaGrammar>,
     temperature: Option<f32>,
 }
 
 impl<'grammar> Sampler<'grammar> {
-    #[deprecated(since = "0.1.32", note = "this does not scale well with many params and does not allow for changing of orders.")]
+    #[deprecated(
+        since = "0.1.32",
+        note = "this does not scale well with many params and does not allow for changing of orders."
+    )]
     fn sample(self, llama_context: &mut LlamaContext) -> LlamaToken {
         match self {
             Sampler {
@@ -60,7 +65,10 @@ impl<'grammar> Sampler<'grammar> {
 
     /// Create a new sampler.
     #[must_use]
-    #[deprecated(since = "0.1.32", note = "this does not scale well with many params and does not allow for changing of orders.")]
+    #[deprecated(
+        since = "0.1.32",
+        note = "this does not scale well with many params and does not allow for changing of orders."
+    )]
     pub fn new(llama_token_data_array: LlamaTokenDataArray) -> Self {
         Self {
             token_data_array: llama_token_data_array,
@@ -71,7 +79,10 @@ impl<'grammar> Sampler<'grammar> {
 
     /// Set the grammar for sampling.
     #[must_use]
-    #[deprecated(since = "0.1.32", note = "this does not scale well with many params and does not allow for changing of orders.")]
+    #[deprecated(
+        since = "0.1.32",
+        note = "this does not scale well with many params and does not allow for changing of orders."
+    )]
     pub fn with_grammar(mut self, grammar: &'grammar mut LlamaGrammar) -> Self {
         self.grammar = Some(grammar);
         self
@@ -91,7 +102,10 @@ impl<'grammar> Sampler<'grammar> {
     ///     .with_temperature(0.5);
     /// ```
     #[must_use]
-    #[deprecated(since = "0.1.32", note = "this does not scale well with many params and does not allow for changing of orders.")]
+    #[deprecated(
+        since = "0.1.32",
+        note = "this does not scale well with many params and does not allow for changing of orders."
+    )]
     pub fn with_temperature(mut self, temperature: f32) -> Self {
         if temperature == 0.0 {
             return self;
@@ -107,7 +121,10 @@ impl LlamaContext<'_> {
     /// # Panics
     ///
     /// - sampler contains no tokens
-    #[deprecated(since = "0.1.32", note = "this does not scale well with many params and does not allow for changing of orders.")]
+    #[deprecated(
+        since = "0.1.32",
+        note = "this does not scale well with many params and does not allow for changing of orders."
+    )]
     pub fn sample(&mut self, sampler: Sampler) -> LlamaToken {
         sampler.sample(self)
     }
@@ -157,7 +174,7 @@ impl LlamaContext<'_> {
         if temperature == 0.0 {
             return;
         }
-        let ctx: *mut llama_context = self.context.as_ptr();
+        let ctx: *mut llama_cpp_sys_2::llama_context = self.context.as_ptr();
         unsafe {
             token_data.modify_as_c_llama_token_data_array(|c_llama_token_data_array| {
                 llama_cpp_sys_2::llama_sample_temp(ctx, c_llama_token_data_array, temperature);
@@ -254,4 +271,24 @@ impl LlamaContext<'_> {
             });
         }
     }
+
+    /// See [`LlamaTokenDataArray::sample_repetition_penalty`]
+    pub fn sample_repetition_penalty(
+        &mut self,
+        token_data: &mut LlamaTokenDataArray,
+        last_tokens: &[LlamaToken],
+        penalty_last_n: usize,
+        penalty_repeat: f32,
+        penalty_freq: f32,
+        penalty_present: f32,
+    ) {
+        token_data.sample_repetition_penalty(
+            Some(self),
+            last_tokens,
+            penalty_last_n,
+            penalty_repeat,
+            penalty_freq,
+            penalty_present,
+        );
+    }
 }
@@ -52,6 +52,7 @@ pub enum LLamaCppError {
     /// There was an error adding a token to a batch.
     #[error["{0}"]]
     BatchAddError(#[from] BatchAddError),
+    /// see [`EmbeddingsError`]
     #[error(transparent)]
     EmbeddingError(#[from] EmbeddingsError),
 }
@@ -81,10 +82,13 @@ pub enum DecodeError {
 /// When embedding related functions fail
 #[derive(Debug, Eq, PartialEq, thiserror::Error)]
 pub enum EmbeddingsError {
+    /// Embeddings weren't enabled in the context options
     #[error("Embeddings weren't enabled in the context options")]
     NotEnabled,
+    /// Logits weren't enabled for the given token
     #[error("Logits were not enabled for the given token")]
     LogitsNotEnabled,
+    /// The given sequence index exceeds the max sequence id
     #[error("Can't use sequence embeddings with a model supporting only LLAMA_POOLING_TYPE_NONE")]
     NonePoolType,
 }
 
@@ -1,9 +1,9 @@
 //! Representation of an initialized llama backend
 
 use crate::LLamaCppError;
+use llama_cpp_sys_2::ggml_log_level;
 use std::sync::atomic::AtomicBool;
 use std::sync::atomic::Ordering::SeqCst;
-use llama_cpp_sys_2::ggml_log_level;
 
 /// Representation of an initialized llama backend
 /// This is required as a parameter for most llama functions as the backend must be initialized
@@ -76,10 +76,11 @@ impl LlamaBackend {
             _level: ggml_log_level,
             _text: *const ::std::os::raw::c_char,
             _user_data: *mut ::std::os::raw::c_void,
-        ) {}
+        ) {
+        }
 
         unsafe {
-            llama_cpp_sys_2::llama_log_set(Some(void_log), std::ptr::null_mut())
+            llama_cpp_sys_2::llama_log_set(Some(void_log), std::ptr::null_mut());
         }
     }
 }