Merge pull request #101 from utilityai/move-simple-to-bin

MarcusDunn · web-flow · commit 4e179bfb2a20 · 2024-02-25T15:51:57.000-08:00
moved simple to its own binary for easier use + faster compile times
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -3,16 +3,22 @@ resolver = "2"
 members = [
     "llama-cpp-sys-2",
     "llama-cpp-2",
+    "simple",
 ]
 
 [workspace.dependencies]
+# core library deps
 thiserror = "1"
 tracing = "0.1"
+
+# examples and benchmarks
 hf-hub = { version = "0.3.2" }
 criterion = "0.5.1"
 pprof = "0.13.0"
 bindgen = "0.69.4"
 cc = "1.0.83"
+anyhow = "1.0.80"
+clap = "4.5.1"
 
 [workspace.lints.rust]
 missing_docs = { level = "warn" }
diff --git a/README.md b/README.md
@@ -1,20 +1,159 @@
 # 🦙 [llama-cpp-rs][readme] &emsp; [![Docs]][docs.rs] [![Latest Version]][crates.io] [![Lisence]][crates.io]
 
 [Docs]: https://img.shields.io/docsrs/llama-cpp-2.svg
+
 [Latest Version]: https://img.shields.io/crates/v/llama-cpp-2.svg
+
 [crates.io]: https://crates.io/crates/llama-cpp-2
+
 [docs.rs]: https://docs.rs/llama-cpp-2
+
 [Lisence]: https://img.shields.io/crates/l/llama-cpp-2.svg
+
 [llama-cpp-sys]: https://crates.io/crates/llama-cpp-sys-2
+
 [utilityai]: https://utilityai.ca
+
 [readme]: https://github.com/utilityai/llama-cpp-rs/tree/main/llama-cpp-2
 
-This is the home for [llama-cpp-2][crates.io]. It also contains the [llama-cpp-sys] bindings which are updated regularly and in sync with [llama-cpp-2][crates.io].
+This is the home for [llama-cpp-2][crates.io]. It also contains the [llama-cpp-sys] bindings which are updated regularly
+and in sync with [llama-cpp-2][crates.io].
 
-This project was created with the explict goal of staying as up to date as possible with llama.cpp, as a result it is dead simple, very close to raw bindings, and does not follow semver meaningfully.
+This project was created with the explict goal of staying as up to date as possible with llama.cpp, as a result it is
+dead simple, very close to raw bindings, and does not follow semver meaningfully.
 
 Check out the [docs.rs] for crate documentation or the [readme] for high level information about the project.
 
+## Try it out!
+
+Clone the repo
+
+```bash
+git clone --recursive https://github.com/utilityai/llama-cpp-rs
+```
+
+Enter the directory
+
+```bash
+cd llama-cpp-rs
+```
+
+Run the simple example
+
+```bash
+cargo run --release --bin simple "The way to kill a linux process is" hf-model TheBloke/Llama-2-7B-GGUF llama-2-7b.Q4_K_M.gguf
+```
+
+Or if you have a GPU and want to use it
+
+```bash
+cargo run --features cublas --release --bin simple "The way to kill a linux process is" hf-model TheBloke/Llama-2-7B-GGUF llama-2-7b.Q4_K_M.gguf
+```
+
+<details>
+<summary>Output</summary>
+<pre>
+ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
+ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
+ggml_init_cublas: found 1 CUDA devices:
+  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
+llama_model_params { n_gpu_layers: 1000, split_mode: 1, main_gpu: 0, tensor_split: 0x0, progress_callback: None, progress_callback_user_data: 0x0, kv_overrides: 0x0, vocab_only: false, use_mmap: true, use_mlock: false }
+llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /home/marcus/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-GGUF/snapshots/b4e04e128f421c93a5f1e34ac4d7ca9b0af47b80/llama-2-7b.Q4_K_M.gguf (version GGUF V2)
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = llama
+llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
+llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
+llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
+llama_model_loader: - kv   4:                          llama.block_count u32              = 32
+llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
+llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
+llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32
+llama_model_loader: - kv   8:              llama.attention.head_count_kv u32              = 32
+llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010
+llama_model_loader: - kv  10:                          general.file_type u32              = 15
+llama_model_loader: - kv  11:                       tokenizer.ggml.model str              = llama
+llama_model_loader: - kv  12:                      tokenizer.ggml.tokens arr[str,32000]   = ["<unk>", "<s>", "</s>", "<0x00>", "<...
+llama_model_loader: - kv  13:                      tokenizer.ggml.scores arr[f32,32000]   = [0.000000, 0.000000, 0.000000, 0.0000...
+llama_model_loader: - kv  14:                  tokenizer.ggml.token_type arr[i32,32000]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
+llama_model_loader: - kv  15:                tokenizer.ggml.bos_token_id u32              = 1
+llama_model_loader: - kv  16:                tokenizer.ggml.eos_token_id u32              = 2
+llama_model_loader: - kv  17:            tokenizer.ggml.unknown_token_id u32              = 0
+llama_model_loader: - kv  18:               general.quantization_version u32              = 2
+llama_model_loader: - type  f32:   65 tensors
+llama_model_loader: - type q4_K:  193 tensors
+llama_model_loader: - type q6_K:   33 tensors
+llm_load_vocab: special tokens definition check successful ( 259/32000 ).
+llm_load_print_meta: format           = GGUF V2
+llm_load_print_meta: arch             = llama
+llm_load_print_meta: vocab type       = SPM
+llm_load_print_meta: n_vocab          = 32000
+llm_load_print_meta: n_merges         = 0
+llm_load_print_meta: n_ctx_train      = 4096
+llm_load_print_meta: n_embd           = 4096
+llm_load_print_meta: n_head           = 32
+llm_load_print_meta: n_head_kv        = 32
+llm_load_print_meta: n_layer          = 32
+llm_load_print_meta: n_rot            = 128
+llm_load_print_meta: n_embd_head_k    = 128
+llm_load_print_meta: n_embd_head_v    = 128
+llm_load_print_meta: n_gqa            = 1
+llm_load_print_meta: n_embd_k_gqa     = 4096
+llm_load_print_meta: n_embd_v_gqa     = 4096
+llm_load_print_meta: f_norm_eps       = 0.0e+00
+llm_load_print_meta: f_norm_rms_eps   = 1.0e-05
+llm_load_print_meta: f_clamp_kqv      = 0.0e+00
+llm_load_print_meta: f_max_alibi_bias = 0.0e+00
+llm_load_print_meta: n_ff             = 11008
+llm_load_print_meta: n_expert         = 0
+llm_load_print_meta: n_expert_used    = 0
+llm_load_print_meta: rope scaling     = linear
+llm_load_print_meta: freq_base_train  = 10000.0
+llm_load_print_meta: freq_scale_train = 1
+llm_load_print_meta: n_yarn_orig_ctx  = 4096
+llm_load_print_meta: rope_finetuned   = unknown
+llm_load_print_meta: model type       = 7B
+llm_load_print_meta: model ftype      = Q4_K - Medium
+llm_load_print_meta: model params     = 6.74 B
+llm_load_print_meta: model size       = 3.80 GiB (4.84 BPW) 
+llm_load_print_meta: general.name     = LLaMA v2
+llm_load_print_meta: BOS token        = 1 '<s>'
+llm_load_print_meta: EOS token        = 2 '</s>'
+llm_load_print_meta: UNK token        = 0 '<unk>'
+llm_load_print_meta: LF token         = 13 '<0x0A>'
+llm_load_tensors: ggml ctx size =    0.22 MiB
+llm_load_tensors: offloading 32 repeating layers to GPU
+llm_load_tensors: offloading non-repeating layers to GPU
+llm_load_tensors: offloaded 33/33 layers to GPU
+llm_load_tensors:      CUDA0 buffer size =  3820.94 MiB
+llm_load_tensors:        CPU buffer size =    70.31 MiB
+..................................................................................................
+Loaded "/home/marcus/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-GGUF/snapshots/b4e04e128f421c93a5f1e34ac4d7ca9b0af47b80/llama-2-7b.Q4_K_M.gguf"
+llama_new_context_with_model: n_ctx      = 2048
+llama_new_context_with_model: freq_base  = 10000.0
+llama_new_context_with_model: freq_scale = 1
+llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB
+llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB
+llama_new_context_with_model:  CUDA_Host input buffer size   =    13.02 MiB
+ggml_gallocr_reserve_n: reallocating CUDA0 buffer from size 0.00 MiB to 164.01 MiB
+ggml_gallocr_reserve_n: reallocating CUDA_Host buffer from size 0.00 MiB to 8.00 MiB
+llama_new_context_with_model:      CUDA0 compute buffer size =   164.01 MiB
+llama_new_context_with_model:  CUDA_Host compute buffer size =     8.00 MiB
+llama_new_context_with_model: graph splits (measure): 3
+n_len = 32, n_ctx = 2048, k_kv_req = 32
+
+The way to kill a linux process is to send it a SIGKILL signal.
+The way to kill a windows process is to send it a S
+
+decoded 24 tokens in 0.23 s, speed 105.65 t/s
+
+load time = 727.50 ms
+sample time = 0.46 ms / 24 runs (0.02 ms per token, 51835.85 tokens per second)
+prompt eval time = 68.52 ms / 9 tokens (7.61 ms per token, 131.35 tokens per second)
+eval time = 225.70 ms / 24 runs (9.40 ms per token, 106.34 tokens per second)
+total time = 954.18 ms
+</pre>
+</details>
+
 ## Hacking
 
 Ensure that when you clone this project you also clone the submodules. This can be done with the following command:
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
@@ -18,10 +18,6 @@ hf-hub = { workspace = true }
 criterion = { workspace = true }
 pprof = { workspace = true, features = ["criterion", "flamegraph"] }
 
-# used in examples
-clap = { version = "4.5.1", features = ["derive"] }
-anyhow = "1.0.80"
-
 [[bench]]
 name = "grammar_bias"
 harness = false
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
@@ -40,7 +40,7 @@ include = [
 
 [build-dependencies]
 bindgen = { workspace = true }
-cc = { workspace = true }
+cc = { workspace = true, features = ["parallel"] }
 
 [features]
 cublas = []
diff --git a/simple/Cargo.toml b/simple/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "simple"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+llama-cpp-2 = { path = "../llama-cpp-2", version = "0.1.28" }
+hf-hub = { workspace = true }
+clap = { workspace = true , features = ["derive"] }
+anyhow = { workspace = true }
+
+[features]
+cublas = ["llama-cpp-2/cublas"]
+
+[lints]
+workspace = true
diff --git a/simple/src/main.rs b/simple/src/main.rs
@@ -1,4 +1,4 @@
-//! This is an translation of simple.cpp in llama.cpp using llama-cpp-2.
+//! This is a translation of simple.cpp in llama.cpp using llama-cpp-2.
 #![allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation, clippy::cast_precision_loss, clippy::cast_sign_loss)]
 
 use anyhow::{bail, Context, Result};
@@ -15,42 +15,79 @@ use std::io::Write;
 use std::num::NonZeroU32;
 use std::path::PathBuf;
 use std::time::Duration;
+use hf_hub::api::sync::ApiBuilder;
 
-#[derive(clap::Parser)]
+#[derive(clap::Parser, Debug, Clone)]
 struct Args {
     /// The path to the model
-    model_path: PathBuf,
+    #[command(subcommand)]
+    model: Model,
     /// The prompt
     #[clap(default_value = "Hello my name is")]
     prompt: String,
+    /// set the length of the prompt + output in tokens
+    #[arg(long, default_value_t = 32)]
+    n_len: i32,
     /// Disable offloading layers to the gpu
     #[cfg(feature = "cublas")]
     #[clap(long)]
     disable_gpu: bool,
 }
 
+#[derive(clap::Subcommand, Debug, Clone)]
+enum Model {
+    /// Use an already downloaded model
+    Local {
+        /// The path to the model. e.g. `/home/marcus/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-Chat-GGUF/blobs/08a5566d61d7cb6b420c3e4387a39e0078e1f2fe5f055f3a03887385304d4bfa`
+        path: PathBuf,
+    },
+    /// Download a model from huggingface (or use a cached version)
+    #[clap(name = "hf-model")]
+    HuggingFace {
+        /// the repo containing the model. e.g. `TheBloke/Llama-2-7B-Chat-GGUF`
+        repo: String,
+        /// the model name. e.g. `llama-2-7b-chat.Q4_K_M.gguf`
+        model: String,
+    },
+}
+
+impl Model {
+    /// Convert the model to a path - may download from huggingface
+    fn to_path(self) -> Result<PathBuf> {
+        match self {
+            Model::Local { path } => Ok(path),
+            Model::HuggingFace { model, repo } => ApiBuilder::new()
+                .with_progress(true)
+                .build()
+                .with_context(|| "unable to create huggingface api")?
+                .model(repo)
+                .get(&model)
+                .with_context(|| "unable to download model")
+        }
+    }
+}
+
 fn main() -> Result<()> {
-    let params = Args::parse();
+    let Args { n_len, model, prompt, #[cfg(feature = "cublas")] disable_gpu } = Args::parse();
 
     // init LLM
     let backend = LlamaBackend::init()?;
 
-    // total length of the sequence including the prompt
-    let n_len: i32 = 32;
-
     // offload all layers to the gpu
     let model_params = {
         #[cfg(feature = "cublas")]
-        if !params.disable_gpu {
+        if !disable_gpu {
             LlamaModelParams::default().with_n_gpu_layers(1000)
         } else {
             LlamaModelParams::default()
         }
         #[cfg(not(feature = "cublas"))]
         LlamaModelParams::default()
     };
+    
+    let model_path = model.to_path().with_context(|| "failed to get model from args")?;
 
-    let model = LlamaModel::load_from_file(&backend, params.model_path, &model_params)
+    let model = LlamaModel::load_from_file(&backend, model_path, &model_params)
         .with_context(|| "unable to load model")?;
 
     // initialize the context
@@ -65,8 +102,8 @@ fn main() -> Result<()> {
     // tokenize the prompt
 
     let tokens_list = model
-        .str_to_token(&params.prompt, AddBos::Always)
-        .with_context(|| format!("failed to tokenize {}", params.prompt))?;
+        .str_to_token(&prompt, AddBos::Always)
+        .with_context(|| format!("failed to tokenize {}", prompt))?;
 
     let n_cxt = ctx.n_ctx() as i32;
     let n_kv_req = tokens_list.len() as i32 + (n_len - tokens_list.len() as i32);
@@ -81,6 +118,10 @@ either reduce n_len or increase n_ctx"
         )
     }
 
+    if tokens_list.len() >= usize::try_from(n_len)? {
+        bail!("the prompt is too long, it has more tokens than n_len")
+    }
+
     // print the prompt token-by-token
     eprintln!();
 
diff --git a/test-build.Dockerfile b/test-build.Dockerfile