Skip to content

Commit 4e179bf

Browse files
authored
Merge pull request #101 from utilityai/move-simple-to-bin
moved simple to its own binary for easier use + faster compile times
2 parents 75b9a3b + c18e5ba commit 4e179bf

File tree

8 files changed

+240
-22
lines changed

8 files changed

+240
-22
lines changed

Cargo.lock

Lines changed: 20 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,22 @@ resolver = "2"
33
members = [
44
"llama-cpp-sys-2",
55
"llama-cpp-2",
6+
"simple",
67
]
78

89
[workspace.dependencies]
10+
# core library deps
911
thiserror = "1"
1012
tracing = "0.1"
13+
14+
# examples and benchmarks
1115
hf-hub = { version = "0.3.2" }
1216
criterion = "0.5.1"
1317
pprof = "0.13.0"
1418
bindgen = "0.69.4"
1519
cc = "1.0.83"
20+
anyhow = "1.0.80"
21+
clap = "4.5.1"
1622

1723
[workspace.lints.rust]
1824
missing_docs = { level = "warn" }

README.md

Lines changed: 141 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,159 @@
11
# 🦙 [llama-cpp-rs][readme]   [![Docs]][docs.rs] [![Latest Version]][crates.io] [![Lisence]][crates.io]
22

33
[Docs]: https://img.shields.io/docsrs/llama-cpp-2.svg
4+
45
[Latest Version]: https://img.shields.io/crates/v/llama-cpp-2.svg
6+
57
[crates.io]: https://crates.io/crates/llama-cpp-2
8+
69
[docs.rs]: https://docs.rs/llama-cpp-2
10+
711
[Lisence]: https://img.shields.io/crates/l/llama-cpp-2.svg
12+
813
[llama-cpp-sys]: https://crates.io/crates/llama-cpp-sys-2
14+
915
[utilityai]: https://utilityai.ca
16+
1017
[readme]: https://github.com/utilityai/llama-cpp-rs/tree/main/llama-cpp-2
1118

12-
This is the home for [llama-cpp-2][crates.io]. It also contains the [llama-cpp-sys] bindings which are updated regularly and in sync with [llama-cpp-2][crates.io].
19+
This is the home for [llama-cpp-2][crates.io]. It also contains the [llama-cpp-sys] bindings which are updated regularly
20+
and in sync with [llama-cpp-2][crates.io].
1321

14-
This project was created with the explict goal of staying as up to date as possible with llama.cpp, as a result it is dead simple, very close to raw bindings, and does not follow semver meaningfully.
22+
This project was created with the explict goal of staying as up to date as possible with llama.cpp, as a result it is
23+
dead simple, very close to raw bindings, and does not follow semver meaningfully.
1524

1625
Check out the [docs.rs] for crate documentation or the [readme] for high level information about the project.
1726

27+
## Try it out!
28+
29+
Clone the repo
30+
31+
```bash
32+
git clone --recursive https://github.com/utilityai/llama-cpp-rs
33+
```
34+
35+
Enter the directory
36+
37+
```bash
38+
cd llama-cpp-rs
39+
```
40+
41+
Run the simple example
42+
43+
```bash
44+
cargo run --release --bin simple "The way to kill a linux process is" hf-model TheBloke/Llama-2-7B-GGUF llama-2-7b.Q4_K_M.gguf
45+
```
46+
47+
Or if you have a GPU and want to use it
48+
49+
```bash
50+
cargo run --features cublas --release --bin simple "The way to kill a linux process is" hf-model TheBloke/Llama-2-7B-GGUF llama-2-7b.Q4_K_M.gguf
51+
```
52+
53+
<details>
54+
<summary>Output</summary>
55+
<pre>
56+
ggml_init_cublas: GGML_CUDA_FORCE_MMQ: no
57+
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
58+
ggml_init_cublas: found 1 CUDA devices:
59+
Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
60+
llama_model_params { n_gpu_layers: 1000, split_mode: 1, main_gpu: 0, tensor_split: 0x0, progress_callback: None, progress_callback_user_data: 0x0, kv_overrides: 0x0, vocab_only: false, use_mmap: true, use_mlock: false }
61+
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /home/marcus/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-GGUF/snapshots/b4e04e128f421c93a5f1e34ac4d7ca9b0af47b80/llama-2-7b.Q4_K_M.gguf (version GGUF V2)
62+
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
63+
llama_model_loader: - kv 0: general.architecture str = llama
64+
llama_model_loader: - kv 1: general.name str = LLaMA v2
65+
llama_model_loader: - kv 2: llama.context_length u32 = 4096
66+
llama_model_loader: - kv 3: llama.embedding_length u32 = 4096
67+
llama_model_loader: - kv 4: llama.block_count u32 = 32
68+
llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008
69+
llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128
70+
llama_model_loader: - kv 7: llama.attention.head_count u32 = 32
71+
llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32
72+
llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
73+
llama_model_loader: - kv 10: general.file_type u32 = 15
74+
llama_model_loader: - kv 11: tokenizer.ggml.model str = llama
75+
llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
76+
llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...
77+
llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
78+
llama_model_loader: - kv 15: tokenizer.ggml.bos_token_id u32 = 1
79+
llama_model_loader: - kv 16: tokenizer.ggml.eos_token_id u32 = 2
80+
llama_model_loader: - kv 17: tokenizer.ggml.unknown_token_id u32 = 0
81+
llama_model_loader: - kv 18: general.quantization_version u32 = 2
82+
llama_model_loader: - type f32: 65 tensors
83+
llama_model_loader: - type q4_K: 193 tensors
84+
llama_model_loader: - type q6_K: 33 tensors
85+
llm_load_vocab: special tokens definition check successful ( 259/32000 ).
86+
llm_load_print_meta: format = GGUF V2
87+
llm_load_print_meta: arch = llama
88+
llm_load_print_meta: vocab type = SPM
89+
llm_load_print_meta: n_vocab = 32000
90+
llm_load_print_meta: n_merges = 0
91+
llm_load_print_meta: n_ctx_train = 4096
92+
llm_load_print_meta: n_embd = 4096
93+
llm_load_print_meta: n_head = 32
94+
llm_load_print_meta: n_head_kv = 32
95+
llm_load_print_meta: n_layer = 32
96+
llm_load_print_meta: n_rot = 128
97+
llm_load_print_meta: n_embd_head_k = 128
98+
llm_load_print_meta: n_embd_head_v = 128
99+
llm_load_print_meta: n_gqa = 1
100+
llm_load_print_meta: n_embd_k_gqa = 4096
101+
llm_load_print_meta: n_embd_v_gqa = 4096
102+
llm_load_print_meta: f_norm_eps = 0.0e+00
103+
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
104+
llm_load_print_meta: f_clamp_kqv = 0.0e+00
105+
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
106+
llm_load_print_meta: n_ff = 11008
107+
llm_load_print_meta: n_expert = 0
108+
llm_load_print_meta: n_expert_used = 0
109+
llm_load_print_meta: rope scaling = linear
110+
llm_load_print_meta: freq_base_train = 10000.0
111+
llm_load_print_meta: freq_scale_train = 1
112+
llm_load_print_meta: n_yarn_orig_ctx = 4096
113+
llm_load_print_meta: rope_finetuned = unknown
114+
llm_load_print_meta: model type = 7B
115+
llm_load_print_meta: model ftype = Q4_K - Medium
116+
llm_load_print_meta: model params = 6.74 B
117+
llm_load_print_meta: model size = 3.80 GiB (4.84 BPW)
118+
llm_load_print_meta: general.name = LLaMA v2
119+
llm_load_print_meta: BOS token = 1 '<s>'
120+
llm_load_print_meta: EOS token = 2 '</s>'
121+
llm_load_print_meta: UNK token = 0 '<unk>'
122+
llm_load_print_meta: LF token = 13 '<0x0A>'
123+
llm_load_tensors: ggml ctx size = 0.22 MiB
124+
llm_load_tensors: offloading 32 repeating layers to GPU
125+
llm_load_tensors: offloading non-repeating layers to GPU
126+
llm_load_tensors: offloaded 33/33 layers to GPU
127+
llm_load_tensors: CUDA0 buffer size = 3820.94 MiB
128+
llm_load_tensors: CPU buffer size = 70.31 MiB
129+
..................................................................................................
130+
Loaded "/home/marcus/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-GGUF/snapshots/b4e04e128f421c93a5f1e34ac4d7ca9b0af47b80/llama-2-7b.Q4_K_M.gguf"
131+
llama_new_context_with_model: n_ctx = 2048
132+
llama_new_context_with_model: freq_base = 10000.0
133+
llama_new_context_with_model: freq_scale = 1
134+
llama_kv_cache_init: CUDA0 KV buffer size = 1024.00 MiB
135+
llama_new_context_with_model: KV self size = 1024.00 MiB, K (f16): 512.00 MiB, V (f16): 512.00 MiB
136+
llama_new_context_with_model: CUDA_Host input buffer size = 13.02 MiB
137+
ggml_gallocr_reserve_n: reallocating CUDA0 buffer from size 0.00 MiB to 164.01 MiB
138+
ggml_gallocr_reserve_n: reallocating CUDA_Host buffer from size 0.00 MiB to 8.00 MiB
139+
llama_new_context_with_model: CUDA0 compute buffer size = 164.01 MiB
140+
llama_new_context_with_model: CUDA_Host compute buffer size = 8.00 MiB
141+
llama_new_context_with_model: graph splits (measure): 3
142+
n_len = 32, n_ctx = 2048, k_kv_req = 32
143+
144+
The way to kill a linux process is to send it a SIGKILL signal.
145+
The way to kill a windows process is to send it a S
146+
147+
decoded 24 tokens in 0.23 s, speed 105.65 t/s
148+
149+
load time = 727.50 ms
150+
sample time = 0.46 ms / 24 runs (0.02 ms per token, 51835.85 tokens per second)
151+
prompt eval time = 68.52 ms / 9 tokens (7.61 ms per token, 131.35 tokens per second)
152+
eval time = 225.70 ms / 24 runs (9.40 ms per token, 106.34 tokens per second)
153+
total time = 954.18 ms
154+
</pre>
155+
</details>
156+
18157
## Hacking
19158

20159
Ensure that when you clone this project you also clone the submodules. This can be done with the following command:

llama-cpp-2/Cargo.toml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,6 @@ hf-hub = { workspace = true }
1818
criterion = { workspace = true }
1919
pprof = { workspace = true, features = ["criterion", "flamegraph"] }
2020

21-
# used in examples
22-
clap = { version = "4.5.1", features = ["derive"] }
23-
anyhow = "1.0.80"
24-
2521
[[bench]]
2622
name = "grammar_bias"
2723
harness = false

llama-cpp-sys-2/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ include = [
4040

4141
[build-dependencies]
4242
bindgen = { workspace = true }
43-
cc = { workspace = true }
43+
cc = { workspace = true, features = ["parallel"] }
4444

4545
[features]
4646
cublas = []

simple/Cargo.toml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[package]
2+
name = "simple"
3+
version = "0.1.0"
4+
edition = "2021"
5+
6+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
7+
8+
[dependencies]
9+
llama-cpp-2 = { path = "../llama-cpp-2", version = "0.1.28" }
10+
hf-hub = { workspace = true }
11+
clap = { workspace = true , features = ["derive"] }
12+
anyhow = { workspace = true }
13+
14+
[features]
15+
cublas = ["llama-cpp-2/cublas"]
16+
17+
[lints]
18+
workspace = true

llama-cpp-2/examples/simple.rs renamed to simple/src/main.rs

Lines changed: 52 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//! This is an translation of simple.cpp in llama.cpp using llama-cpp-2.
1+
//! This is a translation of simple.cpp in llama.cpp using llama-cpp-2.
22
#![allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation, clippy::cast_precision_loss, clippy::cast_sign_loss)]
33

44
use anyhow::{bail, Context, Result};
@@ -15,42 +15,79 @@ use std::io::Write;
1515
use std::num::NonZeroU32;
1616
use std::path::PathBuf;
1717
use std::time::Duration;
18+
use hf_hub::api::sync::ApiBuilder;
1819

19-
#[derive(clap::Parser)]
20+
#[derive(clap::Parser, Debug, Clone)]
2021
struct Args {
2122
/// The path to the model
22-
model_path: PathBuf,
23+
#[command(subcommand)]
24+
model: Model,
2325
/// The prompt
2426
#[clap(default_value = "Hello my name is")]
2527
prompt: String,
28+
/// set the length of the prompt + output in tokens
29+
#[arg(long, default_value_t = 32)]
30+
n_len: i32,
2631
/// Disable offloading layers to the gpu
2732
#[cfg(feature = "cublas")]
2833
#[clap(long)]
2934
disable_gpu: bool,
3035
}
3136

37+
#[derive(clap::Subcommand, Debug, Clone)]
38+
enum Model {
39+
/// Use an already downloaded model
40+
Local {
41+
/// The path to the model. e.g. `/home/marcus/.cache/huggingface/hub/models--TheBloke--Llama-2-7B-Chat-GGUF/blobs/08a5566d61d7cb6b420c3e4387a39e0078e1f2fe5f055f3a03887385304d4bfa`
42+
path: PathBuf,
43+
},
44+
/// Download a model from huggingface (or use a cached version)
45+
#[clap(name = "hf-model")]
46+
HuggingFace {
47+
/// the repo containing the model. e.g. `TheBloke/Llama-2-7B-Chat-GGUF`
48+
repo: String,
49+
/// the model name. e.g. `llama-2-7b-chat.Q4_K_M.gguf`
50+
model: String,
51+
},
52+
}
53+
54+
impl Model {
55+
/// Convert the model to a path - may download from huggingface
56+
fn to_path(self) -> Result<PathBuf> {
57+
match self {
58+
Model::Local { path } => Ok(path),
59+
Model::HuggingFace { model, repo } => ApiBuilder::new()
60+
.with_progress(true)
61+
.build()
62+
.with_context(|| "unable to create huggingface api")?
63+
.model(repo)
64+
.get(&model)
65+
.with_context(|| "unable to download model")
66+
}
67+
}
68+
}
69+
3270
fn main() -> Result<()> {
33-
let params = Args::parse();
71+
let Args { n_len, model, prompt, #[cfg(feature = "cublas")] disable_gpu } = Args::parse();
3472

3573
// init LLM
3674
let backend = LlamaBackend::init()?;
3775

38-
// total length of the sequence including the prompt
39-
let n_len: i32 = 32;
40-
4176
// offload all layers to the gpu
4277
let model_params = {
4378
#[cfg(feature = "cublas")]
44-
if !params.disable_gpu {
79+
if !disable_gpu {
4580
LlamaModelParams::default().with_n_gpu_layers(1000)
4681
} else {
4782
LlamaModelParams::default()
4883
}
4984
#[cfg(not(feature = "cublas"))]
5085
LlamaModelParams::default()
5186
};
87+
88+
let model_path = model.to_path().with_context(|| "failed to get model from args")?;
5289

53-
let model = LlamaModel::load_from_file(&backend, params.model_path, &model_params)
90+
let model = LlamaModel::load_from_file(&backend, model_path, &model_params)
5491
.with_context(|| "unable to load model")?;
5592

5693
// initialize the context
@@ -65,8 +102,8 @@ fn main() -> Result<()> {
65102
// tokenize the prompt
66103

67104
let tokens_list = model
68-
.str_to_token(&params.prompt, AddBos::Always)
69-
.with_context(|| format!("failed to tokenize {}", params.prompt))?;
105+
.str_to_token(&prompt, AddBos::Always)
106+
.with_context(|| format!("failed to tokenize {}", prompt))?;
70107

71108
let n_cxt = ctx.n_ctx() as i32;
72109
let n_kv_req = tokens_list.len() as i32 + (n_len - tokens_list.len() as i32);
@@ -81,6 +118,10 @@ either reduce n_len or increase n_ctx"
81118
)
82119
}
83120

121+
if tokens_list.len() >= usize::try_from(n_len)? {
122+
bail!("the prompt is too long, it has more tokens than n_len")
123+
}
124+
84125
// print the prompt token-by-token
85126
eprintln!();
86127

0 commit comments

Comments
 (0)