Skip to content

Commit a459ddc

Browse files
authored
Merge pull request #217 from menloresearch/update-dev-from-master-2025-08-25-00-12
Sync master with upstream release b6264
2 parents 6abf9ba + 3fa428a commit a459ddc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+5192
-1565
lines changed

.devops/vulkan.Dockerfile

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,30 @@ ARG UBUNTU_VERSION=24.04
22

33
FROM ubuntu:$UBUNTU_VERSION AS build
44

5-
# Install build tools
6-
RUN apt update && apt install -y git build-essential cmake wget
5+
# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html
76

8-
# Install Vulkan SDK and cURL
9-
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
10-
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
11-
apt update -y && \
12-
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
7+
# Install build tools
8+
RUN apt update && apt install -y git build-essential cmake wget xz-utils
9+
10+
# Install Vulkan SDK
11+
ARG VULKAN_VERSION=1.4.321.1
12+
RUN ARCH=$(uname -m) && \
13+
wget -qO /tmp/vulkan-sdk.tar.xz https://sdk.lunarg.com/sdk/download/${VULKAN_VERSION}/linux/vulkan-sdk-linux-${ARCH}-${VULKAN_VERSION}.tar.xz && \
14+
mkdir -p /opt/vulkan && \
15+
tar -xf /tmp/vulkan-sdk.tar.xz -C /tmp --strip-components=1 && \
16+
mv /tmp/${ARCH}/* /opt/vulkan/ && \
17+
rm -rf /tmp/*
18+
19+
# Install cURL and Vulkan SDK dependencies
20+
RUN apt install -y libcurl4-openssl-dev curl \
21+
libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev
22+
23+
# Set environment variables
24+
ENV VULKAN_SDK=/opt/vulkan
25+
ENV PATH=$VULKAN_SDK/bin:$PATH
26+
ENV LD_LIBRARY_PATH=$VULKAN_SDK/lib:$LD_LIBRARY_PATH
27+
ENV CMAKE_PREFIX_PATH=$VULKAN_SDK:$CMAKE_PREFIX_PATH
28+
ENV PKG_CONFIG_PATH=$VULKAN_SDK/lib/pkgconfig:$PKG_CONFIG_PATH
1329

1430
# Build it
1531
WORKDIR /app

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
151151
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
152152
- [x] [GLM-EDGE](https://huggingface.co/models?search=glm-edge)
153153
- [x] [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d)
154+
- [x] [LFM2-VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa)
154155

155156
</details>
156157

common/arg.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
22542254
).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
22552255
add_opt(common_arg(
22562256
{"-dt", "--defrag-thold"}, "N",
2257-
string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
2257+
string_format("KV cache defragmentation threshold (DEPRECATED)"),
22582258
[](common_params & params, const std::string & value) {
2259-
params.defrag_thold = std::stof(value);
2259+
GGML_UNUSED(params);
2260+
GGML_UNUSED(value);
2261+
LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n");
22602262
}
22612263
).set_env("LLAMA_ARG_DEFRAG_THOLD"));
22622264
add_opt(common_arg(

common/chat.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1361,6 +1361,26 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
13611361
"<|end|>",
13621362
};
13631363

1364+
if (!inputs.json_schema.is_null()) {
1365+
data.grammar_lazy = false;
1366+
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
1367+
auto schema = inputs.json_schema;
1368+
builder.resolve_refs(schema);
1369+
1370+
auto not_end = builder.add_rule("not-end",
1371+
"[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]");
1372+
auto analysis = builder.add_rule("analysis",
1373+
"\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\"");
1374+
auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+");
1375+
auto final = builder.add_rule("final",
1376+
"\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " +
1377+
builder.add_schema("response", schema)
1378+
);
1379+
1380+
builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final);
1381+
});
1382+
}
1383+
13641384
if (inputs.tools.is_array() && !inputs.tools.empty()) {
13651385
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
13661386
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
@@ -2121,7 +2141,7 @@ static common_chat_params common_chat_templates_apply_jinja(
21212141
}
21222142

21232143
// GPT-OSS
2124-
if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
2144+
if (src.find("<|channel|>") != std::string::npos) {
21252145
return common_chat_params_init_gpt_oss(tmpl, params);
21262146
}
21272147

common/common.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1152,7 +1152,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
11521152
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
11531153
cparams.pooling_type = params.pooling_type;
11541154
cparams.attention_type = params.attention_type;
1155-
cparams.defrag_thold = params.defrag_thold;
11561155
cparams.cb_eval = params.cb_eval;
11571156
cparams.cb_eval_user_data = params.cb_eval_user_data;
11581157
cparams.offload_kqv = !params.no_kv_offload;

common/common.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,6 @@ struct common_params {
288288
float yarn_beta_fast = 32.0f; // YaRN low correction dim
289289
float yarn_beta_slow = 1.0f; // YaRN high correction dim
290290
int32_t yarn_orig_ctx = 0; // YaRN original context length
291-
float defrag_thold = 0.1f; // KV cache defragmentation threshold
292291

293292
// offload params
294293
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

convert_hf_to_gguf.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5854,6 +5854,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
58545854
return [(self.map_tensor_name(name), data_torch)]
58555855

58565856

5857+
@ModelBase.register("SeedOssForCausalLM")
5858+
class SeedOssModel(TextModel):
5859+
model_arch = gguf.MODEL_ARCH.SEED_OSS
5860+
5861+
58575862
@ModelBase.register("Olmo2ForCausalLM")
58585863
class Olmo2Model(TextModel):
58595864
model_arch = gguf.MODEL_ARCH.OLMO2

docs/build-s390x.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,9 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
265265
| BF16 | 🚫 | 🚫 |||
266266
| Q4_0 |||||
267267
| Q4_1 |||||
268-
| Q5_0 | 🚫 | 🚫 |||
269-
| Q5_1 | 🚫 | 🚫 |||
268+
| MXFP4 | 🚫 | 🚫 |||
269+
| Q5_0 |||||
270+
| Q5_1 |||||
270271
| Q8_0 |||||
271272
| Q2_K | 🚫 | 🚫 |||
272273
| Q3_K |||||
@@ -291,4 +292,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
291292
- 🚫 - acceleration unavailable, will still run using scalar implementation
292293
- ❓ - acceleration unknown, please contribute if you can test it yourself
293294

294-
Last Updated by **Aaron Teo ([email protected])** on July 31, 2025.
295+
Last Updated by **Aaron Teo ([email protected])** on Aug 22, 2025.

examples/llama.vim

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"
1818
" start the llama.cpp server with a FIM-compatible model. for example:
1919
"
20-
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256
20+
" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 256
2121
"
2222
" --batch-size [512, model max context]
2323
"

ggml/include/ggml.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,7 @@ extern "C" {
512512
GGML_OP_IM2COL,
513513
GGML_OP_IM2COL_BACK,
514514
GGML_OP_CONV_2D,
515+
GGML_OP_CONV_3D,
515516
GGML_OP_CONV_2D_DW,
516517
GGML_OP_CONV_TRANSPOSE_2D,
517518
GGML_OP_POOL_1D,
@@ -1940,6 +1941,23 @@ extern "C" {
19401941
int d0, // dilation dimension 0
19411942
int d1); // dilation dimension 1
19421943

1944+
GGML_API struct ggml_tensor * ggml_conv_3d(
1945+
struct ggml_context * ctx,
1946+
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
1947+
struct ggml_tensor * b, // input [W, H, D, C * N]
1948+
int s0, // stride
1949+
int s1,
1950+
int s2,
1951+
int p0, // padding
1952+
int p1,
1953+
int p2,
1954+
int d0, // dilation
1955+
int d1,
1956+
int d2,
1957+
int n_channels,
1958+
int n_batch,
1959+
int n_channels_out);
1960+
19431961
enum ggml_op_pool {
19441962
GGML_OP_POOL_MAX,
19451963
GGML_OP_POOL_AVG,

0 commit comments

Comments
 (0)