Skip to content

Commit 186415d

Browse files
authored
ggml-cpu: drop support for nnpa intrinsics (#15821)
1 parent fd62188 commit 186415d

File tree

8 files changed

+32
-144
lines changed

8 files changed

+32
-144
lines changed

docs/build-s390x.md

Lines changed: 32 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -42,18 +42,6 @@ cmake --build build --config Release -j $(nproc)
4242
cmake --build build --config Release -j $(nproc)
4343
```
4444

45-
- By default, NNPA is disabled by default. To enable it:
46-
47-
```bash
48-
cmake -S . -B build \
49-
-DCMAKE_BUILD_TYPE=Release \
50-
-DGGML_BLAS=ON \
51-
-DGGML_BLAS_VENDOR=OpenBLAS \
52-
-DGGML_NNPA=ON
53-
54-
cmake --build build --config Release -j $(nproc)
55-
```
56-
5745
- For debug builds:
5846

5947
```bash
@@ -164,15 +152,11 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
164152

165153
Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
166154

167-
### 2. NNPA Vector Intrinsics Acceleration
168-
169-
Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
170-
171-
### 3. zDNN Accelerator (WIP)
155+
### 2. zDNN Accelerator (WIP)
172156

173157
Only available in IBM z17/LinuxONE 5 or later system with the `-DGGML_ZDNN=ON` compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs will default back to CPU routines.
174158

175-
### 4. Spyre Accelerator
159+
### 3. Spyre Accelerator
176160

177161
_Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._
178162

@@ -230,10 +214,6 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
230214
CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
231215
```
232216

233-
5. `-DGGML_NNPA=ON` generates gibberish output
234-
235-
Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
236-
237217
## Getting Help on IBM Z & LinuxONE
238218

239219
1. **Bugs, Feature Requests**
@@ -258,38 +238,38 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
258238

259239
## Appendix B: SIMD Support Matrix
260240

261-
| | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
262-
| ---------- | ----------- | ---- | ---- | ----- |
263-
| FP32 |||||
264-
| FP16 |||||
265-
| BF16 | 🚫 | 🚫 |||
266-
| Q4_0 |||||
267-
| Q4_1 |||||
268-
| MXFP4 | 🚫 | 🚫 |||
269-
| Q5_0 |||||
270-
| Q5_1 |||||
271-
| Q8_0 |||||
272-
| Q2_K | 🚫 | 🚫 |||
273-
| Q3_K |||||
274-
| Q4_K |||||
275-
| Q5_K |||||
276-
| Q6_K |||||
277-
| TQ1_0 | 🚫 | 🚫 |||
278-
| TQ2_0 | 🚫 | 🚫 |||
279-
| IQ2_XXS | 🚫 | 🚫 |||
280-
| IQ2_XS | 🚫 | 🚫 |||
281-
| IQ2_S | 🚫 | 🚫 |||
282-
| IQ3_XXS | 🚫 | 🚫 |||
283-
| IQ3_S | 🚫 | 🚫 |||
284-
| IQ1_S | 🚫 | 🚫 |||
285-
| IQ1_M | 🚫 | 🚫 |||
286-
| IQ4_NL |||||
287-
| IQ4_XS |||||
288-
| FP32->FP16 | 🚫 ||||
289-
| FP16->FP32 | 🚫 ||||
241+
| | VX/VXE/VXE2 | zDNN | Spyre |
242+
|------------|-------------|------|-------|
243+
| FP32 | | | |
244+
| FP16 | | | |
245+
| BF16 | 🚫 | | |
246+
| Q4_0 | | | |
247+
| Q4_1 | | | |
248+
| MXFP4 | 🚫 | | |
249+
| Q5_0 | | | |
250+
| Q5_1 | | | |
251+
| Q8_0 | | | |
252+
| Q2_K | 🚫 | | |
253+
| Q3_K | | | |
254+
| Q4_K | | | |
255+
| Q5_K | | | |
256+
| Q6_K | | | |
257+
| TQ1_0 | 🚫 | | |
258+
| TQ2_0 | 🚫 | | |
259+
| IQ2_XXS | 🚫 | | |
260+
| IQ2_XS | 🚫 | | |
261+
| IQ2_S | 🚫 | | |
262+
| IQ3_XXS | 🚫 | | |
263+
| IQ3_S | 🚫 | | |
264+
| IQ1_S | 🚫 | | |
265+
| IQ1_M | 🚫 | | |
266+
| IQ4_NL | | | |
267+
| IQ4_XS | | | |
268+
| FP32->FP16 | 🚫 | | |
269+
| FP16->FP32 | 🚫 | | |
290270

291271
- ✅ - acceleration available
292272
- 🚫 - acceleration unavailable, will still run using scalar implementation
293273
- ❓ - acceleration unknown, please contribute if you can test it yourself
294274

295-
Last Updated by **Aaron Teo ([email protected])** on Aug 22, 2025.
275+
Last Updated by **Aaron Teo ([email protected])** on Sep 6, 2025.

ggml/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,6 @@ option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
134134
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
135135
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
136136
option(GGML_VXE "ggml: enable vxe" ON)
137-
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
138137

139138
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
140139
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")

ggml/include/ggml-cpu.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,6 @@ extern "C" {
101101
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
102102
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
103103
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
104-
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
105104
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
106105
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
107106

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
457457

458458
# TODO: Separation to determine activation of VX/VXE/VXE2
459459
if (${S390X_M} MATCHES "8561|8562")
460-
set(GGML_NNPA OFF)
461460
message(STATUS "z15 target")
462461
list(APPEND ARCH_FLAGS -march=z15)
463462
elseif (${S390X_M} MATCHES "3931")
@@ -479,11 +478,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
479478
list(APPEND ARCH_FLAGS -mvx -mzvector)
480479
list(APPEND ARCH_DEFINITIONS GGML_VXE)
481480
endif()
482-
483-
if (GGML_NNPA)
484-
message(STATUS "NNPA enabled")
485-
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
486-
endif()
487481
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
488482
message(STATUS "Wasm detected")
489483
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)

ggml/src/ggml-cpu/ggml-cpu-impl.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,6 @@ struct ggml_compute_params {
6868
#endif // __VXE2__
6969
#endif // __s390x__ && __VEC__
7070

71-
#if defined(__s390x__) && defined(GGML_NNPA)
72-
#ifndef __NNPA__
73-
#define __NNPA__
74-
#endif // __NNPA__
75-
#endif // __s390x__ && GGML_NNPA
76-
7771
#if defined(__ARM_FEATURE_SVE)
7872
#include <sys/prctl.h>
7973
#endif

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3211,21 +3211,6 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
32113211
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
32123212
_mm_storel_epi64((__m128i *)(y + i), y_vec);
32133213
}
3214-
#elif defined(__NNPA__)
3215-
for (; i + 7 < n; i += 8) {
3216-
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
3217-
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
3218-
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
3219-
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3220-
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
3221-
}
3222-
for (; i + 3 < n; i += 4) {
3223-
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
3224-
float32x4_t v_zero = vec_splats(0.0f);
3225-
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
3226-
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
3227-
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
3228-
}
32293214
#elif defined(__riscv_zvfh)
32303215
for (int vl; i < n; i += vl) {
32313216
vl = __riscv_vsetvl_e32m2(n - i);
@@ -3259,21 +3244,6 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
32593244
__m128 y_vec = _mm_cvtph_ps(x_vec);
32603245
_mm_storeu_ps(y + i, y_vec);
32613246
}
3262-
#elif defined(__NNPA__)
3263-
for (; i + 7 < n; i += 8) {
3264-
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
3265-
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3266-
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3267-
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
3268-
vec_xst(v_yh, 0, (float *)(y + i + 0));
3269-
vec_xst(v_yl, 0, (float *)(y + i + 4));
3270-
}
3271-
for (; i + 3 < n; i += 4) {
3272-
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
3273-
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
3274-
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
3275-
vec_xst(v_yh, 0, (float *)(y + i));
3276-
}
32773247
#endif
32783248

32793249
for (; i < n; ++i) {
@@ -3477,14 +3447,6 @@ int ggml_cpu_has_vxe(void) {
34773447
#endif
34783448
}
34793449

3480-
int ggml_cpu_has_nnpa(void) {
3481-
#if defined(GGML_NNPA)
3482-
return 1;
3483-
#else
3484-
return 0;
3485-
#endif
3486-
}
3487-
34883450
int ggml_cpu_has_neon(void) {
34893451
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
34903452
return 1;

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -576,9 +576,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
576576
if (ggml_cpu_has_vxe()) {
577577
features.push_back({ "VXE", "1" });
578578
}
579-
if (ggml_cpu_has_nnpa()) {
580-
features.push_back({ "NNPA", "1" });
581-
}
582579
if (ggml_cpu_has_wasm_simd()) {
583580
features.push_back({ "WASM_SIMD", "1" });
584581
}

ggml/src/ggml-cpu/simd-mappings.h

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -114,26 +114,6 @@ extern "C" {
114114
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
115115
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
116116
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
117-
#elif defined(__NNPA__)
118-
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
119-
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
120-
121-
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
122-
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
123-
124-
static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
125-
uint16x8_t v_h = vec_splats(h);
126-
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
127-
return vec_extend_to_fp32_hi(v_hd, 0)[0];
128-
}
129-
130-
static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
131-
float32x4_t v_f = vec_splats(f);
132-
float32x4_t v_zero = vec_splats(0.0f);
133-
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
134-
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
135-
return vec_extract(v_h, 0);
136-
}
137117
#endif
138118

139119
// precomputed f32 table for f16 (256 KB)
@@ -1156,11 +1136,6 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
11561136
#define GGML_F16_EPR GGML_F32_EPR
11571137

11581138
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
1159-
#if defined(__NNPA__)
1160-
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
1161-
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
1162-
return vec_extend_to_fp32_hi(v_xd, 0);
1163-
#else
11641139
float tmp[4];
11651140

11661141
for (int i = 0; i < 4; i++) {
@@ -1170,20 +1145,9 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
11701145
// note: keep type-cast here to prevent compiler bugs
11711146
// see: https://github.com/ggml-org/llama.cpp/issues/12846
11721147
return vec_xl(0, (const float *)(tmp));
1173-
#endif
11741148
}
11751149

11761150
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
1177-
#if defined(__NNPA__)
1178-
float32x4_t v_zero = vec_splats(0.0f);
1179-
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
1180-
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
1181-
1182-
x[0] = vec_extract(v_x, 0);
1183-
x[1] = vec_extract(v_x, 1);
1184-
x[2] = vec_extract(v_x, 2);
1185-
x[3] = vec_extract(v_x, 3);
1186-
#else
11871151
float arr[4];
11881152

11891153
// note: keep type-cast here to prevent compiler bugs
@@ -1193,7 +1157,6 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
11931157
for (int i = 0; i < 4; i++) {
11941158
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
11951159
}
1196-
#endif
11971160
}
11981161

11991162
#define GGML_F16_VEC GGML_F32x4

0 commit comments

Comments
 (0)