Skip to content

Commit bad4232

Browse files
authored
FEAT: Auto ngl for llama.cpp backend (#3518)
1 parent 813cfb3 commit bad4232

File tree

11 files changed

+598
-7
lines changed

11 files changed

+598
-7
lines changed

.github/workflows/python.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ jobs:
125125
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
126126
fi
127127
pip install -e ".[dev]"
128-
pip install "xllamacpp>=0.1.16"
128+
pip install "xllamacpp>=0.1.18" gguf
129129
if [ "$MODULE" == "metal" ]; then
130130
conda install -c conda-forge "ffmpeg<7"
131131
pip install "mlx>=0.22.0"
@@ -167,6 +167,7 @@ jobs:
167167
${{ env.SELF_HOST_PYTHON }} -m pip install -U -e ".[audio]"
168168
${{ env.SELF_HOST_PYTHON }} -m pip install -U "openai>1"
169169
${{ env.SELF_HOST_PYTHON }} -m pip install -U modelscope
170+
${{ env.SELF_HOST_PYTHON }} -m pip install -U gguf
170171
${{ env.SELF_HOST_PYTHON }} -m pip install -U sse_starlette
171172
${{ env.SELF_HOST_PYTHON }} -m pip install -U xoscar
172173
${{ env.SELF_HOST_PYTHON }} -m pip install -U "python-jose[cryptography]"

setup.cfg

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ intel =
9494
torch==2.1.0a0
9595
intel_extension_for_pytorch==2.1.10+xpu
9696
llama_cpp =
97-
xllamacpp>=0.1.16
97+
xllamacpp>=0.1.18
98+
gguf
9899
transformers =
99100
transformers>=4.46.0
100101
torch

xinference/deploy/docker/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ RUN pip install --upgrade -i "$PIP_INDEX" pip setuptools wheel&& \
4040
git restore . && \
4141
pip install -i "$PIP_INDEX" --no-deps "." && \
4242
pip uninstall xllamacpp -y && \
43-
pip install "xllamacpp>=0.1.16" --index-url https://xorbitsai.github.io/xllamacpp/whl/cu124 && \
43+
pip install "xllamacpp>=0.1.18" --index-url https://xorbitsai.github.io/xllamacpp/whl/cu124 && \
4444
# clean packages
4545
pip cache purge
4646

xinference/deploy/docker/cpu.Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ RUN python -m pip install --upgrade -i "$PIP_INDEX" pip && \
2727
python setup.py build_web && \
2828
git restore . && \
2929
pip install -i "$PIP_INDEX" --no-deps "." && \
30-
pip install -i "$PIP_INDEX" "xllamacpp>=0.1.16" && \
30+
pip install -i "$PIP_INDEX" "xllamacpp>=0.1.18" && \
3131
# clean packages
3232
pip cache purge
3333

xinference/deploy/docker/requirements-base.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ async-timeout
2222
peft
2323
opencv-contrib-python-headless
2424
setproctitle
25+
gguf

xinference/deploy/docker/requirements_cpu-base.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,4 @@ ormsgpack # For Fish Speech
2828
cachetools # For Fish Speech
2929
imageio-ffmpeg # For video
3030
opencv-contrib-python-headless
31+
gguf

xinference/model/audio/tests/test_cosyvoice.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020

2121
@pytest.mark.parametrize("model_name", ["CosyVoice-300M-SFT", "CosyVoice2-0.5B"])
22+
@pytest.mark.skip(reason="The diffusers on the GPU CI action is not compatible.")
2223
def test_cosyvoice_sft(setup, model_name):
2324
endpoint, _ = setup
2425
from ....client import Client
@@ -72,6 +73,7 @@ def test_cosyvoice_sft(setup, model_name):
7273

7374

7475
@pytest.mark.parametrize("model_name", ["CosyVoice-300M", "CosyVoice2-0.5B"])
76+
@pytest.mark.skip(reason="The diffusers on the GPU CI action is not compatible.")
7577
def test_cosyvoice(setup, model_name):
7678
endpoint, _ = setup
7779
from ....client import Client
@@ -122,6 +124,7 @@ def test_cosyvoice(setup, model_name):
122124

123125

124126
@pytest.mark.parametrize("model_name", ["CosyVoice-300M-Instruct", "CosyVoice2-0.5B"])
127+
@pytest.mark.skip(reason="The diffusers on the GPU CI action is not compatible.")
125128
def test_cosyvoice_instruct(setup, model_name):
126129
endpoint, _ = setup
127130
from ....client import Client
@@ -154,7 +157,8 @@ def test_cosyvoice_instruct(setup, model_name):
154157
else:
155158
# inference without instruction
156159
response = model.speech(
157-
"在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。", voice="中文男"
160+
"在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。",
161+
voice="中文男",
158162
)
159163
assert type(response) is bytes
160164
assert len(response) > 0

xinference/model/llm/llama_cpp/core.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import importlib.util
1616
import logging
1717
import os
18+
import pprint
1819
import queue
1920
from typing import Iterator, List, Optional, Union
2021

@@ -24,6 +25,7 @@
2425
from ..core import LLM
2526
from ..llm_family import LLMFamilyV1, LLMSpecV1
2627
from ..utils import ChatModelMixin
28+
from .memory import estimate_gpu_layers
2729

2830
logger = logging.getLogger(__name__)
2931

@@ -95,7 +97,12 @@ def match_json(
9597

9698
def load(self):
9799
try:
98-
from xllamacpp import CommonParams, Server
100+
from xllamacpp import (
101+
CommonParams,
102+
Server,
103+
get_device_info,
104+
ggml_backend_dev_type,
105+
)
99106
except ImportError:
100107
error_message = "Failed to import module 'xllamacpp'"
101108
installation_guide = ["Please make sure 'xllamacpp' is installed. "]
@@ -175,6 +182,41 @@ def load(self):
175182
# Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
176183
# 0x7FFFFFFF is INT32 max, will be auto set to all layers
177184
params.n_gpu_layers = 0x7FFFFFFF
185+
try:
186+
device_info = get_device_info()
187+
gpus = [
188+
info
189+
for info in device_info
190+
if info["type"]
191+
== ggml_backend_dev_type.GGML_BACKEND_DEVICE_TYPE_GPU
192+
]
193+
if gpus:
194+
logger.info(
195+
"Try to estimate num gpu layers, n_ctx: %s, n_batch: %s, n_parallel: %s, gpus:\n%s",
196+
params.n_ctx,
197+
params.n_batch,
198+
params.n_parallel,
199+
pprint.pformat(gpus),
200+
)
201+
estimate = estimate_gpu_layers(
202+
gpus=gpus,
203+
model_path=model_path,
204+
projectors=[mmproj] if mmproj else [],
205+
context_length=params.n_ctx,
206+
batch_size=params.n_batch,
207+
num_parallel=params.n_parallel,
208+
kv_cache_type="",
209+
)
210+
logger.info("Estimate num gpu layers: %s", estimate)
211+
if estimate.tensor_split:
212+
params.tensor_split = estimate.tensor_split
213+
else:
214+
params.n_gpu_layers = estimate.layers
215+
except Exception as e:
216+
logger.exception(
217+
"Estimate num gpu layers for llama.cpp backend failed: %s", e
218+
)
219+
178220
self._llm = Server(params)
179221
self._executor = concurrent.futures.ThreadPoolExecutor(
180222
max_workers=max(10, n_threads)

0 commit comments

Comments
 (0)