|
15 | 15 | import importlib.util
|
16 | 16 | import logging
|
17 | 17 | import os
|
| 18 | +import pprint |
18 | 19 | import queue
|
19 | 20 | from typing import Iterator, List, Optional, Union
|
20 | 21 |
|
|
24 | 25 | from ..core import LLM
|
25 | 26 | from ..llm_family import LLMFamilyV1, LLMSpecV1
|
26 | 27 | from ..utils import ChatModelMixin
|
| 28 | +from .memory import estimate_gpu_layers |
27 | 29 |
|
28 | 30 | logger = logging.getLogger(__name__)
|
29 | 31 |
|
@@ -95,7 +97,12 @@ def match_json(
|
95 | 97 |
|
96 | 98 | def load(self):
|
97 | 99 | try:
|
98 |
| - from xllamacpp import CommonParams, Server |
| 100 | + from xllamacpp import ( |
| 101 | + CommonParams, |
| 102 | + Server, |
| 103 | + get_device_info, |
| 104 | + ggml_backend_dev_type, |
| 105 | + ) |
99 | 106 | except ImportError:
|
100 | 107 | error_message = "Failed to import module 'xllamacpp'"
|
101 | 108 | installation_guide = ["Please make sure 'xllamacpp' is installed. "]
|
@@ -175,6 +182,41 @@ def load(self):
|
175 | 182 | # Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
|
176 | 183 | # 0x7FFFFFFF is INT32 max, will be auto set to all layers
|
177 | 184 | params.n_gpu_layers = 0x7FFFFFFF
|
| 185 | + try: |
| 186 | + device_info = get_device_info() |
| 187 | + gpus = [ |
| 188 | + info |
| 189 | + for info in device_info |
| 190 | + if info["type"] |
| 191 | + == ggml_backend_dev_type.GGML_BACKEND_DEVICE_TYPE_GPU |
| 192 | + ] |
| 193 | + if gpus: |
| 194 | + logger.info( |
| 195 | + "Try to estimate num gpu layers, n_ctx: %s, n_batch: %s, n_parallel: %s, gpus:\n%s", |
| 196 | + params.n_ctx, |
| 197 | + params.n_batch, |
| 198 | + params.n_parallel, |
| 199 | + pprint.pformat(gpus), |
| 200 | + ) |
| 201 | + estimate = estimate_gpu_layers( |
| 202 | + gpus=gpus, |
| 203 | + model_path=model_path, |
| 204 | + projectors=[mmproj] if mmproj else [], |
| 205 | + context_length=params.n_ctx, |
| 206 | + batch_size=params.n_batch, |
| 207 | + num_parallel=params.n_parallel, |
| 208 | + kv_cache_type="", |
| 209 | + ) |
| 210 | + logger.info("Estimate num gpu layers: %s", estimate) |
| 211 | + if estimate.tensor_split: |
| 212 | + params.tensor_split = estimate.tensor_split |
| 213 | + else: |
| 214 | + params.n_gpu_layers = estimate.layers |
| 215 | + except Exception as e: |
| 216 | + logger.exception( |
| 217 | + "Estimate num gpu layers for llama.cpp backend failed: %s", e |
| 218 | + ) |
| 219 | + |
178 | 220 | self._llm = Server(params)
|
179 | 221 | self._executor = concurrent.futures.ThreadPoolExecutor(
|
180 | 222 | max_workers=max(10, n_threads)
|
|
0 commit comments