@@ -46,18 +46,18 @@ def kv_cache_memory(self) -> float:
4646 Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation.
4747 """
4848 seq_len = self .seq_len or self .llm_config .max_seq_len
49- c = self .llm_config
49+ llm_config = self .llm_config
5050 kv_cache_dtype_bytes = QUANT_MAPPING .get (
51- c .weight_dtype , 2
51+ llm_config .weight_dtype , 2
5252 ) # vLLM uses model's weight applied to KV cache
5353
5454 total_bytes = (
5555 self .batch_size
56- * c .num_hidden_layers
56+ * llm_config .num_hidden_layers
5757 * 2
58- * c .num_attention_heads
58+ * llm_config .num_attention_heads
5959 * seq_len
60- * c .head_dim
60+ * llm_config .head_dim
6161 * kv_cache_dtype_bytes
6262 )
6363 return total_bytes / 1e9
@@ -69,15 +69,17 @@ def model_memory(self) -> float:
6969
7070 Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible.
7171 """
72- c = self .llm_config
73- embedding_count = 1 if getattr ( c , " tie_word_embeddings" , True ) else 2
72+ llm_config = self .llm_config
73+ embedding_count = 1 if llm_config . tie_word_embeddings else 2
7474 embedding_params = (
75- embedding_count * c .vocab_size * c .hidden_size
75+ embedding_count * llm_config .vocab_size * llm_config .hidden_size
7676 ) # input and output untied
77- layer_params = 12 * c .num_hidden_layers * (c .hidden_size ** 2 ) # GPT-style
77+ layer_params = (
78+ 12 * llm_config .num_hidden_layers * (llm_config .hidden_size ** 2 )
79+ ) # GPT-style
7880 num_params = layer_params + embedding_params
7981
80- return num_params * c .bytes_per_parameter / 1e9
82+ return num_params * llm_config .bytes_per_parameter / 1e9
8183
8284 @property
8385 def total_memory (self ) -> float :
@@ -120,17 +122,24 @@ def construct_deployment_params(self) -> str:
120122 -------
121123 str: Parameter string for model deployment.
122124 """
123- c = self .llm_config
125+ llm_config = self .llm_config
124126 params = []
125- if self .seq_len < c .max_seq_len :
127+ if self .seq_len < llm_config .max_seq_len :
126128 params .append (VLLM_PARAMS ["max_model_len" ])
127129 params .append (str (self .seq_len ))
128130
129131 # Only suggest in-flight quantization for unquantized models when such quantization is requested
130- if not c .quantization and c .in_flight_quantization in IN_FLIGHT_QUANTIZATION :
132+ if (
133+ not llm_config .quantization
134+ and llm_config .in_flight_quantization in IN_FLIGHT_QUANTIZATION
135+ ):
131136 # vLLM only supports 4bit in-flight quantization
132137 params .append (VLLM_PARAMS ["in_flight_quant" ])
133138
139+ # add trust-remote-code if custom modules are specified
140+ if llm_config .trust_remote_code :
141+ params .append (VLLM_PARAMS ["trust_remote_code" ])
142+
134143 params = " " .join (params ) if params else ""
135144 return params
136145
@@ -154,12 +163,12 @@ def suggest_param_advice(self, allowed: float) -> str:
154163 wt_gb = self .model_memory
155164 batch_size = self .batch_size
156165 seq_len = self .seq_len
157- weight_size = getattr ( self .llm_config , " weight_dtype" , "unknown" )
166+ weight_size = self .llm_config . weight_dtype
158167 config = self .llm_config
159168
160169 suggested_quant_msg = None
161170 quant_advice = ", " .join (config .suggested_quantizations )
162- quantization = getattr ( config , " quantization" , None )
171+ quantization = config . quantization
163172
164173 advice = []
165174
@@ -246,7 +255,7 @@ def limiting_factor(
246255 )
247256 else :
248257 advice = (
249- f"No override PARAMS needed. \n \n Model fits well within the allowed compute shape "
258+ f"Model fits well within the allowed compute shape "
250259 f"({ required :.1f} GB used / { allowed_gpu_memory :.1f} GB allowed)."
251260 )
252261 return advice
@@ -268,22 +277,22 @@ def model_memory(self) -> float:
268277 Returns estimated model parameter memory (in GB), accurately accounting
269278 for Llama-style attention and MLP, and tied or untied embeddings.
270279 """
271- c = self .llm_config
280+ llm_config = self .llm_config
272281
273282 embedding_params , attn_params = self ._calc_attn_embed_params ()
274283
275284 # MLP params
276- gate_proj = c .hidden_size * c .intermediate_size
277- up_proj = c .hidden_size * c .intermediate_size
278- down_proj = c .intermediate_size * c .hidden_size
285+ gate_proj = llm_config .hidden_size * llm_config .intermediate_size
286+ up_proj = llm_config .hidden_size * llm_config .intermediate_size
287+ down_proj = llm_config .intermediate_size * llm_config .hidden_size
279288 mlp_params = gate_proj + up_proj + down_proj
280289
281290 # Total per-layer
282291 layer_params = attn_params + mlp_params
283292 # Total params
284- num_params = c .num_hidden_layers * layer_params + embedding_params
293+ num_params = llm_config .num_hidden_layers * layer_params + embedding_params
285294
286- return num_params * c .bytes_per_parameter / 1e9
295+ return num_params * llm_config .bytes_per_parameter / 1e9
287296
288297 @property
289298 def kv_cache_memory (self ) -> float :
@@ -293,18 +302,18 @@ def kv_cache_memory(self) -> float:
293302 Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
294303 num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
295304 """
296- c = self .llm_config
297- seq_len = self .seq_len or getattr ( c , " max_seq_len" , 2048 )
298- kv_cache_dtype_bytes = QUANT_MAPPING .get (c .weight_dtype , 2 )
299- kv_heads = c .num_key_value_heads
305+ llm_config = self .llm_config
306+ seq_len = self .seq_len or llm_config . max_seq_len
307+ kv_cache_dtype_bytes = QUANT_MAPPING .get (llm_config .weight_dtype , 2 )
308+ kv_heads = llm_config .num_key_value_heads
300309
301310 total_bytes = (
302311 self .batch_size
303- * c .num_hidden_layers
312+ * llm_config .num_hidden_layers
304313 * 2
305314 * kv_heads
306315 * seq_len
307- * c .head_dim
316+ * llm_config .head_dim
308317 * kv_cache_dtype_bytes
309318 )
310319 return total_bytes / 1e9
@@ -313,17 +322,23 @@ def _calc_attn_embed_params(self) -> tuple:
313322 """
314323 Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
315324 """
316- c = self .llm_config
325+ llm_config = self .llm_config
317326
318327 # Embedding parameters
319328 # assume tied embeddings unless tie_word_embeddings = False
320- embedding_count = 1 if getattr (c , "tie_word_embeddings" , True ) else 2
321- embedding_params = embedding_count * c .vocab_size * c .hidden_size
329+ embedding_count = 1 if llm_config .tie_word_embeddings else 2
330+ embedding_params = (
331+ embedding_count * llm_config .vocab_size * llm_config .hidden_size
332+ )
322333
323- q_proj = c .hidden_size * c .hidden_size
324- k_proj = c .hidden_size * (c .num_key_value_heads * c .head_dim )
325- v_proj = c .hidden_size * (c .num_key_value_heads * c .head_dim )
326- o_proj = c .hidden_size * c .hidden_size
334+ q_proj = llm_config .hidden_size * llm_config .hidden_size
335+ k_proj = llm_config .hidden_size * (
336+ llm_config .num_key_value_heads * llm_config .head_dim
337+ )
338+ v_proj = llm_config .hidden_size * (
339+ llm_config .num_key_value_heads * llm_config .head_dim
340+ )
341+ o_proj = llm_config .hidden_size * llm_config .hidden_size
327342 attn_params = q_proj + k_proj + v_proj + o_proj
328343
329344 return embedding_params , attn_params
@@ -342,21 +357,24 @@ def model_memory(self) -> float:
342357
343358 Returns the estimated memory size of the MoE Model (in GB).
344359 """
345- c = self .llm_config
360+ llm_config = self .llm_config
346361 # Attention parameter count (Llama-style)
347362 embedding_params , attn_params = self ._calc_attn_embed_params ()
348363
349364 # MoE MLP params per layer
350365 moe_params_per_layer = (
351- c .num_local_experts * 3 * c .hidden_size * c .intermediate_size
366+ llm_config .num_local_experts
367+ * 3
368+ * llm_config .hidden_size
369+ * llm_config .intermediate_size
352370 )
353371 total_params = (
354- c .num_hidden_layers * (attn_params + moe_params_per_layer )
372+ llm_config .num_hidden_layers * (attn_params + moe_params_per_layer )
355373 + embedding_params
356374 )
357375
358376 # Convert to GB
359- return total_params * c .bytes_per_parameter / 1e9
377+ return total_params * llm_config .bytes_per_parameter / 1e9
360378
361379
362380def get_estimator (llm_config , ** kwargs ) -> MemoryEstimator :
0 commit comments