Skip to content

Commit 648ed5b

Browse files
committed
moved over code
1 parent 054b2fc commit 648ed5b

20 files changed

+2151
-221
lines changed

ads/aqua/shaperecommend/constants.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@
3838
"4bit": ["No smaller quantization available"],
3939
}
4040

41+
RUNTIME_WEIGHTS = {
42+
"use_bfloat16": "bfloat16",
43+
"use_fp16": "float16",
44+
"use_fp32": "float32",
45+
"use_int8": "int8",
46+
"use_int4": "int4",
47+
"use_bfloat32": "bfloat32",
48+
}
4149

4250
TEXT_GENERATION = "text_generation"
4351
SAFETENSORS = "safetensors"
@@ -78,14 +86,23 @@
7886

7987
IN_FLIGHT_QUANTIZATION = {"4bit"} # vLLM only supports 4bit in-flight-quantization
8088

89+
VLLM_PARAMS = "VLLM_PARAMS"
90+
VLLM_ENV = "VLLM"
91+
92+
QUANT_FLAG = "--quantization"
93+
WEIGHT_DTYPE_FLAG = "--dtype"
94+
MAX_MODEL_LEN_FLAG = "--max-model-len"
95+
8196
TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. "
8297

8398
VLLM_PARAMS = {
8499
"max_model_len": "--max-model-len",
85100
"in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes",
101+
"trust_remote_code": "--trust-remote-code",
86102
}
87103

88104
DEFAULT_WEIGHT_SIZE = "float32"
105+
DEFAULT_MAX_SEQ_LEN = 4096
89106

90107
BITS_AND_BYTES_8BIT = "8bit"
91108
BITS_AND_BYTES_4BIT = "4bit"

ads/aqua/shaperecommend/estimator.py

Lines changed: 57 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -46,18 +46,18 @@ def kv_cache_memory(self) -> float:
4646
Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation.
4747
"""
4848
seq_len = self.seq_len or self.llm_config.max_seq_len
49-
c = self.llm_config
49+
llm_config = self.llm_config
5050
kv_cache_dtype_bytes = QUANT_MAPPING.get(
51-
c.weight_dtype, 2
51+
llm_config.weight_dtype, 2
5252
) # vLLM uses model's weight applied to KV cache
5353

5454
total_bytes = (
5555
self.batch_size
56-
* c.num_hidden_layers
56+
* llm_config.num_hidden_layers
5757
* 2
58-
* c.num_attention_heads
58+
* llm_config.num_attention_heads
5959
* seq_len
60-
* c.head_dim
60+
* llm_config.head_dim
6161
* kv_cache_dtype_bytes
6262
)
6363
return total_bytes / 1e9
@@ -69,15 +69,17 @@ def model_memory(self) -> float:
6969
7070
Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible.
7171
"""
72-
c = self.llm_config
73-
embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
72+
llm_config = self.llm_config
73+
embedding_count = 1 if llm_config.tie_word_embeddings else 2
7474
embedding_params = (
75-
embedding_count * c.vocab_size * c.hidden_size
75+
embedding_count * llm_config.vocab_size * llm_config.hidden_size
7676
) # input and output untied
77-
layer_params = 12 * c.num_hidden_layers * (c.hidden_size**2) # GPT-style
77+
layer_params = (
78+
12 * llm_config.num_hidden_layers * (llm_config.hidden_size**2)
79+
) # GPT-style
7880
num_params = layer_params + embedding_params
7981

80-
return num_params * c.bytes_per_parameter / 1e9
82+
return num_params * llm_config.bytes_per_parameter / 1e9
8183

8284
@property
8385
def total_memory(self) -> float:
@@ -120,17 +122,24 @@ def construct_deployment_params(self) -> str:
120122
-------
121123
str: Parameter string for model deployment.
122124
"""
123-
c = self.llm_config
125+
llm_config = self.llm_config
124126
params = []
125-
if self.seq_len < c.max_seq_len:
127+
if self.seq_len < llm_config.max_seq_len:
126128
params.append(VLLM_PARAMS["max_model_len"])
127129
params.append(str(self.seq_len))
128130

129131
# Only suggest in-flight quantization for unquantized models when such quantization is requested
130-
if not c.quantization and c.in_flight_quantization in IN_FLIGHT_QUANTIZATION:
132+
if (
133+
not llm_config.quantization
134+
and llm_config.in_flight_quantization in IN_FLIGHT_QUANTIZATION
135+
):
131136
# vLLM only supports 4bit in-flight quantization
132137
params.append(VLLM_PARAMS["in_flight_quant"])
133138

139+
# add trust-remote-code if custom modules are specified
140+
if llm_config.trust_remote_code:
141+
params.append(VLLM_PARAMS["trust_remote_code"])
142+
134143
params = " ".join(params) if params else ""
135144
return params
136145

@@ -154,12 +163,12 @@ def suggest_param_advice(self, allowed: float) -> str:
154163
wt_gb = self.model_memory
155164
batch_size = self.batch_size
156165
seq_len = self.seq_len
157-
weight_size = getattr(self.llm_config, "weight_dtype", "unknown")
166+
weight_size = self.llm_config.weight_dtype
158167
config = self.llm_config
159168

160169
suggested_quant_msg = None
161170
quant_advice = ", ".join(config.suggested_quantizations)
162-
quantization = getattr(config, "quantization", None)
171+
quantization = config.quantization
163172

164173
advice = []
165174

@@ -246,7 +255,7 @@ def limiting_factor(
246255
)
247256
else:
248257
advice = (
249-
f"No override PARAMS needed. \n\nModel fits well within the allowed compute shape "
258+
f"Model fits well within the allowed compute shape "
250259
f"({required:.1f}GB used / {allowed_gpu_memory:.1f}GB allowed)."
251260
)
252261
return advice
@@ -268,22 +277,22 @@ def model_memory(self) -> float:
268277
Returns estimated model parameter memory (in GB), accurately accounting
269278
for Llama-style attention and MLP, and tied or untied embeddings.
270279
"""
271-
c = self.llm_config
280+
llm_config = self.llm_config
272281

273282
embedding_params, attn_params = self._calc_attn_embed_params()
274283

275284
# MLP params
276-
gate_proj = c.hidden_size * c.intermediate_size
277-
up_proj = c.hidden_size * c.intermediate_size
278-
down_proj = c.intermediate_size * c.hidden_size
285+
gate_proj = llm_config.hidden_size * llm_config.intermediate_size
286+
up_proj = llm_config.hidden_size * llm_config.intermediate_size
287+
down_proj = llm_config.intermediate_size * llm_config.hidden_size
279288
mlp_params = gate_proj + up_proj + down_proj
280289

281290
# Total per-layer
282291
layer_params = attn_params + mlp_params
283292
# Total params
284-
num_params = c.num_hidden_layers * layer_params + embedding_params
293+
num_params = llm_config.num_hidden_layers * layer_params + embedding_params
285294

286-
return num_params * c.bytes_per_parameter / 1e9
295+
return num_params * llm_config.bytes_per_parameter / 1e9
287296

288297
@property
289298
def kv_cache_memory(self) -> float:
@@ -293,18 +302,18 @@ def kv_cache_memory(self) -> float:
293302
Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
294303
num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
295304
"""
296-
c = self.llm_config
297-
seq_len = self.seq_len or getattr(c, "max_seq_len", 2048)
298-
kv_cache_dtype_bytes = QUANT_MAPPING.get(c.weight_dtype, 2)
299-
kv_heads = c.num_key_value_heads
305+
llm_config = self.llm_config
306+
seq_len = self.seq_len or llm_config.max_seq_len
307+
kv_cache_dtype_bytes = QUANT_MAPPING.get(llm_config.weight_dtype, 2)
308+
kv_heads = llm_config.num_key_value_heads
300309

301310
total_bytes = (
302311
self.batch_size
303-
* c.num_hidden_layers
312+
* llm_config.num_hidden_layers
304313
* 2
305314
* kv_heads
306315
* seq_len
307-
* c.head_dim
316+
* llm_config.head_dim
308317
* kv_cache_dtype_bytes
309318
)
310319
return total_bytes / 1e9
@@ -313,17 +322,23 @@ def _calc_attn_embed_params(self) -> tuple:
313322
"""
314323
Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
315324
"""
316-
c = self.llm_config
325+
llm_config = self.llm_config
317326

318327
# Embedding parameters
319328
# assume tied embeddings unless tie_word_embeddings = False
320-
embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2
321-
embedding_params = embedding_count * c.vocab_size * c.hidden_size
329+
embedding_count = 1 if llm_config.tie_word_embeddings else 2
330+
embedding_params = (
331+
embedding_count * llm_config.vocab_size * llm_config.hidden_size
332+
)
322333

323-
q_proj = c.hidden_size * c.hidden_size
324-
k_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
325-
v_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim)
326-
o_proj = c.hidden_size * c.hidden_size
334+
q_proj = llm_config.hidden_size * llm_config.hidden_size
335+
k_proj = llm_config.hidden_size * (
336+
llm_config.num_key_value_heads * llm_config.head_dim
337+
)
338+
v_proj = llm_config.hidden_size * (
339+
llm_config.num_key_value_heads * llm_config.head_dim
340+
)
341+
o_proj = llm_config.hidden_size * llm_config.hidden_size
327342
attn_params = q_proj + k_proj + v_proj + o_proj
328343

329344
return embedding_params, attn_params
@@ -342,21 +357,24 @@ def model_memory(self) -> float:
342357
343358
Returns the estimated memory size of the MoE Model (in GB).
344359
"""
345-
c = self.llm_config
360+
llm_config = self.llm_config
346361
# Attention parameter count (Llama-style)
347362
embedding_params, attn_params = self._calc_attn_embed_params()
348363

349364
# MoE MLP params per layer
350365
moe_params_per_layer = (
351-
c.num_local_experts * 3 * c.hidden_size * c.intermediate_size
366+
llm_config.num_local_experts
367+
* 3
368+
* llm_config.hidden_size
369+
* llm_config.intermediate_size
352370
)
353371
total_params = (
354-
c.num_hidden_layers * (attn_params + moe_params_per_layer)
372+
llm_config.num_hidden_layers * (attn_params + moe_params_per_layer)
355373
+ embedding_params
356374
)
357375

358376
# Convert to GB
359-
return total_params * c.bytes_per_parameter / 1e9
377+
return total_params * llm_config.bytes_per_parameter / 1e9
360378

361379

362380
def get_estimator(llm_config, **kwargs) -> MemoryEstimator:

0 commit comments

Comments
 (0)