@@ -46,18 +46,18 @@ def kv_cache_memory(self) -> float:
4646 Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation.
4747 """
4848 seq_len = self .seq_len or self .llm_config .max_seq_len
49- c = self .llm_config
49+ llm_config = self .llm_config
5050 kv_cache_dtype_bytes = QUANT_MAPPING .get (
51- c .weight_dtype , 2
51+ llm_config .weight_dtype , 2
5252 ) # vLLM uses model's weight applied to KV cache
5353
5454 total_bytes = (
5555 self .batch_size
56- * c .num_hidden_layers
56+ * llm_config .num_hidden_layers
5757 * 2
58- * c .num_attention_heads
58+ * llm_config .num_attention_heads
5959 * seq_len
60- * c .head_dim
60+ * llm_config .head_dim
6161 * kv_cache_dtype_bytes
6262 )
6363 return total_bytes / 1e9
@@ -69,15 +69,15 @@ def model_memory(self) -> float:
6969
7070 Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible.
7171 """
72- c = self .llm_config
73- embedding_count = 1 if getattr ( c , " tie_word_embeddings" , True ) else 2
72+ llm_config = self .llm_config
73+ embedding_count = 1 if llm_config . tie_word_embeddings else 2
7474 embedding_params = (
75- embedding_count * c .vocab_size * c .hidden_size
75+ embedding_count * llm_config .vocab_size * llm_config .hidden_size
7676 ) # input and output untied
77- layer_params = 12 * c .num_hidden_layers * (c .hidden_size ** 2 ) # GPT-style
77+ layer_params = 12 * llm_config .num_hidden_layers * (llm_config .hidden_size ** 2 ) # GPT-style
7878 num_params = layer_params + embedding_params
7979
80- return num_params * c .bytes_per_parameter / 1e9
80+ return num_params * llm_config .bytes_per_parameter / 1e9
8181
8282 @property
8383 def total_memory (self ) -> float :
@@ -120,19 +120,19 @@ def construct_deployment_params(self) -> str:
120120 -------
121121 str: Parameter string for model deployment.
122122 """
123- c = self .llm_config
123+ llm_config = self .llm_config
124124 params = []
125- if self .seq_len < c .max_seq_len :
125+ if self .seq_len < llm_config .max_seq_len :
126126 params .append (VLLM_PARAMS ["max_model_len" ])
127127 params .append (str (self .seq_len ))
128128
129129 # Only suggest in-flight quantization for unquantized models when such quantization is requested
130- if not c .quantization and c .in_flight_quantization in IN_FLIGHT_QUANTIZATION :
130+ if not llm_config .quantization and llm_config .in_flight_quantization in IN_FLIGHT_QUANTIZATION :
131131 # vLLM only supports 4bit in-flight quantization
132132 params .append (VLLM_PARAMS ["in_flight_quant" ])
133133
134134 # add trust-remote-code if custom modules are specified
135- if c .trust_remote_code :
135+ if llm_config .trust_remote_code :
136136 params .append (VLLM_PARAMS ["trust_remote_code" ])
137137
138138 params = " " .join (params ) if params else ""
@@ -158,12 +158,12 @@ def suggest_param_advice(self, allowed: float) -> str:
158158 wt_gb = self .model_memory
159159 batch_size = self .batch_size
160160 seq_len = self .seq_len
161- weight_size = getattr ( self .llm_config , " weight_dtype" , "unknown" )
161+ weight_size = self .llm_config . weight_dtype
162162 config = self .llm_config
163163
164164 suggested_quant_msg = None
165165 quant_advice = ", " .join (config .suggested_quantizations )
166- quantization = getattr ( config , " quantization" , None )
166+ quantization = config . quantization
167167
168168 advice = []
169169
@@ -272,22 +272,22 @@ def model_memory(self) -> float:
272272 Returns estimated model parameter memory (in GB), accurately accounting
273273 for Llama-style attention and MLP, and tied or untied embeddings.
274274 """
275- c = self .llm_config
275+ llm_config = self .llm_config
276276
277277 embedding_params , attn_params = self ._calc_attn_embed_params ()
278278
279279 # MLP params
280- gate_proj = c .hidden_size * c .intermediate_size
281- up_proj = c .hidden_size * c .intermediate_size
282- down_proj = c .intermediate_size * c .hidden_size
280+ gate_proj = llm_config .hidden_size * llm_config .intermediate_size
281+ up_proj = llm_config .hidden_size * llm_config .intermediate_size
282+ down_proj = llm_config .intermediate_size * llm_config .hidden_size
283283 mlp_params = gate_proj + up_proj + down_proj
284284
285285 # Total per-layer
286286 layer_params = attn_params + mlp_params
287287 # Total params
288- num_params = c .num_hidden_layers * layer_params + embedding_params
288+ num_params = llm_config .num_hidden_layers * layer_params + embedding_params
289289
290- return num_params * c .bytes_per_parameter / 1e9
290+ return num_params * llm_config .bytes_per_parameter / 1e9
291291
292292 @property
293293 def kv_cache_memory (self ) -> float :
@@ -297,18 +297,18 @@ def kv_cache_memory(self) -> float:
297297 Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection.
298298 num_key_value_heads < num_attention_heads, which reduces the KV Cache size.
299299 """
300- c = self .llm_config
301- seq_len = self .seq_len or getattr ( c , " max_seq_len" , 2048 )
302- kv_cache_dtype_bytes = QUANT_MAPPING .get (c .weight_dtype , 2 )
303- kv_heads = c .num_key_value_heads
300+ llm_config = self .llm_config
301+ seq_len = self .seq_len or llm_config . max_seq_len
302+ kv_cache_dtype_bytes = QUANT_MAPPING .get (llm_config .weight_dtype , 2 )
303+ kv_heads = llm_config .num_key_value_heads
304304
305305 total_bytes = (
306306 self .batch_size
307- * c .num_hidden_layers
307+ * llm_config .num_hidden_layers
308308 * 2
309309 * kv_heads
310310 * seq_len
311- * c .head_dim
311+ * llm_config .head_dim
312312 * kv_cache_dtype_bytes
313313 )
314314 return total_bytes / 1e9
@@ -317,17 +317,17 @@ def _calc_attn_embed_params(self) -> tuple:
317317 """
318318 Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models.
319319 """
320- c = self .llm_config
320+ llm_config = self .llm_config
321321
322322 # Embedding parameters
323323 # assume tied embeddings unless tie_word_embeddings = False
324- embedding_count = 1 if getattr ( c , " tie_word_embeddings" , True ) else 2
325- embedding_params = embedding_count * c .vocab_size * c .hidden_size
324+ embedding_count = 1 if llm_config . tie_word_embeddings else 2
325+ embedding_params = embedding_count * llm_config .vocab_size * llm_config .hidden_size
326326
327- q_proj = c .hidden_size * c .hidden_size
328- k_proj = c .hidden_size * (c .num_key_value_heads * c .head_dim )
329- v_proj = c .hidden_size * (c .num_key_value_heads * c .head_dim )
330- o_proj = c .hidden_size * c .hidden_size
327+ q_proj = llm_config .hidden_size * llm_config .hidden_size
328+ k_proj = llm_config .hidden_size * (llm_config .num_key_value_heads * llm_config .head_dim )
329+ v_proj = llm_config .hidden_size * (llm_config .num_key_value_heads * llm_config .head_dim )
330+ o_proj = llm_config .hidden_size * llm_config .hidden_size
331331 attn_params = q_proj + k_proj + v_proj + o_proj
332332
333333 return embedding_params , attn_params
@@ -346,21 +346,21 @@ def model_memory(self) -> float:
346346
347347 Returns the estimated memory size of the MoE Model (in GB).
348348 """
349- c = self .llm_config
349+ llm_config = self .llm_config
350350 # Attention parameter count (Llama-style)
351351 embedding_params , attn_params = self ._calc_attn_embed_params ()
352352
353353 # MoE MLP params per layer
354354 moe_params_per_layer = (
355- c .num_local_experts * 3 * c .hidden_size * c .intermediate_size
355+ llm_config .num_local_experts * 3 * llm_config .hidden_size * llm_config .intermediate_size
356356 )
357357 total_params = (
358- c .num_hidden_layers * (attn_params + moe_params_per_layer )
358+ llm_config .num_hidden_layers * (attn_params + moe_params_per_layer )
359359 + embedding_params
360360 )
361361
362362 # Convert to GB
363- return total_params * c .bytes_per_parameter / 1e9
363+ return total_params * llm_config .bytes_per_parameter / 1e9
364364
365365
366366def get_estimator (llm_config , ** kwargs ) -> MemoryEstimator :
0 commit comments