diff --git a/ads/aqua/modeldeployment/deployment.py b/ads/aqua/modeldeployment/deployment.py index 2710534bb..e6ba00a93 100644 --- a/ads/aqua/modeldeployment/deployment.py +++ b/ads/aqua/modeldeployment/deployment.py @@ -1288,35 +1288,42 @@ def validate_deployment_params( def recommend_shape(self, **kwargs) -> Union[Table, ShapeRecommendationReport]: """ - For the CLI (set generate_table = True), generates the table (in rich diff) with valid + For the CLI (set by default, generate_table = True), generates the table (in rich diff) with valid GPU deployment shapes for the provided model and configuration. For the API (set generate_table = False), generates the JSON with valid GPU deployment shapes for the provided model and configuration. - Validates if recommendations are generated, calls method to construct the rich diff - table with the recommendation data. + Validates the input and determines whether recommendations are available. Parameters ---------- - model_ocid : str - OCID of the model to recommend feasible compute shapes. + **kwargs + model_ocid : str + (Required) The OCID of the model to recommend feasible compute shapes for. + generate_table : bool, optional + If True, generate and return a rich-diff table; if False, return a JSON response (default is False). + compartment_id : str, optional + The OCID of the user's compartment to use for the recommendation. Returns ------- Table (generate_table = True) - A table format for the recommendation report with compatible deployment shapes - or troubleshooting info citing the largest shapes if no shape is suitable. + If `generate_table` is True, a table displaying the recommendation report with compatible deployment shapes, + or troubleshooting info if no shape is suitable. ShapeRecommendationReport (generate_table = False) - A recommendation report with compatible deployment shapes, or troubleshooting info - citing the largest shapes if no shape is suitable. + If `generate_table` is False, a structured recommendation report with compatible deployment shapes, + or troubleshooting info and citing the largest shapes if no shape is suitable. Raises ------ AquaValueError - If model type is unsupported by tool (no recommendation report generated) + If the model type is unsupported and no recommendation report can be generated. """ + deployment_config = self.get_deployment_config(model_id=kwargs.get("model_id")) + kwargs["deployment_config"] = deployment_config + try: request = RequestRecommend(**kwargs) except ValidationError as e: diff --git a/ads/aqua/shaperecommend/constants.py b/ads/aqua/shaperecommend/constants.py index 08f2f2133..608b5b1ea 100644 --- a/ads/aqua/shaperecommend/constants.py +++ b/ads/aqua/shaperecommend/constants.py @@ -38,6 +38,14 @@ "4bit": ["No smaller quantization available"], } +RUNTIME_WEIGHTS = { + "use_bfloat16": "bfloat16", + "use_fp16": "float16", + "use_fp32": "float32", + "use_int8": "int8", + "use_int4": "int4", + "use_bfloat32": "bfloat32", +} TEXT_GENERATION = "text_generation" SAFETENSORS = "safetensors" @@ -78,14 +86,23 @@ IN_FLIGHT_QUANTIZATION = {"4bit"} # vLLM only supports 4bit in-flight-quantization +VLLM_PARAMS_FAMILY = "VLLM_PARAMS" +VLLM_ENV = "VLLM" + +QUANT_FLAG = "--quantization" +WEIGHT_DTYPE_FLAG = "--dtype" +MAX_MODEL_LEN_FLAG = "--max-model-len" + TROUBLESHOOT_MSG = "The selected model is too large to fit on standard GPU shapes with the current configuration.\nAs troubleshooting, we have suggested the two largest available GPU shapes using the smallest quantization level ('4bit') to maximize chances of fitting the model. " VLLM_PARAMS = { "max_model_len": "--max-model-len", "in_flight_quant": "--quantization bitsandbytes --load-format bitsandbytes", + "trust_remote_code": "--trust-remote-code", } DEFAULT_WEIGHT_SIZE = "float32" +DEFAULT_MAX_SEQ_LEN = 4096 BITS_AND_BYTES_8BIT = "8bit" BITS_AND_BYTES_4BIT = "4bit" diff --git a/ads/aqua/shaperecommend/estimator.py b/ads/aqua/shaperecommend/estimator.py index 3c26c498d..4975a56b6 100644 --- a/ads/aqua/shaperecommend/estimator.py +++ b/ads/aqua/shaperecommend/estimator.py @@ -46,18 +46,18 @@ def kv_cache_memory(self) -> float: Uses num_attention_heads (assumes no GQA, each attention head has its own query, key, value) for estimation. """ seq_len = self.seq_len or self.llm_config.max_seq_len - c = self.llm_config + llm_config = self.llm_config kv_cache_dtype_bytes = QUANT_MAPPING.get( - c.weight_dtype, 2 + llm_config.weight_dtype, 2 ) # vLLM uses model's weight applied to KV cache total_bytes = ( self.batch_size - * c.num_hidden_layers + * llm_config.num_hidden_layers * 2 - * c.num_attention_heads + * llm_config.num_attention_heads * seq_len - * c.head_dim + * llm_config.head_dim * kv_cache_dtype_bytes ) return total_bytes / 1e9 @@ -69,15 +69,17 @@ def model_memory(self) -> float: Model Parameter estimation: Standard decoder-only, untied/tied embeddings possible. """ - c = self.llm_config - embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2 + llm_config = self.llm_config + embedding_count = 1 if llm_config.tie_word_embeddings else 2 embedding_params = ( - embedding_count * c.vocab_size * c.hidden_size + embedding_count * llm_config.vocab_size * llm_config.hidden_size ) # input and output untied - layer_params = 12 * c.num_hidden_layers * (c.hidden_size**2) # GPT-style + layer_params = ( + 12 * llm_config.num_hidden_layers * (llm_config.hidden_size**2) + ) # GPT-style num_params = layer_params + embedding_params - return num_params * c.bytes_per_parameter / 1e9 + return num_params * llm_config.bytes_per_parameter / 1e9 @property def total_memory(self) -> float: @@ -120,17 +122,24 @@ def construct_deployment_params(self) -> str: ------- str: Parameter string for model deployment. """ - c = self.llm_config + llm_config = self.llm_config params = [] - if self.seq_len < c.max_seq_len: + if self.seq_len < llm_config.max_seq_len: params.append(VLLM_PARAMS["max_model_len"]) params.append(str(self.seq_len)) # Only suggest in-flight quantization for unquantized models when such quantization is requested - if not c.quantization and c.in_flight_quantization in IN_FLIGHT_QUANTIZATION: + if ( + not llm_config.quantization + and llm_config.in_flight_quantization in IN_FLIGHT_QUANTIZATION + ): # vLLM only supports 4bit in-flight quantization params.append(VLLM_PARAMS["in_flight_quant"]) + # add trust-remote-code if custom modules are specified + if llm_config.trust_remote_code: + params.append(VLLM_PARAMS["trust_remote_code"]) + params = " ".join(params) if params else "" return params @@ -154,12 +163,12 @@ def suggest_param_advice(self, allowed: float) -> str: wt_gb = self.model_memory batch_size = self.batch_size seq_len = self.seq_len - weight_size = getattr(self.llm_config, "weight_dtype", "unknown") + weight_size = self.llm_config.weight_dtype config = self.llm_config suggested_quant_msg = None quant_advice = ", ".join(config.suggested_quantizations) - quantization = getattr(config, "quantization", None) + quantization = config.quantization advice = [] @@ -246,7 +255,7 @@ def limiting_factor( ) else: advice = ( - f"No override PARAMS needed. \n\nModel fits well within the allowed compute shape " + f"Model fits well within the allowed compute shape " f"({required:.1f}GB used / {allowed_gpu_memory:.1f}GB allowed)." ) return advice @@ -268,22 +277,22 @@ def model_memory(self) -> float: Returns estimated model parameter memory (in GB), accurately accounting for Llama-style attention and MLP, and tied or untied embeddings. """ - c = self.llm_config + llm_config = self.llm_config embedding_params, attn_params = self._calc_attn_embed_params() # MLP params - gate_proj = c.hidden_size * c.intermediate_size - up_proj = c.hidden_size * c.intermediate_size - down_proj = c.intermediate_size * c.hidden_size + gate_proj = llm_config.hidden_size * llm_config.intermediate_size + up_proj = llm_config.hidden_size * llm_config.intermediate_size + down_proj = llm_config.intermediate_size * llm_config.hidden_size mlp_params = gate_proj + up_proj + down_proj # Total per-layer layer_params = attn_params + mlp_params # Total params - num_params = c.num_hidden_layers * layer_params + embedding_params + num_params = llm_config.num_hidden_layers * layer_params + embedding_params - return num_params * c.bytes_per_parameter / 1e9 + return num_params * llm_config.bytes_per_parameter / 1e9 @property def kv_cache_memory(self) -> float: @@ -293,18 +302,18 @@ def kv_cache_memory(self) -> float: Grouped Query Attention uses num_key_value_heads, which groups of Q heads share a K and V projection. num_key_value_heads < num_attention_heads, which reduces the KV Cache size. """ - c = self.llm_config - seq_len = self.seq_len or getattr(c, "max_seq_len", 2048) - kv_cache_dtype_bytes = QUANT_MAPPING.get(c.weight_dtype, 2) - kv_heads = c.num_key_value_heads + llm_config = self.llm_config + seq_len = self.seq_len or llm_config.max_seq_len + kv_cache_dtype_bytes = QUANT_MAPPING.get(llm_config.weight_dtype, 2) + kv_heads = llm_config.num_key_value_heads total_bytes = ( self.batch_size - * c.num_hidden_layers + * llm_config.num_hidden_layers * 2 * kv_heads * seq_len - * c.head_dim + * llm_config.head_dim * kv_cache_dtype_bytes ) return total_bytes / 1e9 @@ -313,17 +322,23 @@ def _calc_attn_embed_params(self) -> tuple: """ Returns the embedding parameter count and attention parameter count for Llama-family (GQA) models. """ - c = self.llm_config + llm_config = self.llm_config # Embedding parameters # assume tied embeddings unless tie_word_embeddings = False - embedding_count = 1 if getattr(c, "tie_word_embeddings", True) else 2 - embedding_params = embedding_count * c.vocab_size * c.hidden_size + embedding_count = 1 if llm_config.tie_word_embeddings else 2 + embedding_params = ( + embedding_count * llm_config.vocab_size * llm_config.hidden_size + ) - q_proj = c.hidden_size * c.hidden_size - k_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim) - v_proj = c.hidden_size * (c.num_key_value_heads * c.head_dim) - o_proj = c.hidden_size * c.hidden_size + q_proj = llm_config.hidden_size * llm_config.hidden_size + k_proj = llm_config.hidden_size * ( + llm_config.num_key_value_heads * llm_config.head_dim + ) + v_proj = llm_config.hidden_size * ( + llm_config.num_key_value_heads * llm_config.head_dim + ) + o_proj = llm_config.hidden_size * llm_config.hidden_size attn_params = q_proj + k_proj + v_proj + o_proj return embedding_params, attn_params @@ -342,21 +357,24 @@ def model_memory(self) -> float: Returns the estimated memory size of the MoE Model (in GB). """ - c = self.llm_config + llm_config = self.llm_config # Attention parameter count (Llama-style) embedding_params, attn_params = self._calc_attn_embed_params() # MoE MLP params per layer moe_params_per_layer = ( - c.num_local_experts * 3 * c.hidden_size * c.intermediate_size + llm_config.num_local_experts + * 3 + * llm_config.hidden_size + * llm_config.intermediate_size ) total_params = ( - c.num_hidden_layers * (attn_params + moe_params_per_layer) + llm_config.num_hidden_layers * (attn_params + moe_params_per_layer) + embedding_params ) # Convert to GB - return total_params * c.bytes_per_parameter / 1e9 + return total_params * llm_config.bytes_per_parameter / 1e9 def get_estimator(llm_config, **kwargs) -> MemoryEstimator: diff --git a/ads/aqua/shaperecommend/llm_config.py b/ads/aqua/shaperecommend/llm_config.py index 8116b91c4..9c5c00f13 100644 --- a/ads/aqua/shaperecommend/llm_config.py +++ b/ads/aqua/shaperecommend/llm_config.py @@ -11,19 +11,17 @@ from ads.aqua.shaperecommend.constants import ( BITS_AND_BYTES_4BIT, BITS_AND_BYTES_8BIT, + DEFAULT_MAX_SEQ_LEN, DEFAULT_WEIGHT_SIZE, NEXT_QUANT, QUANT_MAPPING, QUANT_METHODS, + RUNTIME_WEIGHTS, ) +from ads.common.utils import parse_bool -class LLMConfig(BaseModel): - """ - Standardized configuration object for evaluating the size of Large Language Models (LLMs) - based on their architecture and quantization. - """ - +class GeneralConfig(BaseModel): num_hidden_layers: int = Field( ..., description="Number of transformer blocks (layers) in the model’s neural network stack.", @@ -31,23 +29,6 @@ class LLMConfig(BaseModel): hidden_size: int = Field( ..., description="Embedding dimension or hidden size of each layer." ) - vocab_size: int = Field(..., description="Vocabulary size for input/output tokens.") - num_attention_heads: int = Field( - ..., - description="Number of attention heads (used for queries and to determine head_dim).", - ) - - head_dim: int = Field( - ..., - description="Dimension of each attention head. Typically hidden_size // num_attention_heads.", - ) - max_seq_len: Optional[int] = Field( - 4096, description="Maximum input sequence length (context window)." - ) - weight_dtype: Optional[str] = Field( - DEFAULT_WEIGHT_SIZE, - description="Parameter data type: 'float32', 'float16', etc.", - ) quantization: Optional[str] = Field( None, description="Quantization weight (e.g., '8bit', '4bit') or None if unquantized.", @@ -56,25 +37,36 @@ class LLMConfig(BaseModel): None, description="Quantization method (e.g., '8bit', '4bit', 'gptq', 'awq') or None if unquantized.", ) - in_flight_quantization: Optional[str] = Field( None, description="By setting this, enables recalculation of model footprint using 4bit in-flight quantization", ) - - num_key_value_heads: Optional[int] = Field( - None, - description="Number of key/value heads (for GQA architectures: Llama, Mistral, Falcon, Qwen, etc.). Used to determine KV cache size", - ) - - num_local_experts: Optional[int] = Field( - None, description="For MoE architectures, the number of experts per MoE layer" - ) - intermediate_size: Optional[int] = Field( - None, description="For MoE architectures, size of the MLP activation layer." + weight_dtype: Optional[str] = Field( + DEFAULT_WEIGHT_SIZE, + description="Parameter data type: 'float32', 'float16', etc.", ) - tie_word_embeddings: Optional[bool] = Field(None) + @classmethod + def get_weight_dtype(cls, raw: dict) -> str: + # some configs use a different weight dtype at runtime + # for runtime weight keys, see RUNTIME_WEIGHTS + runtime_flags = False + for flag, dtype in RUNTIME_WEIGHTS.items(): + value = raw.get(flag) + # only permit use_bfloat16 : true + if value is True or (isinstance(value, str) and value.lower() == "true"): + return dtype + if value is False or (isinstance(value, str) and value.lower() == "false"): + runtime_flags = True + + # Fallback to torch_dtype if present & no runtime weight dtype + if not runtime_flags: + torch_dtype = raw.get("torch_dtype") + if torch_dtype: + return str(torch_dtype).lower() + + # if runtime flag present (ex. use_bfloat16: false) or torch_dtype not present + return DEFAULT_WEIGHT_SIZE @property def bytes_per_parameter(self) -> float: @@ -148,6 +140,136 @@ def suggested_quantizations(self): ).lower() return NEXT_QUANT.get(key, []) + +class VisionConfig(GeneralConfig): + """ + For transformer-based vision encoder models (part of the image-text-to-text task models), + parses the module responsible for the vision model. + """ + + mlp_dim: int = Field( + None, + description="Size of the MLP/feedforward sub-block in each transformer layer.", + ) + patch_size: int = ( + Field( + None, + description="Image is divided into (patch_size x patch_size) pixel squares.", + ), + ) + num_hidden_layers: int = (Field(...),) + hidden_size: int = Field(...) + image_size: Optional[int] = ( + Field( + None, + description="Input image resolution, affects memory consumption in KV cache.", + ), + ) + num_attention_heads: Optional[int] = Field( + None, + description="Number of attention heads, impacts the size of attention parameters (model size).", + ) + + @classmethod + def from_raw_config(cls, vision_section: dict) -> "VisionConfig": + weight_dtype = cls.get_weight_dtype(vision_section) + num_layers = ( + vision_section.get("num_layers") + or vision_section.get("vision_layers") + or vision_section.get("num_hidden_layers") + or vision_section.get("n_layer") + ) + + hidden_size = vision_section.get("hidden_size") or vision_section.get( + "embed_dim" + ) + + mlp_dim = vision_section.get("mlp_dim") or vision_section.get( + "intermediate_size" + ) + + num_attention_heads = ( + vision_section.get("num_attention_heads") + or vision_section.get("vision_num_attention_heads") + or vision_section.get("n_head") + ) + + image_size = vision_section.get("image_size") or vision_section.get( + "image_resolution" + ) + + patch_size = vision_section.get("patch_size") + weight_dtype = str(cls.get_weight_dtype(vision_section)) + + return cls( + num_hidden_layers=int(num_layers), + hidden_size=int(hidden_size), + mlp_dim=int(mlp_dim), + patch_size=int(patch_size), + num_attention_heads=int(num_attention_heads) + if num_attention_heads + else None, + weight_dtype=weight_dtype, + image_size=int(image_size) if image_size else None, + ) + + +class LLMConfig(GeneralConfig): + """ + Standardized configuration object for evaluating the size of Large Language Models (LLMs) + based on their architecture and quantization. + """ + + vocab_size: int = Field(..., description="Vocabulary size for input/output tokens.") + num_attention_heads: int = Field( + ..., + description="Number of attention heads (used for queries and to determine head_dim).", + ) + num_hidden_layers: int = Field(...) + hidden_size: int = Field(...) + + head_dim: int = Field( + ..., + description="Dimension of each attention head. Typically hidden_size // num_attention_heads.", + ) + max_seq_len: Optional[int] = Field( + DEFAULT_MAX_SEQ_LEN, + description="Maximum input sequence length (context window).", + ) + weight_dtype: Optional[str] = Field( + DEFAULT_WEIGHT_SIZE, + description="Parameter data type: 'float32', 'float16', etc.", + ) + quantization: Optional[str] = Field( + None, + description="Quantization weight (e.g., '8bit', '4bit') or None if unquantized.", + ) + quantization_type: Optional[str] = Field( + None, + description="Quantization method (e.g., '8bit', '4bit', 'gptq', 'awq') or None if unquantized.", + ) + + num_key_value_heads: Optional[int] = Field( + None, + description="Number of key/value heads (for GQA architectures: Llama, Mistral, Falcon, Qwen, etc.). Used to determine KV cache size", + ) + + num_local_experts: Optional[int] = Field( + None, description="For MoE architectures, the number of experts per MoE layer" + ) + intermediate_size: Optional[int] = Field( + None, description="For MoE architectures, size of the MLP activation layer." + ) + + tie_word_embeddings: Optional[bool] = Field( + True, + description="If True, input and output embedding matrices share the same parameters in memory.", + ) + + trust_remote_code: Optional[bool] = Field( + False, description="If True, the model requires custom code to operate." + ) + def calculate_possible_seq_len(self, min_len=2048): """ Calculates a list of possible sequence lengths (in tokens). @@ -219,9 +341,10 @@ def from_raw_config(cls, raw: dict) -> "LLMConfig": num_hidden_layers = ( raw.get("num_hidden_layers") or raw.get("n_layer") or raw.get("num_layers") ) + weight_dtype = cls.get_weight_dtype(raw) + hidden_size = raw.get("hidden_size") or raw.get("n_embd") or raw.get("d_model") vocab_size = raw.get("vocab_size") - weight_dtype = str(raw.get("torch_dtype", DEFAULT_WEIGHT_SIZE)) quantization = cls.detect_quantization_bits(raw) quantization_type = cls.detect_quantization_type(raw) @@ -257,15 +380,12 @@ def from_raw_config(cls, raw: dict) -> "LLMConfig": "intermediate_size" ) - # Type safety: minimal assertion - if None in [ - num_hidden_layers, - hidden_size, - vocab_size, - num_attention_heads, - head_dim, - ]: - raise ValueError("Missing required value in model config.") + raw_tie_word_embeddings = raw.get("tie_word_embeddings", True) + tie_word_embeddings = parse_bool(raw_tie_word_embeddings) + + trust_remote_code = ( + "auto_map" in raw + ) # trust-remote-code is always needed when this key is present return cls( num_hidden_layers=int(num_hidden_layers), @@ -280,4 +400,115 @@ def from_raw_config(cls, raw: dict) -> "LLMConfig": max_seq_len=int(max_seq_len), num_local_experts=num_local_experts, intermediate_size=intermediate_size, + tie_word_embeddings=tie_word_embeddings, + trust_remote_code=trust_remote_code, + ) + + +class ModelConfig(BaseModel): + """ + Represents the configuration for a model, supporting text-only, vision-only, + or multimodal (text + vision) architectures. + + Attributes + ---------- + llm_config : Optional[LLMConfig] + Parsed configuration for the text-generation (language) model, if present. + vision_config : Optional[VisionConfig] + Parsed configuration for the vision/image encoder, if present. + + Notes + ----- + If both `llm_config` and `vision_config` are defined, this represents a multimodal model. + If only `llm_config` is defined, this represents a text-generation model. + If only `vision_config` is defined, this represents a vision-only model (rare). + """ + + llm_config: Optional[LLMConfig] = Field( + None, + description="Parsed configuration of the text-generation model if present.", + ) + vision_config: Optional[VisionConfig] = Field( + None, description="Parsed configuration of the vision model if present." + ) + + @classmethod + def get_model_config(cls, raw: dict): + """ + Instantiates a ModelConfig by parsing a raw config dictionary (such as a Hugging Face config.json). + + Parameters + ---------- + raw : dict + Raw configuration dictionary to parse. + + Returns + ------- + ModelConfig + An instance with the relevant llm_config and/or vision_config sub-configurations set. + + Raises + ------ + AquaRecommendationError + If neither a text-generation nor a vision model configuration can be parsed from the input. + + Notes + ----- + Handles both sectioned (nested) and flat config formats, with fallback for multiple common field names. + """ + # Sectioned/nested search for text + text_section = ( + raw.get("text_config") + or raw.get("llm_config") + or raw.get("language_model") + or raw.get("language_model_config") + or raw.get("decoder_config") + or raw.get("model_config") + or raw.get("base_model") + or raw.get("gpt_config") + or next( + ( + v + for k, v in raw.items() + if ("text" in k or "llm" in k or "gpt" in k) and isinstance(v, dict) + ), + None, + ) + ) + + # Sectioned/nested search for vision + vision_section = ( + raw.get("vision_config") + or raw.get("vision_encoder_config") + or next( + (v for k, v in raw.items() if "vision" in k and isinstance(v, dict)), + None, + ) + ) + + # Both configs found => multimodal + if vision_section and text_section: + llm_config = LLMConfig.from_raw_config(text_section) + vision_config = VisionConfig.from_raw_config(vision_section) + return cls(llm_config=llm_config, vision_config=vision_config) + + # Vision config (sectioned or flat) + if vision_section or "patch_size" in raw or "image_size" in raw: + if vision_section: + vision_config = VisionConfig.from_raw_config(vision_section) + else: # flat case + vision_config = VisionConfig.from_raw_config(raw) + return cls(vision_config=vision_config) + + # Text config (sectioned or flat) + if text_section or "vocab_size" in raw or "tie_word_embeddings" in raw: + if text_section: + llm_config = LLMConfig.from_raw_config(text_section) + else: # flat case + llm_config = LLMConfig.from_raw_config(raw) + return cls(llm_config=llm_config) + + # Neither found -- explicit failure + raise AquaRecommendationError( + "Config could not be parsed as either text, vision, or multimodal model. Check your fields/structure." ) diff --git a/ads/aqua/shaperecommend/recommend.py b/ads/aqua/shaperecommend/recommend.py index 1b93598e0..1ef38a173 100644 --- a/ads/aqua/shaperecommend/recommend.py +++ b/ads/aqua/shaperecommend/recommend.py @@ -59,49 +59,57 @@ def which_shapes( """ Lists valid GPU deployment shapes for the provided model and configuration. - Validates input, retrieves the model configuration, checks the requested sequence length, + This method validates input, retrieves the model configuration, checks the requested sequence length, identifies available and valid compute shapes, and summarizes which shapes are compatible with the current model settings. Parameters ---------- - ocid : str - OCID of the model to recommend feasible compute shapes. - - available_shapes : List[ComputeShapeSummary] - List of available shapes to recommend - - generate_table : bool - whether to generate a rich diff Table or ShapeRecommendationReport (see Returns section) + request : RequestRecommend + The request object with all needed recommendation fields: + model_id : str + OCID of the model to recommend feasible compute shapes for. + generate_table : bool, optional + If True (default), generate a rich diff table as output. If False, return a ShapeRecommendationReport object. + deployment_config : Optional[AquaDeploymentConfig] + Deployment configuration for the model (used for service models only). + compartment_id : str, optional + The OCID of the user's compartment (needed if shape availability is compartment-specific). Returns ------- - Table (generate_table = True) - A table format for the recommendation report with compatible deployment shapes - or troubleshooting info citing the largest shapes if no shape is suitable. - - ShapeRecommendationReport (generate_table = False) - A recommendation report with compatible deployment shapes, or troubleshooting info - citing the largest shapes if no shape is suitable. + Table + If `generate_table` is True, returns a table with the recommendation report, listing compatible deployment shapes or troubleshooting info citing the largest shapes if no shape is suitable. + ShapeRecommendationReport + If `generate_table` is False, returns a recommendation report with compatible deployment shapes, or troubleshooting info if no shape is suitable. Raises ------ AquaValueError - If parameters are missing or invalid, or if no valid sequence length is requested. + If required parameters are missing or invalid, or if no valid sequence length is available. """ try: shapes = self.valid_compute_shapes(compartment_id=request.compartment_id) - ds_model = self._validate_model_ocid(request.model_id) - data = self._get_model_config(ds_model) - - llm_config = LLMConfig.from_raw_config(data) + ds_model = self._get_data_science_model(request.model_id) model_name = ds_model.display_name if ds_model.display_name else "" - shape_recommendation_report = self._summarize_shapes_for_seq_lens( - llm_config, shapes, model_name - ) + if request.deployment_config: + shape_recommendation_report = ( + ShapeRecommendationReport.from_deployment_config( + request.deployment_config, model_name, shapes + ) + ) + + else: + data = self._get_model_config(ds_model) + + llm_config = LLMConfig.from_raw_config(data) + + shape_recommendation_report = self._summarize_shapes_for_seq_lens( + llm_config, shapes, model_name + ) if request.generate_table and shape_recommendation_report.recommendations: shape_recommendation_report = self._rich_diff_table( @@ -248,14 +256,21 @@ def _rich_diff_table(shape_report: ShapeRecommendationReport) -> Table: else: total_memory = f"CPU: {str(shape.memory_in_gbs)}" + model_size = str(model.total_model_gb) if model else "-" + quantization = ( + deploy.quantization or deploy.weight_dtype + if deploy.quantization or deploy.weight_dtype + else "-" + ) + table.add_row( shape.name, str(shape.available), str(shape.shape_series), str(gpu.gpu_count), total_memory, - str(model.total_model_gb), - deploy.quantization, + model_size, + quantization, recommendation, ) @@ -263,9 +278,10 @@ def _rich_diff_table(shape_report: ShapeRecommendationReport) -> Table: return table @staticmethod - def _validate_model_ocid(ocid: str) -> DataScienceModel: + def _get_data_science_model(ocid: str) -> DataScienceModel: """ Ensures the OCID passed is valid for referencing a DataScienceModel resource. + If valid OCID, returns the DataScienceModel """ resource_type = get_resource_type(ocid) @@ -276,8 +292,7 @@ def _validate_model_ocid(ocid: str) -> DataScienceModel: "Tip: Data Science model OCIDs typically start with 'ocid1.datasciencemodel...'." ) - model = DataScienceModel.from_id(ocid) - return model + return DataScienceModel.from_id(ocid) @staticmethod def _get_model_config(model: DataScienceModel): diff --git a/ads/aqua/shaperecommend/shape_report.py b/ads/aqua/shaperecommend/shape_report.py index e3d9854f0..c9555c4cf 100644 --- a/ads/aqua/shaperecommend/shape_report.py +++ b/ads/aqua/shaperecommend/shape_report.py @@ -1,13 +1,24 @@ #!/usr/bin/env python +#!/usr/bin/env python # Copyright (c) 2025 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import argparse +import json from typing import List, Optional from pydantic import BaseModel, Field from ads.aqua.common.entities import ComputeShapeSummary -from ads.aqua.shaperecommend.constants import QUANT_MAPPING +from ads.aqua.modeldeployment.config_loader import AquaDeploymentConfig +from ads.aqua.shaperecommend.constants import ( + MAX_MODEL_LEN_FLAG, + QUANT_FLAG, + QUANT_MAPPING, + VLLM_ENV, + VLLM_PARAMS_FAMILY, + WEIGHT_DTYPE_FLAG, +) from ads.aqua.shaperecommend.estimator import MemoryEstimator from ads.config import COMPARTMENT_OCID @@ -30,6 +41,11 @@ class RequestRecommend(BaseModel): COMPARTMENT_OCID, description="The OCID of user's compartment" ) + deployment_config: Optional[AquaDeploymentConfig] = Field( + None, + description="The deployment configuration for model (only available for service models).", + ) + class Config: protected_namespaces = () @@ -39,12 +55,20 @@ class DeploymentParams(BaseModel): # noqa: N801 Recommended parameters for deployment and model inferencing (specific to compute shape & model). """ + params: str = Field( + ..., description="Runtime parameters for deployment with vLLM, etc." + ) quantization: Optional[str] = Field( None, description="Type of quantization (e.g. 4bit)." ) - max_model_len: int = Field(..., description="Maximum length of input sequence.") - params: str = Field( - ..., description="Runtime parameters for deployment with vLLM, etc." + weight_dtype: Optional[str] = Field( + None, description="Data type that the model weights use (bfloat16)." + ) + max_model_len: Optional[int] = Field( + None, description="Maximum length of input sequence." + ) + env_var: Optional[dict] = Field( + None, description="Global environment variables needed for deployment." ) @@ -68,11 +92,16 @@ class ModelConfig(BaseModel): The configuration for a model based on specific set of deployment parameters and memory capacity of shape. """ - model_details: ModelDetail = Field(..., description="Details about the model.") deployment_params: DeploymentParams = Field( ..., description="Parameters for deployment." ) - recommendation: str = Field(..., description="GPU recommendation for the model.") + model_details: Optional[ModelDetail] = Field( + None, description="Details about the model." + ) + + recommendation: Optional[str] = Field( + "", description="GPU recommendation for the model." + ) class Config: protected_namespaces = () @@ -231,3 +260,112 @@ class ShapeRecommendationReport(BaseModel): None, description="Details for troubleshooting if no shapes fit the current model.", ) + + @classmethod + def create_deployment_config_from_params_string( + cls, config_params: str, config_env: dict + ) -> DeploymentParams: + """ + Parse a vLLM parameter string and create a DeploymentParams object. + + Parameters + ---------- + config_params : str + A space-separated string of deployment parameters + (e.g., '--quantization mxfp4 --weight-dtype fp16 --max-model-len 4096'). + If None or empty, default parameter values are used. + + Returns + ------- + DeploymentParams + A DeploymentParams object populated with parsed or default values. + """ + parser = argparse.ArgumentParser() + parser.add_argument(QUANT_FLAG, type=str, default=None) + parser.add_argument( + WEIGHT_DTYPE_FLAG, dest="weight_dtype", type=str, default=None + ) + parser.add_argument( + MAX_MODEL_LEN_FLAG, dest="max_model_len", type=int, default=None + ) + + # Use parse_known_args to gracefully handle unexpected arguments + args, _ = parser.parse_known_args( + config_params.split() if config_params else [] + ) + + return DeploymentParams( + quantization=args.quantization, + weight_dtype=args.weight_dtype, + max_model_len=args.max_model_len, + params=config_params or "", + env_var=config_env, + ) + + @classmethod + def from_deployment_config( + cls, + deployment_config: AquaDeploymentConfig, + model_name: str, + valid_shapes: List[ComputeShapeSummary], + ) -> "ShapeRecommendationReport": + """ + Creates a ShapeRecommendationReport from an AquaDeploymentConfig, extracting recommended + model configurations for each valid compute shape. + + Parameters + ---------- + deployment_config : AquaDeploymentConfig + The object containing per-shape deployment configurations. + model_name : str + The name of the model for which to generate recommendations. + valid_shapes : list of ComputeShapeSummary + List of compute shapes to evaluate and recommend deployment configurations for. + + Returns + ------- + ShapeRecommendationReport + Report containing recommendations for each valid compute shape. + + Notes + ----- + For service models, this method interprets pre-set deployment configurations to derive + recommendations for each allowed compute shape, including environment variables, quantization, + and maximum model length parameters. + """ + + recs = [] + for shape in valid_shapes: + current_config = deployment_config.configuration.get(shape.name) + if not current_config: + continue + + recommendation = "" + current_params = current_config.parameters.get(VLLM_PARAMS_FAMILY) + current_env = current_config.env.get(VLLM_ENV) + + deployment_params = cls.create_deployment_config_from_params_string( + current_params, current_env + ) + + if current_env: + recommendation += f"ENV: {json.dumps(current_env)}\n\n" + + if ( + not current_params and not current_env + ): # model works with default params and no extra env variables + recommendation += "No override PARAMS and ENV variables needed. \n\n" + + recommendation += "Model fits well within the allowed compute shape." + + # need to adjust for multiple configs per shape + configuration = [ + ModelConfig( + deployment_params=deployment_params, + recommendation=recommendation, + ) + ] + + recs.append(ShapeReport(shape_details=shape, configurations=configuration)) + + return ShapeRecommendationReport(display_name=model_name, recommendations=recs) diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/Devstral-Small-2507-GQA.json b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/Devstral-Small-2507-GQA.json similarity index 99% rename from tests/unitary/with_extras/aqua/test_data/recommend/Devstral-Small-2507-GQA.json rename to tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/Devstral-Small-2507-GQA.json index a7119b3a2..123ff1ed8 100644 --- a/tests/unitary/with_extras/aqua/test_data/recommend/Devstral-Small-2507-GQA.json +++ b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/Devstral-Small-2507-GQA.json @@ -5,7 +5,6 @@ "attention_dropout": 0.0, "bos_token_id": 1, "eos_token_id": 2, - "pad_token_id": 11, "head_dim": 128, "hidden_act": "silu", "hidden_size": 5120, @@ -16,6 +15,7 @@ "num_attention_heads": 32, "num_hidden_layers": 40, "num_key_value_heads": 8, + "pad_token_id": 11, "rms_norm_eps": 1e-05, "rope_theta": 1000000000.0, "sliding_window": null, @@ -24,4 +24,4 @@ "transformers_version": "4.53.1", "use_cache": true, "vocab_size": 131072 -} \ No newline at end of file +} diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/Kimi-K2-Instruct-MOE.json b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/Kimi-K2-Instruct-MOE.json similarity index 99% rename from tests/unitary/with_extras/aqua/test_data/recommend/Kimi-K2-Instruct-MOE.json rename to tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/Kimi-K2-Instruct-MOE.json index da5e6d57d..d41050b82 100644 --- a/tests/unitary/with_extras/aqua/test_data/recommend/Kimi-K2-Instruct-MOE.json +++ b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/Kimi-K2-Instruct-MOE.json @@ -45,8 +45,6 @@ ] }, "rms_norm_eps": 1e-06, - "rope_theta": 50000.0, - "routed_scaling_factor": 2.827, "rope_scaling": { "beta_fast": 1.0, "beta_slow": 1.0, @@ -56,6 +54,8 @@ "original_max_position_embeddings": 4096, "type": "yarn" }, + "rope_theta": 50000.0, + "routed_scaling_factor": 2.827, "scoring_func": "sigmoid", "seq_aux": true, "tie_word_embeddings": false, @@ -66,4 +66,4 @@ "use_cache": true, "v_head_dim": 128, "vocab_size": 163840 -} \ No newline at end of file +} diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/Qwen3-235B-A22B-Instruct-2507-FP8.json b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/Qwen3-235B-A22B-Instruct-2507-FP8.json similarity index 99% rename from tests/unitary/with_extras/aqua/test_data/recommend/Qwen3-235B-A22B-Instruct-2507-FP8.json rename to tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/Qwen3-235B-A22B-Instruct-2507-FP8.json index b3567f0f8..2e8ef2ef7 100644 --- a/tests/unitary/with_extras/aqua/test_data/recommend/Qwen3-235B-A22B-Instruct-2507-FP8.json +++ b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/Qwen3-235B-A22B-Instruct-2507-FP8.json @@ -24,19 +24,9 @@ "num_hidden_layers": 94, "num_key_value_heads": 4, "output_router_logits": false, - "rms_norm_eps": 1e-06, - "rope_scaling": null, - "rope_theta": 5000000, - "router_aux_loss_coef": 0.001, - "sliding_window": null, - "tie_word_embeddings": false, - "torch_dtype": "bfloat16", - "transformers_version": "4.51.0", - "use_cache": true, - "use_sliding_window": false, - "vocab_size": 151936, "quantization_config": { "activation_scheme": "dynamic", + "fmt": "e4m3", "modules_to_not_convert": [ "lm_head", "model.layers.0.input_layernorm", @@ -322,11 +312,21 @@ "model.layers.93.mlp.gate", "model.layers.93.post_attention_layernorm" ], - "fmt": "e4m3", "quant_method": "fp8", "weight_block_size": [ 128, 128 ] - } -} \ No newline at end of file + }, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 5000000, + "router_aux_loss_coef": 0.001, + "sliding_window": null, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.51.0", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/result-Devstral-Small-2507-GQA.json b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/result-Devstral-Small-2507-GQA.json similarity index 82% rename from tests/unitary/with_extras/aqua/test_data/recommend/result-Devstral-Small-2507-GQA.json rename to tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/result-Devstral-Small-2507-GQA.json index 87fe896c9..45dde5161 100644 --- a/tests/unitary/with_extras/aqua/test_data/recommend/result-Devstral-Small-2507-GQA.json +++ b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/result-Devstral-Small-2507-GQA.json @@ -1,20 +1,22 @@ { - "display_name": "Devstral-Small-2507-GQA", + "display_name": "config-json-files/Devstral-Small-2507-GQA", "recommendations": [ { "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 131072, "params": "", - "quantization": "bfloat16" + "quantization": "bfloat16", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 21.47, "model_size_gb": 47.98, "total_model_gb": 69.46 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 96.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (69.5GB used / 96.0GB allowed)." } ], "shape_details": { @@ -51,16 +53,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 131072, "params": "", - "quantization": "bfloat16" + "quantization": "bfloat16", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 21.47, "model_size_gb": 47.98, "total_model_gb": 69.46 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 1440.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (69.5GB used / 1440.0GB allowed)." } ], "shape_details": { @@ -95,16 +99,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 131072, "params": "", - "quantization": "bfloat16" + "quantization": "bfloat16", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 21.47, "model_size_gb": 47.98, "total_model_gb": 69.46 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 768.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (69.5GB used / 768.0GB allowed)." } ], "shape_details": { @@ -140,16 +146,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 131072, "params": "", - "quantization": "bfloat16" + "quantization": "bfloat16", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 21.47, "model_size_gb": 47.98, "total_model_gb": 69.46 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 1128.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (69.5GB used / 1128.0GB allowed)." } ], "shape_details": { @@ -187,16 +195,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 131072, "params": "", - "quantization": "bfloat16" + "quantization": "bfloat16", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 21.47, "model_size_gb": 47.98, "total_model_gb": 69.46 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 192.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (69.5GB used / 192.0GB allowed)." } ], "shape_details": { @@ -234,16 +244,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 131072, "params": "", - "quantization": "bfloat16" + "quantization": "bfloat16", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 21.47, "model_size_gb": 47.98, "total_model_gb": 69.46 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 192.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (69.5GB used / 192.0GB allowed)." } ], "shape_details": { @@ -281,16 +293,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 131072, "params": "", - "quantization": "bfloat16" + "quantization": "bfloat16", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 21.47, "model_size_gb": 47.98, "total_model_gb": 69.46 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 1536.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (69.5GB used / 1536.0GB allowed)." } ], "shape_details": { @@ -320,16 +334,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 131072, "params": "", - "quantization": "bfloat16" + "quantization": "bfloat16", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 21.47, "model_size_gb": 47.98, "total_model_gb": 69.46 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (69.5GB used / 320.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (69.5GB used / 320.0GB allowed)." } ], "shape_details": { @@ -361,16 +377,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 32768, "params": "--max-model-len 32768 --quantization bitsandbytes --load-format bitsandbytes", - "quantization": "4bit" + "quantization": "4bit", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 5.37, "model_size_gb": 12.0, "total_model_gb": 17.36 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (17.4GB used / 24.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (17.4GB used / 24.0GB allowed)." } ], "shape_details": { @@ -407,16 +425,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 131072, "params": "--quantization bitsandbytes --load-format bitsandbytes", - "quantization": "4bit" + "quantization": "4bit", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 21.47, "model_size_gb": 12.0, "total_model_gb": 33.47 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (33.5GB used / 48.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (33.5GB used / 48.0GB allowed)." } ], "shape_details": { diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/result-Kimi-K2-Instruct-MOE.json b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/result-Kimi-K2-Instruct-MOE.json similarity index 75% rename from tests/unitary/with_extras/aqua/test_data/recommend/result-Kimi-K2-Instruct-MOE.json rename to tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/result-Kimi-K2-Instruct-MOE.json index c5b7c5a46..99b8410f3 100644 --- a/tests/unitary/with_extras/aqua/test_data/recommend/result-Kimi-K2-Instruct-MOE.json +++ b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/result-Kimi-K2-Instruct-MOE.json @@ -1,20 +1,22 @@ { - "display_name": "Kimi-K2-Instruct-MOE", + "display_name": "config-json-files/Kimi-K2-Instruct-MOE", "recommendations": [ { "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 2048, - "params": "--max-model-len 2048", - "quantization": "fp8" + "params": "--max-model-len 2048 --trust-remote-code", + "quantization": "fp8", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 3.58, "model_size_gb": 1046.48, "total_model_gb": 1050.06 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (1050.1GB used / 1440.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (1050.1GB used / 1440.0GB allowed)." } ], "shape_details": { @@ -49,16 +51,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 2048, - "params": "--max-model-len 2048", - "quantization": "fp8" + "params": "--max-model-len 2048 --trust-remote-code", + "quantization": "fp8", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 3.58, "model_size_gb": 1046.48, "total_model_gb": 1050.06 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (1050.1GB used / 1536.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (1050.1GB used / 1536.0GB allowed)." } ], "shape_details": { diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/result-Qwen3-235B-A22B-Instruct-2507-FP8.json b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/result-Qwen3-235B-A22B-Instruct-2507-FP8.json similarity index 81% rename from tests/unitary/with_extras/aqua/test_data/recommend/result-Qwen3-235B-A22B-Instruct-2507-FP8.json rename to tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/result-Qwen3-235B-A22B-Instruct-2507-FP8.json index dfb7ec7c2..fd25f263d 100644 --- a/tests/unitary/with_extras/aqua/test_data/recommend/result-Qwen3-235B-A22B-Instruct-2507-FP8.json +++ b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/result-Qwen3-235B-A22B-Instruct-2507-FP8.json @@ -1,20 +1,22 @@ { - "display_name": "Qwen3-235B-A22B-Instruct-2507-FP8", + "display_name": "config-json-files/Qwen3-235B-A22B-Instruct-2507-FP8", "recommendations": [ { "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 2048, "params": "--max-model-len 2048", - "quantization": "fp8" + "quantization": "fp8", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 0.39, "model_size_gb": 231.89, "total_model_gb": 232.28 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (232.3GB used / 1440.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (232.3GB used / 1440.0GB allowed)." } ], "shape_details": { @@ -49,16 +51,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 2048, "params": "--max-model-len 2048", - "quantization": "fp8" + "quantization": "fp8", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 0.39, "model_size_gb": 231.89, "total_model_gb": 232.28 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (232.3GB used / 768.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (232.3GB used / 768.0GB allowed)." } ], "shape_details": { @@ -94,16 +98,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 2048, "params": "--max-model-len 2048", - "quantization": "fp8" + "quantization": "fp8", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 0.39, "model_size_gb": 231.89, "total_model_gb": 232.28 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (232.3GB used / 1128.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (232.3GB used / 1128.0GB allowed)." } ], "shape_details": { @@ -141,16 +147,18 @@ "configurations": [ { "deployment_params": { + "env_var": null, "max_model_len": 2048, "params": "--max-model-len 2048", - "quantization": "fp8" + "quantization": "fp8", + "weight_dtype": null }, "model_details": { "kv_cache_size_gb": 0.39, "model_size_gb": 231.89, "total_model_gb": 232.28 }, - "recommendation": "No override PARAMS needed. \n\nModel fits well within the allowed compute shape (232.3GB used / 1536.0GB allowed)." + "recommendation": "Model fits well within the allowed compute shape (232.3GB used / 1536.0GB allowed)." } ], "shape_details": { diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/t5gemma-ml-ml-prefixlm.json b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/t5gemma-ml-ml-prefixlm.json similarity index 99% rename from tests/unitary/with_extras/aqua/test_data/recommend/t5gemma-ml-ml-prefixlm.json rename to tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/t5gemma-ml-ml-prefixlm.json index 15ab1cd80..b7d9b33c2 100644 --- a/tests/unitary/with_extras/aqua/test_data/recommend/t5gemma-ml-ml-prefixlm.json +++ b/tests/unitary/with_extras/aqua/test_data/recommend/config-json-files/t5gemma-ml-ml-prefixlm.json @@ -123,4 +123,4 @@ "torch_dtype": "bfloat16", "transformers_version": "4.53.0.dev0", "use_cache": true -} \ No newline at end of file +} diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/gpu-index/gpu-index.json b/tests/unitary/with_extras/aqua/test_data/recommend/gpu-index/gpu-index.json new file mode 100644 index 000000000..f4765ede6 --- /dev/null +++ b/tests/unitary/with_extras/aqua/test_data/recommend/gpu-index/gpu-index.json @@ -0,0 +1,383 @@ +{ + "shapes": { + "BM.GPU.A10.4": { + "cpu_count": 64, + "cpu_memory_in_gbs": 1024, + "gpu_count": 4, + "gpu_memory_in_gbs": 96, + "gpu_type": "A10", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 50, + "performance": 50 + } + }, + "BM.GPU.A100-V2.8": { + "cpu_count": 128, + "cpu_memory_in_gbs": 2048, + "gpu_count": 8, + "gpu_memory_in_gbs": 640, + "gpu_type": "A100", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 80, + "performance": 70 + } + }, + "BM.GPU.B200.8": { + "cpu_count": 128, + "cpu_memory_in_gbs": 4096, + "gpu_count": 8, + "gpu_memory_in_gbs": 1440, + "gpu_type": "B200", + "quantization": [ + "fp4", + "fp8", + "fp16", + "bf16", + "tf32", + "int8", + "fp64" + ], + "ranking": { + "cost": 120, + "performance": 130 + } + }, + "BM.GPU.B4.8": { + "cpu_count": 64, + "cpu_memory_in_gbs": 2048, + "gpu_count": 8, + "gpu_memory_in_gbs": 320, + "gpu_type": "A100", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 70, + "performance": 60 + } + }, + "BM.GPU.GB200.4": { + "cpu_count": 144, + "cpu_memory_in_gbs": 1024, + "gpu_count": 4, + "gpu_memory_in_gbs": 768, + "gpu_type": "GB200", + "quantization": [ + "fp4", + "fp8", + "fp6", + "int8", + "fp16", + "bf16", + "tf32", + "fp64" + ], + "ranking": { + "cost": 110, + "performance": 120 + } + }, + "BM.GPU.H100.8": { + "cpu_count": 112, + "cpu_memory_in_gbs": 2048, + "gpu_count": 8, + "gpu_memory_in_gbs": 640, + "gpu_type": "H100", + "quantization": [ + "awq", + "gptq", + "marlin", + "fp8", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 100, + "performance": 100 + } + }, + "BM.GPU.H200.8": { + "cpu_count": 112, + "cpu_memory_in_gbs": 3072, + "gpu_count": 8, + "gpu_memory_in_gbs": 1128, + "gpu_type": "H200", + "quantization": [ + "awq", + "gptq", + "marlin", + "fp8", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 100, + "performance": 110 + } + }, + "BM.GPU.L40S-NC.4": { + "cpu_count": 112, + "cpu_memory_in_gbs": 1024, + "gpu_count": 4, + "gpu_memory_in_gbs": 192, + "gpu_type": "L40S", + "quantization": [ + "awq", + "gptq", + "marlin", + "fp8", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 60, + "performance": 80 + } + }, + "BM.GPU.L40S.4": { + "cpu_count": 112, + "cpu_memory_in_gbs": 1024, + "gpu_count": 4, + "gpu_memory_in_gbs": 192, + "gpu_type": "L40S", + "quantization": [ + "awq", + "gptq", + "marlin", + "fp8", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 60, + "performance": 80 + } + }, + "BM.GPU.MI300X.8": { + "cpu_count": 112, + "cpu_memory_in_gbs": 2048, + "gpu_count": 8, + "gpu_memory_in_gbs": 1536, + "gpu_type": "MI300X", + "quantization": [ + "fp8", + "gguf" + ], + "ranking": { + "cost": 90, + "performance": 90 + } + }, + "BM.GPU2.2": { + "cpu_count": 28, + "cpu_memory_in_gbs": 192, + "gpu_count": 2, + "gpu_memory_in_gbs": 32, + "gpu_type": "P100", + "quantization": [ + "fp16" + ], + "ranking": { + "cost": 30, + "performance": 20 + } + }, + "BM.GPU4.8": { + "cpu_count": 64, + "cpu_memory_in_gbs": 2048, + "gpu_count": 8, + "gpu_memory_in_gbs": 320, + "gpu_type": "A100", + "quantization": [ + "int8", + "fp16", + "bf16", + "tf32" + ], + "ranking": { + "cost": 57, + "performance": 65 + } + }, + "VM.GPU.A10.1": { + "cpu_count": 15, + "cpu_memory_in_gbs": 240, + "gpu_count": 1, + "gpu_memory_in_gbs": 24, + "gpu_type": "A10", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 20, + "performance": 30 + } + }, + "VM.GPU.A10.2": { + "cpu_count": 30, + "cpu_memory_in_gbs": 480, + "gpu_count": 2, + "gpu_memory_in_gbs": 48, + "gpu_type": "A10", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 40, + "performance": 40 + } + }, + "VM.GPU2.1": { + "cpu_count": 12, + "cpu_memory_in_gbs": 72, + "gpu_count": 1, + "gpu_memory_in_gbs": 16, + "gpu_type": "P100", + "quantization": [ + "fp16" + ], + "ranking": { + "cost": 10, + "performance": 10 + } + }, + "VM.GPU3.1": { + "cpu_count": 6, + "cpu_memory_in_gbs": 90, + "gpu_count": 1, + "gpu_memory_in_gbs": 16, + "gpu_type": "V100", + "quantization": [ + "gptq", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 35, + "performance": 10 + } + }, + "VM.GPU3.2": { + "cpu_count": 12, + "cpu_memory_in_gbs": 180, + "gpu_count": 2, + "gpu_memory_in_gbs": 32, + "gpu_type": "V100", + "quantization": [ + "gptq", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 45, + "performance": 20 + } + }, + "VM.GPU3.4": { + "cpu_count": 24, + "cpu_memory_in_gbs": 360, + "gpu_count": 4, + "gpu_memory_in_gbs": 64, + "gpu_type": "V100", + "quantization": [ + "gptq", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 55, + "performance": 45 + } + }, + "VM.GPU3.8": { + "cpu_count": 24, + "cpu_memory_in_gbs": 768, + "gpu_count": 8, + "gpu_memory_in_gbs": 128, + "gpu_type": "V100", + "quantization": [ + "gptq", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 56, + "performance": 46 + } + } + } +} diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/service-config/example-1.json b/tests/unitary/with_extras/aqua/test_data/recommend/service-config/example-1.json new file mode 100644 index 000000000..e2162bd10 --- /dev/null +++ b/tests/unitary/with_extras/aqua/test_data/recommend/service-config/example-1.json @@ -0,0 +1,51 @@ +{ + "configuration": { + "BM.GPU.A10.4": { + "multi_model_deployment": [ + { + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--max-model-len 65536" + } + }, + { + "gpu_count": 2 + } + ] + }, + "BM.GPU.L40S-NC.4": { + "multi_model_deployment": [ + { + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--max-model-len 65536" + } + }, + { + "gpu_count": 2 + } + ] + }, + "VM.GPU.A10.1": { + "parameters": { + "VLLM_PARAMS": "--max-model-len 65536" + } + }, + "VM.GPU.A10.2": { + "multi_model_deployment": [ + { + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--max-model-len 65536" + } + } + ] + } + }, + "shape": [ + "VM.GPU.A10.1", + "VM.GPU.A10.2", + "BM.GPU.A10.4", + "BM.GPU.L40S-NC.4" + ] +} diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/service-config/example-2.json b/tests/unitary/with_extras/aqua/test_data/recommend/service-config/example-2.json new file mode 100644 index 000000000..b0bd8700d --- /dev/null +++ b/tests/unitary/with_extras/aqua/test_data/recommend/service-config/example-2.json @@ -0,0 +1,193 @@ +{ + "configuration": { + "BM.GPU.A10.4": { + "env": { + "VLLM": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + } + }, + "multi_model_deployment": [ + { + "env": { + "VLLM": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + } + }, + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 32 --max-model-len 2048 --quantization mxfp4 --tensor-parallel-size 1" + } + }, + { + "env": { + "VLLM": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + } + }, + "gpu_count": 2, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --quantization mxfp4 --tensor-parallel-size 2" + } + } + ], + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 32 --max-model-len 130000 --dtype bfloat16" + } + }, + "BM.GPU.A100-v2.8": { + "env": { + "VLLM": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + } + }, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --dtype bfloat16" + } + }, + "BM.GPU.B4.8": { + "multi_model_deployment": [ + { + "env": { + "VLLM": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + } + }, + "gpu_count": 2, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --dtype bfloat16 --tensor-parallel-size 2" + } + }, + { + "env": { + "VLLM": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + } + }, + "gpu_count": 4, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --dtype bfloat16 --tensor-parallel-size 4" + } + } + ], + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 32 --max-model-len 130000 --dtype bfloat16" + } + }, + "BM.GPU.H100.8": { + "multi_model_deployment": [ + { + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --quantization mxfp4 --tensor-parallel-size 1" + } + }, + { + "gpu_count": 2, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --quantization mxfp4 --tensor-parallel-size 2" + } + }, + { + "gpu_count": 4, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --quantization mxfp4 --tensor-parallel-size 4" + } + } + ], + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --quantization mxfp4" + } + }, + "BM.GPU.H200.8": { + "multi_model_deployment": [ + { + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --quantization mxfp4 --tensor-parallel-size 1" + } + }, + { + "gpu_count": 2, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --quantization mxfp4 --tensor-parallel-size 2" + } + }, + { + "gpu_count": 4, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --quantization mxfp4 --tensor-parallel-size 4" + } + } + ], + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --quantization mxfp4" + } + }, + "BM.GPU4.8": { + "env": { + "VLLM": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + } + }, + "multi_model_deployment": [ + { + "env": { + "VLLM": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + } + }, + "gpu_count": 2, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 32 --max-model-len 130000 --dtype bfloat16 --tensor-parallel-size 2" + } + }, + { + "env": { + "VLLM": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + } + }, + "gpu_count": 4, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --dtype bfloat16 --tensor-parallel-size 4" + } + } + ], + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 32 --max-model-len 130000 --dtype bfloat16" + } + }, + "VM.GPU.A10.2": { + "env": { + "VLLM": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + } + }, + "multi_model_deployment": [ + { + "env": { + "VLLM": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + } + }, + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 32 --max-model-len 2048 --quantization mxfp4 --tensor-parallel-size 1" + } + } + ], + "parameters": { + "VLLM_PARAMS": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 5 --max-model-len 8192 --dtype bfloat16" + } + } + }, + "shape": [ + "BM.GPU.A10.4", + "VM.GPU.A10.2", + "BM.GPU.H100.8", + "BM.GPU.H200.8", + "BM.GPU.A100-v2.8", + "BM.GPU.B4.8", + "BM.GPU4.8" + ] +} diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/service-config/example-3.json b/tests/unitary/with_extras/aqua/test_data/recommend/service-config/example-3.json new file mode 100644 index 000000000..cc271ea3b --- /dev/null +++ b/tests/unitary/with_extras/aqua/test_data/recommend/service-config/example-3.json @@ -0,0 +1,92 @@ +{ + "configuration": { + "BM.GPU.A10.4": { + "multi_model_deployment": [ + { + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code" + } + } + ], + "parameters": { + "VLLM_PARAMS": "--trust-remote-code" + } + }, + "BM.GPU.A100-v2.8": { + "multi_model_deployment": [ + { + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code" + } + } + ], + "parameters": { + "VLLM_PARAMS": "--trust-remote-code" + } + }, + "BM.GPU.H100.8": { + "multi_model_deployment": [ + { + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code" + } + } + ], + "parameters": { + "VLLM_PARAMS": "--trust-remote-code" + } + }, + "BM.GPU.H200.8": { + "multi_model_deployment": [ + { + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code" + } + } + ], + "parameters": { + "VLLM_PARAMS": "--trust-remote-code" + } + }, + "BM.GPU.L40S-NC.4": { + "multi_model_deployment": [ + { + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code" + } + } + ], + "parameters": { + "VLLM_PARAMS": "--trust-remote-code" + } + }, + "BM.GPU4.8": { + "multi_model_deployment": [ + { + "gpu_count": 1, + "parameters": { + "VLLM_PARAMS": "--trust-remote-code" + } + } + ], + "parameters": { + "VLLM_PARAMS": "--trust-remote-code" + } + }, + "VM.GPU.A10.1": { + "parameters": { + "VLLM_PARAMS": "--gpu-memory-utilization 0.98 --trust-remote-code --enforce-eager --max-num-seqs 32 --max-model-len 6000 --dtype auto --enable-lora --max-lora-rank 320 --lora-extra-vocab-size 512 --limit-mm-per-prompt audio=1,image=1 --max-loras 2 --lora-modules speech=/opt/ds/model/deployed_model/service_models/Phi-4-multimodal-instruct/0af439b/artifact/speech-lora vision=/opt/ds/model/deployed_model/service_models/Phi-4-multimodal-instruct/0af439b/artifact/vision-lora" + } + } + }, + "shape": [ + "VM.GPU.A10.1", + "BM.GPU.H100.8", + "BM.GPU.H200.8" + ] +} diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/service-config/result-example-1.json b/tests/unitary/with_extras/aqua/test_data/recommend/service-config/result-example-1.json new file mode 100644 index 000000000..0488bf5d5 --- /dev/null +++ b/tests/unitary/with_extras/aqua/test_data/recommend/service-config/result-example-1.json @@ -0,0 +1,183 @@ +{ + "display_name": "service-config/example-1", + "recommendations": [ + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": null, + "params": "", + "quantization": null, + "weight_dtype": null + }, + "model_details": null, + "recommendation": "No override PARAMS and ENV variables needed. \n\nModel fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 64, + "cpu_memory_in_gbs": 1024, + "gpu_count": 4, + "gpu_memory_in_gbs": 96, + "gpu_type": "A10", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 50, + "performance": 50 + } + }, + "memory_in_gbs": null, + "name": "BM.GPU.A10.4", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": null, + "params": "", + "quantization": null, + "weight_dtype": null + }, + "model_details": null, + "recommendation": "No override PARAMS and ENV variables needed. \n\nModel fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 112, + "cpu_memory_in_gbs": 1024, + "gpu_count": 4, + "gpu_memory_in_gbs": 192, + "gpu_type": "L40S", + "quantization": [ + "awq", + "gptq", + "marlin", + "fp8", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 60, + "performance": 80 + } + }, + "memory_in_gbs": null, + "name": "BM.GPU.L40S-NC.4", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": 65536, + "params": "--max-model-len 65536", + "quantization": null, + "weight_dtype": null + }, + "model_details": null, + "recommendation": "Model fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 15, + "cpu_memory_in_gbs": 240, + "gpu_count": 1, + "gpu_memory_in_gbs": 24, + "gpu_type": "A10", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 20, + "performance": 30 + } + }, + "memory_in_gbs": null, + "name": "VM.GPU.A10.1", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": null, + "params": "", + "quantization": null, + "weight_dtype": null + }, + "model_details": null, + "recommendation": "No override PARAMS and ENV variables needed. \n\nModel fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 30, + "cpu_memory_in_gbs": 480, + "gpu_count": 2, + "gpu_memory_in_gbs": 48, + "gpu_type": "A10", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 40, + "performance": 40 + } + }, + "memory_in_gbs": null, + "name": "VM.GPU.A10.2", + "shape_series": "GPU" + } + } + ], + "troubleshoot": null +} diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/service-config/result-example-2.json b/tests/unitary/with_extras/aqua/test_data/recommend/service-config/result-example-2.json new file mode 100644 index 000000000..cfc6e1a23 --- /dev/null +++ b/tests/unitary/with_extras/aqua/test_data/recommend/service-config/result-example-2.json @@ -0,0 +1,273 @@ +{ + "display_name": "service-config/example-2", + "recommendations": [ + { + "configurations": [ + { + "deployment_params": { + "env_var": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + }, + "max_model_len": 130000, + "params": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 32 --max-model-len 130000 --dtype bfloat16", + "quantization": null, + "weight_dtype": "bfloat16" + }, + "model_details": null, + "recommendation": "ENV: {\"VLLM_ATTENTION_BACKEND\": \"TRITON_ATTN_VLLM_V1\"}\n\nModel fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 64, + "cpu_memory_in_gbs": 1024, + "gpu_count": 4, + "gpu_memory_in_gbs": 96, + "gpu_type": "A10", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 50, + "performance": 50 + } + }, + "memory_in_gbs": null, + "name": "BM.GPU.A10.4", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": 130000, + "params": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 32 --max-model-len 130000 --dtype bfloat16", + "quantization": null, + "weight_dtype": "bfloat16" + }, + "model_details": null, + "recommendation": "Model fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 64, + "cpu_memory_in_gbs": 2048, + "gpu_count": 8, + "gpu_memory_in_gbs": 320, + "gpu_type": "A100", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 70, + "performance": 60 + } + }, + "memory_in_gbs": null, + "name": "BM.GPU.B4.8", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": 130000, + "params": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --quantization mxfp4", + "quantization": "mxfp4", + "weight_dtype": null + }, + "model_details": null, + "recommendation": "Model fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 112, + "cpu_memory_in_gbs": 2048, + "gpu_count": 8, + "gpu_memory_in_gbs": 640, + "gpu_type": "H100", + "quantization": [ + "awq", + "gptq", + "marlin", + "fp8", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 100, + "performance": 100 + } + }, + "memory_in_gbs": null, + "name": "BM.GPU.H100.8", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": 130000, + "params": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 64 --max-model-len 130000 --quantization mxfp4", + "quantization": "mxfp4", + "weight_dtype": null + }, + "model_details": null, + "recommendation": "Model fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 112, + "cpu_memory_in_gbs": 3072, + "gpu_count": 8, + "gpu_memory_in_gbs": 1128, + "gpu_type": "H200", + "quantization": [ + "awq", + "gptq", + "marlin", + "fp8", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 100, + "performance": 110 + } + }, + "memory_in_gbs": null, + "name": "BM.GPU.H200.8", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + }, + "max_model_len": 130000, + "params": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 32 --max-model-len 130000 --dtype bfloat16", + "quantization": null, + "weight_dtype": "bfloat16" + }, + "model_details": null, + "recommendation": "ENV: {\"VLLM_ATTENTION_BACKEND\": \"TRITON_ATTN_VLLM_V1\"}\n\nModel fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 64, + "cpu_memory_in_gbs": 2048, + "gpu_count": 8, + "gpu_memory_in_gbs": 320, + "gpu_type": "A100", + "quantization": [ + "int8", + "fp16", + "bf16", + "tf32" + ], + "ranking": { + "cost": 57, + "performance": 65 + } + }, + "memory_in_gbs": null, + "name": "BM.GPU4.8", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": { + "VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1" + }, + "max_model_len": 8192, + "params": "--trust-remote-code --gpu-memory-utilization 0.90 --max-num-seqs 5 --max-model-len 8192 --dtype bfloat16", + "quantization": null, + "weight_dtype": "bfloat16" + }, + "model_details": null, + "recommendation": "ENV: {\"VLLM_ATTENTION_BACKEND\": \"TRITON_ATTN_VLLM_V1\"}\n\nModel fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 30, + "cpu_memory_in_gbs": 480, + "gpu_count": 2, + "gpu_memory_in_gbs": 48, + "gpu_type": "A10", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 40, + "performance": 40 + } + }, + "memory_in_gbs": null, + "name": "VM.GPU.A10.2", + "shape_series": "GPU" + } + } + ], + "troubleshoot": null +} diff --git a/tests/unitary/with_extras/aqua/test_data/recommend/service-config/result-example-3.json b/tests/unitary/with_extras/aqua/test_data/recommend/service-config/result-example-3.json new file mode 100644 index 000000000..7d9ff87bc --- /dev/null +++ b/tests/unitary/with_extras/aqua/test_data/recommend/service-config/result-example-3.json @@ -0,0 +1,268 @@ +{ + "display_name": "service-config/example-3", + "recommendations": [ + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": null, + "params": "--trust-remote-code", + "quantization": null, + "weight_dtype": null + }, + "model_details": null, + "recommendation": "Model fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 64, + "cpu_memory_in_gbs": 1024, + "gpu_count": 4, + "gpu_memory_in_gbs": 96, + "gpu_type": "A10", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 50, + "performance": 50 + } + }, + "memory_in_gbs": null, + "name": "BM.GPU.A10.4", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": null, + "params": "--trust-remote-code", + "quantization": null, + "weight_dtype": null + }, + "model_details": null, + "recommendation": "Model fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 112, + "cpu_memory_in_gbs": 2048, + "gpu_count": 8, + "gpu_memory_in_gbs": 640, + "gpu_type": "H100", + "quantization": [ + "awq", + "gptq", + "marlin", + "fp8", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 100, + "performance": 100 + } + }, + "memory_in_gbs": null, + "name": "BM.GPU.H100.8", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": null, + "params": "--trust-remote-code", + "quantization": null, + "weight_dtype": null + }, + "model_details": null, + "recommendation": "Model fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 112, + "cpu_memory_in_gbs": 3072, + "gpu_count": 8, + "gpu_memory_in_gbs": 1128, + "gpu_type": "H200", + "quantization": [ + "awq", + "gptq", + "marlin", + "fp8", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 100, + "performance": 110 + } + }, + "memory_in_gbs": null, + "name": "BM.GPU.H200.8", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": null, + "params": "--trust-remote-code", + "quantization": null, + "weight_dtype": null + }, + "model_details": null, + "recommendation": "Model fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 112, + "cpu_memory_in_gbs": 1024, + "gpu_count": 4, + "gpu_memory_in_gbs": 192, + "gpu_type": "L40S", + "quantization": [ + "awq", + "gptq", + "marlin", + "fp8", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 60, + "performance": 80 + } + }, + "memory_in_gbs": null, + "name": "BM.GPU.L40S-NC.4", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": null, + "params": "--trust-remote-code", + "quantization": null, + "weight_dtype": null + }, + "model_details": null, + "recommendation": "Model fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 64, + "cpu_memory_in_gbs": 2048, + "gpu_count": 8, + "gpu_memory_in_gbs": 320, + "gpu_type": "A100", + "quantization": [ + "int8", + "fp16", + "bf16", + "tf32" + ], + "ranking": { + "cost": 57, + "performance": 65 + } + }, + "memory_in_gbs": null, + "name": "BM.GPU4.8", + "shape_series": "GPU" + } + }, + { + "configurations": [ + { + "deployment_params": { + "env_var": null, + "max_model_len": 6000, + "params": "--gpu-memory-utilization 0.98 --trust-remote-code --enforce-eager --max-num-seqs 32 --max-model-len 6000 --dtype auto --enable-lora --max-lora-rank 320 --lora-extra-vocab-size 512 --limit-mm-per-prompt audio=1,image=1 --max-loras 2 --lora-modules speech=/opt/ds/model/deployed_model/service_models/Phi-4-multimodal-instruct/0af439b/artifact/speech-lora vision=/opt/ds/model/deployed_model/service_models/Phi-4-multimodal-instruct/0af439b/artifact/vision-lora", + "quantization": null, + "weight_dtype": "auto" + }, + "model_details": null, + "recommendation": "Model fits well within the allowed compute shape." + } + ], + "shape_details": { + "available": false, + "core_count": null, + "gpu_specs": { + "cpu_count": 15, + "cpu_memory_in_gbs": 240, + "gpu_count": 1, + "gpu_memory_in_gbs": 24, + "gpu_type": "A10", + "quantization": [ + "awq", + "gptq", + "marlin", + "int8", + "bitblas", + "aqlm", + "bitsandbytes", + "deepspeedfp", + "gguf" + ], + "ranking": { + "cost": 20, + "performance": 30 + } + }, + "memory_in_gbs": null, + "name": "VM.GPU.A10.1", + "shape_series": "GPU" + } + } + ], + "troubleshoot": null +} diff --git a/tests/unitary/with_extras/aqua/test_recommend.py b/tests/unitary/with_extras/aqua/test_recommend.py index cb61dae86..4fcd8c669 100644 --- a/tests/unitary/with_extras/aqua/test_recommend.py +++ b/tests/unitary/with_extras/aqua/test_recommend.py @@ -13,6 +13,7 @@ from ads.aqua.common.entities import ComputeShapeSummary from ads.aqua.common.errors import AquaRecommendationError +from ads.aqua.modeldeployment.config_loader import AquaDeploymentConfig from ads.aqua.shaperecommend.estimator import ( LlamaMemoryEstimator, MemoryEstimator, @@ -31,9 +32,7 @@ ) from ads.model.model_metadata import ModelCustomMetadata, ModelProvenanceMetadata -CONFIG_ROOT = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "test_data/recommend/" -) +CONFIG_ROOT = os.path.join(os.path.dirname(__file__), "test_data/recommend/") def load_config(filename): @@ -89,10 +88,13 @@ def test_get_estimator_llama_and_moe_fields(self): @pytest.mark.parametrize( "config_file,should_raise", [ - ("Devstral-Small-2507-GQA.json", False), - ("Kimi-K2-Instruct-MOE.json", False), - ("Qwen3-235B-A22B-Instruct-2507-FP8.json", False), - ("t5gemma-ml-ml-prefixlm.json", True), # This one is expected to raise + ("config-json-files/Devstral-Small-2507-GQA.json", False), + ("config-json-files/Kimi-K2-Instruct-MOE.json", False), + ("config-json-files/Qwen3-235B-A22B-Instruct-2507-FP8.json", False), + ( + "config-json-files/t5gemma-ml-ml-prefixlm.json", + True, + ), # This one is expected to raise ], ) def test_memory_estimator_properties_from_file(self, config_file, should_raise): @@ -113,9 +115,12 @@ def test_memory_estimator_properties_from_file(self, config_file, should_raise): @pytest.mark.parametrize( "config_file, expected_estimator_cls", [ - ("Devstral-Small-2507-GQA.json", LlamaMemoryEstimator), - ("Kimi-K2-Instruct-MOE.json", MixtureMemoryEstimator), - ("Qwen3-235B-A22B-Instruct-2507-FP8.json", MixtureMemoryEstimator), + ("config-json-files/Devstral-Small-2507-GQA.json", LlamaMemoryEstimator), + ("config-json-files/Kimi-K2-Instruct-MOE.json", MixtureMemoryEstimator), + ( + "config-json-files/Qwen3-235B-A22B-Instruct-2507-FP8.json", + MixtureMemoryEstimator, + ), ], ) def test_get_estimator_types_from_config_file( @@ -149,7 +154,7 @@ def test_llm_config_from_raw_config(self): "config_file, expected_hidden_size, expected_max_seq_len, expected_dtype, exp_num_key_value_heads, exp_num_local_experts, expected_head_dim, expected_quant", [ ( - "Devstral-Small-2507-GQA.json", + "config-json-files/Devstral-Small-2507-GQA.json", 5120, 131072, "bfloat16", @@ -159,7 +164,7 @@ def test_llm_config_from_raw_config(self): None, ), ( - "Kimi-K2-Instruct-MOE.json", + "config-json-files/Kimi-K2-Instruct-MOE.json", 7168, 131072, "bfloat16", @@ -169,7 +174,7 @@ def test_llm_config_from_raw_config(self): "fp8", ), ( - "Qwen3-235B-A22B-Instruct-2507-FP8.json", + "config-json-files/Qwen3-235B-A22B-Instruct-2507-FP8.json", 4096, 262144, "bfloat16", @@ -217,9 +222,9 @@ def test_suggested_quantizations(self): @pytest.mark.parametrize( "config_file, expected_quantizations", [ - ("Devstral-Small-2507-GQA.json", {"4bit"}), - ("Kimi-K2-Instruct-MOE.json", {"4bit"}), - ("Qwen3-235B-A22B-Instruct-2507-FP8.json", {"4bit"}), + ("config-json-files/Devstral-Small-2507-GQA.json", {"4bit"}), + ("config-json-files/Kimi-K2-Instruct-MOE.json", {"4bit"}), + ("config-json-files/Qwen3-235B-A22B-Instruct-2507-FP8.json", {"4bit"}), ], ) def test_suggested_quantizations_from_file( @@ -234,15 +239,10 @@ def test_suggested_quantizations_from_file( # --- Tests for recommend.py --- class GPUShapesIndexMock: def __init__(self): - local_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), - "../../../../ads/aqua/resources", - "gpu_shapes_index.json", - ) - # local_path = "ads/aqua/resources/gpu_shapes_index.json" - with open(local_path) as f: + # update gpu-index.json if changes to accelerated-data-science/ads/aqua/resources/gpu_shapes_index.json occurs + local_path = "gpu-index/gpu-index.json" + with open(os.path.join(CONFIG_ROOT, local_path)) as f: local_data = json.load(f) - local_shapes = local_data.get("shapes", {}) self.shapes = local_shapes @@ -339,39 +339,82 @@ def test_which_shapes_valid( ) @pytest.mark.parametrize( - "config_file, result_file", - [ - ("Devstral-Small-2507-GQA.json", "result-Devstral-Small-2507-GQA.json"), - ("Kimi-K2-Instruct-MOE.json", "result-Kimi-K2-Instruct-MOE.json"), + "config_file, result_file, service_managed_model", + [ # config.json cases + ( + "config-json-files/Devstral-Small-2507-GQA.json", + "config-json-files/result-Devstral-Small-2507-GQA.json", + False, + ), + ( + "config-json-files/Kimi-K2-Instruct-MOE.json", + "config-json-files/result-Kimi-K2-Instruct-MOE.json", + False, + ), + ( + "config-json-files/Qwen3-235B-A22B-Instruct-2507-FP8.json", + "config-json-files/result-Qwen3-235B-A22B-Instruct-2507-FP8.json", + False, + ), + # # SMM config cases + ( + "service-config/example-1.json", + "service-config/result-example-1.json", + True, + ), ( - "Qwen3-235B-A22B-Instruct-2507-FP8.json", - "result-Qwen3-235B-A22B-Instruct-2507-FP8.json", + "service-config/example-2.json", + "service-config/result-example-2.json", + True, + ), + ( + "service-config/example-3.json", + "service-config/result-example-3.json", + True, ), ], ) def test_which_shapes_valid_from_file( - self, monkeypatch, config_file, result_file, **kwargs + self, monkeypatch, config_file, result_file, service_managed_model, **kwargs ): - raw = load_config(config_file) app = AquaShapeRecommend() mock_model = MockDataScienceModel.create(config_file) monkeypatch.setattr( "ads.aqua.app.DataScienceModel.from_id", lambda _: mock_model ) - monkeypatch.setattr(app, "_get_model_config", lambda _: raw) shapes_index = GPUShapesIndexMock() real_shapes = [ ComputeShapeSummary(name=name, shape_series="GPU", gpu_specs=spec) for name, spec in shapes_index.shapes.items() ] + monkeypatch.setattr( - app, "valid_compute_shapes", lambda *args, **kwargs: real_shapes + app, + "valid_compute_shapes", + lambda *args, **kwargs: ( + print("Monkeypatch valid_compute_shapes hit"), + real_shapes, + )[1], ) - request = RequestRecommend( - model_id="ocid1.datasciencemodel.oc1.TEST", generate_table=False - ) + raw = load_config(config_file) + + if service_managed_model: + config = AquaDeploymentConfig(**raw) + + request = RequestRecommend( + model_id="ocid1.datasciencemodel.oc1.TEST", + generate_table=False, + deployment_config=config, + ) + else: + monkeypatch.setattr(app, "_get_model_config", lambda _: raw) + + request = RequestRecommend( + model_id="ocid1.datasciencemodel.oc1.TEST", generate_table=False + ) + result = app.which_shapes(request=request) expected_result = load_config(result_file)