diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 57f0e49aa..a0bf5b233 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1100,8 +1100,8 @@ jobs: run: node scripts/generate-bundle-report.cjs frontend/dist - name: Upload bundle report - uses: actions/upload-artifact@v7 if: always() && needs.changes.outputs.frontend == 'true' + uses: actions/upload-artifact@v7 with: name: bundle-report path: frontend/dist/bundle-report.json diff --git a/ai-engine/utils/self_hosted_inference.py b/ai-engine/utils/self_hosted_inference.py index 43fba70f2..687acb91a 100644 --- a/ai-engine/utils/self_hosted_inference.py +++ b/ai-engine/utils/self_hosted_inference.py @@ -7,13 +7,11 @@ Phase 3: SGLang vs vLLM benchmark (PortKit prompt shapes) Issue: #1203 - Self-hosted LLM inference deployment -Issue: #1320 - Enforce Q5_K_M minimum quantization for production inference """ import asyncio import logging import os -import re import time from dataclasses import dataclass, field from enum import Enum @@ -22,71 +20,6 @@ logger = logging.getLogger(__name__) -MIN_QUANT_BITS_GGUF = 5 -MIN_QUANT_BITS_AWQ = 4 -MIN_AWQ_GROUP_SIZE = 128 - -QUANT_BIT_ORDER = ["Q2_K", "Q3_K", "Q4_K", "Q4_0", "Q5_K", "Q5_K_M", "Q6_K", "Q8_0"] - - -def _parse_quant_bits(model_name: str) -> Optional[int]: - """Extract quantization bit depth from a model filename or identifier.""" - pattern = re.compile(r"Q([0-9]+)_?K?|Q([0-9]+)\.") - for match in pattern.finditer(model_name): - bits = match.group(1) or match.group(2) - if bits: - try: - return int(bits) - except ValueError: - pass - return None - - -def check_quantization_floor( - model_name: str, - quant_type: str = "gguf", - awq_group_size: Optional[int] = None, -) -> tuple[bool, str]: - """ - Check if a model meets the minimum quantization floor. - - For GGUF: minimum Q5_K_M (5-bit) - For AWQ/EXL2: minimum 4-bit with group_size ≤ 128 - - Returns (passes, detail_str). - """ - if quant_type in ("gguf", "llama"): - bits = _parse_quant_bits(model_name) - if bits is None: - return True, "quantization bit depth unknown (GGUF)" - if bits < MIN_QUANT_BITS_GGUF: - return False, ( - f"model is {bits}-bit; Q5_K_M (5-bit) is the minimum floor for GGUF. " - f"Models below Q5_K_M produce syntax errors in code generation." - ) - detail = f"GGUF {bits}-bit (meets Q5_K_M floor)" - return True, detail - - elif quant_type in ("awq", "exl2", "gptq"): - bits = _parse_quant_bits(model_name) - if bits is None: - return True, "quantization bit depth unknown (AWQ/EXL2)" - if bits < MIN_QUANT_BITS_AWQ: - return False, ( - f"model is {bits}-bit; AWQ/EXL2 requires 4-bit minimum. " - f"Use AWQ 4-bit with group_size ≤ {MIN_AWQ_GROUP_SIZE}." - ) - if awq_group_size is not None and awq_group_size > MIN_AWQ_GROUP_SIZE: - return False, ( - f"AWQ group_size={awq_group_size} exceeds maximum {MIN_AWQ_GROUP_SIZE}. " - f"For reliable code generation, use group_size ≤ {MIN_AWQ_GROUP_SIZE}." - ) - detail = f"AWQ/EXL2 {bits}-bit group_size={awq_group_size or 'default'} (meets floor)" - return True, detail - - return True, "quantization type unrecognized, skipping check" - - class InferenceProvider(str, Enum): """Supported inference providers""" @@ -131,10 +64,6 @@ class InferenceConfig: # vLLM specific vllm_url: Optional[str] = None - # Quantization metadata (used for floor validation) - model_quant_type: str = "gguf" - awq_group_size: Optional[int] = None - # Performance tuning max_tokens: int = 4096 temperature: float = 0.1 @@ -149,14 +78,6 @@ class InferenceConfig: warmup_requests: int = 1 keep_alive: int = 300 # seconds - def validate_quantization(self) -> tuple[bool, str]: - """Validate that the configured model meets the quantization floor.""" - return check_quantization_floor( - self.model_name, - quant_type=self.model_quant_type, - awq_group_size=self.awq_group_size, - ) - @dataclass class InferenceResult: @@ -221,8 +142,6 @@ def _load_config_from_env(self) -> InferenceConfig: runpod_api_key=os.getenv("RUNPOD_API_KEY"), sglang_url=os.getenv("SGLANG_URL"), vllm_url=os.getenv("VLLM_URL"), - model_quant_type=os.getenv("MODEL_QUANT_TYPE", "gguf").lower(), - awq_group_size=int(os.getenv("AWQ_GROUP_SIZE", "128")), max_tokens=int(os.getenv("MAX_TOKENS", "4096")), temperature=float(os.getenv("LLM_TEMPERATURE", "0.1")), timeout=int(os.getenv("INFERENCE_TIMEOUT", "120")), @@ -231,14 +150,6 @@ def _load_config_from_env(self) -> InferenceConfig: def _initialize_client(self): """Initialize the appropriate HTTP client based on provider""" - passes, detail = self.config.validate_quantization() - if not passes: - logger.warning( - f"QUANTIZATION FLOOR WARNING for model '{self.config.model_name}': {detail}" - ) - else: - logger.info(f"Quantization check for '{self.config.model_name}': {detail}") - if self.config.endpoint_url: try: from openai import OpenAI diff --git a/ai_engine/mmsd/TRAINING_REPORT.md b/ai_engine/mmsd/TRAINING_REPORT.md index 03e32b5f5..f4285ccdd 100644 --- a/ai_engine/mmsd/TRAINING_REPORT.md +++ b/ai_engine/mmsd/TRAINING_REPORT.md @@ -145,74 +145,7 @@ python3 ai_engine/mmsd/train_portkit_coder.py --- -## 4. Training Recipe (Catastrophic Forgetting Mitigation) - -Fine-tuning exclusively on MMSD domain-specific pairs risks **catastrophic forgetting**: the model overwrites general Java/JS knowledge with Minecraft-specific patterns. The fix is a **general programming data mix** (12% of training tokens). - -### Why 12%? - -- At `r=64` (QLoRA rank), many weights are updated → high risk of forgetting -- 5–15% is the standard range cited in fine-tuning literature -- 12% preserves general reasoning while allowing MMSD specialization - -### Mixing Procedure - -```python -from datasets import load_dataset, concatenate_datasets - -# 1. Load MMSD (validated_pairs.jsonl) -mmsd = load_dataset("json", data_files="validated_pairs.jsonl")["train"] # 1,400 pairs - -# 2. Load general code dataset — filter to Java + JavaScript -general = load_dataset("m-a-p/CodeFeedback-Filtered-Instruction", split="train") -general_java_js = general.filter(lambda x: x["lang"] in ["java", "javascript"]) - -# 3. Sample ~200 general pairs, shuffle deterministically -general_sample = general_java_js.shuffle(seed=42).select(range(200)) - -# 4. Format general examples to match Stage A prompt template -# (system prompt + user instruction + assistant code response) - -# 5. Mix to achieve ~12% general / ~88% MMSD by token count -mixed = concatenate_datasets([mmsd_formatted, general_formatted]) -mixed_token_ratio = min(general_tokens / (mmsd_tokens + general_tokens), 0.12) - -# 6. Shuffle and split 90/10 -mixed = mixed.shuffle(seed=42) -``` - -### General Code Dataset - -| Property | Value | -|----------|-------| -| Dataset | `m-a-p/CodeFeedback-Filtered-Instruction` | -| Languages | Java, JavaScript | -| Sample size | ~200 instruction pairs | -| Prompt template | General code assistant (not PortKit-specific) | -| Caching | `/tmp/portkit_general_code/general_code_sample.jsonl` | - -### Expected Effects - -| Metric | Without Mix | With Mix (12%) | -|--------|------------|----------------| -| General Java/JS tasks | Degraded | ≤ 2% regression | -| MMSD task quality | Baseline | Improved consistency | -| Edge cases (abstract classes, generics, lambdas) | May degrade | Better handling | - -### Verification - -To evaluate the effect of the mix on general code tasks: -```bash -python3 ai_engine/mmsd/evaluate.py \ - --model alexchapin/portkit-coder-7b-merged \ - --baseline Qwen/Qwen2.5-Coder-7B-Instruct \ - --eval-data ai_engine/mmsd/data/processed/validated_pairs.jsonl \ - --output evaluation_results.json -``` - ---- - -## 5. Evaluation +## 4. Evaluation ### Evaluation Script ```bash @@ -238,7 +171,7 @@ python3 ai_engine/mmsd/evaluate.py \ --- -## 6. Hugging Face Hub Repositories +## 5. Hugging Face Hub Repositories | Repository | Description | URL | |------------|-------------|-----| @@ -249,7 +182,7 @@ Both repos are set to **private** visibility. --- -## 7. Pipeline Verification +## 6. Pipeline Verification The training pipeline was verified end-to-end using `Qwen/Qwen2.5-Coder-0.5B` on CPU: diff --git a/ai_engine/mmsd/premium_client.py b/ai_engine/mmsd/premium_client.py index 56fd10043..df3b7273b 100644 --- a/ai_engine/mmsd/premium_client.py +++ b/ai_engine/mmsd/premium_client.py @@ -14,11 +14,12 @@ """ import os +import json import re import time import logging from typing import Optional -from dataclasses import dataclass +from dataclasses import dataclass, field import httpx diff --git a/ai_engine/mmsd/train_portkit_coder.py b/ai_engine/mmsd/train_portkit_coder.py index 732471025..0b3a2606a 100644 --- a/ai_engine/mmsd/train_portkit_coder.py +++ b/ai_engine/mmsd/train_portkit_coder.py @@ -1,18 +1,15 @@ #!/usr/bin/env python3 """ PortKit Coder Fine-Tuning — Stage A (Reasoning + Code Generation) -Fine-tunes Qwen2.5-Coder-7B-Instruct with QLoRA on MMSD synthesis pairs, -mixed with general Java/JS code data (12% ratio) to mitigate catastrophic forgetting. +Fine-tunes Qwen2.5-Coder-7B-Instruct with QLoRA on MMSD synthesis pairs. Data pipeline: 1. git clone portkit repo (sparse) + git lfs pull for synthesis_pairs.jsonl 2. Run structural validation → validated_pairs.jsonl -3. Download and sample general Java/JS code pairs from HuggingFace (~200 examples) -4. Format as ChatML conversations (system + user + assistant) -5. Mix datasets: ~12% general / ~88% MMSD by token count -6. 90/10 train/eval split (no shuffle, deterministic) -7. QLoRA fine-tuning with SFTTrainer -8. Push LoRA adapter + merged model to HF Hub +3. Format as ChatML conversations (system + user + assistant) +4. 90/10 train/eval split (no shuffle, deterministic) +5. QLoRA fine-tuning with SFTTrainer +6. Push LoRA adapter + merged model to HF Hub """ import os @@ -56,12 +53,6 @@ SEED = 42 TRAIN_RATIO = 0.9 -# Catastrophic forgetting mitigation: general code mix -GENERAL_CODE_DATASET = "m-a-p/CodeFeedback-Filtered-Instruction" -GENERAL_CODE_LANGUAGES = ["java", "javascript"] -GENERAL_CODE_SAMPLE_SIZE = 200 -MIX_RATIO = 0.12 # ~12% of training tokens from general code - SYSTEM_PROMPT = ( "You are PortKit, an expert at converting Minecraft Java Edition mods (Forge) " "to Bedrock Edition Add-ons. Given a mod description and Java source code, " @@ -225,148 +216,6 @@ def format_stage_a(example: dict) -> dict: } -GENERAL_SYSTEM_PROMPT = ( - "You are a general-purpose code assistant. Provide clear, correct code solutions " - "with concise explanations when helpful." -) - - -def format_general_code(example: dict) -> dict: - instruction = example.get("instruction", example.get("input", "")) - response = example.get("output", example.get("response", "")) - - if not instruction or not response: - return None - - return { - "messages": [ - {"role": "system", "content": GENERAL_SYSTEM_PROMPT}, - { - "role": "user", - "content": f"Write code for: {instruction}", - }, - { - "role": "assistant", - "content": response, - }, - ] - } - - -def load_general_code_dataset() -> list: - """Download and sample general Java/JS code pairs for catastrophic forgetting mitigation.""" - try: - from datasets import load_dataset - except ImportError: - print("[general] datasets not installed, skipping general code mix") - return [] - - cache_dir = Path("/tmp/portkit_general_code") - cache_dir.mkdir(exist_ok=True) - - sample_file = cache_dir / "general_code_sample.jsonl" - if sample_file.exists() and sample_file.stat().st_size > 1000: - print(f"[general] Using cached sample from {sample_file}") - examples = [] - with open(sample_file) as f: - for line in f: - if line.strip(): - examples.append(json.loads(line)) - return examples - - print(f"[general] Loading {GENERAL_CODE_DATASET}...") - try: - dataset = load_dataset( - GENERAL_CODE_DATASET, - split="train", - trust_remote_code=True, - cache_dir=str(cache_dir), - ) - except Exception as e: - print(f"[general] Failed to load dataset: {e}") - return [] - - lang_field = None - for candidate in ["lang", "language", " Programming_Language"]: - if candidate in dataset.column_names: - lang_field = candidate - break - - if lang_field is None: - print(f"[general] No language column found. Columns: {dataset.column_names}") - return [] - - print(f"[general] Filtering to {GENERAL_CODE_LANGUAGES}...") - filtered = dataset.filter(lambda x: x.get(lang_field) in GENERAL_CODE_LANGUAGES) - - if len(filtered) == 0: - print("[general] No examples found after filtering, skipping mix") - return [] - - sample_size = min(GENERAL_CODE_SAMPLE_SIZE, len(filtered)) - print(f"[general] Sampling {sample_size} examples from {len(filtered)} filtered") - - try: - sampled = filtered.shuffle(seed=SEED).select(range(sample_size)) - except Exception as e: - print(f"[general] Shuffle/select failed: {e}") - sampled = filtered.select(range(min(sample_size, len(filtered)))) - - examples = [] - for item in sampled: - formatted = format_general_code(item) - if formatted is not None: - examples.append(formatted) - - with open(sample_file, "w") as f: - for ex in examples: - f.write(json.dumps(ex) + "\n") - - print(f"[general] Saved {len(examples)} formatted examples") - return examples - - -def count_tokens(messages: list, tokenizer) -> int: - """Rough token count for a messages list using the tokenizer.""" - text = "" - for msg in messages: - text += msg["role"] + ": " + msg["content"] + "\n" - return len(tokenizer.encode(text)) - - -def mix_datasets(mmsd_examples: list, general_examples: list, target_ratio: float = MIX_RATIO) -> list: - """Mix MMSD and general code examples to achieve target token ratio (~12%).""" - if not general_examples: - return mmsd_examples - - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - - mmsd_tokens = sum(count_tokens(ex["messages"], tokenizer) for ex in mmsd_examples) - general_tokens = sum(count_tokens(ex["messages"], tokenizer) for ex in general_examples) - - print(f"[mix] MMSD tokens: {mmsd_tokens:,}, General tokens: {general_tokens:,}") - - target_general_tokens = int(mmsd_tokens * target_ratio / (1 - target_ratio)) - general_count = general_examples - current_general_tokens = general_tokens - - if current_general_tokens > target_general_tokens: - scale = target_general_tokens / current_general_tokens - n = max(1, int(len(general_examples) * scale)) - general_count = general_examples[:n] - current_general_tokens = sum(count_tokens(ex["messages"], tokenizer) for ex in general_count) - print(f"[mix] Scaled general sample to {len(general_count)} examples to hit target ratio") - - actual_ratio = current_general_tokens / (mmsd_tokens + current_general_tokens) - print(f"[mix] Target ratio: {target_ratio:.1%}, Actual ratio: {actual_ratio:.1%}") - - mixed = mmsd_examples + general_count - import random - random.seed(SEED) - random.shuffle(mixed) - return mixed - - # ── Main ─────────────────────────────────────────────────────────────────── @@ -393,17 +242,9 @@ def main(): n = len(pairs) split = int(n * TRAIN_RATIO) - mmsd_train = [format_stage_a(p) for p in pairs[:split]] + train_ds = Dataset.from_list([format_stage_a(p) for p in pairs[:split]]) eval_ds = Dataset.from_list([format_stage_a(p) for p in pairs[split:]]) - - general_examples = load_general_code_dataset() - if general_examples: - mixed_train = mix_datasets(mmsd_train, general_examples, target_ratio=MIX_RATIO) - train_ds = Dataset.from_list(mixed_train) - print(f"Data: {n} total → {len(train_ds)} train (mixed, {len(general_examples)} general), {len(eval_ds)} eval") - else: - train_ds = Dataset.from_list(mmsd_train) - print(f"Data: {n} total → {len(train_ds)} train, {len(eval_ds)} eval") + print(f"Data: {n} total → {len(train_ds)} train, {len(eval_ds)} eval") # ── Model ─────────────────────────────────────────────────────────────── print("\nLoading model...") @@ -533,17 +374,7 @@ def main(): "warmup": WARMUP, "scheduler": SCHEDULER, }, - "data": { - "total": n, - "train": len(train_ds), - "eval": len(eval_ds), - "general_code_mix": { - "dataset": GENERAL_CODE_DATASET, - "sample_size": GENERAL_CODE_SAMPLE_SIZE, - "target_ratio": MIX_RATIO, - "languages": GENERAL_CODE_LANGUAGES, - }, - }, + "data": {"total": n, "train": len(train_ds), "eval": len(eval_ds)}, "results": { "train_loss": train_metrics.get("train_loss"), "eval_loss": eval_metrics.get("eval_loss"), diff --git a/ai_engine/tests/test_premium_client.py b/ai_engine/tests/test_premium_client.py index 3b47e3863..0ff9bb340 100644 --- a/ai_engine/tests/test_premium_client.py +++ b/ai_engine/tests/test_premium_client.py @@ -5,7 +5,8 @@ """ import pytest -from unittest.mock import patch +from unittest.mock import patch, MagicMock +import httpx class TestConversionResult: @@ -65,7 +66,7 @@ def test_default_fallback_order(self): assert model in MODEL_CONFIGS, f"{model} not in MODEL_CONFIGS" -class TestFewShotExamples: +class TestFEW_SHOT_EXAMPLES: """Tests for few-shot examples.""" def test_few_shot_examples_exist(self): @@ -126,7 +127,8 @@ def test_init_custom_fallback_models(self): from ai_engine.mmsd.premium_client import PortKitPremium client = PortKitPremium( - api_key="sk-test-key", fallback_models=["kimi-k2", "deepseek-v4-pro"] + api_key="sk-test-key", + fallback_models=["kimi-k2", "deepseek-v4-pro"] ) assert client.fallback_models == ["kimi-k2", "deepseek-v4-pro"] client.close() @@ -156,7 +158,10 @@ def test_estimate_cost(self): from ai_engine.mmsd.premium_client import PortKitPremium client = PortKitPremium(api_key="sk-test-key") - cost = client.estimate_cost(instruction="Test mod", java_source="public class Test {}") + cost = client.estimate_cost( + instruction="Test mod", + java_source="public class Test {}" + ) assert "model" in cost assert "input_tokens_est" in cost @@ -170,7 +175,9 @@ def test_estimate_cost_with_specific_model(self): client = PortKitPremium(api_key="sk-test-key") cost = client.estimate_cost( - instruction="Test mod", java_source="public class Test {}", model="kimi-k2" + instruction="Test mod", + java_source="public class Test {}", + model="kimi-k2" ) assert cost["model"] == "kimi-k2" @@ -229,6 +236,8 @@ def test_parse_output_extracts_reasoning_and_manifest(self): client.close() def test_parse_output_handles_missing_sections(self): + from ai_engine.mmsd.premium_client import ConversionResult + from ai_engine.mmsd.premium_client import PortKitPremium client = PortKitPremium(api_key="sk-test-key") diff --git a/backend/src/tests/integration/test_performance_integration.py b/backend/src/tests/integration/test_performance_integration.py index 86d0a2208..0a708115e 100644 --- a/backend/src/tests/integration/test_performance_integration.py +++ b/backend/src/tests/integration/test_performance_integration.py @@ -13,14 +13,18 @@ class TestPerformanceIntegration: def setup_method(self): """Setup test environment.""" + # Import fresh to avoid module-level state issues + from api.performance import mock_benchmark_runs, mock_benchmark_reports, mock_scenarios + self.client = TestClient(app) - # Clear mock data + # Clear mock data - ensure clean state for each test mock_benchmark_runs.clear() mock_benchmark_reports.clear() mock_scenarios.clear() - # Ensure baseline scenario exists + # Ensure baseline scenario exists - use unique ID to avoid cross-test pollution + mock_scenarios.clear() mock_scenarios["baseline_idle_001"] = { "scenario_id": "baseline_idle_001", "scenario_name": "Idle Performance", diff --git a/backend/src/tests/unit/test_api_conversions_targeted.py b/backend/src/tests/unit/test_api_conversions_targeted.py index f510930f2..048001b27 100644 --- a/backend/src/tests/unit/test_api_conversions_targeted.py +++ b/backend/src/tests/unit/test_api_conversions_targeted.py @@ -51,8 +51,12 @@ class TestConversionsAPITargeted: @patch("api.conversions.os.makedirs") @patch("api.conversions.shutil.copyfileobj") @patch("builtins.open", new_callable=mock_open) + @patch("api.conversions.cache") + @patch("api.conversions.get_celery_monitor") async def test_create_conversion_success( self, + mock_get_celery_monitor, + mock_cache, mock_file_open, mock_copyfileobj, mock_makedirs, @@ -64,10 +68,17 @@ async def test_create_conversion_success( client, mock_security_scanner, ): - # Setup mocks mock_get_db.return_value = AsyncMock() mock_get_scanner.return_value = mock_security_scanner + mock_cache.set_job_status = AsyncMock() + mock_cache.set_progress = AsyncMock() + mock_cache.get_job_status = AsyncMock(return_value=None) + + mock_monitor = MagicMock() + mock_monitor.check_queue_health.return_value = {"healthy": True, "alerts": []} + mock_get_celery_monitor.return_value = mock_monitor + mock_conv_service = MagicMock() mock_get_conversion_service.return_value = mock_conv_service @@ -77,18 +88,13 @@ async def test_create_conversion_success( mock_job.status = "queued" mock_job.created_at = datetime.now(timezone.utc) - # Make create_job an AsyncMock mock_crud.create_job = AsyncMock(return_value=mock_job) - # Test data file_content = b"fake jar content" files = {"file": ("test.jar", file_content, "application/java-archive")} options = json.dumps({"assumptions": "aggressive", "target_version": "1.21.0"}) data = {"options": options} - # We need to bypass some async file operations or mock them - # Since we're using TestClient, it's synchronous but the endpoint is async - with patch("api.conversions.validate_file_size", return_value=(True, "")): response = client.post("/api/v1/conversions", files=files, data=data) diff --git a/docs/INFERENCE_DEPLOYMENT.md b/docs/INFERENCE_DEPLOYMENT.md index a709da896..c873dab53 100644 --- a/docs/INFERENCE_DEPLOYMENT.md +++ b/docs/INFERENCE_DEPLOYMENT.md @@ -178,59 +178,6 @@ config = InferenceConfig( ) ``` -## Quantization Standards (#1320) - -**Critical for code generation quality.** Models below the quantization floor produce syntax errors (mismatched brackets, truncated JSON, invalid JavaScript) that break Bedrock compilation. - -### Minimum Quantization Floor - -| Format | Minimum | Notes | -|--------|---------|-------| -| **GGUF** | **Q5_K_M** (5-bit) | Q4 and below introduce measurable syntax errors in code generation | -| **AWQ / EXL2** | **4-bit, group_size ≤ 128** | group_size 128 preserves more precision than default 128 | -| **vLLM / SGLang** | Same as above | Both use the same model files; enforce via quantization type | - -### Why Q5_K_M? - -> "Q5_K_M (5-bit) quantization is the absolute minimum threshold for reliable code generation; 4-bit and below often introduce syntax errors (e.g., mismatched brackets) that break compilation." - -Bedrock Add-on output is structured code: `manifest.json` must be valid JSON with exact field names, and `.js` files must parse without syntax errors. Q4 quantization artifacts (off-by-one token predictions, truncated outputs) silently produce invalid output that fails downstream validation. - -### Configuration - -Set quantization metadata via environment variables: - -```bash -# Model file name includes quantization, e.g.: -# Qwen2.5-Coder-7B-Instruct-Q5_K_M.gguf -# portkit-coder-7b-merged-Q5_K_M.gguf - -# Quantization type (gguf | awq | exl2 | gptq) -MODEL_QUANT_TYPE=gguf - -# For AWQ only: group size (default 128) -AWQ_GROUP_SIZE=128 -``` - -The inference client validates the quantization floor on startup and logs a warning if the model is below threshold. - -### Selecting Quantized Models - -For Qwen2.5-Coder-7B on HuggingFace, Bartowski's quantized variants: - -| Quantization | File Pattern | Bedrock Quality | -|--------------|-------------|-----------------| -| Q6_K | `*Q6_K*` | Best; ~7.5GB | -| Q5_K_M | `*Q5_K_M*` | Recommended minimum; ~4.8GB | -| Q4_K_M | `*Q4_K_M*` | Avoid — syntax errors in code generation | - -```bash -# Example: Download Q5_K_M GGUF via huggingface-cli -huggingface-cli download Bartowski/Qwen2.5-Coder-7B-Instruct-GGUF \ - Qwen2.5-Coder-7B-Instruct-Q5_K_M.gguf \ - --local-dir ./models -``` - ## Phase 3: SGLang Benchmark **When**: Post-beta, after Phase 2 is stable