diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 57f0e49aa..a0bf5b233 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1100,8 +1100,8 @@ jobs:
         run: node scripts/generate-bundle-report.cjs frontend/dist
 
       - name: Upload bundle report
-        uses: actions/upload-artifact@v7
         if: always() && needs.changes.outputs.frontend == 'true'
+        uses: actions/upload-artifact@v7
         with:
           name: bundle-report
           path: frontend/dist/bundle-report.json
diff --git a/ai-engine/utils/self_hosted_inference.py b/ai-engine/utils/self_hosted_inference.py
index 43fba70f2..687acb91a 100644
--- a/ai-engine/utils/self_hosted_inference.py
+++ b/ai-engine/utils/self_hosted_inference.py
@@ -7,13 +7,11 @@
 Phase 3: SGLang vs vLLM benchmark (PortKit prompt shapes)
 
 Issue: #1203 - Self-hosted LLM inference deployment
-Issue: #1320 - Enforce Q5_K_M minimum quantization for production inference
 """
 
 import asyncio
 import logging
 import os
-import re
 import time
 from dataclasses import dataclass, field
 from enum import Enum
@@ -22,71 +20,6 @@
 logger = logging.getLogger(__name__)
 
 
-MIN_QUANT_BITS_GGUF = 5
-MIN_QUANT_BITS_AWQ = 4
-MIN_AWQ_GROUP_SIZE = 128
-
-QUANT_BIT_ORDER = ["Q2_K", "Q3_K", "Q4_K", "Q4_0", "Q5_K", "Q5_K_M", "Q6_K", "Q8_0"]
-
-
-def _parse_quant_bits(model_name: str) -> Optional[int]:
-    """Extract quantization bit depth from a model filename or identifier."""
-    pattern = re.compile(r"Q([0-9]+)_?K?|Q([0-9]+)\.")
-    for match in pattern.finditer(model_name):
-        bits = match.group(1) or match.group(2)
-        if bits:
-            try:
-                return int(bits)
-            except ValueError:
-                pass
-    return None
-
-
-def check_quantization_floor(
-    model_name: str,
-    quant_type: str = "gguf",
-    awq_group_size: Optional[int] = None,
-) -> tuple[bool, str]:
-    """
-    Check if a model meets the minimum quantization floor.
-
-    For GGUF: minimum Q5_K_M (5-bit)
-    For AWQ/EXL2: minimum 4-bit with group_size ≤ 128
-
-    Returns (passes, detail_str).
-    """
-    if quant_type in ("gguf", "llama"):
-        bits = _parse_quant_bits(model_name)
-        if bits is None:
-            return True, "quantization bit depth unknown (GGUF)"
-        if bits < MIN_QUANT_BITS_GGUF:
-            return False, (
-                f"model is {bits}-bit; Q5_K_M (5-bit) is the minimum floor for GGUF. "
-                f"Models below Q5_K_M produce syntax errors in code generation."
-            )
-        detail = f"GGUF {bits}-bit (meets Q5_K_M floor)"
-        return True, detail
-
-    elif quant_type in ("awq", "exl2", "gptq"):
-        bits = _parse_quant_bits(model_name)
-        if bits is None:
-            return True, "quantization bit depth unknown (AWQ/EXL2)"
-        if bits < MIN_QUANT_BITS_AWQ:
-            return False, (
-                f"model is {bits}-bit; AWQ/EXL2 requires 4-bit minimum. "
-                f"Use AWQ 4-bit with group_size ≤ {MIN_AWQ_GROUP_SIZE}."
-            )
-        if awq_group_size is not None and awq_group_size > MIN_AWQ_GROUP_SIZE:
-            return False, (
-                f"AWQ group_size={awq_group_size} exceeds maximum {MIN_AWQ_GROUP_SIZE}. "
-                f"For reliable code generation, use group_size ≤ {MIN_AWQ_GROUP_SIZE}."
-            )
-        detail = f"AWQ/EXL2 {bits}-bit group_size={awq_group_size or 'default'} (meets floor)"
-        return True, detail
-
-    return True, "quantization type unrecognized, skipping check"
-
-
 class InferenceProvider(str, Enum):
     """Supported inference providers"""
 
@@ -131,10 +64,6 @@ class InferenceConfig:
     # vLLM specific
     vllm_url: Optional[str] = None
 
-    # Quantization metadata (used for floor validation)
-    model_quant_type: str = "gguf"
-    awq_group_size: Optional[int] = None
-
     # Performance tuning
     max_tokens: int = 4096
     temperature: float = 0.1
@@ -149,14 +78,6 @@ class InferenceConfig:
     warmup_requests: int = 1
     keep_alive: int = 300  # seconds
 
-    def validate_quantization(self) -> tuple[bool, str]:
-        """Validate that the configured model meets the quantization floor."""
-        return check_quantization_floor(
-            self.model_name,
-            quant_type=self.model_quant_type,
-            awq_group_size=self.awq_group_size,
-        )
-
 
 @dataclass
 class InferenceResult:
@@ -221,8 +142,6 @@ def _load_config_from_env(self) -> InferenceConfig:
             runpod_api_key=os.getenv("RUNPOD_API_KEY"),
             sglang_url=os.getenv("SGLANG_URL"),
             vllm_url=os.getenv("VLLM_URL"),
-            model_quant_type=os.getenv("MODEL_QUANT_TYPE", "gguf").lower(),
-            awq_group_size=int(os.getenv("AWQ_GROUP_SIZE", "128")),
             max_tokens=int(os.getenv("MAX_TOKENS", "4096")),
             temperature=float(os.getenv("LLM_TEMPERATURE", "0.1")),
             timeout=int(os.getenv("INFERENCE_TIMEOUT", "120")),
@@ -231,14 +150,6 @@ def _load_config_from_env(self) -> InferenceConfig:
 
     def _initialize_client(self):
         """Initialize the appropriate HTTP client based on provider"""
-        passes, detail = self.config.validate_quantization()
-        if not passes:
-            logger.warning(
-                f"QUANTIZATION FLOOR WARNING for model '{self.config.model_name}': {detail}"
-            )
-        else:
-            logger.info(f"Quantization check for '{self.config.model_name}': {detail}")
-
         if self.config.endpoint_url:
             try:
                 from openai import OpenAI
diff --git a/ai_engine/mmsd/TRAINING_REPORT.md b/ai_engine/mmsd/TRAINING_REPORT.md
index 03e32b5f5..f4285ccdd 100644
--- a/ai_engine/mmsd/TRAINING_REPORT.md
+++ b/ai_engine/mmsd/TRAINING_REPORT.md
@@ -145,74 +145,7 @@ python3 ai_engine/mmsd/train_portkit_coder.py
 
 ---
 
-## 4. Training Recipe (Catastrophic Forgetting Mitigation)
-
-Fine-tuning exclusively on MMSD domain-specific pairs risks **catastrophic forgetting**: the model overwrites general Java/JS knowledge with Minecraft-specific patterns. The fix is a **general programming data mix** (12% of training tokens).
-
-### Why 12%?
-
-- At `r=64` (QLoRA rank), many weights are updated → high risk of forgetting
-- 5–15% is the standard range cited in fine-tuning literature
-- 12% preserves general reasoning while allowing MMSD specialization
-
-### Mixing Procedure
-
-```python
-from datasets import load_dataset, concatenate_datasets
-
-# 1. Load MMSD (validated_pairs.jsonl)
-mmsd = load_dataset("json", data_files="validated_pairs.jsonl")["train"]  # 1,400 pairs
-
-# 2. Load general code dataset — filter to Java + JavaScript
-general = load_dataset("m-a-p/CodeFeedback-Filtered-Instruction", split="train")
-general_java_js = general.filter(lambda x: x["lang"] in ["java", "javascript"])
-
-# 3. Sample ~200 general pairs, shuffle deterministically
-general_sample = general_java_js.shuffle(seed=42).select(range(200))
-
-# 4. Format general examples to match Stage A prompt template
-# (system prompt + user instruction + assistant code response)
-
-# 5. Mix to achieve ~12% general / ~88% MMSD by token count
-mixed = concatenate_datasets([mmsd_formatted, general_formatted])
-mixed_token_ratio = min(general_tokens / (mmsd_tokens + general_tokens), 0.12)
-
-# 6. Shuffle and split 90/10
-mixed = mixed.shuffle(seed=42)
-```
-
-### General Code Dataset
-
-| Property | Value |
-|----------|-------|
-| Dataset | `m-a-p/CodeFeedback-Filtered-Instruction` |
-| Languages | Java, JavaScript |
-| Sample size | ~200 instruction pairs |
-| Prompt template | General code assistant (not PortKit-specific) |
-| Caching | `/tmp/portkit_general_code/general_code_sample.jsonl` |
-
-### Expected Effects
-
-| Metric | Without Mix | With Mix (12%) |
-|--------|------------|----------------|
-| General Java/JS tasks | Degraded | ≤ 2% regression |
-| MMSD task quality | Baseline | Improved consistency |
-| Edge cases (abstract classes, generics, lambdas) | May degrade | Better handling |
-
-### Verification
-
-To evaluate the effect of the mix on general code tasks:
-```bash
-python3 ai_engine/mmsd/evaluate.py \
-    --model alexchapin/portkit-coder-7b-merged \
-    --baseline Qwen/Qwen2.5-Coder-7B-Instruct \
-    --eval-data ai_engine/mmsd/data/processed/validated_pairs.jsonl \
-    --output evaluation_results.json
-```
-
----
-
-## 5. Evaluation
+## 4. Evaluation
 
 ### Evaluation Script
 ```bash
@@ -238,7 +171,7 @@ python3 ai_engine/mmsd/evaluate.py \
 
 ---
 
-## 6. Hugging Face Hub Repositories
+## 5. Hugging Face Hub Repositories
 
 | Repository | Description | URL |
 |------------|-------------|-----|
@@ -249,7 +182,7 @@ Both repos are set to **private** visibility.
 
 ---
 
-## 7. Pipeline Verification
+## 6. Pipeline Verification
 
 The training pipeline was verified end-to-end using `Qwen/Qwen2.5-Coder-0.5B` on CPU:
 
diff --git a/ai_engine/mmsd/premium_client.py b/ai_engine/mmsd/premium_client.py
index 56fd10043..df3b7273b 100644
--- a/ai_engine/mmsd/premium_client.py
+++ b/ai_engine/mmsd/premium_client.py
@@ -14,11 +14,12 @@
 """
 
 import os
+import json
 import re
 import time
 import logging
 from typing import Optional
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 
 import httpx
 
diff --git a/ai_engine/mmsd/train_portkit_coder.py b/ai_engine/mmsd/train_portkit_coder.py
index 732471025..0b3a2606a 100644
--- a/ai_engine/mmsd/train_portkit_coder.py
+++ b/ai_engine/mmsd/train_portkit_coder.py
@@ -1,18 +1,15 @@
 #!/usr/bin/env python3
 """
 PortKit Coder Fine-Tuning — Stage A (Reasoning + Code Generation)
-Fine-tunes Qwen2.5-Coder-7B-Instruct with QLoRA on MMSD synthesis pairs,
-mixed with general Java/JS code data (12% ratio) to mitigate catastrophic forgetting.
+Fine-tunes Qwen2.5-Coder-7B-Instruct with QLoRA on MMSD synthesis pairs.
 
 Data pipeline:
 1. git clone portkit repo (sparse) + git lfs pull for synthesis_pairs.jsonl
 2. Run structural validation → validated_pairs.jsonl
-3. Download and sample general Java/JS code pairs from HuggingFace (~200 examples)
-4. Format as ChatML conversations (system + user + assistant)
-5. Mix datasets: ~12% general / ~88% MMSD by token count
-6. 90/10 train/eval split (no shuffle, deterministic)
-7. QLoRA fine-tuning with SFTTrainer
-8. Push LoRA adapter + merged model to HF Hub
+3. Format as ChatML conversations (system + user + assistant)
+4. 90/10 train/eval split (no shuffle, deterministic)
+5. QLoRA fine-tuning with SFTTrainer
+6. Push LoRA adapter + merged model to HF Hub
 """
 
 import os
@@ -56,12 +53,6 @@
 SEED = 42
 TRAIN_RATIO = 0.9
 
-# Catastrophic forgetting mitigation: general code mix
-GENERAL_CODE_DATASET = "m-a-p/CodeFeedback-Filtered-Instruction"
-GENERAL_CODE_LANGUAGES = ["java", "javascript"]
-GENERAL_CODE_SAMPLE_SIZE = 200
-MIX_RATIO = 0.12  # ~12% of training tokens from general code
-
 SYSTEM_PROMPT = (
     "You are PortKit, an expert at converting Minecraft Java Edition mods (Forge) "
     "to Bedrock Edition Add-ons. Given a mod description and Java source code, "
@@ -225,148 +216,6 @@ def format_stage_a(example: dict) -> dict:
     }
 
 
-GENERAL_SYSTEM_PROMPT = (
-    "You are a general-purpose code assistant. Provide clear, correct code solutions "
-    "with concise explanations when helpful."
-)
-
-
-def format_general_code(example: dict) -> dict:
-    instruction = example.get("instruction", example.get("input", ""))
-    response = example.get("output", example.get("response", ""))
-
-    if not instruction or not response:
-        return None
-
-    return {
-        "messages": [
-            {"role": "system", "content": GENERAL_SYSTEM_PROMPT},
-            {
-                "role": "user",
-                "content": f"Write code for: {instruction}",
-            },
-            {
-                "role": "assistant",
-                "content": response,
-            },
-        ]
-    }
-
-
-def load_general_code_dataset() -> list:
-    """Download and sample general Java/JS code pairs for catastrophic forgetting mitigation."""
-    try:
-        from datasets import load_dataset
-    except ImportError:
-        print("[general] datasets not installed, skipping general code mix")
-        return []
-
-    cache_dir = Path("/tmp/portkit_general_code")
-    cache_dir.mkdir(exist_ok=True)
-
-    sample_file = cache_dir / "general_code_sample.jsonl"
-    if sample_file.exists() and sample_file.stat().st_size > 1000:
-        print(f"[general] Using cached sample from {sample_file}")
-        examples = []
-        with open(sample_file) as f:
-            for line in f:
-                if line.strip():
-                    examples.append(json.loads(line))
-        return examples
-
-    print(f"[general] Loading {GENERAL_CODE_DATASET}...")
-    try:
-        dataset = load_dataset(
-            GENERAL_CODE_DATASET,
-            split="train",
-            trust_remote_code=True,
-            cache_dir=str(cache_dir),
-        )
-    except Exception as e:
-        print(f"[general] Failed to load dataset: {e}")
-        return []
-
-    lang_field = None
-    for candidate in ["lang", "language", " Programming_Language"]:
-        if candidate in dataset.column_names:
-            lang_field = candidate
-            break
-
-    if lang_field is None:
-        print(f"[general] No language column found. Columns: {dataset.column_names}")
-        return []
-
-    print(f"[general] Filtering to {GENERAL_CODE_LANGUAGES}...")
-    filtered = dataset.filter(lambda x: x.get(lang_field) in GENERAL_CODE_LANGUAGES)
-
-    if len(filtered) == 0:
-        print("[general] No examples found after filtering, skipping mix")
-        return []
-
-    sample_size = min(GENERAL_CODE_SAMPLE_SIZE, len(filtered))
-    print(f"[general] Sampling {sample_size} examples from {len(filtered)} filtered")
-
-    try:
-        sampled = filtered.shuffle(seed=SEED).select(range(sample_size))
-    except Exception as e:
-        print(f"[general] Shuffle/select failed: {e}")
-        sampled = filtered.select(range(min(sample_size, len(filtered))))
-
-    examples = []
-    for item in sampled:
-        formatted = format_general_code(item)
-        if formatted is not None:
-            examples.append(formatted)
-
-    with open(sample_file, "w") as f:
-        for ex in examples:
-            f.write(json.dumps(ex) + "\n")
-
-    print(f"[general] Saved {len(examples)} formatted examples")
-    return examples
-
-
-def count_tokens(messages: list, tokenizer) -> int:
-    """Rough token count for a messages list using the tokenizer."""
-    text = ""
-    for msg in messages:
-        text += msg["role"] + ": " + msg["content"] + "\n"
-    return len(tokenizer.encode(text))
-
-
-def mix_datasets(mmsd_examples: list, general_examples: list, target_ratio: float = MIX_RATIO) -> list:
-    """Mix MMSD and general code examples to achieve target token ratio (~12%)."""
-    if not general_examples:
-        return mmsd_examples
-
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-    mmsd_tokens = sum(count_tokens(ex["messages"], tokenizer) for ex in mmsd_examples)
-    general_tokens = sum(count_tokens(ex["messages"], tokenizer) for ex in general_examples)
-
-    print(f"[mix] MMSD tokens: {mmsd_tokens:,}, General tokens: {general_tokens:,}")
-
-    target_general_tokens = int(mmsd_tokens * target_ratio / (1 - target_ratio))
-    general_count = general_examples
-    current_general_tokens = general_tokens
-
-    if current_general_tokens > target_general_tokens:
-        scale = target_general_tokens / current_general_tokens
-        n = max(1, int(len(general_examples) * scale))
-        general_count = general_examples[:n]
-        current_general_tokens = sum(count_tokens(ex["messages"], tokenizer) for ex in general_count)
-        print(f"[mix] Scaled general sample to {len(general_count)} examples to hit target ratio")
-
-    actual_ratio = current_general_tokens / (mmsd_tokens + current_general_tokens)
-    print(f"[mix] Target ratio: {target_ratio:.1%}, Actual ratio: {actual_ratio:.1%}")
-
-    mixed = mmsd_examples + general_count
-    import random
-    random.seed(SEED)
-    random.shuffle(mixed)
-    return mixed
-
-
 # ── Main ───────────────────────────────────────────────────────────────────
 
 
@@ -393,17 +242,9 @@ def main():
 
     n = len(pairs)
     split = int(n * TRAIN_RATIO)
-    mmsd_train = [format_stage_a(p) for p in pairs[:split]]
+    train_ds = Dataset.from_list([format_stage_a(p) for p in pairs[:split]])
     eval_ds = Dataset.from_list([format_stage_a(p) for p in pairs[split:]])
-
-    general_examples = load_general_code_dataset()
-    if general_examples:
-        mixed_train = mix_datasets(mmsd_train, general_examples, target_ratio=MIX_RATIO)
-        train_ds = Dataset.from_list(mixed_train)
-        print(f"Data: {n} total → {len(train_ds)} train (mixed, {len(general_examples)} general), {len(eval_ds)} eval")
-    else:
-        train_ds = Dataset.from_list(mmsd_train)
-        print(f"Data: {n} total → {len(train_ds)} train, {len(eval_ds)} eval")
+    print(f"Data: {n} total → {len(train_ds)} train, {len(eval_ds)} eval")
 
     # ── Model ───────────────────────────────────────────────────────────────
     print("\nLoading model...")
@@ -533,17 +374,7 @@ def main():
             "warmup": WARMUP,
             "scheduler": SCHEDULER,
         },
-        "data": {
-            "total": n,
-            "train": len(train_ds),
-            "eval": len(eval_ds),
-            "general_code_mix": {
-                "dataset": GENERAL_CODE_DATASET,
-                "sample_size": GENERAL_CODE_SAMPLE_SIZE,
-                "target_ratio": MIX_RATIO,
-                "languages": GENERAL_CODE_LANGUAGES,
-            },
-        },
+        "data": {"total": n, "train": len(train_ds), "eval": len(eval_ds)},
         "results": {
             "train_loss": train_metrics.get("train_loss"),
             "eval_loss": eval_metrics.get("eval_loss"),
diff --git a/ai_engine/tests/test_premium_client.py b/ai_engine/tests/test_premium_client.py
index 3b47e3863..0ff9bb340 100644
--- a/ai_engine/tests/test_premium_client.py
+++ b/ai_engine/tests/test_premium_client.py
@@ -5,7 +5,8 @@
 """
 
 import pytest
-from unittest.mock import patch
+from unittest.mock import patch, MagicMock
+import httpx
 
 
 class TestConversionResult:
@@ -65,7 +66,7 @@ def test_default_fallback_order(self):
             assert model in MODEL_CONFIGS, f"{model} not in MODEL_CONFIGS"
 
 
-class TestFewShotExamples:
+class TestFEW_SHOT_EXAMPLES:
     """Tests for few-shot examples."""
 
     def test_few_shot_examples_exist(self):
@@ -126,7 +127,8 @@ def test_init_custom_fallback_models(self):
         from ai_engine.mmsd.premium_client import PortKitPremium
 
         client = PortKitPremium(
-            api_key="sk-test-key", fallback_models=["kimi-k2", "deepseek-v4-pro"]
+            api_key="sk-test-key",
+            fallback_models=["kimi-k2", "deepseek-v4-pro"]
         )
         assert client.fallback_models == ["kimi-k2", "deepseek-v4-pro"]
         client.close()
@@ -156,7 +158,10 @@ def test_estimate_cost(self):
         from ai_engine.mmsd.premium_client import PortKitPremium
 
         client = PortKitPremium(api_key="sk-test-key")
-        cost = client.estimate_cost(instruction="Test mod", java_source="public class Test {}")
+        cost = client.estimate_cost(
+            instruction="Test mod",
+            java_source="public class Test {}"
+        )
 
         assert "model" in cost
         assert "input_tokens_est" in cost
@@ -170,7 +175,9 @@ def test_estimate_cost_with_specific_model(self):
 
         client = PortKitPremium(api_key="sk-test-key")
         cost = client.estimate_cost(
-            instruction="Test mod", java_source="public class Test {}", model="kimi-k2"
+            instruction="Test mod",
+            java_source="public class Test {}",
+            model="kimi-k2"
         )
 
         assert cost["model"] == "kimi-k2"
@@ -229,6 +236,8 @@ def test_parse_output_extracts_reasoning_and_manifest(self):
         client.close()
 
     def test_parse_output_handles_missing_sections(self):
+        from ai_engine.mmsd.premium_client import ConversionResult
+
         from ai_engine.mmsd.premium_client import PortKitPremium
 
         client = PortKitPremium(api_key="sk-test-key")
diff --git a/backend/src/tests/integration/test_performance_integration.py b/backend/src/tests/integration/test_performance_integration.py
index 86d0a2208..0a708115e 100644
--- a/backend/src/tests/integration/test_performance_integration.py
+++ b/backend/src/tests/integration/test_performance_integration.py
@@ -13,14 +13,18 @@ class TestPerformanceIntegration:
 
     def setup_method(self):
         """Setup test environment."""
+        # Import fresh to avoid module-level state issues
+        from api.performance import mock_benchmark_runs, mock_benchmark_reports, mock_scenarios
+
         self.client = TestClient(app)
 
-        # Clear mock data
+        # Clear mock data - ensure clean state for each test
         mock_benchmark_runs.clear()
         mock_benchmark_reports.clear()
         mock_scenarios.clear()
 
-        # Ensure baseline scenario exists
+        # Ensure baseline scenario exists - use unique ID to avoid cross-test pollution
+        mock_scenarios.clear()
         mock_scenarios["baseline_idle_001"] = {
             "scenario_id": "baseline_idle_001",
             "scenario_name": "Idle Performance",
diff --git a/backend/src/tests/unit/test_api_conversions_targeted.py b/backend/src/tests/unit/test_api_conversions_targeted.py
index f510930f2..048001b27 100644
--- a/backend/src/tests/unit/test_api_conversions_targeted.py
+++ b/backend/src/tests/unit/test_api_conversions_targeted.py
@@ -51,8 +51,12 @@ class TestConversionsAPITargeted:
     @patch("api.conversions.os.makedirs")
     @patch("api.conversions.shutil.copyfileobj")
     @patch("builtins.open", new_callable=mock_open)
+    @patch("api.conversions.cache")
+    @patch("api.conversions.get_celery_monitor")
     async def test_create_conversion_success(
         self,
+        mock_get_celery_monitor,
+        mock_cache,
         mock_file_open,
         mock_copyfileobj,
         mock_makedirs,
@@ -64,10 +68,17 @@ async def test_create_conversion_success(
         client,
         mock_security_scanner,
     ):
-        # Setup mocks
         mock_get_db.return_value = AsyncMock()
         mock_get_scanner.return_value = mock_security_scanner
 
+        mock_cache.set_job_status = AsyncMock()
+        mock_cache.set_progress = AsyncMock()
+        mock_cache.get_job_status = AsyncMock(return_value=None)
+
+        mock_monitor = MagicMock()
+        mock_monitor.check_queue_health.return_value = {"healthy": True, "alerts": []}
+        mock_get_celery_monitor.return_value = mock_monitor
+
         mock_conv_service = MagicMock()
         mock_get_conversion_service.return_value = mock_conv_service
 
@@ -77,18 +88,13 @@ async def test_create_conversion_success(
         mock_job.status = "queued"
         mock_job.created_at = datetime.now(timezone.utc)
 
-        # Make create_job an AsyncMock
         mock_crud.create_job = AsyncMock(return_value=mock_job)
 
-        # Test data
         file_content = b"fake jar content"
         files = {"file": ("test.jar", file_content, "application/java-archive")}
         options = json.dumps({"assumptions": "aggressive", "target_version": "1.21.0"})
         data = {"options": options}
 
-        # We need to bypass some async file operations or mock them
-        # Since we're using TestClient, it's synchronous but the endpoint is async
-
         with patch("api.conversions.validate_file_size", return_value=(True, "")):
             response = client.post("/api/v1/conversions", files=files, data=data)
 
diff --git a/docs/INFERENCE_DEPLOYMENT.md b/docs/INFERENCE_DEPLOYMENT.md
index a709da896..c873dab53 100644
--- a/docs/INFERENCE_DEPLOYMENT.md
+++ b/docs/INFERENCE_DEPLOYMENT.md
@@ -178,59 +178,6 @@ config = InferenceConfig(
 )
 ```
 
-## Quantization Standards (#1320)
-
-**Critical for code generation quality.** Models below the quantization floor produce syntax errors (mismatched brackets, truncated JSON, invalid JavaScript) that break Bedrock compilation.
-
-### Minimum Quantization Floor
-
-| Format | Minimum | Notes |
-|--------|---------|-------|
-| **GGUF** | **Q5_K_M** (5-bit) | Q4 and below introduce measurable syntax errors in code generation |
-| **AWQ / EXL2** | **4-bit, group_size ≤ 128** | group_size 128 preserves more precision than default 128 |
-| **vLLM / SGLang** | Same as above | Both use the same model files; enforce via quantization type |
-
-### Why Q5_K_M?
-
-> "Q5_K_M (5-bit) quantization is the absolute minimum threshold for reliable code generation; 4-bit and below often introduce syntax errors (e.g., mismatched brackets) that break compilation."
-
-Bedrock Add-on output is structured code: `manifest.json` must be valid JSON with exact field names, and `.js` files must parse without syntax errors. Q4 quantization artifacts (off-by-one token predictions, truncated outputs) silently produce invalid output that fails downstream validation.
-
-### Configuration
-
-Set quantization metadata via environment variables:
-
-```bash
-# Model file name includes quantization, e.g.:
-#   Qwen2.5-Coder-7B-Instruct-Q5_K_M.gguf
-#   portkit-coder-7b-merged-Q5_K_M.gguf
-
-# Quantization type (gguf | awq | exl2 | gptq)
-MODEL_QUANT_TYPE=gguf
-
-# For AWQ only: group size (default 128)
-AWQ_GROUP_SIZE=128
-```
-
-The inference client validates the quantization floor on startup and logs a warning if the model is below threshold.
-
-### Selecting Quantized Models
-
-For Qwen2.5-Coder-7B on HuggingFace, Bartowski's quantized variants:
-
-| Quantization | File Pattern | Bedrock Quality |
-|--------------|-------------|-----------------|
-| Q6_K | `*Q6_K*` | Best; ~7.5GB |
-| Q5_K_M | `*Q5_K_M*` | Recommended minimum; ~4.8GB |
-| Q4_K_M | `*Q4_K_M*` | Avoid — syntax errors in code generation |
-
-```bash
-# Example: Download Q5_K_M GGUF via huggingface-cli
-huggingface-cli download Bartowski/Qwen2.5-Coder-7B-Instruct-GGUF \
-  Qwen2.5-Coder-7B-Instruct-Q5_K_M.gguf \
-  --local-dir ./models
-```
-
 ## Phase 3: SGLang Benchmark
 
 **When**: Post-beta, after Phase 2 is stable