TheTom · cemigo114 · Apr 13, 2026
diff --git a/demo_run.sh b/demo_run.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# TurboQuant+ End-to-End Demo
+# Hardware: Apple Silicon Mac with Metal
+# Reproduces: Python prototype + llama.cpp inference + perplexity validation
+
+set -e
+
+REPO_DIR="$(cd "$(dirname "$0")" && pwd)"
+LLAMA_DIR="$HOME/llama-cpp-turboquant"
+MODEL="$HOME/models/qwen2.5-1.5b-instruct-q8_0.gguf"
+WIKI="$HOME/models/wikitext-2-raw.txt"
+BENCH="$LLAMA_DIR/build/bin/llama-bench"
+PPL="$LLAMA_DIR/build/bin/llama-perplexity"
+
+echo "============================================================"
+echo "TurboQuant+ Demo — $(date)"
+echo "============================================================"
+echo ""
+
+# --- Step 1: Python prototype ---
+echo ">>> Step 1: Python tests (551 tests)"
+source "$REPO_DIR/.venv/bin/activate"
+python3 -m pytest tests/ -q 2>&1 | tail -3
+echo ""
+
+echo ">>> Step 2: Compression demo"
+python3 benchmarks/demo.py 2>&1 | grep -E "bit TurboQuant|MSE:|Cosine|Compression|demos complete"
+echo ""
+
+# --- Step 3: llama.cpp benchmarks ---
+if [ ! -f "$BENCH" ]; then
+    echo "ERROR: llama-bench not found at $BENCH"
+    echo "Build llama-cpp-turboquant first (see README)."
+    exit 1
+fi
+if [ ! -f "$MODEL" ]; then
+    echo "ERROR: Model not found at $MODEL"
+    echo "Download: hf download Qwen/Qwen2.5-1.5B-Instruct-GGUF qwen2.5-1.5b-instruct-q8_0.gguf --local-dir ~/models"
+    exit 1
+fi
+
+echo ">>> Step 3: llama-bench speed comparison (pp512 + tg128, 3 runs each)"
+echo ""
+echo "--- q8_0 baseline ---"
+$BENCH -m "$MODEL" -ngl 99 -fa 1 -ctk q8_0 -ctv q8_0 -p 512 -n 128 -r 3 2>&1 | grep -E "^\|.*t/s"
+echo ""
+echo "--- turbo4 symmetric ---"
+$BENCH -m "$MODEL" -ngl 99 -fa 1 -ctk turbo4 -ctv turbo4 -p 512 -n 128 -r 3 2>&1 | grep -E "^\|.*t/s"
+echo ""
+echo "--- q8_0-K + turbo4-V (asymmetric, recommended) ---"
+$BENCH -m "$MODEL" -ngl 99 -fa 1 -ctk q8_0 -ctv turbo4 -p 512 -n 128 -r 3 2>&1 | grep -E "^\|.*t/s"
+echo ""
+
+# --- Step 4: Perplexity ---
+if [ ! -f "$WIKI" ]; then
+    echo "SKIP: wikitext-2-raw.txt not found, skipping perplexity"
+else
+    echo ">>> Step 4: Perplexity comparison (wikitext-2, 512 ctx, 10 chunks)"
+    echo ""
+    PPL_Q8=$($PPL -m "$MODEL" -ngl 99 -fa 1 -ctk q8_0 -ctv q8_0 -c 512 --chunks 10 -f "$WIKI" 2>&1 | grep "Final estimate")
+    PPL_T4=$($PPL -m "$MODEL" -ngl 99 -fa 1 -ctk turbo4 -ctv turbo4 -c 512 --chunks 10 -f "$WIKI" 2>&1 | grep "Final estimate")
+    PPL_ASYM=$($PPL -m "$MODEL" -ngl 99 -fa 1 -ctk q8_0 -ctv turbo4 -c 512 --chunks 10 -f "$WIKI" 2>&1 | grep "Final estimate")
+
+    echo "  q8_0 baseline:       $PPL_Q8"
+    echo "  turbo4 symmetric:    $PPL_T4"
+    echo "  q8_0-K + turbo4-V:   $PPL_ASYM"
+fi
+
+echo ""
+echo "============================================================"
+echo "Demo complete."
+echo "============================================================"
diff --git a/docs/community-hardware/m4-pro-48gb.md b/docs/community-hardware/m4-pro-48gb.md
@@ -0,0 +1,60 @@
+# Community Hardware: Apple M4 Pro 48GB
+
+**Date**: 2026-04-13
+**Hardware**: Apple M4 Pro, 48GB unified memory, macOS Darwin 25.3.0
+**GPU Family**: MTLGPUFamilyApple9 (1009), Metal4 (5002)
+**Build**: llama.cpp feature/turboquant-kv-cache (8590cbff9, b8814)
+**Model**: Qwen2.5-1.5B-Instruct Q8_0 (1.76 GiB, 1.78B params)
+**Auto-detected**: 4-mag LUT (pre-M5 hardware), sparse V dequant enabled
+
+## Speed (llama-bench, pp512 + tg128, 3 runs)
+
+| K | V | Prefill t/s | Decode t/s | Prefill vs q8_0 | Decode vs q8_0 |
+|---|---|------------|-----------|----------------|---------------|
+| q8_0 | q8_0 | 2325.95 ± 8.16 | 111.58 ± 2.33 | — | — |
+| turbo4 | turbo4 | 2245.42 ± 3.03 | 75.81 ± 0.55 | 0.97x | 0.68x |
+| turbo3 | turbo3 | 2242.51 ± 6.70 | 72.26 ± 1.70 | 0.96x | 0.65x |
+| q8_0 | turbo4 | 2270.84 ± 9.83 | 89.77 ± 1.25 | 0.98x | 0.80x |
+
+## Perplexity (wikitext-2, 512 ctx, 10 chunks)
+
+| K | V | PPL | vs q8_0 |
+|---|---|-----|---------|
+| q8_0 | q8_0 | 11.9174 ± 0.651 | baseline |
+| turbo4 | turbo4 | 6921.08 ± 521.4 | catastrophic |
+| q8_0 | turbo4 | 12.0483 ± 0.659 | +1.1% |
+
+## Python Prototype
+
+- 551 passed, 6 skipped, 0 failed (16.39s)
+- Coverage: 95% (894 statements, 43 missed)
+- Core modules (codebook, kv_cache, polar_quant, qjl, rotation, turboquant, utils): 100%
+- Real model validation (Qwen3-1.7B): K kurtosis 918 → post-rotation Gaussian confirmed
+
+## Key Findings
+
+- **Asymmetric q8_0-K + turbo4-V works on M4 Pro** — +1.1% PPL, 0.80x decode, 0.98x prefill
+- **Symmetric turbo4 is catastrophic on Qwen2.5-1.5B Q8_0** — consistent with documented sensitivity on small models
+- **Prefill near-parity** across all configs (96–98% of q8_0)
+- **Decode regression** on M4 Pro is between M1 Max and M5 Max results, as expected for pre-M5 hardware
+- **4-mag LUT auto-detected** — no manual configuration needed
+
+## Reproduction
+
+```bash
+git clone https://github.com/TheTom/turboquant_plus.git
+cd turboquant_plus
+python3 -m venv .venv && source .venv/bin/activate
+pip install -e ".[dev]"
+python3 -m pytest tests/ -v  # 551 pass
+
+# llama.cpp
+git clone https://github.com/TheTom/llama-cpp-turboquant.git
+cd llama-cpp-turboquant && git checkout feature/turboquant-kv-cache
+cmake -B build -DGGML_METAL=ON -DGGML_METAL_EMBED_LIBRARY=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build build -j
+
+# benchmark (download model first)
+./build/bin/llama-bench -m <model.gguf> -ngl 99 -fa 1 -ctk q8_0 -ctv q8_0 -p 512 -n 128 -r 3
+./build/bin/llama-bench -m <model.gguf> -ngl 99 -fa 1 -ctk q8_0 -ctv turbo4 -p 512 -n 128 -r 3
+```