Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions demo_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/bash
# TurboQuant+ End-to-End Demo
# Hardware: Apple Silicon Mac with Metal
# Reproduces: Python prototype + llama.cpp inference + perplexity validation

set -e

REPO_DIR="$(cd "$(dirname "$0")" && pwd)"
LLAMA_DIR="$HOME/llama-cpp-turboquant"
MODEL="$HOME/models/qwen2.5-1.5b-instruct-q8_0.gguf"
WIKI="$HOME/models/wikitext-2-raw.txt"
BENCH="$LLAMA_DIR/build/bin/llama-bench"
PPL="$LLAMA_DIR/build/bin/llama-perplexity"

echo "============================================================"
echo "TurboQuant+ Demo — $(date)"
echo "============================================================"
echo ""

# --- Step 1: Python prototype ---
echo ">>> Step 1: Python tests (551 tests)"
source "$REPO_DIR/.venv/bin/activate"
python3 -m pytest tests/ -q 2>&1 | tail -3
echo ""

echo ">>> Step 2: Compression demo"
python3 benchmarks/demo.py 2>&1 | grep -E "bit TurboQuant|MSE:|Cosine|Compression|demos complete"
echo ""

# --- Step 3: llama.cpp benchmarks ---
if [ ! -f "$BENCH" ]; then
echo "ERROR: llama-bench not found at $BENCH"
echo "Build llama-cpp-turboquant first (see README)."
exit 1
fi
if [ ! -f "$MODEL" ]; then
echo "ERROR: Model not found at $MODEL"
echo "Download: hf download Qwen/Qwen2.5-1.5B-Instruct-GGUF qwen2.5-1.5b-instruct-q8_0.gguf --local-dir ~/models"
exit 1
fi

echo ">>> Step 3: llama-bench speed comparison (pp512 + tg128, 3 runs each)"
echo ""
echo "--- q8_0 baseline ---"
$BENCH -m "$MODEL" -ngl 99 -fa 1 -ctk q8_0 -ctv q8_0 -p 512 -n 128 -r 3 2>&1 | grep -E "^\|.*t/s"
echo ""
echo "--- turbo4 symmetric ---"
$BENCH -m "$MODEL" -ngl 99 -fa 1 -ctk turbo4 -ctv turbo4 -p 512 -n 128 -r 3 2>&1 | grep -E "^\|.*t/s"
echo ""
echo "--- q8_0-K + turbo4-V (asymmetric, recommended) ---"
$BENCH -m "$MODEL" -ngl 99 -fa 1 -ctk q8_0 -ctv turbo4 -p 512 -n 128 -r 3 2>&1 | grep -E "^\|.*t/s"
echo ""

# --- Step 4: Perplexity ---
if [ ! -f "$WIKI" ]; then
echo "SKIP: wikitext-2-raw.txt not found, skipping perplexity"
else
echo ">>> Step 4: Perplexity comparison (wikitext-2, 512 ctx, 10 chunks)"
echo ""
PPL_Q8=$($PPL -m "$MODEL" -ngl 99 -fa 1 -ctk q8_0 -ctv q8_0 -c 512 --chunks 10 -f "$WIKI" 2>&1 | grep "Final estimate")
PPL_T4=$($PPL -m "$MODEL" -ngl 99 -fa 1 -ctk turbo4 -ctv turbo4 -c 512 --chunks 10 -f "$WIKI" 2>&1 | grep "Final estimate")
PPL_ASYM=$($PPL -m "$MODEL" -ngl 99 -fa 1 -ctk q8_0 -ctv turbo4 -c 512 --chunks 10 -f "$WIKI" 2>&1 | grep "Final estimate")

echo " q8_0 baseline: $PPL_Q8"
echo " turbo4 symmetric: $PPL_T4"
echo " q8_0-K + turbo4-V: $PPL_ASYM"
fi

echo ""
echo "============================================================"
echo "Demo complete."
echo "============================================================"
60 changes: 60 additions & 0 deletions docs/community-hardware/m4-pro-48gb.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Community Hardware: Apple M4 Pro 48GB

**Date**: 2026-04-13
**Hardware**: Apple M4 Pro, 48GB unified memory, macOS Darwin 25.3.0
**GPU Family**: MTLGPUFamilyApple9 (1009), Metal4 (5002)
**Build**: llama.cpp feature/turboquant-kv-cache (8590cbff9, b8814)
**Model**: Qwen2.5-1.5B-Instruct Q8_0 (1.76 GiB, 1.78B params)
**Auto-detected**: 4-mag LUT (pre-M5 hardware), sparse V dequant enabled

## Speed (llama-bench, pp512 + tg128, 3 runs)

| K | V | Prefill t/s | Decode t/s | Prefill vs q8_0 | Decode vs q8_0 |
|---|---|------------|-----------|----------------|---------------|
| q8_0 | q8_0 | 2325.95 ± 8.16 | 111.58 ± 2.33 | — | — |
| turbo4 | turbo4 | 2245.42 ± 3.03 | 75.81 ± 0.55 | 0.97x | 0.68x |
| turbo3 | turbo3 | 2242.51 ± 6.70 | 72.26 ± 1.70 | 0.96x | 0.65x |
| q8_0 | turbo4 | 2270.84 ± 9.83 | 89.77 ± 1.25 | 0.98x | 0.80x |

## Perplexity (wikitext-2, 512 ctx, 10 chunks)

| K | V | PPL | vs q8_0 |
|---|---|-----|---------|
| q8_0 | q8_0 | 11.9174 ± 0.651 | baseline |
| turbo4 | turbo4 | 6921.08 ± 521.4 | catastrophic |
| q8_0 | turbo4 | 12.0483 ± 0.659 | +1.1% |

## Python Prototype

- 551 passed, 6 skipped, 0 failed (16.39s)
- Coverage: 95% (894 statements, 43 missed)
- Core modules (codebook, kv_cache, polar_quant, qjl, rotation, turboquant, utils): 100%
- Real model validation (Qwen3-1.7B): K kurtosis 918 → post-rotation Gaussian confirmed

## Key Findings

- **Asymmetric q8_0-K + turbo4-V works on M4 Pro** — +1.1% PPL, 0.80x decode, 0.98x prefill
- **Symmetric turbo4 is catastrophic on Qwen2.5-1.5B Q8_0** — consistent with documented sensitivity on small models
- **Prefill near-parity** across all configs (96–98% of q8_0)
- **Decode regression** on M4 Pro is between M1 Max and M5 Max results, as expected for pre-M5 hardware
- **4-mag LUT auto-detected** — no manual configuration needed

## Reproduction

```bash
git clone https://github.com/TheTom/turboquant_plus.git
cd turboquant_plus
python3 -m venv .venv && source .venv/bin/activate
pip install -e ".[dev]"
python3 -m pytest tests/ -v # 551 pass

# llama.cpp
git clone https://github.com/TheTom/llama-cpp-turboquant.git
cd llama-cpp-turboquant && git checkout feature/turboquant-kv-cache
cmake -B build -DGGML_METAL=ON -DGGML_METAL_EMBED_LIBRARY=ON -DCMAKE_BUILD_TYPE=Release
cmake --build build -j

# benchmark (download model first)
./build/bin/llama-bench -m <model.gguf> -ngl 99 -fa 1 -ctk q8_0 -ctv q8_0 -p 512 -n 128 -r 3
./build/bin/llama-bench -m <model.gguf> -ngl 99 -fa 1 -ctk q8_0 -ctv turbo4 -p 512 -n 128 -r 3
```