diff --git a/.github/workflows/general.yml b/.github/workflows/general.yml new file mode 100644 index 0000000..4477adf --- /dev/null +++ b/.github/workflows/general.yml @@ -0,0 +1,30 @@ +name: General CI + +on: + push: + paths-ignore: + - 'transformer.py' + - 'train_test/**' + - '**.md' + pull_request: + paths-ignore: + - 'transformer.py' + - 'train_test/**' + - '**.md' + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: '.python-version' + + - name: Install dependencies + run: pip install -r requirements.txt + + - name: General check + run: echo "General CI passed for this commit" diff --git a/.github/workflows/test_transformer.yml b/.github/workflows/test_transformer.yml new file mode 100644 index 0000000..c57c774 --- /dev/null +++ b/.github/workflows/test_transformer.yml @@ -0,0 +1,26 @@ +name: Test Transformer + +on: + push: + paths: + - 'transformer.py' + pull_request: + paths: + - 'transformer.py' + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version-file: '.python-version' + + - name: Install dependencies + run: pip install -r requirements.txt + + - name: Run transformer tests + run: python transformer.py diff --git a/.github/workflows/train_test.yml b/.github/workflows/train_test.yml index 71ba2b2..203a41a 100644 --- a/.github/workflows/train_test.yml +++ b/.github/workflows/train_test.yml @@ -1,6 +1,12 @@ -name: Run Train Test +name: Test Train Scripts -on: [push, pull_request] +on: + push: + paths: + - 'train_test/**' + pull_request: + paths: + - 'train_test/**' jobs: test: @@ -11,13 +17,10 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version-file: '.python-version' + python-version-file: '.python-version' - name: Install dependencies run: pip install -r requirements.txt - - name: Run transformer tests - run: python transformer.py - - name: Run train_test scripts - run: python train_test/ \ No newline at end of file + run: python -m train_test diff --git a/.gitignore b/.gitignore index 59ba071..ab59c09 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,14 @@ -# Virtual environments .venv eamon_env __pycache__ best_model.pt .ipynb_checkpoints .exe - - +checkpoints/*.pt +checkpoints/*.pth +checkpoints/*.ckpt +!checkpoints/.gitkeep +!checkpoints/README.md +!logs/.gitkeep +!logs/README.md +!logs/train_loss.csv \ No newline at end of file diff --git a/GPU train/image.png b/GPU train/image.png new file mode 100644 index 0000000..d4898ed Binary files /dev/null and b/GPU train/image.png differ diff --git a/README.md b/README.md index b481125..b96afe4 100644 --- a/README.md +++ b/README.md @@ -1,325 +1,238 @@ -# Transformer Language Model [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Zs84ZQf-0VPbQxHce1mlSMD-Jr22xJqZ#scrollTo=VdohdZ8imygv) ![GitHub](https://img.shields.io/github/license/Eamon2009/Transformer-language-model) -image +# Quadtrix [![License](https://img.shields.io/github/license/Eamon2009/Transformer-language-model)](LICENSE) +image +image -A character-level GPT transformer built from scratch in PyTorch, trained on children's stories to generate simple English narrative text character by character. No pre-trained weights. No fine-tuning. Pure architecture and training from zero. -> **Latest run:** 1.99M parameter model trained on Tesla T4 GPU — val loss **0.9250** in just **6.1 minutes.** -Screenshot 2026-04-10 172442 +A minimal, educational GPT-style transformer trained character-by-character on children's stories. No pre-trained weights. No fine-tuning. Just raw PyTorch, from init to generation. +Character-level Transformer is a sequence-to-sequence architecture that operates on the granularity of individual characters rather than words or subword tokens. By utilizing a self-attention mechanism over long sequences of characters, the model learns to construct internal representations of morphology, syntax, and semantics from the ground up, effectively eliminating "out-of-vocabulary" (OOV) issues. While this approach allows for high fidelity in modeling rare words, spelling variations, and creative linguistics, it significantly increases the computational complexity—typically $O(L^2)$ where $L$ is the sequence length—as a single sentence requires many more steps than its word-level equivalent. Consequently, character-level Transformers often require deeper architectures or auxiliary losses to capture the long-range dependencies necessary to match the semantic performance of traditional token-based models - -# GPU RUN-2 -Screenshot 2026-03-22 215122 - -## CPU RUN-1 -Screenshot 2026-03-21 001248 +> **The goal**: Understand how language models learn patterns. See it happen on your machine. Train in minutes on a GPU. --- -## Table of Contents - -1. [What This Project Does](#what-this-project-does) -2. [Project Structure](#project-structure) -3. [How It Works](#how-it-works) -4. [Setup & Requirements](#setup--requirements) -5. [How to Run](#how-to-run) -6. [Configuration](#configuration) -7. [Training Runs — All Results](#training-runs--all-results) -8. [Head-to-Head Comparison](#head-to-head-comparison) -9. [Model Output Comparison](#model-output-comparison) -10. [Loss Curve Analysis](#loss-curve-analysis) -11. [Overfitting Analysis](#overfitting-analysis) -12. [Scaling Laws — And Where Your Model Sits](#scaling-laws--and-where-your-model-sits) -13. [How Weights Produce Output](#how-weights-produce-output) -14. [Known Limitations](#known-limitations) +## Quick Start ---- +```bash +# Install PyTorch +pip install torch + +# Run training +python transformer.py -## What This Project Does +# The script will train, save best weights, then generate text forever +# Press Ctrl+C to stop +``` -This project trains a small GPT-style transformer model on children's stories and then generates new story-like text character by character. It is a learning project — the goal is not to produce publishable stories, but to understand how language models learn patterns from text and to see that process happen live on your own machine and cloud GPU. +**That's it.** No complex setup, no data pipelines, no credentials. --- -## How It Works +## What Is This? -The model is a **character-level transformer**. This means: +Quadtrix is a learning project. It trains a tiny transformer on text—character by character—and learns which characters tend to follow others. At generation time, it predicts the next character, feeds it back in, and repeats. -- It reads your text file one character at a time -- It learns which characters tend to follow other characters in which contexts -- At generation time it predicts the next character, then the next, then the next — forever +**It's the same architecture as GPT**, just much smaller (1M–11M parameters instead of 175B) and trained on much less data (thousands of stories instead of the internet). -It is the same core architecture as GPT, just much smaller and trained on much less data. +The magic: in just 6 minutes on a Tesla T4 GPU, a 2M-parameter model learns meaningful patterns about narrative structure, dialogue, and storytelling. The output isn't Shakespeare, but it *is* recognizable as a story. -**The pipeline in order:** +--- -``` -data.txt (children's stories) - ↓ -Characters encoded as integers (vocab size varies by run) - ↓ -Model trains on sequences of tokens at a time - ↓ -Every N steps: loss is measured and printed - ↓ -Best weights saved to best_model.pt whenever val loss improves - ↓ -After training: text generation begins - ↓ -Press Ctrl+C to stop -``` +## How It Works ---- +### The Pipeline -## Setup & Requirements +``` +1. Load children's stories from disk + ↓ +2. Encode each character as an integer (vocab size: 28–110) + ↓ +3. Split into train/val chunks (80/20) + ↓ +4. Build a GPT-style transformer with: + - Embedding layer (character → vector) + - Transformer blocks (self-attention + feedforward) + - Output projection (vector → logits over vocab) + ↓ +5. Train for N steps, measuring loss every eval_interval + ↓ +6. Save best weights whenever validation loss improves + ↓ +7. Load best model and generate text forever + ↓ +8. Press Ctrl+C to stop +``` -**Python version:** 3.8 or higher +### What the Model Actually Does -**Install dependencies:** +Each forward pass: +1. Takes a sequence of character indices (e.g., "Once upon...") +2. Embeds each into a dense vector +3. Passes through transformer layers (multi-head attention learns which characters to attend to) +4. Outputs logits (scores) for every possible next character +5. Samples the next character according to those probabilities +6. Feeds it back in and repeats -```bash -pip install torch -``` +**Loss function**: Cross-entropy. The model minimizes surprise on unseen text. -No other dependencies needed. The project uses only PyTorch and Python standard library modules. +**Training**: Adam optimizer with a learning rate schedule. Dropout prevents overfitting. --- -## How to Run +## Project Structure -```bash -python transformer.py +``` +transformer.py ← Everything. One file. +best_model.pt ← Saved weights (after first training run) +data.txt ← Your text file (any UTF-8 file works) ``` -The script will: - -1. Print a startup banner with device and timestamp -2. Load and report stats on your dataset -3. Build the model and print parameter count -4. Train for the configured number of steps, printing progress at each eval interval -5. Save best weights to `best_model.pt` automatically -6. Start text generation when done +That's all. No fancy folder structure. No config files. Edit hyperparameters directly in the script. --- -## Configuration +## Configuration & Hardware -### Run 3 — GPU Configuration (Tesla T4, Latest) ⭐ +Quadtrix is designed to work anywhere: laptop CPU to cloud GPU. Edit these hyperparameters in `transformer.py`: ```python -batch_size = 64 # Sequences trained on at once -block_size = 128 # Context window (tokens) -max_iters = 5000 # Total training steps -eval_interval = 200 # Print progress every N steps +# ============================================================================ +# Hyperparameters +# ============================================================================ +batch_size = 64 # Sequences per batch +block_size = 128 # Context window (tokens) +max_iters = 5000 # Total training steps +eval_interval = 200 # Print loss every N steps learning_rate = 3e-4 -n_embd = 200 # Size of internal representations -n_head = 4 # Number of attention heads -n_layer = 4 # Number of transformer blocks -dropout = 0.2 +n_embd = 200 # Embedding dimension +n_head = 4 # Attention heads per layer +n_layer = 4 # Number of transformer blocks +dropout = 0.2 # Regularization ``` -**Parameter count: 1.99M parameters** - -### Run 2 — GPU Configuration (Google Colab) +### Three Pre-Tuned Configurations +**CPU (Laptop) — Fast Feedback** ```python -batch_size = 64 -block_size = 256 -max_iters = 5000 -eval_interval = 250 -learning_rate = 3e-4 -n_embd = 384 -n_head = 6 -n_layer = 6 -dropout = 0.2 +batch_size, block_size, max_iters = 16, 128, 3000 +n_embd, n_head, n_layer = 128, 4, 4 +# ~0.82M parameters +# Trains in ~40 min on AMD Ryzen ``` -**Parameter count: 10.82M parameters** - -### Run 1 — CPU Configuration (Laptop, Minimal Setup) - +**GPU (Google Colab) — Best Quality** ```python -batch_size = 16 -block_size = 128 -max_iters = 3000 -eval_interval = 200 -learning_rate = 3e-4 -n_embd = 128 -n_head = 4 -n_layer = 4 -dropout = 0.2 +batch_size, block_size, max_iters = 64, 256, 5000 +n_embd, n_head, n_layer = 384, 6, 6 +# ~10.82M parameters +# Trains in ~60 min on Colab GPU +# Best output quality ``` -**Parameter count: 0.82M parameters** +**GPU (Tesla T4) — Efficient** +```python +batch_size, block_size, max_iters = 64, 128, 5000 +n_embd, n_head, n_layer = 200, 4, 4 +# ~1.99M parameters +# Trains in **6.1 minutes** ← Fastest +# Best parameter/data balance +``` --- -## Training Runs — All Results +## Training Results -### Run 3 — GPU (Tesla T4, 1.99M Parameters) ⭐ Latest +Three runs. Three different hardware setups. All converged well. -| Field | Value | -|---|---| -| Device | Tesla T4 (CUDA 13.0, Driver 580.82.07) | -| Dataset | ~31.4M characters | -| Vocab size | 100 | -| Train tokens | 28,274,093 | -| Val tokens | 3,141,566 | -| Parameters | 1.99M | -| Architecture | 4 layers × 4 heads × 200 embd dim | -| Training time | **6.1 minutes (367s)** | -| Best val loss | **0.9250** | -| Final train loss | 0.9307 | -| Overfitting | None — `best!` at most checkpoints | +### Run 3 — Tesla T4 (Latest) ⭐ -**Full training log:** +**Configuration**: 4 layers × 4 heads × 200 dim = **1.99M params** -``` -[ 0/5000] train=4.6207 val=4.6202 elapsed=2s ETA=0s << best! -[ 200/5000] train=2.2058 val=2.1986 elapsed=17s ETA=405s << best! -[ 400/5000] train=1.6111 val=1.6039 elapsed=32s ETA=367s << best! -[ 600/5000] train=1.4109 val=1.4183 elapsed=47s ETA=342s << best! -[ 800/5000] train=1.3230 val=1.3231 elapsed=61s ETA=322s << best! -[ 1000/5000] train=1.2495 val=1.2567 elapsed=76s ETA=303s << best! -[ 1200/5000] train=1.1960 val=1.1948 elapsed=90s ETA=286s << best! -[ 1400/5000] train=1.1569 val=1.1642 elapsed=105s ETA=270s << best! -[ 1600/5000] train=1.1283 val=1.1283 elapsed=120s ETA=254s << best! -[ 1800/5000] train=1.0894 val=1.1023 elapsed=134s ETA=238s << best! -[ 2000/5000] train=1.0731 val=1.0765 elapsed=149s ETA=223s << best! -[ 2200/5000] train=1.0584 val=1.0550 elapsed=163s ETA=208s << best! -[ 2400/5000] train=1.0415 val=1.0346 elapsed=178s ETA=192s << best! -[ 2600/5000] train=1.0261 val=1.0199 elapsed=192s ETA=177s << best! -[ 2800/5000] train=1.0106 val=1.0117 elapsed=207s ETA=162s << best! -[ 3000/5000] train=1.0000 val=0.9956 elapsed=221s ETA=148s << best! -[ 3200/5000] train=0.9913 val=0.9924 elapsed=236s ETA=133s << best! -[ 3400/5000] train=0.9727 val=0.9782 elapsed=251s ETA=118s << best! -[ 3600/5000] train=0.9656 val=0.9720 elapsed=265s ETA=103s << best! -[ 3800/5000] train=0.9685 val=0.9632 elapsed=280s ETA=88s << best! -[ 4000/5000] train=0.9601 val=0.9642 elapsed=294s ETA=74s -[ 4200/5000] train=0.9515 val=0.9489 elapsed=309s ETA=59s << best! -[ 4400/5000] train=0.9433 val=0.9431 elapsed=323s ETA=44s << best! -[ 4600/5000] train=0.9384 val=0.9459 elapsed=338s ETA=29s -[ 4800/5000] train=0.9331 val=0.9250 elapsed=353s ETA=15s << best! -[ 4999/5000] train=0.9307 val=0.9430 elapsed=367s ETA=0s +| Metric | Value | +|--------|-------| +| Device | Tesla T4 (CUDA 13.0) | +| Dataset | ~31.4M characters (children's stories) | +| Train tokens | 28.3M | +| Val tokens | 3.1M | +| Training time | **6.1 minutes** | +| Best val loss | **0.9250** | +| Final train loss | 0.9307 | +| Overfitting | None detected | + +**Training curve** (every 200 steps): +``` +[ 0/5000] train=4.6207 val=4.6202 elapsed=2s << best! +[ 200/5000] train=2.2058 val=2.1986 elapsed=17s << best! +[ 400/5000] train=1.6111 val=1.6039 elapsed=32s << best! +[ 1000/5000] train=1.2495 val=1.2567 elapsed=76s << best! +[ 2000/5000] train=1.0731 val=1.0765 elapsed=149s << best! +[ 3000/5000] train=1.0000 val=0.9956 elapsed=221s << best! +[ 4000/5000] train=0.9601 val=0.9642 elapsed=294s +[ 4200/5000] train=0.9515 val=0.9489 elapsed=309s << best! +[ 4400/5000] train=0.9433 val=0.9431 elapsed=323s << best! +[ 4800/5000] train=0.9331 val=0.9250 elapsed=353s << best! +[ 4999/5000] train=0.9307 val=0.9430 elapsed=367s [DONE] Training finished in 367.0s (6.1 min) | Best val loss: 0.9250 ``` +**Key insight**: This run hit the **sweet spot**—large enough to learn coherent patterns, small enough to train fast. It's the reference configuration. + --- -### Run 2 — GPU (Google Colab, 10.82M Parameters) +### Run 2 — Google Colab (10.82M Parameters) -| Field | Value | -|---|---| +**Configuration**: 6 layers × 6 heads × 384 dim + +| Metric | Value | +|--------|-------| | Device | CUDA (Google Colab GPU) | -| Dataset | 88,406,739 characters | -| Vocab size | 110 | -| Train tokens | 79,566,065 | -| Val tokens | 8,840,674 | -| Parameters | 10.82M (10,823,534) | -| Architecture | 6 layers × 6 heads × 384 embd dim | -| Training time | **61.3 minutes** | +| Dataset | ~88.4M characters | +| Parameters | 10.82M | +| Training time | 61.3 minutes | | Best val loss | **0.7176** | -| Final train loss | 0.7259 | -| Overfitting | None — `best!` at every checkpoint | - -**Full training log:** +| Overfitting | None | -``` -[ 0/5000] 0.0% train=4.9244 val=4.9262 elapsed=31s ETA=0s best! -[ 250/5000] 5.0% train=2.1218 val=2.1169 elapsed=206s ETA=3901s best! -[ 500/5000] 10.0% train=1.3606 val=1.3500 elapsed=391s ETA=3510s best! -[ 750/5000] 15.0% train=1.1540 val=1.1411 elapsed=575s ETA=3250s best! -[ 1000/5000] 20.0% train=1.0332 val=1.0296 elapsed=757s ETA=3024s best! -[ 1250/5000] 25.0% train=0.9657 val=0.9556 elapsed=941s ETA=2819s best! -[ 1500/5000] 30.0% train=0.9305 val=0.9189 elapsed=1124s ETA=2619s best! -[ 1750/5000] 35.0% train=0.8935 val=0.8853 elapsed=1306s ETA=2424s best! -[ 2000/5000] 40.0% train=0.8673 val=0.8602 elapsed=1490s ETA=2233s best! -[ 2250/5000] 45.0% train=0.8413 val=0.8367 elapsed=1672s ETA=2042s best! -[ 2500/5000] 50.0% train=0.8162 val=0.8141 elapsed=1855s ETA=1854s best! -[ 2750/5000] 55.0% train=0.8058 val=0.7995 elapsed=2038s ETA=1666s best! -[ 3000/5000] 60.0% train=0.7888 val=0.7803 elapsed=2221s ETA=1479s best! -[ 3250/5000] 65.0% train=0.7798 val=0.7730 elapsed=2403s ETA=1293s best! -[ 3500/5000] 70.0% train=0.7634 val=0.7551 elapsed=2585s ETA=1107s best! -[ 3750/5000] 75.0% train=0.7588 val=0.7528 elapsed=2768s ETA=922s best! -[ 4000/5000] 80.0% train=0.7480 val=0.7434 elapsed=2951s ETA=737s best! -[ 4250/5000] 85.0% train=0.7381 val=0.7351 elapsed=3134s ETA=552s best! -[ 4500/5000] 90.0% train=0.7371 val=0.7314 elapsed=3316s ETA=368s best! -[ 4750/5000] 95.0% train=0.7282 val=0.7239 elapsed=3498s ETA=183s best! -[ 4999/5000] 100.0% train=0.7259 val=0.7176 elapsed=3680s ETA=0s best! - -[DONE] Training finished in 3680.1s (61.3 min) -[DONE] Best val loss: 0.7176 -[SAVE] Best weights saved to: /content/best_model.pt -``` +**Result**: Larger model, more data = **best output quality**. Slower to train but produces recognizable narratives. --- -### Run 1 — CPU (Laptop, 0.82M Parameters) +### Run 1 — CPU Laptop (0.82M Parameters) + +**Configuration**: 4 layers × 4 heads × 128 dim -| Field | Value | -|---|---| +| Metric | Value | +|--------|-------| | Device | AMD Ryzen 5 PRO 3500U (CPU only) | -| Dataset | 201,570 characters | -| Vocab size | 28 | | Parameters | 0.82M | -| Architecture | 4 layers × 4 heads × 128 embd dim | -| Training time | **39.4 minutes** | -| Best val loss | **1.3145** | -| Final train loss | 1.3191 | -| Overfitting | None — `best!` at every checkpoint | - -**Full training log:** +| Dataset | ~201K characters | +| Training time | 39.4 minutes | +| Best val loss | 1.3145 | +| Overfitting | None | -``` -[ 0/3000] 0.0% train=3.2961 val=3.2981 elapsed=12s ETA=0s best! -[ 200/3000] 6.7% train=2.3038 val=2.2490 elapsed=141s ETA=1959s best! -[ 400/3000] 13.3% train=2.2469 val=2.1950 elapsed=292s ETA=1891s best! -[ 600/3000] 20.0% train=2.1842 val=2.1318 elapsed=436s ETA=1739s best! -[ 800/3000] 26.7% train=1.9742 val=1.9103 elapsed=581s ETA=1594s best! -[ 1000/3000] 33.3% train=1.7628 val=1.7002 elapsed=723s ETA=1443s best! -[ 1200/3000] 40.0% train=1.6714 val=1.6040 elapsed=863s ETA=1293s best! -[ 1400/3000] 46.7% train=1.5889 val=1.5360 elapsed=1015s ETA=1158s best! -[ 1600/3000] 53.3% train=1.5375 val=1.4723 elapsed=1166s ETA=1019s best! -[ 1800/3000] 60.0% train=1.4847 val=1.4525 elapsed=1320s ETA=879s best! -[ 2000/3000] 66.7% train=1.4604 val=1.4081 elapsed=1472s ETA=735s best! -[ 2200/3000] 73.3% train=1.4113 val=1.3857 elapsed=1653s ETA=600s best! -[ 2400/3000] 80.0% train=1.3923 val=1.3725 elapsed=1820s ETA=454s best! -[ 2600/3000] 86.7% train=1.3501 val=1.3446 elapsed=1998s ETA=307s best! -[ 2800/3000] 93.3% train=1.3336 val=1.3334 elapsed=2174s ETA=154s best! -[ 2999/3000] 100.0% train=1.3191 val=1.3145 elapsed=2363s ETA=0s best! - -[DONE] Training finished in 2364.1s (39.4 min) -[DONE] Best val loss: 1.3145 -[SAVE] Best weights saved to best_model.pt -``` +**Result**: Smallest model, tiniest dataset. Trains fastest on CPU. Output is fragmented but shows the model learned *something*. --- ## Head-to-Head Comparison -| Metric | Run 1 — CPU Laptop | Run 2 — GPU Colab | Run 3 — Tesla T4 ⭐ | -|---|---|---|---| -| **Device** | AMD Ryzen 5 CPU | CUDA GPU (Colab) | Tesla T4 (CUDA 13.0) | -| **Parameters** | 0.82M | 10.82M | **1.99M** | -| **Architecture** | 4L × 4H × 128d | 6L × 6H × 384d | 4L × 4H × 200d | -| **Dataset size** | 201,570 chars | 88,406,739 chars | ~31.4M chars | -| **Vocab size** | 28 | 110 | **100** | -| **Block size** | 128 tokens | 256 tokens | 128 tokens | -| **Batch size** | 16 | 64 | 64 | -| **Training steps** | 3,000 | 5,000 | 5,000 | +| Metric | Run 1 — CPU | Run 2 — Colab | Run 3 — T4 ⭐ | +|--------|-------------|--------------|------------| +| **Parameters** | 0.82M | 10.82M | 1.99M | +| **Training data** | 200K chars | 88.4M chars | 31.4M chars | | **Training time** | 39.4 min | 61.3 min | **6.1 min** | | **Best val loss** | 1.3145 | **0.7176** | 0.9250 | -| **Overfitting** | None | None | None | -| **Still improving at end?** | Yes | Yes | Yes | +| **Output coherence** | Fragmented | Coherent paragraphs | Basic sentences | +| **Overfitting** | ✓ None | ✓ None | ✓ None | +| **Still improving?** | Yes | Yes | Yes | -> **Key insight on Run 3:** A 1.99M parameter model on a Tesla T4 reached val loss 0.9250 in just 6.1 minutes — faster than any previous run by a large margin. This shows GPU acceleration pays off even for small models. Run 2 still holds the best quality due to its larger size and more data, but Run 3 shows what efficient GPU use looks like. +> **Observation**: All three were still improving at the final checkpoint. More training steps = better loss. --- -## Model Output Comparison +## Example Outputs -### Run 2 — GPU (10.82M params, val loss 0.7176) +### Run 2 Output (10.82M params) — Best Quality ``` Upon a time, there were two friends, Jack and Tom. They had a cold doll in @@ -331,223 +244,261 @@ to share his happy with them. Nack knew it was feeling important to his passion in their rooms. He knew that night, he had never seen a small boy just soon could drink. - -He kept helping her passion and seing this boy. As he kept walking, he saw -a girl. ``` -### Run 1 — CPU (0.82M params, val loss 1.3145) - -``` -when years me told be found a big ea reak abig driendly they named not she -rabbit smiled by aded he what in again -one smiled the mushrought boy -one day and was arroom him that she rabbing animals the dreezed at neard had -to there man owl them with o box and said you s mom that je animable went her -somethings of they ballike i wanted a big taught jill hone was and -he rabbit to havin after the but help and nelpft but it was surpring take to -``` - -### Output Quality Analysis - -| Quality Dimension | Run 1 (CPU, 0.82M) | Run 2 (GPU, 10.82M) | Run 3 (T4, 1.99M) | -|---|---|---|---| -| **Sentence structure** | Fragmented | Full sentences | Partial sentences | -| **Story arc** | Weak | Clear narrative flow | Basic flow | -| **Character names** | Inconsistent | Consistent | Moderate | -| **Spelling** | Many errors | Mostly correct | Mostly correct | -| **Word spacing** | Mostly correct | Correct | Correct | -| **Coherence** | Low | Moderate | Low-moderate | -| **Story phrases** | Partial | Natural | Present | -| **Paragraph breaks** | None | Present | Partial | +**Analysis**: Clear sentence structure. Named characters. Logical progression. Some linguistic oddities ("felt dizzy and wanted to share his happy") but unmistakably a story. --- -## Loss Curve Analysis - -All three runs showed the same characteristic loss curve shape: +### Run 3 Output (1.99M params, Tesla T4) — Efficient ``` -Phase 1 — Rapid Drop (0–20% of training): - Run 1 (CPU): 3.30 → 1.70 (model learns basic structure fast) - Run 2 (Colab): 4.92 → 1.03 (steeper — larger model, more to learn) - Run 3 (T4): 4.62 → 1.25 (fast drop in just seconds on GPU) - -Phase 2 — Steady Descent (20–80%): - Run 1: 1.70 → 1.39 - Run 2: 1.03 → 0.74 - Run 3: 1.25 → 0.96 (consistent improvement throughout) - -Phase 3 — Diminishing Returns (80–100%): - Run 1: 1.39 → 1.31 - Run 2: 0.74 → 0.72 - Run 3: 0.96 → 0.93 (still falling — more steps would help) +Timmy and elsed him to tell being jumping things. They were tired and making +some pinkets and help paper me. They had to see them, drain and ran ar her +mommy. They also fast with the stretch and sacks the changer. + +Lily's truck laughed and saw a rock. She said, "You can't here some wet +sicks. You have something new favorite toys, I do yours." ``` -All three models were **still improving at the final checkpoint**. None hit a plateau. This means more training steps would reduce loss further in every run. +**Analysis**: Narrative present. Dialogue structure intact. Characters named. Some word-order errors and made-up words, but the *shape* of a story is clear. --- -## Overfitting Analysis - -**No run showed any overfitting.** - -In all three cases, val loss tracked train loss closely and improved monotonically across most checkpoints. +### Run 1 Output (0.82M params) — Minimal ``` -Healthy training (all runs): - train loss ↓ and val loss ↓ together → model is generalizing - -Overfitting would look like: - train loss ↓ but val loss ↑ → model is memorizing +when years me told be found a big ea reak abig driendly they named not she +rabbit smiled by aded he what in again one smiled the mushrought boy one day +and was arroom him that she rabbing animals the dreezed at neard had to there +man owl them with o box ``` -The train/val gap at the end of each run: +**Analysis**: Word boundaries mostly intact. Character names present (rabbit, owl). But syntax falls apart—the model is struggling to hold sentence structure. This is a 0.82M model trained on tiny data; it's learning *something* but hasn't converged to coherent text. + +--- +## Project Structure + +``` +. +├── .github/ +│ ├── ISSUE_TEMPLATE/ # GitHub issue templates +│ └── workflows/ # CI/CD workflows +├── .vscode/ # VS Code configuration +├── GPU train/ # GPU training scripts and configurations +├── assets/ # Images, diagrams, and other assets +├── checkpoints/ # Saved model checkpoints +├── config/ # Configuration files +├── data_set/ # Training and validation datasets +├── evaluate/ # Evaluation scripts and metrics +├── generate/ # Text generation scripts +├── logs/ # Training logs and tensorboard files +├── train_test/ # Training and testing utilities +├── .gitattributes # Git attributes configuration +├── .gitignore # Git ignore rules +├── .python-version # Python version specification +├── LICENSE # MIT License +├── README.md # This file +├── cleaned.txt # Cleaned training data +├── gpt-from-scratch.ipynb # Jupyter notebook implementation +├── requirements.txt # Python dependencies +├── traindata.txt # Raw training data +└── transformer.py # Main transformer model implementation +``` + +## Loss Curves & Training Dynamics + +All three runs showed the classic learning curve: + +``` +Phase 1: Rapid drop (0–20% of training) + - Model learns basic character transitions + - Loss halves or better + +Phase 2: Steady descent (20–80%) + - Model learns longer-range patterns + - Character names, sentence boundaries + - Loss continues down but more gradually + +Phase 3: Diminishing returns (80–100%) + - Model learning slows + - Val loss still improving but incremental + - More data or capacity would help here +``` + +**Train/Val Gap Analysis** (indicator of overfitting): | Run | Final Train | Final Val | Gap | -|---|---|---|---| -| Run 1 (CPU, 0.82M) | 1.3191 | 1.3145 | 0.0046 | -| Run 2 (Colab, 10.82M) | 0.7259 | 0.7176 | 0.0083 | -| Run 3 (T4, 1.99M) | 0.9307 | 0.9250 | 0.0057 | +|-----|-------------|-----------|-----| +| CPU | 1.3191 | 1.3145 | 0.0046 | +| Colab | 0.7259 | 0.7176 | 0.0083 | +| T4 | 0.9307 | 0.9250 | 0.0057 | -All gaps are small and healthy. Run 3 sits between Run 1 and Run 2, which matches its middle-sized architecture. +All gaps are tiny. **No overfitting detected.** The model is generalizing well to unseen text. --- -## Scaling Laws — And Where Your Model Sits +## Scaling Laws: Where Quadtrix Sits +Screenshot 2026-03-17 171921 -Screenshot 2026-03-17 171921 -### What Are Scaling Laws? +The Chinchilla (2022) scaling law suggests: **~20 tokens of training data per parameter is optimal**. -Scaling laws describe a predictable relationship between model size, dataset size, compute, and output quality: +Let's see how our runs align: -> The more parameters, the more data, and the more compute you use — the better the model gets. And this follows a consistent, measurable curve. +| Model | Parameters | Training Data | Optimal (20×) | Coverage | +|-------|------------|---------------|--------------|----------| +| Run 1 | 0.82M | 200K tokens | 16.4M | **1.2%** | +| Run 3 | 1.99M | 28.3M tokens | 39.8M | **71.1%** ← Best balanced | +| Run 2 | 10.82M | 79.6M tokens | 216M | **36.8%** | +| GPT-2 Small | 117M | 40B tokens | 2.3B | ~1700% | +| GPT-3 | 175B | 600B tokens | 3.5T | ~17% | -The key finding (Chinchilla, 2022) is that the optimal ratio is roughly **20 tokens of training data per parameter.** +**Insight**: Run 3 is the most *balanced*—the model size and data quantity are well-matched. Run 2 has the largest model but is only at 37% optimal data coverage, meaning it would benefit more from adding data than adding parameters. -### The Three Axes of Scaling +**Next steps for any run**: +1. **More training steps** — All three were still falling at the final checkpoint +2. **More data** — Run 3 is closest to optimal; Run 2 would benefit most +3. **Larger model** — Only worth doing once data coverage exceeds 50% -``` -Parameters (N) → How much the model can remember -Data (D) → How much it has learned from -Compute (C) → Parameters × Data × Training steps -``` +--- -### Where All Three Runs Sit +## How Generation Works -| Model | Parameters | Training Data | Optimal Data (20× rule) | Data Coverage | -|---|---|---|---|---| -| Run 1 — CPU | 0.82M | ~200K tokens | ~16.4M tokens | **1.2%** | -| Run 3 — T4 | 1.99M | ~28.3M tokens | ~39.8M tokens | **71.1%** | -| Run 2 — Colab | 10.82M | ~79.6M tokens | ~216M tokens | **36.8%** | -| GPT-2 Small | 117M | ~40B tokens | ~2.3B tokens | ~1700% | -| GPT-3 | 175B | ~600B tokens | ~3.5T tokens | ~17% | +Once training is done, `best_model.pt` contains frozen weights. Generation is simple: -> **Run 3 stands out here.** At 71.1% of optimal data coverage it is the best-balanced run so far — the model size and dataset size are close to the ideal Chinchilla ratio. Run 2 has the most parameters but sits at only 36.8% coverage, meaning it would benefit more from additional data than additional capacity. +```python +# Pseudocode for generation +seed = torch.tensor([[start_token]]) # e.g., start_token = 0 + +for _ in range(num_chars_to_generate): + # Forward pass through all layers + logits = model(seed)[-1, :] # Get last token's logits + + # Convert logits to probabilities + probs = softmax(logits / temperature) + + # Sample next token + next_token = sample(probs) + + # Append and continue + seed = torch.cat([seed, next_token], dim=-1) + + # Trim to context window if needed + seed = seed[:, -block_size:] +``` + +**Why output differs each run**: The sampling step is random. Same weights, different random seeds = different output. Add `torch.manual_seed(42)` for deterministic output. -### Full Model Landscape +--- -``` -Model Parameters Data Val Loss Output Quality -────────────────────────────────────────────────────────────────────────────────────── -Run 1 — CPU (this repo) 0.82M ~200K tokens 1.3145 Word fragments -Run 3 — T4 (this repo) 1.99M ~28.3M tokens 0.9250 Basic sentences ← Efficient run -Run 2 — Colab (this repo)10.82M ~79.6M tokens 0.7176 Story sentences ← Best quality -GPT-2 Small 117M ~40B tokens ~3.0* Coherent English -GPT-2 Large 774M ~40B tokens ~2.5* Strong English -GPT-3 175B ~600B tokens — Near-human text - -* GPT-2 losses are on a different (larger) vocabulary and not directly comparable. -``` +## Known Limitations -### What This Tells Us +1. **Character-level learning** — the model learns characters, not words. It cannot reliably spell or track meaning across paragraphs. -Run 3 proves that a small, well-balanced model on a fast GPU can converge in minutes. The next logical steps across any run: +2. **Output coherence** — especially on smaller runs. Sentences drift logically. Names disappear. Tense breaks. This is expected at this scale. -``` -1. More training steps → all three were still falling at the final checkpoint -2. More data → Run 3 is closest to optimal ratio; Run 2 benefits most -3. Larger model → only worth it once data coverage is above 50% -``` +3. **All models undertrained** — validation loss was still improving at iteration N. More training steps would help all three. ---- +4. **Limited data** — Run 2 is only at 37% optimal data coverage. A larger story corpus would meaningfully improve output quality. -## How Weights Produce Output +5. **No long-range memory** — transformer context window is fixed (128 or 256 tokens). The model cannot reference events from 10 paragraphs ago. -After training, the model is frozen. The weights file (`best_model.pt`) contains all the numbers that encode everything the model learned from your children's stories. +--- -**The generation loop:** +## Technical Details -``` -Step 1 — Start with a seed token (start of text) - ↓ -Step 2 — Feed it through all transformer layers - Each layer does matrix multiplications - using the saved weight numbers - ↓ -Step 3 — Output is N numbers (one per vocab character) - Each number = probability of that character being next - e.g. 'e' = 0.18 't' = 0.14 'a' = 0.12 - ↓ -Step 4 — Sample randomly from those probabilities - ↓ -Step 5 — That character becomes the new input - Go back to Step 2 - ↓ -Step 6 — Repeat forever -``` +### Model Architecture -**Why output is different every run:** +```python +class GPTModel(nn.Module): + def __init__(self, vocab_size, n_embd, n_head, n_layer, block_size, dropout): + self.token_embedding = nn.Embedding(vocab_size, n_embd) + self.position_embedding = nn.Embedding(block_size, n_embd) + self.transformer = nn.Sequential( + *[TransformerBlock(n_embd, n_head, dropout) for _ in range(n_layer)] + ) + self.ln_final = nn.LayerNorm(n_embd) + self.lm_head = nn.Linear(n_embd, vocab_size) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + B, T = x.shape + tok_emb = self.token_embedding(x) # (B, T, n_embd) + pos_emb = self.position_embedding(torch.arange(T)) + x = self.dropout(tok_emb + pos_emb) + x = self.transformer(x) + x = self.ln_final(x) + logits = self.lm_head(x) + return logits +``` + +### Transformer Block + +Each block contains: +- **Multi-head self-attention**: `(B, T, n_embd) → (B, T, n_embd)` +- **Feedforward network**: Two linear layers with ReLU +- **Layer normalization & residual connections** +- **Dropout for regularization** + +### Training Loop -The sampling step picks randomly from the probabilities. Same weights, different random draws = different output each time. To get deterministic output, add `torch.manual_seed(42)` before generation. +```python +for step in range(max_iters): + # Batch a random chunk of training data + batch_x, batch_y = get_batch('train') + + # Forward pass + logits = model(batch_x) + loss = F.cross_entropy(logits.view(-1, vocab_size), batch_y.view(-1)) + + # Backward pass + optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + + # Evaluate and save + if step % eval_interval == 0: + val_loss = estimate_loss('val') + print(f"train={train_loss:.4f} val={val_loss:.4f}") + + if val_loss < best_val_loss: + best_val_loss = val_loss + torch.save(model.state_dict(), 'best_model.pt') +``` --- -## Known Limitations +## Why This Project Matters -- **Character-level only** — the model learns characters not words, so it cannot spell reliably or track meaning across sentences -- **Output will not be fully coherent** — story fragments are recognizable in Run 2 but still logically drift across paragraphs -- **All models undertrained** — val loss was still falling at the final checkpoint in all three runs; more iterations would help -- **Run 2 still at ~37% optimal data** — a larger story dataset would meaningfully improve output quality -- **No memory between runs** — each generation starts from scratch with no prior context +1. **Educational**: See exactly how a language model learns. No black boxes. ---- +2. **Verifiable**: You can trace loss, inspect weights, understand every line of code. -## Real Model Output (Run 1 — CPU) +3. **Fast**: Even on CPU, training finishes in under an hour. On GPU, minutes. -``` -when years me told be found a big ea reak abig driendly they named not she -rabbit smiled by aded he what in again -one smiled the mushrought boy -one day and was arroom him that she rabbing animals the dreezed at neard had -to there man owl them with o box and said you s mom that je animable went her -``` +4. **Runnable**: One script, one dependency (PyTorch). No cloud account required. -## Real Model Output (Run 2 — GPU Colab) +Quadtrix is to GPT what nanoGPT is to transformers: a stripped-down, readable, educational version that teaches the core ideas. -``` -Upon a time, there were two friends, Jack and Tom. They had a cold doll in -the sunshine. +--- -One day, Jack saw that he was universed. He used the sky at past it to march -around the garden. He had a small ball on his face. He felt dizzy and wanted -to share his happy with them. +## Getting Started -Nack knew it was feeling important to his passion in their rooms. He knew -that night, he had never seen a small boy just soon could drink. +1. **Clone or download** `transformer.py` +2. **Install PyTorch**: `pip install torch` +3. **Add your data**: Place a UTF-8 text file at `data.txt` (or edit the filename in the script) +4. **Run**: `python transformer.py` +5. **Watch**: Loss decreases. Weights save. Text generates. +6. **Tweak**: Edit hyperparameters and re-run to see how each affects training speed and output quality. -``` -## Real Model Output (Run 3 — GPU ) - -``` -Timmy and elsed him to tell being jumping things. They were tired and making some pinkets and help paper me. They had to see them, drain and ran ar her mommy. They also fast with the stretch and sacks the changer. They play and them together or day. They tlike to need to stay and cut fun and have to catch him. But the bird is pretty. You have to make your legs and it's some people truck in it." +--- -Lily's truck laughed and saw a rock. She said, "You can't here some wet sicks. You have something new favorite toys, I do yours. All of fun!" From that day on, Lily always callimbed the slide, and Tom were playing with the surprise, loved to play in the park. One day, they went to the park with most in the bathting to girl and dinner. One day, the family went off another floor and quickly made Jack the toys far away. +## License -In a weath came and dancelet every day out for righting. It was a lot of big ball towers and eggs and make him lots of them. The man is a perfect of the bad lettersser on him. -``` +MIT --- -*Built with PyTorch. — [https://github.com/Eamon2009/Transformer-language-model]* +*Built with PyTorch. | [GitHub](https://github.com/Eamon2009/Transformer-language-model)* diff --git a/checkpoints/.gitkeep b/checkpoints/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/checkpoints/read.md b/checkpoints/read.md new file mode 100644 index 0000000..d4ade29 --- /dev/null +++ b/checkpoints/read.md @@ -0,0 +1,62 @@ +# Checkpoints + +This folder stores saved model weights during and after training. + +## Structure + +``` +checkpoints/ +├── best_model.pt ← best checkpoint by validation loss +├── latest_model.pt ← most recent checkpoint +└── epoch_XX_loss_X.XX.pt ← epoch-specific saves +``` + +## How to Save (add to your transformer.py / train script) + +```python +import torch +import os + +def save_checkpoint(model, optimizer, epoch, loss, path="checkpoints/"): + os.makedirs(path, exist_ok=True) + checkpoint = { + "epoch": epoch, + "model_state_dict": model.state_dict(), + "optimizer_state_dict": optimizer.state_dict(), + "loss": loss, + } + # Save latest + torch.save(checkpoint, os.path.join(path, "latest_model.pt")) + + # Save best if loss improved + best_path = os.path.join(path, "best_model.pt") + if not os.path.exists(best_path): + torch.save(checkpoint, best_path) + else: + best = torch.load(best_path) + if loss < best["loss"]: + torch.save(checkpoint, best_path) + print(f" New best model saved at epoch {epoch} with loss {loss:.4f}") + + # Save per epoch (optional) + torch.save(checkpoint, os.path.join(path, f"epoch_{epoch:03d}_loss_{loss:.4f}.pt")) +``` + +## How to Load + +```python +def load_checkpoint(model, optimizer, path="checkpoints/latest_model.pt"): + checkpoint = torch.load(path) + model.load_state_dict(checkpoint["model_state_dict"]) + optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) + epoch = checkpoint["epoch"] + loss = checkpoint["loss"] + print(f" Loaded checkpoint from epoch {epoch}, loss: {loss:.4f}") + return model, optimizer, epoch, loss +``` + +## Notes + +- `.pt` files are listed in `.gitignore` — they are too large for GitHub +- Push only this README and `.gitkeep` to keep the folder tracked by git +- For long GPU runs, save every N epochs so you can resume if it crashes \ No newline at end of file diff --git a/generate/CMakeLists.txt b/generate/CMakeLists.txt deleted file mode 100644 index 392332e..0000000 --- a/generate/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -cmake_minimum_required(VERSION 3.18) -project(gpt_generate) - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_PREFIX_PATH "C:/libtorch") - -find_package(Torch REQUIRED) - -add_executable(generate main.cpp) -target_link_libraries(generate ${TORCH_LIBRARIES}) - -if(WIN32) - file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll") - add_custom_command(TARGET generate POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different - ${TORCH_DLLS} $) -endif() \ No newline at end of file diff --git a/logs/read.md b/logs/read.md new file mode 100644 index 0000000..3458e38 --- /dev/null +++ b/logs/read.md @@ -0,0 +1,85 @@ +# Logs + +This folder stores training logs, loss curves, and run history. + +## Structure + +``` +logs/ +├── train_loss.csv ← loss per epoch (easy to plot) +├── train_run_latest.log ← full console output of latest run +└── run_YYYYMMDD_HHMM.log ← timestamped logs per run +``` + +## How to Log (add to your transformer.py / train script) + +```python +import csv +import os +import logging +from datetime import datetime + +# --- Setup --- +os.makedirs("logs", exist_ok=True) +timestamp = datetime.now().strftime("%Y%m%d_%H%M") + +# Console + file logger +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s | %(message)s", + handlers=[ + logging.FileHandler(f"logs/run_{timestamp}.log"), + logging.FileHandler("logs/train_run_latest.log", mode="w"), + logging.StreamHandler() # still prints to terminal + ] +) +logger = logging.getLogger(__name__) + +# CSV loss tracker +loss_csv = "logs/train_loss.csv" +if not os.path.exists(loss_csv): + with open(loss_csv, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["epoch", "train_loss", "val_loss", "lr"]) + +def log_epoch(epoch, train_loss, val_loss=None, lr=None): + logger.info(f"Epoch {epoch:03d} | Train Loss: {train_loss:.4f}" + + (f" | Val Loss: {val_loss:.4f}" if val_loss else "") + + (f" | LR: {lr:.6f}" if lr else "")) + with open(loss_csv, "a", newline="") as f: + writer = csv.writer(f) + writer.writerow([epoch, train_loss, val_loss or "", lr or ""]) +``` + +## Usage in Training Loop + +```python +for epoch in range(num_epochs): + train_loss = train_one_epoch(model, optimizer) + log_epoch(epoch, train_loss, lr=scheduler.get_last_lr()[0]) + save_checkpoint(model, optimizer, epoch, train_loss) +``` + +## Plotting Loss Curve + +```python +import pandas as pd +import matplotlib.pyplot as plt + +df = pd.read_csv("logs/train_loss.csv") +plt.plot(df["epoch"], df["train_loss"], label="Train Loss") +if "val_loss" in df and df["val_loss"].notna().any(): + plt.plot(df["epoch"], df["val_loss"], label="Val Loss") +plt.xlabel("Epoch") +plt.ylabel("Loss") +plt.title("GPT Training Loss") +plt.legend() +plt.savefig("logs/loss_curve.png") +plt.show() +``` + +## Notes + +- `.log` files and `loss_curve.png` can be pushed to GitHub (they're small) +- `train_loss.csv` is very useful to track across GPU runs +- Timestamped logs help you compare different training runs \ No newline at end of file diff --git a/logs/run1.log b/logs/run1.log new file mode 100644 index 0000000..347e456 --- /dev/null +++ b/logs/run1.log @@ -0,0 +1,16 @@ +[ 0/3000] 0.0% train=3.2961 val=3.2981 elapsed=12s ETA=0s best! +[ 200/3000] 6.7% train=2.3038 val=2.2490 elapsed=141s ETA=1959s best! +[ 400/3000] 13.3% train=2.2469 val=2.1950 elapsed=292s ETA=1891s best! +[ 600/3000] 20.0% train=2.1842 val=2.1318 elapsed=436s ETA=1739s best! +[ 800/3000] 26.7% train=1.9742 val=1.9103 elapsed=581s ETA=1594s best! +[ 1000/3000] 33.3% train=1.7628 val=1.7002 elapsed=723s ETA=1443s best! +[ 1200/3000] 40.0% train=1.6714 val=1.6040 elapsed=863s ETA=1293s best! +[ 1400/3000] 46.7% train=1.5889 val=1.5360 elapsed=1015s ETA=1158s best! +[ 1600/3000] 53.3% train=1.5375 val=1.4723 elapsed=1166s ETA=1019s best! +[ 1800/3000] 60.0% train=1.4847 val=1.4525 elapsed=1320s ETA=879s best! +[ 2000/3000] 66.7% train=1.4604 val=1.4081 elapsed=1472s ETA=735s best! +[ 2200/3000] 73.3% train=1.4113 val=1.3857 elapsed=1653s ETA=600s best! +[ 2400/3000] 80.0% train=1.3923 val=1.3725 elapsed=1820s ETA=454s best! +[ 2600/3000] 86.7% train=1.3501 val=1.3446 elapsed=1998s ETA=307s best! +[ 2800/3000] 93.3% train=1.3336 val=1.3334 elapsed=2174s ETA=154s best! +[ 2999/3000] 100.0% train=1.3191 val=1.3145 elapsed=2363s ETA=0s best! diff --git a/logs/run2.log b/logs/run2.log new file mode 100644 index 0000000..c836ca8 --- /dev/null +++ b/logs/run2.log @@ -0,0 +1,21 @@ +[ 0/5000] 0.0% train=4.9244 val=4.9262 elapsed=31s ETA=0s best! +[ 250/5000] 5.0% train=2.1218 val=2.1169 elapsed=206s ETA=3901s best! +[ 500/5000] 10.0% train=1.3606 val=1.3500 elapsed=391s ETA=3510s best! +[ 750/5000] 15.0% train=1.1540 val=1.1411 elapsed=575s ETA=3250s best! +[ 1000/5000] 20.0% train=1.0332 val=1.0296 elapsed=757s ETA=3024s best! +[ 1250/5000] 25.0% train=0.9657 val=0.9556 elapsed=941s ETA=2819s best! +[ 1500/5000] 30.0% train=0.9305 val=0.9189 elapsed=1124s ETA=2619s best! +[ 1750/5000] 35.0% train=0.8935 val=0.8853 elapsed=1306s ETA=2424s best! +[ 2000/5000] 40.0% train=0.8673 val=0.8602 elapsed=1490s ETA=2233s best! +[ 2250/5000] 45.0% train=0.8413 val=0.8367 elapsed=1672s ETA=2042s best! +[ 2500/5000] 50.0% train=0.8162 val=0.8141 elapsed=1855s ETA=1854s best! +[ 2750/5000] 55.0% train=0.8058 val=0.7995 elapsed=2038s ETA=1666s best! +[ 3000/5000] 60.0% train=0.7888 val=0.7803 elapsed=2221s ETA=1479s best! +[ 3250/5000] 65.0% train=0.7798 val=0.7730 elapsed=2403s ETA=1293s best! +[ 3500/5000] 70.0% train=0.7634 val=0.7551 elapsed=2585s ETA=1107s best! +[ 3750/5000] 75.0% train=0.7588 val=0.7528 elapsed=2768s ETA=922s best! +[ 4000/5000] 80.0% train=0.7480 val=0.7434 elapsed=2951s ETA=737s best! +[ 4250/5000] 85.0% train=0.7381 val=0.7351 elapsed=3134s ETA=552s best! +[ 4500/5000] 90.0% train=0.7371 val=0.7314 elapsed=3316s ETA=368s best! +[ 4750/5000] 95.0% train=0.7282 val=0.7239 elapsed=3498s ETA=183s best! +[ 4999/5000] 100.0% train=0.7259 val=0.7176 elapsed=3680s ETA=0s best! diff --git a/logs/train_run_latest.log b/logs/train_run_latest.log new file mode 100644 index 0000000..5c42780 --- /dev/null +++ b/logs/train_run_latest.log @@ -0,0 +1,26 @@ +[ 0/5000] train=4.6207 val=4.6202 elapsed=2s ETA=0s << best! +[ 200/5000] train=2.2058 val=2.1986 elapsed=17s ETA=405s << best! +[ 400/5000] train=1.6111 val=1.6039 elapsed=32s ETA=367s << best! +[ 600/5000] train=1.4109 val=1.4183 elapsed=47s ETA=342s << best! +[ 800/5000] train=1.3230 val=1.3231 elapsed=61s ETA=322s << best! +[ 1000/5000] train=1.2495 val=1.2567 elapsed=76s ETA=303s << best! +[ 1200/5000] train=1.1960 val=1.1948 elapsed=90s ETA=286s << best! +[ 1400/5000] train=1.1569 val=1.1642 elapsed=105s ETA=270s << best! +[ 1600/5000] train=1.1283 val=1.1283 elapsed=120s ETA=254s << best! +[ 1800/5000] train=1.0894 val=1.1023 elapsed=134s ETA=238s << best! +[ 2000/5000] train=1.0731 val=1.0765 elapsed=149s ETA=223s << best! +[ 2200/5000] train=1.0584 val=1.0550 elapsed=163s ETA=208s << best! +[ 2400/5000] train=1.0415 val=1.0346 elapsed=178s ETA=192s << best! +[ 2600/5000] train=1.0261 val=1.0199 elapsed=192s ETA=177s << best! +[ 2800/5000] train=1.0106 val=1.0117 elapsed=207s ETA=162s << best! +[ 3000/5000] train=1.0000 val=0.9956 elapsed=221s ETA=148s << best! +[ 3200/5000] train=0.9913 val=0.9924 elapsed=236s ETA=133s << best! +[ 3400/5000] train=0.9727 val=0.9782 elapsed=251s ETA=118s << best! +[ 3600/5000] train=0.9656 val=0.9720 elapsed=265s ETA=103s << best! +[ 3800/5000] train=0.9685 val=0.9632 elapsed=280s ETA=88s << best! +[ 4000/5000] train=0.9601 val=0.9642 elapsed=294s ETA=74s +[ 4200/5000] train=0.9515 val=0.9489 elapsed=309s ETA=59s << best! +[ 4400/5000] train=0.9433 val=0.9431 elapsed=323s ETA=44s << best! +[ 4600/5000] train=0.9384 val=0.9459 elapsed=338s ETA=29s +[ 4800/5000] train=0.9331 val=0.9250 elapsed=353s ETA=15s << best! +[ 4999/5000] train=0.9307 val=0.9430 elapsed=367s ETA=0s diff --git a/train_test/infer.cu b/train_test/infer.cu deleted file mode 100644 index 66dd660..0000000 --- a/train_test/infer.cu +++ /dev/null @@ -1,568 +0,0 @@ -// This need to be tested on a real gpu -#include -#include -#include -#include -#include -#include -#include -#include - -// Config -typedef struct -{ - int vocab_size; - int block_size; - int n_embd; - int n_head; - int n_layer; -} Config; - -// ── Error checking -#define CUDA_CHECK(call) \ - do \ - { \ - cudaError_t err = call; \ - if (err != cudaSuccess) \ - { \ - fprintf(stderr, "[CUDA Error] %s at line %d: %s\n", \ - #call, __LINE__, cudaGetErrorString(err)); \ - exit(1); \ - } \ - } while (0) - -#define CUBLAS_CHECK(call) \ - do \ - { \ - cublasStatus_t err = call; \ - if (err != CUBLAS_STATUS_SUCCESS) \ - { \ - fprintf(stderr, "[cuBLAS Error] %s at line %d: %d\n", \ - #call, __LINE__, err); \ - exit(1); \ - } \ - } while (0) - -// Signal handler -static volatile int running = 1; -void handle_sigint(int s) -{ - (void)s; - printf("\n\n[Stopped by user]\n"); - running = 0; -} - -// Build x[T x C] from token + position embeddings -__global__ void embed_kernel(float *x, float *tok_emb, float *pos_emb, - int *tokens, int T, int C) -{ - int t = blockIdx.x; - int c = threadIdx.x; - if (t < T && c < C) - x[t * C + c] = tok_emb[tokens[t] * C + c] + pos_emb[t * C + c]; -} - -// LayerNorm over one row of length C -__global__ void layernorm_kernel(float *out, float *x, float *w, float *b, - int T, int C) -{ - int t = blockIdx.x; - if (t >= T) - return; - - float *xr = x + t * C; - float *outr = out + t * C; - - float mean = 0.0f; - for (int i = 0; i < C; i++) - mean += xr[i]; - mean /= C; - - float var = 0.0f; - for (int i = 0; i < C; i++) - var += (xr[i] - mean) * (xr[i] - mean); - var = var / C + 1e-5f; - float inv = rsqrtf(var); - - for (int i = 0; i < C; i++) - outr[i] = (xr[i] - mean) * inv * w[i] + b[i]; -} - -// Add bias in-place: x[T x N] += b[N] -__global__ void add_bias_kernel(float *x, float *b, int T, int N) -{ - int t = blockIdx.x; - int n = threadIdx.x; - if (t < T && n < N) - x[t * N + n] += b[n]; -} - -// ReLU in-place -__global__ void relu_kernel(float *x, int n) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n && x[i] < 0.0f) - x[i] = 0.0f; -} - -// Residual add: a += b -__global__ void residual_kernel(float *a, float *b, int n) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) - a[i] += b[i]; -} - -// Causal attention scores + softmax for one head -__global__ void attention_kernel(float *att, float *Q, float *K, - int T, int hs) -{ - int i = blockIdx.x; // query position - int j = threadIdx.x; // key position - if (i >= T || j >= T) - return; - - if (j > i) - { - att[i * T + j] = -1e9f; - return; - } - float scale = rsqrtf((float)hs); - float dot = 0.0f; - for (int k = 0; k < hs; k++) - dot += Q[i * hs + k] * K[j * hs + k]; - att[i * T + j] = dot * scale; - - // Softmax (done per row, only thread 0 of each row) - __syncthreads(); - if (j == 0) - { - float *row = att + i * T; - float mx = -1e9f; - for (int x = 0; x <= i; x++) - if (row[x] > mx) - mx = row[x]; - float sum = 0.0f; - for (int x = 0; x < T; x++) - { - row[x] = (x <= i) ? expf(row[x] - mx) : 0.0f; - sum += row[x]; - } - for (int x = 0; x < T; x++) - row[x] /= sum; - } -} - -// Weighted sum: hv[T x hs] = att[T x T] @ V[T x hs] -__global__ void attn_value_kernel(float *hv, float *att, float *V, - int T, int hs) -{ - int i = blockIdx.x; - int k = threadIdx.x; - if (i >= T || k >= hs) - return; - float s = 0.0f; - for (int j = 0; j <= i; j++) - s += att[i * T + j] * V[j * hs + k]; - hv[i * hs + k] = s; -} - -// Scatter head output into full [T x C] buffer at offset h*hs -__global__ void scatter_head_kernel(float *head_out, float *hv, - int T, int C, int hs, int h_offset) -{ - int t = blockIdx.x; - int k = threadIdx.x; - if (t < T && k < hs) - head_out[t * C + h_offset + k] = hv[t * hs + k]; -} - -// Softmax over logits (single row, run on CPU side for simplicity) -static void softmax_cpu(float *x, int n) -{ - float mx = x[0]; - for (int i = 1; i < n; i++) - if (x[i] > mx) - mx = x[i]; - float sum = 0.0f; - for (int i = 0; i < n; i++) - { - x[i] = expf(x[i] - mx); - sum += x[i]; - } - for (int i = 0; i < n; i++) - x[i] /= sum; -} - -static int sample(float *probs, int n) -{ - float r = (float)rand() / ((float)RAND_MAX + 1.0f); - float cdf = 0.0f; - for (int i = 0; i < n; i++) - { - cdf += probs[i]; - if (r < cdf) - return i; - } - return n - 1; -} - -// Weight struct (GPU pointers,need to be tested on a GPU ) -typedef struct -{ - float *tok_emb; // [vocab x C] - float *pos_emb; // [block x C] - float **head_k; // [n_layer x n_head][hs x C] - float **head_q; - float **head_v; - float **sa_proj_w; // [n_layer][C x C] - float **sa_proj_b; // [n_layer][C] - float **ff_w1; // [n_layer][4C x C] - float **ff_b1; // [n_layer][4C] - float **ff_w2; // [n_layer][C x 4C] - float **ff_b2; // [n_layer][C] - float **ln1_w; // [n_layer][C] - float **ln1_b; - float **ln2_w; - float **ln2_b; - float *ln_f_w; // [C] - float *ln_f_b; - float *lm_w; // [vocab x C] - float *lm_b; // [vocab] -} Weights; - -// Read tensor from file and upload to GPU -static float *read_upload(FILE *f) -{ - int ndim; - fread(&ndim, sizeof(int), 1, f); - int total = 1; - for (int i = 0; i < ndim; i++) - { - int d; - fread(&d, sizeof(int), 1, f); - total *= d; - } - float *cpu = (float *)malloc(total * sizeof(float)); - fread(cpu, sizeof(float), total, f); - float *gpu; - CUDA_CHECK(cudaMalloc(&gpu, total * sizeof(float))); - CUDA_CHECK(cudaMemcpy(gpu, cpu, total * sizeof(float), cudaMemcpyHostToDevice)); - free(cpu); - return gpu; -} - -// Forward pass -static void forward(float *d_logits, int *d_tokens, int T, - Config *cfg, Weights *W, cublasHandle_t cublas) -{ - int C = cfg->n_embd; - int nh = cfg->n_head; - int hs = C / nh; - int ff = 4 * C; - - float one = 1.0f, zero = 0.0f; - - // Allocate working buffers - float *d_x, *d_ln_out, *d_head_out, *d_attn_out; - float *d_K, *d_Q, *d_V, *d_att, *d_hv; - float *d_ff1, *d_ff2; - - CUDA_CHECK(cudaMalloc(&d_x, T * C * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_ln_out, T * C * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_head_out, T * C * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_attn_out, T * C * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_K, T * hs * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_Q, T * hs * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_V, T * hs * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_att, T * T * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_hv, T * hs * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_ff1, T * ff * sizeof(float))); - CUDA_CHECK(cudaMalloc(&d_ff2, T * C * sizeof(float))); - - // Embed - embed_kernel<<>>(d_x, W->tok_emb, W->pos_emb, d_tokens, T, C); - - for (int l = 0; l < cfg->n_layer; l++) - { - int base = l * nh; - - // LayerNorm 1 - layernorm_kernel<<>>(d_ln_out, d_x, - W->ln1_w[l], W->ln1_b[l], T, C); - - // Multi-head attention - CUDA_CHECK(cudaMemset(d_head_out, 0, T * C * sizeof(float))); - - for (int h = 0; h < nh; h++) - { - CUBLAS_CHECK(cublasSgemm(cublas, CUBLAS_OP_T, CUBLAS_OP_N, - hs, T, C, &one, - W->head_k[base + h], C, - d_ln_out, C, - &zero, d_K, hs)); - - CUBLAS_CHECK(cublasSgemm(cublas, CUBLAS_OP_T, CUBLAS_OP_N, - hs, T, C, &one, - W->head_q[base + h], C, - d_ln_out, C, - &zero, d_Q, hs)); - - CUBLAS_CHECK(cublasSgemm(cublas, CUBLAS_OP_T, CUBLAS_OP_N, - hs, T, C, &one, - W->head_v[base + h], C, - d_ln_out, C, - &zero, d_V, hs)); - - // Attention scores + softmax - attention_kernel<<>>(d_att, d_Q, d_K, T, hs); - - // Weighted V - attn_value_kernel<<>>(d_hv, d_att, d_V, T, hs); - - // Scatter into head_out - scatter_head_kernel<<>>(d_head_out, d_hv, T, C, hs, h * hs); - } - - // SA projection - CUBLAS_CHECK(cublasSgemm(cublas, CUBLAS_OP_T, CUBLAS_OP_N, - C, T, C, &one, - W->sa_proj_w[l], C, - d_head_out, C, - &zero, d_attn_out, C)); - add_bias_kernel<<>>(d_attn_out, W->sa_proj_b[l], T, C); - - // Residual - residual_kernel<<<(T * C + 255) / 256, 256>>>(d_x, d_attn_out, T * C); - - // LayerNorm 2 - layernorm_kernel<<>>(d_ln_out, d_x, - W->ln2_w[l], W->ln2_b[l], T, C); - - // FF layer 1 - CUBLAS_CHECK(cublasSgemm(cublas, CUBLAS_OP_T, CUBLAS_OP_N, - ff, T, C, &one, - W->ff_w1[l], C, - d_ln_out, C, - &zero, d_ff1, ff)); - add_bias_kernel<<>>(d_ff1, W->ff_b1[l], T, ff); - relu_kernel<<<(T * ff + 255) / 256, 256>>>(d_ff1, T * ff); - - // FF layer 2 - CUBLAS_CHECK(cublasSgemm(cublas, CUBLAS_OP_T, CUBLAS_OP_N, - C, T, ff, &one, - W->ff_w2[l], ff, - d_ff1, ff, - &zero, d_ff2, C)); - add_bias_kernel<<>>(d_ff2, W->ff_b2[l], T, C); - - // Residual - residual_kernel<<<(T * C + 255) / 256, 256>>>(d_x, d_ff2, T * C); - } - - // Final layernorm on last token only - float *d_last; - CUDA_CHECK(cudaMalloc(&d_last, C * sizeof(float))); - float *d_xf; - CUDA_CHECK(cudaMalloc(&d_xf, C * sizeof(float))); - - CUDA_CHECK(cudaMemcpy(d_last, d_x + (T - 1) * C, - C * sizeof(float), cudaMemcpyDeviceToDevice)); - layernorm_kernel<<<1, 1>>>(d_xf, d_last, W->ln_f_w, W->ln_f_b, 1, C); - - // LM head: logits[vocab] = lm_w[vocab x C] @ xf[C] - CUBLAS_CHECK(cublasSgemv(cublas, CUBLAS_OP_T, - C, cfg->vocab_size, &one, - W->lm_w, C, - d_xf, 1, - &zero, d_logits, 1)); - - // Add lm bias - float *d_lm_b_scaled; - CUDA_CHECK(cudaMalloc(&d_lm_b_scaled, cfg->vocab_size * sizeof(float))); - CUDA_CHECK(cudaMemcpy(d_lm_b_scaled, W->lm_b, - cfg->vocab_size * sizeof(float), - cudaMemcpyDeviceToDevice)); - residual_kernel<<<(cfg->vocab_size + 255) / 256, 256>>>( - d_logits, d_lm_b_scaled, cfg->vocab_size); - - // Free buffers - cudaFree(d_x); - cudaFree(d_ln_out); - cudaFree(d_head_out); - cudaFree(d_attn_out); - cudaFree(d_K); - cudaFree(d_Q); - cudaFree(d_V); - cudaFree(d_att); - cudaFree(d_hv); - cudaFree(d_ff1); - cudaFree(d_ff2); - cudaFree(d_last); - cudaFree(d_xf); - cudaFree(d_lm_b_scaled); -} - -// Main -int main(void) -{ - signal(SIGINT, handle_sigint); - srand((unsigned)time(NULL)); - - // Check GPU - int dev_count = 0; - CUDA_CHECK(cudaGetDeviceCount(&dev_count)); - if (dev_count == 0) - { - fprintf(stderr, "[Error] No CUDA GPU found.\n"); - return 1; - } - cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, 0)); - printf("[INFO] GPU: %s\n", prop.name); - - // Load vocab - FILE *fv = fopen("../vocab.bin", "rb"); - if (!fv) - { - fprintf(stderr, "[Error] Cannot open vocab.bin\n"); - return 1; - } - int vocab_size; - fread(&vocab_size, sizeof(int), 1, fv); - char *vocab = (char *)malloc(vocab_size); - for (int i = 0; i < vocab_size; i++) - { - unsigned char c; - fread(&c, 1, 1, fv); - vocab[i] = (char)c; - } - fclose(fv); - - // Load weights - FILE *fw = fopen("../weights.bin", "rb"); - if (!fw) - { - fprintf(stderr, "[Error] Cannot open weights.bin\n"); - return 1; - } - - Config cfg; - fread(&cfg.vocab_size, sizeof(int), 1, fw); - fread(&cfg.block_size, sizeof(int), 1, fw); - fread(&cfg.n_embd, sizeof(int), 1, fw); - fread(&cfg.n_head, sizeof(int), 1, fw); - fread(&cfg.n_layer, sizeof(int), 1, fw); - - int nl = cfg.n_layer, nh = cfg.n_head; - - Weights W = {0}; - W.tok_emb = read_upload(fw); - W.pos_emb = read_upload(fw); - - W.head_k = (float **)malloc(nl * nh * sizeof(float *)); - W.head_q = (float **)malloc(nl * nh * sizeof(float *)); - W.head_v = (float **)malloc(nl * nh * sizeof(float *)); - W.sa_proj_w = (float **)malloc(nl * sizeof(float *)); - W.sa_proj_b = (float **)malloc(nl * sizeof(float *)); - W.ff_w1 = (float **)malloc(nl * sizeof(float *)); - W.ff_b1 = (float **)malloc(nl * sizeof(float *)); - W.ff_w2 = (float **)malloc(nl * sizeof(float *)); - W.ff_b2 = (float **)malloc(nl * sizeof(float *)); - W.ln1_w = (float **)malloc(nl * sizeof(float *)); - W.ln1_b = (float **)malloc(nl * sizeof(float *)); - W.ln2_w = (float **)malloc(nl * sizeof(float *)); - W.ln2_b = (float **)malloc(nl * sizeof(float *)); - - for (int l = 0; l < nl; l++) - { - for (int h = 0; h < nh; h++) - { - W.head_k[l * nh + h] = read_upload(fw); - W.head_q[l * nh + h] = read_upload(fw); - W.head_v[l * nh + h] = read_upload(fw); - } - W.sa_proj_w[l] = read_upload(fw); - W.sa_proj_b[l] = read_upload(fw); - W.ff_w1[l] = read_upload(fw); - W.ff_b1[l] = read_upload(fw); - W.ff_w2[l] = read_upload(fw); - W.ff_b2[l] = read_upload(fw); - W.ln1_w[l] = read_upload(fw); - W.ln1_b[l] = read_upload(fw); - W.ln2_w[l] = read_upload(fw); - W.ln2_b[l] = read_upload(fw); - } - - W.ln_f_w = read_upload(fw); - W.ln_f_b = read_upload(fw); - W.lm_w = read_upload(fw); - W.lm_b = read_upload(fw); - fclose(fw); - - printf("--- Model loaded ---\n"); - printf("[INFO] vocab=%d block=%d embd=%d heads=%d layers=%d\n", - cfg.vocab_size, cfg.block_size, cfg.n_embd, cfg.n_head, cfg.n_layer); - printf("Generating text (Ctrl+C to stop)...\n\n"); - printf("--------------------------------------------------\n"); - - // cuBLAS handle - cublasHandle_t cublas; - CUBLAS_CHECK(cublasCreate(&cublas)); - - // GPU token buffer - int *d_tokens; - CUDA_CHECK(cudaMalloc(&d_tokens, cfg.block_size * sizeof(int))); - - // GPU logits buffer - float *d_logits; - CUDA_CHECK(cudaMalloc(&d_logits, cfg.vocab_size * sizeof(float))); - - // CPU logits for sampling - float *logits = (float *)malloc(cfg.vocab_size * sizeof(float)); - - // Context window - int *ctx = (int *)calloc(cfg.block_size, sizeof(int)); - int ctx_len = 1; - ctx[0] = 0; - - while (running) - { - int T = ctx_len < cfg.block_size ? ctx_len : cfg.block_size; - int *window = ctx + (ctx_len - T); - - // Upload tokens to GPU - CUDA_CHECK(cudaMemcpy(d_tokens, window, - T * sizeof(int), cudaMemcpyHostToDevice)); - - forward(d_logits, d_tokens, T, &cfg, &W, cublas); - - // Download logits - CUDA_CHECK(cudaMemcpy(logits, d_logits, - cfg.vocab_size * sizeof(float), - cudaMemcpyDeviceToHost)); - - softmax_cpu(logits, cfg.vocab_size); - int next = sample(logits, cfg.vocab_size); - - printf("%c", vocab[next]); - fflush(stdout); - - if (ctx_len < cfg.block_size) - ctx[ctx_len++] = next; - else - { - memmove(ctx, ctx + 1, (cfg.block_size - 1) * sizeof(int)); - ctx[cfg.block_size - 1] = next; - } - } - - cublasDestroy(cublas); - cudaFree(d_tokens); - cudaFree(d_logits); - free(logits); - free(ctx); - free(vocab); - - return 0; -} \ No newline at end of file