diff --git a/.gitignore b/.gitignore
index 223c5cd..691ef30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,3 +50,4 @@ picoclaw/
 
 # Internal dev docs
 picoclaw/PICOLM_INTEGRATION.md
+.aider*
diff --git a/README.md b/README.md
index 1d9ad7b..32853e1 100644
--- a/README.md
+++ b/README.md
@@ -183,7 +183,7 @@ The model file (638MB) stays on disk. PicoLM **memory-maps** it and streams one
 | **FP16 KV Cache** | Halves KV cache memory (44MB vs 88MB for 2048 context) |
 | **Flash Attention** | Online softmax — no O(seq_len) attention buffer needed |
 | **Pre-computed RoPE** | cos/sin lookup tables eliminate transcendentals from hot loop |
-| **SIMD Acceleration** | ARM NEON (Pi 3/4/5) and x86 SSE2 (Intel/AMD) auto-detected |
+| **SIMD Acceleration** | ARM NEON (Pi 3/4/5), x86 SSE2/SSE3, and AVX — auto-detected at compile time |
 | **Fused Dot Products** | Dequantize + dot-product in one pass — no intermediate buffer |
 | **Multi-threaded matmul** | Parallel matrix-vector multiply across CPU cores |
 | **Grammar-Constrained JSON** | `--json` flag forces valid JSON output (for tool calling) |
@@ -242,6 +242,10 @@ picolm.exe model.gguf -p "Hello world" -n 50
 
 ```bash
 make native      # x86/ARM auto-detect (recommended for local machine)
+make x86         # x86-64 safe default (SSE2 only — runs on any x86-64)
+make sse2        # x86-64 SSE2 only (same as x86)
+make sse3        # x86-64 SSE2+SSE3+SSSE3 (AMD Phenom/Athlon, older Intel)
+make avx         # x86-64 AVX (Sandy Bridge+, Bulldozer+ — wider SIMD, faster)
 make pi          # Raspberry Pi 3/4/5 (64-bit ARM + NEON SIMD)
 make pi-arm32    # Pi Zero / Pi 1 (32-bit ARM)
 make cross-pi    # Cross-compile for Pi from x86 (static binary)
@@ -268,6 +272,9 @@ Generation options:
   -c <int>       Context length override
   -j <int>       Number of threads (default: 4)
 
+Performance options:
+  --mem          Load model into RAM instead of mmap (consistent latency, more RAM)
+
 Advanced options:
   --json         Grammar-constrained JSON output mode
   --cache <file> KV cache file (saves/loads prompt state)
@@ -348,7 +355,7 @@ Measured on TinyLlama 1.1B Q4_K_M (638 MB model):
  + FP16 KV cache          █████████████████░░░  (halve memory bandwidth)
  + Pre-computed RoPE      ██████████████████░░  (no sin/cos in hot loop)
  + Flash attention        ██████████████████░░  (no O(n) attention alloc)
- + NEON/SSE2 SIMD         ███████████████████░  (4-wide vector ops)
+ + NEON/SSE2/AVX SIMD     ███████████████████░  (4-wide to 8-wide vector ops)
  + KV cache persistence   ████████████████████  (skip prefill entirely)
 ```
 
@@ -477,9 +484,13 @@ PicoLM implements 9 optimizations that brought generation speed from **1.6 tok/s
 
 4-wide float vector operations for all hot paths. Example: dequantizing Q4_K nibbles with `vmovl_u8` → `vmovl_u16` → `vcvtq_f32_u32`, and RoPE with interleaved `vld2q_f32` / `vst2q_f32`.
 
-### 2. x86 SSE2 SIMD
+### 2. x86 SIMD (SSE2 / SSE3 / AVX)
+
+Three compile-time tiers for Intel/AMD:
 
-Auto-detected on Intel/AMD. 4-wide `__m128` operations for dot products, RMSNorm, and vector operations.
+- **SSE2** (`make sse2` or `make x86`): 4-wide `__m128` operations for dot products, RMSNorm, softmax, RoPE, and element-wise ops. Safe baseline for all x86-64 CPUs.
+- **SSE3** (`make sse3`): adds `_mm_addsub_ps` for a cleaner RoPE rotation kernel (no sign-mask workaround needed).
+- **AVX** (`make avx`): 8-wide `__m256` float accumulators for all ops. Q4_K and Q6_K dot products widen the float accumulation stage while keeping integer nibble extraction at 128-bit (no AVX2 required). RoPE processes 4 complex pairs per iteration with `_mm256_addsub_ps`.
 
 ### 3. FP16 KV Cache
 
@@ -636,7 +647,7 @@ A: llama.cpp is excellent but requires ~200MB+ for the runtime on small models,
 A: TinyLlama 1.1B is a small model — it handles simple tasks (Q&A, summarization, basic reasoning, JSON generation) well. It won't match GPT-4, but it runs on a $10 board with no internet. For structured output, the `--json` grammar mode guarantees valid JSON regardless of model quality.
 
 **Q: What about GPU acceleration?**
-A: PicoLM is CPU-only by design. The target hardware ($10-15 boards) doesn't have GPUs. On x86/ARM CPUs, SIMD (NEON/SSE2) provides meaningful speedup.
+A: PicoLM is CPU-only by design. The target hardware ($10-15 boards) doesn't have GPUs. On x86/ARM CPUs, SIMD (NEON/SSE2/AVX) provides meaningful speedup.
 
 **Q: Can I use a different model?**
 A: Any LLaMA-architecture GGUF model works. Download from [HuggingFace](https://huggingface.co/models?search=gguf) and point PicoLM at it. Recommended quantizations: Q4_K_M (best quality/size balance) or Q2_K (smallest, lower quality).
@@ -645,7 +656,8 @@ A: Any LLaMA-architecture GGUF model works. Download from [HuggingFace](https://
 
 ## Roadmap
 
-- [ ] AVX2/AVX-512 kernels for x86 (2-4x generation speed on modern CPUs)
+- [x] AVX kernels for x86 (`make avx` — 8-wide float ops, ~2x vs SSE2)
+- [ ] AVX2/AVX-512 kernels for x86 (256-bit integer ops for quantized paths)
 - [ ] Speculative decoding with a draft model
 - [ ] Context sliding window (infinite generation beyond max_seq_len)
 - [ ] Weight pruning for further memory reduction
diff --git a/picolm/Makefile b/picolm/Makefile
index 4fd3c7a..d643cb4 100644
--- a/picolm/Makefile
+++ b/picolm/Makefile
@@ -1,5 +1,5 @@
 CC      = gcc
-CFLAGS  = -O2 -std=c11 -D_GNU_SOURCE -Wall -Wextra -Wpedantic
+CFLAGS  = -O3 -std=c11 -D_GNU_SOURCE -Wall -Wextra -Wpedantic
 LDFLAGS = -lm -lpthread
 SRCS    = picolm.c model.c tensor.c quant.c tokenizer.c sampler.c grammar.c
 TARGET  = picolm
@@ -11,11 +11,26 @@ MODEL_DIR ?= /opt/picolm/models
 native: CFLAGS += -march=native
 native: $(TARGET)
 
+# --- x86-64 default (SSE2 only, safe for all x86-64) ---
+x86: sse2
+
+# --- x86-64 with SSE2 only ---
+sse2: CFLAGS += -msse2
+sse2: $(TARGET)
+
+# --- x86-64 with SSE2+SSE3+SSSE3 (covers AMD Phenom/Athlon and similar without AVX) ---
+sse3: CFLAGS += -msse2 -msse3 -mssse3 -mpopcnt
+sse3: $(TARGET)
+
+# --- x86-64 with AVX (Sandy Bridge and newer Intel; Bulldozer and newer AMD) ---
+avx: CFLAGS += -mavx -mpopcnt
+avx: $(TARGET)
+
 $(TARGET): $(SRCS)
 	$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
 
 # --- Static build for single-binary deployment ---
-static: CFLAGS += -march=native
+static: CFLAGS += -msse2
 static: LDFLAGS += -static
 static: $(TARGET)
 
@@ -70,4 +85,4 @@ model:
 clean:
 	rm -f $(TARGET) $(TARGET).exe *.obj *.o
 
-.PHONY: native static pi pi-arm32 cross-pi riscv cross-riscv debug install model clean
+.PHONY: native x86 sse2 sse3 avx static pi pi-arm32 cross-pi riscv cross-riscv debug install model clean
diff --git a/picolm/model.c b/picolm/model.c
index 4b4040d..b3c9efc 100644
--- a/picolm/model.c
+++ b/picolm/model.c
@@ -187,16 +187,53 @@ static int mmap_file(model_t *m, const char *path) {
     return 0;
 }
 
+static int load_file_into_ram(model_t *m, const char *path) {
+    FILE *f = fopen(path, "rb");
+    if (!f) {
+        fprintf(stderr, "Cannot open file: %s\n", path);
+        return -1;
+    }
+
+    fseek(f, 0, SEEK_END);
+    long size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+
+    m->mmap_addr = malloc(size);
+    if (!m->mmap_addr) {
+        fprintf(stderr, "OOM: cannot allocate %ld bytes\n", size);
+        fclose(f);
+        return -1;
+    }
+
+    if (fread(m->mmap_addr, 1, (size_t)size, f) != (size_t)size) {
+        fprintf(stderr, "Failed to read file\n");
+        free(m->mmap_addr);
+        m->mmap_addr = NULL;
+        fclose(f);
+        return -1;
+    }
+
+    fclose(f);
+    m->mmap_size = (size_t)size;
+    m->use_ram = 1;
+    return 0;
+}
+
 static void munmap_file(model_t *m) {
     if (!m->mmap_addr) return;
+    
+    if (m->use_ram) {
+        free(m->mmap_addr);
+    } else {
 #ifdef _WIN32
-    UnmapViewOfFile(m->mmap_addr);
-    CloseHandle(m->map_handle);
-    CloseHandle(m->file_handle);
+        UnmapViewOfFile(m->mmap_addr);
+        CloseHandle(m->map_handle);
+        CloseHandle(m->file_handle);
 #else
-    munmap(m->mmap_addr, m->mmap_size);
-    close(m->fd);
+        munmap(m->mmap_addr, m->mmap_size);
+        close(m->fd);
 #endif
+    }
     m->mmap_addr = NULL;
 }
 
@@ -406,6 +443,7 @@ static int parse_gguf(model_t *m, int max_seq_len) {
     fprintf(stderr, "  n_layers=%d, vocab_size=%d, max_seq=%d\n",
             cfg->n_layers, cfg->vocab_size, cfg->max_seq_len);
     fprintf(stderr, "  head_dim=%d, rope_base=%.1f\n", cfg->head_dim, cfg->rope_freq_base);
+    fprintf(stderr, "  Loading mode: %s\n", m->use_ram ? "RAM" : "mmap");
 
     free(tinfos);
     return 0;
@@ -535,10 +573,16 @@ static int allocate_run_state(model_t *m) {
 
 /* ---- Public API ---- */
 
-int model_load(model_t *m, const char *path, int max_seq_len) {
+int model_load(model_t *m, const char *path, int max_seq_len, int use_ram) {
     memset(m, 0, sizeof(*m));
+    m->use_ram = use_ram;
 
-    if (mmap_file(m, path) != 0) return -1;
+    if (use_ram) {
+        if (load_file_into_ram(m, path) != 0) return -1;
+    } else {
+        if (mmap_file(m, path) != 0) return -1;
+    }
+    
     if (parse_gguf(m, max_seq_len) != 0) return -1;
     if (allocate_run_state(m) != 0) return -1;
 
diff --git a/picolm/model.h b/picolm/model.h
index 2a751e9..340669a 100644
--- a/picolm/model.h
+++ b/picolm/model.h
@@ -108,6 +108,7 @@ typedef struct {
     /* mmap bookkeeping */
     void  *mmap_addr;
     size_t mmap_size;
+    int    use_ram;      /* flag: 1 = load into RAM, 0 = use mmap */
 #ifdef _WIN32
     void  *file_handle;
     void  *map_handle;
@@ -125,7 +126,7 @@ typedef struct {
 } model_t;
 
 /* Load a GGUF model file. Returns 0 on success. */
-int model_load(model_t *m, const char *path, int max_seq_len);
+int model_load(model_t *m, const char *path, int max_seq_len, int use_ram);
 
 /* Run one forward pass. Returns pointer to logits[vocab_size]. */
 float *model_forward(model_t *m, int token, int pos);
diff --git a/picolm/picolm.c b/picolm/picolm.c
index c2f1624..5ef0ef0 100644
--- a/picolm/picolm.c
+++ b/picolm/picolm.c
@@ -40,6 +40,7 @@ static void usage(const char *prog) {
     fprintf(stderr, "\nAdvanced options:\n");
     fprintf(stderr, "  --json         Grammar-constrained JSON output mode\n");
     fprintf(stderr, "  --cache <file> KV cache file (saves/loads prompt state)\n");
+    fprintf(stderr, "  --mem          Load model into RAM instead of memory-mapping\n");
 }
 
 static char *read_stdin(void) {
@@ -77,6 +78,7 @@ int main(int argc, char **argv) {
     int    num_threads = 4;
     int    json_mode = 0;
     const char *cache_file = NULL;
+    int    use_ram = 0; /* 0 = mmap (default), 1 = load into RAM */
 
     /* Parse arguments */
     for (int i = 2; i < argc; i++) {
@@ -98,6 +100,8 @@ int main(int argc, char **argv) {
             json_mode = 1;
         } else if (strcmp(argv[i], "--cache") == 0 && i + 1 < argc) {
             cache_file = argv[++i];
+        } else if (strcmp(argv[i], "--mem") == 0) {
+            use_ram = 1;
         } else {
             fprintf(stderr, "Unknown option: %s\n", argv[i]);
             usage(argv[0]);
@@ -131,8 +135,10 @@ int main(int argc, char **argv) {
 
     /* Load model */
     fprintf(stderr, "Loading model: %s\n", model_path);
+    fprintf(stderr, "Loading mode: %s\n", use_ram ? "RAM" : "mmap");
+    
     model_t model;
-    if (model_load(&model, model_path, context_override) != 0) {
+    if (model_load(&model, model_path, context_override, use_ram) != 0) {
         fprintf(stderr, "Failed to load model\n");
         return 1;
     }
diff --git a/picolm/quant.c b/picolm/quant.c
index 5a92998..7c3e1e4 100644
--- a/picolm/quant.c
+++ b/picolm/quant.c
@@ -352,6 +352,18 @@ float vec_dot_f32_f32(const void *src, const float *x, int n) {
     for (; i < n; i++) sum += w[i] * x[i];
     return sum;
 
+#elif defined(PICOLM_AVX)
+    __m256 acc0 = _mm256_setzero_ps();
+    __m256 acc1 = _mm256_setzero_ps();
+    int i = 0;
+    for (; i + 15 < n; i += 16) {
+        acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_loadu_ps(w + i),     _mm256_loadu_ps(x + i)));
+        acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_loadu_ps(w + i + 8), _mm256_loadu_ps(x + i + 8)));
+    }
+    float sum = hsum_avx(_mm256_add_ps(acc0, acc1));
+    for (; i < n; i++) sum += w[i] * x[i];
+    return sum;
+
 #elif defined(PICOLM_SSE2)
     __m128 acc0 = _mm_setzero_ps();
     __m128 acc1 = _mm_setzero_ps();
@@ -440,6 +452,85 @@ float vec_dot_q4_K_f32(const void *src, const float *x, int n) {
             float sum_x1  = vaddvq_f32_compat(sum_x1_v);
             float sum_qx2 = vaddvq_f32_compat(sum_qx2_v);
             float sum_x2  = vaddvq_f32_compat(sum_x2_v);
+#elif defined(PICOLM_AVX)
+            /* AVX: nibble extraction stays 128-bit (no AVX2 int), float
+             * accumulators widen to 256-bit (8 floats) halving float ops. */
+            __m256 sum_qx1_v = _mm256_setzero_ps();
+            __m256 sum_x1_v  = _mm256_setzero_ps();
+            __m256 sum_qx2_v = _mm256_setzero_ps();
+            __m256 sum_x2_v  = _mm256_setzero_ps();
+            const __m128i mask4  = _mm_set1_epi8(0x0F);
+            const __m128i zero_i = _mm_setzero_si128();
+
+            for (int l = 0; l < 32; l += 8) {
+                __m128i qb  = _mm_loadl_epi64((const __m128i *)(q + l));
+                __m128i lo8 = _mm_and_si128(qb, mask4);
+                __m128i hi8 = _mm_and_si128(_mm_srli_epi16(qb, 4), mask4);
+
+                __m128i lo16 = _mm_unpacklo_epi8(lo8, zero_i);
+                __m128i hi16 = _mm_unpacklo_epi8(hi8, zero_i);
+
+                /* Combine two __m128 → one __m256 of 8 floats */
+                __m256 qf_lo = _mm256_set_m128(
+                    _mm_cvtepi32_ps(_mm_unpackhi_epi16(lo16, zero_i)),
+                    _mm_cvtepi32_ps(_mm_unpacklo_epi16(lo16, zero_i)));
+                __m256 qf_hi = _mm256_set_m128(
+                    _mm_cvtepi32_ps(_mm_unpackhi_epi16(hi16, zero_i)),
+                    _mm_cvtepi32_ps(_mm_unpacklo_epi16(hi16, zero_i)));
+
+                __m256 xv_lo = _mm256_loadu_ps(xp + l);
+                __m256 xv_hi = _mm256_loadu_ps(xp + l + 32);
+
+                sum_qx1_v = _mm256_add_ps(sum_qx1_v, _mm256_mul_ps(qf_lo, xv_lo));
+                sum_x1_v  = _mm256_add_ps(sum_x1_v,  xv_lo);
+                sum_qx2_v = _mm256_add_ps(sum_qx2_v, _mm256_mul_ps(qf_hi, xv_hi));
+                sum_x2_v  = _mm256_add_ps(sum_x2_v,  xv_hi);
+            }
+            float sum_qx1 = hsum_avx(sum_qx1_v);
+            float sum_x1  = hsum_avx(sum_x1_v);
+            float sum_qx2 = hsum_avx(sum_qx2_v);
+            float sum_x2  = hsum_avx(sum_x2_v);
+#elif defined(PICOLM_SSE2)
+            /* SSE2: process 8 quantized bytes per iteration.
+             * Each byte holds two nibbles: lo=q[l]&0xF for first group,
+             * hi=q[l]>>4 for second group (offset by 32 in xp). */
+            __m128 sum_qx1_v = _mm_setzero_ps();
+            __m128 sum_x1_v  = _mm_setzero_ps();
+            __m128 sum_qx2_v = _mm_setzero_ps();
+            __m128 sum_x2_v  = _mm_setzero_ps();
+            const __m128i mask4 = _mm_set1_epi8(0x0F);
+            const __m128i zero_i = _mm_setzero_si128();
+
+            for (int l = 0; l < 32; l += 8) {
+                /* Load 8 quantized bytes -> 8 lo + 8 hi nibbles */
+                __m128i qb  = _mm_loadl_epi64((const __m128i *)(q + l));
+                __m128i lo8 = _mm_and_si128(qb, mask4);
+                __m128i hi8 = _mm_and_si128(_mm_srli_epi16(qb, 4), mask4);
+
+                /* Widen uint8 nibbles -> int32 -> float (8 values each) */
+                __m128i lo16 = _mm_unpacklo_epi8(lo8, zero_i);
+                __m128i hi16 = _mm_unpacklo_epi8(hi8, zero_i);
+                __m128 qf_lo0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(lo16, zero_i));
+                __m128 qf_lo1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(lo16, zero_i));
+                __m128 qf_hi0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(hi16, zero_i));
+                __m128 qf_hi1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(hi16, zero_i));
+
+                __m128 xv_lo0 = _mm_loadu_ps(xp + l);
+                __m128 xv_lo1 = _mm_loadu_ps(xp + l + 4);
+                __m128 xv_hi0 = _mm_loadu_ps(xp + l + 32);
+                __m128 xv_hi1 = _mm_loadu_ps(xp + l + 36);
+
+                sum_qx1_v = _mm_add_ps(sum_qx1_v,
+                    _mm_add_ps(_mm_mul_ps(qf_lo0, xv_lo0), _mm_mul_ps(qf_lo1, xv_lo1)));
+                sum_x1_v  = _mm_add_ps(sum_x1_v,  _mm_add_ps(xv_lo0, xv_lo1));
+                sum_qx2_v = _mm_add_ps(sum_qx2_v,
+                    _mm_add_ps(_mm_mul_ps(qf_hi0, xv_hi0), _mm_mul_ps(qf_hi1, xv_hi1)));
+                sum_x2_v  = _mm_add_ps(sum_x2_v,  _mm_add_ps(xv_hi0, xv_hi1));
+            }
+            float sum_qx1 = hsum_sse(sum_qx1_v);
+            float sum_x1  = hsum_sse(sum_x1_v);
+            float sum_qx2 = hsum_sse(sum_qx2_v);
+            float sum_x2  = hsum_sse(sum_x2_v);
 #else
             float sum_qx1 = 0.0f, sum_x1 = 0.0f;
             float sum_qx2 = 0.0f, sum_x2 = 0.0f;
@@ -476,9 +567,147 @@ float vec_dot_q6_K_f32(const void *src, const float *x, int n) {
         const int8_t  *sc = blocks[i].scales;
         const float *xp = x + i * 256;
 
-        /* Accumulate per-scale-group sums: 16 groups of 16 elements each */
         float sums[16] = {0};
 
+/* Shared macro: sign-extend int8 → float via 128-bit SSE2 ops.
+ * Used in both the AVX and SSE2 paths (integer extraction is always 128-bit). */
+#define Q6K_CONV(qi8, fa, fb) do { \
+    __m128i w16 = _mm_srai_epi16(_mm_unpacklo_epi8(zero_i, qi8), 8); \
+    fa = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero_i, w16), 16)); \
+    fb = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero_i, w16), 16)); \
+} while (0)
+
+#ifdef PICOLM_AVX
+        /* AVX: same 128-bit integer extraction as SSE2, but float
+         * accumulators are 256-bit — halves the number of float instructions. */
+        const __m128i mask4  = _mm_set1_epi8(0x0F);
+        const __m128i mask3  = _mm_set1_epi8(0x03);
+        const __m128i sub32  = _mm_set1_epi8(32);
+        const __m128i zero_i = _mm_setzero_si128();
+
+        for (int chunk = 0; chunk < 2; chunk++) {
+            int is = chunk * 8;
+            const uint8_t *ql_c = ql + chunk * 64;
+            const uint8_t *qh_c = qh + chunk * 32;
+            const float   *xp_c = xp + chunk * 128;
+
+            for (int half = 0; half < 2; half++) {
+                int l0   = half * 16;
+                int sidx = is + half;
+                __m256 acc1 = _mm256_setzero_ps();
+                __m256 acc2 = _mm256_setzero_ps();
+                __m256 acc3 = _mm256_setzero_ps();
+                __m256 acc4 = _mm256_setzero_ps();
+
+                for (int l = l0; l < l0 + 16; l += 8) {
+                    __m128i qla = _mm_loadl_epi64((const __m128i *)(ql_c + l));
+                    __m128i qlb = _mm_loadl_epi64((const __m128i *)(ql_c + l + 32));
+                    __m128i qhv = _mm_loadl_epi64((const __m128i *)(qh_c + l));
+
+                    __m128i lo_a = _mm_and_si128(qla, mask4);
+                    __m128i hi_a = _mm_and_si128(_mm_srli_epi16(qla, 4), mask4);
+                    __m128i lo_b = _mm_and_si128(qlb, mask4);
+                    __m128i hi_b = _mm_and_si128(_mm_srli_epi16(qlb, 4), mask4);
+
+                    __m128i h01 = _mm_and_si128(qhv, mask3);
+                    __m128i h23 = _mm_and_si128(_mm_srli_epi16(qhv, 2), mask3);
+                    __m128i h45 = _mm_and_si128(_mm_srli_epi16(qhv, 4), mask3);
+                    __m128i h67 = _mm_and_si128(_mm_srli_epi16(qhv, 6), mask3);
+
+                    __m128i q1_i8 = _mm_sub_epi8(_mm_or_si128(lo_a, _mm_slli_epi16(h01, 4)), sub32);
+                    __m128i q2_i8 = _mm_sub_epi8(_mm_or_si128(lo_b, _mm_slli_epi16(h23, 4)), sub32);
+                    __m128i q3_i8 = _mm_sub_epi8(_mm_or_si128(hi_a, _mm_slli_epi16(h45, 4)), sub32);
+                    __m128i q4_i8 = _mm_sub_epi8(_mm_or_si128(hi_b, _mm_slli_epi16(h67, 4)), sub32);
+
+                    __m128 qf1a, qf1b, qf2a, qf2b, qf3a, qf3b, qf4a, qf4b;
+                    Q6K_CONV(q1_i8, qf1a, qf1b);
+                    Q6K_CONV(q2_i8, qf2a, qf2b);
+                    Q6K_CONV(q3_i8, qf3a, qf3b);
+                    Q6K_CONV(q4_i8, qf4a, qf4b);
+
+                    /* Merge two __m128 → one __m256, load 8 floats at once */
+                    acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_set_m128(qf1b, qf1a), _mm256_loadu_ps(xp_c + l)));
+                    acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_set_m128(qf2b, qf2a), _mm256_loadu_ps(xp_c + l + 32)));
+                    acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_set_m128(qf3b, qf3a), _mm256_loadu_ps(xp_c + l + 64)));
+                    acc4 = _mm256_add_ps(acc4, _mm256_mul_ps(_mm256_set_m128(qf4b, qf4a), _mm256_loadu_ps(xp_c + l + 96)));
+                }
+                sums[sidx + 0] += hsum_avx(acc1);
+                sums[sidx + 2] += hsum_avx(acc2);
+                sums[sidx + 4] += hsum_avx(acc3);
+                sums[sidx + 6] += hsum_avx(acc4);
+            }
+        }
+#elif defined(PICOLM_SSE2)
+        /* SSE2: process 8 elements per inner iteration.
+         * Each 6-bit value: low 4 bits from ql, high 2 bits from qh.
+         * Values are biased by 32 (range -32..31). */
+        const __m128i mask4  = _mm_set1_epi8(0x0F);
+        const __m128i mask3  = _mm_set1_epi8(0x03);
+        const __m128i sub32  = _mm_set1_epi8(32);
+        const __m128i zero_i = _mm_setzero_si128();
+
+        for (int chunk = 0; chunk < 2; chunk++) {
+            int is = chunk * 8;
+            const uint8_t *ql_c = ql + chunk * 64;
+            const uint8_t *qh_c = qh + chunk * 32;
+            const float   *xp_c = xp + chunk * 128;
+
+            /* half=0 -> l=0..15  -> sums[is+0,2,4,6]
+             * half=1 -> l=16..31 -> sums[is+1,3,5,7] */
+            for (int half = 0; half < 2; half++) {
+                int l0   = half * 16;
+                int sidx = is + half;
+                __m128 acc1a = _mm_setzero_ps(), acc1b = _mm_setzero_ps();
+                __m128 acc2a = _mm_setzero_ps(), acc2b = _mm_setzero_ps();
+                __m128 acc3a = _mm_setzero_ps(), acc3b = _mm_setzero_ps();
+                __m128 acc4a = _mm_setzero_ps(), acc4b = _mm_setzero_ps();
+
+                for (int l = l0; l < l0 + 16; l += 8) {
+                    /* Load 8 bytes each of ql and qh */
+                    __m128i qla = _mm_loadl_epi64((const __m128i *)(ql_c + l));
+                    __m128i qlb = _mm_loadl_epi64((const __m128i *)(ql_c + l + 32));
+                    __m128i qhv = _mm_loadl_epi64((const __m128i *)(qh_c + l));
+
+                    /* Extract nibbles from ql */
+                    __m128i lo_a = _mm_and_si128(qla, mask4);
+                    __m128i hi_a = _mm_and_si128(_mm_srli_epi16(qla, 4), mask4);
+                    __m128i lo_b = _mm_and_si128(qlb, mask4);
+                    __m128i hi_b = _mm_and_si128(_mm_srli_epi16(qlb, 4), mask4);
+
+                    /* Extract 2-bit groups from qh (byte-level shifts via epi16 + mask) */
+                    __m128i h01 = _mm_and_si128(qhv, mask3);
+                    __m128i h23 = _mm_and_si128(_mm_srli_epi16(qhv, 2), mask3);
+                    __m128i h45 = _mm_and_si128(_mm_srli_epi16(qhv, 4), mask3);
+                    __m128i h67 = _mm_and_si128(_mm_srli_epi16(qhv, 6), mask3);
+
+                    /* Build 6-bit values as signed int8, range -32..31. */
+                    __m128i q1_i8 = _mm_sub_epi8(_mm_or_si128(lo_a, _mm_slli_epi16(h01, 4)), sub32);
+                    __m128i q2_i8 = _mm_sub_epi8(_mm_or_si128(lo_b, _mm_slli_epi16(h23, 4)), sub32);
+                    __m128i q3_i8 = _mm_sub_epi8(_mm_or_si128(hi_a, _mm_slli_epi16(h45, 4)), sub32);
+                    __m128i q4_i8 = _mm_sub_epi8(_mm_or_si128(hi_b, _mm_slli_epi16(h67, 4)), sub32);
+
+                    __m128 qf1a, qf1b, qf2a, qf2b, qf3a, qf3b, qf4a, qf4b;
+                    Q6K_CONV(q1_i8, qf1a, qf1b);
+                    Q6K_CONV(q2_i8, qf2a, qf2b);
+                    Q6K_CONV(q3_i8, qf3a, qf3b);
+                    Q6K_CONV(q4_i8, qf4a, qf4b);
+
+                    acc1a = _mm_add_ps(acc1a, _mm_mul_ps(qf1a, _mm_loadu_ps(xp_c + l)));
+                    acc1b = _mm_add_ps(acc1b, _mm_mul_ps(qf1b, _mm_loadu_ps(xp_c + l + 4)));
+                    acc2a = _mm_add_ps(acc2a, _mm_mul_ps(qf2a, _mm_loadu_ps(xp_c + l + 32)));
+                    acc2b = _mm_add_ps(acc2b, _mm_mul_ps(qf2b, _mm_loadu_ps(xp_c + l + 36)));
+                    acc3a = _mm_add_ps(acc3a, _mm_mul_ps(qf3a, _mm_loadu_ps(xp_c + l + 64)));
+                    acc3b = _mm_add_ps(acc3b, _mm_mul_ps(qf3b, _mm_loadu_ps(xp_c + l + 68)));
+                    acc4a = _mm_add_ps(acc4a, _mm_mul_ps(qf4a, _mm_loadu_ps(xp_c + l + 96)));
+                    acc4b = _mm_add_ps(acc4b, _mm_mul_ps(qf4b, _mm_loadu_ps(xp_c + l + 100)));
+                }
+                sums[sidx + 0] += hsum_sse(_mm_add_ps(acc1a, acc1b));
+                sums[sidx + 2] += hsum_sse(_mm_add_ps(acc2a, acc2b));
+                sums[sidx + 4] += hsum_sse(_mm_add_ps(acc3a, acc3b));
+                sums[sidx + 6] += hsum_sse(_mm_add_ps(acc4a, acc4b));
+            }
+        }
+#else
         for (int chunk = 0; chunk < 2; chunk++) {
             int is = chunk * 8;
             const uint8_t *ql_c = ql + chunk * 64;
@@ -506,6 +735,9 @@ float vec_dot_q6_K_f32(const void *src, const float *x, int n) {
                 sums[is + 7] += (float)q4 * xp_c[l + 96];
             }
         }
+#endif
+
+#undef Q6K_CONV
 
         for (int j = 0; j < 16; j++) {
             sumf += d * (float)sc[j] * sums[j];
diff --git a/picolm/quant.h b/picolm/quant.h
index e35095c..d9a89fc 100644
--- a/picolm/quant.h
+++ b/picolm/quant.h
@@ -30,6 +30,21 @@ static inline float hsum_sse(__m128 v) {
 }
 #endif
 
+#if defined(PICOLM_SSE2) && defined(__SSE3__)
+#define PICOLM_SSE3 1
+#include <pmmintrin.h>
+#endif
+
+#if defined(PICOLM_SSE2) && defined(__AVX__)
+#define PICOLM_AVX 1
+/* immintrin.h already included above; AVX intrinsics are available */
+static inline float hsum_avx(__m256 v) {
+    __m128 lo = _mm256_castps256_ps128(v);
+    __m128 hi = _mm256_extractf128_ps(v, 1);
+    return hsum_sse(_mm_add_ps(lo, hi));
+}
+#endif
+
 /* GGUF tensor data types */
 typedef enum {
     GGUF_TYPE_F32   = 0,
diff --git a/picolm/tensor.c b/picolm/tensor.c
index 59a68e6..801076e 100644
--- a/picolm/tensor.c
+++ b/picolm/tensor.c
@@ -137,6 +137,15 @@ void rmsnorm(float *out, const float *x, const float *weight, int size) {
     }
     ss = vaddvq_f32_compat(acc);
     for (; i < size; i++) ss += x[i] * x[i];
+#elif defined(PICOLM_AVX)
+    __m256 acc = _mm256_setzero_ps();
+    int i = 0;
+    for (; i + 7 < size; i += 8) {
+        __m256 v = _mm256_loadu_ps(x + i);
+        acc = _mm256_add_ps(acc, _mm256_mul_ps(v, v));
+    }
+    ss = hsum_avx(acc);
+    for (; i < size; i++) ss += x[i] * x[i];
 #elif defined(PICOLM_SSE2)
     __m128 acc = _mm_setzero_ps();
     int i = 0;
@@ -161,6 +170,15 @@ void rmsnorm(float *out, const float *x, const float *weight, int size) {
         vst1q_f32(out + i, vmulq_f32(vmulq_f32(v, scale), w));
     }
     for (; i < size; i++) out[i] = x[i] * ss * weight[i];
+#elif defined(PICOLM_AVX)
+    __m256 scale = _mm256_set1_ps(ss);
+    i = 0;
+    for (; i + 7 < size; i += 8) {
+        __m256 v = _mm256_loadu_ps(x + i);
+        __m256 w = _mm256_loadu_ps(weight + i);
+        _mm256_storeu_ps(out + i, _mm256_mul_ps(_mm256_mul_ps(v, scale), w));
+    }
+    for (; i < size; i++) out[i] = x[i] * ss * weight[i];
 #elif defined(PICOLM_SSE2)
     __m128 scale = _mm_set1_ps(ss);
     i = 0;
@@ -194,6 +212,13 @@ void softmax(float *x, int size) {
         vst1q_f32(x + i, vmulq_f32(vld1q_f32(x + i), inv_v));
     }
     for (; i < size; i++) x[i] *= inv;
+#elif defined(PICOLM_AVX)
+    __m256 inv_v = _mm256_set1_ps(inv);
+    int i = 0;
+    for (; i + 7 < size; i += 8) {
+        _mm256_storeu_ps(x + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), inv_v));
+    }
+    for (; i < size; i++) x[i] *= inv;
 #elif defined(PICOLM_SSE2)
     __m128 inv_v = _mm_set1_ps(inv);
     int i = 0;
@@ -206,6 +231,76 @@ void softmax(float *x, int size) {
 #endif
 }
 
+/* ---- AVX RoPE helper: 4 complex pairs per iteration ---- */
+#ifdef PICOLM_AVX
+static void rope_avx(float *h, int half, const float *cos_pos, const float *sin_pos) {
+    int i = 0;
+    /* Process 4 complex pairs at a time: [r0,i0,r1,i1,r2,i2,r3,i3]
+     * Broadcast cos/sin: [c0,c0,c1,c1,c2,c2,c3,c3] via unpacklo/hi + set_m128.
+     * Swap pairs with permute_ps(0xB1) within each 128-bit lane.
+     * _mm256_addsub_ps: subtract even lanes, add odd lanes. */
+    for (; i + 3 < half; i += 4) {
+        __m256 v   = _mm256_loadu_ps(h + i * 2);
+        __m128 c4  = _mm_loadu_ps(cos_pos + i);
+        __m128 s4  = _mm_loadu_ps(sin_pos + i);
+        __m256 cv  = _mm256_set_m128(_mm_unpackhi_ps(c4, c4), _mm_unpacklo_ps(c4, c4));
+        __m256 sv  = _mm256_set_m128(_mm_unpackhi_ps(s4, s4), _mm_unpacklo_ps(s4, s4));
+        __m256 sw  = _mm256_permute_ps(v, 0xB1); /* swap r,i within each pair */
+        _mm256_storeu_ps(h + i * 2,
+            _mm256_addsub_ps(_mm256_mul_ps(v, cv), _mm256_mul_ps(sw, sv)));
+    }
+    for (; i < half; i++) {
+        float r = h[i * 2], im = h[i * 2 + 1];
+        h[i * 2]     = r * cos_pos[i] - im * sin_pos[i];
+        h[i * 2 + 1] = r * sin_pos[i] + im * cos_pos[i];
+    }
+}
+#endif
+
+/* ---- SSE2 RoPE helper: apply rotation to one head ---- */
+#if defined(PICOLM_SSE2) && !defined(PICOLM_AVX)
+static void rope_sse(float *h, int half, const float *cos_pos, const float *sin_pos) {
+    int i = 0;
+    /* Process 2 complex pairs at a time: [r0, i0, r1, i1]
+     * r_new = r*cos - i*sin,  i_new = r*sin + i*cos
+     *
+     * With SSE: swap pairs to get [i0, r0, i1, r1], broadcast cos/sin,
+     * then use addsub (SSE3) or sign-mask trick (SSE2).
+     *   a = v  * cv  = [r0*c0, i0*c0, r1*c1, i1*c1]
+     *   b = sv * sv' = [i0*s0, r0*s0, i1*s1, r1*s1]
+     *   result[even] = a - b,  result[odd] = a + b  */
+#ifdef PICOLM_SSE3
+    for (; i + 1 < half; i += 2) {
+        __m128 v  = _mm_loadu_ps(h + i * 2);
+        __m128 c2 = _mm_unpacklo_ps(_mm_load_ss(cos_pos + i), _mm_load_ss(cos_pos + i + 1));
+        __m128 s2 = _mm_unpacklo_ps(_mm_load_ss(sin_pos + i), _mm_load_ss(sin_pos + i + 1));
+        __m128 cv = _mm_shuffle_ps(c2, c2, _MM_SHUFFLE(1,1,0,0));
+        __m128 sv = _mm_shuffle_ps(s2, s2, _MM_SHUFFLE(1,1,0,0));
+        __m128 sw = _mm_shuffle_ps(v,  v,  _MM_SHUFFLE(2,3,0,1));
+        _mm_storeu_ps(h + i * 2, _mm_addsub_ps(_mm_mul_ps(v, cv), _mm_mul_ps(sw, sv)));
+    }
+#else
+    const __m128 sign = _mm_set_ps(1.0f, -1.0f, 1.0f, -1.0f);
+    for (; i + 1 < half; i += 2) {
+        __m128 v  = _mm_loadu_ps(h + i * 2);
+        __m128 c2 = _mm_unpacklo_ps(_mm_load_ss(cos_pos + i), _mm_load_ss(cos_pos + i + 1));
+        __m128 s2 = _mm_unpacklo_ps(_mm_load_ss(sin_pos + i), _mm_load_ss(sin_pos + i + 1));
+        __m128 cv = _mm_shuffle_ps(c2, c2, _MM_SHUFFLE(1,1,0,0));
+        __m128 sv = _mm_shuffle_ps(s2, s2, _MM_SHUFFLE(1,1,0,0));
+        __m128 sw = _mm_shuffle_ps(v,  v,  _MM_SHUFFLE(2,3,0,1));
+        __m128 a  = _mm_mul_ps(v, cv);
+        __m128 b  = _mm_mul_ps(_mm_mul_ps(sign, sw), sv);
+        _mm_storeu_ps(h + i * 2, _mm_add_ps(a, b));
+    }
+#endif
+    for (; i < half; i++) {
+        float r = h[i * 2], im = h[i * 2 + 1];
+        h[i * 2]     = r * cos_pos[i] - im * sin_pos[i];
+        h[i * 2 + 1] = r * sin_pos[i] + im * cos_pos[i];
+    }
+}
+#endif
+
 /* Rotary position encoding using pre-computed cos/sin tables */
 void rope(float *q, float *k, int head_dim, int n_heads, int n_kv_heads,
           const float *cos_pos, const float *sin_pos) {
@@ -233,6 +328,10 @@ void rope(float *q, float *k, int head_dim, int n_heads, int n_kv_heads,
             qh[i * 2]     = q0 * cos_pos[i] - q1 * sin_pos[i];
             qh[i * 2 + 1] = q0 * sin_pos[i] + q1 * cos_pos[i];
         }
+#elif defined(PICOLM_AVX)
+        rope_avx(qh, half, cos_pos, sin_pos);
+#elif defined(PICOLM_SSE2)
+        rope_sse(qh, half, cos_pos, sin_pos);
 #else
         for (int i = 0; i < half; i++) {
             float q0 = qh[i * 2];
@@ -246,12 +345,18 @@ void rope(float *q, float *k, int head_dim, int n_heads, int n_kv_heads,
     /* Apply RoPE to all KV heads */
     for (int h = 0; h < n_kv_heads; h++) {
         float *kh = k + h * head_dim;
+#ifdef PICOLM_AVX
+        rope_avx(kh, half, cos_pos, sin_pos);
+#elif defined(PICOLM_SSE2)
+        rope_sse(kh, half, cos_pos, sin_pos);
+#else
         for (int i = 0; i < half; i++) {
             float k0 = kh[i * 2];
             float k1 = kh[i * 2 + 1];
             kh[i * 2]     = k0 * cos_pos[i] - k1 * sin_pos[i];
             kh[i * 2 + 1] = k0 * sin_pos[i] + k1 * cos_pos[i];
         }
+#endif
     }
 }
 
@@ -268,6 +373,12 @@ void elemwise_mul(float *out, const float *a, const float *b, int size) {
         vst1q_f32(out + i, vmulq_f32(vld1q_f32(a + i), vld1q_f32(b + i)));
     }
     for (; i < size; i++) out[i] = a[i] * b[i];
+#elif defined(PICOLM_AVX)
+    int i = 0;
+    for (; i + 7 < size; i += 8) {
+        _mm256_storeu_ps(out + i, _mm256_mul_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i)));
+    }
+    for (; i < size; i++) out[i] = a[i] * b[i];
 #elif defined(PICOLM_SSE2)
     int i = 0;
     for (; i + 3 < size; i += 4) {
@@ -286,6 +397,12 @@ void vec_add(float *a, const float *b, int size) {
         vst1q_f32(a + i, vaddq_f32(vld1q_f32(a + i), vld1q_f32(b + i)));
     }
     for (; i < size; i++) a[i] += b[i];
+#elif defined(PICOLM_AVX)
+    int i = 0;
+    for (; i + 7 < size; i += 8) {
+        _mm256_storeu_ps(a + i, _mm256_add_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i)));
+    }
+    for (; i < size; i++) a[i] += b[i];
 #elif defined(PICOLM_SSE2)
     int i = 0;
     for (; i + 3 < size; i += 4) {