RightNow-AI · cstroie · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/.gitignore b/.gitignore
@@ -50,3 +50,4 @@ picoclaw/
 
 # Internal dev docs
 picoclaw/PICOLM_INTEGRATION.md
+.aider*
diff --git a/README.md b/README.md
@@ -183,7 +183,7 @@ The model file (638MB) stays on disk. PicoLM **memory-maps** it and streams one
 | **FP16 KV Cache** | Halves KV cache memory (44MB vs 88MB for 2048 context) |
 | **Flash Attention** | Online softmax — no O(seq_len) attention buffer needed |
 | **Pre-computed RoPE** | cos/sin lookup tables eliminate transcendentals from hot loop |
-| **SIMD Acceleration** | ARM NEON (Pi 3/4/5) and x86 SSE2 (Intel/AMD) auto-detected |
+| **SIMD Acceleration** | ARM NEON (Pi 3/4/5), x86 SSE2/SSE3, and AVX — auto-detected at compile time |
 | **Fused Dot Products** | Dequantize + dot-product in one pass — no intermediate buffer |
 | **Multi-threaded matmul** | Parallel matrix-vector multiply across CPU cores |
 | **Grammar-Constrained JSON** | `--json` flag forces valid JSON output (for tool calling) |
@@ -242,6 +242,10 @@ picolm.exe model.gguf -p "Hello world" -n 50
 
 ```bash
 make native      # x86/ARM auto-detect (recommended for local machine)
+make x86         # x86-64 safe default (SSE2 only — runs on any x86-64)
+make sse2        # x86-64 SSE2 only (same as x86)
+make sse3        # x86-64 SSE2+SSE3+SSSE3 (AMD Phenom/Athlon, older Intel)
+make avx         # x86-64 AVX (Sandy Bridge+, Bulldozer+ — wider SIMD, faster)
 make pi          # Raspberry Pi 3/4/5 (64-bit ARM + NEON SIMD)
 make pi-arm32    # Pi Zero / Pi 1 (32-bit ARM)
 make cross-pi    # Cross-compile for Pi from x86 (static binary)
@@ -268,6 +272,9 @@ Generation options:
   -c <int>       Context length override
   -j <int>       Number of threads (default: 4)
 
+Performance options:
+  --mem          Load model into RAM instead of mmap (consistent latency, more RAM)
+
 Advanced options:
   --json         Grammar-constrained JSON output mode
   --cache <file> KV cache file (saves/loads prompt state)
@@ -348,7 +355,7 @@ Measured on TinyLlama 1.1B Q4_K_M (638 MB model):
  + FP16 KV cache          █████████████████░░░  (halve memory bandwidth)
  + Pre-computed RoPE      ██████████████████░░  (no sin/cos in hot loop)
  + Flash attention        ██████████████████░░  (no O(n) attention alloc)
- + NEON/SSE2 SIMD         ███████████████████░  (4-wide vector ops)
+ + NEON/SSE2/AVX SIMD     ███████████████████░  (4-wide to 8-wide vector ops)
  + KV cache persistence   ████████████████████  (skip prefill entirely)
 ```
 
@@ -477,9 +484,13 @@ PicoLM implements 9 optimizations that brought generation speed from **1.6 tok/s
 
 4-wide float vector operations for all hot paths. Example: dequantizing Q4_K nibbles with `vmovl_u8` → `vmovl_u16` → `vcvtq_f32_u32`, and RoPE with interleaved `vld2q_f32` / `vst2q_f32`.
 
-### 2. x86 SSE2 SIMD
+### 2. x86 SIMD (SSE2 / SSE3 / AVX)
+
+Three compile-time tiers for Intel/AMD:
 
-Auto-detected on Intel/AMD. 4-wide `__m128` operations for dot products, RMSNorm, and vector operations.
+- **SSE2** (`make sse2` or `make x86`): 4-wide `__m128` operations for dot products, RMSNorm, softmax, RoPE, and element-wise ops. Safe baseline for all x86-64 CPUs.
+- **SSE3** (`make sse3`): adds `_mm_addsub_ps` for a cleaner RoPE rotation kernel (no sign-mask workaround needed).
+- **AVX** (`make avx`): 8-wide `__m256` float accumulators for all ops. Q4_K and Q6_K dot products widen the float accumulation stage while keeping integer nibble extraction at 128-bit (no AVX2 required). RoPE processes 4 complex pairs per iteration with `_mm256_addsub_ps`.
 
 ### 3. FP16 KV Cache
 
@@ -636,7 +647,7 @@ A: llama.cpp is excellent but requires ~200MB+ for the runtime on small models,
 A: TinyLlama 1.1B is a small model — it handles simple tasks (Q&A, summarization, basic reasoning, JSON generation) well. It won't match GPT-4, but it runs on a $10 board with no internet. For structured output, the `--json` grammar mode guarantees valid JSON regardless of model quality.
 
 **Q: What about GPU acceleration?**
-A: PicoLM is CPU-only by design. The target hardware ($10-15 boards) doesn't have GPUs. On x86/ARM CPUs, SIMD (NEON/SSE2) provides meaningful speedup.
+A: PicoLM is CPU-only by design. The target hardware ($10-15 boards) doesn't have GPUs. On x86/ARM CPUs, SIMD (NEON/SSE2/AVX) provides meaningful speedup.
 
 **Q: Can I use a different model?**
 A: Any LLaMA-architecture GGUF model works. Download from [HuggingFace](https://huggingface.co/models?search=gguf) and point PicoLM at it. Recommended quantizations: Q4_K_M (best quality/size balance) or Q2_K (smallest, lower quality).
@@ -645,7 +656,8 @@ A: Any LLaMA-architecture GGUF model works. Download from [HuggingFace](https://
 
 ## Roadmap
 
-- [ ] AVX2/AVX-512 kernels for x86 (2-4x generation speed on modern CPUs)
+- [x] AVX kernels for x86 (`make avx` — 8-wide float ops, ~2x vs SSE2)
+- [ ] AVX2/AVX-512 kernels for x86 (256-bit integer ops for quantized paths)
 - [ ] Speculative decoding with a draft model
 - [ ] Context sliding window (infinite generation beyond max_seq_len)
 - [ ] Weight pruning for further memory reduction

diff --git a/picolm/Makefile b/picolm/Makefile
@@ -1,5 +1,5 @@
 CC      = gcc
-CFLAGS  = -O2 -std=c11 -D_GNU_SOURCE -Wall -Wextra -Wpedantic
+CFLAGS  = -O3 -std=c11 -D_GNU_SOURCE -Wall -Wextra -Wpedantic
 LDFLAGS = -lm -lpthread
 SRCS    = picolm.c model.c tensor.c quant.c tokenizer.c sampler.c grammar.c
 TARGET  = picolm
@@ -11,11 +11,26 @@ MODEL_DIR ?= /opt/picolm/models
 native: CFLAGS += -march=native
 native: $(TARGET)
 
+# --- x86-64 default (SSE2 only, safe for all x86-64) ---
+x86: sse2
+
+# --- x86-64 with SSE2 only ---
+sse2: CFLAGS += -msse2
+sse2: $(TARGET)
+
+# --- x86-64 with SSE2+SSE3+SSSE3 (covers AMD Phenom/Athlon and similar without AVX) ---
+sse3: CFLAGS += -msse2 -msse3 -mssse3 -mpopcnt
+sse3: $(TARGET)
+
+# --- x86-64 with AVX (Sandy Bridge and newer Intel; Bulldozer and newer AMD) ---
+avx: CFLAGS += -mavx -mpopcnt
+avx: $(TARGET)
+
 $(TARGET): $(SRCS)
 	$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
 
 # --- Static build for single-binary deployment ---
-static: CFLAGS += -march=native
+static: CFLAGS += -msse2
 static: LDFLAGS += -static
 static: $(TARGET)
 
@@ -70,4 +85,4 @@ model:
 clean:
 	rm -f $(TARGET) $(TARGET).exe *.obj *.o
 
-.PHONY: native static pi pi-arm32 cross-pi riscv cross-riscv debug install model clean
+.PHONY: native x86 sse2 sse3 avx static pi pi-arm32 cross-pi riscv cross-riscv debug install model clean
diff --git a/picolm/model.c b/picolm/model.c
@@ -187,16 +187,53 @@ static int mmap_file(model_t *m, const char *path) {
     return 0;
 }
 
+static int load_file_into_ram(model_t *m, const char *path) {
+    FILE *f = fopen(path, "rb");
+    if (!f) {
+        fprintf(stderr, "Cannot open file: %s\n", path);
+        return -1;
+    }
+
+    fseek(f, 0, SEEK_END);
+    long size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+
+    m->mmap_addr = malloc(size);
+    if (!m->mmap_addr) {
+        fprintf(stderr, "OOM: cannot allocate %ld bytes\n", size);
+        fclose(f);
+        return -1;
+    }
+
+    if (fread(m->mmap_addr, 1, (size_t)size, f) != (size_t)size) {
+        fprintf(stderr, "Failed to read file\n");
+        free(m->mmap_addr);
+        m->mmap_addr = NULL;
+        fclose(f);
+        return -1;
+    }
+
+    fclose(f);
+    m->mmap_size = (size_t)size;
+    m->use_ram = 1;
+    return 0;
+}
+
 static void munmap_file(model_t *m) {
     if (!m->mmap_addr) return;
+
+    if (m->use_ram) {
+        free(m->mmap_addr);
+    } else {
 #ifdef _WIN32
-    UnmapViewOfFile(m->mmap_addr);
-    CloseHandle(m->map_handle);
-    CloseHandle(m->file_handle);
+        UnmapViewOfFile(m->mmap_addr);
+        CloseHandle(m->map_handle);
+        CloseHandle(m->file_handle);
 #else
-    munmap(m->mmap_addr, m->mmap_size);
-    close(m->fd);
+        munmap(m->mmap_addr, m->mmap_size);
+        close(m->fd);
 #endif
+    }
     m->mmap_addr = NULL;
 }
 
@@ -406,6 +443,7 @@ static int parse_gguf(model_t *m, int max_seq_len) {
     fprintf(stderr, "  n_layers=%d, vocab_size=%d, max_seq=%d\n",
             cfg->n_layers, cfg->vocab_size, cfg->max_seq_len);
     fprintf(stderr, "  head_dim=%d, rope_base=%.1f\n", cfg->head_dim, cfg->rope_freq_base);
+    fprintf(stderr, "  Loading mode: %s\n", m->use_ram ? "RAM" : "mmap");
 
     free(tinfos);
     return 0;
@@ -535,10 +573,16 @@ static int allocate_run_state(model_t *m) {
 
 /* ---- Public API ---- */
 
-int model_load(model_t *m, const char *path, int max_seq_len) {
+int model_load(model_t *m, const char *path, int max_seq_len, int use_ram) {
     memset(m, 0, sizeof(*m));
+    m->use_ram = use_ram;
 
-    if (mmap_file(m, path) != 0) return -1;
+    if (use_ram) {
+        if (load_file_into_ram(m, path) != 0) return -1;
+    } else {
+        if (mmap_file(m, path) != 0) return -1;
+    }
+
     if (parse_gguf(m, max_seq_len) != 0) return -1;
     if (allocate_run_state(m) != 0) return -1;
 

diff --git a/picolm/model.h b/picolm/model.h
@@ -108,6 +108,7 @@ typedef struct {
     /* mmap bookkeeping */
     void  *mmap_addr;
     size_t mmap_size;
+    int    use_ram;      /* flag: 1 = load into RAM, 0 = use mmap */
 #ifdef _WIN32
     void  *file_handle;
     void  *map_handle;
@@ -125,7 +126,7 @@ typedef struct {
 } model_t;
 
 /* Load a GGUF model file. Returns 0 on success. */
-int model_load(model_t *m, const char *path, int max_seq_len);
+int model_load(model_t *m, const char *path, int max_seq_len, int use_ram);
 
 /* Run one forward pass. Returns pointer to logits[vocab_size]. */
 float *model_forward(model_t *m, int token, int pos);

diff --git a/picolm/picolm.c b/picolm/picolm.c
@@ -40,6 +40,7 @@ static void usage(const char *prog) {
     fprintf(stderr, "\nAdvanced options:\n");
     fprintf(stderr, "  --json         Grammar-constrained JSON output mode\n");
     fprintf(stderr, "  --cache <file> KV cache file (saves/loads prompt state)\n");
+    fprintf(stderr, "  --mem          Load model into RAM instead of memory-mapping\n");
 }
 
 static char *read_stdin(void) {
@@ -77,6 +78,7 @@ int main(int argc, char **argv) {
     int    num_threads = 4;
     int    json_mode = 0;
     const char *cache_file = NULL;
+    int    use_ram = 0; /* 0 = mmap (default), 1 = load into RAM */
 
     /* Parse arguments */
     for (int i = 2; i < argc; i++) {
@@ -98,6 +100,8 @@ int main(int argc, char **argv) {
             json_mode = 1;
         } else if (strcmp(argv[i], "--cache") == 0 && i + 1 < argc) {
             cache_file = argv[++i];
+        } else if (strcmp(argv[i], "--mem") == 0) {
+            use_ram = 1;
         } else {
             fprintf(stderr, "Unknown option: %s\n", argv[i]);
             usage(argv[0]);
@@ -131,8 +135,10 @@ int main(int argc, char **argv) {
 
     /* Load model */
     fprintf(stderr, "Loading model: %s\n", model_path);
+    fprintf(stderr, "Loading mode: %s\n", use_ram ? "RAM" : "mmap");
+
     model_t model;
-    if (model_load(&model, model_path, context_override) != 0) {
+    if (model_load(&model, model_path, context_override, use_ram) != 0) {
         fprintf(stderr, "Failed to load model\n");
         return 1;
     }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -50,3 +50,4 @@ picoclaw/

		# Internal dev docs
		picoclaw/PICOLM_INTEGRATION.md
		.aider*