beepboop2025 · beepboop2025 · Jun 5, 2026
diff --git a/.env.example b/.env.example
@@ -32,6 +32,15 @@ TOR_PROXY=socks5://127.0.0.1:9050
 ANTHROPIC_API_KEY=
 OLLAMA_URL=http://localhost:11434
 
+# Free LLM router — adds an LLM sentiment tier above FinBERT/VADER.
+# Set FREE_LLM_ENABLED=false to disable. Router uses only providers with a key set.
+FREE_LLM_ENABLED=true
+GROQ_API_KEY=
+CEREBRAS_API_KEY=
+GOOGLE_AI_STUDIO_API_KEY=
+MISTRAL_API_KEY=
+OPENROUTER_API_KEY=
+
 # ── Destination: DragonScope ──────────────────────────────
 DRAGONSCOPE_REDIS_URL=redis://localhost:6379/1
 DRAGONSCOPE_API_URL=http://localhost:3456

diff --git a/free_llm_router/__init__.py b/free_llm_router/__init__.py
@@ -0,0 +1,30 @@
+"""free-llm-router: failover across perpetually-free, OpenAI-compatible LLM APIs."""
+
+from .health import CircuitBreaker, State
+from .providers import Provider, REGISTRY, available_providers
+from .ratelimit import TokenBucket
+from .router import (
+    AllProvidersFailed,
+    FreeLLMRouter,
+    OrderFn,
+    ProviderStats,
+    TASK_TIER,
+    default_order,
+)
+
+__all__ = [
+    "FreeLLMRouter",
+    "Provider",
+    "ProviderStats",
+    "OrderFn",
+    "default_order",
+    "AllProvidersFailed",
+    "TASK_TIER",
+    "REGISTRY",
+    "available_providers",
+    "TokenBucket",
+    "CircuitBreaker",
+    "State",
+]
+
+__version__ = "0.1.0"
diff --git a/free_llm_router/health.py b/free_llm_router/health.py
@@ -0,0 +1,85 @@
+"""
+Per-provider circuit breaker.
+
+A free provider that starts 500-ing or timing out should be taken out of the
+rotation quickly and probed cautiously — otherwise every request pays the full
+timeout before failing over, and a flapping provider gets hammered.
+
+States:
+  closed     – normal; requests flow.
+  open       – too many recent failures; reject fast until cooldown elapses.
+  half_open  – cooldown elapsed; allow EXACTLY ONE probe. If it succeeds we
+               close; if it fails we re-open. Letting many probes through at
+               once was a real bug — they all rush the still-broken provider
+               and the breaker oscillates.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from enum import Enum
+
+
+class State(str, Enum):
+    CLOSED = "closed"
+    OPEN = "open"
+    HALF_OPEN = "half_open"
+
+
+class CircuitBreaker:
+    def __init__(
+        self,
+        *,
+        monotonic,
+        failure_threshold: int = 3,
+        cooldown_sec: float = 30.0,
+    ) -> None:
+        self._monotonic = monotonic
+        self._failure_threshold = failure_threshold
+        self._cooldown_sec = cooldown_sec
+        self._state = State.CLOSED
+        self._failures = 0
+        self._opened_at = 0.0
+        self._probe_in_flight = False
+        self._lock = asyncio.Lock()
+
+    async def allow(self) -> bool:
+        """Whether a request may proceed right now."""
+        async with self._lock:
+            if self._state is State.CLOSED:
+                return True
+            if self._state is State.OPEN:
+                if self._monotonic() - self._opened_at >= self._cooldown_sec:
+                    # cooldown elapsed → permit a single probe
+                    self._state = State.HALF_OPEN
+                    self._probe_in_flight = True
+                    return True
+                return False
+            # HALF_OPEN: only the one in-flight probe is allowed
+            if not self._probe_in_flight:
+                self._probe_in_flight = True
+                return True
+            return False
+
+    async def record_success(self) -> None:
+        async with self._lock:
+            self._failures = 0
+            self._probe_in_flight = False
+            self._state = State.CLOSED
+
+    async def record_failure(self) -> None:
+        async with self._lock:
+            self._probe_in_flight = False
+            if self._state is State.HALF_OPEN:
+                # probe failed → straight back to open, restart cooldown
+                self._state = State.OPEN
+                self._opened_at = self._monotonic()
+                return
+            self._failures += 1
+            if self._failures >= self._failure_threshold:
+                self._state = State.OPEN
+                self._opened_at = self._monotonic()
+
+    @property
+    def state(self) -> State:
+        return self._state
diff --git a/free_llm_router/policy.py b/free_llm_router/policy.py
@@ -0,0 +1,65 @@
+"""
+Provider ordering policy — YOUR decision point.
+
+The router calls an ``OrderFn`` before every request to decide which provider to
+try first, second, third… Given a live snapshot of each provider's state, return
+the providers in the order you want them attempted.
+
+The default policy (``free_llm_router.router.default_order``) sorts by static
+``priority`` only. That's fine until reality intrudes:
+  * The top-priority provider is rate-limited *this minute* — trying it first just
+    wastes a failover hop (it'll be skipped, but it's still first in line).
+  * A provider has burned 49/50 of its daily quota — maybe save it for last.
+  * One provider has been consistently slow (high ``last_latency_ms``).
+  * A provider's circuit is half_open — risky; maybe deprioritize.
+
+`ProviderStats` gives you, per provider:
+    .provider.priority      static rank (lower = preferred)
+    .circuit_state          "closed" | "open" | "half_open"
+    .tokens_available       bool — has an RPM token to spend right now
+    .day_count / .day_limit requests spent today / documented daily cap (cap may be None)
+    .last_latency_ms        most recent successful round-trip, 0.0 if never called
+
+Tradeoffs to weigh:
+  - Latency-first ordering gets fast answers but can stampede one provider until
+    it rate-limits, then thrash.
+  - Quota-preserving ordering (spread load, save scarce daily quotas for last)
+    is gentler on the free tiers — which is the whole point of not getting banned.
+  - Health-first ordering avoids dead providers but a pure "closed-circuits-first"
+    sort ignores speed and quota entirely.
+
+There is no single right answer — it depends on whether you optimize for speed,
+for staying under the free caps, or for resilience. That's why it's yours.
+"""
+
+from __future__ import annotations
+
+from typing import List
+
+from .router import ProviderStats, default_order
+from .providers import Provider
+
+
+def smart_order(stats: List[ProviderStats]) -> List[Provider]:
+    """
+    TODO(you): Rank providers for the next request.
+
+    Return a list[Provider] in the order they should be tried. You don't have to
+    include every provider, but anything you drop simply won't be attempted this
+    call (the router still skips rate-limited / open-circuit ones defensively, so
+    dropping them is optional).
+
+    Suggested shape — sort by a tuple of keys, cheapest-to-violate first, e.g.:
+
+        def rank(s: ProviderStats):
+            return (
+                0 if s.circuit_state == "closed" else 1,   # healthy first
+                0 if s.tokens_available else 1,             # ready-now first
+                ???,                                        # your quota / latency call
+                s.provider.priority,                        # static tie-break
+            )
+        return [s.provider for s in sorted(stats, key=rank)]
+
+    Replace the line below with your implementation.
+    """
+    return default_order(stats)  # placeholder — delegates to static priority
diff --git a/free_llm_router/providers.py b/free_llm_router/providers.py
@@ -0,0 +1,111 @@
+"""
+Registry of perpetually-free, OpenAI-compatible LLM providers.
+
+Every provider here exposes a ``POST {base_url}/chat/completions`` endpoint that
+accepts the OpenAI request schema. That uniformity is what lets a single client
+body talk to all of them — only ``base_url``, the API key, and the model id change.
+
+We model two logical *tiers* instead of hard-coding model names at call sites:
+
+    "fast"  – small, low-latency model for classification / bulk work
+    "smart" – larger model for drafting / reasoning / summarization
+
+Each provider maps the tiers to a concrete model it offers for free. Callers ask
+for a tier; the router resolves it per-provider during failover.
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+Tier = str  # "fast" | "smart"
+
+
+@dataclass(frozen=True)
+class Provider:
+    """A single free LLM provider and its free-tier characteristics."""
+
+    name: str
+    base_url: str
+    api_key_env: str           # env var holding the key
+    models: Dict[Tier, str]    # tier -> concrete model id offered for free
+    rpm: int                   # documented free-tier requests per minute
+    rpd: Optional[int]         # documented free-tier requests per day (None = unknown)
+    priority: int              # tie-breaker; lower = generally preferred
+    referer: str = ""          # OpenRouter wants HTTP-Referer/X-Title for free tier
+    extra_headers: Dict[str, str] = field(default_factory=dict)
+
+    @property
+    def api_key(self) -> Optional[str]:
+        return os.environ.get(self.api_key_env) or None
+
+    def model_for(self, tier: Tier) -> Optional[str]:
+        return self.models.get(tier)
+
+
+# ── The registry (perpetually-free tiers only — no trial-credit providers) ──────
+#
+# Limits are the documented free-tier numbers at time of writing; they drift, so
+# treat them as hints for the rate limiter rather than guarantees. Sources:
+# github.com/cheahjs/free-llm-api-resources
+
+REGISTRY: List[Provider] = [
+    Provider(
+        name="groq",
+        base_url="https://api.groq.com/openai/v1",
+        api_key_env="GROQ_API_KEY",
+        models={"fast": "llama-3.1-8b-instant", "smart": "llama-3.3-70b-versatile"},
+        rpm=30,
+        rpd=14_400,
+        priority=10,  # fastest inference of the free tiers
+    ),
+    Provider(
+        name="cerebras",
+        base_url="https://api.cerebras.ai/v1",
+        api_key_env="CEREBRAS_API_KEY",
+        models={"fast": "llama3.1-8b", "smart": "llama-3.3-70b"},
+        rpm=30,
+        rpd=14_400,
+        priority=20,
+    ),
+    Provider(
+        name="google_ai_studio",
+        # Google exposes an OpenAI-compatible shim under /v1beta/openai
+        base_url="https://generativelanguage.googleapis.com/v1beta/openai",
+        api_key_env="GOOGLE_AI_STUDIO_API_KEY",
+        models={"fast": "gemini-2.0-flash-lite", "smart": "gemini-2.0-flash"},
+        rpm=15,
+        rpd=1_500,
+        priority=30,  # generous token quota, strong quality
+    ),
+    Provider(
+        name="mistral",
+        base_url="https://api.mistral.ai/v1",
+        api_key_env="MISTRAL_API_KEY",
+        models={"fast": "open-mistral-nemo", "smart": "mistral-small-latest"},
+        rpm=60,
+        rpd=None,
+        priority=40,
+    ),
+    Provider(
+        name="openrouter",
+        base_url="https://openrouter.ai/api/v1",
+        api_key_env="OPENROUTER_API_KEY",
+        # ":free" suffixed models cost nothing on OpenRouter
+        models={
+            "fast": "meta-llama/llama-3.3-70b-instruct:free",
+            "smart": "deepseek/deepseek-r1:free",
+        },
+        rpm=20,
+        rpd=50,  # 1000/day if the account has ever topped up $10
+        priority=50,  # widest model catalog, but tightest free request cap
+        referer="https://github.com/cheahjs/free-llm-api-resources",
+    ),
+]
+
+
+def available_providers() -> List[Provider]:
+    """Registry entries that actually have an API key set in the environment."""
+    return [p for p in REGISTRY if p.api_key]
diff --git a/free_llm_router/ratelimit.py b/free_llm_router/ratelimit.py
@@ -0,0 +1,64 @@
+"""
+Per-provider rate limiting.
+
+Free tiers police two axes simultaneously: requests-per-minute (burst) and
+requests-per-day (quota). We enforce both:
+
+  * RPM via a classic token bucket (smooth refill, allows short bursts).
+  * RPD via a simple daily counter the caller resets out-of-band.
+
+Async-safe. A subtle TOCTOU bug bit an earlier project: refilling tokens only
+in ``acquire`` let two coroutines both see "1 token left" before either consumed.
+Here refill happens under the same lock that does the consume, so check-and-take
+is atomic.
+"""
+
+from __future__ import annotations
+
+import asyncio
+
+
+class TokenBucket:
+    """Async token bucket: ``rpm`` tokens, refilled continuously."""
+
+    def __init__(self, rpm: int, *, monotonic) -> None:
+        # `monotonic` is injected (time.monotonic) so tests can supply a fake clock.
+        self._capacity = float(max(rpm, 1))
+        self._tokens = float(max(rpm, 1))
+        self._refill_per_sec = max(rpm, 1) / 60.0
+        self._monotonic = monotonic
+        self._last = monotonic()
+        self._lock = asyncio.Lock()
+        self._day_count = 0
+
+    def _refill(self) -> None:
+        now = self._monotonic()
+        elapsed = now - self._last
+        if elapsed > 0:
+            self._tokens = min(self._capacity, self._tokens + elapsed * self._refill_per_sec)
+            self._last = now
+
+    async def try_acquire(self) -> bool:
+        """Take one token if available. Returns False instead of blocking."""
+        async with self._lock:
+            self._refill()  # refill INSIDE the lock — atomic with the consume below
+            if self._tokens >= 1.0:
+                self._tokens -= 1.0
+                self._day_count += 1
+                return True
+            return False
+
+    async def seconds_until_token(self) -> float:
+        """How long until at least one token is available (for backoff hints)."""
+        async with self._lock:
+            self._refill()
+            if self._tokens >= 1.0:
+                return 0.0
+            return (1.0 - self._tokens) / self._refill_per_sec
+
+    @property
+    def day_count(self) -> int:
+        return self._day_count
+
+    def reset_day(self) -> None:
+        self._day_count = 0