Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ TOR_PROXY=socks5://127.0.0.1:9050
ANTHROPIC_API_KEY=
OLLAMA_URL=http://localhost:11434

# Free LLM router — adds an LLM sentiment tier above FinBERT/VADER.
# Set FREE_LLM_ENABLED=false to disable. Router uses only providers with a key set.
FREE_LLM_ENABLED=true
GROQ_API_KEY=
CEREBRAS_API_KEY=
GOOGLE_AI_STUDIO_API_KEY=
MISTRAL_API_KEY=
OPENROUTER_API_KEY=

# ── Destination: DragonScope ──────────────────────────────
DRAGONSCOPE_REDIS_URL=redis://localhost:6379/1
DRAGONSCOPE_API_URL=http://localhost:3456
Expand Down
30 changes: 30 additions & 0 deletions free_llm_router/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""free-llm-router: failover across perpetually-free, OpenAI-compatible LLM APIs."""

from .health import CircuitBreaker, State
from .providers import Provider, REGISTRY, available_providers
from .ratelimit import TokenBucket
from .router import (
AllProvidersFailed,
FreeLLMRouter,
OrderFn,
ProviderStats,
TASK_TIER,
default_order,
)

__all__ = [
"FreeLLMRouter",
"Provider",
"ProviderStats",
"OrderFn",
"default_order",
"AllProvidersFailed",
"TASK_TIER",
"REGISTRY",
"available_providers",
"TokenBucket",
"CircuitBreaker",
"State",
]

__version__ = "0.1.0"
85 changes: 85 additions & 0 deletions free_llm_router/health.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""
Per-provider circuit breaker.

A free provider that starts 500-ing or timing out should be taken out of the
rotation quickly and probed cautiously — otherwise every request pays the full
timeout before failing over, and a flapping provider gets hammered.

States:
closed – normal; requests flow.
open – too many recent failures; reject fast until cooldown elapses.
half_open – cooldown elapsed; allow EXACTLY ONE probe. If it succeeds we
close; if it fails we re-open. Letting many probes through at
once was a real bug — they all rush the still-broken provider
and the breaker oscillates.
"""

from __future__ import annotations

import asyncio
from enum import Enum


class State(str, Enum):
CLOSED = "closed"
OPEN = "open"
HALF_OPEN = "half_open"


class CircuitBreaker:
def __init__(
self,
*,
monotonic,
failure_threshold: int = 3,
cooldown_sec: float = 30.0,
) -> None:
self._monotonic = monotonic
self._failure_threshold = failure_threshold
self._cooldown_sec = cooldown_sec
self._state = State.CLOSED
self._failures = 0
self._opened_at = 0.0
self._probe_in_flight = False
self._lock = asyncio.Lock()

async def allow(self) -> bool:
"""Whether a request may proceed right now."""
async with self._lock:
if self._state is State.CLOSED:
return True
if self._state is State.OPEN:
if self._monotonic() - self._opened_at >= self._cooldown_sec:
# cooldown elapsed → permit a single probe
self._state = State.HALF_OPEN
self._probe_in_flight = True
return True
return False
# HALF_OPEN: only the one in-flight probe is allowed
if not self._probe_in_flight:
self._probe_in_flight = True
return True
return False

async def record_success(self) -> None:
async with self._lock:
self._failures = 0
self._probe_in_flight = False
self._state = State.CLOSED

async def record_failure(self) -> None:
async with self._lock:
self._probe_in_flight = False
if self._state is State.HALF_OPEN:
# probe failed → straight back to open, restart cooldown
self._state = State.OPEN
self._opened_at = self._monotonic()
return
self._failures += 1
if self._failures >= self._failure_threshold:
self._state = State.OPEN
self._opened_at = self._monotonic()

@property
def state(self) -> State:
return self._state
65 changes: 65 additions & 0 deletions free_llm_router/policy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""
Provider ordering policy — YOUR decision point.

The router calls an ``OrderFn`` before every request to decide which provider to
try first, second, third… Given a live snapshot of each provider's state, return
the providers in the order you want them attempted.

The default policy (``free_llm_router.router.default_order``) sorts by static
``priority`` only. That's fine until reality intrudes:
* The top-priority provider is rate-limited *this minute* — trying it first just
wastes a failover hop (it'll be skipped, but it's still first in line).
* A provider has burned 49/50 of its daily quota — maybe save it for last.
* One provider has been consistently slow (high ``last_latency_ms``).
* A provider's circuit is half_open — risky; maybe deprioritize.

`ProviderStats` gives you, per provider:
.provider.priority static rank (lower = preferred)
.circuit_state "closed" | "open" | "half_open"
.tokens_available bool — has an RPM token to spend right now
.day_count / .day_limit requests spent today / documented daily cap (cap may be None)
.last_latency_ms most recent successful round-trip, 0.0 if never called

Tradeoffs to weigh:
- Latency-first ordering gets fast answers but can stampede one provider until
it rate-limits, then thrash.
- Quota-preserving ordering (spread load, save scarce daily quotas for last)
is gentler on the free tiers — which is the whole point of not getting banned.
- Health-first ordering avoids dead providers but a pure "closed-circuits-first"
sort ignores speed and quota entirely.

There is no single right answer — it depends on whether you optimize for speed,
for staying under the free caps, or for resilience. That's why it's yours.
"""

from __future__ import annotations

from typing import List

from .router import ProviderStats, default_order
from .providers import Provider


def smart_order(stats: List[ProviderStats]) -> List[Provider]:
"""
TODO(you): Rank providers for the next request.

Return a list[Provider] in the order they should be tried. You don't have to
include every provider, but anything you drop simply won't be attempted this
call (the router still skips rate-limited / open-circuit ones defensively, so
dropping them is optional).

Suggested shape — sort by a tuple of keys, cheapest-to-violate first, e.g.:

def rank(s: ProviderStats):
return (
0 if s.circuit_state == "closed" else 1, # healthy first
0 if s.tokens_available else 1, # ready-now first
???, # your quota / latency call
s.provider.priority, # static tie-break
)
return [s.provider for s in sorted(stats, key=rank)]

Replace the line below with your implementation.
"""
return default_order(stats) # placeholder — delegates to static priority
111 changes: 111 additions & 0 deletions free_llm_router/providers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""
Registry of perpetually-free, OpenAI-compatible LLM providers.

Every provider here exposes a ``POST {base_url}/chat/completions`` endpoint that
accepts the OpenAI request schema. That uniformity is what lets a single client
body talk to all of them — only ``base_url``, the API key, and the model id change.

We model two logical *tiers* instead of hard-coding model names at call sites:

"fast" – small, low-latency model for classification / bulk work
"smart" – larger model for drafting / reasoning / summarization

Each provider maps the tiers to a concrete model it offers for free. Callers ask
for a tier; the router resolves it per-provider during failover.
"""

from __future__ import annotations

import os
from dataclasses import dataclass, field
from typing import Dict, List, Optional

Tier = str # "fast" | "smart"


@dataclass(frozen=True)
class Provider:
"""A single free LLM provider and its free-tier characteristics."""

name: str
base_url: str
api_key_env: str # env var holding the key
models: Dict[Tier, str] # tier -> concrete model id offered for free
rpm: int # documented free-tier requests per minute
rpd: Optional[int] # documented free-tier requests per day (None = unknown)
priority: int # tie-breaker; lower = generally preferred
referer: str = "" # OpenRouter wants HTTP-Referer/X-Title for free tier
extra_headers: Dict[str, str] = field(default_factory=dict)

@property
def api_key(self) -> Optional[str]:
return os.environ.get(self.api_key_env) or None

def model_for(self, tier: Tier) -> Optional[str]:
return self.models.get(tier)


# ── The registry (perpetually-free tiers only — no trial-credit providers) ──────
#
# Limits are the documented free-tier numbers at time of writing; they drift, so
# treat them as hints for the rate limiter rather than guarantees. Sources:
# github.com/cheahjs/free-llm-api-resources

REGISTRY: List[Provider] = [
Provider(
name="groq",
base_url="https://api.groq.com/openai/v1",
api_key_env="GROQ_API_KEY",
models={"fast": "llama-3.1-8b-instant", "smart": "llama-3.3-70b-versatile"},
rpm=30,
rpd=14_400,
priority=10, # fastest inference of the free tiers
),
Provider(
name="cerebras",
base_url="https://api.cerebras.ai/v1",
api_key_env="CEREBRAS_API_KEY",
models={"fast": "llama3.1-8b", "smart": "llama-3.3-70b"},
rpm=30,
rpd=14_400,
priority=20,
),
Provider(
name="google_ai_studio",
# Google exposes an OpenAI-compatible shim under /v1beta/openai
base_url="https://generativelanguage.googleapis.com/v1beta/openai",
api_key_env="GOOGLE_AI_STUDIO_API_KEY",
models={"fast": "gemini-2.0-flash-lite", "smart": "gemini-2.0-flash"},
rpm=15,
rpd=1_500,
priority=30, # generous token quota, strong quality
),
Provider(
name="mistral",
base_url="https://api.mistral.ai/v1",
api_key_env="MISTRAL_API_KEY",
models={"fast": "open-mistral-nemo", "smart": "mistral-small-latest"},
rpm=60,
rpd=None,
priority=40,
),
Provider(
name="openrouter",
base_url="https://openrouter.ai/api/v1",
api_key_env="OPENROUTER_API_KEY",
# ":free" suffixed models cost nothing on OpenRouter
models={
"fast": "meta-llama/llama-3.3-70b-instruct:free",
"smart": "deepseek/deepseek-r1:free",
},
rpm=20,
rpd=50, # 1000/day if the account has ever topped up $10
priority=50, # widest model catalog, but tightest free request cap
referer="https://github.com/cheahjs/free-llm-api-resources",
),
]


def available_providers() -> List[Provider]:
"""Registry entries that actually have an API key set in the environment."""
return [p for p in REGISTRY if p.api_key]
64 changes: 64 additions & 0 deletions free_llm_router/ratelimit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""
Per-provider rate limiting.

Free tiers police two axes simultaneously: requests-per-minute (burst) and
requests-per-day (quota). We enforce both:

* RPM via a classic token bucket (smooth refill, allows short bursts).
* RPD via a simple daily counter the caller resets out-of-band.

Async-safe. A subtle TOCTOU bug bit an earlier project: refilling tokens only
in ``acquire`` let two coroutines both see "1 token left" before either consumed.
Here refill happens under the same lock that does the consume, so check-and-take
is atomic.
"""

from __future__ import annotations

import asyncio


class TokenBucket:
"""Async token bucket: ``rpm`` tokens, refilled continuously."""

def __init__(self, rpm: int, *, monotonic) -> None:
# `monotonic` is injected (time.monotonic) so tests can supply a fake clock.
self._capacity = float(max(rpm, 1))
self._tokens = float(max(rpm, 1))
self._refill_per_sec = max(rpm, 1) / 60.0
self._monotonic = monotonic
self._last = monotonic()
self._lock = asyncio.Lock()
self._day_count = 0

def _refill(self) -> None:
now = self._monotonic()
elapsed = now - self._last
if elapsed > 0:
self._tokens = min(self._capacity, self._tokens + elapsed * self._refill_per_sec)
self._last = now

async def try_acquire(self) -> bool:
"""Take one token if available. Returns False instead of blocking."""
async with self._lock:
self._refill() # refill INSIDE the lock — atomic with the consume below
if self._tokens >= 1.0:
self._tokens -= 1.0
self._day_count += 1
return True
return False

async def seconds_until_token(self) -> float:
"""How long until at least one token is available (for backoff hints)."""
async with self._lock:
self._refill()
if self._tokens >= 1.0:
return 0.0
return (1.0 - self._tokens) / self._refill_per_sec

@property
def day_count(self) -> int:
return self._day_count

def reset_day(self) -> None:
self._day_count = 0
Loading