Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
144c5b4
Add canonical engineering glossary (GLOSSARY.md) (#102)
scion-gteam[bot] May 31, 2026
198c424
P0-1: switch Postgres driver from lib/pq to pgx/v5 stdlib
Jun 1, 2026
c63c1bb
P0-2: add connection pool config to DatabaseConfig
Jun 1, 2026
1161069
P0-3/P0-4: CRUD-parity test harness + spec-driven fixture generator
Jun 1, 2026
01d126d
feat(ent): add 23 new Ent schemas for full table parity (P1-2 + P1-3)
Jun 1, 2026
f815897
P2: port notification + gcp/github/token domains to Ent entadapter
Jun 1, 2026
ca32619
P2: port schedule, maintenance, message domains to Ent entadapter
Jun 1, 2026
a5b83e3
feat(entadapter): port user + allowlist/invite domains to Ent (P2)
Jun 1, 2026
7f331f1
P2: port secret/env_var + template/harness_config domains to Ent
Jun 1, 2026
7c1b9d1
P2: port project/broker + brokersecret domains to Ent
Jun 1, 2026
59925ae
P2: port agent domain to Ent entadapter (XL)
Jun 1, 2026
cd651a7
chore(ent): regenerate Ent code for all 30 entity schemas
Jun 1, 2026
95e82cb
P2-collapse: collapse dual-DB into single Ent store
Jun 1, 2026
bcf83cf
P2-delete: remove raw-SQL store implementation
Jun 1, 2026
2191179
test: compile-migrate downstream suites to Ent store + fix signing-ke…
Jun 1, 2026
30d575b
test(hub): map non-UUID fixture IDs to UUIDs via tid() helper
Jun 1, 2026
84ac1c3
fix(store): seed maintenance ops in Migrate; initStore uses Migrate
Jun 1, 2026
64dc4d1
test(hub): satisfy Ent NotEmpty validators in fixtures
Jun 1, 2026
1736385
fix(entadapter): Get-by-id returns ErrNotFound for non-UUID identifiers
Jun 1, 2026
4f410f0
test(hub): fix store-less id wraps and project-route URL paths
Jun 1, 2026
94086ab
test(hub): unwrap projectIDFromServiceAccountEmail expectation
Jun 1, 2026
bf517d1
fix(ent): GCPServiceAccount.project_id is a string, not a UUID
Jun 1, 2026
9c20a56
test(hub): fix GCP SA project-id assertion and project-settings id
Jun 1, 2026
d389987
test(hub): revert tid() over-wraps in store-less events_test
Jun 1, 2026
ce646f3
test(hub): fix maintenance-run path and notifications agentId queries
Jun 1, 2026
04e2a45
test(hub): wrap remaining fixture IDs revealed after panic-cascade cl…
Jun 1, 2026
405c394
test(hub): unwrap tid() in scheduler_test (mock store, raw ids)
Jun 1, 2026
a027921
fix(ent): Template.harness may be empty (raw-store parity)
Jun 1, 2026
19de7cf
test(hub): wrap dynamic fixture IDs in wake/workspace/signing-key tests
Jun 1, 2026
85c0125
test(hub): convert raw-id URL path segments to tid()
Jun 1, 2026
7709924
fix(hub): seed creator users for agent-created agents; cascade-delete…
Jun 1, 2026
b0557d7
test(hub): seed broker slug/name in dispatcher and project_cache fixt…
Jun 1, 2026
d2a3e66
fix(entadapter): cascade-delete agents on project delete (raw-store p…
Jun 1, 2026
be00089
test(hub): MaxOpenConns=1 for SQLite test store (serialize writes); t…
Jun 1, 2026
2da26c4
test(hub): unwrap over-wrapped tid() in unit tests (workspace/logfilt…
Jun 1, 2026
aafcf1e
fix(ent): allow empty display_name (raw-store NOT NULL parity, email …
Jun 1, 2026
c9a3454
feat(migrate): add Migration β tool (Ent-SQLite → Ent-Postgres)
Jun 1, 2026
7ff0b6f
feat(concurrency): dialect-aware multi-replica primitives for Postgre…
Jun 1, 2026
2593665
feat(hub): widen events to EventPublisher interface + Postgres LISTEN…
Jun 1, 2026
802830e
test(store): parameterize store suites over {sqlite, postgres} (P3-2)
Jun 1, 2026
b49a014
fix(hub): harden Postgres event publish + verify wiring; lower PG poo…
Jun 2, 2026
7c65f54
test(store): add Postgres stress/integration suite (contention, isola…
Jun 2, 2026
27d0897
fix(db): recycle stale conns + keepalives; skip singleton tick on loc…
Jun 2, 2026
5d3184b
fix(store): bound advisory-lock conn checkout + unlock with short tim…
Jun 2, 2026
738c8b0
feat(migrate): in-process migration α (legacy raw-SQL hub.db → Ent)
Jun 2, 2026
bdcf5b6
fix(config): apply real Postgres pool size (leaked SQLite default of …
Jun 2, 2026
b0efe91
feat(hub): per-process instanceID on Server (B1-1)
Jun 3, 2026
7820f03
feat(schema): affinity columns on runtime_brokers (B1-2)
Jun 3, 2026
a8e75a4
feat(store): Claim/Release runtime-broker affinity CAS methods (B1-3)
Jun 3, 2026
28dd5c8
fix(hub): thread sessionID through connect + fix onDisconnect clobber…
Jun 3, 2026
c92f8dd
feat(schema): broker_dispatch intent table + messages dispatch-state …
Jun 3, 2026
9b7f908
feat(hub): PostgresCommandBus LISTEN/NOTIFY signal listener on scion_…
Jun 3, 2026
1f42d82
feat(store): BrokerDispatch store methods + message dispatch CAS (B2-3)
Jun 3, 2026
604429a
feat(hub): reconcile-on-connect drain wired to bus + markBrokerOnline…
Jun 3, 2026
54c6a40
feat(hub): route() decision in HybridBrokerClient (B3-1)
Jun 3, 2026
d8df0c6
feat(hub): cross-node message dispatch via route()+intent+signal+owne…
Jun 3, 2026
9ca8b8f
feat(hub): lifecycle dispatch (rolling-timeout wait + cross-node star…
Jun 3, 2026
cecaae2
feat(hub): wire originator-side cross-node lifecycle dispatch (B4-2 c…
Jun 3, 2026
530f9a1
fix(hub): make web session replica-portable to fix OAuth state_mismatch
Jun 3, 2026
40ca14c
feat(hub): cross-node delete + create-time data ops dispatch (B4-3, B…
Jun 3, 2026
883c0e7
feat(hub): stale-affinity + stuck-dispatch reaper singleton (B5-1)
Jun 3, 2026
1baca1d
feat(hub): pending-message sweep + dispatch metrics (B5-2)
Jun 3, 2026
032ff9c
fix(hub): derive JWT signing keys from shared SESSION_SECRET to fix c…
Jun 3, 2026
4f75aef
docs: project log for B5-3 chaos gate — GB5 PASSED (GA gate for broke…
Jun 3, 2026
46bb997
fix(hub): align fakeHTTPClient.CleanupProject with interface (3 param…
Jun 4, 2026
5743503
fix(hub): address PR #305 review feedback
ptone Jun 5, 2026
203a1a7
docs: add project log for PR #305 review feedback fixes
ptone Jun 5, 2026
f8914bd
fix(hub): prevent duplicate message delivery, guard dispatch state tr…
ptone Jun 5, 2026
ffe5373
fix(hub): reconcile broker→eventbus and hub-native→hub-managed rename…
ptone Jun 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .design/project-log/2026-06-03-b5-3-chaos-gate.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# B5-3 Chaos Gate — GB5 PASSED

**Date:** 2026-06-03
**Agent:** qa-agent
**Branch:** `postgres/wave-b-integration` @ `62186381`
**Gate:** GB5 (GA gate for broker dispatch)

## Result: PASS

All five chaos scenarios completed against two-VM + CloudSQL topology.
Full results at `/scion-volumes/scratchpad/B5-3-CHAOS-GATE-RESULTS.md`.

| Scenario | Result |
|----------|--------|
| A: Kill owning hub mid-start | **PASS** — Hub B claimed and completed dispatch in 1.3s; no double-execution; `state=done, attempts=0` |
| B: Broker flap A→B | **PARTIAL/PASS** — Co-located topology prevents literal A→B flap; CAS claim, reconcile drain, and reaper all verified via equivalent tests |
| C: Pool saturation during PublishTx | **PASS** — Message dispatched 60ms post-creation despite external pool pressure; no corruption, no orphaned pending rows |
| D: Command-bus listener drop | **PASS** — Reconnected in ~280ms; cross-node dispatch succeeded immediately after |
| E: Reaper correctness | **PASS** — Stuck `in_progress` dispatch re-driven within 1 min of threshold; stale `connected_hub_id` cleared within 1 min of stale window |

## Key Timing Evidence (Scenario E)

- Hub killed: 22:48:02; last heartbeat: 22:49:38
- Dispatch requeued (`in_progress→pending`): 22:50:26 — within 1 min of `dispatchStuckAge`
- Affinity cleared: 22:53:26 — within 1 min of `affinityStaleAge` (180s from last heartbeat)

## Notes

- Scenario B limitation: in this deployment brokers are co-located with their hubs (same process). A cross-hub broker reconnect can't be manually induced. The mechanisms that handle it (CAS claim on reconnect, reconcile drain, reaper) were each independently verified.
- `ConnectionMaxIdleTime` fix (from LIVE-RETEST-RESULTS.md §7) not yet implemented. No stall observed during chaos recovery — command-bus reconnect was clean. Recommend as a follow-up hardening item, not a blocker.
- VMs left running `62186381` on Postgres (healthy) after gate.
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Fix: cross-replica login loop (`session_expired`) after cookie-store fix

**Date:** 2026-06-03
**Branch:** postgres/wave-b-integration
**Symptom:** After OAuth login the dashboard flashes, then the browser is
redirected to `/login?error=session_expired&returnTo=/`, repeatedly.

## Background

Commit `0515e2a8` replaced the per-replica gorilla `FilesystemStore` with an
encrypted+signed `CookieStore` whose keys derive from the shared
`SESSION_SECRET`, so the whole web session (OAuth state + Hub JWTs) rides in the
client cookie and any replica can read it. That fixed the OAuth `state_mismatch`
and made the *session container* replica-portable.

## Root cause (one layer deeper)

The cookie is portable, but the **Hub JWT inside it is signed with a per-replica
key**. Signing keys are resolved by `ensureSigningKey()` scoped to
`(scope=hub, scope_id=hubID)`, and `hubID = sha256(hostname)[:12]`
(`DefaultHubID`). The integration deployment runs **two replicas of one logical
hub** behind a single LB (`multi.demo.scion-ai.dev`), sharing one Postgres DB
and one `SESSION_SECRET`, but with different hostnames:

| Replica | hub_id | user_signing_key fp |
|---|---|---|
| scion-integration | `ca39430276ee` | `9a35ae24cfeedba0` |
| scion-integration2 | `9662ebe99da4` | `97d3f30a36554d7a` |

So each replica minted/validated user JWTs with a *different* HS256 key. When a
post-login request landed on the replica that did **not** mint the token,
`ValidateUserToken` failed (`go-jose: error in cryptographic primitive`),
refresh failed too (the refresh token is signed with the same foreign key), and
`sessionToBearerMiddleware` declared the session "irrecoverably invalid",
**deleted the cookie** (`MaxAge=-1`) and returned `session_expired`. The cookie
deletion is what turns it into a loop. Logs show the same user alternating
between "User authenticated" and "Hub token irrecoverably invalid, clearing
session" depending on which replica served the request.

## Fix

Extend the `0515e2a8` philosophy from the cookie to the keys inside it: derive
the agent and user JWT signing keys deterministically from the shared
`SESSION_SECRET`.

- `ServerConfig.SharedSigningSecret` (new field).
- `ensureSigningKey()`: when `SharedSigningSecret != ""`, return
`deriveSharedSigningKey(secret, keyName)` (domain-separated by key name),
bypassing per-host secret-backend storage. Empty secret → unchanged per-hub
behavior (no regression for single-node/local dev).
- `cmd/server_foreground.go`: new `resolveSessionSecret()` helper feeds the same
value into both the web cookie store and `hubCfg.SharedSigningSecret`.

Now every replica with the same `SESSION_SECRET` agrees on the signing keys,
regardless of hostname/hubID — no operator coordination (matching HubID) needed.

## Tests

`pkg/hub/signing_key_shared_test.go`:
- derivation is deterministic, 32 bytes, domain-separated, secret-sensitive;
- two servers with **different hubID, same secret** derive identical keys and a
token minted on one validates on the other; a different secret cannot;
- an explicit pre-configured key still wins over derivation.

## Deploy note

Rolling out the new binary changes the signing keys (they now derive from
`SESSION_SECRET` instead of the stored per-host keys), so existing web sessions
and CLI tokens are invalidated **once** — users log in again, CLI/agents
re-auth. Both replicas already share `SESSION_SECRET`, so no config change is
required. (Faster stopgap without a rebuild: pin the same
`SCION_SERVER_HUB_HUBID` on both VMs to an existing hub ID so they share the
already-stored keys.)
32 changes: 32 additions & 0 deletions .design/project-log/2026-06-05-pr305-review-feedback.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# PR #305 Review Feedback — First Fix Round

**Date:** 2026-06-05
**PR:** #305 — feat(hub): multi-node broker dispatch
**Branch:** pr/broker-dispatch
**Commit:** c5f8b3c

## Summary

Addressed all 6 review comments from gemini-code-assist on PR #305.

### HIGH Priority Fixes

1. **server_migrate.go — nil-checked deferred close**: Changed `defer src.Close()` to a nil-checked closure so the source DB can be manually closed and set to nil before `dropSQLiteFile`, preventing Windows sharing violations.

2. **server_migrate.go — close before drop**: Added explicit `src.Close()` + `src = nil` before the `dropSQLiteFile` call in the `migrateDropSource` path.

3. **server_foreground.go — stale closure capture**: Moved `mgr := hubSrv.GetControlChannelManager()` inside the `ownsLocally` closure. Previously it was captured once at closure creation time, so if the manager was nil at that point but initialized later, `ownsLocally` would permanently return false.

### MEDIUM Priority Fixes

4. **server_migrate.go — file:// prefix handling**: Added a `file://` case before the `file:` case in `parseSQLiteSourceDSN` so that `file:///tmp/hub.db` correctly resolves to `/tmp/hub.db` instead of `//tmp/hub.db`.

5. **server_migrate_test.go — triple-slash test**: Added a test case verifying `file:///tmp/hub.db` is parsed correctly.

6. **server_test.go — subtest name sanitization**: Used `strings.ReplaceAll(t.Name(), "/", "_")` in `newTestStore` to prevent SQLite from interpreting subtest slashes as directory paths.

## Verification

- `gofmt` clean on all changed files
- `go vet ./cmd/` passes
- All relevant tests pass including the new `file_url_with_triple_slashes` test case
1 change: 1 addition & 0 deletions .scion/project-id
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
c7c7775e-e3a0-43de-9d26-274688d467d0
2 changes: 1 addition & 1 deletion GLOSSARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ The three run modes at a glance — distinguish them by whether a server runs an
| Mode | Server | Tenancy | State & isolation | Canonical use |
|------|--------|---------|-------------------|----------------|
| **Local mode** | None | Single user | Local machine; isolation via git worktrees | Agents launched directly via the `scion` CLI, no server |
| **Workstation mode** | Combo server (Hub + Runtime Broker + Web) on loopback | Single-tenant | Local machine; single-tenant state | The hosted experience locally, on your own machine |
| **Workstation mode** | Combo server (Hub + Runtime Broker + Web) on loopback | Single-tenant | That machine | The hosted experience locally, on your own machine |
| **Hosted mode** | Multi-user server deployment | Multi-user | Hub-coordinated across brokers | Coordinating state across users, projects, and runtime brokers |

**Local mode**:
Expand Down
7 changes: 7 additions & 0 deletions agents.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,13 @@ All icons in the web frontend use the Shoelace `<sl-icon>` component (Bootstrap

> **Canonical engineering glossary:** See [`GLOSSARY.md`](./GLOSSARY.md) at the repo root for the canonical, opinionated terminology used throughout the codebase — the preferred term for each concept and the synonyms to avoid. Prefer these terms in new code, comments, and docs.

These terms may be used in shorthand with prompts

- **hub-broker, combo server** References running the server command with both the hub function and the broker function running in the same invocation.
- **hub-native, hub-project** A special variant of a project/project space, that is created on a hub server for use by agents dispatched from clients. These live in ~/.scion/projects/<hub-project-name> on any broker that is a provider to the hub project. This is in contrast to the arbitrary local path on a broker for a linked project.
- **agent-home** The directory that gets mounted as the home folder of the container user in the agent container
- **linked-project** A project and project folder that pre-existed on a broker machine, and is linked as a hub resource project for visibility, metadata, and agent management across other brokers that may have such a linked project. May be based on name or git-URI

## Project use of the scion cli itself
Do not commit changes in the project's own `.scion` folder to git as part of committing progress on code and docs. These are managed and committed manually when template defaults are intentionally updated.

Expand Down
27 changes: 27 additions & 0 deletions cmd/server_foreground.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,10 @@ func runServerStart(cmd *cobra.Command, args []string) error {
log.Fatalf("Hub server failed to start: %v", hubInitErr)
}

// Wire command bus for cross-node dispatch (B2-4).
cmdBus := newCommandBus(ctx, cfg, hubSrv)
hubSrv.SetCommandBus(cmdBus)

if !enableWeb {
// Hub runs its own HTTP server (standalone mode).
eventPub := newEventPublisher(ctx, cfg)
Expand Down Expand Up @@ -1083,6 +1087,29 @@ func newEventPublisher(ctx context.Context, cfg *config.GlobalConfig) hub.EventP
return hub.NewChannelEventPublisher()
}

// newCommandBus selects the command bus backend. With Postgres it returns a
// PostgresCommandBus (LISTEN/NOTIFY on scion_broker_cmd); otherwise it returns
// a no-op bus (single-process SQLite always owns all brokers locally).
func newCommandBus(ctx context.Context, cfg *config.GlobalConfig, hubSrv *hub.Server) hub.CommandBus {
if !strings.EqualFold(cfg.Database.Driver, "postgres") {
return hub.NoopCommandBus{}
}
ownsLocally := func(brokerID string) bool {
mgr := hubSrv.GetControlChannelManager()
if mgr == nil {
return false
}
return mgr.IsConnected(brokerID)
}
bus, err := hub.NewPostgresCommandBus(ctx, cfg.Database.URL, ownsLocally, hubSrv.ReconcileBroker, logging.Subsystem("hub.commandbus"))
if err != nil {
log.Printf("WARNING: failed to start Postgres command bus (%v); falling back to no-op. Cross-replica dispatch signals will not work.", err)
return hub.NoopCommandBus{}
}
log.Printf("Using Postgres command bus on channel scion_broker_cmd")
return bus
}

// initWebServer creates and configures the Web server. The provided context is
// threaded to the event publisher so that the Postgres LISTEN/NOTIFY goroutine
// is cancelled cleanly on shutdown, preventing connection leaks.
Expand Down
9 changes: 7 additions & 2 deletions cmd/server_migrate.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,11 @@ func runServerMigrate(cmd *cobra.Command, _ []string) error {
if err != nil {
return fmt.Errorf("opening source sqlite: %w", err)
}
defer src.Close()
defer func() {
if src != nil {
_ = src.Close()
}
}()

fmt.Fprintln(out, "Opening destination PostgreSQL")
dst, err := entc.OpenPostgres(dstDSN, entc.PoolConfig{MaxOpenConns: 10, MaxIdleConns: 5})
Expand Down Expand Up @@ -133,6 +137,8 @@ func runServerMigrate(cmd *cobra.Command, _ []string) error {
len(report.Entities), total, report.ChildGroupEdgs)

if migrateDropSource {
_ = src.Close()
src = nil
fmt.Fprintf(out, "Dropping source SQLite file: %s\n", srcPath)
if err := dropSQLiteFile(srcPath); err != nil {
return fmt.Errorf("dropping source: %w", err)
Expand Down Expand Up @@ -164,7 +170,6 @@ func parseSQLiteSourceDSN(raw string) (dsn, path string, err error) {
path = strings.TrimPrefix(raw, "sqlite:")
case strings.HasPrefix(raw, "file://"):
path = strings.TrimPrefix(raw, "file://")
// file:///abs -> "/abs"; the third slash begins the absolute path.
if i := strings.IndexByte(path, '?'); i >= 0 {
path = path[:i]
}
Expand Down
20 changes: 4 additions & 16 deletions cmd/server_migrate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,26 +43,14 @@ func TestParseSQLiteSourceDSN(t *testing.T) {
wantPath: "/tmp/hub.db",
},
{
name: "file triple-slash absolute url",
in: "file:///var/lib/scion/hub.db",
wantDSN: "file:/var/lib/scion/hub.db?cache=shared",
wantPath: "/var/lib/scion/hub.db",
},
{
name: "file double-slash relative url",
in: "file://data/hub.db",
wantDSN: "file:data/hub.db?cache=shared",
wantPath: "data/hub.db",
},
{
name: "file triple-slash with query",
in: "file:///tmp/hub.db?mode=ro",
name: "file url with query",
in: "file:/tmp/hub.db?cache=shared",
wantDSN: "file:/tmp/hub.db?cache=shared",
wantPath: "/tmp/hub.db",
},
{
name: "file url with query",
in: "file:/tmp/hub.db?cache=shared",
name: "file url with triple slashes",
in: "file:///tmp/hub.db",
wantDSN: "file:/tmp/hub.db?cache=shared",
wantPath: "/tmp/hub.db",
},
Expand Down
4 changes: 3 additions & 1 deletion cmd/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package cmd

import (
"context"
"strings"
"testing"

"github.com/GoogleCloudPlatform/scion/pkg/config"
Expand All @@ -30,7 +31,8 @@ import (

func newTestStore(t *testing.T) store.Store {
t.Helper()
client, err := entc.OpenSQLite("file:"+t.Name()+"?mode=memory&cache=shared", entc.PoolConfig{})
dbName := strings.ReplaceAll(t.Name(), "/", "_")
client, err := entc.OpenSQLite("file:"+dbName+"?mode=memory&cache=shared", entc.PoolConfig{})
require.NoError(t, err)
require.NoError(t, entc.AutoMigrate(context.Background(), client))
s := entadapter.NewCompositeStore(client)
Expand Down
Loading