Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .design/project-log/pr321-review-feedback.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# PR #321 Review Feedback — Multi-Node Session Fixes

**Date:** 2026-06-06
**PR:** GoogleCloudPlatform/scion#321
**Branch:** postgres/delta-fixes
**Commit:** a1e715f

## Summary

Addressed 3 review comments from Gemini Code Assist on the multi-node session fixes PR.

## Changes

### 1. HIGH: SharedSigningSecret bypasses storage (pkg/hub/server.go)

**Problem:** When `SharedSigningSecret` is configured, `ensureSigningKey` derives keys deterministically but returns immediately without persisting them to the secret backend. External consumers (e.g., `scion-chat-app`) rely on label-based auto-discovery from GCP Secret Manager to find signing keys.

**Fix:** After deriving the key, call `syncSigningKeyToBackend()` to persist the derived key to the secret backend. This is a best-effort sync (warning on failure, non-fatal) since the key can always be re-derived. The sync uses the existing `syncSigningKeyToBackend` function which handles both the backend Set and the SQLite backup.

### 2. MEDIUM: Missing session secret warning in hosted mode (cmd/server_foreground.go)

**Problem:** In hosted mode, running without a session secret means each replica generates its own ephemeral key, completely breaking cross-replica sessions.

**Fix:** Added a `log.Println("WARNING: ...")` at startup when `hostedMode && hubCfg.SharedSigningSecret == ""`. Chose a warning over a hard failure to avoid breaking existing single-node hosted deployments that may not have configured a session secret yet.

### 3. MEDIUM: Nil guard in test (pkg/hub/web_test.go)

**Problem:** `TestSessionStore_DifferentSecretCannotDecode` accessed `sessC.Values` without checking `sessC != nil`.

**Fix:** Added `require.NotNil(t, sessC, ...)` before the `Values` access.

## Observations

- The `make ci` target shows pre-existing vet errors in `command_bus_test.go` (undefined `recExec`) and `broker_affinity_test.go` (undefined `newBroker`). These are cross-file test helper references that work with `go test` but fail `go vet` individually. Not related to this PR.
- The repo has many `gofmt` alignment diffs from the grove-to-project rename. These show up in `git diff` but were not included in this commit.
5 changes: 5 additions & 0 deletions cmd/sciontool/commands/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,11 @@ func handleLimitsExceeded(sup *supervisor.Supervisor, limitType, message string)
func handleAuthReset(hubClient *hub.Client, tokenRefreshCancel *context.CancelFunc, tokenRefreshDone *<-chan struct{}, statusHandler *handlers.StatusHandler, targetUID, targetGID int) {
log.TaggedInfo("AUTH_RESET", "Received SIGUSR2: auth reset requested")

if hubClient == nil {
log.Error("AUTH_RESET: Hub client is not configured, cannot reset auth")
return
}

newToken := hub.ReadTokenFile()
if newToken == "" {
log.Error("AUTH_RESET: Token file is empty after SIGUSR2, cannot reset auth")
Expand Down
35 changes: 29 additions & 6 deletions cmd/server_foreground.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ import (
"syscall"
"time"

"github.com/google/uuid"

"github.com/GoogleCloudPlatform/scion/pkg/agent"
"github.com/GoogleCloudPlatform/scion/pkg/api"
"github.com/GoogleCloudPlatform/scion/pkg/apiclient"
Expand Down Expand Up @@ -328,7 +330,11 @@ func runServerStart(cmd *cobra.Command, args []string) error {
// can authenticate back to the Hub API. Self-managed plugins
// handle their own credential lifecycle.
if !pluginMgr.IsSelfManaged(scionplugin.PluginTypeBroker, bt) && hubSrv != nil && s != nil {
brokerID := "plugin-broker-" + bt
// Use the same deterministic UUIDv5 as the α migration so the
// broker entity created here matches the migrated ID.
pluginBrokerNS := uuid.MustParse("5c104390-a1d0-5e9a-9b1e-5c104390a1d0")
legacyID := "plugin-broker-" + bt
brokerID := uuid.NewSHA1(pluginBrokerNS, []byte(legacyID)).String()
if authSvc := hubSrv.GetBrokerAuthService(); authSvc != nil {
// Ensure the runtime broker entity exists (required by
// the broker_secrets foreign key constraint).
Expand Down Expand Up @@ -861,13 +867,21 @@ func parseAdminEmails(cfg *config.GlobalConfig) []string {

// resolveSessionSecret resolves the deployment-wide session secret from the
// --session-secret flag, falling back to the SCION_SERVER_SESSION_SECRET env
// var. The same value backs both the web session cookie store and the hub JWT
// signing keys so that all replicas behind the load balancer agree.
// var (then SESSION_SECRET for compatibility). The same value backs both the
// web session cookie store and the hub JWT signing keys so that all replicas
// behind the load balancer agree.
func resolveSessionSecret() string {
if webSessionSecret != "" {
return webSessionSecret
secret := webSessionSecret
if secret == "" {
secret = os.Getenv("SCION_SERVER_SESSION_SECRET")
}
if secret == "" {
secret = os.Getenv("SESSION_SECRET")
}
if secret == "" && hostedMode {
slog.Warn("No session secret configured in hosted mode! Replicas will not be able to share sessions or agree on JWT signing keys, leading to login loops.")
}
return os.Getenv("SCION_SERVER_SESSION_SECRET")
return secret
}

// initHubServer creates and configures the Hub server.
Expand Down Expand Up @@ -949,6 +963,15 @@ func initHubServer(ctx context.Context, cfg *config.GlobalConfig, s store.Store,
SharedSigningSecret: resolveSessionSecret(),
}

// In hosted mode every replica must share the same session secret for
// cookies and JWT signing keys to work across the load balancer. Running
// without one means each replica generates its own ephemeral key, which
// breaks session persistence and causes login loops.
if hostedMode && hubCfg.SharedSigningSecret == "" {
log.Println("WARNING: hosted mode is enabled but no session secret is configured. " +
"Set --session-secret or SCION_SERVER_SESSION_SECRET to avoid cross-replica session failures.")
}

// Construct proxy authenticator when auth mode is "proxy"
if cfg.Auth.Mode == "proxy" && cfg.Auth.Proxy != nil {
switch cfg.Auth.Proxy.Provider {
Expand Down
26 changes: 13 additions & 13 deletions pkg/config/settings_v1.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,13 +252,13 @@ type V1ServerConfig struct {
// Mode selects the server operating mode: "workstation" (default) or "hosted".
// When set to "hosted", the server behaves as if --hosted were passed.
// The legacy value "production" is also accepted for backward compatibility.
Mode string `json:"mode,omitempty" yaml:"mode,omitempty" koanf:"mode"`
Env string `json:"env,omitempty" yaml:"env,omitempty" koanf:"env"`
Hub *V1ServerHubConfig `json:"hub,omitempty" yaml:"hub,omitempty" koanf:"hub"`
Broker *V1BrokerConfig `json:"broker,omitempty" yaml:"broker,omitempty" koanf:"broker"`
Database *V1DatabaseConfig `json:"database,omitempty" yaml:"database,omitempty" koanf:"database"`
Auth *V1AuthConfig `json:"auth,omitempty" yaml:"auth,omitempty" koanf:"auth"`
OAuth *V1OAuthConfig `json:"oauth,omitempty" yaml:"oauth,omitempty" koanf:"oauth"`
Mode string `json:"mode,omitempty" yaml:"mode,omitempty" koanf:"mode"`
Env string `json:"env,omitempty" yaml:"env,omitempty" koanf:"env"`
Hub *V1ServerHubConfig `json:"hub,omitempty" yaml:"hub,omitempty" koanf:"hub"`
Broker *V1BrokerConfig `json:"broker,omitempty" yaml:"broker,omitempty" koanf:"broker"`
Database *V1DatabaseConfig `json:"database,omitempty" yaml:"database,omitempty" koanf:"database"`
Auth *V1AuthConfig `json:"auth,omitempty" yaml:"auth,omitempty" koanf:"auth"`
OAuth *V1OAuthConfig `json:"oauth,omitempty" yaml:"oauth,omitempty" koanf:"oauth"`
Storage *V1StorageConfig `json:"storage,omitempty" yaml:"storage,omitempty" koanf:"storage"`
WorkspaceStorage *V1WorkspaceStorageConfig `json:"workspace_storage,omitempty" yaml:"workspace_storage,omitempty" koanf:"workspace_storage"`
Secrets *V1SecretsConfig `json:"secrets,omitempty" yaml:"secrets,omitempty" koanf:"secrets"`
Expand Down Expand Up @@ -388,12 +388,12 @@ type V1DatabaseConfig struct {
type V1AuthConfig struct {
// Mode selects the exclusive human auth mode: "oauth" (default), "proxy", or "dev".
// In proxy mode, OAuth handlers are disabled; in dev mode, dev token auth is used.
Mode string `json:"mode,omitempty" yaml:"mode,omitempty" koanf:"mode"`
DevMode bool `json:"dev_mode,omitempty" yaml:"dev_mode,omitempty" koanf:"dev_mode"`
DevToken string `json:"dev_token,omitempty" yaml:"dev_token,omitempty" koanf:"dev_token"`
DevTokenFile string `json:"dev_token_file,omitempty" yaml:"dev_token_file,omitempty" koanf:"dev_token_file"`
AuthorizedDomains []string `json:"authorized_domains,omitempty" yaml:"authorized_domains,omitempty" koanf:"authorized_domains"`
UserAccessMode string `json:"user_access_mode,omitempty" yaml:"user_access_mode,omitempty" koanf:"user_access_mode"`
Mode string `json:"mode,omitempty" yaml:"mode,omitempty" koanf:"mode"`
DevMode bool `json:"dev_mode,omitempty" yaml:"dev_mode,omitempty" koanf:"dev_mode"`
DevToken string `json:"dev_token,omitempty" yaml:"dev_token,omitempty" koanf:"dev_token"`
DevTokenFile string `json:"dev_token_file,omitempty" yaml:"dev_token_file,omitempty" koanf:"dev_token_file"`
AuthorizedDomains []string `json:"authorized_domains,omitempty" yaml:"authorized_domains,omitempty" koanf:"authorized_domains"`
UserAccessMode string `json:"user_access_mode,omitempty" yaml:"user_access_mode,omitempty" koanf:"user_access_mode"`
Proxy *V1ProxyConfig `json:"proxy,omitempty" yaml:"proxy,omitempty" koanf:"proxy"`
Transport *V1TransportConfig `json:"transport,omitempty" yaml:"transport,omitempty" koanf:"transport"`
}
Expand Down
30 changes: 24 additions & 6 deletions pkg/hub/admin_reset_auth.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,34 @@ func (s *Server) handleAdminResetAuthAll(w http.ResponseWriter, r *http.Request)
Error string `json:"error,omitempty"`
}

var succeeded []agentResult
var failed []agentResult
// Dispatch concurrently with a bounded worker pool to avoid timeouts
// when many agents are running across slow or unreachable brokers.
results := make(chan agentResult, len(agents.Items))
sem := make(chan struct{}, 20)

for _, agent := range agents.Items {
a := agent
if err := s.dispatcher.DispatchAgentResetAuth(ctx, &a); err != nil {
slog.Error("Bulk reset-auth failed for agent", "agent_id", a.ID, "error", err)
failed = append(failed, agentResult{ID: a.ID, Name: a.Name, Error: err.Error()})
go func() {
sem <- struct{}{}
defer func() { <-sem }()

res := agentResult{ID: a.ID, Name: a.Name}
if err := s.dispatcher.DispatchAgentResetAuth(ctx, &a); err != nil {
slog.Error("Bulk reset-auth failed for agent", "agent_id", a.ID, "error", err)
res.Error = err.Error()
}
results <- res
}()
}

var succeeded []agentResult
var failed []agentResult
for range agents.Items {
res := <-results
if res.Error != "" {
failed = append(failed, res)
} else {
succeeded = append(succeeded, agentResult{ID: a.ID, Name: a.Name})
succeeded = append(succeeded, res)
}
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/hub/command_bus.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,5 +333,5 @@ type NoopCommandBus struct{}
var _ CommandBus = NoopCommandBus{}

func (NoopCommandBus) NotifyBrokerCmd(context.Context, pgExecutor, string) error { return nil }
func (NoopCommandBus) SignalBrokerCmd(context.Context, string) error { return nil }
func (NoopCommandBus) SignalBrokerCmd(context.Context, string) error { return nil }
func (NoopCommandBus) Close() {}
3 changes: 2 additions & 1 deletion pkg/hub/command_bus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !no_sqlite

package hub

import (
Expand Down Expand Up @@ -473,4 +475,3 @@ func TestCommandBusIntegration_CloseIsIdempotent(t *testing.T) {
bus.Close()
bus.Close() // must not panic
}

30 changes: 16 additions & 14 deletions pkg/hub/dispatch_exec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,22 @@ import (
// lifecycleTestDispatcher captures which lifecycle op was called and with
// what args, so we can verify executeDispatch routes correctly.
type lifecycleTestDispatcher struct {
startCalled atomic.Int32
stopCalled atomic.Int32
restartCalled atomic.Int32
deleteCalled atomic.Int32
checkPromptCalled atomic.Int32
finalizeEnvCalled atomic.Int32
createCalled atomic.Int32
lastTask string
checkPromptResult bool
lastDeleteFiles bool
lastFinalizeEnv map[string]string
}

func (d *lifecycleTestDispatcher) DispatchAgentCreate(context.Context, *store.Agent) error { return nil }
startCalled atomic.Int32
stopCalled atomic.Int32
restartCalled atomic.Int32
deleteCalled atomic.Int32
checkPromptCalled atomic.Int32
finalizeEnvCalled atomic.Int32
createCalled atomic.Int32
lastTask string
checkPromptResult bool
lastDeleteFiles bool
lastFinalizeEnv map[string]string
}

func (d *lifecycleTestDispatcher) DispatchAgentCreate(context.Context, *store.Agent) error {
return nil
}
func (d *lifecycleTestDispatcher) DispatchAgentProvision(context.Context, *store.Agent) error {
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/hub/events_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func (noopDispatcher) DispatchAgentProvision(_ context.Context, _ *store.Agent)
func (noopDispatcher) DispatchAgentStart(_ context.Context, _ *store.Agent, _ string) error {
return nil
}
func (noopDispatcher) DispatchAgentStop(_ context.Context, _ *store.Agent) error { return nil }
func (noopDispatcher) DispatchAgentStop(_ context.Context, _ *store.Agent) error { return nil }
func (noopDispatcher) DispatchAgentRestart(_ context.Context, _ *store.Agent) error { return nil }
func (noopDispatcher) DispatchAgentResetAuth(_ context.Context, _ *store.Agent) error { return nil }
func (noopDispatcher) DispatchAgentDelete(_ context.Context, _ *store.Agent, _, _, _ bool, _ time.Time) error {
Expand Down
7 changes: 4 additions & 3 deletions pkg/hub/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -2578,9 +2578,9 @@ type GroupMessageRecipientResult struct {

// GroupMessageResponse is the JSON response for a set[] message delivery.
type GroupMessageResponse struct {
GroupID string `json:"group_id"`
Delivered int `json:"delivered"`
Failed int `json:"failed"`
GroupID string `json:"group_id"`
Delivered int `json:"delivered"`
Failed int `json:"failed"`
Results []GroupMessageRecipientResult `json:"results"`
}

Expand Down Expand Up @@ -9450,6 +9450,7 @@ func (s *Server) handleProjectImportTemplates(w http.ResponseWriter, r *http.Req
Count: len(imported),
})
}

// ImportResourcesRequest is the body for the unified import endpoint
// (POST /api/v1/resources/import). It imports a single kind of resource from a
// remote source URL into the given scope.
Expand Down
18 changes: 9 additions & 9 deletions pkg/hub/httpdispatcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ import (
"github.com/GoogleCloudPlatform/scion/pkg/observability/dispatchmetrics"
"github.com/GoogleCloudPlatform/scion/pkg/secret"
"github.com/GoogleCloudPlatform/scion/pkg/store"
"go.opentelemetry.io/otel/attribute"
"github.com/go-jose/go-jose/v4/jwt"
"github.com/google/uuid"
"go.opentelemetry.io/otel/attribute"
)

// HTTPRuntimeBrokerClient is an HTTP-based implementation of RuntimeBrokerClient.
Expand Down Expand Up @@ -130,14 +130,14 @@ type GitHubAppTokenMinter interface {
// It looks up the runtime broker endpoint from the store and uses HTTPRuntimeBrokerClient
// to make the actual API calls.
type HTTPAgentDispatcher struct {
store store.Store
client RuntimeBrokerClient
tokenGenerator AgentTokenGenerator
secretBackend secret.SecretBackend
authzService *AuthzService // Optional authz service for progeny secret verification
githubAppMinter GitHubAppTokenMinter // Optional GitHub App token minter
hubEndpoint string // Hub endpoint URL for agents to call back
hubID string // Hub instance ID for hub-scoped queries
store store.Store
client RuntimeBrokerClient
tokenGenerator AgentTokenGenerator
secretBackend secret.SecretBackend
authzService *AuthzService // Optional authz service for progeny secret verification
githubAppMinter GitHubAppTokenMinter // Optional GitHub App token minter
hubEndpoint string // Hub endpoint URL for agents to call back
hubID string // Hub instance ID for hub-scoped queries
devAuthToken string // Dev auth token to inject into agent env (dev-auth mode only)
transportMinter TransportTokenMinter // Optional transport token minter for OIDC dispatch
transportAudience string // OIDC audience for transport tokens
Expand Down
2 changes: 1 addition & 1 deletion pkg/hub/messagebroker.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ type MessageBrokerProxy struct {
mu sync.Mutex
subscriptions map[string][]eventbus.Subscription // projectID -> active subscriptions
pluginSubscriptions map[string]eventbus.Subscription // pattern -> plugin-initiated subscription
subscribedTopics map[string]bool // dedup guard for project-level subscriptions
subscribedTopics map[string]bool // dedup guard for project-level subscriptions
stopCh chan struct{}
stopOnce sync.Once
wg sync.WaitGroup
Expand Down
2 changes: 1 addition & 1 deletion pkg/hub/notifications.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import (
type NotificationDispatcher struct {
store store.Store
events EventPublisher
getDispatcher func() AgentDispatcher // lazy getter; dispatcher may be set after startup
getDispatcher func() AgentDispatcher // lazy getter; dispatcher may be set after startup
signalDeferred func(ctx context.Context, brokerID, agentID string) // NOTIFY wakeup for deferred messages
log *slog.Logger
messageLog *slog.Logger // dedicated message audit logger (nil = disabled)
Expand Down
6 changes: 3 additions & 3 deletions pkg/hub/proxyauth.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,9 @@ type jwksCache struct {

mu sync.RWMutex
keys map[string]jose.JSONWebKey // kid -> key
lastFetched time.Time // last successful fetch
lastAttempted time.Time // last fetch attempt (success or failure), for stampede prevention
refreshing bool // true while a refresh is in-flight
lastFetched time.Time // last successful fetch
lastAttempted time.Time // last fetch attempt (success or failure), for stampede prevention
refreshing bool // true while a refresh is in-flight
}

// GetKey returns the public key for the given kid. If the kid is not found
Expand Down
4 changes: 2 additions & 2 deletions pkg/hub/reaper.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ import (
// gives the rolling-timeout wait ample time to fail organically before the
// reaper force-transitions the row.
const (
affinityStaleAge = 2 * defaultAffinityFreshness // 180s
dispatchStuckAge = 3 * dispatchRollingTimeout // 270s
affinityStaleAge = 2 * defaultAffinityFreshness // 180s
dispatchStuckAge = 3 * dispatchRollingTimeout // 270s
dispatchMaxRetries = 3
)

Expand Down
Loading