diff --git a/internal/mcp/tools.go b/internal/mcp/tools.go index 6ab777d66..c537dd460 100644 --- a/internal/mcp/tools.go +++ b/internal/mcp/tools.go @@ -62,6 +62,19 @@ func registerTools(server *mcp.Server) { Annotations: readOnly, }, logToolCall("get_topology", handleGetTopology)) + mcp.AddTool(server, &mcp.Tool{ + Name: "get_neighborhood", + Description: "Get the BFS-expanded neighborhood of a specific resource — the slice " + + "of the topology graph immediately relevant to one root. Cheaper and more " + + "focused than get_topology when you already know which resource you care " + + "about. Profile is 'auto' (default — picks a bounded edge set from the root " + + "kind) or 'all' (every edge type). Hops controls BFS depth (default 1, max " + + "2). Nodes are RBAC-filtered against the caller; dropped neighbors are " + + "listed in `omitted` with reason=rbac_denied. If max_nodes is exceeded " + + "mid-expansion, truncated=true is set and a partial subgraph is returned.", + Annotations: readOnly, + }, logToolCall("get_neighborhood", handleGetNeighborhood)) + mcp.AddTool(server, &mcp.Tool{ Name: "get_events", Description: "Get recent Kubernetes warning events, deduplicated and sorted by recency. " + @@ -831,50 +844,16 @@ func handleGetTopology(ctx context.Context, req *mcp.CallToolRequest, input topo return toJSONResult(topo) } -// clusterScopedTopologyKinds maps topology NodeKinds for cluster-scoped -// resources to the (group, resource) tuple a SAR needs. Topology pulls -// these from the SA-populated cache regardless of the caller's namespace -// scope, so callers without per-kind RBAC must have them stripped. -// -// This is a denylist: it must enumerate every cluster-scoped kind the -// topology builder creates. Drift here = silent leak. See the checklist -// comment on NodeKind in pkg/topology/types.go and the planned scope- -// driven follow-up that removes the central table. -// -// KindNamespace is intentionally excluded — handled by per-user filter -// upstream. KindNodeClass has multiple entries (one per cloud provider) -// because the topology builder iterates EC2NodeClass / AKSNodeClass / -// GCPNodeClass under the same NodeKind label; a denial on any provider -// strips all NodeClass nodes. canReadClusterScopedKind's unknown-kind -// passthrough makes providers absent from the cluster's discovery -// non-blocking. -var clusterScopedTopologyKinds = []struct { - kind topology.NodeKind - group string - resource string -}{ - {topology.KindNode, "", "nodes"}, - {topology.KindNodePool, "karpenter.sh", "nodepools"}, - {topology.KindNodeClaim, "karpenter.sh", "nodeclaims"}, - {topology.KindNodeClass, "karpenter.k8s.aws", "ec2nodeclasses"}, - {topology.KindNodeClass, "karpenter.azure.com", "aksnodeclasses"}, - {topology.KindNodeClass, "karpenter.k8s.gcp", "gcpnodeclasses"}, - {topology.KindGatewayClass, "gateway.networking.k8s.io", "gatewayclasses"}, - {topology.KindPV, "", "persistentvolumes"}, - {topology.KindStorageClass, "storage.k8s.io", "storageclasses"}, - {topology.KindCiliumClusterwideNetworkPolicy, "cilium.io", "ciliumclusterwidenetworkpolicies"}, - {topology.KindClusterNetworkPolicy, "policy.networking.k8s.io", "clusternetworkpolicies"}, -} - // deniedClusterScopedTopoKinds returns the set of cluster-scoped topology -// NodeKinds the calling user cannot list. Reuses canReadClusterScopedKind's -// per-user canI cache so subsequent topology calls within the same TTL -// don't re-SAR. +// NodeKinds the calling user cannot list. Walks topology.ClusterScopedKinds +// (centralized table — see pkg/topology/cluster_scoped_kinds.go). Reuses +// canReadClusterScopedKind's per-user canI cache so subsequent topology +// calls within the same TTL don't re-SAR. func deniedClusterScopedTopoKinds(ctx context.Context) map[topology.NodeKind]bool { deny := make(map[topology.NodeKind]bool) - for _, ck := range clusterScopedTopologyKinds { - if !canReadClusterScopedKind(ctx, ck.resource, ck.group, "list") { - deny[ck.kind] = true + for _, ck := range topology.ClusterScopedKinds { + if !canReadClusterScopedKind(ctx, ck.Resource, ck.Group, "list") { + deny[ck.Kind] = true } } return deny diff --git a/internal/mcp/tools_neighborhood.go b/internal/mcp/tools_neighborhood.go new file mode 100644 index 000000000..a913267a5 --- /dev/null +++ b/internal/mcp/tools_neighborhood.go @@ -0,0 +1,278 @@ +package mcp + +import ( + "context" + "fmt" + "strings" + + "github.com/modelcontextprotocol/go-sdk/mcp" + + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// Neighborhood tool input. +type getNeighborhoodInput struct { + Kind string `json:"kind" jsonschema:"resource kind: pod, deployment, service, application, etc."` + Group string `json:"group,omitempty" jsonschema:"API group required to disambiguate kinds that collide across groups. Examples: serving.knative.dev for KNative Service (vs core/v1 Service), cluster.x-k8s.io for CAPI Cluster (vs CNPG Cluster), networking.istio.io for Istio Gateway (vs gateway.networking.k8s.io Gateway). Omit for kinds with no known collisions."` + Namespace string `json:"namespace,omitempty" jsonschema:"resource namespace; omit for cluster-scoped kinds"` + Name string `json:"name" jsonschema:"resource name"` + Profile string `json:"profile,omitempty" jsonschema:"neighborhood breadth: auto or all. Default: auto (picks a bounded edge set from the root kind)."` + Hops int `json:"hops,omitempty" jsonschema:"BFS depth. Default 1, max 2."` + MaxNodes int `json:"max_nodes,omitempty" jsonschema:"node-budget cap. Default 25. When the cap is hit mid-expansion, truncated=true is set and the partial subgraph is returned."` +} + +// neighborhoodResult is the MCP wire shape. Matches the REST envelope so +// agents that consume both surfaces parse identically. +type neighborhoodResult struct { + Root topology.ResourceRef `json:"root"` + Subgraph neighborhoodSubgraphMCP `json:"subgraph"` + Truncated bool `json:"truncated"` + Omitted []resourcecontext.OmittedField `json:"omitted,omitempty"` +} + +type neighborhoodSubgraphMCP struct { + Nodes []topology.Node `json:"nodes"` + Edges []topology.Edge `json:"edges"` +} + +func handleGetNeighborhood(ctx context.Context, req *mcp.CallToolRequest, input getNeighborhoodInput) (*mcp.CallToolResult, any, error) { + cache := k8s.GetResourceCache() + if cache == nil { + return nil, nil, fmt.Errorf("not connected to cluster") + } + if input.Kind == "" || input.Name == "" { + return nil, nil, fmt.Errorf("kind and name are required") + } + + // RBAC for the root. Topology pseudo-kinds (NodeClass, NodePool, NodeClaim, + // …) FIRST: ClassifyKindScope doesn't recognize them ("nodeclass" isn't a + // real K8s kind — the variants are EC2NodeClass / AKSNodeClass / GCPNodeClass). + // Without this branch we fall into the namespaced arm below and reject as + // "namespace is required" even though the agent sees these kinds in + // get_topology output. topology.RBACTuplesForKind returns the per-variant + // SAR tuples — we iterate through canReadInNamespace and allow on any + // pass, matching the per-node gate's first-success semantics. + if pseudoTuples, tracked, fallthroughAllow := topology.RBACTuplesForKind(input.Kind, input.Group, pseudoKindDiscoveryLookupMCP()); tracked { + if !allowPseudoKindTuplesMCP(ctx, pseudoTuples, fallthroughAllow) { + return nil, nil, fmt.Errorf("forbidden: %s requires explicit cluster-scoped RBAC", input.Kind) + } + } else if clusterScoped, gvrGroup, gvrResource := k8s.ClassifyKindScope(input.Kind, input.Group); clusterScoped { + if !canReadClusterScopedKind(ctx, gvrResource, gvrGroup, "get") { + return nil, nil, fmt.Errorf("forbidden: %s requires explicit cluster-scoped RBAC", input.Kind) + } + } else { + if input.Namespace == "" { + return nil, nil, fmt.Errorf("namespace is required for namespaced kinds") + } + if !checkNamespaceAccess(ctx, input.Namespace) { + return nil, nil, fmt.Errorf("forbidden: no access to namespace %q", input.Namespace) + } + } + + opts := topology.NeighborhoodOptions{ + Profile: resolveProfile(input.Profile), + Hops: input.Hops, + MaxNodes: input.MaxNodes, + } + if opts.Hops <= 0 { + opts.Hops = 1 + } + if opts.MaxNodes <= 0 { + opts.MaxNodes = 25 + } + // Top-end clamps symmetric with REST. BFS clamps Hops internally + // (neighborhoodMaxHops) but doing it here too means opts.Hops is + // correct if anything inspects/logs it before BFS. + if opts.Hops > 2 { + opts.Hops = 2 + } + if opts.MaxNodes > 200 { + opts.MaxNodes = 200 + } + + // Build the full topology and slice via BFS. The MCP server doesn't own + // a topology memoizer (the REST server does), so we accept the per-call + // rebuild cost here — neighborhood is a low-frequency tool. + // + // dp is captured once and threaded into both Builder and BuildNeighborhoodWithIndex + // so root-ID construction can resolve CRD plurals correctly (without it, + // buildNodeID falls back to the static kindMap which only covers built-in kinds). + dp := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + buildOpts := topology.DefaultBuildOptions() + buildOpts.IncludeReplicaSets = true + buildOpts.ForRelationshipCache = true + // Override DefaultBuildOptions' Secret-elision: with IncludeSecrets=false, + // root lookup for kind=secret produces an empty subgraph and the handler + // returns "resource not found" even for authorized users. The Allow gate + // below applies the per-namespace `get secrets` SAR per node, so + // unauthorized users still get the same "not found" via the empty-subgraph + // path — existence-hiding preserved. + buildOpts.IncludeSecrets = true + topo, err := topology.NewBuilder(k8s.NewTopologyResourceProvider(cache)). + WithDynamic(dp). + Build(buildOpts) + if err != nil { + return nil, nil, fmt.Errorf("failed to build topology: %w", err) + } + // Build the inverted index once and reuse it across BFS expansion plus + // any per-resource relationship lookups downstream. Without a memoizer + // the cost is paid every call, but it's still cheaper than scanning + // topo.Edges from inside the BFS loop (O(E) per hop level). + idx := topology.IndexByResource(topo) + + root := topology.ResourceRef{ + Kind: displayKindForMCP(input.Kind), + Namespace: input.Namespace, + Name: input.Name, + Group: input.Group, + } + // Pre-filter RBAC into BFS so forbidden nodes can't shape the visible + // graph (path-fragment effects, budget consumption). See the matching + // REST handler for the rationale. + opts.Allow = func(n *topology.Node) bool { + return canReadNeighborhoodNodeMCP(ctx, n) + } + + sub := topology.BuildNeighborhoodWithIndex(topo, root, opts, idx, dp) + if sub.AmbiguousRoot { + return nil, nil, fmt.Errorf("resource kind is ambiguous for %s/%s/%s; provide group", input.Kind, input.Namespace, input.Name) + } + if len(sub.Nodes) == 0 { + return nil, nil, fmt.Errorf("resource not found in topology: %s/%s/%s", input.Kind, input.Namespace, input.Name) + } + + // Use the resolved root node's Kind for the response. displayKindForMCP + // only normalizes built-in kinds (Pod, Deployment, …); pseudo-kinds + // like "nodeclass"/"nodepool" pass through lowercase while subgraph + // nodes carry display-form NodeKind ("NodeClass"). Without this rewrite + // the response's root.kind would diverge from subgraph.nodes[0].kind + // within the same payload. Matches the REST handler's identical fix. + rootResp := root + rootResp.Kind = string(sub.Nodes[0].Kind) + + result := neighborhoodResult{ + Root: rootResp, + Subgraph: neighborhoodSubgraphMCP{ + Nodes: sub.Nodes, + Edges: sub.Edges, + }, + Truncated: sub.Truncated, + } + if sub.RBACDenied > 0 { + // Aggregated rather than per-node — denied node refs would + // re-leak existence info the Allow gate exists to hide. + result.Omitted = append(result.Omitted, resourcecontext.OmittedField{ + Field: "subgraph.nodes", + Reason: resourcecontext.OmittedRBACDenied, + }) + } + return toJSONResult(result) +} + +// resolveProfile is retained as a thin shim around topology.ResolveProfile +// so the local call sites in this file don't need updating. New callers +// should use topology.ResolveProfile directly. +func resolveProfile(s string) topology.Profile { + return topology.ResolveProfile(s) +} + +// displayKindForMCP normalizes a lowercased / plural kind into the +// display-form used by topology nodes. MCP inputs are lowercase by +// convention; the topology graph uses display forms (Pod, Deployment, …). +func displayKindForMCP(kind string) string { + return normalizeDisplayKind(strings.ToLower(kind)) +} + +// canReadNeighborhoodNodeMCP is the MCP-side per-node RBAC gate. Mirrors +// the REST canReadNeighborhoodNode — same decision tree, different per-user +// check function. Tuple-selection logic lives in topology.RBACTuplesForNode +// so both surfaces stay in lockstep when the pseudo-kind table or Secret- +// tightening rules evolve. +// +// See REST canReadNeighborhoodNode for the Secret SAR rationale — namespace +// access alone leaks Secrets when the cache SA has cluster-wide secrets RBAC +// the calling user doesn't. +func canReadNeighborhoodNodeMCP(ctx context.Context, n *topology.Node) bool { + // Namespace-list gate is protocol-specific; apply it here for namespaced + // nodes BEFORE consulting the shared helper. + if n != nil && n.Data != nil { + if ns, ok := n.Data["namespace"].(string); ok && ns != "" { + if !checkNamespaceAccess(ctx, ns) { + return false + } + } + } + + decision, tuples := topology.RBACTuplesForNode(n, pseudoKindDiscoveryLookupMCP()) + switch decision { + case topology.NodeRBACAllow: + return true + case topology.NodeRBACDeny: + return false + case topology.NodeRBACCheckTuples: + for _, t := range tuples { + if canReadInNamespace(ctx, t.Group, t.Resource, t.Namespace, "get") { + return true + } + } + return false + case topology.NodeRBACConsultClassifyKindScope: + // Cluster-scoped node that isn't a tracked pseudo-kind. Fall back to + // the regular static-catalogue / discovery path. Unclassified kinds + // allow-through: the topology graph wouldn't have surfaced the node + // for an unprivileged SA either. + group := "" + if n.Data != nil { + if v, ok := n.Data["apiVersion"].(string); ok { + group = topology.APIVersionGroup(v) + } + } + clusterScoped, gvrGroup, gvrResource := k8s.ClassifyKindScope(string(n.Kind), group) + if !clusterScoped { + return true + } + return canReadClusterScopedKind(ctx, gvrResource, gvrGroup, "get") + default: + // New decision values must be handled explicitly; default-deny. + return false + } +} + +// pseudoKindDiscoveryLookupMCP returns the function form of the discovery +// singleton that topology.RBACTuplesForKind / RBACTuplesForNode expect, or +// nil when discovery isn't initialised (test envs). See the doc on +// topology.PseudoKindDiscoveryLookup for why this is a function rather than +// an interface (typed-nil-into-interface gotcha). +func pseudoKindDiscoveryLookupMCP() topology.PseudoKindDiscoveryLookup { + disc := k8s.GetResourceDiscovery() + if disc == nil { + return nil + } + return disc.GetResourceWithGroup +} + +// allowPseudoKindTuplesMCP authorizes a list of per-variant SAR tuples +// returned by topology.RBACTuplesForKind for the root-preflight path. +// Iterates each tuple through canReadInNamespace and allows on the first +// pass; if the helper returned zero tuples + fallthroughAllow=true (every +// variant was filtered out by discovery), allow — matches the pre-existing +// "over-include on absent provider variants" behavior. +// +// We use canReadInNamespace(group, resource, "", "get") directly rather than +// canReadClusterScopedKind: canReadClusterScopedKind re-resolves the +// resource via ClassifyKindScope's discovery, which over-broadens +// (passthrough-allow) when the CRD is missing. The table is the source of +// truth for "this is cluster-scoped" — no need for discovery to re-confirm. +func allowPseudoKindTuplesMCP(ctx context.Context, tuples []topology.SARTuple, fallthroughAllow bool) bool { + if len(tuples) == 0 { + return fallthroughAllow + } + for _, t := range tuples { + if canReadInNamespace(ctx, t.Group, t.Resource, t.Namespace, "get") { + return true + } + } + return false +} diff --git a/internal/mcp/tools_neighborhood_test.go b/internal/mcp/tools_neighborhood_test.go new file mode 100644 index 000000000..003668787 --- /dev/null +++ b/internal/mcp/tools_neighborhood_test.go @@ -0,0 +1,470 @@ +package mcp + +import ( + "encoding/json" + "strings" + "testing" + + "github.com/modelcontextprotocol/go-sdk/mcp" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" + + "github.com/skyhook-io/radar/internal/k8s" + pkgauth "github.com/skyhook-io/radar/pkg/auth" + "github.com/skyhook-io/radar/pkg/topology" +) + +// canReadNeighborhoodNodeMCP must apply per-kind Secret RBAC inside an allowed +// namespace. Mirrors handleGetResource: namespace access alone is NOT a +// sufficient gate for Secrets because the SA backing the cache may carry +// cluster-wide secrets RBAC (Helm release visibility) the calling user lacks. + +func makeSecretNode(ns, name string) *topology.Node { + return &topology.Node{ + ID: "secret/" + ns + "/" + name, + Kind: topology.KindSecret, + Name: name, + Status: topology.StatusHealthy, + Data: map[string]any{ + "namespace": ns, + "apiVersion": "v1", + }, + } +} + +func makeConfigMapNode(ns, name string) *topology.Node { + return &topology.Node{ + ID: "configmap/" + ns + "/" + name, + Kind: topology.KindConfigMap, + Name: name, + Status: topology.StatusHealthy, + Data: map[string]any{ + "namespace": ns, + "apiVersion": "v1", + }, + } +} + +// TestCanReadNeighborhoodNodeMCP_SecretRequiresPerKindRBAC pins the gate: +// a user with namespace access but no per-namespace `get secrets` SAR must be +// denied. The cache may hold Secrets the user can't read directly — the +// neighborhood graph must not leak them. +func TestCanReadNeighborhoodNodeMCP_SecretRequiresPerKindRBAC(t *testing.T) { + ctx := withTestUserPerms(t, "alice", nil, []string{"default"}) + // alice has namespace access to default but explicit deny on secrets-get. + perms := getPermCache().Get("alice") + perms.SetCanI("get", "", "secrets", "default", false) + + secret := makeSecretNode("default", "nginx-tls") + if canReadNeighborhoodNodeMCP(ctx, secret) { + t.Error("Secret node leaked through MCP neighborhood gate: namespace access without per-kind RBAC must deny") + } +} + +// Counterpart: user WITH namespace access AND per-namespace secrets-get +// must be allowed. Locks down the positive path so the gate isn't blanket-deny. +func TestCanReadNeighborhoodNodeMCP_SecretAllowedWithPerKindRBAC(t *testing.T) { + ctx := withTestUserPerms(t, "bob", nil, []string{"default"}) + perms := getPermCache().Get("bob") + perms.SetCanI("get", "", "secrets", "default", true) + + secret := makeSecretNode("default", "nginx-tls") + if !canReadNeighborhoodNodeMCP(ctx, secret) { + t.Error("authorized user denied: namespace access + per-kind RBAC should pass") + } +} + +// Sanity: non-Secret namespaced kinds (e.g. ConfigMap) ride on the namespace +// gate alone. The Secret-specific tightening must not regress that. +func TestCanReadNeighborhoodNodeMCP_ConfigMapStaysOnNamespaceGate(t *testing.T) { + ctx := withTestUserPerms(t, "alice", nil, []string{"default"}) + // No configmap-specific SAR seeded — namespace access alone should pass + // because we deliberately do NOT tighten that for ConfigMap. + cm := makeConfigMapNode("default", "nginx-conf") + if !canReadNeighborhoodNodeMCP(ctx, cm) { + t.Error("ConfigMap node denied: namespace access should be sufficient (no per-kind tightening for ConfigMap)") + } +} + +// Smoke: no-auth callers (no user in context) pass through. Matches the +// passthrough behavior of every per-namespace RBAC helper in this package. +func TestCanReadNeighborhoodNodeMCP_NoAuthPassthrough(t *testing.T) { + // Empty context — no user attached. Helpers should not deny. + secret := makeSecretNode("default", "nginx-tls") + if !canReadNeighborhoodNodeMCP(pkgauth.ContextWithUser(t.Context(), nil), secret) { + t.Error("no-auth caller denied — Secret gate must not fail-closed when auth is disabled") + } +} + +// makeNodeClassNode builds a topology pseudo-kind NodeClass node. The Kind is +// the synthesized topology label ("NodeClass"), not a real K8s resource — the +// actual variants are EC2NodeClass / AKSNodeClass / GCPNodeClass. +func makeNodeClassNode(name string) *topology.Node { + return &topology.Node{ + ID: "nodeclass/" + name, + Kind: topology.KindNodeClass, + Name: name, + Status: topology.StatusHealthy, + Data: map[string]any{ + // No namespace — NodeClass is cluster-scoped. + "apiVersion": "karpenter.k8s.aws/v1", + }, + } +} + +func makeKnativeServiceNode(ns, name string) *topology.Node { + return &topology.Node{ + ID: "knativeservice/" + ns + "/" + name, + Kind: topology.KindKnativeService, + Name: name, + Status: topology.StatusHealthy, + Data: map[string]any{ + "namespace": ns, + "apiVersion": "serving.knative.dev/v1", + }, + } +} + +// TestCanReadNeighborhoodNodeMCP_NodeClassRequiresPerProviderSAR pins the +// pseudo-kind cluster-scoped fix: NodeClass is a topology-only label that +// ClassifyKindScope doesn't recognize. Without the clusterScopedTopologyKinds +// lookup, NodeClass nodes hit the unclassified+empty-namespace allow branch +// and surface to users without provider-specific RBAC. +func TestCanReadNeighborhoodNodeMCP_NodeClassDeniedWithoutSAR(t *testing.T) { + ctx := withTestUserPerms(t, "alice", nil, nil) + perms := getPermCache().Get("alice") + // Deny all NodeClass variants. The helper iterates the table; without + // discovery only ec2 enters the SAR loop (group != "" check is harmless + // — the discovery filter is skip-when-missing). + perms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", false) + perms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + perms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + + n := makeNodeClassNode("default-class") + if canReadNeighborhoodNodeMCP(ctx, n) { + t.Error("NodeClass pseudo-kind leaked to user without any provider get-SAR") + } +} + +// Counterpart: user with one provider's get-SAR sees NodeClass nodes. Mirrors +// the topology-strip semantics — denial requires ALL discovery-present +// providers to fail. +func TestCanReadNeighborhoodNodeMCP_NodeClassAllowedWithProviderSAR(t *testing.T) { + ctx := withTestUserPerms(t, "bob", nil, nil) + perms := getPermCache().Get("bob") + // Bob has EC2 access only — should still pass for NodeClass. + perms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", true) + perms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + perms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + + n := makeNodeClassNode("default-class") + if !canReadNeighborhoodNodeMCP(ctx, n) { + t.Error("NodeClass denied for user with EC2 get-SAR — single-provider RBAC must allow") + } +} + +// TestCanReadNeighborhoodNodeMCP_NodeClassPerVariantDeniesWrongProvider +// pins the per-variant authorization fix on the MCP side: a user with +// EC2 RBAC must not see AKS NodeClass nodes. Mirrors the REST test. +func TestCanReadNeighborhoodNodeMCP_NodeClassPerVariantDeniesWrongProvider(t *testing.T) { + ctx := withTestUserPerms(t, "bob", nil, nil) + perms := getPermCache().Get("bob") + perms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", true) + perms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + perms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + + aks := &topology.Node{ + ID: "nodeclass/aks-default", + Kind: topology.KindNodeClass, + Name: "aks-default", + Status: topology.StatusHealthy, + Data: map[string]any{ + "apiVersion": "karpenter.azure.com/v1beta1", + }, + } + if canReadNeighborhoodNodeMCP(ctx, aks) { + t.Error("AKS NodeClass leaked to user with EC2-only RBAC — per-variant gate must deny cross-provider") + } + + ec2 := &topology.Node{ + ID: "nodeclass/ec2-default", + Kind: topology.KindNodeClass, + Name: "ec2-default", + Status: topology.StatusHealthy, + Data: map[string]any{ + "apiVersion": "karpenter.k8s.aws/v1", + }, + } + if !canReadNeighborhoodNodeMCP(ctx, ec2) { + t.Error("EC2 NodeClass denied to user with EC2 RBAC — per-variant gate must allow the matching provider") + } +} + +// KnativeService is a namespaced pseudo-kind. The cluster-scoped table +// shouldn't match it; the helper must fall through to the namespaced branch +// and ride on namespace access alone (no per-kind tightening for Knative). +func TestCanReadNeighborhoodNodeMCP_KnativeServiceUsesNamespaceGate(t *testing.T) { + ctx := withTestUserPerms(t, "alice", nil, []string{"prod"}) + n := makeKnativeServiceNode("prod", "api") + if !canReadNeighborhoodNodeMCP(ctx, n) { + t.Error("namespaced pseudo-kind KnativeService denied — namespace access should be sufficient") + } + + // User without namespace access → denied. + ctxDenied := withTestUserPerms(t, "carol", nil, []string{"staging"}) + if canReadNeighborhoodNodeMCP(ctxDenied, n) { + t.Error("KnativeService allowed for user without namespace access — namespace gate must apply") + } +} + +// pseudoKindTuplesForTestMCP mirrors the inline topology.RBACTuplesForKind +// call in handleGetNeighborhood. disc=nil in tests so no rows are filtered. +func pseudoKindTuplesForTestMCP(kind, group string) (tuples []topology.SARTuple, fallthroughAllow bool) { + t, _, fa := topology.RBACTuplesForKind(kind, group, nil) + return t, fa +} + +// TestAllowPseudoKindTuplesMCP_NodeClass pins the MCP-side root-preflight +// helper: kind-only lookup (no node yet) iterates every table row under that +// kind and allows on any pass. Mirrors the REST test of the same name. +func TestAllowPseudoKindTuplesMCP_NodeClass_DeniedWithoutSAR(t *testing.T) { + ctx := withTestUserPerms(t, "alice", nil, nil) + perms := getPermCache().Get("alice") + perms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", false) + perms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + perms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + + tuples, fallthroughAllow := pseudoKindTuplesForTestMCP("nodeclass", "") + if len(tuples) == 0 { + t.Fatal("RBACTuplesForKind returned 0 tuples for nodeclass — table wiring is broken") + } + if allowPseudoKindTuplesMCP(ctx, tuples, fallthroughAllow) { + t.Error("nodeclass root preflight allowed user without any provider get-SAR — must deny") + } +} + +func TestAllowPseudoKindTuplesMCP_NodeClass_AllowedWithProviderSAR(t *testing.T) { + ctx := withTestUserPerms(t, "bob", nil, nil) + perms := getPermCache().Get("bob") + // EC2 only — single-provider grant must be enough for the kind-level gate. + perms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", true) + perms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + perms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + + tuples, fallthroughAllow := pseudoKindTuplesForTestMCP("nodeclass", "") + if !allowPseudoKindTuplesMCP(ctx, tuples, fallthroughAllow) { + t.Error("nodeclass root preflight denied user with EC2 get-SAR — single-provider RBAC must pass") + } +} + +// TestHandleGetNeighborhoodMCP_NodeClassRootNotNamespaceRequired pins the +// integration fix on the MCP surface: an agent calling get_neighborhood with +// kind="nodeclass" and Namespace="" (which is what get_topology output +// suggests) must NOT receive "namespace is required" — that's the bug we're +// closing. +// +// Two cases: +// - Unauthorized user (no provider SAR): error must mention "forbidden", +// not "namespace is required". +// - Authorized user (EC2 SAR): preflight passes, BFS runs against the +// seeded cache (no NodeClass nodes there), root lookup misses → "not +// found" error, also not "namespace is required". +func TestHandleGetNeighborhoodMCP_NodeClassRootNotNamespaceRequired(t *testing.T) { + setupSecretRefCacheMCP(t) + + // Unauthorized + ctxDeny := withTestUserPerms(t, "alice", nil, nil) + denyPerms := getPermCache().Get("alice") + denyPerms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", false) + denyPerms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + denyPerms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + _, _, err := handleGetNeighborhood(ctxDeny, nil, getNeighborhoodInput{ + Kind: "nodeclass", + Name: "foo", + }) + if err == nil { + t.Fatal("unauthorized nodeclass call succeeded — preflight must deny") + } + if strings.Contains(err.Error(), "namespace is required") { + t.Errorf("unauthorized nodeclass returned namespace error %q — regression: pseudo-kind misclassified as namespaced", err) + } + if !strings.Contains(err.Error(), "forbidden") { + t.Errorf("unauthorized nodeclass error = %q, want 'forbidden' substring", err) + } + + // Authorized + ctxAllow := withTestUserPerms(t, "bob", nil, nil) + allowPerms := getPermCache().Get("bob") + allowPerms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", true) + allowPerms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + allowPerms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + _, _, err2 := handleGetNeighborhood(ctxAllow, nil, getNeighborhoodInput{ + Kind: "nodeclass", + Name: "foo", + }) + if err2 == nil { + t.Fatal("authorized nodeclass call succeeded for nonexistent node — expected 'not found' error from empty subgraph") + } + if strings.Contains(err2.Error(), "namespace is required") { + t.Errorf("authorized nodeclass returned namespace error %q even WITH EC2 SAR — regression", err2) + } + if !strings.Contains(err2.Error(), "not found") { + t.Errorf("authorized nodeclass error = %q, want 'not found' substring (preflight passed, BFS empty)", err2) + } +} + +// Same shape for NodePool — sanity that the fix covers every cluster-scoped +// pseudo-kind in the table, not just NodeClass. +func TestHandleGetNeighborhoodMCP_NodePoolRootNotNamespaceRequired(t *testing.T) { + setupSecretRefCacheMCP(t) + + ctxDeny := withTestUserPerms(t, "alice", nil, nil) + denyPerms := getPermCache().Get("alice") + denyPerms.SetCanI("get", "karpenter.sh", "nodepools", "", false) + _, _, err := handleGetNeighborhood(ctxDeny, nil, getNeighborhoodInput{ + Kind: "nodepool", + Name: "foo", + }) + if err == nil { + t.Fatal("unauthorized nodepool call succeeded — preflight must deny") + } + if strings.Contains(err.Error(), "namespace is required") { + t.Errorf("unauthorized nodepool returned namespace error %q — pseudo-kind misclassified", err) + } + if !strings.Contains(err.Error(), "forbidden") { + t.Errorf("unauthorized nodepool error = %q, want 'forbidden' substring", err) + } + + ctxAllow := withTestUserPerms(t, "bob", nil, nil) + allowPerms := getPermCache().Get("bob") + allowPerms.SetCanI("get", "karpenter.sh", "nodepools", "", true) + _, _, err2 := handleGetNeighborhood(ctxAllow, nil, getNeighborhoodInput{ + Kind: "nodepool", + Name: "foo", + }) + if err2 == nil { + t.Fatal("authorized nodepool call succeeded for nonexistent node — expected 'not found'") + } + if strings.Contains(err2.Error(), "namespace is required") { + t.Errorf("authorized nodepool returned namespace error %q even with SAR — regression", err2) + } + if !strings.Contains(err2.Error(), "not found") { + t.Errorf("authorized nodepool error = %q, want 'not found' substring", err2) + } +} + +// setupSecretRefCacheMCP seeds a fake cache with a Deployment that references +// a Secret via Volumes — the only shape the topology builder uses to decide +// whether to surface the Secret node. Without this reference, IncludeSecrets= +// true alone wouldn't make the Secret appear (see pkg/topology/builder.go's +// "isReferenced" guard in section 9). +func setupSecretRefCacheMCP(t *testing.T) { + t.Helper() + replicas := int32(1) + fakeClient := fake.NewClientset( + &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: "default"}, + Status: corev1.NamespaceStatus{Phase: corev1.NamespaceActive}, + }, + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{Name: "nginx-tls", Namespace: "default"}, + Type: corev1.SecretTypeOpaque, + }, + &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: "nginx", Namespace: "default"}, + Spec: appsv1.DeploymentSpec{ + Replicas: &replicas, + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "nginx"}}, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "nginx"}}, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "nginx", Image: "nginx:1.25"}}, + Volumes: []corev1.Volume{{ + Name: "tls", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: "nginx-tls"}, + }, + }}, + }, + }, + }, + }, + ) + if err := k8s.InitTestResourceCache(fakeClient); err != nil { + t.Fatalf("InitTestResourceCache: %v", err) + } + k8s.SetConnectionStatus(k8s.ConnectionStatus{State: k8s.StateConnected, Context: "fake-test"}) + t.Cleanup(func() { + k8s.ResetTestState() + getPermCache().Invalidate() + }) +} + +// TestNeighborhoodMCP_SecretRootIncluded pins the IncludeSecrets fix on the +// MCP side. DefaultBuildOptions sets IncludeSecrets=false, so the Secret +// node isn't in the topology and the root lookup returns "resource not +// found" even for authorized users. The handler must override +// IncludeSecrets=true so authorized callers find the root, while the +// per-namespace `get secrets` Allow gate still denies unauthorized callers +// via the empty-subgraph path (preserving existence-hiding). +func TestNeighborhoodMCP_SecretRootIncluded(t *testing.T) { + setupSecretRefCacheMCP(t) + + // Authorized user: namespace access to default + per-namespace + // `get secrets`. Both Allow checks must pass; root resolves; result + // includes the Secret node. + ctx := withTestUserPerms(t, "bob", nil, []string{"default"}) + bobPerms := getPermCache().Get("bob") + bobPerms.SetCanI("get", "", "secrets", "default", true) + + call, _, err := handleGetNeighborhood(ctx, nil, getNeighborhoodInput{ + Kind: "secret", + Namespace: "default", + Name: "nginx-tls", + }) + if err != nil { + t.Fatalf("authorized user got error for Secret root: %v — IncludeSecrets override must surface Secret nodes in topology", err) + } + if call == nil || len(call.Content) == 0 { + t.Fatal("expected MCP CallToolResult content for authorized Secret root") + } + tc, ok := call.Content[0].(*mcp.TextContent) + if !ok { + t.Fatalf("unexpected content type: %T", call.Content[0]) + } + var result neighborhoodResult + if jsonErr := json.Unmarshal([]byte(tc.Text), &result); jsonErr != nil { + t.Fatalf("decode MCP result: %v (raw=%q)", jsonErr, tc.Text) + } + foundSecret := false + for _, n := range result.Subgraph.Nodes { + if n.Kind == topology.KindSecret && n.Name == "nginx-tls" { + foundSecret = true + break + } + } + if !foundSecret { + t.Errorf("authorized user's response missing Secret node — IncludeSecrets override should have surfaced it (nodes=%+v)", result.Subgraph.Nodes) + } + + // Unauthorized user: namespace access but NO per-namespace `get secrets` + // SAR. Allow rejects the root → empty subgraph → "not found" error. + // Existence-hiding preserved: same 404-shape result the + // IncludeSecrets=false path produced, but driven by RBAC. + ctxDenied := withTestUserPerms(t, "alice", nil, []string{"default"}) + alicePerms := getPermCache().Get("alice") + alicePerms.SetCanI("get", "", "secrets", "default", false) + + _, _, err2 := handleGetNeighborhood(ctxDenied, nil, getNeighborhoodInput{ + Kind: "secret", + Namespace: "default", + Name: "nginx-tls", + }) + if err2 == nil { + t.Error("unauthorized user got success for Secret root — Allow gate must produce not-found via empty subgraph (existence-hiding)") + } else if !strings.Contains(err2.Error(), "not found") { + t.Errorf("unauthorized user got unexpected error %v — expected 'not found' shape to mirror existence-hiding", err2) + } +} diff --git a/internal/server/neighborhood_handler.go b/internal/server/neighborhood_handler.go new file mode 100644 index 000000000..5bbb87b4d --- /dev/null +++ b/internal/server/neighborhood_handler.go @@ -0,0 +1,339 @@ +package server + +import ( + "log" + "net/http" + "strconv" + + "github.com/go-chi/chi/v5" + + "github.com/skyhook-io/radar/internal/k8s" + "github.com/skyhook-io/radar/pkg/resourcecontext" + "github.com/skyhook-io/radar/pkg/topology" +) + +// neighborhoodResponse is the wire shape returned by GET /api/ai/neighborhood. +// It deliberately differs from topology.Subgraph in two ways: +// +// - root + truncated are lifted to top-level for easy parsing +// - omitted carries resourcecontext-style RBAC drop records so agents can +// tell when authorized context is incomplete +type neighborhoodResponse struct { + Root topology.ResourceRef `json:"root"` + Subgraph neighborhoodSubgraph `json:"subgraph"` + Truncated bool `json:"truncated"` + Omitted []resourcecontext.OmittedField `json:"omitted,omitempty"` +} + +type neighborhoodSubgraph struct { + Nodes []topology.Node `json:"nodes"` + Edges []topology.Edge `json:"edges"` +} + +// handleAINeighborhood returns the BFS-expanded neighborhood of a root +// resource. See pkg/topology.BuildNeighborhood for the graph semantics. +// +// GET /api/ai/neighborhood/{kind}/{namespace}/{name} +// +// ?profile=auto|all (default: auto) +// ?hops=1|2 (default: 1) +// ?max_nodes=25 (default: 25) +// +// Cluster-scoped roots use "_" as the namespace placeholder (same convention +// as handleAIGetResource). +func (s *Server) handleAINeighborhood(w http.ResponseWriter, r *http.Request) { + if !s.requireConnected(w) { + return + } + + rawKind := chi.URLParam(r, "kind") + namespace := chi.URLParam(r, "namespace") + name := chi.URLParam(r, "name") + if namespace == "_" { + namespace = "" + } + if rawKind == "" || name == "" { + s.writeError(w, http.StatusBadRequest, "kind and name are required") + return + } + + // RBAC for the root. + group := r.URL.Query().Get("group") + // Topology pseudo-kinds (NodeClass, NodePool, NodeClaim, …) FIRST: these + // are synthesized labels that ClassifyKindScope doesn't recognize ("nodeclass" + // isn't a real K8s kind — the variants are EC2NodeClass / AKSNodeClass / + // GCPNodeClass). Without this branch the call falls into the namespaced + // arm below and 400s with "namespace is required" even though "_" was + // supplied (URL → namespace == ""). topology.RBACTuplesForKind returns the + // per-variant SAR tuples — we iterate through s.canRead and allow on any + // pass, matching the per-node gate's first-success semantics. + if pseudoTuples, tracked, fallthroughAllow := topology.RBACTuplesForKind(rawKind, group, pseudoKindDiscoveryLookup()); tracked { + if !s.allowPseudoKindTuples(r, pseudoTuples, fallthroughAllow) { + s.writeError(w, http.StatusForbidden, "insufficient permissions for cluster-scoped "+rawKind) + return + } + } else if rootClusterScoped, gvrGroup, gvrResource := k8s.ClassifyKindScope(rawKind, group); rootClusterScoped { + if !s.canRead(r, gvrGroup, gvrResource, "", "get") { + s.writeError(w, http.StatusForbidden, "insufficient permissions for cluster-scoped "+rawKind) + return + } + } else { + if namespace == "" { + s.writeError(w, http.StatusBadRequest, "namespace is required for namespaced kinds (use '_' for cluster-scoped)") + return + } + allowed := s.getUserNamespaces(r, []string{namespace}) + if noNamespaceAccess(allowed) { + s.writeError(w, http.StatusForbidden, "no access to namespace "+namespace) + return + } + } + + opts := parseNeighborhoodOptions(r) + + cache := k8s.GetResourceCache() + if cache == nil { + s.writeError(w, http.StatusServiceUnavailable, "Resource cache not available") + return + } + + // Build the full topology (memoized) and let BuildNeighborhood do the + // BFS slice. Cheaper than reaching into builder internals; topoMemo + // dedupes concurrent calls. We also fetch the cached RelationshipsIndex + // so the BFS expansion uses O(degree) edge lookups instead of paying + // an O(E) adjacency-build per request. + // + // dp is captured once and threaded into both Builder and BuildNeighborhoodWithIndex + // so root-ID construction can resolve CRD plurals correctly (without it, + // buildNodeID falls back to the static kindMap which only covers built-in kinds). + dp := k8s.NewTopologyDynamicProvider(k8s.GetDynamicResourceCache(), k8s.GetResourceDiscovery()) + buildOpts := topology.DefaultBuildOptions() + buildOpts.IncludeReplicaSets = true + buildOpts.ForRelationshipCache = true + // Override the DefaultBuildOptions Secret-elision: without this a request + // for kind=secret resolves to an empty subgraph (root missing in topology) + // and 404s even for authorized users. The Allow gate below applies the + // per-namespace `get secrets` SAR per node, so unauthorized users still + // get a 404 via the empty-subgraph path — existence-hiding preserved. + // + // The Memoizer keys on a hash that includes IncludeSecrets (see + // pkg/topology/memo.go memoKey), so this lives in a separate cache slot + // from the IncludeSecrets=false topology used elsewhere. + buildOpts.IncludeSecrets = true + build := func() (*topology.Topology, error) { + return topology.NewBuilder(k8s.NewTopologyResourceProvider(cache)). + WithDynamic(dp). + Build(buildOpts) + } + topo, err := s.topoMemo.Get(buildOpts, build) + if err != nil { + log.Printf("[neighborhood] Failed to build topology for %s %s/%s: %v", rawKind, namespace, name, err) + s.writeError(w, http.StatusInternalServerError, err.Error()) + return + } + // GetIndex piggybacks on the memo entry just populated by Get above — + // the index is computed once per topology refresh and reused across + // requests, matching the relationships hot path. + idx, err := s.topoMemo.GetIndex(buildOpts, build) + if err != nil { + log.Printf("[neighborhood] Failed to fetch topology index for %s %s/%s: %v", rawKind, namespace, name, err) + s.writeError(w, http.StatusInternalServerError, err.Error()) + return + } + + root := topology.ResourceRef{ + Kind: normalizeKind(rawKind), + Namespace: namespace, + Name: name, + Group: group, + } + + // Push RBAC into the BFS expansion itself. If we filtered post-hoc, + // a forbidden node could still influence which allowed nodes surface + // (acting as a path-fragment between two readable endpoints) and + // could consume the MaxNodes truncation budget before being dropped. + // Skipping during traversal keeps the visible graph independent of + // hidden nodes — both for security and for predictable truncation. + opts.Allow = func(n *topology.Node) bool { + return s.canReadNeighborhoodNode(r, n) + } + + sub := topology.BuildNeighborhoodWithIndex(topo, root, opts, idx, dp) + if sub.AmbiguousRoot { + s.writeError(w, http.StatusBadRequest, "resource kind is ambiguous; provide group") + return + } + if len(sub.Nodes) == 0 { + s.writeError(w, http.StatusNotFound, "resource not found in topology") + return + } + + // Use the resolved root node's Kind for the response, not the + // URL-derived (lowercase) form. Subgraph nodes carry display-form + // NodeKind values ("Pod", "KnativeService") — without this rewrite, + // the response's root.kind would be lowercase while + // subgraph.nodes[0].kind is display-form, breaking case-sensitive + // within-response matching and diverging from MCP's shape despite + // the header comment claiming both surfaces "parse identically". + rootResp := root + rootResp.Kind = string(sub.Nodes[0].Kind) + + resp := neighborhoodResponse{ + Root: rootResp, + Subgraph: neighborhoodSubgraph{ + Nodes: sub.Nodes, + Edges: sub.Edges, + }, + Truncated: sub.Truncated, + } + if sub.RBACDenied > 0 { + // Single aggregated omission rather than per-node entries — + // surfacing the specific names of denied nodes would defeat the + // existence-hiding guarantee the pre-filter provides. + resp.Omitted = append(resp.Omitted, resourcecontext.OmittedField{ + Field: "subgraph.nodes", + Reason: resourcecontext.OmittedRBACDenied, + }) + } + + s.writeJSON(w, resp) +} + +// parseNeighborhoodOptions reads the query string into NeighborhoodOptions +// with defaults (auto, hops=1, max_nodes=25) and clamps (hops max 2, max_nodes +// floor 1 / ceiling 200) applied. +func parseNeighborhoodOptions(r *http.Request) topology.NeighborhoodOptions { + q := r.URL.Query() + opts := topology.NeighborhoodOptions{ + Profile: topology.ProfileAuto, + Hops: 1, + MaxNodes: 25, + } + if p := q.Get("profile"); p != "" { + // Mirror MCP via topology.ResolveProfile so both surfaces normalize + // identically. Unknown values fall back to auto instead of silently + // broadening traversal to all edges. + opts.Profile = topology.ResolveProfile(p) + } + if h := q.Get("hops"); h != "" { + if n, err := strconv.Atoi(h); err == nil && n > 0 { + opts.Hops = n + } + } + if m := q.Get("max_nodes"); m != "" { + if n, err := strconv.Atoi(m); err == nil && n > 0 { + opts.MaxNodes = n + } + } + // Top-end clamps on both Hops and MaxNodes — keep responses bounded for + // agent contexts regardless of what the caller asks for. BFS also clamps + // internally (neighborhoodMaxHops), but doing it here too matches the + // doc above, keeps the two budget fields symmetric, and means + // opts.Hops is correct if anything inspects/logs it before BFS. + if opts.Hops > 2 { + opts.Hops = 2 + } + if opts.MaxNodes > 200 { + opts.MaxNodes = 200 + } + return opts +} + +// canReadNeighborhoodNode is the REST-side per-node RBAC gate. Mirrors the +// MCP equivalent canReadNeighborhoodNodeMCP — same decision tree, different +// per-user check function. The tuple-selection logic itself lives in +// topology.RBACTuplesForNode so the two surfaces stay in lockstep when the +// pseudo-kind table or Secret-tightening rules evolve. +// +// Decision dispatch (see topology.NodeRBACDecision for the four cases): +// +// - Namespaced node: run protocol-specific namespace-list gate first, then +// dispatch on the topology decision. Allow + CheckTuples both require +// the namespace gate to have passed; Deny short-circuits before it. +// - Cluster-scoped node: namespace-list gate doesn't apply; dispatch +// directly. +// +// Secret nodes get an additional per-kind SAR inside the namespace: namespace +// access (e.g. "user can list pods in team-a") is NOT a sufficient signal for +// reading Secrets, because the SA the cache runs under may have cluster-wide +// secrets RBAC (Helm release visibility) while the user does not. This mirrors +// the same gate handleGetResource applies — without it, the neighborhood graph +// would leak Secret existence + names to users who can't fetch them directly. +func (s *Server) canReadNeighborhoodNode(r *http.Request, n *topology.Node) bool { + // Namespace-list gate is protocol-specific (per-user state); apply it + // here for namespaced nodes BEFORE consulting the shared helper. + if n != nil && n.Data != nil { + if ns, ok := n.Data["namespace"].(string); ok && ns != "" { + allowed := s.getUserNamespaces(r, []string{ns}) + if noNamespaceAccess(allowed) { + return false + } + } + } + + decision, tuples := topology.RBACTuplesForNode(n, pseudoKindDiscoveryLookup()) + switch decision { + case topology.NodeRBACAllow: + return true + case topology.NodeRBACDeny: + return false + case topology.NodeRBACCheckTuples: + for _, t := range tuples { + if s.canRead(r, t.Group, t.Resource, t.Namespace, "get") { + return true + } + } + return false + case topology.NodeRBACConsultClassifyKindScope: + // Cluster-scoped node that isn't a tracked pseudo-kind. Fall back + // to the regular static-catalogue / discovery path. Unclassified + // kinds (returns false) allow-through: the topology graph wouldn't + // have surfaced the node for an unprivileged SA either. + group := "" + if n.Data != nil { + if v, ok := n.Data["apiVersion"].(string); ok { + group = topology.APIVersionGroup(v) + } + } + clusterScoped, gvrGroup, gvrResource := k8s.ClassifyKindScope(string(n.Kind), group) + if !clusterScoped { + return true + } + return s.canRead(r, gvrGroup, gvrResource, "", "get") + default: + // New decision values must be handled explicitly; default-deny is + // the security-safe fallthrough. + return false + } +} + +// pseudoKindDiscoveryLookup returns the function form of the discovery +// singleton that topology.RBACTuplesForKind / RBACTuplesForNode expect, or +// nil when discovery isn't initialised (test envs). Centralised here to +// sidestep the typed-nil-into-interface gotcha — see the doc on +// topology.PseudoKindDiscoveryLookup for the full rationale. +func pseudoKindDiscoveryLookup() topology.PseudoKindDiscoveryLookup { + disc := k8s.GetResourceDiscovery() + if disc == nil { + return nil + } + return disc.GetResourceWithGroup +} + +// allowPseudoKindTuples authorizes a list of per-variant SAR tuples returned +// by topology.RBACTuplesForKind for the root-preflight path. Iterates each +// tuple through s.canRead and allows on the first pass; if the helper +// returned zero tuples + fallthroughAllow=true (every variant was filtered +// out by discovery), allow — matches the pre-existing "over-include on +// absent provider variants" behavior. +func (s *Server) allowPseudoKindTuples(r *http.Request, tuples []topology.SARTuple, fallthroughAllow bool) bool { + if len(tuples) == 0 { + return fallthroughAllow + } + for _, t := range tuples { + if s.canRead(r, t.Group, t.Resource, t.Namespace, "get") { + return true + } + } + return false +} diff --git a/internal/server/neighborhood_handler_test.go b/internal/server/neighborhood_handler_test.go new file mode 100644 index 000000000..db3e7f561 --- /dev/null +++ b/internal/server/neighborhood_handler_test.go @@ -0,0 +1,97 @@ +package server + +import ( + "net/http/httptest" + "testing" + + "github.com/skyhook-io/radar/pkg/topology" +) + +func TestParseNeighborhoodOptions_Defaults(t *testing.T) { + r := httptest.NewRequest("GET", "/api/ai/neighborhood/Pod/prod/cart", nil) + opts := parseNeighborhoodOptions(r) + if opts.Profile != topology.ProfileAuto { + t.Errorf("default profile = %q, want auto", opts.Profile) + } + if opts.Hops != 1 { + t.Errorf("default hops = %d, want 1", opts.Hops) + } + if opts.MaxNodes != 25 { + t.Errorf("default max_nodes = %d, want 25", opts.MaxNodes) + } +} + +func TestParseNeighborhoodOptions_Custom(t *testing.T) { + r := httptest.NewRequest("GET", "/api/ai/neighborhood/Pod/prod/cart?profile=all&hops=2&max_nodes=10", nil) + opts := parseNeighborhoodOptions(r) + if opts.Profile != topology.ProfileAll { + t.Errorf("profile = %q, want all", opts.Profile) + } + if opts.Hops != 2 { + t.Errorf("hops = %d, want 2", opts.Hops) + } + if opts.MaxNodes != 10 { + t.Errorf("max_nodes = %d, want 10", opts.MaxNodes) + } +} + +func TestParseNeighborhoodOptions_MaxNodesClamp(t *testing.T) { + r := httptest.NewRequest("GET", "/api/ai/neighborhood/Pod/prod/cart?max_nodes=99999", nil) + opts := parseNeighborhoodOptions(r) + if opts.MaxNodes != 200 { + t.Errorf("max_nodes clamp = %d, want 200", opts.MaxNodes) + } +} + +// TestParseNeighborhoodOptions_HopsClamp pins that REST applies the hops=2 +// clamp at the handler level too, matching MaxNodes. BFS clamps internally, +// but the handler-level clamp keeps opts.Hops correct if anything inspects +// or logs it before BFS, and matches the doc on parseNeighborhoodOptions. +func TestParseNeighborhoodOptions_HopsClamp(t *testing.T) { + r := httptest.NewRequest("GET", "/api/ai/neighborhood/Pod/prod/cart?hops=99", nil) + opts := parseNeighborhoodOptions(r) + if opts.Hops != 2 { + t.Errorf("hops clamp = %d, want 2", opts.Hops) + } +} + +func TestParseNeighborhoodOptions_InvalidValues(t *testing.T) { + r := httptest.NewRequest("GET", "/api/ai/neighborhood/Pod/prod/cart?hops=abc&max_nodes=-5", nil) + opts := parseNeighborhoodOptions(r) + if opts.Hops != 1 { + t.Errorf("invalid hops should fall back to default 1, got %d", opts.Hops) + } + if opts.MaxNodes != 25 { + t.Errorf("invalid max_nodes should fall back to default 25, got %d", opts.MaxNodes) + } +} + +// TestParseNeighborhoodOptions_ProfileNormalization pins that REST exposes +// only auto/all. Unknown semantic bucket names fall back to ProfileAuto +// rather than silently expanding to all edge types. +func TestParseNeighborhoodOptions_ProfileNormalization(t *testing.T) { + cases := []struct { + query string + want topology.Profile + }{ + {"profile=all", topology.ProfileAll}, + {"profile=All", topology.ProfileAll}, // case-insensitive + {"profile=%20%20all%20%20", topology.ProfileAll}, // whitespace trim + {"profile=management", topology.ProfileAuto}, // unsupported bucket → auto + {"profile=networking", topology.ProfileAuto}, // unsupported bucket → auto + {"profile=policy", topology.ProfileAuto}, // unsupported bucket → auto + {"profile=security", topology.ProfileAuto}, // unsupported bucket → auto + {"profile=garbage", topology.ProfileAuto}, // unknown → auto + {"profile=", topology.ProfileAuto}, // empty → auto + {"", topology.ProfileAuto}, // missing → auto + } + for _, c := range cases { + t.Run(c.query, func(t *testing.T) { + r := httptest.NewRequest("GET", "/api/ai/neighborhood/Pod/prod/cart?"+c.query, nil) + opts := parseNeighborhoodOptions(r) + if opts.Profile != c.want { + t.Errorf("profile = %q, want %q", opts.Profile, c.want) + } + }) + } +} diff --git a/internal/server/neighborhood_rbac_test.go b/internal/server/neighborhood_rbac_test.go new file mode 100644 index 000000000..6b95eedc4 --- /dev/null +++ b/internal/server/neighborhood_rbac_test.go @@ -0,0 +1,450 @@ +package server + +import ( + "net/http" + "testing" + + "github.com/skyhook-io/radar/internal/auth" + "github.com/skyhook-io/radar/pkg/topology" +) + +// canReadNeighborhoodNode must apply per-kind Secret RBAC inside an allowed +// namespace. Mirrors the gate that handleGetResource already applies: namespace +// access alone is NOT sufficient — the SA backing the cache may have cluster- +// wide secrets RBAC (Helm release visibility) the calling user does not. If the +// per-kind SAR is missing, Secret nodes would leak through the neighborhood BFS +// to users who can't read them directly. + +func makeSecretNode(ns, name string) *topology.Node { + return &topology.Node{ + ID: "secret/" + ns + "/" + name, + Kind: topology.KindSecret, + Name: name, + Status: topology.StatusHealthy, + Data: map[string]any{ + "namespace": ns, + "apiVersion": "v1", + }, + } +} + +func makeConfigMapNode(ns, name string) *topology.Node { + return &topology.Node{ + ID: "configmap/" + ns + "/" + name, + Kind: topology.KindConfigMap, + Name: name, + Status: topology.StatusHealthy, + Data: map[string]any{ + "namespace": ns, + "apiVersion": "v1", + }, + } +} + +// TestCanReadNeighborhoodNode_SecretRequiresPerKindRBAC pins the new gate: +// a user with namespace access but no per-namespace `get secrets` SAR must be +// denied. Same setup as the existing handleGetResource secrets RBAC tests. +func TestCanReadNeighborhoodNode_SecretRequiresPerKindRBAC(t *testing.T) { + s := newAuthServer(auth.Config{Mode: "proxy"}) + s.permCache.Set("alice", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + // alice has namespace access to default but NOT per-namespace secrets-get. + perms := s.permCache.Get("alice") + perms.SetCanI("get", "", "secrets", "default", false) + + r := requestWithUser("GET", "/api/ai/neighborhood/pod/default/anything", &auth.User{ + Username: "alice", + }) + secret := makeSecretNode("default", "nginx-tls") + + if s.canReadNeighborhoodNode(r, secret) { + t.Error("Secret node leaked through neighborhood gate: namespace access without per-kind RBAC must deny") + } +} + +// Counterpart: a user WITH namespace access AND per-namespace secrets-get +// must be allowed. Locks down the positive path so the gate isn't blanket-deny. +func TestCanReadNeighborhoodNode_SecretAllowedWithPerKindRBAC(t *testing.T) { + s := newAuthServer(auth.Config{Mode: "proxy"}) + s.permCache.Set("bob", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + perms := s.permCache.Get("bob") + perms.SetCanI("get", "", "secrets", "default", true) + + r := requestWithUser("GET", "/api/ai/neighborhood/pod/default/anything", &auth.User{ + Username: "bob", + }) + secret := makeSecretNode("default", "nginx-tls") + + if !s.canReadNeighborhoodNode(r, secret) { + t.Error("authorized user denied: namespace access + per-kind RBAC should pass") + } +} + +// Sanity: non-Secret namespaced kinds (e.g. ConfigMap) ride on the namespace +// gate alone — adding the Secret-specific tightening must not regress that. +func TestCanReadNeighborhoodNode_ConfigMapStaysOnNamespaceGate(t *testing.T) { + s := newAuthServer(auth.Config{Mode: "proxy"}) + s.permCache.Set("alice", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + // alice has no configmap-specific SAR seeded — namespace access alone + // should pass for ConfigMap because we deliberately do NOT tighten that. + r := requestWithUser("GET", "/api/ai/neighborhood/pod/default/anything", &auth.User{ + Username: "alice", + }) + cm := makeConfigMapNode("default", "nginx-conf") + + if !s.canReadNeighborhoodNode(r, cm) { + t.Error("ConfigMap node denied: namespace access should be sufficient (no per-kind tightening for ConfigMap)") + } +} + +// makeNodeClassNode builds a topology pseudo-kind NodeClass node. The Kind is +// the synthesized topology label ("NodeClass"), not a real K8s resource — the +// actual variants are EC2NodeClass / AKSNodeClass / GCPNodeClass. +func makeNodeClassNode(name string) *topology.Node { + return &topology.Node{ + ID: "nodeclass/" + name, + Kind: topology.KindNodeClass, + Name: name, + Status: topology.StatusHealthy, + Data: map[string]any{ + // No namespace — NodeClass is cluster-scoped. + "apiVersion": "karpenter.k8s.aws/v1", + }, + } +} + +func makeKnativeServiceNode(ns, name string) *topology.Node { + return &topology.Node{ + ID: "knativeservice/" + ns + "/" + name, + Kind: topology.KindKnativeService, + Name: name, + Status: topology.StatusHealthy, + Data: map[string]any{ + "namespace": ns, + "apiVersion": "serving.knative.dev/v1", + }, + } +} + +// TestCanReadNeighborhoodNode_NodeClassRequiresPerProviderSAR pins the +// pseudo-kind cluster-scoped fix: NodeClass is a topology-only label that +// ClassifyKindScope doesn't recognize. Without the clusterScopedTopologyKinds +// lookup, NodeClass nodes hit the unclassified+empty-namespace allow branch +// and leak to users without provider-specific RBAC. +func TestCanReadNeighborhoodNode_NodeClassDeniedWithoutSAR(t *testing.T) { + s := newAuthServer(auth.Config{Mode: "proxy"}) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + // Deny all NodeClass variants. The helper iterates the table; entries + // not in discovery (typical for the test env) are skipped by the + // discovery filter — so the SARs only fire on what discovery has. + // disc=nil in tests → no entries filtered out → all 3 SARs run. + perms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", false) + perms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + perms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + s.permCache.Set("alice", perms) + + r := requestWithUser("GET", "/api/ai/neighborhood/nodeclass/_/x", &auth.User{Username: "alice"}) + n := makeNodeClassNode("default-class") + if s.canReadNeighborhoodNode(r, n) { + t.Error("NodeClass pseudo-kind leaked to user without any provider get-SAR") + } +} + +// Counterpart: user with one provider's get-SAR sees NodeClass nodes. Mirrors +// the topology-strip semantics — denial requires ALL discovery-present +// providers to fail; a single allow is sufficient. +func TestCanReadNeighborhoodNode_NodeClassAllowedWithProviderSAR(t *testing.T) { + s := newAuthServer(auth.Config{Mode: "proxy"}) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + // Bob has EC2 access only — should still pass for NodeClass. + perms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", true) + perms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + perms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + s.permCache.Set("bob", perms) + + r := requestWithUser("GET", "/api/ai/neighborhood/nodeclass/_/x", &auth.User{Username: "bob"}) + n := makeNodeClassNode("default-class") + if !s.canReadNeighborhoodNode(r, n) { + t.Error("NodeClass denied for user with EC2 get-SAR — single-provider RBAC must allow") + } +} + +// TestCanReadNeighborhoodNode_NodeClassPerVariantSAR pins the per-variant +// authorization fix: NodeClass has 3 entries in ClusterScopedKinds (EC2 / +// AKS / GCP). Before the fix, the helper iterated ALL entries and returned +// true on the FIRST passing SAR — so a user with EC2 RBAC saw AKS and GCP +// NodeClass nodes too. The fix matches the table row by BOTH Kind and the +// node's apiVersion-group, so an AKS NodeClass node is SARed against the +// AKS row only. +// +// Setup: Bob has EC2 RBAC only. An AKS NodeClass node (apiVersion= +// karpenter.azure.com/v1beta1) must be denied — his EC2 grant must not +// leak across providers. +func TestCanReadNeighborhoodNode_NodeClassPerVariantDeniesWrongProvider(t *testing.T) { + s := newAuthServer(auth.Config{Mode: "proxy"}) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + perms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", true) + perms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + perms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + s.permCache.Set("bob", perms) + + r := requestWithUser("GET", "/api/ai/neighborhood/nodeclass/_/x", &auth.User{Username: "bob"}) + + // AKS NodeClass node — apiVersion-group is karpenter.azure.com. The + // per-variant lookup must SAR ONLY the AKS row, which is denied. + aks := &topology.Node{ + ID: "nodeclass/aks-default", + Kind: topology.KindNodeClass, + Name: "aks-default", + Status: topology.StatusHealthy, + Data: map[string]any{ + "apiVersion": "karpenter.azure.com/v1beta1", + }, + } + if s.canReadNeighborhoodNode(r, aks) { + t.Error("AKS NodeClass leaked to user with EC2-only RBAC — per-variant gate must deny cross-provider") + } + + // Sanity counterpart: same user, EC2 NodeClass — must allow. + ec2 := &topology.Node{ + ID: "nodeclass/ec2-default", + Kind: topology.KindNodeClass, + Name: "ec2-default", + Status: topology.StatusHealthy, + Data: map[string]any{ + "apiVersion": "karpenter.k8s.aws/v1", + }, + } + if !s.canReadNeighborhoodNode(r, ec2) { + t.Error("EC2 NodeClass denied to user with EC2 RBAC — per-variant gate must allow the matching provider") + } +} + +// KnativeService is a namespaced pseudo-kind. The cluster-scoped table +// shouldn't match it; the helper must fall through to the namespaced branch +// and ride on namespace access alone (no per-kind tightening for Knative). +func TestCanReadNeighborhoodNode_KnativeServiceUsesNamespaceGate(t *testing.T) { + s := newAuthServer(auth.Config{Mode: "proxy"}) + s.permCache.Set("alice", &auth.UserPermissions{AllowedNamespaces: []string{"prod"}}) + + r := requestWithUser("GET", "/api/ai/neighborhood/service/prod/api", &auth.User{Username: "alice"}) + n := makeKnativeServiceNode("prod", "api") + if !s.canReadNeighborhoodNode(r, n) { + t.Error("namespaced pseudo-kind KnativeService denied — namespace access should be sufficient") + } + + // Sanity: user without namespace access → denied. + s.permCache.Set("carol", &auth.UserPermissions{AllowedNamespaces: []string{"staging"}}) + r2 := requestWithUser("GET", "/api/ai/neighborhood/service/prod/api", &auth.User{Username: "carol"}) + if s.canReadNeighborhoodNode(r2, n) { + t.Error("KnativeService allowed for user without namespace access — namespace gate must apply") + } +} + +// pseudoKindTuplesForTest is the test analogue of the inline +// topology.RBACTuplesForKind call in handleAINeighborhood — disc=nil in +// tests, so no rows are filtered, exactly matching the production path with +// discovery offline. +func pseudoKindTuplesForTest(kind, group string) (tuples []topology.SARTuple, fallthroughAllow bool) { + t, _, fa := topology.RBACTuplesForKind(kind, group, nil) + return t, fa +} + +// TestAllowPseudoKindTuples_NodeClass pins the root-preflight helper: a user +// without ANY provider get-SAR for NodeClass must be denied, while a user +// with EC2 RBAC must be allowed (single-provider grant is sufficient, +// matching the per-node gate). This is the kind-only variant used at root +// preflight before the topology has resolved a concrete node with an +// apiVersion — so we iterate every table row under that kind and allow on +// any pass. +func TestAllowPseudoKindTuples_NodeClass_DeniedWithoutSAR(t *testing.T) { + s := newAuthServer(auth.Config{Mode: "proxy"}) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + perms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", false) + perms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + perms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + s.permCache.Set("alice", perms) + + r := requestWithUser("GET", "/api/ai/neighborhood/nodeclass/_/x", &auth.User{Username: "alice"}) + tuples, fallthroughAllow := pseudoKindTuplesForTest("nodeclass", "") + if len(tuples) == 0 { + t.Fatal("RBACTuplesForKind returned 0 tuples for nodeclass — table wiring is broken") + } + if s.allowPseudoKindTuples(r, tuples, fallthroughAllow) { + t.Error("nodeclass root preflight allowed user without any provider get-SAR — must deny") + } +} + +func TestAllowPseudoKindTuples_NodeClass_AllowedWithProviderSAR(t *testing.T) { + s := newAuthServer(auth.Config{Mode: "proxy"}) + perms := &auth.UserPermissions{AllowedNamespaces: nil} + // Bob has EC2 RBAC only. Root preflight without a known provider group + // must still allow — the per-node Allow gate will then drop AKS/GCP + // variants. This mirrors topology-strip semantics: a single provider + // grant is sufficient for the kind-level gate. + perms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", true) + perms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + perms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + s.permCache.Set("bob", perms) + + r := requestWithUser("GET", "/api/ai/neighborhood/nodeclass/_/x", &auth.User{Username: "bob"}) + tuples, fallthroughAllow := pseudoKindTuplesForTest("nodeclass", "") + if !s.allowPseudoKindTuples(r, tuples, fallthroughAllow) { + t.Error("nodeclass root preflight denied user with EC2 get-SAR — single-provider RBAC must pass") + } +} + +// Same shape for NodePool, which has a SINGLE row in the table (karpenter.sh). +// Pin that the kind-only path works for single-entry pseudo-kinds too. +func TestAllowPseudoKindTuples_NodePool(t *testing.T) { + s := newAuthServer(auth.Config{Mode: "proxy"}) + denyPerms := &auth.UserPermissions{AllowedNamespaces: nil} + denyPerms.SetCanI("get", "karpenter.sh", "nodepools", "", false) + s.permCache.Set("alice", denyPerms) + + allowPerms := &auth.UserPermissions{AllowedNamespaces: nil} + allowPerms.SetCanI("get", "karpenter.sh", "nodepools", "", true) + s.permCache.Set("bob", allowPerms) + + tuples, fallthroughAllow := pseudoKindTuplesForTest("nodepool", "") + if len(tuples) != 1 { + t.Fatalf("expected 1 nodepool tuple, got %d", len(tuples)) + } + rDeny := requestWithUser("GET", "/api/ai/neighborhood/nodepool/_/x", &auth.User{Username: "alice"}) + if s.allowPseudoKindTuples(rDeny, tuples, fallthroughAllow) { + t.Error("nodepool root preflight allowed user without karpenter.sh/nodepools get-SAR") + } + rAllow := requestWithUser("GET", "/api/ai/neighborhood/nodepool/_/x", &auth.User{Username: "bob"}) + if !s.allowPseudoKindTuples(rAllow, tuples, fallthroughAllow) { + t.Error("nodepool root preflight denied user WITH karpenter.sh/nodepools get-SAR") + } +} + +// TestNeighborhood_NodeClassRootPreflightNotBadRequest is the integration +// pin: the URL /api/ai/neighborhood/nodeclass/_/foo must NOT return 400 +// "namespace is required" — that's the regression we're fixing. +// +// Two assertions: +// - Unauthorized user (no provider get-SAR): preflight must reject as 403, +// not 400. +// - Authorized user (EC2 get-SAR): preflight passes, BFS runs against the +// test cache (which has no NodeClass nodes), root lookup misses → 404, +// also not 400. Both prove the preflight is no longer rejecting on +// namespace. +func TestNeighborhood_NodeClassRootPreflightNotBadRequest(t *testing.T) { + env := newAuthTestServer(t) + + // Unauthorized: all provider SARs denied → 403, not 400. + denyPerms := &auth.UserPermissions{AllowedNamespaces: nil} + denyPerms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", false) + denyPerms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + denyPerms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + env.srv.permCache.Set("alice", denyPerms) + + resp := env.authGet(t, "/api/ai/neighborhood/nodeclass/_/foo", "alice", "") + resp.Body.Close() + if resp.StatusCode == http.StatusBadRequest { + t.Errorf("nodeclass root preflight returned 400 — regression: pseudo-kind classified as namespaced") + } + if resp.StatusCode != http.StatusForbidden { + t.Errorf("unauthorized user got %d for nodeclass — want 403 (cluster-scoped pseudo-kind gate)", resp.StatusCode) + } + + // Authorized: EC2 get-SAR allowed → preflight passes; BFS finds no + // NodeClass node in the seeded cache → 404, not 400. + allowPerms := &auth.UserPermissions{AllowedNamespaces: nil} + allowPerms.SetCanI("get", "karpenter.k8s.aws", "ec2nodeclasses", "", true) + allowPerms.SetCanI("get", "karpenter.azure.com", "aksnodeclasses", "", false) + allowPerms.SetCanI("get", "karpenter.k8s.gcp", "gcpnodeclasses", "", false) + env.srv.permCache.Set("bob", allowPerms) + + resp2 := env.authGet(t, "/api/ai/neighborhood/nodeclass/_/foo", "bob", "") + resp2.Body.Close() + if resp2.StatusCode == http.StatusBadRequest { + t.Errorf("nodeclass root preflight returned 400 even with EC2 SAR — regression") + } + if resp2.StatusCode != http.StatusNotFound { + t.Errorf("authorized user got %d for nonexistent nodeclass — want 404 (preflight passed, BFS empty)", resp2.StatusCode) + } +} + +// Same for NodePool — sanity that the fix covers all pseudo-kinds the +// agent might request, not just NodeClass. +func TestNeighborhood_NodePoolRootPreflightNotBadRequest(t *testing.T) { + env := newAuthTestServer(t) + + denyPerms := &auth.UserPermissions{AllowedNamespaces: nil} + denyPerms.SetCanI("get", "karpenter.sh", "nodepools", "", false) + env.srv.permCache.Set("alice", denyPerms) + resp := env.authGet(t, "/api/ai/neighborhood/nodepool/_/foo", "alice", "") + resp.Body.Close() + if resp.StatusCode == http.StatusBadRequest { + t.Errorf("nodepool root preflight returned 400 — pseudo-kind misclassified as namespaced") + } + if resp.StatusCode != http.StatusForbidden { + t.Errorf("unauthorized user got %d for nodepool — want 403", resp.StatusCode) + } + + allowPerms := &auth.UserPermissions{AllowedNamespaces: nil} + allowPerms.SetCanI("get", "karpenter.sh", "nodepools", "", true) + env.srv.permCache.Set("bob", allowPerms) + resp2 := env.authGet(t, "/api/ai/neighborhood/nodepool/_/foo", "bob", "") + resp2.Body.Close() + if resp2.StatusCode == http.StatusBadRequest { + t.Errorf("nodepool root preflight returned 400 even with karpenter.sh SAR") + } + if resp2.StatusCode != http.StatusNotFound { + t.Errorf("authorized user got %d for nonexistent nodepool — want 404", resp2.StatusCode) + } +} + +// TestNeighborhood_SecretRootIncluded pins the IncludeSecrets fix at the +// REST handler boundary. DefaultBuildOptions sets IncludeSecrets=false, so +// Secret nodes don't enter the topology and root lookup for kind=secret +// always returns "resource not found in topology" → 404, even for users +// authorized to read the Secret. The handler must override +// IncludeSecrets=true so authorized callers find the root, while the +// per-namespace `get secrets` Allow gate still 404s unauthorized callers +// via the empty-subgraph path. +// +// The seeded fake client (server_smoke_test.go TestMain) has a Secret +// "nginx-tls" in "default". This test depends on that seed. +func TestNeighborhood_SecretRootIncluded(t *testing.T) { + env := newAuthTestServer(t) + + // Authorized user: namespace access to default + per-namespace + // `get secrets` SAR. Both Allow checks (namespace + per-kind Secret) + // must pass. + env.srv.permCache.Set("bob", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + bobPerms := env.srv.permCache.Get("bob") + bobPerms.SetCanI("get", "", "secrets", "default", true) + + resp := env.authGet(t, "/api/ai/neighborhood/secret/default/nginx-tls", "bob", "") + resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Errorf("authorized user got %d for Secret root — IncludeSecrets override must surface Secret nodes in topology", resp.StatusCode) + } + + // Unauthorized user: namespace access to default but NO per-namespace + // `get secrets` SAR. Allow rejects the root → empty subgraph → 404. + // Existence-hiding preserved: same 404 the IncludeSecrets=false path + // produced, but now driven by RBAC instead of topology elision. + env.srv.permCache.Set("alice", &auth.UserPermissions{ + AllowedNamespaces: []string{"default"}, + }) + alicePerms := env.srv.permCache.Get("alice") + alicePerms.SetCanI("get", "", "secrets", "default", false) + + resp2 := env.authGet(t, "/api/ai/neighborhood/secret/default/nginx-tls", "alice", "") + resp2.Body.Close() + if resp2.StatusCode != http.StatusNotFound { + t.Errorf("unauthorized user got %d for Secret root — Allow gate must produce 404 via empty subgraph (existence-hiding)", resp2.StatusCode) + } +} diff --git a/internal/server/server.go b/internal/server/server.go index 76923e0dd..055be36e2 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -378,6 +378,7 @@ func (s *Server) setupRoutes() { r.Use(aiAgentLogMiddleware) r.Get("/ai/resources/{kind}", s.handleAIListResources) r.Get("/ai/resources/{kind}/{namespace}/{name}", s.handleAIGetResource) + r.Get("/ai/neighborhood/{kind}/{namespace}/{name}", s.handleAINeighborhood) }) // Debug routes (for event pipeline diagnostics) @@ -805,43 +806,11 @@ func (s *Server) filterNamespacesByCanRead(r *http.Request, group, resource, ver return out } -// clusterScopedTopologyKinds maps topology NodeKinds for cluster-scoped -// resources to the (group, resource) tuple a SAR needs. Mirrors the MCP -// table — kept in sync manually since each side gates with its own helper. -// -// This is a denylist: it must enumerate every cluster-scoped kind the -// topology builder creates. Drift here = silent leak. The follow-up plan -// (per-node GVR/scope metadata + scope-driven strip) removes the central -// table; until then, the checklist comment on NodeKind in -// pkg/topology/types.go is the maintenance signal for new kinds. -// -// KindNamespace is intentionally excluded — handled by per-user filter -// upstream. KindNodeClass has multiple entries (one per cloud provider) -// because the topology builder iterates EC2NodeClass / AKSNodeClass / -// GCPNodeClass under the same NodeKind label; a denial on any provider -// strips all NodeClass nodes. canRead's unknown-kind passthrough makes -// providers absent from the cluster's discovery non-blocking. -var clusterScopedTopologyKinds = []struct { - kind topology.NodeKind - group string - resource string -}{ - {topology.KindNode, "", "nodes"}, - {topology.KindNodePool, "karpenter.sh", "nodepools"}, - {topology.KindNodeClaim, "karpenter.sh", "nodeclaims"}, - {topology.KindNodeClass, "karpenter.k8s.aws", "ec2nodeclasses"}, - {topology.KindNodeClass, "karpenter.azure.com", "aksnodeclasses"}, - {topology.KindNodeClass, "karpenter.k8s.gcp", "gcpnodeclasses"}, - {topology.KindGatewayClass, "gateway.networking.k8s.io", "gatewayclasses"}, - {topology.KindPV, "", "persistentvolumes"}, - {topology.KindStorageClass, "storage.k8s.io", "storageclasses"}, - {topology.KindCiliumClusterwideNetworkPolicy, "cilium.io", "ciliumclusterwidenetworkpolicies"}, - {topology.KindClusterNetworkPolicy, "policy.networking.k8s.io", "clusternetworkpolicies"}, -} - // deniedClusterScopedTopoKinds returns the set of cluster-scoped topology -// NodeKinds the calling user cannot list. Reuses canRead's per-user canI -// cache so subsequent topology calls within the TTL don't re-SAR. +// NodeKinds the calling user cannot list. Walks topology.ClusterScopedKinds +// (centralized table — see pkg/topology/cluster_scoped_kinds.go). Reuses +// canRead's per-user canI cache so subsequent topology calls within the +// TTL don't re-SAR. // // Skips CRDs not present in discovery (e.g. AKSNodeClass on an EKS cluster): // SARing a non-existent resource returns false because no RBAC rule covers @@ -851,14 +820,14 @@ var clusterScopedTopologyKinds = []struct { func (s *Server) deniedClusterScopedTopoKinds(r *http.Request) map[topology.NodeKind]bool { deny := make(map[topology.NodeKind]bool) disc := k8s.GetResourceDiscovery() - for _, ck := range clusterScopedTopologyKinds { - if ck.group != "" && disc != nil { - if _, ok := disc.GetResourceWithGroup(ck.resource, ck.group); !ok { + for _, ck := range topology.ClusterScopedKinds { + if ck.Group != "" && disc != nil { + if _, ok := disc.GetResourceWithGroup(ck.Resource, ck.Group); !ok { continue } } - if !s.canRead(r, ck.group, ck.resource, "", "list") { - deny[ck.kind] = true + if !s.canRead(r, ck.Group, ck.Resource, "", "list") { + deny[ck.Kind] = true } } return deny diff --git a/internal/server/server_smoke_test.go b/internal/server/server_smoke_test.go index 51da8590a..d0cb0d054 100644 --- a/internal/server/server_smoke_test.go +++ b/internal/server/server_smoke_test.go @@ -47,7 +47,20 @@ func TestMain(m *testing.M) { }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "nginx"}}, - Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: "nginx", Image: "nginx:1.25"}}}, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "nginx", Image: "nginx:1.25"}}, + // Reference nginx-tls so the topology builder includes + // the Secret node when IncludeSecrets=true. The + // neighborhood handler's Secret-root tests depend on + // this — without a reference the Secret would be + // elided regardless of IncludeSecrets. + Volumes: []corev1.Volume{{ + Name: "tls", + VolumeSource: corev1.VolumeSource{ + Secret: &corev1.SecretVolumeSource{SecretName: "nginx-tls"}, + }, + }}, + }, }, }, Status: appsv1.DeploymentStatus{ diff --git a/pkg/resourcecontext/types.go b/pkg/resourcecontext/types.go index ce229a84f..15f5cfee7 100644 --- a/pkg/resourcecontext/types.go +++ b/pkg/resourcecontext/types.go @@ -37,7 +37,6 @@ type ResourceContext struct { PolicySummary *PolicySummary `json:"policySummary,omitempty"` Hints []string `json:"hints,omitempty"` Omitted []OmittedField `json:"omitted,omitempty"` - Truncated bool `json:"truncated,omitempty"` } // ContextTier signals how much enrichment is included. "basic" is the @@ -52,51 +51,21 @@ const ( // ContextRef is a typed pointer to another Kubernetes object that the // subject relates to. Group is omitted for core/v1 kinds; Namespace is -// omitted for cluster-scoped objects. +// omitted for cluster-scoped objects. The structural reason for the link +// is implied by the parent field name (selectedBy → selector match, +// runsOn → node binding, etc.) rather than re-encoded per-ref. type ContextRef struct { - Kind string `json:"kind"` - Group string `json:"group,omitempty"` - Namespace string `json:"namespace,omitempty"` - Name string `json:"name"` - Reason RefReason `json:"reason,omitempty"` - Source RefSource `json:"source,omitempty"` - Confidence string `json:"confidence,omitempty"` // reserved; not populated in v1 + Kind string `json:"kind"` + Group string `json:"group,omitempty"` + Namespace string `json:"namespace,omitempty"` + Name string `json:"name"` } -// RefReason describes WHY a ContextRef is being emitted — the structural -// link between subject and target. -type RefReason string - -const ( - ReasonOwnerReference RefReason = "owner_reference" - ReasonLabelSelector RefReason = "label_selector" - ReasonPodSelector RefReason = "pod_selector_match" - ReasonPolicyReportSubj RefReason = "policy_report_subject" - ReasonVolumeMount RefReason = "volume_mount_ref" - ReasonEnvVarRef RefReason = "env_var_ref" - ReasonScaleTargetRef RefReason = "scale_target_ref" - ReasonClaimRef RefReason = "claim_ref" - ReasonNodeName RefReason = "node_name" - ReasonSAName RefReason = "service_account_name" -) - -// RefSource describes WHERE the link came from — which subsystem produced -// the ref. Useful for debugging and for filtering on the agent side. -type RefSource string - -const ( - SourceTopology RefSource = "topology" - SourceOwnerChain RefSource = "owner_chain" - SourcePolicyReport RefSource = "policy_report" - SourceAuditEngine RefSource = "audit_engine" - SourceK8sSpec RefSource = "k8s_spec" -) - // ManagedByRef is the compact form of a "managed-by" pointer used in // SummaryContext (list/search rows). Carries Kind alongside Source so // consumers can distinguish e.g. a Flux Kustomization from a Flux // HelmRelease without re-parsing the Source string. Intentionally lacks -// Group, Reason, and Confidence to keep per-row bytes minimal. +// Group to keep per-row bytes minimal. type ManagedByRef struct { Kind string `json:"kind"` // "Application" | "Kustomization" | "HelmRelease" | "Deployment" | "DaemonSet" | "StatefulSet" | "Rollout" | … Source string `json:"source"` // "argocd" | "flux" | "helm" | "native" @@ -177,10 +146,8 @@ type OmittedField struct { type OmittedReason string const ( - OmittedRBACDenied OmittedReason = "rbac_denied" - OmittedBudgetExceeded OmittedReason = "budget_exceeded" - OmittedCacheCold OmittedReason = "cache_cold" - OmittedNotInstalled OmittedReason = "not_installed" - OmittedKindUnsupported OmittedReason = "kind_unsupported" - OmittedProviderDisabled OmittedReason = "provider_disabled" + OmittedRBACDenied OmittedReason = "rbac_denied" + OmittedBudgetExceeded OmittedReason = "budget_exceeded" + OmittedCacheCold OmittedReason = "cache_cold" + OmittedNotInstalled OmittedReason = "not_installed" ) diff --git a/pkg/resourcecontext/types_test.go b/pkg/resourcecontext/types_test.go index 799579a85..2e891594c 100644 --- a/pkg/resourcecontext/types_test.go +++ b/pkg/resourcecontext/types_test.go @@ -7,96 +7,6 @@ import ( "testing" ) -// TestRefReasonMarshalsSnakeCase asserts every RefReason constant -// marshals to its locked snake_case literal — the wire contract. -func TestRefReasonMarshalsSnakeCase(t *testing.T) { - cases := map[RefReason]string{ - ReasonOwnerReference: "owner_reference", - ReasonLabelSelector: "label_selector", - ReasonPodSelector: "pod_selector_match", - ReasonPolicyReportSubj: "policy_report_subject", - ReasonVolumeMount: "volume_mount_ref", - ReasonEnvVarRef: "env_var_ref", - ReasonScaleTargetRef: "scale_target_ref", - ReasonClaimRef: "claim_ref", - ReasonNodeName: "node_name", - ReasonSAName: "service_account_name", - } - for c, want := range cases { - got, err := json.Marshal(c) - if err != nil { - t.Fatalf("marshal RefReason %q: %v", c, err) - } - if string(got) != `"`+want+`"` { - t.Errorf("RefReason marshal: got %s, want %q", got, want) - } - } -} - -// TestRefReasonUnmarshalsSnakeCase asserts each locked snake_case literal -// round-trips into the matching constant. -func TestRefReasonUnmarshalsSnakeCase(t *testing.T) { - cases := map[string]RefReason{ - "owner_reference": ReasonOwnerReference, - "label_selector": ReasonLabelSelector, - "pod_selector_match": ReasonPodSelector, - "policy_report_subject": ReasonPolicyReportSubj, - "volume_mount_ref": ReasonVolumeMount, - "env_var_ref": ReasonEnvVarRef, - "scale_target_ref": ReasonScaleTargetRef, - "claim_ref": ReasonClaimRef, - "node_name": ReasonNodeName, - "service_account_name": ReasonSAName, - } - for in, want := range cases { - var got RefReason - if err := json.Unmarshal([]byte(`"`+in+`"`), &got); err != nil { - t.Fatalf("unmarshal RefReason %q: %v", in, err) - } - if got != want { - t.Errorf("RefReason unmarshal %q: got %q, want %q", in, got, want) - } - } -} - -func TestRefSourceMarshalsSnakeCase(t *testing.T) { - cases := map[RefSource]string{ - SourceTopology: "topology", - SourceOwnerChain: "owner_chain", - SourcePolicyReport: "policy_report", - SourceAuditEngine: "audit_engine", - SourceK8sSpec: "k8s_spec", - } - for c, want := range cases { - got, err := json.Marshal(c) - if err != nil { - t.Fatalf("marshal RefSource %q: %v", c, err) - } - if string(got) != `"`+want+`"` { - t.Errorf("RefSource marshal: got %s, want %q", got, want) - } - } -} - -func TestRefSourceUnmarshalsSnakeCase(t *testing.T) { - cases := map[string]RefSource{ - "topology": SourceTopology, - "owner_chain": SourceOwnerChain, - "policy_report": SourcePolicyReport, - "audit_engine": SourceAuditEngine, - "k8s_spec": SourceK8sSpec, - } - for in, want := range cases { - var got RefSource - if err := json.Unmarshal([]byte(`"`+in+`"`), &got); err != nil { - t.Fatalf("unmarshal RefSource %q: %v", in, err) - } - if got != want { - t.Errorf("RefSource unmarshal %q: got %q, want %q", in, got, want) - } - } -} - func TestContextTierMarshalsSnakeCase(t *testing.T) { cases := map[ContextTier]string{ TierBasic: "basic", @@ -131,12 +41,10 @@ func TestContextTierUnmarshalsSnakeCase(t *testing.T) { func TestOmittedReasonMarshalsSnakeCase(t *testing.T) { cases := map[OmittedReason]string{ - OmittedRBACDenied: "rbac_denied", - OmittedBudgetExceeded: "budget_exceeded", - OmittedCacheCold: "cache_cold", - OmittedNotInstalled: "not_installed", - OmittedKindUnsupported: "kind_unsupported", - OmittedProviderDisabled: "provider_disabled", + OmittedRBACDenied: "rbac_denied", + OmittedBudgetExceeded: "budget_exceeded", + OmittedCacheCold: "cache_cold", + OmittedNotInstalled: "not_installed", } for c, want := range cases { got, err := json.Marshal(c) @@ -151,12 +59,10 @@ func TestOmittedReasonMarshalsSnakeCase(t *testing.T) { func TestOmittedReasonUnmarshalsSnakeCase(t *testing.T) { cases := map[string]OmittedReason{ - "rbac_denied": OmittedRBACDenied, - "budget_exceeded": OmittedBudgetExceeded, - "cache_cold": OmittedCacheCold, - "not_installed": OmittedNotInstalled, - "kind_unsupported": OmittedKindUnsupported, - "provider_disabled": OmittedProviderDisabled, + "rbac_denied": OmittedRBACDenied, + "budget_exceeded": OmittedBudgetExceeded, + "cache_cold": OmittedCacheCold, + "not_installed": OmittedNotInstalled, } for in, want := range cases { var got OmittedReason @@ -201,7 +107,6 @@ func TestResourceContextFieldOrdering(t *testing.T) { PolicySummary: &PolicySummary{}, Hints: []string{"hint"}, Omitted: []OmittedField{{Field: "selectedBy", Reason: OmittedRBACDenied}}, - Truncated: true, } b, err := json.Marshal(ac) if err != nil { @@ -221,7 +126,6 @@ func TestResourceContextFieldOrdering(t *testing.T) { `"policySummary"`, `"hints"`, `"omitted"`, - `"truncated"`, } prev := -1 for _, key := range wantOrder { @@ -247,40 +151,29 @@ func TestResourceContextRoundTrip(t *testing.T) { Group: "apps", Namespace: "prod", Name: "api", - Reason: ReasonOwnerReference, - Source: SourceOwnerChain, }}, Exposes: []ContextRef{{ Kind: "Service", Namespace: "prod", Name: "api", - Reason: ReasonLabelSelector, - Source: SourceTopology, }}, SelectedBy: []ContextRef{{ Kind: "NetworkPolicy", Group: "networking.k8s.io", Namespace: "prod", Name: "default-deny", - Reason: ReasonPodSelector, - Source: SourceTopology, }}, Uses: &UsesBlock{ - ConfigMaps: []ContextRef{{Kind: "ConfigMap", Name: "api-config", Reason: ReasonEnvVarRef, Source: SourceK8sSpec}}, - Secrets: []ContextRef{{Kind: "Secret", Name: "api-creds", Reason: ReasonVolumeMount, Source: SourceK8sSpec}}, - ServiceAccount: &ContextRef{ - Kind: "ServiceAccount", Name: "api-sa", - Reason: ReasonSAName, Source: SourceK8sSpec, - }, - PVCs: []ContextRef{{Kind: "PersistentVolumeClaim", Name: "data", Reason: ReasonClaimRef, Source: SourceK8sSpec}}, + ConfigMaps: []ContextRef{{Kind: "ConfigMap", Name: "api-config"}}, + Secrets: []ContextRef{{Kind: "Secret", Name: "api-creds"}}, + ServiceAccount: &ContextRef{Kind: "ServiceAccount", Name: "api-sa"}, + PVCs: []ContextRef{{Kind: "PersistentVolumeClaim", Name: "data"}}, }, - RunsOn: &ContextRef{Kind: "Node", Name: "node-1", Reason: ReasonNodeName, Source: SourceK8sSpec}, + RunsOn: &ContextRef{Kind: "Node", Name: "node-1"}, ScaledBy: []ContextRef{{ - Kind: "HorizontalPodAutoscaler", - Group: "autoscaling", - Name: "api-hpa", - Reason: ReasonScaleTargetRef, - Source: SourceTopology, + Kind: "HorizontalPodAutoscaler", + Group: "autoscaling", + Name: "api-hpa", }}, IssueSummary: &IssueSummary{ Count: 3, @@ -309,7 +202,6 @@ func TestResourceContextRoundTrip(t *testing.T) { {Field: "selectedBy.networkPolicies", Reason: OmittedRBACDenied}, {Field: "policySummary.kyverno", Reason: OmittedNotInstalled}, }, - Truncated: true, } b, err := json.Marshal(orig) @@ -355,7 +247,7 @@ func TestSummaryContextRoundTrip(t *testing.T) { t.Errorf("SummaryContext JSON missing %s: %s", sub, s) } } - for _, forbidden := range []string{`"group"`, `"reason"`, `"confidence"`} { + for _, forbidden := range []string{`"group"`} { if strings.Contains(s, forbidden) { t.Errorf("SummaryContext JSON leaks %s: %s", forbidden, s) } @@ -382,15 +274,3 @@ func TestManagedByRefDistinguishesFluxKinds(t *testing.T) { } } -// TestConfidenceOmittedWhenEmpty pins the contract that the reserved -// Confidence field is dropped from the wire when not populated. -func TestConfidenceOmittedWhenEmpty(t *testing.T) { - ref := ContextRef{Kind: "Pod", Name: "p"} - b, err := json.Marshal(ref) - if err != nil { - t.Fatalf("marshal: %v", err) - } - if strings.Contains(string(b), "confidence") { - t.Errorf("expected confidence omitted when empty, got %s", b) - } -} diff --git a/pkg/topology/cluster_scoped_kinds.go b/pkg/topology/cluster_scoped_kinds.go new file mode 100644 index 000000000..49852271f --- /dev/null +++ b/pkg/topology/cluster_scoped_kinds.go @@ -0,0 +1,266 @@ +package topology + +import ( + "strings" + + k8score "github.com/skyhook-io/radar/pkg/k8score" +) + +// ClusterScopedKindEntry maps a topology NodeKind for a cluster-scoped +// resource to the (group, resource) tuple a SubjectAccessReview needs to +// authorize a list/get. +type ClusterScopedKindEntry struct { + Kind NodeKind + Group string + Resource string +} + +// ClusterScopedKinds is the central denylist of cluster-scoped kinds the +// topology builder synthesizes. Both the topology-strip path (REST + MCP) +// and the neighborhood per-node RBAC gate iterate this table; centralising +// it removes the historical "update BOTH internal/server and internal/mcp" +// drift hazard that the checklist on NodeKind warned about. +// +// KindNamespace is intentionally excluded — handled by per-user filter +// upstream. KindNodeClass has one entry per cloud provider (EC2 / AKS / +// GCP) because the topology builder iterates them under the same NodeKind +// label; callers should treat ALL three as a single SAR-anchored gate so +// providers absent from cluster discovery don't over-deny. +var ClusterScopedKinds = []ClusterScopedKindEntry{ + {KindNode, "", "nodes"}, + {KindNodePool, "karpenter.sh", "nodepools"}, + {KindNodeClaim, "karpenter.sh", "nodeclaims"}, + {KindNodeClass, "karpenter.k8s.aws", "ec2nodeclasses"}, + {KindNodeClass, "karpenter.azure.com", "aksnodeclasses"}, + {KindNodeClass, "karpenter.k8s.gcp", "gcpnodeclasses"}, + {KindGatewayClass, "gateway.networking.k8s.io", "gatewayclasses"}, + {KindPV, "", "persistentvolumes"}, + {KindStorageClass, "storage.k8s.io", "storageclasses"}, + {KindCiliumClusterwideNetworkPolicy, "cilium.io", "ciliumclusterwidenetworkpolicies"}, + {KindClusterNetworkPolicy, "policy.networking.k8s.io", "clusternetworkpolicies"}, +} + +// SARTuple is a (group, resource, namespace) triple suitable for handing to +// a SubjectAccessReview. Used as the return type of the neighborhood RBAC +// tuple-selection helpers below so REST + MCP can iterate the same set of +// candidates through their respective per-user check function without +// duplicating the table-walking + discovery-filter logic. +type SARTuple struct { + Group string + Resource string + Namespace string +} + +// PseudoKindDiscoveryLookup is the minimal discovery surface +// RBACTuplesForKind uses to drop pseudo-kind variants whose CRD has been +// removed mid-build (or never installed). Pass nil to skip the filter — the +// helper then returns every (kind, group)-matched row unfiltered, matching +// the over-include-on-missing-discovery behavior the REST + MCP per-node +// gates already used before this extraction. +// +// A function type (rather than an interface) sidesteps the Go nil-interface +// gotcha: callers commonly write `disc := k8s.GetResourceDiscovery()` which +// can return a typed-nil pointer that lifts into a non-nil interface — the +// in-helper `disc != nil` check then passes but the method call panics on +// the autogenerated forwarder for the embedded discovery pointer. With a +// function type, callers can pass the bound method (`disc.GetResourceWithGroup`) +// when the pointer is concrete and nil when it isn't, with a single `if`. +type PseudoKindDiscoveryLookup func(resource, group string) (k8score.APIResource, bool) + +// RBACTuplesForKind returns the SAR tuples authorizing a read of the given +// cluster-scoped topology pseudo-kind. Centralises the table-walking + +// discovery-filter logic that previously lived in four parallel helpers +// (canReadClusterScopedTopoKind{,MCP} + canReadClusterScopedTopoKindByName{,MCP}). +// +// `group` semantics: +// - Non-empty (per-node path: pass the node's apiVersion group) → returns +// ONLY the row matching that group. For pseudo-kinds like NodeClass this +// disambiguates EC2 / AKS / GCP variants so a user with one provider's +// RBAC doesn't pick up another provider's nodes. +// - Empty (root-preflight path: caller has URL kind only) → returns ALL +// rows under the kind so caller can SAR each and allow on first pass. +// +// Return semantics — three states, in order of caller decision: +// +// - tracked=false: kind isn't in ClusterScopedKinds. Caller should fall +// back to k8s.ClassifyKindScope (the regular static-catalogue + +// discovery path). Both surfaces already do this. +// - tracked=true + len(tuples) > 0: caller iterates tuples through its +// protocol-specific per-user check (REST: s.canRead; MCP: +// canReadInNamespace) and ALLOWS on the first pass. +// - tracked=true + len(tuples) == 0 + fallthroughAllow=true: every +// candidate row was eliminated by the discovery filter (CRD removed +// mid-build, or no provider variant installed on this cluster). Caller +// should ALLOW — matches the pre-existing "over-include rather than +// silently hide a node the cluster admin can see" behavior the per-node +// and by-name helpers both used. +// - tracked=true + len(tuples) == 0 + fallthroughAllow=false: kind IS +// tracked but no row matched the supplied group. Caller should DENY — +// this is the per-variant safety the helpers exist to enforce. Only +// possible when group is non-empty (root-preflight passes empty group +// and gets every row). +// +// Discovery filter: rows with a non-empty Group whose (Resource, Group) is +// MISSING from disc are skipped. Rows with empty Group (core/v1 kinds like +// Node, PV) are kept regardless — those don't ride on CRD discovery. nil +// disc disables the filter and returns every matching row. +// +// The returned slice is a fresh allocation — callers may mutate. +func RBACTuplesForKind(kind, group string, disc PseudoKindDiscoveryLookup) (tuples []SARTuple, tracked bool, fallthroughAllow bool) { + // Resolve pseudo-kinds (e.g. KnativeService → ServingService) so the + // caller-supplied (kind, group) lines up with the ClusterScopedKinds + // table's Pascal-cased rows. ClusterScopedKinds contains only + // cluster-scoped entries; namespaced pseudo-kinds (KnativeService, + // CAPICluster, …) correctly fail the lookup and the caller falls back. + resolved := pseudoKindFor(kind, group) + + // tracked := the kind appears in ClusterScopedKinds at all. We need this + // signal separately from "matched a row under the supplied group" so the + // per-variant deny case (tracked + supplied group doesn't match any row) + // is distinguishable from the not-tracked case (caller should fall + // through to ClassifyKindScope). + for _, ck := range ClusterScopedKinds { + if strings.EqualFold(string(ck.Kind), resolved) { + tracked = true + break + } + } + if !tracked { + return nil, false, false + } + + // Caller-supplied group selects a specific variant; empty group returns + // every variant under the kind. + for _, ck := range ClusterScopedKinds { + if !strings.EqualFold(string(ck.Kind), resolved) { + continue + } + if group != "" && ck.Group != group { + continue + } + if ck.Group != "" && disc != nil { + if _, ok := disc(ck.Resource, ck.Group); !ok { + continue + } + } + tuples = append(tuples, SARTuple{Group: ck.Group, Resource: ck.Resource, Namespace: ""}) + } + if len(tuples) > 0 { + return tuples, true, false + } + + // Tracked + zero tuples: distinguish "every row was filtered by + // discovery" (fallthroughAllow=true) from "no row matched supplied + // group" (fallthroughAllow=false, per-variant deny). + if group == "" { + // Empty group means we walked every row under the kind; if none + // survived the filter, the disc lookup dropped them all → fallthrough-allow. + return nil, true, true + } + // group != "" → re-walk WITHOUT the disc filter. If a row exists + // pre-filter, the filter ate it → fallthrough-allow. Else no row + // matched group → per-variant deny. + for _, ck := range ClusterScopedKinds { + if !strings.EqualFold(string(ck.Kind), resolved) { + continue + } + if ck.Group == group { + // Pre-filter row exists for the requested group → disc filter + // removed it → over-include the same way the existing per-node + // gate did when its single targeted row failed discovery. + return nil, true, true + } + } + // No row exists for (kind, group) — deny. + return nil, true, false +} + +// NodeRBACDecision is the verdict RBACTuplesForNode returns to the caller. +// Four states cover every code path the per-node gate used to handle inline; +// the caller dispatches on Decision and (when applicable) iterates Tuples +// through its protocol-specific per-user check. +type NodeRBACDecision int + +const ( + // NodeRBACAllow: the caller should ALLOW the read without testing any + // SAR tuple. Tuples is empty. Cases: + // - Namespaced node whose namespace-list gate is a sufficient signal + // (ordinary kinds — Pod, ConfigMap, …). + // - Cluster-scoped pseudo-kind whose only matching variant was filtered + // out by discovery (CRD removed mid-build / not installed). + NodeRBACAllow NodeRBACDecision = iota + + // NodeRBACDeny: the caller should DENY the read without testing any + // SAR tuple. Tuples is empty. Single case today: + // - Cluster-scoped pseudo-kind whose node carries an apiVersion-group + // not in the per-variant table — over-deny rather than leak an + // unmapped variant. + NodeRBACDeny + + // NodeRBACCheckTuples: the caller should iterate Tuples through its + // per-user SAR function and allow on the FIRST passing tuple. Cases: + // - Namespaced Secret node (Tuples = [{"", "secrets", ns}]). + // - Cluster-scoped pseudo-kind node with a matching variant row + // (Tuples = [{table.Group, table.Resource, ""}]). + NodeRBACCheckTuples + + // NodeRBACConsultClassifyKindScope: the caller should fall back to the + // regular k8s.ClassifyKindScope path. Tuples is empty. Single case: + // - Cluster-scoped node that is NOT a tracked pseudo-kind. The + // helper deliberately doesn't call into the k8s package (which is + // internal and would create an import cycle); the caller already + // has ClassifyKindScope in scope. + NodeRBACConsultClassifyKindScope +) + +// RBACTuplesForNode is the per-node equivalent of RBACTuplesForKind. Given +// a topology node, classifies the read into one of four decisions and (when +// applicable) returns the SAR tuples to test. Mirrors the now-collapsed +// canReadNeighborhoodNode{,MCP} table-selection logic so REST + MCP share +// one source of truth. +// +// CALLER CONTRACT: when n has a non-empty namespace, the caller must first +// run its protocol-specific namespace-list gate (REST: getUserNamespaces; +// MCP: checkNamespaceAccess). RBACTuplesForNode does NOT encode that gate +// — it's per-user state the helper doesn't see. The returned decision +// applies AFTER the namespace gate has passed. +// +// Decision summary: +// +// - Namespaced, n.Kind == KindSecret → CheckTuples with the per-namespace +// secrets tuple. Secret nodes need a per-user secrets SAR because the +// cache SA may have cluster-wide secrets RBAC the calling user doesn't. +// - Namespaced, other kinds → Allow. +// - Cluster-scoped, tracked pseudo-kind, variant matches → CheckTuples +// with the per-variant cluster-scope tuple. +// - Cluster-scoped, tracked pseudo-kind, disc filter ate the row → Allow. +// - Cluster-scoped, tracked pseudo-kind, unmapped variant → Deny. +// - Cluster-scoped, NOT a tracked pseudo-kind → ConsultClassifyKindScope. +func RBACTuplesForNode(n *Node, disc PseudoKindDiscoveryLookup) (decision NodeRBACDecision, tuples []SARTuple) { + if n == nil { + // Defensive: nil node can't be authorized. + return NodeRBACDeny, nil + } + ns := nodeNamespaceFromData(n) + if ns != "" { + if n.Kind == KindSecret { + return NodeRBACCheckTuples, []SARTuple{{Group: "", Resource: "secrets", Namespace: ns}} + } + return NodeRBACAllow, nil + } + + // Cluster-scoped. Try the pseudo-kind table first. + group := nodeAPIGroupFromData(n) + pseudoTuples, tracked, fallthroughAllow := RBACTuplesForKind(string(n.Kind), group, disc) + if !tracked { + return NodeRBACConsultClassifyKindScope, nil + } + if len(pseudoTuples) > 0 { + return NodeRBACCheckTuples, pseudoTuples + } + if fallthroughAllow { + return NodeRBACAllow, nil + } + return NodeRBACDeny, nil +} + diff --git a/pkg/topology/neighborhood.go b/pkg/topology/neighborhood.go new file mode 100644 index 000000000..f7339a3ff --- /dev/null +++ b/pkg/topology/neighborhood.go @@ -0,0 +1,565 @@ +package topology + +import ( + "strings" +) + +// Profile controls how broadly a neighborhood expands. The public surface is +// intentionally small for agent ergonomics: auto picks a bounded edge set from +// the root kind, while all is the explicit escape hatch. +type Profile string + +const ( + // ProfileAll walks every edge type. + ProfileAll Profile = "all" + // ProfileAuto picks an edge-type set based on the root kind. See + // edgeTypesForAuto for the per-kind mapping. + ProfileAuto Profile = "auto" +) + +// ResolveProfile normalizes a user-supplied profile string to a Profile +// constant. Empty, whitespace, or unrecognized values fall back to +// ProfileAuto. Callers wanting the broadest expansion must pass all +// explicitly; unknown strings never silently broaden traversal. +func ResolveProfile(s string) Profile { + switch strings.ToLower(strings.TrimSpace(s)) { + case "", string(ProfileAuto): + return ProfileAuto + case string(ProfileAll): + return ProfileAll + default: + return ProfileAuto + } +} + +// NeighborhoodOptions configures a BuildNeighborhood call. Zero values are +// replaced with sensible defaults: Profile=auto, Hops=1, MaxNodes=25. +type NeighborhoodOptions struct { + Profile Profile + Hops int + MaxNodes int + + // Allow gates each candidate node during BFS expansion. nil means + // allow all. When non-nil, BFS skips nodes for which Allow returns + // false — they don't consume the MaxNodes budget, can't appear as + // path-fragments between two allowed nodes, and bump + // Subgraph.RBACDenied so callers can report omissions to the user. + // The root node is also checked so callers can hide roots that pass + // coarse preflight checks but fail per-kind gates such as Secrets. + // + // This is the security boundary: forbidden nodes must not influence + // the visible graph shape. Post-hoc filtering would let an + // unauthorized node act as a bridge in the BFS frontier (or consume + // truncation budget) before being dropped, leaking its existence + // indirectly. + Allow func(*Node) bool +} + +// Subgraph is the BFS-expanded neighborhood of a root resource. Nodes are +// the existing topology Node shape (already summary-minified — id / kind / +// name / status / small data map). Edges are filtered by the profile and +// only included when both endpoints are present in Nodes. +type Subgraph struct { + Root ResourceRef `json:"root"` + Nodes []Node `json:"nodes"` + Edges []Edge `json:"edges"` + Truncated bool `json:"truncated,omitempty"` + + // RBACDenied counts nodes skipped during BFS because Allow returned + // false. Reported as a single aggregated omission by the caller; we + // intentionally do NOT track which specific nodes were denied, since + // surfacing those names would re-introduce the existence-leak the + // Allow gate exists to prevent. + RBACDenied int `json:"-"` + + // AmbiguousRoot is set when the root lookup matched more than one API + // group and the caller omitted ResourceRef.Group. Callers should ask the + // user/agent to provide group instead of returning an arbitrary first match. + AmbiguousRoot bool `json:"-"` +} + +// neighborhoodMaxHops is the hard ceiling on BFS depth. Two hops is enough +// to reach grandparents (Pod → ReplicaSet → Deployment) without exploding +// into the whole namespace. +const neighborhoodMaxHops = 2 + +// neighborhoodDefaultHops is the default BFS depth when Hops is unset. +const neighborhoodDefaultHops = 1 + +// neighborhoodDefaultMaxNodes is the default node-budget when MaxNodes is +// unset. 25 fits roughly a workload + its owners + selecting services + +// attached policies without spilling into cluster-level dependencies. +const neighborhoodDefaultMaxNodes = 25 + +// BuildNeighborhood returns the BFS expansion of root in t, filtered by +// opts.Profile. Returns a non-nil Subgraph even when the root is missing +// from the topology — callers can check len(s.Nodes) == 0 for that case. +// +// Thin shim over BuildNeighborhoodWithIndex(nil, nil) — every per-step neighbor +// enumeration falls back to a linear scan over t.Edges, and root resolution +// uses the lowercase-kind ID heuristic without CRD plural lookup. Hot callers +// that already hold a RelationshipsIndex + DynamicProvider (REST + MCP +// handlers) should call BuildNeighborhoodWithIndex directly to skip the index +// rebuild and to enable group-aware root ID construction for CRDs. +func BuildNeighborhood(t *Topology, root ResourceRef, opts NeighborhoodOptions) *Subgraph { + return BuildNeighborhoodWithIndex(t, root, opts, nil, nil) +} + +// BuildNeighborhoodWithIndex is the indexed variant of BuildNeighborhood. +// When idx is non-nil, per-node edge lookups go through idx.EdgesFor in O(1) +// (well, O(degree)) instead of pre-computing an adjacency map from t.Edges. +// When idx is nil, behavior matches BuildNeighborhood exactly: the BFS pays +// a single O(E) adjacency build up front, which is fine for one-shot library +// callers and tests. +// +// dp threads the topology's DynamicProvider into root ID construction so +// CRD plurals resolve correctly (e.g. "certificaterequests" → "certificaterequest"). +// Without dp, buildNodeID's static kindMap covers the built-in kinds only — +// fine for tests but insufficient for ambiguous CRDs (CAPI vs CNPG Cluster). +// When root.Group is non-empty it's also used by findNodeByRef to disambiguate +// kind-collisions across API groups. +// +// Hot callers — REST handleAINeighborhood, MCP handleGetNeighborhood — fetch +// idx from Memoizer.GetIndex so the topology-wide inverted index is shared +// with GetRelationshipsWithIndex / SynthesizeManagedBy on the same request, +// and pass the same dp they used to build the topology. +func BuildNeighborhoodWithIndex(t *Topology, root ResourceRef, opts NeighborhoodOptions, idx *RelationshipsIndex, dp DynamicProvider) *Subgraph { + hops := opts.Hops + if hops <= 0 { + hops = neighborhoodDefaultHops + } + if hops > neighborhoodMaxHops { + hops = neighborhoodMaxHops + } + maxNodes := opts.MaxNodes + if maxNodes <= 0 { + maxNodes = neighborhoodDefaultMaxNodes + } + + sub := &Subgraph{ + Root: root, + Nodes: []Node{}, + Edges: []Edge{}, + } + if t == nil { + return sub + } + + nodeByID := make(map[string]*Node, len(t.Nodes)) + for i := range t.Nodes { + nodeByID[t.Nodes[i].ID] = &t.Nodes[i] + } + + // Use the same dp the topology was built with so CRD plurals resolve + // (e.g. "certificaterequests" → "certificaterequest"). Without it, + // buildNodeID's static map can't construct the right ID for arbitrary CRDs. + rootID := buildNodeID(root.Kind, root.Namespace, root.Name, dp) + rootNode, ok := nodeByID[rootID] + // When root.Group is set, the rootID match alone isn't enough: two CRDs + // sharing the same lowercase plural collide on the ID (e.g. CAPI + // cluster.x-k8s.io/Cluster vs CNPG postgresql.cnpg.io/Cluster — whichever + // was inserted last wins in nodeByID). Verify the candidate's apiGroup + // matches; otherwise fall through to findNodeByRef which does the + // group-aware tuple match. + if ok && root.Group != "" && nodeAPIGroupFromData(rootNode) != root.Group { + ok = false + } + if !ok { + // Fallback: try matching by (kind, namespace, name [+ group]) tuple. + // Mostly for CRDs whose topology node ID uses a different prefix than + // the lowercase kind (e.g. "knativeservice/"). When root.Group is set, + // findNodeByRef also disambiguates kind-collisions across API groups + // (CAPI cluster.x-k8s.io/Cluster vs Fleet cluster.fleet.io/Cluster). + matched, ambiguous := findNodeByRef(t.Nodes, root) + if ambiguous { + sub.AmbiguousRoot = true + return sub + } + if matched == nil { + return sub + } + rootNode = matched + rootID = rootNode.ID + } + + // Apply the Allow gate to the root itself. Callers' upfront RBAC checks + // (REST: kind+namespace, MCP: kind+namespace) authorize the kind class but + // don't catch per-kind tightening inside an allowed namespace — a Secret + // root in a namespace the user can list still needs `get secrets` for + // THAT namespace. Without this gate the root Secret leaks (name+status+ + // data preview) even though the matching neighbor-Secret would be hidden. + // + // Empty subgraph + RBACDenied=1 lets callers translate to 404 (same as + // "root not in topology") so the response doesn't act as an existence + // oracle: a user with namespace-list access but no `get secrets` can't + // distinguish "Secret doesn't exist" from "Secret exists, you can't read". + if opts.Allow != nil && !opts.Allow(rootNode) { + sub.RBACDenied = 1 + return sub + } + + allowedEdges := edgeTypesForProfile(opts.Profile, rootNode.Kind) + + // neighbors yields every edge touching id that passes the profile filter, + // preferring the precomputed index when supplied. The undirected walk + // matters: a Pod is the target of Service→Pod (exposes), but the agent + // wants to find the Service from the Pod just as much as vice versa. + // + // Without idx we walk t.Edges once per BFS step. That's O(E) per step + // instead of O(degree), but the index-free path is reserved for tests + // and library callers without a Memoizer — hot REST/MCP traffic always + // supplies idx. + neighbors := func(id string) []Edge { + if idx != nil { + incoming, outgoing := idx.EdgesFor(id) + // Filter both sides by allowed edge types. Concatenate into a + // fresh slice so the caller can't accidentally mutate the + // index's internal storage. + out := make([]Edge, 0, len(incoming)+len(outgoing)) + for _, e := range outgoing { + if allowedEdges[e.Type] { + out = append(out, e) + } + } + for _, e := range incoming { + if allowedEdges[e.Type] { + out = append(out, e) + } + } + return out + } + var out []Edge + for _, e := range t.Edges { + if !allowedEdges[e.Type] { + continue + } + if e.Source == id || e.Target == id { + out = append(out, e) + } + } + return out + } + + // BFS by hop level. visited[id] = hop at which the node entered the + // frontier, so we can stop when we'd exceed hops. RBAC-denied nodes + // are skipped here (not post-filtered) so they neither consume budget + // nor act as path-fragments to allowed nodes downstream. + included := map[string]bool{rootID: true} + order := []string{rootID} + frontier := []string{rootID} + truncated := false + rbacDenied := 0 + deniedIDs := make(map[string]bool) // dedupe — same denied node may surface via multiple edges + + for hop := 0; hop < hops; hop++ { + var next []string + for _, id := range frontier { + for _, e := range neighbors(id) { + other := e.Source + if other == id { + other = e.Target + } + if included[other] { + continue + } + if deniedIDs[other] { + // Same denied node may surface via multiple edges; skip + // re-evaluating Allow (SAR cache hit, but still wasted work). + continue + } + candidate, exists := nodeByID[other] + if !exists { + // Edge dangles off a node that isn't in the topology. + continue + } + if opts.Allow != nil && !opts.Allow(candidate) { + deniedIDs[other] = true + rbacDenied++ + continue + } + if len(included) >= maxNodes { + truncated = true + continue + } + included[other] = true + order = append(order, other) + next = append(next, other) + } + if truncated { + break + } + } + if truncated || len(next) == 0 { + break + } + frontier = next + } + + sub.Truncated = truncated + sub.RBACDenied = rbacDenied + + // Materialize nodes in BFS order so the root is always first and the + // rest follow predictable expansion order. + sub.Nodes = make([]Node, 0, len(order)) + for _, id := range order { + if n, ok := nodeByID[id]; ok { + sub.Nodes = append(sub.Nodes, *n) + } + } + + // Edges: include only edges whose type is allowed AND both endpoints + // are in the included set. Walking t.Edges once here keeps the output + // edge order stable regardless of whether we used the index or not — + // tests and clients depend on that ordering. + for _, e := range t.Edges { + if !allowedEdges[e.Type] { + continue + } + if !included[e.Source] || !included[e.Target] { + continue + } + sub.Edges = append(sub.Edges, e) + } + + return sub +} + +// edgeTypesForProfile returns the set of edge types a profile traverses. +// rootKind is used for ProfileAuto only. Unknown profile values normalize to +// auto rather than all, so typos do not silently broaden traversal. +func edgeTypesForProfile(p Profile, rootKind NodeKind) map[EdgeType]bool { + switch p { + case ProfileAll: + return allEdgeTypes() + case ProfileAuto, "": + // Empty falls through here so direct lib callers that leave + // Profile unset get the documented default (auto), matching what + // the REST and MCP handlers already produce. + return edgeTypesForAuto(rootKind) + default: + // Unknown profile string (typo, future profile name from an old + // client): pick auto rather than the broadest possible expansion. + return edgeTypesForAuto(rootKind) + } +} + +func managementEdgeTypes() map[EdgeType]bool { + return map[EdgeType]bool{EdgeManages: true} +} + +func networkingEdgeTypes() map[EdgeType]bool { + return map[EdgeType]bool{EdgeRoutesTo: true, EdgeExposes: true} +} + +func policyEdgeTypes() map[EdgeType]bool { + return map[EdgeType]bool{EdgeProtects: true} +} + +// allEdgeTypes returns the universal set. Kept centralized so adding an +// EdgeType updates ProfileAll automatically. +func allEdgeTypes() map[EdgeType]bool { + return map[EdgeType]bool{ + EdgeManages: true, + EdgeRoutesTo: true, + EdgeExposes: true, + EdgeUses: true, + EdgeProtects: true, + EdgeConfigures: true, + } +} + +// edgeTypesForAuto picks profile edge-types based on the root's kind. The +// goal is "the agent asked about a workload — show the management chain +// plus the network and policy attachments, not the whole graph." +func edgeTypesForAuto(rootKind NodeKind) map[EdgeType]bool { + switch rootKind { + // Workloads / pods: management chain + network exposure + protection. + case KindPod, KindPodGroup, KindDeployment, KindStatefulSet, KindDaemonSet, + KindReplicaSet, KindRollout, KindJob, KindCronJob, + KindKnativeService, KindKnativeRevision, KindKnativeConfiguration: + return map[EdgeType]bool{ + EdgeManages: true, + EdgeRoutesTo: true, + EdgeExposes: true, + EdgeProtects: true, + } + // GitOps controllers: just the management chain (what they own). + case KindApplication, KindKustomization, KindHelmRelease, KindGitRepository: + return managementEdgeTypes() + // Network-shaped resources: routing topology. + case KindService, KindIngress, KindGateway, KindGatewayClass, + KindHTTPRoute, KindGRPCRoute, KindTCPRoute, KindTLSRoute, + KindVirtualService, KindIstioGateway, KindHTTPProxy, + KindIngressRoute, KindIngressRouteTCP, KindIngressRouteUDP: + return networkingEdgeTypes() + // Policies / protectors: who they attach to. + case KindNetworkPolicy, KindCiliumNetworkPolicy, + KindCiliumClusterwideNetworkPolicy, KindClusterNetworkPolicy, + KindPDB, KindMachineHealthCheck, + KindPeerAuthentication, KindAuthorizationPolicy: + return policyEdgeTypes() + // Nodes / node pools: hosted workloads via management chain. + case KindNode, KindNodePool, KindNodeClaim, KindNodeClass: + return managementEdgeTypes() + default: + return allEdgeTypes() + } +} + +// findNodeByRef looks up a node by (kind, namespace, name [+ group]). Used as +// a fallback when buildNodeID's lowercase-kind heuristic produces an ID that +// doesn't match a CRD-prefixed node ID (e.g. "knativeservice/"). +// +// When ref.Group is non-empty, the candidate node's apiGroup (derived from +// Data["apiVersion"] via apiVersionGroupFromData) must match — this is how +// callers disambiguate kind-collisions across API groups, e.g. asking for the +// CAPI cluster.x-k8s.io/Cluster vs the KubeFleet cluster.fleet.io/Cluster +// when both share the same kind+namespace+name. When ref.Group is empty, +// multiple matching API groups are reported as ambiguous instead of choosing +// an arbitrary first match; callers should ask for an explicit group. +// +// Pseudo-kinds: some CRDs are stored in the topology under a synthesized kind +// distinct from their API kind (KnativeService for serving.knative.dev/Service, +// CAPICluster for cluster.x-k8s.io/Cluster, …). We map (ref.Kind, ref.Group) +// through pseudoKindFor BEFORE the direct kind comparison so a caller asking +// for kind=Service&group=serving.knative.dev finds the KnativeService topology +// node. Without this, the comparison Node.Kind="KnativeService" vs +// ref.Kind="Service" never matches and the root lookup silently fails. +func findNodeByRef(nodes []Node, ref ResourceRef) (*Node, bool) { + // Resolve the caller-facing (kind, group) tuple to the kind the topology + // builder uses on Node.Kind. Falls back to ref.Kind unchanged when there's + // no pseudo-kind mapping for this (kind, group). + wantKind := pseudoKindFor(ref.Kind, ref.Group) + var match *Node + matchGroup := "" + for i := range nodes { + n := &nodes[i] + // Compare against the resolved pseudo-kind first; if that misses fall + // back to the original ref.Kind so callers that don't supply a group, + // or kinds that aren't pseudo-mapped, still work when unambiguous. + if !strings.EqualFold(string(n.Kind), wantKind) && !strings.EqualFold(string(n.Kind), ref.Kind) { + continue + } + if n.Name != ref.Name { + continue + } + ns := nodeNamespaceFromData(n) + if ns != ref.Namespace { + continue + } + if ref.Group != "" { + if nodeAPIGroupFromData(n) != ref.Group { + continue + } + } + group := nodeAPIGroupFromData(n) + if match != nil && group != matchGroup { + return nil, true + } + match = n + matchGroup = group + } + return match, false +} + +// pseudoKindFor maps an (API kind, API group) to the synthesized topology +// kind the builder assigns when both differ. For (kind, group) pairs that +// aren't pseudo-mapped (or when group is empty), returns kind unchanged. +// +// Examples: +// +// ("Service", "serving.knative.dev") → "KnativeService" +// ("Configuration", "serving.knative.dev") → "KnativeConfiguration" +// ("Cluster", "cluster.x-k8s.io") → "CAPICluster" +// ("Cluster", "postgresql.cnpg.io") → "Cluster" (no pseudo-mapping) +// +// Keep this table in sync with the kinds the topology builder synthesizes — +// see the NodeKind constants prefixed Knative*/CAPI* in types.go. Missing an +// entry here means the caller-supplied API kind never matches the topology +// node, so the neighborhood root lookup silently returns "not found." +func pseudoKindFor(kind, group string) string { + if group == "" { + return kind + } + // Accept both singular and plural lowercase forms because REST hits + // this with URL-path kinds (plural — "services") after normalizeKind + // lowercases them, while MCP arrives in Pascal-case singular + // ("Service"). Without case + plurality insensitivity, REST root + // lookups silently 404 for any cross-group CRD whose plural collides + // with a built-in (e.g. serving.knative.dev/Service vs core/v1 + // Service, cluster.x-k8s.io/Cluster vs other CRDs). + switch group { + case "serving.knative.dev": + switch strings.ToLower(kind) { + case "service", "services": + return string(KindKnativeService) + case "configuration", "configurations": + return string(KindKnativeConfiguration) + case "revision", "revisions": + return string(KindKnativeRevision) + case "route", "routes": + return string(KindKnativeRoute) + } + case "cluster.x-k8s.io": + switch strings.ToLower(kind) { + case "cluster", "clusters": + return string(KindCAPICluster) + } + case "networking.istio.io": + // Istio Gateway lives under IstioGateway to avoid collision with + // gateway.networking.k8s.io/Gateway. + switch strings.ToLower(kind) { + case "gateway", "gateways": + return string(KindIstioGateway) + } + } + return kind +} + +// nodeNamespaceFromData reads the namespace from a Node's Data map. Mirrors +// the convention used by the builder ("namespace" → string). +func nodeNamespaceFromData(n *Node) string { + if n.Data == nil { + return "" + } + if ns, ok := n.Data["namespace"].(string); ok { + return ns + } + return "" +} + +// APIVersionGroup extracts the API group from a Kubernetes apiVersion +// string by splitting on the first '/'. The core group has no slash and +// returns "" — matches K8s convention. +// +// "v1" → "" +// "apps/v1" → "apps" +// "serving.knative.dev/v1" → "serving.knative.dev" +// "" → "" +// +// Exported so REST + MCP handlers share one implementation; previously +// internal/server, internal/mcp, and this package each carried their own +// copy of the same split-on-first-slash logic. +func APIVersionGroup(apiVersion string) string { + for i := 0; i < len(apiVersion); i++ { + if apiVersion[i] == '/' { + return apiVersion[:i] + } + } + return "" +} + +// nodeAPIGroupFromData extracts the API group from a Node's Data map. +// Returns "" when no apiVersion is set on the node. Thin wrapper around +// APIVersionGroup so callers walking topology nodes don't have to read +// the Data map themselves. +func nodeAPIGroupFromData(n *Node) string { + if n.Data == nil { + return "" + } + v, ok := n.Data["apiVersion"].(string) + if !ok { + return "" + } + return APIVersionGroup(v) +} diff --git a/pkg/topology/neighborhood_test.go b/pkg/topology/neighborhood_test.go new file mode 100644 index 000000000..2b38fa728 --- /dev/null +++ b/pkg/topology/neighborhood_test.go @@ -0,0 +1,928 @@ +package topology + +import ( + "sort" + "testing" +) + +// makeNode is a tiny helper for the BFS tests. It assembles a Node with a +// matching ID and namespace data so BuildNeighborhood's lookups work. +func makeNode(kind NodeKind, ns, name string) Node { + idKind := normalizeIDKind(kind) + id := idKind + "/" + ns + "/" + name + return Node{ + ID: id, + Kind: kind, + Name: name, + Status: StatusHealthy, + Data: map[string]any{"namespace": ns}, + } +} + +func normalizeIDKind(kind NodeKind) string { + // Mirror buildNodeID's lowercase-kind convention. For the kinds these + // tests use, the lowercase form is the ID prefix. + switch kind { + case KindPod: + return "pod" + case KindService: + return "service" + case KindDeployment: + return "deployment" + case KindReplicaSet: + return "replicaset" + case KindIngress: + return "ingress" + case KindHTTPRoute: + return "httproute" + case KindPDB: + return "poddisruptionbudget" + case KindNetworkPolicy: + return "networkpolicy" + case KindHPA: + return "horizontalpodautoscaler" + case KindConfigMap: + return "configmap" + case KindSecret: + return "secret" + case KindApplication: + return "application" + case KindNode: + return "node" + } + return string(kind) +} + +func makeEdge(typ EdgeType, source, target string) Edge { + return Edge{ + ID: source + "-" + string(typ) + "-" + target, + Source: source, + Target: target, + Type: typ, + } +} + +// podNeighborhood builds a representative topology for a Pod with surrounding +// owner chain, exposing service + ingress, attached PDB and NetworkPolicy. +// +// Ingress → Service → Pod +// Deployment → ReplicaSet → Pod +// PDB → Pod, NetworkPolicy → Pod (EdgeProtects) +// Pod uses ConfigMap (EdgeConfigures) +func podNeighborhood() *Topology { + pod := makeNode(KindPod, "prod", "cart-xyz") + rs := makeNode(KindReplicaSet, "prod", "cart-rs") + dep := makeNode(KindDeployment, "prod", "cart") + svc := makeNode(KindService, "prod", "cart") + ing := makeNode(KindIngress, "prod", "cart") + pdb := makeNode(KindPDB, "prod", "cart-pdb") + np := makeNode(KindNetworkPolicy, "prod", "cart-allow") + cm := makeNode(KindConfigMap, "prod", "cart-config") + + return &Topology{ + Nodes: []Node{pod, rs, dep, svc, ing, pdb, np, cm}, + Edges: []Edge{ + makeEdge(EdgeManages, dep.ID, rs.ID), + makeEdge(EdgeManages, rs.ID, pod.ID), + makeEdge(EdgeExposes, svc.ID, pod.ID), + makeEdge(EdgeRoutesTo, ing.ID, svc.ID), + makeEdge(EdgeProtects, pdb.ID, pod.ID), + makeEdge(EdgeProtects, np.ID, pod.ID), + makeEdge(EdgeConfigures, cm.ID, pod.ID), + }, + } +} + +func nodeIDs(s *Subgraph) []string { + ids := make([]string, 0, len(s.Nodes)) + for _, n := range s.Nodes { + ids = append(ids, n.ID) + } + sort.Strings(ids) + return ids +} + +func edgeIDs(s *Subgraph) []string { + ids := make([]string, 0, len(s.Edges)) + for _, e := range s.Edges { + ids = append(ids, e.ID) + } + sort.Strings(ids) + return ids +} + +func TestBuildNeighborhood_AutoForPod(t *testing.T) { + topo := podNeighborhood() + root := ResourceRef{Kind: "Pod", Namespace: "prod", Name: "cart-xyz"} + + sub := BuildNeighborhood(topo, root, NeighborhoodOptions{ + Profile: ProfileAuto, + Hops: 1, + }) + // Auto for Pod = management + networking + policy. ReplicaSet (manages), + // Service (exposes), PDB + NP (protects) all reachable in 1 hop. The + // ConfigMap is via EdgeConfigures and should NOT appear under auto. + got := nodeIDs(sub) + want := []string{ + "networkpolicy/prod/cart-allow", + "pod/prod/cart-xyz", + "poddisruptionbudget/prod/cart-pdb", + "replicaset/prod/cart-rs", + "service/prod/cart", + } + if !equalStrings(got, want) { + t.Errorf("auto pod 1-hop nodes = %v, want %v", got, want) + } + // EdgeConfigures should not be present even though ConfigMap is excluded. + for _, e := range sub.Edges { + if e.Type == EdgeConfigures { + t.Errorf("auto profile for Pod must not include EdgeConfigures: %+v", e) + } + } +} + +func TestBuildNeighborhood_AutoForApplication(t *testing.T) { + // Application → Deployment via EdgeManages. + app := makeNode(KindApplication, "argocd", "cart") + dep := makeNode(KindDeployment, "prod", "cart") + svc := makeNode(KindService, "prod", "cart") + topo := &Topology{ + Nodes: []Node{app, dep, svc}, + Edges: []Edge{ + makeEdge(EdgeManages, app.ID, dep.ID), + makeEdge(EdgeExposes, svc.ID, dep.ID), + }, + } + + sub := BuildNeighborhood(topo, ResourceRef{Kind: "Application", Namespace: "argocd", Name: "cart"}, NeighborhoodOptions{ + Profile: ProfileAuto, + Hops: 1, + }) + got := nodeIDs(sub) + want := []string{"application/argocd/cart", "deployment/prod/cart"} + if !equalStrings(got, want) { + t.Errorf("auto Application 1-hop nodes = %v, want %v", got, want) + } +} + +func TestBuildNeighborhood_AutoForService(t *testing.T) { + topo := podNeighborhood() + sub := BuildNeighborhood(topo, ResourceRef{Kind: "Service", Namespace: "prod", Name: "cart"}, NeighborhoodOptions{ + Profile: ProfileAuto, + Hops: 1, + }) + got := nodeIDs(sub) + // Service auto profile = networking. Should walk to Pod (it exposes) + // and Ingress (routes-to Service). Should NOT walk to ReplicaSet + // (manages) even though it's adjacent. + want := []string{ + "ingress/prod/cart", + "pod/prod/cart-xyz", + "service/prod/cart", + } + if !equalStrings(got, want) { + t.Errorf("auto Service 1-hop nodes = %v, want %v", got, want) + } +} + +func TestBuildNeighborhood_AutoForPDB(t *testing.T) { + topo := podNeighborhood() + sub := BuildNeighborhood(topo, ResourceRef{Kind: "PodDisruptionBudget", Namespace: "prod", Name: "cart-pdb"}, NeighborhoodOptions{ + Profile: ProfileAuto, + Hops: 1, + }) + got := nodeIDs(sub) + want := []string{ + "pod/prod/cart-xyz", + "poddisruptionbudget/prod/cart-pdb", + } + if !equalStrings(got, want) { + t.Errorf("auto PDB 1-hop nodes = %v, want %v", got, want) + } +} + +func TestBuildNeighborhood_MaxNodesTriggersTruncation(t *testing.T) { + topo := podNeighborhood() + root := ResourceRef{Kind: "Pod", Namespace: "prod", Name: "cart-xyz"} + + // MaxNodes=2 — only the root and one neighbor will fit, even though many + // are reachable in 1 hop under ProfileAll. + sub := BuildNeighborhood(topo, root, NeighborhoodOptions{ + Profile: ProfileAll, + Hops: 2, + MaxNodes: 2, + }) + if !sub.Truncated { + t.Errorf("expected Truncated=true, got false") + } + if len(sub.Nodes) != 2 { + t.Errorf("expected exactly 2 nodes under MaxNodes=2, got %d (%v)", len(sub.Nodes), nodeIDs(sub)) + } + if sub.Nodes[0].ID != "pod/prod/cart-xyz" { + t.Errorf("expected root first, got %s", sub.Nodes[0].ID) + } +} + +func TestBuildNeighborhood_DefaultsApplied(t *testing.T) { + topo := podNeighborhood() + root := ResourceRef{Kind: "Pod", Namespace: "prod", Name: "cart-xyz"} + + sub := BuildNeighborhood(topo, root, NeighborhoodOptions{}) + if len(sub.Nodes) == 0 { + t.Fatalf("expected non-empty neighborhood under default options") + } + // Default hops=1 — must include root and at least one neighbor; should + // NOT include the Deployment (which is 2 hops away). + if sub.Nodes[0].ID != "pod/prod/cart-xyz" { + t.Errorf("expected root first, got %s", sub.Nodes[0].ID) + } + for _, n := range sub.Nodes { + if n.Kind == KindDeployment { + t.Errorf("default hops=1 should not include Deployment (2 hops away)") + } + } +} + +func TestBuildNeighborhood_HopsClampedToMax(t *testing.T) { + topo := podNeighborhood() + root := ResourceRef{Kind: "Pod", Namespace: "prod", Name: "cart-xyz"} + + // Hops=99 should be clamped to neighborhoodMaxHops=2. Under auto for + // a Pod we reach the management chain plus adjacent networking/policy + // nodes, but still exclude config/configure edges. + sub := BuildNeighborhood(topo, root, NeighborhoodOptions{ + Profile: ProfileAuto, + Hops: 99, + }) + got := nodeIDs(sub) + want := []string{ + "deployment/prod/cart", + "ingress/prod/cart", + "networkpolicy/prod/cart-allow", + "pod/prod/cart-xyz", + "poddisruptionbudget/prod/cart-pdb", + "replicaset/prod/cart-rs", + "service/prod/cart", + } + if !equalStrings(got, want) { + t.Errorf("hops clamp: got %v, want %v", got, want) + } +} + +func TestBuildNeighborhood_RootMissingReturnsEmpty(t *testing.T) { + topo := podNeighborhood() + root := ResourceRef{Kind: "Pod", Namespace: "prod", Name: "does-not-exist"} + + sub := BuildNeighborhood(topo, root, NeighborhoodOptions{Profile: ProfileAuto}) + if sub == nil { + t.Fatal("expected non-nil subgraph for missing root") + } + if len(sub.Nodes) != 0 { + t.Errorf("expected empty nodes for missing root, got %v", nodeIDs(sub)) + } + if sub.Root.Name != "does-not-exist" { + t.Errorf("subgraph.Root should echo the requested root") + } +} + +// TestBuildNeighborhood_UnknownProfileDefaultsToAuto verifies that an +// unrecognized profile string (typo, future profile name from an older +// client) is normalized to ProfileAuto rather than silently expanding +// to ProfileAll. Empty profile gets the same treatment so direct lib +// callers that don't pre-default through a handler see the documented +// "Profile=auto" default. +func TestBuildNeighborhood_UnknownProfileDefaultsToAuto(t *testing.T) { + topo := podNeighborhood() + root := ResourceRef{Kind: "Pod", Namespace: "prod", Name: "cart-xyz"} + + // Reference: ProfileAuto for a Pod traverses management + networking + policy. + auto := BuildNeighborhood(topo, root, NeighborhoodOptions{ + Profile: ProfileAuto, + Hops: 2, + }) + autoNodeIDs := nodeIDs(auto) + + cases := []struct { + name string + profile Profile + }{ + {"empty profile", ""}, + {"unknown profile string", "banana"}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := BuildNeighborhood(topo, root, NeighborhoodOptions{ + Profile: c.profile, + Hops: 2, + }) + if !equalStrings(nodeIDs(got), autoNodeIDs) { + t.Errorf("%q profile should match ProfileAuto:\n got: %v\n want: %v", c.profile, nodeIDs(got), autoNodeIDs) + } + }) + } + + // Sanity: ProfileAll is broader than auto for a Pod (auto excludes + // EdgeUses + EdgeConfigures). Confirms our "unknown→auto" guard + // actually narrows the expansion vs. the previous "unknown→all" + // behavior. + all := BuildNeighborhood(topo, root, NeighborhoodOptions{ + Profile: ProfileAll, + Hops: 2, + }) + if len(nodeIDs(all)) <= len(autoNodeIDs) { + t.Errorf("expected ProfileAll to cover at least as many nodes as ProfileAuto for a Pod root; got all=%d auto=%d", len(nodeIDs(all)), len(autoNodeIDs)) + } +} + +// TestBuildNeighborhood_AllowSkipsForbidden verifies that nodes for which +// Allow returns false are skipped during BFS — they don't appear in the +// output AND don't consume the MaxNodes budget. This is the security +// boundary: forbidden nodes must not influence the visible graph. +func TestBuildNeighborhood_AllowSkipsForbidden(t *testing.T) { + topo := podNeighborhood() + root := ResourceRef{Kind: "Pod", Namespace: "prod", Name: "cart-xyz"} + + // Deny ReplicaSet — forbidden during BFS. + sub := BuildNeighborhood(topo, root, NeighborhoodOptions{ + Profile: ProfileAll, + Hops: 2, + Allow: func(n *Node) bool { + return n.Kind != KindReplicaSet + }, + }) + + for _, n := range sub.Nodes { + if n.Kind == KindReplicaSet { + t.Errorf("BFS surfaced a forbidden ReplicaSet node: %s", n.ID) + } + } + if sub.RBACDenied != 1 { + t.Errorf("expected RBACDenied=1 (the single ReplicaSet), got %d", sub.RBACDenied) + } +} + +// TestBuildNeighborhood_AllowPreventsPathFragments verifies the path-fragment +// guarantee: a forbidden node cannot serve as a bridge between two allowed +// nodes the user reaches via BFS. Without pre-filtering, BFS would traverse +// through the forbidden node and surface its downstream allowed neighbors +// (leaking that the forbidden node connects them). +// +// Topology: Pod → ReplicaSet → Deployment (management chain). With +// ReplicaSet forbidden, BFS from Pod must NOT reach Deployment. +func TestBuildNeighborhood_AllowPreventsPathFragments(t *testing.T) { + topo := podNeighborhood() + root := ResourceRef{Kind: "Pod", Namespace: "prod", Name: "cart-xyz"} + + sub := BuildNeighborhood(topo, root, NeighborhoodOptions{ + Profile: ProfileAuto, + Hops: 2, + Allow: func(n *Node) bool { + return n.Kind != KindReplicaSet + }, + }) + + for _, n := range sub.Nodes { + if n.Kind == KindDeployment { + t.Errorf("BFS reached Deployment through a forbidden ReplicaSet — path-fragment leak: %s", n.ID) + } + } +} + +// TestBuildNeighborhood_AllowProtectsBudget verifies that forbidden nodes +// don't consume the MaxNodes truncation budget. Without pre-filtering, a +// run of forbidden nodes near the root could exhaust the budget and cause +// allowed nodes further out to be truncated — a side-channel leak. +func TestBuildNeighborhood_AllowProtectsBudget(t *testing.T) { + topo := podNeighborhood() + root := ResourceRef{Kind: "Pod", Namespace: "prod", Name: "cart-xyz"} + + // With ReplicaSet denied AND MaxNodes=2, BFS should NOT count the + // denied node toward the budget. We should still fit the root + + // another allowed node, instead of root + denied node (which would + // trip truncation with zero allowed neighbors). + sub := BuildNeighborhood(topo, root, NeighborhoodOptions{ + Profile: ProfileAll, + Hops: 2, + MaxNodes: 2, + Allow: func(n *Node) bool { + return n.Kind != KindReplicaSet + }, + }) + + // At least the root must be present. + if len(sub.Nodes) == 0 { + t.Fatal("expected non-empty subgraph; got zero nodes") + } + if sub.Nodes[0].Kind != "Pod" { + t.Errorf("expected Pod root first, got %s", sub.Nodes[0].Kind) + } + // Denied node must not appear. + for _, n := range sub.Nodes { + if n.Kind == KindReplicaSet { + t.Errorf("denied node consumed budget and appeared: %s", n.ID) + } + } +} + +func TestBuildNeighborhood_EdgesOnlyBetweenIncludedNodes(t *testing.T) { + topo := podNeighborhood() + root := ResourceRef{Kind: "Pod", Namespace: "prod", Name: "cart-xyz"} + + sub := BuildNeighborhood(topo, root, NeighborhoodOptions{ + Profile: ProfileAuto, + Hops: 1, + }) + gotEdges := edgeIDs(sub) + wantEdges := []string{ + "networkpolicy/prod/cart-allow-protects-pod/prod/cart-xyz", + "poddisruptionbudget/prod/cart-pdb-protects-pod/prod/cart-xyz", + "replicaset/prod/cart-rs-manages-pod/prod/cart-xyz", + "service/prod/cart-exposes-pod/prod/cart-xyz", + } + if !equalStrings(gotEdges, wantEdges) { + t.Errorf("auto 1-hop edges = %v, want %v", gotEdges, wantEdges) + } +} + +func TestBuildNeighborhood_NilTopologyReturnsEmpty(t *testing.T) { + root := ResourceRef{Kind: "Pod", Namespace: "prod", Name: "cart-xyz"} + sub := BuildNeighborhood(nil, root, NeighborhoodOptions{}) + if sub == nil { + t.Fatal("expected non-nil subgraph") + } + if len(sub.Nodes) != 0 || len(sub.Edges) != 0 { + t.Errorf("expected empty subgraph for nil topology, got %+v", sub) + } +} + +// TestBuildNeighborhood_GroupCollisionRoot pins the group-aware root lookup. +// When two nodes share kind+namespace+name but come from different API +// groups (a real collision: CAPI cluster.x-k8s.io/Cluster vs the hypothetical +// KubeFleet cluster.fleet.io/Cluster — same kind, different groups), the +// caller must be able to disambiguate by passing ResourceRef.Group. +func TestBuildNeighborhood_GroupCollisionRoot(t *testing.T) { + // Two "Cluster" nodes in the same namespace, same name, different groups. + // Each has a distinct neighbor so we can verify which root was selected. + capi := Node{ + ID: "cluster.x-k8s.io/cluster/fleet/prod", + Kind: "Cluster", + Name: "prod", + Status: StatusHealthy, + Data: map[string]any{ + "namespace": "fleet", + "apiVersion": "cluster.x-k8s.io/v1beta1", + }, + } + capiMachine := makeNode("Machine", "fleet", "prod-md-0") + capiMachine.Data["apiVersion"] = "cluster.x-k8s.io/v1beta1" + + fleet := Node{ + ID: "cluster.fleet.io/cluster/fleet/prod", + Kind: "Cluster", + Name: "prod", + Status: StatusHealthy, + Data: map[string]any{ + "namespace": "fleet", + "apiVersion": "cluster.fleet.io/v1alpha1", + }, + } + fleetGroup := makeNode("ClusterGroup", "fleet", "prod-grp") + fleetGroup.Data["apiVersion"] = "cluster.fleet.io/v1alpha1" + + topo := &Topology{ + Nodes: []Node{capi, capiMachine, fleet, fleetGroup}, + Edges: []Edge{ + makeEdge(EdgeManages, capi.ID, capiMachine.ID), + makeEdge(EdgeManages, fleet.ID, fleetGroup.ID), + }, + } + + // Explicit group=cluster.x-k8s.io must pick the CAPI Cluster and surface + // its Machine neighbor, NOT the Fleet ClusterGroup. + sub := BuildNeighborhoodWithIndex(topo, + ResourceRef{Kind: "Cluster", Namespace: "fleet", Name: "prod", Group: "cluster.x-k8s.io"}, + NeighborhoodOptions{Profile: ProfileAll, Hops: 1}, + nil, nil, + ) + if sub.Nodes[0].ID != capi.ID { + t.Fatalf("expected CAPI root (id=%s), got %s", capi.ID, sub.Nodes[0].ID) + } + got := nodeIDs(sub) + want := []string{capiMachine.ID, capi.ID} + sort.Strings(want) + if !equalStrings(got, want) { + t.Errorf("group=cluster.x-k8s.io root neighborhood = %v, want %v", got, want) + } +} + +// TestBuildNeighborhood_GroupEmptyRootAmbiguous verifies that omitted group +// does not silently pick an arbitrary node when multiple API groups share the +// same kind+namespace+name. Callers must pass ResourceRef.Group to resolve the +// collision. +func TestBuildNeighborhood_GroupEmptyRootAmbiguous(t *testing.T) { + capi := Node{ + ID: "cluster.x-k8s.io/cluster/fleet/prod", + Kind: "Cluster", + Name: "prod", + Status: StatusHealthy, + Data: map[string]any{ + "namespace": "fleet", + "apiVersion": "cluster.x-k8s.io/v1beta1", + }, + } + fleet := Node{ + ID: "cluster.fleet.io/cluster/fleet/prod", + Kind: "Cluster", + Name: "prod", + Status: StatusHealthy, + Data: map[string]any{ + "namespace": "fleet", + "apiVersion": "cluster.fleet.io/v1alpha1", + }, + } + topo := &Topology{ + Nodes: []Node{capi, fleet}, + Edges: []Edge{}, + } + + sub := BuildNeighborhoodWithIndex(topo, + ResourceRef{Kind: "Cluster", Namespace: "fleet", Name: "prod", Group: ""}, + NeighborhoodOptions{Profile: ProfileAll, Hops: 1}, + nil, nil, + ) + if !sub.AmbiguousRoot { + t.Fatal("expected ambiguous root when group is omitted for colliding resources") + } + if len(sub.Nodes) != 0 || len(sub.Edges) != 0 { + t.Errorf("expected empty subgraph for ambiguous root, got nodes=%v edges=%v", nodeIDs(sub), edgeIDs(sub)) + } +} + +// TestBuildNeighborhood_AllowDeniesRoot verifies the root is gated by Allow, +// not just the BFS frontier. A Secret root with namespace access but no +// per-namespace `get secrets` SAR is the load-bearing case: callers' upfront +// RBAC check (kind+namespace) doesn't catch per-kind tightening inside a +// namespace, so the root Allow gate is the only place that denial surfaces. +// +// Empty subgraph + RBACDenied=1 lets handlers translate to 404 (matching the +// "root not in topology" path), preserving existence-hiding — a user without +// `get secrets` can't distinguish "Secret doesn't exist" from "Secret exists, +// you can't read it." +func TestBuildNeighborhood_AllowDeniesRoot(t *testing.T) { + pod := makeNode(KindPod, "prod", "cart-xyz") + secret := makeNode(KindSecret, "prod", "api-tls") + topo := &Topology{ + Nodes: []Node{pod, secret}, + Edges: []Edge{ + makeEdge(EdgeConfigures, secret.ID, pod.ID), + }, + } + + sub := BuildNeighborhoodWithIndex(topo, + ResourceRef{Kind: "Secret", Namespace: "prod", Name: "api-tls"}, + NeighborhoodOptions{ + Profile: ProfileAll, + Hops: 1, + // Simulate "user has namespace access but no get-secrets" — the + // root Secret is rejected before BFS starts. + Allow: func(n *Node) bool { return n.Kind != KindSecret }, + }, + nil, nil, + ) + + if len(sub.Nodes) != 0 { + t.Errorf("expected empty subgraph when root Allow denies; got %v", nodeIDs(sub)) + } + if len(sub.Edges) != 0 { + t.Errorf("expected no edges when root denied; got %d", len(sub.Edges)) + } + if sub.RBACDenied != 1 { + t.Errorf("expected RBACDenied=1 for denied root, got %d", sub.RBACDenied) + } +} + +// TestBuildNeighborhood_AllowAllowsRoot is the positive counterpart: a Secret +// root passes the Allow gate when the caller's predicate accepts it, and the +// expansion proceeds normally. +func TestBuildNeighborhood_AllowAllowsRoot(t *testing.T) { + pod := makeNode(KindPod, "prod", "cart-xyz") + secret := makeNode(KindSecret, "prod", "api-tls") + topo := &Topology{ + Nodes: []Node{pod, secret}, + Edges: []Edge{ + makeEdge(EdgeConfigures, secret.ID, pod.ID), + }, + } + + sub := BuildNeighborhoodWithIndex(topo, + ResourceRef{Kind: "Secret", Namespace: "prod", Name: "api-tls"}, + NeighborhoodOptions{ + Profile: ProfileAll, + Hops: 1, + Allow: func(n *Node) bool { return true }, + }, + nil, nil, + ) + if len(sub.Nodes) == 0 { + t.Fatalf("expected non-empty subgraph when Allow accepts root") + } + if sub.Nodes[0].ID != secret.ID { + t.Errorf("expected Secret root first, got %s", sub.Nodes[0].ID) + } + if sub.RBACDenied != 0 { + t.Errorf("expected RBACDenied=0 when nothing was denied, got %d", sub.RBACDenied) + } +} + +// TestBuildNeighborhood_GroupAwareRootIDMatch pins the group-aware check +// applied even when buildNodeID's direct ID match hits. The default +// buildNodeID heuristic (lowercase kind / namespace / name) collides when +// two CRDs share a plural — without the apiVersion-group validation, the +// caller-supplied group is silently ignored on the direct-match path. +// +// Setup: a CNPG Cluster occupies the lowercase-kind ID "cluster/fleet/prod" +// (what buildNodeID produces for kind=Cluster). The CAPI Cluster lives under +// a distinct ID to avoid the collision. A caller asking for +// group=cluster.x-k8s.io must NOT silently get the CNPG node back just +// because the direct ID lookup hit. +func TestBuildNeighborhood_GroupAwareRootIDMatch(t *testing.T) { + // CNPG Cluster: occupies the lowercase-kind ID buildNodeID produces. + cnpg := Node{ + ID: "cluster/fleet/prod", + Kind: "Cluster", + Name: "prod", + Status: StatusHealthy, + Data: map[string]any{ + "namespace": "fleet", + "apiVersion": "postgresql.cnpg.io/v1", + }, + } + // CAPI Cluster: same kind+namespace+name but distinct ID (group-prefixed + // to mirror real-world collision-avoidance). The caller asks for it by + // group; the direct-ID path must reject CNPG, then findNodeByRef must + // surface the CAPI variant. + capi := Node{ + ID: "cluster.x-k8s.io/cluster/fleet/prod", + Kind: "Cluster", + Name: "prod", + Status: StatusHealthy, + Data: map[string]any{ + "namespace": "fleet", + "apiVersion": "cluster.x-k8s.io/v1beta1", + }, + } + topo := &Topology{Nodes: []Node{cnpg, capi}} + + sub := BuildNeighborhoodWithIndex(topo, + ResourceRef{Kind: "Cluster", Namespace: "fleet", Name: "prod", Group: "cluster.x-k8s.io"}, + NeighborhoodOptions{Profile: ProfileAll, Hops: 1}, + nil, nil, + ) + if len(sub.Nodes) == 0 { + t.Fatal("expected non-empty neighborhood for group-disambiguated root") + } + got := nodeAPIGroupFromData(&sub.Nodes[0]) + if got != "cluster.x-k8s.io" { + t.Errorf("group-aware lookup picked apiGroup=%q (id=%s), expected cluster.x-k8s.io (CAPI)", got, sub.Nodes[0].ID) + } + + // Sanity counterpart: caller asking for the CNPG group must get CNPG via + // the direct-ID path with the apiGroup matching. + sub2 := BuildNeighborhoodWithIndex(topo, + ResourceRef{Kind: "Cluster", Namespace: "fleet", Name: "prod", Group: "postgresql.cnpg.io"}, + NeighborhoodOptions{Profile: ProfileAll, Hops: 1}, + nil, nil, + ) + if len(sub2.Nodes) == 0 { + t.Fatal("expected non-empty neighborhood for CNPG-disambiguated root") + } + got2 := nodeAPIGroupFromData(&sub2.Nodes[0]) + if got2 != "postgresql.cnpg.io" { + t.Errorf("group=postgresql.cnpg.io picked apiGroup=%q, expected postgresql.cnpg.io", got2) + } +} + +// TestBuildNeighborhood_PseudoKindRootLookup pins the pseudo-kind mapping in +// findNodeByRef. KNative serving.knative.dev/Service is stored in the topology +// under NodeKind="KnativeService" (a synthesized label, not a real K8s kind). +// A caller asking for kind=Service&group=serving.knative.dev must find the +// KnativeService node — without the (kind, group) → pseudo-kind translation, +// the direct kind comparison ("Service" vs "KnativeService") never matches. +// +// Also pins the disambiguation: a core Service of the same name in the same +// namespace must NOT be returned when group=serving.knative.dev is supplied. +func TestBuildNeighborhood_PseudoKindRootLookup(t *testing.T) { + knsvc := Node{ + ID: "knativeservice/prod/api", + Kind: KindKnativeService, + Name: "api", + Status: StatusHealthy, + Data: map[string]any{ + "namespace": "prod", + "apiVersion": "serving.knative.dev/v1", + }, + } + coreSvc := Node{ + ID: "service/prod/api", + Kind: KindService, + Name: "api", + Status: StatusHealthy, + Data: map[string]any{ + "namespace": "prod", + "apiVersion": "v1", + }, + } + pod := makeNode(KindPod, "prod", "api-xyz") + topo := &Topology{ + Nodes: []Node{knsvc, coreSvc, pod}, + Edges: []Edge{ + makeEdge(EdgeExposes, knsvc.ID, pod.ID), + }, + } + + // Both Pascal-case ("Service" — MCP path) and lowercase ("service" — REST + // path after normalizeKind) must resolve to the KnativeService pseudo-kind. + // REST's lowercasing previously slipped past pseudoKindFor's Pascal-only + // switch and returned 404 for legitimate cross-group root lookups. + // REST normalizes URL paths to lowercase plural ("services"); MCP arrives + // in Pascal singular ("Service"). Both must resolve. + for _, kind := range []string{"Service", "service", "services"} { + t.Run(kind, func(t *testing.T) { + sub := BuildNeighborhoodWithIndex(topo, + ResourceRef{Kind: kind, Namespace: "prod", Name: "api", Group: "serving.knative.dev"}, + NeighborhoodOptions{Profile: ProfileAuto, Hops: 1}, + nil, nil, + ) + if len(sub.Nodes) == 0 { + t.Fatalf("expected non-empty neighborhood for KnativeService pseudo-kind root with kind=%q", kind) + } + if sub.Nodes[0].ID != knsvc.ID { + t.Errorf("pseudo-kind root lookup with kind=%q returned %s, expected KnativeService %s", kind, sub.Nodes[0].ID, knsvc.ID) + } + if sub.Nodes[0].Kind != KindKnativeService { + t.Errorf("expected NodeKind=KnativeService for kind=%q, got %s", kind, sub.Nodes[0].Kind) + } + }) + } +} + +// TestBuildNeighborhood_SecretFilteredByAllow verifies the secret-leak fix at +// the topology layer: a Secret node in the BFS frontier is dropped when the +// caller's Allow predicate rejects it, matching the per-kind RBAC gate the +// REST and MCP handlers apply for Secret reads. The Secret must not appear in +// the output and the RBACDenied counter must reflect the skip. +func TestBuildNeighborhood_SecretFilteredByAllow(t *testing.T) { + pod := makeNode(KindPod, "prod", "cart-xyz") + secret := makeNode(KindSecret, "prod", "db-creds") + topo := &Topology{ + Nodes: []Node{pod, secret}, + Edges: []Edge{ + makeEdge(EdgeConfigures, secret.ID, pod.ID), + }, + } + + // Caller drops the Secret — simulates the REST/MCP per-kind SAR refusing + // the user even though they have namespace access. + sub := BuildNeighborhoodWithIndex(topo, + ResourceRef{Kind: "Pod", Namespace: "prod", Name: "cart-xyz"}, + NeighborhoodOptions{ + Profile: ProfileAll, + Hops: 1, + Allow: func(n *Node) bool { return n.Kind != KindSecret }, + }, + nil, nil, + ) + for _, n := range sub.Nodes { + if n.Kind == KindSecret { + t.Errorf("Secret leaked through BFS despite Allow=false: %s", n.ID) + } + } + if sub.RBACDenied != 1 { + t.Errorf("expected RBACDenied=1 for the denied Secret, got %d", sub.RBACDenied) + } +} + +// TestBuildNeighborhood_NodeClassPerVariantSAR pins the per-variant +// authorization at the topology layer: a single NodeKind (NodeClass) has +// three discriminated variants (EC2NodeClass / AKSNodeClass / GCPNodeClass) +// keyed by apiVersion-group. The caller's Allow predicate must be able to +// pass one variant while denying another so a user with EC2-only RBAC +// doesn't see AKS or GCP NodeClass nodes on a multi-provider cluster. +// +// Setup: a NodePool root with two child NodeClass nodes — one EC2 +// (karpenter.k8s.aws) and one AKS (karpenter.azure.com). Allow returns +// true for EC2 group, false for AKS group. The BFS expansion must surface +// EC2, drop AKS, and bump RBACDenied for the dropped AKS node. +func TestBuildNeighborhood_NodeClassPerVariantSAR(t *testing.T) { + np := Node{ + ID: "nodepool/_/karpenter-default", + Kind: KindNodePool, + Name: "karpenter-default", + Status: StatusHealthy, + Data: map[string]any{ + "namespace": "", + "apiVersion": "karpenter.sh/v1", + }, + } + ec2 := Node{ + ID: "nodeclass/_/ec2-default", + Kind: KindNodeClass, + Name: "ec2-default", + Status: StatusHealthy, + Data: map[string]any{ + "namespace": "", + "apiVersion": "karpenter.k8s.aws/v1", + }, + } + aks := Node{ + ID: "nodeclass/_/aks-default", + Kind: KindNodeClass, + Name: "aks-default", + Status: StatusHealthy, + Data: map[string]any{ + "namespace": "", + "apiVersion": "karpenter.azure.com/v1beta1", + }, + } + topo := &Topology{ + Nodes: []Node{np, ec2, aks}, + Edges: []Edge{ + makeEdge(EdgeConfigures, np.ID, ec2.ID), + makeEdge(EdgeConfigures, np.ID, aks.ID), + }, + } + + // Allow returns true for NodePool + EC2 NodeClass, false for AKS + // NodeClass — keyed on the node's apiVersion group, which is exactly + // the discriminator the handler-side fix relies on. + sub := BuildNeighborhoodWithIndex(topo, + ResourceRef{Kind: "NodePool", Namespace: "", Name: "karpenter-default", Group: "karpenter.sh"}, + NeighborhoodOptions{ + Profile: ProfileAll, + Hops: 1, + Allow: func(n *Node) bool { + if n.Kind != KindNodeClass { + return true + } + return nodeAPIGroupFromData(n) == "karpenter.k8s.aws" + }, + }, + nil, nil, + ) + + sawEC2 := false + sawAKS := false + for _, n := range sub.Nodes { + switch n.ID { + case ec2.ID: + sawEC2 = true + case aks.ID: + sawAKS = true + } + } + if !sawEC2 { + t.Errorf("EC2 NodeClass denied — per-variant Allow with karpenter.k8s.aws=true must surface it (got nodes: %v)", nodeIDs(sub)) + } + if sawAKS { + t.Errorf("AKS NodeClass leaked despite Allow=false for karpenter.azure.com — per-variant gate must drop it") + } + if sub.RBACDenied != 1 { + t.Errorf("expected RBACDenied=1 for the dropped AKS NodeClass, got %d", sub.RBACDenied) + } +} + +// TestAPIVersionGroup pins the canonical split-on-first-slash extraction. +// Previously this function existed three times (REST, MCP, and inside +// topology); the test now lives next to the single shared implementation. +func TestAPIVersionGroup(t *testing.T) { + cases := []struct { + in string + want string + }{ + {"v1", ""}, + {"apps/v1", "apps"}, + {"argoproj.io/v1alpha1", "argoproj.io"}, + {"networking.k8s.io/v1", "networking.k8s.io"}, + {"serving.knative.dev/v1", "serving.knative.dev"}, + {"", ""}, + {"/v1", ""}, // leading slash → empty group + {"apps/v1/extra", "apps"}, // multi-slash → split on FIRST + } + for _, tc := range cases { + if got := APIVersionGroup(tc.in); got != tc.want { + t.Errorf("APIVersionGroup(%q) = %q, want %q", tc.in, got, tc.want) + } + } +} + +func equalStrings(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} diff --git a/pkg/topology/types.go b/pkg/topology/types.go index f2fa46ea9..8cde8bc39 100644 --- a/pkg/topology/types.go +++ b/pkg/topology/types.go @@ -25,11 +25,11 @@ import ( // - dashboard.go: resource counting (if applicable) // - capabilities.go: ResourcePermissions struct + permCheck array (if needs RBAC) // - dynamic_cache.go: warmup list (if CRD) -// - if the kind is cluster-scoped: add an entry to clusterScopedTopologyKinds -// in BOTH internal/server/server.go and internal/mcp/tools.go so the -// topology strip helpers can SAR-gate it. Missing the strip table leaks -// the cluster-scoped node to namespace-restricted users via /api/topology -// and the get_topology MCP tool. +// - if the kind is cluster-scoped: add an entry to topology.ClusterScopedKinds +// in pkg/topology/cluster_scoped_kinds.go so the topology strip helpers +// AND neighborhood per-node gates can SAR-check it. Missing the entry +// leaks the cluster-scoped node to namespace-restricted users via +// /api/topology, get_topology MCP, AND get_neighborhood MCP/REST. type NodeKind string const (