Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cmd/explorer/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ func main() {
prometheusURL := flag.String("prometheus-url", "", "Manual Prometheus/VictoriaMetrics URL (skips auto-discovery)")
// MCP server
noMCP := flag.Bool("no-mcp", false, "Disable MCP (Model Context Protocol) server for AI tools")
// AI investigation
aiProvider := flag.String("ai-provider", "", "AI provider for investigations: openai or anthropic")
aiAPIKey := flag.String("ai-api-key", "", "API key for the AI provider")
aiBaseURL := flag.String("ai-base-url", "", "Base URL for OpenAI-compatible endpoints (e.g. http://localhost:11434/v1 for Ollama)")
aiModel := flag.String("ai-model", "", "Model override for AI provider (default: provider-specific)")
flag.Parse()

if *showVersion {
Expand Down Expand Up @@ -80,6 +85,10 @@ func main() {
PrometheusURL: *prometheusURL,
MCPEnabled: !*noMCP,
Version: version,
AIProvider: *aiProvider,
AIAPIKey: *aiAPIKey,
AIBaseURL: *aiBaseURL,
AIModel: *aiModel,
}

// Set global flags
Expand Down
6 changes: 6 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ require (
github.com/Masterminds/goutils v1.1.1 // indirect
github.com/Masterminds/sprig/v3 v3.3.0 // indirect
github.com/Masterminds/squirrel v1.5.4 // indirect
github.com/anthropics/anthropic-sdk-go v1.26.0 // indirect
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
github.com/bep/debounce v1.2.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
Expand Down Expand Up @@ -115,6 +116,7 @@ require (
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect
github.com/ncruces/go-strftime v1.0.0 // indirect
github.com/openai/openai-go/v3 v3.24.0 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/opencontainers/image-spec v1.1.1 // indirect
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
Expand All @@ -134,6 +136,10 @@ require (
github.com/spf13/cast v1.10.0 // indirect
github.com/spf13/cobra v1.10.2 // indirect
github.com/spf13/pflag v1.0.10 // indirect
github.com/tidwall/gjson v1.18.0 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.1 // indirect
github.com/tidwall/sjson v1.2.5 // indirect
github.com/tkrajina/go-reflector v0.5.8 // indirect
github.com/valyala/bytebufferpool v1.0.0 // indirect
github.com/valyala/fasttemplate v1.2.2 // indirect
Expand Down
14 changes: 14 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ github.com/Masterminds/sprig/v3 v3.3.0 h1:mQh0Yrg1XPo6vjYXgtf5OtijNAKJRNcTdOOGZe
github.com/Masterminds/sprig/v3 v3.3.0/go.mod h1:Zy1iXRYNqNLUolqCpL4uhk6SHUMAOSCzdgBfDb35Lz0=
github.com/Masterminds/squirrel v1.5.4 h1:uUcX/aBc8O7Fg9kaISIUsHXdKuqehiXAMQTYX8afzqM=
github.com/Masterminds/squirrel v1.5.4/go.mod h1:NNaOrjSoIDfDA40n7sr2tPNZRfjzjA400rg+riTZj10=
github.com/anthropics/anthropic-sdk-go v1.26.0 h1:oUTzFaUpAevfuELAP1sjL6CQJ9HHAfT7CoSYSac11PY=
github.com/anthropics/anthropic-sdk-go v1.26.0/go.mod h1:qUKmaW+uuPB64iy1l+4kOSvaLqPXnHTTBKH6RVZ7q5Q=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so=
Expand Down Expand Up @@ -283,6 +285,8 @@ github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns
github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo=
github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A=
github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k=
github.com/openai/openai-go/v3 v3.24.0 h1:08x6GnYiB+AAejTo6yzPY8RkZMJQ8NpreiOyM5QfyYU=
github.com/openai/openai-go/v3 v3.24.0/go.mod h1:cdufnVK14cWcT9qA1rRtrXx4FTRsgbDPW7Ia7SS5cZo=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040=
Expand Down Expand Up @@ -355,6 +359,16 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
github.com/tkrajina/go-reflector v0.5.8 h1:yPADHrwmUbMq4RGEyaOUpz2H90sRsETNVpjzo3DLVQQ=
github.com/tkrajina/go-reflector v0.5.8/go.mod h1:ECbqLgccecY5kPmPmXg1MrHW585yMcDkVl6IvJe64T4=
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
Expand Down
233 changes: 233 additions & 0 deletions internal/ai/investigate/engine.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
package investigate

import (
"context"
"encoding/json"
"fmt"
"io"
"log"
"strings"
"time"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"

aicontext "github.com/skyhook-io/radar/internal/ai/context"
"github.com/skyhook-io/radar/internal/ai/llm"
"github.com/skyhook-io/radar/internal/k8s"
"github.com/skyhook-io/radar/internal/timeline"
)

// Engine orchestrates AI-powered investigations.
type Engine struct {
provider llm.Provider
}

// NewEngine creates an investigation engine with the given LLM provider.
func NewEngine(provider llm.Provider) *Engine {
return &Engine{provider: provider}
}

// InvestigateParams defines what to investigate.
type InvestigateParams struct {
Kind string `json:"kind"`
Namespace string `json:"namespace"`
Name string `json:"name"`
Question string `json:"question,omitempty"`
}

// Event represents a streamed investigation progress event.
type Event struct {
Type string `json:"type"` // "status", "tool_call", "tool_result", "analysis", "error", "done"
Content string `json:"content"` // human-readable content
Tool string `json:"tool,omitempty"` // tool name for tool_call events
Args string `json:"args,omitempty"` // tool args for tool_call events
ToolCallID string `json:"toolCallId,omitempty"` // unique ID for correlating tool calls with results
}

// Investigate runs an AI investigation on the specified resource.
// Progress is streamed via the onEvent callback.
func (e *Engine) Investigate(ctx context.Context, params InvestigateParams, onEvent func(Event)) error {
onEvent(Event{Type: "status", Content: "Assembling resource context..."})

// Build initial context from Radar's data
initialContext, err := assembleInitialContext(ctx, params)
if err != nil {
return fmt.Errorf("failed to assemble context: %w", err)
}

userPrompt := buildUserPrompt(params.Kind, params.Namespace, params.Name, initialContext, params.Question)

// Build tools for the investigation
tools := buildTools()

// Bridge investigation events to the caller (engine sends its own "done")
llmOnEvent := func(ev llm.StreamEvent) {
switch ev.Type {
case "step_start":
onEvent(Event{Type: "step_start"})
case "tool_call":
onEvent(Event{
Type: "tool_call",
Content: fmt.Sprintf("Calling %s", ev.Tool),
Tool: ev.Tool,
Args: ev.Args,
ToolCallID: ev.ToolCallID,
})
case "tool_result":
onEvent(Event{
Type: "tool_result",
Content: ev.Content,
Tool: ev.Tool,
ToolCallID: ev.ToolCallID,
})
case "text":
onEvent(Event{Type: "analysis", Content: ev.Content})
case "thinking":
onEvent(Event{Type: "status", Content: ev.Content})
case "error":
onEvent(Event{Type: "error", Content: ev.Content})
case "done":
// Handled by engine after Investigate returns
}
}

req := llm.InvestigateRequest{
SystemPrompt: systemPrompt,
UserPrompt: userPrompt,
Tools: tools,
}

_, err = e.provider.Investigate(ctx, req, llmOnEvent)
if err != nil {
onEvent(Event{Type: "error", Content: err.Error()})
return fmt.Errorf("investigation failed: %w", err)
}

onEvent(Event{Type: "done", Content: ""})
return nil
}

// assembleInitialContext gathers resource data to provide as the starting context.
func assembleInitialContext(ctx context.Context, params InvestigateParams) (string, error) {
cache := k8s.GetResourceCache()
if cache == nil {
return "", fmt.Errorf("not connected to cluster")
}

kind := strings.ToLower(params.Kind)
sections := aicontext.ContextSections{
ResourceKind: params.Kind,
ResourceNamespace: params.Namespace,
ResourceName: params.Name,
}

// 1. Minified resource
obj, err := k8s.FetchResource(cache, kind, params.Namespace, params.Name)
if err == nil {
k8s.SetTypeMeta(obj)
if minified, minErr := aicontext.Minify(obj, aicontext.LevelDetail); minErr == nil {
data, _ := json.MarshalIndent(minified, "", " ")
sections.MinifiedResource = string(data)
}
} else if err == k8s.ErrUnknownKind {
u, dynErr := cache.GetDynamicWithGroup(ctx, kind, params.Namespace, params.Name, "")
if dynErr == nil {
data, _ := json.MarshalIndent(aicontext.MinifyUnstructured(u, aicontext.LevelDetail), "", " ")
sections.MinifiedResource = string(data)
}
}

// 2. Events for this resource
if eventLister := cache.Events(); eventLister != nil {
var events []*corev1.Event
if params.Namespace != "" {
events, _ = eventLister.Events(params.Namespace).List(labels.Everything())
} else {
events, _ = eventLister.List(labels.Everything())
}
var matched []corev1.Event
for _, e := range events {
if e.Type != "Warning" {
continue
}
if strings.EqualFold(e.InvolvedObject.Kind, params.Kind) && e.InvolvedObject.Name == params.Name {
matched = append(matched, *e)
}
}
if len(matched) > 0 {
deduplicated := aicontext.DeduplicateEvents(matched)
if len(deduplicated) > 10 {
deduplicated = deduplicated[:10]
}
data, _ := json.MarshalIndent(deduplicated, "", " ")
sections.Events = string(data)
}
}

// 3. Logs (if pod)
if isPodKind(kind) {
if client := k8s.GetClient(); client != nil {
tailLines := int64(100)
opts := &corev1.PodLogOptions{TailLines: &tailLines}
stream, logErr := client.CoreV1().Pods(params.Namespace).GetLogs(params.Name, opts).Stream(ctx)
if logErr == nil {
defer stream.Close()
data, readErr := io.ReadAll(stream)
if readErr == nil {
filtered := aicontext.FilterLogs(string(data))
jsonData, _ := json.MarshalIndent(filtered, "", " ")
sections.Logs = string(jsonData)
}
}
}
}

// 4. Recent changes
if store := timeline.GetStore(); store != nil {
queryOpts := timeline.QueryOptions{
Since: time.Now().Add(-1 * time.Hour),
FilterPreset: "workloads",
Limit: 10,
}
if params.Namespace != "" {
queryOpts.Namespaces = []string{params.Namespace}
}
changes, queryErr := store.Query(ctx, queryOpts)
if queryErr == nil && len(changes) > 0 {
type change struct {
Kind string `json:"kind"`
Name string `json:"name"`
ChangeType string `json:"changeType"`
Summary string `json:"summary"`
Timestamp string `json:"timestamp"`
}
var changeSummaries []change
for _, c := range changes {
summary := ""
if c.Diff != nil && c.Diff.Summary != "" {
summary = c.Diff.Summary
} else if c.Message != "" {
summary = k8s.Truncate(c.Message, 100)
}
changeSummaries = append(changeSummaries, change{
Kind: c.Kind,
Name: c.Name,
ChangeType: string(c.EventType),
Summary: summary,
Timestamp: c.Timestamp.Format(time.RFC3339),
})
}
data, _ := json.MarshalIndent(changeSummaries, "", " ")
sections.Metrics = string(data) // Reuse metrics slot for changes in initial context
}
}

assembled := aicontext.AssembleContext(sections, aicontext.BudgetCloud)
log.Printf("[ai] Assembled initial context: %d chars", len(assembled))
return assembled, nil
}

func isPodKind(kind string) bool {
return kind == "pod" || kind == "pods"
}
44 changes: 44 additions & 0 deletions internal/ai/investigate/prompt.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package investigate

const systemPrompt = `You are an expert Kubernetes SRE investigating a problem in a live cluster. Your goal is to identify the root cause and suggest actionable fixes.

## Communication style
- Think out loud. Before calling a tool, briefly say what you're checking and why.
- After getting results, share what you learned in 1-2 sentences before moving on.
- Use markdown: ## for section headers, **bold** for emphasis, ` + "`" + `code` + "`" + ` for resource names.
- Sound like a helpful colleague, not a report generator.

## Investigation approach
1. Review the resource context provided — status, conditions, obvious issues
2. Check events for warnings or errors
3. If Pod-related, check logs for error patterns
4. Check recent changes that correlate with the problem
5. Check related resources for upstream issues

## Final analysis format
When you have enough evidence, summarize with:

## Root cause
Clear statement of what went wrong, with evidence.

## Why this happened
Underlying cause explanation.

## Recommended fix
Specific actionable steps. Mention Radar actions (restart, rollback, scale) when applicable.

## Guidelines
- Don't repeat raw JSON data — summarize what you found in plain language.
- Stop investigating when you have enough evidence — don't make unnecessary tool calls.
- If the resource looks healthy, say so and check if the problem has self-resolved.`

func buildUserPrompt(kind, namespace, name string, initialContext string, question string) string {
prompt := "Investigate this Kubernetes resource that appears to have a problem:\n\n"
prompt += initialContext

if question != "" {
prompt += "\n\nAdditional context from the user: " + question
}

return prompt
}
Loading
Loading