Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions docs/otel-sentry.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ In Sentry's trace detail view, expand a span and look under **Tags & Attributes*
| Attribute | Type | Description | Appears On |
|-----------|------|-------------|------------|
| `gen_ai.tool.name` | string | Tool name (e.g., `search_code`, `get_file_contents`) | `mcp.tool_call`, `gateway.backend.execute`, `proxy.difc_pipeline`, `proxy.backend.forward` |
| `gen_ai.operation.name` | string | GenAI operation name (`execute_tool`) | `mcp.tool_call` |
| `gen_ai.agent.name` | string | Gateway GenAI agent name (`mcp-gateway`) | `mcp.tool_call` |
| `gen_ai.agent.id` | string | Backend MCP server ID (e.g., `github`, `slack`) | `mcp.tool_call`, `gateway.backend.execute` |
| `gen_ai.conversation.id` | string | Truncated session ID for correlation | `gateway.request` |

Expand Down Expand Up @@ -73,11 +75,11 @@ In Sentry's trace detail view, expand a span and look under **Tags & Attributes*

| Attribute | Type | Description | Appears On |
|-----------|------|-------------|------------|
| `rate_limit.hit` | boolean | Whether a rate limit was triggered | `gateway.backend.execute` |
| `rate_limit.hit` | boolean | Whether a rate limit was triggered | `mcp.tool_call`, `gateway.backend.execute`, `proxy.backend.forward` |

### Span Events

For `gateway.backend.execute`, backend transport errors are recorded via `span.RecordError()` with a generic message (`"tool execution failed"`) to avoid leaking internal details to trace backends. Other spans, including `mcp.tool_call`, may record more specific error details (for example, access-denied or guard-failure conditions).
`mcp.tool_call` and `proxy.difc_pipeline` include milestone events such as `difc.pre_phases_complete`, `difc.access_denied`, and `rate_limit.detected`. For `gateway.backend.execute`, backend transport errors are recorded via `span.RecordError()` with a generic message (`"tool execution failed"`) to avoid leaking internal details to trace backends.

## Important Sentry Behavior

Expand Down
32 changes: 25 additions & 7 deletions internal/proxy/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@ import (
"io"
"net/http"
"strconv"
"time"

"go.opentelemetry.io/otel/attribute"
semconv "go.opentelemetry.io/otel/semconv/v1.41.0"
oteltrace "go.opentelemetry.io/otel/trace"

"github.com/github/gh-aw-mcpg/internal/difc"
"github.com/github/gh-aw-mcpg/internal/guard"
Expand Down Expand Up @@ -195,6 +198,9 @@ func (h *proxyHandler) handleWithDIFC(w http.ResponseWriter, r *http.Request, pa
if denied, _ := guard.HandlePrePhaseError(err); denied != nil {
logHandler.Printf("[DIFC] Phase 2: BLOCKED %s %s — %s", r.Method, path, denied.EvalResult.Reason)
deniedErr := fmt.Errorf("DIFC policy violation: %s", denied.EvalResult.Reason)
difcSpan.AddEvent("difc.access_denied", oteltrace.WithAttributes(
attribute.String("reason", denied.EvalResult.Reason),
))
tracing.RecordSpanError(difcSpan, deniedErr, "access denied: "+denied.EvalResult.Reason)
writeDIFCForbidden(w, deniedErr.Error())
return
Expand All @@ -204,6 +210,7 @@ func (h *proxyHandler) handleWithDIFC(w http.ResponseWriter, r *http.Request, pa
httputil.WriteErrorResponse(w, http.StatusBadGateway, "bad_gateway", "resource labeling failed")
return
}
difcSpan.AddEvent("difc.pre_phases_complete")

// **Phase 3: Forward to upstream GitHub API**
clientAuth := r.Header.Get("Authorization")
Expand All @@ -219,6 +226,14 @@ func (h *proxyHandler) handleWithDIFC(w http.ResponseWriter, r *http.Request, pa
}
if resp != nil {
fwdSpan.SetAttributes(semconv.HTTPResponseStatusCodeKey.Int(resp.StatusCode))
if rateLimited, resetHeader := rateLimitSignal(resp); rateLimited {
fwdSpan.SetAttributes(tracing.RateLimitHit.Bool(true))
eventAttrs := []attribute.KeyValue{}
if resetAt := httputil.ParseRateLimitResetHeader(resetHeader); !resetAt.IsZero() {
eventAttrs = append(eventAttrs, attribute.String("reset_at", resetAt.UTC().Format(time.RFC3339)))
}
difcSpan.AddEvent("rate_limit.detected", oteltrace.WithAttributes(eventAttrs...))
}
}
if resp == nil {
tracing.RecordSpanErrorOnAll(errors.New("upstream request failed"), "upstream request failed", fwdSpan, difcSpan)
Expand Down Expand Up @@ -442,16 +457,11 @@ func copyResponseHeaders(w http.ResponseWriter, resp *http.Response) {
// 1. Injects a Retry-After header so the client knows when to retry.
// 2. Logs the event at ERROR level so operators can monitor rate-limit incidents.
func injectRetryAfterIfRateLimited(w http.ResponseWriter, resp *http.Response) {
is429 := resp.StatusCode == http.StatusTooManyRequests
// Use Go's canonical header key form (textproto.CanonicalMIMEHeaderKey produces
// "X-Ratelimit-Remaining", matching GitHub's actual response headers).
remaining := resp.Header.Get("X-Ratelimit-Remaining")
resetHeader := resp.Header.Get("X-Ratelimit-Reset")

isRateLimited := is429 || remaining == "0"
isRateLimited, resetHeader := rateLimitSignal(resp)
if !isRateLimited {
return
}
remaining := resp.Header.Get("X-Ratelimit-Remaining")

resetAt := httputil.ParseRateLimitResetHeader(resetHeader)
retryAfter := httputil.ComputeRetryAfter(resetAt)
Expand All @@ -463,6 +473,14 @@ func injectRetryAfterIfRateLimited(w http.ResponseWriter, resp *http.Response) {
resp.StatusCode, remaining, resetHeader, retryAfter)
}

func rateLimitSignal(resp *http.Response) (bool, string) {
is429 := resp.StatusCode == http.StatusTooManyRequests
// Use Go's canonical header key form (textproto.CanonicalMIMEHeaderKey produces
// "X-Ratelimit-Remaining", matching GitHub's actual response headers).
remaining := resp.Header.Get("X-Ratelimit-Remaining")
return is429 || remaining == "0", resp.Header.Get("X-Ratelimit-Reset")
}

var metadataPassthrough = map[string]bool{
"/meta": true,
"/rate_limit": true,
Expand Down
35 changes: 35 additions & 0 deletions internal/proxy/rate_limit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,38 @@ func TestComputeRetryAfter(t *testing.T) {
assert.Equal(t, 3600, httputil.ComputeRetryAfter(farFuture))
})
}

func TestRateLimitSignal(t *testing.T) {
t.Parallel()

t.Run("429 status is rate limited", func(t *testing.T) {
t.Parallel()
resp := &http.Response{
StatusCode: http.StatusTooManyRequests,
Header: http.Header{"X-Ratelimit-Reset": []string{"12345"}},
}
limited, reset := rateLimitSignal(resp)
assert.True(t, limited)
assert.Equal(t, "12345", reset)
})

t.Run("remaining zero is rate limited", func(t *testing.T) {
t.Parallel()
resp := &http.Response{
StatusCode: http.StatusOK,
Header: http.Header{"X-Ratelimit-Remaining": []string{"0"}},
}
limited, _ := rateLimitSignal(resp)
assert.True(t, limited)
})

t.Run("non-rate-limited response", func(t *testing.T) {
t.Parallel()
resp := &http.Response{
StatusCode: http.StatusOK,
Header: http.Header{"X-Ratelimit-Remaining": []string{"100"}},
}
limited, _ := rateLimitSignal(resp)
assert.False(t, limited)
})
}
11 changes: 11 additions & 0 deletions internal/server/unified.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ import (
"github.com/github/gh-aw-mcpg/internal/mcp"
"github.com/github/gh-aw-mcpg/internal/tracing"
sdk "github.com/modelcontextprotocol/go-sdk/mcp"
"go.opentelemetry.io/otel/attribute"
oteltrace "go.opentelemetry.io/otel/trace"
)

var logUnified = logger.New("server:unified")
Expand Down Expand Up @@ -431,6 +433,9 @@ func (us *UnifiedServer) callBackendTool(ctx context.Context, serverID, toolName
if denied, detailedErr := guard.HandlePrePhaseError(err); denied != nil {
logger.LogWarn("difc", "Access DENIED for agent %s to %s: %s",
agentID, denied.Resource.Description, denied.EvalResult.Reason)
toolSpan.AddEvent("difc.access_denied", oteltrace.WithAttributes(
attribute.String("reason", denied.EvalResult.Reason),
))
tracing.RecordSpanError(toolSpan, detailedErr, "access denied: "+denied.EvalResult.Reason)
httpStatusCode = 403
return mcp.NewErrorCallToolResult(detailedErr)
Expand All @@ -439,6 +444,7 @@ func (us *UnifiedServer) callBackendTool(ctx context.Context, serverID, toolName
httpStatusCode = 500
return mcp.NewErrorCallToolResult(fmt.Errorf("guard labeling failed: %w", err))
}
toolSpan.AddEvent("difc.pre_phases_complete")

// Add agent tags snapshot to context for enriched MCP backend logging (Phase 3).
ctx = context.WithValue(ctx, mcp.AgentTagsSnapshotContextKey, &mcp.AgentTagsSnapshot{
Expand Down Expand Up @@ -475,6 +481,11 @@ func (us *UnifiedServer) callBackendTool(ctx context.Context, serverID, toolName
cb.RecordRateLimit(resetAt)
execSpan.SetAttributes(tracing.RateLimitHit.Bool(true))
toolSpan.SetAttributes(tracing.RateLimitHit.Bool(true))
eventAttrs := []attribute.KeyValue{}
if !resetAt.IsZero() {
eventAttrs = append(eventAttrs, attribute.String("reset_at", resetAt.UTC().Format(time.RFC3339)))
}
toolSpan.AddEvent("rate_limit.detected", oteltrace.WithAttributes(eventAttrs...))
tracing.RecordSpanErrorOnAll(errRateLimitExceeded, rateLimitExceededStatus, execSpan, toolSpan)
httpStatusCode = 429
// Preserve the original backend error text so the agent sees the actual upstream
Expand Down
Loading