From 573d6cb5b2418aa805b297d58799bfd3859de5a4 Mon Sep 17 00:00:00 2001 From: "Jonathan A. Sternberg" Date: Fri, 17 Jan 2025 09:36:31 -0600 Subject: [PATCH] cache: add tracing to prune Adds tracing to the cache manager prune operation. The attribute will show both explicit prunes from a user and background prunes initiated after solve requests are processed. Signed-off-by: Jonathan A. Sternberg --- cache/manager.go | 53 +++++++++++++++++++++++- cmd/buildkitd/main.go | 7 +++- cmd/buildkitd/main_containerd_worker.go | 1 + cmd/buildkitd/main_oci_worker.go | 1 + util/tracing/tracing.go | 55 ++++++++++++++++++++++--- worker/base/worker.go | 3 ++ 6 files changed, 111 insertions(+), 9 deletions(-) diff --git a/cache/manager.go b/cache/manager.go index f5af2c6a97d2..bb9702ddb606 100644 --- a/cache/manager.go +++ b/cache/manager.go @@ -25,10 +25,13 @@ import ( "github.com/moby/buildkit/util/disk" "github.com/moby/buildkit/util/flightcontrol" "github.com/moby/buildkit/util/progress" + "github.com/moby/buildkit/util/tracing" digest "github.com/opencontainers/go-digest" imagespecidentity "github.com/opencontainers/image-spec/identity" ocispecs "github.com/opencontainers/image-spec/specs-go/v1" "github.com/pkg/errors" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" "golang.org/x/sync/errgroup" ) @@ -38,6 +41,17 @@ var ( errInvalid = errors.New("invalid") ) +var ( + PruneFilterAttribute = attribute.Key("moby.buildkit.prune.filter") + PruneAllAttribute = attribute.Key("moby.buildkit.prune.all") + PruneGCPolicyKeepDuration = attribute.Key("moby.buildkit.prune.gcpolicy.keepduration") + PruneGCPolicyReservedSpace = attribute.Key("moby.buildkit.prune.gcpolicy.reservedspace") + PruneGCPolicyMaxUsedSpace = attribute.Key("moby.buildkit.prune.gcpolicy.maxusedspace") + PruneGCPolicyMinFreeSpace = attribute.Key("moby.buildkit.prune.gcpolicy.minfreespace") + PruneDeleteSizeAttribute = attribute.Key("moby.buildkit.prune.delete.size") + PruneDeleteCountAttribute = attribute.Key("moby.buildkit.prune.delete.count") +) + const maxPruneBatch = 10 // maximum number of refs to prune while holding the manager lock type ManagerOpt struct { @@ -51,6 +65,7 @@ type ManagerOpt struct { MetadataStore *metadata.Store Root string MountPoolRoot string + TracerProvider trace.TracerProvider } type Accessor interface { @@ -95,7 +110,8 @@ type cacheManager struct { Differ diff.Comparer MetadataStore *metadata.Store - root string + root string + tracer trace.Tracer mountPool sharableMountPool @@ -114,6 +130,7 @@ func NewManager(opt ManagerOpt) (Manager, error) { Differ: opt.Differ, MetadataStore: opt.MetadataStore, root: opt.Root, + tracer: tracing.Tracer(opt.TracerProvider), records: make(map[string]*cacheRecord), } @@ -1009,6 +1026,9 @@ func (cm *cacheManager) createDiffRef(ctx context.Context, parents parentRefs, d } func (cm *cacheManager) Prune(ctx context.Context, ch chan client.UsageInfo, opts ...client.PruneInfo) error { + ctx, span := cm.tracer.Start(ctx, "(*cacheManager).Prune") + defer span.End() + cm.muPrune.Lock() for _, opt := range opts { @@ -1030,6 +1050,18 @@ func (cm *cacheManager) Prune(ctx context.Context, ch chan client.UsageInfo, opt } func (cm *cacheManager) prune(ctx context.Context, ch chan client.UsageInfo, opt client.PruneInfo) error { + ctx, span := cm.tracer.Start(ctx, "(*cacheManager).prune", + trace.WithAttributes( + PruneFilterAttribute.StringSlice(opt.Filter), + PruneAllAttribute.Bool(opt.All), + PruneGCPolicyKeepDuration.Int64(int64(opt.KeepDuration)), + PruneGCPolicyReservedSpace.Int64(opt.ReservedSpace), + PruneGCPolicyMaxUsedSpace.Int64(opt.MaxUsedSpace), + PruneGCPolicyMinFreeSpace.Int64(opt.MinFreeSpace), + ), + ) + defer span.End() + filter, err := filters.ParseAll(opt.Filter...) if err != nil { return errors.Wrapf(err, "failed to parse prune filters %v", opt.Filter) @@ -1066,6 +1098,17 @@ func (cm *cacheManager) prune(ctx context.Context, ch chan client.UsageInfo, opt } } + var ( + totalReleasedSize int64 + totalReleasedCount int64 + ) + defer func() { + span.SetAttributes( + PruneDeleteSizeAttribute.Int64(totalReleasedSize), + PruneDeleteCountAttribute.Int64(totalReleasedCount), + ) + }() + popt := pruneOpt{ filter: filter, all: opt.All, @@ -1076,9 +1119,15 @@ func (cm *cacheManager) prune(ctx context.Context, ch chan client.UsageInfo, opt } for { releasedSize, releasedCount, err := cm.pruneOnce(ctx, ch, popt) - if err != nil || releasedCount == 0 { + if err != nil { return err } + totalReleasedSize += releasedSize + totalReleasedCount += releasedCount + + if releasedCount == 0 { + return nil + } popt.totalSize -= releasedSize } } diff --git a/cmd/buildkitd/main.go b/cmd/buildkitd/main.go index cdfe9a948a0e..7b71f46c3a6b 100644 --- a/cmd/buildkitd/main.go +++ b/cmd/buildkitd/main.go @@ -68,6 +68,7 @@ import ( "go.opentelemetry.io/otel/propagation" sdkmetric "go.opentelemetry.io/otel/sdk/metric" sdktrace "go.opentelemetry.io/otel/sdk/trace" + "go.opentelemetry.io/otel/trace" tracev1 "go.opentelemetry.io/proto/otlp/collector/trace/v1" "golang.org/x/sync/errgroup" "google.golang.org/grpc" @@ -90,6 +91,7 @@ type workerInitializerOpt struct { config *config.Config sessionManager *session.Manager traceSocket string + tracerProvider trace.TracerProvider } type workerInitializer struct { @@ -336,7 +338,7 @@ func main() { return err } - controller, err := newController(ctx, c, &cfg) + controller, err := newController(ctx, c, &cfg, tp) if err != nil { return err } @@ -747,7 +749,7 @@ func serverCredentials(cfg config.TLSConfig) (*tls.Config, error) { return tlsConf, nil } -func newController(ctx context.Context, c *cli.Context, cfg *config.Config) (*control.Controller, error) { +func newController(ctx context.Context, c *cli.Context, cfg *config.Config, tp trace.TracerProvider) (*control.Controller, error) { sessionManager, err := session.NewManager() if err != nil { return nil, err @@ -776,6 +778,7 @@ func newController(ctx context.Context, c *cli.Context, cfg *config.Config) (*co config: cfg, sessionManager: sessionManager, traceSocket: traceSocket, + tracerProvider: tp, }) if err != nil { return nil, err diff --git a/cmd/buildkitd/main_containerd_worker.go b/cmd/buildkitd/main_containerd_worker.go index d07bc99ec2bf..946debf77099 100644 --- a/cmd/buildkitd/main_containerd_worker.go +++ b/cmd/buildkitd/main_containerd_worker.go @@ -348,6 +348,7 @@ func containerdWorkerInitializer(c *cli.Context, common workerInitializerOpt) ([ opt.GCPolicy = getGCPolicy(cfg.GCConfig, common.config.Root) opt.BuildkitVersion = getBuildkitVersion() opt.RegistryHosts = resolverFunc(common.config) + opt.TracerProvider = common.tracerProvider if platformsStr := cfg.Platforms; len(platformsStr) != 0 { platforms, err := parsePlatforms(platformsStr) diff --git a/cmd/buildkitd/main_oci_worker.go b/cmd/buildkitd/main_oci_worker.go index a8a27ed4f61a..a57525dad50b 100644 --- a/cmd/buildkitd/main_oci_worker.go +++ b/cmd/buildkitd/main_oci_worker.go @@ -322,6 +322,7 @@ func ociWorkerInitializer(c *cli.Context, common workerInitializerOpt) ([]worker opt.GCPolicy = getGCPolicy(cfg.GCConfig, common.config.Root) opt.BuildkitVersion = getBuildkitVersion() opt.RegistryHosts = hosts + opt.TracerProvider = common.tracerProvider if platformsStr := cfg.Platforms; len(platformsStr) != 0 { platforms, err := parsePlatforms(platformsStr) diff --git a/util/tracing/tracing.go b/util/tracing/tracing.go index 2df1c4627387..416ad96bbee5 100644 --- a/util/tracing/tracing.go +++ b/util/tracing/tracing.go @@ -5,7 +5,13 @@ import ( "fmt" "net/http" "net/http/httptrace" + "runtime" + "strings" + "sync" + "github.com/moby/buildkit/util/bklog" + "github.com/moby/buildkit/util/stack" + "github.com/pkg/errors" "go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "go.opentelemetry.io/otel/attribute" @@ -14,18 +20,57 @@ import ( semconv "go.opentelemetry.io/otel/semconv/v1.26.0" "go.opentelemetry.io/otel/trace" "go.opentelemetry.io/otel/trace/noop" +) - "github.com/pkg/errors" +var noopTracer = sync.OnceValue(func() trace.TracerProvider { + return noop.NewTracerProvider() +}) - "github.com/moby/buildkit/util/bklog" - "github.com/moby/buildkit/util/stack" -) +// Tracer is a utility function for creating a tracer. It will create +// a tracer with the name matching the package of the caller with the given options. +// +// This function isn't meant to be called in a tight loop. It is intended to be +// called on initialization and the tracer is supposed to be retained for future use. +func Tracer(tp trace.TracerProvider, options ...trace.TracerOption) trace.Tracer { + if tp == nil { + // Potentially consider using otel.GetTracerProvider here, but + // we know of some issues where the default tracer provider can cause + // memory leaks if it is never initialized so just being cautious + // and using the noop tracer because buildkit itself doesn't use the global + // tracer provider. + return noopTracer().Tracer("", options...) + } + + var ( + name string + callers [1]uintptr + ) + + if runtime.Callers(2, callers[:]) > 0 { + frames := runtime.CallersFrames(callers[:]) + frame, _ := frames.Next() + + if frame.Function != "" { + lastSlash := strings.LastIndex(frame.Function, "/") + if lastSlash < 0 { + lastSlash = 0 + } + + if funcStart := strings.Index(frame.Function[lastSlash:], "."); funcStart >= 0 { + name = frame.Function[:lastSlash+funcStart] + } else { + name = frame.Function + } + } + } + return tp.Tracer(name, options...) +} // StartSpan starts a new span as a child of the span in context. // If there is no span in context then this is a no-op. func StartSpan(ctx context.Context, operationName string, opts ...trace.SpanStartOption) (trace.Span, context.Context) { parent := trace.SpanFromContext(ctx) - tracer := noop.NewTracerProvider().Tracer("") + tracer := noopTracer().Tracer("") if parent != nil && parent.SpanContext().IsValid() { tracer = parent.TracerProvider().Tracer("") } diff --git a/worker/base/worker.go b/worker/base/worker.go index ffe78f6ec6b1..fee4fea45160 100644 --- a/worker/base/worker.go +++ b/worker/base/worker.go @@ -50,6 +50,7 @@ import ( digest "github.com/opencontainers/go-digest" ocispecs "github.com/opencontainers/image-spec/specs-go/v1" "github.com/pkg/errors" + "go.opentelemetry.io/otel/trace" "golang.org/x/sync/errgroup" "golang.org/x/sync/semaphore" ) @@ -82,6 +83,7 @@ type WorkerOpt struct { MetadataStore *metadata.Store MountPoolRoot string ResourceMonitor *resources.Monitor + TracerProvider trace.TracerProvider } // Worker is a local worker instance with dedicated snapshotter, cache, and so on. @@ -113,6 +115,7 @@ func NewWorker(ctx context.Context, opt WorkerOpt) (*Worker, error) { MetadataStore: opt.MetadataStore, Root: opt.Root, MountPoolRoot: opt.MountPoolRoot, + TracerProvider: opt.TracerProvider, }) if err != nil { return nil, err