diff --git a/fdbkubernetesmonitor/api/annotations.go b/fdbkubernetesmonitor/api/annotations.go index e18aac57f1b..d7017d61789 100644 --- a/fdbkubernetesmonitor/api/annotations.go +++ b/fdbkubernetesmonitor/api/annotations.go @@ -20,28 +20,34 @@ package api const ( + // FoundationDBPrefix represents the prefix for all FoundationDB related annotations. + FoundationDBPrefix = "foundationdb.org/" + // CurrentConfigurationAnnotation is the annotation we use to store the // latest configuration. - CurrentConfigurationAnnotation = "foundationdb.org/launcher-current-configuration" + CurrentConfigurationAnnotation = FoundationDBPrefix + "launcher-current-configuration" // EnvironmentAnnotation is the annotation we use to store the environment // variables. - EnvironmentAnnotation = "foundationdb.org/launcher-environment" + EnvironmentAnnotation = FoundationDBPrefix + "launcher-environment" // OutdatedConfigMapAnnotation is the annotation we read to get notified of // outdated configuration. - OutdatedConfigMapAnnotation = "foundationdb.org/outdated-config-map-seen" + OutdatedConfigMapAnnotation = FoundationDBPrefix + "outdated-config-map-seen" // DelayShutdownAnnotation defines how long the FDB Kubernetes monitor process should sleep before shutting itself down. // The FDB Kubernetes monitor will always shutdown all fdbserver processes, independent of this setting. // The value of this annotation must be a duration like "60s". - DelayShutdownAnnotation = "foundationdb.org/delay-shutdown" + DelayShutdownAnnotation = FoundationDBPrefix + "delay-shutdown" // ClusterFileChangeDetectedAnnotation is the annotation that will be updated if the fdb.cluster file is updated. - ClusterFileChangeDetectedAnnotation = "foundationdb.org/cluster-file-change" + ClusterFileChangeDetectedAnnotation = FoundationDBPrefix + "cluster-file-change" // IsolateProcessGroupAnnotation is the annotation that defines if the current Pod should be isolated. Isolated // process groups will shutdown the fdbserver instance but keep the Pod and other Kubernetes resources running // for debugging purpose. - IsolateProcessGroupAnnotation = "foundationdb.org/isolate-process-group" + IsolateProcessGroupAnnotation = FoundationDBPrefix + "isolate-process-group" + + // AvailableBinariesAnnotation is the annotation we use to store the available binaries on this Pod. + AvailableBinariesAnnotation = FoundationDBPrefix + "available-binaries" ) diff --git a/fdbkubernetesmonitor/go.mod b/fdbkubernetesmonitor/go.mod index 3a469e57561..081fcdf214d 100644 --- a/fdbkubernetesmonitor/go.mod +++ b/fdbkubernetesmonitor/go.mod @@ -52,6 +52,7 @@ require ( github.com/go-openapi/swag v0.23.1 // indirect github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect + github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/pprof v0.0.0-20250607225305-033d6d78b36a // indirect @@ -72,6 +73,7 @@ require ( go.yaml.in/yaml/v2 v2.4.2 // indirect golang.org/x/net v0.41.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect + golang.org/x/sync v0.15.0 // indirect golang.org/x/sys v0.33.0 // indirect golang.org/x/term v0.32.0 // indirect golang.org/x/text v0.26.0 // indirect diff --git a/fdbkubernetesmonitor/go.sum b/fdbkubernetesmonitor/go.sum index 62f66222201..7372e10fa9d 100644 --- a/fdbkubernetesmonitor/go.sum +++ b/fdbkubernetesmonitor/go.sum @@ -9,6 +9,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= +github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= @@ -29,6 +31,8 @@ github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1v github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= @@ -124,6 +128,8 @@ golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKl golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= +golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/fdbkubernetesmonitor/kubernetes.go b/fdbkubernetesmonitor/kubernetes.go index f82e00def97..8fe9313b06b 100644 --- a/fdbkubernetesmonitor/kubernetes.go +++ b/fdbkubernetesmonitor/kubernetes.go @@ -225,9 +225,28 @@ func (podClient *kubernetesClient) updateAnnotations(monitor *monitor) error { return err } + availableBinaries, err := json.Marshal(monitor.availableBinaries) + if err != nil { + return err + } + return podClient.updateAnnotationsOnPod(map[string]string{ api.CurrentConfigurationAnnotation: string(monitor.activeConfigurationBytes), api.EnvironmentAnnotation: string(jsonEnvironment), + api.AvailableBinariesAnnotation: string(availableBinaries), + }) +} + +// updateAvailableBinariesAnnotation updates the api.AvailableBinariesAnnotation annotation on the pod +// after a new fdbserver binary was copied into the shared directory. +func (podClient *kubernetesClient) updateAvailableBinariesAnnotation(currentBinaries map[string]struct{}) error { + availableBinaries, err := json.Marshal(currentBinaries) + if err != nil { + return err + } + + return podClient.updateAnnotationsOnPod(map[string]string{ + api.AvailableBinariesAnnotation: string(availableBinaries), }) } diff --git a/fdbkubernetesmonitor/main.go b/fdbkubernetesmonitor/main.go index f161ed1acfb..07f4697bcbd 100644 --- a/fdbkubernetesmonitor/main.go +++ b/fdbkubernetesmonitor/main.go @@ -31,13 +31,13 @@ import ( "syscall" "github.com/apple/foundationdb/fdbkubernetesmonitor/api" - "github.com/go-logr/logr" "github.com/go-logr/zapr" "github.com/spf13/pflag" "go.uber.org/zap" "go.uber.org/zap/zapcore" "gopkg.in/natefinch/lumberjack.v2" + ctrl "sigs.k8s.io/controller-runtime" ) var ( @@ -174,6 +174,9 @@ func main() { logger.Error(err, "Error parsing container version", "currentContainerVersion", currentContainerVersion) os.Exit(1) } + + // Update the logger for the controller-runtime. + ctrl.SetLogger(logger) startMonitor(context.Background(), logger, path.Join(inputDir, monitorConfFile), customEnvironment, processCount, promConfig, enablePprof, parsedVersion, enableNodeWatch) case executionModeInit: err = copyFiles(logger, outputDir, copyDetails, requiredCopies) diff --git a/fdbkubernetesmonitor/monitor.go b/fdbkubernetesmonitor/monitor.go index 0f45aff47d6..5ed3be80d29 100644 --- a/fdbkubernetesmonitor/monitor.go +++ b/fdbkubernetesmonitor/monitor.go @@ -33,6 +33,7 @@ import ( "os/exec" "os/signal" "path" + "path/filepath" "strconv" "strings" "sync" @@ -107,6 +108,15 @@ type monitor struct { // metrics represents the prometheus monitor metrics. metrics *metrics + + // runVersionCommand when set to false the monitor will not try to run the fdbserver --version command to ensure + // that the binary is executable. + runVersionCommand bool + + // availableBinaries represents all available binaries in the sharedBinaryDir. Most of the time this map will be + // empty but during version incompatible upgrades, this information will be used to signal the operator that + // the new fdbserver binary is present and executable in the sharedBinaryDir. + availableBinaries map[string]struct{} } type httpConfig struct { @@ -129,6 +139,8 @@ func startMonitor(ctx context.Context, logger logr.Logger, configFile string, cu processCount: processCount, processIDs: make([]int, processCount+1), currentContainerVersion: currentContainerVersion, + runVersionCommand: true, + availableBinaries: map[string]struct{}{}, } go func() { mon.watchPodTimestamps() }() @@ -253,7 +265,8 @@ func (monitor *monitor) readConfiguration() (*api.ProcessConfiguration, []byte) configuration.BinaryPath = path.Join(sharedBinaryDir, configuration.Version.String(), "fdbserver") } - err = checkOwnerExecutable(configuration.BinaryPath) + // TODO (johscheuer): Should we run this check every time? + err = checkOwnerExecutable(configuration.BinaryPath, monitor.runVersionCommand) if err != nil { monitor.logger.Error(err, "Error with binary path for latest configuration", "configuration", configuration, "binaryPath", configuration.BinaryPath) return nil, nil @@ -295,7 +308,7 @@ func (monitor *monitor) loadConfiguration() { // checkOwnerExecutable validates that a path is a file that exists and is // executable by its owner. -func checkOwnerExecutable(path string) error { +func checkOwnerExecutable(path string, runVersionCommand bool) error { binaryStat, err := os.Stat(path) if err != nil { return err @@ -303,6 +316,18 @@ func checkOwnerExecutable(path string) error { if binaryStat.Mode()&0o100 == 0 { return fmt.Errorf("binary is not executable") } + + if !runVersionCommand { + return nil + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + cmd := exec.CommandContext(ctx, path, "--version") + if err = cmd.Run(); err != nil { + return fmt.Errorf("could not run the version command with binary: %s, error: %w", path, err) + } + return nil } @@ -523,7 +548,7 @@ func (monitor *monitor) watchConfiguration(watcher *fsnotify.Watcher) { return } - monitor.logger.Info("Detected event on monitor conf file or cluster file", "event", event) + monitor.logger.Info("Detected event on monitor conf file, cluster file or shared binaries", "event", event) if event.Op&fsnotify.Write == fsnotify.Write || event.Op&fsnotify.Create == fsnotify.Create { monitor.handleFileChange(event.Name) } else if event.Op&fsnotify.Remove == fsnotify.Remove { @@ -542,6 +567,59 @@ func (monitor *monitor) watchConfiguration(watcher *fsnotify.Watcher) { } } +// getBinariesFromSharedBinaryDir returns all fdbserver binaries that are found in the shared binary directory. +func (monitor *monitor) waitForSharedBinariesAndUpdateAnnotation(dir string) error { + var fdbserverBinaries []string + + startTime := time.Now() + for len(fdbserverBinaries) == 0 { + // If after 5 minutes the new fdbserver binary was not copied, something is probably wrong. + if time.Since(startTime) > 5*time.Minute { + return fmt.Errorf("could not find fdbserver binary in shared binary dir after more than 2 minutes") + } + + monitor.logger.Info("Checking shared binary dir for new fdbserver binary", "sharedBinaryDir", sharedBinaryDir, "fdbserverBinaries", fdbserverBinaries) + err := filepath.Walk(dir, + func(currentPath string, info os.FileInfo, err error) error { + if err != nil { + return err + } + + if info.IsDir() { + return nil + } + + if path.Base(currentPath) != "fdbserver" { + return nil + } + + monitor.logger.Info("found new fdbserver binary in shared binary dir", "sharedBinaryDir", sharedBinaryDir, "currentPath", currentPath) + fdbserverBinaries = append(fdbserverBinaries, currentPath) + + return nil + }) + + if err != nil { + monitor.logger.Error(err, "Error getting binaries from sharedBinaryDir", "sharedBinaryDir", sharedBinaryDir) + } + + time.Sleep(1 * time.Second) + } + + for _, binary := range fdbserverBinaries { + err := checkOwnerExecutable(binary, monitor.runVersionCommand) + if err != nil { + monitor.logger.Error(err, "Error with binary in shared binary directory", "sharedBinaryDir", sharedBinaryDir, "binary", binary) + continue + } + + monitor.availableBinaries[binary] = struct{}{} + monitor.logger.Info("Adding new binary to available binaries", "sharedBinaryDir", sharedBinaryDir, "binary", binary) + } + + return monitor.podClient.updateAvailableBinariesAnnotation(monitor.availableBinaries) +} + // handleFileChange will perform the required action based on the changed/modified file. func (monitor *monitor) handleFileChange(changedFile string) { if changedFile == fdbClusterFilePath { @@ -552,6 +630,18 @@ func (monitor *monitor) handleFileChange(changedFile string) { return } + // If the changed file is in the shared binary path, check if the binary can be executed. If the binary is + // executable then we can add it to the available binaries. + if strings.HasPrefix(changedFile, sharedBinaryDir) { + go func(dir string) { + err := monitor.waitForSharedBinariesAndUpdateAnnotation(sharedBinaryDir) + if err != nil { + monitor.logger.Error(err, "Error getting binaries from sharedBinaryDir", "sharedBinaryDir", sharedBinaryDir, "changedFile", changedFile) + return + } + }(sharedBinaryDir) + } + monitor.loadConfiguration() } @@ -615,6 +705,13 @@ func (monitor *monitor) run() { panic(err) } + // Create a watcher for the sharedBinaryDir, this watcher will update the available binaries during an upgrade. + monitor.logger.Info("adding watch for shared binary path", "path", path.Dir(sharedBinaryDir)) + err = watcher.Add(path.Dir(sharedBinaryDir)) + if err != nil { + panic(err) + } + defer func(watcher *fsnotify.Watcher) { err := watcher.Close() if err != nil {