@@ -5,12 +5,14 @@ import (
55 "context"
66 "crypto/tls"
77 "crypto/x509"
8+ "errors"
89 "fmt"
910 "net/http"
1011 "os"
1112 "path/filepath"
1213 "strconv"
1314 "strings"
15+ "sync/atomic"
1416 "time"
1517
1618 "github.com/gorilla/mux"
@@ -25,7 +27,10 @@ import (
2527 corev1 "k8s.io/api/core/v1"
2628 apierrors "k8s.io/apimachinery/pkg/api/errors"
2729 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+ "k8s.io/client-go/kubernetes"
2831 "k8s.io/client-go/rest"
32+ "k8s.io/client-go/tools/leaderelection"
33+ "k8s.io/client-go/tools/leaderelection/resourcelock"
2934)
3035
3136const (
@@ -47,6 +52,10 @@ const (
4752
4853var caFile = filepath .Join (os .TempDir (), "k8s-webhook-server" , "client-ca" , "ca.crt" )
4954
55+ // leaderFlag indicates whether this process is the elected leader.
56+ // Gate config mutation work on this to avoid concurrent writers.
57+ var leaderFlag atomic.Bool
58+
5059// tlsOpt option function applied to all webhook servers.
5160var tlsOpt = func (config * tls.Config ) {
5261 config .MinVersion = tls .VersionTLS12
@@ -85,6 +94,51 @@ func ListenAndServe(ctx context.Context, cfg *rest.Config, mcmEnabled bool) erro
8594 return err
8695 }
8796
97+ k8sClient , err := kubernetes .NewForConfig (cfg )
98+ if err != nil {
99+ return fmt .Errorf ("failed to create kubernetes clientset: %w" , err )
100+ }
101+ id , err := os .Hostname ()
102+ if err != nil || id == "" {
103+ id = fmt .Sprintf ("rancher-webhook-%d" , time .Now ().UnixNano ())
104+ }
105+
106+ lock := & resourcelock.LeaseLock {
107+ LeaseMeta : metav1.ObjectMeta {
108+ Name : "rancher-webhook-leader" ,
109+ Namespace : namespace ,
110+ },
111+ Client : k8sClient .CoordinationV1 (),
112+ LockConfig : resourcelock.ResourceLockConfig {
113+ Identity : id ,
114+ },
115+ }
116+
117+ go leaderelection .RunOrDie (ctx , leaderelection.LeaderElectionConfig {
118+ Lock : lock ,
119+ LeaseDuration : 15 * time .Second ,
120+ RenewDeadline : 10 * time .Second ,
121+ RetryPeriod : 2 * time .Second ,
122+ ReleaseOnCancel : true ,
123+ Callbacks : leaderelection.LeaderCallbacks {
124+ OnStartedLeading : func (_ context.Context ) {
125+ leaderFlag .Store (true )
126+ logrus .Infof ("[%s] elected leader: will manage webhook configurations" , id )
127+ },
128+ OnStoppedLeading : func () {
129+ leaderFlag .Store (false )
130+ logrus .Infof ("[%s] lost leadership: will stop managing webhook configurations" , id )
131+ },
132+ OnNewLeader : func (identity string ) {
133+ if identity == id {
134+ logrus .Infof ("[%s] I am the new leader" , id )
135+ } else {
136+ logrus .Infof ("[%s] observed new leader: %s" , id , identity )
137+ }
138+ },
139+ },
140+ })
141+
88142 if err = listenAndServe (ctx , clients , validators , mutators ); err != nil {
89143 return err
90144 }
@@ -111,6 +165,7 @@ func listenAndServe(ctx context.Context, clients *clients.Clients, validators []
111165 router := mux .NewRouter ()
112166 errChecker := health .NewErrorChecker ("Config Applied" )
113167 health .RegisterHealthCheckers (router , errChecker )
168+ errChecker .Store (errors .New ("webhook configuration not yet applied" ))
114169 router .Use (certAuth ())
115170
116171 logrus .Debug ("Creating Webhook routes" )
@@ -178,14 +233,20 @@ type secretHandler struct {
178233}
179234
180235// sync updates the validating admission configuration whenever the TLS cert changes.
236+ // Only the elected leader performs the updates, followers are a no-op.
181237func (s * secretHandler ) sync (_ string , secret * corev1.Secret ) (* corev1.Secret , error ) {
238+ // The leader is responsible for applying the webhook configuration.
239+ // Follower pods are only responsible for serving traffic and can be marked as healthy once the certificates are generated.
240+ if ! leaderFlag .Load () {
241+ s .errChecker .Store (nil )
242+ return nil , nil
243+ }
244+
182245 if secret == nil || secret .Name != caName || secret .Namespace != namespace || len (secret .Data [corev1 .TLSCertKey ]) == 0 {
183246 return nil , nil
184247 }
185248
186- logrus .Info ("Sleeping for 15 seconds then applying webhook config" )
187- // Sleep here to make sure server is listening and all caches are primed
188- time .Sleep (15 * time .Second )
249+ logrus .Info ("Applying webhook config" )
189250
190251 validationClientConfig := v1.WebhookClientConfig {
191252 Service : & v1.ServiceReference {
0 commit comments