Skip to content

Commit b750c33

Browse files
committed
Allow webhook pods to have leader election to govern which pod updates the config
1 parent 2684311 commit b750c33

File tree

1 file changed

+64
-3
lines changed

1 file changed

+64
-3
lines changed

pkg/server/server.go

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@ import (
55
"context"
66
"crypto/tls"
77
"crypto/x509"
8+
"errors"
89
"fmt"
910
"net/http"
1011
"os"
1112
"path/filepath"
1213
"strconv"
1314
"strings"
15+
"sync/atomic"
1416
"time"
1517

1618
"github.com/gorilla/mux"
@@ -25,7 +27,10 @@ import (
2527
corev1 "k8s.io/api/core/v1"
2628
apierrors "k8s.io/apimachinery/pkg/api/errors"
2729
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+
"k8s.io/client-go/kubernetes"
2831
"k8s.io/client-go/rest"
32+
"k8s.io/client-go/tools/leaderelection"
33+
"k8s.io/client-go/tools/leaderelection/resourcelock"
2934
)
3035

3136
const (
@@ -47,6 +52,10 @@ const (
4752

4853
var caFile = filepath.Join(os.TempDir(), "k8s-webhook-server", "client-ca", "ca.crt")
4954

55+
// leaderFlag indicates whether this process is the elected leader.
56+
// Gate config mutation work on this to avoid concurrent writers.
57+
var leaderFlag atomic.Bool
58+
5059
// tlsOpt option function applied to all webhook servers.
5160
var tlsOpt = func(config *tls.Config) {
5261
config.MinVersion = tls.VersionTLS12
@@ -85,6 +94,51 @@ func ListenAndServe(ctx context.Context, cfg *rest.Config, mcmEnabled bool) erro
8594
return err
8695
}
8796

97+
k8sClient, err := kubernetes.NewForConfig(cfg)
98+
if err != nil {
99+
return fmt.Errorf("failed to create kubernetes clientset: %w", err)
100+
}
101+
id, err := os.Hostname()
102+
if err != nil || id == "" {
103+
id = fmt.Sprintf("rancher-webhook-%d", time.Now().UnixNano())
104+
}
105+
106+
lock := &resourcelock.LeaseLock{
107+
LeaseMeta: metav1.ObjectMeta{
108+
Name: "rancher-webhook-leader",
109+
Namespace: namespace,
110+
},
111+
Client: k8sClient.CoordinationV1(),
112+
LockConfig: resourcelock.ResourceLockConfig{
113+
Identity: id,
114+
},
115+
}
116+
117+
go leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{
118+
Lock: lock,
119+
LeaseDuration: 15 * time.Second,
120+
RenewDeadline: 10 * time.Second,
121+
RetryPeriod: 2 * time.Second,
122+
ReleaseOnCancel: true,
123+
Callbacks: leaderelection.LeaderCallbacks{
124+
OnStartedLeading: func(_ context.Context) {
125+
leaderFlag.Store(true)
126+
logrus.Infof("[%s] elected leader: will manage webhook configurations", id)
127+
},
128+
OnStoppedLeading: func() {
129+
leaderFlag.Store(false)
130+
logrus.Infof("[%s] lost leadership: will stop managing webhook configurations", id)
131+
},
132+
OnNewLeader: func(identity string) {
133+
if identity == id {
134+
logrus.Infof("[%s] I am the new leader", id)
135+
} else {
136+
logrus.Infof("[%s] observed new leader: %s", id, identity)
137+
}
138+
},
139+
},
140+
})
141+
88142
if err = listenAndServe(ctx, clients, validators, mutators); err != nil {
89143
return err
90144
}
@@ -111,6 +165,7 @@ func listenAndServe(ctx context.Context, clients *clients.Clients, validators []
111165
router := mux.NewRouter()
112166
errChecker := health.NewErrorChecker("Config Applied")
113167
health.RegisterHealthCheckers(router, errChecker)
168+
errChecker.Store(errors.New("webhook configuration not yet applied"))
114169
router.Use(certAuth())
115170

116171
logrus.Debug("Creating Webhook routes")
@@ -178,14 +233,20 @@ type secretHandler struct {
178233
}
179234

180235
// sync updates the validating admission configuration whenever the TLS cert changes.
236+
// Only the elected leader performs the updates, followers are a no-op.
181237
func (s *secretHandler) sync(_ string, secret *corev1.Secret) (*corev1.Secret, error) {
238+
// The leader is responsible for applying the webhook configuration.
239+
// Follower pods are only responsible for serving traffic and can be marked as healthy once the certificates are generated.
240+
if !leaderFlag.Load() {
241+
s.errChecker.Store(nil)
242+
return nil, nil
243+
}
244+
182245
if secret == nil || secret.Name != caName || secret.Namespace != namespace || len(secret.Data[corev1.TLSCertKey]) == 0 {
183246
return nil, nil
184247
}
185248

186-
logrus.Info("Sleeping for 15 seconds then applying webhook config")
187-
// Sleep here to make sure server is listening and all caches are primed
188-
time.Sleep(15 * time.Second)
249+
logrus.Info("Applying webhook config")
189250

190251
validationClientConfig := v1.WebhookClientConfig{
191252
Service: &v1.ServiceReference{

0 commit comments

Comments
 (0)