Skip to content

Commit

Permalink
clustermesh: Print errors in interval while waiting
Browse files Browse the repository at this point in the history
This assists in troubleshooting as we wait for a condition to become
valid.

Signed-off-by: Thomas Graf <[email protected]>
  • Loading branch information
tgraf committed Mar 3, 2021
1 parent 475af86 commit edf7995
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 46 deletions.
103 changes: 58 additions & 45 deletions clustermesh/clustermesh.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/cilium/cilium-cli/defaults"
"github.com/cilium/cilium-cli/internal/certs"
"github.com/cilium/cilium-cli/internal/k8s"
"github.com/cilium/cilium-cli/internal/utils"
"github.com/cilium/cilium-cli/status"

"github.com/cilium/cilium/api/v1/models"
Expand All @@ -52,8 +53,6 @@ var (
deploymentMaxSurge = intstr.FromInt(1)
deploymentMaxUnavailable = intstr.FromInt(1)
secretDefaultMode = int32(420)

retryInterval = 2 * time.Second
)

var clusterRole = &rbacv1.ClusterRole{
Expand Down Expand Up @@ -884,35 +883,41 @@ type Status struct {
Connectivity *ConnectivityStatus
}

func (k *K8sClusterMesh) statusAccessInformation(ctx context.Context) (*accessInformation, error) {
retry:
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
func (k *K8sClusterMesh) statusAccessInformation(ctx context.Context, log bool) (*accessInformation, error) {
w := utils.NewWaitObserver(ctx, utils.WaitParameters{Log: func(err error, wait string) {
if log {
k.Log("⌛ Waiting (%s) for access information: %s", wait, err)
}
}})
defer w.Cancel()

retry:
ai, err := k.extractAccessInformation(ctx, k.client, []string{}, false)
if err != nil && k.params.Wait {
time.Sleep(retryInterval)
if err := w.Retry(err); err != nil {
return nil, err
}
goto retry
}

return ai, err
}

func (k *K8sClusterMesh) statusService(ctx context.Context) (*corev1.Service, error) {
retry:
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
func (k *K8sClusterMesh) statusService(ctx context.Context, log bool) (*corev1.Service, error) {
w := utils.NewWaitObserver(ctx, utils.WaitParameters{Log: func(err error, wait string) {
if log {
k.Log("⌛ Waiting (%s) for ClusterMesh service to be available: %s", wait, err)
}
}})
defer w.Cancel()

retry:
svc, err := k.client.GetService(ctx, k.params.Namespace, defaults.ClusterMeshServiceName, metav1.GetOptions{})
if err != nil {
if k.params.Wait {
time.Sleep(retryInterval)
if err := w.Retry(err); err != nil {
return nil, err
}
goto retry
}

Expand Down Expand Up @@ -1001,38 +1006,53 @@ func (c *ConnectivityStatus) parseAgentStatus(name string, s *status.ClusterMesh
c.Connected.Avg += float64(ready)
}

func (k *K8sClusterMesh) statusConnectivity(ctx context.Context) (*ConnectivityStatus, error) {
func (k *K8sClusterMesh) statusConnectivity(ctx context.Context, log bool) (*ConnectivityStatus, error) {
w := utils.NewWaitObserver(ctx, utils.WaitParameters{Log: func(err error, wait string) {
if log {
k.Log("⌛ Waiting (%s) for clusters to be connected: %s", wait, err)
}
}})
defer w.Cancel()

retry:
status, err := k.determineStatusConnectivity(ctx)
if k.params.Wait {
if err == nil {
if status.NotReady > 0 {
err = fmt.Errorf("%d clusters not ready", status.NotReady)
}
if len(status.Errors) > 0 {
err = fmt.Errorf("%d clusters have errors", len(status.Errors))
}
}

if err != nil {
if err := w.Retry(err); err != nil {
return nil, err
}
goto retry
}
}

return status, err
}

func (k *K8sClusterMesh) determineStatusConnectivity(ctx context.Context) (*ConnectivityStatus, error) {
status := &ConnectivityStatus{
GlobalServices: StatisticalStatus{Min: -1},
Connected: StatisticalStatus{Min: -1},
Errors: status.ErrorCountMapMap{},
Clusters: map[string]*ClusterStats{},
}
retry:
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}

pods, err := k.client.ListPods(ctx, k.params.Namespace, metav1.ListOptions{LabelSelector: "k8s-app=cilium"})
if err != nil {
if k.params.Wait {
time.Sleep(retryInterval)
goto retry
}

return nil, fmt.Errorf("unable to list cilium pods: %w", err)
}

for _, pod := range pods.Items {
s, err := k.statusCollector.ClusterMeshConnectivity(ctx, pod.Name)
if err != nil {
if k.params.Wait {
time.Sleep(retryInterval)
goto retry
}

return nil, fmt.Errorf("unable to determine status of cilium pod %q: %w", pod.Name, err)
}

Expand All @@ -1042,13 +1062,6 @@ retry:
status.GlobalServices.Avg /= float64(len(pods.Items))
status.Connected.Avg /= float64(len(pods.Items))

if k.params.Wait {
if status.NotReady > 0 || len(status.Errors) > 0 {
time.Sleep(retryInterval)
goto retry
}
}

return status, nil
}

Expand All @@ -1070,7 +1083,7 @@ func (k *K8sClusterMesh) Status(ctx context.Context, log bool) (*Status, error)
ctx, cancel := context.WithTimeout(ctx, k.params.waitTimeout())
defer cancel()

s.AccessInformation, err = k.statusAccessInformation(ctx)
s.AccessInformation, err = k.statusAccessInformation(ctx, log)
if err != nil {
return nil, err
}
Expand All @@ -1082,7 +1095,7 @@ func (k *K8sClusterMesh) Status(ctx context.Context, log bool) (*Status, error)
}
}

s.Service, err = k.statusService(ctx)
s.Service, err = k.statusService(ctx, log)
if err != nil {
return nil, err
}
Expand All @@ -1100,7 +1113,7 @@ func (k *K8sClusterMesh) Status(ctx context.Context, log bool) (*Status, error)
}
}

s.Connectivity, err = k.statusConnectivity(ctx)
s.Connectivity, err = k.statusConnectivity(ctx, log)

if log && s.Connectivity != nil {
if s.Connectivity.NotReady > 0 {
Expand Down
5 changes: 5 additions & 0 deletions defaults/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

package defaults

import "time"

const (
AgentServiceAccountName = "cilium"
AgentClusterRoleName = "cilium"
Expand Down Expand Up @@ -77,6 +79,9 @@ const (
Version = "v1.9.4"

TunnelType = "vxlan"

WaitRetryInterval = 2 * time.Second
WaitWarningInterval = 10 * time.Second
)

var OperatorLabels = map[string]string{
Expand Down
80 changes: 79 additions & 1 deletion internal/utils/utils.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright 2020 Authors of Cilium
// Copyright 2020-2021 Authors of Cilium
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -14,6 +14,14 @@

package utils

import (
"context"
"fmt"
"time"

"github.com/cilium/cilium-cli/defaults"
)

func BuildImagePath(userImage, defaultImage, userVersion, defaultVersion string) string {
if userImage == "" {
userImage = defaultImage
Expand All @@ -25,3 +33,73 @@ func BuildImagePath(userImage, defaultImage, userVersion, defaultVersion string)

return userImage + ":" + userVersion
}

type LogFunc func(err error, waitTime string)

type WaitParameters struct {
RetryInterval time.Duration
WarningInterval time.Duration
Timeout time.Duration
Log LogFunc
}

func (w WaitParameters) retryInterval() time.Duration {
if w.RetryInterval != time.Duration(0) {
return w.RetryInterval
}

return defaults.WaitRetryInterval
}

func (w WaitParameters) warningInterval() time.Duration {
if w.WarningInterval != time.Duration(0) {
return w.WarningInterval
}

return defaults.WaitWarningInterval
}

type WaitObserver struct {
ctx context.Context
params WaitParameters
lastWarning time.Time
waitStarted time.Time
cancel context.CancelFunc
}

func NewWaitObserver(ctx context.Context, p WaitParameters) *WaitObserver {
w := &WaitObserver{
ctx: ctx,
params: p,
lastWarning: time.Now(),
waitStarted: time.Now(),
}

if p.Timeout != time.Duration(0) {
w.ctx, w.cancel = context.WithTimeout(ctx, p.Timeout)
}

return w
}

func (w *WaitObserver) Cancel() {
if w.cancel != nil {
w.cancel()
}
}

func (w *WaitObserver) Retry(err error) error {
if w.params.Log != nil && time.Since(w.lastWarning) > w.params.warningInterval() {
waitString := time.Since(w.waitStarted).Truncate(time.Second).String()
w.params.Log(err, waitString)
w.lastWarning = time.Now()
}

select {
case <-w.ctx.Done():
return fmt.Errorf("timeout while waiting for condition, last error: %s", err)
case <-time.After(w.params.retryInterval()):
}

return nil
}

0 comments on commit edf7995

Please sign in to comment.