Skip to content

Commit 7399d95

Browse files
committed
[gateway/xds] Backoff on xDS NACKs
1 parent 8b9a057 commit 7399d95

File tree

1 file changed

+67
-3
lines changed

1 file changed

+67
-3
lines changed

pkg/gateway/xds/cache/snapshotcache.go

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
// and is provided here subject to the following:
1111
// Copyright Project Contour Authors
1212
// SPDX-License-Identifier: Apache-2.0
13-
1413
package cache
1514

1615
import (
@@ -20,12 +19,14 @@ import (
2019
"math"
2120
"strconv"
2221
"sync"
22+
"time"
2323

2424
corev3 "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
2525
discoveryv3 "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3"
2626
cachev3 "github.com/envoyproxy/go-control-plane/pkg/cache/v3"
2727
envoylog "github.com/envoyproxy/go-control-plane/pkg/log"
2828
serverv3 "github.com/envoyproxy/go-control-plane/pkg/server/v3"
29+
"k8s.io/apimachinery/pkg/util/wait"
2930

3031
"github.com/apoxy-dev/apoxy-cli/pkg/gateway/xds/types"
3132
"github.com/apoxy-dev/apoxy-cli/pkg/log"
@@ -52,13 +53,21 @@ type snapshotMap map[string]*cachev3.Snapshot
5253

5354
type nodeInfoMap map[int64]*corev3.Node
5455

56+
type nodeBackoff struct {
57+
backoff wait.Backoff
58+
lastAttempt time.Time
59+
}
60+
61+
type backoffMap map[string]*nodeBackoff
62+
5563
type snapshotCache struct {
5664
cachev3.SnapshotCache
5765
snapshotVersion int64
5866
lastSnapshot snapshotMap
5967

6068
mu sync.Mutex
6169
streamIDNodeInfo nodeInfoMap
70+
nodeBackoffs backoffMap
6271
}
6372

6473
// GenerateNewSnapshot takes a table of resources (the output from the IR->xDS
@@ -128,6 +137,7 @@ func NewSnapshotCache(ads bool, logger *slog.Logger) SnapshotCacheWithCallbacks
128137
SnapshotCache: cachev3.NewSnapshotCache(ads, &Hash, l),
129138
lastSnapshot: make(snapshotMap),
130139
streamIDNodeInfo: make(nodeInfoMap),
140+
nodeBackoffs: make(backoffMap),
131141
}
132142
}
133143

@@ -215,9 +225,36 @@ func (s *snapshotCache) OnStreamRequest(streamID int64, req *discoveryv3.Discove
215225

216226
if status := req.ErrorDetail; status != nil {
217227
// if Envoy rejected the last update log the details here.
218-
// TODO(youngnick): Handle NACK properly
219228
errorCode = status.Code
220229
errorMessage = status.Message
230+
log.Warnf("NACK received: code %d, message %s", errorCode, errorMessage)
231+
232+
if s.nodeBackoffs[nodeID] == nil {
233+
s.nodeBackoffs[nodeID] = &nodeBackoff{
234+
backoff: wait.Backoff{
235+
Duration: time.Second,
236+
Factor: 2,
237+
Steps: 5,
238+
Cap: 30 * time.Second,
239+
},
240+
}
241+
}
242+
243+
// Check if enough time has passed since last backoff and reset the backoff if so.
244+
if time.Since(s.nodeBackoffs[nodeID].lastAttempt) > s.nodeBackoffs[nodeID].backoff.Cap {
245+
s.nodeBackoffs[nodeID].backoff = wait.Backoff{
246+
Duration: time.Second,
247+
Factor: 2,
248+
Steps: 5,
249+
Cap: 30 * time.Second,
250+
}
251+
}
252+
253+
// Backoff for a bit before retrying.
254+
s.nodeBackoffs[nodeID].lastAttempt = time.Now()
255+
delay := s.nodeBackoffs[nodeID].backoff.Step()
256+
log.Warnf("Backing off for retry after NACK for node %s", nodeID)
257+
time.Sleep(delay)
221258
}
222259

223260
log.Debugf("handling v3 xDS resource request, version_info %s, response_nonce %s, nodeID %s, node_version %s, resource_names %v, type_url %s, errorCode %d, errorMessage %s",
@@ -312,9 +349,36 @@ func (s *snapshotCache) OnStreamDeltaRequest(streamID int64, req *discoveryv3.De
312349
req.ResponseNonce, nodeID, nodeVersion)
313350
if status := req.ErrorDetail; status != nil {
314351
// if Envoy rejected the last update log the details here.
315-
// TODO(youngnick): Handle NACK properly
316352
errorCode = status.Code
317353
errorMessage = status.Message
354+
log.Warnf("NACK received: code %d, message %s", errorCode, errorMessage)
355+
356+
if s.nodeBackoffs[nodeID] == nil {
357+
s.nodeBackoffs[nodeID] = &nodeBackoff{
358+
backoff: wait.Backoff{
359+
Duration: time.Second,
360+
Factor: 2,
361+
Steps: 5,
362+
Cap: 30 * time.Second,
363+
},
364+
}
365+
}
366+
367+
// Check if enough time has passed since last backoff and reset the backoff if so.
368+
if time.Since(s.nodeBackoffs[nodeID].lastAttempt) > s.nodeBackoffs[nodeID].backoff.Cap {
369+
s.nodeBackoffs[nodeID].backoff = wait.Backoff{
370+
Duration: time.Second,
371+
Factor: 2,
372+
Steps: 5,
373+
Cap: 30 * time.Second,
374+
}
375+
}
376+
377+
// Backoff for a bit before retrying.
378+
s.nodeBackoffs[nodeID].lastAttempt = time.Now()
379+
delay := s.nodeBackoffs[nodeID].backoff.Step()
380+
log.Warnf("Backing off for retry after NACK for node %s", nodeID)
381+
time.Sleep(delay)
318382
}
319383
log.Debugf("handling v3 xDS resource request, response_nonce %s, nodeID %s, node_version %s, resource_names_subscribe %v, resource_names_unsubscribe %v, type_url %s, errorCode %d, errorMessage %s",
320384
req.ResponseNonce,

0 commit comments

Comments
 (0)