@@ -7,10 +7,13 @@ import (
77 "regexp"
88 "strconv"
99 "strings"
10+ "time"
1011
1112 csi "github.com/container-storage-interface/spec/lib/go/csi"
1213 fsnotify "github.com/fsnotify/fsnotify"
14+ v1 "k8s.io/api/core/v1"
1315 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
16+ "k8s.io/apimachinery/pkg/util/wait"
1417 "k8s.io/client-go/kubernetes"
1518 "k8s.io/client-go/rest"
1619 "k8s.io/klog/v2"
@@ -242,18 +245,15 @@ func ValidateDataCacheConfig(dataCacheMode string, dataCacheSize string, ctx con
242245
243246func GetDataCacheCountFromNodeLabel (ctx context.Context , nodeName string ) (int , error ) {
244247 cfg , err := rest .InClusterConfig ()
245- // We want to capture API errors with node label fetching, so return -1
246- // in those cases instead of 0.
247248 if err != nil {
248249 return 0 , err
249250 }
250251 kubeClient , err := kubernetes .NewForConfig (cfg )
251252 if err != nil {
252253 return 0 , err
253254 }
254- node , err := kubeClient . CoreV1 (). Nodes (). Get ( ctx , nodeName , metav1. GetOptions {} )
255+ node , err := getNodeWithRetry ( ctx , kubeClient , nodeName )
255256 if err != nil {
256- // We could retry, but this error will also crashloop the driver which may be as good a way to retry as any.
257257 return 0 , err
258258 }
259259 if val , found := node .GetLabels ()[fmt .Sprintf (common .NodeLabelPrefix , common .DataCacheLssdCountLabel )]; found {
@@ -264,10 +264,33 @@ func GetDataCacheCountFromNodeLabel(ctx context.Context, nodeName string) (int,
264264 klog .V (4 ).Infof ("Number of local SSDs requested for Data Cache: %v" , dataCacheCount )
265265 return dataCacheCount , nil
266266 }
267- // This will be returned for a non-Data-Cache node pool
268267 return 0 , nil
269268}
270269
270+ func getNodeWithRetry (ctx context.Context , kubeClient * kubernetes.Clientset , nodeName string ) (* v1.Node , error ) {
271+ var nodeObj * v1.Node
272+ backoff := wait.Backoff {
273+ Duration : 1 * time .Second ,
274+ Factor : 2.0 ,
275+ Steps : 5 ,
276+ }
277+ err := wait .ExponentialBackoffWithContext (ctx , backoff , func () (bool , error ) {
278+ node , err := kubeClient .CoreV1 ().Nodes ().Get (ctx , nodeName , metav1.GetOptions {})
279+ if err != nil {
280+ klog .Warningf ("Error getting node %s: %v, retrying...\n " , nodeName , err )
281+ return false , nil
282+ }
283+ nodeObj = node
284+ klog .V (4 ).Infof ("Successfully retrieved node info %s\n " , nodeName )
285+ return true , nil
286+ })
287+
288+ if err != nil {
289+ klog .Errorf ("Failed to get node %s after retries: %v\n " , nodeName , err )
290+ }
291+ return nodeObj , err
292+ }
293+
271294func FetchRaidedLssdCountForDatacache () (int , error ) {
272295 raidedPath , err := fetchRAIDedLocalSsdPath ()
273296 if err != nil {
0 commit comments