Skip to content

Commit 48fdd09

Browse files
committed
DRA kubelet: refactor gRPC call timeouts
Some of the E2E node tests were flaky. Their timeout apparently was chosen under the assumption that kubelet would retry immediately after a failed gRPC call, with a factor of 2 as safety margin. But according to kubernetes@0449cef, kubelet has a different, higher retry period of 90 seconds, which was exactly the test timeout. The test timeout has to be higher than that. As the tests don't use the gRPC call timeout anymore, it can be made private. While at it, the name and documentation gets updated.
1 parent 9cc11ab commit 48fdd09

File tree

5 files changed

+66
-65
lines changed

5 files changed

+66
-65
lines changed

pkg/kubelet/cm/dra/plugin/client.go

+3-5
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ import (
3333
drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
3434
)
3535

36-
const PluginClientTimeout = 45 * time.Second
37-
3836
// NewDRAPluginClient returns a wrapper around those gRPC methods of a DRA
3937
// driver kubelet plugin which need to be called by kubelet. The wrapper
4038
// handles gRPC connection management and logging. Connections are reused
@@ -60,7 +58,7 @@ type Plugin struct {
6058
conn *grpc.ClientConn
6159
endpoint string
6260
highestSupportedVersion *utilversion.Version
63-
clientTimeout time.Duration
61+
clientCallTimeout time.Duration
6462
}
6563

6664
func (p *Plugin) getOrCreateGRPCConn() (*grpc.ClientConn, error) {
@@ -116,7 +114,7 @@ func (p *Plugin) NodePrepareResources(
116114
return nil, err
117115
}
118116

119-
ctx, cancel := context.WithTimeout(ctx, p.clientTimeout)
117+
ctx, cancel := context.WithTimeout(ctx, p.clientCallTimeout)
120118
defer cancel()
121119

122120
nodeClient := drapb.NewNodeClient(conn)
@@ -138,7 +136,7 @@ func (p *Plugin) NodeUnprepareResources(
138136
return nil, err
139137
}
140138

141-
ctx, cancel := context.WithTimeout(ctx, p.clientTimeout)
139+
ctx, cancel := context.WithTimeout(ctx, p.clientCallTimeout)
142140
defer cancel()
143141

144142
nodeClient := drapb.NewNodeClient(conn)

pkg/kubelet/cm/dra/plugin/client_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -243,9 +243,9 @@ func TestNodeUnprepareResources(t *testing.T) {
243243
defer teardown()
244244

245245
p := &Plugin{
246-
backgroundCtx: ctx,
247-
endpoint: addr,
248-
clientTimeout: PluginClientTimeout,
246+
backgroundCtx: ctx,
247+
endpoint: addr,
248+
clientCallTimeout: PluginClientTimeout,
249249
}
250250

251251
conn, err := p.getOrCreateGRPCConn()

pkg/kubelet/cm/dra/plugin/plugin.go

+11-2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@ import (
3535
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
3636
)
3737

38+
// defaultClientCallTimeout is the default amount of time that a DRA driver has
39+
// to respond to any of the gRPC calls. kubelet uses this value by passing nil
40+
// to RegisterPlugin. Some tests use a different, usually shorter timeout to
41+
// speed up testing.
42+
//
43+
// This is half of the kubelet retry period (according to
44+
// https://github.com/kubernetes/kubernetes/commit/0449cef8fd5217d394c5cd331d852bd50983e6b3).
45+
const defaultClientCallTimeout = 45 * time.Second
46+
3847
// RegistrationHandler is the handler which is fed to the pluginwatcher API.
3948
type RegistrationHandler struct {
4049
// backgroundCtx is used for all future activities of the handler.
@@ -144,7 +153,7 @@ func (h *RegistrationHandler) RegisterPlugin(pluginName string, endpoint string,
144153

145154
var timeout time.Duration
146155
if pluginClientTimeout == nil {
147-
timeout = PluginClientTimeout
156+
timeout = defaultClientCallTimeout
148157
} else {
149158
timeout = *pluginClientTimeout
150159
}
@@ -157,7 +166,7 @@ func (h *RegistrationHandler) RegisterPlugin(pluginName string, endpoint string,
157166
conn: nil,
158167
endpoint: endpoint,
159168
highestSupportedVersion: highestSupportedVersion,
160-
clientTimeout: timeout,
169+
clientCallTimeout: timeout,
161170
}
162171

163172
// Storing endpoint of newly registered DRA Plugin into the map, where plugin name will be the key

staging/src/k8s.io/dynamic-resource-allocation/structured/allocator_test.go

-20
Original file line numberDiff line numberDiff line change
@@ -133,26 +133,6 @@ func classWithConfig(name, driver, attribute string) *resourceapi.DeviceClass {
133133
return class
134134
}
135135

136-
// generate a DeviceClass object with the given name and the node selector
137-
// that selects nodes with the region label set to either "west" or "east".
138-
func classWithSuitableNodes(name, driver string) *resourceapi.DeviceClass {
139-
class := class(name, driver)
140-
class.Spec.SuitableNodes = &v1.NodeSelector{
141-
NodeSelectorTerms: []v1.NodeSelectorTerm{
142-
{
143-
MatchExpressions: []v1.NodeSelectorRequirement{
144-
{
145-
Key: regionKey,
146-
Operator: v1.NodeSelectorOpIn,
147-
Values: []string{region1, region2},
148-
},
149-
},
150-
},
151-
},
152-
}
153-
return class
154-
}
155-
156136
// generate a ResourceClaim object with the given name and device requests.
157137
func claimWithRequests(name string, constraints []resourceapi.DeviceConstraint, requests ...resourceapi.DeviceRequest) *resourceapi.ResourceClaim {
158138
return &resourceapi.ResourceClaim{

0 commit comments

Comments
 (0)