Skip to content
This repository was archived by the owner on Jan 30, 2020. It is now read-only.

Commit 2b2eda2

Browse files
author
Dongsu Park
authored
Merge pull request #1698 from endocode/dongsu/engine-fix-replace-reschedule
agent,engine: fix bugs in rescheduling for replaced units
2 parents e98df4c + f382b13 commit 2b2eda2

File tree

9 files changed

+254
-102
lines changed

9 files changed

+254
-102
lines changed

agent/reconcile_test.go

+15-15
Original file line numberDiff line numberDiff line change
@@ -275,48 +275,48 @@ func TestAbleToRun(t *testing.T) {
275275
tests := []struct {
276276
dState *AgentState
277277
job *job.Job
278-
want bool
278+
want job.JobAction
279279
}{
280280
// nothing to worry about
281281
{
282282
dState: NewAgentState(&machine.MachineState{ID: "123"}),
283283
job: &job.Job{Name: "easy-street.service", Unit: unit.UnitFile{}},
284-
want: true,
284+
want: job.JobActionSchedule,
285285
},
286286

287287
// match MachineID
288288
{
289289
dState: NewAgentState(&machine.MachineState{ID: "XYZ"}),
290290
job: newTestJobWithXFleetValues(t, "MachineID=XYZ"),
291-
want: true,
291+
want: job.JobActionSchedule,
292292
},
293293

294294
// mismatch MachineID
295295
{
296296
dState: NewAgentState(&machine.MachineState{ID: "123"}),
297297
job: newTestJobWithXFleetValues(t, "MachineID=XYZ"),
298-
want: false,
298+
want: job.JobActionUnschedule,
299299
},
300300

301301
// match MachineMetadata
302302
{
303303
dState: NewAgentState(&machine.MachineState{ID: "123", Metadata: map[string]string{"region": "us-west"}}),
304304
job: newTestJobWithXFleetValues(t, "MachineMetadata=region=us-west"),
305-
want: true,
305+
want: job.JobActionSchedule,
306306
},
307307

308308
// Machine metadata ignored when no MachineMetadata in Job
309309
{
310310
dState: NewAgentState(&machine.MachineState{ID: "123", Metadata: map[string]string{"region": "us-west"}}),
311311
job: &job.Job{Name: "easy-street.service", Unit: unit.UnitFile{}},
312-
want: true,
312+
want: job.JobActionSchedule,
313313
},
314314

315315
// mismatch MachineMetadata
316316
{
317317
dState: NewAgentState(&machine.MachineState{ID: "123", Metadata: map[string]string{"region": "us-west"}}),
318318
job: newTestJobWithXFleetValues(t, "MachineMetadata=region=us-east"),
319-
want: false,
319+
want: job.JobActionUnschedule,
320320
},
321321

322322
// peer scheduled locally
@@ -328,7 +328,7 @@ func TestAbleToRun(t *testing.T) {
328328
},
329329
},
330330
job: newTestJobWithXFleetValues(t, "MachineOf=pong.service"),
331-
want: true,
331+
want: job.JobActionSchedule,
332332
},
333333

334334
// multiple peers scheduled locally
@@ -341,14 +341,14 @@ func TestAbleToRun(t *testing.T) {
341341
},
342342
},
343343
job: newTestJobWithXFleetValues(t, "MachineOf=pong.service\nMachineOf=ping.service"),
344-
want: true,
344+
want: job.JobActionSchedule,
345345
},
346346

347347
// peer not scheduled locally
348348
{
349349
dState: NewAgentState(&machine.MachineState{ID: "123"}),
350350
job: newTestJobWithXFleetValues(t, "MachineOf=ping.service"),
351-
want: false,
351+
want: job.JobActionUnschedule,
352352
},
353353

354354
// one of multiple peers not scheduled locally
@@ -360,7 +360,7 @@ func TestAbleToRun(t *testing.T) {
360360
},
361361
},
362362
job: newTestJobWithXFleetValues(t, "MachineOf=pong.service\nMachineOf=ping.service"),
363-
want: false,
363+
want: job.JobActionUnschedule,
364364
},
365365

366366
// no conflicts found
@@ -372,7 +372,7 @@ func TestAbleToRun(t *testing.T) {
372372
},
373373
},
374374
job: newTestJobWithXFleetValues(t, "Conflicts=pong.service"),
375-
want: true,
375+
want: job.JobActionSchedule,
376376
},
377377

378378
// conflicts found
@@ -384,7 +384,7 @@ func TestAbleToRun(t *testing.T) {
384384
},
385385
},
386386
job: newTestJobWithXFleetValues(t, "Conflicts=ping.service"),
387-
want: false,
387+
want: job.JobActionUnschedule,
388388
},
389389

390390
// no replaces found
@@ -396,7 +396,7 @@ func TestAbleToRun(t *testing.T) {
396396
},
397397
},
398398
job: newTestJobWithXFleetValues(t, "Replaces=pong.service"),
399-
want: true,
399+
want: job.JobActionSchedule,
400400
},
401401

402402
// replaces found
@@ -408,7 +408,7 @@ func TestAbleToRun(t *testing.T) {
408408
},
409409
},
410410
job: newTestJobWithXFleetValues(t, "Replaces=ping.service"),
411-
want: false,
411+
want: job.JobActionReschedule,
412412
},
413413
}
414414

agent/state.go

+17-10
Original file line numberDiff line numberDiff line change
@@ -124,36 +124,43 @@ func globMatches(pattern, target string) bool {
124124
// - Agent must have all required Peers of the Job scheduled locally (if any)
125125
// - Job must not conflict with any other Units scheduled to the agent
126126
// - Job must specially handle replaced units to be rescheduled
127-
func (as *AgentState) AbleToRun(j *job.Job) (bool, string) {
127+
func (as *AgentState) AbleToRun(j *job.Job) (jobAction job.JobAction, errstr string) {
128128
if tgt, ok := j.RequiredTarget(); ok && !as.MState.MatchID(tgt) {
129-
return false, fmt.Sprintf("agent ID %q does not match required %q", as.MState.ID, tgt)
129+
return job.JobActionUnschedule, fmt.Sprintf("agent ID %q does not match required %q", as.MState.ID, tgt)
130130
}
131131

132132
metadata := j.RequiredTargetMetadata()
133133
if len(metadata) != 0 {
134134
if !machine.HasMetadata(as.MState, metadata) {
135-
return false, "local Machine metadata insufficient"
135+
return job.JobActionUnschedule, "local Machine metadata insufficient"
136136
}
137137
}
138138

139139
peers := j.Peers()
140140
if len(peers) != 0 {
141141
for _, peer := range peers {
142142
if !as.unitScheduled(peer) {
143-
return false, fmt.Sprintf("required peer Unit(%s) is not scheduled locally", peer)
143+
return job.JobActionUnschedule, fmt.Sprintf("required peer Unit(%s) is not scheduled locally", peer)
144144
}
145145
}
146146
}
147147

148148
if cExists, cJobName := as.HasConflict(j.Name, j.Conflicts()); cExists {
149-
return false, fmt.Sprintf("found conflict with locally-scheduled Unit(%s)", cJobName)
149+
return job.JobActionUnschedule, fmt.Sprintf("found conflict with locally-scheduled Unit(%s)", cJobName)
150150
}
151151

152-
// Handle Replace option specially, by returning a special string
153-
// "jobreschedule" as reason.
154-
if cExists, _ := as.hasReplace(j.Name, j.Replaces()); cExists {
155-
return false, job.JobReschedule
152+
// Handle Replace option specially for rescheduling the unit
153+
if cExists, cJobName := as.hasReplace(j.Name, j.Replaces()); cExists {
154+
return job.JobActionReschedule, fmt.Sprintf("found replace with locally-scheduled Unit(%s)", cJobName)
156155
}
157156

158-
return true, ""
157+
return job.JobActionSchedule, ""
158+
}
159+
160+
func (as *AgentState) GetReplacedUnit(j *job.Job) (string, error) {
161+
cExists, replaced := as.hasReplace(j.Name, j.Replaces())
162+
if !cExists {
163+
return "", fmt.Errorf("cannot find units to be replaced for Unit(%s)", j.Name)
164+
}
165+
return replaced, nil
159166
}

engine/reconciler.go

+88-33
Original file line numberDiff line numberDiff line change
@@ -84,56 +84,111 @@ func (r *Reconciler) calculateClusterTasks(clust *clusterState, stopchan chan st
8484
return true
8585
}
8686

87-
go func() {
88-
defer close(taskchan)
87+
decide := func(j *job.Job) (jobAction job.JobAction, reason string) {
88+
if j.TargetState == job.JobStateInactive {
89+
return job.JobActionUnschedule, "target state inactive"
90+
}
8991

9092
agents := clust.agents()
9193

94+
as, ok := agents[j.TargetMachineID]
95+
if !ok {
96+
metrics.ReportEngineReconcileFailure(metrics.MachineAway)
97+
return job.JobActionUnschedule, fmt.Sprintf("target Machine(%s) went away", j.TargetMachineID)
98+
}
99+
100+
if act, ableReason := as.AbleToRun(j); act != job.JobActionSchedule {
101+
metrics.ReportEngineReconcileFailure(metrics.RunFailure)
102+
return act, fmt.Sprintf("target Machine(%s) unable to run unit: %v",
103+
j.TargetMachineID, ableReason)
104+
}
105+
106+
return job.JobActionSchedule, ""
107+
}
108+
109+
handle_reschedule := func(j *job.Job, reason string) bool {
110+
isRescheduled := false
111+
112+
agents := clust.agents()
113+
114+
as, ok := agents[j.TargetMachineID]
115+
if !ok {
116+
metrics.ReportEngineReconcileFailure(metrics.MachineAway)
117+
return false
118+
}
119+
120+
for _, cj := range clust.jobs {
121+
if !cj.Scheduled() {
122+
continue
123+
}
124+
if j.Name != cj.Name {
125+
continue
126+
}
127+
128+
replacedUnit, err := as.GetReplacedUnit(j)
129+
if err != nil {
130+
log.Debugf("No unit to reschedule: %v", err)
131+
metrics.ReportEngineReconcileFailure(metrics.ScheduleFailure)
132+
continue
133+
}
134+
135+
if !send(taskTypeUnscheduleUnit, reason, replacedUnit, j.TargetMachineID) {
136+
log.Infof("Job(%s) unschedule send failed", replacedUnit)
137+
metrics.ReportEngineReconcileFailure(metrics.ScheduleFailure)
138+
continue
139+
}
140+
141+
dec, err := r.sched.DecideReschedule(clust, j)
142+
if err != nil {
143+
log.Debugf("Unable to schedule Job(%s): %v", j.Name, err)
144+
metrics.ReportEngineReconcileFailure(metrics.ScheduleFailure)
145+
continue
146+
}
147+
148+
if !send(taskTypeAttemptScheduleUnit, reason, replacedUnit, dec.machineID) {
149+
log.Infof("Job(%s) attemptschedule send failed", replacedUnit)
150+
metrics.ReportEngineReconcileFailure(metrics.ScheduleFailure)
151+
continue
152+
}
153+
clust.schedule(replacedUnit, dec.machineID)
154+
log.Debugf("rescheduling unit %s to machine %s", replacedUnit, dec.machineID)
155+
156+
clust.schedule(j.Name, j.TargetMachineID)
157+
log.Debugf("scheduling unit %s to machine %s", j.Name, j.TargetMachineID)
158+
159+
isRescheduled = true
160+
}
161+
162+
return isRescheduled
163+
}
164+
165+
go func() {
166+
defer close(taskchan)
167+
92168
for _, j := range clust.jobs {
93169
if !j.Scheduled() {
94170
continue
95171
}
96172

97-
decide := func() (unschedule bool, reason string) {
98-
if j.TargetState == job.JobStateInactive {
99-
unschedule = true
100-
reason = "target state inactive"
101-
return
102-
}
103-
104-
as, ok := agents[j.TargetMachineID]
105-
if !ok {
106-
unschedule = true
107-
reason = fmt.Sprintf("target Machine(%s) went away", j.TargetMachineID)
108-
metrics.ReportEngineReconcileFailure(metrics.MachineAway)
109-
return
110-
}
111-
112-
var able bool
113-
var ableReason string
114-
if able, ableReason = as.AbleToRun(j); !able {
115-
unschedule = true
116-
if ableReason == job.JobReschedule {
117-
reason = ableReason
118-
} else {
119-
reason = fmt.Sprintf("target Machine(%s) unable to run unit", j.TargetMachineID)
120-
metrics.ReportEngineReconcileFailure(metrics.RunFailure)
121-
}
122-
return
123-
}
124-
125-
return
173+
act, reason := decide(j)
174+
if act == job.JobActionReschedule && handle_reschedule(j, reason) {
175+
log.Debugf("Job(%s) is rescheduled: %v", j.Name, reason)
176+
continue
126177
}
127178

128-
unschedule, reason := decide()
129-
if !unschedule {
179+
if act != job.JobActionUnschedule {
180+
log.Debugf("Job(%s) is not to be unscheduled, reason: %v", j.Name, reason)
181+
metrics.ReportEngineReconcileFailure(metrics.ScheduleFailure)
130182
continue
131183
}
132184

133185
if !send(taskTypeUnscheduleUnit, reason, j.Name, j.TargetMachineID) {
186+
log.Infof("Job(%s) send failed.", j.Name)
187+
metrics.ReportEngineReconcileFailure(metrics.ScheduleFailure)
134188
return
135189
}
136190

191+
log.Debugf("Job(%s) unscheduling.", j.Name)
137192
clust.unschedule(j.Name)
138193
}
139194

engine/scheduler.go

+38-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ type decision struct {
2828

2929
type Scheduler interface {
3030
Decide(*clusterState, *job.Job) (*decision, error)
31+
DecideReschedule(*clusterState, *job.Job) (*decision, error)
3132
}
3233

3334
type leastLoadedScheduler struct{}
@@ -41,7 +42,7 @@ func (lls *leastLoadedScheduler) Decide(clust *clusterState, j *job.Job) (*decis
4142

4243
var target *agent.AgentState
4344
for _, as := range agents {
44-
if able, _ := as.AbleToRun(j); !able {
45+
if act, _ := as.AbleToRun(j); act == job.JobActionUnschedule {
4546
continue
4647
}
4748

@@ -61,6 +62,42 @@ func (lls *leastLoadedScheduler) Decide(clust *clusterState, j *job.Job) (*decis
6162
return &dec, nil
6263
}
6364

65+
// DecideReschedule() decides scheduling in a much simpler way than
66+
// Decide(). It just tries to find out another free machine to be scheduled,
67+
// except for the current target machine. It does not have to run
68+
// as.AbleToRun(), because its job action must have been already decided
69+
// before getting into the function.
70+
func (lls *leastLoadedScheduler) DecideReschedule(clust *clusterState, j *job.Job) (*decision, error) {
71+
agents := lls.sortedAgents(clust)
72+
73+
if len(agents) == 0 {
74+
return nil, fmt.Errorf("zero agents available")
75+
}
76+
77+
found := false
78+
var target *agent.AgentState
79+
for _, as := range agents {
80+
if as.MState.ID == j.TargetMachineID {
81+
continue
82+
}
83+
84+
as := as
85+
target = as
86+
found = true
87+
break
88+
}
89+
90+
if !found {
91+
return nil, fmt.Errorf("no agents able to run job")
92+
}
93+
94+
dec := decision{
95+
machineID: target.MState.ID,
96+
}
97+
98+
return &dec, nil
99+
}
100+
64101
// sortedAgents returns a list of AgentState objects sorted ascending
65102
// by the number of scheduled units
66103
func (lls *leastLoadedScheduler) sortedAgents(clust *clusterState) []*agent.AgentState {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
[Unit]
2+
Description=Test Unit
3+
4+
[Service]
5+
ExecStart=/bin/bash -c "while true; do echo Hello, World!; sleep 1; done"
6+
7+
[X-Fleet]
8+
Replaces=replace.0.service

0 commit comments

Comments
 (0)