Skip to content

Commit 7570776

Browse files
yesnaultsguiheux
authored andcommitted
fix (hatchery:swarm): clean networks & check ratio (#1277)
1 parent 69634bd commit 7570776

File tree

3 files changed

+50
-34
lines changed

3 files changed

+50
-34
lines changed

engine/hatchery/swarm/swarm.go

+30-18
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ var containersCache = struct {
121121
func (h *HatcherySwarm) getContainers() ([]docker.APIContainers, error) {
122122
t := time.Now()
123123

124-
defer log.Debug("getContainers() : %d s", time.Since(t).Seconds())
124+
defer log.Debug("getContainers() : %f s", time.Since(t).Seconds())
125125

126126
containersCache.mu.RLock()
127127
nbServers := len(containersCache.list)
@@ -137,6 +137,11 @@ func (h *HatcherySwarm) getContainers() ([]docker.APIContainers, error) {
137137
containersCache.mu.Lock()
138138
containersCache.list = s
139139
containersCache.mu.Unlock()
140+
141+
log.Debug("getContainers> %d containers on this host", len(s))
142+
for _, v := range s {
143+
log.Debug("getContainers> container ID:%s names:%+v image:%s created:%d state:%s, status:%s", v.ID, v.Names, v.Image, v.Created, v.State, v.Status)
144+
}
140145
//Remove data from the cache after 2 seconds
141146
go func() {
142147
time.Sleep(2 * time.Second)
@@ -193,18 +198,19 @@ func (h *HatcherySwarm) killAndRemove(ID string) {
193198
return
194199
}
195200

196-
network, err := h.dockerClient.NetworkInfo(container.NetworkSettings.NetworkID)
197-
if err != nil {
198-
log.Info("killAndRemove> cannot NetworkInfo: %v", err)
199-
h.killAndRemoveContainer(ID)
200-
return
201-
}
202-
203-
// If we succeed to get the network, kill and remove all the container on the network
204-
if netname, ok := network.Labels["worker_net"]; ok {
205-
log.Info("killAndRemove> Remove network %s", netname)
206-
for id := range network.Containers {
207-
h.killAndRemoveContainer(id)
201+
for _, cnetwork := range container.NetworkSettings.Networks {
202+
network, err := h.dockerClient.NetworkInfo(cnetwork.NetworkID)
203+
if err != nil {
204+
log.Info("killAndRemove> cannot NetworkInfo: %v", err)
205+
h.killAndRemoveContainer(ID)
206+
return
207+
}
208+
// If we succeed to get the network, kill and remove all the container on the network
209+
if netname, ok := network.Labels["worker_net"]; ok {
210+
log.Info("killAndRemove> Remove network %s", netname)
211+
for id := range network.Containers {
212+
h.killAndRemoveContainer(id)
213+
}
208214
}
209215
}
210216
}
@@ -421,14 +427,20 @@ func (h *HatcherySwarm) CanSpawn(model *sdk.Model, jobID int64, requirements []s
421427
}
422428
}
423429

424-
// hatcherySwarm.ratioService: Percent reserved for spwaning worker with service requirement
430+
// hatcherySwarm.ratioService: Percent reserved for spawning worker with service requirement
425431
// if no link -> we need to check ratioService
426-
if len(links) == 0 && len(cs) > 0 {
427-
percentFree := 100 - (100 * len(cs) / h.Config.MaxContainers)
428-
if percentFree <= h.Config.RatioService {
429-
log.Info("CanSpawn> ratio reached. percentFree:%d ratioService:%d", percentFree, h.Config.RatioService)
432+
if len(links) == 0 {
433+
if h.Config.RatioService >= 100 {
434+
log.Debug("CanSpawn> ratioService 100 by conf - no spawn worker without CDS Service")
430435
return false
431436
}
437+
if len(cs) > 0 {
438+
percentFree := 100 - (100 * len(cs) / h.Config.MaxContainers)
439+
if percentFree <= h.Config.RatioService {
440+
log.Debug("CanSpawn> ratio reached. percentFree:%d ratioService:%d", percentFree, h.Config.RatioService)
441+
return false
442+
}
443+
}
432444
}
433445

434446
log.Debug("CanSpawn> %s need %v", model.Name, links)

sdk/hatchery/hatchery.go

+8-7
Original file line numberDiff line numberDiff line change
@@ -138,24 +138,25 @@ func receiveJob(h Interface, isWorkflowJob bool, execGroups []sdk.Group, jobID i
138138

139139
atomic.AddInt64(nRoutines, 1)
140140
defer atomic.AddInt64(nRoutines, -1)
141-
if errR := routine(h, isWorkflowJob, models, execGroups, jobID, requirements, hostname, time.Now().Unix()); errR != nil {
141+
isSpawned, errR := routine(h, isWorkflowJob, models, execGroups, jobID, requirements, hostname, time.Now().Unix())
142+
if errR != nil {
142143
log.Warning("Error on routine: %s", errR)
143144
return false
144145
}
145-
return true
146+
return isSpawned
146147
}
147148

148-
func routine(h Interface, isWorkflowJob bool, models []sdk.Model, execGroups []sdk.Group, jobID int64, requirements []sdk.Requirement, hostname string, timestamp int64) error {
149+
func routine(h Interface, isWorkflowJob bool, models []sdk.Model, execGroups []sdk.Group, jobID int64, requirements []sdk.Requirement, hostname string, timestamp int64) (bool, error) {
149150
defer logTime(h, fmt.Sprintf("routine> %d", timestamp), time.Now())
150151
log.Debug("routine> %d enter", timestamp)
151152

152153
if h.Hatchery() == nil || h.Hatchery().ID == 0 {
153154
log.Debug("Create> continue")
154-
return nil
155+
return false, nil
155156
}
156157

157158
if len(models) == 0 {
158-
return fmt.Errorf("routine> %d - No model returned by CDS api", timestamp)
159+
return false, fmt.Errorf("routine> %d - No model returned by CDS api", timestamp)
159160
}
160161
log.Debug("routine> %d - models received: %d", timestamp, len(models))
161162

@@ -205,11 +206,11 @@ func routine(h Interface, isWorkflowJob bool, models []sdk.Model, execGroups []s
205206
if err := h.Client().QueueJobSendSpawnInfo(isWorkflowJob, jobID, infos); err != nil {
206207
log.Warning("routine> %d - cannot client.QueueJobSendSpawnInfo for job %d: %s", timestamp, jobID, err)
207208
}
208-
break // ok for this job
209+
return true, nil // ok for this job
209210
}
210211
}
211212

212-
return nil
213+
return false, nil
213214
}
214215

215216
func provisioning(h Interface, provisionDisabled bool, models []sdk.Model) {

sdk/hatchery/register.go

+12-9
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ func Create(h Interface) {
7272
tickerCountWorkersStarted := time.NewTicker(time.Duration(2 * time.Second))
7373
tickerGetModels := time.NewTicker(time.Duration(3 * time.Second))
7474

75-
var maxWorkersReached bool
7675
var models []sdk.Model
7776

7877
// Call WorkerModel Enabled first
@@ -96,9 +95,6 @@ func Create(h Interface) {
9695
workersStarted = int64(h.WorkersStarted())
9796
if workersStarted > int64(h.Configuration().Provision.MaxWorker) {
9897
log.Info("max workers reached. current:%d max:%d", workersStarted, int64(h.Configuration().Provision.MaxWorker))
99-
maxWorkersReached = true
100-
} else {
101-
maxWorkersReached = false
10298
}
10399
log.Debug("workers already started:%d", workersStarted)
104100
case <-tickerGetModels.C:
@@ -108,25 +104,32 @@ func Create(h Interface) {
108104
log.Error("error on h.Client().WorkerModelsEnabled(): %v", errwm)
109105
}
110106
case j := <-pbjobs:
111-
if maxWorkersReached {
112-
log.Debug("maxWorkerReached:%d", workersStarted)
107+
if workersStarted > int64(h.Configuration().Provision.MaxWorker) {
108+
log.Debug("maxWorkersReached:%d", workersStarted)
113109
continue
114110
}
115111
go func(job sdk.PipelineBuildJob) {
112+
atomic.AddInt64(&workersStarted, 1)
116113
if isRun := receiveJob(h, false, job.ExecGroups, job.ID, job.QueuedSeconds, job.BookedBy, job.Job.Action.Requirements, models, &nRoutines, spawnIDs, hostname); isRun {
117-
atomic.AddInt64(&workersStarted, 1)
118114
spawnIDs.SetDefault(string(job.ID), job.ID)
115+
} else {
116+
atomic.AddInt64(&workersStarted, -1)
119117
}
120118
}(j)
121119
case j := <-wjobs:
122-
if maxWorkersReached {
123-
log.Debug("maxWorkerReached:%d", workersStarted)
120+
if workersStarted > int64(h.Configuration().Provision.MaxWorker) {
121+
log.Debug("maxWorkersReached:%d", workersStarted)
124122
continue
125123
}
126124
go func(job sdk.WorkflowNodeJobRun) {
125+
// count + 1 here, and remove -1 if worker is not started
126+
// this avoid to spawn to many workers compare
127+
atomic.AddInt64(&workersStarted, 1)
127128
if isRun := receiveJob(h, true, nil, job.ID, job.QueuedSeconds, job.BookedBy, job.Job.Action.Requirements, models, &nRoutines, spawnIDs, hostname); isRun {
128129
atomic.AddInt64(&workersStarted, 1)
129130
spawnIDs.SetDefault(string(job.ID), job.ID)
131+
} else {
132+
atomic.AddInt64(&workersStarted, -1)
130133
}
131134
}(j)
132135
case err := <-errs:

0 commit comments

Comments
 (0)