Skip to content

Commit 1b91a3a

Browse files
committed
watchdog: restart worker if failing
Workaround of #5 This commit will need to be revert when real watchdog will be unblocked
1 parent fb0f13c commit 1b91a3a

File tree

7 files changed

+73
-4
lines changed

7 files changed

+73
-4
lines changed

defaults/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ concourse_group: "{{ concourse_user }}"
1717
concourse_gid: "{{ concourse_uid }}"
1818
concourse_force_restart: no
1919
concourse_service_enabled: yes
20-
20+
concourse_service_watchdog_enabled: yes
2121

2222
# Concourse source variables
2323

tasks/install-worker.yml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,21 @@
2424
dest: "{{ concourse_worker_launcher_path }}"
2525
- src: concourse-retire-worker.j2
2626
dest: "{{ concourse_retire_worker_path }}"
27+
- src: concourse-worker-watchdog.j2
28+
dest: "{{ concourse_install_dir }}/concourse-worker-watchdog"
2729

2830
- name: create worker service | concourse
2931
template:
30-
src: concourse-worker.service.j2
31-
dest: /etc/systemd/system/concourse-worker.service
32+
src: "{{ item['src'] }}"
33+
dest: "{{ item['dest'] }}"
3234
owner: root
3335
force: yes
3436
become: yes
3537
become_user: root
38+
with_items:
39+
- src: concourse-worker.service.j2
40+
dest: /etc/systemd/system/concourse-worker.service
41+
- src: concourse-worker-watchdog.service.j2
42+
dest: /etc/systemd/system/concourse-worker-watchdog.service
3643
notify:
3744
- restart concourse worker

tasks/start.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@
1212
become: yes
1313
when: concourse_worker
1414

15+
- name: configure worker watchdog service | concourse
16+
service:
17+
name: concourse-worker-watchdog
18+
enabled: "{{ concourse_service_watchdog_enabled }}"
19+
become: yes
20+
when: concourse_worker
21+
1522
- name: start web service | concourse
1623
service:
1724
name: concourse-web
@@ -25,3 +32,10 @@
2532
state: started
2633
become: yes
2734
when: concourse_worker and concourse_service_enabled
35+
36+
- name: start worker watchdog service | concourse
37+
service:
38+
name: concourse-worker-watchdog
39+
state: started
40+
become: yes
41+
when: concourse_worker and concourse_service_enabled

templates/concourse-retire-worker.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ export {{ key }}="{{ value }}"
99
# If $1 PID of concourse worker is provided, do a kill instead of an api call
1010
# Mostly used by systemd for concourse compatiility issues https://github.com/concourse/concourse/pull/3929
1111

12-
until ! curl --fail 127.0.0.1:7777/ping; do
12+
until ! curl --silent --fail 127.0.0.1:7777/ping; do
1313

1414
if [[ -z "$1" ]]; then
1515
{{ concourse_binary_path }} retire-worker \
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/bash
2+
3+
watchdog() {
4+
#WORKER_PID=$1
5+
6+
RETRY=3
7+
while(true); do
8+
FAIL=0
9+
10+
curl --silent 127.0.0.1:8888 || FAIL=1
11+
12+
#if [[ $FAIL -eq 0 ]]; then
13+
if [[ $FAIL -eq 1 ]]; then
14+
if [[ $RETRY -ne 0 ]]; then
15+
echo "retry $RETRY"
16+
((RETRY=RETRY-1))
17+
else
18+
echo "restart worker"
19+
#/bin/systemd-notify --pid=$WORKER_PID "WATCHDOG=1";
20+
/bin/systemctl restart concourse-worker
21+
RETRY=3
22+
fi
23+
else
24+
echo "watchdog: concourse-worker healthcheck ok"
25+
#sleep 1
26+
RETRY=3
27+
fi
28+
sleep 15
29+
done
30+
}
31+
32+
watchdog
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# {{ ansible_managed }}
2+
3+
[Unit]
4+
Description=concourse-worker-watchdog
5+
Requires=network-online.target
6+
After=concourse-worker.service
7+
8+
[Service]
9+
ExecStart={{ concourse_install_dir }}/concourse-worker-watchdog
10+
ExecStop=/bin/kill $MAINPID
11+
ExecReload=/bin/kill -HUP $MAINPID
12+
Restart=on-failure
13+
14+
[Install]
15+
WantedBy=multi-user.target

templates/concourse-worker.service.j2

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
Description=concourse-worker
55
Requires=network-online.target
66
After=network-online.target
7+
Before=concourse-worker-watchdog.service
78

89
[Service]
910
ExecStart={{ concourse_worker_launcher_path }}

0 commit comments

Comments
 (0)