-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdq-churn
executable file
·87 lines (75 loc) · 2.24 KB
/
dq-churn
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/bin/bash
function expand {
input="$1" #"1,5-7,13-18,22"
sep="${2:- }"
result_str="$sep"
for num in $(tr ',' ' ' <<< "$input"); do
if [[ "$num" == *-* ]]; then
left=$(cut -f1 -d- <<< "$num")
right=$(cut -f2 -d- <<< "$num")
if [ 0$left -lt 0$right ]; then
res=$(seq -s "$sep" $left $right)
else
res=$(seq -s "$sep" $left -1 $right)
fi
else
res="$num"
fi
result_str="$result_str$res$sep"
done
echo ${result_str}
}
MAX_HOST=24
GPU_CNT=4
CLUSTER=$(hostname -s | sed 's/[0-9]//g')
D=$(cd `dirname $0` && pwd)
SSH="ssh -q -x -o BatchMode=yes -o ConnectTimeout=10 -o ServerAliveInterval=3"
QDIR=$D/jobs/queue
mkdir -p $QDIR && touch $QDIR
touch $QDIR/.lock
(
echo -n "Locking..."
flock -xw 10 200 || {
echo " Timeout!"
echo "Please inspect system with 'dq-status' 'dq-free' etc."
exit 1
}
echo " locked."
JOBS=`ls -1 $QDIR | fgrep - | sort -n -k1 -t-`
if [ "x$JOBS" = "x" ]; then
echo "No job requests to schedule"
exit 0
fi
for JOBID in $JOBS; do
export KRB5CCNAME=$QDIR/$JOBID/ticket.krb5
export USER=$(cat $QDIR/$JOBID/user.txt)
HOST=
GPU=
ALL_FREE=$($D/dq-on-all shell -pc gpu-free-list)
DQ_RANGE=$(grep DQ_RANGE $QDIR/$JOBID/env.txt | cut -f2 -d=)
DQ_EXCLUDE=$(grep DQ_EXCLUDE $QDIR/$JOBID/env.txt | cut -f2 -d=)
RANGE=`expand ${DQ_RANGE:-$MAX_HOST-1}`
EXCLUDE=`expand "${DQ_EXCLUDE}" -`
for i in $RANGE; do
if [[ "$EXCLUDE" == *-$i-* ]]; then
echo "Excluding host $i"
continue;
fi
for gpu in `seq 0 $GPU_CNT`; do
if [[ "$ALL_FREE" == *"$CLUSTER$i: GPU-$gpu"* ]]; then
HOST=$CLUSTER$i
GPU=$gpu
break
fi
done
if [ "x$HOST" != "x" ]; then break; fi
done
if [ "x$HOST" == "x" ]; then
echo "No machines free for job $JOBID";
break
fi
U=$(cat $QDIR/$JOBID/user.txt)
echo "Executing '$JOBID' on $HOST/$GPU as $USER..."
$SSH $USER@$HOST CUDA_VISIBLE_DEVICES=$GPU $D/dq-submit-backend $JOBID
done
) 200>$QDIR/.lock