-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinetuning_slurm.sh
79 lines (64 loc) · 3.1 KB
/
finetuning_slurm.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/bin/bash
# General resource requests.
#--------------------------------------------
#---> COMMON OPTIONS
#--------------------------------------------
#SBATCH --time=24:00:00 # Job duration (72h is the limit).
#SBATCH --ntasks=1 # Number of tasks.
# #SBATCH --mem=0 # Real memory required per node.
#SBATCH --gres=gpu:1 # The specified resources will be allocated to the job on each node.
# #SBATCH --cpus-per-task=4 # Number of cpu-cores per task (>1 if multi-threaded tasks).
# #SBATCH --exclusive # The job can not share nodes with other running jobs.
#--------------------------------------------
#---> TURGALIUM
#--------------------------------------------
#SBATCH --partition=volta # Request specific partition.
# #SBATCH --exclude=acp[02],aap[01-04] # Explicitly exclude certain nodes from the resources granted to the job.
#--------------------------------------------
#---> NGPU.URG
#--------------------------------------------
# #SBATCH --partition=dios # Request specific partition (dios, dgx).
#--------------------------------------------
#---> UNUSED OPTIONS
#--------------------------------------------
# #SBATCH --nodes=1 # Number of nodes.
# #SBATCH --gpus-per-node=2 # Specify the number of GPUs required for the job on each node.
# Current exp configuration --> Imbalanced/Balanced
#--------------------------------------------
# --> INFO: Specific configurations for the experiments.
#--------------------------------------------
# * RayTune: --ntasks=1, --gpus-per-node=2/4, --exclusive
# * DDP-4GPUs: --ntasks=4, --gpus-per-node=4, --exclusive
# * Imbalanced (1-GPU): --ntasks=1, --gpus-per-node=2/4,
# * Balanced (1-GPU): --ntasks=1, --gpus-per-node=2/4
#--------------------------------------------
# Troubleshooting.
export LOGLEVEL=INFO
export NCCL_DEBUG=INFO
# Torchrun configuration for Slurm.
nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
nodes_array=($nodes)
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
echo Nodes array: $nodes_array
echo Head node: $head_node
echo Head node IP: $head_node_ip
# Load virtual environment (turgalium).
source ~/anaconda3/etc/profile.d/conda.sh
conda activate ssl-bsu-conda
# Load virtual environment (ngpu.ugr).
# export PATH="/opt/anaconda/anaconda3/bin:$PATH"
# export PATH="/opt/anaconda/bin:$PATH"
# eval "$(conda shell.bash hook)"
# conda activate /mnt/homeGPU/asanchez/ssl-conda
# export TFHUB_CACHE_DIR=.
# Define the general settings.
command="./finetuning_run_localhost.sh"
# Show the chosen options.
echo "---------------------"
echo "Command executed: >> srun $command"
echo "---------------------"
# Run.
mail -s "Sbatch $model began" [email protected] <<< "Starting..."
srun $command
mail -s "Sbatch $model ended" [email protected] <<< "Completed!"