-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathpretrain.sh
More file actions
81 lines (71 loc) · 2.09 KB
/
pretrain.sh
File metadata and controls
81 lines (71 loc) · 2.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/bin/bash
set -x -e
export NCCL_DEBUG=INFO
export NCCL_TIMEOUT=18000000
export NNODES=4
export num_gpus=8
export CPUS_PER_TASK=16
MASTER_ADDR=`scontrol show hostname $SLURM_JOB_NODELIST | head -n1`
MASTER_PORT=$((RANDOM % 101 + 25199))
export MASTER_ADDR=$MASTER_ADDR
export MASTER_PORT=$MASTER_PORT
echo $MASTER_ADDR
echo $MASTER_PORT
job_id=${SLURM_JOB_ID}
name="g2vlm_pretrain_${WORLD_SIZE}g_${job_id}"
export MODEL_PATH="InternRobotics/G2VLM-Qwen2-VL-2B"
export output_dir="./checkpoints/${name}/"
mkdir -p ${output_dir}
export checkpoint_dir="./checkpoints/${name}"
mkdir -p ${checkpoint_dir}
# export WANDB_MODE=offline
# export WANDB_API_KEY="your key"
export current_time=$(date +%Y%m%d_%H%M%S)
export wandb_name=$name
torchrun \
--nnodes $NNODES \
--nproc_per_node 8 \
--node_rank="${SLURM_NODEID}" \
--master_addr=$MASTER_ADDR \
--master_port=$MASTER_PORT \
train/pretrain_unified_model.py \
--dataset_config_file data/configs/pretrain.yaml \
--layer_module Qwen2VLMoTDecoderLayer \
--vit_path ${MODEL_PATH} \
--dino_path facebook/dinov2-with-registers-large \
--llm_path ${MODEL_PATH} \
--model_path ${MODEL_PATH} \
--use_flex True \
--expected_num_tokens 25600 \
--max_num_tokens 25600 \
--max_num_tokens_per_sample 25600 \
--wandb_project G2VLM \
--wandb_name ${wandb_name} \
--wandb_offline True \
--wandb_resume allow \
--checkpoint_dir ${checkpoint_dir} \
--llm_qk_norm True \
--finetune_from_hf True \
--auto_resume False \
--resume-model-only True \
--finetune-from-ema True \
--resume_from ${MODEL_PATH} \
--finetune_dino_from_hf False \
--copy_init_moe False \
--visual_und False \
--visual_recon True \
--pretrain_train_recon True \
--enable_ema_model False \
--freeze_dino True \
--freeze_vit True \
--freeze_und True \
--results_dir $output_dir \
--save_every 2000 \
--total_steps 100000 \
--warmup_steps 5000 \
--log_every 1 \
--num_shard 8 \
--num_replicate 4 \
--lr 2e-4 \
--lr_scheduler cosine \
--num_workers 4