From ccbbdab894b30c7ddf135b889a7dba26b1f813ae Mon Sep 17 00:00:00 2001 From: Wu Date: Thu, 6 Jun 2019 14:14:36 -0700 Subject: [PATCH] minor change for EFA running --- infra/ami/train_efa.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/infra/ami/train_efa.sh b/infra/ami/train_efa.sh index 86eb342e..a3c6135c 100644 --- a/infra/ami/train_efa.sh +++ b/infra/ami/train_efa.sh @@ -27,18 +27,18 @@ mpirun -np ${NUM_GPU} \ -x LD_LIBRARY_PATH \ -x PATH \ -x NCCL_SOCKET_IFNAME=^docker0,lo \ --x NCCL_MIN_NRINGS=8 \ +-x NCCL_MIN_NRINGS=13 \ -x NCCL_DEBUG=INFO \ -x TENSORPACK_FP16=1 \ -x HOROVOD_CYCLE_TIME=0.5 \ -x HOROVOD_FUSION_THRESHOLD=67108864 \ -python3 tensorpack-mask-rcnn/MaskRCNN/train.py \ +python3 /home/ec2-user/tensorpack-mask-rcnn/MaskRCNN/train.py \ --fp16 \ --throughput_log_freq ${THROUGHPUT_LOG_FREQ} \ --config \ MODE_MASK=True \ MODE_FPN=True \ -DATA.BASEDIR=/data \ +DATA.BASEDIR=/home/ec2-user/data \ DATA.TRAIN='["train2017"]' \ DATA.VAL='("val2017",)' \ TRAIN.BATCH_SIZE_PER_GPU=${BATCH_SIZE_PER_GPU} \ @@ -46,7 +46,7 @@ TRAIN.LR_EPOCH_SCHEDULE='[(8, 0.1), (10, 0.01), (12, None)]' \ TRAIN.EVAL_PERIOD=12 \ RPN.TOPK_PER_IMAGE=True \ PREPROC.PREDEFINED_PADDING=True \ -BACKBONE.WEIGHTS=/data/pretrained-models/ImageNet-R50-AlignPadding.npz \ +BACKBONE.WEIGHTS=/home/ec2-user/data/pretrained-models/ImageNet-R50-AlignPadding.npz \ BACKBONE.NORM=FreezeBN \ TRAINER=horovod #For 32x4