From a5e57e4a9a263056cf7bc65b4987adf2ae489e01 Mon Sep 17 00:00:00 2001 From: Andrew Gu Date: Tue, 22 Oct 2024 13:04:38 -0700 Subject: [PATCH] Use expandable segments in `run_llama_train.sh` ghstack-source-id: e8adf491c3a7d2da4a8674bb78b18e793b75a934 Pull Request resolved: https://github.com/pytorch/torchtitan/pull/643 --- run_llama_train.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/run_llama_train.sh b/run_llama_train.sh index a4107806..a69c967a 100755 --- a/run_llama_train.sh +++ b/run_llama_train.sh @@ -19,6 +19,7 @@ if [ $# -ne 0 ]; then overrides="$*" fi +PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \ torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \ --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \ train.py --job.config_file ${CONFIG_FILE} $overrides