Skip to content

Install and Run on DeltaAI

Cameron Smith edited this page Oct 28, 2025 · 1 revision

create working dir

mkdir mlReconnection
cd mlReconnection

allocate a bunch of cores for the python install

salloc -N 1 -n 30 -t120 --account=bfim-dtai-gh --gpus-per-node=1 --partition=ghx4

create pgkyl install script

cat << EOF > installPgkyl.sh
module use /sw/user/modules/python
module load python/miniforge3_pytorch
source pgkyl/bin/activate
git clone https://github.com/ammarhakim/postgkyl.git
cd postgkyl/
pip install -e .[adios,test]
EOF

chmod +x installPgkyl.sh 
./installPgkyl.sh 

clone pgkylFrontEnd

git clone -b cws/scorec [email protected]:scorec/pgkylFrontEnd.git

create an environment file for python

cat << EOF > envPython.sh
root=$PWD/mlReconnection
module use /sw/user/modules/python
module load python/miniforge3_pytorch
export PYTHONPATH=$PYTHONPATH:$root/pgkylFrontEnd
source $root/pgkyl/bin/activate
EOF

enable the environment

do this every time you begin work

source envPython.sh

install pytorch packages

see https://docs.ncsa.illinois.edu/systems/deltaai/en/latest/user-guide/python/pytorch.html#pip-install-into-a-conda-or-venv-environment

pip install --upgrade pip
pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126

clone and test reconClassifier

git clone [email protected]:SCOREC/reconClassifier
data=/work/nvme/bfim/cwsmith/mlReconnection2025/1024Res_v0
python reconClassifier/XPointMLTest.py \
--paramFile=$data/pkpm_2d_turb_p2-params.txt \
--xptCacheDir=$data/cache \
--trainFrameFirst 1 --trainFrameLast 2 --validationFrameFirst 2 --validationFrameLast 3 --epochs 2 --minTrainingLoss 0

run benchmark

#copy data to ramdisk - requires ~10GB
nvme=/work/nvme/bfim/cwsmith/mlReconnection2025/1024Res_v0
ramdisk=/dev/shm/`whoami`
mkdir -p $ramdisk
time cp -r $nvme/cache04082025 $ramdisk/.
time cp $nvme/pkpm_2d_turb_p2-params.txt $ramdisk/.
data=$ramdisk

export OMP_NUM_THREADS=10
export OMP_SCHEDULE=STATIC
export OMP_PROC_BIND=CLOSE

python -u reconClassifier/XPointMLTest.py \
  --paramFile=$data/pkpm_2d_turb_p2-params.txt \
  --xptCacheDir=$data/cache04082025 \
  --use-amp \
  --benchmark \
  --epochs 5 \
  --batchSize 16 \
  --learningRate 1e-5