Skip to content

Commit e676aa3

Browse files
DeepNVMe benchmarks (#991)
* ds_io sweep scripts * Use accelerator pin memory * Credit * Add README Signed-off-by: Olatunji Ruwase <[email protected]> * Add README Signed-off-by: Olatunji Ruwase <[email protected]> --------- Signed-off-by: Olatunji Ruwase <[email protected]> Co-authored-by: Olatunji Ruwase <[email protected]>
1 parent ad2a4bd commit e676aa3

File tree

7 files changed

+209
-3
lines changed

7 files changed

+209
-3
lines changed

deepnvme/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
This folder contains performance micro-benchmarks of using DeepNVMe for various I/O data transfer scenarios. `GDS` mode transfers is supported on compatible platforms
2+
3+
ds_io: Sweep scripts for the `ds_io` I/O performance utility. `ds_io` is a `fio`-like utility for measuring read and write performance of raw I/O transfers involving CPU or GPU buffers.
4+
5+
file_access: Scripts for measuring file acesses involving CPU or GPU buffers.
6+
7+
zero_inference: Weight-offloading for LLM inference.

deepnvme/ds_io/ds_io_read_sweep.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
if [[ $# -lt 3 ]]; then
4+
echo "Usage: $0 <xfer [cpu|gpu|gds]> <nvme mount> <output log dir>"
5+
exit 1
6+
fi
7+
8+
XFER=$1
9+
NVME_DIR=$2
10+
LOG_DIR=$3
11+
12+
./ds_io_sweep.sh "read" ${XFER} ${NVME_DIR} ${LOG_DIR}

deepnvme/ds_io/ds_io_sweep.sh

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/bin/bash
2+
# set -x
3+
if [[ $# -lt 4 ]]; then
4+
echo "Usage: $0 <op [read|write]> <xfer [gpu|cpu|gds]> <nvme mount> <output log dir>"
5+
exit 1
6+
fi
7+
8+
IO_OP=$1
9+
XFER=$2
10+
NVME_DIR=$3
11+
LOG_DIR=$4
12+
13+
14+
if [[ ${IO_OP} == "read" ]]; then
15+
io_op_opt="--read"
16+
elif [[ ${IO_OP} == "write" ]]; then
17+
io_op_opt=""
18+
else
19+
echo "Error: ${IO_OP} is an invalid op. Valid ops are [read, write]"
20+
exit 1
21+
fi
22+
23+
if [[ ${XFER} == "cpu" ]]; then
24+
xfer_opt=""
25+
elif [[ ${XFER} == "gpu" ]]; then
26+
xfer_opt="--gpu --use_accelerator_pin_memory"
27+
elif [[ ${XFER} == "gds" ]]; then
28+
xfer_opt="--gpu --use_gds"
29+
else
30+
echo "Error: ${XFER} is an invalid op. Valid xfers are [cpu, gpu, gds]"
31+
exit 1
32+
fi
33+
34+
NUM_DRIVES=`ls -d ${NVME_DIR}* | wc -l`
35+
if [[ $NUM_DRIVES -lt 1 ]]; then
36+
echo "Error: Found less than 1 folder in ${NVME_DIR}"
37+
exit 1
38+
fi
39+
40+
41+
42+
mkdir -p ${LOG_DIR}
43+
IO_SIZE=1G
44+
45+
for numjobs in 1 4 8; do
46+
if ((numjobs < NUM_DRIVES)); then
47+
continue
48+
fi
49+
FTD_OPT="--folder_to_device_mapping "
50+
drive_num=0
51+
jobs_per_drive=$((numjobs/NUM_DRIVES))
52+
if ((jobs_per_drive == 0 )); then
53+
jobs_per_drive=1
54+
fi
55+
for (( i=0; i<${numjobs}; i++ )); do
56+
FTD_OPT="${FTD_OPT} ${NVME_DIR}${drive_num}:${i}"
57+
if (( (i+1) % jobs_per_drive == 0)); then
58+
drive_num=$((drive_num+1))
59+
fi
60+
done
61+
# echo ${FTD_OPT}
62+
COMMON_OPTS="--io_size ${IO_SIZE} ${io_op_opt} ${xfer_opt} ${FTD_OPT}"
63+
for ov in overlap sequential; do
64+
if [[ ${ov} == "sequential" ]]; then
65+
ov_opt="--sequential_requests"
66+
else
67+
ov_opt=""
68+
fi
69+
for sub in single block; do
70+
if [[ ${sub} == "single" ]]; then
71+
sub_opt="--single_submit"
72+
else
73+
sub_opt=""
74+
fi
75+
for io_para in 1 2 4 8; do
76+
io_para_opt="--io_parallel ${io_para}"
77+
for bs in 1M 2M; do
78+
bs_opt="--block_size ${bs}"
79+
for qd in 128; do
80+
qd_opt="--queue_depth ${qd}"
81+
RUN_OPTS="${ov_opt} ${sub_opt} ${io_para_opt} ${bs_opt} ${qd_opt}"
82+
LOG="${LOG_DIR}/$IO_OPT_${sub}_${ov}_t${io_para}_p${numjobs}_d${qd}_bs${bs}.txt"
83+
cmd="ds_io ${COMMON_OPTS} ${RUN_OPTS} &> ${LOG}"
84+
echo ${cmd}
85+
eval ${cmd}
86+
done
87+
done
88+
done
89+
done
90+
done
91+
done
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
3+
if [[ $# -lt 3 ]]; then
4+
echo "Usage: $0 <xfer [cpu|gpu|gds]> <nvme mount> <output log dir>"
5+
exit 1
6+
fi
7+
8+
XFER=$1
9+
NVME_DIR=$2
10+
LOG_DIR=$3
11+
12+
./ds_io_sweep.sh "write" ${XFER} ${NVME_DIR} ${LOG_DIR}
13+
exit

deepnvme/model_checkpoint/save_model_utils.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import argparse
22
import os
3-
from transformers import AutoModelForCausalLM
3+
from transformers import AutoModelForCausalLM, AutoTokenizer
44
from transformers import T5ForConditionalGeneration
55
from torch_save_utils import PINNED_BUFFER_MB
66

@@ -23,10 +23,13 @@ def _get_hf_model(tag):
2323
model_name = HF_MODELS_DICT[tag]
2424
if tag == TINY_T5:
2525
model = T5ForConditionalGeneration.from_pretrained(model_name)
26+
2627
else:
2728
model = AutoModelForCausalLM.from_pretrained(model_name)
29+
tokenizer = AutoTokenizer.from_pretrained(model_name)
30+
2831

29-
return model, model_name, tag
32+
return model, tokenizer, model_name, tag
3033

3134
def get_model(model_tag):
3235
return _get_hf_model(model_tag)
@@ -108,6 +111,13 @@ def parse_arguments():
108111
action='store_true',
109112
help='Disable double buffering of i/o buffer.')
110113

114+
parser.add_argument('--safetensors',
115+
action='store_true',
116+
help='Use safetensors load/save.')
117+
118+
parser.add_argument('--regular_torch_save',
119+
action='store_true',
120+
help='Use vanilla torch.save.')
111121

112122
#parser.add_argument('--single_writer', action='store_true', help='Disable parallel rank writes of data parallel (replicated) state')
113123

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Credit https://github.com/sayakpaul
2+
from save_model_utils import get_model, validate_arguments, parse_arguments
3+
from torch_save_utils import load_io_ops, _test_ds_fast_save, test_save
4+
import safetensors.torch
5+
import os
6+
import time
7+
import torch
8+
9+
def test_sft_save(file, buffer, args):
10+
st = time.time()
11+
safetensors.torch.save_file(filename=file, tensors=buffer)
12+
return time.time() - st
13+
14+
def main():
15+
print(
16+
f'Performance test of torch.save() integration of fast model checkpointing.'
17+
)
18+
print(f'torch version = {torch.__version__}')
19+
torch.manual_seed(42)
20+
21+
args = parse_arguments()
22+
if not validate_arguments(args):
23+
quit()
24+
load_io_ops(args)
25+
model, tokenizer, model_name, ckpt_name = get_model(args.model)
26+
27+
inputs = tokenizer("I am good", return_tensors="pt").to("cuda")
28+
29+
if args.half:
30+
model = model.half()
31+
if args.gpu:
32+
model = model.to("cuda")
33+
34+
with torch.no_grad():
35+
model.eval()
36+
pre_logits = model(**inputs).logits
37+
38+
if not args.safetensors:
39+
file = os.path.join(args.folder, f'{ckpt_name}.pt')
40+
else:
41+
file = os.path.join(args.folder, f'{ckpt_name}.safetensors')
42+
if os.path.exists(file):
43+
os.remove(file)
44+
if not args.regular_torch_save and not args.safetensors:
45+
write_sec = _test_ds_fast_save(file, model.state_dict(), args, False)
46+
elif args.regular_torch_save:
47+
write_sec = test_save(file, model.state_dict(), args)
48+
else:
49+
write_sec = test_sft_save(file, model.state_dict(), args)
50+
ckpt_size = os.path.getsize(file)
51+
gb_size = ckpt_size / (1024**3)
52+
gb_per_sec = gb_size / write_sec
53+
print(
54+
f'{gb_size:5.2f} GB, {write_sec:5.2f} secs, {gb_per_sec:5.2f} GB/s'
55+
)
56+
st = time.time()
57+
if args.safetensors:
58+
loaded_sd = safetensors.torch.load_file(file, device="cuda")
59+
else:
60+
loaded_sd = torch.load(file, weights_only=True, map_location="cuda")
61+
load_sec = time.time() - st
62+
print(f"Loaded in {load_sec:5.2f} seconds.")
63+
model.load_state_dict(loaded_sd)
64+
with torch.no_grad():
65+
model.eval()
66+
post_logits = model(**inputs).logits
67+
68+
assert torch.allclose(pre_logits, post_logits, atol=1e-3, rtol=1e-3)
69+
os.remove(file)
70+
71+
72+
if __name__ == "__main__":
73+
main()

deepnvme/model_checkpoint/torch_save_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def main():
5757
if not validate_arguments(args):
5858
quit()
5959
load_io_ops(args)
60-
model, model_name, ckpt_name = get_model(args.model)
60+
model, tokenizer, model_name, ckpt_name = get_model(args.model)
6161
if args.half:
6262
model = model.half()
6363
if args.gpu:

0 commit comments

Comments
 (0)