@@ -261,6 +261,7 @@ struct cmd_params {
261
261
std::vector<bool > use_mmap;
262
262
std::vector<bool > embeddings;
263
263
std::vector<bool > no_op_offload;
264
+ std::vector<bool > graph_reuse;
264
265
ggml_numa_strategy numa;
265
266
int reps;
266
267
ggml_sched_priority prio;
@@ -298,6 +299,7 @@ static const cmd_params cmd_params_defaults = {
298
299
/* use_mmap */ { true },
299
300
/* embeddings */ { false },
300
301
/* no_op_offload */ { false },
302
+ /* graph_reuse */ { false },
301
303
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
302
304
/* reps */ 5 ,
303
305
/* prio */ GGML_SCHED_PRIO_NORMAL,
@@ -377,6 +379,7 @@ static void print_usage(int /* argc */, char ** argv) {
377
379
printf (" -ot --override-tensors <tensor name pattern>=<buffer type>;...\n " );
378
380
printf (" (default: disabled)\n " );
379
381
printf (" -nopo, --no-op-offload <0|1> (default: 0)\n " );
382
+ printf (" -gr, --graph-reuse <0|1> (default: 0)\n " );
380
383
printf (" \n " );
381
384
printf (
382
385
" Multiple values can be given for each parameter by separating them with ','\n "
@@ -620,6 +623,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
620
623
}
621
624
auto p = string_split<bool >(argv[i], split_delim);
622
625
params.no_kv_offload .insert (params.no_kv_offload .end (), p.begin (), p.end ());
626
+ } else if (arg == " -gr" || arg == " --graph-reuse" ) {
627
+ if (++i >= argc) {
628
+ invalid_param = true ;
629
+ break ;
630
+ }
631
+ auto p = string_split<bool >(argv[i], split_delim);
632
+ params.graph_reuse .insert (params.graph_reuse .end (), p.begin (), p.end ());
623
633
} else if (arg == " --numa" ) {
624
634
if (++i >= argc) {
625
635
invalid_param = true ;
@@ -885,6 +895,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
885
895
if (params.no_op_offload .empty ()) {
886
896
params.no_op_offload = cmd_params_defaults.no_op_offload ;
887
897
}
898
+ if (params.graph_reuse .empty ()) {
899
+ params.graph_reuse = cmd_params_defaults.graph_reuse ;
900
+ }
888
901
if (params.n_threads .empty ()) {
889
902
params.n_threads = cmd_params_defaults.n_threads ;
890
903
}
@@ -926,6 +939,7 @@ struct cmd_params_instance {
926
939
bool use_mmap;
927
940
bool embeddings;
928
941
bool no_op_offload;
942
+ bool graph_reuse;
929
943
930
944
llama_model_params to_llama_mparams () const {
931
945
llama_model_params mparams = llama_model_default_params ();
@@ -998,6 +1012,7 @@ struct cmd_params_instance {
998
1012
cparams.embeddings = embeddings;
999
1013
cparams.op_offload = !no_op_offload;
1000
1014
cparams.swa_full = false ;
1015
+ cparams.graph_reuse = graph_reuse;
1001
1016
1002
1017
return cparams;
1003
1018
}
@@ -1018,6 +1033,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1018
1033
for (const auto & mmp : params.use_mmap )
1019
1034
for (const auto & embd : params.embeddings )
1020
1035
for (const auto & nopo : params.no_op_offload )
1036
+ for (const auto & gr : params.graph_reuse )
1021
1037
for (const auto & nb : params.n_batch )
1022
1038
for (const auto & nub : params.n_ubatch )
1023
1039
for (const auto & tk : params.type_k )
@@ -1059,6 +1075,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1059
1075
/* .use_mmap = */ mmp,
1060
1076
/* .embeddings = */ embd,
1061
1077
/* .no_op_offload= */ nopo,
1078
+ /* .graph_reuse = */ gr,
1062
1079
};
1063
1080
instances.push_back (instance);
1064
1081
}
@@ -1092,6 +1109,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1092
1109
/* .use_mmap = */ mmp,
1093
1110
/* .embeddings = */ embd,
1094
1111
/* .no_op_offload= */ nopo,
1112
+ /* .graph_reuse = */ gr,
1095
1113
};
1096
1114
instances.push_back (instance);
1097
1115
}
@@ -1125,6 +1143,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
1125
1143
/* .use_mmap = */ mmp,
1126
1144
/* .embeddings = */ embd,
1127
1145
/* .no_op_offload= */ nopo,
1146
+ /* .graph_reuse = */ gr,
1128
1147
};
1129
1148
instances.push_back (instance);
1130
1149
}
@@ -1162,6 +1181,7 @@ struct test {
1162
1181
bool use_mmap;
1163
1182
bool embeddings;
1164
1183
bool no_op_offload;
1184
+ bool graph_reuse;
1165
1185
int n_prompt;
1166
1186
int n_gen;
1167
1187
int n_depth;
@@ -1197,6 +1217,7 @@ struct test {
1197
1217
use_mmap = inst.use_mmap ;
1198
1218
embeddings = inst.embeddings ;
1199
1219
no_op_offload = inst.no_op_offload ;
1220
+ graph_reuse = inst.graph_reuse ;
1200
1221
n_prompt = inst.n_prompt ;
1201
1222
n_gen = inst.n_gen ;
1202
1223
n_depth = inst.n_depth ;
@@ -1243,8 +1264,8 @@ struct test {
1243
1264
" cpu_mask" , " cpu_strict" , " poll" , " type_k" , " type_v" , " n_gpu_layers" ,
1244
1265
" split_mode" , " main_gpu" , " no_kv_offload" , " flash_attn" , " tensor_split" , " tensor_buft_overrides" ,
1245
1266
" defrag_thold" ,
1246
- " use_mmap" , " embeddings" , " no_op_offload" , " n_prompt" , " n_gen" , " n_depth" , " test_time " ,
1247
- " avg_ns" , " stddev_ns" , " avg_ts" , " stddev_ts" ,
1267
+ " use_mmap" , " embeddings" , " no_op_offload" , " graph_reuse " , " n_prompt" , " n_gen" , " n_depth" ,
1268
+ " test_time " , " avg_ns" , " stddev_ns" , " avg_ts" , " stddev_ts" ,
1248
1269
};
1249
1270
return fields;
1250
1271
}
@@ -1259,7 +1280,7 @@ struct test {
1259
1280
return INT;
1260
1281
}
1261
1282
if (field == " f16_kv" || field == " no_kv_offload" || field == " cpu_strict" || field == " flash_attn" ||
1262
- field == " use_mmap" || field == " embeddings" ) {
1283
+ field == " use_mmap" || field == " embeddings" || field == " graph_reuse " ) {
1263
1284
return BOOL;
1264
1285
}
1265
1286
if (field == " avg_ts" || field == " stddev_ts" || field == " defrag_thold" ) {
@@ -1333,6 +1354,7 @@ struct test {
1333
1354
std::to_string (use_mmap),
1334
1355
std::to_string (embeddings),
1335
1356
std::to_string (no_op_offload),
1357
+ std::to_string (graph_reuse),
1336
1358
std::to_string (n_prompt),
1337
1359
std::to_string (n_gen),
1338
1360
std::to_string (n_depth),
@@ -1518,6 +1540,9 @@ struct markdown_printer : public printer {
1518
1540
if (field == " no_op_offload" ) {
1519
1541
return 4 ;
1520
1542
}
1543
+ if (field == " graph_reuse" ) {
1544
+ return 4 ;
1545
+ }
1521
1546
1522
1547
int width = std::max ((int ) field.length (), 10 );
1523
1548
@@ -1552,6 +1577,9 @@ struct markdown_printer : public printer {
1552
1577
if (field == " no_op_offload" ) {
1553
1578
return " nopo" ;
1554
1579
}
1580
+ if (field == " graph_reuse" ) {
1581
+ return " gr" ;
1582
+ }
1555
1583
if (field == " tensor_split" ) {
1556
1584
return " ts" ;
1557
1585
}
@@ -1626,6 +1654,9 @@ struct markdown_printer : public printer {
1626
1654
if (params.no_op_offload .size () > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload ) {
1627
1655
fields.emplace_back (" no_op_offload" );
1628
1656
}
1657
+ if (params.graph_reuse .size () > 1 || params.graph_reuse != cmd_params_defaults.graph_reuse ) {
1658
+ fields.emplace_back (" graph_reuse" );
1659
+ }
1629
1660
fields.emplace_back (" test" );
1630
1661
fields.emplace_back (" t/s" );
1631
1662
0 commit comments