Skip to content

Commit e83d57f

Browse files
committed
vulkan: Reuse conversion results in prealloc_y
Cache the pipeline and tensor that were most recently used to fill prealloc_y, and skip the conversion if the current pipeline/tensor match.
1 parent 21c17b5 commit e83d57f

File tree

2 files changed

+94
-23
lines changed

2 files changed

+94
-23
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 58 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1193,6 +1193,10 @@ struct ggml_backend_vk_context {
11931193
vk::Fence fence, almost_ready_fence;
11941194
bool almost_ready_fence_pending {};
11951195

1196+
// Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert.
1197+
vk_pipeline prealloc_y_last_pipeline_used;
1198+
const ggml_tensor *prealloc_y_last_tensor_used;
1199+
11961200
vk_buffer buffer_pool[MAX_VK_BUFFERS];
11971201

11981202
vk_context_ref compute_ctx;
@@ -5651,10 +5655,20 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
56515655
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
56525656
}
56535657
if (y_non_contig) {
5654-
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5658+
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 ||
5659+
ctx->prealloc_y_last_tensor_used != src1) {
5660+
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5661+
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;
5662+
ctx->prealloc_y_last_tensor_used = src1;
5663+
}
56555664
}
56565665
if (quantize_y) {
5657-
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
5666+
if (ctx->prealloc_y_last_pipeline_used != to_q8_1 ||
5667+
ctx->prealloc_y_last_tensor_used != src1) {
5668+
ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
5669+
ctx->prealloc_y_last_pipeline_used = to_q8_1;
5670+
ctx->prealloc_y_last_tensor_used = src1;
5671+
}
56585672
}
56595673

56605674
uint32_t stride_batch_x = ne00*ne01;
@@ -5829,7 +5843,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
58295843
}
58305844
if (y_non_contig) {
58315845
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
5832-
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5846+
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 ||
5847+
ctx->prealloc_y_last_tensor_used != src1) {
5848+
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5849+
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;
5850+
ctx->prealloc_y_last_tensor_used = src1;
5851+
}
58335852
}
58345853

58355854
// For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
@@ -6259,7 +6278,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
62596278
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
62606279
}
62616280
if (y_non_contig) {
6262-
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6281+
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 ||
6282+
ctx->prealloc_y_last_tensor_used != src1) {
6283+
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6284+
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;
6285+
ctx->prealloc_y_last_tensor_used = src1;
6286+
}
62636287
}
62646288

62656289
uint32_t stride_batch_x = ne00*ne01;
@@ -6447,7 +6471,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
64476471
}
64486472
if (y_non_contig) {
64496473
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
6450-
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6474+
if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1 ||
6475+
ctx->prealloc_y_last_tensor_used != src1) {
6476+
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
6477+
ctx->prealloc_y_last_pipeline_used = to_fp16_vk_1;
6478+
ctx->prealloc_y_last_tensor_used = src1;
6479+
}
64516480
}
64526481

64536482
uint32_t stride_batch_y = ne10*ne11;
@@ -6491,22 +6520,29 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
64916520
GGML_ASSERT(nei0 <= 4096);
64926521
const uint32_t split_size = std::min(nei1, 4096u / nei0);
64936522

6494-
ggml_tensor src1_copy = *src1;
6495-
ggml_tensor src2_copy = *src2;
6496-
ggml_tensor dst_copy = *dst;
6523+
if (split_size == nei1) {
6524+
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
6525+
} else {
6526+
ggml_tensor src1_copy = *src1;
6527+
ggml_tensor src2_copy = *src2;
6528+
ggml_tensor dst_copy = *dst;
64976529

6498-
for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
6499-
const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
6530+
for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
6531+
const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
65006532

6501-
src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
6502-
src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
6503-
dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
6533+
src1_copy.view_offs = src1->view_offs + token_start * src1_copy.nb[2];
6534+
src2_copy.view_offs = src2->view_offs + token_start * src2_copy.nb[1];
6535+
dst_copy.view_offs = dst->view_offs + token_start * dst_copy.nb[2];
65046536

6505-
src1_copy.ne[2] = n_tokens;
6506-
src2_copy.ne[1] = n_tokens;
6507-
dst_copy.ne[2] = n_tokens;
6537+
src1_copy.ne[2] = n_tokens;
6538+
src2_copy.ne[1] = n_tokens;
6539+
dst_copy.ne[2] = n_tokens;
65086540

6509-
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
6541+
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
6542+
// invalidate cached prealloc_y, can't cache based on the copy of the ggml_tensor
6543+
ctx->prealloc_y_last_pipeline_used = {};
6544+
ctx->prealloc_y_last_tensor_used = nullptr;
6545+
}
65106546
}
65116547
}
65126548
}
@@ -10311,6 +10347,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
1031110347
ggml_vk_pool_free(ctx, buffer);
1031210348
}
1031310349
ctx->gc.temp_buffers.clear();
10350+
ctx->prealloc_y_last_pipeline_used = {};
1031410351

1031510352
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
1031610353
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
@@ -10346,6 +10383,7 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
1034610383
ggml_vk_destroy_buffer(ctx->prealloc_x);
1034710384
ggml_vk_destroy_buffer(ctx->prealloc_y);
1034810385
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
10386+
ctx->prealloc_y_last_pipeline_used = {};
1034910387

1035010388
for (auto& buffer : ctx->buffer_pool) {
1035110389
ggml_vk_destroy_buffer(buffer);
@@ -10894,6 +10932,9 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1089410932
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
1089510933
}
1089610934

10935+
ctx->prealloc_y_last_pipeline_used = {};
10936+
ctx->prealloc_y_last_tensor_used = nullptr;
10937+
1089710938
// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
1089810939
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
1089910940
// (and scaled down based on model size, so smaller models submit earlier).

tests/test-backend-ops.cpp

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3098,9 +3098,10 @@ struct test_mul_mat : public test_case {
30983098
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
30993099
const std::array<int64_t, 4> per; // permutation of dimensions
31003100
const bool v; // whether a and b are non-contiguous views
3101+
const uint32_t o; // number of outputs
31013102

31023103
std::string vars() override {
3103-
return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v);
3104+
return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, v, o);
31043105
}
31053106

31063107
double max_nmse_err() override {
@@ -3121,8 +3122,8 @@ struct test_mul_mat : public test_case {
31213122
std::array<int64_t, 2> bs = {10, 10},
31223123
std::array<int64_t, 2> nr = {2, 2},
31233124
std::array<int64_t, 4> per = {0, 1, 2, 3},
3124-
bool v = false)
3125-
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {}
3125+
bool v = false, uint32_t o = 1)
3126+
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v), o(o) {}
31263127

31273128
ggml_tensor * build_graph(ggml_context * ctx) override {
31283129
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
@@ -3186,9 +3187,21 @@ struct test_mul_mat : public test_case {
31863187

31873188
ggml_tensor * out = ggml_mul_mat(ctx, a, b);
31883189
ggml_set_name(out, "out");
3190+
for (uint32_t i = 1; i < o; ++i) {
3191+
ggml_tensor * out2 = ggml_mul_mat(ctx, a, b);
3192+
ggml_set_name(out2, "out2");
3193+
out = ggml_add(ctx, out, out2);
3194+
}
31893195

31903196
return out;
31913197
}
3198+
3199+
bool run_whole_graph() override { return o > 1; }
3200+
3201+
std::string op_desc(ggml_tensor * t) override {
3202+
GGML_UNUSED(t);
3203+
return ggml_op_name(GGML_OP_MUL_MAT);
3204+
}
31923205
};
31933206

31943207
// GGML_OP_MUL_MAT_ID
@@ -3201,9 +3214,10 @@ struct test_mul_mat_id : public test_case {
32013214
const int64_t m;
32023215
const int64_t n;
32033216
const int64_t k;
3217+
const uint32_t o; // number of outputs
32043218

32053219
std::string vars() override {
3206-
return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
3220+
return VARS_TO_STR9(type_a, type_b, n_mats, n_used, b, m, n, k, o);
32073221
}
32083222

32093223
double max_nmse_err() override {
@@ -3217,9 +3231,9 @@ struct test_mul_mat_id : public test_case {
32173231

32183232
test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
32193233
int n_mats = 8, int n_used = 2, bool b = false,
3220-
int64_t m = 32, int64_t n = 32, int64_t k = 32)
3234+
int64_t m = 32, int64_t n = 32, int64_t k = 32, uint32_t o = 1)
32213235
: type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
3222-
m(m), n(n), k(k) {
3236+
m(m), n(n), k(k), o(o) {
32233237
GGML_ASSERT(n_used <= n_mats);
32243238
}
32253239

@@ -3241,6 +3255,13 @@ struct test_mul_mat_id : public test_case {
32413255
ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
32423256
ggml_set_name(out, "out");
32433257

3258+
for (uint32_t i = 1; i < o; ++i) {
3259+
ggml_tensor * a2 = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
3260+
ggml_tensor * out2 = ggml_mul_mat_id(ctx, a2, b, ids);
3261+
ggml_set_name(out2, "out2");
3262+
out = ggml_add(ctx, out, out2);
3263+
}
3264+
32443265
return out;
32453266
}
32463267

@@ -3264,6 +3285,13 @@ struct test_mul_mat_id : public test_case {
32643285
}
32653286
}
32663287
}
3288+
3289+
bool run_whole_graph() override { return o > 1; }
3290+
3291+
std::string op_desc(ggml_tensor * t) override {
3292+
GGML_UNUSED(t);
3293+
return ggml_op_name(GGML_OP_MUL_MAT_ID);
3294+
}
32673295
};
32683296

32693297
// GGML_OP_OUT_PROD
@@ -5798,6 +5826,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
57985826
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
57995827
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
58005828
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
5829+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, true, 3));
58015830

58025831
for (auto bs2 : {1,3}) {
58035832
for (auto bs : {1,2,4,8}) {
@@ -5826,6 +5855,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
58265855
}
58275856

58285857
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 1, 1, false, 8, 16, 1));
5858+
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, false, 32, 32, 32, 3));
58295859

58305860
for (ggml_type type_a : base_types) {
58315861
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {

0 commit comments

Comments
 (0)