@@ -434,6 +434,7 @@ struct vk_device_struct {
434
434
435
435
vk_pipeline pipeline_matmul_split_k_reduce;
436
436
vk_pipeline pipeline_quantize_q8_1;
437
+ vk_pipeline pipeline_quantize_q8_1_x4;
437
438
438
439
vk_pipeline pipeline_dequant[GGML_TYPE_COUNT];
439
440
vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols];
@@ -2934,8 +2935,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
2934
2935
2935
2936
if (device->subgroup_clustered && device->subgroup_require_full_support) {
2936
2937
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_subgroup_len, quantize_q8_1_subgroup_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
2938
+ ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_subgroup_len, quantize_q8_1_x4_subgroup_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1, true, true);
2937
2939
} else {
2938
2940
ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
2941
+ ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1_x4, "quantize_q8_1_x4", quantize_q8_1_x4_len, quantize_q8_1_x4_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
2939
2942
}
2940
2943
2941
2944
for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
@@ -5440,20 +5443,20 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
5440
5443
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
5441
5444
}
5442
5445
5443
- static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) {
5446
+ static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type, bool use_x4_blocks ) {
5444
5447
switch(type) {
5445
5448
case GGML_TYPE_Q8_1:
5446
- return ctx->device->pipeline_quantize_q8_1;
5449
+ return use_x4_blocks ? ctx->device->pipeline_quantize_q8_1_x4 : ctx->device->pipeline_quantize_q8_1;
5447
5450
default:
5448
5451
std::cerr << "Missing quantize pipeline for type: " << ggml_type_name(type) << std::endl;
5449
5452
GGML_ABORT("fatal error");
5450
5453
}
5451
5454
}
5452
5455
5453
- static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& subctx, vk_subbuffer&& in, vk_subbuffer&& out, uint32_t ne) {
5456
+ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& subctx, vk_subbuffer&& in, vk_subbuffer&& out, uint32_t ne, bool use_x4_blocks = false ) {
5454
5457
VK_LOG_DEBUG("ggml_vk_quantize_q8_1(" << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ", " << ne << ")");
5455
5458
5456
- vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
5459
+ vk_pipeline pipeline = use_x4_blocks ? ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true) : ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, false );
5457
5460
5458
5461
ggml_vk_sync_buffers(subctx);
5459
5462
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 1>{ne}, { ne, 1, 1 });
@@ -5573,7 +5576,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
5573
5576
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
5574
5577
5575
5578
if (quantize_y) {
5576
- to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
5579
+ to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, false );
5577
5580
}
5578
5581
5579
5582
if (dryrun) {
@@ -5741,16 +5744,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5741
5744
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
5742
5745
5743
5746
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
5744
-
5745
- const uint64_t x_ne = ne01 * ne00;
5746
- const uint64_t y_ne = ne11 * ne10;
5747
- const uint64_t d_ne = ne11 * ne01;
5748
-
5749
- const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
5750
- const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
5751
- const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
5752
- const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
5753
- const uint64_t d_sz = sizeof(float) * d_ne;
5747
+ bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && (ne11 * ne10) % 4 == 0;
5754
5748
5755
5749
vk_pipeline to_fp16_vk_0 = nullptr;
5756
5750
vk_pipeline to_fp16_vk_1 = nullptr;
@@ -5763,8 +5757,6 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5763
5757
to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
5764
5758
}
5765
5759
5766
- bool quantize_y = ctx->device->integer_dot_product && src1->type == GGML_TYPE_F32 && ggml_is_contiguous(src1) && (ne11 * ne10) % 4 == 0;
5767
-
5768
5760
// Check for mmq first
5769
5761
vk_pipeline dmmv = quantize_y ? ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, GGML_TYPE_Q8_1, ne11) : nullptr;
5770
5762
vk_pipeline to_q8_1 = nullptr;
@@ -5776,7 +5768,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5776
5768
}
5777
5769
5778
5770
if (quantize_y) {
5779
- to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
5771
+ to_q8_1 = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1, true );
5780
5772
}
5781
5773
5782
5774
const bool qx_needs_dequant = x_non_contig;
@@ -5789,6 +5781,16 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5789
5781
GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT
5790
5782
GGML_ASSERT(dmmv != nullptr);
5791
5783
5784
+ const uint64_t x_ne = ne01 * ne00;
5785
+ const uint64_t y_ne = ne11 * ne10;
5786
+ const uint64_t d_ne = ne11 * ne01;
5787
+
5788
+ const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
5789
+ const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
5790
+ const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
5791
+ const uint64_t y_sz = quantize_y ? (y_ne * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1)) : (f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne);
5792
+ const uint64_t d_sz = sizeof(float) * d_ne;
5793
+
5792
5794
if (dryrun) {
5793
5795
const uint64_t x_sz_upd = x_sz * ne02 * ne03;
5794
5796
const uint64_t y_sz_upd = y_sz * ne12 * ne13;
@@ -5801,7 +5803,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5801
5803
ctx->prealloc_size_x = x_sz_upd;
5802
5804
}
5803
5805
if ((qy_needs_dequant || quantize_y) && ctx->prealloc_size_y < y_sz_upd) {
5804
- ctx->prealloc_size_y = y_sz_upd;
5806
+ ctx->prealloc_size_y = CEIL_DIV( y_sz_upd, 128) * 128 ;
5805
5807
}
5806
5808
5807
5809
// Request descriptor sets
@@ -5846,7 +5848,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5846
5848
d_Y = ctx->prealloc_y;
5847
5849
} else if (quantize_y) {
5848
5850
d_Y = ctx->prealloc_y;
5849
- GGML_ASSERT(d_Y->size >= y_ne * ggml_type_size(GGML_TYPE_Q8_1) / ggml_blck_size(GGML_TYPE_Q8_1) );
5851
+ GGML_ASSERT(d_Y->size >= CEIL_DIV(y_sz * ne12 * ne13, 128) * 128 );
5850
5852
} else {
5851
5853
d_Y = d_Qy;
5852
5854
y_buf_offset = qy_buf_offset;
@@ -5862,7 +5864,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
5862
5864
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
5863
5865
}
5864
5866
if (quantize_y) {
5865
- ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13);
5867
+ ggml_vk_quantize_q8_1(ctx, subctx, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }, y_ne * ne12 * ne13, true );
5866
5868
}
5867
5869
5868
5870
// For batch_n, the A matrix is the same for each batch, and B/D use the row stride as the batch stride
0 commit comments