File tree 2 files changed +11
-0
lines changed
2 files changed +11
-0
lines changed Original file line number Diff line number Diff line change @@ -16,6 +16,7 @@ class DlinferAttentionMetadata(AttentionMetadata):
16
16
max_q_seq_len : int = 1
17
17
max_kv_seq_len : int = 1
18
18
quant_meta : Dict = None
19
+ cu_seq_lens_kv : Optional [Tensor ] = None
19
20
20
21
21
22
class DlinferAttentionImpl (AttentionImpl [DlinferAttentionMetadata ]):
@@ -79,6 +80,8 @@ def forward(
79
80
max_q_seq_len = attn_metadata .max_q_seq_len
80
81
max_kv_seq_len = attn_metadata .max_kv_seq_len
81
82
quant_bits = attn_metadata .quant_policy
83
+ cu_seq_lens_kv = attn_metadata .cu_seq_lens_kv
84
+
82
85
if attn_metadata .quant_meta is not None :
83
86
k_scales_zeros = [
84
87
next (attn_metadata .quant_meta ['k_scales' ]),
@@ -128,6 +131,7 @@ def forward(
128
131
q_start_loc = q_start_loc ,
129
132
q_seqlens = q_seqlens ,
130
133
kv_seqlens = kv_seqlens ,
134
+ cu_seq_lens_kv = cu_seq_lens_kv ,
131
135
max_q_seq_len = max_q_seq_len ,
132
136
max_kv_seq_len = max_kv_seq_len ,
133
137
is_decoding = is_decoding ,
Original file line number Diff line number Diff line change @@ -15,7 +15,9 @@ def prefill_attention(
15
15
q_start_loc : Tensor ,
16
16
q_seq_len : Tensor ,
17
17
kv_seq_len : Tensor ,
18
+ cu_seq_lens_kv : Tensor ,
18
19
max_q_seq_len : int ,
20
+ max_kv_seq_len : int ,
19
21
block_size : int ,
20
22
attn_mask : Sequence [Optional [Tensor ]],
21
23
is_unpaged_prefill : Optional [bool ],
@@ -51,7 +53,9 @@ def prefill_attention(
51
53
q_start_loc ,
52
54
q_seq_len ,
53
55
kv_seq_len ,
56
+ cu_seq_lens_kv ,
54
57
max_q_seq_len ,
58
+ max_kv_seq_len ,
55
59
num_q_heads ,
56
60
num_kv_heads ,
57
61
attn_mask ,
@@ -105,6 +109,7 @@ def paged_attention_fwd(
105
109
q_start_loc : Tensor ,
106
110
q_seqlens : Tensor ,
107
111
kv_seqlens : Tensor ,
112
+ cu_seq_lens_kv : Tensor ,
108
113
max_q_seq_len : int ,
109
114
max_kv_seq_len : int ,
110
115
is_decoding : bool ,
@@ -127,7 +132,9 @@ def paged_attention_fwd(
127
132
q_start_loc ,
128
133
q_seqlens ,
129
134
kv_seqlens ,
135
+ cu_seq_lens_kv ,
130
136
max_q_seq_len ,
137
+ max_kv_seq_len ,
131
138
block_size ,
132
139
attn_mask ,
133
140
is_unpaged_prefill ,
You can’t perform that action at this time.
0 commit comments