1
1
#include " ggml-metal-common.h"
2
2
3
3
#include " ggml-impl.h"
4
+ #include " ggml-backend-impl.h"
4
5
5
6
#include < vector>
6
7
8
+ // represents a memory range (i.e. an interval from a starting address p0 to an ending address p1 in a given buffer pb)
9
+ // the type indicates whether it is a source range (i.e. ops read data from it) or a destination range (i.e. ops write data to it)
7
10
struct ggml_mem_range {
8
11
uint64_t pb; // buffer id
9
12
@@ -36,8 +39,8 @@ void ggml_mem_ranges_reset(ggml_mem_ranges * mrs) {
36
39
mrs->ranges .clear ();
37
40
}
38
41
39
- static bool ggml_mem_ranges_add (ggml_mem_ranges * mrs, ggml_mem_range mrp ) {
40
- mrs->ranges .push_back (mrp );
42
+ static bool ggml_mem_ranges_add (ggml_mem_ranges * mrs, ggml_mem_range mr ) {
43
+ mrs->ranges .push_back (mr );
41
44
42
45
return true ;
43
46
}
@@ -48,28 +51,32 @@ static ggml_mem_range ggml_mem_range_from_tensor(const ggml_tensor * tensor, ggm
48
51
49
52
GGML_ASSERT (!tensor->view_src );
50
53
51
- ggml_mem_range mrp ;
54
+ ggml_mem_range mr ;
52
55
53
56
if (tensor->buffer ) {
54
- // when the tensor is allocated, use the actual memory address range of the buffer
55
- mrp = {
57
+ // when the tensor is allocated, use the actual memory address range in the buffer
58
+ //
59
+ // take the actual allocated size with ggml_backend_buft_get_alloc_size()
60
+ // this can be larger than the tensor size if the buffer type allocates extra memory
61
+ // ref: https://github.com/ggml-org/llama.cpp/pull/15966
62
+ mr = {
56
63
/* .pb =*/ (uint64_t ) tensor->buffer ,
57
64
/* .p0 =*/ (uint64_t ) tensor->data ,
58
- /* .p1 =*/ (uint64_t ) tensor->data + ggml_nbytes ( tensor),
65
+ /* .p1 =*/ (uint64_t ) tensor->data + ggml_backend_buft_get_alloc_size (tensor-> buffer -> buft , tensor),
59
66
/* .pt =*/ pt,
60
67
};
61
68
} else {
62
- // otherwise, the tensor ptr is used as an unique id of the memory ranges
69
+ // otherwise, the pointer address is used as an unique id of the memory ranges
63
70
// that the tensor will be using when it is allocated
64
- mrp = {
71
+ mr = {
65
72
/* .pb =*/ (uint64_t ) tensor,
66
73
/* .p0 =*/ 0 , //
67
74
/* .p1 =*/ 1024 , // [0, 1024) is a dummy range, not used
68
75
/* .pt =*/ pt,
69
76
};
70
77
};
71
78
72
- return mrp ;
79
+ return mr ;
73
80
}
74
81
75
82
static ggml_mem_range ggml_mem_range_from_tensor_src (const ggml_tensor * tensor) {
@@ -83,25 +90,25 @@ static ggml_mem_range ggml_mem_range_from_tensor_dst(const ggml_tensor * tensor)
83
90
static bool ggml_mem_ranges_add_src (ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
84
91
GGML_ASSERT (tensor);
85
92
86
- ggml_mem_range mrp = ggml_mem_range_from_tensor_src (tensor);
93
+ ggml_mem_range mr = ggml_mem_range_from_tensor_src (tensor);
87
94
88
95
if (mrs->debug > 2 ) {
89
- GGML_LOG_DEBUG (" %s: add src range buf=%lld, [%lld, %lld)\n " , __func__, mrp .pb , mrp .p0 , mrp .p1 );
96
+ GGML_LOG_DEBUG (" %s: add src range buf=%lld, [%lld, %lld)\n " , __func__, mr .pb , mr .p0 , mr .p1 );
90
97
}
91
98
92
- return ggml_mem_ranges_add (mrs, mrp );
99
+ return ggml_mem_ranges_add (mrs, mr );
93
100
}
94
101
95
102
static bool ggml_mem_ranges_add_dst (ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
96
103
GGML_ASSERT (tensor);
97
104
98
- ggml_mem_range mrp = ggml_mem_range_from_tensor_dst (tensor);
105
+ ggml_mem_range mr = ggml_mem_range_from_tensor_dst (tensor);
99
106
100
107
if (mrs->debug > 2 ) {
101
- GGML_LOG_DEBUG (" %s: add dst range buf=%lld, [%lld, %lld)\n " , __func__, mrp .pb , mrp .p0 , mrp .p1 );
108
+ GGML_LOG_DEBUG (" %s: add dst range buf=%lld, [%lld, %lld)\n " , __func__, mr .pb , mr .p0 , mr .p1 );
102
109
}
103
110
104
- return ggml_mem_ranges_add (mrs, mrp );
111
+ return ggml_mem_ranges_add (mrs, mr );
105
112
}
106
113
107
114
bool ggml_mem_ranges_add (ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
@@ -114,24 +121,26 @@ bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
114
121
return ggml_mem_ranges_add_dst (mrs, tensor);
115
122
}
116
123
117
- static bool ggml_mem_ranges_check (const ggml_mem_ranges * mrs, ggml_mem_range mrp ) {
124
+ static bool ggml_mem_ranges_check (const ggml_mem_ranges * mrs, ggml_mem_range mr ) {
118
125
for (size_t i = 0 ; i < mrs->ranges .size (); i++) {
119
126
const auto & cmp = mrs->ranges [i];
120
127
121
- if (mrp.pb != cmp.pb ) {
128
+ // two memory ranges cannot intersect if they are in different buffers
129
+ if (mr.pb != cmp.pb ) {
122
130
continue ;
123
131
}
124
132
125
- if (mrp.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) {
133
+ // intersecting source ranges are allowed
134
+ if (mr.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) {
126
135
continue ;
127
136
}
128
137
129
- if (mrp .p0 < cmp.p1 && mrp .p1 >= cmp.p0 ) {
138
+ if (mr .p0 < cmp.p1 && mr .p1 >= cmp.p0 ) {
130
139
if (mrs->debug > 2 ) {
131
140
GGML_LOG_DEBUG (" %s: the %s range buf=%lld, [%lld, %lld) overlaps with a previous %s range buf=%lld, [%lld, %lld)\n " ,
132
141
__func__,
133
- mrp .pt == MEM_RANGE_TYPE_SRC ? " src" : " dst" ,
134
- mrp .pb , mrp .p0 , mrp .p1 ,
142
+ mr .pt == MEM_RANGE_TYPE_SRC ? " src" : " dst" ,
143
+ mr .pb , mr .p0 , mr .p1 ,
135
144
cmp.pt == MEM_RANGE_TYPE_SRC ? " src" : " dst" ,
136
145
cmp.pb , cmp.p0 , cmp.p1 );
137
146
}
@@ -146,19 +155,19 @@ static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mr
146
155
static bool ggml_mem_ranges_check_src (const ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
147
156
GGML_ASSERT (tensor);
148
157
149
- ggml_mem_range mrp = ggml_mem_range_from_tensor_src (tensor);
158
+ ggml_mem_range mr = ggml_mem_range_from_tensor_src (tensor);
150
159
151
- const bool res = ggml_mem_ranges_check (mrs, mrp );
160
+ const bool res = ggml_mem_ranges_check (mrs, mr );
152
161
153
162
return res;
154
163
}
155
164
156
165
static bool ggml_mem_ranges_check_dst (const ggml_mem_ranges * mrs, const ggml_tensor * tensor) {
157
166
GGML_ASSERT (tensor);
158
167
159
- ggml_mem_range mrp = ggml_mem_range_from_tensor_dst (tensor);
168
+ ggml_mem_range mr = ggml_mem_range_from_tensor_dst (tensor);
160
169
161
- const bool res = ggml_mem_ranges_check (mrs, mrp );
170
+ const bool res = ggml_mem_ranges_check (mrs, mr );
162
171
163
172
return res;
164
173
}
@@ -222,6 +231,7 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
222
231
}
223
232
}
224
233
234
+ // keep track of the sources of the fused nodes as well
225
235
for (const auto * fused : node.fused ) {
226
236
for (int i = 0 ; i < GGML_MAX_SRC; i++) {
227
237
if (fused->src [i]) {
@@ -290,7 +300,10 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
290
300
291
301
std::vector<bool > used (n, false );
292
302
303
+ // the memory ranges for the set of currently concurrent nodes
293
304
ggml_mem_ranges * mrs0 = ggml_mem_ranges_init (0 );
305
+
306
+ // the memory ranges for the set of nodes that haven't been processed yet, when looking forward for a node to reorder
294
307
ggml_mem_ranges * mrs1 = ggml_mem_ranges_init (0 );
295
308
296
309
for (int i0 = 0 ; i0 < n; i0++) {
@@ -329,7 +342,7 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
329
342
330
343
const bool is_empty = node1.is_empty ();
331
344
332
- // to add a concurrent node , it has to be:
345
+ // to reorder a node and add it to the concurrent set , it has to be:
333
346
// + empty or concurrent with all nodes in the existing concurrent set (mrs0)
334
347
// + concurrent with all nodes prior to it that haven't been processed yet (mrs1)
335
348
if ((is_empty || h_check (mrs0, node1)) && h_check (mrs1, node1)) {
@@ -419,8 +432,8 @@ void ggml_metal_graph_optimize(ggml_cgraph * gf) {
419
432
nodes.push_back (std::move (node));
420
433
}
421
434
422
- // reorder to improve concurrency
423
435
#if 1
436
+ // reorder to improve concurrency
424
437
const auto order = ggml_metal_graph_optimize_reorder (nodes);
425
438
#else
426
439
std::vector<int> order(nodes.size());
0 commit comments