Skip to content

sched : copy only the used experts when offloading prompt processing #15346

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 20, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 87 additions & 9 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <vector>
#include <algorithm>
#include <vector>

#ifdef __APPLE__
#include <sys/types.h>
Expand Down Expand Up @@ -1352,6 +1351,10 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
struct ggml_backend_sched_split * splits = sched->splits;

ggml_tensor * prev_ids_tensor = nullptr;
std::vector<int32_t> ids;
std::vector<ggml_bitset_t> used_ids;

for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &splits[i];
int split_backend_id = split->backend_id;
Expand All @@ -1378,16 +1381,91 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
} else {
ggml_backend_synchronize(split_backend);
}
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {

// when offloading MoE weights, we can reduce the amount of data copied by copying only the experts that are used
ggml_tensor * node = split->graph.nodes[0];
if (split->graph.n_nodes > 0 &&
ggml_backend_buffer_get_usage(input->buffer) == GGML_BACKEND_BUFFER_USAGE_WEIGHTS &&
ggml_backend_buffer_is_host(input->buffer) && (
(node->src[0] == input_cpy && node->op == GGML_OP_MUL_MAT_ID)
//|| (node->src[1] == input_cpy && node->op == GGML_OP_ADD_ID) /* GGML_OP_ADD_ID weights are small and not worth splitting */
)) {

const int64_t n_expert = node->op == GGML_OP_MUL_MAT_ID ? input->ne[2] : input->ne[1];
const size_t expert_size = node->op == GGML_OP_MUL_MAT_ID ? input->nb[2] : input->nb[1];

ggml_backend_synchronize(input_backend);
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else {

// get the ids
ggml_tensor * ids_tensor = node->src[2];
if (ids_tensor != prev_ids_tensor) {
ids.resize(ggml_nbytes(ids_tensor) / sizeof(int32_t));
ggml_backend_tensor_get_async(split_backend, ids_tensor, ids.data(), 0, ggml_nbytes(ids_tensor));
ggml_backend_synchronize(split_backend);

// find the used experts
used_ids.clear();
used_ids.resize(ggml_bitset_size(n_expert));
for (int64_t i1 = 0; i1 < ids_tensor->ne[1]; i1++) {
for (int64_t i0 = 0; i0 < ids_tensor->ne[0]; i0++) {
int32_t id = ids[i1 * ids_tensor->nb[1]/sizeof(int32_t) + i0 * ids_tensor->nb[0]/sizeof(int32_t)];
ggml_bitset_set(used_ids.data(), id);
}
}

prev_ids_tensor = ids_tensor;
}

// group consecutive experts and copy them together
auto copy_experts = [&](int32_t first_id, int32_t last_id) {
const size_t expert_offset = first_id * expert_size;
const size_t expert_size_copy = (last_id - first_id + 1) * expert_size;
const size_t padding = std::min<size_t>(expert_size, 512);
const size_t padding_end = last_id < n_expert - 1 ? padding : 0;

ggml_backend_tensor_set_async(split_backend,
input_cpy,
(const uint8_t *)input->data + expert_offset, expert_offset,
// copy a bit extra at the to ensure there are no NaNs in the padding of the last expert
// this is necessary for MMQ in the CUDA backend
expert_size_copy + padding_end);
};

int id = 0;
while (!ggml_bitset_get(used_ids.data(), id)) {
id++;
}
int32_t first_id = id;
int32_t last_id = first_id;

for (++id; id < n_expert; ++id) {
if (!ggml_bitset_get(used_ids.data(), id)) {
continue;
}

if (id == last_id + 1) {
last_id = id;
continue;
}

copy_experts(first_id, last_id);

first_id = id;
last_id = id;
}
copy_experts(first_id, last_id);
} else {
// try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
// TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
ggml_backend_synchronize(input_backend);
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
}
ggml_backend_tensor_copy(input, input_cpy);
}
ggml_backend_tensor_copy(input, input_cpy);
}
}
}
Expand Down
Loading