From 3975be0712a455d325a4162f671ecc50253b4bdd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:43 -0700 Subject: [PATCH 01/28] cover Signed-off-by: Roman Gushchin --- cover.txt | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 cover.txt diff --git a/cover.txt b/cover.txt new file mode 100644 index 0000000000000..149b0cf659b79 --- /dev/null +++ b/cover.txt @@ -0,0 +1,82 @@ +mm: BPF OOM + +This patchset adds an ability to customize the out of memory +handling using bpf. + +It focuses on two parts: +1) OOM handling policy, +2) PSI-based OOM invocation. + +The idea to use bpf for customizing the OOM handling is not new, but +unlike the previous proposal [1], which augmented the existing task +ranking policy, this one tries to be as generic as possible and +leverage the full power of the modern bpf. + +It provides a generic interface which is called before the existing OOM +killer code and allows implementing any policy, e.g. picking a victim +task or memory cgroup or potentially even releasing memory in other +ways, e.g. deleting tmpfs files (the last one might require some +additional but relatively simple changes). + +The past attempt to implement memory-cgroup aware policy [2] showed +that there are multiple opinions on what the best policy is. As it's +highly workload-dependent and specific to a concrete way of organizing +workloads, the structure of the cgroup tree etc, a customizable +bpf-based implementation is preferable over an in-kernel implementation +with a dozen of sysctls. + +The second part is related to the fundamental question on when to +declare the OOM event. It's a trade-off between the risk of +unnecessary OOM kills and associated work losses and the risk of +infinite trashing and effective soft lockups. In the last few years +several PSI-based userspace solutions were developed (e.g. OOMd [3] or +systemd-OOMd [4]). The common idea was to use userspace daemons to +implement custom OOM logic as well as rely on PSI monitoring to avoid +stalls. In this scenario the userspace daemon was supposed to handle +the majority of OOMs, while the in-kernel OOM killer worked as the +last resort measure to guarantee that the system would never deadlock +on the memory. But this approach creates additional infrastructure +churn: userspace OOM daemon is a separate entity which needs to be +deployed, updated, monitored. A completely different pipeline needs to +be built to monitor both types of OOM events and collect associated +logs. A userspace daemon is more restricted in terms on what data is +available to it. Implementing a daemon which can work reliably under a +heavy memory pressure in the system is also tricky. + +This patchset includes the code, tests and many ideas from the patchset +of JP Kobryn, which implemented bpf kfuncs to provide a faster method +to access memcg data [5]. + +[1]: https://lwn.net/ml/linux-kernel/20230810081319.65668-1-zhouchuyi@bytedance.com/ +[2]: https://lore.kernel.org/lkml/20171130152824.1591-1-guro@fb.com/ +[3]: https://github.com/facebookincubator/oomd +[4]: https://www.freedesktop.org/software/systemd/man/latest/systemd-oomd.service.html +[5]: https://lkml.org/lkml/2025/10/15/1554 + +---- +v2: + 1) A single bpf_oom can be attached system-wide and a single bpf_oom per memcg. + (by Alexei Starovoitov) + 2) Initial support for attaching struct ops to cgroups (Martin KaFai Lau, + Andrii Nakryiko and others) + 3) bpf memcontrol kfuncs enhancements and tests (co-developed by JP Kobryn) + 4) Many mall-ish fixes and cleanups (suggested by Andrew Morton, Suren Baghdasaryan, + Andrii Nakryiko and Kumar Kartikeya Dwivedi) + 5) bpf_out_of_memory() is taking u64 flags instead of bool wait_on_oom_lock + (suggested by Kumar Kartikeya Dwivedi) + 6) bpf_get_mem_cgroup() got KF_RCU flag (suggested by Kumar Kartikeya Dwivedi) + 7) cgroup online and offline callbacks for bpf_psi, cgroup offline for bpf_oom + +v1: + 1) Both OOM and PSI parts are now implemented using bpf struct ops, + providing a path the future extensions (suggested by Kumar Kartikeya Dwivedi, + Song Liu and Matt Bobrowski) + 2) It's possible to create PSI triggers from BPF, no need for an additional + userspace agent. (suggested by Suren Baghdasaryan) + Also there is now a callback for the cgroup release event. + 3) Added an ability to block on oom_lock instead of bailing out (suggested by Michal Hocko) + 4) Added bpf_task_is_oom_victim (suggested by Michal Hocko) + 5) PSI callbacks are scheduled using a separate workqueue (suggested by Suren Baghdasaryan) + +RFC: + https://lwn.net/ml/all/20250428033617.3797686-1-roman.gushchin@linux.dev/ From 3bdd2c00b75828bc6dc621ab4e08edf41825fabc Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 10 Oct 2025 16:21:01 -0700 Subject: [PATCH 02/28] bpf: move bpf_struct_ops_link into bpf.h Move struct bpf_struct_ops_link's definition into bpf.h, where other custom bpf links definitions are. It's necessary to access its members from outside of generic bpf_struct_ops implementation, which will be done by following patches in the series. Signed-off-by: Roman Gushchin --- include/linux/bpf.h | 6 ++++++ kernel/bpf/bpf_struct_ops.c | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e53cda0aabb68..01f71059f152e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1837,6 +1837,12 @@ struct bpf_raw_tp_link { u64 cookie; }; +struct bpf_struct_ops_link { + struct bpf_link link; + struct bpf_map __rcu *map; + wait_queue_head_t wait_hup; +}; + struct bpf_link_primer { struct bpf_link *link; struct file *file; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index a41e6730edcf3..45cc5ee19dc24 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -55,12 +55,6 @@ struct bpf_struct_ops_map { struct bpf_struct_ops_value kvalue; }; -struct bpf_struct_ops_link { - struct bpf_link link; - struct bpf_map __rcu *map; - wait_queue_head_t wait_hup; -}; - static DEFINE_MUTEX(update_mutex); #define VALUE_PREFIX "bpf_struct_ops_" From 31cc1aee92637095ea68570573a3f598549a00ff Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 10 Oct 2025 16:28:41 -0700 Subject: [PATCH 03/28] bpf: initial support for attaching struct ops to cgroups When a struct ops is being attached and a bpf link is created, allow to pass a cgroup fd using bpf attr, so that struct ops can be attached to a cgroup instead of globally. Attached struct ops doesn't hold a reference to the cgroup, only preserves cgroup id. Signed-off-by: Roman Gushchin --- include/linux/bpf.h | 1 + kernel/bpf/bpf_struct_ops.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 01f71059f152e..4abef08b3ed90 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1841,6 +1841,7 @@ struct bpf_struct_ops_link { struct bpf_link link; struct bpf_map __rcu *map; wait_queue_head_t wait_hup; + u64 cgroup_id; }; struct bpf_link_primer { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 45cc5ee19dc24..58664779a2b6f 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -13,6 +13,7 @@ #include #include #include +#include struct bpf_struct_ops_value { struct bpf_struct_ops_common_value common; @@ -1359,6 +1360,18 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL, attr->link_create.attach_type); +#ifdef CONFIG_CGROUPS + if (attr->link_create.cgroup.relative_fd) { + struct cgroup *cgrp; + + cgrp = cgroup_get_from_fd(attr->link_create.cgroup.relative_fd); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + + link->cgroup_id = cgroup_id(cgrp); + cgroup_put(cgrp); + } +#endif /* CONFIG_CGROUPS */ err = bpf_link_prime(&link->link, &link_primer); if (err) From 3b1a9c85a21339154016c5a7027ddab4483eaac9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:43 -0700 Subject: [PATCH 04/28] bpf: mark struct oom_control's memcg field as TRUSTED_OR_NULL Struct oom_control is used to describe the OOM context. It's memcg field defines the scope of OOM: it's NULL for global OOMs and a valid memcg pointer for memcg-scoped OOMs. Teach bpf verifier to recognize it as trusted or NULL pointer. It will provide the bpf OOM handler a trusted memcg pointer, which for example is required for iterating the memcg's subtree. Signed-off-by: Roman Gushchin Acked-by: Kumar Kartikeya Dwivedi --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6d175849e57ac..7ef954760078d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7101,6 +7101,10 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) { struct file *vm_file; }; +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct oom_control) { + struct mem_cgroup *memcg; +}; + static bool type_is_rcu(struct bpf_verifier_env *env, struct bpf_reg_state *reg, const char *field_name, u32 btf_id) @@ -7143,6 +7147,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct oom_control)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); From 5ea3213fa29bd6c865d83346d012996e4748f423 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 10 Oct 2025 16:22:18 -0700 Subject: [PATCH 05/28] mm: define mem_cgroup_get_from_ino() outside of CONFIG_SHRINKER_DEBUG mem_cgroup_get_from_ino() can be reused by the BPF OOM implementation, but currently depends on CONFIG_SHRINKER_DEBUG. Remove this dependency. Signed-off-by: Roman Gushchin --- include/linux/memcontrol.h | 4 ++-- mm/memcontrol.c | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 873e510d6f8d9..9af9ae28afe75 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -832,9 +832,9 @@ static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return memcg ? cgroup_ino(memcg->css.cgroup) : 0; } +#endif struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); -#endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { @@ -1331,12 +1331,12 @@ static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) { return 0; } +#endif static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { return NULL; } -#endif static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4deda33625f41..5d27cd5372aa0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3618,7 +3618,6 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return xa_load(&mem_cgroup_ids, id); } -#ifdef CONFIG_SHRINKER_DEBUG struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) { struct cgroup *cgrp; @@ -3639,7 +3638,6 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) return memcg; } -#endif static void free_mem_cgroup_per_node_info(struct mem_cgroup_per_node *pn) { From 60ff610735a1a7e5b0106bd6294425b01b2f6bee Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 17 Oct 2025 11:47:24 -0700 Subject: [PATCH 06/28] mm: declare memcg_page_state_output() in memcontrol.h To use memcg_page_state_output() in bpf_memcontrol.c move the declaration from v1-specific memcontrol-v1.h to memcontrol.h. Signed-off-by: Roman Gushchin --- include/linux/memcontrol.h | 1 + mm/memcontrol-v1.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9af9ae28afe75..50d851ff3f27d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -949,6 +949,7 @@ static inline void mod_memcg_page_state(struct page *page, } unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); +unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx); diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 6358464bb4160..a304ad418cdfe 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -27,7 +27,6 @@ unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); void drain_all_stock(struct mem_cgroup *root_memcg); unsigned long memcg_events(struct mem_cgroup *memcg, int event); -unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); int memory_stat_show(struct seq_file *m, void *v); void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n); From e70cb51b60e62a94b18bfc7fa711f3f3afd0a7c0 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:43 -0700 Subject: [PATCH 07/28] mm: introduce BPF struct ops for OOM handling Introduce a bpf struct ops for implementing custom OOM handling policies. It's possible to load one bpf_oom_ops for the system and one bpf_oom_ops for every memory cgroup. In case of a memcg OOM, the cgroup tree is traversed from the OOM'ing memcg up to the root and corresponding BPF OOM handlers are executed until some memory is freed. If no memory is freed, the kernel OOM killer is invoked. The struct ops provides the bpf_handle_out_of_memory() callback, which expected to return 1 if it was able to free some memory and 0 otherwise. If 1 is returned, the kernel also checks the bpf_memory_freed field of the oom_control structure, which is expected to be set by kfuncs suitable for releasing memory. If both are set, OOM is considered handled, otherwise the next OOM handler in the chain (e.g. BPF OOM attached to the parent cgroup or the in-kernel OOM killer) is executed. The bpf_handle_out_of_memory() callback program is sleepable to enable using iterators, e.g. cgroup iterators. The callback receives struct oom_control as an argument, so it can determine the scope of the OOM event: if this is a memcg-wide or system-wide OOM. The callback is executed just before the kernel victim task selection algorithm, so all heuristics and sysctls like panic on oom, sysctl_oom_kill_allocating_task and sysctl_oom_kill_allocating_task are respected. BPF OOM struct ops provides the handle_cgroup_offline() callback which is good for releasing struct ops if the corresponding cgroup is gone. The struct ops also has the name field, which allows to define a custom name for the implemented policy. It's printed in the OOM report in the oom_policy= format. "default" is printed if bpf is not used or policy name is not specified. [ 112.696676] test_progs invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=0, oom_score_adj=0 oom_policy=bpf_test_policy [ 112.698160] CPU: 1 UID: 0 PID: 660 Comm: test_progs Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) [ 112.698165] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 [ 112.698167] Call Trace: [ 112.698177] [ 112.698182] dump_stack_lvl+0x4d/0x70 [ 112.698192] dump_header+0x59/0x1c6 [ 112.698199] oom_kill_process.cold+0x8/0xef [ 112.698206] bpf_oom_kill_process+0x59/0xb0 [ 112.698216] bpf_prog_7ecad0f36a167fd7_test_out_of_memory+0x2be/0x313 [ 112.698229] bpf__bpf_oom_ops_handle_out_of_memory+0x47/0xaf [ 112.698236] ? srso_alias_return_thunk+0x5/0xfbef5 [ 112.698240] bpf_handle_oom+0x11a/0x1e0 [ 112.698250] out_of_memory+0xab/0x5c0 [ 112.698258] mem_cgroup_out_of_memory+0xbc/0x110 [ 112.698274] try_charge_memcg+0x4b5/0x7e0 [ 112.698288] charge_memcg+0x2f/0xc0 [ 112.698293] __mem_cgroup_charge+0x30/0xc0 [ 112.698299] do_anonymous_page+0x40f/0xa50 [ 112.698311] __handle_mm_fault+0xbba/0x1140 [ 112.698317] ? srso_alias_return_thunk+0x5/0xfbef5 [ 112.698335] handle_mm_fault+0xe6/0x370 [ 112.698343] do_user_addr_fault+0x211/0x6a0 [ 112.698354] exc_page_fault+0x75/0x1d0 [ 112.698363] asm_exc_page_fault+0x26/0x30 [ 112.698366] RIP: 0033:0x7fa97236db00 Signed-off-by: Roman Gushchin --- include/linux/bpf_oom.h | 73 +++++++++++ include/linux/memcontrol.h | 5 + include/linux/oom.h | 8 ++ mm/Makefile | 3 + mm/bpf_oom.c | 246 +++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 2 + mm/oom_kill.c | 22 +++- 7 files changed, 357 insertions(+), 2 deletions(-) create mode 100644 include/linux/bpf_oom.h create mode 100644 mm/bpf_oom.c diff --git a/include/linux/bpf_oom.h b/include/linux/bpf_oom.h new file mode 100644 index 0000000000000..d93dba501a006 --- /dev/null +++ b/include/linux/bpf_oom.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef __BPF_OOM_H +#define __BPF_OOM_H + +struct oom_control; + +#define BPF_OOM_NAME_MAX_LEN 64 + +struct bpf_oom_ctx { + /* + * If bpf_oom_ops is attached to a cgroup, id of this cgroup. + * 0 otherwise. + */ + u64 cgroup_id; +}; + +struct bpf_oom_ops { + /** + * @handle_out_of_memory: Out of memory bpf handler, called before + * the in-kernel OOM killer. + * @oc: OOM control structure + * @ctx: Execution context + * + * Should return 1 if some memory was freed up, otherwise + * the in-kernel OOM killer is invoked. + */ + int (*handle_out_of_memory)(struct oom_control *oc, struct bpf_oom_ctx *ctx); + + /** + * @handle_cgroup_offline: Cgroup offline callback + * @cgroup_id: Id of deleted cgroup + * + * Called if the cgroup with the attached bpf_oom_ops is deleted. + */ + void (*handle_cgroup_offline)(u64 cgroup_id, struct bpf_oom_ctx *ctx); + + /** + * @name: BPF OOM policy name + */ + char name[BPF_OOM_NAME_MAX_LEN]; +}; + +#ifdef CONFIG_BPF_SYSCALL +/** + * @bpf_handle_oom: handle out of memory condition using bpf + * @oc: OOM control structure + * + * Returns true if some memory was freed. + */ +bool bpf_handle_oom(struct oom_control *oc); + + +/** + * @bpf_oom_memcg_offline: handle memcg offlining + * @memcg: Memory cgroup is offlined + * + * When a memory cgroup is about to be deleted and there is an + * attached BPF OOM structure, it has to be detached. + */ +void bpf_oom_memcg_offline(struct mem_cgroup *memcg); + +#else /* CONFIG_BPF_SYSCALL */ +static inline bool bpf_handle_oom(struct oom_control *oc) +{ + return false; +} + +static inline void bpf_oom_memcg_offline(struct mem_cgroup *memcg) {} + +#endif /* CONFIG_BPF_SYSCALL */ + +#endif /* __BPF_OOM_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 50d851ff3f27d..39a6c7c8735ba 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -29,6 +29,7 @@ struct obj_cgroup; struct page; struct mm_struct; struct kmem_cache; +struct bpf_oom_ops; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { @@ -226,6 +227,10 @@ struct mem_cgroup { */ bool oom_group; +#ifdef CONFIG_BPF_SYSCALL + struct bpf_oom_ops *bpf_oom; +#endif + int swappiness; /* memory.events and memory.events.local */ diff --git a/include/linux/oom.h b/include/linux/oom.h index 7b02bc1d0a7ea..721087952d043 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -51,6 +51,14 @@ struct oom_control { /* Used to print the constraint info. */ enum oom_constraint constraint; + +#ifdef CONFIG_BPF_SYSCALL + /* Used by the bpf oom implementation to mark the forward progress */ + bool bpf_memory_freed; + + /* Policy name */ + const char *bpf_policy_name; +#endif }; extern struct mutex oom_lock; diff --git a/mm/Makefile b/mm/Makefile index 21abb33535501..051e88c699afb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -105,6 +105,9 @@ obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o ifdef CONFIG_SWAP obj-$(CONFIG_MEMCG) += swap_cgroup.o endif +ifdef CONFIG_BPF_SYSCALL +obj-y += bpf_oom.o +endif obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_TEST) += gup_test.o obj-$(CONFIG_DMAPOOL_TEST) += dmapool_test.o diff --git a/mm/bpf_oom.c b/mm/bpf_oom.c new file mode 100644 index 0000000000000..a7e021c9db44b --- /dev/null +++ b/mm/bpf_oom.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * BPF-driven OOM killer customization + * + * Author: Roman Gushchin + */ + +#include +#include +#include +#include +#include +#include + +DEFINE_STATIC_SRCU(bpf_oom_srcu); +static struct bpf_oom_ops *system_bpf_oom; + +static int bpf_ops_handle_oom(struct bpf_oom_ops *bpf_oom_ops, + struct mem_cgroup *memcg, + struct oom_control *oc) +{ + struct bpf_oom_ctx exec_ctx; + int ret; + + if (memcg) + exec_ctx.cgroup_id = cgroup_id(memcg->css.cgroup); + else + exec_ctx.cgroup_id = 0; + + oc->bpf_policy_name = &bpf_oom_ops->name[0]; + oc->bpf_memory_freed = false; + ret = bpf_oom_ops->handle_out_of_memory(oc, &exec_ctx); + oc->bpf_policy_name = NULL; + + return ret; +} + +bool bpf_handle_oom(struct oom_control *oc) +{ + struct bpf_oom_ops *bpf_oom_ops = NULL; + struct mem_cgroup *memcg; + int idx, ret = 0; + + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */ + idx = srcu_read_lock(&bpf_oom_srcu); + + /* Find the nearest bpf_oom_ops traversing the cgroup tree upwards */ + for (memcg = oc->memcg; memcg; memcg = parent_mem_cgroup(memcg)) { + bpf_oom_ops = READ_ONCE(memcg->bpf_oom); + if (!bpf_oom_ops) + continue; + + /* Call BPF OOM handler */ + ret = bpf_ops_handle_oom(bpf_oom_ops, memcg, oc); + if (ret && oc->bpf_memory_freed) + goto exit; + } + /* + * System-wide OOM or per-memcg BPF OOM handler wasn't successful? + * Try system_bpf_oom. + */ + bpf_oom_ops = READ_ONCE(system_bpf_oom); + if (!bpf_oom_ops) + goto exit; + + /* Call BPF OOM handler */ + ret = bpf_ops_handle_oom(bpf_oom_ops, NULL, oc); +exit: + srcu_read_unlock(&bpf_oom_srcu, idx); + return ret && oc->bpf_memory_freed; +} + +static int __handle_out_of_memory(struct oom_control *oc, + struct bpf_oom_ctx *exec_ctx) +{ + return 0; +} + +static void __handle_cgroup_offline(u64 cgroup_id, struct bpf_oom_ctx *exec_ctx) +{ +} + +static struct bpf_oom_ops __bpf_oom_ops = { + .handle_out_of_memory = __handle_out_of_memory, + .handle_cgroup_offline = __handle_cgroup_offline, +}; + +static const struct bpf_func_proto * +bpf_oom_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return tracing_prog_func_proto(func_id, prog); +} + +static bool bpf_oom_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static const struct bpf_verifier_ops bpf_oom_verifier_ops = { + .get_func_proto = bpf_oom_func_proto, + .is_valid_access = bpf_oom_ops_is_valid_access, +}; + +static int bpf_oom_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link); + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL; + struct bpf_oom_ops *bpf_oom_ops = kdata; + struct mem_cgroup *memcg = NULL; + int err = 0; + + if (ops_link->cgroup_id) { + /* Attach to a memory cgroup? */ + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id); + if (IS_ERR_OR_NULL(memcg)) + return PTR_ERR(memcg); + bpf_oom_ops_ptr = &memcg->bpf_oom; + } else { + /* System-wide OOM handler */ + bpf_oom_ops_ptr = &system_bpf_oom; + } + + /* Another struct ops attached? */ + if (READ_ONCE(*bpf_oom_ops_ptr)) { + err = -EBUSY; + goto exit; + } + + /* Expose bpf_oom_ops structure */ + WRITE_ONCE(*bpf_oom_ops_ptr, bpf_oom_ops); +exit: + mem_cgroup_put(memcg); + return err; +} + +static void bpf_oom_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_struct_ops_link *ops_link = container_of(link, struct bpf_struct_ops_link, link); + struct bpf_oom_ops **bpf_oom_ops_ptr = NULL; + struct bpf_oom_ops *bpf_oom_ops = kdata; + struct mem_cgroup *memcg = NULL; + + if (ops_link->cgroup_id) { + /* Detach from a memory cgroup? */ + memcg = mem_cgroup_get_from_ino(ops_link->cgroup_id); + if (IS_ERR_OR_NULL(memcg)) + goto exit; + bpf_oom_ops_ptr = &memcg->bpf_oom; + } else { + /* System-wide OOM handler */ + bpf_oom_ops_ptr = &system_bpf_oom; + } + + /* Hide bpf_oom_ops from new callers */ + if (!WARN_ON(READ_ONCE(*bpf_oom_ops_ptr) != bpf_oom_ops)) + WRITE_ONCE(*bpf_oom_ops_ptr, NULL); + + mem_cgroup_put(memcg); + +exit: + /* Release bpf_oom_ops after a srcu grace period */ + synchronize_srcu(&bpf_oom_srcu); +} + +void bpf_oom_memcg_offline(struct mem_cgroup *memcg) +{ + struct bpf_oom_ops *bpf_oom_ops; + struct bpf_oom_ctx exec_ctx; + u64 cgrp_id; + int idx; + + /* All bpf_oom_ops structures are protected using bpf_oom_srcu */ + idx = srcu_read_lock(&bpf_oom_srcu); + + bpf_oom_ops = READ_ONCE(memcg->bpf_oom); + WRITE_ONCE(memcg->bpf_oom, NULL); + + if (bpf_oom_ops && bpf_oom_ops->handle_cgroup_offline) { + cgrp_id = cgroup_id(memcg->css.cgroup); + exec_ctx.cgroup_id = cgrp_id; + bpf_oom_ops->handle_cgroup_offline(cgrp_id, &exec_ctx); + } + + srcu_read_unlock(&bpf_oom_srcu, idx); +} + +static int bpf_oom_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct bpf_oom_ops, handle_out_of_memory): + if (!prog) + return -EINVAL; + break; + } + + return 0; +} + +static int bpf_oom_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct bpf_oom_ops *uops = udata; + struct bpf_oom_ops *ops = kdata; + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct bpf_oom_ops, name): + if (uops->name[0]) + strscpy_pad(ops->name, uops->name, sizeof(ops->name)); + else + strscpy_pad(ops->name, "bpf_defined_policy"); + return 1; + } + return 0; +} + +static int bpf_oom_ops_init(struct btf *btf) +{ + return 0; +} + +static struct bpf_struct_ops bpf_oom_bpf_ops = { + .verifier_ops = &bpf_oom_verifier_ops, + .reg = bpf_oom_ops_reg, + .unreg = bpf_oom_ops_unreg, + .check_member = bpf_oom_ops_check_member, + .init_member = bpf_oom_ops_init_member, + .init = bpf_oom_ops_init, + .name = "bpf_oom_ops", + .owner = THIS_MODULE, + .cfi_stubs = &__bpf_oom_ops +}; + +static int __init bpf_oom_struct_ops_init(void) +{ + return register_bpf_struct_ops(&bpf_oom_bpf_ops, bpf_oom_ops); +} +late_initcall(bpf_oom_struct_ops_init); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5d27cd5372aa0..d44c1f293e168 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,6 +63,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -3885,6 +3886,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) zswap_memcg_offline_cleanup(memcg); + bpf_oom_memcg_offline(memcg); memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c145b0feecc1f..d05ec0f840879 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include "internal.h" @@ -246,6 +247,15 @@ static const char * const oom_constraint_text[] = { [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG", }; +static const char *oom_policy_name(struct oom_control *oc) +{ +#ifdef CONFIG_BPF_SYSCALL + if (oc->bpf_policy_name) + return oc->bpf_policy_name; +#endif + return "default"; +} + /* * Determine the type of allocation constraint. */ @@ -458,9 +468,10 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim) static void dump_header(struct oom_control *oc) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\noom_policy=%s\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, - current->signal->oom_score_adj); + current->signal->oom_score_adj, + oom_policy_name(oc)); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n"); @@ -1167,6 +1178,13 @@ bool out_of_memory(struct oom_control *oc) return true; } + /* + * Let bpf handle the OOM first. If it was able to free up some memory, + * bail out. Otherwise fall back to the kernel OOM killer. + */ + if (bpf_handle_oom(oc)) + return true; + select_bad_process(oc); /* Found nothing?!?! */ if (!oc->chosen) { From 30da7527483572eb7041667d6790816039c903b3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:43 -0700 Subject: [PATCH 08/28] mm: introduce bpf_oom_kill_process() bpf kfunc Introduce bpf_oom_kill_process() bpf kfunc, which is supposed to be used by BPF OOM programs. It allows to kill a process in exactly the same way the OOM killer does: using the OOM reaper, bumping corresponding memcg and global statistics, respecting memory.oom.group etc. On success, it sets om_control's bpf_memory_freed field to true, enabling the bpf program to bypass the kernel OOM killer. Signed-off-by: Roman Gushchin --- mm/oom_kill.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d05ec0f840879..3c86cd7553718 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1288,3 +1288,70 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) return -ENOSYS; #endif /* CONFIG_MMU */ } + +#ifdef CONFIG_BPF_SYSCALL + +__bpf_kfunc_start_defs(); +/** + * bpf_oom_kill_process - Kill a process as OOM killer + * @oc: pointer to oom_control structure, describes OOM context + * @task: task to be killed + * @message__str: message to print in dmesg + * + * Kill a process in a way similar to the kernel OOM killer. + * This means dump the necessary information to dmesg, adjust memcg + * statistics, leverage the oom reaper, respect memory.oom.group etc. + * + * bpf_oom_kill_process() marks the forward progress by setting + * oc->bpf_memory_freed. If the progress was made, the bpf program + * is free to decide if the kernel oom killer should be invoked. + * Otherwise it's enforced, so that a bad bpf program can't + * deadlock the machine on memory. + */ +__bpf_kfunc int bpf_oom_kill_process(struct oom_control *oc, + struct task_struct *task, + const char *message__str) +{ + if (oom_unkillable_task(task)) + return -EPERM; + + /* paired with put_task_struct() in oom_kill_process() */ + task = tryget_task_struct(task); + if (!task) + return -EINVAL; + + oc->chosen = task; + + oom_kill_process(oc, message__str); + + oc->chosen = NULL; + oc->bpf_memory_freed = true; + + return 0; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_oom_kfuncs) +BTF_ID_FLAGS(func, bpf_oom_kill_process, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_oom_kfuncs) + +static const struct btf_kfunc_id_set bpf_oom_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_oom_kfuncs, +}; + +static int __init bpf_oom_init(void) +{ + int err; + + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &bpf_oom_kfunc_set); + if (err) + pr_warn("error while registering bpf oom kfuncs: %d", err); + + return err; +} +late_initcall(bpf_oom_init); + +#endif From e072ce3ce6413d6d3cce840844e90b802472fed1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:43 -0700 Subject: [PATCH 09/28] mm: introduce BPF kfuncs to deal with memcg pointers To effectively operate with memory cgroups in BPF there is a need to convert css pointers to memcg pointers. A simple container_of cast which is used in the kernel code can't be used in BPF because from the verifier's point of view that's a out-of-bounds memory access. Introduce helper get/put kfuncs which can be used to get a refcounted memcg pointer from the css pointer: - bpf_get_mem_cgroup, - bpf_put_mem_cgroup. bpf_get_mem_cgroup() can take both memcg's css and the corresponding cgroup's "self" css. It allows it to be used with the existing cgroup iterator which iterates over cgroup tree, not memcg tree. Signed-off-by: Roman Gushchin --- mm/Makefile | 1 + mm/bpf_memcontrol.c | 88 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 mm/bpf_memcontrol.c diff --git a/mm/Makefile b/mm/Makefile index 051e88c699afb..2d8f9beb3c710 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -107,6 +107,7 @@ obj-$(CONFIG_MEMCG) += swap_cgroup.o endif ifdef CONFIG_BPF_SYSCALL obj-y += bpf_oom.o +obj-$(CONFIG_MEMCG) += bpf_memcontrol.o endif obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_TEST) += gup_test.o diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c new file mode 100644 index 0000000000000..1e46097745cfe --- /dev/null +++ b/mm/bpf_memcontrol.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Memory Controller-related BPF kfuncs and auxiliary code + * + * Author: Roman Gushchin + */ + +#include +#include + +__bpf_kfunc_start_defs(); + +/** + * bpf_get_mem_cgroup - Get a reference to a memory cgroup + * @css: pointer to the css structure + * + * Returns a pointer to a mem_cgroup structure after bumping + * the corresponding css's reference counter. + * + * It's fine to pass a css which belongs to any cgroup controller, + * e.g. unified hierarchy's main css. + * + * Implements KF_ACQUIRE semantics. + */ +__bpf_kfunc struct mem_cgroup * +bpf_get_mem_cgroup(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = NULL; + bool rcu_unlock = false; + + if (!root_mem_cgroup) + return NULL; + + if (root_mem_cgroup->css.ss != css->ss) { + struct cgroup *cgroup = css->cgroup; + int ssid = root_mem_cgroup->css.ss->id; + + rcu_read_lock(); + rcu_unlock = true; + css = rcu_dereference_raw(cgroup->subsys[ssid]); + } + + if (css && css_tryget(css)) + memcg = container_of(css, struct mem_cgroup, css); + + if (rcu_unlock) + rcu_read_unlock(); + + return memcg; +} + +/** + * bpf_put_mem_cgroup - Put a reference to a memory cgroup + * @memcg: memory cgroup to release + * + * Releases a previously acquired memcg reference. + * Implements KF_RELEASE semantics. + */ +__bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg) +{ + css_put(&memcg->css); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_memcontrol_kfuncs) +BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) +BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) + +BTF_KFUNCS_END(bpf_memcontrol_kfuncs) + +static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_memcontrol_kfuncs, +}; + +static int __init bpf_memcontrol_init(void) +{ + int err; + + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &bpf_memcontrol_kfunc_set); + if (err) + pr_warn("error while registering bpf memcontrol kfuncs: %d", err); + + return err; +} +late_initcall(bpf_memcontrol_init); From 9ec213ffe408607e8fb5e3423bea72d786b5e0cc Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:43 -0700 Subject: [PATCH 10/28] mm: introduce bpf_get_root_mem_cgroup() BPF kfunc Introduce a BPF kfunc to get a trusted pointer to the root memory cgroup. It's very handy to traverse the full memcg tree, e.g. for handling a system-wide OOM. It's possible to obtain this pointer by traversing the memcg tree up from any known memcg, but it's sub-optimal and makes BPF programs more complex and less efficient. bpf_get_root_mem_cgroup() has a KF_ACQUIRE | KF_RET_NULL semantics, however in reality it's not necessarily to bump the corresponding reference counter - root memory cgroup is immortal, reference counting is skipped, see css_get(). Once set, root_mem_cgroup is always a valid memcg pointer. It's safe to call bpf_put_mem_cgroup() for the pointer obtained with bpf_get_root_mem_cgroup(), it's effectively a no-op. Signed-off-by: Roman Gushchin --- mm/bpf_memcontrol.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 1e46097745cfe..76c342318256e 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -10,6 +10,20 @@ __bpf_kfunc_start_defs(); +/** + * bpf_get_root_mem_cgroup - Returns a pointer to the root memory cgroup + * + * The function has KF_ACQUIRE semantics, even though the root memory + * cgroup is never destroyed after being created and doesn't require + * reference counting. And it's perfectly safe to pass it to + * bpf_put_mem_cgroup() + */ +__bpf_kfunc struct mem_cgroup *bpf_get_root_mem_cgroup(void) +{ + /* css_get() is not needed */ + return root_mem_cgroup; +} + /** * bpf_get_mem_cgroup - Get a reference to a memory cgroup * @css: pointer to the css structure @@ -64,6 +78,7 @@ __bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg) __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_memcontrol_kfuncs) +BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) From 9940d32e76bfd5db021df57fd0dcf1c0ffe770dd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 17 Oct 2025 12:10:43 -0700 Subject: [PATCH 11/28] mm: introduce BPF kfuncs to access memcg statistics and events Introduce BPF kfuncs to conveniently access memcg data: - bpf_mem_cgroup_vm_events(), - bpf_mem_cgroup_usage(), - bpf_mem_cgroup_page_state(), - bpf_mem_cgroup_flush_stats(). These functions are useful for implementing BPF OOM policies, but also can be used to accelerate access to the memcg data. Reading it through cgroupfs is much more expensive, roughly 5x, mostly because of the need to convert the data into the text and back. Signed-off-by: Roman Gushchin Co-developed-by: JP Kobryn Signed-off-by: JP Kobryn --- include/linux/memcontrol.h | 2 ++ mm/bpf_memcontrol.c | 57 +++++++++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 39a6c7c8735ba..b9e08dddd7ada 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -953,6 +953,8 @@ static inline void mod_memcg_page_state(struct page *page, rcu_read_unlock(); } +unsigned long memcg_events(struct mem_cgroup *memcg, int event); +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item); unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx); diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 76c342318256e..387255b8ab88c 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -75,6 +75,56 @@ __bpf_kfunc void bpf_put_mem_cgroup(struct mem_cgroup *memcg) css_put(&memcg->css); } +/** + * bpf_mem_cgroup_vm_events - Read memory cgroup's vm event counter + * @memcg: memory cgroup + * @event: event id + * + * Allows to read memory cgroup event counters. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_vm_events(struct mem_cgroup *memcg, + enum vm_event_item event) +{ + return memcg_events(memcg, event); +} + +/** + * bpf_mem_cgroup_usage - Read memory cgroup's usage + * @memcg: memory cgroup + * + * Returns current memory cgroup size in bytes. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_usage(struct mem_cgroup *memcg) +{ + return page_counter_read(&memcg->memory); +} + +/** + * bpf_mem_cgroup_page_state - Read memory cgroup's page state counter + * @memcg: memory cgroup + * @idx: counter idx + * + * Allows to read memory cgroup statistics. The output is in bytes. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_page_state(struct mem_cgroup *memcg, int idx) +{ + if (idx < 0 || idx >= MEMCG_NR_STAT) + return (unsigned long)-1; + + return memcg_page_state_output(memcg, idx); +} + +/** + * bpf_mem_cgroup_flush_stats - Flush memory cgroup's statistics + * @memcg: memory cgroup + * + * Propagate memory cgroup's statistics up the cgroup tree. + */ +__bpf_kfunc void bpf_mem_cgroup_flush_stats(struct mem_cgroup *memcg) +{ + mem_cgroup_flush_stats(memcg); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_memcontrol_kfuncs) @@ -82,6 +132,11 @@ BTF_ID_FLAGS(func, bpf_get_root_mem_cgroup, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_usage, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_TRUSTED_ARGS | KF_SLEEPABLE) + BTF_KFUNCS_END(bpf_memcontrol_kfuncs) static const struct btf_kfunc_id_set bpf_memcontrol_kfunc_set = { @@ -93,7 +148,7 @@ static int __init bpf_memcontrol_init(void) { int err; - err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &bpf_memcontrol_kfunc_set); if (err) pr_warn("error while registering bpf memcontrol kfuncs: %d", err); From fc6b5ccf2e4347fb720b44922a99c7028a2eef7d Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 22 Oct 2025 15:56:07 -0700 Subject: [PATCH 12/28] mm: introduce BPF kfunc to access memory events Introduce BPF kfunc to access memory events, e.g.: MEMCG_LOW, MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL etc. Signed-off-by: JP Kobryn --- mm/bpf_memcontrol.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mm/bpf_memcontrol.c b/mm/bpf_memcontrol.c index 387255b8ab88c..458ad022b036f 100644 --- a/mm/bpf_memcontrol.c +++ b/mm/bpf_memcontrol.c @@ -99,6 +99,23 @@ __bpf_kfunc unsigned long bpf_mem_cgroup_usage(struct mem_cgroup *memcg) return page_counter_read(&memcg->memory); } +/** + * bpf_mem_cgroup_events - Read memory cgroup's page state counter + * bpf_mem_cgroup_memory_events - Read memory cgroup's memory event value + * @memcg: memory cgroup + * @event: memory event id + * + * Returns current memory event count. + */ +__bpf_kfunc unsigned long bpf_mem_cgroup_memory_events(struct mem_cgroup *memcg, + enum memcg_memory_event event) +{ + if (event >= MEMCG_NR_MEMORY_EVENTS) + return (unsigned long)-1; + + return atomic_long_read(&memcg->memory_events[event]); +} + /** * bpf_mem_cgroup_page_state - Read memory cgroup's page state counter * @memcg: memory cgroup @@ -133,6 +150,7 @@ BTF_ID_FLAGS(func, bpf_get_mem_cgroup, KF_ACQUIRE | KF_RET_NULL | KF_RCU) BTF_ID_FLAGS(func, bpf_put_mem_cgroup, KF_RELEASE) BTF_ID_FLAGS(func, bpf_mem_cgroup_vm_events, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_mem_cgroup_memory_events, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_mem_cgroup_usage, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_mem_cgroup_page_state, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_mem_cgroup_flush_stats, KF_TRUSTED_ARGS | KF_SLEEPABLE) From 44c82ea29d584d252ffe2e722b479ead25b9f1e3 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 8 Oct 2025 17:12:40 -0700 Subject: [PATCH 13/28] bpf: selftests: selftests for memcg stat kfuncs Add test coverage for the kfuncs that fetch memcg stats. Using some common stats, test scenarios ensuring that the given stat increases by some arbitrary amount. The stats selected cover the three categories represented by the enums: node_stat_item, memcg_stat_item, vm_event_item. Since only a subset of all stats are queried, use a static struct made up of fields for each stat. Write to the struct with the fetched values when the bpf program is invoked and read the fields in the user mode program for verification. Signed-off-by: JP Kobryn --- .../testing/selftests/bpf/cgroup_iter_memcg.h | 18 ++ .../bpf/prog_tests/cgroup_iter_memcg.c | 225 ++++++++++++++++++ .../selftests/bpf/progs/cgroup_iter_memcg.c | 42 ++++ 3 files changed, 285 insertions(+) create mode 100644 tools/testing/selftests/bpf/cgroup_iter_memcg.h create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c create mode 100644 tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c diff --git a/tools/testing/selftests/bpf/cgroup_iter_memcg.h b/tools/testing/selftests/bpf/cgroup_iter_memcg.h new file mode 100644 index 0000000000000..3f59b127943ba --- /dev/null +++ b/tools/testing/selftests/bpf/cgroup_iter_memcg.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef __CGROUP_ITER_MEMCG_H +#define __CGROUP_ITER_MEMCG_H + +struct memcg_query { + /* some node_stat_item's */ + unsigned long nr_anon_mapped; + unsigned long nr_shmem; + unsigned long nr_file_pages; + unsigned long nr_file_mapped; + /* some memcg_stat_item */ + unsigned long memcg_kmem; + /* some vm_event_item */ + unsigned long pgfault; +}; + +#endif /* __CGROUP_ITER_MEMCG_H */ diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c new file mode 100644 index 0000000000000..86f558a096143 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include +#include +#include +#include "cgroup_helpers.h" +#include "cgroup_iter_memcg.h" +#include "cgroup_iter_memcg.skel.h" + +static int read_stats(struct bpf_link *link) +{ + int fd, ret = 0; + ssize_t bytes; + + fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_OK_FD(fd, "bpf_iter_create")) + return 1; + + /* + * Invoke iter program by reading from its fd. We're not expecting any + * data to be written by the bpf program so the result should be zero. + * Results will be read directly through the custom data section + * accessible through skel->data_query.memcg_query. + */ + bytes = read(fd, NULL, 0); + if (!ASSERT_EQ(bytes, 0, "read fd")) + ret = 1; + + close(fd); + return ret; +} + +static void test_anon(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg anon usage by mapping and writing + * to a new anon region. + */ + map = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_anon_mapped, 0, "final anon mapped val"); + +cleanup: + munmap(map, len); +} + +static void test_file(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + FILE *f; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg file usage by creating and writing + * to a temoprary mapped file. + */ + f = tmpfile(); + if (!ASSERT_OK_PTR(f, "tmpfile")) + return; + fd = fileno(f); + if (!ASSERT_OK_FD(fd, "open fd")) + return; + if (!ASSERT_OK(ftruncate(fd, len), "ftruncate")) + goto cleanup_fd; + + map = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap file")) + goto cleanup_fd; + + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup_map; + + ASSERT_GT(memcg_query->nr_file_pages, 0, "final file value"); + ASSERT_GT(memcg_query->nr_file_mapped, 0, "final file mapped value"); + +cleanup_map: + munmap(map, len); +cleanup_fd: + close(fd); +} + +static void test_shmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + size_t len; + int fd; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* + * Increase memcg shmem usage by creating and writing + * to a shmem object. + */ + fd = shm_open("/tmp_shmem", O_CREAT | O_RDWR, 0644); + if (!ASSERT_OK_FD(fd, "shm_open")) + return; + + if (!ASSERT_OK(fallocate(fd, 0, 0, len), "fallocate")) + goto cleanup; + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->nr_shmem, 0, "final shmem value"); + +cleanup: + close(fd); + shm_unlink("/tmp_shmem"); +} + +#define NR_PIPES 2 +static void test_kmem(struct bpf_link *link, struct memcg_query *memcg_query) +{ + int fds[NR_PIPES][2], i; + + /* + * Increase kmem value by creating pipes which will allocate some + * kernel buffers. + */ + for (i = 0; i < NR_PIPES; i++) { + if (!ASSERT_OK(pipe(fds[i]), "pipe")) + goto cleanup; + } + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->memcg_kmem, 0, "kmem value"); + +cleanup: + for (i = 0; i < NR_PIPES; i++) { + close(fds[i][0]); + close(fds[i][1]); + } +} + +static void test_pgfault(struct bpf_link *link, struct memcg_query *memcg_query) +{ + void *map; + size_t len; + + len = sysconf(_SC_PAGESIZE) * 1024; + + /* Create region to use for triggering a page fault. */ + map = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) + return; + + /* Trigger page fault. */ + memset(map, 1, len); + + if (!ASSERT_OK(read_stats(link), "read stats")) + goto cleanup; + + ASSERT_GT(memcg_query->pgfault, 0, "final pgfault val"); + +cleanup: + munmap(map, len); +} + +void test_cgroup_iter_memcg(void) +{ + char *cgroup_rel_path = "/cgroup_iter_memcg_test"; + struct cgroup_iter_memcg *skel; + struct bpf_link *link; + int cgroup_fd; + + cgroup_fd = cgroup_setup_and_join(cgroup_rel_path); + if (!ASSERT_OK_FD(cgroup_fd, "cgroup_setup_and_join")) + return; + + skel = cgroup_iter_memcg__open_and_load(); + if (!ASSERT_OK_PTR(skel, "cgroup_iter_memcg__open_and_load")) + goto cleanup_cgroup_fd; + + DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts); + union bpf_iter_link_info linfo = { + .cgroup.cgroup_fd = cgroup_fd, + .cgroup.order = BPF_CGROUP_ITER_SELF_ONLY, + }; + opts.link_info = &linfo; + opts.link_info_len = sizeof(linfo); + + link = bpf_program__attach_iter(skel->progs.cgroup_memcg_query, &opts); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto cleanup_skel; + + if (test__start_subtest("cgroup_iter_memcg__anon")) + test_anon(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__shmem")) + test_shmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__file")) + test_file(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__kmem")) + test_kmem(link, &skel->data_query->memcg_query); + if (test__start_subtest("cgroup_iter_memcg__pgfault")) + test_pgfault(link, &skel->data_query->memcg_query); + + bpf_link__destroy(link); +cleanup_skel: + cgroup_iter_memcg__destroy(skel); +cleanup_cgroup_fd: + close(cgroup_fd); + cleanup_cgroup_environment(); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c new file mode 100644 index 0000000000000..92db5fd11391d --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_iter_memcg.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "cgroup_iter_memcg.h" + +char _license[] SEC("license") = "GPL"; + +/* The latest values read are stored here. */ +struct memcg_query memcg_query SEC(".data.query"); + +SEC("iter.s/cgroup") +int cgroup_memcg_query(struct bpf_iter__cgroup *ctx) +{ + struct cgroup *cgrp = ctx->cgroup; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + + if (!cgrp) + return 1; + + css = container_of(cgrp, struct cgroup_subsys_state, cgroup); + if (!css) + return 1; + + memcg = bpf_get_mem_cgroup(css); + if (!memcg) + return 1; + + bpf_mem_cgroup_flush_stats(memcg); + + memcg_query.nr_anon_mapped = bpf_mem_cgroup_page_state(memcg, NR_ANON_MAPPED); + memcg_query.nr_shmem = bpf_mem_cgroup_page_state(memcg, NR_SHMEM); + memcg_query.nr_file_pages = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES); + memcg_query.nr_file_mapped = bpf_mem_cgroup_page_state(memcg, NR_FILE_MAPPED); + memcg_query.memcg_kmem = bpf_mem_cgroup_page_state(memcg, MEMCG_KMEM); + memcg_query.pgfault = bpf_mem_cgroup_vm_events(memcg, PGFAULT); + + bpf_put_mem_cgroup(memcg); + + return 0; +} From 03e37b738cb9d35f6a0bfb78bfc75ae76c1e52e5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:43 -0700 Subject: [PATCH 14/28] mm: introduce bpf_out_of_memory() BPF kfunc Introduce bpf_out_of_memory() bpf kfunc, which allows to declare an out of memory events and trigger the corresponding kernel OOM handling mechanism. It takes a trusted memcg pointer (or NULL for system-wide OOMs) as an argument, as well as the page order. If the BPF_OOM_FLAGS_WAIT_ON_OOM_LOCK flag is not set, only one OOM can be declared and handled in the system at once, so if the function is called in parallel to another OOM handling, it bails out with -EBUSY. This mode is suited for global OOM's: any concurrent OOMs will likely do the job and release some memory. In a blocking mode (which is suited for memcg OOMs) the execution will wait on the oom_lock mutex. The function is declared as sleepable. It guarantees that it won't be called from an atomic context. It's required by the OOM handling code, which shouldn't be called from a non-blocking context. Handling of a memcg OOM almost always requires taking of the css_set_lock spinlock. The fact that bpf_out_of_memory() is sleepable also guarantees that it can't be called with acquired css_set_lock, so the kernel can't deadlock on it. Please, note that this function will be inaccessible as of now. Calling bpf_out_of_memory() from a random context is dangerous because e.g. it's easy to deadlock the system on oom_lock. The following commit in the series will provide one safe context where this kfunc can be used. Signed-off-by: Roman Gushchin --- include/linux/oom.h | 5 ++++ mm/oom_kill.c | 63 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/include/linux/oom.h b/include/linux/oom.h index 721087952d043..3cbdcd0132741 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -21,6 +21,11 @@ enum oom_constraint { CONSTRAINT_MEMCG, }; +enum bpf_oom_flags { + BPF_OOM_FLAGS_WAIT_ON_OOM_LOCK = 1 << 0, + BPF_OOM_FLAGS_LAST = 1 << 1, +}; + /* * Details of the page allocation that triggered the oom killer that are used to * determine what should be killed. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3c86cd7553718..d7fca4bf575bf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1330,15 +1330,78 @@ __bpf_kfunc int bpf_oom_kill_process(struct oom_control *oc, return 0; } +/** + * bpf_out_of_memory - declare Out Of Memory state and invoke OOM killer + * @memcg__nullable: memcg or NULL for system-wide OOMs + * @order: order of page which wasn't allocated + * @flags: flags + * @constraint_text__nullable: custom constraint description for the OOM report + * + * Declares the Out Of Memory state and invokes the OOM killer. + * + * OOM handlers are synchronized using the oom_lock mutex. If wait_on_oom_lock + * is true, the function will wait on it. Otherwise it bails out with -EBUSY + * if oom_lock is contended. + * + * Generally it's advised to pass wait_on_oom_lock=false for global OOMs + * and wait_on_oom_lock=true for memcg-scoped OOMs. + * + * Returns 1 if the forward progress was achieved and some memory was freed. + * Returns a negative value if an error occurred. + */ +__bpf_kfunc int bpf_out_of_memory(struct mem_cgroup *memcg__nullable, + int order, u64 flags) +{ + struct oom_control oc = { + .memcg = memcg__nullable, + .order = order, + }; + int ret; + + if (flags & ~(BPF_OOM_FLAGS_LAST - 1)) + return -EINVAL; + + if (oc.order < 0 || oc.order > MAX_PAGE_ORDER) + return -EINVAL; + + if (flags & BPF_OOM_FLAGS_WAIT_ON_OOM_LOCK) { + ret = mutex_lock_killable(&oom_lock); + if (ret) + return ret; + } else if (!mutex_trylock(&oom_lock)) + return -EBUSY; + + ret = out_of_memory(&oc); + + mutex_unlock(&oom_lock); + return ret; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_oom_kfuncs) BTF_ID_FLAGS(func, bpf_oom_kill_process, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_out_of_memory, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_oom_kfuncs) +BTF_SET_START(bpf_oom_declare_oom_kfuncs) +BTF_ID(func, bpf_out_of_memory) +BTF_SET_END(bpf_oom_declare_oom_kfuncs) + +extern struct bpf_struct_ops bpf_psi_bpf_ops; + +static int bpf_oom_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (!btf_id_set_contains(&bpf_oom_declare_oom_kfuncs, kfunc_id)) + return 0; + + return -EACCES; +} + static const struct btf_kfunc_id_set bpf_oom_kfunc_set = { .owner = THIS_MODULE, .set = &bpf_oom_kfuncs, + .filter = bpf_oom_kfunc_filter, }; static int __init bpf_oom_init(void) From f7316fb03b3a2040f1ad0d960bbc21c284b4be81 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:43 -0700 Subject: [PATCH 15/28] mm: allow specifying custom oom constraint for BPF triggers Currently there is a hard-coded list of possible oom constraints: NONE, CPUSET, MEMORY_POLICY & MEMCG. Add a new one: CONSTRAINT_BPF. Also, add an ability to specify a custom constraint name when calling bpf_out_of_memory(). If an empty string is passed as an argument, CONSTRAINT_BPF is displayed. The resulting output in dmesg will look like this: [ 315.224875] kworker/u17:0 invoked oom-killer: gfp_mask=0x0(), order=0, oom_score_adj=0 oom_policy=default [ 315.226532] CPU: 1 UID: 0 PID: 74 Comm: kworker/u17:0 Not tainted 6.16.0-00015-gf09eb0d6badc #102 PREEMPT(full) [ 315.226534] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-5.fc42 04/01/2014 [ 315.226536] Workqueue: bpf_psi_wq bpf_psi_handle_event_fn [ 315.226542] Call Trace: [ 315.226545] [ 315.226548] dump_stack_lvl+0x4d/0x70 [ 315.226555] dump_header+0x59/0x1c6 [ 315.226561] oom_kill_process.cold+0x8/0xef [ 315.226565] out_of_memory+0x111/0x5c0 [ 315.226577] bpf_out_of_memory+0x6f/0xd0 [ 315.226580] ? srso_alias_return_thunk+0x5/0xfbef5 [ 315.226589] bpf_prog_3018b0cf55d2c6bb_handle_psi_event+0x5d/0x76 [ 315.226594] bpf__bpf_psi_ops_handle_psi_event+0x47/0xa7 [ 315.226599] bpf_psi_handle_event_fn+0x63/0xb0 [ 315.226604] process_one_work+0x1fc/0x580 [ 315.226616] ? srso_alias_return_thunk+0x5/0xfbef5 [ 315.226624] worker_thread+0x1d9/0x3b0 [ 315.226629] ? __pfx_worker_thread+0x10/0x10 [ 315.226632] kthread+0x128/0x270 [ 315.226637] ? lock_release+0xd4/0x2d0 [ 315.226645] ? __pfx_kthread+0x10/0x10 [ 315.226649] ret_from_fork+0x81/0xd0 [ 315.226652] ? __pfx_kthread+0x10/0x10 [ 315.226655] ret_from_fork_asm+0x1a/0x30 [ 315.226667] [ 315.239745] memory: usage 42240kB, limit 9007199254740988kB, failcnt 0 [ 315.240231] swap: usage 0kB, limit 0kB, failcnt 0 [ 315.240585] Memory cgroup stats for /cgroup-test-work-dir673/oom_test/cg2: [ 315.240603] anon 42897408 [ 315.241317] file 0 [ 315.241493] kernel 98304 ... [ 315.255946] Tasks state (memory values in pages): [ 315.256292] [ pid ] uid tgid total_vm rss rss_anon rss_file rss_shmem pgtables_bytes swapents oom_score_adj name [ 315.257107] [ 675] 0 675 162013 10969 10712 257 0 155648 0 0 test_progs [ 315.257927] oom-kill:constraint=CONSTRAINT_BPF_PSI_MEM,nodemask=(null),cpuset=/,mems_allowed=0,oom_memcg=/cgroup-test-work-dir673/oom_test/cg2,task_memcg=/cgroup-test-work-dir673/oom_test/cg2,task=test_progs,pid=675,uid=0 [ 315.259371] Memory cgroup out of memory: Killed process 675 (test_progs) total-vm:648052kB, anon-rss:42848kB, file-rss:1028kB, shmem-rss:0kB, UID:0 pgtables:152kB oom_score_adj:0 Signed-off-by: Roman Gushchin --- include/linux/oom.h | 4 ++++ mm/oom_kill.c | 38 +++++++++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 3cbdcd0132741..704fc0e786c62 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -19,6 +19,7 @@ enum oom_constraint { CONSTRAINT_CPUSET, CONSTRAINT_MEMORY_POLICY, CONSTRAINT_MEMCG, + CONSTRAINT_BPF, }; enum bpf_oom_flags { @@ -63,6 +64,9 @@ struct oom_control { /* Policy name */ const char *bpf_policy_name; + + /* BPF-specific constraint name */ + const char *bpf_constraint; #endif }; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d7fca4bf575bf..72a346261c793 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -240,13 +240,6 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) return points; } -static const char * const oom_constraint_text[] = { - [CONSTRAINT_NONE] = "CONSTRAINT_NONE", - [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET", - [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY", - [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG", -}; - static const char *oom_policy_name(struct oom_control *oc) { #ifdef CONFIG_BPF_SYSCALL @@ -256,6 +249,27 @@ static const char *oom_policy_name(struct oom_control *oc) return "default"; } +static const char *oom_constraint_text(struct oom_control *oc) +{ + switch (oc->constraint) { + case CONSTRAINT_NONE: + return "CONSTRAINT_NONE"; + case CONSTRAINT_CPUSET: + return "CONSTRAINT_CPUSET"; + case CONSTRAINT_MEMORY_POLICY: + return "CONSTRAINT_MEMORY_POLICY"; + case CONSTRAINT_MEMCG: + return "CONSTRAINT_MEMCG"; +#ifdef CONFIG_BPF_SYSCALL + case CONSTRAINT_BPF: + return oc->bpf_constraint ? : "CONSTRAINT_BPF"; +#endif + default: + WARN_ON_ONCE(1); + return ""; + } +} + /* * Determine the type of allocation constraint. */ @@ -267,6 +281,9 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) bool cpuset_limited = false; int nid; + if (oc->constraint == CONSTRAINT_BPF) + return CONSTRAINT_BPF; + if (is_memcg_oom(oc)) { oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; return CONSTRAINT_MEMCG; @@ -458,7 +475,7 @@ static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim) { /* one line summary of the oom killer context. */ pr_info("oom-kill:constraint=%s,nodemask=%*pbl", - oom_constraint_text[oc->constraint], + oom_constraint_text(oc), nodemask_pr_args(oc->nodemask)); cpuset_print_current_mems_allowed(); mem_cgroup_print_oom_context(oc->memcg, victim); @@ -1350,11 +1367,14 @@ __bpf_kfunc int bpf_oom_kill_process(struct oom_control *oc, * Returns a negative value if an error occurred. */ __bpf_kfunc int bpf_out_of_memory(struct mem_cgroup *memcg__nullable, - int order, u64 flags) + int order, u64 flags, + const char *constraint_text__nullable) { struct oom_control oc = { .memcg = memcg__nullable, .order = order, + .constraint = CONSTRAINT_BPF, + .bpf_constraint = constraint_text__nullable, }; int ret; From 69df552bde9d621074f5c05bd1ed18d67b0f1c98 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:44 -0700 Subject: [PATCH 16/28] mm: introduce bpf_task_is_oom_victim() kfunc Export tsk_is_oom_victim() helper as a BPF kfunc. It's very useful to avoid redundant oom kills. Signed-off-by: Roman Gushchin --- mm/oom_kill.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 72a346261c793..90bb86dee3cf2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1397,11 +1397,25 @@ __bpf_kfunc int bpf_out_of_memory(struct mem_cgroup *memcg__nullable, return ret; } +/** + * bpf_task_is_oom_victim - Check if the task has been marked as an OOM victim + * @task: task to check + * + * Returns true if the task has been previously selected by the OOM killer + * to be killed. It's expected that the task will be destroyed soon and some + * memory will be freed, so maybe no additional actions required. + */ +__bpf_kfunc bool bpf_task_is_oom_victim(struct task_struct *task) +{ + return tsk_is_oom_victim(task); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_oom_kfuncs) BTF_ID_FLAGS(func, bpf_oom_kill_process, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_out_of_memory, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_is_oom_victim, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_oom_kfuncs) BTF_SET_START(bpf_oom_declare_oom_kfuncs) From bc792dcd7d228c96f93bb61b825b153f375d6c48 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 10 Oct 2025 17:07:41 -0700 Subject: [PATCH 17/28] libbpf: introduce bpf_map__attach_struct_ops_opts() Introduce bpf_map__attach_struct_ops_opts(), an extended version of bpf_map__attach_struct_ops(), which takes additional struct bpf_struct_ops_opts argument. struct bpf_struct_ops_opts has the relative_fd member, which allows to pass an additional file descriptor argument. It can be used to attach struct ops maps to cgroups. Signed-off-by: Roman Gushchin --- tools/lib/bpf/bpf.c | 8 ++++++++ tools/lib/bpf/libbpf.c | 18 ++++++++++++++++-- tools/lib/bpf/libbpf.h | 14 ++++++++++++++ tools/lib/bpf/libbpf.map | 1 + 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 339b197972374..4c8944f8d6ba5 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -883,6 +883,14 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, cgroup)) return libbpf_err(-EINVAL); break; + case BPF_STRUCT_OPS: + relative_fd = OPTS_GET(opts, cgroup.relative_fd, 0); + attr.link_create.cgroup.relative_fd = relative_fd; + attr.link_create.cgroup.expected_revision = + OPTS_GET(opts, cgroup.expected_revision, 0); + if (!OPTS_ZEROED(opts, cgroup)) + return libbpf_err(-EINVAL); + break; default: if (!OPTS_ZEROED(opts, flags)) return libbpf_err(-EINVAL); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index b90574f39d1c7..be56a5dee5050 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -13196,12 +13196,19 @@ static int bpf_link__detach_struct_ops(struct bpf_link *link) return close(link->fd); } -struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +struct bpf_link *bpf_map__attach_struct_ops_opts(const struct bpf_map *map, + const struct bpf_struct_ops_opts *opts) { + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts); struct bpf_link_struct_ops *link; __u32 zero = 0; int err, fd; + if (!OPTS_VALID(opts, bpf_struct_ops_opts)) { + pr_warn("map '%s': invalid opts\n", map->name); + return libbpf_err_ptr(-EINVAL); + } + if (!bpf_map__is_struct_ops(map)) { pr_warn("map '%s': can't attach non-struct_ops map\n", map->name); return libbpf_err_ptr(-EINVAL); @@ -13237,7 +13244,9 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) return &link->link; } - fd = bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, NULL); + link_opts.cgroup.relative_fd = OPTS_GET(opts, relative_fd, 0); + + fd = bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, &link_opts); if (fd < 0) { free(link); return libbpf_err_ptr(fd); @@ -13249,6 +13258,11 @@ struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) return &link->link; } +struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) +{ + return bpf_map__attach_struct_ops_opts(map, NULL); +} + /* * Swap the back struct_ops of a link with a new struct_ops map. */ diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5118d0a90e243..dc84898715cfc 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -922,6 +922,20 @@ bpf_program__attach_cgroup_opts(const struct bpf_program *prog, int cgroup_fd, struct bpf_map; LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map); + +struct bpf_struct_ops_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + __u32 flags; + __u32 relative_fd; + __u64 expected_revision; + size_t :0; +}; +#define bpf_struct_ops_opts__last_field expected_revision + +LIBBPF_API struct bpf_link * +bpf_map__attach_struct_ops_opts(const struct bpf_map *map, + const struct bpf_struct_ops_opts *opts); LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map); struct bpf_iter_attach_opts { diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 8ed8749907d47..bc00089343ce4 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -451,4 +451,5 @@ LIBBPF_1.7.0 { global: bpf_map__set_exclusive_program; bpf_map__exclusive_program; + bpf_map__attach_struct_ops_opts; } LIBBPF_1.6.0; From 7996b9b1a43f3fc319b55b020e736ed1dc9507b8 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:44 -0700 Subject: [PATCH 18/28] bpf: selftests: introduce read_cgroup_file() helper Implement read_cgroup_file() helper to read from cgroup control files, e.g. statistics. Signed-off-by: Roman Gushchin --- tools/testing/selftests/bpf/cgroup_helpers.c | 39 ++++++++++++++++++++ tools/testing/selftests/bpf/cgroup_helpers.h | 2 + 2 files changed, 41 insertions(+) diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c index 20cede4db3cee..8fb02fe4c4aaa 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.c +++ b/tools/testing/selftests/bpf/cgroup_helpers.c @@ -126,6 +126,45 @@ int enable_controllers(const char *relative_path, const char *controllers) return __enable_controllers(cgroup_path, controllers); } +static size_t __read_cgroup_file(const char *cgroup_path, const char *file, + char *buf, size_t size) +{ + char file_path[PATH_MAX + 1]; + size_t ret; + int fd; + + snprintf(file_path, sizeof(file_path), "%s/%s", cgroup_path, file); + fd = open(file_path, O_RDONLY); + if (fd < 0) { + log_err("Opening %s", file_path); + return -1; + } + + ret = read(fd, buf, size); + close(fd); + return ret; +} + +/** + * read_cgroup_file() - Read to a cgroup file + * @relative_path: The cgroup path, relative to the workdir + * @file: The name of the file in cgroupfs to read to + * @buf: Buffer to read from the file + * @size: Size of the buffer + * + * Read to a file in the given cgroup's directory. + * + * If successful, the number of read bytes is returned. + */ +size_t read_cgroup_file(const char *relative_path, const char *file, + char *buf, size_t size) +{ + char cgroup_path[PATH_MAX - 24]; + + format_cgroup_path(cgroup_path, relative_path); + return __read_cgroup_file(cgroup_path, file, buf, size); +} + static int __write_cgroup_file(const char *cgroup_path, const char *file, const char *buf) { diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h index 3857304be8741..9f9bb6b5d9928 100644 --- a/tools/testing/selftests/bpf/cgroup_helpers.h +++ b/tools/testing/selftests/bpf/cgroup_helpers.h @@ -11,6 +11,8 @@ /* cgroupv2 related */ int enable_controllers(const char *relative_path, const char *controllers); +size_t read_cgroup_file(const char *relative_path, const char *file, + char *buf, size_t size); int write_cgroup_file(const char *relative_path, const char *file, const char *buf); int write_cgroup_file_parent(const char *relative_path, const char *file, From f2fa8ad2c4b50542896cfa7a889682ca62a8cf5b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:44 -0700 Subject: [PATCH 19/28] bpf: selftests: BPF OOM handler test Implement a pseudo-realistic test for the OOM handling functionality. The OOM handling policy which is implemented in bpf is to kill all tasks belonging to the biggest leaf cgroup, which doesn't contain unkillable tasks (tasks with oom_score_adj set to -1000). Pagecache size is excluded from the accounting. The test creates a hierarchy of memory cgroups, causes an OOM at the top level, checks that the expected process will be killed and checks memcg's oom statistics. Signed-off-by: Roman Gushchin --- .../selftests/bpf/prog_tests/test_oom.c | 249 ++++++++++++++++++ tools/testing/selftests/bpf/progs/test_oom.c | 118 +++++++++ 2 files changed, 367 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_oom.c create mode 100644 tools/testing/selftests/bpf/progs/test_oom.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_oom.c b/tools/testing/selftests/bpf/prog_tests/test_oom.c new file mode 100644 index 0000000000000..6126d961aba3d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_oom.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +#include "cgroup_helpers.h" +#include "test_oom.skel.h" + +struct cgroup_desc { + const char *path; + int fd; + unsigned long long id; + int pid; + size_t target; + size_t max; + int oom_score_adj; + bool victim; +}; + +#define MB (1024 * 1024) +#define OOM_SCORE_ADJ_MIN (-1000) +#define OOM_SCORE_ADJ_MAX 1000 + +static struct cgroup_desc cgroups[] = { + { .path = "/oom_test", .max = 80 * MB}, + { .path = "/oom_test/cg1", .target = 10 * MB, + .oom_score_adj = OOM_SCORE_ADJ_MAX }, + { .path = "/oom_test/cg2", .target = 40 * MB, + .oom_score_adj = OOM_SCORE_ADJ_MIN }, + { .path = "/oom_test/cg3" }, + { .path = "/oom_test/cg3/cg4", .target = 30 * MB, + .victim = true }, + { .path = "/oom_test/cg3/cg5", .target = 20 * MB }, +}; + +static int spawn_task(struct cgroup_desc *desc) +{ + char *ptr; + int pid; + + pid = fork(); + if (pid < 0) + return pid; + + if (pid > 0) { + /* parent */ + desc->pid = pid; + return 0; + } + + /* child */ + if (desc->oom_score_adj) { + char buf[64]; + int fd = open("/proc/self/oom_score_adj", O_WRONLY); + + if (fd < 0) + return -1; + + snprintf(buf, sizeof(buf), "%d", desc->oom_score_adj); + write(fd, buf, sizeof(buf)); + close(fd); + } + + ptr = (char *)malloc(desc->target); + if (!ptr) + return -ENOMEM; + + memset(ptr, 'a', desc->target); + + while (1) + sleep(1000); + + return 0; +} + +static void setup_environment(void) +{ + int i, err; + + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + goto cleanup; + + for (i = 0; i < ARRAY_SIZE(cgroups); i++) { + cgroups[i].fd = create_and_get_cgroup(cgroups[i].path); + if (!ASSERT_GE(cgroups[i].fd, 0, "create_and_get_cgroup")) + goto cleanup; + + cgroups[i].id = get_cgroup_id(cgroups[i].path); + if (!ASSERT_GT(cgroups[i].id, 0, "get_cgroup_id")) + goto cleanup; + + /* Freeze the top-level cgroup */ + if (i == 0) { + /* Freeze the top-level cgroup */ + err = write_cgroup_file(cgroups[i].path, "cgroup.freeze", "1"); + if (!ASSERT_OK(err, "freeze cgroup")) + goto cleanup; + } + + /* Recursively enable the memory controller */ + if (!cgroups[i].target) { + + err = write_cgroup_file(cgroups[i].path, "cgroup.subtree_control", + "+memory"); + if (!ASSERT_OK(err, "enable memory controller")) + goto cleanup; + } + + /* Set memory.max */ + if (cgroups[i].max) { + char buf[256]; + + snprintf(buf, sizeof(buf), "%lu", cgroups[i].max); + err = write_cgroup_file(cgroups[i].path, "memory.max", buf); + if (!ASSERT_OK(err, "set memory.max")) + goto cleanup; + + snprintf(buf, sizeof(buf), "0"); + write_cgroup_file(cgroups[i].path, "memory.swap.max", buf); + + } + + /* Spawn tasks creating memory pressure */ + if (cgroups[i].target) { + char buf[256]; + + err = spawn_task(&cgroups[i]); + if (!ASSERT_OK(err, "spawn task")) + goto cleanup; + + snprintf(buf, sizeof(buf), "%d", cgroups[i].pid); + err = write_cgroup_file(cgroups[i].path, "cgroup.procs", buf); + if (!ASSERT_OK(err, "put child into a cgroup")) + goto cleanup; + } + } + + return; + +cleanup: + cleanup_cgroup_environment(); +} + +static int run_and_wait_for_oom(void) +{ + int ret = -1; + bool first = true; + char buf[4096] = {}; + size_t size; + + /* Unfreeze the top-level cgroup */ + ret = write_cgroup_file(cgroups[0].path, "cgroup.freeze", "0"); + if (!ASSERT_OK(ret, "freeze cgroup")) + return -1; + + for (;;) { + int i, status; + pid_t pid = wait(&status); + + if (pid == -1) { + if (errno == EINTR) + continue; + /* ECHILD */ + break; + } + + if (!first) + continue; + + first = false; + + /* Check which process was terminated first */ + for (i = 0; i < ARRAY_SIZE(cgroups); i++) { + if (!ASSERT_OK(cgroups[i].victim != + (pid == cgroups[i].pid), + "correct process was killed")) { + ret = -1; + break; + } + + if (!cgroups[i].victim) + continue; + + /* Check the memcg oom counter */ + size = read_cgroup_file(cgroups[i].path, + "memory.events", + buf, sizeof(buf)); + if (!ASSERT_OK(size <= 0, "read memory.events")) { + ret = -1; + break; + } + + if (!ASSERT_OK(strstr(buf, "oom_kill 1") == NULL, + "oom_kill count check")) { + ret = -1; + break; + } + } + + /* Kill all remaining tasks */ + for (i = 0; i < ARRAY_SIZE(cgroups); i++) + if (cgroups[i].pid && cgroups[i].pid != pid) + kill(cgroups[i].pid, SIGKILL); + } + + return ret; +} + +void test_oom(void) +{ + DECLARE_LIBBPF_OPTS(bpf_struct_ops_opts, opts); + struct test_oom *skel; + struct bpf_link *link1, *link2; + int err = 0; + + setup_environment(); + + skel = test_oom__open_and_load(); + if (!skel) { + err = -errno; + CHECK_FAIL(err); + goto cleanup; + } + + opts.relative_fd = cgroups[0].fd; + link1 = bpf_map__attach_struct_ops_opts(skel->maps.test_bpf_oom, &opts); + if (!link1) { + err = -errno; + CHECK_FAIL(err); + goto cleanup; + } + + opts.relative_fd = 0; /* attach system-wide */ + link2 = bpf_map__attach_struct_ops_opts(skel->maps.test_bpf_oom, &opts); + if (!link2) { + err = -errno; + CHECK_FAIL(err); + goto cleanup; + } + + /* Unfreeze all child tasks and create the memory pressure */ + err = run_and_wait_for_oom(); + CHECK_FAIL(err); + +cleanup: + cleanup_cgroup_environment(); + test_oom__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_oom.c b/tools/testing/selftests/bpf/progs/test_oom.c new file mode 100644 index 0000000000000..352b522ae584c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_oom.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +#define OOM_SCORE_ADJ_MIN (-1000) + +static bool mem_cgroup_killable(struct mem_cgroup *memcg) +{ + struct task_struct *task; + bool ret = true; + + bpf_for_each(css_task, task, &memcg->css, CSS_TASK_ITER_PROCS) + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + return false; + + return ret; +} + +/* + * Find the largest leaf cgroup (ignoring page cache) without unkillable tasks + * and kill all belonging tasks. + */ +SEC("struct_ops.s/handle_out_of_memory") +int BPF_PROG(test_out_of_memory, struct oom_control *oc, struct bpf_oom_ctx *exec_ctx) +{ + struct task_struct *task; + struct mem_cgroup *root_memcg = oc->memcg; + struct mem_cgroup *memcg, *victim = NULL; + struct cgroup_subsys_state *css_pos; + unsigned long usage, max_usage = 0; + unsigned long pagecache = 0; + int ret = 0; + + /* Pass to the system-level bpf_oom ops */ + if (exec_ctx->cgroup_id) + return 0; + + if (root_memcg) + root_memcg = bpf_get_mem_cgroup(&root_memcg->css); + else + root_memcg = bpf_get_root_mem_cgroup(); + + if (!root_memcg) + return 0; + + bpf_rcu_read_lock(); + bpf_for_each(css, css_pos, &root_memcg->css, BPF_CGROUP_ITER_DESCENDANTS_POST) { + if (css_pos->cgroup->nr_descendants + css_pos->cgroup->nr_dying_descendants) + continue; + + memcg = bpf_get_mem_cgroup(css_pos); + if (!memcg) + continue; + + usage = bpf_mem_cgroup_usage(memcg); + pagecache = bpf_mem_cgroup_page_state(memcg, NR_FILE_PAGES); + + if (usage > pagecache) + usage -= pagecache; + else + usage = 0; + + if ((usage > max_usage) && mem_cgroup_killable(memcg)) { + max_usage = usage; + if (victim) + bpf_put_mem_cgroup(victim); + victim = bpf_get_mem_cgroup(&memcg->css); + } + + bpf_put_mem_cgroup(memcg); + } + bpf_rcu_read_unlock(); + + if (!victim) + goto exit; + + bpf_for_each(css_task, task, &victim->css, CSS_TASK_ITER_PROCS) { + struct task_struct *t = bpf_task_acquire(task); + + if (t) { + /* + * If the task is already an OOM victim, it will + * quit soon and release some memory. + */ + if (bpf_task_is_oom_victim(task)) { + bpf_task_release(t); + ret = 1; + break; + } + + bpf_oom_kill_process(oc, task, "bpf oom test"); + bpf_task_release(t); + ret = 1; + } + } + + bpf_put_mem_cgroup(victim); +exit: + bpf_put_mem_cgroup(root_memcg); + + return ret; +} + +SEC("struct_ops.s/handle_cgroup_offline") +int BPF_PROG(test_cgroup_offline, u64 cgroup_id, struct bpf_oom_ctx *exec_ctx) +{ + return 0; +} + +SEC(".struct_ops.link") +struct bpf_oom_ops test_bpf_oom = { + .name = "bpf_test_policy", + .handle_out_of_memory = (void *)test_out_of_memory, + .handle_cgroup_offline = (void *)test_cgroup_offline, +}; From 5c683a3ccafd212f8e606bf2e2c2d78dd10ba7b3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:44 -0700 Subject: [PATCH 20/28] sched: psi: refactor psi_trigger_create() Currently psi_trigger_create() does a lot of things: parses the user text input, allocates and initializes the psi_trigger structure and turns on the trigger. It does it slightly different for two existing types of psi_triggers: system-wide and cgroup-wide. In order to support a new type of PSI triggers, which will be owned by a BPF program and won't have a user's text description, let's refactor psi_trigger_create(). 1. Introduce psi_trigger_type enum: currently PSI_SYSTEM and PSI_CGROUP are valid values. 2. Introduce psi_trigger_params structure to avoid passing a large number of parameters to psi_trigger_create(). 3. Move out the user's input parsing into the new psi_trigger_parse() helper. 4. Move out the capabilities check into the new psi_file_privileged() helper. 5. Stop relying on t->of for detecting trigger type. This commit is a pure refactoring and doesn't bring any functional changes. Signed-off-by: Roman Gushchin --- include/linux/psi.h | 15 +++++-- include/linux/psi_types.h | 33 ++++++++++++++- kernel/cgroup/cgroup.c | 14 ++++++- kernel/sched/psi.c | 88 +++++++++++++++++++++++++-------------- 4 files changed, 113 insertions(+), 37 deletions(-) diff --git a/include/linux/psi.h b/include/linux/psi.h index e0745873e3f26..8178e998d94b9 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -23,14 +23,23 @@ void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); -struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, - enum psi_res res, struct file *file, - struct kernfs_open_file *of); +int psi_trigger_parse(struct psi_trigger_params *params, const char *buf); +struct psi_trigger *psi_trigger_create(struct psi_group *group, + const struct psi_trigger_params *param); void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, poll_table *wait); +static inline bool psi_file_privileged(struct file *file) +{ + /* + * Checking the privilege here on file->f_cred implies that a privileged user + * could open the file and delegate the write to an unprivileged one. + */ + return cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE); +} + #ifdef CONFIG_CGROUPS static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index dd10c22299ab8..aa5ed39592cb3 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -119,7 +119,38 @@ struct psi_window { u64 prev_growth; }; +enum psi_trigger_type { + PSI_SYSTEM, + PSI_CGROUP, +}; + +struct psi_trigger_params { + /* Trigger type */ + enum psi_trigger_type type; + + /* Resource to be monitored */ + enum psi_res res; + + /* True if all threads should be stalled to trigger */ + bool full; + + /* Threshold in us */ + u32 threshold_us; + + /* Window in us */ + u32 window_us; + + /* Privileged triggers are treated differently */ + bool privileged; + + /* Link to kernfs open file, only for PSI_CGROUP */ + struct kernfs_open_file *of; +}; + struct psi_trigger { + /* Trigger type */ + enum psi_trigger_type type; + /* PSI state being monitored by the trigger */ enum psi_states state; @@ -135,7 +166,7 @@ struct psi_trigger { /* Wait queue for polling */ wait_queue_head_t event_wait; - /* Kernfs file for cgroup triggers */ + /* Kernfs file for PSI_CGROUP triggers */ struct kernfs_open_file *of; /* Pending event flag */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6ae5f48cf64e3..836b28676abcb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4000,6 +4000,12 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; + struct psi_trigger_params params; + int err; + + err = psi_trigger_parse(¶ms, buf); + if (err) + return err; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) @@ -4015,7 +4021,13 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, } psi = cgroup_psi(cgrp); - new = psi_trigger_create(psi, buf, res, of->file, of); + + params.type = PSI_CGROUP; + params.res = res; + params.privileged = psi_file_privileged(of->file); + params.of = of; + + new = psi_trigger_create(psi, ¶ms); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 59fdb7ebbf22a..73fdc79b56022 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -511,7 +511,7 @@ static void update_triggers(struct psi_group *group, u64 now, /* Generate an event */ if (cmpxchg(&t->event, 0, 1) == 0) { - if (t->of) + if (t->type == PSI_CGROUP) kernfs_notify(t->of->kn); else wake_up_interruptible(&t->event_wait); @@ -1292,74 +1292,88 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, - enum psi_res res, struct file *file, - struct kernfs_open_file *of) +int psi_trigger_parse(struct psi_trigger_params *params, const char *buf) { - struct psi_trigger *t; - enum psi_states state; - u32 threshold_us; - bool privileged; - u32 window_us; + u32 threshold_us, window_us; if (static_branch_likely(&psi_disabled)) - return ERR_PTR(-EOPNOTSUPP); - - /* - * Checking the privilege here on file->f_cred implies that a privileged user - * could open the file and delegate the write to an unprivileged one. - */ - privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE); + return -EOPNOTSUPP; if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) - state = PSI_IO_SOME + res * 2; + params->full = false; else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) - state = PSI_IO_FULL + res * 2; + params->full = true; else - return ERR_PTR(-EINVAL); + return -EINVAL; + + params->threshold_us = threshold_us; + params->window_us = window_us; + return 0; +} + +struct psi_trigger *psi_trigger_create(struct psi_group *group, + const struct psi_trigger_params *params) +{ + struct psi_trigger *t; + enum psi_states state; + + if (static_branch_likely(&psi_disabled)) + return ERR_PTR(-EOPNOTSUPP); + + state = params->full ? PSI_IO_FULL : PSI_IO_SOME; + state += params->res * 2; #ifdef CONFIG_IRQ_TIME_ACCOUNTING - if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + if (params->res == PSI_IRQ && --state != PSI_IRQ_FULL) return ERR_PTR(-EINVAL); #endif if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); - if (window_us == 0 || window_us > WINDOW_MAX_US) + if (params->window_us == 0 || params->window_us > WINDOW_MAX_US) return ERR_PTR(-EINVAL); /* * Unprivileged users can only use 2s windows so that averages aggregation * work is used, and no RT threads need to be spawned. */ - if (!privileged && window_us % 2000000) + if (!params->privileged && params->window_us % 2000000) return ERR_PTR(-EINVAL); /* Check threshold */ - if (threshold_us == 0 || threshold_us > window_us) + if (params->threshold_us == 0 || params->threshold_us > params->window_us) return ERR_PTR(-EINVAL); t = kmalloc(sizeof(*t), GFP_KERNEL); if (!t) return ERR_PTR(-ENOMEM); + t->type = params->type; t->group = group; t->state = state; - t->threshold = threshold_us * NSEC_PER_USEC; - t->win.size = window_us * NSEC_PER_USEC; + t->threshold = params->threshold_us * NSEC_PER_USEC; + t->win.size = params->window_us * NSEC_PER_USEC; window_reset(&t->win, sched_clock(), group->total[PSI_POLL][t->state], 0); t->event = 0; t->last_event_time = 0; - t->of = of; - if (!of) + + switch (params->type) { + case PSI_SYSTEM: init_waitqueue_head(&t->event_wait); + t->of = NULL; + break; + case PSI_CGROUP: + t->of = params->of; + break; + } + t->pending_event = false; - t->aggregator = privileged ? PSI_POLL : PSI_AVGS; + t->aggregator = params->privileged ? PSI_POLL : PSI_AVGS; - if (privileged) { + if (params->privileged) { mutex_lock(&group->rtpoll_trigger_lock); if (!rcu_access_pointer(group->rtpoll_task)) { @@ -1412,7 +1426,7 @@ void psi_trigger_destroy(struct psi_trigger *t) * being accessed later. Can happen if cgroup is deleted from under a * polling process. */ - if (t->of) + if (t->type == PSI_CGROUP) kernfs_notify(t->of->kn); else wake_up_interruptible(&t->event_wait); @@ -1492,7 +1506,7 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - if (t->of) + if (t->type == PSI_CGROUP) kernfs_generic_poll(t->of, wait); else poll_wait(file, &t->event_wait, wait); @@ -1541,6 +1555,8 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, size_t buf_size; struct seq_file *seq; struct psi_trigger *new; + struct psi_trigger_params params; + int err; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; @@ -1554,6 +1570,10 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, buf[buf_size - 1] = '\0'; + err = psi_trigger_parse(¶ms, buf); + if (err) + return err; + seq = file->private_data; /* Take seq->lock to protect seq->private from concurrent writes */ @@ -1565,7 +1585,11 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, res, file, NULL); + params.type = PSI_SYSTEM; + params.res = res; + params.privileged = psi_file_privileged(file); + + new = psi_trigger_create(&psi_system, ¶ms); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); From 41aecb3ed363446c3b31f5ca0cdfd6f9506bd823 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:44 -0700 Subject: [PATCH 21/28] sched: psi: implement bpf_psi struct ops This patch implements a BPF struct ops-based mechanism to create PSI triggers, attach them to cgroups or system wide and handle PSI events in BPF. The struct ops provides 3 callbacks: - init() called once at load, handy for creating PSI triggers - handle_psi_event() called every time a PSI trigger fires - handle_cgroup_online() called when a new cgroup is created - handle_cgroup_offline() called if a cgroup with an attached trigger is deleted A single struct ops can create a number of PSI triggers, both cgroup-scoped and system-wide. All 4 struct ops callbacks can be sleepable. handle_psi_event() handlers are executed using a separate workqueue, so it won't affect the latency of other PSI triggers. Signed-off-by: Roman Gushchin --- include/linux/bpf_psi.h | 87 ++++++++++ include/linux/psi_types.h | 43 ++++- kernel/bpf/cgroup.c | 3 + kernel/sched/bpf_psi.c | 298 +++++++++++++++++++++++++++++++++++ kernel/sched/build_utility.c | 4 + kernel/sched/psi.c | 48 ++++-- mm/oom_kill.c | 3 + 7 files changed, 474 insertions(+), 12 deletions(-) create mode 100644 include/linux/bpf_psi.h create mode 100644 kernel/sched/bpf_psi.c diff --git a/include/linux/bpf_psi.h b/include/linux/bpf_psi.h new file mode 100644 index 0000000000000..df00778e474ee --- /dev/null +++ b/include/linux/bpf_psi.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#ifndef __BPF_PSI_H +#define __BPF_PSI_H + +#include +#include +#include +#include + +struct cgroup; +struct bpf_psi; +struct psi_trigger; +struct psi_trigger_params; + +#define BPF_PSI_FULL 0x80000000 + +struct bpf_psi_ops { + /** + * @init: Initialization callback, suited for creating psi triggers. + * @bpf_psi: bpf_psi pointer, can be passed to bpf_psi_create_trigger(). + * + * A non-0 return value means the initialization has been failed. + */ + int (*init)(struct bpf_psi *bpf_psi); + + /** + * @handle_psi_event: PSI event callback + * @t: psi_trigger pointer + */ + void (*handle_psi_event)(struct psi_trigger *t); + + /** + * @handle_cgroup_online: Cgroup online callback + * @cgroup_id: Id of the new cgroup + * + * Called every time a new cgroup is created. Can be used + * to create new psi triggers. + */ + void (*handle_cgroup_online)(u64 cgroup_id); + + /** + * @handle_cgroup_offline: Cgroup offline callback + * @cgroup_id: Id of offlined cgroup + * + * Called every time a cgroup with an attached bpf psi trigger is + * offlined. + */ + void (*handle_cgroup_offline)(u64 cgroup_id); + + /* private */ + struct bpf_psi *bpf_psi; +}; + +struct bpf_psi { + spinlock_t lock; + struct list_head triggers; + struct bpf_psi_ops *ops; + struct srcu_struct srcu; + struct list_head node; /* Protected by bpf_psi_lock */ +}; + +#ifdef CONFIG_BPF_SYSCALL +void bpf_psi_add_trigger(struct psi_trigger *t, + const struct psi_trigger_params *params); +void bpf_psi_remove_trigger(struct psi_trigger *t); +void bpf_psi_handle_event(struct psi_trigger *t); + +#else /* CONFIG_BPF_SYSCALL */ +static inline void bpf_psi_add_trigger(struct psi_trigger *t, + const struct psi_trigger_params *params) {} +static inline void bpf_psi_remove_trigger(struct psi_trigger *t) {} +static inline void bpf_psi_handle_event(struct psi_trigger *t) {} + +#endif /* CONFIG_BPF_SYSCALL */ + +#if (defined(CONFIG_CGROUPS) && defined(CONFIG_PSI) && defined(CONFIG_BPF_SYSCALL)) +void bpf_psi_cgroup_online(struct cgroup *cgroup); +void bpf_psi_cgroup_offline(struct cgroup *cgroup); + +#else /* CONFIG_CGROUPS && CONFIG_PSI && CONFIG_BPF_SYSCALL */ +static inline void bpf_psi_cgroup_online(struct cgroup *cgroup) {} +static inline void bpf_psi_cgroup_offline(struct cgroup *cgroup) {} + +#endif /* CONFIG_CGROUPS && CONFIG_PSI && CONFIG_BPF_SYSCALL */ + +#endif /* __BPF_PSI_H */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index aa5ed39592cb3..e551df9d6336c 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -122,6 +122,7 @@ struct psi_window { enum psi_trigger_type { PSI_SYSTEM, PSI_CGROUP, + PSI_BPF, }; struct psi_trigger_params { @@ -143,8 +144,15 @@ struct psi_trigger_params { /* Privileged triggers are treated differently */ bool privileged; - /* Link to kernfs open file, only for PSI_CGROUP */ - struct kernfs_open_file *of; + union { + /* Link to kernfs open file, only for PSI_CGROUP */ + struct kernfs_open_file *of; + +#ifdef CONFIG_BPF_SYSCALL + /* Link to bpf_psi structure, only for BPF_PSI */ + struct bpf_psi *bpf_psi; +#endif + }; }; struct psi_trigger { @@ -186,6 +194,31 @@ struct psi_trigger { /* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */ enum psi_aggregators aggregator; + +#ifdef CONFIG_BPF_SYSCALL + /* Fields specific to PSI_BPF triggers */ + + /* Bpf psi structure for events handling */ + struct bpf_psi *bpf_psi; + + /* List node inside bpf_psi->triggers list */ + struct list_head bpf_psi_node; + + /* List node inside group->bpf_triggers list */ + struct list_head bpf_group_node; + + /* Work structure, used to execute event handlers */ + struct work_struct bpf_work; + + /* + * Whether the trigger is being pinned in memory. + * Protected by group->bpf_triggers_lock. + */ + bool pinned; + + /* Cgroup Id */ + u64 cgroup_id; +#endif }; struct psi_group { @@ -234,6 +267,12 @@ struct psi_group { u64 rtpoll_total[NR_PSI_STATES - 1]; u64 rtpoll_next_update; u64 rtpoll_until; + +#ifdef CONFIG_BPF_SYSCALL + /* List of triggers owned by bpf and corresponding lock */ + spinlock_t bpf_triggers_lock; + struct list_head bpf_triggers; +#endif }; #else /* CONFIG_PSI */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 248f517d66d04..4df4c49ba1793 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -557,9 +558,11 @@ static int cgroup_bpf_lifetime_notify(struct notifier_block *nb, switch (action) { case CGROUP_LIFETIME_ONLINE: + bpf_psi_cgroup_online(cgrp); ret = cgroup_bpf_inherit(cgrp); break; case CGROUP_LIFETIME_OFFLINE: + bpf_psi_cgroup_offline(cgrp); cgroup_bpf_offline(cgrp); break; } diff --git a/kernel/sched/bpf_psi.c b/kernel/sched/bpf_psi.c new file mode 100644 index 0000000000000..527761d31717c --- /dev/null +++ b/kernel/sched/bpf_psi.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * BPF PSI event handlers + * + * Author: Roman Gushchin + */ + +#include +#include + +static struct workqueue_struct *bpf_psi_wq; + +static DEFINE_MUTEX(bpf_psi_lock); +static LIST_HEAD(bpf_psi_notify_list); +static DEFINE_STATIC_KEY_FALSE(bpf_psi_notify_key); + +static struct bpf_psi *bpf_psi_create(struct bpf_psi_ops *ops) +{ + struct bpf_psi *bpf_psi; + + bpf_psi = kzalloc(sizeof(*bpf_psi), GFP_KERNEL); + if (!bpf_psi) + return NULL; + + if (init_srcu_struct(&bpf_psi->srcu)) { + kfree(bpf_psi); + return NULL; + } + + spin_lock_init(&bpf_psi->lock); + bpf_psi->ops = ops; + INIT_LIST_HEAD(&bpf_psi->triggers); + ops->bpf_psi = bpf_psi; + + if (ops->handle_cgroup_online) { + mutex_lock(&bpf_psi_lock); + list_add(&bpf_psi->node, &bpf_psi_notify_list); + mutex_unlock(&bpf_psi_lock); + static_branch_inc(&bpf_psi_notify_key); + } else { + INIT_LIST_HEAD(&bpf_psi->node); + } + + return bpf_psi; +} + +static void bpf_psi_handle_event_fn(struct work_struct *work) +{ + struct psi_trigger *t; + struct bpf_psi *bpf_psi; + int idx; + + t = container_of(work, struct psi_trigger, bpf_work); + bpf_psi = READ_ONCE(t->bpf_psi); + + if (likely(bpf_psi)) { + idx = srcu_read_lock(&bpf_psi->srcu); + bpf_psi->ops->handle_psi_event(t); + srcu_read_unlock(&bpf_psi->srcu, idx); + } +} + +void bpf_psi_add_trigger(struct psi_trigger *t, + const struct psi_trigger_params *params) +{ + t->bpf_psi = params->bpf_psi; + t->pinned = false; + INIT_WORK(&t->bpf_work, bpf_psi_handle_event_fn); + + spin_lock(&t->bpf_psi->lock); + list_add(&t->bpf_psi_node, &t->bpf_psi->triggers); + spin_unlock(&t->bpf_psi->lock); + + spin_lock(&t->group->bpf_triggers_lock); + list_add(&t->bpf_group_node, &t->group->bpf_triggers); + spin_unlock(&t->group->bpf_triggers_lock); +} + +void bpf_psi_remove_trigger(struct psi_trigger *t) +{ + spin_lock(&t->group->bpf_triggers_lock); + list_del(&t->bpf_group_node); + spin_unlock(&t->group->bpf_triggers_lock); + + spin_lock(&t->bpf_psi->lock); + list_del(&t->bpf_psi_node); + spin_unlock(&t->bpf_psi->lock); +} + +#ifdef CONFIG_CGROUPS +void bpf_psi_cgroup_online(struct cgroup *cgroup) +{ + struct bpf_psi *bpf_psi; + int idx; + + if (!static_branch_likely(&bpf_psi_notify_key)) + return; + + mutex_lock(&bpf_psi_lock); + list_for_each_entry(bpf_psi, &bpf_psi_notify_list, node) { + idx = srcu_read_lock(&bpf_psi->srcu); + if (bpf_psi->ops->handle_cgroup_online) + bpf_psi->ops->handle_cgroup_online(cgroup_id(cgroup)); + srcu_read_unlock(&bpf_psi->srcu, idx); + } + mutex_unlock(&bpf_psi_lock); +} + +void bpf_psi_cgroup_offline(struct cgroup *cgroup) +{ + struct psi_group *group = cgroup->psi; + u64 cgrp_id = cgroup_id(cgroup); + struct psi_trigger *t, *p; + struct bpf_psi *bpf_psi; + LIST_HEAD(to_destroy); + int idx; + + spin_lock(&group->bpf_triggers_lock); + list_for_each_entry_safe(t, p, &group->bpf_triggers, bpf_group_node) { + if (!t->pinned) { + t->pinned = true; + list_move(&t->bpf_group_node, &to_destroy); + } + } + spin_unlock(&group->bpf_triggers_lock); + + list_for_each_entry_safe(t, p, &to_destroy, bpf_group_node) { + bpf_psi = READ_ONCE(t->bpf_psi); + + idx = srcu_read_lock(&bpf_psi->srcu); + if (bpf_psi->ops->handle_cgroup_offline) + bpf_psi->ops->handle_cgroup_offline(cgrp_id); + srcu_read_unlock(&bpf_psi->srcu, idx); + + spin_lock(&bpf_psi->lock); + list_del(&t->bpf_psi_node); + spin_unlock(&bpf_psi->lock); + + WRITE_ONCE(t->bpf_psi, NULL); + flush_workqueue(bpf_psi_wq); + synchronize_srcu(&bpf_psi->srcu); + psi_trigger_destroy(t); + } +} +#endif + +void bpf_psi_handle_event(struct psi_trigger *t) +{ + queue_work(bpf_psi_wq, &t->bpf_work); +} + +/* BPF struct ops */ + +static int __bpf_psi_init(struct bpf_psi *bpf_psi) { return 0; } +static void __bpf_psi_handle_psi_event(struct psi_trigger *t) {} +static void __bpf_psi_handle_cgroup_online(u64 cgroup_id) {} +static void __bpf_psi_handle_cgroup_offline(u64 cgroup_id) {} + +static struct bpf_psi_ops __bpf_psi_ops = { + .init = __bpf_psi_init, + .handle_psi_event = __bpf_psi_handle_psi_event, + .handle_cgroup_online = __bpf_psi_handle_cgroup_online, + .handle_cgroup_offline = __bpf_psi_handle_cgroup_offline, +}; + +static const struct bpf_func_proto * +bpf_psi_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return tracing_prog_func_proto(func_id, prog); +} + +static bool bpf_psi_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static const struct bpf_verifier_ops bpf_psi_verifier_ops = { + .get_func_proto = bpf_psi_func_proto, + .is_valid_access = bpf_psi_ops_is_valid_access, +}; + +static int bpf_psi_ops_reg(void *kdata, struct bpf_link *link) +{ + struct bpf_psi_ops *ops = kdata; + struct bpf_psi *bpf_psi; + + bpf_psi = bpf_psi_create(ops); + if (!bpf_psi) + return -ENOMEM; + + return ops->init(bpf_psi); +} + +static void bpf_psi_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct bpf_psi_ops *ops = kdata; + struct bpf_psi *bpf_psi = ops->bpf_psi; + struct psi_trigger *t, *p; + LIST_HEAD(to_destroy); + + spin_lock(&bpf_psi->lock); + list_for_each_entry_safe(t, p, &bpf_psi->triggers, bpf_psi_node) { + spin_lock(&t->group->bpf_triggers_lock); + if (!t->pinned) { + t->pinned = true; + list_move(&t->bpf_group_node, &to_destroy); + list_del(&t->bpf_psi_node); + + WRITE_ONCE(t->bpf_psi, NULL); + } + spin_unlock(&t->group->bpf_triggers_lock); + } + spin_unlock(&bpf_psi->lock); + + flush_workqueue(bpf_psi_wq); + synchronize_srcu(&bpf_psi->srcu); + + list_for_each_entry_safe(t, p, &to_destroy, bpf_group_node) + psi_trigger_destroy(t); + + if (!list_empty(&bpf_psi->node)) { + mutex_lock(&bpf_psi_lock); + list_del(&bpf_psi->node); + mutex_unlock(&bpf_psi_lock); + static_branch_dec(&bpf_psi_notify_key); + } + + cleanup_srcu_struct(&bpf_psi->srcu); + kfree(bpf_psi); +} + +static int bpf_psi_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct bpf_psi_ops, init): + fallthrough; + case offsetof(struct bpf_psi_ops, handle_psi_event): + if (!prog) + return -EINVAL; + break; + } + + return 0; +} + +static int bpf_psi_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return 0; +} + +static int bpf_psi_ops_init(struct btf *btf) +{ + return 0; +} + +struct bpf_struct_ops bpf_psi_bpf_ops = { + .verifier_ops = &bpf_psi_verifier_ops, + .reg = bpf_psi_ops_reg, + .unreg = bpf_psi_ops_unreg, + .check_member = bpf_psi_ops_check_member, + .init_member = bpf_psi_ops_init_member, + .init = bpf_psi_ops_init, + .name = "bpf_psi_ops", + .owner = THIS_MODULE, + .cfi_stubs = &__bpf_psi_ops +}; + +static int __init bpf_psi_struct_ops_init(void) +{ + int wq_flags = WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI; + int err; + + bpf_psi_wq = alloc_workqueue("bpf_psi_wq", wq_flags, 0); + if (!bpf_psi_wq) + return -ENOMEM; + + err = register_bpf_struct_ops(&bpf_psi_bpf_ops, bpf_psi_ops); + if (err) { + pr_warn("error while registering bpf psi struct ops: %d", err); + goto err; + } + + return 0; + +err: + destroy_workqueue(bpf_psi_wq); + return err; +} +late_initcall(bpf_psi_struct_ops_init); diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index e2cf3b08d4e95..1f90781781a12 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -91,6 +92,9 @@ #ifdef CONFIG_PSI # include "psi.c" +# ifdef CONFIG_BPF_SYSCALL +# include "bpf_psi.c" +# endif #endif #ifdef CONFIG_MEMBARRIER diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 73fdc79b56022..26de772750e82 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -223,6 +223,10 @@ static void group_init(struct psi_group *group) init_waitqueue_head(&group->rtpoll_wait); timer_setup(&group->rtpoll_timer, poll_timer_fn, 0); rcu_assign_pointer(group->rtpoll_task, NULL); +#ifdef CONFIG_BPF_SYSCALL + spin_lock_init(&group->bpf_triggers_lock); + INIT_LIST_HEAD(&group->bpf_triggers); +#endif } void __init psi_init(void) @@ -511,10 +515,17 @@ static void update_triggers(struct psi_group *group, u64 now, /* Generate an event */ if (cmpxchg(&t->event, 0, 1) == 0) { - if (t->type == PSI_CGROUP) - kernfs_notify(t->of->kn); - else + switch (t->type) { + case PSI_SYSTEM: wake_up_interruptible(&t->event_wait); + break; + case PSI_CGROUP: + kernfs_notify(t->of->kn); + break; + case PSI_BPF: + bpf_psi_handle_event(t); + break; + } } t->last_event_time = now; /* Reset threshold breach flag once event got generated */ @@ -1368,6 +1379,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, case PSI_CGROUP: t->of = params->of; break; + case PSI_BPF: + bpf_psi_add_trigger(t, params); + break; } t->pending_event = false; @@ -1381,8 +1395,10 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, task = kthread_create(psi_rtpoll_worker, group, "psimon"); if (IS_ERR(task)) { - kfree(t); mutex_unlock(&group->rtpoll_trigger_lock); + if (t->type == PSI_BPF) + bpf_psi_remove_trigger(t); + kfree(t); return ERR_CAST(task); } atomic_set(&group->rtpoll_wakeup, 0); @@ -1426,10 +1442,16 @@ void psi_trigger_destroy(struct psi_trigger *t) * being accessed later. Can happen if cgroup is deleted from under a * polling process. */ - if (t->type == PSI_CGROUP) - kernfs_notify(t->of->kn); - else + switch (t->type) { + case PSI_SYSTEM: wake_up_interruptible(&t->event_wait); + break; + case PSI_CGROUP: + kernfs_notify(t->of->kn); + break; + case PSI_BPF: + break; + } if (t->aggregator == PSI_AVGS) { mutex_lock(&group->avgs_lock); @@ -1506,10 +1528,16 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - if (t->type == PSI_CGROUP) - kernfs_generic_poll(t->of, wait); - else + switch (t->type) { + case PSI_SYSTEM: poll_wait(file, &t->event_wait, wait); + break; + case PSI_CGROUP: + kernfs_generic_poll(t->of, wait); + break; + case PSI_BPF: + break; + } if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 90bb86dee3cf2..65a3b4c1fc725 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1429,6 +1429,9 @@ static int bpf_oom_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id) if (!btf_id_set_contains(&bpf_oom_declare_oom_kfuncs, kfunc_id)) return 0; + if (IS_ENABLED(CONFIG_PSI) && prog->aux->st_ops == &bpf_psi_bpf_ops) + return 0; + return -EACCES; } From 25cc885da28aee1e06c5b5ea98461b412e8b5bf1 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:44 -0700 Subject: [PATCH 22/28] sched: psi: implement bpf_psi_create_trigger() kfunc Implement a new bpf_psi_create_trigger() BPF kfunc, which allows to create new PSI triggers and attach them to cgroups or be system-wide. Created triggers will exist until the struct ops is loaded and if they are attached to a cgroup until the cgroup exists. Due to a limitation of 5 arguments, the resource type and the "full" bit are squeezed into a single u32. Signed-off-by: Roman Gushchin --- include/linux/cgroup.h | 4 ++ include/linux/psi.h | 6 +++ kernel/sched/bpf_psi.c | 94 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6ed477338b166..1a99da44999ed 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -707,6 +707,10 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) {} +static inline struct cgroup *cgroup_get_from_id(u64 id) +{ + return NULL; +} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS diff --git a/include/linux/psi.h b/include/linux/psi.h index 8178e998d94b9..8ffe84cd8571a 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -50,6 +50,12 @@ int psi_cgroup_alloc(struct cgroup *cgrp); void psi_cgroup_free(struct cgroup *cgrp); void cgroup_move_task(struct task_struct *p, struct css_set *to); void psi_cgroup_restart(struct psi_group *group); + +#else +static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) +{ + return &psi_system; +} #endif #else /* CONFIG_PSI */ diff --git a/kernel/sched/bpf_psi.c b/kernel/sched/bpf_psi.c index 527761d31717c..952c7bd3ff3d2 100644 --- a/kernel/sched/bpf_psi.c +++ b/kernel/sched/bpf_psi.c @@ -8,6 +8,7 @@ #include #include +struct bpf_struct_ops bpf_psi_bpf_ops; static struct workqueue_struct *bpf_psi_wq; static DEFINE_MUTEX(bpf_psi_lock); @@ -182,6 +183,92 @@ static const struct bpf_verifier_ops bpf_psi_verifier_ops = { .is_valid_access = bpf_psi_ops_is_valid_access, }; +__bpf_kfunc_start_defs(); + +/** + * bpf_psi_create_trigger - Create a PSI trigger + * @bpf_psi: bpf_psi struct to attach the trigger to + * @cgroup_id: cgroup Id to attach the trigger; 0 for system-wide scope + * @resource: resource to monitor (PSI_MEM, PSI_IO, etc) and the full bit. + * @threshold_us: threshold in us + * @window_us: window in us + * + * Creates a PSI trigger and attached is to bpf_psi. The trigger will be + * active unless bpf struct ops is unloaded or the corresponding cgroup + * is deleted. + * + * Resource's most significant bit encodes whether "some" or "full" + * PSI state should be tracked. + * + * Returns 0 on success and the error code on failure. + */ +__bpf_kfunc int bpf_psi_create_trigger(struct bpf_psi *bpf_psi, + u64 cgroup_id, u32 resource, + u32 threshold_us, u32 window_us) +{ + enum psi_res res = resource & ~BPF_PSI_FULL; + bool full = resource & BPF_PSI_FULL; + struct psi_trigger_params params; + struct cgroup *cgroup __maybe_unused = NULL; + struct psi_group *group; + struct psi_trigger *t; + int ret = 0; + + if (res >= NR_PSI_RESOURCES) + return -EINVAL; + + if (IS_ENABLED(CONFIG_CGROUPS) && cgroup_id) { + cgroup = cgroup_get_from_id(cgroup_id); + if (IS_ERR_OR_NULL(cgroup)) + return PTR_ERR(cgroup); + + group = cgroup_psi(cgroup); + } else { + group = &psi_system; + } + + params.type = PSI_BPF; + params.bpf_psi = bpf_psi; + params.privileged = capable(CAP_SYS_RESOURCE); + params.res = res; + params.full = full; + params.threshold_us = threshold_us; + params.window_us = window_us; + + t = psi_trigger_create(group, ¶ms); + if (IS_ERR(t)) + ret = PTR_ERR(t); + else + t->cgroup_id = cgroup_id; + +#ifdef CONFIG_CGROUPS + if (cgroup) + cgroup_put(cgroup); +#endif + + return ret; +} +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_psi_kfuncs) +BTF_ID_FLAGS(func, bpf_psi_create_trigger, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_psi_kfuncs) + +static int bpf_psi_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + if (btf_id_set8_contains(&bpf_psi_kfuncs, kfunc_id) && + prog->aux->st_ops != &bpf_psi_bpf_ops) + return -EACCES; + + return 0; +} + +static const struct btf_kfunc_id_set bpf_psi_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_psi_kfuncs, + .filter = bpf_psi_kfunc_filter, +}; + static int bpf_psi_ops_reg(void *kdata, struct bpf_link *link) { struct bpf_psi_ops *ops = kdata; @@ -283,6 +370,13 @@ static int __init bpf_psi_struct_ops_init(void) if (!bpf_psi_wq) return -ENOMEM; + err = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, + &bpf_psi_kfunc_set); + if (err) { + pr_warn("error while registering bpf psi kfuncs: %d", err); + goto err; + } + err = register_bpf_struct_ops(&bpf_psi_bpf_ops, bpf_psi_ops); if (err) { pr_warn("error while registering bpf psi struct ops: %d", err); From 194ef7577149bb29fcf3275cc48daca598608313 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 11 Aug 2025 18:05:44 -0700 Subject: [PATCH 23/28] bpf: selftests: PSI struct ops test Add a PSI struct ops test. The test creates a cgroup with two child sub-cgroups, sets up memory.high for one of those and puts there a memory hungry process (initially frozen). Then it creates 2 PSI triggers from within a init() BPF callback and attaches them to these cgroups. Then it deletes the first cgroup and runs the memory hungry task. The task is creating a high memory pressure, which triggers the PSI event. The PSI BPF handler declares a memcg oom in the corresponding cgroup. Finally the checks that both handle_cgroup_free() and handle_psi_event() handlers were executed, the correct process was killed and oom counters were updated. Signed-off-by: Roman Gushchin --- .../selftests/bpf/prog_tests/test_psi.c | 238 ++++++++++++++++++ tools/testing/selftests/bpf/progs/test_psi.c | 80 ++++++ 2 files changed, 318 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/test_psi.c create mode 100644 tools/testing/selftests/bpf/progs/test_psi.c diff --git a/tools/testing/selftests/bpf/prog_tests/test_psi.c b/tools/testing/selftests/bpf/prog_tests/test_psi.c new file mode 100644 index 0000000000000..b294cea0a6fe2 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/test_psi.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include + +#include "cgroup_helpers.h" +#include "test_psi.skel.h" + +enum psi_res { + PSI_IO, + PSI_MEM, + PSI_CPU, + PSI_IRQ, + NR_PSI_RESOURCES, +}; + +struct cgroup_desc { + const char *path; + unsigned long long id; + int pid; + int fd; + size_t target; + size_t high; + bool victim; +}; + +#define MB (1024 * 1024) + +static struct cgroup_desc cgroups[] = { + { .path = "/psi_test" }, + { .path = "/psi_test/cg1" }, + { .path = "/psi_test/cg2", .target = 500 * MB, + .high = 40 * MB, .victim = true }, +}; + +static int spawn_task(struct cgroup_desc *desc) +{ + char *ptr; + int pid; + + pid = fork(); + if (pid < 0) + return pid; + + if (pid > 0) { + /* parent */ + desc->pid = pid; + return 0; + } + + /* child */ + ptr = (char *)malloc(desc->target); + if (!ptr) + return -ENOMEM; + + memset(ptr, 'a', desc->target); + + while (1) + sleep(1000); + + return 0; +} + +static void setup_environment(void) +{ + int i, err; + + err = setup_cgroup_environment(); + if (!ASSERT_OK(err, "setup_cgroup_environment")) + goto cleanup; + + for (i = 0; i < ARRAY_SIZE(cgroups); i++) { + cgroups[i].fd = create_and_get_cgroup(cgroups[i].path); + if (!ASSERT_GE(cgroups[i].fd, 0, "create_and_get_cgroup")) + goto cleanup; + + cgroups[i].id = get_cgroup_id(cgroups[i].path); + if (!ASSERT_GT(cgroups[i].id, 0, "get_cgroup_id")) + goto cleanup; + + /* Freeze the top-level cgroup and enable the memory controller */ + if (i == 0) { + err = write_cgroup_file(cgroups[i].path, "cgroup.freeze", "1"); + if (!ASSERT_OK(err, "freeze cgroup")) + goto cleanup; + + err = write_cgroup_file(cgroups[i].path, "cgroup.subtree_control", + "+memory"); + if (!ASSERT_OK(err, "enable memory controller")) + goto cleanup; + } + + /* Set memory.high */ + if (cgroups[i].high) { + char buf[256]; + + snprintf(buf, sizeof(buf), "%lu", cgroups[i].high); + err = write_cgroup_file(cgroups[i].path, "memory.high", buf); + if (!ASSERT_OK(err, "set memory.high")) + goto cleanup; + + snprintf(buf, sizeof(buf), "0"); + write_cgroup_file(cgroups[i].path, "memory.swap.max", buf); + } + + /* Spawn tasks creating memory pressure */ + if (cgroups[i].target) { + char buf[256]; + + err = spawn_task(&cgroups[i]); + if (!ASSERT_OK(err, "spawn task")) + goto cleanup; + + snprintf(buf, sizeof(buf), "%d", cgroups[i].pid); + err = write_cgroup_file(cgroups[i].path, "cgroup.procs", buf); + if (!ASSERT_OK(err, "put child into a cgroup")) + goto cleanup; + } + } + + return; + +cleanup: + cleanup_cgroup_environment(); +} + +static int run_and_wait_for_oom(void) +{ + int ret = -1; + bool first = true; + char buf[4096] = {}; + size_t size; + + /* Unfreeze the top-level cgroup */ + ret = write_cgroup_file(cgroups[0].path, "cgroup.freeze", "0"); + if (!ASSERT_OK(ret, "unfreeze cgroup")) + return -1; + + for (;;) { + int i, status; + pid_t pid = wait(&status); + + if (pid == -1) { + if (errno == EINTR) + continue; + /* ECHILD */ + break; + } + + if (!first) + continue; + first = false; + + /* Check which process was terminated first */ + for (i = 0; i < ARRAY_SIZE(cgroups); i++) { + if (!ASSERT_OK(cgroups[i].victim != + (pid == cgroups[i].pid), + "correct process was killed")) { + ret = -1; + break; + } + + if (!cgroups[i].victim) + continue; + + /* Check the memcg oom counter */ + size = read_cgroup_file(cgroups[i].path, "memory.events", + buf, sizeof(buf)); + if (!ASSERT_OK(size <= 0, "read memory.events")) { + ret = -1; + break; + } + + if (!ASSERT_OK(strstr(buf, "oom_kill 1") == NULL, + "oom_kill count check")) { + ret = -1; + break; + } + } + + /* Kill all remaining tasks */ + for (i = 0; i < ARRAY_SIZE(cgroups); i++) + if (cgroups[i].pid && cgroups[i].pid != pid) + kill(cgroups[i].pid, SIGKILL); + } + + return ret; +} + +void test_psi(void) +{ + struct test_psi *skel; + u64 deleted_cgroup_id; + int new_cgroup_fd; + u64 new_cgroup_id; + int err; + + setup_environment(); + + skel = test_psi__open_and_load(); + err = libbpf_get_error(skel); + if (CHECK_FAIL(err)) + goto cleanup; + + skel->bss->deleted_cgroup_id = cgroups[1].id; + skel->bss->high_pressure_cgroup_id = cgroups[2].id; + + err = test_psi__attach(skel); + if (CHECK_FAIL(err)) + goto cleanup; + + /* Delete the first cgroup, it should trigger handle_cgroup_offline() */ + remove_cgroup(cgroups[1].path); + + new_cgroup_fd = create_and_get_cgroup("/psi_test_new"); + if (!ASSERT_GE(new_cgroup_fd, 0, "create_and_get_cgroup")) + goto cleanup; + + new_cgroup_id = get_cgroup_id("/psi_test_new"); + if (!ASSERT_GT(new_cgroup_id, 0, "get_cgroup_id")) + goto cleanup; + + /* Unfreeze all child tasks and create the memory pressure */ + err = run_and_wait_for_oom(); + CHECK_FAIL(err); + + /* Check the result of the handle_cgroup_offline() handler */ + deleted_cgroup_id = skel->bss->deleted_cgroup_id; + ASSERT_EQ(deleted_cgroup_id, cgroups[1].id, "deleted cgroup id"); + + /* Check the result of the handle_cgroup_online() handler */ + ASSERT_EQ(skel->bss->new_cgroup_id, new_cgroup_id, + "new cgroup id"); + +cleanup: + cleanup_cgroup_environment(); + test_psi__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_psi.c b/tools/testing/selftests/bpf/progs/test_psi.c new file mode 100644 index 0000000000000..4e5cdb5242d1f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_psi.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +#define PSI_FULL 0x80000000 + +/* cgroup which will experience the high memory pressure */ +u64 high_pressure_cgroup_id; + +/* cgroup which will be deleted */ +u64 deleted_cgroup_id; + +/* cgroup which will be created */ +u64 new_cgroup_id; + +/* cgroup which was deleted */ +u64 deleted_cgroup_id; + +char constraint_name[] = "CONSTRAINT_BPF_PSI_MEM"; + +SEC("struct_ops.s/init") +int BPF_PROG(psi_init, struct bpf_psi *bpf_psi) +{ + int ret; + + ret = bpf_psi_create_trigger(bpf_psi, high_pressure_cgroup_id, + PSI_MEM | PSI_FULL, 100000, 1000000); + if (ret) + return ret; + + return bpf_psi_create_trigger(bpf_psi, deleted_cgroup_id, + PSI_IO, 100000, 1000000); +} + +SEC("struct_ops.s/handle_psi_event") +void BPF_PROG(handle_psi_event, struct psi_trigger *t) +{ + u64 cgroup_id = t->cgroup_id; + struct mem_cgroup *memcg; + struct cgroup *cgroup; + + cgroup = bpf_cgroup_from_id(cgroup_id); + if (!cgroup) + return; + + memcg = bpf_get_mem_cgroup(&cgroup->self); + if (!memcg) { + bpf_cgroup_release(cgroup); + return; + } + + bpf_out_of_memory(memcg, 0, BPF_OOM_FLAGS_WAIT_ON_OOM_LOCK, + constraint_name); + + bpf_put_mem_cgroup(memcg); + bpf_cgroup_release(cgroup); +} + +SEC("struct_ops.s/handle_cgroup_online") +void BPF_PROG(handle_cgroup_online, u64 cgroup_id) +{ + new_cgroup_id = cgroup_id; +} + +SEC("struct_ops.s/handle_cgroup_offline") +void BPF_PROG(handle_cgroup_offline, u64 cgroup_id) +{ + deleted_cgroup_id = cgroup_id; +} + +SEC(".struct_ops.link") +struct bpf_psi_ops test_bpf_psi = { + .init = (void *)psi_init, + .handle_psi_event = (void *)handle_psi_event, + .handle_cgroup_online = (void *)handle_cgroup_online, + .handle_cgroup_offline = (void *)handle_cgroup_offline, +}; From 0fe9540ca46f1dc6caa1389619f534aa3ed996a7 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Sun, 26 Oct 2025 09:30:01 -0700 Subject: [PATCH 24/28] fix cgroup iter memcg selftests Signed-off-by: JP Kobryn --- tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c index 86f558a096143..2be4344984576 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c @@ -127,7 +127,7 @@ static void test_shmem(struct bpf_link *link, struct memcg_query *memcg_query) shm_unlink("/tmp_shmem"); } -#define NR_PIPES 2 +#define NR_PIPES 64 static void test_kmem(struct bpf_link *link, struct memcg_query *memcg_query) { int fds[NR_PIPES][2], i; From 1baf0ad817ec1498ac63b97032f919e87021fcae Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 27 Oct 2025 11:35:03 -0700 Subject: [PATCH 25/28] bpf: selftests: add config for psi Include CONFIG_PSI to allow dependent tests to build. Signed-off-by: JP Kobryn --- tools/testing/selftests/bpf/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 70b28c1e653ea..178c840c844bc 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -110,6 +110,7 @@ CONFIG_IP6_NF_IPTABLES=y CONFIG_IP6_NF_FILTER=y CONFIG_NF_NAT=y CONFIG_PACKET=y +CONFIG_PSI=y CONFIG_RC_CORE=y CONFIG_SECURITY=y CONFIG_SECURITYFS=y From 021953e79f9e2084c006e0290f7997c37d3c6099 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 27 Oct 2025 13:11:23 -0700 Subject: [PATCH 26/28] Fix issue using ftruncate on tmpfile. Use normal file instead. Signed-off-by: JP Kobryn --- .../selftests/bpf/prog_tests/cgroup_iter_memcg.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c index 2be4344984576..f66a57b664e75 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c @@ -63,19 +63,17 @@ static void test_file(struct bpf_link *link, struct memcg_query *memcg_query) { void *map; size_t len; - FILE *f; + char *path; int fd; len = sysconf(_SC_PAGESIZE) * 1024; + path = "/tmp/test_cgroup_iter_memcg"; /* * Increase memcg file usage by creating and writing - * to a temoprary mapped file. + * to a mapped file. */ - f = tmpfile(); - if (!ASSERT_OK_PTR(f, "tmpfile")) - return; - fd = fileno(f); + fd = open(path, O_CREAT | O_WRONLY); if (!ASSERT_OK_FD(fd, "open fd")) return; if (!ASSERT_OK(ftruncate(fd, len), "ftruncate")) @@ -97,6 +95,7 @@ static void test_file(struct bpf_link *link, struct memcg_query *memcg_query) munmap(map, len); cleanup_fd: close(fd); + unlink(path); } static void test_shmem(struct bpf_link *link, struct memcg_query *memcg_query) From a9e5c9d77e087c15483b091a38653380f18d1341 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 27 Oct 2025 14:47:35 -0700 Subject: [PATCH 27/28] remove unneeded PROT_READ Signed-off-by: JP Kobryn --- tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c index f66a57b664e75..e354d13693b22 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c @@ -44,7 +44,7 @@ static void test_anon(struct bpf_link *link, struct memcg_query *memcg_query) * Increase memcg anon usage by mapping and writing * to a new anon region. */ - map = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) return; @@ -79,7 +79,7 @@ static void test_file(struct bpf_link *link, struct memcg_query *memcg_query) if (!ASSERT_OK(ftruncate(fd, len), "ftruncate")) goto cleanup_fd; - map = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + map = mmap(NULL, len, PROT_WRITE, MAP_SHARED, fd, 0); if (!ASSERT_NEQ(map, MAP_FAILED, "mmap file")) goto cleanup_fd; @@ -160,8 +160,7 @@ static void test_pgfault(struct bpf_link *link, struct memcg_query *memcg_query) len = sysconf(_SC_PAGESIZE) * 1024; /* Create region to use for triggering a page fault. */ - map = mmap(NULL, len, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + map = mmap(NULL, len, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (!ASSERT_NEQ(map, MAP_FAILED, "mmap anon")) return; From 2fe8bce47433bc4e398b61f7a1351763e31e5c28 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 27 Oct 2025 15:17:38 -0700 Subject: [PATCH 28/28] fix open flag Signed-off-by: JP Kobryn --- tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c index e354d13693b22..215e4c98c76f1 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_iter_memcg.c @@ -73,7 +73,7 @@ static void test_file(struct bpf_link *link, struct memcg_query *memcg_query) * Increase memcg file usage by creating and writing * to a mapped file. */ - fd = open(path, O_CREAT | O_WRONLY); + fd = open(path, O_CREAT | O_RDWR, 0644); if (!ASSERT_OK_FD(fd, "open fd")) return; if (!ASSERT_OK(ftruncate(fd, len), "ftruncate"))