From 7c1102b51ada74164872bccf1f7deb4cf5962e37 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Mon, 28 Apr 2025 16:16:52 +0800 Subject: [PATCH 1/3] bpf, sockmap: Introduce a new kfunc for sockmap Since the helper list is effectively frozen and the existing helpers cannot be extended, we add a new kfunc instead which simply set the redir_cpu to psock. The new kfunc is used to set redir_cpu to psock. All these changes conform to the kfuncs.rst documentation. Signed-off-by: Jiayuan Chen --- Documentation/bpf/map_sockmap.rst | 14 +++++++++++ include/linux/skmsg.h | 3 +++ kernel/bpf/btf.c | 3 +++ net/core/skmsg.c | 1 + net/core/sock_map.c | 39 +++++++++++++++++++++++++++++++ 5 files changed, 60 insertions(+) diff --git a/Documentation/bpf/map_sockmap.rst b/Documentation/bpf/map_sockmap.rst index 2d630686a00ba..eca3dfc1c85fe 100644 --- a/Documentation/bpf/map_sockmap.rst +++ b/Documentation/bpf/map_sockmap.rst @@ -212,6 +212,20 @@ following cases: Returns 0 +bpf_sk_skb_set_redirect_cpu() +^^^^^^^^^^^^^^^^^^^^^^ +.. code-block:: c + + int bpf_sk_skb_set_redirect_cpu(struct __sk_buff *s, int redir_cpu) + +This kfunc ``bpf_sk_skb_set_redirect_cpu()`` is available to +``BPF_PROG_TYPE_SK_SKB`` BPF programs. It sets the CPU affinity, allowing the +sockmap packet redirecting process to run on the specified CPU as much as +possible, helping users reduce the interference between the sockmap redirecting +background thread and other threads. + +Returns 0 on success, or a negative error in case of failure. + bpf_msg_cork_bytes() ^^^^^^^^^^^^^^^^^^^^^^ .. code-block:: c diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 0b9095a281b89..b888481a845de 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -16,6 +16,8 @@ #define MAX_MSG_FRAGS MAX_SKB_FRAGS #define NR_MSG_FRAG_IDS (MAX_MSG_FRAGS + 1) +#define BPF_SK_REDIR_CPU_UNSET -1 + enum __sk_action { __SK_DROP = 0, __SK_PASS, @@ -86,6 +88,7 @@ struct sk_psock { u32 apply_bytes; u32 cork_bytes; u32 eval; + s32 redir_cpu; bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a91822bae043b..2a8f59e2c6393 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -219,6 +219,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_LWT, BTF_KFUNC_HOOK_NETFILTER, BTF_KFUNC_HOOK_KPROBE, + BTF_KFUNC_HOOK_SK_MSG, BTF_KFUNC_HOOK_MAX, }; @@ -8649,6 +8650,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) return BTF_KFUNC_HOOK_SCHED_ACT; case BPF_PROG_TYPE_SK_SKB: return BTF_KFUNC_HOOK_SK_SKB; + case BPF_PROG_TYPE_SK_MSG: + return BTF_KFUNC_HOOK_SK_MSG; case BPF_PROG_TYPE_SOCKET_FILTER: return BTF_KFUNC_HOOK_SOCKET_FILTER; case BPF_PROG_TYPE_LWT_OUT: diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2769346730667..292752c783b59 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -741,6 +741,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) psock->saved_destroy = prot->destroy; psock->saved_close = prot->close; psock->saved_write_space = sk->sk_write_space; + psock->redir_cpu = BPF_SK_REDIR_CPU_UNSET; INIT_LIST_HEAD(&psock->link); spin_lock_init(&psock->link_lock); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 82a14f131d00c..9f1e531a38078 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -701,6 +701,45 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = { .arg4_type = ARG_ANYTHING, }; +__bpf_kfunc_start_defs(); + +__bpf_kfunc int bpf_sk_skb_set_redirect_cpu(struct __sk_buff *s, int redir_cpu) +{ + struct sk_buff *skb = (struct sk_buff *)s; + struct sock *sk = skb->sk; + struct sk_psock *psock; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!sk || redir_cpu >= num_possible_cpus()) + return -EINVAL; + + psock = sk_psock(sk); + if (!psock) + return -ENOENT; + + psock->redir_cpu = redir_cpu; + return 0; +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(bpf_sk_sockmap_kfunc_ids) +BTF_ID_FLAGS(func, bpf_sk_skb_set_redirect_cpu) +BTF_KFUNCS_END(bpf_sk_sockmap_kfunc_ids) + +static const struct btf_kfunc_id_set bpf_sk_sockmap_kfunc_set = { + .owner = THIS_MODULE, + .set = &bpf_sk_sockmap_kfunc_ids, +}; + +static int init_sockmap_subsystem(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_sk_sockmap_kfunc_set); +} + +late_initcall(init_sockmap_subsystem); + struct sock_map_seq_info { struct bpf_map *map; struct sock *sk; From da166b9e17615877853d660277758bf7b98f5fb1 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Mon, 28 Apr 2025 16:16:53 +0800 Subject: [PATCH 2/3] bpf, sockmap: Affinitize workqueue to a specific CPU Introduce a sk_psock_schedule_delayed_work() wrapper function, which calls schedule_delayed_work_on() to specify the CPU for running the workqueue if the BPF program has set the redirect CPU using bpf_sk_skb_set_redirect_cpu(). Otherwise, it falls back to the original logic. Signed-off-by: Jiayuan Chen --- include/linux/skmsg.h | 12 ++++++++++++ net/core/skmsg.c | 9 +++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index b888481a845de..21c7dd47186ff 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -396,6 +396,18 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) sk_error_report(sk); } +static inline void sk_psock_schedule_delayed_work(struct sk_psock *psock, + int delay) +{ + s32 redir_cpu = psock->redir_cpu; + + if (redir_cpu != BPF_SK_REDIR_CPU_UNSET) + schedule_delayed_work_on(redir_cpu, &psock->work, + delay); + else + schedule_delayed_work(&psock->work, delay); +} + struct sk_psock *sk_psock_init(struct sock *sk, int node); void sk_psock_stop(struct sk_psock *psock); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 292752c783b59..af00c09263a86 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -689,7 +689,7 @@ static void sk_psock_backlog(struct work_struct *work) * other work that might be here. */ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_delayed_work(&psock->work, 1); + sk_psock_schedule_delayed_work(psock, 1); goto end; } /* Hard errors break pipe and stop xmit. */ @@ -940,6 +940,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) sock_drop(from->sk, skb); return -EIO; } + psock_other->redir_cpu = from->redir_cpu; spin_lock_bh(&psock_other->ingress_lock); if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { spin_unlock_bh(&psock_other->ingress_lock); @@ -949,7 +950,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) } skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_delayed_work(&psock_other->work, 0); + sk_psock_schedule_delayed_work(psock_other, 0); spin_unlock_bh(&psock_other->ingress_lock); return 0; } @@ -1027,7 +1028,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); - schedule_delayed_work(&psock->work, 0); + sk_psock_schedule_delayed_work(psock, 0); err = 0; } spin_unlock_bh(&psock->ingress_lock); @@ -1059,7 +1060,7 @@ static void sk_psock_write_space(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_delayed_work(&psock->work, 0); + sk_psock_schedule_delayed_work(psock, 0); write_space = psock->saved_write_space; } rcu_read_unlock(); From 956af6eac2b53070303999388efef2f561dd1ce4 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Mon, 28 Apr 2025 16:16:54 +0800 Subject: [PATCH 3/3] selftest/bpf/benchs: Add cpu-affinity for sockmap bench Add cpu-affinity for sockmap bench. Also add no-verify args to avoid validating data for performance enhancements. Signed-off-by: Jiayuan Chen --- .../selftests/bpf/benchs/bench_sockmap.c | 35 +++++++++++++++++-- tools/testing/selftests/bpf/bpf_kfuncs.h | 6 ++++ .../selftests/bpf/progs/bench_sockmap_prog.c | 7 ++++ 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/benchs/bench_sockmap.c b/tools/testing/selftests/bpf/benchs/bench_sockmap.c index 8ebf563a67a2b..e004a618822a6 100644 --- a/tools/testing/selftests/bpf/benchs/bench_sockmap.c +++ b/tools/testing/selftests/bpf/benchs/bench_sockmap.c @@ -43,6 +43,8 @@ enum SOCKMAP_ARG_FLAG { ARG_FW_TX_VERDICT_INGRESS, ARG_FW_TX_VERDICT_EGRESS, ARG_CTL_RX_STRP, + ARG_CTL_CPU_AFFINITY, + ARG_CTL_NO_VERIFY, ARG_CONSUMER_DELAY_TIME, ARG_PRODUCER_DURATION, }; @@ -109,6 +111,8 @@ static struct socmap_ctx { int delay_consumer; int prod_run_time; int strp_size; + int cpu_affinity; + int skip_verify; } ctx = { .prod_send = 0, .user_read = 0, @@ -118,6 +122,8 @@ static struct socmap_ctx { .delay_consumer = 0, .prod_run_time = 0, .strp_size = 0, + .cpu_affinity = 0, + .skip_verify = 0, }; static void bench_sockmap_prog_destroy(void) @@ -235,11 +241,18 @@ static int create_sockets(void) static void validate(void) { if (env.consumer_cnt != 2 || env.producer_cnt != 1 || - !env.affinity) + !env.affinity) { + fprintf(stderr, "argument '-c 2 -p 1 -a' is necessary\n"); goto err; + } + + if (!ctx.cpu_affinity && env.nr_cpus < 4) { + fprintf(stderr, "4 CPU are needed to test cpu-affinity\n"); + goto err; + } + return; err: - fprintf(stderr, "argument '-c 2 -p 1 -a' is necessary"); exit(1); } @@ -327,6 +340,9 @@ static void setup(void) exit(1); } + if (ctx.cpu_affinity) + ctx.skel->data->redir_cpu = 3; + if (create_sockets()) { fprintf(stderr, "create_net_mode error\n"); goto err; @@ -367,9 +383,12 @@ static void measure(struct bench_res *res) static void verify_data(int *check_pos, char *buf, int rcv) { + if (ctx.skip_verify) + return; + for (int i = 0 ; i < rcv; i++) { if (buf[i] != snd_data[(*check_pos) % DATA_REPEAT_SIZE]) { - fprintf(stderr, "verify data fail"); + fprintf(stderr, "verify data fail\n"); exit(1); } (*check_pos)++; @@ -553,6 +572,10 @@ static const struct argp_option opts[] = { "delay consumer start"}, { "producer-duration", ARG_PRODUCER_DURATION, "SEC", 0, "producer duration"}, + { "cpu-affinity", ARG_CTL_CPU_AFFINITY, NULL, 0, + "set cpu-affinity for sockmap backlog thread"}, + { "no-verify", ARG_CTL_NO_VERIFY, NULL, 0, + "skip data validation for performance enhancements"}, {}, }; @@ -571,6 +594,12 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) case ARG_CTL_RX_STRP: ctx.strp_size = strtol(arg, NULL, 10); break; + case ARG_CTL_CPU_AFFINITY: + ctx.cpu_affinity = 1; + break; + case ARG_CTL_NO_VERIFY: + ctx.skip_verify = 1; + break; default: return ARGP_ERR_UNKNOWN; } diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 8215c9b3115e4..173329c5d034b 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -92,4 +92,10 @@ extern int bpf_set_dentry_xattr(struct dentry *dentry, const char *name__str, const struct bpf_dynptr *value_p, int flags) __ksym __weak; extern int bpf_remove_dentry_xattr(struct dentry *dentry, const char *name__str) __ksym __weak; +/* Description + * Set sockmap redir cpu + * Returns + * Error code + */ +extern int bpf_sk_skb_set_redirect_cpu(struct __sk_buff *skb, int redir_cpu) __ksym; #endif diff --git a/tools/testing/selftests/bpf/progs/bench_sockmap_prog.c b/tools/testing/selftests/bpf/progs/bench_sockmap_prog.c index 079bf3794b3a7..dd1a11cb4f488 100644 --- a/tools/testing/selftests/bpf/progs/bench_sockmap_prog.c +++ b/tools/testing/selftests/bpf/progs/bench_sockmap_prog.c @@ -2,11 +2,15 @@ #include #include #include +#include +#include "bpf_kfuncs.h" long process_byte = 0; int verdict_dir = 0; int dropped = 0; int pkt_size = 0; +int redir_cpu = -1; + struct { __uint(type, BPF_MAP_TYPE_SOCKMAP); __uint(max_entries, 20); @@ -33,6 +37,9 @@ int prog_skb_verdict(struct __sk_buff *skb) int one = 1; int ret = bpf_sk_redirect_map(skb, &sock_map_rx, one, verdict_dir); + if (redir_cpu != -1) + bpf_sk_skb_set_redirect_cpu(skb, redir_cpu); + if (ret == SK_DROP) dropped++; __sync_fetch_and_add(&process_byte, skb->len);