From 261ffd53cc8e91e6484a3170a1ddf59a16696667 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Tue, 1 Apr 2025 10:32:17 -0700 Subject: [PATCH 001/770] Drivers: hv: Fix bad pointer dereference in hv_get_partition_id 'output' is already a pointer to the output argument, it should be passed directly to hv_do_hypercall() without the '&' operator. Fixes: e96204e5e96e ("hyperv: Move hv_current_partition_id to arch-generic code") Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1743528737-20310-1-git-send-email-nunodasneves@linux.microsoft.com Signed-off-by: Wei Liu Message-ID: <1743528737-20310-1-git-send-email-nunodasneves@linux.microsoft.com> --- drivers/hv/hv_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index b3b11be116500..a7d7494feaca1 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -307,7 +307,7 @@ void __init hv_get_partition_id(void) local_irq_save(flags); output = *this_cpu_ptr(hyperv_pcpu_input_arg); - status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, &output); + status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output); pt_id = output->partition_id; local_irq_restore(flags); From 06eaa824fd239edd1eab2754f29b2d03da313003 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 18 Mar 2025 07:19:46 +0000 Subject: [PATCH 002/770] mm/memblock: pass size instead of end to memblock_set_node() The second parameter of memblock_set_node() is size instead of end. Since it iterates from lower address to higher address, finally the node id is correct. But during the process, some of them are wrong. Pass size instead of end. Fixes: 61167ad5fecd ("mm: pass nid to reserve_bootmem_region()") Signed-off-by: Wei Yang CC: Mike Rapoport CC: Yajun Deng CC: stable@vger.kernel.org Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250318071948.23854-2-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index 0a53db4d9f7be..9639f04b4fdf9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2196,7 +2196,7 @@ static void __init memmap_init_reserved_pages(void) if (memblock_is_nomap(region)) reserve_bootmem_region(start, end, nid); - memblock_set_node(start, end, &memblock.reserved, nid); + memblock_set_node(start, region->size, &memblock.reserved, nid); } /* From eac8ea8736ccc09513152d970eb2a42ed78e87e8 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 18 Mar 2025 07:19:47 +0000 Subject: [PATCH 003/770] mm/memblock: repeat setting reserved region nid if array is doubled Commit 61167ad5fecd ("mm: pass nid to reserve_bootmem_region()") introduce a way to set nid to all reserved region. But there is a corner case it will leave some region with invalid nid. When memblock_set_node() doubles the array of memblock.reserved, it may lead to a new reserved region before current position. The new region will be left with an invalid node id. Repeat the process when detecting it. Fixes: 61167ad5fecd ("mm: pass nid to reserve_bootmem_region()") Signed-off-by: Wei Yang CC: Mike Rapoport CC: Yajun Deng CC: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250318071948.23854-3-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- mm/memblock.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/memblock.c b/mm/memblock.c index 9639f04b4fdf9..d3509414b8c3a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2183,11 +2183,14 @@ static void __init memmap_init_reserved_pages(void) struct memblock_region *region; phys_addr_t start, end; int nid; + unsigned long max_reserved; /* * set nid on all reserved pages and also treat struct * pages for the NOMAP regions as PageReserved */ +repeat: + max_reserved = memblock.reserved.max; for_each_mem_region(region) { nid = memblock_get_region_node(region); start = region->base; @@ -2198,6 +2201,13 @@ static void __init memmap_init_reserved_pages(void) memblock_set_node(start, region->size, &memblock.reserved, nid); } + /* + * 'max' is changed means memblock.reserved has been doubled its + * array, which may result a new reserved region before current + * 'start'. Now we should repeat the procedure to set its node id. + */ + if (max_reserved != memblock.reserved.max) + goto repeat; /* * initialize struct pages for reserved regions that don't have From 3b394dff15e14550a26b133fc7b556b5b526f6a5 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 18 Mar 2025 07:19:48 +0000 Subject: [PATCH 004/770] memblock tests: add test for memblock_set_node Add a test to check memblock_set_node() behavior. And create a corner case in which the memblock.reserved array is doubled during memblock_set_node(). And finally make sure all regions in memblock.reserved are with valid node id. Signed-off-by: Wei Yang CC: Mike Rapoport CC: Yajun Deng Link: https://lore.kernel.org/r/20250318071948.23854-4-richard.weiyang@gmail.com Signed-off-by: Mike Rapoport (Microsoft) --- tools/testing/memblock/tests/basic_api.c | 102 +++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/tools/testing/memblock/tests/basic_api.c b/tools/testing/memblock/tests/basic_api.c index 67503089e6a0a..01e836fba4888 100644 --- a/tools/testing/memblock/tests/basic_api.c +++ b/tools/testing/memblock/tests/basic_api.c @@ -2434,6 +2434,107 @@ static int memblock_overlaps_region_checks(void) return 0; } +#ifdef CONFIG_NUMA +static int memblock_set_node_check(void) +{ + unsigned long i, max_reserved; + struct memblock_region *rgn; + void *orig_region; + + PREFIX_PUSH(); + + reset_memblock_regions(); + memblock_allow_resize(); + + dummy_physical_memory_init(); + memblock_add(dummy_physical_memory_base(), MEM_SIZE); + orig_region = memblock.reserved.regions; + + /* Equally Split range to node 0 and 1*/ + memblock_set_node(memblock_start_of_DRAM(), + memblock_phys_mem_size() / 2, &memblock.memory, 0); + memblock_set_node(memblock_start_of_DRAM() + memblock_phys_mem_size() / 2, + memblock_phys_mem_size() / 2, &memblock.memory, 1); + + ASSERT_EQ(memblock.memory.cnt, 2); + rgn = &memblock.memory.regions[0]; + ASSERT_EQ(rgn->base, memblock_start_of_DRAM()); + ASSERT_EQ(rgn->size, memblock_phys_mem_size() / 2); + ASSERT_EQ(memblock_get_region_node(rgn), 0); + rgn = &memblock.memory.regions[1]; + ASSERT_EQ(rgn->base, memblock_start_of_DRAM() + memblock_phys_mem_size() / 2); + ASSERT_EQ(rgn->size, memblock_phys_mem_size() / 2); + ASSERT_EQ(memblock_get_region_node(rgn), 1); + + /* Reserve 126 regions with the last one across node boundary */ + for (i = 0; i < 125; i++) + memblock_reserve(memblock_start_of_DRAM() + SZ_16 * i, SZ_8); + + memblock_reserve(memblock_start_of_DRAM() + memblock_phys_mem_size() / 2 - SZ_8, + SZ_16); + + /* + * Commit 61167ad5fecd ("mm: pass nid to reserve_bootmem_region()") + * do following process to set nid to each memblock.reserved region. + * But it may miss some region if memblock_set_node() double the + * array. + * + * By checking 'max', we make sure all region nid is set properly. + */ +repeat: + max_reserved = memblock.reserved.max; + for_each_mem_region(rgn) { + int nid = memblock_get_region_node(rgn); + + memblock_set_node(rgn->base, rgn->size, &memblock.reserved, nid); + } + if (max_reserved != memblock.reserved.max) + goto repeat; + + /* Confirm each region has valid node set */ + for_each_reserved_mem_region(rgn) { + ASSERT_TRUE(numa_valid_node(memblock_get_region_node(rgn))); + if (rgn == (memblock.reserved.regions + memblock.reserved.cnt - 1)) + ASSERT_EQ(1, memblock_get_region_node(rgn)); + else + ASSERT_EQ(0, memblock_get_region_node(rgn)); + } + + dummy_physical_memory_cleanup(); + + /* + * The current reserved.regions is occupying a range of memory that + * allocated from dummy_physical_memory_init(). After free the memory, + * we must not use it. So restore the origin memory region to make sure + * the tests can run as normal and not affected by the double array. + */ + memblock.reserved.regions = orig_region; + memblock.reserved.cnt = INIT_MEMBLOCK_RESERVED_REGIONS; + + test_pass_pop(); + + return 0; +} + +static int memblock_set_node_checks(void) +{ + prefix_reset(); + prefix_push("memblock_set_node"); + test_print("Running memblock_set_node tests...\n"); + + memblock_set_node_check(); + + prefix_pop(); + + return 0; +} +#else +static int memblock_set_node_checks(void) +{ + return 0; +} +#endif + int memblock_basic_checks(void) { memblock_initialization_check(); @@ -2444,6 +2545,7 @@ int memblock_basic_checks(void) memblock_bottom_up_checks(); memblock_trim_memory_checks(); memblock_overlaps_region_checks(); + memblock_set_node_checks(); return 0; } From 649b50a82f09fa44c2f7a65618e4584072145ab7 Mon Sep 17 00:00:00 2001 From: Ruslan Piasetskyi Date: Wed, 26 Mar 2025 23:06:38 +0100 Subject: [PATCH 005/770] mmc: renesas_sdhi: Fix error handling in renesas_sdhi_probe After moving tmio_mmc_host_probe down, error handling has to be adjusted. Fixes: 74f45de394d9 ("mmc: renesas_sdhi: register irqs before registering controller") Reviewed-by: Ihar Salauyou Signed-off-by: Ruslan Piasetskyi Reviewed-by: Geert Uytterhoeven Reviewed-by: Wolfram Sang Tested-by: Wolfram Sang Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250326220638.460083-1-ruslan.piasetskyi@gmail.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index fa6526be36381..cea6af5daf993 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -1243,26 +1243,26 @@ int renesas_sdhi_probe(struct platform_device *pdev, num_irqs = platform_irq_count(pdev); if (num_irqs < 0) { ret = num_irqs; - goto eirq; + goto edisclk; } /* There must be at least one IRQ source */ if (!num_irqs) { ret = -ENXIO; - goto eirq; + goto edisclk; } for (i = 0; i < num_irqs; i++) { irq = platform_get_irq(pdev, i); if (irq < 0) { ret = irq; - goto eirq; + goto edisclk; } ret = devm_request_irq(&pdev->dev, irq, tmio_mmc_irq, 0, dev_name(&pdev->dev), host); if (ret) - goto eirq; + goto edisclk; } ret = tmio_mmc_host_probe(host); @@ -1274,8 +1274,6 @@ int renesas_sdhi_probe(struct platform_device *pdev, return ret; -eirq: - tmio_mmc_host_remove(host); edisclk: renesas_sdhi_clk_disable(host); efree: From 9078f01fec1275a1974a01a64a5a495d72898c60 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 29 Mar 2025 17:41:26 +0100 Subject: [PATCH 006/770] mmc: renesas_sdhi: add regulator dependency The driver started using the regulator subsystem and fails to build without a dependeny on CONFIG_REGULATOR: ERROR: modpost: "rdev_get_drvdata" [drivers/mmc/host/renesas_sdhi_core.ko] undefined! ERROR: modpost: "devm_regulator_register" [drivers/mmc/host/renesas_sdhi_core.ko] undefined! The 'select RESET_CONTROLLER' needs to either go away or get changed to a dependency in order to avoid Kconfig dependency loops here. It also turns out the the superh version needs neither RESET_CONTROLLER nor REGULATOR, and this works because CONFIG_OF is not set there. Change both to a 'depends on', but add '|| !OF' for the superh case. Fixes: fae80a99dc03 ("mmc: renesas_sdhi: Add support for RZ/G3E SoC") Tested-by: Biju Das Signed-off-by: Arnd Bergmann Reviewed-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250329164145.3194284-1-arnd@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 6824131b69b18..264e11fa58eaf 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -691,8 +691,8 @@ config MMC_TMIO_CORE config MMC_SDHI tristate "Renesas SDHI SD/SDIO controller support" depends on SUPERH || ARCH_RENESAS || COMPILE_TEST + depends on (RESET_CONTROLLER && REGULATOR) || !OF select MMC_TMIO_CORE - select RESET_CONTROLLER if ARCH_RENESAS help This provides support for the SDHI SD/SDIO controller found in Renesas SuperH, ARM and ARM64 based SoCs From 77183db6b8dbd8c352816030b328dd55993dc330 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 31 Mar 2025 00:17:32 +0200 Subject: [PATCH 007/770] mmc: renesas_sdhi: disable clocks if registering regulator failed Because the clocks were just enabled, bail out to the proper target if there are problems with the regulator. Fixes: fae80a99dc03 ("mmc: renesas_sdhi: Add support for RZ/G3E SoC") Signed-off-by: Wolfram Sang Reviewed-by: Biju Das Tested-by: Biju Das Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250330221732.56072-2-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index cea6af5daf993..8c83e203c5167 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -1179,7 +1179,7 @@ int renesas_sdhi_probe(struct platform_device *pdev, if (IS_ERR(rdev)) { dev_err(dev, "regulator register failed err=%ld", PTR_ERR(rdev)); ret = PTR_ERR(rdev); - goto efree; + goto edisclk; } priv->rdev = rdev; } From 48afd55505247df3b8e00a15a0b25b58618bc3a0 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 7 Apr 2025 10:26:07 +0200 Subject: [PATCH 008/770] hamradio: Remove unnecessary strscpy_pad() size arguments If the destination buffer has a fixed length, strscpy_pad() automatically determines its size using sizeof() when the argument is omitted. This makes the explicit sizeof() calls unnecessary - remove them. No functional changes intended. Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20250407082607.741919-2-thorsten.blum@linux.dev Signed-off-by: Paolo Abeni --- drivers/net/hamradio/baycom_epp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 9e366f275406d..5fda7a0fcce06 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -1074,7 +1074,7 @@ static int baycom_siocdevprivate(struct net_device *dev, struct ifreq *ifr, return 0; case HDLCDRVCTL_DRIVERNAME: - strscpy_pad(hi.data.drivername, "baycom_epp", sizeof(hi.data.drivername)); + strscpy_pad(hi.data.drivername, "baycom_epp"); break; case HDLCDRVCTL_GETMODE: @@ -1091,8 +1091,7 @@ static int baycom_siocdevprivate(struct net_device *dev, struct ifreq *ifr, return baycom_setmode(bc, hi.data.modename); case HDLCDRVCTL_MODELIST: - strscpy_pad(hi.data.modename, "intclk,extclk,intmodem,extmodem,divider=x", - sizeof(hi.data.modename)); + strscpy_pad(hi.data.modename, "intclk,extclk,intmodem,extmodem,divider=x"); break; case HDLCDRVCTL_MODEMPARMASK: From fc2e4f4f7b5f24327113297e624f13f2847cc6ea Mon Sep 17 00:00:00 2001 From: Julian Vetter Date: Mon, 7 Apr 2025 10:33:06 +0200 Subject: [PATCH 009/770] eth: nfp: remove __get_unaligned_cpu32 from netronome drivers The __get_unaligned_cpu32 function is deprecated. So, replace it with the more generic get_unaligned and just cast the input parameter. Signed-off-by: Julian Vetter Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250407083306.1553921-1-julian@outer-limits.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/netronome/nfp/nfd3/dp.c | 2 +- drivers/net/ethernet/netronome/nfp/nfdk/dp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfd3/dp.c b/drivers/net/ethernet/netronome/nfp/nfd3/dp.c index f1c6c47564b17..08086eb76996f 100644 --- a/drivers/net/ethernet/netronome/nfp/nfd3/dp.c +++ b/drivers/net/ethernet/netronome/nfp/nfd3/dp.c @@ -779,7 +779,7 @@ nfp_nfd3_parse_meta(struct net_device *netdev, struct nfp_meta_parsed *meta, case NFP_NET_META_CSUM: meta->csum_type = CHECKSUM_COMPLETE; meta->csum = - (__force __wsum)__get_unaligned_cpu32(data); + (__force __wsum)get_unaligned((u32 *)data); data += 4; break; case NFP_NET_META_RESYNC_INFO: diff --git a/drivers/net/ethernet/netronome/nfp/nfdk/dp.c b/drivers/net/ethernet/netronome/nfp/nfdk/dp.c index ebeb6ab4465c6..ab3cd06ed63ed 100644 --- a/drivers/net/ethernet/netronome/nfp/nfdk/dp.c +++ b/drivers/net/ethernet/netronome/nfp/nfdk/dp.c @@ -779,7 +779,7 @@ nfp_nfdk_parse_meta(struct net_device *netdev, struct nfp_meta_parsed *meta, case NFP_NET_META_CSUM: meta->csum_type = CHECKSUM_COMPLETE; meta->csum = - (__force __wsum)__get_unaligned_cpu32(data); + (__force __wsum)get_unaligned((u32 *)data); data += 4; break; case NFP_NET_META_RESYNC_INFO: From 4acdd3de31c878804c6be826cb8c508f18962d0e Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 7 Apr 2025 11:14:42 +0200 Subject: [PATCH 010/770] rocker: Simplify if condition in ofdpa_port_fdb() Remove the double negation and simplify the if condition. No functional changes intended. Signed-off-by: Thorsten Blum Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20250407091442.743478-1-thorsten.blum@linux.dev Signed-off-by: Paolo Abeni --- drivers/net/ethernet/rocker/rocker_ofdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index 826990459fa44..8832bfdd88330 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -1933,7 +1933,7 @@ static int ofdpa_port_fdb(struct ofdpa_port *ofdpa_port, spin_unlock_irqrestore(&ofdpa->fdb_tbl_lock, lock_flags); /* Check if adding and already exists, or removing and can't find */ - if (!found != !removing) { + if (!found == removing) { kfree(fdb); if (!found && removing) return 0; From 34a07c5b257453b5fcadc2408719c7b075844014 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Mon, 7 Apr 2025 15:59:13 +0530 Subject: [PATCH 011/770] amd-xgbe: Convert to SPDX identifier Use SPDX-License-Identifier accross all the files of the xgbe driver to ensure compliance with Linux kernel standards, thus removing the boiler-plate template license text. Signed-off-by: Raju Rangoju Acked-by: Shyam Sundar S K Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250407102913.3063691-1-Raju.Rangoju@amd.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/amd/xgbe/xgbe-common.h | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-dcb.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-desc.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-dev.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-i2c.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-main.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-mdio.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-pci.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-platform.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe-ptp.c | 117 +----------------- drivers/net/ethernet/amd/xgbe/xgbe.h | 117 +----------------- 16 files changed, 64 insertions(+), 1808 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h b/drivers/net/ethernet/amd/xgbe/xgbe-common.h index 3b70f67376331..e3d33f5b96427 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #ifndef __XGBE_COMMON_H__ diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dcb.c b/drivers/net/ethernet/amd/xgbe/xgbe-dcb.c index c68ace804e376..1474df5544fab 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-dcb.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-dcb.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c index b35808d3d07f7..d9157c4acde93 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c index 230726d7b74f6..7dba849bcd6b0 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include "xgbe.h" diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c index f393228d41c7b..b51a3666dddb2 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 5475867708f42..30822f45df6b3 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c index 4431ab1c18b32..be0d2c7d08dc2 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-i2c.c b/drivers/net/ethernet/amd/xgbe/xgbe-i2c.c index 7a833894f52ad..d40011e8ddf2b 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-i2c.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-i2c.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c index 0e8698928e4d7..4ebdd123c4355 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c index 07f4f3418d018..71449edbb76de 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c index c636999a6a849..3e9f31256dce4 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c index d16eae415f721..2e6b8ffe785cb 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c index 268399dfcf22f..7a4dfa4e19c73 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-platform.c b/drivers/net/ethernet/amd/xgbe/xgbe-platform.c index 4365bd62942cf..47d53e59ccf63 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-platform.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-platform.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c b/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c index 7051bd7cf6dc3..978c4dd01fa0a 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-ptp.c @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #include diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index d85386cac8d16..e5f5104342aad 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -1,117 +1,8 @@ +// SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-3-Clause) /* - * AMD 10Gb Ethernet driver - * - * This file is available to you under your choice of the following two - * licenses: - * - * License 1: GPLv2 - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * - * This file is free software; you may copy, redistribute and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or (at - * your option) any later version. - * - * This file is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. - * - * - * License 2: Modified BSD - * - * Copyright (c) 2014-2016 Advanced Micro Devices, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of Advanced Micro Devices, Inc. nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF - * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * This file incorporates work covered by the following copyright and - * permission notice: - * The Synopsys DWC ETHER XGMAC Software Driver and documentation - * (hereinafter "Software") is an unsupported proprietary work of Synopsys, - * Inc. unless otherwise expressly agreed to in writing between Synopsys - * and you. - * - * The Software IS NOT an item of Licensed Software or Licensed Product - * under any End User Software License Agreement or Agreement for Licensed - * Product with Synopsys or any supplement thereto. Permission is hereby - * granted, free of charge, to any person obtaining a copy of this software - * annotated with this license and the Software, to deal in the Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is furnished - * to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included - * in all copies or substantial portions of the Software. - * - * THIS SOFTWARE IS BEING DISTRIBUTED BY SYNOPSYS SOLELY ON AN "AS IS" - * BASIS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE HEREBY DISCLAIMED. IN NO EVENT SHALL SYNOPSYS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF - * THE POSSIBILITY OF SUCH DAMAGE. + * Copyright (c) 2014-2025, Advanced Micro Devices, Inc. + * Copyright (c) 2014, Synopsys, Inc. + * All rights reserved */ #ifndef __XGBE_H__ From c3025e94daa9ce84ef0e53df983b5bf2fbd76ea6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Apr 2025 16:35:59 +0000 Subject: [PATCH 012/770] net: rps: change skb_flow_limit() hash function As explained in commit f3483c8e1da6 ("net: rfs: hash function change"), masking low order bits of skb_get_hash(skb) has low entropy. A NIC with 32 RX queues uses the 5 low order bits of rss key to select a queue. This means all packets landing to a given queue share the same 5 low order bits. Switch to hash_32() to reduce hash collisions. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250407163602.170356-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- net/core/dev.h | 2 +- net/core/sysctl_net_core.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 0608605cfc242..f674236f34beb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5038,7 +5038,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { - new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); + new_flow = hash_32(skb_get_hash(skb), fl->log_buckets); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; diff --git a/net/core/dev.h b/net/core/dev.h index 7ee203395d8e2..b1cd44b5f009b 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -16,7 +16,7 @@ struct cpumask; #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { u64 count; - unsigned int num_buckets; + u8 log_buckets; unsigned int history_head; u16 history[FLOW_LIMIT_HISTORY]; u8 buckets[]; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index c7769ee0d9c55..5cfe76ede5237 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -248,7 +248,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, ret = -ENOMEM; goto write_unlock; } - cur->num_buckets = netdev_flow_limit_table_len; + cur->log_buckets = ilog2(netdev_flow_limit_table_len); rcu_assign_pointer(sd->flow_limit, cur); } } From 7b6f0a852da34379a304d7020d70049ba5d1f0f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Apr 2025 16:36:00 +0000 Subject: [PATCH 013/770] net: rps: annotate data-races around (struct sd_flow_limit)->count softnet_seq_show() can read fl->count while another cpu updates this field from skb_flow_limit(). Make this field an 'unsigned int', as its only consumer only deals with 32 bit. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250407163602.170356-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 3 ++- net/core/dev.h | 2 +- net/core/net-procfs.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index f674236f34beb..969883173182b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5049,7 +5049,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { - fl->count++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(fl->count, fl->count + 1); rcu_read_unlock(); return true; } diff --git a/net/core/dev.h b/net/core/dev.h index b1cd44b5f009b..e855e1cb43fd1 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -15,7 +15,7 @@ struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { - u64 count; + unsigned int count; u8 log_buckets; unsigned int history_head; u16 history[FLOW_LIMIT_HISTORY]; diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 3e92bf0f9060b..69782d62fbe19 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -132,8 +132,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); + /* Pairs with WRITE_ONCE() in skb_flow_limit() */ if (fl) - flow_limit_count = fl->count; + flow_limit_count = READ_ONCE(fl->count); rcu_read_unlock(); #endif From 22d046a778e4344437fd49bb0995e315ec3fddcf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Apr 2025 16:36:01 +0000 Subject: [PATCH 014/770] net: add data-race annotations in softnet_seq_show() softnet_seq_show() reads several fields that might be updated concurrently. Add READ_ONCE() and WRITE_ONCE() annotations. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250407163602.170356-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 6 ++++-- net/core/net-procfs.c | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 969883173182b..4ccc6dc5303e8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4953,7 +4953,8 @@ static void rps_trigger_softirq(void *data) struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); - sd->received_rps++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->received_rps, sd->received_rps + 1); } #endif /* CONFIG_RPS */ @@ -7523,7 +7524,8 @@ static __latent_entropy void net_rx_action(void) */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { - sd->time_squeeze++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1); break; } } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 69782d62fbe19..4f0f0709a1cbc 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -145,11 +145,11 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - sd->processed, atomic_read(&sd->dropped), - sd->time_squeeze, 0, + READ_ONCE(sd->processed), atomic_read(&sd->dropped), + READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ - sd->received_rps, flow_limit_count, + READ_ONCE(sd->received_rps), flow_limit_count, input_qlen + process_qlen, (int)seq->index, input_qlen, process_qlen); return 0; From 0a7de4a8f898c480ffafe024c4a0a8b8819597f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Apr 2025 16:36:02 +0000 Subject: [PATCH 015/770] net: rps: remove kfree_rcu_mightsleep() use Add an rcu_head to sd_flow_limit and rps_sock_flow_table structs to use the more conventional and predictable k[v]free_rcu(). Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250407163602.170356-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 5 +++-- net/core/dev.h | 1 + net/core/sysctl_net_core.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/net/rps.h b/include/net/rps.h index e358e9711f275..507f4aa5d39b2 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -57,9 +57,10 @@ struct rps_dev_flow_table { * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { - u32 mask; + struct rcu_head rcu; + u32 mask; - u32 ents[] ____cacheline_aligned_in_smp; + u32 ents[] ____cacheline_aligned_in_smp; }; #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) diff --git a/net/core/dev.h b/net/core/dev.h index e855e1cb43fd1..710abc05ebdb0 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -15,6 +15,7 @@ struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { + struct rcu_head rcu; unsigned int count; u8 log_buckets; unsigned int history_head; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 5cfe76ede5237..5dbb2c6f371de 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -201,7 +201,7 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - kvfree_rcu_mightsleep(orig_sock_table); + kvfree_rcu(orig_sock_table, rcu); } } } @@ -239,7 +239,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - kfree_rcu_mightsleep(cur); + kfree_rcu(cur, rcu); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); From 0f681b0ecd190fb4516bb34cec227296b10533d1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 7 Apr 2025 09:47:59 -0700 Subject: [PATCH 016/770] net: ena: Support persistent per-NAPI config. Let's pass the queue index to netif_napi_add_config() to preserve per-NAPI config. Test: Set 100 to defer-hard-irqs (default is 0) and check the value after link down & up. $ cat /sys/class/net/enp39s0/napi_defer_hard_irqs 0 $ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \ --dump napi-get --json='{"ifindex": 2}' [{'defer-hard-irqs': 0, 'gro-flush-timeout': 0, 'id': 65, 'ifindex': 2, 'irq': 29, 'irq-suspend-timeout': 0}] $ sudo ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \ --do napi-set --json='{"id": 65, "defer-hard-irqs": 100}' $ sudo ip link set enp39s0 down && sudo ip link set enp39s0 up Without patch: $ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \ --dump napi-get --json='{"ifindex": 2}' [{'defer-hard-irqs': 0, <------------------- Reset to 0 'gro-flush-timeout': 0, 'id': 66, <------------------------------- New ID 'ifindex': 2, 'irq': 29, 'irq-suspend-timeout': 0}] With patch: $ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \ --dump napi-get --json='{"ifindex": 2}' [{'defer-hard-irqs': 100, <--------------+-- Preserved 'gro-flush-timeout': 0, | 'id': 65, <----------------------------' 'ifindex': 2, 'irq': 29, 'irq-suspend-timeout': 0}] Signed-off-by: Kuniyuki Iwashima Reviewed-by: Joe Damato Reviewed-by: Arthur Kiyanovski Link: https://patch.msgid.link/20250407164802.25184-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 70fa3adb49342..aa4a17edd98f8 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -1777,7 +1777,7 @@ static void ena_init_napi_in_range(struct ena_adapter *adapter, if (ENA_IS_XDP_INDEX(adapter, i)) napi_handler = ena_xdp_io_poll; - netif_napi_add(adapter->netdev, &napi->napi, napi_handler); + netif_napi_add_config(adapter->netdev, &napi->napi, napi_handler, i); if (!ENA_IS_XDP_INDEX(adapter, i)) napi->rx_ring = rx_ring; From 5ac40e6b5b0bac3f72020a48d01ada23e8450d0c Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Mon, 7 Apr 2025 18:56:56 -0300 Subject: [PATCH 017/770] selftests: tc-testing: Pre-load IFE action and its submodules Recently we had some issues in parallel TDC where some of IFE tests are failing due to some of IFE's submodules (like act_meta_skbtcindex and act_meta_skbprio) taking too long to load [1]. To avoid that issue, pre-load IFE and all its submodules before running any of the tests in tdc.sh [1] https://lore.kernel.org/netdev/e909b2a0-244e-4141-9fa9-1b7d96ab7d71@mojatatu.com/T/#u Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20250407215656.2535990-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/tc-testing/tdc.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tdc.sh b/tools/testing/selftests/tc-testing/tdc.sh index cddff1772e104..589b18ed758a6 100755 --- a/tools/testing/selftests/tc-testing/tdc.sh +++ b/tools/testing/selftests/tc-testing/tdc.sh @@ -31,6 +31,10 @@ try_modprobe act_skbedit try_modprobe act_skbmod try_modprobe act_tunnel_key try_modprobe act_vlan +try_modprobe act_ife +try_modprobe act_meta_mark +try_modprobe act_meta_skbtcindex +try_modprobe act_meta_skbprio try_modprobe cls_basic try_modprobe cls_bpf try_modprobe cls_cgroup From a36283e2b683f172aa1760c77325e50b16c0f792 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Apr 2025 17:45:41 +0200 Subject: [PATCH 018/770] udp_tunnel: create a fastpath GRO lookup. Most UDP tunnels bind a socket to a local port, with ANY address, no peer and no interface index specified. Additionally it's quite common to have a single tunnel device per namespace. Track in each namespace the UDP tunnel socket respecting the above. When only a single one is present, store a reference in the netns. When such reference is not NULL, UDP tunnel GRO lookup just need to match the incoming packet destination port vs the socket local port. The tunnel socket never sets the reuse[port] flag[s]. When bound to no address and interface, no other socket can exist in the same netns matching the specified local port. Matching packets with non-local destination addresses will be aggregated, and eventually segmented as needed - no behavior changes intended. Restrict the optimization to kernel sockets only: it covers all the relevant use-cases, and user-space owned sockets could be disconnected and rebound after setup_udp_tunnel_sock(), breaking the uniqueness assumption Note that the UDP tunnel socket reference is stored into struct netns_ipv4 for both IPv4 and IPv6 tunnels. That is intentional to keep all the fastpath-related netns fields in the same struct and allow cacheline-based optimization. Currently both the IPv4 and IPv6 socket pointer share the same cacheline as the `udp_table` field. Signed-off-by: Paolo Abeni Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/41d16bc8d1257d567f9344c445b4ae0b4a91ede4.1744040675.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 16 ++++++++++++++++ include/net/netns/ipv4.h | 11 +++++++++++ include/net/udp.h | 1 + include/net/udp_tunnel.h | 12 ++++++++++++ net/ipv4/udp.c | 13 ++++++++++++- net/ipv4/udp_offload.c | 37 +++++++++++++++++++++++++++++++++++++ net/ipv4/udp_tunnel_core.c | 13 +++++++++++++ net/ipv6/udp.c | 2 ++ net/ipv6/udp_offload.c | 5 +++++ 9 files changed, 109 insertions(+), 1 deletion(-) diff --git a/include/linux/udp.h b/include/linux/udp.h index 0807e21cfec95..895240177f4f4 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -101,6 +101,13 @@ struct udp_sock { /* Cache friendly copy of sk->sk_peek_off >= 0 */ bool peeking_with_offset; + + /* + * Accounting for the tunnel GRO fastpath. + * Unprotected by compilers guard, as it uses space available in + * the last UDP socket cacheline. + */ + struct hlist_node tunnel_list; }; #define udp_test_bit(nr, sk) \ @@ -219,4 +226,13 @@ static inline void udp_allow_gso(struct sock *sk) #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) +static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) +{ +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + return rcu_dereference(net->ipv4.udp_tunnel_gro[is_ipv6].sk); +#else + return NULL; +#endif +} + #endif /* _LINUX_UDP_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 650b2dc9199f4..6373e3f17da84 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -47,6 +47,11 @@ struct sysctl_fib_multipath_hash_seed { }; #endif +struct udp_tunnel_gro { + struct sock __rcu *sk; + struct hlist_head list; +}; + struct netns_ipv4 { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. @@ -85,6 +90,11 @@ struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + /* Not in a pernet subsys because need to be available at GRO stage */ + struct udp_tunnel_gro udp_tunnel_gro[2]; +#endif + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -277,4 +287,5 @@ struct netns_ipv4 { struct hlist_head *inet_addr_lst; struct delayed_work addr_chk_work; }; + #endif diff --git a/include/net/udp.h b/include/net/udp.h index 6e89520e100dc..a772510b2aa58 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -290,6 +290,7 @@ static inline void udp_lib_init_sock(struct sock *sk) struct udp_sock *up = udp_sk(sk); skb_queue_head_init(&up->reader_queue); + INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index a93dc51f6323e..1bb2b852e90eb 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -191,6 +191,18 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) } #endif +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +#else +static inline void udp_tunnel_update_gro_lookup(struct net *net, + struct sock *sk, bool add) {} +#endif + +static inline void udp_tunnel_cleanup_gro(struct sock *sk) +{ + udp_tunnel_update_gro_lookup(sock_net(sk), sk, false); +} + static inline void udp_tunnel_encap_enable(struct sock *sk) { if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2742cc7602bb5..04cd3353f1d46 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2897,8 +2897,10 @@ void udp_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (udp_test_bit(ENCAP_ENABLED, sk)) + if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udp_encap_needed_key); + udp_tunnel_cleanup_gro(sk); + } } } @@ -3810,6 +3812,15 @@ static void __net_init udp_set_table(struct net *net) static int __net_init udp_pernet_init(struct net *net) { +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + int i; + + /* No tunnel is configured */ + for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) { + INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list); + RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL); + } +#endif udp_sysctl_init(net); udp_set_table(net); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 2c0725583be39..69fc4eae82509 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -12,6 +12,38 @@ #include #include #include +#include + +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +static DEFINE_SPINLOCK(udp_tunnel_gro_lock); + +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) +{ + bool is_ipv6 = sk->sk_family == AF_INET6; + struct udp_sock *tup, *up = udp_sk(sk); + struct udp_tunnel_gro *udp_tunnel_gro; + + spin_lock(&udp_tunnel_gro_lock); + udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6]; + if (add) + hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list); + else if (up->tunnel_list.pprev) + hlist_del_init(&up->tunnel_list); + + if (udp_tunnel_gro->list.first && + !udp_tunnel_gro->list.first->next) { + tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock, + tunnel_list); + + rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup); + } else { + RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL); + } + + spin_unlock(&udp_tunnel_gro_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); +#endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, @@ -635,8 +667,13 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct iphdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, false); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 619a53eb672da..3d1214e6df0ea 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -58,6 +58,15 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, } EXPORT_SYMBOL(udp_sock_create4); +static bool sk_saddr_any(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#else + return !sk->sk_rcv_saddr; +#endif +} + void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { @@ -80,6 +89,10 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sk); + + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) && + sk->sk_kern_sock) + udp_tunnel_update_gro_lookup(net, sk, true); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 024458ef163c9..7317f8e053f1c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -1825,6 +1826,7 @@ void udpv6_destroy_sock(struct sock *sk) if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); + udp_tunnel_cleanup_gro(sk); } } } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 404212dfc99ab..d8445ac1b2e43 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -118,8 +118,13 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, { const struct ipv6hdr *iph = skb_gro_network_header(skb); struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, true); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, From 5d7f5b2f6b935517ee5fd8058dc32342a5cba3e1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Apr 2025 17:45:42 +0200 Subject: [PATCH 019/770] udp_tunnel: use static call for GRO hooks when possible It's quite common to have a single UDP tunnel type active in the whole system. In such a case we can replace the indirect call for the UDP tunnel GRO callback with a static call. Add the related accounting in the control path and switch to static call when possible. To keep the code simple use a static array for the registered tunnel types, and size such array based on the kernel config. Note that there are valid kernel configurations leading to UDP_MAX_TUNNEL_TYPES == 0 even with IS_ENABLED(CONFIG_NET_UDP_TUNNEL), Explicitly skip the accounting in such a case, to avoid compile warning when accessing "udp_tunnel_gro_types". Signed-off-by: Paolo Abeni Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/53d156cdfddcc9678449e873cc83e68fa1582653.1744040675.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/udp_tunnel.h | 4 ++ net/ipv4/udp_offload.c | 135 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 138 insertions(+), 1 deletion(-) diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 1bb2b852e90eb..288f06f23a804 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -193,13 +193,16 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add); #else static inline void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) {} +static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {} #endif static inline void udp_tunnel_cleanup_gro(struct sock *sk) { + udp_tunnel_update_gro_rcv(sk, false); udp_tunnel_update_gro_lookup(sock_net(sk), sk, false); } @@ -212,6 +215,7 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) if (READ_ONCE(sk->sk_family) == PF_INET6) ipv6_stub->udpv6_encap_enable(); #endif + udp_tunnel_update_gro_rcv(sk, true); udp_encap_enable(); } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 69fc4eae82509..787b2947d3a04 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -15,6 +15,38 @@ #include #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + +/* + * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL + * values for the udp tunnel static call. + */ +static struct sk_buff *dummy_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->flush = 1; + return NULL; +} + +typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); + +struct udp_tunnel_type_entry { + udp_tunnel_gro_rcv_t gro_receive; + refcount_t count; +}; + +#define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \ + IS_ENABLED(CONFIG_VXLAN) * 2 + \ + IS_ENABLED(CONFIG_NET_FOU) * 2 + \ + IS_ENABLED(CONFIG_XFRM) * 2) + +DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv); +static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call); +static struct mutex udp_tunnel_gro_type_lock; +static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES]; +static unsigned int udp_tunnel_gro_type_nr; static DEFINE_SPINLOCK(udp_tunnel_gro_lock); void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) @@ -43,6 +75,105 @@ void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) spin_unlock(&udp_tunnel_gro_lock); } EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); + +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) +{ + struct udp_tunnel_type_entry *cur = NULL; + struct udp_sock *up = udp_sk(sk); + int i, old_gro_type_nr; + + if (!UDP_MAX_TUNNEL_TYPES || !up->gro_receive) + return; + + mutex_lock(&udp_tunnel_gro_type_lock); + + /* Check if the static call is permanently disabled. */ + if (udp_tunnel_gro_type_nr > UDP_MAX_TUNNEL_TYPES) + goto out; + + for (i = 0; i < udp_tunnel_gro_type_nr; i++) + if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive) + cur = &udp_tunnel_gro_types[i]; + + old_gro_type_nr = udp_tunnel_gro_type_nr; + if (add) { + /* + * Update the matching entry, if found, or add a new one + * if needed + */ + if (cur) { + refcount_inc(&cur->count); + goto out; + } + + if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) { + pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n"); + /* Ensure static call will never be enabled */ + udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 1; + } else { + cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++]; + refcount_set(&cur->count, 1); + cur->gro_receive = up->gro_receive; + } + } else { + /* + * The stack cleanups only successfully added tunnel, the + * lookup on removal should never fail. + */ + if (WARN_ON_ONCE(!cur)) + goto out; + + if (!refcount_dec_and_test(&cur->count)) + goto out; + + /* Avoid gaps, so that the enable tunnel has always id 0 */ + *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr]; + } + + if (udp_tunnel_gro_type_nr == 1) { + static_call_update(udp_tunnel_gro_rcv, + udp_tunnel_gro_types[0].gro_receive); + static_branch_enable(&udp_tunnel_static_call); + } else if (old_gro_type_nr == 1) { + static_branch_disable(&udp_tunnel_static_call); + static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv); + } + +out: + mutex_unlock(&udp_tunnel_gro_type_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv); + +static void udp_tunnel_gro_init(void) +{ + mutex_init(&udp_tunnel_gro_type_lock); +} + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + if (static_branch_likely(&udp_tunnel_static_call)) { + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + return static_call(udp_tunnel_gro_rcv)(sk, head, skb); + } + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + +#else + +static void udp_tunnel_gro_init(void) {} + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + #endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, @@ -654,7 +785,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); + pp = udp_tunnel_gro_rcv(sk, head, skb); out: skb_gro_flush_final(skb, pp, flush); @@ -804,5 +935,7 @@ int __init udpv4_offload_init(void) .gro_complete = udp4_gro_complete, }, }; + + udp_tunnel_gro_init(); return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP); } From 420aabef3ab5fa743afb4d3d391f03ef0e777ca8 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 7 Apr 2025 21:01:02 +0200 Subject: [PATCH 020/770] net: Drop unused @sk of __skb_try_recv_from_queue() __skb_try_recv_from_queue() deals with a queue, @sk is not used since commit e427cad6eee4 ("net: datagram: drop 'destructor' argument from several helpers"). Remove sk from function parameters, adapt callers. No functional change intended. Signed-off-by: Michal Luczaj Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250407-cleanup-drop-param-sk-v1-1-cd076979afac@rbox.co Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 3 +-- net/core/datagram.c | 5 ++--- net/ipv4/udp.c | 8 ++++---- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b974a277975a8..f1381aff0f896 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4105,8 +4105,7 @@ static inline void skb_frag_list_init(struct sk_buff *skb) int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue, int *err, long *timeo_p, const struct sk_buff *skb); -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last); diff --git a/net/core/datagram.c b/net/core/datagram.c index f0693707aece4..f0634f0cb8346 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -163,8 +163,7 @@ static struct sk_buff *skb_set_peeked(struct sk_buff *skb) return skb; } -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) @@ -261,7 +260,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, * However, this function was correct in any case. 8) */ spin_lock_irqsave(&queue->lock, cpu_flags); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error, + skb = __skb_try_recv_from_queue(queue, flags, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 04cd3353f1d46..8867fe687888c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1942,8 +1942,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, error = -EAGAIN; do { spin_lock_bh(&queue->lock); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, - err, &last); + skb = __skb_try_recv_from_queue(queue, flags, off, err, + &last); if (skb) { if (!(flags & MSG_PEEK)) udp_skb_destructor(sk, skb); @@ -1964,8 +1964,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, queue); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, - err, &last); + skb = __skb_try_recv_from_queue(queue, flags, off, err, + &last); if (skb && !(flags & MSG_PEEK)) udp_skb_dtor_locked(sk, skb); spin_unlock(&sk_queue->lock); From 7bd47be16108e55e6bc85bdd3cae5c9a2bc98a89 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Apr 2025 10:21:26 +0300 Subject: [PATCH 021/770] dm table: Fix W=1 build warning when mempool_needs_integrity is unused The mempool_needs_integrity is unused. This, in particular, prevents kernel builds with Clang, `make W=1` and CONFIG_WERROR=y: drivers/md/dm-table.c:1052:7: error: variable 'mempool_needs_integrity' set but not used [-Werror,-Wunused-but-set-variable] 1052 | bool mempool_needs_integrity = t->integrity_supported; | ^ Fix this by removing the leftover. Fixes: 105ca2a2c2ff ("block: split struct bio_integrity_payload") Signed-off-by: Andy Shevchenko Signed-off-by: Mikulas Patocka --- drivers/md/dm-table.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 35100a435c88b..53759dbbe9d60 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1049,7 +1049,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * unsigned int min_pool_size = 0, pool_size; struct dm_md_mempools *pools; unsigned int bioset_flags = 0; - bool mempool_needs_integrity = t->integrity_supported; if (unlikely(type == DM_TYPE_NONE)) { DMERR("no table type is set, can't allocate mempools"); @@ -1074,8 +1073,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * per_io_data_size = max(per_io_data_size, ti->per_io_data_size); min_pool_size = max(min_pool_size, ti->num_flush_bios); - - mempool_needs_integrity |= ti->mempool_needs_integrity; } pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, From a82dc19db13649aa4232ce37cb6f4ceff851e2fe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:48 -0700 Subject: [PATCH 022/770] net: avoid potential race between netdev_get_by_index_lock() and netns switch netdev_get_by_index_lock() performs following steps: rcu_lock(); dev = lookup(netns, ifindex); dev_get(dev); rcu_unlock(); [... lock & validate the dev ...] return dev Validation right now only checks if the device is registered but since the lookup is netns-aware we must also protect against the device switching netns right after we dropped the RCU lock. Otherwise the caller in netns1 may get a pointer to a device which has just switched to netns2. We can't hold the lock for the entire netns change process (because of the NETDEV_UNREGISTER notifier), and there's no existing marking to indicate that the netns is unlisted because of netns move, so add one. AFAIU none of the existing netdev_get_by_index_lock() callers can suffer from this problem (NAPI code double checks the netns membership and other callers are either under rtnl_lock or not ns-sensitive), so this patch does not have to be treated as a fix. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 +++++- net/core/dev.c | 25 ++++++++++++++++++------- net/core/dev.h | 2 +- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cf3b6445817bb..8e9be80bc1670 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1952,6 +1952,7 @@ enum netdev_reg_state { * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one * @nd_net: Network namespace this network device is inside + * protected by @lock * * @ml_priv: Mid-layer private * @ml_priv_type: Mid-layer private type @@ -2359,6 +2360,9 @@ struct net_device { bool dismantle; + /** @moving_ns: device is changing netns, protected by @lock */ + bool moving_ns; + enum { RTNL_LINK_INITIALIZED, RTNL_LINK_INITIALIZING, @@ -2521,7 +2525,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up + * @up, @moving_ns, @nd_net * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues diff --git a/net/core/dev.c b/net/core/dev.c index 4ccc6dc5303e8..8c13e5339cc98 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -828,7 +828,7 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) dev_hold(dev); rcu_read_unlock(); - dev = __netdev_put_lock(dev); + dev = __netdev_put_lock(dev, net); if (!dev) return NULL; @@ -1039,10 +1039,11 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) * This helper is intended for locking net_device after it has been looked up * using a lockless lookup helper. Lock prevents the instance from going away. */ -struct net_device *__netdev_put_lock(struct net_device *dev) +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net) { netdev_lock(dev); - if (dev->reg_state > NETREG_REGISTERED) { + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { netdev_unlock(dev); dev_put(dev); return NULL; @@ -1070,7 +1071,7 @@ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) if (!dev) return NULL; - return __netdev_put_lock(dev); + return __netdev_put_lock(dev, net); } struct net_device * @@ -1090,7 +1091,7 @@ netdev_xa_find_lock(struct net *net, struct net_device *dev, dev_hold(dev); rcu_read_unlock(); - dev = __netdev_put_lock(dev); + dev = __netdev_put_lock(dev, net); if (dev) return dev; @@ -12157,7 +12158,11 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, netif_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev); - netdev_unlock_ops(dev); + + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->moving_ns = true; + netdev_unlock(dev); synchronize_net(); @@ -12193,7 +12198,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ + netdev_lock(dev); dev_net_set(dev, net); + netdev_unlock(dev); dev->ifindex = new_ifindex; if (new_name[0]) { @@ -12219,7 +12226,11 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, err = netdev_change_owner(dev, net_old, net); WARN_ON(err); - netdev_lock_ops(dev); + netdev_lock(dev); + dev->moving_ns = false; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); + /* Add the device back in the hashes */ list_netdevice(dev); /* Notify protocols, that a new device appeared. */ diff --git a/net/core/dev.h b/net/core/dev.h index 710abc05ebdb0..e7bf21f52fc76 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -30,7 +30,7 @@ netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); -struct net_device *__netdev_put_lock(struct net_device *dev); +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index); From 606048cbd8346e616cfaee01b0143d072534136d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:49 -0700 Subject: [PATCH 023/770] net: designate XSK pool pointers in queues as "ops protected" Read accesses go via xsk_get_pool_from_qid(), the call coming from the core and gve look safe (other "ops locked" drivers don't support XSK). Write accesses go via xsk_reg_pool_at_qid() and xsk_clear_pool_at_qid(). Former is already under the ops lock, latter is not (both coming from the workqueue via xp_clear_dev() and NETDEV_UNREGISTER via xsk_notifier()). Acked-by: Stanislav Fomichev Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + include/net/netdev_rx_queue.h | 6 +++--- net/xdp/xsk_buff_pool.c | 6 +++++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8e9be80bc1670..7242fb8a22fc5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -688,6 +688,7 @@ struct netdev_queue { /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS + /* "ops protected", see comment about net_device::lock */ struct xsk_buff_pool *pool; #endif diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index b2238b551dced..8cdcd138b33f2 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -20,12 +20,12 @@ struct netdev_rx_queue { struct net_device *dev; netdevice_tracker dev_tracker; + /* All fields below are "ops protected", + * see comment about net_device::lock + */ #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif - /* NAPI instance for the queue - * "ops protected", see comment about net_device::lock - */ struct napi_struct *napi; struct pp_memory_provider_params mp_params; } ____cacheline_aligned_in_smp; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 25a76c5ce0f12..cbf2129e808b3 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -266,13 +266,17 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, void xp_clear_dev(struct xsk_buff_pool *pool) { + struct net_device *netdev = pool->netdev; + if (!pool->netdev) return; + netdev_lock_ops(netdev); xp_disable_drv_zc(pool); xsk_clear_pool_at_qid(pool->netdev, pool->queue_id); - dev_put(pool->netdev); pool->netdev = NULL; + netdev_unlock_ops(netdev); + dev_put(netdev); } static void xp_release_deferred(struct work_struct *work) From 4ec9031cbeb73a66979560bbb6d355329be762de Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:50 -0700 Subject: [PATCH 024/770] netdev: add "ops compat locking" helpers Add helpers to "lock a netdev in a backward-compatible way", which for ops-locked netdevs will mean take the instance lock. For drivers which haven't opted into the ops locking we'll take rtnl_lock. The scoped foreach is dropping and re-taking the lock for each device, even if prev and next are both under rtnl_lock. I hope that's fine since we expect that netdev nl to be mostly supported by modern drivers, and modern drivers should also opt into the instance locking. Note that these helpers are mostly needed for queue related state, because drivers modify queue config in their ops in a non-atomic way. Or differently put, queue changes don't have a clear-cut API like NAPI configuration. Any state that can should just use the instance lock directly, not the "compat" hacks. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netdev_lock.h | 16 ++++++++++++ net/core/dev.c | 51 +++++++++++++++++++++++++++++++++++++++ net/core/dev.h | 15 ++++++++++++ 3 files changed, 82 insertions(+) diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index c316b551df8d6..5706835a660c1 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -64,6 +64,22 @@ netdev_ops_assert_locked_or_invisible(const struct net_device *dev) netdev_ops_assert_locked(dev); } +static inline void netdev_lock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); + else + rtnl_lock(); +} + +static inline void netdev_unlock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); + else + rtnl_unlock(); +} + static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { diff --git a/net/core/dev.c b/net/core/dev.c index 8c13e5339cc98..b52efa4cec564 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1052,6 +1052,20 @@ struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net) return dev; } +static struct net_device * +__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net) +{ + netdev_lock_ops_compat(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock_ops_compat(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + /** * netdev_get_by_index_lock() - find a device by its ifindex * @net: the applicable net namespace @@ -1074,6 +1088,18 @@ struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) return __netdev_put_lock(dev, net); } +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock_ops_compat(dev, net); +} + struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, unsigned long *index) @@ -1099,6 +1125,31 @@ netdev_xa_find_lock(struct net *net, struct net_device *dev, } while (true); } +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index) +{ + if (dev) + netdev_unlock_ops_compat(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock_ops_compat(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); +} + static DEFINE_SEQLOCK(netdev_rename_lock); void netdev_copy_name(struct net_device *dev, char *name) diff --git a/net/core/dev.h b/net/core/dev.h index e7bf21f52fc76..e93f36b7ddf36 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -42,6 +42,21 @@ DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T)); (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \ ifindex++) +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex); +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index); + +DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *, + if (_T) netdev_unlock_ops_compat(_T)); + +#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \ + for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \ + (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \ + &ifindex)); \ + ifindex++) + #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else From d02e3b388221562cc7acabe700e3d3a29e51ad1d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:51 -0700 Subject: [PATCH 025/770] netdev: don't hold rtnl_lock over nl queue info get when possible Netdev queue dump accesses: NAPI, memory providers, XSk pointers. All three are "ops protected" now, switch to the op compat locking. rtnl lock does not have to be taken for "ops locked" devices. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 5d7af50fe7027..7ef9b01919364 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -481,18 +481,15 @@ int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) if (!rsp) return -ENOMEM; - rtnl_lock(); - - netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info), + ifindex); if (netdev) { err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); - netdev_unlock(netdev); + netdev_unlock_ops_compat(netdev); } else { err = -ENODEV; } - rtnl_unlock(); - if (err) goto err_free_msg; @@ -541,17 +538,17 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); - rtnl_lock(); if (ifindex) { - netdev = netdev_get_by_index_lock(net, ifindex); + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); if (netdev) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); - netdev_unlock(netdev); + netdev_unlock_ops_compat(netdev); } else { err = -ENODEV; } } else { - for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { + for_each_netdev_lock_ops_compat_scoped(net, netdev, + ctx->ifindex) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); if (err < 0) break; @@ -559,7 +556,6 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) ctx->txq_idx = 0; } } - rtnl_unlock(); return err; } From 03df156dd3a6d5992f17682cd5c3b11e5ffdae02 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:52 -0700 Subject: [PATCH 026/770] xdp: double protect netdev->xdp_flags with netdev->lock Protect xdp_features with netdev->lock. This way pure readers no longer have to take rtnl_lock to access the field. This includes calling NETDEV_XDP_FEAT_CHANGE under the lock. Looks like that's fine for bonding, the only "real" listener, it's the same as ethtool feature change. In terms of normal drivers - only GVE need special consideration (other drivers don't use instance lock or don't support XDP). It calls xdp_set_features_flag() helper from gve_init_priv() which in turn is called from gve_reset_recovery() (locked), or prior to netdev registration. So switch to _locked. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Acked-by: Harshitha Ramamurthy Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250408195956.412733-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/netdevices.rst | 1 + drivers/net/ethernet/google/gve/gve_main.c | 2 +- include/linux/netdevice.h | 2 +- include/net/xdp.h | 1 + net/core/lock_debug.c | 2 +- net/core/xdp.c | 12 +++++++++++- 6 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index 6c2d8945f5979..d6357472d3f19 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -354,6 +354,7 @@ For devices with locked ops, currently only the following notifiers are running under the lock: * ``NETDEV_REGISTER`` * ``NETDEV_UP`` +* ``NETDEV_XDP_FEAT_CHANGE`` The following notifiers are running without the lock: * ``NETDEV_UNREGISTER`` diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index f9a73c9568614..7a249baee316a 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -2185,7 +2185,7 @@ static void gve_set_netdev_xdp_features(struct gve_priv *priv) xdp_features = 0; } - xdp_set_features_flag(priv->dev, xdp_features); + xdp_set_features_flag_locked(priv->dev, xdp_features); } static int gve_init_priv(struct gve_priv *priv, bool skip_describe_device) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7242fb8a22fc5..dece2ae396a1e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2526,7 +2526,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up, @moving_ns, @nd_net + * @up, @moving_ns, @nd_net, @xdp_flags * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues diff --git a/include/net/xdp.h b/include/net/xdp.h index 48efacbaa35da..20e41b5ff319a 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -616,6 +616,7 @@ struct xdp_metadata_ops { u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); #else diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c index b7f22dc92a6f3..598c443ef2f39 100644 --- a/net/core/lock_debug.c +++ b/net/core/lock_debug.c @@ -20,6 +20,7 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event, switch (cmd) { case NETDEV_REGISTER: case NETDEV_UP: + case NETDEV_XDP_FEAT_CHANGE: netdev_ops_assert_locked(dev); fallthrough; case NETDEV_DOWN: @@ -58,7 +59,6 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event, case NETDEV_OFFLOAD_XSTATS_DISABLE: case NETDEV_OFFLOAD_XSTATS_REPORT_USED: case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: - case NETDEV_XDP_FEAT_CHANGE: ASSERT_RTNL(); break; diff --git a/net/core/xdp.c b/net/core/xdp.c index f86eedad586a7..3cd0db9c9d2d3 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -17,6 +17,7 @@ #include #include +#include #include #include /* struct xdp_mem_allocator */ #include @@ -991,17 +992,26 @@ static int __init xdp_metadata_init(void) } late_initcall(xdp_metadata_init); -void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val) { val &= NETDEV_XDP_ACT_MASK; if (dev->xdp_features == val) return; + netdev_assert_locked_or_invisible(dev); dev->xdp_features = val; if (dev->reg_state == NETREG_REGISTERED) call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); } +EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked); + +void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +{ + netdev_lock(dev); + xdp_set_features_flag_locked(dev, val); + netdev_unlock(dev); +} EXPORT_SYMBOL_GPL(xdp_set_features_flag); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) From 99e44f39a8f7138f8b9d2bd87b17fceb483f8998 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:53 -0700 Subject: [PATCH 027/770] netdev: depend on netdev->lock for xdp features Writes to XDP features are now protected by netdev->lock. Other things we report are based on ops which don't change once device has been registered. It is safe to stop taking rtnl_lock, and depend on netdev->lock instead. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 7ef9b01919364..8c58261de9691 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -38,6 +38,8 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, u64 xdp_rx_meta = 0; void *hdr; + netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */ + hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; @@ -122,15 +124,14 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) if (!rsp) return -ENOMEM; - rtnl_lock(); - - netdev = __dev_get_by_index(genl_info_net(info), ifindex); - if (netdev) - err = netdev_nl_dev_fill(netdev, rsp, info); - else + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { err = -ENODEV; + goto err_free_msg; + } - rtnl_unlock(); + err = netdev_nl_dev_fill(netdev, rsp, info); + netdev_unlock(netdev); if (err) goto err_free_msg; @@ -146,18 +147,15 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); - struct net_device *netdev; - int err = 0; + int err; - rtnl_lock(); - for_each_netdev_dump(net, netdev, ctx->ifindex) { + for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) - break; + return err; } - rtnl_unlock(); - return err; + return 0; } static int From 87eba404f2e1785d410b51a31ff030932194e1c6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:54 -0700 Subject: [PATCH 028/770] docs: netdev: break down the instance locking info per ops struct Explicitly list all the ops structs and what locking they provide. Use "ops locked" as a term for drivers which have ops called under the instance lock. Acked-by: Stanislav Fomichev Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250408195956.412733-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/netdevices.rst | 54 +++++++++++++++++++------ 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index d6357472d3f19..7ae28c5fb835c 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -314,13 +314,8 @@ napi->poll: softirq will be called with interrupts disabled by netconsole. -struct netdev_queue_mgmt_ops synchronization rules -================================================== - -All queue management ndo callbacks are holding netdev instance lock. - -RTNL and netdev instance lock -============================= +netdev instance lock +==================== Historically, all networking control operations were protected by a single global lock known as ``rtnl_lock``. There is an ongoing effort to replace this @@ -328,10 +323,13 @@ global lock with separate locks for each network namespace. Additionally, properties of individual netdev are increasingly protected by per-netdev locks. For device drivers that implement shaping or queue management APIs, all control -operations will be performed under the netdev instance lock. Currently, this -instance lock is acquired within the context of ``rtnl_lock``. The drivers -can also explicitly request instance lock to be acquired via -``request_ops_lock``. In the future, there will be an option for individual +operations will be performed under the netdev instance lock. +Drivers can also explicitly request instance lock to be held during ops +by setting ``request_ops_lock`` to true. Code comments and docs refer +to drivers which have ops called under the instance lock as "ops locked". +See also the documentation of the ``lock`` member of struct net_device. + +In the future, there will be an option for individual drivers to opt out of using ``rtnl_lock`` and instead perform their control operations directly under the netdev instance lock. @@ -343,8 +341,40 @@ there are two sets of interfaces: ``dev_xxx`` and ``netif_xxx`` (e.g., acquiring the instance lock themselves, while the ``netif_xxx`` functions assume that the driver has already acquired the instance lock. +struct net_device_ops +--------------------- + +``ndos`` are called without holding the instance lock for most drivers. + +"Ops locked" drivers will have most of the ``ndos`` invoked under +the instance lock. + +struct ethtool_ops +------------------ + +Similarly to ``ndos`` the instance lock is only held for select drivers. +For "ops locked" drivers all ethtool ops without exceptions should +be called under the instance lock. + +struct net_shaper_ops +--------------------- + +All net shaper callbacks are invoked while holding the netdev instance +lock. ``rtnl_lock`` may or may not be held. + +Note that supporting net shapers automatically enables "ops locking". + +struct netdev_queue_mgmt_ops +---------------------------- + +All queue management callbacks are invoked while holding the netdev instance +lock. ``rtnl_lock`` may or may not be held. + +Note that supporting struct netdev_queue_mgmt_ops automatically enables +"ops locking". + Notifiers and netdev instance lock -================================== +---------------------------------- For device drivers that implement shaping or queue management APIs, some of the notifiers (``enum netdev_cmd``) are running under the netdev From ce7b14947484e6190372f2c3dbfb69aafbc4c0fc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Apr 2025 12:59:55 -0700 Subject: [PATCH 029/770] netdev: depend on netdev->lock for qstats in ops locked drivers We mostly needed rtnl_lock in qstat to make sure the queue count is stable while we work. For "ops locked" drivers the instance lock protects the queue count, so we don't have to take rtnl_lock. For currently ops-locked drivers: netdevsim and bnxt need the protection from netdev going down while we dump, which instance lock provides. gve doesn't care. Reviewed-by: Joe Damato Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250408195956.412733-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/netdevices.rst | 6 +++++ include/net/netdev_queues.h | 4 +++- net/core/netdev-genl.c | 29 +++++++++++++++---------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index 7ae28c5fb835c..0ccc7dcf4390c 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -356,6 +356,12 @@ Similarly to ``ndos`` the instance lock is only held for select drivers. For "ops locked" drivers all ethtool ops without exceptions should be called under the instance lock. +struct netdev_stat_ops +---------------------- + +"qstat" ops are invoked under the instance lock for "ops locked" drivers, +and under rtnl_lock for all other drivers. + struct net_shaper_ops --------------------- diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 825141d675e58..ea709b59d8270 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -85,9 +85,11 @@ struct netdev_queue_stats_tx { * for some of the events is not maintained, and reliable "total" cannot * be provided). * + * Ops are called under the instance lock if netdev_need_ops_lock() + * returns true, otherwise under rtnl_lock. * Device drivers can assume that when collecting total device stats, * the @get_base_stats and subsequent per-queue calls are performed - * "atomically" (without releasing the rtnl_lock). + * "atomically" (without releasing the relevant lock). * * Device drivers are encouraged to reset the per-queue statistics when * number of queues change. This is because the primary use case for diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 8c58261de9691..b64c614a00c47 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -795,26 +795,31 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); - rtnl_lock(); if (ifindex) { - netdev = __dev_get_by_index(net, ifindex); - if (netdev && netdev->stat_ops) { - err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, - info, ctx); - } else { + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); + if (!netdev) { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); - err = netdev ? -EOPNOTSUPP : -ENODEV; + return -ENODEV; } - } else { - for_each_netdev_dump(net, netdev, ctx->ifindex) { + if (netdev->stat_ops) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); - if (err < 0) - break; + } else { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + err = -EOPNOTSUPP; } + netdev_unlock_ops_compat(netdev); + return err; + } + + for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + if (err < 0) + break; } - rtnl_unlock(); return err; } From e4cb911780231bb5bee35cd164a24bee8a3ef6a4 Mon Sep 17 00:00:00 2001 From: Julian Vetter Date: Tue, 8 Apr 2025 11:15:48 +0200 Subject: [PATCH 030/770] net: remove __get_unaligned_cpu32 from macvlan driver The __get_unaligned_cpu32 function is deprecated. So, replace it with the more generic get_unaligned and just cast the input parameter. Signed-off-by: Julian Vetter Link: https://patch.msgid.link/20250408091548.2263911-1-julian@outer-limits.org Signed-off-by: Jakub Kicinski --- drivers/net/macvlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d0dfa6bca6cc2..7045b1d587541 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -254,7 +254,7 @@ static u32 macvlan_hash_mix(const struct macvlan_dev *vlan) static unsigned int mc_hash(const struct macvlan_dev *vlan, const unsigned char *addr) { - u32 val = __get_unaligned_cpu32(addr + 2); + u32 val = get_unaligned((u32 *)(addr + 2)); val ^= macvlan_hash_mix(vlan); return hash_32(val, MACVLAN_MC_FILTER_BITS); From 1635eecdd298e46c9abce66695e55587d3957393 Mon Sep 17 00:00:00 2001 From: Julian Vetter Date: Tue, 8 Apr 2025 11:19:46 +0200 Subject: [PATCH 031/770] net: ipvlan: remove __get_unaligned_cpu32 from ipvlan driver The __get_unaligned_cpu32 function is deprecated. So, replace it with the more generic get_unaligned and just cast the input parameter. Signed-off-by: Julian Vetter Link: https://patch.msgid.link/20250408091946.2266271-1-julian@outer-limits.org Signed-off-by: Jakub Kicinski --- drivers/net/ipvlan/ipvlan_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index ca62188a317ad..e3e65772c5991 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -219,7 +219,7 @@ void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type) unsigned int ipvlan_mac_hash(const unsigned char *addr) { - u32 hash = jhash_1word(__get_unaligned_cpu32(addr+2), + u32 hash = jhash_1word(get_unaligned((u32 *)(addr + 2)), ipvlan_jhash_secret); return hash & IPVLAN_MAC_FILTER_MASK; From 311920774c408ce3366888d3817e7e648e9ccf1c Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 2 Apr 2025 10:23:05 -0700 Subject: [PATCH 032/770] configs/debug: run and debug PREEMPT Recent change [0] resulted in a "BUG: using __this_cpu_read() in preemptible" splat [1]. PREEMPT kernels have additional requirements on what can and can not run with/without preemption enabled. Expose those constrains in the debug kernels. 0: https://lore.kernel.org/netdev/20250314120048.12569-2-justin.iurman@uliege.be/ 1: https://lore.kernel.org/netdev/20250402094458.006ba2a7@kernel.org/T/#mbf72641e9d7d274daee9003ef5edf6833201f1bc Signed-off-by: Stanislav Fomichev Reviewed-by: Simon Horman Acked-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250402172305.1775226-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- kernel/configs/debug.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index 8aafd050b754b..e81327d2cd639 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -112,3 +112,8 @@ CONFIG_BRANCH_PROFILE_NONE=y CONFIG_DYNAMIC_FTRACE=y CONFIG_FTRACE=y CONFIG_FUNCTION_TRACER=y +# +# Preemption +# +CONFIG_DEBUG_PREEMPT=y +CONFIG_PREEMPT=y From 9c056ec6dd1654b1420dafbbe2a69718850e6ff2 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Tue, 8 Apr 2025 11:26:02 +0800 Subject: [PATCH 033/770] octeontx2-pf: Add error log forcn10k_map_unmap_rq_policer() The cn10k_free_matchall_ipolicer() calls the cn10k_map_unmap_rq_policer() for each queue in a for loop without checking for any errors. Check the return value of the cn10k_map_unmap_rq_policer() function during each loop, and report a warning if the function fails. Signed-off-by: Wentao Liang Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250408032602.2909-1-vulab@iscas.ac.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c index c3b6e0f60a799..7f6a435ac6806 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c @@ -357,9 +357,12 @@ int cn10k_free_matchall_ipolicer(struct otx2_nic *pfvf) mutex_lock(&pfvf->mbox.lock); /* Remove RQ's policer mapping */ - for (qidx = 0; qidx < hw->rx_queues; qidx++) - cn10k_map_unmap_rq_policer(pfvf, qidx, - hw->matchall_ipolicer, false); + for (qidx = 0; qidx < hw->rx_queues; qidx++) { + rc = cn10k_map_unmap_rq_policer(pfvf, qidx, hw->matchall_ipolicer, false); + if (rc) + dev_warn(pfvf->dev, "Failed to unmap RQ %d's policer (error %d).", + qidx, rc); + } rc = cn10k_free_leaf_profile(pfvf, hw->matchall_ipolicer); From 229671ac60e298b85c2644f52d7e487e9f487d06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Apr 2025 20:27:42 +0000 Subject: [PATCH 034/770] net: remove cpu stall in txq_trans_update() txq_trans_update() currently uses txq->xmit_lock_owner to conditionally update txq->trans_start. For regular devices, txq->xmit_lock_owner is updated from HARD_TX_LOCK() and HARD_TX_UNLOCK(), and this apparently causes cpu stalls. Using dev->lltx, which sits in a read-mostly cache-line, and already used in HARD_TX_LOCK() and HARD_TX_UNLOCK() helps cpu prediction. On an AMD EPYC 7B12 dual socket server, tcp_rr with 128 threads and 30,000 flows gets a 5 % increase in throughput. As explained in commit 95ecba62e2fd ("net: fix races in netdev_tx_sent_queue()/dev_watchdog()") I am planning to no longer update txq->trans_start in the fast path in a followup patch. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250408202742.2145516-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- include/linux/netdevice.h | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index c9fd34787c998..e78de79a5d78c 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -427,7 +427,7 @@ static void am65_cpsw_nuss_ndo_host_tx_timeout(struct net_device *ndev, if (netif_tx_queue_stopped(netif_txq)) { /* try recover if stopped by us */ - txq_trans_update(netif_txq); + txq_trans_update(ndev, netif_txq); netif_tx_wake_queue(netif_txq); } } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dece2ae396a1e..a28a08046615f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4693,9 +4693,10 @@ static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) /* * txq->trans_start can be read locklessly from dev_watchdog() */ -static inline void txq_trans_update(struct netdev_queue *txq) +static inline void txq_trans_update(const struct net_device *dev, + struct netdev_queue *txq) { - if (txq->xmit_lock_owner != -1) + if (!dev->lltx) WRITE_ONCE(txq->trans_start, jiffies); } @@ -5214,7 +5215,7 @@ static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_devi rc = __netdev_start_xmit(ops, skb, dev, more); if (rc == NETDEV_TX_OK) - txq_trans_update(txq); + txq_trans_update(dev, txq); return rc; } From 827b2ac8e7968e59b6e6893f6b1d43dec99ef2de Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Tue, 8 Apr 2025 17:40:23 +0200 Subject: [PATCH 035/770] net: bridge: Prevent unicast ARP/NS packets from being suppressed by bridge When Proxy ARP or ARP/ND suppression are enabled, ARP/NS packets can be handled by bridge in br_do_proxy_suppress_arp()/br_do_suppress_nd(). For broadcast packets, they are replied by bridge, but later they are not flooded. Currently, unicast packets are replied by bridge when suppression is enabled, and they are also forwarded, which results two replicas of ARP reply/NA - one from the bridge and second from the target. RFC 1122 describes use case for unicat ARP packets - "unicast poll" - actively poll the remote host by periodically sending a point-to-point ARP request to it, and delete the entry if no ARP reply is received from N successive polls. The purpose of ARP/ND suppression is to reduce flooding in the broadcast domain. If a host is sending a unicast ARP/NS, then it means it already knows the address and the switches probably know it as well and there will not be any flooding. In addition, the use case of unicast ARP/NS is to poll a specific host, so it does not make sense to have the switch answer on behalf of the host. According to RFC 9161: "A PE SHOULD reply to broadcast/multicast address resolution messages, i.e., ARP Requests, ARP probes, NS messages, as well as DAD NS messages. An ARP probe is an ARP Request constructed with an all-zero sender IP address that may be used by hosts for IPv4 Address Conflict Detection as specified in [RFC5227]. A PE SHOULD NOT reply to unicast address resolution requests (for instance, NUD NS messages)." Forward such requests and prevent the bridge from replying to them. Reported-by: Denis Yulevych Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/6bf745a149ddfe5e6be8da684a63aa574a326f8d.1744123493.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_arp_nd_proxy.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 115a23054a58d..1e2b51769eec8 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -160,6 +160,9 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { if (br_is_neigh_suppress_enabled(p, vid)) return; + if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + parp->ar_op == htons(ARPOP_REQUEST)) + return; if (parp->ar_op != htons(ARPOP_RREQUEST) && parp->ar_op != htons(ARPOP_RREPLY) && (ipv4_is_zeronet(sip) || sip == tip)) { @@ -410,6 +413,10 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, if (br_is_neigh_suppress_enabled(p, vid)) return; + if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) + return; + if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT && !msg->icmph.icmp6_solicited) { /* prevent flooding to neigh suppress ports */ From 0ffb594212a0175259084b6bdaf62181d42c0491 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Tue, 8 Apr 2025 17:40:24 +0200 Subject: [PATCH 036/770] selftests: test_bridge_neigh_suppress: Test unicast ARP/NS with suppression Add test cases to check that unicast ARP/NS packets are replied once, even if ARP/ND suppression is enabled. Without the previous patch: $ ./test_bridge_neigh_suppress.sh ... Unicast ARP, per-port ARP suppression - VLAN 10 ----------------------------------------------- TEST: "neigh_suppress" is on [ OK ] TEST: Unicast ARP, suppression on, h1 filter [FAIL] TEST: Unicast ARP, suppression on, h2 filter [ OK ] Unicast ARP, per-port ARP suppression - VLAN 20 ----------------------------------------------- TEST: "neigh_suppress" is on [ OK ] TEST: Unicast ARP, suppression on, h1 filter [FAIL] TEST: Unicast ARP, suppression on, h2 filter [ OK ] ... Unicast NS, per-port NS suppression - VLAN 10 --------------------------------------------- TEST: "neigh_suppress" is on [ OK ] TEST: Unicast NS, suppression on, h1 filter [FAIL] TEST: Unicast NS, suppression on, h2 filter [ OK ] Unicast NS, per-port NS suppression - VLAN 20 --------------------------------------------- TEST: "neigh_suppress" is on [ OK ] TEST: Unicast NS, suppression on, h1 filter [FAIL] TEST: Unicast NS, suppression on, h2 filter [ OK ] ... Tests passed: 156 Tests failed: 4 With the previous patch: $ ./test_bridge_neigh_suppress.sh ... Unicast ARP, per-port ARP suppression - VLAN 10 ----------------------------------------------- TEST: "neigh_suppress" is on [ OK ] TEST: Unicast ARP, suppression on, h1 filter [ OK ] TEST: Unicast ARP, suppression on, h2 filter [ OK ] Unicast ARP, per-port ARP suppression - VLAN 20 ----------------------------------------------- TEST: "neigh_suppress" is on [ OK ] TEST: Unicast ARP, suppression on, h1 filter [ OK ] TEST: Unicast ARP, suppression on, h2 filter [ OK ] ... Unicast NS, per-port NS suppression - VLAN 10 --------------------------------------------- TEST: "neigh_suppress" is on [ OK ] TEST: Unicast NS, suppression on, h1 filter [ OK ] TEST: Unicast NS, suppression on, h2 filter [ OK ] Unicast NS, per-port NS suppression - VLAN 20 --------------------------------------------- TEST: "neigh_suppress" is on [ OK ] TEST: Unicast NS, suppression on, h1 filter [ OK ] TEST: Unicast NS, suppression on, h2 filter [ OK ] ... Tests passed: 160 Tests failed: 0 Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/dc240b9649b31278295189f412223f320432c5f2.1744123493.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/test_bridge_neigh_suppress.sh | 125 ++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh index 02b986c9c247d..9067197c90550 100755 --- a/tools/testing/selftests/net/test_bridge_neigh_suppress.sh +++ b/tools/testing/selftests/net/test_bridge_neigh_suppress.sh @@ -51,7 +51,9 @@ ret=0 # All tests in this script. Can be overridden with -t option. TESTS=" neigh_suppress_arp + neigh_suppress_uc_arp neigh_suppress_ns + neigh_suppress_uc_ns neigh_vlan_suppress_arp neigh_vlan_suppress_ns " @@ -388,6 +390,52 @@ neigh_suppress_arp() neigh_suppress_arp_common $vid $sip $tip } +neigh_suppress_uc_arp_common() +{ + local vid=$1; shift + local sip=$1; shift + local tip=$1; shift + local tmac + + echo + echo "Unicast ARP, per-port ARP suppression - VLAN $vid" + echo "-----------------------------------------------" + + run_cmd "bridge -n $sw1 link set dev vx0 neigh_suppress on" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_suppress on\"" + log_test $? 0 "\"neigh_suppress\" is on" + + tmac=$(ip -n $h2 -j -p link show eth0.$vid | jq -r '.[]["address"]') + run_cmd "bridge -n $sw1 fdb replace $tmac dev vx0 master static vlan $vid" + run_cmd "ip -n $sw1 neigh replace $tip lladdr $tmac nud permanent dev br0.$vid" + + run_cmd "tc -n $h1 qdisc replace dev eth0.$vid clsact" + run_cmd "tc -n $h1 filter replace dev eth0.$vid ingress pref 1 handle 101 proto arp flower arp_sip $tip arp_op reply action pass" + + run_cmd "tc -n $h2 qdisc replace dev eth0.$vid clsact" + run_cmd "tc -n $h2 filter replace dev eth0.$vid egress pref 1 handle 101 proto arp flower arp_tip $sip arp_op reply action pass" + + run_cmd "ip netns exec $h1 mausezahn eth0.$vid -c 1 -a own -b $tmac -t arp 'request sip=$sip, tip=$tip, tmac=$tmac' -q" + tc_check_packets $h1 "dev eth0.$vid ingress" 101 1 + log_test $? 0 "Unicast ARP, suppression on, h1 filter" + tc_check_packets $h2 "dev eth0.$vid egress" 101 1 + log_test $? 0 "Unicast ARP, suppression on, h2 filter" +} + +neigh_suppress_uc_arp() +{ + local vid=10 + local sip=192.0.2.1 + local tip=192.0.2.2 + + neigh_suppress_uc_arp_common $vid $sip $tip + + vid=20 + sip=192.0.2.17 + tip=192.0.2.18 + neigh_suppress_uc_arp_common $vid $sip $tip +} + neigh_suppress_ns_common() { local vid=$1; shift @@ -494,6 +542,78 @@ neigh_suppress_ns() neigh_suppress_ns_common $vid $saddr $daddr $maddr } +icmpv6_header_get() +{ + local csum=$1; shift + local tip=$1; shift + local type + local p + + # Type 135 (Neighbor Solicitation), hex format + type="87" + p=$(: + )"$type:"$( : ICMPv6.type + )"00:"$( : ICMPv6.code + )"$csum:"$( : ICMPv6.checksum + )"00:00:00:00:"$( : Reserved + )"$tip:"$( : Target Address + ) + echo $p +} + +neigh_suppress_uc_ns_common() +{ + local vid=$1; shift + local sip=$1; shift + local dip=$1; shift + local full_dip=$1; shift + local csum=$1; shift + local tmac + + echo + echo "Unicast NS, per-port NS suppression - VLAN $vid" + echo "---------------------------------------------" + + run_cmd "bridge -n $sw1 link set dev vx0 neigh_suppress on" + run_cmd "bridge -n $sw1 -d link show dev vx0 | grep \"neigh_suppress on\"" + log_test $? 0 "\"neigh_suppress\" is on" + + tmac=$(ip -n $h2 -j -p link show eth0.$vid | jq -r '.[]["address"]') + run_cmd "bridge -n $sw1 fdb replace $tmac dev vx0 master static vlan $vid" + run_cmd "ip -n $sw1 -6 neigh replace $dip lladdr $tmac nud permanent dev br0.$vid" + + run_cmd "tc -n $h1 qdisc replace dev eth0.$vid clsact" + run_cmd "tc -n $h1 filter replace dev eth0.$vid ingress pref 1 handle 101 proto ipv6 flower ip_proto icmpv6 src_ip $dip type 136 code 0 action pass" + + run_cmd "tc -n $h2 qdisc replace dev eth0.$vid clsact" + run_cmd "tc -n $h2 filter replace dev eth0.$vid egress pref 1 handle 101 proto ipv6 flower ip_proto icmpv6 dst_ip $sip type 136 code 0 action pass" + + run_cmd "ip netns exec $h1 mausezahn -6 eth0.$vid -c 1 -a own -b $tmac -A $sip -B $dip -t ip hop=255,next=58,payload=$(icmpv6_header_get $csum $full_dip) -q" + tc_check_packets $h1 "dev eth0.$vid ingress" 101 1 + log_test $? 0 "Unicast NS, suppression on, h1 filter" + tc_check_packets $h2 "dev eth0.$vid egress" 101 1 + log_test $? 0 "Unicast NS, suppression on, h2 filter" +} + +neigh_suppress_uc_ns() +{ + local vid=10 + local saddr=2001:db8:1::1 + local daddr=2001:db8:1::2 + local full_daddr=20:01:0d:b8:00:01:00:00:00:00:00:00:00:00:00:02 + local csum="ef:79" + + neigh_suppress_uc_ns_common $vid $saddr $daddr $full_daddr $csum + + vid=20 + saddr=2001:db8:2::1 + daddr=2001:db8:2::2 + full_daddr=20:01:0d:b8:00:02:00:00:00:00:00:00:00:00:00:02 + csum="ef:76" + + neigh_suppress_uc_ns_common $vid $saddr $daddr $full_daddr $csum +} + neigh_vlan_suppress_arp() { local vid1=10 @@ -825,6 +945,11 @@ if [ ! -x "$(command -v jq)" ]; then exit $ksft_skip fi +if [ ! -x "$(command -v mausezahn)" ]; then + echo "SKIP: Could not run test without mausezahn tool" + exit $ksft_skip +fi + bridge link help 2>&1 | grep -q "neigh_vlan_suppress" if [ $? -ne 0 ]; then echo "SKIP: iproute2 bridge too old, missing per-VLAN neighbor suppression support" From 6a07e3af4973402fa199a80036c10060b922c92c Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 8 Apr 2025 11:26:58 +0200 Subject: [PATCH 037/770] net: ethernet: cortina: Use TOE/TSO on all TCP It is desireable to push the hardware accelerator to also process non-segmented TCP frames: we pass the skb->len to the "TOE/TSO" offloader and it will handle them. Without this quirk the driver becomes unstable and lock up and and crash. I do not know exactly why, but it is probably due to the TOE (TCP offload engine) feature that is coupled with the segmentation feature - it is not possible to turn one part off and not the other, either both TOE and TSO are active, or neither of them. Not having the TOE part active seems detrimental, as if that hardware feature is not really supposed to be turned off. The datasheet says: "Based on packet parsing and TCP connection/NAT table lookup results, the NetEngine puts the packets belonging to the same TCP connection to the same queue for the software to process. The NetEngine puts incoming packets to the buffer or series of buffers for a jumbo packet. With this hardware acceleration, IP/TCP header parsing, checksum validation and connection lookup are offloaded from the software processing." After numerous tests with the hardware locking up after something between minutes and hours depending on load using iperf3 I have concluded this is necessary to stabilize the hardware. Signed-off-by: Linus Walleij Link: https://patch.msgid.link/20250408-gemini-ethernet-tso-always-v1-1-e669f932359c@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cortina/gemini.c | 37 +++++++++++++++++++++------ 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/cortina/gemini.c b/drivers/net/ethernet/cortina/gemini.c index 517a15904fb08..6a2004bbe87f9 100644 --- a/drivers/net/ethernet/cortina/gemini.c +++ b/drivers/net/ethernet/cortina/gemini.c @@ -1144,6 +1144,7 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb, struct gmac_txdesc *txd; skb_frag_t *skb_frag; dma_addr_t mapping; + bool tcp = false; void *buffer; u16 mss; int ret; @@ -1151,6 +1152,13 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb, word1 = skb->len; word3 = SOF_BIT; + /* Determine if we are doing TCP */ + if (skb->protocol == htons(ETH_P_IP)) + tcp = (ip_hdr(skb)->protocol == IPPROTO_TCP); + else + /* IPv6 */ + tcp = (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP); + mss = skb_shinfo(skb)->gso_size; if (mss) { /* This means we are dealing with TCP and skb->len is the @@ -1163,8 +1171,26 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb, mss, skb->len); word1 |= TSS_MTU_ENABLE_BIT; word3 |= mss; + } else if (tcp) { + /* Even if we are not using TSO, use the hardware offloader + * for transferring the TCP frame: this hardware has partial + * TCP awareness (called TOE - TCP Offload Engine) and will + * according to the datasheet put packets belonging to the + * same TCP connection in the same queue for the TOE/TSO + * engine to process. The engine will deal with chopping + * up frames that exceed ETH_DATA_LEN which the + * checksumming engine cannot handle (see below) into + * manageable chunks. It flawlessly deals with quite big + * frames and frames containing custom DSA EtherTypes. + */ + mss = netdev->mtu + skb_tcp_all_headers(skb); + mss = min(mss, skb->len); + netdev_dbg(netdev, "TOE/TSO len %04x mtu %04x mss %04x\n", + skb->len, netdev->mtu, mss); + word1 |= TSS_MTU_ENABLE_BIT; + word3 |= mss; } else if (skb->len >= ETH_FRAME_LEN) { - /* Hardware offloaded checksumming isn't working on frames + /* Hardware offloaded checksumming isn't working on non-TCP frames * bigger than 1514 bytes. A hypothesis about this is that the * checksum buffer is only 1518 bytes, so when the frames get * bigger they get truncated, or the last few bytes get @@ -1181,21 +1207,16 @@ static int gmac_map_tx_bufs(struct net_device *netdev, struct sk_buff *skb, } if (skb->ip_summed == CHECKSUM_PARTIAL) { - int tcp = 0; - /* We do not switch off the checksumming on non TCP/UDP * frames: as is shown from tests, the checksumming engine * is smart enough to see that a frame is not actually TCP * or UDP and then just pass it through without any changes * to the frame. */ - if (skb->protocol == htons(ETH_P_IP)) { + if (skb->protocol == htons(ETH_P_IP)) word1 |= TSS_IP_CHKSUM_BIT; - tcp = ip_hdr(skb)->protocol == IPPROTO_TCP; - } else { /* IPv6 */ + else word1 |= TSS_IPV6_ENABLE_BIT; - tcp = ipv6_hdr(skb)->nexthdr == IPPROTO_TCP; - } word1 |= tcp ? TSS_TCP_CHKSUM_BIT : TSS_UDP_CHKSUM_BIT; } From 29264a372da9a3d2a087953756cf3a28e79f7317 Mon Sep 17 00:00:00 2001 From: Mengyuan Lou Date: Tue, 8 Apr 2025 17:15:51 +0800 Subject: [PATCH 038/770] net: libwx: Add mailbox api for wangxun pf drivers Implements the mailbox interfaces for wangxun pf drivers ngbe and txgbe. Signed-off-by: Mengyuan Lou Link: https://patch.msgid.link/70017BD4D67614A4+20250408091556.9640-2-mengyuanlou@net-swift.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/Makefile | 2 +- drivers/net/ethernet/wangxun/libwx/wx_mbx.c | 176 +++++++++++++++++++ drivers/net/ethernet/wangxun/libwx/wx_mbx.h | 32 ++++ drivers/net/ethernet/wangxun/libwx/wx_type.h | 8 + 4 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/wangxun/libwx/wx_mbx.c create mode 100644 drivers/net/ethernet/wangxun/libwx/wx_mbx.h diff --git a/drivers/net/ethernet/wangxun/libwx/Makefile b/drivers/net/ethernet/wangxun/libwx/Makefile index e9f0f1f2309bf..cd1675ef32531 100644 --- a/drivers/net/ethernet/wangxun/libwx/Makefile +++ b/drivers/net/ethernet/wangxun/libwx/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_LIBWX) += libwx.o -libwx-objs := wx_hw.o wx_lib.o wx_ethtool.o wx_ptp.o +libwx-objs := wx_hw.o wx_lib.o wx_ethtool.o wx_ptp.o wx_mbx.o diff --git a/drivers/net/ethernet/wangxun/libwx/wx_mbx.c b/drivers/net/ethernet/wangxun/libwx/wx_mbx.c new file mode 100644 index 0000000000000..73af5f11c3bdc --- /dev/null +++ b/drivers/net/ethernet/wangxun/libwx/wx_mbx.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2015 - 2025 Beijing WangXun Technology Co., Ltd. */ + +#include +#include "wx_type.h" +#include "wx_mbx.h" + +/** + * wx_obtain_mbx_lock_pf - obtain mailbox lock + * @wx: pointer to the HW structure + * @vf: the VF index + * + * Return: return 0 on success and -EBUSY on failure + **/ +static int wx_obtain_mbx_lock_pf(struct wx *wx, u16 vf) +{ + int count = 5; + u32 mailbox; + + while (count--) { + /* Take ownership of the buffer */ + wr32(wx, WX_PXMAILBOX(vf), WX_PXMAILBOX_PFU); + + /* reserve mailbox for vf use */ + mailbox = rd32(wx, WX_PXMAILBOX(vf)); + if (mailbox & WX_PXMAILBOX_PFU) + return 0; + else if (count) + udelay(10); + } + wx_err(wx, "Failed to obtain mailbox lock for PF%d", vf); + + return -EBUSY; +} + +static int wx_check_for_bit_pf(struct wx *wx, u32 mask, int index) +{ + u32 mbvficr = rd32(wx, WX_MBVFICR(index)); + + if (!(mbvficr & mask)) + return -EBUSY; + wr32(wx, WX_MBVFICR(index), mask); + + return 0; +} + +/** + * wx_check_for_ack_pf - checks to see if the VF has acked + * @wx: pointer to the HW structure + * @vf: the VF index + * + * Return: return 0 if the VF has set the status bit or else -EBUSY + **/ +int wx_check_for_ack_pf(struct wx *wx, u16 vf) +{ + u32 index = vf / 16, vf_bit = vf % 16; + + return wx_check_for_bit_pf(wx, + FIELD_PREP(WX_MBVFICR_VFACK_MASK, + BIT(vf_bit)), + index); +} + +/** + * wx_check_for_msg_pf - checks to see if the VF has sent mail + * @wx: pointer to the HW structure + * @vf: the VF index + * + * Return: return 0 if the VF has got req bit or else -EBUSY + **/ +int wx_check_for_msg_pf(struct wx *wx, u16 vf) +{ + u32 index = vf / 16, vf_bit = vf % 16; + + return wx_check_for_bit_pf(wx, + FIELD_PREP(WX_MBVFICR_VFREQ_MASK, + BIT(vf_bit)), + index); +} + +/** + * wx_write_mbx_pf - Places a message in the mailbox + * @wx: pointer to the HW structure + * @msg: The message buffer + * @size: Length of buffer + * @vf: the VF index + * + * Return: return 0 on success and -EINVAL/-EBUSY on failure + **/ +int wx_write_mbx_pf(struct wx *wx, u32 *msg, u16 size, u16 vf) +{ + struct wx_mbx_info *mbx = &wx->mbx; + int ret, i; + + /* mbx->size is up to 15 */ + if (size > mbx->size) { + wx_err(wx, "Invalid mailbox message size %d", size); + return -EINVAL; + } + + /* lock the mailbox to prevent pf/vf race condition */ + ret = wx_obtain_mbx_lock_pf(wx, vf); + if (ret) + return ret; + + /* flush msg and acks as we are overwriting the message buffer */ + wx_check_for_msg_pf(wx, vf); + wx_check_for_ack_pf(wx, vf); + + /* copy the caller specified message to the mailbox memory buffer */ + for (i = 0; i < size; i++) + wr32a(wx, WX_PXMBMEM(vf), i, msg[i]); + + /* Interrupt VF to tell it a message has been sent and release buffer */ + /* set mirrored mailbox flags */ + wr32a(wx, WX_PXMBMEM(vf), WX_VXMAILBOX_SIZE, WX_PXMAILBOX_STS); + wr32(wx, WX_PXMAILBOX(vf), WX_PXMAILBOX_STS); + + return 0; +} + +/** + * wx_read_mbx_pf - Read a message from the mailbox + * @wx: pointer to the HW structure + * @msg: The message buffer + * @size: Length of buffer + * @vf: the VF index + * + * Return: return 0 on success and -EBUSY on failure + **/ +int wx_read_mbx_pf(struct wx *wx, u32 *msg, u16 size, u16 vf) +{ + struct wx_mbx_info *mbx = &wx->mbx; + int ret; + u16 i; + + /* limit read to size of mailbox and mbx->size is up to 15 */ + if (size > mbx->size) + size = mbx->size; + + /* lock the mailbox to prevent pf/vf race condition */ + ret = wx_obtain_mbx_lock_pf(wx, vf); + if (ret) + return ret; + + for (i = 0; i < size; i++) + msg[i] = rd32a(wx, WX_PXMBMEM(vf), i); + + /* Acknowledge the message and release buffer */ + /* set mirrored mailbox flags */ + wr32a(wx, WX_PXMBMEM(vf), WX_VXMAILBOX_SIZE, WX_PXMAILBOX_ACK); + wr32(wx, WX_PXMAILBOX(vf), WX_PXMAILBOX_ACK); + + return 0; +} + +/** + * wx_check_for_rst_pf - checks to see if the VF has reset + * @wx: pointer to the HW structure + * @vf: the VF index + * + * Return: return 0 on success and -EBUSY on failure + **/ +int wx_check_for_rst_pf(struct wx *wx, u16 vf) +{ + u32 reg_offset = WX_VF_REG_OFFSET(vf); + u32 vf_shift = WX_VF_IND_SHIFT(vf); + u32 vflre = 0; + + vflre = rd32(wx, WX_VFLRE(reg_offset)); + if (!(vflre & BIT(vf_shift))) + return -EBUSY; + wr32(wx, WX_VFLREC(reg_offset), BIT(vf_shift)); + + return 0; +} diff --git a/drivers/net/ethernet/wangxun/libwx/wx_mbx.h b/drivers/net/ethernet/wangxun/libwx/wx_mbx.h new file mode 100644 index 0000000000000..c09b25f3e43d9 --- /dev/null +++ b/drivers/net/ethernet/wangxun/libwx/wx_mbx.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2015 - 2025 Beijing WangXun Technology Co., Ltd. */ +#ifndef _WX_MBX_H_ +#define _WX_MBX_H_ + +#define WX_VXMAILBOX_SIZE 15 + +/* PF Registers */ +#define WX_PXMAILBOX(i) (0x600 + (4 * (i))) /* i=[0,63] */ +#define WX_PXMAILBOX_STS BIT(0) /* Initiate message send to VF */ +#define WX_PXMAILBOX_ACK BIT(1) /* Ack message recv'd from VF */ +#define WX_PXMAILBOX_PFU BIT(3) /* PF owns the mailbox buffer */ + +#define WX_PXMBMEM(i) (0x5000 + (64 * (i))) /* i=[0,63] */ + +#define WX_VFLRE(i) (0x4A0 + (4 * (i))) /* i=[0,1] */ +#define WX_VFLREC(i) (0x4A8 + (4 * (i))) /* i=[0,1] */ + +/* SR-IOV specific macros */ +#define WX_MBVFICR(i) (0x480 + (4 * (i))) /* i=[0,3] */ +#define WX_MBVFICR_VFREQ_MASK GENMASK(15, 0) +#define WX_MBVFICR_VFACK_MASK GENMASK(31, 16) + +#define WX_VT_MSGINFO_MASK GENMASK(23, 16) + +int wx_write_mbx_pf(struct wx *wx, u32 *msg, u16 size, u16 vf); +int wx_read_mbx_pf(struct wx *wx, u32 *msg, u16 size, u16 vf); +int wx_check_for_rst_pf(struct wx *wx, u16 mbx_id); +int wx_check_for_msg_pf(struct wx *wx, u16 mbx_id); +int wx_check_for_ack_pf(struct wx *wx, u16 mbx_id); + +#endif /* _WX_MBX_H_ */ diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index 5b230ecbbabb5..18287b0ee0401 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -22,6 +22,9 @@ #define WX_PCI_LINK_STATUS 0xB2 /**************** Global Registers ****************************/ +#define WX_VF_REG_OFFSET(_v) FIELD_GET(GENMASK(15, 5), (_v)) +#define WX_VF_IND_SHIFT(_v) FIELD_GET(GENMASK(4, 0), (_v)) + /* chip control Registers */ #define WX_MIS_PWR 0x10000 #define WX_MIS_RST 0x1000C @@ -779,6 +782,10 @@ struct wx_bus_info { u16 device; }; +struct wx_mbx_info { + u16 size; +}; + struct wx_thermal_sensor_data { s16 temp; s16 alarm_thresh; @@ -1129,6 +1136,7 @@ struct wx { struct pci_dev *pdev; struct net_device *netdev; struct wx_bus_info bus; + struct wx_mbx_info mbx; struct wx_mac_info mac; enum em_mac_type mac_type; enum sp_media_type media_type; From 9bfd65980f8d9786cf4159ef1cbdfd7e43ec39f4 Mon Sep 17 00:00:00 2001 From: Mengyuan Lou Date: Tue, 8 Apr 2025 17:15:52 +0800 Subject: [PATCH 039/770] net: libwx: Add sriov api for wangxun nics Implement sriov_configure interface for wangxun nics in libwx. Enable VT mode and initialize vf control structure, when sriov is enabled. Do not be allowed to disable sriov when vfs are assigned. Signed-off-by: Mengyuan Lou Link: https://patch.msgid.link/81EA45C21B0A98B0+20250408091556.9640-3-mengyuanlou@net-swift.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/Makefile | 2 +- drivers/net/ethernet/wangxun/libwx/wx_mbx.h | 4 + drivers/net/ethernet/wangxun/libwx/wx_sriov.c | 200 ++++++++++++++++++ drivers/net/ethernet/wangxun/libwx/wx_sriov.h | 14 ++ drivers/net/ethernet/wangxun/libwx/wx_type.h | 31 +++ 5 files changed, 250 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/wangxun/libwx/wx_sriov.c create mode 100644 drivers/net/ethernet/wangxun/libwx/wx_sriov.h diff --git a/drivers/net/ethernet/wangxun/libwx/Makefile b/drivers/net/ethernet/wangxun/libwx/Makefile index cd1675ef32531..9b78b604a94e4 100644 --- a/drivers/net/ethernet/wangxun/libwx/Makefile +++ b/drivers/net/ethernet/wangxun/libwx/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_LIBWX) += libwx.o -libwx-objs := wx_hw.o wx_lib.o wx_ethtool.o wx_ptp.o wx_mbx.o +libwx-objs := wx_hw.o wx_lib.o wx_ethtool.o wx_ptp.o wx_mbx.o wx_sriov.o diff --git a/drivers/net/ethernet/wangxun/libwx/wx_mbx.h b/drivers/net/ethernet/wangxun/libwx/wx_mbx.h index c09b25f3e43d9..172b46b9c14dc 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_mbx.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_mbx.h @@ -23,6 +23,10 @@ #define WX_VT_MSGINFO_MASK GENMASK(23, 16) +enum wxvf_xcast_modes { + WXVF_XCAST_MODE_NONE = 0, +}; + int wx_write_mbx_pf(struct wx *wx, u32 *msg, u16 size, u16 vf); int wx_read_mbx_pf(struct wx *wx, u32 *msg, u16 size, u16 vf); int wx_check_for_rst_pf(struct wx *wx, u16 mbx_id); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_sriov.c b/drivers/net/ethernet/wangxun/libwx/wx_sriov.c new file mode 100644 index 0000000000000..ce7469e1cf3e1 --- /dev/null +++ b/drivers/net/ethernet/wangxun/libwx/wx_sriov.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2015 - 2025 Beijing WangXun Technology Co., Ltd. */ + +#include +#include + +#include "wx_type.h" +#include "wx_mbx.h" +#include "wx_sriov.h" + +static void wx_vf_configuration(struct pci_dev *pdev, int event_mask) +{ + bool enable = !!WX_VF_ENABLE_CHECK(event_mask); + struct wx *wx = pci_get_drvdata(pdev); + u32 vfn = WX_VF_NUM_GET(event_mask); + + if (enable) + eth_zero_addr(wx->vfinfo[vfn].vf_mac_addr); +} + +static int wx_alloc_vf_macvlans(struct wx *wx, u8 num_vfs) +{ + struct vf_macvlans *mv_list; + int num_vf_macvlans, i; + + /* Initialize list of VF macvlans */ + INIT_LIST_HEAD(&wx->vf_mvs.mvlist); + + num_vf_macvlans = wx->mac.num_rar_entries - + (WX_MAX_PF_MACVLANS + 1 + num_vfs); + if (!num_vf_macvlans) + return -EINVAL; + + mv_list = kcalloc(num_vf_macvlans, sizeof(struct vf_macvlans), + GFP_KERNEL); + if (!mv_list) + return -ENOMEM; + + for (i = 0; i < num_vf_macvlans; i++) { + mv_list[i].vf = -1; + mv_list[i].free = true; + list_add(&mv_list[i].mvlist, &wx->vf_mvs.mvlist); + } + wx->mv_list = mv_list; + + return 0; +} + +static void wx_sriov_clear_data(struct wx *wx) +{ + /* set num VFs to 0 to prevent access to vfinfo */ + wx->num_vfs = 0; + + /* free VF control structures */ + kfree(wx->vfinfo); + wx->vfinfo = NULL; + + /* free macvlan list */ + kfree(wx->mv_list); + wx->mv_list = NULL; + + /* set default pool back to 0 */ + wr32m(wx, WX_PSR_VM_CTL, WX_PSR_VM_CTL_POOL_MASK, 0); + wx->ring_feature[RING_F_VMDQ].offset = 0; + + clear_bit(WX_FLAG_SRIOV_ENABLED, wx->flags); + /* Disable VMDq flag so device will be set in NM mode */ + if (wx->ring_feature[RING_F_VMDQ].limit == 1) + clear_bit(WX_FLAG_VMDQ_ENABLED, wx->flags); +} + +static int __wx_enable_sriov(struct wx *wx, u8 num_vfs) +{ + int i, ret = 0; + u32 value = 0; + + set_bit(WX_FLAG_SRIOV_ENABLED, wx->flags); + wx_err(wx, "SR-IOV enabled with %d VFs\n", num_vfs); + + /* Enable VMDq flag so device will be set in VM mode */ + set_bit(WX_FLAG_VMDQ_ENABLED, wx->flags); + if (!wx->ring_feature[RING_F_VMDQ].limit) + wx->ring_feature[RING_F_VMDQ].limit = 1; + wx->ring_feature[RING_F_VMDQ].offset = num_vfs; + + wx->vfinfo = kcalloc(num_vfs, sizeof(struct vf_data_storage), + GFP_KERNEL); + if (!wx->vfinfo) + return -ENOMEM; + + ret = wx_alloc_vf_macvlans(wx, num_vfs); + if (ret) + return ret; + + /* Initialize default switching mode VEB */ + wr32m(wx, WX_PSR_CTL, WX_PSR_CTL_SW_EN, WX_PSR_CTL_SW_EN); + + for (i = 0; i < num_vfs; i++) { + /* enable spoof checking for all VFs */ + wx->vfinfo[i].spoofchk_enabled = true; + wx->vfinfo[i].link_enable = true; + /* untrust all VFs */ + wx->vfinfo[i].trusted = false; + /* set the default xcast mode */ + wx->vfinfo[i].xcast_mode = WXVF_XCAST_MODE_NONE; + } + + if (wx->mac.type == wx_mac_em) { + value = WX_CFG_PORT_CTL_NUM_VT_8; + } else { + if (num_vfs < 32) + value = WX_CFG_PORT_CTL_NUM_VT_32; + else + value = WX_CFG_PORT_CTL_NUM_VT_64; + } + wr32m(wx, WX_CFG_PORT_CTL, + WX_CFG_PORT_CTL_NUM_VT_MASK, + value); + + return ret; +} + +static void wx_sriov_reinit(struct wx *wx) +{ + rtnl_lock(); + wx->setup_tc(wx->netdev, netdev_get_num_tc(wx->netdev)); + rtnl_unlock(); +} + +void wx_disable_sriov(struct wx *wx) +{ + if (!pci_vfs_assigned(wx->pdev)) + pci_disable_sriov(wx->pdev); + else + wx_err(wx, "Unloading driver while VFs are assigned.\n"); + + /* clear flags and free allloced data */ + wx_sriov_clear_data(wx); +} +EXPORT_SYMBOL(wx_disable_sriov); + +static int wx_pci_sriov_enable(struct pci_dev *dev, + int num_vfs) +{ + struct wx *wx = pci_get_drvdata(dev); + int err = 0, i; + + err = __wx_enable_sriov(wx, num_vfs); + if (err) + return err; + + wx->num_vfs = num_vfs; + for (i = 0; i < wx->num_vfs; i++) + wx_vf_configuration(dev, (i | WX_VF_ENABLE)); + + /* reset before enabling SRIOV to avoid mailbox issues */ + wx_sriov_reinit(wx); + + err = pci_enable_sriov(dev, num_vfs); + if (err) { + wx_err(wx, "Failed to enable PCI sriov: %d\n", err); + goto err_out; + } + + return num_vfs; +err_out: + wx_sriov_clear_data(wx); + return err; +} + +static void wx_pci_sriov_disable(struct pci_dev *dev) +{ + struct wx *wx = pci_get_drvdata(dev); + + wx_disable_sriov(wx); + wx_sriov_reinit(wx); +} + +int wx_pci_sriov_configure(struct pci_dev *pdev, int num_vfs) +{ + struct wx *wx = pci_get_drvdata(pdev); + int err; + + if (!num_vfs) { + if (!pci_vfs_assigned(pdev)) { + wx_pci_sriov_disable(pdev); + return 0; + } + + wx_err(wx, "can't free VFs because some are assigned to VMs.\n"); + return -EBUSY; + } + + err = wx_pci_sriov_enable(pdev, num_vfs); + if (err) + return err; + + return num_vfs; +} +EXPORT_SYMBOL(wx_pci_sriov_configure); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_sriov.h b/drivers/net/ethernet/wangxun/libwx/wx_sriov.h new file mode 100644 index 0000000000000..e5fd96b7c5984 --- /dev/null +++ b/drivers/net/ethernet/wangxun/libwx/wx_sriov.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2015 - 2025 Beijing WangXun Technology Co., Ltd. */ + +#ifndef _WX_SRIOV_H_ +#define _WX_SRIOV_H_ + +#define WX_VF_ENABLE_CHECK(_m) FIELD_GET(BIT(31), (_m)) +#define WX_VF_NUM_GET(_m) FIELD_GET(GENMASK(5, 0), (_m)) +#define WX_VF_ENABLE BIT(31) + +void wx_disable_sriov(struct wx *wx); +int wx_pci_sriov_configure(struct pci_dev *pdev, int num_vfs); + +#endif /* _WX_SRIOV_H_ */ diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index 18287b0ee0401..26776dee64903 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -20,6 +20,7 @@ /* MSI-X capability fields masks */ #define WX_PCIE_MSIX_TBL_SZ_MASK 0x7FF #define WX_PCI_LINK_STATUS 0xB2 +#define WX_MAX_PF_MACVLANS 15 /**************** Global Registers ****************************/ #define WX_VF_REG_OFFSET(_v) FIELD_GET(GENMASK(15, 5), (_v)) @@ -93,6 +94,9 @@ #define WX_CFG_TAG_TPID(_i) (0x14430 + ((_i) * 4)) #define WX_CFG_PORT_CTL_NUM_VT_MASK GENMASK(13, 12) /* number of TVs */ +#define WX_CFG_PORT_CTL_NUM_VT_8 FIELD_PREP(GENMASK(13, 12), 1) +#define WX_CFG_PORT_CTL_NUM_VT_32 FIELD_PREP(GENMASK(13, 12), 2) +#define WX_CFG_PORT_CTL_NUM_VT_64 FIELD_PREP(GENMASK(13, 12), 3) /* GPIO Registers */ #define WX_GPIO_DR 0x14800 @@ -168,6 +172,7 @@ /******************************* PSR Registers *******************************/ /* psr control */ #define WX_PSR_CTL 0x15000 +#define WX_PSR_VM_CTL 0x151B0 /* Header split receive */ #define WX_PSR_CTL_SW_EN BIT(18) #define WX_PSR_CTL_RSC_ACK BIT(17) @@ -205,6 +210,7 @@ /* mcasst/ucast overflow tbl */ #define WX_PSR_MC_TBL(_i) (0x15200 + ((_i) * 4)) #define WX_PSR_UC_TBL(_i) (0x15400 + ((_i) * 4)) +#define WX_PSR_VM_CTL_POOL_MASK GENMASK(12, 7) /* VM L2 contorl */ #define WX_PSR_VM_L2CTL(_i) (0x15600 + ((_i) * 4)) @@ -1059,6 +1065,7 @@ struct wx_ring_feature { enum wx_ring_f_enum { RING_F_NONE = 0, + RING_F_VMDQ, RING_F_RSS, RING_F_FDIR, RING_F_ARRAY_SIZE /* must be last in enum set */ @@ -1114,8 +1121,27 @@ enum wx_state { WX_STATE_NBITS /* must be last */ }; +struct vf_data_storage { + struct pci_dev *vfdev; + unsigned char vf_mac_addr[ETH_ALEN]; + bool spoofchk_enabled; + bool link_enable; + bool trusted; + int xcast_mode; +}; + +struct vf_macvlans { + struct list_head mvlist; + int vf; + bool free; + bool is_macvlan; + u8 vf_macvlan[ETH_ALEN]; +}; + enum wx_pf_flags { WX_FLAG_SWFW_RING, + WX_FLAG_VMDQ_ENABLED, + WX_FLAG_SRIOV_ENABLED, WX_FLAG_FDIR_CAPABLE, WX_FLAG_FDIR_HASH, WX_FLAG_FDIR_PERFECT, @@ -1220,10 +1246,15 @@ struct wx { u64 hw_csum_rx_good; u64 hw_csum_rx_error; u64 alloc_rx_buff_failed; + unsigned int num_vfs; + struct vf_data_storage *vfinfo; + struct vf_macvlans vf_mvs; + struct vf_macvlans *mv_list; u32 atr_sample_rate; void (*atr)(struct wx_ring *ring, struct wx_tx_buffer *first, u8 ptype); void (*configure_fdir)(struct wx *wx); + int (*setup_tc)(struct net_device *netdev, u8 tc); void (*do_reset)(struct net_device *netdev); int (*ptp_setup_sdp)(struct wx *wx); From c52d4b898901f697c3b0b1bc2c61480b70dcc976 Mon Sep 17 00:00:00 2001 From: Mengyuan Lou Date: Tue, 8 Apr 2025 17:15:53 +0800 Subject: [PATCH 040/770] net: libwx: Redesign flow when sriov is enabled Reallocate queue and int resources when sriov is enabled. Redefine macro VMDQ to make it work in VT mode. Signed-off-by: Mengyuan Lou Link: https://patch.msgid.link/64B616774ABE3C5A+20250408091556.9640-4-mengyuanlou@net-swift.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 296 ++++++++++++++++++- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 128 +++++++- drivers/net/ethernet/wangxun/libwx/wx_type.h | 34 ++- 3 files changed, 441 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index aed45abafb1b7..3d2ebbaa729d7 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -10,6 +10,7 @@ #include "wx_type.h" #include "wx_lib.h" +#include "wx_sriov.h" #include "wx_hw.h" static int wx_phy_read_reg_mdi(struct mii_bus *bus, int phy_addr, int devnum, int regnum) @@ -964,11 +965,28 @@ static void wx_sync_mac_table(struct wx *wx) } } +static void wx_full_sync_mac_table(struct wx *wx) +{ + int i; + + for (i = 0; i < wx->mac.num_rar_entries; i++) { + if (wx->mac_table[i].state & WX_MAC_STATE_IN_USE) { + wx_set_rar(wx, i, + wx->mac_table[i].addr, + wx->mac_table[i].pools, + WX_PSR_MAC_SWC_AD_H_AV); + } else { + wx_clear_rar(wx, i); + } + wx->mac_table[i].state &= ~(WX_MAC_STATE_MODIFIED); + } +} + /* this function destroys the first RAR entry */ void wx_mac_set_default_filter(struct wx *wx, u8 *addr) { memcpy(&wx->mac_table[0].addr, addr, ETH_ALEN); - wx->mac_table[0].pools = 1ULL; + wx->mac_table[0].pools = BIT(VMDQ_P(0)); wx->mac_table[0].state = (WX_MAC_STATE_DEFAULT | WX_MAC_STATE_IN_USE); wx_set_rar(wx, 0, wx->mac_table[0].addr, wx->mac_table[0].pools, @@ -1206,6 +1224,35 @@ static void wx_update_mc_addr_list(struct wx *wx, struct net_device *netdev) wx_dbg(wx, "Update mc addr list Complete\n"); } +static void wx_restore_vf_multicasts(struct wx *wx) +{ + u32 i, j, vector_bit, vector_reg; + struct vf_data_storage *vfinfo; + + for (i = 0; i < wx->num_vfs; i++) { + u32 vmolr = rd32(wx, WX_PSR_VM_L2CTL(i)); + + vfinfo = &wx->vfinfo[i]; + for (j = 0; j < vfinfo->num_vf_mc_hashes; j++) { + wx->addr_ctrl.mta_in_use++; + vector_reg = WX_PSR_MC_TBL_REG(vfinfo->vf_mc_hashes[j]); + vector_bit = WX_PSR_MC_TBL_BIT(vfinfo->vf_mc_hashes[j]); + wr32m(wx, WX_PSR_MC_TBL(vector_reg), + BIT(vector_bit), BIT(vector_bit)); + /* errata 5: maintain a copy of the reg table conf */ + wx->mac.mta_shadow[vector_reg] |= BIT(vector_bit); + } + if (vfinfo->num_vf_mc_hashes) + vmolr |= WX_PSR_VM_L2CTL_ROMPE; + else + vmolr &= ~WX_PSR_VM_L2CTL_ROMPE; + wr32(wx, WX_PSR_VM_L2CTL(i), vmolr); + } + + /* Restore any VF macvlans */ + wx_full_sync_mac_table(wx); +} + /** * wx_write_mc_addr_list - write multicast addresses to MTA * @netdev: network interface device structure @@ -1223,6 +1270,9 @@ static int wx_write_mc_addr_list(struct net_device *netdev) wx_update_mc_addr_list(wx, netdev); + if (test_bit(WX_FLAG_SRIOV_ENABLED, wx->flags)) + wx_restore_vf_multicasts(wx); + return netdev_mc_count(netdev); } @@ -1243,7 +1293,7 @@ int wx_set_mac(struct net_device *netdev, void *p) if (retval) return retval; - wx_del_mac_filter(wx, wx->mac.addr, 0); + wx_del_mac_filter(wx, wx->mac.addr, VMDQ_P(0)); eth_hw_addr_set(netdev, addr->sa_data); memcpy(wx->mac.addr, addr->sa_data, netdev->addr_len); @@ -1345,6 +1395,10 @@ static int wx_hpbthresh(struct wx *wx) /* Calculate delay value for device */ dv_id = WX_DV(link, tc); + /* Loopback switch introduces additional latency */ + if (test_bit(WX_FLAG_SRIOV_ENABLED, wx->flags)) + dv_id += WX_B2BT(tc); + /* Delay value is calculated in bit times convert to KB */ kb = WX_BT2KB(dv_id); rx_pba = rd32(wx, WX_RDB_PB_SZ(0)) >> WX_RDB_PB_SZ_SHIFT; @@ -1400,12 +1454,107 @@ static void wx_pbthresh_setup(struct wx *wx) wx->fc.low_water = 0; } +static void wx_set_ethertype_anti_spoofing(struct wx *wx, bool enable, int vf) +{ + u32 pfvfspoof, reg_offset, vf_shift; + + vf_shift = WX_VF_IND_SHIFT(vf); + reg_offset = WX_VF_REG_OFFSET(vf); + + pfvfspoof = rd32(wx, WX_TDM_ETYPE_AS(reg_offset)); + if (enable) + pfvfspoof |= BIT(vf_shift); + else + pfvfspoof &= ~BIT(vf_shift); + wr32(wx, WX_TDM_ETYPE_AS(reg_offset), pfvfspoof); +} + +static int wx_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting) +{ + u32 index = WX_VF_REG_OFFSET(vf), vf_bit = WX_VF_IND_SHIFT(vf); + struct wx *wx = netdev_priv(netdev); + u32 regval; + + if (vf >= wx->num_vfs) + return -EINVAL; + + wx->vfinfo[vf].spoofchk_enabled = setting; + + regval = (setting << vf_bit); + wr32m(wx, WX_TDM_MAC_AS(index), regval | BIT(vf_bit), regval); + + if (wx->vfinfo[vf].vlan_count) + wr32m(wx, WX_TDM_VLAN_AS(index), regval | BIT(vf_bit), regval); + + return 0; +} + +static void wx_configure_virtualization(struct wx *wx) +{ + u16 pool = wx->num_rx_pools; + u32 reg_offset, vf_shift; + u32 i; + + if (!test_bit(WX_FLAG_SRIOV_ENABLED, wx->flags)) + return; + + wr32m(wx, WX_PSR_VM_CTL, + WX_PSR_VM_CTL_POOL_MASK | WX_PSR_VM_CTL_REPLEN, + FIELD_PREP(WX_PSR_VM_CTL_POOL_MASK, VMDQ_P(0)) | + WX_PSR_VM_CTL_REPLEN); + while (pool--) + wr32m(wx, WX_PSR_VM_L2CTL(pool), + WX_PSR_VM_L2CTL_AUPE, WX_PSR_VM_L2CTL_AUPE); + + if (wx->mac.type == wx_mac_em) { + vf_shift = BIT(VMDQ_P(0)); + /* Enable only the PF pools for Tx/Rx */ + wr32(wx, WX_RDM_VF_RE(0), vf_shift); + wr32(wx, WX_TDM_VF_TE(0), vf_shift); + } else { + vf_shift = WX_VF_IND_SHIFT(VMDQ_P(0)); + reg_offset = WX_VF_REG_OFFSET(VMDQ_P(0)); + + /* Enable only the PF pools for Tx/Rx */ + wr32(wx, WX_RDM_VF_RE(reg_offset), GENMASK(31, vf_shift)); + wr32(wx, WX_RDM_VF_RE(reg_offset ^ 1), reg_offset - 1); + wr32(wx, WX_TDM_VF_TE(reg_offset), GENMASK(31, vf_shift)); + wr32(wx, WX_TDM_VF_TE(reg_offset ^ 1), reg_offset - 1); + } + + /* clear VLAN promisc flag so VFTA will be updated if necessary */ + clear_bit(WX_FLAG_VLAN_PROMISC, wx->flags); + + for (i = 0; i < wx->num_vfs; i++) { + if (!wx->vfinfo[i].spoofchk_enabled) + wx_set_vf_spoofchk(wx->netdev, i, false); + /* enable ethertype anti spoofing if hw supports it */ + wx_set_ethertype_anti_spoofing(wx, true, i); + } +} + static void wx_configure_port(struct wx *wx) { u32 value, i; - value = WX_CFG_PORT_CTL_D_VLAN | WX_CFG_PORT_CTL_QINQ; + if (wx->mac.type == wx_mac_em) { + value = (wx->num_vfs == 0) ? + WX_CFG_PORT_CTL_NUM_VT_NONE : + WX_CFG_PORT_CTL_NUM_VT_8; + } else { + if (test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) { + if (wx->ring_feature[RING_F_RSS].indices == 4) + value = WX_CFG_PORT_CTL_NUM_VT_32; + else + value = WX_CFG_PORT_CTL_NUM_VT_64; + } else { + value = 0; + } + } + + value |= WX_CFG_PORT_CTL_D_VLAN | WX_CFG_PORT_CTL_QINQ; wr32m(wx, WX_CFG_PORT_CTL, + WX_CFG_PORT_CTL_NUM_VT_MASK | WX_CFG_PORT_CTL_D_VLAN | WX_CFG_PORT_CTL_QINQ, value); @@ -1466,6 +1615,83 @@ static void wx_vlan_strip_control(struct wx *wx, bool enable) } } +static void wx_vlan_promisc_enable(struct wx *wx) +{ + u32 vlnctrl, i, vind, bits, reg_idx; + + vlnctrl = rd32(wx, WX_PSR_VLAN_CTL); + if (test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) { + /* we need to keep the VLAN filter on in SRIOV */ + vlnctrl |= WX_PSR_VLAN_CTL_VFE; + wr32(wx, WX_PSR_VLAN_CTL, vlnctrl); + } else { + vlnctrl &= ~WX_PSR_VLAN_CTL_VFE; + wr32(wx, WX_PSR_VLAN_CTL, vlnctrl); + return; + } + /* We are already in VLAN promisc, nothing to do */ + if (test_bit(WX_FLAG_VLAN_PROMISC, wx->flags)) + return; + /* Set flag so we don't redo unnecessary work */ + set_bit(WX_FLAG_VLAN_PROMISC, wx->flags); + /* Add PF to all active pools */ + for (i = WX_PSR_VLAN_SWC_ENTRIES; --i;) { + wr32(wx, WX_PSR_VLAN_SWC_IDX, i); + vind = WX_VF_IND_SHIFT(VMDQ_P(0)); + reg_idx = WX_VF_REG_OFFSET(VMDQ_P(0)); + bits = rd32(wx, WX_PSR_VLAN_SWC_VM(reg_idx)); + bits |= BIT(vind); + wr32(wx, WX_PSR_VLAN_SWC_VM(reg_idx), bits); + } + /* Set all bits in the VLAN filter table array */ + for (i = 0; i < wx->mac.vft_size; i++) + wr32(wx, WX_PSR_VLAN_TBL(i), U32_MAX); +} + +static void wx_scrub_vfta(struct wx *wx) +{ + u32 i, vid, bits, vfta, vind, vlvf, reg_idx; + + for (i = WX_PSR_VLAN_SWC_ENTRIES; --i;) { + wr32(wx, WX_PSR_VLAN_SWC_IDX, i); + vlvf = rd32(wx, WX_PSR_VLAN_SWC_IDX); + /* pull VLAN ID from VLVF */ + vid = vlvf & ~WX_PSR_VLAN_SWC_VIEN; + if (vlvf & WX_PSR_VLAN_SWC_VIEN) { + /* if PF is part of this then continue */ + if (test_bit(vid, wx->active_vlans)) + continue; + } + /* remove PF from the pool */ + vind = WX_VF_IND_SHIFT(VMDQ_P(0)); + reg_idx = WX_VF_REG_OFFSET(VMDQ_P(0)); + bits = rd32(wx, WX_PSR_VLAN_SWC_VM(reg_idx)); + bits &= ~BIT(vind); + wr32(wx, WX_PSR_VLAN_SWC_VM(reg_idx), bits); + } + /* extract values from vft_shadow and write back to VFTA */ + for (i = 0; i < wx->mac.vft_size; i++) { + vfta = wx->mac.vft_shadow[i]; + wr32(wx, WX_PSR_VLAN_TBL(i), vfta); + } +} + +static void wx_vlan_promisc_disable(struct wx *wx) +{ + u32 vlnctrl; + + /* configure vlan filtering */ + vlnctrl = rd32(wx, WX_PSR_VLAN_CTL); + vlnctrl |= WX_PSR_VLAN_CTL_VFE; + wr32(wx, WX_PSR_VLAN_CTL, vlnctrl); + /* We are not in VLAN promisc, nothing to do */ + if (!test_bit(WX_FLAG_VLAN_PROMISC, wx->flags)) + return; + /* Set flag so we don't redo unnecessary work */ + clear_bit(WX_FLAG_VLAN_PROMISC, wx->flags); + wx_scrub_vfta(wx); +} + void wx_set_rx_mode(struct net_device *netdev) { struct wx *wx = netdev_priv(netdev); @@ -1478,7 +1704,7 @@ void wx_set_rx_mode(struct net_device *netdev) /* Check for Promiscuous and All Multicast modes */ fctrl = rd32(wx, WX_PSR_CTL); fctrl &= ~(WX_PSR_CTL_UPE | WX_PSR_CTL_MPE); - vmolr = rd32(wx, WX_PSR_VM_L2CTL(0)); + vmolr = rd32(wx, WX_PSR_VM_L2CTL(VMDQ_P(0))); vmolr &= ~(WX_PSR_VM_L2CTL_UPE | WX_PSR_VM_L2CTL_MPE | WX_PSR_VM_L2CTL_ROPE | @@ -1499,7 +1725,10 @@ void wx_set_rx_mode(struct net_device *netdev) fctrl |= WX_PSR_CTL_UPE | WX_PSR_CTL_MPE; /* pf don't want packets routing to vf, so clear UPE */ vmolr |= WX_PSR_VM_L2CTL_MPE; - vlnctrl &= ~WX_PSR_VLAN_CTL_VFE; + if (test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags) && + test_bit(WX_FLAG_SRIOV_ENABLED, wx->flags)) + vlnctrl |= WX_PSR_VLAN_CTL_VFE; + features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; } if (netdev->flags & IFF_ALLMULTI) { @@ -1522,7 +1751,7 @@ void wx_set_rx_mode(struct net_device *netdev) * sufficient space to store all the addresses then enable * unicast promiscuous mode */ - count = wx_write_uc_addr_list(netdev, 0); + count = wx_write_uc_addr_list(netdev, VMDQ_P(0)); if (count < 0) { vmolr &= ~WX_PSR_VM_L2CTL_ROPE; vmolr |= WX_PSR_VM_L2CTL_UPE; @@ -1540,7 +1769,7 @@ void wx_set_rx_mode(struct net_device *netdev) wr32(wx, WX_PSR_VLAN_CTL, vlnctrl); wr32(wx, WX_PSR_CTL, fctrl); - wr32(wx, WX_PSR_VM_L2CTL(0), vmolr); + wr32(wx, WX_PSR_VM_L2CTL(VMDQ_P(0)), vmolr); if ((features & NETIF_F_HW_VLAN_CTAG_RX) && (features & NETIF_F_HW_VLAN_STAG_RX)) @@ -1548,6 +1777,10 @@ void wx_set_rx_mode(struct net_device *netdev) else wx_vlan_strip_control(wx, false); + if (features & NETIF_F_HW_VLAN_CTAG_FILTER) + wx_vlan_promisc_disable(wx); + else + wx_vlan_promisc_enable(wx); } EXPORT_SYMBOL(wx_set_rx_mode); @@ -1797,6 +2030,13 @@ static void wx_setup_reta(struct wx *wx) u32 random_key_size = WX_RSS_KEY_SIZE / 4; u32 i, j; + if (test_bit(WX_FLAG_SRIOV_ENABLED, wx->flags)) { + if (wx->mac.type == wx_mac_em) + rss_i = 1; + else + rss_i = rss_i < 4 ? 4 : rss_i; + } + /* Fill out hash function seeds */ for (i = 0; i < random_key_size; i++) wr32(wx, WX_RDB_RSSRK(i), wx->rss_key[i]); @@ -1814,10 +2054,42 @@ static void wx_setup_reta(struct wx *wx) wx_store_reta(wx); } +#define WX_RDB_RSS_PL_2 FIELD_PREP(GENMASK(31, 29), 1) +#define WX_RDB_RSS_PL_4 FIELD_PREP(GENMASK(31, 29), 2) +static void wx_setup_psrtype(struct wx *wx) +{ + int rss_i = wx->ring_feature[RING_F_RSS].indices; + u32 psrtype; + int pool; + + psrtype = WX_RDB_PL_CFG_L4HDR | + WX_RDB_PL_CFG_L3HDR | + WX_RDB_PL_CFG_L2HDR | + WX_RDB_PL_CFG_TUN_OUTL2HDR | + WX_RDB_PL_CFG_TUN_TUNHDR; + + if (wx->mac.type == wx_mac_em) { + for_each_set_bit(pool, &wx->fwd_bitmask, 8) + wr32(wx, WX_RDB_PL_CFG(VMDQ_P(pool)), psrtype); + } else { + if (rss_i > 3) + psrtype |= WX_RDB_RSS_PL_4; + else if (rss_i > 1) + psrtype |= WX_RDB_RSS_PL_2; + + for_each_set_bit(pool, &wx->fwd_bitmask, 32) + wr32(wx, WX_RDB_PL_CFG(VMDQ_P(pool)), psrtype); + } +} + static void wx_setup_mrqc(struct wx *wx) { u32 rss_field = 0; + /* VT, and RSS do not coexist at the same time */ + if (test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) + return; + /* Disable indicating checksum in descriptor, enables RSS hash */ wr32m(wx, WX_PSR_CTL, WX_PSR_CTL_PCSD, WX_PSR_CTL_PCSD); @@ -1847,16 +2119,11 @@ static void wx_setup_mrqc(struct wx *wx) **/ void wx_configure_rx(struct wx *wx) { - u32 psrtype, i; int ret; + u32 i; wx_disable_rx(wx); - - psrtype = WX_RDB_PL_CFG_L4HDR | - WX_RDB_PL_CFG_L3HDR | - WX_RDB_PL_CFG_L2HDR | - WX_RDB_PL_CFG_TUN_TUNHDR; - wr32(wx, WX_RDB_PL_CFG(0), psrtype); + wx_setup_psrtype(wx); /* enable hw crc stripping */ wr32m(wx, WX_RSC_CTL, WX_RSC_CTL_CRC_STRIP, WX_RSC_CTL_CRC_STRIP); @@ -1904,6 +2171,7 @@ void wx_configure(struct wx *wx) { wx_set_rxpba(wx); wx_pbthresh_setup(wx); + wx_configure_virtualization(wx); wx_configure_port(wx); wx_set_rx_mode(wx->netdev); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 00b0b318df27e..59d904f237640 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -1618,6 +1618,65 @@ void wx_napi_disable_all(struct wx *wx) } EXPORT_SYMBOL(wx_napi_disable_all); +static bool wx_set_vmdq_queues(struct wx *wx) +{ + u16 vmdq_i = wx->ring_feature[RING_F_VMDQ].limit; + u16 rss_i = wx->ring_feature[RING_F_RSS].limit; + u16 rss_m = WX_RSS_DISABLED_MASK; + u16 vmdq_m = 0; + + /* only proceed if VMDq is enabled */ + if (!test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) + return false; + /* Add starting offset to total pool count */ + vmdq_i += wx->ring_feature[RING_F_VMDQ].offset; + + if (wx->mac.type == wx_mac_sp) { + /* double check we are limited to maximum pools */ + vmdq_i = min_t(u16, 64, vmdq_i); + + /* 64 pool mode with 2 queues per pool, or + * 16/32/64 pool mode with 1 queue per pool + */ + if (vmdq_i > 32 || rss_i < 4) { + vmdq_m = WX_VMDQ_2Q_MASK; + rss_m = WX_RSS_2Q_MASK; + rss_i = min_t(u16, rss_i, 2); + /* 32 pool mode with 4 queues per pool */ + } else { + vmdq_m = WX_VMDQ_4Q_MASK; + rss_m = WX_RSS_4Q_MASK; + rss_i = 4; + } + } else { + /* double check we are limited to maximum pools */ + vmdq_i = min_t(u16, 8, vmdq_i); + + /* when VMDQ on, disable RSS */ + rss_i = 1; + } + + /* remove the starting offset from the pool count */ + vmdq_i -= wx->ring_feature[RING_F_VMDQ].offset; + + /* save features for later use */ + wx->ring_feature[RING_F_VMDQ].indices = vmdq_i; + wx->ring_feature[RING_F_VMDQ].mask = vmdq_m; + + /* limit RSS based on user input and save for later use */ + wx->ring_feature[RING_F_RSS].indices = rss_i; + wx->ring_feature[RING_F_RSS].mask = rss_m; + + wx->queues_per_pool = rss_i;/*maybe same to num_rx_queues_per_pool*/ + wx->num_rx_pools = vmdq_i; + wx->num_rx_queues_per_pool = rss_i; + + wx->num_rx_queues = vmdq_i * rss_i; + wx->num_tx_queues = vmdq_i * rss_i; + + return true; +} + /** * wx_set_rss_queues: Allocate queues for RSS * @wx: board private structure to initialize @@ -1632,6 +1691,10 @@ static void wx_set_rss_queues(struct wx *wx) /* set mask for 16 queue limit of RSS */ f = &wx->ring_feature[RING_F_RSS]; + if (wx->mac.type == wx_mac_sp) + f->mask = WX_RSS_64Q_MASK; + else + f->mask = WX_RSS_8Q_MASK; f->indices = f->limit; if (!(test_bit(WX_FLAG_FDIR_CAPABLE, wx->flags))) @@ -1664,6 +1727,9 @@ static void wx_set_num_queues(struct wx *wx) wx->num_tx_queues = 1; wx->queues_per_pool = 1; + if (wx_set_vmdq_queues(wx)) + return; + wx_set_rss_queues(wx); } @@ -1744,6 +1810,10 @@ static int wx_set_interrupt_capability(struct wx *wx) if (ret == 0 || (ret == -ENOMEM)) return ret; + /* Disable VMDq support */ + dev_warn(&wx->pdev->dev, "Disabling VMQQ support\n"); + clear_bit(WX_FLAG_VMDQ_ENABLED, wx->flags); + /* Disable RSS */ dev_warn(&wx->pdev->dev, "Disabling RSS support\n"); wx->ring_feature[RING_F_RSS].limit = 1; @@ -1770,6 +1840,49 @@ static int wx_set_interrupt_capability(struct wx *wx) return 0; } +static bool wx_cache_ring_vmdq(struct wx *wx) +{ + struct wx_ring_feature *vmdq = &wx->ring_feature[RING_F_VMDQ]; + struct wx_ring_feature *rss = &wx->ring_feature[RING_F_RSS]; + u16 reg_idx; + int i; + + /* only proceed if VMDq is enabled */ + if (!test_bit(WX_FLAG_VMDQ_ENABLED, wx->flags)) + return false; + + if (wx->mac.type == wx_mac_sp) { + /* start at VMDq register offset for SR-IOV enabled setups */ + reg_idx = vmdq->offset * __ALIGN_MASK(1, ~vmdq->mask); + for (i = 0; i < wx->num_rx_queues; i++, reg_idx++) { + /* If we are greater than indices move to next pool */ + if ((reg_idx & ~vmdq->mask) >= rss->indices) + reg_idx = __ALIGN_MASK(reg_idx, ~vmdq->mask); + wx->rx_ring[i]->reg_idx = reg_idx; + } + reg_idx = vmdq->offset * __ALIGN_MASK(1, ~vmdq->mask); + for (i = 0; i < wx->num_tx_queues; i++, reg_idx++) { + /* If we are greater than indices move to next pool */ + if ((reg_idx & rss->mask) >= rss->indices) + reg_idx = __ALIGN_MASK(reg_idx, ~vmdq->mask); + wx->tx_ring[i]->reg_idx = reg_idx; + } + } else { + /* start at VMDq register offset for SR-IOV enabled setups */ + reg_idx = vmdq->offset; + for (i = 0; i < wx->num_rx_queues; i++) + /* If we are greater than indices move to next pool */ + wx->rx_ring[i]->reg_idx = reg_idx + i; + + reg_idx = vmdq->offset; + for (i = 0; i < wx->num_tx_queues; i++) + /* If we are greater than indices move to next pool */ + wx->tx_ring[i]->reg_idx = reg_idx + i; + } + + return true; +} + /** * wx_cache_ring_rss - Descriptor ring to register mapping for RSS * @wx: board private structure to initialize @@ -1781,6 +1894,9 @@ static void wx_cache_ring_rss(struct wx *wx) { u16 i; + if (wx_cache_ring_vmdq(wx)) + return; + for (i = 0; i < wx->num_rx_queues; i++) wx->rx_ring[i]->reg_idx = i; @@ -2179,7 +2295,8 @@ static void wx_set_ivar(struct wx *wx, s8 direction, wr32(wx, WX_PX_MISC_IVAR, ivar); } else { /* tx or rx causes */ - msix_vector += 1; /* offset for queue vectors */ + if (!(wx->mac.type == wx_mac_em && wx->num_vfs == 7)) + msix_vector += 1; /* offset for queue vectors */ msix_vector |= WX_PX_IVAR_ALLOC_VAL; index = ((16 * (queue & 1)) + (8 * direction)); ivar = rd32(wx, WX_PX_IVAR(queue >> 1)); @@ -2231,10 +2348,17 @@ void wx_configure_vectors(struct wx *wx) { struct pci_dev *pdev = wx->pdev; u32 eitrsel = 0; - u16 v_idx; + u16 v_idx, i; if (pdev->msix_enabled) { /* Populate MSIX to EITR Select */ + if (wx->mac.type == wx_mac_sp) { + if (wx->num_vfs >= 32) + eitrsel = BIT(wx->num_vfs % 32) - 1; + } else if (wx->mac.type == wx_mac_em) { + for (i = 0; i < wx->num_vfs; i++) + eitrsel |= BIT(i); + } wr32(wx, WX_PX_ITRSEL, eitrsel); /* use EIAM to auto-mask when MSI-X interrupt is asserted * this saves a register write for every interrupt diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index 26776dee64903..be9463f8b4087 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -21,6 +21,7 @@ #define WX_PCIE_MSIX_TBL_SZ_MASK 0x7FF #define WX_PCI_LINK_STATUS 0xB2 #define WX_MAX_PF_MACVLANS 15 +#define WX_MAX_VF_MC_ENTRIES 30 /**************** Global Registers ****************************/ #define WX_VF_REG_OFFSET(_v) FIELD_GET(GENMASK(15, 5), (_v)) @@ -80,6 +81,7 @@ #define WX_MAC_LXONOFFRXC 0x11E0C /*********************** Receive DMA registers **************************/ +#define WX_RDM_VF_RE(_i) (0x12004 + ((_i) * 4)) #define WX_RDM_DRP_PKT 0x12500 #define WX_RDM_PKT_CNT 0x12504 #define WX_RDM_BYTE_CNT_LSB 0x12508 @@ -94,6 +96,7 @@ #define WX_CFG_TAG_TPID(_i) (0x14430 + ((_i) * 4)) #define WX_CFG_PORT_CTL_NUM_VT_MASK GENMASK(13, 12) /* number of TVs */ +#define WX_CFG_PORT_CTL_NUM_VT_NONE 0 #define WX_CFG_PORT_CTL_NUM_VT_8 FIELD_PREP(GENMASK(13, 12), 1) #define WX_CFG_PORT_CTL_NUM_VT_32 FIELD_PREP(GENMASK(13, 12), 2) #define WX_CFG_PORT_CTL_NUM_VT_64 FIELD_PREP(GENMASK(13, 12), 3) @@ -119,6 +122,10 @@ /*********************** Transmit DMA registers **************************/ /* transmit global control */ #define WX_TDM_CTL 0x18000 +#define WX_TDM_VF_TE(_i) (0x18004 + ((_i) * 4)) +#define WX_TDM_MAC_AS(_i) (0x18060 + ((_i) * 4)) +#define WX_TDM_VLAN_AS(_i) (0x18070 + ((_i) * 4)) + /* TDM CTL BIT */ #define WX_TDM_CTL_TE BIT(0) /* Transmit Enable */ #define WX_TDM_PB_THRE(_i) (0x18020 + ((_i) * 4)) @@ -209,7 +216,10 @@ #define WX_PSR_1588_CTL_VALID BIT(0) /* mcasst/ucast overflow tbl */ #define WX_PSR_MC_TBL(_i) (0x15200 + ((_i) * 4)) +#define WX_PSR_MC_TBL_REG(_i) FIELD_GET(GENMASK(11, 5), (_i)) +#define WX_PSR_MC_TBL_BIT(_i) FIELD_GET(GENMASK(4, 0), (_i)) #define WX_PSR_UC_TBL(_i) (0x15400 + ((_i) * 4)) +#define WX_PSR_VM_CTL_REPLEN BIT(30) /* replication enabled */ #define WX_PSR_VM_CTL_POOL_MASK GENMASK(12, 7) /* VM L2 contorl */ @@ -254,6 +264,7 @@ #define WX_PSR_VLAN_SWC 0x16220 #define WX_PSR_VLAN_SWC_VM_L 0x16224 #define WX_PSR_VLAN_SWC_VM_H 0x16228 +#define WX_PSR_VLAN_SWC_VM(_i) (0x16224 + ((_i) * 4)) #define WX_PSR_VLAN_SWC_IDX 0x16230 /* 64 vlan entries */ /* VLAN pool filtering masks */ #define WX_PSR_VLAN_SWC_VIEN BIT(31) /* filter is valid */ @@ -268,6 +279,10 @@ #define WX_RSC_ST 0x17004 #define WX_RSC_ST_RSEC_RDY BIT(0) +/*********************** Transmit DMA registers **************************/ +/* transmit global control */ +#define WX_TDM_ETYPE_AS(_i) (0x18058 + ((_i) * 4)) + /****************************** TDB ******************************************/ #define WX_TDB_PB_SZ(_i) (0x1CC00 + ((_i) * 4)) #define WX_TXPKT_SIZE_MAX 0xA /* Max Tx Packet size */ @@ -426,6 +441,15 @@ enum WX_MSCA_CMD_value { /* Number of 80 microseconds we wait for PCI Express master disable */ #define WX_PCI_MASTER_DISABLE_TIMEOUT 80000 +#define WX_RSS_64Q_MASK 0x3F +#define WX_RSS_8Q_MASK 0x7 +#define WX_RSS_4Q_MASK 0x3 +#define WX_RSS_2Q_MASK 0x1 +#define WX_RSS_DISABLED_MASK 0x0 + +#define WX_VMDQ_4Q_MASK 0x7C +#define WX_VMDQ_2Q_MASK 0x7E + /****************** Manageablility Host Interface defines ********************/ #define WX_HI_MAX_BLOCK_BYTE_LENGTH 256 /* Num of bytes in range */ #define WX_HI_COMMAND_TIMEOUT 1000 /* Process HI command limit */ @@ -493,7 +517,7 @@ enum WX_MSCA_CMD_value { #define WX_REQ_TX_DESCRIPTOR_MULTIPLE 128 #define WX_MAX_JUMBO_FRAME_SIZE 9432 /* max payload 9414 */ -#define VMDQ_P(p) p +#define VMDQ_P(p) ((p) + wx->ring_feature[RING_F_VMDQ].offset) /* Supported Rx Buffer Sizes */ #define WX_RXBUFFER_256 256 /* Used for skb receive header */ @@ -1128,6 +1152,10 @@ struct vf_data_storage { bool link_enable; bool trusted; int xcast_mode; + + u16 vf_mc_hashes[WX_MAX_VF_MC_ENTRIES]; + u16 num_vf_mc_hashes; + u16 vlan_count; }; struct vf_macvlans { @@ -1141,6 +1169,7 @@ struct vf_macvlans { enum wx_pf_flags { WX_FLAG_SWFW_RING, WX_FLAG_VMDQ_ENABLED, + WX_FLAG_VLAN_PROMISC, WX_FLAG_SRIOV_ENABLED, WX_FLAG_FDIR_CAPABLE, WX_FLAG_FDIR_HASH, @@ -1217,6 +1246,8 @@ struct wx { struct wx_ring *tx_ring[64] ____cacheline_aligned_in_smp; struct wx_ring *rx_ring[64]; struct wx_q_vector *q_vector[64]; + int num_rx_pools; + int num_rx_queues_per_pool; unsigned int queues_per_pool; struct msix_entry *msix_q_entries; @@ -1250,6 +1281,7 @@ struct wx { struct vf_data_storage *vfinfo; struct vf_macvlans vf_mvs; struct vf_macvlans *mv_list; + unsigned long fwd_bitmask; u32 atr_sample_rate; void (*atr)(struct wx_ring *ring, struct wx_tx_buffer *first, u8 ptype); From 359e41f63155408e16588c810145def7801ea3ac Mon Sep 17 00:00:00 2001 From: Mengyuan Lou Date: Tue, 8 Apr 2025 17:15:54 +0800 Subject: [PATCH 041/770] net: libwx: Add msg task func Implement wx_msg_task which is used to process mailbox messages sent by vf. Signed-off-by: Mengyuan Lou Link: https://patch.msgid.link/8257B39B95CDB469+20250408091556.9640-5-mengyuanlou@net-swift.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 8 +- drivers/net/ethernet/wangxun/libwx/wx_hw.h | 4 + drivers/net/ethernet/wangxun/libwx/wx_mbx.h | 41 ++ drivers/net/ethernet/wangxun/libwx/wx_sriov.c | 636 ++++++++++++++++++ drivers/net/ethernet/wangxun/libwx/wx_sriov.h | 1 + drivers/net/ethernet/wangxun/libwx/wx_type.h | 17 + 6 files changed, 703 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index 3d2ebbaa729d7..3c3aa5f4ebbff 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -1011,7 +1011,7 @@ void wx_flush_sw_mac_table(struct wx *wx) } EXPORT_SYMBOL(wx_flush_sw_mac_table); -static int wx_add_mac_filter(struct wx *wx, u8 *addr, u16 pool) +int wx_add_mac_filter(struct wx *wx, u8 *addr, u16 pool) { u32 i; @@ -1042,7 +1042,7 @@ static int wx_add_mac_filter(struct wx *wx, u8 *addr, u16 pool) return -ENOMEM; } -static int wx_del_mac_filter(struct wx *wx, u8 *addr, u16 pool) +int wx_del_mac_filter(struct wx *wx, u8 *addr, u16 pool) { u32 i; @@ -1469,7 +1469,7 @@ static void wx_set_ethertype_anti_spoofing(struct wx *wx, bool enable, int vf) wr32(wx, WX_TDM_ETYPE_AS(reg_offset), pfvfspoof); } -static int wx_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting) +int wx_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting) { u32 index = WX_VF_REG_OFFSET(vf), vf_bit = WX_VF_IND_SHIFT(vf); struct wx *wx = netdev_priv(netdev); @@ -2530,7 +2530,7 @@ static int wx_set_vlvf(struct wx *wx, u32 vlan, u32 vind, bool vlan_on, * * Turn on/off specified VLAN in the VLAN filter table. **/ -static int wx_set_vfta(struct wx *wx, u32 vlan, u32 vind, bool vlan_on) +int wx_set_vfta(struct wx *wx, u32 vlan, u32 vind, bool vlan_on) { u32 bitindex, vfta, targetbit; bool vfta_changed = false; diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.h b/drivers/net/ethernet/wangxun/libwx/wx_hw.h index b883342bb5767..91c1d6135045b 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.h @@ -26,9 +26,12 @@ void wx_init_eeprom_params(struct wx *wx); void wx_get_mac_addr(struct wx *wx, u8 *mac_addr); void wx_init_rx_addrs(struct wx *wx); void wx_mac_set_default_filter(struct wx *wx, u8 *addr); +int wx_add_mac_filter(struct wx *wx, u8 *addr, u16 pool); +int wx_del_mac_filter(struct wx *wx, u8 *addr, u16 pool); void wx_flush_sw_mac_table(struct wx *wx); int wx_set_mac(struct net_device *netdev, void *p); void wx_disable_rx(struct wx *wx); +int wx_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting); int wx_disable_sec_rx_path(struct wx *wx); void wx_enable_sec_rx_path(struct wx *wx); void wx_set_rx_mode(struct net_device *netdev); @@ -42,6 +45,7 @@ int wx_stop_adapter(struct wx *wx); void wx_reset_misc(struct wx *wx); int wx_get_pcie_msix_counts(struct wx *wx, u16 *msix_count, u16 max_msix_count); int wx_sw_init(struct wx *wx); +int wx_set_vfta(struct wx *wx, u32 vlan, u32 vind, bool vlan_on); int wx_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid); int wx_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid); int wx_fc_enable(struct wx *wx, bool tx_pause, bool rx_pause); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_mbx.h b/drivers/net/ethernet/wangxun/libwx/wx_mbx.h index 172b46b9c14dc..05aae138dbc31 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_mbx.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_mbx.h @@ -21,10 +21,51 @@ #define WX_MBVFICR_VFREQ_MASK GENMASK(15, 0) #define WX_MBVFICR_VFACK_MASK GENMASK(31, 16) +#define WX_VT_MSGTYPE_ACK BIT(31) +#define WX_VT_MSGTYPE_NACK BIT(30) +#define WX_VT_MSGTYPE_CTS BIT(29) +#define WX_VT_MSGINFO_SHIFT 16 #define WX_VT_MSGINFO_MASK GENMASK(23, 16) +enum wx_pfvf_api_rev { + wx_mbox_api_null, + wx_mbox_api_13 = 4, /* API version 1.3 */ + wx_mbox_api_unknown, /* indicates that API version is not known */ +}; + +/* mailbox API */ +#define WX_VF_RESET 0x01 /* VF requests reset */ +#define WX_VF_SET_MAC_ADDR 0x02 /* VF requests PF to set MAC addr */ +#define WX_VF_SET_MULTICAST 0x03 /* VF requests PF to set MC addr */ +#define WX_VF_SET_VLAN 0x04 /* VF requests PF to set VLAN */ +#define WX_VF_SET_LPE 0x05 /* VF requests PF to set VMOLR.LPE */ +#define WX_VF_SET_MACVLAN 0x06 /* VF requests PF unicast filter */ +#define WX_VF_API_NEGOTIATE 0x08 /* negotiate API version */ +#define WX_VF_GET_QUEUES 0x09 /* get queue configuration */ +#define WX_VF_GET_RETA 0x0a /* VF request for RETA */ +#define WX_VF_GET_RSS_KEY 0x0b /* get RSS key */ +#define WX_VF_UPDATE_XCAST_MODE 0x0c +#define WX_VF_GET_LINK_STATE 0x10 /* get vf link state */ +#define WX_VF_GET_FW_VERSION 0x11 /* get fw version */ + +#define WX_VF_BACKUP 0x8001 /* VF requests backup */ + +#define WX_PF_CONTROL_MSG BIT(8) /* PF control message */ +#define WX_PF_NOFITY_VF_LINK_STATUS 0x1 +#define WX_PF_NOFITY_VF_NET_NOT_RUNNING BIT(31) + +#define WX_VF_TX_QUEUES 1 /* number of Tx queues supported */ +#define WX_VF_RX_QUEUES 2 /* number of Rx queues supported */ +#define WX_VF_TRANS_VLAN 3 /* Indication of port vlan */ +#define WX_VF_DEF_QUEUE 4 /* Default queue offset */ + +#define WX_VF_PERMADDR_MSG_LEN 4 + enum wxvf_xcast_modes { WXVF_XCAST_MODE_NONE = 0, + WXVF_XCAST_MODE_MULTI, + WXVF_XCAST_MODE_ALLMULTI, + WXVF_XCAST_MODE_PROMISC, }; int wx_write_mbx_pf(struct wx *wx, u32 *msg, u16 size, u16 vf); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_sriov.c b/drivers/net/ethernet/wangxun/libwx/wx_sriov.c index ce7469e1cf3e1..1a7cadbf7234d 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_sriov.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_sriov.c @@ -5,6 +5,7 @@ #include #include "wx_type.h" +#include "wx_hw.h" #include "wx_mbx.h" #include "wx_sriov.h" @@ -198,3 +199,638 @@ int wx_pci_sriov_configure(struct pci_dev *pdev, int num_vfs) return num_vfs; } EXPORT_SYMBOL(wx_pci_sriov_configure); + +static int wx_set_vf_mac(struct wx *wx, u16 vf, const u8 *mac_addr) +{ + u8 hw_addr[ETH_ALEN]; + int ret = 0; + + ether_addr_copy(hw_addr, mac_addr); + wx_del_mac_filter(wx, wx->vfinfo[vf].vf_mac_addr, vf); + ret = wx_add_mac_filter(wx, hw_addr, vf); + if (ret >= 0) + ether_addr_copy(wx->vfinfo[vf].vf_mac_addr, mac_addr); + else + eth_zero_addr(wx->vfinfo[vf].vf_mac_addr); + + return ret; +} + +static void wx_set_vmolr(struct wx *wx, u16 vf, bool aupe) +{ + u32 vmolr = rd32(wx, WX_PSR_VM_L2CTL(vf)); + + vmolr |= WX_PSR_VM_L2CTL_BAM; + if (aupe) + vmolr |= WX_PSR_VM_L2CTL_AUPE; + else + vmolr &= ~WX_PSR_VM_L2CTL_AUPE; + wr32(wx, WX_PSR_VM_L2CTL(vf), vmolr); +} + +static void wx_set_vmvir(struct wx *wx, u16 vid, u16 qos, u16 vf) +{ + u32 vmvir = vid | (qos << VLAN_PRIO_SHIFT) | + WX_TDM_VLAN_INS_VLANA_DEFAULT; + + wr32(wx, WX_TDM_VLAN_INS(vf), vmvir); +} + +static int wx_set_vf_vlan(struct wx *wx, int add, int vid, u16 vf) +{ + if (!vid && !add) + return 0; + + return wx_set_vfta(wx, vid, vf, (bool)add); +} + +static void wx_set_vlan_anti_spoofing(struct wx *wx, bool enable, int vf) +{ + u32 index = WX_VF_REG_OFFSET(vf), vf_bit = WX_VF_IND_SHIFT(vf); + u32 pfvfspoof; + + pfvfspoof = rd32(wx, WX_TDM_VLAN_AS(index)); + if (enable) + pfvfspoof |= BIT(vf_bit); + else + pfvfspoof &= ~BIT(vf_bit); + wr32(wx, WX_TDM_VLAN_AS(index), pfvfspoof); +} + +static void wx_write_qde(struct wx *wx, u32 vf, u32 qde) +{ + struct wx_ring_feature *vmdq = &wx->ring_feature[RING_F_VMDQ]; + u32 q_per_pool = __ALIGN_MASK(1, ~vmdq->mask); + u32 reg = 0, n = vf * q_per_pool / 32; + u32 i = vf * q_per_pool; + + reg = rd32(wx, WX_RDM_PF_QDE(n)); + for (i = (vf * q_per_pool - n * 32); + i < ((vf + 1) * q_per_pool - n * 32); + i++) { + if (qde == 1) + reg |= qde << i; + else + reg &= qde << i; + } + + wr32(wx, WX_RDM_PF_QDE(n), reg); +} + +static void wx_clear_vmvir(struct wx *wx, u32 vf) +{ + wr32(wx, WX_TDM_VLAN_INS(vf), 0); +} + +static void wx_set_vf_rx_tx(struct wx *wx, int vf) +{ + u32 index = WX_VF_REG_OFFSET(vf), vf_bit = WX_VF_IND_SHIFT(vf); + u32 reg_cur_tx, reg_cur_rx, reg_req_tx, reg_req_rx; + + reg_cur_tx = rd32(wx, WX_TDM_VF_TE(index)); + reg_cur_rx = rd32(wx, WX_RDM_VF_RE(index)); + + if (wx->vfinfo[vf].link_enable) { + reg_req_tx = reg_cur_tx | BIT(vf_bit); + reg_req_rx = reg_cur_rx | BIT(vf_bit); + /* Enable particular VF */ + if (reg_cur_tx != reg_req_tx) + wr32(wx, WX_TDM_VF_TE(index), reg_req_tx); + if (reg_cur_rx != reg_req_rx) + wr32(wx, WX_RDM_VF_RE(index), reg_req_rx); + } else { + reg_req_tx = BIT(vf_bit); + reg_req_rx = BIT(vf_bit); + /* Disable particular VF */ + if (reg_cur_tx & reg_req_tx) + wr32(wx, WX_TDM_VFTE_CLR(index), reg_req_tx); + if (reg_cur_rx & reg_req_rx) + wr32(wx, WX_RDM_VFRE_CLR(index), reg_req_rx); + } +} + +static int wx_get_vf_queues(struct wx *wx, u32 *msgbuf, u32 vf) +{ + struct wx_ring_feature *vmdq = &wx->ring_feature[RING_F_VMDQ]; + unsigned int default_tc = 0; + + msgbuf[WX_VF_TX_QUEUES] = __ALIGN_MASK(1, ~vmdq->mask); + msgbuf[WX_VF_RX_QUEUES] = __ALIGN_MASK(1, ~vmdq->mask); + + if (wx->vfinfo[vf].pf_vlan || wx->vfinfo[vf].pf_qos) + msgbuf[WX_VF_TRANS_VLAN] = 1; + else + msgbuf[WX_VF_TRANS_VLAN] = 0; + + /* notify VF of default queue */ + msgbuf[WX_VF_DEF_QUEUE] = default_tc; + + return 0; +} + +static void wx_vf_reset_event(struct wx *wx, u16 vf) +{ + struct vf_data_storage *vfinfo = &wx->vfinfo[vf]; + u8 num_tcs = netdev_get_num_tc(wx->netdev); + + /* add PF assigned VLAN */ + wx_set_vf_vlan(wx, true, vfinfo->pf_vlan, vf); + + /* reset offloads to defaults */ + wx_set_vmolr(wx, vf, !vfinfo->pf_vlan); + + /* set outgoing tags for VFs */ + if (!vfinfo->pf_vlan && !vfinfo->pf_qos && !num_tcs) { + wx_clear_vmvir(wx, vf); + } else { + if (vfinfo->pf_qos || !num_tcs) + wx_set_vmvir(wx, vfinfo->pf_vlan, + vfinfo->pf_qos, vf); + else + wx_set_vmvir(wx, vfinfo->pf_vlan, + wx->default_up, vf); + } + + /* reset multicast table array for vf */ + wx->vfinfo[vf].num_vf_mc_hashes = 0; + + /* Flush and reset the mta with the new values */ + wx_set_rx_mode(wx->netdev); + + wx_del_mac_filter(wx, wx->vfinfo[vf].vf_mac_addr, vf); + /* reset VF api back to unknown */ + wx->vfinfo[vf].vf_api = wx_mbox_api_null; +} + +static void wx_vf_reset_msg(struct wx *wx, u16 vf) +{ + const u8 *vf_mac = wx->vfinfo[vf].vf_mac_addr; + struct net_device *dev = wx->netdev; + u32 msgbuf[5] = {0, 0, 0, 0, 0}; + u8 *addr = (u8 *)(&msgbuf[1]); + u32 reg = 0, index, vf_bit; + int pf_max_frame; + + /* reset the filters for the device */ + wx_vf_reset_event(wx, vf); + + /* set vf mac address */ + if (!is_zero_ether_addr(vf_mac)) + wx_set_vf_mac(wx, vf, vf_mac); + + index = WX_VF_REG_OFFSET(vf); + vf_bit = WX_VF_IND_SHIFT(vf); + + /* force drop enable for all VF Rx queues */ + wx_write_qde(wx, vf, 1); + + /* set transmit and receive for vf */ + wx_set_vf_rx_tx(wx, vf); + + pf_max_frame = dev->mtu + ETH_HLEN; + + if (pf_max_frame > ETH_FRAME_LEN) + reg = BIT(vf_bit); + wr32(wx, WX_RDM_VFRE_CLR(index), reg); + + /* enable VF mailbox for further messages */ + wx->vfinfo[vf].clear_to_send = true; + + /* reply to reset with ack and vf mac address */ + msgbuf[0] = WX_VF_RESET; + if (!is_zero_ether_addr(vf_mac)) { + msgbuf[0] |= WX_VT_MSGTYPE_ACK; + memcpy(addr, vf_mac, ETH_ALEN); + } else { + msgbuf[0] |= WX_VT_MSGTYPE_NACK; + wx_err(wx, "VF %d has no MAC address assigned", vf); + } + + msgbuf[3] = wx->mac.mc_filter_type; + wx_write_mbx_pf(wx, msgbuf, WX_VF_PERMADDR_MSG_LEN, vf); +} + +static int wx_set_vf_mac_addr(struct wx *wx, u32 *msgbuf, u16 vf) +{ + const u8 *new_mac = ((u8 *)(&msgbuf[1])); + int ret; + + if (!is_valid_ether_addr(new_mac)) { + wx_err(wx, "VF %d attempted to set invalid mac\n", vf); + return -EINVAL; + } + + if (wx->vfinfo[vf].pf_set_mac && + memcmp(wx->vfinfo[vf].vf_mac_addr, new_mac, ETH_ALEN)) { + wx_err(wx, + "VF %d attempt to set a MAC but it already had a MAC.", + vf); + return -EBUSY; + } + + ret = wx_set_vf_mac(wx, vf, new_mac); + if (ret < 0) + return ret; + + return 0; +} + +static void wx_set_vf_multicasts(struct wx *wx, u32 *msgbuf, u32 vf) +{ + struct vf_data_storage *vfinfo = &wx->vfinfo[vf]; + u16 entries = (msgbuf[0] & WX_VT_MSGINFO_MASK) + >> WX_VT_MSGINFO_SHIFT; + u32 vmolr = rd32(wx, WX_PSR_VM_L2CTL(vf)); + u32 vector_bit, vector_reg, mta_reg, i; + u16 *hash_list = (u16 *)&msgbuf[1]; + + /* only so many hash values supported */ + entries = min_t(u16, entries, WX_MAX_VF_MC_ENTRIES); + vfinfo->num_vf_mc_hashes = entries; + + for (i = 0; i < entries; i++) + vfinfo->vf_mc_hashes[i] = hash_list[i]; + + for (i = 0; i < vfinfo->num_vf_mc_hashes; i++) { + vector_reg = WX_PSR_MC_TBL_REG(vfinfo->vf_mc_hashes[i]); + vector_bit = WX_PSR_MC_TBL_BIT(vfinfo->vf_mc_hashes[i]); + mta_reg = wx->mac.mta_shadow[vector_reg]; + mta_reg |= BIT(vector_bit); + wx->mac.mta_shadow[vector_reg] = mta_reg; + wr32(wx, WX_PSR_MC_TBL(vector_reg), mta_reg); + } + vmolr |= WX_PSR_VM_L2CTL_ROMPE; + wr32(wx, WX_PSR_VM_L2CTL(vf), vmolr); +} + +static void wx_set_vf_lpe(struct wx *wx, u32 max_frame, u32 vf) +{ + u32 index, vf_bit, vfre; + u32 max_frs, reg_val; + + /* determine VF receive enable location */ + index = WX_VF_REG_OFFSET(vf); + vf_bit = WX_VF_IND_SHIFT(vf); + + vfre = rd32(wx, WX_RDM_VF_RE(index)); + vfre |= BIT(vf_bit); + wr32(wx, WX_RDM_VF_RE(index), vfre); + + /* pull current max frame size from hardware */ + max_frs = DIV_ROUND_UP(max_frame, 1024); + reg_val = rd32(wx, WX_MAC_WDG_TIMEOUT) & WX_MAC_WDG_TIMEOUT_WTO_MASK; + if (max_frs > (reg_val + WX_MAC_WDG_TIMEOUT_WTO_DELTA)) + wr32(wx, WX_MAC_WDG_TIMEOUT, + max_frs - WX_MAC_WDG_TIMEOUT_WTO_DELTA); +} + +static int wx_find_vlvf_entry(struct wx *wx, u32 vlan) +{ + int regindex; + u32 vlvf; + + /* short cut the special case */ + if (vlan == 0) + return 0; + + /* Search for the vlan id in the VLVF entries */ + for (regindex = 1; regindex < WX_PSR_VLAN_SWC_ENTRIES; regindex++) { + wr32(wx, WX_PSR_VLAN_SWC_IDX, regindex); + vlvf = rd32(wx, WX_PSR_VLAN_SWC); + if ((vlvf & VLAN_VID_MASK) == vlan) + break; + } + + /* Return a negative value if not found */ + if (regindex >= WX_PSR_VLAN_SWC_ENTRIES) + regindex = -EINVAL; + + return regindex; +} + +static int wx_set_vf_macvlan(struct wx *wx, + u16 vf, int index, unsigned char *mac_addr) +{ + struct vf_macvlans *entry; + struct list_head *pos; + int retval = 0; + + if (index <= 1) { + list_for_each(pos, &wx->vf_mvs.mvlist) { + entry = list_entry(pos, struct vf_macvlans, mvlist); + if (entry->vf == vf) { + entry->vf = -1; + entry->free = true; + entry->is_macvlan = false; + wx_del_mac_filter(wx, entry->vf_macvlan, vf); + } + } + } + + if (!index) + return 0; + + entry = NULL; + list_for_each(pos, &wx->vf_mvs.mvlist) { + entry = list_entry(pos, struct vf_macvlans, mvlist); + if (entry->free) + break; + } + + if (!entry || !entry->free) + return -ENOSPC; + + retval = wx_add_mac_filter(wx, mac_addr, vf); + if (retval >= 0) { + entry->free = false; + entry->is_macvlan = true; + entry->vf = vf; + memcpy(entry->vf_macvlan, mac_addr, ETH_ALEN); + } + + return retval; +} + +static int wx_set_vf_vlan_msg(struct wx *wx, u32 *msgbuf, u16 vf) +{ + int add = (msgbuf[0] & WX_VT_MSGINFO_MASK) >> WX_VT_MSGINFO_SHIFT; + int vid = (msgbuf[1] & WX_PSR_VLAN_SWC_VLANID_MASK); + int ret; + + if (add) + wx->vfinfo[vf].vlan_count++; + else if (wx->vfinfo[vf].vlan_count) + wx->vfinfo[vf].vlan_count--; + + /* in case of promiscuous mode any VLAN filter set for a VF must + * also have the PF pool added to it. + */ + if (add && wx->netdev->flags & IFF_PROMISC) + wx_set_vf_vlan(wx, add, vid, VMDQ_P(0)); + + ret = wx_set_vf_vlan(wx, add, vid, vf); + if (!ret && wx->vfinfo[vf].spoofchk_enabled) + wx_set_vlan_anti_spoofing(wx, true, vf); + + /* Go through all the checks to see if the VLAN filter should + * be wiped completely. + */ + if (!add && wx->netdev->flags & IFF_PROMISC) { + u32 bits = 0, vlvf; + int reg_ndx; + + reg_ndx = wx_find_vlvf_entry(wx, vid); + if (reg_ndx < 0) + return -ENOSPC; + wr32(wx, WX_PSR_VLAN_SWC_IDX, reg_ndx); + vlvf = rd32(wx, WX_PSR_VLAN_SWC); + /* See if any other pools are set for this VLAN filter + * entry other than the PF. + */ + if (VMDQ_P(0) < 32) { + bits = rd32(wx, WX_PSR_VLAN_SWC_VM_L); + bits &= ~BIT(VMDQ_P(0)); + if (wx->mac.type != wx_mac_em) + bits |= rd32(wx, WX_PSR_VLAN_SWC_VM_H); + } else { + if (wx->mac.type != wx_mac_em) + bits = rd32(wx, WX_PSR_VLAN_SWC_VM_H); + bits &= ~BIT(VMDQ_P(0) % 32); + bits |= rd32(wx, WX_PSR_VLAN_SWC_VM_L); + } + /* If the filter was removed then ensure PF pool bit + * is cleared if the PF only added itself to the pool + * because the PF is in promiscuous mode. + */ + if ((vlvf & VLAN_VID_MASK) == vid && !bits) + wx_set_vf_vlan(wx, add, vid, VMDQ_P(0)); + } + + return 0; +} + +static int wx_set_vf_macvlan_msg(struct wx *wx, u32 *msgbuf, u16 vf) +{ + int index = (msgbuf[0] & WX_VT_MSGINFO_MASK) >> + WX_VT_MSGINFO_SHIFT; + u8 *new_mac = ((u8 *)(&msgbuf[1])); + int err; + + if (wx->vfinfo[vf].pf_set_mac && index > 0) { + wx_err(wx, "VF %d request MACVLAN filter but is denied\n", vf); + return -EINVAL; + } + + /* An non-zero index indicates the VF is setting a filter */ + if (index) { + if (!is_valid_ether_addr(new_mac)) { + wx_err(wx, "VF %d attempted to set invalid mac\n", vf); + return -EINVAL; + } + /* If the VF is allowed to set MAC filters then turn off + * anti-spoofing to avoid false positives. + */ + if (wx->vfinfo[vf].spoofchk_enabled) + wx_set_vf_spoofchk(wx->netdev, vf, false); + } + + err = wx_set_vf_macvlan(wx, vf, index, new_mac); + if (err == -ENOSPC) + wx_err(wx, + "VF %d request MACVLAN filter but there is no space\n", + vf); + if (err < 0) + return err; + + return 0; +} + +static int wx_negotiate_vf_api(struct wx *wx, u32 *msgbuf, u32 vf) +{ + int api = msgbuf[1]; + + switch (api) { + case wx_mbox_api_13: + wx->vfinfo[vf].vf_api = api; + return 0; + default: + wx_err(wx, "VF %d requested invalid api version %u\n", vf, api); + return -EINVAL; + } +} + +static int wx_get_vf_link_state(struct wx *wx, u32 *msgbuf, u32 vf) +{ + msgbuf[1] = wx->vfinfo[vf].link_enable; + + return 0; +} + +static int wx_get_fw_version(struct wx *wx, u32 *msgbuf, u32 vf) +{ + unsigned long fw_version = 0ULL; + int ret = 0; + + ret = kstrtoul(wx->eeprom_id, 16, &fw_version); + if (ret) + return -EOPNOTSUPP; + msgbuf[1] = fw_version; + + return 0; +} + +static int wx_update_vf_xcast_mode(struct wx *wx, u32 *msgbuf, u32 vf) +{ + int xcast_mode = msgbuf[1]; + u32 vmolr, disable, enable; + + if (wx->vfinfo[vf].xcast_mode == xcast_mode) + return 0; + + switch (xcast_mode) { + case WXVF_XCAST_MODE_NONE: + disable = WX_PSR_VM_L2CTL_BAM | WX_PSR_VM_L2CTL_ROMPE | + WX_PSR_VM_L2CTL_MPE | WX_PSR_VM_L2CTL_UPE | + WX_PSR_VM_L2CTL_VPE; + enable = 0; + break; + case WXVF_XCAST_MODE_MULTI: + disable = WX_PSR_VM_L2CTL_MPE | WX_PSR_VM_L2CTL_UPE | + WX_PSR_VM_L2CTL_VPE; + enable = WX_PSR_VM_L2CTL_BAM | WX_PSR_VM_L2CTL_ROMPE; + break; + case WXVF_XCAST_MODE_ALLMULTI: + disable = WX_PSR_VM_L2CTL_UPE | WX_PSR_VM_L2CTL_VPE; + enable = WX_PSR_VM_L2CTL_BAM | WX_PSR_VM_L2CTL_ROMPE | + WX_PSR_VM_L2CTL_MPE; + break; + case WXVF_XCAST_MODE_PROMISC: + disable = 0; + enable = WX_PSR_VM_L2CTL_BAM | WX_PSR_VM_L2CTL_ROMPE | + WX_PSR_VM_L2CTL_MPE | WX_PSR_VM_L2CTL_UPE | + WX_PSR_VM_L2CTL_VPE; + break; + default: + return -EOPNOTSUPP; + } + + vmolr = rd32(wx, WX_PSR_VM_L2CTL(vf)); + vmolr &= ~disable; + vmolr |= enable; + wr32(wx, WX_PSR_VM_L2CTL(vf), vmolr); + + wx->vfinfo[vf].xcast_mode = xcast_mode; + msgbuf[1] = xcast_mode; + + return 0; +} + +static void wx_rcv_msg_from_vf(struct wx *wx, u16 vf) +{ + u16 mbx_size = WX_VXMAILBOX_SIZE; + u32 msgbuf[WX_VXMAILBOX_SIZE]; + int retval; + + retval = wx_read_mbx_pf(wx, msgbuf, mbx_size, vf); + if (retval) { + wx_err(wx, "Error receiving message from VF\n"); + return; + } + + /* this is a message we already processed, do nothing */ + if (msgbuf[0] & (WX_VT_MSGTYPE_ACK | WX_VT_MSGTYPE_NACK)) + return; + + if (msgbuf[0] == WX_VF_RESET) { + wx_vf_reset_msg(wx, vf); + return; + } + + /* until the vf completes a virtual function reset it should not be + * allowed to start any configuration. + */ + if (!wx->vfinfo[vf].clear_to_send) { + msgbuf[0] |= WX_VT_MSGTYPE_NACK; + wx_write_mbx_pf(wx, msgbuf, 1, vf); + return; + } + + switch ((msgbuf[0] & U16_MAX)) { + case WX_VF_SET_MAC_ADDR: + retval = wx_set_vf_mac_addr(wx, msgbuf, vf); + break; + case WX_VF_SET_MULTICAST: + wx_set_vf_multicasts(wx, msgbuf, vf); + retval = 0; + break; + case WX_VF_SET_VLAN: + retval = wx_set_vf_vlan_msg(wx, msgbuf, vf); + break; + case WX_VF_SET_LPE: + wx_set_vf_lpe(wx, msgbuf[1], vf); + retval = 0; + break; + case WX_VF_SET_MACVLAN: + retval = wx_set_vf_macvlan_msg(wx, msgbuf, vf); + break; + case WX_VF_API_NEGOTIATE: + retval = wx_negotiate_vf_api(wx, msgbuf, vf); + break; + case WX_VF_GET_QUEUES: + retval = wx_get_vf_queues(wx, msgbuf, vf); + break; + case WX_VF_GET_LINK_STATE: + retval = wx_get_vf_link_state(wx, msgbuf, vf); + break; + case WX_VF_GET_FW_VERSION: + retval = wx_get_fw_version(wx, msgbuf, vf); + break; + case WX_VF_UPDATE_XCAST_MODE: + retval = wx_update_vf_xcast_mode(wx, msgbuf, vf); + break; + case WX_VF_BACKUP: + break; + default: + wx_err(wx, "Unhandled Msg %8.8x\n", msgbuf[0]); + break; + } + + /* notify the VF of the results of what it sent us */ + if (retval) + msgbuf[0] |= WX_VT_MSGTYPE_NACK; + else + msgbuf[0] |= WX_VT_MSGTYPE_ACK; + + msgbuf[0] |= WX_VT_MSGTYPE_CTS; + + wx_write_mbx_pf(wx, msgbuf, mbx_size, vf); +} + +static void wx_rcv_ack_from_vf(struct wx *wx, u16 vf) +{ + u32 msg = WX_VT_MSGTYPE_NACK; + + /* if device isn't clear to send it shouldn't be reading either */ + if (!wx->vfinfo[vf].clear_to_send) + wx_write_mbx_pf(wx, &msg, 1, vf); +} + +void wx_msg_task(struct wx *wx) +{ + u16 vf; + + for (vf = 0; vf < wx->num_vfs; vf++) { + /* process any reset requests */ + if (!wx_check_for_rst_pf(wx, vf)) + wx_vf_reset_event(wx, vf); + + /* process any messages pending */ + if (!wx_check_for_msg_pf(wx, vf)) + wx_rcv_msg_from_vf(wx, vf); + + /* process any acks */ + if (!wx_check_for_ack_pf(wx, vf)) + wx_rcv_ack_from_vf(wx, vf); + } +} +EXPORT_SYMBOL(wx_msg_task); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_sriov.h b/drivers/net/ethernet/wangxun/libwx/wx_sriov.h index e5fd96b7c5984..d3f29617c7d31 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_sriov.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_sriov.h @@ -10,5 +10,6 @@ void wx_disable_sriov(struct wx *wx); int wx_pci_sriov_configure(struct pci_dev *pdev, int num_vfs); +void wx_msg_task(struct wx *wx); #endif /* _WX_SRIOV_H_ */ diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index be9463f8b4087..a5ca2ca0aba79 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -82,6 +82,8 @@ /*********************** Receive DMA registers **************************/ #define WX_RDM_VF_RE(_i) (0x12004 + ((_i) * 4)) +#define WX_RDM_PF_QDE(_i) (0x12080 + ((_i) * 4)) +#define WX_RDM_VFRE_CLR(_i) (0x120A0 + ((_i) * 4)) #define WX_RDM_DRP_PKT 0x12500 #define WX_RDM_PKT_CNT 0x12504 #define WX_RDM_BYTE_CNT_LSB 0x12508 @@ -125,6 +127,7 @@ #define WX_TDM_VF_TE(_i) (0x18004 + ((_i) * 4)) #define WX_TDM_MAC_AS(_i) (0x18060 + ((_i) * 4)) #define WX_TDM_VLAN_AS(_i) (0x18070 + ((_i) * 4)) +#define WX_TDM_VFTE_CLR(_i) (0x180A0 + ((_i) * 4)) /* TDM CTL BIT */ #define WX_TDM_CTL_TE BIT(0) /* Transmit Enable */ @@ -226,6 +229,7 @@ #define WX_PSR_VM_L2CTL(_i) (0x15600 + ((_i) * 4)) #define WX_PSR_VM_L2CTL_UPE BIT(4) /* unicast promiscuous */ #define WX_PSR_VM_L2CTL_VACC BIT(6) /* accept nomatched vlan */ +#define WX_PSR_VM_L2CTL_VPE BIT(7) /* vlan promiscuous mode */ #define WX_PSR_VM_L2CTL_AUPE BIT(8) /* accept untagged packets */ #define WX_PSR_VM_L2CTL_ROMPE BIT(9) /* accept packets in MTA tbl */ #define WX_PSR_VM_L2CTL_ROPE BIT(10) /* accept packets in UC tbl */ @@ -269,6 +273,7 @@ /* VLAN pool filtering masks */ #define WX_PSR_VLAN_SWC_VIEN BIT(31) /* filter is valid */ #define WX_PSR_VLAN_SWC_ENTRIES 64 +#define WX_PSR_VLAN_SWC_VLANID_MASK GENMASK(11, 0) /********************************* RSEC **************************************/ /* general rsec */ @@ -282,6 +287,9 @@ /*********************** Transmit DMA registers **************************/ /* transmit global control */ #define WX_TDM_ETYPE_AS(_i) (0x18058 + ((_i) * 4)) +#define WX_TDM_VLAN_INS(_i) (0x18100 + ((_i) * 4)) +/* Per VF Port VLAN insertion rules */ +#define WX_TDM_VLAN_INS_VLANA_DEFAULT BIT(30) /* Always use default VLAN*/ /****************************** TDB ******************************************/ #define WX_TDB_PB_SZ(_i) (0x1CC00 + ((_i) * 4)) @@ -352,6 +360,9 @@ #define WX_MAC_WDG_TIMEOUT 0x1100C #define WX_MAC_RX_FLOW_CTRL 0x11090 #define WX_MAC_RX_FLOW_CTRL_RFE BIT(0) /* receive fc enable */ + +#define WX_MAC_WDG_TIMEOUT_WTO_MASK GENMASK(3, 0) +#define WX_MAC_WDG_TIMEOUT_WTO_DELTA 2 /* MDIO Registers */ #define WX_MSCA 0x11200 #define WX_MSCA_RA(v) FIELD_PREP(U16_MAX, v) @@ -1152,6 +1163,11 @@ struct vf_data_storage { bool link_enable; bool trusted; int xcast_mode; + unsigned int vf_api; + bool clear_to_send; + u16 pf_vlan; /* When set, guest VLAN config not allowed. */ + u16 pf_qos; + bool pf_set_mac; u16 vf_mc_hashes[WX_MAX_VF_MC_ENTRIES]; u16 num_vf_mc_hashes; @@ -1269,6 +1285,7 @@ struct wx { u32 wol; u16 bd_number; + bool default_up; struct wx_hw_stats stats; u64 tx_busy; From 877253d2cbf224ef5d1f239c8fd9bd2ad37d2cbe Mon Sep 17 00:00:00 2001 From: Mengyuan Lou Date: Tue, 8 Apr 2025 17:15:55 +0800 Subject: [PATCH 042/770] net: ngbe: add sriov function support Add sriov_configure for driver ops. Add mailbox handler wx_msg_task for ngbe in the interrupt handler. Add the notification flow when the vfs exist. Signed-off-by: Mengyuan Lou Link: https://patch.msgid.link/C9A0A43732966022+20250408091556.9640-6-mengyuanlou@net-swift.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_sriov.c | 31 +++++++ drivers/net/ethernet/wangxun/libwx/wx_sriov.h | 2 + drivers/net/ethernet/wangxun/libwx/wx_type.h | 2 + drivers/net/ethernet/wangxun/ngbe/ngbe_main.c | 93 +++++++++++++++++-- drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c | 5 + drivers/net/ethernet/wangxun/ngbe/ngbe_type.h | 3 + 6 files changed, 127 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_sriov.c b/drivers/net/ethernet/wangxun/libwx/wx_sriov.c index 1a7cadbf7234d..a31b574a343e8 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_sriov.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_sriov.c @@ -834,3 +834,34 @@ void wx_msg_task(struct wx *wx) } } EXPORT_SYMBOL(wx_msg_task); + +void wx_disable_vf_rx_tx(struct wx *wx) +{ + wr32(wx, WX_TDM_VFTE_CLR(0), U32_MAX); + wr32(wx, WX_RDM_VFRE_CLR(0), U32_MAX); + if (wx->mac.type != wx_mac_em) { + wr32(wx, WX_TDM_VFTE_CLR(1), U32_MAX); + wr32(wx, WX_RDM_VFRE_CLR(1), U32_MAX); + } +} +EXPORT_SYMBOL(wx_disable_vf_rx_tx); + +void wx_ping_all_vfs_with_link_status(struct wx *wx, bool link_up) +{ + u32 msgbuf[2] = {0, 0}; + u16 i; + + if (!wx->num_vfs) + return; + msgbuf[0] = WX_PF_NOFITY_VF_LINK_STATUS | WX_PF_CONTROL_MSG; + if (link_up) + msgbuf[1] = FIELD_PREP(GENMASK(31, 1), wx->speed) | link_up; + if (wx->notify_down) + msgbuf[1] |= WX_PF_NOFITY_VF_NET_NOT_RUNNING; + for (i = 0; i < wx->num_vfs; i++) { + if (wx->vfinfo[i].clear_to_send) + msgbuf[0] |= WX_VT_MSGTYPE_CTS; + wx_write_mbx_pf(wx, msgbuf, 2, i); + } +} +EXPORT_SYMBOL(wx_ping_all_vfs_with_link_status); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_sriov.h b/drivers/net/ethernet/wangxun/libwx/wx_sriov.h index d3f29617c7d31..376d8e0e49f38 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_sriov.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_sriov.h @@ -11,5 +11,7 @@ void wx_disable_sriov(struct wx *wx); int wx_pci_sriov_configure(struct pci_dev *pdev, int num_vfs); void wx_msg_task(struct wx *wx); +void wx_disable_vf_rx_tx(struct wx *wx); +void wx_ping_all_vfs_with_link_status(struct wx *wx, bool link_up); #endif /* _WX_SRIOV_H_ */ diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index a5ca2ca0aba79..9b9345290594f 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -92,6 +92,7 @@ /************************* Port Registers ************************************/ /* port cfg Registers */ #define WX_CFG_PORT_CTL 0x14400 +#define WX_CFG_PORT_CTL_PFRSTD BIT(14) #define WX_CFG_PORT_CTL_DRV_LOAD BIT(3) #define WX_CFG_PORT_CTL_QINQ BIT(2) #define WX_CFG_PORT_CTL_D_VLAN BIT(0) /* double vlan*/ @@ -1231,6 +1232,7 @@ struct wx { u8 swfw_index; /* PHY stuff */ + bool notify_down; unsigned int link; int speed; int duplex; diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c index a6159214ec0a9..fd102078f5c92 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c @@ -15,6 +15,8 @@ #include "../libwx/wx_hw.h" #include "../libwx/wx_lib.h" #include "../libwx/wx_ptp.h" +#include "../libwx/wx_mbx.h" +#include "../libwx/wx_sriov.h" #include "ngbe_type.h" #include "ngbe_mdio.h" #include "ngbe_hw.h" @@ -129,6 +131,10 @@ static int ngbe_sw_init(struct wx *wx) wx->tx_work_limit = NGBE_DEFAULT_TX_WORK; wx->rx_work_limit = NGBE_DEFAULT_RX_WORK; + wx->mbx.size = WX_VXMAILBOX_SIZE; + wx->setup_tc = ngbe_setup_tc; + set_bit(0, &wx->fwd_bitmask); + return 0; } @@ -200,12 +206,10 @@ static irqreturn_t ngbe_intr(int __always_unused irq, void *data) return IRQ_HANDLED; } -static irqreturn_t ngbe_msix_other(int __always_unused irq, void *data) +static irqreturn_t __ngbe_msix_misc(struct wx *wx, u32 eicr) { - struct wx *wx = data; - u32 eicr; - - eicr = wx_misc_isb(wx, WX_ISB_MISC); + if (eicr & NGBE_PX_MISC_IC_VF_MBOX) + wx_msg_task(wx); if (unlikely(eicr & NGBE_PX_MISC_IC_TIMESYNC)) wx_ptp_check_pps_event(wx); @@ -217,6 +221,35 @@ static irqreturn_t ngbe_msix_other(int __always_unused irq, void *data) return IRQ_HANDLED; } +static irqreturn_t ngbe_msix_misc(int __always_unused irq, void *data) +{ + struct wx *wx = data; + u32 eicr; + + eicr = wx_misc_isb(wx, WX_ISB_MISC); + + return __ngbe_msix_misc(wx, eicr); +} + +static irqreturn_t ngbe_misc_and_queue(int __always_unused irq, void *data) +{ + struct wx_q_vector *q_vector; + struct wx *wx = data; + u32 eicr; + + eicr = wx_misc_isb(wx, WX_ISB_MISC); + if (!eicr) { + /* queue */ + q_vector = wx->q_vector[0]; + napi_schedule_irqoff(&q_vector->napi); + if (netif_running(wx->netdev)) + ngbe_irq_enable(wx, true); + return IRQ_HANDLED; + } + + return __ngbe_msix_misc(wx, eicr); +} + /** * ngbe_request_msix_irqs - Initialize MSI-X interrupts * @wx: board private structure @@ -249,8 +282,16 @@ static int ngbe_request_msix_irqs(struct wx *wx) } } - err = request_irq(wx->msix_entry->vector, - ngbe_msix_other, 0, netdev->name, wx); + /* Due to hardware design, when num_vfs < 7, pf can use 0 for misc and 1 + * for queue. But when num_vfs == 7, vector[1] is assigned to vf6. + * Misc and queue should reuse interrupt vector[0]. + */ + if (wx->num_vfs == 7) + err = request_irq(wx->msix_entry->vector, + ngbe_misc_and_queue, 0, netdev->name, wx); + else + err = request_irq(wx->msix_entry->vector, + ngbe_msix_misc, 0, netdev->name, wx); if (err) { wx_err(wx, "request_irq for msix_other failed: %d\n", err); @@ -302,6 +343,22 @@ static void ngbe_disable_device(struct wx *wx) struct net_device *netdev = wx->netdev; u32 i; + if (wx->num_vfs) { + /* Clear EITR Select mapping */ + wr32(wx, WX_PX_ITRSEL, 0); + + /* Mark all the VFs as inactive */ + for (i = 0; i < wx->num_vfs; i++) + wx->vfinfo[i].clear_to_send = 0; + wx->notify_down = true; + /* ping all the active vfs to let them know we are going down */ + wx_ping_all_vfs_with_link_status(wx, false); + wx->notify_down = false; + + /* Disable all VFTE/VFRE TX/RX */ + wx_disable_vf_rx_tx(wx); + } + /* disable all enabled rx queues */ for (i = 0; i < wx->num_rx_queues; i++) /* this call also flushes the previous write */ @@ -324,12 +381,19 @@ static void ngbe_disable_device(struct wx *wx) wx_update_stats(wx); } +static void ngbe_reset(struct wx *wx) +{ + wx_flush_sw_mac_table(wx); + wx_mac_set_default_filter(wx, wx->mac.addr); + if (test_bit(WX_STATE_PTP_RUNNING, wx->state)) + wx_ptp_reset(wx); +} + void ngbe_down(struct wx *wx) { phylink_stop(wx->phylink); ngbe_disable_device(wx); - if (test_bit(WX_STATE_PTP_RUNNING, wx->state)) - wx_ptp_reset(wx); + ngbe_reset(wx); wx_clean_all_tx_rings(wx); wx_clean_all_rx_rings(wx); } @@ -352,6 +416,11 @@ void ngbe_up(struct wx *wx) ngbe_sfp_modules_txrx_powerctl(wx, true); phylink_start(wx->phylink); + /* Set PF Reset Done bit so PF/VF Mail Ops can work */ + wr32m(wx, WX_CFG_PORT_CTL, + WX_CFG_PORT_CTL_PFRSTD, WX_CFG_PORT_CTL_PFRSTD); + if (wx->num_vfs) + wx_ping_all_vfs_with_link_status(wx, false); } /** @@ -596,6 +665,10 @@ static int ngbe_probe(struct pci_dev *pdev, goto err_pci_release_regions; } + /* The emerald supports up to 8 VFs per pf, but physical + * function also need one pool for basic networking. + */ + pci_sriov_set_totalvfs(pdev, NGBE_MAX_VFS_DRV_LIMIT); wx->driver_name = ngbe_driver_name; ngbe_set_ethtool_ops(netdev); netdev->netdev_ops = &ngbe_netdev_ops; @@ -743,6 +816,7 @@ static void ngbe_remove(struct pci_dev *pdev) struct net_device *netdev; netdev = wx->netdev; + wx_disable_sriov(wx); unregister_netdev(netdev); phylink_destroy(wx->phylink); pci_release_selected_regions(pdev, @@ -802,6 +876,7 @@ static struct pci_driver ngbe_driver = { .suspend = ngbe_suspend, .resume = ngbe_resume, .shutdown = ngbe_shutdown, + .sriov_configure = wx_pci_sriov_configure, }; module_pci_driver(ngbe_driver); diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c index ea1d7e9a91f3d..c63bb6e6f4056 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_mdio.c @@ -9,6 +9,7 @@ #include "../libwx/wx_type.h" #include "../libwx/wx_ptp.h" #include "../libwx/wx_hw.h" +#include "../libwx/wx_sriov.h" #include "ngbe_type.h" #include "ngbe_mdio.h" @@ -70,6 +71,8 @@ static void ngbe_mac_link_down(struct phylink_config *config, wx->speed = SPEED_UNKNOWN; if (test_bit(WX_STATE_PTP_RUNNING, wx->state)) wx_ptp_reset_cyclecounter(wx); + /* ping all the active vfs to let them know we are going down */ + wx_ping_all_vfs_with_link_status(wx, false); } static void ngbe_mac_link_up(struct phylink_config *config, @@ -114,6 +117,8 @@ static void ngbe_mac_link_up(struct phylink_config *config, wx->last_rx_ptp_check = jiffies; if (test_bit(WX_STATE_PTP_RUNNING, wx->state)) wx_ptp_reset_cyclecounter(wx); + /* ping all the active vfs to let them know we are going up */ + wx_ping_all_vfs_with_link_status(wx, true); } static const struct phylink_mac_ops ngbe_mac_ops = { diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_type.h b/drivers/net/ethernet/wangxun/ngbe/ngbe_type.h index 992adbb98c7d1..bb74263f04985 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_type.h +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_type.h @@ -73,12 +73,14 @@ #define NGBE_PX_MISC_IEN_TIMESYNC BIT(11) #define NGBE_PX_MISC_IEN_ETH_LK BIT(18) #define NGBE_PX_MISC_IEN_INT_ERR BIT(20) +#define NGBE_PX_MISC_IC_VF_MBOX BIT(23) #define NGBE_PX_MISC_IEN_GPIO BIT(26) #define NGBE_PX_MISC_IEN_MASK ( \ NGBE_PX_MISC_IEN_DEV_RST | \ NGBE_PX_MISC_IEN_TIMESYNC | \ NGBE_PX_MISC_IEN_ETH_LK | \ NGBE_PX_MISC_IEN_INT_ERR | \ + NGBE_PX_MISC_IC_VF_MBOX | \ NGBE_PX_MISC_IEN_GPIO) /* Extended Interrupt Cause Read */ @@ -134,6 +136,7 @@ #define NGBE_MAX_RXD 8192 #define NGBE_MIN_RXD 128 +#define NGBE_MAX_VFS_DRV_LIMIT 7 extern char ngbe_driver_name[]; void ngbe_down(struct wx *wx); From a9843689e2de1a3727d58b4225e4f8664937aefd Mon Sep 17 00:00:00 2001 From: Mengyuan Lou Date: Tue, 8 Apr 2025 17:15:56 +0800 Subject: [PATCH 043/770] net: txgbe: add sriov function support Add sriov_configure for driver ops. Add mailbox handler wx_msg_task for txgbe. Signed-off-by: Mengyuan Lou Link: https://patch.msgid.link/ECDC57CF4F2316B9+20250408091556.9640-7-mengyuanlou@net-swift.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_sriov.c | 42 +++++++++++++++++++ drivers/net/ethernet/wangxun/libwx/wx_sriov.h | 1 + drivers/net/ethernet/wangxun/libwx/wx_type.h | 1 + .../net/ethernet/wangxun/txgbe/txgbe_irq.c | 21 ++++++++-- .../net/ethernet/wangxun/txgbe/txgbe_main.c | 27 ++++++++++++ .../net/ethernet/wangxun/txgbe/txgbe_phy.c | 6 +++ .../net/ethernet/wangxun/txgbe/txgbe_type.h | 7 +++- 7 files changed, 101 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_sriov.c b/drivers/net/ethernet/wangxun/libwx/wx_sriov.c index a31b574a343e8..52e6a6faf7153 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_sriov.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_sriov.c @@ -282,6 +282,15 @@ static void wx_clear_vmvir(struct wx *wx, u32 vf) wr32(wx, WX_TDM_VLAN_INS(vf), 0); } +static void wx_ping_vf(struct wx *wx, int vf) +{ + u32 ping = WX_PF_CONTROL_MSG; + + if (wx->vfinfo[vf].clear_to_send) + ping |= WX_VT_MSGTYPE_CTS; + wx_write_mbx_pf(wx, &ping, 1, vf); +} + static void wx_set_vf_rx_tx(struct wx *wx, int vf) { u32 index = WX_VF_REG_OFFSET(vf), vf_bit = WX_VF_IND_SHIFT(vf); @@ -865,3 +874,36 @@ void wx_ping_all_vfs_with_link_status(struct wx *wx, bool link_up) } } EXPORT_SYMBOL(wx_ping_all_vfs_with_link_status); + +static void wx_set_vf_link_state(struct wx *wx, int vf, int state) +{ + wx->vfinfo[vf].link_state = state; + switch (state) { + case IFLA_VF_LINK_STATE_AUTO: + if (netif_running(wx->netdev)) + wx->vfinfo[vf].link_enable = true; + else + wx->vfinfo[vf].link_enable = false; + break; + case IFLA_VF_LINK_STATE_ENABLE: + wx->vfinfo[vf].link_enable = true; + break; + case IFLA_VF_LINK_STATE_DISABLE: + wx->vfinfo[vf].link_enable = false; + break; + } + /* restart the VF */ + wx->vfinfo[vf].clear_to_send = false; + wx_ping_vf(wx, vf); + + wx_set_vf_rx_tx(wx, vf); +} + +void wx_set_all_vfs(struct wx *wx) +{ + int i; + + for (i = 0; i < wx->num_vfs; i++) + wx_set_vf_link_state(wx, i, wx->vfinfo[i].link_state); +} +EXPORT_SYMBOL(wx_set_all_vfs); diff --git a/drivers/net/ethernet/wangxun/libwx/wx_sriov.h b/drivers/net/ethernet/wangxun/libwx/wx_sriov.h index 376d8e0e49f38..8a3a47bb58155 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_sriov.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_sriov.h @@ -13,5 +13,6 @@ int wx_pci_sriov_configure(struct pci_dev *pdev, int num_vfs); void wx_msg_task(struct wx *wx); void wx_disable_vf_rx_tx(struct wx *wx); void wx_ping_all_vfs_with_link_status(struct wx *wx, bool link_up); +void wx_set_all_vfs(struct wx *wx); #endif /* _WX_SRIOV_H_ */ diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h index 9b9345290594f..e13172c9eeedd 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_type.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h @@ -1173,6 +1173,7 @@ struct vf_data_storage { u16 vf_mc_hashes[WX_MAX_VF_MC_ENTRIES]; u16 num_vf_mc_hashes; u16 vlan_count; + int link_state; }; struct vf_macvlans { diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c index 8658a51ee8106..3b9e831cf0ef3 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_irq.c @@ -7,6 +7,7 @@ #include "../libwx/wx_type.h" #include "../libwx/wx_lib.h" #include "../libwx/wx_hw.h" +#include "../libwx/wx_sriov.h" #include "txgbe_type.h" #include "txgbe_phy.h" #include "txgbe_irq.h" @@ -109,8 +110,17 @@ static irqreturn_t txgbe_misc_irq_handle(int irq, void *data) struct wx *wx = txgbe->wx; u32 eicr; - if (wx->pdev->msix_enabled) + if (wx->pdev->msix_enabled) { + eicr = wx_misc_isb(wx, WX_ISB_MISC); + if (!eicr) + return IRQ_NONE; + txgbe->eicr = eicr; + if (eicr & TXGBE_PX_MISC_IC_VF_MBOX) { + wx_msg_task(txgbe->wx); + wx_intr_enable(wx, TXGBE_INTR_MISC); + } return IRQ_WAKE_THREAD; + } eicr = wx_misc_isb(wx, WX_ISB_VEC0); if (!eicr) { @@ -129,6 +139,11 @@ static irqreturn_t txgbe_misc_irq_handle(int irq, void *data) q_vector = wx->q_vector[0]; napi_schedule_irqoff(&q_vector->napi); + eicr = wx_misc_isb(wx, WX_ISB_MISC); + if (!eicr) + return IRQ_NONE; + txgbe->eicr = eicr; + return IRQ_WAKE_THREAD; } @@ -140,7 +155,7 @@ static irqreturn_t txgbe_misc_irq_thread_fn(int irq, void *data) unsigned int sub_irq; u32 eicr; - eicr = wx_misc_isb(wx, WX_ISB_MISC); + eicr = txgbe->eicr; if (eicr & (TXGBE_PX_MISC_ETH_LK | TXGBE_PX_MISC_ETH_LKDN | TXGBE_PX_MISC_ETH_AN)) { sub_irq = irq_find_mapping(txgbe->misc.domain, TXGBE_IRQ_LINK); @@ -183,7 +198,7 @@ int txgbe_setup_misc_irq(struct txgbe *txgbe) if (wx->mac.type == wx_mac_aml) goto skip_sp_irq; - txgbe->misc.nirqs = 1; + txgbe->misc.nirqs = TXGBE_IRQ_MAX; txgbe->misc.domain = irq_domain_add_simple(NULL, txgbe->misc.nirqs, 0, &txgbe_misc_irq_domain_ops, txgbe); if (!txgbe->misc.domain) diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c index a2e245e3b0168..6d9134a3ce4df 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c @@ -15,6 +15,8 @@ #include "../libwx/wx_lib.h" #include "../libwx/wx_ptp.h" #include "../libwx/wx_hw.h" +#include "../libwx/wx_mbx.h" +#include "../libwx/wx_sriov.h" #include "txgbe_type.h" #include "txgbe_hw.h" #include "txgbe_phy.h" @@ -117,6 +119,12 @@ static void txgbe_up_complete(struct wx *wx) /* enable transmits */ netif_tx_start_all_queues(netdev); + + /* Set PF Reset Done bit so PF/VF Mail Ops can work */ + wr32m(wx, WX_CFG_PORT_CTL, WX_CFG_PORT_CTL_PFRSTD, + WX_CFG_PORT_CTL_PFRSTD); + /* update setting rx tx for all active vfs */ + wx_set_all_vfs(wx); } static void txgbe_reset(struct wx *wx) @@ -165,6 +173,16 @@ static void txgbe_disable_device(struct wx *wx) wx_err(wx, "%s: invalid bus lan id %d\n", __func__, wx->bus.func); + if (wx->num_vfs) { + /* Clear EITR Select mapping */ + wr32(wx, WX_PX_ITRSEL, 0); + /* Mark all the VFs as inactive */ + for (i = 0; i < wx->num_vfs; i++) + wx->vfinfo[i].clear_to_send = 0; + /* update setting rx tx for all active vfs */ + wx_set_all_vfs(wx); + } + if (!(((wx->subsystem_device_id & WX_NCSI_MASK) == WX_NCSI_SUP) || ((wx->subsystem_device_id & WX_WOL_MASK) == WX_WOL_SUP))) { /* disable mac transmiter */ @@ -307,12 +325,15 @@ static int txgbe_sw_init(struct wx *wx) /* set default ring sizes */ wx->tx_ring_count = TXGBE_DEFAULT_TXD; wx->rx_ring_count = TXGBE_DEFAULT_RXD; + wx->mbx.size = WX_VXMAILBOX_SIZE; /* set default work limits */ wx->tx_work_limit = TXGBE_DEFAULT_TX_WORK; wx->rx_work_limit = TXGBE_DEFAULT_RX_WORK; + wx->setup_tc = txgbe_setup_tc; wx->do_reset = txgbe_do_reset; + set_bit(0, &wx->fwd_bitmask); switch (wx->mac.type) { case wx_mac_sp: @@ -604,6 +625,10 @@ static int txgbe_probe(struct pci_dev *pdev, goto err_pci_release_regions; } + /* The sapphire supports up to 63 VFs per pf, but physical + * function also need one pool for basic networking. + */ + pci_sriov_set_totalvfs(pdev, TXGBE_MAX_VFS_DRV_LIMIT); wx->driver_name = txgbe_driver_name; txgbe_set_ethtool_ops(netdev); netdev->netdev_ops = &txgbe_netdev_ops; @@ -794,6 +819,7 @@ static void txgbe_remove(struct pci_dev *pdev) struct net_device *netdev; netdev = wx->netdev; + wx_disable_sriov(wx); unregister_netdev(netdev); txgbe_remove_phy(txgbe); @@ -816,6 +842,7 @@ static struct pci_driver txgbe_driver = { .probe = txgbe_probe, .remove = txgbe_remove, .shutdown = txgbe_shutdown, + .sriov_configure = wx_pci_sriov_configure, }; module_pci_driver(txgbe_driver); diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c index 85f022ceef4fe..1863cfd27ee79 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_phy.c @@ -16,6 +16,8 @@ #include "../libwx/wx_type.h" #include "../libwx/wx_lib.h" #include "../libwx/wx_ptp.h" +#include "../libwx/wx_sriov.h" +#include "../libwx/wx_mbx.h" #include "../libwx/wx_hw.h" #include "txgbe_type.h" #include "txgbe_phy.h" @@ -184,6 +186,8 @@ static void txgbe_mac_link_down(struct phylink_config *config, wx->speed = SPEED_UNKNOWN; if (test_bit(WX_STATE_PTP_RUNNING, wx->state)) wx_ptp_reset_cyclecounter(wx); + /* ping all the active vfs to let them know we are going down */ + wx_ping_all_vfs_with_link_status(wx, false); } static void txgbe_mac_link_up(struct phylink_config *config, @@ -225,6 +229,8 @@ static void txgbe_mac_link_up(struct phylink_config *config, wx->last_rx_ptp_check = jiffies; if (test_bit(WX_STATE_PTP_RUNNING, wx->state)) wx_ptp_reset_cyclecounter(wx); + /* ping all the active vfs to let them know we are going up */ + wx_ping_all_vfs_with_link_status(wx, true); } static int txgbe_mac_prepare(struct phylink_config *config, unsigned int mode, diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h b/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h index 9c1c26234cad9..5937cbc6bd057 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h @@ -77,11 +77,13 @@ #define TXGBE_PX_MISC_ETH_LK BIT(18) #define TXGBE_PX_MISC_ETH_AN BIT(19) #define TXGBE_PX_MISC_INT_ERR BIT(20) +#define TXGBE_PX_MISC_IC_VF_MBOX BIT(23) #define TXGBE_PX_MISC_GPIO BIT(26) #define TXGBE_PX_MISC_IEN_MASK \ (TXGBE_PX_MISC_ETH_LKDN | TXGBE_PX_MISC_DEV_RST | \ TXGBE_PX_MISC_ETH_EVENT | TXGBE_PX_MISC_ETH_LK | \ - TXGBE_PX_MISC_ETH_AN | TXGBE_PX_MISC_INT_ERR) + TXGBE_PX_MISC_ETH_AN | TXGBE_PX_MISC_INT_ERR | \ + TXGBE_PX_MISC_IC_VF_MBOX) /* Port cfg registers */ #define TXGBE_CFG_PORT_ST 0x14404 @@ -174,6 +176,8 @@ #define TXGBE_SP_RX_PB_SIZE 512 #define TXGBE_SP_TDB_PB_SZ (160 * 1024) /* 160KB Packet Buffer */ +#define TXGBE_MAX_VFS_DRV_LIMIT 63 + #define TXGBE_DEFAULT_ATR_SAMPLE_RATE 20 /* Software ATR hash keys */ @@ -348,6 +352,7 @@ struct txgbe { struct clk *clk; struct gpio_chip *gpio; unsigned int link_irq; + u32 eicr; /* flow director */ struct hlist_head fdir_filter_list; From 3e730fe2af867d49b8c0eed65fe824079f7f377a Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Thu, 10 Apr 2025 03:31:15 +0000 Subject: [PATCH 044/770] selftests/xsk: Add packet stream replacement function Add pkt_stream_replace_ifobject function to replace the packet stream for a given ifobject. Enable separate TX and RX packet replacement, allowing RX side packet length adjustments using bpf_xdp_adjust_tail() in the upcoming patch. Currently, pkt_stream_replace() works on both TX and RX packet streams, and this new function provides the ability to modify one of them. Reviewed-by: Maciej Fijalkowski Signed-off-by: Tushar Vyavahare Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250410033116.173617-2-tushar.vyavahare@intel.com --- tools/testing/selftests/bpf/xskxceiver.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index 11f047b8af752..d60ee6a31c09e 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -757,14 +757,15 @@ static struct pkt_stream *pkt_stream_clone(struct pkt_stream *pkt_stream) return pkt_stream_generate(pkt_stream->nb_pkts, pkt_stream->pkts[0].len); } -static void pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len) +static void pkt_stream_replace_ifobject(struct ifobject *ifobj, u32 nb_pkts, u32 pkt_len) { - struct pkt_stream *pkt_stream; + ifobj->xsk->pkt_stream = pkt_stream_generate(nb_pkts, pkt_len); +} - pkt_stream = pkt_stream_generate(nb_pkts, pkt_len); - test->ifobj_tx->xsk->pkt_stream = pkt_stream; - pkt_stream = pkt_stream_generate(nb_pkts, pkt_len); - test->ifobj_rx->xsk->pkt_stream = pkt_stream; +static void pkt_stream_replace(struct test_spec *test, u32 nb_pkts, u32 pkt_len) +{ + pkt_stream_replace_ifobject(test->ifobj_tx, nb_pkts, pkt_len); + pkt_stream_replace_ifobject(test->ifobj_rx, nb_pkts, pkt_len); } static void __pkt_stream_replace_half(struct ifobject *ifobj, u32 pkt_len, From 4b302092553c204599d02a97a10f5b9b70f2c0a0 Mon Sep 17 00:00:00 2001 From: Tushar Vyavahare Date: Thu, 10 Apr 2025 03:31:16 +0000 Subject: [PATCH 045/770] selftests/xsk: Add tail adjustment tests and support check Introduce tail adjustment functionality in xskxceiver using bpf_xdp_adjust_tail(). Add `xsk_xdp_adjust_tail` to modify packet sizes and drop unmodified packets. Implement `is_adjust_tail_supported` to check helper availability. Develop packet resizing tests, including shrinking and growing scenarios, with functions for both single-buffer and multi-buffer cases. Update the test framework to handle various scenarios and adjust MTU settings. These changes enhance the testing of packet tail adjustments, improving AF_XDP framework reliability. Reviewed-by: Maciej Fijalkowski Signed-off-by: Tushar Vyavahare Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250410033116.173617-3-tushar.vyavahare@intel.com --- .../selftests/bpf/progs/xsk_xdp_progs.c | 50 +++++++++ tools/testing/selftests/bpf/xsk_xdp_common.h | 1 + tools/testing/selftests/bpf/xskxceiver.c | 105 +++++++++++++++++- tools/testing/selftests/bpf/xskxceiver.h | 2 + 4 files changed, 156 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c index ccde6a4c63196..683306db8594e 100644 --- a/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c +++ b/tools/testing/selftests/bpf/progs/xsk_xdp_progs.c @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include "xsk_xdp_common.h" struct { @@ -14,6 +16,7 @@ struct { } xsk SEC(".maps"); static unsigned int idx; +int adjust_value = 0; int count = 0; SEC("xdp.frags") int xsk_def_prog(struct xdp_md *xdp) @@ -70,4 +73,51 @@ SEC("xdp") int xsk_xdp_shared_umem(struct xdp_md *xdp) return bpf_redirect_map(&xsk, idx, XDP_DROP); } +SEC("xdp.frags") int xsk_xdp_adjust_tail(struct xdp_md *xdp) +{ + __u32 buff_len, curr_buff_len; + int ret; + + buff_len = bpf_xdp_get_buff_len(xdp); + if (buff_len == 0) + return XDP_DROP; + + ret = bpf_xdp_adjust_tail(xdp, adjust_value); + if (ret < 0) { + /* Handle unsupported cases */ + if (ret == -EOPNOTSUPP) { + /* Set adjust_value to -EOPNOTSUPP to indicate to userspace that this case + * is unsupported + */ + adjust_value = -EOPNOTSUPP; + return bpf_redirect_map(&xsk, 0, XDP_DROP); + } + + return XDP_DROP; + } + + curr_buff_len = bpf_xdp_get_buff_len(xdp); + if (curr_buff_len != buff_len + adjust_value) + return XDP_DROP; + + if (curr_buff_len > buff_len) { + __u32 *pkt_data = (void *)(long)xdp->data; + __u32 len, words_to_end, seq_num; + + len = curr_buff_len - PKT_HDR_ALIGN; + words_to_end = len / sizeof(*pkt_data) - 1; + seq_num = words_to_end; + + /* Convert sequence number to network byte order. Store this in the last 4 bytes of + * the packet. Use 'adjust_value' to determine the position at the end of the + * packet for storing the sequence number. + */ + seq_num = __constant_htonl(words_to_end); + bpf_xdp_store_bytes(xdp, curr_buff_len - sizeof(seq_num), &seq_num, + sizeof(seq_num)); + } + + return bpf_redirect_map(&xsk, 0, XDP_DROP); +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/xsk_xdp_common.h b/tools/testing/selftests/bpf/xsk_xdp_common.h index 5a6f36f07383c..45810ff552dae 100644 --- a/tools/testing/selftests/bpf/xsk_xdp_common.h +++ b/tools/testing/selftests/bpf/xsk_xdp_common.h @@ -4,6 +4,7 @@ #define XSK_XDP_COMMON_H_ #define MAX_SOCKETS 2 +#define PKT_HDR_ALIGN (sizeof(struct ethhdr) + 2) /* Just to align the data in the packet */ struct xdp_info { __u64 count; diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c index d60ee6a31c09e..0ced4026ee443 100644 --- a/tools/testing/selftests/bpf/xskxceiver.c +++ b/tools/testing/selftests/bpf/xskxceiver.c @@ -524,6 +524,8 @@ static void __test_spec_init(struct test_spec *test, struct ifobject *ifobj_tx, test->nb_sockets = 1; test->fail = false; test->set_ring = false; + test->adjust_tail = false; + test->adjust_tail_support = false; test->mtu = MAX_ETH_PKT_SIZE; test->xdp_prog_rx = ifobj_rx->xdp_progs->progs.xsk_def_prog; test->xskmap_rx = ifobj_rx->xdp_progs->maps.xsk; @@ -992,6 +994,31 @@ static bool is_metadata_correct(struct pkt *pkt, void *buffer, u64 addr) return true; } +static bool is_adjust_tail_supported(struct xsk_xdp_progs *skel_rx) +{ + struct bpf_map *data_map; + int adjust_value = 0; + int key = 0; + int ret; + + data_map = bpf_object__find_map_by_name(skel_rx->obj, "xsk_xdp_.bss"); + if (!data_map || !bpf_map__is_internal(data_map)) { + ksft_print_msg("Error: could not find bss section of XDP program\n"); + exit_with_error(errno); + } + + ret = bpf_map_lookup_elem(bpf_map__fd(data_map), &key, &adjust_value); + if (ret) { + ksft_print_msg("Error: bpf_map_lookup_elem failed with error %d\n", ret); + exit_with_error(errno); + } + + /* Set the 'adjust_value' variable to -EOPNOTSUPP in the XDP program if the adjust_tail + * helper is not supported. Skip the adjust_tail test case in this scenario. + */ + return adjust_value != -EOPNOTSUPP; +} + static bool is_frag_valid(struct xsk_umem_info *umem, u64 addr, u32 len, u32 expected_pkt_nb, u32 bytes_processed) { @@ -1768,8 +1795,13 @@ static void *worker_testapp_validate_rx(void *arg) if (!err && ifobject->validation_func) err = ifobject->validation_func(ifobject); - if (err) - report_failure(test); + + if (err) { + if (test->adjust_tail && !is_adjust_tail_supported(ifobject->xdp_progs)) + test->adjust_tail_support = false; + else + report_failure(test); + } pthread_exit(NULL); } @@ -2516,6 +2548,71 @@ static int testapp_hw_sw_max_ring_size(struct test_spec *test) return testapp_validate_traffic(test); } +static int testapp_xdp_adjust_tail(struct test_spec *test, int adjust_value) +{ + struct xsk_xdp_progs *skel_rx = test->ifobj_rx->xdp_progs; + struct xsk_xdp_progs *skel_tx = test->ifobj_tx->xdp_progs; + + test_spec_set_xdp_prog(test, skel_rx->progs.xsk_xdp_adjust_tail, + skel_tx->progs.xsk_xdp_adjust_tail, + skel_rx->maps.xsk, skel_tx->maps.xsk); + + skel_rx->bss->adjust_value = adjust_value; + + return testapp_validate_traffic(test); +} + +static int testapp_adjust_tail(struct test_spec *test, u32 value, u32 pkt_len) +{ + int ret; + + test->adjust_tail_support = true; + test->adjust_tail = true; + test->total_steps = 1; + + pkt_stream_replace_ifobject(test->ifobj_tx, DEFAULT_BATCH_SIZE, pkt_len); + pkt_stream_replace_ifobject(test->ifobj_rx, DEFAULT_BATCH_SIZE, pkt_len + value); + + ret = testapp_xdp_adjust_tail(test, value); + if (ret) + return ret; + + if (!test->adjust_tail_support) { + ksft_test_result_skip("%s %sResize pkt with bpf_xdp_adjust_tail() not supported\n", + mode_string(test), busy_poll_string(test)); + return TEST_SKIP; + } + + return 0; +} + +static int testapp_adjust_tail_shrink(struct test_spec *test) +{ + /* Shrink by 4 bytes for testing purpose */ + return testapp_adjust_tail(test, -4, MIN_PKT_SIZE * 2); +} + +static int testapp_adjust_tail_shrink_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + /* Shrink by the frag size */ + return testapp_adjust_tail(test, -XSK_UMEM__MAX_FRAME_SIZE, XSK_UMEM__LARGE_FRAME_SIZE * 2); +} + +static int testapp_adjust_tail_grow(struct test_spec *test) +{ + /* Grow by 4 bytes for testing purpose */ + return testapp_adjust_tail(test, 4, MIN_PKT_SIZE * 2); +} + +static int testapp_adjust_tail_grow_mb(struct test_spec *test) +{ + test->mtu = MAX_ETH_JUMBO_SIZE; + /* Grow by (frag_size - last_frag_Size) - 1 to stay inside the last fragment */ + return testapp_adjust_tail(test, (XSK_UMEM__MAX_FRAME_SIZE / 2) - 1, + XSK_UMEM__LARGE_FRAME_SIZE * 2); +} + static void run_pkt_test(struct test_spec *test) { int ret; @@ -2622,6 +2719,10 @@ static const struct test_spec tests[] = { {.name = "TOO_MANY_FRAGS", .test_func = testapp_too_many_frags}, {.name = "HW_SW_MIN_RING_SIZE", .test_func = testapp_hw_sw_min_ring_size}, {.name = "HW_SW_MAX_RING_SIZE", .test_func = testapp_hw_sw_max_ring_size}, + {.name = "XDP_ADJUST_TAIL_SHRINK", .test_func = testapp_adjust_tail_shrink}, + {.name = "XDP_ADJUST_TAIL_SHRINK_MULTI_BUFF", .test_func = testapp_adjust_tail_shrink_mb}, + {.name = "XDP_ADJUST_TAIL_GROW", .test_func = testapp_adjust_tail_grow}, + {.name = "XDP_ADJUST_TAIL_GROW_MULTI_BUFF", .test_func = testapp_adjust_tail_grow_mb}, }; static void print_tests(void) diff --git a/tools/testing/selftests/bpf/xskxceiver.h b/tools/testing/selftests/bpf/xskxceiver.h index e46e823f6a1aa..67fc44b2813b2 100644 --- a/tools/testing/selftests/bpf/xskxceiver.h +++ b/tools/testing/selftests/bpf/xskxceiver.h @@ -173,6 +173,8 @@ struct test_spec { u16 nb_sockets; bool fail; bool set_ring; + bool adjust_tail; + bool adjust_tail_support; enum test_mode mode; char name[MAX_TEST_NAME_SIZE]; }; From 709894c52c1cafa36fe2047ba5a0b83bdf398133 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 9 Apr 2025 14:50:58 +0200 Subject: [PATCH 046/770] af_unix: Remove unix_unhash() Dummy unix_unhash() was introduced for sockmap in commit 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap"), but there's no need to implement it anymore. ->unhash() is only called conditionally: in unix_shutdown() since commit d359902d5c35 ("af_unix: Fix NULL pointer bug in unix_shutdown"), and in BPF proto's sock_map_unhash() since commit 5b4a79ba65a1 ("bpf, sockmap: Don't let sock_map_{close,destroy,unhash} call itself"). Remove it. Signed-off-by: Michal Luczaj Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250409-cleanup-drop-unix-unhash-v1-1-1659e5b8ee84@rbox.co Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f78a2492826f9..2ab20821d6bb2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -950,13 +950,6 @@ static void unix_close(struct sock *sk, long timeout) */ } -static void unix_unhash(struct sock *sk) -{ - /* Nothing to do here, unix socket does not need a ->unhash(). - * This is merely for sockmap. - */ -} - static bool unix_bpf_bypass_getsockopt(int level, int optname) { if (level == SOL_SOCKET) { @@ -987,7 +980,6 @@ struct proto unix_stream_proto = { .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, - .unhash = unix_unhash, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_stream_bpf_update_proto, From 04271411121a58d37f47b065bc872f333274bf1f Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 9 Apr 2025 19:26:04 +0800 Subject: [PATCH 047/770] tcp: add TCP_RFC7323_TW_PAWS drop reason Devices in the networking path, such as firewalls, NATs, or routers, which can perform SNAT or DNAT, use addresses from their own limited address pools to masquerade the source address during forwarding, causing PAWS verification to fail more easily. Currently, packet loss statistics for PAWS can only be viewed through MIB, which is a global metric and cannot be precisely obtained through tracing to get the specific 4-tuple of the dropped packet. In the past, we had to use kprobe ret to retrieve relevant skb information from tcp_timewait_state_process(). We add a drop_reason pointer, similar to what previous commit does: commit e34100c2ecbb ("tcp: add a drop_reason pointer to tcp_check_req()") This commit addresses the PAWSESTABREJECTED case and also sets the corresponding drop reason. We use 'pwru' to test. Before this commit: '''' ./pwru 'port 9999' 2025/04/07 13:40:19 Listening for events.. TUPLE FUNC 172.31.75.115:12345->172.31.75.114:9999(tcp) sk_skb_reason_drop(SKB_DROP_REASON_NOT_SPECIFIED) ''' After this commit: ''' ./pwru 'port 9999' 2025/04/07 13:51:34 Listening for events.. TUPLE FUNC 172.31.75.115:12345->172.31.75.114:9999(tcp) sk_skb_reason_drop(SKB_DROP_REASON_TCP_RFC7323_TW_PAWS) ''' Suggested-by: Eric Dumazet Signed-off-by: Jiayuan Chen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250409112614.16153-2-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 6 ++++++ include/net/tcp.h | 3 ++- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv4/tcp_minisocks.c | 7 +++++-- net/ipv6/tcp_ipv6.c | 3 ++- 5 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index e4fdc6b54ceff..9701d7f936f68 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -40,6 +40,7 @@ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ FN(TCP_RFC7323_PAWS_ACK) \ + FN(TCP_RFC7323_TW_PAWS) \ FN(TCP_RFC7323_TSECR) \ FN(TCP_LISTEN_OVERFLOW) \ FN(TCP_OLD_SEQUENCE) \ @@ -283,6 +284,11 @@ enum skb_drop_reason { * Corresponds to LINUX_MIB_PAWS_OLD_ACK. */ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, + /** + * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in + * TIME_WAIT state. + */ + SKB_DROP_REASON_TCP_RFC7323_TW_PAWS, /** * @SKB_DROP_REASON_TCP_RFC7323_TSECR: PAWS check, invalid TSEcr. * Corresponds to LINUX_MIB_TSECRREJECTED. diff --git a/include/net/tcp.h b/include/net/tcp.h index 4450c384ef178..5078ad868feef 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -427,7 +427,8 @@ enum tcp_tw_status { enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, - u32 *tw_isn); + u32 *tw_isn, + enum skb_drop_reason *drop_reason); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race, enum skb_drop_reason *drop_reason); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8cce0d5489dae..d5b5c32115d2e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2417,7 +2417,8 @@ int tcp_v4_rcv(struct sk_buff *skb) goto csum_error; } - tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn); + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index fb9349be36b80..27511bf58c0f5 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -97,7 +97,8 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th, u32 *tw_isn) + const struct tcphdr *th, u32 *tw_isn, + enum skb_drop_reason *drop_reason) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); @@ -245,8 +246,10 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, return TCP_TW_SYN; } - if (paws_reject) + if (paws_reject) { + *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); + } if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index b03c223eda4fa..7dcb33f879ee1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1970,7 +1970,8 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) goto csum_error; } - tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn); + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); switch (tw_status) { case TCP_TW_SYN: { From c449d5f3a3d70b6223af8df2cadca3ca6eacb613 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 9 Apr 2025 19:26:05 +0800 Subject: [PATCH 048/770] tcp: add LINUX_MIB_PAWS_TW_REJECTED counter When TCP is in TIME_WAIT state, PAWS verification uses LINUX_PAWSESTABREJECTED, which is ambiguous and cannot be distinguished from other PAWS verification processes. We added a new counter, like the existing PAWS_OLD_ACK one. Also we update the doc with previously missing PAWS_OLD_ACK. usage: ''' nstat -az | grep PAWSTimewait TcpExtPAWSTimewait 1 0.0 ''' Suggested-by: Eric Dumazet Signed-off-by: Jiayuan Chen Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250409112614.16153-3-jiayuan.chen@linux.dev Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/snmp.rst | 2 ++ include/net/dropreason-core.h | 1 + include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_minisocks.c | 2 +- 5 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/net_cachelines/snmp.rst b/Documentation/networking/net_cachelines/snmp.rst index bc96efc92cf5b..bd44b3eebbef7 100644 --- a/Documentation/networking/net_cachelines/snmp.rst +++ b/Documentation/networking/net_cachelines/snmp.rst @@ -37,6 +37,8 @@ unsigned_long LINUX_MIB_TIMEWAITKILLED unsigned_long LINUX_MIB_PAWSACTIVEREJECTED unsigned_long LINUX_MIB_PAWSESTABREJECTED unsigned_long LINUX_MIB_TSECR_REJECTED +unsigned_long LINUX_MIB_PAWS_OLD_ACK +unsigned_long LINUX_MIB_PAWS_TW_REJECTED unsigned_long LINUX_MIB_DELAYEDACKLOST unsigned_long LINUX_MIB_LISTENOVERFLOWS unsigned_long LINUX_MIB_LISTENDROPS diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 9701d7f936f68..bea77934a235c 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -287,6 +287,7 @@ enum skb_drop_reason { /** * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in * TIME_WAIT state. + * Corresponds to LINUX_MIB_PAWS_TW_REJECTED. */ SKB_DROP_REASON_TCP_RFC7323_TW_PAWS, /** diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index ec47f9b68a1bf..1d234d7e18927 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -188,6 +188,7 @@ enum LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */ LINUX_MIB_TSECRREJECTED, /* TSEcrRejected */ LINUX_MIB_PAWS_OLD_ACK, /* PAWSOldAck */ + LINUX_MIB_PAWS_TW_REJECTED, /* PAWSTimewait */ LINUX_MIB_DELAYEDACKS, /* DelayedACKs */ LINUX_MIB_DELAYEDACKLOCKED, /* DelayedACKLocked */ LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 10cbeb76c2745..ea2f01584379a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -191,6 +191,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED), SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), + SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 27511bf58c0f5..43d7852ce07e0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -248,7 +248,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (paws_reject) { *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; - __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); + __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); } if (!th->rst) { From de64872019492e4a84ee053e635cdf8f63f853d2 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 9 Apr 2025 09:02:20 +0100 Subject: [PATCH 049/770] net: stmmac: provide stmmac_pltfr_find_clk() Provide a generic way to find a clock in the bulk data. Reviewed-by: Lad Prabhakar Tested-by: Lad Prabhakar Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u2QO4-001Rp2-Dy@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 11 +++++++++++ drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h | 3 +++ 2 files changed, 14 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index c73eff6a56b87..43c869f64c39c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -709,6 +709,17 @@ devm_stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) #endif /* CONFIG_OF */ EXPORT_SYMBOL_GPL(devm_stmmac_probe_config_dt); +struct clk *stmmac_pltfr_find_clk(struct plat_stmmacenet_data *plat_dat, + const char *name) +{ + for (int i = 0; i < plat_dat->num_clks; i++) + if (strcmp(plat_dat->clks[i].id, name) == 0) + return plat_dat->clks[i].clk; + + return NULL; +} +EXPORT_SYMBOL_GPL(stmmac_pltfr_find_clk); + int stmmac_get_platform_resources(struct platform_device *pdev, struct stmmac_resources *stmmac_res) { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h index 72dc1a32e46d5..6e6561e29d6e3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.h @@ -14,6 +14,9 @@ struct plat_stmmacenet_data * devm_stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac); +struct clk *stmmac_pltfr_find_clk(struct plat_stmmacenet_data *plat_dat, + const char *name); + int stmmac_get_platform_resources(struct platform_device *pdev, struct stmmac_resources *stmmac_res); From 34e816acdb0db36054e26211f859129259816902 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 9 Apr 2025 09:02:25 +0100 Subject: [PATCH 050/770] net: stmmac: dwc-qos: use stmmac_pltfr_find_clk() Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u2QO9-001Rp8-Ii@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../stmicro/stmmac/dwmac-dwc-qos-eth.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index cd431f84f34f6..5db318327d332 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -34,16 +34,6 @@ struct tegra_eqos { struct gpio_desc *reset; }; -static struct clk *dwc_eth_find_clk(struct plat_stmmacenet_data *plat_dat, - const char *name) -{ - for (int i = 0; i < plat_dat->num_clks; i++) - if (strcmp(plat_dat->clks[i].id, name) == 0) - return plat_dat->clks[i].clk; - - return NULL; -} - static int dwc_eth_dwmac_config_dt(struct platform_device *pdev, struct plat_stmmacenet_data *plat_dat) { @@ -132,7 +122,7 @@ static int dwc_qos_probe(struct platform_device *pdev, struct plat_stmmacenet_data *plat_dat, struct stmmac_resources *stmmac_res) { - plat_dat->pclk = dwc_eth_find_clk(plat_dat, "phy_ref_clk"); + plat_dat->pclk = stmmac_pltfr_find_clk(plat_dat, "phy_ref_clk"); return 0; } @@ -242,7 +232,7 @@ static int tegra_eqos_probe(struct platform_device *pdev, if (!is_of_node(dev->fwnode)) goto bypass_clk_reset_gpio; - plat_dat->clk_tx_i = dwc_eth_find_clk(plat_dat, "tx"); + plat_dat->clk_tx_i = stmmac_pltfr_find_clk(plat_dat, "tx"); eqos->reset = devm_gpiod_get(&pdev->dev, "phy-reset", GPIOD_OUT_HIGH); if (IS_ERR(eqos->reset)) { @@ -362,8 +352,8 @@ static int dwc_eth_dwmac_probe(struct platform_device *pdev) if (ret) return dev_err_probe(&pdev->dev, ret, "Failed to enable clocks\n"); - plat_dat->stmmac_clk = dwc_eth_find_clk(plat_dat, - data->stmmac_clk_name); + plat_dat->stmmac_clk = stmmac_pltfr_find_clk(plat_dat, + data->stmmac_clk_name); if (data->probe) ret = data->probe(pdev, plat_dat, &stmmac_res); From b1e904999542ad6764eafa54545f1c55776006d1 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 8 Apr 2025 11:32:01 -0700 Subject: [PATCH 051/770] net: pass const to msg_data_left() The msg_data_left() function doesn't modify the struct msghdr parameter, so mark it as const. This allows the function to be used with const references, improving type safety and making the API more flexible. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250408-tcpsendmsg-v3-1-208b87064c28@debian.org Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index c3322eb3d6865..3b262487ec060 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -168,7 +168,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } -static inline size_t msg_data_left(struct msghdr *msg) +static inline size_t msg_data_left(const struct msghdr *msg) { return iov_iter_count(&msg->msg_iter); } From 0f08335ade71273f89d19412268b48b55f3e3726 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 8 Apr 2025 11:32:02 -0700 Subject: [PATCH 052/770] trace: tcp: Add tracepoint for tcp_sendmsg_locked() Add a tracepoint to monitor TCP send operations, enabling detailed visibility into TCP message transmission. Create a new tracepoint within the tcp_sendmsg_locked function, capturing traditional fields along with size_goal, which indicates the optimal data size for a single TCP segment. Additionally, a reference to the struct sock sk is passed, allowing direct access for BPF programs. The implementation is largely based on David's patch[1] and suggestions. Link: https://lore.kernel.org/all/70168c8f-bf52-4279-b4c4-be64527aa1ac@kernel.org/ [1] Signed-off-by: Breno Leitao Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250408-tcpsendmsg-v3-2-208b87064c28@debian.org Signed-off-by: Jakub Kicinski --- include/trace/events/tcp.h | 24 ++++++++++++++++++++++++ kernel/bpf/btf.c | 1 + net/ipv4/tcp.c | 2 ++ 3 files changed, 27 insertions(+) diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 1a40c41ff8c30..75d3d53a3832c 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -259,6 +259,30 @@ TRACE_EVENT(tcp_retransmit_synack, __entry->saddr_v6, __entry->daddr_v6) ); +TRACE_EVENT(tcp_sendmsg_locked, + TP_PROTO(const struct sock *sk, const struct msghdr *msg, + const struct sk_buff *skb, int size_goal), + + TP_ARGS(sk, msg, skb, size_goal), + + TP_STRUCT__entry( + __field(const void *, skb_addr) + __field(int, skb_len) + __field(int, msg_left) + __field(int, size_goal) + ), + + TP_fast_assign( + __entry->skb_addr = skb; + __entry->skb_len = skb ? skb->len : 0; + __entry->msg_left = msg_data_left(msg); + __entry->size_goal = size_goal; + ), + + TP_printk("skb_addr %p skb_len %d msg_left %d size_goal %d", + __entry->skb_addr, __entry->skb_len, __entry->msg_left, + __entry->size_goal)); + DECLARE_TRACE(tcp_cwnd_reduction_tp, TP_PROTO(const struct sock *sk, int newly_acked_sacked, int newly_lost, int flag), diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 16ba36f34dfab..24a26b4bb0b8a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6541,6 +6541,7 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { { "xprt_put_cong", 0x10 }, /* tcp */ { "tcp_send_reset", 0x11 }, + { "tcp_sendmsg_locked", 0x100 }, /* tegra_apb_dma */ { "tegra_dma_tx_status", 0x100 }, /* timer_migration */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6edc441b37023..e0e96f8fd47cb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1160,6 +1160,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (skb) copy = size_goal - skb->len; + trace_tcp_sendmsg_locked(sk, msg, skb, size_goal); + if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; From 4145f00227ee80f21ab274e9cd9c09758e9bcf3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Krzysztof=20Ha=C5=82asa?= Date: Tue, 8 Apr 2025 13:59:41 +0200 Subject: [PATCH 053/770] usbnet: asix AX88772: leave the carrier control to phylink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASIX AX88772B based USB 10/100 Ethernet adapter doesn't come up ("carrier off"), despite the built-in 100BASE-FX PHY positive link indication. The internal PHY is configured (using EEPROM) in fixed 100 Mbps full duplex mode. The primary problem appears to be using carrier_netif_{on,off}() while, at the same time, delegating carrier management to phylink. Use only the latter and remove "manual control" in the asix driver. I don't have any other AX88772 board here, but the problem doesn't seem specific to a particular board or settings - it's probably timing-dependent. Remove unused asix_adjust_link() as well. Signed-off-by: Krzysztof Hałasa Tested-by: Oleksij Rempel Link: https://patch.msgid.link/m3plhmdfte.fsf_-_@t19.piap.pl Signed-off-by: Jakub Kicinski --- drivers/net/usb/asix.h | 1 - drivers/net/usb/asix_common.c | 22 ---------------------- drivers/net/usb/asix_devices.c | 17 ++++------------- 3 files changed, 4 insertions(+), 36 deletions(-) diff --git a/drivers/net/usb/asix.h b/drivers/net/usb/asix.h index 74162190bccc1..8531b804021aa 100644 --- a/drivers/net/usb/asix.h +++ b/drivers/net/usb/asix.h @@ -224,7 +224,6 @@ int asix_write_rx_ctl(struct usbnet *dev, u16 mode, int in_pm); u16 asix_read_medium_status(struct usbnet *dev, int in_pm); int asix_write_medium_mode(struct usbnet *dev, u16 mode, int in_pm); -void asix_adjust_link(struct net_device *netdev); int asix_write_gpio(struct usbnet *dev, u16 value, int sleep, int in_pm); diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c index 72ffc89b477ad..7fd763917ae2c 100644 --- a/drivers/net/usb/asix_common.c +++ b/drivers/net/usb/asix_common.c @@ -414,28 +414,6 @@ int asix_write_medium_mode(struct usbnet *dev, u16 mode, int in_pm) return ret; } -/* set MAC link settings according to information from phylib */ -void asix_adjust_link(struct net_device *netdev) -{ - struct phy_device *phydev = netdev->phydev; - struct usbnet *dev = netdev_priv(netdev); - u16 mode = 0; - - if (phydev->link) { - mode = AX88772_MEDIUM_DEFAULT; - - if (phydev->duplex == DUPLEX_HALF) - mode &= ~AX_MEDIUM_FD; - - if (phydev->speed != SPEED_100) - mode &= ~AX_MEDIUM_PS; - } - - asix_write_medium_mode(dev, mode, 0); - phy_print_status(phydev); - usbnet_link_change(dev, phydev->link, 0); -} - int asix_write_gpio(struct usbnet *dev, u16 value, int sleep, int in_pm) { int ret; diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index da24941a6e444..9b0318fb50b55 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -752,7 +752,6 @@ static void ax88772_mac_link_down(struct phylink_config *config, struct usbnet *dev = netdev_priv(to_net_dev(config->dev)); asix_write_medium_mode(dev, 0, 0); - usbnet_link_change(dev, false, false); } static void ax88772_mac_link_up(struct phylink_config *config, @@ -783,7 +782,6 @@ static void ax88772_mac_link_up(struct phylink_config *config, m |= AX_MEDIUM_RFC; asix_write_medium_mode(dev, m, 0); - usbnet_link_change(dev, true, false); } static const struct phylink_mac_ops ax88772_phylink_mac_ops = { @@ -1350,10 +1348,9 @@ static const struct driver_info ax88772_info = { .description = "ASIX AX88772 USB 2.0 Ethernet", .bind = ax88772_bind, .unbind = ax88772_unbind, - .status = asix_status, .reset = ax88772_reset, .stop = ax88772_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | FLAG_MULTI_PACKET, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, }; @@ -1362,11 +1359,9 @@ static const struct driver_info ax88772b_info = { .description = "ASIX AX88772B USB 2.0 Ethernet", .bind = ax88772_bind, .unbind = ax88772_unbind, - .status = asix_status, .reset = ax88772_reset, .stop = ax88772_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | - FLAG_MULTI_PACKET, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, .data = FLAG_EEPROM_MAC, @@ -1376,11 +1371,9 @@ static const struct driver_info lxausb_t1l_info = { .description = "Linux Automation GmbH USB 10Base-T1L", .bind = ax88772_bind, .unbind = ax88772_unbind, - .status = asix_status, .reset = ax88772_reset, .stop = ax88772_stop, - .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | - FLAG_MULTI_PACKET, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, .data = FLAG_EEPROM_MAC, @@ -1412,10 +1405,8 @@ static const struct driver_info hg20f9_info = { .description = "HG20F9 USB 2.0 Ethernet", .bind = ax88772_bind, .unbind = ax88772_unbind, - .status = asix_status, .reset = ax88772_reset, - .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_LINK_INTR | - FLAG_MULTI_PACKET, + .flags = FLAG_ETHER | FLAG_FRAMING_AX | FLAG_MULTI_PACKET, .rx_fixup = asix_rx_fixup_common, .tx_fixup = asix_tx_fixup, .data = FLAG_EEPROM_MAC, From cd5e64fb959a98e2d3122c7e944f17ffa6d0448e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:46 -0700 Subject: [PATCH 054/770] netlink: specs: rename rtnetlink specs in accordance with family name The rtnetlink family names are set to rt-$name within the YAML but the files are called rt_$name. C codegen assumes that the generated file name will match the family. The use of dashes is in line with our general expectation that name properties in the spec use dashes not underscores (even tho, as Donald points out most genl families use underscores in the name). We have 3 un-ideal options to choose from: - accept the slight inconsistency with old families using _, or - accept the slight annoyance with all languages having to do s/-/_/ when looking up family ID, or - accept the inconsistency with all name properties in new YAML spec being separated with - and just the family name always using _. Pick option 1 and rename the rtnl spec files. Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250410014658.782120-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/{rt_addr.yaml => rt-addr.yaml} | 0 Documentation/netlink/specs/{rt_link.yaml => rt-link.yaml} | 0 Documentation/netlink/specs/{rt_neigh.yaml => rt-neigh.yaml} | 0 Documentation/netlink/specs/{rt_route.yaml => rt-route.yaml} | 0 Documentation/netlink/specs/{rt_rule.yaml => rt-rule.yaml} | 0 Documentation/userspace-api/netlink/netlink-raw.rst | 2 +- tools/testing/selftests/net/lib/py/ynl.py | 4 ++-- 7 files changed, 3 insertions(+), 3 deletions(-) rename Documentation/netlink/specs/{rt_addr.yaml => rt-addr.yaml} (100%) rename Documentation/netlink/specs/{rt_link.yaml => rt-link.yaml} (100%) rename Documentation/netlink/specs/{rt_neigh.yaml => rt-neigh.yaml} (100%) rename Documentation/netlink/specs/{rt_route.yaml => rt-route.yaml} (100%) rename Documentation/netlink/specs/{rt_rule.yaml => rt-rule.yaml} (100%) diff --git a/Documentation/netlink/specs/rt_addr.yaml b/Documentation/netlink/specs/rt-addr.yaml similarity index 100% rename from Documentation/netlink/specs/rt_addr.yaml rename to Documentation/netlink/specs/rt-addr.yaml diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt-link.yaml similarity index 100% rename from Documentation/netlink/specs/rt_link.yaml rename to Documentation/netlink/specs/rt-link.yaml diff --git a/Documentation/netlink/specs/rt_neigh.yaml b/Documentation/netlink/specs/rt-neigh.yaml similarity index 100% rename from Documentation/netlink/specs/rt_neigh.yaml rename to Documentation/netlink/specs/rt-neigh.yaml diff --git a/Documentation/netlink/specs/rt_route.yaml b/Documentation/netlink/specs/rt-route.yaml similarity index 100% rename from Documentation/netlink/specs/rt_route.yaml rename to Documentation/netlink/specs/rt-route.yaml diff --git a/Documentation/netlink/specs/rt_rule.yaml b/Documentation/netlink/specs/rt-rule.yaml similarity index 100% rename from Documentation/netlink/specs/rt_rule.yaml rename to Documentation/netlink/specs/rt-rule.yaml diff --git a/Documentation/userspace-api/netlink/netlink-raw.rst b/Documentation/userspace-api/netlink/netlink-raw.rst index 1990eea772d08..31fc91020eb34 100644 --- a/Documentation/userspace-api/netlink/netlink-raw.rst +++ b/Documentation/userspace-api/netlink/netlink-raw.rst @@ -62,7 +62,7 @@ Sub-messages ------------ Several raw netlink families such as -:doc:`rt_link<../../networking/netlink_spec/rt_link>` and +:doc:`rt-link<../../networking/netlink_spec/rt-link>` and :doc:`tc<../../networking/netlink_spec/tc>` use attribute nesting as an abstraction to carry module specific information. diff --git a/tools/testing/selftests/net/lib/py/ynl.py b/tools/testing/selftests/net/lib/py/ynl.py index 8986c584cb371..6329ae805abff 100644 --- a/tools/testing/selftests/net/lib/py/ynl.py +++ b/tools/testing/selftests/net/lib/py/ynl.py @@ -39,12 +39,12 @@ def __init__(self, recv_size=0): class RtnlFamily(YnlFamily): def __init__(self, recv_size=0): - super().__init__((SPEC_PATH / Path('rt_link.yaml')).as_posix(), + super().__init__((SPEC_PATH / Path('rt-link.yaml')).as_posix(), schema='', recv_size=recv_size) class RtnlAddrFamily(YnlFamily): def __init__(self, recv_size=0): - super().__init__((SPEC_PATH / Path('rt_addr.yaml')).as_posix(), + super().__init__((SPEC_PATH / Path('rt-addr.yaml')).as_posix(), schema='', recv_size=recv_size) class NetdevFamily(YnlFamily): From 97a33caa90715f90b93610a1fc6e5deb4c21e8d4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:47 -0700 Subject: [PATCH 055/770] netlink: specs: rt-route: specify fixed-header at operations level The C codegen currently stores the fixed-header as part of family info, so it only supports one fixed-header type per spec. Luckily all rtm route message have the same fixed header so just move it up to the higher level. Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250410014658.782120-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-route.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Documentation/netlink/specs/rt-route.yaml b/Documentation/netlink/specs/rt-route.yaml index 292469c7d4b9f..6fa3fa24305ed 100644 --- a/Documentation/netlink/specs/rt-route.yaml +++ b/Documentation/netlink/specs/rt-route.yaml @@ -245,12 +245,12 @@ attribute-sets: operations: enum-model: directional + fixed-header: rtmsg list: - name: getroute doc: Dump route information. attribute-set: route-attrs - fixed-header: rtmsg do: request: value: 26 @@ -320,7 +320,6 @@ operations: name: newroute doc: Create a new route attribute-set: route-attrs - fixed-header: rtmsg do: request: value: 24 @@ -329,7 +328,6 @@ operations: name: delroute doc: Delete an existing route attribute-set: route-attrs - fixed-header: rtmsg do: request: value: 25 From d460016e7bca47b25951853c5f74c1632ccddb3b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:48 -0700 Subject: [PATCH 056/770] netlink: specs: rt-addr: remove the fixed members from attrs The purpose of the attribute list is to list the attributes which will be included in a given message to shrink the objects for families with huge attr spaces. Fixed headers are always present in their entirety (between netlink header and the attrs) so there's no point in listing their members. Current C codegen doesn't expect them and tries to look them up in the attribute space. Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250410014658.782120-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-addr.yaml | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/Documentation/netlink/specs/rt-addr.yaml b/Documentation/netlink/specs/rt-addr.yaml index df6b23f06a228..0488ce87506ca 100644 --- a/Documentation/netlink/specs/rt-addr.yaml +++ b/Documentation/netlink/specs/rt-addr.yaml @@ -133,11 +133,6 @@ operations: request: value: 20 attributes: &ifaddr-all - - ifa-family - - ifa-flags - - ifa-prefixlen - - ifa-scope - - ifa-index - address - label - local @@ -150,11 +145,6 @@ operations: request: value: 21 attributes: - - ifa-family - - ifa-flags - - ifa-prefixlen - - ifa-scope - - ifa-index - address - local - @@ -164,8 +154,7 @@ operations: dump: request: value: 22 - attributes: - - ifa-index + attributes: [] reply: value: 20 attributes: *ifaddr-all @@ -177,9 +166,7 @@ operations: do: request: value: 58 - attributes: - - ifa-family - - ifa-index + attributes: [] reply: value: 58 attributes: &mcaddr-attrs @@ -188,8 +175,7 @@ operations: dump: request: value: 58 - attributes: - - ifa-family + attributes: [] reply: value: 58 attributes: *mcaddr-attrs From 295ff1e9520185de63e747aec63949e54f883c2e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:49 -0700 Subject: [PATCH 057/770] netlink: specs: rt-route: remove the fixed members from attrs The purpose of the attribute list is to list the attributes which will be included in a given message to shrink the objects for families with huge attr spaces. Fixed headers are always present in their entirety (between netlink header and the attrs) so there's no point in listing their members. Current C codegen doesn't expect them and tries to look them up in the attribute space. Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250410014658.782120-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-route.yaml | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/Documentation/netlink/specs/rt-route.yaml b/Documentation/netlink/specs/rt-route.yaml index 6fa3fa24305ed..c7c6f776ab2f1 100644 --- a/Documentation/netlink/specs/rt-route.yaml +++ b/Documentation/netlink/specs/rt-route.yaml @@ -255,11 +255,8 @@ operations: request: value: 26 attributes: - - rtm-family - src - - rtm-src-len - dst - - rtm-dst-len - iif - oif - ip-proto @@ -271,15 +268,6 @@ operations: reply: value: 24 attributes: &all-route-attrs - - rtm-family - - rtm-dst-len - - rtm-src-len - - rtm-tos - - rtm-table - - rtm-protocol - - rtm-scope - - rtm-type - - rtm-flags - dst - src - iif @@ -311,8 +299,7 @@ operations: dump: request: value: 26 - attributes: - - rtm-family + attributes: [] reply: value: 24 attributes: *all-route-attrs From 52d062362c05b17f86485ae206420140bbe0f649 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:50 -0700 Subject: [PATCH 058/770] netlink: specs: rt-addr: add C naming info Add properties needed for C codegen to match names with uAPI headers. Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250410014658.782120-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-addr.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/netlink/specs/rt-addr.yaml b/Documentation/netlink/specs/rt-addr.yaml index 0488ce87506ca..4f86aa1075da7 100644 --- a/Documentation/netlink/specs/rt-addr.yaml +++ b/Documentation/netlink/specs/rt-addr.yaml @@ -2,6 +2,7 @@ name: rt-addr protocol: netlink-raw +uapi-header: linux/rtnetlink.h protonum: 0 doc: @@ -49,6 +50,8 @@ definitions: - name: ifa-flags type: flags + name-prefix: ifa-f- + enum-name: entries: - name: secondary @@ -124,6 +127,7 @@ attribute-sets: operations: fixed-header: ifaddrmsg enum-model: directional + name-prefix: rtm- list: - name: newaddr From 1652e1f35dfb434862a595224ab2c4aa0244951f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:51 -0700 Subject: [PATCH 059/770] netlink: specs: rt-route: add C naming info Add properties needed for C codegen to match names with uAPI headers. Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250410014658.782120-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-route.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/netlink/specs/rt-route.yaml b/Documentation/netlink/specs/rt-route.yaml index c7c6f776ab2f1..800f3a823d473 100644 --- a/Documentation/netlink/specs/rt-route.yaml +++ b/Documentation/netlink/specs/rt-route.yaml @@ -2,6 +2,7 @@ name: rt-route protocol: netlink-raw +uapi-header: linux/rtnetlink.h protonum: 0 doc: @@ -11,6 +12,7 @@ definitions: - name: rtm-type name-prefix: rtn- + enum-name: type: enum entries: - unspec @@ -246,6 +248,7 @@ attribute-sets: operations: enum-model: directional fixed-header: rtmsg + name-prefix: rtm- list: - name: getroute From 17b3ce292dcbf1aaface3d42859cff0e1dfd3bb6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:52 -0700 Subject: [PATCH 060/770] tools: ynl: support creating non-genl sockets Classic netlink has static family IDs specified in YAML, there is no family name -> ID lookup. Support providing the ID info to the library via the generated struct and make library use it. Since NETLINK_ROUTE is ID 0 we need an extra boolean to indicate classic_id is to be used. Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250410014658.782120-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl.c | 51 +++++++++++++++++++++----------- tools/net/ynl/lib/ynl.h | 3 ++ tools/net/ynl/pyynl/ynl_gen_c.py | 9 ++++-- 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c index ce32cb35007d6..b9fda1a99453a 100644 --- a/tools/net/ynl/lib/ynl.c +++ b/tools/net/ynl/lib/ynl.c @@ -663,6 +663,7 @@ ynl_sock_create(const struct ynl_family *yf, struct ynl_error *yse) struct sockaddr_nl addr; struct ynl_sock *ys; socklen_t addrlen; + int sock_type; int one = 1; ys = malloc(sizeof(*ys) + 2 * YNL_SOCKET_BUFFER_SIZE); @@ -675,7 +676,9 @@ ynl_sock_create(const struct ynl_family *yf, struct ynl_error *yse) ys->rx_buf = &ys->raw_buf[YNL_SOCKET_BUFFER_SIZE]; ys->ntf_last_next = &ys->ntf_first; - ys->socket = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + sock_type = yf->is_classic ? yf->classic_id : NETLINK_GENERIC; + + ys->socket = socket(AF_NETLINK, SOCK_RAW, sock_type); if (ys->socket < 0) { __perr(yse, "failed to create a netlink socket"); goto err_free_sock; @@ -708,8 +711,9 @@ ynl_sock_create(const struct ynl_family *yf, struct ynl_error *yse) ys->portid = addr.nl_pid; ys->seq = random(); - - if (ynl_sock_read_family(ys, yf->name)) { + if (yf->is_classic) { + ys->family_id = yf->classic_id; + } else if (ynl_sock_read_family(ys, yf->name)) { if (yse) memcpy(yse, &ys->err, sizeof(*yse)); goto err_close_sock; @@ -791,13 +795,21 @@ static int ynl_ntf_parse(struct ynl_sock *ys, const struct nlmsghdr *nlh) struct ynl_parse_arg yarg = { .ys = ys, }; const struct ynl_ntf_info *info; struct ynl_ntf_base_type *rsp; - struct genlmsghdr *gehdr; + __u32 cmd; int ret; - gehdr = ynl_nlmsg_data(nlh); - if (gehdr->cmd >= ys->family->ntf_info_size) + if (ys->family->is_classic) { + cmd = nlh->nlmsg_type; + } else { + struct genlmsghdr *gehdr; + + gehdr = ynl_nlmsg_data(nlh); + cmd = gehdr->cmd; + } + + if (cmd >= ys->family->ntf_info_size) return YNL_PARSE_CB_ERROR; - info = &ys->family->ntf_info[gehdr->cmd]; + info = &ys->family->ntf_info[cmd]; if (!info->cb) return YNL_PARSE_CB_ERROR; @@ -811,7 +823,7 @@ static int ynl_ntf_parse(struct ynl_sock *ys, const struct nlmsghdr *nlh) goto err_free; rsp->family = nlh->nlmsg_type; - rsp->cmd = gehdr->cmd; + rsp->cmd = cmd; *ys->ntf_last_next = rsp; ys->ntf_last_next = &rsp->next; @@ -863,17 +875,22 @@ int ynl_error_parse(struct ynl_parse_arg *yarg, const char *msg) static int ynl_check_alien(struct ynl_sock *ys, const struct nlmsghdr *nlh, __u32 rsp_cmd) { - struct genlmsghdr *gehdr; + if (ys->family->is_classic) { + if (nlh->nlmsg_type != rsp_cmd) + return ynl_ntf_parse(ys, nlh); + } else { + struct genlmsghdr *gehdr; - if (ynl_nlmsg_data_len(nlh) < sizeof(*gehdr)) { - yerr(ys, YNL_ERROR_INV_RESP, - "Kernel responded with truncated message"); - return -1; - } + if (ynl_nlmsg_data_len(nlh) < sizeof(*gehdr)) { + yerr(ys, YNL_ERROR_INV_RESP, + "Kernel responded with truncated message"); + return -1; + } - gehdr = ynl_nlmsg_data(nlh); - if (gehdr->cmd != rsp_cmd) - return ynl_ntf_parse(ys, nlh); + gehdr = ynl_nlmsg_data(nlh); + if (gehdr->cmd != rsp_cmd) + return ynl_ntf_parse(ys, nlh); + } return 0; } diff --git a/tools/net/ynl/lib/ynl.h b/tools/net/ynl/lib/ynl.h index 6cd570b283eab..59256e258130b 100644 --- a/tools/net/ynl/lib/ynl.h +++ b/tools/net/ynl/lib/ynl.h @@ -2,6 +2,7 @@ #ifndef __YNL_C_H #define __YNL_C_H 1 +#include #include #include #include @@ -48,6 +49,8 @@ struct ynl_family { /* private: */ const char *name; size_t hdr_len; + bool is_classic; + __u16 classic_id; const struct ynl_ntf_info *ntf_info; unsigned int ntf_info_size; }; diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index a1427c5370302..9e00aac4801c2 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -971,9 +971,6 @@ def __init__(self, file_name, exclude_ops): def resolve(self): self.resolve_up(super()) - if self.yaml.get('protocol', 'genetlink') not in {'genetlink', 'genetlink-c', 'genetlink-legacy'}: - raise Exception("Codegen only supported for genetlink") - self.c_name = c_lower(self.ident_name) if 'name-prefix' in self.yaml['operations']: self.op_prefix = c_upper(self.yaml['operations']['name-prefix']) @@ -1020,6 +1017,9 @@ def new_attr_set(self, elem): def new_operation(self, elem, req_value, rsp_value): return Operation(self, elem, req_value, rsp_value) + def is_classic(self): + return self.proto == 'netlink-raw' + def _mark_notify(self): for op in self.msgs.values(): if 'notify' in op: @@ -2730,6 +2730,9 @@ def render_user_family(family, cw, prototype): cw.block_start(f'{symbol} = ') cw.p(f'.name\t\t= "{family.c_name}",') + if family.is_classic(): + cw.p(f'.is_classic\t= true,') + cw.p(f'.classic_id\t= {family.get("protonum")},') if family.fixed_header: cw.p(f'.hdr_len\t= sizeof(struct genlmsghdr) + sizeof(struct {c_lower(family.fixed_header)}),') else: From e0a7903c323f37a71dcdafafa749dfb3beedff3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:53 -0700 Subject: [PATCH 061/770] tools: ynl-gen: don't consider requests with fixed hdr empty C codegen skips generating the structs if request/reply has no attrs. In such cases the request op takes no argument and return int (rather than response struct). In case of classic netlink a lot of information gets passed using the fixed struct, however, so adjust the logic to consider a request empty only if it has no attrs _and_ no fixed struct. Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250410014658.782120-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_c.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 9e00aac4801c2..04f1ac62cb01b 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -1247,6 +1247,9 @@ def __init__(self, cw, family, ku_space, op, op_mode, attr_set=None): if op_mode == 'event': self.struct['reply'] = Struct(family, self.attr_set, type_list=op['event']['attributes']) + def type_empty(self, key): + return len(self.struct[key].attr_list) == 0 and self.fixed_hdr is None + class CodeWriter: def __init__(self, nlib, out_file=None, overwrite=True): @@ -2034,7 +2037,7 @@ def print_type_helpers(ri, direction, deref=False): def print_req_type_helpers(ri): - if len(ri.struct["request"].attr_list) == 0: + if ri.type_empty("request"): return print_alloc_wrapper(ri, "request") print_type_helpers(ri, "request") @@ -2057,7 +2060,7 @@ def print_parse_prototype(ri, direction, terminate=True): def print_req_type(ri): - if len(ri.struct["request"].attr_list) == 0: + if ri.type_empty("request"): return print_type(ri, "request") From 7e8ba0c7de2b3a5b90bc591177a3926b8c0e022d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:54 -0700 Subject: [PATCH 062/770] tools: ynl: don't use genlmsghdr in classic netlink Make sure the codegen calls the right YNL lib helper to start the request based on family type. Classic netlink request must not include the genl header. Conversely don't expect genl headers in the responses. Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250410014658.782120-10-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/lib/ynl-priv.h | 3 +++ tools/net/ynl/lib/ynl.c | 8 ++++---- tools/net/ynl/pyynl/ynl_gen_c.py | 19 +++++++++++++++---- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/tools/net/ynl/lib/ynl-priv.h b/tools/net/ynl/lib/ynl-priv.h index 3c09a7bbfba59..634eb16548b93 100644 --- a/tools/net/ynl/lib/ynl-priv.h +++ b/tools/net/ynl/lib/ynl-priv.h @@ -94,6 +94,9 @@ struct ynl_ntf_base_type { unsigned char data[] __attribute__((aligned(8))); }; +struct nlmsghdr *ynl_msg_start_req(struct ynl_sock *ys, __u32 id); +struct nlmsghdr *ynl_msg_start_dump(struct ynl_sock *ys, __u32 id); + struct nlmsghdr * ynl_gemsg_start_req(struct ynl_sock *ys, __u32 id, __u8 cmd, __u8 version); struct nlmsghdr * diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c index b9fda1a99453a..70f899a540076 100644 --- a/tools/net/ynl/lib/ynl.c +++ b/tools/net/ynl/lib/ynl.c @@ -451,14 +451,14 @@ ynl_gemsg_start(struct ynl_sock *ys, __u32 id, __u16 flags, return nlh; } -void ynl_msg_start_req(struct ynl_sock *ys, __u32 id) +struct nlmsghdr *ynl_msg_start_req(struct ynl_sock *ys, __u32 id) { - ynl_msg_start(ys, id, NLM_F_REQUEST | NLM_F_ACK); + return ynl_msg_start(ys, id, NLM_F_REQUEST | NLM_F_ACK); } -void ynl_msg_start_dump(struct ynl_sock *ys, __u32 id) +struct nlmsghdr *ynl_msg_start_dump(struct ynl_sock *ys, __u32 id) { - ynl_msg_start(ys, id, NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP); + return ynl_msg_start(ys, id, NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP); } struct nlmsghdr * diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 04f1ac62cb01b..b0b47a493a864 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -1710,7 +1710,10 @@ def _multi_parse(ri, struct, init_lines, local_vars): ri.cw.p(f'dst->{arg} = {arg};') if ri.fixed_hdr: - ri.cw.p('hdr = ynl_nlmsg_data_offset(nlh, sizeof(struct genlmsghdr));') + if ri.family.is_classic(): + ri.cw.p('hdr = ynl_nlmsg_data(nlh);') + else: + ri.cw.p('hdr = ynl_nlmsg_data_offset(nlh, sizeof(struct genlmsghdr));') ri.cw.p(f"memcpy(&dst->_hdr, hdr, sizeof({ri.fixed_hdr}));") for anest in sorted(all_multi): aspec = struct[anest] @@ -1857,7 +1860,10 @@ def print_req(ri): ri.cw.block_start() ri.cw.write_func_lvar(local_vars) - ri.cw.p(f"nlh = ynl_gemsg_start_req(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);") + if ri.family.is_classic(): + ri.cw.p(f"nlh = ynl_msg_start_req(ys, {ri.op.enum_name});") + else: + ri.cw.p(f"nlh = ynl_gemsg_start_req(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);") ri.cw.p(f"ys->req_policy = &{ri.struct['request'].render_name}_nest;") if 'reply' in ri.op[ri.op_mode]: @@ -1926,7 +1932,10 @@ def print_dump(ri): else: ri.cw.p(f'yds.rsp_cmd = {ri.op.rsp_value};') ri.cw.nl() - ri.cw.p(f"nlh = ynl_gemsg_start_dump(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);") + if ri.family.is_classic(): + ri.cw.p(f"nlh = ynl_msg_start_dump(ys, {ri.op.enum_name});") + else: + ri.cw.p(f"nlh = ynl_gemsg_start_dump(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);") if ri.fixed_hdr: ri.cw.p("hdr_len = sizeof(req->_hdr);") @@ -2736,7 +2745,9 @@ def render_user_family(family, cw, prototype): if family.is_classic(): cw.p(f'.is_classic\t= true,') cw.p(f'.classic_id\t= {family.get("protonum")},') - if family.fixed_header: + if family.is_classic(): + cw.p(f'.hdr_len\t= sizeof(struct {c_lower(family.fixed_header)}),') + elif family.fixed_header: cw.p(f'.hdr_len\t= sizeof(struct genlmsghdr) + sizeof(struct {c_lower(family.fixed_header)}),') else: cw.p('.hdr_len\t= sizeof(struct genlmsghdr),') From e8025e72aad6d74504c4918c81b532e1fb219b91 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:55 -0700 Subject: [PATCH 063/770] tools: ynl-gen: consider dump ops without a do "type-consistent" If the type for the response to do and dump are the same we don't generate it twice. This is called "type_consistent" in the generator. Consider operations which only have dump to also be consistent. This removes unnecessary "_dump" from the names. There's a number of GET ops in classic Netlink which only have dump handlers. Make sure we output the "onesided" types, normally if the type is consistent we only output it when we render the do structures. Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250410014658.782120-11-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_c.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index b0b47a493a864..efed498ccb35c 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -1212,6 +1212,7 @@ def __init__(self, cw, family, ku_space, op, op_mode, attr_set=None): # 'do' and 'dump' response parsing is identical self.type_consistent = True + self.type_oneside = False if op_mode != 'do' and 'dump' in op: if 'do' in op: if ('reply' in op['do']) != ('reply' in op["dump"]): @@ -1219,7 +1220,8 @@ def __init__(self, cw, family, ku_space, op, op_mode, attr_set=None): elif 'reply' in op['do'] and op["do"]["reply"] != op["dump"]["reply"]: self.type_consistent = False else: - self.type_consistent = False + self.type_consistent = True + self.type_oneside = True self.attr_set = attr_set if not self.attr_set: @@ -1516,7 +1518,9 @@ def op_prefix(ri, direction, deref=False): suffix += f"{direction_to_suffix[direction]}" else: if direction == 'request': - suffix += '_req_dump' + suffix += '_req' + if not ri.type_oneside: + suffix += '_dump' else: if ri.type_consistent: if deref: @@ -1995,7 +1999,7 @@ def _print_type(ri, direction, struct): if not direction and ri.type_name_conflict: suffix += '_' - if ri.op_mode == 'dump': + if ri.op_mode == 'dump' and not ri.type_oneside: suffix += '_dump' ri.cw.block_start(line=f"struct {ri.family.c_name}{suffix}") @@ -2979,7 +2983,7 @@ def main(): ri = RenderInfo(cw, parsed, args.mode, op, 'dump') print_req_type(ri) print_req_type_helpers(ri) - if not ri.type_consistent: + if not ri.type_consistent or ri.type_oneside: print_rsp_type(ri) print_wrapped_type(ri) print_dump_prototype(ri) @@ -3057,7 +3061,7 @@ def main(): if 'dump' in op: cw.p(f"/* {op.enum_name} - dump */") ri = RenderInfo(cw, parsed, args.mode, op, "dump") - if not ri.type_consistent: + if not ri.type_consistent or ri.type_oneside: parse_rsp_msg(ri, deref=True) print_req_free(ri) print_dump_type_free(ri) From 882e7b1365cedae2bb956e17a65bb697cde35e84 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:56 -0700 Subject: [PATCH 064/770] tools: ynl-gen: use family c-name in notifications Family names may include dashes. Fix notification handling code gen to the c-compatible name. Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250410014658.782120-12-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_c.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index efed498ccb35c..a9966ff2a1432 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -2726,7 +2726,7 @@ def render_user_family(family, cw, prototype): return if family.ntfs: - cw.block_start(line=f"static const struct ynl_ntf_info {family['name']}_ntf_info[] = ") + cw.block_start(line=f"static const struct ynl_ntf_info {family.c_name}_ntf_info[] = ") for ntf_op_name, ntf_op in family.ntfs.items(): if 'notify' in ntf_op: op = family.ops[ntf_op['notify']] @@ -2756,8 +2756,8 @@ def render_user_family(family, cw, prototype): else: cw.p('.hdr_len\t= sizeof(struct genlmsghdr),') if family.ntfs: - cw.p(f".ntf_info\t= {family['name']}_ntf_info,") - cw.p(f".ntf_info_size\t= YNL_ARRAY_SIZE({family['name']}_ntf_info),") + cw.p(f".ntf_info\t= {family.c_name}_ntf_info,") + cw.p(f".ntf_info_size\t= YNL_ARRAY_SIZE({family.c_name}_ntf_info),") cw.block_end(line=';') From 29d34a4d785bbf389d57bfdafe2a19dad6ced3a4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:57 -0700 Subject: [PATCH 065/770] tools: ynl: generate code for rt-addr and add a sample YNL C can now generate code for simple classic netlink families. Include rt-addr in the Makefile for generation and add a sample. $ ./tools/net/ynl/samples/rt-addr lo: 127.0.0.1 wlp0s20f3: 192.168.1.101 lo: :: wlp0s20f3: fe80::6385:be6:746e:8116 vpn0: fe80::3597:d353:b5a7:66dd Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250410014658.782120-13-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/Makefile.deps | 1 + tools/net/ynl/generated/Makefile | 2 +- tools/net/ynl/samples/.gitignore | 3 +- tools/net/ynl/samples/rt-addr.c | 80 ++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 tools/net/ynl/samples/rt-addr.c diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index f3269ce39e5b7..e55d94211df6c 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -29,4 +29,5 @@ CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_NETLINK_H,nfsd_netlink.h) CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_vport:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) +CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) CFLAGS_tcp_metrics:=$(call get_hdr_inc,_LINUX_TCP_METRICS_H,tcp_metrics.h) diff --git a/tools/net/ynl/generated/Makefile b/tools/net/ynl/generated/Makefile index 21f9e299dc759..67ce3b8988ef0 100644 --- a/tools/net/ynl/generated/Makefile +++ b/tools/net/ynl/generated/Makefile @@ -25,7 +25,7 @@ SPECS_DIR:=../../../../Documentation/netlink/specs GENS_PATHS=$(shell grep -nrI --files-without-match \ 'protocol: netlink' \ $(SPECS_DIR)) -GENS=$(patsubst $(SPECS_DIR)/%.yaml,%,${GENS_PATHS}) +GENS=$(patsubst $(SPECS_DIR)/%.yaml,%,${GENS_PATHS}) rt-addr SRCS=$(patsubst %,%-user.c,${GENS}) HDRS=$(patsubst %,%-user.h,${GENS}) OBJS=$(patsubst %,%-user.o,${GENS}) diff --git a/tools/net/ynl/samples/.gitignore b/tools/net/ynl/samples/.gitignore index dda6686257a7c..2bc8721d61441 100644 --- a/tools/net/ynl/samples/.gitignore +++ b/tools/net/ynl/samples/.gitignore @@ -2,4 +2,5 @@ ethtool devlink netdev ovs -page-pool \ No newline at end of file +page-pool +rt-addr diff --git a/tools/net/ynl/samples/rt-addr.c b/tools/net/ynl/samples/rt-addr.c new file mode 100644 index 0000000000000..0f4851b4ec57b --- /dev/null +++ b/tools/net/ynl/samples/rt-addr.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include + +#include +#include + +#include "rt-addr-user.h" + +static void rt_addr_print(struct rt_addr_getaddr_rsp *a) +{ + char ifname[IF_NAMESIZE]; + char addr_str[64]; + const char *addr; + const char *name; + + name = if_indextoname(a->_hdr.ifa_index, ifname); + if (name) + printf("%16s: ", name); + + switch (a->_present.address_len) { + case 4: + addr = inet_ntop(AF_INET, a->address, + addr_str, sizeof(addr_str)); + break; + case 16: + addr = inet_ntop(AF_INET6, a->address, + addr_str, sizeof(addr_str)); + break; + default: + addr = NULL; + break; + } + if (addr) + printf("%s", addr); + else + printf("[%d]", a->_present.address_len); + + printf("\n"); +} + +int main(int argc, char **argv) +{ + struct rt_addr_getaddr_list *rsp; + struct rt_addr_getaddr_req *req; + struct ynl_error yerr; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_addr_family, &yerr); + if (!ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return 1; + } + + req = rt_addr_getaddr_req_alloc(); + if (!req) + goto err_destroy; + + rsp = rt_addr_getaddr_dump(ys, req); + rt_addr_getaddr_req_free(req); + if (!rsp) + goto err_close; + + if (ynl_dump_empty(rsp)) + fprintf(stderr, "Error: no addresses reported\n"); + ynl_dump_foreach(rsp, addr) + rt_addr_print(addr); + rt_addr_getaddr_list_free(rsp); + + ynl_sock_destroy(ys); + return 0; + +err_close: + fprintf(stderr, "YNL: %s\n", ys->err.msg); +err_destroy: + ynl_sock_destroy(ys); + return 2; +} From 54d790856c73d71240835e7929235bdf710b57f4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:46:58 -0700 Subject: [PATCH 066/770] tools: ynl: generate code for rt-route and add a sample YNL C can now generate code for simple classic netlink families. Include rt-route in the Makefile for generation and add a sample. $ ./tools/net/ynl/samples/rt-route oif: wlp0s20f3 gateway: 192.168.1.1 oif: wlp0s20f3 dst: 192.168.1.0/24 oif: vpn0 dst: fe80::/64 oif: wlp0s20f3 dst: fe80::/64 oif: wlp0s20f3 gateway: fe80::200:5eff:fe00:201 Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250410014658.782120-14-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/Makefile.deps | 1 + tools/net/ynl/generated/Makefile | 2 +- tools/net/ynl/samples/.gitignore | 1 + tools/net/ynl/samples/rt-route.c | 80 ++++++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 tools/net/ynl/samples/rt-route.c diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index e55d94211df6c..385783489f849 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -30,4 +30,5 @@ CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_vport:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) +CFLAGS_rt-route:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) CFLAGS_tcp_metrics:=$(call get_hdr_inc,_LINUX_TCP_METRICS_H,tcp_metrics.h) diff --git a/tools/net/ynl/generated/Makefile b/tools/net/ynl/generated/Makefile index 67ce3b8988ef0..6603ad8d4ce11 100644 --- a/tools/net/ynl/generated/Makefile +++ b/tools/net/ynl/generated/Makefile @@ -25,7 +25,7 @@ SPECS_DIR:=../../../../Documentation/netlink/specs GENS_PATHS=$(shell grep -nrI --files-without-match \ 'protocol: netlink' \ $(SPECS_DIR)) -GENS=$(patsubst $(SPECS_DIR)/%.yaml,%,${GENS_PATHS}) rt-addr +GENS=$(patsubst $(SPECS_DIR)/%.yaml,%,${GENS_PATHS}) rt-addr rt-route SRCS=$(patsubst %,%-user.c,${GENS}) HDRS=$(patsubst %,%-user.h,${GENS}) OBJS=$(patsubst %,%-user.o,${GENS}) diff --git a/tools/net/ynl/samples/.gitignore b/tools/net/ynl/samples/.gitignore index 2bc8721d61441..7f9781cf532fd 100644 --- a/tools/net/ynl/samples/.gitignore +++ b/tools/net/ynl/samples/.gitignore @@ -4,3 +4,4 @@ netdev ovs page-pool rt-addr +rt-route diff --git a/tools/net/ynl/samples/rt-route.c b/tools/net/ynl/samples/rt-route.c new file mode 100644 index 0000000000000..9d9c868f88733 --- /dev/null +++ b/tools/net/ynl/samples/rt-route.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include + +#include +#include + +#include "rt-route-user.h" + +static void rt_route_print(struct rt_route_getroute_rsp *r) +{ + char ifname[IF_NAMESIZE]; + char route_str[64]; + const char *route; + const char *name; + + /* Ignore local */ + if (r->_hdr.rtm_table == RT_TABLE_LOCAL) + return; + + if (r->_present.oif) { + name = if_indextoname(r->oif, ifname); + if (name) + printf("oif: %-16s ", name); + } + + if (r->_present.dst_len) { + route = inet_ntop(r->_hdr.rtm_family, r->dst, + route_str, sizeof(route_str)); + printf("dst: %s/%d", route, r->_hdr.rtm_dst_len); + } + + if (r->_present.gateway_len) { + route = inet_ntop(r->_hdr.rtm_family, r->gateway, + route_str, sizeof(route_str)); + printf("gateway: %s ", route); + } + + printf("\n"); +} + +int main(int argc, char **argv) +{ + struct rt_route_getroute_req_dump *req; + struct rt_route_getroute_list *rsp; + struct ynl_error yerr; + struct ynl_sock *ys; + + ys = ynl_sock_create(&ynl_rt_route_family, &yerr); + if (!ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return 1; + } + + req = rt_route_getroute_req_dump_alloc(); + if (!req) + goto err_destroy; + + rsp = rt_route_getroute_dump(ys, req); + rt_route_getroute_req_dump_free(req); + if (!rsp) + goto err_close; + + if (ynl_dump_empty(rsp)) + fprintf(stderr, "Error: no routeesses reported\n"); + ynl_dump_foreach(rsp, route) + rt_route_print(route); + rt_route_getroute_list_free(rsp); + + ynl_sock_destroy(ys); + return 0; + +err_close: + fprintf(stderr, "YNL: %s\n", ys->err.msg); +err_destroy: + ynl_sock_destroy(ys); + return 2; +} From 3b4f78f9ad292ad57e3db483332ce7bc799d0d49 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Wed, 9 Apr 2025 11:33:21 +0800 Subject: [PATCH 067/770] ipv4: remove unnecessary judgment in ip_route_output_key_hash_rcu In the ip_route_output_key_cash_rcu function, the input fl4 member saddr is first checked to be non-zero before entering multicast, broadcast and arbitrary IP address checks. However, the fact that the IP address is not 0 has already ruled out the possibility of any address, so remove unnecessary judgment. Signed-off-by: Zhengchao Shao Reviewed-by: David Ahern Link: https://patch.msgid.link/20250409033321.108244-1-shaozhengchao@163.com Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 753704f75b2c6..22dfc971aab4b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2699,8 +2699,7 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, if (fl4->saddr) { if (ipv4_is_multicast(fl4->saddr) || - ipv4_is_lbcast(fl4->saddr) || - ipv4_is_zeronet(fl4->saddr)) { + ipv4_is_lbcast(fl4->saddr)) { rth = ERR_PTR(-EINVAL); goto out; } From 8c40d99e5f43e0545a3f4fea9156313847e2eb79 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 9 Apr 2025 21:05:37 +0200 Subject: [PATCH 068/770] r8169: add helper rtl_csi_mod for accessing extended config space Add a helper for the Realtek-specific mechanism for accessing extended config space if native access isn't possible. This avoids code duplication. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/b368fd91-57d7-4cb5-9342-98b4d8fe9aea@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 26 ++++++++++++++--------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4eebd9cb40a37..8e62b109518b2 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2852,10 +2852,23 @@ static u32 rtl_csi_read(struct rtl8169_private *tp, int addr) RTL_R32(tp, CSIDR) : ~0; } +static void rtl_csi_mod(struct rtl8169_private *tp, int addr, + u32 mask, u32 set) +{ + u32 val; + + WARN(addr % 4, "Invalid CSI address %#x\n", addr); + + netdev_notice_once(tp->dev, + "No native access to PCI extended config space, falling back to CSI\n"); + + val = rtl_csi_read(tp, addr); + rtl_csi_write(tp, addr, (val & ~mask) | set); +} + static void rtl_disable_zrxdc_timeout(struct rtl8169_private *tp) { struct pci_dev *pdev = tp->pci_dev; - u32 csi; int rc; u8 val; @@ -2872,16 +2885,12 @@ static void rtl_disable_zrxdc_timeout(struct rtl8169_private *tp) } } - netdev_notice_once(tp->dev, - "No native access to PCI extended config space, falling back to CSI\n"); - csi = rtl_csi_read(tp, RTL_GEN3_RELATED_OFF); - rtl_csi_write(tp, RTL_GEN3_RELATED_OFF, csi & ~RTL_GEN3_ZRXDC_NONCOMPL); + rtl_csi_mod(tp, RTL_GEN3_RELATED_OFF, RTL_GEN3_ZRXDC_NONCOMPL, 0); } static void rtl_set_aspm_entry_latency(struct rtl8169_private *tp, u8 val) { struct pci_dev *pdev = tp->pci_dev; - u32 csi; /* According to Realtek the value at config space address 0x070f * controls the L0s/L1 entrance latency. We try standard ECAM access @@ -2893,10 +2902,7 @@ static void rtl_set_aspm_entry_latency(struct rtl8169_private *tp, u8 val) pci_write_config_byte(pdev, 0x070f, val) == PCIBIOS_SUCCESSFUL) return; - netdev_notice_once(tp->dev, - "No native access to PCI extended config space, falling back to CSI\n"); - csi = rtl_csi_read(tp, 0x070c) & 0x00ffffff; - rtl_csi_write(tp, 0x070c, csi | val << 24); + rtl_csi_mod(tp, 0x070c, 0xff000000, val << 24); } static void rtl_set_def_aspm_entry_latency(struct rtl8169_private *tp) From 0c49baf099ba2147a6ff3bbdc3197c6ddbee5469 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 9 Apr 2025 21:14:47 +0200 Subject: [PATCH 069/770] r8169: add helper rtl8125_phy_param The integrated PHY's of RTL8125/8126 have an own mechanism to access PHY parameters, similar to what r8168g_phy_param does on earlier PHY versions. Add helper rtl8125_phy_param to simplify the code. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/847b7356-12d6-441b-ade9-4b6e1539b84a@gmail.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/realtek/r8169_phy_config.c | 36 +++++++++---------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c index cf95e579c65d7..748ca8b212ba1 100644 --- a/drivers/net/ethernet/realtek/r8169_phy_config.c +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c @@ -50,6 +50,15 @@ static void r8168g_phy_param(struct phy_device *phydev, u16 parm, phy_restore_page(phydev, oldpage, 0); } +static void rtl8125_phy_param(struct phy_device *phydev, u16 parm, + u16 mask, u16 val) +{ + phy_lock_mdio_bus(phydev); + __phy_write_mmd(phydev, MDIO_MMD_VEND2, 0xb87c, parm); + __phy_modify_mmd(phydev, MDIO_MMD_VEND2, 0xb87e, mask, val); + phy_unlock_mdio_bus(phydev); +} + struct phy_reg { u16 reg; u16 val; @@ -1004,12 +1013,8 @@ static void rtl8125a_2_hw_phy_config(struct rtl8169_private *tp, phy_write_paged(phydev, 0xac5, 0x16, 0x01ff); phy_modify_paged(phydev, 0xac8, 0x15, 0x00f0, 0x0030); - phy_write(phydev, 0x1f, 0x0b87); - phy_write(phydev, 0x16, 0x80a2); - phy_write(phydev, 0x17, 0x0153); - phy_write(phydev, 0x16, 0x809c); - phy_write(phydev, 0x17, 0x0153); - phy_write(phydev, 0x1f, 0x0000); + rtl8125_phy_param(phydev, 0x80a2, 0xffff, 0x0153); + rtl8125_phy_param(phydev, 0x809c, 0xffff, 0x0153); phy_write(phydev, 0x1f, 0x0a43); phy_write(phydev, 0x13, 0x81B3); @@ -1061,14 +1066,9 @@ static void rtl8125b_hw_phy_config(struct rtl8169_private *tp, phy_modify_paged(phydev, 0xac4, 0x13, 0x00f0, 0x0090); phy_modify_paged(phydev, 0xad3, 0x10, 0x0003, 0x0001); - phy_write(phydev, 0x1f, 0x0b87); - phy_write(phydev, 0x16, 0x80f5); - phy_write(phydev, 0x17, 0x760e); - phy_write(phydev, 0x16, 0x8107); - phy_write(phydev, 0x17, 0x360e); - phy_write(phydev, 0x16, 0x8551); - phy_modify(phydev, 0x17, 0xff00, 0x0800); - phy_write(phydev, 0x1f, 0x0000); + rtl8125_phy_param(phydev, 0x80f5, 0xffff, 0x760e); + rtl8125_phy_param(phydev, 0x8107, 0xffff, 0x360e); + rtl8125_phy_param(phydev, 0x8551, 0xff00, 0x0800); phy_modify_paged(phydev, 0xbf0, 0x10, 0xe000, 0xa000); phy_modify_paged(phydev, 0xbf4, 0x13, 0x0f00, 0x0300); @@ -1110,12 +1110,8 @@ static void rtl8125bp_hw_phy_config(struct rtl8169_private *tp, r8168g_phy_param(phydev, 0x8010, 0x0800, 0x0000); - phy_write(phydev, 0x1f, 0x0b87); - phy_write(phydev, 0x16, 0x8088); - phy_modify(phydev, 0x17, 0xff00, 0x9000); - phy_write(phydev, 0x16, 0x808f); - phy_modify(phydev, 0x17, 0xff00, 0x9000); - phy_write(phydev, 0x1f, 0x0000); + rtl8125_phy_param(phydev, 0x8088, 0xff00, 0x9000); + rtl8125_phy_param(phydev, 0x808f, 0xff00, 0x9000); r8168g_phy_param(phydev, 0x8174, 0x2000, 0x1800); From 61499764e5cc5918c9f63026d3b7a34c8668d4b8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 7 Apr 2025 20:15:35 +0100 Subject: [PATCH 070/770] net: stmmac: stm32: simplify clock handling Some stm32 implementations need the receive clock running in suspend, as indicated by dwmac->ops->clk_rx_enable_in_suspend. The existing code achieved this in a rather complex way, by passing a flag around. However, the clk API prepare/enables are counted - which means that a clock won't be stopped as long as there are more prepare and enables than disables and unprepares, just like a reference count. Therefore, we can simplify this logic by calling clk_prepare_enable() an additional time in the probe function if this flag is set, and then balancing that at remove time. With this, we can avoid passing a "are we suspending" and "are we resuming" flag to various functions in the driver. Signed-off-by: Russell King (Oracle) Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/dwmac-stm32.c | 57 ++++++++++++------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c index c3d321192581f..1eb16eec9c0d2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c @@ -119,7 +119,7 @@ struct stm32_ops { u32 syscfg_clr_off; }; -static int stm32_dwmac_clk_enable(struct stm32_dwmac *dwmac, bool resume) +static int stm32_dwmac_clk_enable(struct stm32_dwmac *dwmac) { int ret; @@ -127,11 +127,9 @@ static int stm32_dwmac_clk_enable(struct stm32_dwmac *dwmac, bool resume) if (ret) goto err_clk_tx; - if (!dwmac->ops->clk_rx_enable_in_suspend || !resume) { - ret = clk_prepare_enable(dwmac->clk_rx); - if (ret) - goto err_clk_rx; - } + ret = clk_prepare_enable(dwmac->clk_rx); + if (ret) + goto err_clk_rx; ret = clk_prepare_enable(dwmac->syscfg_clk); if (ret) @@ -148,15 +146,14 @@ static int stm32_dwmac_clk_enable(struct stm32_dwmac *dwmac, bool resume) err_clk_eth_ck: clk_disable_unprepare(dwmac->syscfg_clk); err_syscfg_clk: - if (!dwmac->ops->clk_rx_enable_in_suspend || !resume) - clk_disable_unprepare(dwmac->clk_rx); + clk_disable_unprepare(dwmac->clk_rx); err_clk_rx: clk_disable_unprepare(dwmac->clk_tx); err_clk_tx: return ret; } -static int stm32_dwmac_init(struct plat_stmmacenet_data *plat_dat, bool resume) +static int stm32_dwmac_init(struct plat_stmmacenet_data *plat_dat) { struct stm32_dwmac *dwmac = plat_dat->bsp_priv; int ret; @@ -167,7 +164,7 @@ static int stm32_dwmac_init(struct plat_stmmacenet_data *plat_dat, bool resume) return ret; } - return stm32_dwmac_clk_enable(dwmac, resume); + return stm32_dwmac_clk_enable(dwmac); } static int stm32mp1_select_ethck_external(struct plat_stmmacenet_data *plat_dat) @@ -382,12 +379,10 @@ static int stm32mcu_set_mode(struct plat_stmmacenet_data *plat_dat) SYSCFG_MCU_ETH_MASK, val << 23); } -static void stm32_dwmac_clk_disable(struct stm32_dwmac *dwmac, bool suspend) +static void stm32_dwmac_clk_disable(struct stm32_dwmac *dwmac) { clk_disable_unprepare(dwmac->clk_tx); - if (!dwmac->ops->clk_rx_enable_in_suspend || !suspend) - clk_disable_unprepare(dwmac->clk_rx); - + clk_disable_unprepare(dwmac->clk_rx); clk_disable_unprepare(dwmac->syscfg_clk); if (dwmac->enable_eth_ck) clk_disable_unprepare(dwmac->clk_eth_ck); @@ -541,18 +536,32 @@ static int stm32_dwmac_probe(struct platform_device *pdev) plat_dat->flags |= STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP; plat_dat->bsp_priv = dwmac; - ret = stm32_dwmac_init(plat_dat, false); + ret = stm32_dwmac_init(plat_dat); if (ret) return ret; + /* If this platform requires the clock to be running in suspend, + * prepare and enable the receive clock an additional time to keep + * it running. + */ + if (dwmac->ops->clk_rx_enable_in_suspend) { + ret = clk_prepare_enable(dwmac->clk_rx); + if (ret) + goto err_clk_disable; + } + ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); if (ret) - goto err_clk_disable; + goto err_clk_disable_suspend; return 0; +err_clk_disable_suspend: + if (dwmac->ops->clk_rx_enable_in_suspend) + clk_disable_unprepare(dwmac->clk_rx); + err_clk_disable: - stm32_dwmac_clk_disable(dwmac, false); + stm32_dwmac_clk_disable(dwmac); return ret; } @@ -565,7 +574,15 @@ static void stm32_dwmac_remove(struct platform_device *pdev) stmmac_dvr_remove(&pdev->dev); - stm32_dwmac_clk_disable(dwmac, false); + /* If this platform requires the clock to be running in suspend, + * we need to disable and unprepare the receive clock an additional + * time to balance the extra clk_prepare_enable() in the probe + * function. + */ + if (dwmac->ops->clk_rx_enable_in_suspend) + clk_disable_unprepare(dwmac->clk_rx); + + stm32_dwmac_clk_disable(dwmac); if (dwmac->irq_pwr_wakeup >= 0) { dev_pm_clear_wake_irq(&pdev->dev); @@ -596,7 +613,7 @@ static int stm32_dwmac_suspend(struct device *dev) if (ret) return ret; - stm32_dwmac_clk_disable(dwmac, true); + stm32_dwmac_clk_disable(dwmac); if (dwmac->ops->suspend) ret = dwmac->ops->suspend(dwmac); @@ -614,7 +631,7 @@ static int stm32_dwmac_resume(struct device *dev) if (dwmac->ops->resume) dwmac->ops->resume(dwmac); - ret = stm32_dwmac_init(priv->plat, true); + ret = stm32_dwmac_init(priv->plat); if (ret) return ret; From a808691df39b52cd9db861b118e88e18b63e2299 Mon Sep 17 00:00:00 2001 From: Mateusz Pacuszka Date: Fri, 14 Feb 2025 09:50:35 +0100 Subject: [PATCH 071/770] ice: fix check for existing switch rule In case the rule already exists and another VSI wants to subscribe to it new VSI list is being created and both VSIs are moved to it. Currently, the check for already existing VSI with the same rule is done based on fdw_id.hw_vsi_id, which applies only to LOOKUP_RX flag. Change it to vsi_handle. This is software VSI ID, but it can be applied here, because vsi_map itself is also based on it. Additionally change return status in case the VSI already exists in the VSI map to "Already exists". Such case should be handled by the caller. Signed-off-by: Mateusz Pacuszka Reviewed-by: Przemek Kitszel Reviewed-by: Michal Swiatkowski Signed-off-by: Larysa Zaremba Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_switch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index 4a91e0aaf0a5e..9d9a7edd3618a 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -3146,7 +3146,7 @@ ice_add_update_vsi_list(struct ice_hw *hw, u16 vsi_handle_arr[2]; /* A rule already exists with the new VSI being added */ - if (cur_fltr->fwd_id.hw_vsi_id == new_fltr->fwd_id.hw_vsi_id) + if (cur_fltr->vsi_handle == new_fltr->vsi_handle) return -EEXIST; vsi_handle_arr[0] = cur_fltr->vsi_handle; @@ -5978,7 +5978,7 @@ ice_adv_add_update_vsi_list(struct ice_hw *hw, /* A rule already exists with the new VSI being added */ if (test_bit(vsi_handle, m_entry->vsi_list_info->vsi_map)) - return 0; + return -EEXIST; /* Update the previously created VSI list set with * the new VSI ID passed in From 4d5a1c4e6d49e1a521c97c991d1aa81702f4c3d9 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 14 Feb 2025 09:50:36 +0100 Subject: [PATCH 072/770] ice: do not add LLDP-specific filter if not necessary Commit 34295a3696fb ("ice: implement new LLDP filter command") introduced the ability to use LLDP-specific filter that directs all LLDP traffic to a single VSI. However, current goal is for all trusted VFs to be able to see LLDP neighbors, which is impossible to do with the special filter. Make using the generic filter the default choice and fall back to special one only if a generic filter cannot be added. That way setups with "NVMs where an already existent LLDP filter is blocking the creation of a filter to allow LLDP packets" will still be able to configure software Rx LLDP on PF only, while all other setups would be able to forward them to VFs too. Reviewed-by: Michal Swiatkowski Signed-off-by: Larysa Zaremba Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 1 + drivers/net/ethernet/intel/ice/ice_common.c | 14 +++++++++---- drivers/net/ethernet/intel/ice/ice_common.h | 3 +-- drivers/net/ethernet/intel/ice/ice_lib.c | 23 ++++++++++++++------- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index fd083647c14ac..2694951a0b1d6 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -515,6 +515,7 @@ enum ice_pf_flags { ICE_FLAG_MTU_CHANGED, ICE_FLAG_GNSS, /* GNSS successfully initialized */ ICE_FLAG_DPLL, /* SyncE/PTP dplls initialized */ + ICE_FLAG_LLDP_AQ_FLTR, ICE_PF_FLAGS_NBITS /* must be last */ }; diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 59df31c2c83f7..e725b785d0932 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -6011,15 +6011,21 @@ bool ice_fw_supports_lldp_fltr_ctrl(struct ice_hw *hw) /** * ice_lldp_fltr_add_remove - add or remove a LLDP Rx switch filter * @hw: pointer to HW struct - * @vsi_num: absolute HW index for VSI + * @vsi: VSI to add the filter to * @add: boolean for if adding or removing a filter + * + * Return: 0 on success, -EOPNOTSUPP if the operation cannot be performed + * with this HW or VSI, otherwise an error corresponding to + * the AQ transaction result. */ -int -ice_lldp_fltr_add_remove(struct ice_hw *hw, u16 vsi_num, bool add) +int ice_lldp_fltr_add_remove(struct ice_hw *hw, struct ice_vsi *vsi, bool add) { struct ice_aqc_lldp_filter_ctrl *cmd; struct ice_aq_desc desc; + if (vsi->type != ICE_VSI_PF || !ice_fw_supports_lldp_fltr_ctrl(hw)) + return -EOPNOTSUPP; + cmd = &desc.params.lldp_filter_ctrl; ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_lldp_filter_ctrl); @@ -6029,7 +6035,7 @@ ice_lldp_fltr_add_remove(struct ice_hw *hw, u16 vsi_num, bool add) else cmd->cmd_flags = ICE_AQC_LLDP_FILTER_ACTION_DELETE; - cmd->vsi_num = cpu_to_le16(vsi_num); + cmd->vsi_num = cpu_to_le16(vsi->vsi_num); return ice_aq_send_cmd(hw, &desc, NULL, 0, NULL); } diff --git a/drivers/net/ethernet/intel/ice/ice_common.h b/drivers/net/ethernet/intel/ice/ice_common.h index 9b00aa0ddf10e..64c530b391917 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.h +++ b/drivers/net/ethernet/intel/ice/ice_common.h @@ -290,8 +290,7 @@ int ice_aq_set_lldp_mib(struct ice_hw *hw, u8 mib_type, void *buf, u16 buf_size, struct ice_sq_cd *cd); bool ice_fw_supports_lldp_fltr_ctrl(struct ice_hw *hw); -int -ice_lldp_fltr_add_remove(struct ice_hw *hw, u16 vsi_num, bool add); +int ice_lldp_fltr_add_remove(struct ice_hw *hw, struct ice_vsi *vsi, bool add); int ice_lldp_execute_pending_mib(struct ice_hw *hw); int ice_aq_read_i2c(struct ice_hw *hw, struct ice_aqc_link_topo_addr topo_addr, diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 0bcf9d127ac9e..542b76b5707a0 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2085,19 +2085,28 @@ void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create) status = eth_fltr(vsi, ETH_P_LLDP, ICE_FLTR_TX, ICE_DROP_PACKET); } else { - if (ice_fw_supports_lldp_fltr_ctrl(&pf->hw)) { - status = ice_lldp_fltr_add_remove(&pf->hw, vsi->vsi_num, - create); - } else { + if (!test_bit(ICE_FLAG_LLDP_AQ_FLTR, pf->flags)) { status = eth_fltr(vsi, ETH_P_LLDP, ICE_FLTR_RX, ICE_FWD_TO_VSI); + if (!status || !create) + goto report; + + dev_info(dev, + "Failed to add generic LLDP Rx filter on VSI %i error: %d, falling back to specialized AQ control\n", + vsi->vsi_num, status); } + + status = ice_lldp_fltr_add_remove(&pf->hw, vsi, create); + if (!status) + set_bit(ICE_FLAG_LLDP_AQ_FLTR, pf->flags); + } +report: if (status) - dev_dbg(dev, "Fail %s %s LLDP rule on VSI %i error: %d\n", - create ? "adding" : "removing", tx ? "TX" : "RX", - vsi->vsi_num, status); + dev_warn(dev, "Failed to %s %s LLDP rule on VSI %i error: %d\n", + create ? "add" : "remove", tx ? "Tx" : "Rx", + vsi->vsi_num, status); } /** From 2296345416b02aa9a7216bfec2725948f034feff Mon Sep 17 00:00:00 2001 From: Mateusz Pacuszka Date: Fri, 14 Feb 2025 09:50:37 +0100 Subject: [PATCH 073/770] ice: receive LLDP on trusted VFs When a trusted VF tries to configure an LLDP multicast address, configure a rule that would mirror the traffic to this VF, untrusted VFs are not allowed to receive LLDP at all, so the request to add LLDP MAC address will always fail for them. Add a forwarding LLDP filter to a trusted VF when it tries to add an LLDP multicast MAC address. The MAC address has to be added after enabling trust (through restarting the LLDP service). Signed-off-by: Mateusz Pacuszka Co-developed-by: Larysa Zaremba Signed-off-by: Larysa Zaremba Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_dcb_lib.c | 2 +- drivers/net/ethernet/intel/ice/ice_ethtool.c | 2 +- drivers/net/ethernet/intel/ice/ice_lib.c | 48 ++++++++++++++--- drivers/net/ethernet/intel/ice/ice_lib.h | 3 +- drivers/net/ethernet/intel/ice/ice_sriov.c | 4 ++ drivers/net/ethernet/intel/ice/ice_vf_lib.c | 26 +++++++++ drivers/net/ethernet/intel/ice/ice_vf_lib.h | 8 +++ drivers/net/ethernet/intel/ice/ice_virtchnl.c | 53 +++++++++++++++++-- 8 files changed, 134 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c index a7c5108328240..67988c7ab08e6 100644 --- a/drivers/net/ethernet/intel/ice/ice_dcb_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_dcb_lib.c @@ -846,7 +846,7 @@ int ice_init_pf_dcb(struct ice_pf *pf, bool locked) goto dcb_init_err; } - ice_cfg_sw_lldp(pf_vsi, false, true); + ice_cfg_sw_rx_lldp(pf, true); pf->dcbx_cap = ice_dcb_get_mode(port_info, true); return 0; diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 7c2dc347e4e57..648815170477c 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -1818,7 +1818,7 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags) /* Remove rule to direct LLDP packets to default VSI. * The FW LLDP engine will now be consuming them. */ - ice_cfg_sw_lldp(vsi, false, false); + ice_cfg_sw_rx_lldp(vsi->back, false); /* AQ command to start FW LLDP agent will return an * error if the agent is already started diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 542b76b5707a0..03bb16191237e 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2065,12 +2065,15 @@ static void ice_vsi_set_tc_cfg(struct ice_vsi *vsi) } /** - * ice_cfg_sw_lldp - Config switch rules for LLDP packet handling + * ice_vsi_cfg_sw_lldp - Config switch rules for LLDP packet handling * @vsi: the VSI being configured * @tx: bool to determine Tx or Rx rule * @create: bool to determine create or remove Rule + * + * Adding an ethtype Tx rule to the uplink VSI results in it being applied + * to the whole port, so LLDP transmission for VFs will be blocked too. */ -void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create) +void ice_vsi_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create) { int (*eth_fltr)(struct ice_vsi *v, u16 type, u16 flag, enum ice_sw_fwd_act_type act); @@ -2109,6 +2112,37 @@ void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create) vsi->vsi_num, status); } +/** + * ice_cfg_sw_rx_lldp - Enable/disable software handling of LLDP + * @pf: the PF being configured + * @enable: enable or disable + * + * Configure switch rules to enable/disable LLDP handling by software + * across PF. + */ +void ice_cfg_sw_rx_lldp(struct ice_pf *pf, bool enable) +{ + struct ice_vsi *vsi; + struct ice_vf *vf; + unsigned int bkt; + + vsi = ice_get_main_vsi(pf); + ice_vsi_cfg_sw_lldp(vsi, false, enable); + + if (!test_bit(ICE_FLAG_SRIOV_ENA, pf->flags)) + return; + + ice_for_each_vf(pf, bkt, vf) { + vsi = ice_get_vf_vsi(vf); + + if (WARN_ON(!vsi)) + continue; + + if (ice_vf_is_lldp_ena(vf)) + ice_vsi_cfg_sw_lldp(vsi, false, enable); + } +} + /** * ice_set_agg_vsi - sets up scheduler aggregator node and move VSI into it * @vsi: pointer to the VSI @@ -2537,7 +2571,7 @@ ice_vsi_setup(struct ice_pf *pf, struct ice_vsi_cfg_params *params) if (!ice_is_safe_mode(pf) && vsi->type == ICE_VSI_PF) { ice_fltr_add_eth(vsi, ETH_P_PAUSE, ICE_FLTR_TX, ICE_DROP_PACKET); - ice_cfg_sw_lldp(vsi, true, true); + ice_vsi_cfg_sw_lldp(vsi, true, true); } if (!vsi->agg_node) @@ -2834,9 +2868,11 @@ int ice_vsi_release(struct ice_vsi *vsi) /* The Rx rule will only exist to remove if the LLDP FW * engine is currently stopped */ - if (!ice_is_safe_mode(pf) && vsi->type == ICE_VSI_PF && - !test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags)) - ice_cfg_sw_lldp(vsi, false, false); + if (!ice_is_safe_mode(pf) && + !test_bit(ICE_FLAG_FW_LLDP_AGENT, pf->flags) && + (vsi->type == ICE_VSI_PF || (vsi->type == ICE_VSI_VF && + ice_vf_is_lldp_ena(vsi->vf)))) + ice_vsi_cfg_sw_lldp(vsi, false, false); ice_vsi_decfg(vsi); diff --git a/drivers/net/ethernet/intel/ice/ice_lib.h b/drivers/net/ethernet/intel/ice/ice_lib.h index b4c9cb28a016e..654516c5fc3ed 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_lib.h @@ -29,7 +29,8 @@ ice_vsi_stop_lan_tx_rings(struct ice_vsi *vsi, enum ice_disq_rst_src rst_src, int ice_vsi_stop_xdp_tx_rings(struct ice_vsi *vsi); -void ice_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create); +void ice_vsi_cfg_sw_lldp(struct ice_vsi *vsi, bool tx, bool create); +void ice_cfg_sw_rx_lldp(struct ice_pf *pf, bool enable); int ice_set_link(struct ice_vsi *vsi, bool ena); diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c index f1648cf103b74..0e4dc1a5cff0f 100644 --- a/drivers/net/ethernet/intel/ice/ice_sriov.c +++ b/drivers/net/ethernet/intel/ice/ice_sriov.c @@ -63,6 +63,7 @@ static void ice_free_vf_res(struct ice_vf *vf) if (vf->lan_vsi_idx != ICE_NO_VSI) { ice_vf_vsi_release(vf); vf->num_mac = 0; + vf->num_mac_lldp = 0; } last_vector_idx = vf->first_vector_idx + vf->num_msix - 1; @@ -1402,6 +1403,9 @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted) mutex_lock(&vf->cfg_lock); + while (!trusted && vf->num_mac_lldp) + ice_vf_update_mac_lldp_num(vf, ice_get_vf_vsi(vf), false); + vf->trusted = trusted; ice_reset_vf(vf, ICE_VF_RESET_NOTIFY); dev_info(ice_pf_to_dev(pf), "VF %u is now %strusted\n", diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.c b/drivers/net/ethernet/intel/ice/ice_vf_lib.c index 815ad0bfe8326..48cd533e93b74 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.c @@ -226,6 +226,7 @@ static void ice_vf_clear_counters(struct ice_vf *vf) vsi->num_vlan = 0; vf->num_mac = 0; + vf->num_mac_lldp = 0; memset(&vf->mdd_tx_events, 0, sizeof(vf->mdd_tx_events)); memset(&vf->mdd_rx_events, 0, sizeof(vf->mdd_rx_events)); } @@ -1401,3 +1402,28 @@ struct ice_vsi *ice_get_vf_ctrl_vsi(struct ice_pf *pf, struct ice_vsi *vsi) rcu_read_unlock(); return ctrl_vsi; } + +/** + * ice_vf_update_mac_lldp_num - update the VF's number of LLDP addresses + * @vf: a VF to add the address to + * @vsi: the corresponding VSI + * @incr: is the rule added or removed + */ +void ice_vf_update_mac_lldp_num(struct ice_vf *vf, struct ice_vsi *vsi, + bool incr) +{ + bool lldp_by_fw = test_bit(ICE_FLAG_FW_LLDP_AGENT, vsi->back->flags); + bool was_ena = ice_vf_is_lldp_ena(vf) && !lldp_by_fw; + bool is_ena; + + if (WARN_ON(!vsi)) { + vf->num_mac_lldp = 0; + return; + } + + vf->num_mac_lldp += incr ? 1 : -1; + is_ena = ice_vf_is_lldp_ena(vf) && !lldp_by_fw; + + if (was_ena != is_ena) + ice_vsi_cfg_sw_lldp(vsi, false, is_ena); +} diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.h b/drivers/net/ethernet/intel/ice/ice_vf_lib.h index 799b2c1f11844..f4c9ca1f51ce2 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.h @@ -134,6 +134,7 @@ struct ice_vf { unsigned long vf_caps; /* VF's adv. capabilities */ u8 num_req_qs; /* num of queue pairs requested by VF */ u16 num_mac; + u16 num_mac_lldp; u16 num_vf_qs; /* num of queue configured per VF */ u8 vlan_strip_ena; /* Outer and Inner VLAN strip enable */ #define ICE_INNER_VLAN_STRIP_ENA BIT(0) @@ -180,6 +181,11 @@ static inline u16 ice_vf_get_port_vlan_tpid(struct ice_vf *vf) return vf->port_vlan_info.tpid; } +static inline bool ice_vf_is_lldp_ena(struct ice_vf *vf) +{ + return vf->num_mac_lldp && vf->trusted; +} + /* VF Hash Table access functions * * These functions provide abstraction for interacting with the VF hash table. @@ -245,6 +251,8 @@ ice_vf_clear_vsi_promisc(struct ice_vf *vf, struct ice_vsi *vsi, u8 promisc_m); int ice_reset_vf(struct ice_vf *vf, u32 flags); void ice_reset_all_vfs(struct ice_pf *pf); struct ice_vsi *ice_get_vf_ctrl_vsi(struct ice_pf *pf, struct ice_vsi *vsi); +void ice_vf_update_mac_lldp_num(struct ice_vf *vf, struct ice_vsi *vsi, + bool incr); #else /* CONFIG_PCI_IOV */ static inline struct ice_vf *ice_get_vf_by_id(struct ice_pf *pf, u16 vf_id) { diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c index 7c3006eb68dd0..b147f6cf26151 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c @@ -2265,6 +2265,51 @@ ice_vfhw_mac_add(struct ice_vf *vf, struct virtchnl_ether_addr *vc_ether_addr) } } +/** + * ice_is_mc_lldp_eth_addr - check if the given MAC is a multicast LLDP address + * @mac: address to check + * + * Return: true if the address is one of the three possible LLDP multicast + * addresses, false otherwise. + */ +static bool ice_is_mc_lldp_eth_addr(const u8 *mac) +{ + const u8 lldp_mac_base[] = {0x01, 0x80, 0xc2, 0x00, 0x00}; + + if (memcmp(mac, lldp_mac_base, sizeof(lldp_mac_base))) + return false; + + return (mac[5] == 0x0e || mac[5] == 0x03 || mac[5] == 0x00); +} + +/** + * ice_vc_can_add_mac - check if the VF is allowed to add a given MAC + * @vf: a VF to add the address to + * @mac: address to check + * + * Return: true if the VF is allowed to add such MAC address, false otherwise. + */ +static bool ice_vc_can_add_mac(const struct ice_vf *vf, const u8 *mac) +{ + struct device *dev = ice_pf_to_dev(vf->pf); + + if (is_unicast_ether_addr(mac) && + !ice_can_vf_change_mac((struct ice_vf *)vf)) { + dev_err(dev, + "VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n"); + return false; + } + + if (!vf->trusted && ice_is_mc_lldp_eth_addr(mac)) { + dev_warn(dev, + "An untrusted VF %u is attempting to configure an LLDP multicast address\n", + vf->vf_id); + return false; + } + + return true; +} + /** * ice_vc_add_mac_addr - attempt to add the MAC address passed in * @vf: pointer to the VF info @@ -2283,10 +2328,8 @@ ice_vc_add_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi, if (ether_addr_equal(mac_addr, vf->dev_lan_addr)) return 0; - if (is_unicast_ether_addr(mac_addr) && !ice_can_vf_change_mac(vf)) { - dev_err(dev, "VF attempting to override administratively set MAC address, bring down and up the VF interface to resume normal operation\n"); + if (!ice_vc_can_add_mac(vf, mac_addr)) return -EPERM; - } ret = ice_fltr_add_mac(vsi, mac_addr, ICE_FWD_TO_VSI); if (ret == -EEXIST) { @@ -2301,6 +2344,8 @@ ice_vc_add_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi, return ret; } else { vf->num_mac++; + if (ice_is_mc_lldp_eth_addr(mac_addr)) + ice_vf_update_mac_lldp_num(vf, vsi, true); } ice_vfhw_mac_add(vf, vc_ether_addr); @@ -2395,6 +2440,8 @@ ice_vc_del_mac_addr(struct ice_vf *vf, struct ice_vsi *vsi, ice_vfhw_mac_del(vf, vc_ether_addr); vf->num_mac--; + if (ice_is_mc_lldp_eth_addr(mac_addr)) + ice_vf_update_mac_lldp_num(vf, vsi, false); return 0; } From 5787179c5183c99d9d2d49818186d43c49d6cc04 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 14 Feb 2025 09:50:38 +0100 Subject: [PATCH 074/770] ice: remove headers argument from ice_tc_count_lkups Remove the headers argument from the ice_tc_count_lkups() function, because it is not used anywhere. Reviewed-by: Michal Swiatkowski Signed-off-by: Larysa Zaremba Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index ea39b999a0d00..d8d28d74222a6 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -12,14 +12,11 @@ /** * ice_tc_count_lkups - determine lookup count for switch filter * @flags: TC-flower flags - * @headers: Pointer to TC flower filter header structure * @fltr: Pointer to outer TC filter structure * - * Determine lookup count based on TC flower input for switch filter. + * Return: lookup count based on TC flower input for a switch filter. */ -static int -ice_tc_count_lkups(u32 flags, struct ice_tc_flower_lyr_2_4_hdrs *headers, - struct ice_tc_flower_fltr *fltr) +static int ice_tc_count_lkups(u32 flags, struct ice_tc_flower_fltr *fltr) { int lkups_cnt = 1; /* 0th lookup is metadata */ @@ -770,7 +767,6 @@ static int ice_eswitch_tc_parse_action(struct net_device *filter_dev, static int ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) { - struct ice_tc_flower_lyr_2_4_hdrs *headers = &fltr->outer_headers; struct ice_adv_rule_info rule_info = { 0 }; struct ice_rule_query_data rule_added; struct ice_hw *hw = &vsi->back->hw; @@ -785,7 +781,7 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) return -EOPNOTSUPP; } - lkups_cnt = ice_tc_count_lkups(flags, headers, fltr); + lkups_cnt = ice_tc_count_lkups(flags, fltr); list = kcalloc(lkups_cnt, sizeof(*list), GFP_ATOMIC); if (!list) return -ENOMEM; @@ -985,7 +981,6 @@ static int ice_add_tc_flower_adv_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *tc_fltr) { - struct ice_tc_flower_lyr_2_4_hdrs *headers = &tc_fltr->outer_headers; struct ice_adv_rule_info rule_info = {0}; struct ice_rule_query_data rule_added; struct ice_adv_lkup_elem *list; @@ -1021,7 +1016,7 @@ ice_add_tc_flower_adv_fltr(struct ice_vsi *vsi, return PTR_ERR(dest_vsi); } - lkups_cnt = ice_tc_count_lkups(flags, headers, tc_fltr); + lkups_cnt = ice_tc_count_lkups(flags, tc_fltr); list = kcalloc(lkups_cnt, sizeof(*list), GFP_ATOMIC); if (!list) return -ENOMEM; From 40f42dc1cbb6b5841eb350fc81a5c26f9aa5b420 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 14 Feb 2025 09:50:39 +0100 Subject: [PATCH 075/770] ice: support egress drop rules on PF tc clsact qdisc allows us to add offloaded egress rules with commands such as the following one: tc filter add dev egress protocol lldp flower skip_sw action drop Support the egress rule drop action when added to PF, with a few caveats: * in switchdev mode, all PF traffic has to go uplink with an exception for LLDP that can be delegated to a single VSI at a time * in legacy mode, we cannot delegate LLDP functionality to another VSI, so such packets from PF should not be blocked. Also, simplify the rule direction logic, it was previously derived from actions, but actually can be inherited from the tc block (and flipped in case of port representors). Reviewed-by: Michal Swiatkowski Signed-off-by: Larysa Zaremba Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_eswitch.c | 4 + drivers/net/ethernet/intel/ice/ice_main.c | 63 ++++++++++++-- drivers/net/ethernet/intel/ice/ice_repr.c | 3 +- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 86 +++++++++++++++----- drivers/net/ethernet/intel/ice/ice_tc_lib.h | 9 +- drivers/net/ethernet/intel/ice/ice_txrx.c | 17 ++-- 6 files changed, 138 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index ed21d7f55ac11..206a7d839aeff 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -245,6 +245,10 @@ ice_eswitch_set_target_vsi(struct sk_buff *skb, u64 cd_cmd, dst_vsi; if (!dst) { + struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb); + + if (unlikely(eth->h_proto == htons(ETH_P_LLDP))) + return; cd_cmd = ICE_TX_CTX_DESC_SWTCH_UPLINK << ICE_TXD_CTX_QW1_CMD_S; off->cd_qw1 |= (cd_cmd | ICE_TX_DESC_DTYPE_CTX); } else { diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index d390157b59fe1..1fbe13ee93a8d 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -8330,11 +8330,16 @@ void ice_tx_timeout(struct net_device *netdev, unsigned int txqueue) * @np: net device to configure * @filter_dev: device on which filter is added * @cls_flower: offload data + * @ingress: if the rule is added to an ingress block + * + * Return: 0 if the flower was successfully added or deleted, + * negative error code otherwise. */ static int ice_setup_tc_cls_flower(struct ice_netdev_priv *np, struct net_device *filter_dev, - struct flow_cls_offload *cls_flower) + struct flow_cls_offload *cls_flower, + bool ingress) { struct ice_vsi *vsi = np->vsi; @@ -8343,7 +8348,7 @@ ice_setup_tc_cls_flower(struct ice_netdev_priv *np, switch (cls_flower->command) { case FLOW_CLS_REPLACE: - return ice_add_cls_flower(filter_dev, vsi, cls_flower); + return ice_add_cls_flower(filter_dev, vsi, cls_flower, ingress); case FLOW_CLS_DESTROY: return ice_del_cls_flower(vsi, cls_flower); default: @@ -8352,20 +8357,46 @@ ice_setup_tc_cls_flower(struct ice_netdev_priv *np, } /** - * ice_setup_tc_block_cb - callback handler registered for TC block + * ice_setup_tc_block_cb_ingress - callback handler for ingress TC block * @type: TC SETUP type * @type_data: TC flower offload data that contains user input * @cb_priv: netdev private data + * + * Return: 0 if the setup was successful, negative error code otherwise. */ static int -ice_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv) +ice_setup_tc_block_cb_ingress(enum tc_setup_type type, void *type_data, + void *cb_priv) { struct ice_netdev_priv *np = cb_priv; switch (type) { case TC_SETUP_CLSFLOWER: return ice_setup_tc_cls_flower(np, np->vsi->netdev, - type_data); + type_data, true); + default: + return -EOPNOTSUPP; + } +} + +/** + * ice_setup_tc_block_cb_egress - callback handler for egress TC block + * @type: TC SETUP type + * @type_data: TC flower offload data that contains user input + * @cb_priv: netdev private data + * + * Return: 0 if the setup was successful, negative error code otherwise. + */ +static int +ice_setup_tc_block_cb_egress(enum tc_setup_type type, void *type_data, + void *cb_priv) +{ + struct ice_netdev_priv *np = cb_priv; + + switch (type) { + case TC_SETUP_CLSFLOWER: + return ice_setup_tc_cls_flower(np, np->vsi->netdev, + type_data, false); default: return -EOPNOTSUPP; } @@ -9310,16 +9341,32 @@ ice_setup_tc(struct net_device *netdev, enum tc_setup_type type, void *type_data) { struct ice_netdev_priv *np = netdev_priv(netdev); + enum flow_block_binder_type binder_type; struct ice_pf *pf = np->vsi->back; + flow_setup_cb_t *flower_handler; bool locked = false; int err; switch (type) { case TC_SETUP_BLOCK: + binder_type = + ((struct flow_block_offload *)type_data)->binder_type; + + switch (binder_type) { + case FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS: + flower_handler = ice_setup_tc_block_cb_ingress; + break; + case FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS: + flower_handler = ice_setup_tc_block_cb_egress; + break; + default: + return -EOPNOTSUPP; + } + return flow_block_cb_setup_simple(type_data, &ice_block_cb_list, - ice_setup_tc_block_cb, - np, np, true); + flower_handler, + np, np, false); case TC_SETUP_QDISC_MQPRIO: if (ice_is_eswitch_mode_switchdev(pf)) { netdev_err(netdev, "TC MQPRIO offload not supported, switchdev is enabled\n"); @@ -9380,7 +9427,7 @@ ice_indr_setup_block_cb(enum tc_setup_type type, void *type_data, case TC_SETUP_CLSFLOWER: return ice_setup_tc_cls_flower(np, priv->netdev, (struct flow_cls_offload *) - type_data); + type_data, false); default: return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c index fb7a1b9a43139..f81bf60f83651 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.c +++ b/drivers/net/ethernet/intel/ice/ice_repr.c @@ -219,7 +219,8 @@ ice_repr_setup_tc_cls_flower(struct ice_repr *repr, { switch (flower->command) { case FLOW_CLS_REPLACE: - return ice_add_cls_flower(repr->netdev, repr->src_vsi, flower); + return ice_add_cls_flower(repr->netdev, repr->src_vsi, flower, + true); case FLOW_CLS_DESTROY: return ice_del_cls_flower(repr->src_vsi, flower); default: diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index d8d28d74222a6..229cd29ff92a4 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -681,26 +681,26 @@ static int ice_tc_setup_action(struct net_device *filter_dev, fltr->action.fltr_act = action; if (ice_is_port_repr_netdev(filter_dev) && - ice_is_port_repr_netdev(target_dev)) { + ice_is_port_repr_netdev(target_dev) && + fltr->direction == ICE_ESWITCH_FLTR_EGRESS) { repr = ice_netdev_to_repr(target_dev); fltr->dest_vsi = repr->src_vsi; - fltr->direction = ICE_ESWITCH_FLTR_EGRESS; } else if (ice_is_port_repr_netdev(filter_dev) && - ice_tc_is_dev_uplink(target_dev)) { + ice_tc_is_dev_uplink(target_dev) && + fltr->direction == ICE_ESWITCH_FLTR_EGRESS) { repr = ice_netdev_to_repr(filter_dev); fltr->dest_vsi = repr->src_vsi->back->eswitch.uplink_vsi; - fltr->direction = ICE_ESWITCH_FLTR_EGRESS; } else if (ice_tc_is_dev_uplink(filter_dev) && - ice_is_port_repr_netdev(target_dev)) { + ice_is_port_repr_netdev(target_dev) && + fltr->direction == ICE_ESWITCH_FLTR_INGRESS) { repr = ice_netdev_to_repr(target_dev); fltr->dest_vsi = repr->src_vsi; - fltr->direction = ICE_ESWITCH_FLTR_INGRESS; } else { NL_SET_ERR_MSG_MOD(fltr->extack, - "Unsupported netdevice in switchdev mode"); + "The action is not supported for this netdevice"); return -EINVAL; } @@ -713,13 +713,11 @@ ice_tc_setup_drop_action(struct net_device *filter_dev, { fltr->action.fltr_act = ICE_DROP_PACKET; - if (ice_is_port_repr_netdev(filter_dev)) { - fltr->direction = ICE_ESWITCH_FLTR_EGRESS; - } else if (ice_tc_is_dev_uplink(filter_dev)) { - fltr->direction = ICE_ESWITCH_FLTR_INGRESS; - } else { + if (!ice_tc_is_dev_uplink(filter_dev) && + !(ice_is_port_repr_netdev(filter_dev) && + fltr->direction == ICE_ESWITCH_FLTR_INGRESS)) { NL_SET_ERR_MSG_MOD(fltr->extack, - "Unsupported netdevice in switchdev mode"); + "The action is not supported for this netdevice"); return -EINVAL; } @@ -809,6 +807,11 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) rule_info.sw_act.flag |= ICE_FLTR_RX; rule_info.sw_act.src = hw->pf_id; rule_info.flags_info.act = ICE_SINGLE_ACT_LB_ENABLE; + } else if (fltr->direction == ICE_ESWITCH_FLTR_EGRESS && + !fltr->dest_vsi && vsi == vsi->back->eswitch.uplink_vsi) { + /* PF to Uplink */ + rule_info.sw_act.flag |= ICE_FLTR_TX; + rule_info.sw_act.src = vsi->idx; } else if (fltr->direction == ICE_ESWITCH_FLTR_EGRESS && fltr->dest_vsi == vsi->back->eswitch.uplink_vsi) { /* VF to Uplink */ @@ -1051,8 +1054,13 @@ ice_add_tc_flower_adv_fltr(struct ice_vsi *vsi, tc_fltr->action.fwd.q.hw_queue, lkups_cnt); break; case ICE_DROP_PACKET: - rule_info.sw_act.flag |= ICE_FLTR_RX; - rule_info.sw_act.src = hw->pf_id; + if (tc_fltr->direction == ICE_ESWITCH_FLTR_EGRESS) { + rule_info.sw_act.flag |= ICE_FLTR_TX; + rule_info.sw_act.src = vsi->idx; + } else { + rule_info.sw_act.flag |= ICE_FLTR_RX; + rule_info.sw_act.src = hw->pf_id; + } rule_info.priority = ICE_SWITCH_FLTR_PRIO_VSI; break; default: @@ -1458,11 +1466,16 @@ ice_parse_tunnel_attr(struct net_device *dev, struct flow_rule *rule, * @filter_dev: Pointer to device on which filter is being added * @f: Pointer to struct flow_cls_offload * @fltr: Pointer to filter structure + * @ingress: if the rule is added to an ingress block + * + * Return: 0 if the flower was parsed successfully, -EINVAL if the flower + * cannot be parsed, -EOPNOTSUPP if such filter cannot be configured + * for the given VSI. */ static int ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, struct flow_cls_offload *f, - struct ice_tc_flower_fltr *fltr) + struct ice_tc_flower_fltr *fltr, bool ingress) { struct ice_tc_flower_lyr_2_4_hdrs *headers = &fltr->outer_headers; struct flow_rule *rule = flow_cls_offload_flow_rule(f); @@ -1546,6 +1559,20 @@ ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, fltr->flags |= ICE_TC_FLWR_FIELD_ETH_TYPE_ID; } + if (!ingress) { + bool switchdev = + ice_is_eswitch_mode_switchdev(vsi->back); + + if (switchdev != (n_proto_key == ETH_P_LLDP)) { + NL_SET_ERR_MSG_FMT_MOD(fltr->extack, + "%sLLDP filtering is not supported on egress in %s mode", + switchdev ? "Non-" : "", + switchdev ? "switchdev" : + "legacy"); + return -EOPNOTSUPP; + } + } + headers->l2_key.n_proto = cpu_to_be16(n_proto_key); headers->l2_mask.n_proto = cpu_to_be16(n_proto_mask); headers->l3_key.ip_proto = match.key->ip_proto; @@ -1721,6 +1748,14 @@ ice_parse_cls_flower(struct net_device *filter_dev, struct ice_vsi *vsi, return -EINVAL; } } + + /* Ingress filter on representor results in an egress filter in HW + * and vice versa + */ + ingress = ice_is_port_repr_netdev(filter_dev) ? !ingress : ingress; + fltr->direction = ingress ? ICE_ESWITCH_FLTR_INGRESS : + ICE_ESWITCH_FLTR_EGRESS; + return 0; } @@ -1970,14 +2005,18 @@ static int ice_del_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) * @vsi: Pointer to VSI * @f: Pointer to flower offload structure * @__fltr: Pointer to struct ice_tc_flower_fltr + * @ingress: if the rule is added to an ingress block * * This function parses TC-flower input fields, parses action, * and adds a filter. + * + * Return: 0 if the filter was successfully added, + * negative error code otherwise. */ static int ice_add_tc_fltr(struct net_device *netdev, struct ice_vsi *vsi, struct flow_cls_offload *f, - struct ice_tc_flower_fltr **__fltr) + struct ice_tc_flower_fltr **__fltr, bool ingress) { struct ice_tc_flower_fltr *fltr; int err; @@ -1994,7 +2033,7 @@ ice_add_tc_fltr(struct net_device *netdev, struct ice_vsi *vsi, fltr->src_vsi = vsi; INIT_HLIST_NODE(&fltr->tc_flower_node); - err = ice_parse_cls_flower(netdev, vsi, f, fltr); + err = ice_parse_cls_flower(netdev, vsi, f, fltr, ingress); if (err < 0) goto err; @@ -2037,10 +2076,13 @@ ice_find_tc_flower_fltr(struct ice_pf *pf, unsigned long cookie) * @netdev: Pointer to filter device * @vsi: Pointer to VSI * @cls_flower: Pointer to flower offload structure + * @ingress: if the rule is added to an ingress block + * + * Return: 0 if the flower was successfully added, + * negative error code otherwise. */ -int -ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi, - struct flow_cls_offload *cls_flower) +int ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi, + struct flow_cls_offload *cls_flower, bool ingress) { struct netlink_ext_ack *extack = cls_flower->common.extack; struct net_device *vsi_netdev = vsi->netdev; @@ -2075,7 +2117,7 @@ ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi, } /* prep and add TC-flower filter in HW */ - err = ice_add_tc_fltr(netdev, vsi, cls_flower, &fltr); + err = ice_add_tc_fltr(netdev, vsi, cls_flower, &fltr, ingress); if (err) return err; diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.h b/drivers/net/ethernet/intel/ice/ice_tc_lib.h index d84f153517ec5..df9f90f793b99 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.h @@ -211,11 +211,10 @@ static inline int ice_chnl_dmac_fltr_cnt(struct ice_pf *pf) } struct ice_vsi *ice_locate_vsi_using_queue(struct ice_vsi *vsi, int queue); -int -ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi, - struct flow_cls_offload *cls_flower); -int -ice_del_cls_flower(struct ice_vsi *vsi, struct flow_cls_offload *cls_flower); +int ice_add_cls_flower(struct net_device *netdev, struct ice_vsi *vsi, + struct flow_cls_offload *cls_flower, bool ingress); +int ice_del_cls_flower(struct ice_vsi *vsi, + struct flow_cls_offload *cls_flower); void ice_replay_tc_fltrs(struct ice_pf *pf); bool ice_is_tunnel_supported(struct net_device *dev); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 1e4f6f6ee449c..0e5107fe62ad5 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -2440,19 +2440,20 @@ ice_xmit_frame_ring(struct sk_buff *skb, struct ice_tx_ring *tx_ring) /* allow CONTROL frames egress from main VSI if FW LLDP disabled */ eth = (struct ethhdr *)skb_mac_header(skb); - if (unlikely((skb->priority == TC_PRIO_CONTROL || - eth->h_proto == htons(ETH_P_LLDP)) && - vsi->type == ICE_VSI_PF && - vsi->port_info->qos_cfg.is_sw_lldp)) - offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX | - ICE_TX_CTX_DESC_SWTCH_UPLINK << - ICE_TXD_CTX_QW1_CMD_S); - ice_tstamp(tx_ring, skb, first, &offload); if ((ice_is_switchdev_running(vsi->back) || ice_lag_is_switchdev_running(vsi->back)) && vsi->type != ICE_VSI_SF) ice_eswitch_set_target_vsi(skb, &offload); + else if (unlikely((skb->priority == TC_PRIO_CONTROL || + eth->h_proto == htons(ETH_P_LLDP)) && + vsi->type == ICE_VSI_PF && + vsi->port_info->qos_cfg.is_sw_lldp)) + offload.cd_qw1 |= (u64)(ICE_TX_DESC_DTYPE_CTX | + ICE_TX_CTX_DESC_SWTCH_UPLINK << + ICE_TXD_CTX_QW1_CMD_S); + + ice_tstamp(tx_ring, skb, first, &offload); if (offload.cd_qw1 & ICE_TX_DESC_DTYPE_CTX) { struct ice_tx_ctx_desc *cdesc; From 517f7a08ca5fa82dc1d9e4c153f2dc93dc61a20f Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Fri, 14 Feb 2025 09:50:40 +0100 Subject: [PATCH 076/770] ice: enable LLDP TX for VFs through tc Only a single VSI can be in charge of sending LLDP frames, sometimes it is beneficial to assign this function to a VF, that is possible to do with tc capabilities in the switchdev mode. It requires first blocking the PF from sending the LLDP frames with a following command: tc filter add dev egress protocol lldp flower skip_sw action drop Then it becomes possible to configure a forward rule from a VF port representor to uplink instead. tc filter add dev ingress protocol lldp flower skip_sw action mirred egress redirect dev How LLDP exclusivity was done previously is LLDP traffic was blocked for a whole port by a single rule and PF was bypassing that. Now at least in the switchdev mode, every separate VSI has to have its own drop rule. Another complication is the fact that tc does not respect when the driver refuses to delete a rule, so returning an error results in a HW rule still present with no way to reference it through tc. This is addressed by allowing the PF rule to be deleted at any time, but making the VF forward rule "dormant" in such case, this means it is deleted from HW but stays in tc and driver's bookkeeping to be restored when drop rule is added back to the PF. Implement tc configuration handling which enables the user to transmit LLDP packets from VF instead of PF. Reviewed-by: Michal Swiatkowski Signed-off-by: Larysa Zaremba Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_eswitch.c | 2 + drivers/net/ethernet/intel/ice/ice_repr.c | 7 + drivers/net/ethernet/intel/ice/ice_tc_lib.c | 160 +++++++++++++++++++ drivers/net/ethernet/intel/ice/ice_tc_lib.h | 2 + drivers/net/ethernet/intel/ice/ice_vf_lib.h | 4 + 5 files changed, 175 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_eswitch.c b/drivers/net/ethernet/intel/ice/ice_eswitch.c index 206a7d839aeff..6aae03771746d 100644 --- a/drivers/net/ethernet/intel/ice/ice_eswitch.c +++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c @@ -29,6 +29,7 @@ static int ice_eswitch_setup_env(struct ice_pf *pf) return -ENODEV; ice_remove_vsi_fltr(&pf->hw, uplink_vsi->idx); + ice_vsi_cfg_sw_lldp(uplink_vsi, true, false); netif_addr_lock_bh(netdev); __dev_uc_unsync(netdev, NULL); @@ -282,6 +283,7 @@ static void ice_eswitch_release_env(struct ice_pf *pf) ice_fltr_add_mac_and_broadcast(uplink_vsi, uplink_vsi->port_info->mac.perm_addr, ICE_FWD_TO_VSI); + ice_vsi_cfg_sw_lldp(uplink_vsi, true, true); } /** diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c index f81bf60f83651..cb08746556a67 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.c +++ b/drivers/net/ethernet/intel/ice/ice_repr.c @@ -337,6 +337,7 @@ void ice_repr_destroy(struct ice_repr *repr) static void ice_repr_rem_vf(struct ice_repr *repr) { ice_eswitch_decfg_vsi(repr->src_vsi, repr->parent_mac); + ice_pass_vf_tx_lldp(repr->src_vsi, true); unregister_netdev(repr->netdev); ice_devlink_destroy_vf_port(repr->vf); ice_virtchnl_set_dflt_ops(repr->vf); @@ -418,6 +419,10 @@ static int ice_repr_add_vf(struct ice_repr *repr) if (err) goto err_netdev; + err = ice_drop_vf_tx_lldp(repr->src_vsi, true); + if (err) + goto err_drop_lldp; + err = ice_eswitch_cfg_vsi(repr->src_vsi, repr->parent_mac); if (err) goto err_cfg_vsi; @@ -430,6 +435,8 @@ static int ice_repr_add_vf(struct ice_repr *repr) return 0; err_cfg_vsi: + ice_pass_vf_tx_lldp(repr->src_vsi, true); +err_drop_lldp: unregister_netdev(repr->netdev); err_netdev: ice_devlink_destroy_vf_port(vf); diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index 229cd29ff92a4..49b87071b7dc1 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -762,6 +762,154 @@ static int ice_eswitch_tc_parse_action(struct net_device *filter_dev, return 0; } +static bool ice_is_fltr_lldp(struct ice_tc_flower_fltr *fltr) +{ + return fltr->outer_headers.l2_key.n_proto == htons(ETH_P_LLDP); +} + +static bool ice_is_fltr_pf_tx_lldp(struct ice_tc_flower_fltr *fltr) +{ + struct ice_vsi *vsi = fltr->src_vsi, *uplink; + + if (!ice_is_switchdev_running(vsi->back)) + return false; + + uplink = vsi->back->eswitch.uplink_vsi; + return vsi == uplink && fltr->action.fltr_act == ICE_DROP_PACKET && + ice_is_fltr_lldp(fltr) && + fltr->direction == ICE_ESWITCH_FLTR_EGRESS && + fltr->flags == ICE_TC_FLWR_FIELD_ETH_TYPE_ID; +} + +static bool ice_is_fltr_vf_tx_lldp(struct ice_tc_flower_fltr *fltr) +{ + struct ice_vsi *vsi = fltr->src_vsi, *uplink; + + uplink = vsi->back->eswitch.uplink_vsi; + return fltr->src_vsi->type == ICE_VSI_VF && ice_is_fltr_lldp(fltr) && + fltr->direction == ICE_ESWITCH_FLTR_EGRESS && + fltr->dest_vsi == uplink; +} + +static struct ice_tc_flower_fltr * +ice_find_pf_tx_lldp_fltr(struct ice_pf *pf) +{ + struct ice_tc_flower_fltr *fltr; + + hlist_for_each_entry(fltr, &pf->tc_flower_fltr_list, tc_flower_node) + if (ice_is_fltr_pf_tx_lldp(fltr)) + return fltr; + + return NULL; +} + +static bool ice_any_vf_lldp_tx_ena(struct ice_pf *pf) +{ + struct ice_vf *vf; + unsigned int bkt; + + ice_for_each_vf(pf, bkt, vf) + if (vf->lldp_tx_ena) + return true; + + return false; +} + +int ice_pass_vf_tx_lldp(struct ice_vsi *vsi, bool deinit) +{ + struct ice_rule_query_data remove_entry = { + .rid = vsi->vf->lldp_recipe_id, + .rule_id = vsi->vf->lldp_rule_id, + .vsi_handle = vsi->idx, + }; + struct ice_pf *pf = vsi->back; + int err; + + if (vsi->vf->lldp_tx_ena) + return 0; + + if (!deinit && !ice_find_pf_tx_lldp_fltr(vsi->back)) + return -EINVAL; + + if (!deinit && ice_any_vf_lldp_tx_ena(pf)) + return -EINVAL; + + err = ice_rem_adv_rule_by_id(&pf->hw, &remove_entry); + if (!err) + vsi->vf->lldp_tx_ena = true; + + return err; +} + +int ice_drop_vf_tx_lldp(struct ice_vsi *vsi, bool init) +{ + struct ice_rule_query_data rule_added; + struct ice_adv_rule_info rinfo = { + .priority = 7, + .src_vsi = vsi->idx, + .sw_act = { + .src = vsi->idx, + .flag = ICE_FLTR_TX, + .fltr_act = ICE_DROP_PACKET, + .vsi_handle = vsi->idx, + }, + .flags_info.act_valid = true, + }; + struct ice_adv_lkup_elem list[3]; + struct ice_pf *pf = vsi->back; + int err; + + if (!init && !vsi->vf->lldp_tx_ena) + return 0; + + memset(list, 0, sizeof(list)); + ice_rule_add_direction_metadata(&list[0]); + ice_rule_add_src_vsi_metadata(&list[1]); + list[2].type = ICE_ETYPE_OL; + list[2].h_u.ethertype.ethtype_id = htons(ETH_P_LLDP); + list[2].m_u.ethertype.ethtype_id = htons(0xFFFF); + + err = ice_add_adv_rule(&pf->hw, list, ARRAY_SIZE(list), &rinfo, + &rule_added); + if (err) { + dev_err(&pf->pdev->dev, + "Failed to add an LLDP rule to VSI 0x%X: %d\n", + vsi->idx, err); + } else { + vsi->vf->lldp_recipe_id = rule_added.rid; + vsi->vf->lldp_rule_id = rule_added.rule_id; + vsi->vf->lldp_tx_ena = false; + } + + return err; +} + +static void ice_handle_add_pf_lldp_drop_rule(struct ice_vsi *vsi) +{ + struct ice_tc_flower_fltr *fltr; + struct ice_pf *pf = vsi->back; + + hlist_for_each_entry(fltr, &pf->tc_flower_fltr_list, tc_flower_node) { + if (!ice_is_fltr_vf_tx_lldp(fltr)) + continue; + ice_pass_vf_tx_lldp(fltr->src_vsi, true); + break; + } +} + +static void ice_handle_del_pf_lldp_drop_rule(struct ice_pf *pf) +{ + int i; + + /* Make the VF LLDP fwd to uplink rule dormant */ + ice_for_each_vsi(pf, i) { + struct ice_vsi *vf_vsi = pf->vsi[i]; + + if (vf_vsi && vf_vsi->type == ICE_VSI_VF) + ice_drop_vf_tx_lldp(vf_vsi, false); + } +} + static int ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) { @@ -779,6 +927,9 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) return -EOPNOTSUPP; } + if (ice_is_fltr_vf_tx_lldp(fltr)) + return ice_pass_vf_tx_lldp(vsi, false); + lkups_cnt = ice_tc_count_lkups(flags, fltr); list = kcalloc(lkups_cnt, sizeof(*list), GFP_ATOMIC); if (!list) @@ -850,6 +1001,9 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) goto exit; } + if (ice_is_fltr_pf_tx_lldp(fltr)) + ice_handle_add_pf_lldp_drop_rule(vsi); + /* store the output params, which are needed later for removing * advanced switch filter */ @@ -1969,6 +2123,12 @@ static int ice_del_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) struct ice_pf *pf = vsi->back; int err; + if (ice_is_fltr_pf_tx_lldp(fltr)) + ice_handle_del_pf_lldp_drop_rule(pf); + + if (ice_is_fltr_vf_tx_lldp(fltr)) + return ice_drop_vf_tx_lldp(vsi, false); + rule_rem.rid = fltr->rid; rule_rem.rule_id = fltr->rule_id; rule_rem.vsi_handle = fltr->dest_vsi_handle; diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.h b/drivers/net/ethernet/intel/ice/ice_tc_lib.h index df9f90f793b99..8a3ab2f22af9b 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.h @@ -217,6 +217,8 @@ int ice_del_cls_flower(struct ice_vsi *vsi, struct flow_cls_offload *cls_flower); void ice_replay_tc_fltrs(struct ice_pf *pf); bool ice_is_tunnel_supported(struct net_device *dev); +int ice_drop_vf_tx_lldp(struct ice_vsi *vsi, bool init); +int ice_pass_vf_tx_lldp(struct ice_vsi *vsi, bool deinit); static inline bool ice_is_forward_action(enum ice_sw_fwd_act_type fltr_act) { diff --git a/drivers/net/ethernet/intel/ice/ice_vf_lib.h b/drivers/net/ethernet/intel/ice/ice_vf_lib.h index f4c9ca1f51ce2..482f4285fd350 100644 --- a/drivers/net/ethernet/intel/ice/ice_vf_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_vf_lib.h @@ -124,6 +124,7 @@ struct ice_vf { u8 spoofchk:1; u8 link_forced:1; u8 link_up:1; /* only valid if VF link is forced */ + u8 lldp_tx_ena:1; u32 ptp_caps; @@ -150,6 +151,9 @@ struct ice_vf { /* devlink port data */ struct devlink_port devlink_port; + u16 lldp_recipe_id; + u16 lldp_rule_id; + u16 num_msix; /* num of MSI-X configured on this VF */ struct ice_vf_qs_bw qs_bw[ICE_MAX_RSS_QS_PER_VF]; }; From 1e05c5a05d0d91085f2c0cca8a038ee5251a99cc Mon Sep 17 00:00:00 2001 From: Karol Kolacinski Date: Thu, 20 Mar 2025 14:15:36 +0100 Subject: [PATCH 077/770] ice: remove SW side band access workaround for E825 Due to the bug in FW/NVM autoload mechanism (wrong default SB_REM_DEV_CTL register settings), the access to peer PHY and CGU clients was disabled by default. As the workaround solution, the register value was overwritten by the driver at the probe or reset handling. Remove workaround as it's not needed anymore. The fix in autoload procedure has been provided with NVM 3.80 version. NOTE: at the time the fix was provided in NVM, the E825C product was not officially available on the market, so it's not expected this change will cause regression when running with older driver/kernel versions. Reviewed-by: Michal Swiatkowski Reviewed-by: Przemek Kitszel Signed-off-by: Karol Kolacinski Signed-off-by: Grzegorz Nitka Reviewed-by: Simon Horman Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ptp_hw.c | 23 --------------------- 1 file changed, 23 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c index 89bb8461284ad..a5df081ffc190 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c @@ -2630,25 +2630,6 @@ int ice_start_phy_timer_eth56g(struct ice_hw *hw, u8 port) return 0; } -/** - * ice_sb_access_ena_eth56g - Enable SB devices (PHY and others) access - * @hw: pointer to HW struct - * @enable: Enable or disable access - * - * Enable sideband devices (PHY and others) access. - */ -static void ice_sb_access_ena_eth56g(struct ice_hw *hw, bool enable) -{ - u32 val = rd32(hw, PF_SB_REM_DEV_CTL); - - if (enable) - val |= BIT(eth56g_phy_0) | BIT(cgu) | BIT(eth56g_phy_1); - else - val &= ~(BIT(eth56g_phy_0) | BIT(cgu) | BIT(eth56g_phy_1)); - - wr32(hw, PF_SB_REM_DEV_CTL, val); -} - /** * ice_ptp_init_phc_e825 - Perform E825 specific PHC initialization * @hw: pointer to HW struct @@ -2659,8 +2640,6 @@ static void ice_sb_access_ena_eth56g(struct ice_hw *hw, bool enable) */ static int ice_ptp_init_phc_e825(struct ice_hw *hw) { - ice_sb_access_ena_eth56g(hw, true); - /* Initialize the Clock Generation Unit */ return ice_init_cgu_e82x(hw); } @@ -2747,8 +2726,6 @@ static void ice_ptp_init_phy_e825(struct ice_hw *hw) params->num_phys = 2; ptp->ports_per_phy = 4; ptp->num_lports = params->num_phys * ptp->ports_per_phy; - - ice_sb_access_ena_eth56g(hw, true); } /* E822 family functions From 1fd9c91f7e8f2e0d91043e09f3b61c6783d75e5c Mon Sep 17 00:00:00 2001 From: Karol Kolacinski Date: Thu, 20 Mar 2025 14:15:37 +0100 Subject: [PATCH 078/770] ice: refactor ice_sbq_msg_dev enum Rename ice_sbq_msg_dev to ice_sbq_dev_id to reflect the meaning of this type more precisely. This enum type describes RDA (Remote Device Access) client ids, accessible over SB (Side Band) interface. Rename enum elements to make a driver namespace more cleaner and consistent with other definitions within SB Remove unused 'rmn_x' entries, specific to unsupported E824 device. Adjust clients '2' and '13' names (phy_0 and phy_0_peer respectively) to be compliant with EAS doc. According to the specification, regardless of the complex entity (single or dual), when accessing its own ports, they're accessed always as 'phy_0' client. And referred as 'phy_0_peer' when handling ports connected to the other complex. Reviewed-by: Simon Horman Reviewed-by: Przemek Kitszel Signed-off-by: Karol Kolacinski Co-developed-by: Grzegorz Nitka Signed-off-by: Grzegorz Nitka Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_common.c | 2 +- drivers/net/ethernet/intel/ice/ice_ptp_hw.c | 20 ++++++++++---------- drivers/net/ethernet/intel/ice/ice_sbq_cmd.h | 11 ++++------- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index e725b785d0932..bec6b3e6b4de6 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -3434,7 +3434,7 @@ int ice_aq_get_fec_stats(struct ice_hw *hw, u16 pcs_quad, u16 pcs_port, msg.msg_addr_low = lower_16_bits(reg_offset); msg.msg_addr_high = receiver_id; msg.opcode = ice_sbq_msg_rd; - msg.dest_dev = rmn_0; + msg.dest_dev = ice_sbq_dev_phy_0; err = ice_sbq_rw_reg(hw, &msg, flag); if (err) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c index a5df081ffc190..eb1893dd89799 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c @@ -240,7 +240,7 @@ static int ice_read_cgu_reg_e82x(struct ice_hw *hw, u32 addr, u32 *val) { struct ice_sbq_msg_input cgu_msg = { .opcode = ice_sbq_msg_rd, - .dest_dev = cgu, + .dest_dev = ice_sbq_dev_cgu, .msg_addr_low = addr }; int err; @@ -272,7 +272,7 @@ static int ice_write_cgu_reg_e82x(struct ice_hw *hw, u32 addr, u32 val) { struct ice_sbq_msg_input cgu_msg = { .opcode = ice_sbq_msg_wr, - .dest_dev = cgu, + .dest_dev = ice_sbq_dev_cgu, .msg_addr_low = addr, .data = val }; @@ -919,16 +919,16 @@ static void ice_ptp_cfg_sync_delay(const struct ice_hw *hw, u32 delay) * * Return: destination sideband queue PHY device. */ -static enum ice_sbq_msg_dev ice_ptp_get_dest_dev_e825(struct ice_hw *hw, - u8 port) +static enum ice_sbq_dev_id ice_ptp_get_dest_dev_e825(struct ice_hw *hw, + u8 port) { /* On a single complex E825, PHY 0 is always destination device phy_0 * and PHY 1 is phy_0_peer. */ if (port >= hw->ptp.ports_per_phy) - return eth56g_phy_1; + return ice_sbq_dev_phy_0_peer; else - return eth56g_phy_0; + return ice_sbq_dev_phy_0; } /** @@ -2758,7 +2758,7 @@ static void ice_fill_phy_msg_e82x(struct ice_hw *hw, msg->msg_addr_high = P_Q1_H(P_4_BASE + offset, phy_port); } - msg->dest_dev = rmn_0; + msg->dest_dev = ice_sbq_dev_phy_0; } /** @@ -3081,7 +3081,7 @@ static int ice_fill_quad_msg_e82x(struct ice_hw *hw, if (quad >= ICE_GET_QUAD_NUM(hw->ptp.num_lports)) return -EINVAL; - msg->dest_dev = rmn_0; + msg->dest_dev = ice_sbq_dev_phy_0; if (!(quad % ICE_GET_QUAD_NUM(hw->ptp.ports_per_phy))) addr = Q_0_BASE + offset; @@ -4800,7 +4800,7 @@ static int ice_read_phy_reg_e810(struct ice_hw *hw, u32 addr, u32 *val) msg.msg_addr_low = lower_16_bits(addr); msg.msg_addr_high = upper_16_bits(addr); msg.opcode = ice_sbq_msg_rd; - msg.dest_dev = rmn_0; + msg.dest_dev = ice_sbq_dev_phy_0; err = ice_sbq_rw_reg(hw, &msg, ICE_AQ_FLAG_RD); if (err) { @@ -4830,7 +4830,7 @@ static int ice_write_phy_reg_e810(struct ice_hw *hw, u32 addr, u32 val) msg.msg_addr_low = lower_16_bits(addr); msg.msg_addr_high = upper_16_bits(addr); msg.opcode = ice_sbq_msg_wr; - msg.dest_dev = rmn_0; + msg.dest_dev = ice_sbq_dev_phy_0; msg.data = val; err = ice_sbq_rw_reg(hw, &msg, ICE_AQ_FLAG_RD); diff --git a/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h b/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h index 3b0054faf70c7..183dd5457d6ae 100644 --- a/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_sbq_cmd.h @@ -46,13 +46,10 @@ struct ice_sbq_evt_desc { u8 data[24]; }; -enum ice_sbq_msg_dev { - eth56g_phy_0 = 0x02, - rmn_0 = 0x02, - rmn_1 = 0x03, - rmn_2 = 0x04, - cgu = 0x06, - eth56g_phy_1 = 0x0D, +enum ice_sbq_dev_id { + ice_sbq_dev_phy_0 = 0x02, + ice_sbq_dev_cgu = 0x06, + ice_sbq_dev_phy_0_peer = 0x0D, }; enum ice_sbq_msg_opcode { From e2193f9f9ec989d7d3e5ff1fd96f71abc4426fc5 Mon Sep 17 00:00:00 2001 From: Karol Kolacinski Date: Thu, 20 Mar 2025 14:15:38 +0100 Subject: [PATCH 079/770] ice: enable timesync operation on 2xNAC E825 devices According to the E825C specification, SBQ address for ports on a single complex is device 2 for PHY 0 and device 13 for PHY1. For accessing ports on a dual complex E825C (so called 2xNAC mode), the driver should use destination device 2 (referred as phy_0) for the current complex PHY and device 13 (referred as phy_0_peer) for peer complex PHY. Differentiate SBQ destination device by checking if current PF port number is on the same PHY as target port number. Adjust 'ice_get_lane_number' function to provide unique port number for ports from PHY1 in 'dual' mode config (by adding fixed offset for PHY1 ports). Cache this value in ice_hw struct. Introduce ice_get_primary_hw wrapper to get access to timesync register not available from second NAC. Reviewed-by: Simon Horman Reviewed-by: Przemek Kitszel Signed-off-by: Karol Kolacinski Co-developed-by: Grzegorz Nitka Signed-off-by: Grzegorz Nitka Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice.h | 60 ++++++++++++++++++++- drivers/net/ethernet/intel/ice/ice_common.c | 6 ++- drivers/net/ethernet/intel/ice/ice_ptp.c | 49 ++++++++++++----- drivers/net/ethernet/intel/ice/ice_ptp_hw.c | 39 +++++++++++--- drivers/net/ethernet/intel/ice/ice_ptp_hw.h | 5 -- drivers/net/ethernet/intel/ice/ice_type.h | 1 + 6 files changed, 134 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 2694951a0b1d6..e42572ae7631c 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -193,8 +193,6 @@ #define ice_pf_to_dev(pf) (&((pf)->pdev->dev)) -#define ice_pf_src_tmr_owned(pf) ((pf)->hw.func_caps.ts_func_info.src_tmr_owned) - enum ice_feature { ICE_F_DSCP, ICE_F_PHY_RCLK, @@ -1046,4 +1044,62 @@ static inline void ice_clear_rdma_cap(struct ice_pf *pf) } extern const struct xdp_metadata_ops ice_xdp_md_ops; + +/** + * ice_is_dual - Check if given config is multi-NAC + * @hw: pointer to HW structure + * + * Return: true if the device is running in mutli-NAC (Network + * Acceleration Complex) configuration variant, false otherwise + * (always false for non-E825 devices). + */ +static inline bool ice_is_dual(struct ice_hw *hw) +{ + return hw->mac_type == ICE_MAC_GENERIC_3K_E825 && + (hw->dev_caps.nac_topo.mode & ICE_NAC_TOPO_DUAL_M); +} + +/** + * ice_is_primary - Check if given device belongs to the primary complex + * @hw: pointer to HW structure + * + * Check if given PF/HW is running on primary complex in multi-NAC + * configuration. + * + * Return: true if the device is dual, false otherwise (always true + * for non-E825 devices). + */ +static inline bool ice_is_primary(struct ice_hw *hw) +{ + return hw->mac_type != ICE_MAC_GENERIC_3K_E825 || + !ice_is_dual(hw) || + (hw->dev_caps.nac_topo.mode & ICE_NAC_TOPO_PRIMARY_M); +} + +/** + * ice_pf_src_tmr_owned - Check if a primary timer is owned by PF + * @pf: pointer to PF structure + * + * Return: true if PF owns primary timer, false otherwise. + */ +static inline bool ice_pf_src_tmr_owned(struct ice_pf *pf) +{ + return pf->hw.func_caps.ts_func_info.src_tmr_owned && + ice_is_primary(&pf->hw); +} + +/** + * ice_get_primary_hw - Get pointer to primary ice_hw structure + * @pf: pointer to PF structure + * + * Return: A pointer to ice_hw structure with access to timesync + * register space. + */ +static inline struct ice_hw *ice_get_primary_hw(struct ice_pf *pf) +{ + if (!pf->adapter->ctrl_pf) + return &pf->hw; + else + return &pf->adapter->ctrl_pf->hw; +} #endif /* _ICE_H_ */ diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index bec6b3e6b4de6..4fedf0181c4e1 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -1135,6 +1135,8 @@ int ice_init_hw(struct ice_hw *hw) } } + hw->lane_num = ice_get_phy_lane_number(hw); + return 0; err_unroll_fltr_mgmt_struct: ice_cleanup_fltr_mgmt_struct(hw); @@ -4082,10 +4084,12 @@ int ice_get_phy_lane_number(struct ice_hw *hw) continue; if (hw->pf_id == lport) { + if (hw->mac_type == ICE_MAC_GENERIC_3K_E825 && + ice_is_dual(hw) && !ice_is_primary(hw)) + lane += ICE_PORTS_PER_QUAD; kfree(options); return lane; } - lport++; } diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 1fd1ae03eb909..0ea5e52303f22 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -305,6 +305,9 @@ u64 ice_ptp_read_src_clk_reg(struct ice_pf *pf, u32 hi, lo, lo2; u8 tmr_idx; + if (!ice_is_primary(hw)) + hw = ice_get_primary_hw(pf); + tmr_idx = ice_get_ptp_src_clock_index(hw); guard(spinlock)(&pf->adapter->ptp_gltsyn_time_lock); /* Read the system timestamp pre PHC read */ @@ -2985,6 +2988,32 @@ static void ice_ptp_periodic_work(struct kthread_work *work) msecs_to_jiffies(err ? 10 : 500)); } +/** + * ice_ptp_prepare_rebuild_sec - Prepare second NAC for PTP reset or rebuild + * @pf: Board private structure + * @rebuild: rebuild if true, prepare if false + * @reset_type: the reset type being performed + */ +static void ice_ptp_prepare_rebuild_sec(struct ice_pf *pf, bool rebuild, + enum ice_reset_req reset_type) +{ + struct list_head *entry; + + list_for_each(entry, &pf->adapter->ports.ports) { + struct ice_ptp_port *port = list_entry(entry, + struct ice_ptp_port, + list_node); + struct ice_pf *peer_pf = ptp_port_to_pf(port); + + if (!ice_is_primary(&peer_pf->hw)) { + if (rebuild) + ice_ptp_rebuild(peer_pf, reset_type); + else + ice_ptp_prepare_for_reset(peer_pf, reset_type); + } + } +} + /** * ice_ptp_prepare_for_reset - Prepare PTP for reset * @pf: Board private structure @@ -2993,6 +3022,7 @@ static void ice_ptp_periodic_work(struct kthread_work *work) void ice_ptp_prepare_for_reset(struct ice_pf *pf, enum ice_reset_req reset_type) { struct ice_ptp *ptp = &pf->ptp; + struct ice_hw *hw = &pf->hw; u8 src_tmr; if (ptp->state != ICE_PTP_READY) @@ -3008,6 +3038,9 @@ void ice_ptp_prepare_for_reset(struct ice_pf *pf, enum ice_reset_req reset_type) if (reset_type == ICE_RESET_PFR) return; + if (ice_pf_src_tmr_owned(pf) && hw->mac_type == ICE_MAC_GENERIC_3K_E825) + ice_ptp_prepare_rebuild_sec(pf, false, reset_type); + ice_ptp_release_tx_tracker(pf, &pf->ptp.port.tx); /* Disable periodic outputs */ @@ -3129,13 +3162,6 @@ void ice_ptp_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type) dev_err(ice_pf_to_dev(pf), "PTP reset failed %d\n", err); } -static bool ice_is_primary(struct ice_hw *hw) -{ - return hw->mac_type == ICE_MAC_GENERIC_3K_E825 && ice_is_dual(hw) ? - !!(hw->dev_caps.nac_topo.mode & ICE_NAC_TOPO_PRIMARY_M) : - true; -} - static int ice_ptp_setup_adapter(struct ice_pf *pf) { if (!ice_pf_src_tmr_owned(pf) || !ice_is_primary(&pf->hw)) @@ -3355,17 +3381,16 @@ void ice_ptp_init(struct ice_pf *pf) { struct ice_ptp *ptp = &pf->ptp; struct ice_hw *hw = &pf->hw; - int lane_num, err; + int err; ptp->state = ICE_PTP_INITIALIZING; - lane_num = ice_get_phy_lane_number(hw); - if (lane_num < 0) { - err = lane_num; + if (hw->lane_num < 0) { + err = hw->lane_num; goto err_exit; } + ptp->port.port_num = hw->lane_num; - ptp->port.port_num = (u8)lane_num; ice_ptp_init_hw(hw); ice_ptp_init_tx_interrupt_mode(pf); diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c index eb1893dd89799..ccac84eb34c9e 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.c @@ -874,8 +874,12 @@ static u32 ice_ptp_tmr_cmd_to_port_reg(struct ice_hw *hw, */ void ice_ptp_src_cmd(struct ice_hw *hw, enum ice_ptp_tmr_cmd cmd) { + struct ice_pf *pf = container_of(hw, struct ice_pf, hw); u32 cmd_val = ice_ptp_tmr_cmd_to_src_reg(hw, cmd); + if (!ice_is_primary(hw)) + hw = ice_get_primary_hw(pf); + wr32(hw, GLTSYN_CMD, cmd_val); } @@ -891,6 +895,9 @@ static void ice_ptp_exec_tmr_cmd(struct ice_hw *hw) { struct ice_pf *pf = container_of(hw, struct ice_pf, hw); + if (!ice_is_primary(hw)) + hw = ice_get_primary_hw(pf); + guard(spinlock)(&pf->adapter->ptp_gltsyn_time_lock); wr32(hw, GLTSYN_CMD_SYNC, SYNC_EXEC_CMD); ice_flush(hw); @@ -922,10 +929,18 @@ static void ice_ptp_cfg_sync_delay(const struct ice_hw *hw, u32 delay) static enum ice_sbq_dev_id ice_ptp_get_dest_dev_e825(struct ice_hw *hw, u8 port) { - /* On a single complex E825, PHY 0 is always destination device phy_0 + u8 curr_phy, tgt_phy; + + tgt_phy = port >= hw->ptp.ports_per_phy; + curr_phy = hw->lane_num >= hw->ptp.ports_per_phy; + /* In the driver, lanes 4..7 are in fact 0..3 on a second PHY. + * On a single complex E825C, PHY 0 is always destination device phy_0 * and PHY 1 is phy_0_peer. + * On dual complex E825C, device phy_0 points to PHY on a current + * complex and phy_0_peer to PHY on a different complex. */ - if (port >= hw->ptp.ports_per_phy) + if ((!ice_is_dual(hw) && tgt_phy == 1) || + (ice_is_dual(hw) && tgt_phy != curr_phy)) return ice_sbq_dev_phy_0_peer; else return ice_sbq_dev_phy_0; @@ -2417,6 +2432,7 @@ int ice_phy_cfg_intr_eth56g(struct ice_hw *hw, u8 port, bool ena, u8 threshold) static int ice_read_phy_and_phc_time_eth56g(struct ice_hw *hw, u8 port, u64 *phy_time, u64 *phc_time) { + struct ice_pf *pf = container_of(hw, struct ice_pf, hw); u64 tx_time, rx_time; u32 zo, lo; u8 tmr_idx; @@ -2436,8 +2452,13 @@ static int ice_read_phy_and_phc_time_eth56g(struct ice_hw *hw, u8 port, ice_ptp_exec_tmr_cmd(hw); /* Read the captured PHC time from the shadow time registers */ - zo = rd32(hw, GLTSYN_SHTIME_0(tmr_idx)); - lo = rd32(hw, GLTSYN_SHTIME_L(tmr_idx)); + if (ice_is_primary(hw)) { + zo = rd32(hw, GLTSYN_SHTIME_0(tmr_idx)); + lo = rd32(hw, GLTSYN_SHTIME_L(tmr_idx)); + } else { + zo = rd32(ice_get_primary_hw(pf), GLTSYN_SHTIME_0(tmr_idx)); + lo = rd32(ice_get_primary_hw(pf), GLTSYN_SHTIME_L(tmr_idx)); + } *phc_time = (u64)lo << 32 | zo; /* Read the captured PHY time from the PHY shadow registers */ @@ -2574,6 +2595,7 @@ int ice_stop_phy_timer_eth56g(struct ice_hw *hw, u8 port, bool soft_reset) */ int ice_start_phy_timer_eth56g(struct ice_hw *hw, u8 port) { + struct ice_pf *pf = container_of(hw, struct ice_pf, hw); u32 lo, hi; u64 incval; u8 tmr_idx; @@ -2599,8 +2621,13 @@ int ice_start_phy_timer_eth56g(struct ice_hw *hw, u8 port) if (err) return err; - lo = rd32(hw, GLTSYN_INCVAL_L(tmr_idx)); - hi = rd32(hw, GLTSYN_INCVAL_H(tmr_idx)); + if (ice_is_primary(hw)) { + lo = rd32(hw, GLTSYN_INCVAL_L(tmr_idx)); + hi = rd32(hw, GLTSYN_INCVAL_H(tmr_idx)); + } else { + lo = rd32(ice_get_primary_hw(pf), GLTSYN_INCVAL_L(tmr_idx)); + hi = rd32(ice_get_primary_hw(pf), GLTSYN_INCVAL_H(tmr_idx)); + } incval = (u64)hi << 32 | lo; err = ice_write_40b_ptp_reg_eth56g(hw, port, PHY_REG_TIMETUS_L, incval); diff --git a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h index e5925ccc26132..83f20fa7ace74 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp_hw.h +++ b/drivers/net/ethernet/intel/ice/ice_ptp_hw.h @@ -444,11 +444,6 @@ static inline u64 ice_get_base_incval(struct ice_hw *hw) } } -static inline bool ice_is_dual(struct ice_hw *hw) -{ - return !!(hw->dev_caps.nac_topo.mode & ICE_NAC_TOPO_DUAL_M); -} - #define PFTSYN_SEM_BYTES 4 #define ICE_PTP_CLOCK_INDEX_0 0x00 diff --git a/drivers/net/ethernet/intel/ice/ice_type.h b/drivers/net/ethernet/intel/ice/ice_type.h index 0aab21113cc43..ccf53cc6403e9 100644 --- a/drivers/net/ethernet/intel/ice/ice_type.h +++ b/drivers/net/ethernet/intel/ice/ice_type.h @@ -970,6 +970,7 @@ struct ice_hw { u8 intrl_gran; struct ice_ptp_hw ptp; + s8 lane_num; /* Active package version (currently active) */ struct ice_pkg_ver active_pkg_ver; From 6cb10c063d6c2b4ae659f8221f4dc70343185319 Mon Sep 17 00:00:00 2001 From: Martyna Szapar-Mudlaw Date: Fri, 14 Mar 2025 09:11:11 +0100 Subject: [PATCH 080/770] ice: improve error message for insufficient filter space When adding a rule to switch through tc, if the operation fails due to not enough free recipes (-ENOSPC), provide a clearer error message: "Unable to add filter: insufficient space available." This improves user feedback by distinguishing space limitations from other generic failures. Reviewed-by: Michal Swiatkowski Signed-off-by: Martyna Szapar-Mudlaw Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_tc_lib.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c index 49b87071b7dc1..fb9ea7f8ef444 100644 --- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c @@ -996,6 +996,9 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr) NL_SET_ERR_MSG_MOD(fltr->extack, "Unable to add filter because it already exist"); ret = -EINVAL; goto exit; + } else if (ret == -ENOSPC) { + NL_SET_ERR_MSG_MOD(fltr->extack, "Unable to add filter: insufficient space available."); + goto exit; } else if (ret) { NL_SET_ERR_MSG_MOD(fltr->extack, "Unable to add filter due to error"); goto exit; @@ -1228,6 +1231,10 @@ ice_add_tc_flower_adv_fltr(struct ice_vsi *vsi, "Unable to add filter because it already exist"); ret = -EINVAL; goto exit; + } else if (ret == -ENOSPC) { + NL_SET_ERR_MSG_MOD(tc_fltr->extack, + "Unable to add filter: insufficient space available."); + goto exit; } else if (ret) { NL_SET_ERR_MSG_MOD(tc_fltr->extack, "Unable to add filter due to error"); From fee4a79a12240af90283034709518296544a9039 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 17 Mar 2025 17:20:24 +0000 Subject: [PATCH 081/770] ice: make const read-only array dflt_rules static Don't populate the const read-only array dflt_rules on the stack at run time, instead make it static. Signed-off-by: Colin Ian King Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c index 1d118171de378..aceec184e89b2 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool_fdir.c @@ -1605,7 +1605,7 @@ void ice_fdir_replay_fltrs(struct ice_pf *pf) */ int ice_fdir_create_dflt_rules(struct ice_pf *pf) { - const enum ice_fltr_ptype dflt_rules[] = { + static const enum ice_fltr_ptype dflt_rules[] = { ICE_FLTR_PTYPE_NONF_IPV4_TCP, ICE_FLTR_PTYPE_NONF_IPV4_UDP, ICE_FLTR_PTYPE_NONF_IPV6_TCP, ICE_FLTR_PTYPE_NONF_IPV6_UDP, }; From 015bac5daca978448f2671478c553ce1f300c21e Mon Sep 17 00:00:00 2001 From: Kyungwook Boo Date: Tue, 11 Mar 2025 14:16:02 +0900 Subject: [PATCH 082/770] i40e: fix MMIO write access to an invalid page in i40e_clear_hw When the device sends a specific input, an integer underflow can occur, leading to MMIO write access to an invalid page. Prevent the integer underflow by changing the type of related variables. Signed-off-by: Kyungwook Boo Link: https://lore.kernel.org/lkml/ffc91764-1142-4ba2-91b6-8c773f6f7095@gmail.com/T/ Reviewed-by: Przemek Kitszel Reviewed-by: Simon Horman Reviewed-by: Aleksandr Loktionov Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_common.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 370b4bddee441..b11c35e307ca9 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -817,10 +817,11 @@ int i40e_pf_reset(struct i40e_hw *hw) void i40e_clear_hw(struct i40e_hw *hw) { u32 num_queues, base_queue; - u32 num_pf_int; - u32 num_vf_int; + s32 num_pf_int; + s32 num_vf_int; u32 num_vfs; - u32 i, j; + s32 i; + u32 j; u32 val; u32 eol = 0x7ff; From cdcb3804eeda24d588348bbab6766cf14fddbeaa Mon Sep 17 00:00:00 2001 From: Rand Deeb Date: Thu, 6 Mar 2025 13:12:00 +0300 Subject: [PATCH 083/770] ixgbe: Fix unreachable retry logic in combined and byte I2C write functions The current implementation of `ixgbe_write_i2c_combined_generic_int` and `ixgbe_write_i2c_byte_generic_int` sets `max_retry` to `1`, which makes the condition `retry < max_retry` always evaluate to `false`. This renders the retry mechanism ineffective, as the debug message and retry logic are never executed. This patch increases `max_retry` to `3` in both functions, aligning them with the retry logic in `ixgbe_read_i2c_combined_generic_int`. This ensures that the retry mechanism functions as intended, improving robustness in case of I2C write failures. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Rand Deeb Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c index 0a03a8bb5f886..2d54828bdfbbc 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c @@ -167,7 +167,7 @@ int ixgbe_write_i2c_combined_generic_int(struct ixgbe_hw *hw, u8 addr, u16 reg, u16 val, bool lock) { u32 swfw_mask = hw->phy.phy_semaphore_mask; - int max_retry = 1; + int max_retry = 3; int retry = 0; u8 reg_high; u8 csum; @@ -2285,7 +2285,7 @@ static int ixgbe_write_i2c_byte_generic_int(struct ixgbe_hw *hw, u8 byte_offset, u8 dev_addr, u8 data, bool lock) { u32 swfw_mask = hw->phy.phy_semaphore_mask; - u32 max_retry = 1; + u32 max_retry = 3; u32 retry = 0; int status; From f9c961efb0f44b4319e73a3c40e831db2de4d074 Mon Sep 17 00:00:00 2001 From: Rui Salvaterra Date: Thu, 13 Mar 2025 09:35:22 +0000 Subject: [PATCH 084/770] igc: enable HW vlan tag insertion/stripping by default This is enabled by default in other Intel drivers I've checked (e1000, e1000e, iavf, igb and ice). Fixes an out-of-the-box performance issue when running OpenWrt on typical mini-PCs with igc-supported Ethernet controllers and 802.1Q VLAN configurations, as ethtool isn't part of the default packages and sane defaults are expected. In my specific case, with an Intel N100-based machine with four I226-V Ethernet controllers, my upload performance increased from under 30 Mb/s to the expected ~1 Gb/s. Signed-off-by: Rui Salvaterra Reviewed-by: Simon Horman Reviewed-by: Vitaly Lifshits Reviewed-by: Kurt Kanzenbach Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index f1330379e6bbc..30c72c3432626 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7125,6 +7125,9 @@ static int igc_probe(struct pci_dev *pdev, netdev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_XSK_ZEROCOPY; + /* enable HW vlan tag insertion/stripping by default */ + netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; + /* MTU range: 68 - 9216 */ netdev->min_mtu = ETH_MIN_MTU; netdev->max_mtu = MAX_STD_JUMBO_FRAME_SIZE; From 39aa687a849489897a15cd8a2d8a4b513def6237 Mon Sep 17 00:00:00 2001 From: Piotr Wejman Date: Sat, 22 Feb 2025 13:46:29 +0100 Subject: [PATCH 085/770] net: e1000e: convert to ndo_hwtstamp_get() and ndo_hwtstamp_set() Update the driver to use the new hardware timestamping API added in commit 66f7223039c0 ("net: add NDOs for configuring hardware timestamping"). Use Netlink extack for error reporting in e1000e_config_hwtstamp. Align the indentation of net_device_ops. Reviewed-by: Vadim Fedorenko Reviewed-by: Simon Horman Reviewed-by: Vitaly Lifshits Signed-off-by: Piotr Wejman Tested-by: Avigail Dahan Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/e1000.h | 2 +- drivers/net/ethernet/intel/e1000e/netdev.c | 75 +++++++++++----------- 2 files changed, 38 insertions(+), 39 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index ba9c19e6994c9..952898151565f 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -319,7 +319,7 @@ struct e1000_adapter { u16 tx_ring_count; u16 rx_ring_count; - struct hwtstamp_config hwtstamp_config; + struct kernel_hwtstamp_config hwtstamp_config; struct delayed_work systim_overflow_work; struct sk_buff *tx_hwtstamp_skb; unsigned long tx_hwtstamp_start; diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 8ebcb6a7d608a..e0f492a6723fa 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -3574,6 +3574,7 @@ s32 e1000e_get_base_timinca(struct e1000_adapter *adapter, u32 *timinca) * e1000e_config_hwtstamp - configure the hwtstamp registers and enable/disable * @adapter: board private structure * @config: timestamp configuration + * @extack: netlink extended ACK for error report * * Outgoing time stamping can be enabled and disabled. Play nice and * disable it when requested, although it shouldn't cause any overhead @@ -3587,7 +3588,8 @@ s32 e1000e_get_base_timinca(struct e1000_adapter *adapter, u32 *timinca) * exception of "all V2 events regardless of level 2 or 4". **/ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, - struct hwtstamp_config *config) + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { struct e1000_hw *hw = &adapter->hw; u32 tsync_tx_ctl = E1000_TSYNCTXCTL_ENABLED; @@ -3598,8 +3600,10 @@ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, bool is_l2 = false; u32 regval; - if (!(adapter->flags & FLAG_HAS_HW_TIMESTAMP)) + if (!(adapter->flags & FLAG_HAS_HW_TIMESTAMP)) { + NL_SET_ERR_MSG(extack, "No HW timestamp support"); return -EINVAL; + } switch (config->tx_type) { case HWTSTAMP_TX_OFF: @@ -3608,6 +3612,7 @@ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, case HWTSTAMP_TX_ON: break; default: + NL_SET_ERR_MSG(extack, "Unsupported TX HW timestamp type"); return -ERANGE; } @@ -3681,6 +3686,7 @@ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, config->rx_filter = HWTSTAMP_FILTER_ALL; break; default: + NL_SET_ERR_MSG(extack, "Unsupported RX HW timestamp filter"); return -ERANGE; } @@ -3693,7 +3699,8 @@ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, ew32(TSYNCTXCTL, regval); if ((er32(TSYNCTXCTL) & E1000_TSYNCTXCTL_ENABLED) != (regval & E1000_TSYNCTXCTL_ENABLED)) { - e_err("Timesync Tx Control register not set as expected\n"); + NL_SET_ERR_MSG(extack, + "Timesync Tx Control register not set as expected"); return -EAGAIN; } @@ -3706,7 +3713,8 @@ static int e1000e_config_hwtstamp(struct e1000_adapter *adapter, E1000_TSYNCRXCTL_TYPE_MASK)) != (regval & (E1000_TSYNCRXCTL_ENABLED | E1000_TSYNCRXCTL_TYPE_MASK))) { - e_err("Timesync Rx Control register not set as expected\n"); + NL_SET_ERR_MSG(extack, + "Timesync Rx Control register not set as expected"); return -EAGAIN; } @@ -3901,6 +3909,7 @@ static void e1000e_systim_reset(struct e1000_adapter *adapter) { struct ptp_clock_info *info = &adapter->ptp_clock_info; struct e1000_hw *hw = &adapter->hw; + struct netlink_ext_ack extack = {}; unsigned long flags; u32 timinca; s32 ret_val; @@ -3932,7 +3941,12 @@ static void e1000e_systim_reset(struct e1000_adapter *adapter) spin_unlock_irqrestore(&adapter->systim_lock, flags); /* restore the previous hwtstamp configuration settings */ - e1000e_config_hwtstamp(adapter, &adapter->hwtstamp_config); + ret_val = e1000e_config_hwtstamp(adapter, &adapter->hwtstamp_config, + &extack); + if (ret_val) { + if (extack._msg) + e_err("%s\n", extack._msg); + } } /** @@ -6079,8 +6093,7 @@ static int e1000_change_mtu(struct net_device *netdev, int new_mtu) return 0; } -static int e1000_mii_ioctl(struct net_device *netdev, struct ifreq *ifr, - int cmd) +static int e1000_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) { struct e1000_adapter *adapter = netdev_priv(netdev); struct mii_ioctl_data *data = if_mii(ifr); @@ -6140,7 +6153,8 @@ static int e1000_mii_ioctl(struct net_device *netdev, struct ifreq *ifr, /** * e1000e_hwtstamp_set - control hardware time stamping * @netdev: network interface device structure - * @ifr: interface request + * @config: timestamp configuration + * @extack: netlink extended ACK report * * Outgoing time stamping can be enabled and disabled. Play nice and * disable it when requested, although it shouldn't cause any overhead @@ -6153,20 +6167,18 @@ static int e1000_mii_ioctl(struct net_device *netdev, struct ifreq *ifr, * specified. Matching the kind of event packet is not supported, with the * exception of "all V2 events regardless of level 2 or 4". **/ -static int e1000e_hwtstamp_set(struct net_device *netdev, struct ifreq *ifr) +static int e1000e_hwtstamp_set(struct net_device *netdev, + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack) { struct e1000_adapter *adapter = netdev_priv(netdev); - struct hwtstamp_config config; int ret_val; - if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) - return -EFAULT; - - ret_val = e1000e_config_hwtstamp(adapter, &config); + ret_val = e1000e_config_hwtstamp(adapter, config, extack); if (ret_val) return ret_val; - switch (config.rx_filter) { + switch (config->rx_filter) { case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: case HWTSTAMP_FILTER_PTP_V2_SYNC: @@ -6178,38 +6190,23 @@ static int e1000e_hwtstamp_set(struct net_device *netdev, struct ifreq *ifr) * by hardware so notify the caller the requested packets plus * some others are time stamped. */ - config.rx_filter = HWTSTAMP_FILTER_SOME; + config->rx_filter = HWTSTAMP_FILTER_SOME; break; default: break; } - return copy_to_user(ifr->ifr_data, &config, - sizeof(config)) ? -EFAULT : 0; + return 0; } -static int e1000e_hwtstamp_get(struct net_device *netdev, struct ifreq *ifr) +static int e1000e_hwtstamp_get(struct net_device *netdev, + struct kernel_hwtstamp_config *kernel_config) { struct e1000_adapter *adapter = netdev_priv(netdev); - return copy_to_user(ifr->ifr_data, &adapter->hwtstamp_config, - sizeof(adapter->hwtstamp_config)) ? -EFAULT : 0; -} + *kernel_config = adapter->hwtstamp_config; -static int e1000_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) -{ - switch (cmd) { - case SIOCGMIIPHY: - case SIOCGMIIREG: - case SIOCSMIIREG: - return e1000_mii_ioctl(netdev, ifr, cmd); - case SIOCSHWTSTAMP: - return e1000e_hwtstamp_set(netdev, ifr); - case SIOCGHWTSTAMP: - return e1000e_hwtstamp_get(netdev, ifr); - default: - return -EOPNOTSUPP; - } + return 0; } static int e1000_init_phy_wakeup(struct e1000_adapter *adapter, u32 wufc) @@ -7346,9 +7343,11 @@ static const struct net_device_ops e1000e_netdev_ops = { #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = e1000_netpoll, #endif - .ndo_set_features = e1000_set_features, - .ndo_fix_features = e1000_fix_features, + .ndo_set_features = e1000_set_features, + .ndo_fix_features = e1000_fix_features, .ndo_features_check = passthru_features_check, + .ndo_hwtstamp_get = e1000e_hwtstamp_get, + .ndo_hwtstamp_set = e1000e_hwtstamp_set, }; /** From faeefc173be40512341b102cf1568aa0b6571acd Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Thu, 10 Apr 2025 09:01:27 +0800 Subject: [PATCH 086/770] sock: Correct error checking condition for (assign|release)_proto_idx() (assign|release)_proto_idx() wrongly check find_first_zero_bit() failure by condition '(prot->inuse_idx == PROTO_INUSE_NR - 1)' obviously. Fix by correcting the condition to '(prot->inuse_idx == PROTO_INUSE_NR)' Signed-off-by: Zijun Hu Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250410-fix_net-v2-1-d69e7c5739a4@quicinc.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index e54449c9ab0ba..121f640112889 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4004,7 +4004,7 @@ static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); - if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } @@ -4015,7 +4015,7 @@ static int assign_proto_idx(struct proto *prot) static void release_proto_idx(struct proto *prot) { - if (prot->inuse_idx != PROTO_INUSE_NR - 1) + if (prot->inuse_idx != PROTO_INUSE_NR) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else From ba5560e53dacefddf8c47802b7a30b2e53afdcb8 Mon Sep 17 00:00:00 2001 From: "Lucien.Jheng" Date: Wed, 9 Apr 2025 23:09:02 +0800 Subject: [PATCH 087/770] net: phy: air_en8811h: Add clk provider for CKO pin EN8811H outputs 25MHz or 50MHz clocks on CKO, selected by GPIO3. CKO clock operates continuously from power-up through md32 loading. Implement clk provider driver so we can disable the clock output in case it isn't needed, which also helps to reduce EMF noise Signed-off-by: Lucien.Jheng Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250409150902.3596-1-lucienx123@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/air_en8811h.c | 103 +++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/air_en8811h.c b/drivers/net/phy/air_en8811h.c index e9fd24cb72709..57fbd8df94388 100644 --- a/drivers/net/phy/air_en8811h.c +++ b/drivers/net/phy/air_en8811h.c @@ -11,6 +11,7 @@ * Copyright (C) 2023 Airoha Technology Corp. */ +#include #include #include #include @@ -115,6 +116,11 @@ #define EN8811H_GPIO_OUTPUT 0xcf8b8 #define EN8811H_GPIO_OUTPUT_345 (BIT(3) | BIT(4) | BIT(5)) +#define EN8811H_HWTRAP1 0xcf914 +#define EN8811H_HWTRAP1_CKO BIT(12) +#define EN8811H_CLK_CGM 0xcf958 +#define EN8811H_CLK_CGM_CKO BIT(26) + #define EN8811H_FW_CTRL_1 0x0f0018 #define EN8811H_FW_CTRL_1_START 0x0 #define EN8811H_FW_CTRL_1_FINISH 0x1 @@ -142,10 +148,15 @@ struct led { unsigned long state; }; +#define clk_hw_to_en8811h_priv(_hw) \ + container_of(_hw, struct en8811h_priv, hw) + struct en8811h_priv { - u32 firmware_version; - bool mcu_needs_restart; - struct led led[EN8811H_LED_COUNT]; + u32 firmware_version; + bool mcu_needs_restart; + struct led led[EN8811H_LED_COUNT]; + struct clk_hw hw; + struct phy_device *phydev; }; enum { @@ -806,6 +817,86 @@ static int en8811h_led_hw_is_supported(struct phy_device *phydev, u8 index, return 0; }; +static unsigned long en8811h_clk_recalc_rate(struct clk_hw *hw, + unsigned long parent) +{ + struct en8811h_priv *priv = clk_hw_to_en8811h_priv(hw); + struct phy_device *phydev = priv->phydev; + u32 pbus_value; + int ret; + + ret = air_buckpbus_reg_read(phydev, EN8811H_HWTRAP1, &pbus_value); + if (ret < 0) + return ret; + + return (pbus_value & EN8811H_HWTRAP1_CKO) ? 50000000 : 25000000; +} + +static int en8811h_clk_enable(struct clk_hw *hw) +{ + struct en8811h_priv *priv = clk_hw_to_en8811h_priv(hw); + struct phy_device *phydev = priv->phydev; + + return air_buckpbus_reg_modify(phydev, EN8811H_CLK_CGM, + EN8811H_CLK_CGM_CKO, + EN8811H_CLK_CGM_CKO); +} + +static void en8811h_clk_disable(struct clk_hw *hw) +{ + struct en8811h_priv *priv = clk_hw_to_en8811h_priv(hw); + struct phy_device *phydev = priv->phydev; + + air_buckpbus_reg_modify(phydev, EN8811H_CLK_CGM, + EN8811H_CLK_CGM_CKO, 0); +} + +static int en8811h_clk_is_enabled(struct clk_hw *hw) +{ + struct en8811h_priv *priv = clk_hw_to_en8811h_priv(hw); + struct phy_device *phydev = priv->phydev; + u32 pbus_value; + int ret; + + ret = air_buckpbus_reg_read(phydev, EN8811H_CLK_CGM, &pbus_value); + if (ret < 0) + return ret; + + return (pbus_value & EN8811H_CLK_CGM_CKO); +} + +static const struct clk_ops en8811h_clk_ops = { + .recalc_rate = en8811h_clk_recalc_rate, + .enable = en8811h_clk_enable, + .disable = en8811h_clk_disable, + .is_enabled = en8811h_clk_is_enabled, +}; + +static int en8811h_clk_provider_setup(struct device *dev, struct clk_hw *hw) +{ + struct clk_init_data init; + int ret; + + if (!IS_ENABLED(CONFIG_COMMON_CLK)) + return 0; + + init.name = devm_kasprintf(dev, GFP_KERNEL, "%s-cko", + fwnode_get_name(dev_fwnode(dev))); + if (!init.name) + return -ENOMEM; + + init.ops = &en8811h_clk_ops; + init.flags = 0; + init.num_parents = 0; + hw->init = &init; + + ret = devm_clk_hw_register(dev, hw); + if (ret) + return ret; + + return devm_of_clk_add_hw_provider(dev, of_clk_hw_simple_get, hw); +} + static int en8811h_probe(struct phy_device *phydev) { struct en8811h_priv *priv; @@ -838,6 +929,12 @@ static int en8811h_probe(struct phy_device *phydev) return ret; } + priv->phydev = phydev; + /* Co-Clock Output */ + ret = en8811h_clk_provider_setup(&phydev->mdio.dev, &priv->hw); + if (ret) + return ret; + /* Configure led gpio pins as output */ ret = air_buckpbus_reg_modify(phydev, EN8811H_GPIO_OUTPUT, EN8811H_GPIO_OUTPUT_345, From b2bdce7adc9027ae25d3dd864a58c435bcfcabac Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 9 Apr 2025 19:36:44 -0700 Subject: [PATCH 088/770] selftest: net: Remove DCCP bits. We will remove DCCP. Let's remove DCCP bits from selftest. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250410023921.11307-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/config | 1 - .../selftests/net/reuseport_addr_any.c | 36 +------------------ 2 files changed, 1 insertion(+), 36 deletions(-) diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 130d532b7e67a..3cfef51538230 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -33,7 +33,6 @@ CONFIG_NETFILTER_ADVANCED=y CONFIG_NF_CONNTRACK=m CONFIG_IPV6_MROUTE=y CONFIG_IPV6_SIT=y -CONFIG_IP_DCCP=m CONFIG_NF_NAT=m CONFIG_IP6_NF_IPTABLES=m CONFIG_IP_NF_IPTABLES=m diff --git a/tools/testing/selftests/net/reuseport_addr_any.c b/tools/testing/selftests/net/reuseport_addr_any.c index b8475cb29be7a..1c43401a1c807 100644 --- a/tools/testing/selftests/net/reuseport_addr_any.c +++ b/tools/testing/selftests/net/reuseport_addr_any.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -21,10 +20,6 @@ #include #include -#ifndef SOL_DCCP -#define SOL_DCCP 269 -#endif - static const char *IP4_ADDR = "127.0.0.1"; static const char *IP6_ADDR = "::1"; static const char *IP4_MAPPED6 = "::ffff:127.0.0.1"; @@ -86,15 +81,6 @@ static void build_rcv_fd(int family, int proto, int *rcv_fds, int count, if (proto == SOCK_STREAM && listen(rcv_fds[i], 10)) error(1, errno, "tcp: failed to listen on receive port"); - else if (proto == SOCK_DCCP) { - if (setsockopt(rcv_fds[i], SOL_DCCP, - DCCP_SOCKOPT_SERVICE, - &(int) {htonl(42)}, sizeof(int))) - error(1, errno, "failed to setsockopt"); - - if (listen(rcv_fds[i], 10)) - error(1, errno, "dccp: failed to listen on receive port"); - } } } @@ -148,11 +134,6 @@ static int connect_and_send(int family, int proto) if (fd < 0) error(1, errno, "failed to create send socket"); - if (proto == SOCK_DCCP && - setsockopt(fd, SOL_DCCP, DCCP_SOCKOPT_SERVICE, - &(int){htonl(42)}, sizeof(int))) - error(1, errno, "failed to setsockopt"); - if (bind(fd, saddr, sz)) error(1, errno, "failed to bind send socket"); @@ -175,7 +156,7 @@ static int receive_once(int epfd, int proto) if (i < 0) error(1, errno, "epoll_wait failed"); - if (proto == SOCK_STREAM || proto == SOCK_DCCP) { + if (proto == SOCK_STREAM) { fd = accept(ev.data.fd, NULL, NULL); if (fd < 0) error(1, errno, "failed to accept"); @@ -243,20 +224,6 @@ static void run_one_test(int fam_send, int fam_rcv, int proto, static void test_proto(int proto, const char *proto_str) { - if (proto == SOCK_DCCP) { - int test_fd; - - test_fd = socket(AF_INET, proto, 0); - if (test_fd < 0) { - if (errno == ESOCKTNOSUPPORT) { - fprintf(stderr, "DCCP not supported: skipping DCCP tests\n"); - return; - } else - error(1, errno, "failed to create a DCCP socket"); - } - close(test_fd); - } - fprintf(stderr, "%s IPv4 ... ", proto_str); run_one_test(AF_INET, AF_INET, proto, IP4_ADDR); @@ -271,7 +238,6 @@ int main(void) { test_proto(SOCK_DGRAM, "UDP"); test_proto(SOCK_STREAM, "TCP"); - test_proto(SOCK_DCCP, "DCCP"); fprintf(stderr, "SUCCESS\n"); return 0; From 2a63dd0edf388802074f1d4d6b588a3b4c380688 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 9 Apr 2025 19:36:45 -0700 Subject: [PATCH 089/770] net: Retire DCCP socket. DCCP was orphaned in 2021 by commit 054c4610bd05 ("MAINTAINERS: dccp: move Gerrit Renker to CREDITS"), which noted that the last maintainer had been inactive for five years. In recent years, it has become a playground for syzbot, and most changes to DCCP have been odd bug fixes triggered by syzbot. Apart from that, the only changes have been driven by treewide or networking API updates or adjustments related to TCP. Thus, in 2023, we announced we would remove DCCP in 2025 via commit b144fcaf46d4 ("dccp: Print deprecation notice."). Since then, only one individual has contacted the netdev mailing list. [0] There is ongoing research for Multipath DCCP. The repository is hosted on GitHub [1], and development is not taking place through the upstream community. While the repository is published under the GPLv2 license, the scheduling part remains proprietary, with a LICENSE file [2] stating: "This is not Open Source software." The researcher mentioned a plan to address the licensing issue, upstream the patches, and step up as a maintainer, but there has been no further communication since then. Maintaining DCCP for a decade without any real users has become a burden. Therefore, it's time to remove it. Removing DCCP will also provide significant benefits to TCP. It allows us to freely reorganize the layout of struct inet_connection_sock, which is currently shared with DCCP, and optimize it to reduce the number of cachelines accessed in the TCP fast path. Note that we keep DCCP netfilter modules as requested. [3] Link: https://lore.kernel.org/netdev/20230710182253.81446-1-kuniyu@amazon.com/T/#u #[0] Link: https://github.com/telekom/mp-dccp #[1] Link: https://github.com/telekom/mp-dccp/blob/mpdccp_v03_k5.10/net/dccp/non_gpl_scheduler/LICENSE #[2] Link: https://lore.kernel.org/netdev/Z_VQ0KlCRkqYWXa-@calendula/ #[3] Signed-off-by: Kuniyuki Iwashima Acked-by: Paul Moore (LSM and SELinux) Acked-by: Casey Schaufler Link: https://patch.msgid.link/20250410023921.11307-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- Documentation/admin-guide/bug-hunting.rst | 2 +- Documentation/networking/dccp.rst | 219 --- Documentation/networking/index.rst | 1 - Documentation/networking/ip-sysctl.rst | 4 +- .../zh_CN/admin-guide/bug-hunting.rst | 2 +- .../zh_TW/admin-guide/bug-hunting.rst | 2 +- MAINTAINERS | 9 - arch/m68k/configs/amiga_defconfig | 2 - arch/m68k/configs/apollo_defconfig | 2 - arch/m68k/configs/atari_defconfig | 2 - arch/m68k/configs/bvme6000_defconfig | 2 - arch/m68k/configs/hp300_defconfig | 2 - arch/m68k/configs/mac_defconfig | 2 - arch/m68k/configs/multi_defconfig | 2 - arch/m68k/configs/mvme147_defconfig | 2 - arch/m68k/configs/mvme16x_defconfig | 2 - arch/m68k/configs/q40_defconfig | 2 - arch/m68k/configs/sun3_defconfig | 2 - arch/m68k/configs/sun3x_defconfig | 2 - arch/mips/configs/bigsur_defconfig | 1 - arch/mips/configs/gpr_defconfig | 1 - arch/mips/configs/mtx1_defconfig | 1 - arch/powerpc/configs/pmac32_defconfig | 1 - arch/powerpc/configs/ppc6xx_defconfig | 1 - include/linux/dccp.h | 289 --- include/linux/tfrc.h | 51 - include/net/inet_hashtables.h | 6 +- include/net/rstreason.h | 2 +- include/net/secure_seq.h | 4 - include/trace/events/sock.h | 1 - include/trace/events/sunrpc.h | 2 - net/Kconfig | 1 - net/Makefile | 1 - net/core/secure_seq.c | 42 - net/core/sock_diag.c | 2 - net/dccp/Kconfig | 46 - net/dccp/Makefile | 30 - net/dccp/ackvec.c | 403 ----- net/dccp/ackvec.h | 136 -- net/dccp/ccid.c | 219 --- net/dccp/ccid.h | 262 --- net/dccp/ccids/Kconfig | 55 - net/dccp/ccids/ccid2.c | 794 --------- net/dccp/ccids/ccid2.h | 121 -- net/dccp/ccids/ccid3.c | 866 --------- net/dccp/ccids/ccid3.h | 148 -- net/dccp/ccids/lib/loss_interval.c | 184 -- net/dccp/ccids/lib/loss_interval.h | 69 - net/dccp/ccids/lib/packet_history.c | 439 ----- net/dccp/ccids/lib/packet_history.h | 142 -- net/dccp/ccids/lib/tfrc.c | 46 - net/dccp/ccids/lib/tfrc.h | 73 - net/dccp/ccids/lib/tfrc_equation.c | 702 -------- net/dccp/dccp.h | 483 ----- net/dccp/diag.c | 85 - net/dccp/feat.c | 1581 ----------------- net/dccp/feat.h | 133 -- net/dccp/input.c | 739 -------- net/dccp/ipv4.c | 1101 ------------ net/dccp/ipv6.c | 1174 ------------ net/dccp/ipv6.h | 27 - net/dccp/minisocks.c | 266 --- net/dccp/options.c | 609 ------- net/dccp/output.c | 708 -------- net/dccp/proto.c | 1293 -------------- net/dccp/qpolicy.c | 136 -- net/dccp/sysctl.c | 107 -- net/dccp/timer.c | 272 --- net/dccp/trace.h | 82 - net/ipv4/Kconfig | 2 +- net/ipv4/af_inet.c | 5 +- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/inet_diag.c | 2 - net/ipv6/ip6_output.c | 2 +- samples/bpf/sockex2_kern.c | 1 - scripts/checkpatch.pl | 2 +- security/lsm_audit.c | 19 - security/selinux/hooks.c | 41 +- security/selinux/include/classmap.h | 2 - security/selinux/nlmsgtab.c | 1 - security/smack/smack_lsm.c | 9 +- 81 files changed, 14 insertions(+), 14274 deletions(-) delete mode 100644 Documentation/networking/dccp.rst delete mode 100644 include/linux/tfrc.h delete mode 100644 net/dccp/Kconfig delete mode 100644 net/dccp/Makefile delete mode 100644 net/dccp/ackvec.c delete mode 100644 net/dccp/ackvec.h delete mode 100644 net/dccp/ccid.c delete mode 100644 net/dccp/ccid.h delete mode 100644 net/dccp/ccids/Kconfig delete mode 100644 net/dccp/ccids/ccid2.c delete mode 100644 net/dccp/ccids/ccid2.h delete mode 100644 net/dccp/ccids/ccid3.c delete mode 100644 net/dccp/ccids/ccid3.h delete mode 100644 net/dccp/ccids/lib/loss_interval.c delete mode 100644 net/dccp/ccids/lib/loss_interval.h delete mode 100644 net/dccp/ccids/lib/packet_history.c delete mode 100644 net/dccp/ccids/lib/packet_history.h delete mode 100644 net/dccp/ccids/lib/tfrc.c delete mode 100644 net/dccp/ccids/lib/tfrc.h delete mode 100644 net/dccp/ccids/lib/tfrc_equation.c delete mode 100644 net/dccp/dccp.h delete mode 100644 net/dccp/diag.c delete mode 100644 net/dccp/feat.c delete mode 100644 net/dccp/feat.h delete mode 100644 net/dccp/input.c delete mode 100644 net/dccp/ipv4.c delete mode 100644 net/dccp/ipv6.c delete mode 100644 net/dccp/ipv6.h delete mode 100644 net/dccp/minisocks.c delete mode 100644 net/dccp/options.c delete mode 100644 net/dccp/output.c delete mode 100644 net/dccp/proto.c delete mode 100644 net/dccp/qpolicy.c delete mode 100644 net/dccp/sysctl.c delete mode 100644 net/dccp/timer.c delete mode 100644 net/dccp/trace.h diff --git a/Documentation/admin-guide/bug-hunting.rst b/Documentation/admin-guide/bug-hunting.rst index ce6f4e8ca4876..30858757c9f20 100644 --- a/Documentation/admin-guide/bug-hunting.rst +++ b/Documentation/admin-guide/bug-hunting.rst @@ -196,7 +196,7 @@ will see the assembler code for the routine shown, but if your kernel has debug symbols the C code will also be available. (Debug symbols can be enabled in the kernel hacking menu of the menu configuration.) For example:: - $ objdump -r -S -l --disassemble net/dccp/ipv4.o + $ objdump -r -S -l --disassemble net/ipv4/tcp.o .. note:: diff --git a/Documentation/networking/dccp.rst b/Documentation/networking/dccp.rst deleted file mode 100644 index 91e5c33ba3ff5..0000000000000 --- a/Documentation/networking/dccp.rst +++ /dev/null @@ -1,219 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -============= -DCCP protocol -============= - - -.. Contents - - Introduction - - Missing features - - Socket options - - Sysctl variables - - IOCTLs - - Other tunables - - Notes - - -Introduction -============ -Datagram Congestion Control Protocol (DCCP) is an unreliable, connection -oriented protocol designed to solve issues present in UDP and TCP, particularly -for real-time and multimedia (streaming) traffic. -It divides into a base protocol (RFC 4340) and pluggable congestion control -modules called CCIDs. Like pluggable TCP congestion control, at least one CCID -needs to be enabled in order for the protocol to function properly. In the Linux -implementation, this is the TCP-like CCID2 (RFC 4341). Additional CCIDs, such as -the TCP-friendly CCID3 (RFC 4342), are optional. -For a brief introduction to CCIDs and suggestions for choosing a CCID to match -given applications, see section 10 of RFC 4340. - -It has a base protocol and pluggable congestion control IDs (CCIDs). - -DCCP is a Proposed Standard (RFC 2026), and the homepage for DCCP as a protocol -is at http://www.ietf.org/html.charters/dccp-charter.html - - -Missing features -================ -The Linux DCCP implementation does not currently support all the features that are -specified in RFCs 4340...42. - -The known bugs are at: - - http://www.linuxfoundation.org/collaborate/workgroups/networking/todo#DCCP - -For more up-to-date versions of the DCCP implementation, please consider using -the experimental DCCP test tree; instructions for checking this out are on: -http://www.linuxfoundation.org/collaborate/workgroups/networking/dccp_testing#Experimental_DCCP_source_tree - - -Socket options -============== -DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes -a policy ID as argument and can only be set before the connection (i.e. changes -during an established connection are not supported). Currently, two policies are -defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, -and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an -u32 priority value as ancillary data to sendmsg(), where higher numbers indicate -a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to -be formatted using a cmsg(3) message header filled in as follows:: - - cmsg->cmsg_level = SOL_DCCP; - cmsg->cmsg_type = DCCP_SCM_PRIORITY; - cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ - -DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero -value is always interpreted as unbounded queue length. If different from zero, -the interpretation of this parameter depends on the current dequeuing policy -(see above): the "simple" policy will enforce a fixed queue size by returning -EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the -lowest-priority packet first. The default value for this parameter is -initialised from /proc/sys/net/dccp/default/tx_qlen. - -DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of -service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, -the socket will fall back to 0 (which means that no meaningful service code -is present). On active sockets this is set before connect(); specifying more -than one code has no effect (all subsequent service codes are ignored). The -case is different for passive sockets, where multiple service codes (up to 32) -can be set before calling bind(). - -DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet -size (application payload size) in bytes, see RFC 4340, section 14. - -DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs -supported by the endpoint. The option value is an array of type uint8_t whose -size is passed as option length. The minimum array size is 4 elements, the -value returned in the optlen argument always reflects the true number of -built-in CCIDs. - -DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same -time, combining the operation of the next two socket options. This option is -preferable over the latter two, since often applications will use the same -type of CCID for both directions; and mixed use of CCIDs is not currently well -understood. This socket option takes as argument at least one uint8_t value, or -an array of uint8_t values, which must match available CCIDS (see above). CCIDs -must be registered on the socket before calling connect() or listen(). - -DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets -the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. -Please note that the getsockopt argument type here is ``int``, not uint8_t. - -DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. - -DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold -timewait state when closing the connection (RFC 4340, 8.3). The usual case is -that the closing server sends a CloseReq, whereupon the client holds timewait -state. When this boolean socket option is on, the server sends a Close instead -and will enter TIMEWAIT. This option must be set after accept() returns. - -DCCP_SOCKOPT_SEND_CSCOV and DCCP_SOCKOPT_RECV_CSCOV are used for setting the -partial checksum coverage (RFC 4340, sec. 9.2). The default is that checksums -always cover the entire packet and that only fully covered application data is -accepted by the receiver. Hence, when using this feature on the sender, it must -be enabled at the receiver, too with suitable choice of CsCov. - -DCCP_SOCKOPT_SEND_CSCOV sets the sender checksum coverage. Values in the - range 0..15 are acceptable. The default setting is 0 (full coverage), - values between 1..15 indicate partial coverage. - -DCCP_SOCKOPT_RECV_CSCOV is for the receiver and has a different meaning: it - sets a threshold, where again values 0..15 are acceptable. The default - of 0 means that all packets with a partial coverage will be discarded. - Values in the range 1..15 indicate that packets with minimally such a - coverage value are also acceptable. The higher the number, the more - restrictive this setting (see [RFC 4340, sec. 9.2.1]). Partial coverage - settings are inherited to the child socket after accept(). - -The following two options apply to CCID 3 exclusively and are getsockopt()-only. -In either case, a TFRC info struct (defined in ) is returned. - -DCCP_SOCKOPT_CCID_RX_INFO - Returns a ``struct tfrc_rx_info`` in optval; the buffer for optval and - optlen must be set to at least sizeof(struct tfrc_rx_info). - -DCCP_SOCKOPT_CCID_TX_INFO - Returns a ``struct tfrc_tx_info`` in optval; the buffer for optval and - optlen must be set to at least sizeof(struct tfrc_tx_info). - -On unidirectional connections it is useful to close the unused half-connection -via shutdown (SHUT_WR or SHUT_RD): this will reduce per-packet processing costs. - - -Sysctl variables -================ -Several DCCP default parameters can be managed by the following sysctls -(sysctl net.dccp.default or /proc/sys/net/dccp/default): - -request_retries - The number of active connection initiation retries (the number of - Requests minus one) before timing out. In addition, it also governs - the behaviour of the other, passive side: this variable also sets - the number of times DCCP repeats sending a Response when the initial - handshake does not progress from RESPOND to OPEN (i.e. when no Ack - is received after the initial Request). This value should be greater - than 0, suggested is less than 10. Analogue of tcp_syn_retries. - -retries1 - How often a DCCP Response is retransmitted until the listening DCCP - side considers its connecting peer dead. Analogue of tcp_retries1. - -retries2 - The number of times a general DCCP packet is retransmitted. This has - importance for retransmitted acknowledgments and feature negotiation, - data packets are never retransmitted. Analogue of tcp_retries2. - -tx_ccid = 2 - Default CCID for the sender-receiver half-connection. Depending on the - choice of CCID, the Send Ack Vector feature is enabled automatically. - -rx_ccid = 2 - Default CCID for the receiver-sender half-connection; see tx_ccid. - -seq_window = 100 - The initial sequence window (sec. 7.5.2) of the sender. This influences - the local ackno validity and the remote seqno validity windows (7.5.1). - Values in the range Wmin = 32 (RFC 4340, 7.5.2) up to 2^32-1 can be set. - -tx_qlen = 5 - The size of the transmit buffer in packets. A value of 0 corresponds - to an unbounded transmit buffer. - -sync_ratelimit = 125 ms - The timeout between subsequent DCCP-Sync packets sent in response to - sequence-invalid packets on the same socket (RFC 4340, 7.5.4). The unit - of this parameter is milliseconds; a value of 0 disables rate-limiting. - - -IOCTLS -====== -FIONREAD - Works as in udp(7): returns in the ``int`` argument pointer the size of - the next pending datagram in bytes, or 0 when no datagram is pending. - -SIOCOUTQ - Returns the number of unsent data bytes in the socket send queue as ``int`` - into the buffer specified by the argument pointer. - -Other tunables -============== -Per-route rto_min support - CCID-2 supports the RTAX_RTO_MIN per-route setting for the minimum value - of the RTO timer. This setting can be modified via the 'rto_min' option - of iproute2; for example:: - - > ip route change 10.0.0.0/24 rto_min 250j dev wlan0 - > ip route add 10.0.0.254/32 rto_min 800j dev wlan0 - > ip route show dev wlan0 - - CCID-3 also supports the rto_min setting: it is used to define the lower - bound for the expiry of the nofeedback timer. This can be useful on LANs - with very low RTTs (e.g., loopback, Gbit ethernet). - - -Notes -===== -DCCP does not travel through NAT successfully at present on many boxes. This is -because the checksum covers the pseudo-header as per TCP and UDP. Linux NAT -support for DCCP has been added. diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c64133d309bfe..ac90b82f3ce95 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -48,7 +48,6 @@ Contents: ax25 bonding cdc_mbim - dccp dctcp devmem dns_resolver diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 5c63ab928b97d..b43222ee57cf9 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -37,8 +37,8 @@ ip_no_pmtu_disc - INTEGER Mode 3 is a hardened pmtu discover mode. The kernel will only accept fragmentation-needed errors if the underlying protocol can verify them besides a plain socket lookup. Current - protocols for which pmtu events will be honored are TCP, SCTP - and DCCP as they verify e.g. the sequence number or the + protocols for which pmtu events will be honored are TCP and + SCTP as they verify e.g. the sequence number or the association. This mode should not be enabled globally but is only intended to secure e.g. name servers in namespaces where TCP path mtu must still work but path MTU information of other diff --git a/Documentation/translations/zh_CN/admin-guide/bug-hunting.rst b/Documentation/translations/zh_CN/admin-guide/bug-hunting.rst index c3f6a83294dc4..4b3432753eb9f 100644 --- a/Documentation/translations/zh_CN/admin-guide/bug-hunting.rst +++ b/Documentation/translations/zh_CN/admin-guide/bug-hunting.rst @@ -188,7 +188,7 @@ objdump 编行。如果没有调试符号,您将看到所示例程的汇编程序代码,但是如果内核有调试 符号,C代码也将可见(调试符号可以在内核配置菜单的hacking项中启用)。例如:: - $ objdump -r -S -l --disassemble net/dccp/ipv4.o + $ objdump -r -S -l --disassemble net/ipv4/tcp.o .. note:: diff --git a/Documentation/translations/zh_TW/admin-guide/bug-hunting.rst b/Documentation/translations/zh_TW/admin-guide/bug-hunting.rst index b25ecc44d7354..80ea5677ee524 100644 --- a/Documentation/translations/zh_TW/admin-guide/bug-hunting.rst +++ b/Documentation/translations/zh_TW/admin-guide/bug-hunting.rst @@ -191,7 +191,7 @@ objdump 編行。如果沒有調試符號,您將看到所示例程的彙編程序代碼,但是如果內核有調試 符號,C代碼也將可見(調試符號可以在內核配置菜單的hacking項中啓用)。例如:: - $ objdump -r -S -l --disassemble net/dccp/ipv4.o + $ objdump -r -S -l --disassemble net/ipv4/tcp.o .. note:: diff --git a/MAINTAINERS b/MAINTAINERS index c59316109e3f8..1248443035f43 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6546,15 +6546,6 @@ S: Maintained F: Documentation/scsi/dc395x.rst F: drivers/scsi/dc395x.* -DCCP PROTOCOL -L: dccp@vger.kernel.org -S: Orphan -W: http://www.linuxfoundation.org/collaborate/workgroups/networking/dccp -F: include/linux/dccp.h -F: include/linux/tfrc.h -F: include/uapi/linux/dccp.h -F: net/dccp/ - DEBUGOBJECTS: M: Thomas Gleixner L: linux-kernel@vger.kernel.org diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 31ecb8b7b9f1e..85bd696b0e902 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -267,8 +267,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 1f57514624d51..93003ceb51bbf 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -263,8 +263,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 02db7a48e57e7..fa6ed1bb01e3c 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -270,8 +270,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index f0e673cb17eb7..80ca001d5eab5 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -260,8 +260,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index e8ca5a50b86da..5e35e4b1fe163 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -262,8 +262,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index b3a270441bb17..11bc3f3c7576f 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -261,8 +261,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index d215dba006ced..343f702210df1 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -281,8 +281,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index a888ed93ff829..6104a5a245947 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -259,8 +259,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index b481782375f68..a99dd6c7c9c88 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -260,8 +260,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 6eba743d8eb5c..0d87c119d4c46 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -261,8 +261,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 9bdbb418ffa88..841f9615b6abc 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -256,8 +256,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index e1cf20fa53431..12c5dc92ea4c8 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -257,8 +257,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m diff --git a/arch/mips/configs/bigsur_defconfig b/arch/mips/configs/bigsur_defconfig index 8f7c36868204a..97d2cd9972850 100644 --- a/arch/mips/configs/bigsur_defconfig +++ b/arch/mips/configs/bigsur_defconfig @@ -81,7 +81,6 @@ CONFIG_IP_VS_SH=m CONFIG_IP_VS_SED=m CONFIG_IP_VS_NQ=m CONFIG_IP_VS_FTP=m -CONFIG_IP_DCCP=m CONFIG_BRIDGE=m CONFIG_VLAN_8021Q=m CONFIG_VLAN_8021Q_GVRP=y diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig index 12f3eed8a9468..df55fe6df8bab 100644 --- a/arch/mips/configs/gpr_defconfig +++ b/arch/mips/configs/gpr_defconfig @@ -84,7 +84,6 @@ CONFIG_BRIDGE_EBT_MARK_T=m CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m -CONFIG_IP_DCCP=m CONFIG_IP_SCTP=m CONFIG_TIPC=m CONFIG_ATM=y diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig index 06b7a0b97ecae..487bd84043f7c 100644 --- a/arch/mips/configs/mtx1_defconfig +++ b/arch/mips/configs/mtx1_defconfig @@ -130,7 +130,6 @@ CONFIG_BRIDGE_EBT_MARK_T=m CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m -CONFIG_IP_DCCP=m CONFIG_IP_SCTP=m CONFIG_TIPC=m CONFIG_ATM=y diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 1bc3466bc909e..ae45f70b29f0d 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -87,7 +87,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_IP_DCCP=m CONFIG_BT=m CONFIG_BT_RFCOMM=m CONFIG_BT_RFCOMM_TTY=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index a91a766b71a44..5e0f2cd7e62b9 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -225,7 +225,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m CONFIG_TIPC=m CONFIG_ATM=m CONFIG_ATM_CLIP=m diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 325af611909f9..0b61b8b996d4e 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -2,79 +2,8 @@ #ifndef _LINUX_DCCP_H #define _LINUX_DCCP_H - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include #include -enum dccp_state { - DCCP_OPEN = TCP_ESTABLISHED, - DCCP_REQUESTING = TCP_SYN_SENT, - DCCP_LISTEN = TCP_LISTEN, - DCCP_RESPOND = TCP_SYN_RECV, - /* - * States involved in closing a DCCP connection: - * 1) ACTIVE_CLOSEREQ is entered by a server sending a CloseReq. - * - * 2) CLOSING can have three different meanings (RFC 4340, 8.3): - * a. Client has performed active-close, has sent a Close to the server - * from state OPEN or PARTOPEN, and is waiting for the final Reset - * (in this case, SOCK_DONE == 1). - * b. Client is asked to perform passive-close, by receiving a CloseReq - * in (PART)OPEN state. It sends a Close and waits for final Reset - * (in this case, SOCK_DONE == 0). - * c. Server performs an active-close as in (a), keeps TIMEWAIT state. - * - * 3) The following intermediate states are employed to give passively - * closing nodes a chance to process their unread data: - * - PASSIVE_CLOSE (from OPEN => CLOSED) and - * - PASSIVE_CLOSEREQ (from (PART)OPEN to CLOSING; case (b) above). - */ - DCCP_ACTIVE_CLOSEREQ = TCP_FIN_WAIT1, - DCCP_PASSIVE_CLOSE = TCP_CLOSE_WAIT, /* any node receiving a Close */ - DCCP_CLOSING = TCP_CLOSING, - DCCP_TIME_WAIT = TCP_TIME_WAIT, - DCCP_CLOSED = TCP_CLOSE, - DCCP_NEW_SYN_RECV = TCP_NEW_SYN_RECV, - DCCP_PARTOPEN = TCP_MAX_STATES, - DCCP_PASSIVE_CLOSEREQ, /* clients receiving CloseReq */ - DCCP_MAX_STATES -}; - -enum { - DCCPF_OPEN = TCPF_ESTABLISHED, - DCCPF_REQUESTING = TCPF_SYN_SENT, - DCCPF_LISTEN = TCPF_LISTEN, - DCCPF_RESPOND = TCPF_SYN_RECV, - DCCPF_ACTIVE_CLOSEREQ = TCPF_FIN_WAIT1, - DCCPF_CLOSING = TCPF_CLOSING, - DCCPF_TIME_WAIT = TCPF_TIME_WAIT, - DCCPF_CLOSED = TCPF_CLOSE, - DCCPF_NEW_SYN_RECV = TCPF_NEW_SYN_RECV, - DCCPF_PARTOPEN = (1 << DCCP_PARTOPEN), -}; - -static inline struct dccp_hdr *dccp_hdr(const struct sk_buff *skb) -{ - return (struct dccp_hdr *)skb_transport_header(skb); -} - -static inline struct dccp_hdr *dccp_zeroed_hdr(struct sk_buff *skb, int headlen) -{ - skb_push(skb, headlen); - skb_reset_transport_header(skb); - return memset(skb_transport_header(skb), 0, headlen); -} - static inline struct dccp_hdr_ext *dccp_hdrx(const struct dccp_hdr *dh) { return (struct dccp_hdr_ext *)((unsigned char *)dh + sizeof(*dh)); @@ -85,12 +14,6 @@ static inline unsigned int __dccp_basic_hdr_len(const struct dccp_hdr *dh) return sizeof(*dh) + (dh->dccph_x ? sizeof(struct dccp_hdr_ext) : 0); } -static inline unsigned int dccp_basic_hdr_len(const struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - return __dccp_basic_hdr_len(dh); -} - static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh) { __u64 seq_nr = ntohs(dh->dccph_seq); @@ -103,222 +26,10 @@ static inline __u64 dccp_hdr_seq(const struct dccp_hdr *dh) return seq_nr; } -static inline struct dccp_hdr_request *dccp_hdr_request(struct sk_buff *skb) -{ - return (struct dccp_hdr_request *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline struct dccp_hdr_ack_bits *dccp_hdr_ack_bits(const struct sk_buff *skb) -{ - return (struct dccp_hdr_ack_bits *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline u64 dccp_hdr_ack_seq(const struct sk_buff *skb) -{ - const struct dccp_hdr_ack_bits *dhack = dccp_hdr_ack_bits(skb); - return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + ntohl(dhack->dccph_ack_nr_low); -} - -static inline struct dccp_hdr_response *dccp_hdr_response(struct sk_buff *skb) -{ - return (struct dccp_hdr_response *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - -static inline struct dccp_hdr_reset *dccp_hdr_reset(struct sk_buff *skb) -{ - return (struct dccp_hdr_reset *)(skb_transport_header(skb) + - dccp_basic_hdr_len(skb)); -} - static inline unsigned int __dccp_hdr_len(const struct dccp_hdr *dh) { return __dccp_basic_hdr_len(dh) + dccp_packet_hdr_len(dh->dccph_type); } -static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) -{ - return __dccp_hdr_len(dccp_hdr(skb)); -} - -/** - * struct dccp_request_sock - represent DCCP-specific connection request - * @dreq_inet_rsk: structure inherited from - * @dreq_iss: initial sequence number, sent on the first Response (RFC 4340, 7.1) - * @dreq_gss: greatest sequence number sent (for retransmitted Responses) - * @dreq_isr: initial sequence number received in the first Request - * @dreq_gsr: greatest sequence number received (for retransmitted Request(s)) - * @dreq_service: service code present on the Request (there is just one) - * @dreq_featneg: feature negotiation options for this connection - * The following two fields are analogous to the ones in dccp_sock: - * @dreq_timestamp_echo: last received timestamp to echo (13.1) - * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo - */ -struct dccp_request_sock { - struct inet_request_sock dreq_inet_rsk; - __u64 dreq_iss; - __u64 dreq_gss; - __u64 dreq_isr; - __u64 dreq_gsr; - __be32 dreq_service; - spinlock_t dreq_lock; - struct list_head dreq_featneg; - __u32 dreq_timestamp_echo; - __u32 dreq_timestamp_time; -}; - -static inline struct dccp_request_sock *dccp_rsk(const struct request_sock *req) -{ - return (struct dccp_request_sock *)req; -} - -extern struct inet_timewait_death_row dccp_death_row; - -extern int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - struct sk_buff *skb); - -struct dccp_options_received { - u64 dccpor_ndp:48; - u32 dccpor_timestamp; - u32 dccpor_timestamp_echo; - u32 dccpor_elapsed_time; -}; - -struct ccid; - -enum dccp_role { - DCCP_ROLE_UNDEFINED, - DCCP_ROLE_LISTEN, - DCCP_ROLE_CLIENT, - DCCP_ROLE_SERVER, -}; - -struct dccp_service_list { - __u32 dccpsl_nr; - __be32 dccpsl_list[]; -}; - -#define DCCP_SERVICE_INVALID_VALUE htonl((__u32)-1) -#define DCCP_SERVICE_CODE_IS_ABSENT 0 - -static inline bool dccp_list_has_service(const struct dccp_service_list *sl, - const __be32 service) -{ - if (likely(sl != NULL)) { - u32 i = sl->dccpsl_nr; - while (i--) - if (sl->dccpsl_list[i] == service) - return true; - } - return false; -} - -struct dccp_ackvec; - -/** - * struct dccp_sock - DCCP socket state - * - * @dccps_swl - sequence number window low - * @dccps_swh - sequence number window high - * @dccps_awl - acknowledgement number window low - * @dccps_awh - acknowledgement number window high - * @dccps_iss - initial sequence number sent - * @dccps_isr - initial sequence number received - * @dccps_osr - first OPEN sequence number received - * @dccps_gss - greatest sequence number sent - * @dccps_gsr - greatest valid sequence number received - * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss - * @dccps_service - first (passive sock) or unique (active sock) service code - * @dccps_service_list - second .. last service code on passive socket - * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option - * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo - * @dccps_l_ack_ratio - feature-local Ack Ratio - * @dccps_r_ack_ratio - feature-remote Ack Ratio - * @dccps_l_seq_win - local Sequence Window (influences ack number validity) - * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) - * @dccps_pcslen - sender partial checksum coverage (via sockopt) - * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) - * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) - * @dccps_ndp_count - number of Non Data Packets since last data packet - * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) - * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) - * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) - * @dccps_hc_rx_ackvec - rx half connection ack vector - * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) - * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) - * @dccps_options_received - parsed set of retrieved options - * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy - * @dccps_tx_qlen - maximum length of the TX queue - * @dccps_role - role of this sock, one of %dccp_role - * @dccps_hc_rx_insert_options - receiver wants to add options when acking - * @dccps_hc_tx_insert_options - sender wants to add options when sending - * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) - * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" - * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets - * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) - * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) - */ -struct dccp_sock { - /* inet_connection_sock has to be the first member of dccp_sock */ - struct inet_connection_sock dccps_inet_connection; -#define dccps_syn_rtt dccps_inet_connection.icsk_ack.lrcvtime - __u64 dccps_swl; - __u64 dccps_swh; - __u64 dccps_awl; - __u64 dccps_awh; - __u64 dccps_iss; - __u64 dccps_isr; - __u64 dccps_osr; - __u64 dccps_gss; - __u64 dccps_gsr; - __u64 dccps_gar; - __be32 dccps_service; - __u32 dccps_mss_cache; - struct dccp_service_list *dccps_service_list; - __u32 dccps_timestamp_echo; - __u32 dccps_timestamp_time; - __u16 dccps_l_ack_ratio; - __u16 dccps_r_ack_ratio; - __u64 dccps_l_seq_win:48; - __u64 dccps_r_seq_win:48; - __u8 dccps_pcslen:4; - __u8 dccps_pcrlen:4; - __u8 dccps_send_ndp_count:1; - __u64 dccps_ndp_count:48; - unsigned long dccps_rate_last; - struct list_head dccps_featneg; - struct dccp_ackvec *dccps_hc_rx_ackvec; - struct ccid *dccps_hc_rx_ccid; - struct ccid *dccps_hc_tx_ccid; - struct dccp_options_received dccps_options_received; - __u8 dccps_qpolicy; - __u32 dccps_tx_qlen; - enum dccp_role dccps_role:2; - __u8 dccps_hc_rx_insert_options:1; - __u8 dccps_hc_tx_insert_options:1; - __u8 dccps_server_timewait:1; - __u8 dccps_sync_scheduled:1; - struct tasklet_struct dccps_xmitlet; - struct timer_list dccps_xmit_timer; -}; - -#define dccp_sk(ptr) container_of_const(ptr, struct dccp_sock, \ - dccps_inet_connection.icsk_inet.sk) - -static inline const char *dccp_role(const struct sock *sk) -{ - switch (dccp_sk(sk)->dccps_role) { - case DCCP_ROLE_UNDEFINED: return "undefined"; - case DCCP_ROLE_LISTEN: return "listen"; - case DCCP_ROLE_SERVER: return "server"; - case DCCP_ROLE_CLIENT: return "client"; - } - return NULL; -} - -extern void dccp_syn_ack_timeout(const struct request_sock *req); - #endif /* _LINUX_DCCP_H */ diff --git a/include/linux/tfrc.h b/include/linux/tfrc.h deleted file mode 100644 index a5acc768085da..0000000000000 --- a/include/linux/tfrc.h +++ /dev/null @@ -1,51 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _LINUX_TFRC_H_ -#define _LINUX_TFRC_H_ -/* - * TFRC - Data Structures for the TCP-Friendly Rate Control congestion - * control mechanism as specified in RFC 3448. - * - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ -#include - -/** tfrc_rx_info - TFRC Receiver Data Structure - * - * @tfrcrx_x_recv: receiver estimate of sending rate (3.2.2) - * @tfrcrx_rtt: round-trip-time (communicated by sender) - * @tfrcrx_p: current estimate of loss event rate (3.2.2) - */ -struct tfrc_rx_info { - __u32 tfrcrx_x_recv; - __u32 tfrcrx_rtt; - __u32 tfrcrx_p; -}; - -/** tfrc_tx_info - TFRC Sender Data Structure - * - * @tfrctx_x: computed transmit rate (4.3 (4)) - * @tfrctx_x_recv: receiver estimate of send rate (4.3) - * @tfrctx_x_calc: return value of throughput equation (3.1) - * @tfrctx_rtt: (moving average) estimate of RTT (4.3) - * @tfrctx_p: current loss event rate (5.4) - * @tfrctx_rto: estimate of RTO, equals 4*RTT (4.3) - * @tfrctx_ipi: inter-packet interval (4.6) - * - * Note: X and X_recv are both maintained in units of 64 * bytes/second. This - * enables a finer resolution of sending rates and avoids problems with - * integer arithmetic; u32 is not sufficient as scaling consumes 6 bits. - */ -struct tfrc_tx_info { - __u64 tfrctx_x; - __u64 tfrctx_x_recv; - __u32 tfrctx_x_calc; - __u32 tfrctx_rtt; - __u32 tfrctx_p; - __u32 tfrctx_rto; - __u32 tfrctx_ipi; -}; - -#endif /* _LINUX_TFRC_H_ */ diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 949641e925398..d172b64a63201 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -177,12 +177,8 @@ struct inet_hashinfo { static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) { -#if IS_ENABLED(CONFIG_IP_DCCP) - return sk->sk_prot->h.hashinfo ? : - sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#else + /* TODO: rename function */ return sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#endif } static inline struct inet_listen_hashbucket * diff --git a/include/net/rstreason.h b/include/net/rstreason.h index 69cb2e52b7dae..979ac87b5d994 100644 --- a/include/net/rstreason.h +++ b/include/net/rstreason.h @@ -36,7 +36,7 @@ /** * enum sk_rst_reason - the reasons of socket reset * - * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols. + * The reasons of sk reset, which are used in TCP/MPTCP protocols. * * There are three parts in order: * 1) skb drop reasons: relying on drop reasons for such as passive reset diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index 21e7fa2a18138..cddebafb9f779 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -16,9 +16,5 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport); u32 secure_tcpv6_ts_off(const struct net *net, const __be32 *saddr, const __be32 *daddr); -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport); -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport); #endif /* _NET_SECURE_SEQ */ diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h index 3836de435d9d2..b5310439536e0 100644 --- a/include/trace/events/sock.h +++ b/include/trace/events/sock.h @@ -19,7 +19,6 @@ /* The protocol traced by inet_sock_set_state */ #define inet_protocol_names \ EM(IPPROTO_TCP) \ - EM(IPPROTO_DCCP) \ EM(IPPROTO_SCTP) \ EMe(IPPROTO_MPTCP) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5d331383047b7..de214f1dea586 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -21,7 +21,6 @@ TRACE_DEFINE_ENUM(SOCK_DGRAM); TRACE_DEFINE_ENUM(SOCK_RAW); TRACE_DEFINE_ENUM(SOCK_RDM); TRACE_DEFINE_ENUM(SOCK_SEQPACKET); -TRACE_DEFINE_ENUM(SOCK_DCCP); TRACE_DEFINE_ENUM(SOCK_PACKET); #define show_socket_type(type) \ @@ -31,7 +30,6 @@ TRACE_DEFINE_ENUM(SOCK_PACKET); { SOCK_RAW, "RAW" }, \ { SOCK_RDM, "RDM" }, \ { SOCK_SEQPACKET, "SEQPACKET" }, \ - { SOCK_DCCP, "DCCP" }, \ { SOCK_PACKET, "PACKET" }) /* This list is known to be incomplete, add new enums as needed. */ diff --git a/net/Kconfig b/net/Kconfig index c3fca69a7c834..202cc595e5e6f 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -245,7 +245,6 @@ source "net/bridge/netfilter/Kconfig" endif -source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/rds/Kconfig" source "net/tipc/Kconfig" diff --git a/net/Makefile b/net/Makefile index 60ed5190eda8d..aac960c41db6d 100644 --- a/net/Makefile +++ b/net/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_PHONET) += phonet/ ifneq ($(CONFIG_VLAN_8021Q),) obj-y += 8021q/ endif -obj-$(CONFIG_IP_DCCP) += dccp/ obj-$(CONFIG_IP_SCTP) += sctp/ obj-$(CONFIG_RDS) += rds/ obj-$(CONFIG_WIRELESS) += wireless/ diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index 568779d5a0efa..9a39656804513 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -156,45 +156,3 @@ u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif - -#if IS_ENABLED(CONFIG_IP_DCCP) -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) -{ - u64 seq; - net_secret_init(); - seq = siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u32)sport << 16 | (__force u32)dport, - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccp_sequence_number); - -#if IS_ENABLED(CONFIG_IPV6) -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport) -{ - const struct { - struct in6_addr saddr; - struct in6_addr daddr; - __be16 sport; - __be16 dport; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .saddr = *(struct in6_addr *)saddr, - .daddr = *(struct in6_addr *)daddr, - .sport = sport, - .dport = dport - }; - u64 seq; - net_secret_init(); - seq = siphash(&combined, offsetofend(typeof(combined), dport), - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccpv6_sequence_number); -#endif -#endif diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a08eed9b9142e..b23594c767f2e 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -264,8 +264,6 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, switch (nlh->nlmsg_type) { case TCPDIAG_GETSOCK: - case DCCPDIAG_GETSOCK: - if (!rcu_access_pointer(inet_rcv_compat)) sock_load_diag_module(AF_INET, 0); diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig deleted file mode 100644 index 0c7d2f66ba278..0000000000000 --- a/net/dccp/Kconfig +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menuconfig IP_DCCP - tristate "The DCCP Protocol" - depends on INET - help - Datagram Congestion Control Protocol (RFC 4340) - - From https://www.ietf.org/rfc/rfc4340.txt: - - The Datagram Congestion Control Protocol (DCCP) is a transport - protocol that implements bidirectional, unicast connections of - congestion-controlled, unreliable datagrams. It should be suitable - for use by applications such as streaming media, Internet telephony, - and on-line games. - - To compile this protocol support as a module, choose M here: the - module will be called dccp. - - If in doubt, say N. - -if IP_DCCP - -config INET_DCCP_DIAG - depends on INET_DIAG - def_tristate y if (IP_DCCP = y && INET_DIAG = y) - def_tristate m - -source "net/dccp/ccids/Kconfig" - -menu "DCCP Kernel Hacking" - depends on DEBUG_KERNEL=y - -config IP_DCCP_DEBUG - bool "DCCP debug messages" - help - Only use this if you're hacking DCCP. - - When compiling DCCP as a module, this debugging output can be toggled - by setting the parameter dccp_debug of the `dccp' module to 0 or 1. - - Just say N. - - -endmenu - -endif # IP_DDCP diff --git a/net/dccp/Makefile b/net/dccp/Makefile deleted file mode 100644 index 5b4ff37bc8061..0000000000000 --- a/net/dccp/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o - -dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \ - qpolicy.o -# -# CCID algorithms to be used by dccp.ko -# -# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency -dccp-y += ccids/ccid2.o ackvec.o -dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o -dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \ - ccids/lib/tfrc_equation.o \ - ccids/lib/packet_history.o \ - ccids/lib/loss_interval.o - -dccp_ipv4-y := ipv4.o - -# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module -obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o -dccp_ipv6-y := ipv6.o - -obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o - -dccp-$(CONFIG_SYSCTL) += sysctl.o - -dccp_diag-y := diag.o - -# build with local directory for trace.h -CFLAGS_proto.o := -I$(src) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c deleted file mode 100644 index 1cba001bb4c8e..0000000000000 --- a/net/dccp/ackvec.c +++ /dev/null @@ -1,403 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/ackvec.c - * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#include "dccp.h" -#include -#include -#include - -static struct kmem_cache *dccp_ackvec_slab; -static struct kmem_cache *dccp_ackvec_record_slab; - -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) -{ - struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); - - if (av != NULL) { - av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; - INIT_LIST_HEAD(&av->av_records); - } - return av; -} - -static void dccp_ackvec_purge_records(struct dccp_ackvec *av) -{ - struct dccp_ackvec_record *cur, *next; - - list_for_each_entry_safe(cur, next, &av->av_records, avr_node) - kmem_cache_free(dccp_ackvec_record_slab, cur); - INIT_LIST_HEAD(&av->av_records); -} - -void dccp_ackvec_free(struct dccp_ackvec *av) -{ - if (likely(av != NULL)) { - dccp_ackvec_purge_records(av); - kmem_cache_free(dccp_ackvec_slab, av); - } -} - -/** - * dccp_ackvec_update_records - Record information about sent Ack Vectors - * @av: Ack Vector records to update - * @seqno: Sequence number of the packet carrying the Ack Vector just sent - * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector - */ -int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) -{ - struct dccp_ackvec_record *avr; - - avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); - if (avr == NULL) - return -ENOBUFS; - - avr->avr_ack_seqno = seqno; - avr->avr_ack_ptr = av->av_buf_head; - avr->avr_ack_ackno = av->av_buf_ackno; - avr->avr_ack_nonce = nonce_sum; - avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); - /* - * When the buffer overflows, we keep no more than one record. This is - * the simplest way of disambiguating sender-Acks dating from before the - * overflow from sender-Acks which refer to after the overflow; a simple - * solution is preferable here since we are handling an exception. - */ - if (av->av_overflow) - dccp_ackvec_purge_records(av); - /* - * Since GSS is incremented for each packet, the list is automatically - * arranged in descending order of @ack_seqno. - */ - list_add(&avr->avr_node, &av->av_records); - - dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", - (unsigned long long)avr->avr_ack_seqno, - (unsigned long long)avr->avr_ack_ackno, - avr->avr_ack_runlen); - return 0; -} - -static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, - const u64 ackno) -{ - struct dccp_ackvec_record *avr; - /* - * Exploit that records are inserted in descending order of sequence - * number, start with the oldest record first. If @ackno is `before' - * the earliest ack_ackno, the packet is too old to be considered. - */ - list_for_each_entry_reverse(avr, av_list, avr_node) { - if (avr->avr_ack_seqno == ackno) - return avr; - if (before48(ackno, avr->avr_ack_seqno)) - break; - } - return NULL; -} - -/* - * Buffer index and length computation using modulo-buffersize arithmetic. - * Note that, as pointers move from right to left, head is `before' tail. - */ -static inline u16 __ackvec_idx_add(const u16 a, const u16 b) -{ - return (a + b) % DCCPAV_MAX_ACKVEC_LEN; -} - -static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) -{ - return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); -} - -u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) -{ - if (unlikely(av->av_overflow)) - return DCCPAV_MAX_ACKVEC_LEN; - return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); -} - -/** - * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 - * @av: non-empty buffer to update - * @distance: negative or zero distance of @seqno from buf_ackno downward - * @seqno: the (old) sequence number whose record is to be updated - * @state: state in which packet carrying @seqno was received - */ -static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, - u64 seqno, enum dccp_ackvec_states state) -{ - u16 ptr = av->av_buf_head; - - BUG_ON(distance > 0); - if (unlikely(dccp_ackvec_is_empty(av))) - return; - - do { - u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); - - if (distance + runlen >= 0) { - /* - * Only update the state if packet has not been received - * yet. This is OK as per the second table in RFC 4340, - * 11.4.1; i.e. here we are using the following table: - * RECEIVED - * 0 1 3 - * S +---+---+---+ - * T 0 | 0 | 0 | 0 | - * O +---+---+---+ - * R 1 | 1 | 1 | 1 | - * E +---+---+---+ - * D 3 | 0 | 1 | 3 | - * +---+---+---+ - * The "Not Received" state was set by reserve_seats(). - */ - if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) - av->av_buf[ptr] = state; - else - dccp_pr_debug("Not changing %llu state to %u\n", - (unsigned long long)seqno, state); - break; - } - - distance += runlen + 1; - ptr = __ackvec_idx_add(ptr, 1); - - } while (ptr != av->av_buf_tail); -} - -/* Mark @num entries after buf_head as "Not yet received". */ -static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) -{ - u16 start = __ackvec_idx_add(av->av_buf_head, 1), - len = DCCPAV_MAX_ACKVEC_LEN - start; - - /* check for buffer wrap-around */ - if (num > len) { - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); - start = 0; - num -= len; - } - if (num) - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); -} - -/** - * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer - * @av: container of buffer to update (can be empty or non-empty) - * @num_packets: number of packets to register (must be >= 1) - * @seqno: sequence number of the first packet in @num_packets - * @state: state in which packet carrying @seqno was received - */ -static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, - u64 seqno, enum dccp_ackvec_states state) -{ - u32 num_cells = num_packets; - - if (num_packets > DCCPAV_BURST_THRESH) { - u32 lost_packets = num_packets - 1; - - DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); - /* - * We received 1 packet and have a loss of size "num_packets-1" - * which we squeeze into num_cells-1 rather than reserving an - * entire byte for each lost packet. - * The reason is that the vector grows in O(burst_length); when - * it grows too large there will no room left for the payload. - * This is a trade-off: if a few packets out of the burst show - * up later, their state will not be changed; it is simply too - * costly to reshuffle/reallocate/copy the buffer each time. - * Should such problems persist, we will need to switch to a - * different underlying data structure. - */ - for (num_packets = num_cells = 1; lost_packets; ++num_cells) { - u8 len = min_t(u32, lost_packets, DCCPAV_MAX_RUNLEN); - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); - av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; - - lost_packets -= len; - } - } - - if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { - DCCP_CRIT("Ack Vector buffer overflow: dropping old entries"); - av->av_overflow = true; - } - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); - if (av->av_overflow) - av->av_buf_tail = av->av_buf_head; - - av->av_buf[av->av_buf_head] = state; - av->av_buf_ackno = seqno; - - if (num_packets > 1) - dccp_ackvec_reserve_seats(av, num_packets - 1); -} - -/** - * dccp_ackvec_input - Register incoming packet in the buffer - * @av: Ack Vector to register packet to - * @skb: Packet to register - */ -void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) -{ - u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; - enum dccp_ackvec_states state = DCCPAV_RECEIVED; - - if (dccp_ackvec_is_empty(av)) { - dccp_ackvec_add_new(av, 1, seqno, state); - av->av_tail_ackno = seqno; - - } else { - s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); - u8 *current_head = av->av_buf + av->av_buf_head; - - if (num_packets == 1 && - dccp_ackvec_state(current_head) == state && - dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { - - *current_head += 1; - av->av_buf_ackno = seqno; - - } else if (num_packets > 0) { - dccp_ackvec_add_new(av, num_packets, seqno, state); - } else { - dccp_ackvec_update_old(av, num_packets, seqno, state); - } - } -} - -/** - * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection - * @av: Ack Vector record to clean - * @ackno: last Ack Vector which has been acknowledged - * - * This routine is called when the peer acknowledges the receipt of Ack Vectors - * up to and including @ackno. While based on section A.3 of RFC 4340, here - * are additional precautions to prevent corrupted buffer state. In particular, - * we use tail_ackno to identify outdated records; it always marks the earliest - * packet of group (2) in 11.4.2. - */ -void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) -{ - struct dccp_ackvec_record *avr, *next; - u8 runlen_now, eff_runlen; - s64 delta; - - avr = dccp_ackvec_lookup(&av->av_records, ackno); - if (avr == NULL) - return; - /* - * Deal with outdated acknowledgments: this arises when e.g. there are - * several old records and the acks from the peer come in slowly. In - * that case we may still have records that pre-date tail_ackno. - */ - delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); - if (delta < 0) - goto free_records; - /* - * Deal with overlapping Ack Vectors: don't subtract more than the - * number of packets between tail_ackno and ack_ackno. - */ - eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; - - runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); - /* - * The run length of Ack Vector cells does not decrease over time. If - * the run length is the same as at the time the Ack Vector was sent, we - * free the ack_ptr cell. That cell can however not be freed if the run - * length has increased: in this case we need to move the tail pointer - * backwards (towards higher indices), to its next-oldest neighbour. - */ - if (runlen_now > eff_runlen) { - - av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; - av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); - - /* This move may not have cleared the overflow flag. */ - if (av->av_overflow) - av->av_overflow = (av->av_buf_head == av->av_buf_tail); - } else { - av->av_buf_tail = avr->avr_ack_ptr; - /* - * We have made sure that avr points to a valid cell within the - * buffer. This cell is either older than head, or equals head - * (empty buffer): in both cases we no longer have any overflow. - */ - av->av_overflow = 0; - } - - /* - * The peer has acknowledged up to and including ack_ackno. Hence the - * first packet in group (2) of 11.4.2 is the successor of ack_ackno. - */ - av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); - -free_records: - list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { - list_del(&avr->avr_node); - kmem_cache_free(dccp_ackvec_record_slab, avr); - } -} - -/* - * Routines to keep track of Ack Vectors received in an skb - */ -int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) -{ - struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); - - if (new == NULL) - return -ENOBUFS; - new->vec = vec; - new->len = len; - new->nonce = nonce; - - list_add_tail(&new->node, head); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); - -void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) -{ - struct dccp_ackvec_parsed *cur, *next; - - list_for_each_entry_safe(cur, next, parsed_chunks, node) - kfree(cur); - INIT_LIST_HEAD(parsed_chunks); -} -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); - -int __init dccp_ackvec_init(void) -{ - dccp_ackvec_slab = KMEM_CACHE(dccp_ackvec, SLAB_HWCACHE_ALIGN); - if (dccp_ackvec_slab == NULL) - goto out_err; - - dccp_ackvec_record_slab = KMEM_CACHE(dccp_ackvec_record, SLAB_HWCACHE_ALIGN); - if (dccp_ackvec_record_slab == NULL) - goto out_destroy_slab; - - return 0; - -out_destroy_slab: - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; -out_err: - DCCP_CRIT("Unable to create Ack Vector slab cache"); - return -ENOBUFS; -} - -void dccp_ackvec_exit(void) -{ - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; - kmem_cache_destroy(dccp_ackvec_record_slab); - dccp_ackvec_record_slab = NULL; -} diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h deleted file mode 100644 index d2c4220fb377b..0000000000000 --- a/net/dccp/ackvec.h +++ /dev/null @@ -1,136 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _ACKVEC_H -#define _ACKVEC_H -/* - * net/dccp/ackvec.h - * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include - -/* - * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, - * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 - * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives - * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. - * The maximum value is bounded by the u16 types for indices and functions. - */ -#define DCCPAV_NUM_ACKVECS 2 -#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) - -/* Estimated minimum average Ack Vector length - used for updating MPS */ -#define DCCPAV_MIN_OPTLEN 16 - -/* Threshold for coping with large bursts of losses */ -#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) - -enum dccp_ackvec_states { - DCCPAV_RECEIVED = 0x00, - DCCPAV_ECN_MARKED = 0x40, - DCCPAV_RESERVED = 0x80, - DCCPAV_NOT_RECEIVED = 0xC0 -}; -#define DCCPAV_MAX_RUNLEN 0x3F - -static inline u8 dccp_ackvec_runlen(const u8 *cell) -{ - return *cell & DCCPAV_MAX_RUNLEN; -} - -static inline u8 dccp_ackvec_state(const u8 *cell) -{ - return *cell & ~DCCPAV_MAX_RUNLEN; -} - -/** - * struct dccp_ackvec - Ack Vector main data structure - * - * This implements a fixed-size circular buffer within an array and is largely - * based on Appendix A of RFC 4340. - * - * @av_buf: circular buffer storage area - * @av_buf_head: head index; begin of live portion in @av_buf - * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf - * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf - * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf - * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to - * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf - * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound - * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) - */ -struct dccp_ackvec { - u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; - u16 av_buf_head; - u16 av_buf_tail; - u64 av_buf_ackno:48; - u64 av_tail_ackno:48; - bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; - u8 av_overflow:1; - struct list_head av_records; -}; - -/** - * struct dccp_ackvec_record - Records information about sent Ack Vectors - * - * These list entries define the additional information which the HC-Receiver - * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. - * - * @avr_node: the list node in @av_records - * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on - * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to - * @avr_ack_ptr: pointer into @av_buf where this record starts - * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending - * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent - * - * The list as a whole is sorted in descending order by @avr_ack_seqno. - */ -struct dccp_ackvec_record { - struct list_head avr_node; - u64 avr_ack_seqno:48; - u64 avr_ack_ackno:48; - u16 avr_ack_ptr; - u8 avr_ack_runlen; - u8 avr_ack_nonce:1; -}; - -int dccp_ackvec_init(void); -void dccp_ackvec_exit(void); - -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); -void dccp_ackvec_free(struct dccp_ackvec *av); - -void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); -int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); -void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); -u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); - -static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) -{ - return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; -} - -/** - * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb - * @vec: start of vector (offset into skb) - * @len: length of @vec - * @nonce: whether @vec had an ECN nonce of 0 or 1 - * @node: FIFO - arranged in descending order of ack_ackno - * - * This structure is used by CCIDs to access Ack Vectors in a received skb. - */ -struct dccp_ackvec_parsed { - u8 *vec, - len, - nonce:1; - struct list_head node; -}; - -int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce); -void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); -#endif /* _ACKVEC_H */ diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c deleted file mode 100644 index 6beac5d348e2b..0000000000000 --- a/net/dccp/ccid.c +++ /dev/null @@ -1,219 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/ccid.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - * - * CCID infrastructure - */ - -#include - -#include "ccid.h" -#include "ccids/lib/tfrc.h" - -static struct ccid_operations *ccids[] = { - &ccid2_ops, -#ifdef CONFIG_IP_DCCP_CCID3 - &ccid3_ops, -#endif -}; - -static struct ccid_operations *ccid_by_number(const u8 id) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) - if (ccids[i]->ccid_id == id) - return ccids[i]; - return NULL; -} - -/* check that up to @array_len members in @ccid_array are supported */ -bool ccid_support_check(u8 const *ccid_array, u8 array_len) -{ - while (array_len > 0) - if (ccid_by_number(ccid_array[--array_len]) == NULL) - return false; - return true; -} - -/** - * ccid_get_builtin_ccids - Populate a list of built-in CCIDs - * @ccid_array: pointer to copy into - * @array_len: value to return length into - * - * This function allocates memory - caller must see that it is freed after use. - */ -int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) -{ - *ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any()); - if (*ccid_array == NULL) - return -ENOBUFS; - - for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1) - (*ccid_array)[*array_len] = ccids[*array_len]->ccid_id; - return 0; -} - -int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *optval, int __user *optlen) -{ - u8 *ccid_array, array_len; - int err = 0; - - if (ccid_get_builtin_ccids(&ccid_array, &array_len)) - return -ENOBUFS; - - if (put_user(array_len, optlen)) - err = -EFAULT; - else if (len > 0 && copy_to_user(optval, ccid_array, - len > array_len ? array_len : len)) - err = -EFAULT; - - kfree(ccid_array); - return err; -} - -static __printf(3, 4) struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...) -{ - struct kmem_cache *slab; - va_list args; - - va_start(args, fmt); - vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args); - va_end(args); - - slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); - return slab; -} - -static void ccid_kmem_cache_destroy(struct kmem_cache *slab) -{ - kmem_cache_destroy(slab); -} - -static int __init ccid_activate(struct ccid_operations *ccid_ops) -{ - int err = -ENOBUFS; - - ccid_ops->ccid_hc_rx_slab = - ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size, - ccid_ops->ccid_hc_rx_slab_name, - "ccid%u_hc_rx_sock", - ccid_ops->ccid_id); - if (ccid_ops->ccid_hc_rx_slab == NULL) - goto out; - - ccid_ops->ccid_hc_tx_slab = - ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size, - ccid_ops->ccid_hc_tx_slab_name, - "ccid%u_hc_tx_sock", - ccid_ops->ccid_id); - if (ccid_ops->ccid_hc_tx_slab == NULL) - goto out_free_rx_slab; - - pr_info("DCCP: Activated CCID %d (%s)\n", - ccid_ops->ccid_id, ccid_ops->ccid_name); - err = 0; -out: - return err; -out_free_rx_slab: - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); - ccid_ops->ccid_hc_rx_slab = NULL; - goto out; -} - -static void ccid_deactivate(struct ccid_operations *ccid_ops) -{ - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab); - ccid_ops->ccid_hc_tx_slab = NULL; - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); - ccid_ops->ccid_hc_rx_slab = NULL; - - pr_info("DCCP: Deactivated CCID %d (%s)\n", - ccid_ops->ccid_id, ccid_ops->ccid_name); -} - -struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx) -{ - struct ccid_operations *ccid_ops = ccid_by_number(id); - struct ccid *ccid = NULL; - - if (ccid_ops == NULL) - goto out; - - ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab : - ccid_ops->ccid_hc_tx_slab, gfp_any()); - if (ccid == NULL) - goto out; - ccid->ccid_ops = ccid_ops; - if (rx) { - memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size); - if (ccid->ccid_ops->ccid_hc_rx_init != NULL && - ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0) - goto out_free_ccid; - } else { - memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size); - if (ccid->ccid_ops->ccid_hc_tx_init != NULL && - ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0) - goto out_free_ccid; - } -out: - return ccid; -out_free_ccid: - kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab : - ccid_ops->ccid_hc_tx_slab, ccid); - ccid = NULL; - goto out; -} - -void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk) -{ - if (ccid != NULL) { - if (ccid->ccid_ops->ccid_hc_rx_exit != NULL) - ccid->ccid_ops->ccid_hc_rx_exit(sk); - kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid); - } -} - -void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk) -{ - if (ccid != NULL) { - if (ccid->ccid_ops->ccid_hc_tx_exit != NULL) - ccid->ccid_ops->ccid_hc_tx_exit(sk); - kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid); - } -} - -int __init ccid_initialize_builtins(void) -{ - int i, err = tfrc_lib_init(); - - if (err) - return err; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) { - err = ccid_activate(ccids[i]); - if (err) - goto unwind_registrations; - } - return 0; - -unwind_registrations: - while(--i >= 0) - ccid_deactivate(ccids[i]); - tfrc_lib_exit(); - return err; -} - -void ccid_cleanup_builtins(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) - ccid_deactivate(ccids[i]); - tfrc_lib_exit(); -} diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h deleted file mode 100644 index 105f3734dadbf..0000000000000 --- a/net/dccp/ccid.h +++ /dev/null @@ -1,262 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _CCID_H -#define _CCID_H -/* - * net/dccp/ccid.h - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - * - * CCID infrastructure - */ - -#include -#include -#include -#include -#include - -/* maximum value for a CCID (RFC 4340, 19.5) */ -#define CCID_MAX 255 -#define CCID_SLAB_NAME_LENGTH 32 - -struct tcp_info; - -/** - * struct ccid_operations - Interface to Congestion-Control Infrastructure - * - * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) - * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled) - * @ccid_name: alphabetical identifier string for @ccid_id - * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection - * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket - * - * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup) - * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction) - * @ccid_hc_rx_packet_recv: implements the HC-receiver side - * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options - * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options - * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender - * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender - * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender - * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender - * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender - */ -struct ccid_operations { - unsigned char ccid_id; - __u32 ccid_ccmps; - const char *ccid_name; - struct kmem_cache *ccid_hc_rx_slab, - *ccid_hc_tx_slab; - char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH]; - char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH]; - __u32 ccid_hc_rx_obj_size, - ccid_hc_tx_obj_size; - /* Interface Routines */ - int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk); - int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk); - void (*ccid_hc_rx_exit)(struct sock *sk); - void (*ccid_hc_tx_exit)(struct sock *sk); - void (*ccid_hc_rx_packet_recv)(struct sock *sk, - struct sk_buff *skb); - int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); - int (*ccid_hc_rx_insert_options)(struct sock *sk, - struct sk_buff *skb); - void (*ccid_hc_tx_packet_recv)(struct sock *sk, - struct sk_buff *skb); - int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); - int (*ccid_hc_tx_send_packet)(struct sock *sk, - struct sk_buff *skb); - void (*ccid_hc_tx_packet_sent)(struct sock *sk, - unsigned int len); - void (*ccid_hc_rx_get_info)(struct sock *sk, - struct tcp_info *info); - void (*ccid_hc_tx_get_info)(struct sock *sk, - struct tcp_info *info); - int (*ccid_hc_rx_getsockopt)(struct sock *sk, - const int optname, int len, - u32 __user *optval, - int __user *optlen); - int (*ccid_hc_tx_getsockopt)(struct sock *sk, - const int optname, int len, - u32 __user *optval, - int __user *optlen); -}; - -extern struct ccid_operations ccid2_ops; -#ifdef CONFIG_IP_DCCP_CCID3 -extern struct ccid_operations ccid3_ops; -#endif - -int ccid_initialize_builtins(void); -void ccid_cleanup_builtins(void); - -struct ccid { - struct ccid_operations *ccid_ops; - char ccid_priv[]; -}; - -static inline void *ccid_priv(const struct ccid *ccid) -{ - return (void *)ccid->ccid_priv; -} - -bool ccid_support_check(u8 const *ccid_array, u8 array_len); -int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); -int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *, int __user *); - -struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx); - -static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_rx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} - -static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_tx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} - -void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); -void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); - -/* - * Congestion control of queued data packets via CCID decision. - * - * The TX CCID performs its congestion-control by indicating whether and when a - * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). - * The following modes are supported via the symbolic constants below: - * - timer-based pacing (CCID returns a delay value in milliseconds); - * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). - */ - -enum ccid_dequeueing_decision { - CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ - CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ - CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ - CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ - CCID_PACKET_ERR = 0xF0000, /* error condition */ -}; - -static inline int ccid_packet_dequeue_eval(const int return_code) -{ - if (return_code < 0) - return CCID_PACKET_ERR; - if (return_code == 0) - return CCID_PACKET_SEND_AT_ONCE; - if (return_code <= CCID_PACKET_DELAY_MAX) - return CCID_PACKET_DELAY; - return return_code; -} - -static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) - return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); - return CCID_PACKET_SEND_AT_ONCE; -} - -static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, - unsigned int len) -{ - if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); -} - -static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL) - ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb); -} - -static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); -} - -/** - * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver - * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) - * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) - * @val: value of @opt - * @len: length of @val in bytes - */ -static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) -{ - if (!ccid || !ccid->ccid_ops->ccid_hc_tx_parse_options) - return 0; - return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); -} - -/** - * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender - * Arguments are analogous to ccid_hc_tx_parse_options() - */ -static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) -{ - if (!ccid || !ccid->ccid_ops->ccid_hc_rx_parse_options) - return 0; - return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); -} - -static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL) - return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb); - return 0; -} - -static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk, - struct tcp_info *info) -{ - if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL) - ccid->ccid_ops->ccid_hc_rx_get_info(sk, info); -} - -static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk, - struct tcp_info *info) -{ - if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL) - ccid->ccid_ops->ccid_hc_tx_get_info(sk, info); -} - -static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, - const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - int rc = -ENOPROTOOPT; - if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) - rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, - optval, optlen); - return rc; -} - -static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, - const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - int rc = -ENOPROTOOPT; - if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) - rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, - optval, optlen); - return rc; -} -#endif /* _CCID_H */ diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig deleted file mode 100644 index e3d388c33d256..0000000000000 --- a/net/dccp/ccids/Kconfig +++ /dev/null @@ -1,55 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menu "DCCP CCIDs Configuration" - -config IP_DCCP_CCID2_DEBUG - bool "CCID-2 debugging messages" - help - Enable CCID-2 specific debugging messages. - - The debugging output can additionally be toggled by setting the - ccid2_debug parameter to 0 or 1. - - If in doubt, say N. - -config IP_DCCP_CCID3 - bool "CCID-3 (TCP-Friendly)" - default IP_DCCP = y || IP_DCCP = m - help - CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based - rate-controlled congestion control mechanism. TFRC is designed to - be reasonably fair when competing for bandwidth with TCP-like flows, - where a flow is "reasonably fair" if its sending rate is generally - within a factor of two of the sending rate of a TCP flow under the - same conditions. However, TFRC has a much lower variation of - throughput over time compared with TCP, which makes CCID-3 more - suitable than CCID-2 for applications such streaming media where a - relatively smooth sending rate is of importance. - - CCID-3 is further described in RFC 4342, - https://www.ietf.org/rfc/rfc4342.txt - - The TFRC congestion control algorithms were initially described in - RFC 5348. - - This text was extracted from RFC 4340 (sec. 10.2), - https://www.ietf.org/rfc/rfc4340.txt - - If in doubt, say N. - -config IP_DCCP_CCID3_DEBUG - bool "CCID-3 debugging messages" - depends on IP_DCCP_CCID3 - help - Enable CCID-3 specific debugging messages. - - The debugging output can additionally be toggled by setting the - ccid3_debug parameter to 0 or 1. - - If in doubt, say N. - -config IP_DCCP_TFRC_LIB - def_bool y if IP_DCCP_CCID3 - -config IP_DCCP_TFRC_DEBUG - def_bool y if IP_DCCP_CCID3_DEBUG -endmenu diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c deleted file mode 100644 index d6b30700af679..0000000000000 --- a/net/dccp/ccids/ccid2.c +++ /dev/null @@ -1,794 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2005, 2006 Andrea Bittau - * - * Changes to meet Linux coding standards, and DCCP infrastructure fixes. - * - * Copyright (c) 2006 Arnaldo Carvalho de Melo - */ - -/* - * This implementation should follow RFC 4341 - */ -#include -#include "../feat.h" -#include "ccid2.h" - - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG -static bool ccid2_debug; -#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) -#else -#define ccid2_pr_debug(format, a...) -#endif - -static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) -{ - struct ccid2_seq *seqp; - int i; - - /* check if we have space to preserve the pointer to the buffer */ - if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) / - sizeof(struct ccid2_seq *))) - return -ENOMEM; - - /* allocate buffer and initialize linked list */ - seqp = kmalloc_array(CCID2_SEQBUF_LEN, sizeof(struct ccid2_seq), - gfp_any()); - if (seqp == NULL) - return -ENOMEM; - - for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) { - seqp[i].ccid2s_next = &seqp[i + 1]; - seqp[i + 1].ccid2s_prev = &seqp[i]; - } - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp; - seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - - /* This is the first allocation. Initiate the head and tail. */ - if (hc->tx_seqbufc == 0) - hc->tx_seqh = hc->tx_seqt = seqp; - else { - /* link the existing list with the one we just created */ - hc->tx_seqh->ccid2s_next = seqp; - seqp->ccid2s_prev = hc->tx_seqh; - - hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt; - } - - /* store the original pointer to the buffer so we can free it */ - hc->tx_seqbuf[hc->tx_seqbufc] = seqp; - hc->tx_seqbufc++; - - return 0; -} - -static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) -{ - if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) - return CCID_PACKET_WILL_DEQUEUE_LATER; - return CCID_PACKET_SEND_AT_ONCE; -} - -static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) -{ - u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2); - - /* - * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from - * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always - * acceptable since this causes starvation/deadlock whenever cwnd < 2. - * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled). - */ - if (val == 0 || val > max_ratio) { - DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); - val = max_ratio; - } - dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO, - min_t(u32, val, DCCPF_ACK_RATIO_MAX)); -} - -static void ccid2_check_l_ack_ratio(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - /* - * After a loss, idle period, application limited period, or RTO we - * need to check that the ack ratio is still less than the congestion - * window. Otherwise, we will send an entire congestion window of - * packets and got no response because we haven't sent ack ratio - * packets yet. - * If the ack ratio does need to be reduced, we reduce it to half of - * the congestion window (or 1 if that's zero) instead of to the - * congestion window. This prevents problems if one ack is lost. - */ - if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd) - ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U); -} - -static void ccid2_change_l_seq_window(struct sock *sk, u64 val) -{ - dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW, - clamp_val(val, DCCPF_SEQ_WMIN, - DCCPF_SEQ_WMAX)); -} - -static void dccp_tasklet_schedule(struct sock *sk) -{ - struct tasklet_struct *t = &dccp_sk(sk)->dccps_xmitlet; - - if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { - sock_hold(sk); - __tasklet_schedule(t); - } -} - -static void ccid2_hc_tx_rto_expire(struct timer_list *t) -{ - struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer); - struct sock *sk = hc->sk; - const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5); - goto out; - } - - ccid2_pr_debug("RTO_EXPIRE\n"); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - /* back-off timer */ - hc->tx_rto <<= 1; - if (hc->tx_rto > DCCP_RTO_MAX) - hc->tx_rto = DCCP_RTO_MAX; - - /* adjust pipe, cwnd etc */ - hc->tx_ssthresh = hc->tx_cwnd / 2; - if (hc->tx_ssthresh < 2) - hc->tx_ssthresh = 2; - hc->tx_cwnd = 1; - hc->tx_pipe = 0; - - /* clear state about stuff we sent */ - hc->tx_seqt = hc->tx_seqh; - hc->tx_packets_acked = 0; - - /* clear ack ratio state. */ - hc->tx_rpseq = 0; - hc->tx_rpdupack = -1; - ccid2_change_l_ack_ratio(sk, 1); - - /* if we were blocked before, we may now send cwnd=1 packet */ - if (sender_was_blocked) - dccp_tasklet_schedule(sk); - /* restart backed-off timer */ - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/* - * Congestion window validation (RFC 2861). - */ -static bool ccid2_do_cwv = true; -module_param(ccid2_do_cwv, bool, 0644); -MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation"); - -/** - * ccid2_update_used_window - Track how much of cwnd is actually used - * @hc: socket to update window - * @new_wnd: new window values to add into the filter - * - * This is done in addition to CWV. The sender needs to have an idea of how many - * packets may be in flight, to set the local Sequence Window value accordingly - * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the - * maximum-used window. We use an EWMA low-pass filter to filter out noise. - */ -static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd) -{ - hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4; -} - -/* This borrows the code of tcp_cwnd_application_limited() */ -static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - /* don't reduce cwnd below the initial window (IW) */ - u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache), - win_used = max(hc->tx_cwnd_used, init_win); - - if (win_used < hc->tx_cwnd) { - hc->tx_ssthresh = max(hc->tx_ssthresh, - (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2)); - hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1; - } - hc->tx_cwnd_used = 0; - hc->tx_cwnd_stamp = now; - - ccid2_check_l_ack_ratio(sk); -} - -/* This borrows the code of tcp_cwnd_restart() */ -static void ccid2_cwnd_restart(struct sock *sk, const u32 now) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - u32 cwnd = hc->tx_cwnd, restart_cwnd, - iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache); - s32 delta = now - hc->tx_lsndtime; - - hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2)); - - /* don't reduce cwnd below the initial window (IW) */ - restart_cwnd = min(cwnd, iwnd); - - while ((delta -= hc->tx_rto) >= 0 && cwnd > restart_cwnd) - cwnd >>= 1; - hc->tx_cwnd = max(cwnd, restart_cwnd); - hc->tx_cwnd_stamp = now; - hc->tx_cwnd_used = 0; - - ccid2_check_l_ack_ratio(sk); -} - -static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - const u32 now = ccid2_jiffies32; - struct ccid2_seq *next; - - /* slow-start after idle periods (RFC 2581, RFC 2861) */ - if (ccid2_do_cwv && !hc->tx_pipe && - (s32)(now - hc->tx_lsndtime) >= hc->tx_rto) - ccid2_cwnd_restart(sk, now); - - hc->tx_lsndtime = now; - hc->tx_pipe += 1; - - /* see whether cwnd was fully used (RFC 2861), update expected window */ - if (ccid2_cwnd_network_limited(hc)) { - ccid2_update_used_window(hc, hc->tx_cwnd); - hc->tx_cwnd_used = 0; - hc->tx_cwnd_stamp = now; - } else { - if (hc->tx_pipe > hc->tx_cwnd_used) - hc->tx_cwnd_used = hc->tx_pipe; - - ccid2_update_used_window(hc, hc->tx_cwnd_used); - - if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto) - ccid2_cwnd_application_limited(sk, now); - } - - hc->tx_seqh->ccid2s_seq = dp->dccps_gss; - hc->tx_seqh->ccid2s_acked = 0; - hc->tx_seqh->ccid2s_sent = now; - - next = hc->tx_seqh->ccid2s_next; - /* check if we need to alloc more space */ - if (next == hc->tx_seqt) { - if (ccid2_hc_tx_alloc_seq(hc)) { - DCCP_CRIT("packet history - out of memory!"); - /* FIXME: find a more graceful way to bail out */ - return; - } - next = hc->tx_seqh->ccid2s_next; - BUG_ON(next == hc->tx_seqt); - } - hc->tx_seqh = next; - - ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe); - - /* - * FIXME: The code below is broken and the variables have been removed - * from the socket struct. The `ackloss' variable was always set to 0, - * and with arsent there are several problems: - * (i) it doesn't just count the number of Acks, but all sent packets; - * (ii) it is expressed in # of packets, not # of windows, so the - * comparison below uses the wrong formula: Appendix A of RFC 4341 - * comes up with the number K = cwnd / (R^2 - R) of consecutive windows - * of data with no lost or marked Ack packets. If arsent were the # of - * consecutive Acks received without loss, then Ack Ratio needs to be - * decreased by 1 when - * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2) - * where cwnd / R is the number of Acks received per window of data - * (cf. RFC 4341, App. A). The problems are that - * - arsent counts other packets as well; - * - the comparison uses a formula different from RFC 4341; - * - computing a cubic/quadratic equation each time is too complicated. - * Hence a different algorithm is needed. - */ -#if 0 - /* Ack Ratio. Need to maintain a concept of how many windows we sent */ - hc->tx_arsent++; - /* We had an ack loss in this window... */ - if (hc->tx_ackloss) { - if (hc->tx_arsent >= hc->tx_cwnd) { - hc->tx_arsent = 0; - hc->tx_ackloss = 0; - } - } else { - /* No acks lost up to now... */ - /* decrease ack ratio if enough packets were sent */ - if (dp->dccps_l_ack_ratio > 1) { - /* XXX don't calculate denominator each time */ - int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - - dp->dccps_l_ack_ratio; - - denom = hc->tx_cwnd * hc->tx_cwnd / denom; - - if (hc->tx_arsent >= denom) { - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); - hc->tx_arsent = 0; - } - } else { - /* we can't increase ack ratio further [1] */ - hc->tx_arsent = 0; /* or maybe set it to cwnd*/ - } - } -#endif - - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG - do { - struct ccid2_seq *seqp = hc->tx_seqt; - - while (seqp != hc->tx_seqh) { - ccid2_pr_debug("out seq=%llu acked=%d time=%u\n", - (unsigned long long)seqp->ccid2s_seq, - seqp->ccid2s_acked, seqp->ccid2s_sent); - seqp = seqp->ccid2s_next; - } - } while (0); - ccid2_pr_debug("=========\n"); -#endif -} - -/** - * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm - * @sk: socket to perform estimator on - * @mrtt: measured RTT - * - * This code is almost identical with TCP's tcp_rtt_estimator(), since - * - it has a higher sampling frequency (recommended by RFC 1323), - * - the RTO does not collapse into RTT due to RTTVAR going towards zero, - * - it is simple (cf. more complex proposals such as Eifel timer or research - * which suggests that the gain should be set according to window size), - * - in tests it was found to work well with CCID2 [gerrit]. - */ -static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - long m = mrtt ? : 1; - - if (hc->tx_srtt == 0) { - /* First measurement m */ - hc->tx_srtt = m << 3; - hc->tx_mdev = m << 1; - - hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk)); - hc->tx_rttvar = hc->tx_mdev_max; - - hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; - } else { - /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ - m -= (hc->tx_srtt >> 3); - hc->tx_srtt += m; - - /* Similarly, update scaled mdev with regard to |m| */ - if (m < 0) { - m = -m; - m -= (hc->tx_mdev >> 2); - /* - * This neutralises RTO increase when RTT < SRTT - mdev - * (see P. Sarolahti, A. Kuznetsov,"Congestion Control - * in Linux TCP", USENIX 2002, pp. 49-62). - */ - if (m > 0) - m >>= 3; - } else { - m -= (hc->tx_mdev >> 2); - } - hc->tx_mdev += m; - - if (hc->tx_mdev > hc->tx_mdev_max) { - hc->tx_mdev_max = hc->tx_mdev; - if (hc->tx_mdev_max > hc->tx_rttvar) - hc->tx_rttvar = hc->tx_mdev_max; - } - - /* - * Decay RTTVAR at most once per flight, exploiting that - * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) - * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) - * GAR is a useful bound for FlightSize = pipe. - * AWL is probably too low here, as it over-estimates pipe. - */ - if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { - if (hc->tx_mdev_max < hc->tx_rttvar) - hc->tx_rttvar -= (hc->tx_rttvar - - hc->tx_mdev_max) >> 2; - hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; - hc->tx_mdev_max = tcp_rto_min(sk); - } - } - - /* - * Set RTO from SRTT and RTTVAR - * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. - * This agrees with RFC 4341, 5: - * "Because DCCP does not retransmit data, DCCP does not require - * TCP's recommended minimum timeout of one second". - */ - hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; - - if (hc->tx_rto > DCCP_RTO_MAX) - hc->tx_rto = DCCP_RTO_MAX; -} - -static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, - unsigned int *maxincr) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio; - - if (hc->tx_cwnd < dp->dccps_l_seq_win && - r_seq_used < dp->dccps_r_seq_win) { - if (hc->tx_cwnd < hc->tx_ssthresh) { - if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) { - hc->tx_cwnd += 1; - *maxincr -= 1; - hc->tx_packets_acked = 0; - } - } else if (++hc->tx_packets_acked >= hc->tx_cwnd) { - hc->tx_cwnd += 1; - hc->tx_packets_acked = 0; - } - } - - /* - * Adjust the local sequence window and the ack ratio to allow about - * 5 times the number of packets in the network (RFC 4340 7.5.2) - */ - if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win) - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2); - else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2) - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U); - - if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win) - ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2); - else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2) - ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2); - - /* - * FIXME: RTT is sampled several times per acknowledgment (for each - * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). - * This causes the RTT to be over-estimated, since the older entries - * in the Ack Vector have earlier sending times. - * The cleanest solution is to not use the ccid2s_sent field at all - * and instead use DCCP timestamps: requires changes in other places. - */ - ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent); -} - -static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) { - ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); - return; - } - - hc->tx_last_cong = ccid2_jiffies32; - - hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; - hc->tx_ssthresh = max(hc->tx_cwnd, 2U); - - ccid2_check_l_ack_ratio(sk); -} - -static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - switch (option) { - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen, - option - DCCPO_ACK_VECTOR_0); - } - return 0; -} - -static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); - struct dccp_ackvec_parsed *avp; - u64 ackno, seqno; - struct ccid2_seq *seqp; - int done = 0; - unsigned int maxincr = 0; - - /* check reverse path congestion */ - seqno = DCCP_SKB_CB(skb)->dccpd_seq; - - /* XXX this whole "algorithm" is broken. Need to fix it to keep track - * of the seqnos of the dupacks so that rpseq and rpdupack are correct - * -sorbo. - */ - /* need to bootstrap */ - if (hc->tx_rpdupack == -1) { - hc->tx_rpdupack = 0; - hc->tx_rpseq = seqno; - } else { - /* check if packet is consecutive */ - if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1) - hc->tx_rpseq = seqno; - /* it's a later packet */ - else if (after48(seqno, hc->tx_rpseq)) { - hc->tx_rpdupack++; - - /* check if we got enough dupacks */ - if (hc->tx_rpdupack >= NUMDUPACK) { - hc->tx_rpdupack = -1; /* XXX lame */ - hc->tx_rpseq = 0; -#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__ - /* - * FIXME: Ack Congestion Control is broken; in - * the current state instabilities occurred with - * Ack Ratios greater than 1; causing hang-ups - * and long RTO timeouts. This needs to be fixed - * before opening up dynamic changes. -- gerrit - */ - ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); -#endif - } - } - } - - /* check forward path congestion */ - if (dccp_packet_without_ack(skb)) - return; - - /* still didn't send out new data packets */ - if (hc->tx_seqh == hc->tx_seqt) - goto done; - - ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - if (after48(ackno, hc->tx_high_ack)) - hc->tx_high_ack = ackno; - - seqp = hc->tx_seqt; - while (before48(seqp->ccid2s_seq, ackno)) { - seqp = seqp->ccid2s_next; - if (seqp == hc->tx_seqh) { - seqp = hc->tx_seqh->ccid2s_prev; - break; - } - } - - /* - * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2 - * packets per acknowledgement. Rounding up avoids that cwnd is not - * advanced when Ack Ratio is 1 and gives a slight edge otherwise. - */ - if (hc->tx_cwnd < hc->tx_ssthresh) - maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); - - /* go through all ack vectors */ - list_for_each_entry(avp, &hc->tx_av_chunks, node) { - /* go through this ack vector */ - for (; avp->len--; avp->vec++) { - u64 ackno_end_rl = SUB48(ackno, - dccp_ackvec_runlen(avp->vec)); - - ccid2_pr_debug("ackvec %llu |%u,%u|\n", - (unsigned long long)ackno, - dccp_ackvec_state(avp->vec) >> 6, - dccp_ackvec_runlen(avp->vec)); - /* if the seqno we are analyzing is larger than the - * current ackno, then move towards the tail of our - * seqnos. - */ - while (after48(seqp->ccid2s_seq, ackno)) { - if (seqp == hc->tx_seqt) { - done = 1; - break; - } - seqp = seqp->ccid2s_prev; - } - if (done) - break; - - /* check all seqnos in the range of the vector - * run length - */ - while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = dccp_ackvec_state(avp->vec); - - /* new packet received or marked */ - if (state != DCCPAV_NOT_RECEIVED && - !seqp->ccid2s_acked) { - if (state == DCCPAV_ECN_MARKED) - ccid2_congestion_event(sk, - seqp); - else - ccid2_new_ack(sk, seqp, - &maxincr); - - seqp->ccid2s_acked = 1; - ccid2_pr_debug("Got ack for %llu\n", - (unsigned long long)seqp->ccid2s_seq); - hc->tx_pipe--; - } - if (seqp == hc->tx_seqt) { - done = 1; - break; - } - seqp = seqp->ccid2s_prev; - } - if (done) - break; - - ackno = SUB48(ackno_end_rl, 1); - } - if (done) - break; - } - - /* The state about what is acked should be correct now - * Check for NUMDUPACK - */ - seqp = hc->tx_seqt; - while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) { - seqp = seqp->ccid2s_next; - if (seqp == hc->tx_seqh) { - seqp = hc->tx_seqh->ccid2s_prev; - break; - } - } - done = 0; - while (1) { - if (seqp->ccid2s_acked) { - done++; - if (done == NUMDUPACK) - break; - } - if (seqp == hc->tx_seqt) - break; - seqp = seqp->ccid2s_prev; - } - - /* If there are at least 3 acknowledgements, anything unacknowledged - * below the last sequence number is considered lost - */ - if (done == NUMDUPACK) { - struct ccid2_seq *last_acked = seqp; - - /* check for lost packets */ - while (1) { - if (!seqp->ccid2s_acked) { - ccid2_pr_debug("Packet lost: %llu\n", - (unsigned long long)seqp->ccid2s_seq); - /* XXX need to traverse from tail -> head in - * order to detect multiple congestion events in - * one ack vector. - */ - ccid2_congestion_event(sk, seqp); - hc->tx_pipe--; - } - if (seqp == hc->tx_seqt) - break; - seqp = seqp->ccid2s_prev; - } - - hc->tx_seqt = last_acked; - } - - /* trim acked packets in tail */ - while (hc->tx_seqt != hc->tx_seqh) { - if (!hc->tx_seqt->ccid2s_acked) - break; - - hc->tx_seqt = hc->tx_seqt->ccid2s_next; - } - - /* restart RTO timer if not all outstanding data has been acked */ - if (hc->tx_pipe == 0) - sk_stop_timer(sk, &hc->tx_rtotimer); - else - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -done: - /* check if incoming Acks allow pending packets to be sent */ - if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) - dccp_tasklet_schedule(sk); - dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); -} - -static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid_priv(ccid); - struct dccp_sock *dp = dccp_sk(sk); - u32 max_ratio; - - /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ - hc->tx_ssthresh = ~0U; - - /* Use larger initial windows (RFC 4341, section 5). */ - hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); - hc->tx_expected_wnd = hc->tx_cwnd; - - /* Make sure that Ack Ratio is enabled and within bounds. */ - max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); - if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) - dp->dccps_l_ack_ratio = max_ratio; - - /* XXX init ~ to window size... */ - if (ccid2_hc_tx_alloc_seq(hc)) - return -ENOMEM; - - hc->tx_rto = DCCP_TIMEOUT_INIT; - hc->tx_rpdupack = -1; - hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32; - hc->tx_cwnd_used = 0; - hc->sk = sk; - timer_setup(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 0); - INIT_LIST_HEAD(&hc->tx_av_chunks); - return 0; -} - -static void ccid2_hc_tx_exit(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - int i; - - sk_stop_timer(sk, &hc->tx_rtotimer); - - for (i = 0; i < hc->tx_seqbufc; i++) - kfree(hc->tx_seqbuf[i]); - hc->tx_seqbufc = 0; - dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); -} - -static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk); - - if (!dccp_data_packet(skb)) - return; - - if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) { - dccp_send_ack(sk); - hc->rx_num_data_pkts = 0; - } -} - -struct ccid_operations ccid2_ops = { - .ccid_id = DCCPC_CCID2, - .ccid_name = "TCP-like", - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), - .ccid_hc_tx_init = ccid2_hc_tx_init, - .ccid_hc_tx_exit = ccid2_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, - .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, -}; - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG -module_param(ccid2_debug, bool, 0644); -MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages"); -#endif diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h deleted file mode 100644 index 330c7b4ec0013..0000000000000 --- a/net/dccp/ccids/ccid2.h +++ /dev/null @@ -1,121 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) 2005 Andrea Bittau - */ -#ifndef _DCCP_CCID2_H_ -#define _DCCP_CCID2_H_ - -#include -#include -#include "../ccid.h" -#include "../dccp.h" - -/* - * CCID-2 timestamping faces the same issues as TCP timestamping. - * Hence we reuse/share as much of the code as possible. - */ -#define ccid2_jiffies32 ((u32)jiffies) - -/* NUMDUPACK parameter from RFC 4341, p. 6 */ -#define NUMDUPACK 3 - -struct ccid2_seq { - u64 ccid2s_seq; - u32 ccid2s_sent; - int ccid2s_acked; - struct ccid2_seq *ccid2s_prev; - struct ccid2_seq *ccid2s_next; -}; - -#define CCID2_SEQBUF_LEN 1024 -#define CCID2_SEQBUF_MAX 128 - -/* - * Multiple of congestion window to keep the sequence window at - * (RFC 4340 7.5.2) - */ -#define CCID2_WIN_CHANGE_FACTOR 5 - -/** - * struct ccid2_hc_tx_sock - CCID2 TX half connection - * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 - * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @tx_srtt: smoothed RTT estimate, scaled by 2^3 - * @tx_mdev: smoothed RTT variation, scaled by 2^2 - * @tx_mdev_max: maximum of @mdev during one flight - * @tx_rttvar: moving average/maximum of @mdev_max - * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) - * @tx_rtt_seq: to decay RTTVAR at most once per flight - * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861 - * @tx_expected_wnd: moving average of @tx_cwnd_used - * @tx_cwnd_stamp: to track idle periods in CWV - * @tx_lsndtime: last time (in jiffies) a data packet was sent - * @tx_rpseq: last consecutive seqno - * @tx_rpdupack: dupacks since rpseq - * @tx_av_chunks: list of Ack Vectors received on current skb - */ -struct ccid2_hc_tx_sock { - u32 tx_cwnd; - u32 tx_ssthresh; - u32 tx_pipe; - u32 tx_packets_acked; - struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX]; - int tx_seqbufc; - struct ccid2_seq *tx_seqh; - struct ccid2_seq *tx_seqt; - - /* RTT measurement: variables/principles are the same as in TCP */ - u32 tx_srtt, - tx_mdev, - tx_mdev_max, - tx_rttvar, - tx_rto; - u64 tx_rtt_seq:48; - struct timer_list tx_rtotimer; - struct sock *sk; - - /* Congestion Window validation (optional, RFC 2861) */ - u32 tx_cwnd_used, - tx_expected_wnd, - tx_cwnd_stamp, - tx_lsndtime; - - u64 tx_rpseq; - int tx_rpdupack; - u32 tx_last_cong; - u64 tx_high_ack; - struct list_head tx_av_chunks; -}; - -static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc) -{ - return hc->tx_pipe >= hc->tx_cwnd; -} - -/* - * Convert RFC 3390 larger initial window into an equivalent number of packets. - * This is based on the numbers specified in RFC 5681, 3.1. - */ -static inline u32 rfc3390_bytes_to_packets(const u32 smss) -{ - return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3); -} - -/** - * struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection - * @rx_num_data_pkts: number of data packets received since last feedback - */ -struct ccid2_hc_rx_sock { - u32 rx_num_data_pkts; -}; - -static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) -{ - return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); -} - -static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk) -{ - return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); -} -#endif /* _DCCP_CCID2_H_ */ diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c deleted file mode 100644 index f349d16dd8f65..0000000000000 --- a/net/dccp/ccids/ccid3.c +++ /dev/null @@ -1,866 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#include "../dccp.h" -#include "ccid3.h" - -#include - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static bool ccid3_debug; -#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a) -#else -#define ccid3_pr_debug(format, a...) -#endif - -/* - * Transmitter Half-Connection Routines - */ -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) -{ - static const char *const ccid3_state_names[] = { - [TFRC_SSTATE_NO_SENT] = "NO_SENT", - [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", - [TFRC_SSTATE_FBACK] = "FBACK", - }; - - return ccid3_state_names[state]; -} -#endif - -static void ccid3_hc_tx_set_state(struct sock *sk, - enum ccid3_hc_tx_states state) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - enum ccid3_hc_tx_states oldstate = hc->tx_state; - - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_tx_state_name(oldstate), - ccid3_tx_state_name(state)); - WARN_ON(state == oldstate); - hc->tx_state = state; -} - -/* - * Compute the initial sending rate X_init in the manner of RFC 3390: - * - * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT - * - * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis - * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. - * For consistency with other parts of the code, X_init is scaled by 2^6. - */ -static inline u64 rfc3390_initial_rate(struct sock *sk) -{ - const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s); - - return scaled_div(w_init << 6, hc->tx_rtt); -} - -/** - * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst - * @hc: socket to have the send interval updated - * - * This respects the granularity of X_inst (64 * bytes/second). - */ -static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) -{ - hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x); - - DCCP_BUG_ON(hc->tx_t_ipi == 0); - ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi, - hc->tx_s, (unsigned int)(hc->tx_x >> 6)); -} - -static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) -{ - u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count); - - return delta / hc->tx_rtt; -} - -/** - * ccid3_hc_tx_update_x - Update allowed sending rate X - * @sk: socket to be updated - * @stamp: most recent time if available - can be left NULL. - * - * This function tracks draft rfc3448bis, check there for latest details. - * - * Note: X and X_recv are both stored in units of 64 * bytes/second, to support - * fine-grained resolution of sending rates. This requires scaling by 2^6 - * throughout the code. Only X_calc is unscaled (in bytes/second). - * - */ -static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - __u64 min_rate = 2 * hc->tx_x_recv; - const __u64 old_x = hc->tx_x; - ktime_t now = stamp ? *stamp : ktime_get_real(); - - /* - * Handle IDLE periods: do not reduce below RFC3390 initial sending rate - * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis: - * a sender is idle if it has not sent anything over a 2-RTT-period. - * For consistency with X and X_recv, min_rate is also scaled by 2^6. - */ - if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) { - min_rate = rfc3390_initial_rate(sk); - min_rate = max(min_rate, 2 * hc->tx_x_recv); - } - - if (hc->tx_p > 0) { - - hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate); - hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); - - } else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) { - - hc->tx_x = min(2 * hc->tx_x, min_rate); - hc->tx_x = max(hc->tx_x, - scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt)); - hc->tx_t_ld = now; - } - - if (hc->tx_x != old_x) { - ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " - "X_recv=%u\n", (unsigned int)(old_x >> 6), - (unsigned int)(hc->tx_x >> 6), hc->tx_x_calc, - (unsigned int)(hc->tx_x_recv >> 6)); - - ccid3_update_send_interval(hc); - } -} - -/** - * ccid3_hc_tx_update_s - Track the mean packet size `s' - * @hc: socket to be updated - * @len: DCCP packet payload size in bytes - * - * cf. RFC 4342, 5.3 and RFC 3448, 4.1 - */ -static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len) -{ - const u16 old_s = hc->tx_s; - - hc->tx_s = tfrc_ewma(hc->tx_s, len, 9); - - if (hc->tx_s != old_s) - ccid3_update_send_interval(hc); -} - -/* - * Update Window Counter using the algorithm from [RFC 4342, 8.1]. - * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt(). - */ -static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc, - ktime_t now) -{ - u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count), - quarter_rtts = (4 * delta) / hc->tx_rtt; - - if (quarter_rtts > 0) { - hc->tx_t_last_win_count = now; - hc->tx_last_win_count += min(quarter_rtts, 5U); - hc->tx_last_win_count &= 0xF; /* mod 16 */ - } -} - -static void ccid3_hc_tx_no_feedback_timer(struct timer_list *t) -{ - struct ccid3_hc_tx_sock *hc = from_timer(hc, t, tx_no_feedback_timer); - struct sock *sk = hc->sk; - unsigned long t_nfb = USEC_PER_SEC / 5; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - /* XXX: set some sensible MIB */ - goto restart_timer; - } - - ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk, - ccid3_tx_state_name(hc->tx_state)); - - /* Ignore and do not restart after leaving the established state */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) - goto out; - - /* Reset feedback state to "no feedback received" */ - if (hc->tx_state == TFRC_SSTATE_FBACK) - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - - /* - * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 - * RTO is 0 if and only if no feedback has been received yet. - */ - if (hc->tx_t_rto == 0 || hc->tx_p == 0) { - - /* halve send rate directly */ - hc->tx_x = max(hc->tx_x / 2, - (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); - ccid3_update_send_interval(hc); - } else { - /* - * Modify the cached value of X_recv - * - * If (X_calc > 2 * X_recv) - * X_recv = max(X_recv / 2, s / (2 * t_mbi)); - * Else - * X_recv = X_calc / 4; - * - * Note that X_recv is scaled by 2^6 while X_calc is not - */ - if (hc->tx_x_calc > (hc->tx_x_recv >> 5)) - hc->tx_x_recv = - max(hc->tx_x_recv / 2, - (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI)); - else { - hc->tx_x_recv = hc->tx_x_calc; - hc->tx_x_recv <<= 4; - } - ccid3_hc_tx_update_x(sk, NULL); - } - ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", - (unsigned long long)hc->tx_x); - - /* - * Set new timeout for the nofeedback timer. - * See comments in packet_recv() regarding the value of t_RTO. - */ - if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */ - t_nfb = TFRC_INITIAL_TIMEOUT; - else - t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); - -restart_timer: - sk_reset_timer(sk, &hc->tx_no_feedback_timer, - jiffies + usecs_to_jiffies(t_nfb)); -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/** - * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets - * @sk: socket to send packet from - * @skb: next packet candidate to send on @sk - * - * This function uses the convention of ccid_packet_dequeue_eval() and - * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. - */ -static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - ktime_t now = ktime_get_real(); - s64 delay; - - /* - * This function is called only for Data and DataAck packets. Sending - * zero-sized Data(Ack)s is theoretically possible, but for congestion - * control this case is pathological - ignore it. - */ - if (unlikely(skb->len == 0)) - return -EBADMSG; - - if (hc->tx_state == TFRC_SSTATE_NO_SENT) { - sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies + - usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); - hc->tx_last_win_count = 0; - hc->tx_t_last_win_count = now; - - /* Set t_0 for initial packet */ - hc->tx_t_nom = now; - - hc->tx_s = skb->len; - - /* - * Use initial RTT sample when available: recommended by erratum - * to RFC 4342. This implements the initialisation procedure of - * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6. - */ - if (dp->dccps_syn_rtt) { - ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); - hc->tx_rtt = dp->dccps_syn_rtt; - hc->tx_x = rfc3390_initial_rate(sk); - hc->tx_t_ld = now; - } else { - /* - * Sender does not have RTT sample: - * - set fallback RTT (RFC 4340, 3.4) since a RTT value - * is needed in several parts (e.g. window counter); - * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. - */ - hc->tx_rtt = DCCP_FALLBACK_RTT; - hc->tx_x = hc->tx_s; - hc->tx_x <<= 6; - } - ccid3_update_send_interval(hc); - - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - - } else { - delay = ktime_us_delta(hc->tx_t_nom, now); - ccid3_pr_debug("delay=%ld\n", (long)delay); - /* - * Scheduling of packet transmissions (RFC 5348, 8.3) - * - * if (t_now > t_nom - delta) - * // send the packet now - * else - * // send the packet in (t_nom - t_now) milliseconds. - */ - if (delay >= TFRC_T_DELTA) - return (u32)delay / USEC_PER_MSEC; - - ccid3_hc_tx_update_win_count(hc, now); - } - - /* prepare to send now (add options etc.) */ - dp->dccps_hc_tx_insert_options = 1; - DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count; - - /* set the nominal send time for the next following packet */ - hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi); - return CCID_PACKET_SEND_AT_ONCE; -} - -static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - - ccid3_hc_tx_update_s(hc, len); - - if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss)) - DCCP_CRIT("packet history - out of memory!"); -} - -static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct tfrc_tx_hist_entry *acked; - ktime_t now; - unsigned long t_nfb; - u32 r_sample; - - /* we are only interested in ACKs */ - if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || - DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) - return; - /* - * Locate the acknowledged packet in the TX history. - * - * Returning "entry not found" here can for instance happen when - * - the host has not sent out anything (e.g. a passive server), - * - the Ack is outdated (packet with higher Ack number was received), - * - it is a bogus Ack (for a packet not sent on this connection). - */ - acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb)); - if (acked == NULL) - return; - /* For the sake of RTT sampling, ignore/remove all older entries */ - tfrc_tx_hist_purge(&acked->next); - - /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ - now = ktime_get_real(); - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); - hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); - - /* - * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 - */ - if (hc->tx_state == TFRC_SSTATE_NO_FBACK) { - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); - - if (hc->tx_t_rto == 0) { - /* - * Initial feedback packet: Larger Initial Windows (4.2) - */ - hc->tx_x = rfc3390_initial_rate(sk); - hc->tx_t_ld = now; - - ccid3_update_send_interval(hc); - - goto done_computing_x; - } else if (hc->tx_p == 0) { - /* - * First feedback after nofeedback timer expiry (4.3) - */ - goto done_computing_x; - } - } - - /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ - if (hc->tx_p > 0) - hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p); - ccid3_hc_tx_update_x(sk, &now); - -done_computing_x: - ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " - "p=%u, X_calc=%u, X_recv=%u, X=%u\n", - dccp_role(sk), sk, hc->tx_rtt, r_sample, - hc->tx_s, hc->tx_p, hc->tx_x_calc, - (unsigned int)(hc->tx_x_recv >> 6), - (unsigned int)(hc->tx_x >> 6)); - - /* unschedule no feedback timer */ - sk_stop_timer(sk, &hc->tx_no_feedback_timer); - - /* - * As we have calculated new ipi, delta, t_nom it is possible - * that we now can send a packet, so wake up dccp_wait_for_ccid - */ - sk->sk_write_space(sk); - - /* - * Update timeout interval for the nofeedback timer. In order to control - * rate halving on networks with very low RTTs (<= 1 ms), use per-route - * tunable RTAX_RTO_MIN value as the lower bound. - */ - hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, - USEC_PER_SEC/HZ * tcp_rto_min(sk)); - /* - * Schedule no feedback timer to expire in - * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) - */ - t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); - - ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " - "expire in %lu jiffies (%luus)\n", - dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); - - sk_reset_timer(sk, &hc->tx_no_feedback_timer, - jiffies + usecs_to_jiffies(t_nfb)); -} - -static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - __be32 opt_val; - - switch (option) { - case TFRC_OPT_RECEIVE_RATE: - case TFRC_OPT_LOSS_EVENT_RATE: - /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ - if (packet_type == DCCP_PKT_DATA) - break; - if (unlikely(optlen != 4)) { - DCCP_WARN("%s(%p), invalid len %d for %u\n", - dccp_role(sk), sk, optlen, option); - return -EINVAL; - } - opt_val = ntohl(get_unaligned((__be32 *)optval)); - - if (option == TFRC_OPT_RECEIVE_RATE) { - /* Receive Rate is kept in units of 64 bytes/second */ - hc->tx_x_recv = opt_val; - hc->tx_x_recv <<= 6; - - ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", - dccp_role(sk), sk, opt_val); - } else { - /* Update the fixpoint Loss Event Rate fraction */ - hc->tx_p = tfrc_invert_loss_event_rate(opt_val); - - ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", - dccp_role(sk), sk, opt_val); - } - } - return 0; -} - -static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid3_hc_tx_sock *hc = ccid_priv(ccid); - - hc->tx_state = TFRC_SSTATE_NO_SENT; - hc->tx_hist = NULL; - hc->sk = sk; - timer_setup(&hc->tx_no_feedback_timer, - ccid3_hc_tx_no_feedback_timer, 0); - return 0; -} - -static void ccid3_hc_tx_exit(struct sock *sk) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - - sk_stop_timer(sk, &hc->tx_no_feedback_timer); - tfrc_tx_hist_purge(&hc->tx_hist); -} - -static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) -{ - info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto; - info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt; -} - -static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct tfrc_tx_info tfrc; - const void *val; - - switch (optname) { - case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(tfrc)) - return -EINVAL; - memset(&tfrc, 0, sizeof(tfrc)); - tfrc.tfrctx_x = hc->tx_x; - tfrc.tfrctx_x_recv = hc->tx_x_recv; - tfrc.tfrctx_x_calc = hc->tx_x_calc; - tfrc.tfrctx_rtt = hc->tx_rtt; - tfrc.tfrctx_p = hc->tx_p; - tfrc.tfrctx_rto = hc->tx_t_rto; - tfrc.tfrctx_ipi = hc->tx_t_ipi; - len = sizeof(tfrc); - val = &tfrc; - break; - default: - return -ENOPROTOOPT; - } - - if (put_user(len, optlen) || copy_to_user(optval, val, len)) - return -EFAULT; - - return 0; -} - -/* - * Receiver Half-Connection Routines - */ - -/* CCID3 feedback types */ -enum ccid3_fback_type { - CCID3_FBACK_NONE = 0, - CCID3_FBACK_INITIAL, - CCID3_FBACK_PERIODIC, - CCID3_FBACK_PARAM_CHANGE -}; - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) -{ - static const char *const ccid3_rx_state_names[] = { - [TFRC_RSTATE_NO_DATA] = "NO_DATA", - [TFRC_RSTATE_DATA] = "DATA", - }; - - return ccid3_rx_state_names[state]; -} -#endif - -static void ccid3_hc_rx_set_state(struct sock *sk, - enum ccid3_hc_rx_states state) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - enum ccid3_hc_rx_states oldstate = hc->rx_state; - - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_rx_state_name(oldstate), - ccid3_rx_state_name(state)); - WARN_ON(state == oldstate); - hc->rx_state = state; -} - -static void ccid3_hc_rx_send_feedback(struct sock *sk, - const struct sk_buff *skb, - enum ccid3_fback_type fbtype) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - ktime_t now = ktime_get(); - s64 delta = 0; - - switch (fbtype) { - case CCID3_FBACK_INITIAL: - hc->rx_x_recv = 0; - hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */ - break; - case CCID3_FBACK_PARAM_CHANGE: - /* - * When parameters change (new loss or p > p_prev), we do not - * have a reliable estimate for R_m of [RFC 3448, 6.2] and so - * need to reuse the previous value of X_recv. However, when - * X_recv was 0 (due to early loss), this would kill X down to - * s/t_mbi (i.e. one packet in 64 seconds). - * To avoid such drastic reduction, we approximate X_recv as - * the number of bytes since last feedback. - * This is a safe fallback, since X is bounded above by X_calc. - */ - if (hc->rx_x_recv > 0) - break; - fallthrough; - case CCID3_FBACK_PERIODIC: - delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback); - if (delta <= 0) - delta = 1; - hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta); - break; - default: - return; - } - - ccid3_pr_debug("Interval %lldusec, X_recv=%u, 1/p=%u\n", delta, - hc->rx_x_recv, hc->rx_pinv); - - hc->rx_tstamp_last_feedback = now; - hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval; - hc->rx_bytes_recv = 0; - - dp->dccps_hc_rx_insert_options = 1; - dccp_send_ack(sk); -} - -static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) -{ - const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - __be32 x_recv, pinv; - - if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) - return 0; - - if (dccp_packet_without_ack(skb)) - return 0; - - x_recv = htonl(hc->rx_x_recv); - pinv = htonl(hc->rx_pinv); - - if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE, - &pinv, sizeof(pinv)) || - dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE, - &x_recv, sizeof(x_recv))) - return -1; - - return 0; -} - -/** - * ccid3_first_li - Implements [RFC 5348, 6.3.1] - * @sk: socket to calculate loss interval for - * - * Determine the length of the first loss interval via inverse lookup. - * Assume that X_recv can be computed by the throughput equation - * s - * X_recv = -------- - * R * fval - * Find some p such that f(p) = fval; return 1/p (scaled). - */ -static u32 ccid3_first_li(struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - u32 x_recv, p; - s64 delta; - u64 fval; - - if (hc->rx_rtt == 0) { - DCCP_WARN("No RTT estimate available, using fallback RTT\n"); - hc->rx_rtt = DCCP_FALLBACK_RTT; - } - - delta = ktime_us_delta(ktime_get(), hc->rx_tstamp_last_feedback); - if (delta <= 0) - delta = 1; - x_recv = scaled_div32(hc->rx_bytes_recv, delta); - if (x_recv == 0) { /* would also trigger divide-by-zero */ - DCCP_WARN("X_recv==0\n"); - if (hc->rx_x_recv == 0) { - DCCP_BUG("stored value of X_recv is zero"); - return ~0U; - } - x_recv = hc->rx_x_recv; - } - - fval = scaled_div(hc->rx_s, hc->rx_rtt); - fval = scaled_div32(fval, x_recv); - p = tfrc_calc_x_reverse_lookup(fval); - - ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " - "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); - - return p == 0 ? ~0U : scaled_div(1, p); -} - -static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; - const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; - const bool is_data_packet = dccp_data_packet(skb); - - if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) { - if (is_data_packet) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - do_feedback = CCID3_FBACK_INITIAL; - ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); - hc->rx_s = payload; - /* - * Not necessary to update rx_bytes_recv here, - * since X_recv = 0 for the first feedback packet (cf. - * RFC 3448, 6.3) -- gerrit - */ - } - goto update_records; - } - - if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb)) - return; /* done receiving */ - - if (is_data_packet) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - /* - * Update moving-average of s and the sum of received payload bytes - */ - hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9); - hc->rx_bytes_recv += payload; - } - - /* - * Perform loss detection and handle pending losses - */ - if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist, - skb, ndp, ccid3_first_li, sk)) { - do_feedback = CCID3_FBACK_PARAM_CHANGE; - goto done_receiving; - } - - if (tfrc_rx_hist_loss_pending(&hc->rx_hist)) - return; /* done receiving */ - - /* - * Handle data packets: RTT sampling and monitoring p - */ - if (unlikely(!is_data_packet)) - goto update_records; - - if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) { - const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb); - /* - * Empty loss history: no loss so far, hence p stays 0. - * Sample RTT values, since an RTT estimate is required for the - * computation of p when the first loss occurs; RFC 3448, 6.3.1. - */ - if (sample != 0) - hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9); - - } else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) { - /* - * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean - * has decreased (resp. p has increased), send feedback now. - */ - do_feedback = CCID3_FBACK_PARAM_CHANGE; - } - - /* - * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 - */ - if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3) - do_feedback = CCID3_FBACK_PERIODIC; - -update_records: - tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp); - -done_receiving: - if (do_feedback) - ccid3_hc_rx_send_feedback(sk, skb, do_feedback); -} - -static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid_priv(ccid); - - hc->rx_state = TFRC_RSTATE_NO_DATA; - tfrc_lh_init(&hc->rx_li_hist); - return tfrc_rx_hist_alloc(&hc->rx_hist); -} - -static void ccid3_hc_rx_exit(struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - - tfrc_rx_hist_purge(&hc->rx_hist); - tfrc_lh_cleanup(&hc->rx_li_hist); -} - -static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) -{ - info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state; - info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt; -} - -static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - struct tfrc_rx_info rx_info; - const void *val; - - switch (optname) { - case DCCP_SOCKOPT_CCID_RX_INFO: - if (len < sizeof(rx_info)) - return -EINVAL; - rx_info.tfrcrx_x_recv = hc->rx_x_recv; - rx_info.tfrcrx_rtt = hc->rx_rtt; - rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv); - len = sizeof(rx_info); - val = &rx_info; - break; - default: - return -ENOPROTOOPT; - } - - if (put_user(len, optlen) || copy_to_user(optval, val, len)) - return -EFAULT; - - return 0; -} - -struct ccid_operations ccid3_ops = { - .ccid_id = DCCPC_CCID3, - .ccid_name = "TCP-Friendly Rate Control", - .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock), - .ccid_hc_tx_init = ccid3_hc_tx_init, - .ccid_hc_tx_exit = ccid3_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent, - .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv, - .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options, - .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock), - .ccid_hc_rx_init = ccid3_hc_rx_init, - .ccid_hc_rx_exit = ccid3_hc_rx_exit, - .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options, - .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv, - .ccid_hc_rx_get_info = ccid3_hc_rx_get_info, - .ccid_hc_tx_get_info = ccid3_hc_tx_get_info, - .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt, - .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, -}; - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -module_param(ccid3_debug, bool, 0644); -MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages"); -#endif diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h deleted file mode 100644 index 02e0fc9f6334b..0000000000000 --- a/net/dccp/ccids/ccid3.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#ifndef _DCCP_CCID3_H_ -#define _DCCP_CCID3_H_ - -#include -#include -#include -#include -#include "lib/tfrc.h" -#include "../ccid.h" - -/* Two seconds as per RFC 5348, 4.2 */ -#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) - -/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ -#define TFRC_T_MBI 64 - -/* - * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are - * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. - * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse - * resolution of HZ < 500 means that the error is below one timer tick (t_gran) - * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). - */ -#if (HZ >= 500) -# define TFRC_T_DELTA USEC_PER_MSEC -#else -# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) -#endif - -enum ccid3_options { - TFRC_OPT_LOSS_EVENT_RATE = 192, - TFRC_OPT_LOSS_INTERVALS = 193, - TFRC_OPT_RECEIVE_RATE = 194, -}; - -/* TFRC sender states */ -enum ccid3_hc_tx_states { - TFRC_SSTATE_NO_SENT = 1, - TFRC_SSTATE_NO_FBACK, - TFRC_SSTATE_FBACK, -}; - -/** - * struct ccid3_hc_tx_sock - CCID3 sender half-connection socket - * @tx_x: Current sending rate in 64 * bytes per second - * @tx_x_recv: Receive rate in 64 * bytes per second - * @tx_x_calc: Calculated rate in bytes per second - * @tx_rtt: Estimate of current round trip time in usecs - * @tx_p: Current loss event rate (0-1) scaled by 1000000 - * @tx_s: Packet size in bytes - * @tx_t_rto: Nofeedback Timer setting in usecs - * @tx_t_ipi: Interpacket (send) interval (RFC 3448, 4.6) in usecs - * @tx_state: Sender state, one of %ccid3_hc_tx_states - * @tx_last_win_count: Last window counter sent - * @tx_t_last_win_count: Timestamp of earliest packet - * with last_win_count value sent - * @tx_no_feedback_timer: Handle to no feedback timer - * @tx_t_ld: Time last doubled during slow start - * @tx_t_nom: Nominal send time of next packet - * @tx_hist: Packet history - */ -struct ccid3_hc_tx_sock { - u64 tx_x; - u64 tx_x_recv; - u32 tx_x_calc; - u32 tx_rtt; - u32 tx_p; - u32 tx_t_rto; - u32 tx_t_ipi; - u16 tx_s; - enum ccid3_hc_tx_states tx_state:8; - u8 tx_last_win_count; - ktime_t tx_t_last_win_count; - struct timer_list tx_no_feedback_timer; - struct sock *sk; - ktime_t tx_t_ld; - ktime_t tx_t_nom; - struct tfrc_tx_hist_entry *tx_hist; -}; - -static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) -{ - struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); - BUG_ON(hctx == NULL); - return hctx; -} - -/* TFRC receiver states */ -enum ccid3_hc_rx_states { - TFRC_RSTATE_NO_DATA = 1, - TFRC_RSTATE_DATA, -}; - -/** - * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket - * @rx_last_counter: Tracks window counter (RFC 4342, 8.1) - * @rx_state: Receiver state, one of %ccid3_hc_rx_states - * @rx_bytes_recv: Total sum of DCCP payload bytes - * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3) - * @rx_rtt: Receiver estimate of RTT - * @rx_tstamp_last_feedback: Time at which last feedback was sent - * @rx_hist: Packet history (loss detection + RTT sampling) - * @rx_li_hist: Loss Interval database - * @rx_s: Received packet size in bytes - * @rx_pinv: Inverse of Loss Event Rate (RFC 4342, sec. 8.5) - */ -struct ccid3_hc_rx_sock { - u8 rx_last_counter:4; - enum ccid3_hc_rx_states rx_state:8; - u32 rx_bytes_recv; - u32 rx_x_recv; - u32 rx_rtt; - ktime_t rx_tstamp_last_feedback; - struct tfrc_rx_hist rx_hist; - struct tfrc_loss_hist rx_li_hist; - u16 rx_s; -#define rx_pinv rx_li_hist.i_mean -}; - -static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) -{ - struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); - BUG_ON(hcrx == NULL); - return hcrx; -} - -#endif /* _DCCP_CCID3_H_ */ diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c deleted file mode 100644 index da95319842bbe..0000000000000 --- a/net/dccp/ccids/lib/loss_interval.c +++ /dev/null @@ -1,184 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#include -#include "tfrc.h" - -static struct kmem_cache *tfrc_lh_slab __read_mostly; -/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */ -static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 }; - -/* implements LIFO semantics on the array */ -static inline u8 LIH_INDEX(const u8 ctr) -{ - return LIH_SIZE - 1 - (ctr % LIH_SIZE); -} - -/* the `counter' index always points at the next entry to be populated */ -static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh) -{ - return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL; -} - -/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */ -static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i) -{ - BUG_ON(i >= lh->counter); - return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length; -} - -/* - * On-demand allocation and de-allocation of entries - */ -static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh) -{ - if (lh->ring[LIH_INDEX(lh->counter)] == NULL) - lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab, - GFP_ATOMIC); - return lh->ring[LIH_INDEX(lh->counter)]; -} - -void tfrc_lh_cleanup(struct tfrc_loss_hist *lh) -{ - if (!tfrc_lh_is_initialised(lh)) - return; - - for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++) - if (lh->ring[LIH_INDEX(lh->counter)] != NULL) { - kmem_cache_free(tfrc_lh_slab, - lh->ring[LIH_INDEX(lh->counter)]); - lh->ring[LIH_INDEX(lh->counter)] = NULL; - } -} - -static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) -{ - u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0; - int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */ - - if (k <= 0) - return; - - for (i = 0; i <= k; i++) { - i_i = tfrc_lh_get_interval(lh, i); - - if (i < k) { - i_tot0 += i_i * tfrc_lh_weights[i]; - w_tot += tfrc_lh_weights[i]; - } - if (i > 0) - i_tot1 += i_i * tfrc_lh_weights[i-1]; - } - - lh->i_mean = max(i_tot0, i_tot1) / w_tot; -} - -/** - * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 - * @lh: histogram to update - * @skb: received socket triggering loss interval update - * - * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev - */ -u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) -{ - struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); - u32 old_i_mean = lh->i_mean; - s64 len; - - if (cur == NULL) /* not initialised */ - return 0; - - len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; - - if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ - return 0; - - if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) - /* - * Implements RFC 4342, 10.2: - * If a packet S (skb) exists whose seqno comes `after' the one - * starting the current loss interval (cur) and if the modulo-16 - * distance from C(cur) to C(S) is greater than 4, consider all - * subsequent packets as belonging to a new loss interval. This - * test is necessary since CCVal may wrap between intervals. - */ - cur->li_is_closed = 1; - - if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ - return 0; - - cur->li_length = len; - tfrc_lh_calc_i_mean(lh); - - return lh->i_mean < old_i_mean; -} - -/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ -static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, - struct tfrc_rx_hist_entry *new_loss) -{ - return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 && - (cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4); -} - -/** - * tfrc_lh_interval_add - Insert new record into the Loss Interval database - * @lh: Loss Interval database - * @rh: Receive history containing a fresh loss event - * @calc_first_li: Caller-dependent routine to compute length of first interval - * @sk: Used by @calc_first_li in caller-specific way (subtyping) - * - * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. - */ -int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, - u32 (*calc_first_li)(struct sock *), struct sock *sk) -{ - struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; - - if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) - return 0; - - new = tfrc_lh_demand_next(lh); - if (unlikely(new == NULL)) { - DCCP_CRIT("Cannot allocate/add loss record."); - return 0; - } - - new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; - new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval; - new->li_is_closed = 0; - - if (++lh->counter == 1) - lh->i_mean = new->li_length = (*calc_first_li)(sk); - else { - cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno); - new->li_length = dccp_delta_seqno(new->li_seqno, - tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1; - if (lh->counter > (2*LIH_SIZE)) - lh->counter -= LIH_SIZE; - - tfrc_lh_calc_i_mean(lh); - } - return 1; -} - -int __init tfrc_li_init(void) -{ - tfrc_lh_slab = kmem_cache_create("tfrc_li_hist", - sizeof(struct tfrc_loss_interval), 0, - SLAB_HWCACHE_ALIGN, NULL); - return tfrc_lh_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_li_exit(void) -{ - if (tfrc_lh_slab != NULL) { - kmem_cache_destroy(tfrc_lh_slab); - tfrc_lh_slab = NULL; - } -} diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h deleted file mode 100644 index c3d95f85e43b1..0000000000000 --- a/net/dccp/ccids/lib/loss_interval.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _DCCP_LI_HIST_ -#define _DCCP_LI_HIST_ -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ -#include -#include -#include - -/* - * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than - * NINTERVAL, since the `open' interval I_0 is always stored as the first entry. - */ -#define NINTERVAL 8 -#define LIH_SIZE (NINTERVAL + 1) - -/** - * tfrc_loss_interval - Loss history record for TFRC-based protocols - * @li_seqno: Highest received seqno before the start of loss - * @li_ccval: The CCVal belonging to @li_seqno - * @li_is_closed: Whether @li_seqno is older than 1 RTT - * @li_length: Loss interval sequence length - */ -struct tfrc_loss_interval { - u64 li_seqno:48, - li_ccval:4, - li_is_closed:1; - u32 li_length; -}; - -/** - * tfrc_loss_hist - Loss record database - * @ring: Circular queue managed in LIFO manner - * @counter: Current count of entries (can be more than %LIH_SIZE) - * @i_mean: Current Average Loss Interval [RFC 3448, 5.4] - */ -struct tfrc_loss_hist { - struct tfrc_loss_interval *ring[LIH_SIZE]; - u8 counter; - u32 i_mean; -}; - -static inline void tfrc_lh_init(struct tfrc_loss_hist *lh) -{ - memset(lh, 0, sizeof(struct tfrc_loss_hist)); -} - -static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh) -{ - return lh->counter > 0; -} - -static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) -{ - return min(lh->counter, (u8)LIH_SIZE); -} - -struct tfrc_rx_hist; - -int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, - u32 (*first_li)(struct sock *), struct sock *); -u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); -void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); - -#endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c deleted file mode 100644 index 0cdda3c66fb5e..0000000000000 --- a/net/dccp/ccids/lib/packet_history.c +++ /dev/null @@ -1,439 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ - -#include -#include -#include "packet_history.h" -#include "../../dccp.h" - -/* - * Transmitter History Routines - */ -static struct kmem_cache *tfrc_tx_hist_slab; - -int __init tfrc_tx_packet_history_init(void) -{ - tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist", - sizeof(struct tfrc_tx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, NULL); - return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_tx_packet_history_exit(void) -{ - if (tfrc_tx_hist_slab != NULL) { - kmem_cache_destroy(tfrc_tx_hist_slab); - tfrc_tx_hist_slab = NULL; - } -} - -int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) -{ - struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); - - if (entry == NULL) - return -ENOBUFS; - entry->seqno = seqno; - entry->stamp = ktime_get_real(); - entry->next = *headp; - *headp = entry; - return 0; -} - -void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) -{ - struct tfrc_tx_hist_entry *head = *headp; - - while (head != NULL) { - struct tfrc_tx_hist_entry *next = head->next; - - kmem_cache_free(tfrc_tx_hist_slab, head); - head = next; - } - - *headp = NULL; -} - -/* - * Receiver History Routines - */ -static struct kmem_cache *tfrc_rx_hist_slab; - -int __init tfrc_rx_packet_history_init(void) -{ - tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache", - sizeof(struct tfrc_rx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, NULL); - return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_rx_packet_history_exit(void) -{ - if (tfrc_rx_hist_slab != NULL) { - kmem_cache_destroy(tfrc_rx_hist_slab); - tfrc_rx_hist_slab = NULL; - } -} - -static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry, - const struct sk_buff *skb, - const u64 ndp) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - - entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - entry->tfrchrx_ccval = dh->dccph_ccval; - entry->tfrchrx_type = dh->dccph_type; - entry->tfrchrx_ndp = ndp; - entry->tfrchrx_tstamp = ktime_get_real(); -} - -void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, - const struct sk_buff *skb, - const u64 ndp) -{ - struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h); - - tfrc_rx_hist_entry_from_skb(entry, skb, ndp); -} - -/* has the packet contained in skb been seen before? */ -int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) -{ - const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq; - int i; - - if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0) - return 1; - - for (i = 1; i <= h->loss_count; i++) - if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq) - return 1; - - return 0; -} - -static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) -{ - const u8 idx_a = tfrc_rx_hist_index(h, a), - idx_b = tfrc_rx_hist_index(h, b); - - swap(h->ring[idx_a], h->ring[idx_b]); -} - -/* - * Private helper functions for loss detection. - * - * In the descriptions, `Si' refers to the sequence number of entry number i, - * whose NDP count is `Ni' (lower case is used for variables). - * Note: All __xxx_loss functions expect that a test against duplicates has been - * performed already: the seqno of the skb must not be less than the seqno - * of loss_prev; and it must not equal that of any valid history entry. - */ -static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ - h->loss_count = 1; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); - } -} - -static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */ - h->loss_count = 2; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2); - return; - } - - /* S0 < S2 < S1 */ - - if (dccp_loss_free(s0, s2, n2)) { - u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; - - if (dccp_loss_free(s2, s1, n1)) { - /* hole is filled: S0, S2, and S1 are consecutive */ - h->loss_count = 0; - h->loss_start = tfrc_rx_hist_index(h, 1); - } else - /* gap between S2 and S1: just update loss_prev */ - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); - - } else { /* gap between S0 and S2 */ - /* - * Reorder history to insert S2 between S0 and S1 - */ - tfrc_rx_hist_swap(h, 0, 3); - h->loss_start = tfrc_rx_hist_index(h, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2); - h->loss_count = 2; - } -} - -/* return 1 if a new loss event has been identified */ -static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, - s3 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */ - h->loss_count = 3; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3); - return 1; - } - - /* S3 < S2 */ - - if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */ - /* - * Reorder history to insert S3 between S1 and S2 - */ - tfrc_rx_hist_swap(h, 2, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3); - h->loss_count = 3; - return 1; - } - - /* S0 < S3 < S1 */ - - if (dccp_loss_free(s0, s3, n3)) { - u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; - - if (dccp_loss_free(s3, s1, n1)) { - /* hole between S0 and S1 filled by S3 */ - u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp; - - if (dccp_loss_free(s1, s2, n2)) { - /* entire hole filled by S0, S3, S1, S2 */ - h->loss_start = tfrc_rx_hist_index(h, 2); - h->loss_count = 0; - } else { - /* gap remains between S1 and S2 */ - h->loss_start = tfrc_rx_hist_index(h, 1); - h->loss_count = 1; - } - - } else /* gap exists between S3 and S1, loss_count stays at 2 */ - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3); - - return 0; - } - - /* - * The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3 - * Reorder history to insert S3 between S0 and S1. - */ - tfrc_rx_hist_swap(h, 0, 3); - h->loss_start = tfrc_rx_hist_index(h, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3); - h->loss_count = 3; - - return 1; -} - -/* recycle RX history records to continue loss detection if necessary */ -static void __three_after_loss(struct tfrc_rx_hist *h) -{ - /* - * At this stage we know already that there is a gap between S0 and S1 - * (since S0 was the highest sequence number received before detecting - * the loss). To recycle the loss record, it is thus only necessary to - * check for other possible gaps between S1/S2 and between S2/S3. - */ - u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, - s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno; - u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp, - n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp; - - if (dccp_loss_free(s1, s2, n2)) { - - if (dccp_loss_free(s2, s3, n3)) { - /* no gap between S2 and S3: entire hole is filled */ - h->loss_start = tfrc_rx_hist_index(h, 3); - h->loss_count = 0; - } else { - /* gap between S2 and S3 */ - h->loss_start = tfrc_rx_hist_index(h, 2); - h->loss_count = 1; - } - - } else { /* gap between S1 and S2 */ - h->loss_start = tfrc_rx_hist_index(h, 1); - h->loss_count = 2; - } -} - -/** - * tfrc_rx_handle_loss - Loss detection and further processing - * @h: The non-empty RX history object - * @lh: Loss Intervals database to update - * @skb: Currently received packet - * @ndp: The NDP count belonging to @skb - * @calc_first_li: Caller-dependent computation of first loss interval in @lh - * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) - * - * Chooses action according to pending loss, updates LI database when a new - * loss was detected, and does required post-processing. Returns 1 when caller - * should send feedback, 0 otherwise. - * Since it also takes care of reordering during loss detection and updates the - * records accordingly, the caller should not perform any more RX history - * operations when loss_count is greater than 0 after calling this function. - */ -int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*calc_first_li)(struct sock *), struct sock *sk) -{ - int is_new_loss = 0; - - if (h->loss_count == 0) { - __do_track_loss(h, skb, ndp); - } else if (h->loss_count == 1) { - __one_after_loss(h, skb, ndp); - } else if (h->loss_count != 2) { - DCCP_BUG("invalid loss_count %d", h->loss_count); - } else if (__two_after_loss(h, skb, ndp)) { - /* - * Update Loss Interval database and recycle RX records - */ - is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); - __three_after_loss(h); - } - return is_new_loss; -} - -int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) -{ - int i; - - for (i = 0; i <= TFRC_NDUPACK; i++) { - h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); - if (h->ring[i] == NULL) - goto out_free; - } - - h->loss_count = h->loss_start = 0; - return 0; - -out_free: - while (i-- != 0) { - kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); - h->ring[i] = NULL; - } - return -ENOBUFS; -} - -void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) -{ - int i; - - for (i = 0; i <= TFRC_NDUPACK; ++i) - if (h->ring[i] != NULL) { - kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); - h->ring[i] = NULL; - } -} - -/** - * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against - * @h: The non-empty RX history object - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) -{ - return h->ring[0]; -} - -/** - * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry - * @h: The non-empty RX history object - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) -{ - return h->ring[h->rtt_sample_prev]; -} - -/** - * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal - * @h: receive histogram - * @skb: packet containing timestamp. - * - * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able - * to compute a sample with given data - calling function should check this. - */ -u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) -{ - u32 sample = 0, - delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - - if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ - if (h->rtt_sample_prev == 2) { /* previous candidate stored */ - sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - if (sample) - sample = 4 / sample * - ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); - else /* - * FIXME: This condition is in principle not - * possible but occurs when CCID is used for - * two-way data traffic. I have tried to trace - * it, but the cause does not seem to be here. - */ - DCCP_BUG("please report to dccp@vger.kernel.org" - " => prev = %u, last = %u", - tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - } else if (delta_v < 1) { - h->rtt_sample_prev = 1; - goto keep_ref_for_next_time; - } - - } else if (delta_v == 4) /* optimal match */ - sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); - else { /* suboptimal match */ - h->rtt_sample_prev = 2; - goto keep_ref_for_next_time; - } - - if (unlikely(sample > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %u too large, using max\n", sample); - sample = DCCP_SANE_RTT_MAX; - } - - h->rtt_sample_prev = 0; /* use current entry as next reference */ -keep_ref_for_next_time: - - return sample; -} diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h deleted file mode 100644 index 159cc9326eabc..0000000000000 --- a/net/dccp/ccids/lib/packet_history.h +++ /dev/null @@ -1,142 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Packet RX/TX history data structures and routines for TFRC-based protocols. - * - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo . - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ - -#ifndef _DCCP_PKT_HIST_ -#define _DCCP_PKT_HIST_ - -#include -#include -#include "tfrc.h" - -/** - * tfrc_tx_hist_entry - Simple singly-linked TX history list - * @next: next oldest entry (LIFO order) - * @seqno: sequence number of this entry - * @stamp: send time of packet with sequence number @seqno - */ -struct tfrc_tx_hist_entry { - struct tfrc_tx_hist_entry *next; - u64 seqno; - ktime_t stamp; -}; - -static inline struct tfrc_tx_hist_entry * - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) -{ - while (head != NULL && head->seqno != seqno) - head = head->next; - return head; -} - -int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); -void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); - -/* Subtraction a-b modulo-16, respects circular wrap-around */ -#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) - -/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */ -#define TFRC_NDUPACK 3 - -/** - * tfrc_rx_hist_entry - Store information about a single received packet - * @tfrchrx_seqno: DCCP packet sequence number - * @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1) - * @tfrchrx_ndp: the NDP count (if any) of the packet - * @tfrchrx_tstamp: actual receive time of packet - */ -struct tfrc_rx_hist_entry { - u64 tfrchrx_seqno:48, - tfrchrx_ccval:4, - tfrchrx_type:4; - u64 tfrchrx_ndp:48; - ktime_t tfrchrx_tstamp; -}; - -/** - * tfrc_rx_hist - RX history structure for TFRC-based protocols - * @ring: Packet history for RTT sampling and loss detection - * @loss_count: Number of entries in circular history - * @loss_start: Movable index (for loss detection) - * @rtt_sample_prev: Used during RTT sampling, points to candidate entry - */ -struct tfrc_rx_hist { - struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; - u8 loss_count:2, - loss_start:2; -#define rtt_sample_prev loss_start -}; - -/** - * tfrc_rx_hist_index - index to reach n-th entry after loss_start - */ -static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n) -{ - return (h->loss_start + n) & TFRC_NDUPACK; -} - -/** - * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h) -{ - return h->ring[tfrc_rx_hist_index(h, h->loss_count)]; -} - -/** - * tfrc_rx_hist_entry - return the n-th history entry after loss_start - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n) -{ - return h->ring[tfrc_rx_hist_index(h, n)]; -} - -/** - * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h) -{ - return h->ring[h->loss_start]; -} - -/* indicate whether previously a packet was detected missing */ -static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) -{ - return h->loss_count > 0; -} - -void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, - const u64 ndp); - -int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); - -struct tfrc_loss_hist; -int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*first_li)(struct sock *sk), struct sock *sk); -u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb); -int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); -void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); - -#endif /* _DCCP_PKT_HIST_ */ diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c deleted file mode 100644 index d7f265e1f50c8..0000000000000 --- a/net/dccp/ccids/lib/tfrc.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * TFRC library initialisation - * - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2007 Arnaldo Carvalho de Melo - */ -#include -#include "tfrc.h" - -#ifdef CONFIG_IP_DCCP_TFRC_DEBUG -bool tfrc_debug; -module_param(tfrc_debug, bool, 0644); -MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages"); -#endif - -int __init tfrc_lib_init(void) -{ - int rc = tfrc_li_init(); - - if (rc) - goto out; - - rc = tfrc_tx_packet_history_init(); - if (rc) - goto out_free_loss_intervals; - - rc = tfrc_rx_packet_history_init(); - if (rc) - goto out_free_tx_history; - return 0; - -out_free_tx_history: - tfrc_tx_packet_history_exit(); -out_free_loss_intervals: - tfrc_li_exit(); -out: - return rc; -} - -void tfrc_lib_exit(void) -{ - tfrc_rx_packet_history_exit(); - tfrc_tx_packet_history_exit(); - tfrc_li_exit(); -} diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h deleted file mode 100644 index 0a63e8750cc5b..0000000000000 --- a/net/dccp/ccids/lib/tfrc.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _TFRC_H_ -#define _TFRC_H_ -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-6 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ -#include -#include -#include "../../dccp.h" - -/* internal includes that this library exports: */ -#include "loss_interval.h" -#include "packet_history.h" - -#ifdef CONFIG_IP_DCCP_TFRC_DEBUG -extern bool tfrc_debug; -#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a) -#else -#define tfrc_pr_debug(format, a...) -#endif - -/* integer-arithmetic divisions of type (a * 1000000)/b */ -static inline u64 scaled_div(u64 a, u64 b) -{ - BUG_ON(b == 0); - return div64_u64(a * 1000000, b); -} - -static inline u32 scaled_div32(u64 a, u64 b) -{ - u64 result = scaled_div(a, b); - - if (result > UINT_MAX) { - DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX", - (unsigned long long)a, (unsigned long long)b); - return UINT_MAX; - } - return result; -} - -/** - * tfrc_ewma - Exponentially weighted moving average - * @weight: Weight to be used as damping factor, in units of 1/10 - */ -static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) -{ - return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval; -} - -u32 tfrc_calc_x(u16 s, u32 R, u32 p); -u32 tfrc_calc_x_reverse_lookup(u32 fvalue); -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); - -int tfrc_tx_packet_history_init(void); -void tfrc_tx_packet_history_exit(void); -int tfrc_rx_packet_history_init(void); -void tfrc_rx_packet_history_exit(void); - -int tfrc_li_init(void); -void tfrc_li_exit(void); - -#ifdef CONFIG_IP_DCCP_TFRC_LIB -int tfrc_lib_init(void); -void tfrc_lib_exit(void); -#else -#define tfrc_lib_init() (0) -#define tfrc_lib_exit() -#endif -#endif /* _TFRC_H_ */ diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c deleted file mode 100644 index 92a8c6bea3167..0000000000000 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ /dev/null @@ -1,702 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005 Ian McDonald - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ - -#include -#include "../../dccp.h" -#include "tfrc.h" - -#define TFRC_CALC_X_ARRSIZE 500 -#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */ -#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE) - -/* - TFRC TCP Reno Throughput Equation Lookup Table for f(p) - - The following two-column lookup table implements a part of the TCP throughput - equation from [RFC 3448, sec. 3.1]: - - s - X_calc = -------------------------------------------------------------- - R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3)) - - Where: - X is the transmit rate in bytes/second - s is the packet size in bytes - R is the round trip time in seconds - p is the loss event rate, between 0 and 1.0, of the number of loss - events as a fraction of the number of packets transmitted - t_RTO is the TCP retransmission timeout value in seconds - b is the number of packets acknowledged by a single TCP ACK - - We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes: - - s - X_calc = ------------------------------------------------------- - R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3)) - - which we can break down into: - - s - X_calc = --------- - R * f(p) - - where f(p) is given for 0 < p <= 1 by: - - f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3) - - Since this is kernel code, floating-point arithmetic is avoided in favour of - integer arithmetic. This means that nearly all fractional parameters are - scaled by 1000000: - * the parameters p and R - * the return result f(p) - The lookup table therefore actually tabulates the following function g(q): - - g(q) = 1000000 * f(q/1000000) - - Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer - granularity for the practically more relevant case of small values of p (up to - 5%), the second column is used; the first one ranges up to 100%. This split - corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also - determines the smallest resolution possible with this lookup table: - - TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE - - The entire table is generated by: - for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) { - lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE); - lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE); - } - - With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1, - lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%) - lookup[M][0] = g(1000000) = 1000000 * f(100%) - lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%) - lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%) - - In summary, the two columns represent f(p) for the following ranges: - * The first column is for 0.002 <= p <= 1.0 - * The second column is for 0.0001 <= p <= 0.05 - Where the columns overlap, the second (finer-grained) is given preference, - i.e. the first column is used only for p >= 0.05. - */ -static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { - { 37172, 8172 }, - { 53499, 11567 }, - { 66664, 14180 }, - { 78298, 16388 }, - { 89021, 18339 }, - { 99147, 20108 }, - { 108858, 21738 }, - { 118273, 23260 }, - { 127474, 24693 }, - { 136520, 26052 }, - { 145456, 27348 }, - { 154316, 28589 }, - { 163130, 29783 }, - { 171919, 30935 }, - { 180704, 32049 }, - { 189502, 33130 }, - { 198328, 34180 }, - { 207194, 35202 }, - { 216114, 36198 }, - { 225097, 37172 }, - { 234153, 38123 }, - { 243294, 39055 }, - { 252527, 39968 }, - { 261861, 40864 }, - { 271305, 41743 }, - { 280866, 42607 }, - { 290553, 43457 }, - { 300372, 44293 }, - { 310333, 45117 }, - { 320441, 45929 }, - { 330705, 46729 }, - { 341131, 47518 }, - { 351728, 48297 }, - { 362501, 49066 }, - { 373460, 49826 }, - { 384609, 50577 }, - { 395958, 51320 }, - { 407513, 52054 }, - { 419281, 52780 }, - { 431270, 53499 }, - { 443487, 54211 }, - { 455940, 54916 }, - { 468635, 55614 }, - { 481581, 56306 }, - { 494785, 56991 }, - { 508254, 57671 }, - { 521996, 58345 }, - { 536019, 59014 }, - { 550331, 59677 }, - { 564939, 60335 }, - { 579851, 60988 }, - { 595075, 61636 }, - { 610619, 62279 }, - { 626491, 62918 }, - { 642700, 63553 }, - { 659253, 64183 }, - { 676158, 64809 }, - { 693424, 65431 }, - { 711060, 66050 }, - { 729073, 66664 }, - { 747472, 67275 }, - { 766266, 67882 }, - { 785464, 68486 }, - { 805073, 69087 }, - { 825103, 69684 }, - { 845562, 70278 }, - { 866460, 70868 }, - { 887805, 71456 }, - { 909606, 72041 }, - { 931873, 72623 }, - { 954614, 73202 }, - { 977839, 73778 }, - { 1001557, 74352 }, - { 1025777, 74923 }, - { 1050508, 75492 }, - { 1075761, 76058 }, - { 1101544, 76621 }, - { 1127867, 77183 }, - { 1154739, 77741 }, - { 1182172, 78298 }, - { 1210173, 78852 }, - { 1238753, 79405 }, - { 1267922, 79955 }, - { 1297689, 80503 }, - { 1328066, 81049 }, - { 1359060, 81593 }, - { 1390684, 82135 }, - { 1422947, 82675 }, - { 1455859, 83213 }, - { 1489430, 83750 }, - { 1523671, 84284 }, - { 1558593, 84817 }, - { 1594205, 85348 }, - { 1630518, 85878 }, - { 1667543, 86406 }, - { 1705290, 86932 }, - { 1743770, 87457 }, - { 1782994, 87980 }, - { 1822973, 88501 }, - { 1863717, 89021 }, - { 1905237, 89540 }, - { 1947545, 90057 }, - { 1990650, 90573 }, - { 2034566, 91087 }, - { 2079301, 91600 }, - { 2124869, 92111 }, - { 2171279, 92622 }, - { 2218543, 93131 }, - { 2266673, 93639 }, - { 2315680, 94145 }, - { 2365575, 94650 }, - { 2416371, 95154 }, - { 2468077, 95657 }, - { 2520707, 96159 }, - { 2574271, 96660 }, - { 2628782, 97159 }, - { 2684250, 97658 }, - { 2740689, 98155 }, - { 2798110, 98651 }, - { 2856524, 99147 }, - { 2915944, 99641 }, - { 2976382, 100134 }, - { 3037850, 100626 }, - { 3100360, 101117 }, - { 3163924, 101608 }, - { 3228554, 102097 }, - { 3294263, 102586 }, - { 3361063, 103073 }, - { 3428966, 103560 }, - { 3497984, 104045 }, - { 3568131, 104530 }, - { 3639419, 105014 }, - { 3711860, 105498 }, - { 3785467, 105980 }, - { 3860253, 106462 }, - { 3936229, 106942 }, - { 4013410, 107422 }, - { 4091808, 107902 }, - { 4171435, 108380 }, - { 4252306, 108858 }, - { 4334431, 109335 }, - { 4417825, 109811 }, - { 4502501, 110287 }, - { 4588472, 110762 }, - { 4675750, 111236 }, - { 4764349, 111709 }, - { 4854283, 112182 }, - { 4945564, 112654 }, - { 5038206, 113126 }, - { 5132223, 113597 }, - { 5227627, 114067 }, - { 5324432, 114537 }, - { 5422652, 115006 }, - { 5522299, 115474 }, - { 5623389, 115942 }, - { 5725934, 116409 }, - { 5829948, 116876 }, - { 5935446, 117342 }, - { 6042439, 117808 }, - { 6150943, 118273 }, - { 6260972, 118738 }, - { 6372538, 119202 }, - { 6485657, 119665 }, - { 6600342, 120128 }, - { 6716607, 120591 }, - { 6834467, 121053 }, - { 6953935, 121514 }, - { 7075025, 121976 }, - { 7197752, 122436 }, - { 7322131, 122896 }, - { 7448175, 123356 }, - { 7575898, 123815 }, - { 7705316, 124274 }, - { 7836442, 124733 }, - { 7969291, 125191 }, - { 8103877, 125648 }, - { 8240216, 126105 }, - { 8378321, 126562 }, - { 8518208, 127018 }, - { 8659890, 127474 }, - { 8803384, 127930 }, - { 8948702, 128385 }, - { 9095861, 128840 }, - { 9244875, 129294 }, - { 9395760, 129748 }, - { 9548529, 130202 }, - { 9703198, 130655 }, - { 9859782, 131108 }, - { 10018296, 131561 }, - { 10178755, 132014 }, - { 10341174, 132466 }, - { 10505569, 132917 }, - { 10671954, 133369 }, - { 10840345, 133820 }, - { 11010757, 134271 }, - { 11183206, 134721 }, - { 11357706, 135171 }, - { 11534274, 135621 }, - { 11712924, 136071 }, - { 11893673, 136520 }, - { 12076536, 136969 }, - { 12261527, 137418 }, - { 12448664, 137867 }, - { 12637961, 138315 }, - { 12829435, 138763 }, - { 13023101, 139211 }, - { 13218974, 139658 }, - { 13417071, 140106 }, - { 13617407, 140553 }, - { 13819999, 140999 }, - { 14024862, 141446 }, - { 14232012, 141892 }, - { 14441465, 142339 }, - { 14653238, 142785 }, - { 14867346, 143230 }, - { 15083805, 143676 }, - { 15302632, 144121 }, - { 15523842, 144566 }, - { 15747453, 145011 }, - { 15973479, 145456 }, - { 16201939, 145900 }, - { 16432847, 146345 }, - { 16666221, 146789 }, - { 16902076, 147233 }, - { 17140429, 147677 }, - { 17381297, 148121 }, - { 17624696, 148564 }, - { 17870643, 149007 }, - { 18119154, 149451 }, - { 18370247, 149894 }, - { 18623936, 150336 }, - { 18880241, 150779 }, - { 19139176, 151222 }, - { 19400759, 151664 }, - { 19665007, 152107 }, - { 19931936, 152549 }, - { 20201564, 152991 }, - { 20473907, 153433 }, - { 20748982, 153875 }, - { 21026807, 154316 }, - { 21307399, 154758 }, - { 21590773, 155199 }, - { 21876949, 155641 }, - { 22165941, 156082 }, - { 22457769, 156523 }, - { 22752449, 156964 }, - { 23049999, 157405 }, - { 23350435, 157846 }, - { 23653774, 158287 }, - { 23960036, 158727 }, - { 24269236, 159168 }, - { 24581392, 159608 }, - { 24896521, 160049 }, - { 25214642, 160489 }, - { 25535772, 160929 }, - { 25859927, 161370 }, - { 26187127, 161810 }, - { 26517388, 162250 }, - { 26850728, 162690 }, - { 27187165, 163130 }, - { 27526716, 163569 }, - { 27869400, 164009 }, - { 28215234, 164449 }, - { 28564236, 164889 }, - { 28916423, 165328 }, - { 29271815, 165768 }, - { 29630428, 166208 }, - { 29992281, 166647 }, - { 30357392, 167087 }, - { 30725779, 167526 }, - { 31097459, 167965 }, - { 31472452, 168405 }, - { 31850774, 168844 }, - { 32232445, 169283 }, - { 32617482, 169723 }, - { 33005904, 170162 }, - { 33397730, 170601 }, - { 33792976, 171041 }, - { 34191663, 171480 }, - { 34593807, 171919 }, - { 34999428, 172358 }, - { 35408544, 172797 }, - { 35821174, 173237 }, - { 36237335, 173676 }, - { 36657047, 174115 }, - { 37080329, 174554 }, - { 37507197, 174993 }, - { 37937673, 175433 }, - { 38371773, 175872 }, - { 38809517, 176311 }, - { 39250924, 176750 }, - { 39696012, 177190 }, - { 40144800, 177629 }, - { 40597308, 178068 }, - { 41053553, 178507 }, - { 41513554, 178947 }, - { 41977332, 179386 }, - { 42444904, 179825 }, - { 42916290, 180265 }, - { 43391509, 180704 }, - { 43870579, 181144 }, - { 44353520, 181583 }, - { 44840352, 182023 }, - { 45331092, 182462 }, - { 45825761, 182902 }, - { 46324378, 183342 }, - { 46826961, 183781 }, - { 47333531, 184221 }, - { 47844106, 184661 }, - { 48358706, 185101 }, - { 48877350, 185541 }, - { 49400058, 185981 }, - { 49926849, 186421 }, - { 50457743, 186861 }, - { 50992759, 187301 }, - { 51531916, 187741 }, - { 52075235, 188181 }, - { 52622735, 188622 }, - { 53174435, 189062 }, - { 53730355, 189502 }, - { 54290515, 189943 }, - { 54854935, 190383 }, - { 55423634, 190824 }, - { 55996633, 191265 }, - { 56573950, 191706 }, - { 57155606, 192146 }, - { 57741621, 192587 }, - { 58332014, 193028 }, - { 58926806, 193470 }, - { 59526017, 193911 }, - { 60129666, 194352 }, - { 60737774, 194793 }, - { 61350361, 195235 }, - { 61967446, 195677 }, - { 62589050, 196118 }, - { 63215194, 196560 }, - { 63845897, 197002 }, - { 64481179, 197444 }, - { 65121061, 197886 }, - { 65765563, 198328 }, - { 66414705, 198770 }, - { 67068508, 199213 }, - { 67726992, 199655 }, - { 68390177, 200098 }, - { 69058085, 200540 }, - { 69730735, 200983 }, - { 70408147, 201426 }, - { 71090343, 201869 }, - { 71777343, 202312 }, - { 72469168, 202755 }, - { 73165837, 203199 }, - { 73867373, 203642 }, - { 74573795, 204086 }, - { 75285124, 204529 }, - { 76001380, 204973 }, - { 76722586, 205417 }, - { 77448761, 205861 }, - { 78179926, 206306 }, - { 78916102, 206750 }, - { 79657310, 207194 }, - { 80403571, 207639 }, - { 81154906, 208084 }, - { 81911335, 208529 }, - { 82672880, 208974 }, - { 83439562, 209419 }, - { 84211402, 209864 }, - { 84988421, 210309 }, - { 85770640, 210755 }, - { 86558080, 211201 }, - { 87350762, 211647 }, - { 88148708, 212093 }, - { 88951938, 212539 }, - { 89760475, 212985 }, - { 90574339, 213432 }, - { 91393551, 213878 }, - { 92218133, 214325 }, - { 93048107, 214772 }, - { 93883493, 215219 }, - { 94724314, 215666 }, - { 95570590, 216114 }, - { 96422343, 216561 }, - { 97279594, 217009 }, - { 98142366, 217457 }, - { 99010679, 217905 }, - { 99884556, 218353 }, - { 100764018, 218801 }, - { 101649086, 219250 }, - { 102539782, 219698 }, - { 103436128, 220147 }, - { 104338146, 220596 }, - { 105245857, 221046 }, - { 106159284, 221495 }, - { 107078448, 221945 }, - { 108003370, 222394 }, - { 108934074, 222844 }, - { 109870580, 223294 }, - { 110812910, 223745 }, - { 111761087, 224195 }, - { 112715133, 224646 }, - { 113675069, 225097 }, - { 114640918, 225548 }, - { 115612702, 225999 }, - { 116590442, 226450 }, - { 117574162, 226902 }, - { 118563882, 227353 }, - { 119559626, 227805 }, - { 120561415, 228258 }, - { 121569272, 228710 }, - { 122583219, 229162 }, - { 123603278, 229615 }, - { 124629471, 230068 }, - { 125661822, 230521 }, - { 126700352, 230974 }, - { 127745083, 231428 }, - { 128796039, 231882 }, - { 129853241, 232336 }, - { 130916713, 232790 }, - { 131986475, 233244 }, - { 133062553, 233699 }, - { 134144966, 234153 }, - { 135233739, 234608 }, - { 136328894, 235064 }, - { 137430453, 235519 }, - { 138538440, 235975 }, - { 139652876, 236430 }, - { 140773786, 236886 }, - { 141901190, 237343 }, - { 143035113, 237799 }, - { 144175576, 238256 }, - { 145322604, 238713 }, - { 146476218, 239170 }, - { 147636442, 239627 }, - { 148803298, 240085 }, - { 149976809, 240542 }, - { 151156999, 241000 }, - { 152343890, 241459 }, - { 153537506, 241917 }, - { 154737869, 242376 }, - { 155945002, 242835 }, - { 157158929, 243294 }, - { 158379673, 243753 }, - { 159607257, 244213 }, - { 160841704, 244673 }, - { 162083037, 245133 }, - { 163331279, 245593 }, - { 164586455, 246054 }, - { 165848586, 246514 }, - { 167117696, 246975 }, - { 168393810, 247437 }, - { 169676949, 247898 }, - { 170967138, 248360 }, - { 172264399, 248822 }, - { 173568757, 249284 }, - { 174880235, 249747 }, - { 176198856, 250209 }, - { 177524643, 250672 }, - { 178857621, 251136 }, - { 180197813, 251599 }, - { 181545242, 252063 }, - { 182899933, 252527 }, - { 184261908, 252991 }, - { 185631191, 253456 }, - { 187007807, 253920 }, - { 188391778, 254385 }, - { 189783129, 254851 }, - { 191181884, 255316 }, - { 192588065, 255782 }, - { 194001698, 256248 }, - { 195422805, 256714 }, - { 196851411, 257181 }, - { 198287540, 257648 }, - { 199731215, 258115 }, - { 201182461, 258582 }, - { 202641302, 259050 }, - { 204107760, 259518 }, - { 205581862, 259986 }, - { 207063630, 260454 }, - { 208553088, 260923 }, - { 210050262, 261392 }, - { 211555174, 261861 }, - { 213067849, 262331 }, - { 214588312, 262800 }, - { 216116586, 263270 }, - { 217652696, 263741 }, - { 219196666, 264211 }, - { 220748520, 264682 }, - { 222308282, 265153 }, - { 223875978, 265625 }, - { 225451630, 266097 }, - { 227035265, 266569 }, - { 228626905, 267041 }, - { 230226576, 267514 }, - { 231834302, 267986 }, - { 233450107, 268460 }, - { 235074016, 268933 }, - { 236706054, 269407 }, - { 238346244, 269881 }, - { 239994613, 270355 }, - { 241651183, 270830 }, - { 243315981, 271305 } -}; - -/* return largest index i such that fval <= lookup[i][small] */ -static inline u32 tfrc_binsearch(u32 fval, u8 small) -{ - u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1; - - while (low < high) { - try = (low + high) / 2; - if (fval <= tfrc_calc_x_lookup[try][small]) - high = try; - else - low = try + 1; - } - return high; -} - -/** - * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448 - * @s: packet size in bytes - * @R: RTT scaled by 1000000 (i.e., microseconds) - * @p: loss ratio estimate scaled by 1000000 - * - * Returns X_calc in bytes per second (not scaled). - */ -u32 tfrc_calc_x(u16 s, u32 R, u32 p) -{ - u16 index; - u32 f; - u64 result; - - /* check against invalid parameters and divide-by-zero */ - BUG_ON(p > 1000000); /* p must not exceed 100% */ - BUG_ON(p == 0); /* f(0) = 0, divide by zero */ - if (R == 0) { /* possible divide by zero */ - DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc."); - return ~0U; - } - - if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ - if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ - DCCP_WARN("Value of p (%d) below resolution. " - "Substituting %d\n", p, TFRC_SMALLEST_P); - index = 0; - } else /* 0.0001 <= p <= 0.05 */ - index = p/TFRC_SMALLEST_P - 1; - - f = tfrc_calc_x_lookup[index][1]; - - } else { /* 0.05 < p <= 1.00 */ - index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1; - - f = tfrc_calc_x_lookup[index][0]; - } - - /* - * Compute X = s/(R*f(p)) in bytes per second. - * Since f(p) and R are both scaled by 1000000, we need to multiply by - * 1000000^2. To avoid overflow, the result is computed in two stages. - * This works under almost all reasonable operational conditions, for a - * wide range of parameters. Yet, should some strange combination of - * parameters result in overflow, the use of scaled_div32 will catch - * this and return UINT_MAX - which is a logically adequate consequence. - */ - result = scaled_div(s, R); - return scaled_div32(result, f); -} - -/** - * tfrc_calc_x_reverse_lookup - try to find p given f(p) - * @fvalue: function value to match, scaled by 1000000 - * - * Returns closest match for p, also scaled by 1000000 - */ -u32 tfrc_calc_x_reverse_lookup(u32 fvalue) -{ - int index; - - if (fvalue == 0) /* f(p) = 0 whenever p = 0 */ - return 0; - - /* Error cases. */ - if (fvalue < tfrc_calc_x_lookup[0][1]) { - DCCP_WARN("fvalue %u smaller than resolution\n", fvalue); - return TFRC_SMALLEST_P; - } - if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) { - DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue); - return 1000000; - } - - if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) { - index = tfrc_binsearch(fvalue, 1); - return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; - } - - /* else ... it must be in the coarse-grained column */ - index = tfrc_binsearch(fvalue, 0); - return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; -} - -/** - * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% - * @loss_event_rate: loss event rate to invert - * When @loss_event_rate is large, there is a chance that p is truncated to 0. - * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. - */ -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) -{ - if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ - return 0; - if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ - return 1000000; - return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); -} diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h deleted file mode 100644 index 1f748ed1279d3..0000000000000 --- a/net/dccp/dccp.h +++ /dev/null @@ -1,483 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_H -#define _DCCP_H -/* - * net/dccp/dccp.h - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2005-6 Ian McDonald - */ - -#include -#include -#include -#include -#include -#include "ackvec.h" - -/* - * DCCP - specific warning and debugging macros. - */ -#define DCCP_WARN(fmt, ...) \ - net_warn_ratelimited("%s: " fmt, __func__, ##__VA_ARGS__) -#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \ - __FILE__, __LINE__, __func__) -#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0) -#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \ - DCCP_BUG("\"%s\" holds (exception!)", \ - __stringify(cond)); \ - } while (0) - -#define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \ - printk(fmt, ##args); \ - } while(0) -#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \ - "%s: " fmt, __func__, ##a) - -#ifdef CONFIG_IP_DCCP_DEBUG -extern bool dccp_debug; -#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) -#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) -#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) -#else -#define dccp_pr_debug(format, a...) do {} while (0) -#define dccp_pr_debug_cat(format, a...) do {} while (0) -#define dccp_debug(format, a...) do {} while (0) -#endif - -extern struct inet_hashinfo dccp_hashinfo; - -DECLARE_PER_CPU(unsigned int, dccp_orphan_count); - -void dccp_time_wait(struct sock *sk, int state, int timeo); - -/* - * Set safe upper bounds for header and option length. Since Data Offset is 8 - * bits (RFC 4340, sec. 5.1), the total header length can never be more than - * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1): - * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR - * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields - * Hence a safe upper bound for the maximum option length is 1020-28 = 992 - */ -#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) -#define DCCP_MAX_PACKET_HDR 28 -#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) -#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) - -/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ -#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) - -#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT - * state, about 60 seconds */ - -/* RFC 1122, 4.2.3.1 initial RTO value */ -#define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ)) - -/* - * The maximum back-off value for retransmissions. This is needed for - * - retransmitting client-Requests (sec. 8.1.1), - * - retransmitting Close/CloseReq when closing (sec. 8.3), - * - feature-negotiation retransmission (sec. 6.6.3), - * - Acks in client-PARTOPEN state (sec. 8.1.5). - */ -#define DCCP_RTO_MAX ((unsigned int)(64 * HZ)) - -/* - * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 - */ -#define DCCP_SANE_RTT_MIN 100 -#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) -#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) - -/* sysctl variables for DCCP */ -extern int sysctl_dccp_request_retries; -extern int sysctl_dccp_retries1; -extern int sysctl_dccp_retries2; -extern int sysctl_dccp_tx_qlen; -extern int sysctl_dccp_sync_ratelimit; - -/* - * 48-bit sequence number arithmetic (signed and unsigned) - */ -#define INT48_MIN 0x800000000000LL /* 2^47 */ -#define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */ -#define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */ -#define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x))) -#define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x))) -#define ADD48(a, b) (((a) + (b)) & UINT48_MAX) -#define SUB48(a, b) ADD48((a), COMPLEMENT48(b)) - -static inline void dccp_inc_seqno(u64 *seqno) -{ - *seqno = ADD48(*seqno, 1); -} - -/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */ -static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2) -{ - u64 delta = SUB48(seqno2, seqno1); - - return TO_SIGNED48(delta); -} - -/* is seq1 < seq2 ? */ -static inline int before48(const u64 seq1, const u64 seq2) -{ - return (s64)((seq2 << 16) - (seq1 << 16)) > 0; -} - -/* is seq1 > seq2 ? */ -#define after48(seq1, seq2) before48(seq2, seq1) - -/* is seq2 <= seq1 <= seq3 ? */ -static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3) -{ - return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); -} - -/** - * dccp_loss_count - Approximate the number of lost data packets in a burst loss - * @s1: last known sequence number before the loss ('hole') - * @s2: first sequence number seen after the 'hole' - * @ndp: NDP count on packet with sequence number @s2 - */ -static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp) -{ - s64 delta = dccp_delta_seqno(s1, s2); - - WARN_ON(delta < 0); - delta -= ndp + 1; - - return delta > 0 ? delta : 0; -} - -/** - * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1 - */ -static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) -{ - return dccp_loss_count(s1, s2, ndp) == 0; -} - -enum { - DCCP_MIB_NUM = 0, - DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ - DCCP_MIB_ESTABRESETS, /* EstabResets */ - DCCP_MIB_CURRESTAB, /* CurrEstab */ - DCCP_MIB_OUTSEGS, /* OutSegs */ - DCCP_MIB_OUTRSTS, - DCCP_MIB_ABORTONTIMEOUT, - DCCP_MIB_TIMEOUTS, - DCCP_MIB_ABORTFAILED, - DCCP_MIB_PASSIVEOPENS, - DCCP_MIB_ATTEMPTFAILS, - DCCP_MIB_OUTDATAGRAMS, - DCCP_MIB_INERRS, - DCCP_MIB_OPTMANDATORYERROR, - DCCP_MIB_INVALIDOPT, - __DCCP_MIB_MAX -}; - -#define DCCP_MIB_MAX __DCCP_MIB_MAX -struct dccp_mib { - unsigned long mibs[DCCP_MIB_MAX]; -}; - -DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); -#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) -#define __DCCP_INC_STATS(field) __SNMP_INC_STATS(dccp_statistics, field) -#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) - -/* - * Checksumming routines - */ -static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb) -{ - const struct dccp_hdr* dh = dccp_hdr(skb); - - if (dh->dccph_cscov == 0) - return skb->len; - return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32); -} - -static inline void dccp_csum_outgoing(struct sk_buff *skb) -{ - unsigned int cov = dccp_csum_coverage(skb); - - if (cov >= skb->len) - dccp_hdr(skb)->dccph_cscov = 0; - - skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0); -} - -void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb); - -int dccp_retransmit_skb(struct sock *sk); - -void dccp_send_ack(struct sock *sk); -void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *rsk); - -void dccp_send_sync(struct sock *sk, const u64 seq, - const enum dccp_pkt_type pkt_type); - -/* - * TX Packet Dequeueing Interface - */ -void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); -bool dccp_qpolicy_full(struct sock *sk); -void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); -struct sk_buff *dccp_qpolicy_top(struct sock *sk); -struct sk_buff *dccp_qpolicy_pop(struct sock *sk); -bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); - -/* - * TX Packet Output and TX Timers - */ -void dccp_write_xmit(struct sock *sk); -void dccp_write_space(struct sock *sk); -void dccp_flush_write_queue(struct sock *sk, long *time_budget); - -void dccp_init_xmit_timers(struct sock *sk); -static inline void dccp_clear_xmit_timers(struct sock *sk) -{ - inet_csk_clear_xmit_timers(sk); -} - -unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); - -const char *dccp_packet_name(const int type); - -void dccp_set_state(struct sock *sk, const int state); -void dccp_done(struct sock *sk); - -int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, - struct sk_buff const *skb); - -int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); - -struct sock *dccp_create_openreq_child(const struct sock *sk, - const struct request_sock *req, - const struct sk_buff *skb); - -int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); - -struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req); -struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req); - -int dccp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb); -int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct dccp_hdr *dh, unsigned int len); -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len); - -void dccp_destruct_common(struct sock *sk); -int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); -void dccp_destroy_sock(struct sock *sk); - -void dccp_close(struct sock *sk, long timeout); -struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, - struct request_sock *req); - -int dccp_connect(struct sock *sk); -int dccp_disconnect(struct sock *sk, int flags); -int dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -int dccp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen); -int dccp_ioctl(struct sock *sk, int cmd, int *karg); -int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len); -void dccp_shutdown(struct sock *sk, int how); -int inet_dccp_listen(struct socket *sock, int backlog); -__poll_t dccp_poll(struct file *file, struct socket *sock, - poll_table *wait); -int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); -void dccp_req_err(struct sock *sk, u64 seq); - -struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb); -int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); -void dccp_send_close(struct sock *sk, const int active); -int dccp_invalid_packet(struct sk_buff *skb); -u32 dccp_sample_rtt(struct sock *sk, long delta); - -static inline bool dccp_bad_service_code(const struct sock *sk, - const __be32 service) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - if (dp->dccps_service == service) - return false; - return !dccp_list_has_service(dp->dccps_service_list, service); -} - -/** - * dccp_skb_cb - DCCP per-packet control information - * @dccpd_type: one of %dccp_pkt_type (or unknown) - * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1 - * @dccpd_reset_code: one of %dccp_reset_codes - * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code) - * @dccpd_opt_len: total length of all options (5.8) in the packet - * @dccpd_seq: sequence number - * @dccpd_ack_seq: acknowledgment number subheader field value - * - * This is used for transmission as well as for reception. - */ -struct dccp_skb_cb { - union { - struct inet_skb_parm h4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; - __u8 dccpd_type:4; - __u8 dccpd_ccval:4; - __u8 dccpd_reset_code, - dccpd_reset_data[3]; - __u16 dccpd_opt_len; - __u64 dccpd_seq; - __u64 dccpd_ack_seq; -}; - -#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) - -/* RFC 4340, sec. 7.7 */ -static inline int dccp_non_data_packet(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_ACK || - type == DCCP_PKT_CLOSE || - type == DCCP_PKT_CLOSEREQ || - type == DCCP_PKT_RESET || - type == DCCP_PKT_SYNC || - type == DCCP_PKT_SYNCACK; -} - -/* RFC 4340, sec. 7.7 */ -static inline int dccp_data_packet(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_DATA || - type == DCCP_PKT_DATAACK || - type == DCCP_PKT_REQUEST || - type == DCCP_PKT_RESPONSE; -} - -static inline int dccp_packet_without_ack(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST; -} - -#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2) - -static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) -{ - struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + - sizeof(*dh)); - dh->dccph_seq2 = 0; - dh->dccph_seq = htons((gss >> 32) & 0xfffff); - dhx->dccph_seq_low = htonl(gss & 0xffffffff); -} - -static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, - const u64 gsr) -{ - dhack->dccph_reserved1 = 0; - dhack->dccph_ack_nr_high = htons(gsr >> 32); - dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff); -} - -static inline void dccp_update_gsr(struct sock *sk, u64 seq) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (after48(seq, dp->dccps_gsr)) - dp->dccps_gsr = seq; - /* Sequence validity window depends on remote Sequence Window (7.5.1) */ - dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); - /* - * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, - * 7.5.1 we perform this check beyond the initial handshake: W/W' are - * always > 32, so for the first W/W' packets in the lifetime of a - * connection we always have to adjust SWL. - * A second reason why we are doing this is that the window depends on - * the feature-remote value of Sequence Window: nothing stops the peer - * from updating this value while we are busy adjusting SWL for the - * first W packets (we would have to count from scratch again then). - * Therefore it is safer to always make sure that the Sequence Window - * is not artificially extended by a peer who grows SWL downwards by - * continually updating the feature-remote Sequence-Window. - * If sequence numbers wrap it is bad luck. But that will take a while - * (48 bit), and this measure prevents Sequence-number attacks. - */ - if (before48(dp->dccps_swl, dp->dccps_isr)) - dp->dccps_swl = dp->dccps_isr; - dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); -} - -static inline void dccp_update_gss(struct sock *sk, u64 seq) -{ - struct dccp_sock *dp = dccp_sk(sk); - - dp->dccps_gss = seq; - /* Ack validity window depends on local Sequence Window value (7.5.1) */ - dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); - /* Adjust AWL so that it is not below ISS - see comment above for SWL */ - if (before48(dp->dccps_awl, dp->dccps_iss)) - dp->dccps_awl = dp->dccps_iss; - dp->dccps_awh = dp->dccps_gss; -} - -static inline int dccp_ackvec_pending(const struct sock *sk) -{ - return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && - !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); -} - -static inline int dccp_ack_pending(const struct sock *sk) -{ - return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); -} - -int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); -int dccp_feat_finalise_settings(struct dccp_sock *dp); -int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); -int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, - struct sk_buff *skb); -int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); -void dccp_feat_list_purge(struct list_head *fn_list); - -int dccp_insert_options(struct sock *sk, struct sk_buff *skb); -int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *); -u32 dccp_timestamp(void); -void dccp_timestamping_init(void); -int dccp_insert_option(struct sk_buff *skb, unsigned char option, - const void *value, unsigned char len); - -#ifdef CONFIG_SYSCTL -int dccp_sysctl_init(void); -void dccp_sysctl_exit(void); -#else -static inline int dccp_sysctl_init(void) -{ - return 0; -} - -static inline void dccp_sysctl_exit(void) -{ -} -#endif - -#endif /* _DCCP_H */ diff --git a/net/dccp/diag.c b/net/dccp/diag.c deleted file mode 100644 index f5019d95c3ae5..0000000000000 --- a/net/dccp/diag.c +++ /dev/null @@ -1,85 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/diag.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - - -#include -#include - -#include "ccid.h" -#include "dccp.h" - -static void dccp_get_info(struct sock *sk, struct tcp_info *info) -{ - struct dccp_sock *dp = dccp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - - memset(info, 0, sizeof(*info)); - - info->tcpi_state = sk->sk_state; - info->tcpi_retransmits = icsk->icsk_retransmits; - info->tcpi_probes = icsk->icsk_probes_out; - info->tcpi_backoff = icsk->icsk_backoff; - info->tcpi_pmtu = icsk->icsk_pmtu_cookie; - - if (dp->dccps_hc_rx_ackvec != NULL) - info->tcpi_options |= TCPI_OPT_SACK; - - if (dp->dccps_hc_rx_ccid != NULL) - ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); - - if (dp->dccps_hc_tx_ccid != NULL) - ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info); -} - -static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, - void *_info) -{ - r->idiag_rqueue = r->idiag_wqueue = 0; - - if (_info != NULL) - dccp_get_info(sk, _info); -} - -static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) -{ - inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r); -} - -static int dccp_diag_dump_one(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) -{ - return inet_diag_dump_one_icsk(&dccp_hashinfo, cb, req); -} - -static const struct inet_diag_handler dccp_diag_handler = { - .owner = THIS_MODULE, - .dump = dccp_diag_dump, - .dump_one = dccp_diag_dump_one, - .idiag_get_info = dccp_diag_get_info, - .idiag_type = IPPROTO_DCCP, - .idiag_info_size = sizeof(struct tcp_info), -}; - -static int __init dccp_diag_init(void) -{ - return inet_diag_register(&dccp_diag_handler); -} - -static void __exit dccp_diag_fini(void) -{ - inet_diag_unregister(&dccp_diag_handler); -} - -module_init(dccp_diag_init); -module_exit(dccp_diag_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo "); -MODULE_DESCRIPTION("DCCP inet_diag handler"); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-33 /* AF_INET - IPPROTO_DCCP */); diff --git a/net/dccp/feat.c b/net/dccp/feat.c deleted file mode 100644 index f7554dcdaaba9..0000000000000 --- a/net/dccp/feat.c +++ /dev/null @@ -1,1581 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/feat.c - * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * - * Copyright (c) 2008 Gerrit Renker - * Rewrote from scratch, some bits from earlier code by - * Copyright (c) 2005 Andrea Bittau - * - * ASSUMPTIONS - * ----------- - * o Feature negotiation is coordinated with connection setup (as in TCP), wild - * changes of parameters of an established connection are not supported. - * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN. - * o All currently known SP features have 1-byte quantities. If in the future - * extensions of RFCs 4340..42 define features with item lengths larger than - * one byte, a feature-specific extension of the code will be required. - */ -#include -#include -#include "ccid.h" -#include "feat.h" - -/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ -unsigned long sysctl_dccp_sequence_window __read_mostly = 100; -int sysctl_dccp_rx_ccid __read_mostly = 2, - sysctl_dccp_tx_ccid __read_mostly = 2; - -/* - * Feature activation handlers. - * - * These all use an u64 argument, to provide enough room for NN/SP features. At - * this stage the negotiated values have been checked to be within their range. - */ -static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid *new_ccid = ccid_new(ccid, sk, rx); - - if (new_ccid == NULL) - return -ENOMEM; - - if (rx) { - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = new_ccid; - } else { - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = new_ccid; - } - return 0; -} - -static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) { - dp->dccps_r_seq_win = seq_win; - /* propagate changes to update SWL/SWH */ - dccp_update_gsr(sk, dp->dccps_gsr); - } else { - dp->dccps_l_seq_win = seq_win; - /* propagate changes to update AWL */ - dccp_update_gss(sk, dp->dccps_gss); - } - return 0; -} - -static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) -{ - if (rx) - dccp_sk(sk)->dccps_r_ack_ratio = ratio; - else - dccp_sk(sk)->dccps_l_ack_ratio = ratio; - return 0; -} - -static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) { - if (enable && dp->dccps_hc_rx_ackvec == NULL) { - dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); - if (dp->dccps_hc_rx_ackvec == NULL) - return -ENOMEM; - } else if (!enable) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } - } - return 0; -} - -static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) -{ - if (!rx) - dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); - return 0; -} - -/* - * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that - * `rx' holds when the sending peer informs about his partial coverage via a - * ChangeR() option. In the other case, we are the sender and the receiver - * announces its coverage via ChangeL() options. The policy here is to honour - * such communication by enabling the corresponding partial coverage - but only - * if it has not been set manually before; the warning here means that all - * packets will be dropped. - */ -static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) - dp->dccps_pcrlen = cscov; - else { - if (dp->dccps_pcslen == 0) - dp->dccps_pcslen = cscov; - else if (cscov > dp->dccps_pcslen) - DCCP_WARN("CsCov %u too small, peer requires >= %u\n", - dp->dccps_pcslen, (u8)cscov); - } - return 0; -} - -static const struct { - u8 feat_num; /* DCCPF_xxx */ - enum dccp_feat_type rxtx; /* RX or TX */ - enum dccp_feat_type reconciliation; /* SP or NN */ - u8 default_value; /* as in 6.4 */ - int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); -/* - * Lookup table for location and type of features (from RFC 4340/4342) - * +--------------------------+----+-----+----+----+---------+-----------+ - * | Feature | Location | Reconc. | Initial | Section | - * | | RX | TX | SP | NN | Value | Reference | - * +--------------------------+----+-----+----+----+---------+-----------+ - * | DCCPF_CCID | | X | X | | 2 | 10 | - * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | - * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | - * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | - * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | - * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | - * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | - * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | - * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | - * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | - * +--------------------------+----+-----+----+----+---------+-----------+ - */ -} dccp_feat_table[] = { - { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, - { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, - { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, - { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, - { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, - { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, - { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, - { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, -}; -#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) - -/** - * dccp_feat_index - Hash function to map feature number into array position - * @feat_num: feature to hash, one of %dccp_feature_numbers - * - * Returns consecutive array index or -1 if the feature is not understood. - */ -static int dccp_feat_index(u8 feat_num) -{ - /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ - if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) - return feat_num - 1; - - /* - * Other features: add cases for new feature types here after adding - * them to the above table. - */ - switch (feat_num) { - case DCCPF_SEND_LEV_RATE: - return DCCP_FEAT_SUPPORTED_MAX - 1; - } - return -1; -} - -static u8 dccp_feat_type(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); - - if (idx < 0) - return FEAT_UNKNOWN; - return dccp_feat_table[idx].reconciliation; -} - -static int dccp_feat_default_value(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); - /* - * There are no default values for unknown features, so encountering a - * negative index here indicates a serious problem somewhere else. - */ - DCCP_BUG_ON(idx < 0); - - return idx < 0 ? 0 : dccp_feat_table[idx].default_value; -} - -/* - * Debugging and verbose-printing section - */ -static const char *dccp_feat_fname(const u8 feat) -{ - static const char *const feature_names[] = { - [DCCPF_RESERVED] = "Reserved", - [DCCPF_CCID] = "CCID", - [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", - [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", - [DCCPF_ECN_INCAPABLE] = "ECN Incapable", - [DCCPF_ACK_RATIO] = "Ack Ratio", - [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", - [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", - [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", - [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", - }; - if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) - return feature_names[DCCPF_RESERVED]; - - if (feat == DCCPF_SEND_LEV_RATE) - return "Send Loss Event Rate"; - if (feat >= DCCPF_MIN_CCID_SPECIFIC) - return "CCID-specific"; - - return feature_names[feat]; -} - -static const char *const dccp_feat_sname[] = { - "DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE", -}; - -#ifdef CONFIG_IP_DCCP_DEBUG -static const char *dccp_feat_oname(const u8 opt) -{ - switch (opt) { - case DCCPO_CHANGE_L: return "Change_L"; - case DCCPO_CONFIRM_L: return "Confirm_L"; - case DCCPO_CHANGE_R: return "Change_R"; - case DCCPO_CONFIRM_R: return "Confirm_R"; - } - return NULL; -} - -static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) -{ - u8 i, type = dccp_feat_type(feat_num); - - if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) - dccp_pr_debug_cat("(NULL)"); - else if (type == FEAT_SP) - for (i = 0; i < val->sp.len; i++) - dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); - else if (type == FEAT_NN) - dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); - else - dccp_pr_debug_cat("unknown type %u", type); -} - -static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) -{ - u8 type = dccp_feat_type(feat_num); - dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; - - if (type == FEAT_NN) - fval.nn = dccp_decode_value_var(list, len); - dccp_feat_printval(feat_num, &fval); -} - -static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) -{ - dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", - dccp_feat_fname(entry->feat_num)); - dccp_feat_printval(entry->feat_num, &entry->val); - dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], - entry->needs_confirm ? "(Confirm pending)" : ""); -} - -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ - dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ - dccp_feat_printvals(feat, val, len); \ - dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) - -#define dccp_feat_print_fnlist(fn_list) { \ - const struct dccp_feat_entry *___entry; \ - \ - dccp_pr_debug("List Dump:\n"); \ - list_for_each_entry(___entry, fn_list, node) \ - dccp_feat_print_entry(___entry); \ -} -#else /* ! CONFIG_IP_DCCP_DEBUG */ -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) -#define dccp_feat_print_fnlist(fn_list) -#endif - -static int __dccp_feat_activate(struct sock *sk, const int idx, - const bool is_local, dccp_feat_val const *fval) -{ - bool rx; - u64 val; - - if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) - return -1; - if (dccp_feat_table[idx].activation_hdlr == NULL) - return 0; - - if (fval == NULL) { - val = dccp_feat_table[idx].default_value; - } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { - if (fval->sp.vec == NULL) { - /* - * This can happen when an empty Confirm is sent - * for an SP (i.e. known) feature. In this case - * we would be using the default anyway. - */ - DCCP_CRIT("Feature #%d undefined: using default", idx); - val = dccp_feat_table[idx].default_value; - } else { - val = fval->sp.vec[0]; - } - } else { - val = fval->nn; - } - - /* Location is RX if this is a local-RX or remote-TX feature */ - rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); - - dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", - dccp_feat_fname(dccp_feat_table[idx].feat_num), - fval ? "" : "default ", (unsigned long long)val); - - return dccp_feat_table[idx].activation_hdlr(sk, val, rx); -} - -/** - * dccp_feat_activate - Activate feature value on socket - * @sk: fully connected DCCP socket (after handshake is complete) - * @feat_num: feature to activate, one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @fval: the value (SP or NN) to activate, or NULL to use the default value - * - * For general use this function is preferable over __dccp_feat_activate(). - */ -static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, - dccp_feat_val const *fval) -{ - return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); -} - -/* Test for "Req'd" feature (RFC 4340, 6.4) */ -static inline int dccp_feat_must_be_understood(u8 feat_num) -{ - return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || - feat_num == DCCPF_SEQUENCE_WINDOW; -} - -/* copy constructor, fval must not already contain allocated memory */ -static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) -{ - fval->sp.len = len; - if (fval->sp.len > 0) { - fval->sp.vec = kmemdup(val, len, gfp_any()); - if (fval->sp.vec == NULL) { - fval->sp.len = 0; - return -ENOMEM; - } - } - return 0; -} - -static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) -{ - if (unlikely(val == NULL)) - return; - if (dccp_feat_type(feat_num) == FEAT_SP) - kfree(val->sp.vec); - memset(val, 0, sizeof(*val)); -} - -static struct dccp_feat_entry * - dccp_feat_clone_entry(struct dccp_feat_entry const *original) -{ - struct dccp_feat_entry *new; - u8 type = dccp_feat_type(original->feat_num); - - if (type == FEAT_UNKNOWN) - return NULL; - - new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); - if (new == NULL) - return NULL; - - if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, - original->val.sp.vec, - original->val.sp.len)) { - kfree(new); - return NULL; - } - return new; -} - -static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) -{ - if (entry != NULL) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - kfree(entry); - } -} - -/* - * List management functions - * - * Feature negotiation lists rely on and maintain the following invariants: - * - each feat_num in the list is known, i.e. we know its type and default value - * - each feat_num/is_local combination is unique (old entries are overwritten) - * - SP values are always freshly allocated - * - list is sorted in increasing order of feature number (faster lookup) - */ -static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, - u8 feat_num, bool is_local) -{ - struct dccp_feat_entry *entry; - - list_for_each_entry(entry, fn_list, node) { - if (entry->feat_num == feat_num && entry->is_local == is_local) - return entry; - else if (entry->feat_num > feat_num) - break; - } - return NULL; -} - -/** - * dccp_feat_entry_new - Central list update routine (called by all others) - * @head: list to add to - * @feat: feature number - * @local: whether the local (1) or remote feature with number @feat is meant - * - * This is the only constructor and serves to ensure the above invariants. - */ -static struct dccp_feat_entry * - dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) -{ - struct dccp_feat_entry *entry; - - list_for_each_entry(entry, head, node) - if (entry->feat_num == feat && entry->is_local == local) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - return entry; - } else if (entry->feat_num > feat) { - head = &entry->node; - break; - } - - entry = kmalloc(sizeof(*entry), gfp_any()); - if (entry != NULL) { - entry->feat_num = feat; - entry->is_local = local; - list_add_tail(&entry->node, head); - } - return entry; -} - -/** - * dccp_feat_push_change - Add/overwrite a Change option in the list - * @fn_list: feature-negotiation list to update - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @mandatory: whether to use Mandatory feature negotiation options - * @fval: pointer to NN/SP value to be inserted (will be copied) - */ -static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, - u8 mandatory, dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); - - if (new == NULL) - return -ENOMEM; - - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_INITIALISING; - new->needs_confirm = false; - new->empty_confirm = false; - new->val = *fval; - new->needs_mandatory = mandatory; - - return 0; -} - -/** - * dccp_feat_push_confirm - Add a Confirm entry to the FN list - * @fn_list: feature-negotiation list to add to - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is being confirmed - * @fval: pointer to NN/SP value to be inserted or NULL - * - * Returns 0 on success, a Reset code for further processing otherwise. - */ -static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, - dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); - - if (new == NULL) - return DCCP_RESET_CODE_TOO_BUSY; - - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_STABLE; /* transition in 6.6.2 */ - new->needs_confirm = true; - new->empty_confirm = (fval == NULL); - new->val.nn = 0; /* zeroes the whole structure */ - if (!new->empty_confirm) - new->val = *fval; - new->needs_mandatory = false; - - return 0; -} - -static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) -{ - return dccp_feat_push_confirm(fn_list, feat, local, NULL); -} - -static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) -{ - list_del(&entry->node); - dccp_feat_entry_destructor(entry); -} - -void dccp_feat_list_purge(struct list_head *fn_list) -{ - struct dccp_feat_entry *entry, *next; - - list_for_each_entry_safe(entry, next, fn_list, node) - dccp_feat_entry_destructor(entry); - INIT_LIST_HEAD(fn_list); -} -EXPORT_SYMBOL_GPL(dccp_feat_list_purge); - -/* generate @to as full clone of @from - @to must not contain any nodes */ -int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) -{ - struct dccp_feat_entry *entry, *new; - - INIT_LIST_HEAD(to); - list_for_each_entry(entry, from, node) { - new = dccp_feat_clone_entry(entry); - if (new == NULL) - goto cloning_failed; - list_add_tail(&new->node, to); - } - return 0; - -cloning_failed: - dccp_feat_list_purge(to); - return -ENOMEM; -} - -/** - * dccp_feat_valid_nn_length - Enforce length constraints on NN options - * @feat_num: feature to return length of, one of %dccp_feature_numbers - * - * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, - * incoming options are accepted as long as their values are valid. - */ -static u8 dccp_feat_valid_nn_length(u8 feat_num) -{ - if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ - return 2; - if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ - return 6; - return 0; -} - -static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) -{ - switch (feat_num) { - case DCCPF_ACK_RATIO: - return val <= DCCPF_ACK_RATIO_MAX; - case DCCPF_SEQUENCE_WINDOW: - return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; - } - return 0; /* feature unknown - so we can't tell */ -} - -/* check that SP values are within the ranges defined in RFC 4340 */ -static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) -{ - switch (feat_num) { - case DCCPF_CCID: - return val == DCCPC_CCID2 || val == DCCPC_CCID3; - /* Type-check Boolean feature values: */ - case DCCPF_SHORT_SEQNOS: - case DCCPF_ECN_INCAPABLE: - case DCCPF_SEND_ACK_VECTOR: - case DCCPF_SEND_NDP_COUNT: - case DCCPF_DATA_CHECKSUM: - case DCCPF_SEND_LEV_RATE: - return val < 2; - case DCCPF_MIN_CSUM_COVER: - return val < 16; - } - return 0; /* feature unknown */ -} - -static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) -{ - if (sp_list == NULL || sp_len < 1) - return 0; - while (sp_len--) - if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) - return 0; - return 1; -} - -/** - * dccp_feat_insert_opts - Generate FN options from current list state - * @skb: next sk_buff to be sent to the peer - * @dp: for client during handshake and general negotiation - * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) - */ -int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - struct dccp_feat_entry *pos, *next; - u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; - bool rpt; - - /* put entries into @skb in the order they appear in the list */ - list_for_each_entry_safe_reverse(pos, next, fn, node) { - opt = dccp_feat_genopt(pos); - type = dccp_feat_type(pos->feat_num); - rpt = false; - - if (pos->empty_confirm) { - len = 0; - ptr = NULL; - } else { - if (type == FEAT_SP) { - len = pos->val.sp.len; - ptr = pos->val.sp.vec; - rpt = pos->needs_confirm; - } else if (type == FEAT_NN) { - len = dccp_feat_valid_nn_length(pos->feat_num); - ptr = nn_in_nbo; - dccp_encode_value_var(pos->val.nn, ptr, len); - } else { - DCCP_BUG("unknown feature %u", pos->feat_num); - return -1; - } - } - dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); - - if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) - return -1; - if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) - return -1; - - if (skb->sk->sk_state == DCCP_OPEN && - (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) { - /* - * Confirms don't get retransmitted (6.6.3) once the - * connection is in state OPEN - */ - dccp_feat_list_pop(pos); - } else { - /* - * Enter CHANGING after transmitting the Change - * option (6.6.2). - */ - if (pos->state == FEAT_INITIALISING) - pos->state = FEAT_CHANGING; - } - } - return 0; -} - -/** - * __feat_register_nn - Register new NN value on socket - * @fn: feature-negotiation list to register with - * @feat: an NN feature from %dccp_feature_numbers - * @mandatory: use Mandatory option if 1 - * @nn_val: value to register (restricted to 4 bytes) - * - * Note that NN features are local by definition (RFC 4340, 6.3.2). - */ -static int __feat_register_nn(struct list_head *fn, u8 feat, - u8 mandatory, u64 nn_val) -{ - dccp_feat_val fval = { .nn = nn_val }; - - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; - - /* Don't bother with default values, they will be activated anyway. */ - if (nn_val - (u64)dccp_feat_default_value(feat) == 0) - return 0; - - return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); -} - -/** - * __feat_register_sp - Register new SP value/list on socket - * @fn: feature-negotiation list to register with - * @feat: an SP feature from %dccp_feature_numbers - * @is_local: whether the local (1) or the remote (0) @feat is meant - * @mandatory: use Mandatory option if 1 - * @sp_val: SP value followed by optional preference list - * @sp_len: length of @sp_val in bytes - */ -static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, - u8 mandatory, u8 const *sp_val, u8 sp_len) -{ - dccp_feat_val fval; - - if (dccp_feat_type(feat) != FEAT_SP || - !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) - return -EINVAL; - - /* Avoid negotiating alien CCIDs by only advertising supported ones */ - if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) - return -EOPNOTSUPP; - - if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) - return -ENOMEM; - - if (dccp_feat_push_change(fn, feat, is_local, mandatory, &fval)) { - kfree(fval.sp.vec); - return -ENOMEM; - } - - return 0; -} - -/** - * dccp_feat_register_sp - Register requests to change SP feature values - * @sk: client or listening socket - * @feat: one of %dccp_feature_numbers - * @is_local: whether the local (1) or remote (0) @feat is meant - * @list: array of preferred values, in descending order of preference - * @len: length of @list in bytes - */ -int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len) -{ /* any changes must be registered before establishing the connection */ - if (sk->sk_state != DCCP_CLOSED) - return -EISCONN; - if (dccp_feat_type(feat) != FEAT_SP) - return -EINVAL; - return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, - 0, list, len); -} - -/** - * dccp_feat_nn_get - Query current/pending value of NN feature - * @sk: DCCP socket of an established connection - * @feat: NN feature number from %dccp_feature_numbers - * - * For a known NN feature, returns value currently being negotiated, or - * current (confirmed) value if no negotiation is going on. - */ -u64 dccp_feat_nn_get(struct sock *sk, u8 feat) -{ - if (dccp_feat_type(feat) == FEAT_NN) { - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_feat_entry *entry; - - entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1); - if (entry != NULL) - return entry->val.nn; - - switch (feat) { - case DCCPF_ACK_RATIO: - return dp->dccps_l_ack_ratio; - case DCCPF_SEQUENCE_WINDOW: - return dp->dccps_l_seq_win; - } - } - DCCP_BUG("attempt to look up unsupported feature %u", feat); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_feat_nn_get); - -/** - * dccp_feat_signal_nn_change - Update NN values for an established connection - * @sk: DCCP socket of an established connection - * @feat: NN feature number from %dccp_feature_numbers - * @nn_val: the new value to use - * - * This function is used to communicate NN updates out-of-band. - */ -int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - dccp_feat_val fval = { .nn = nn_val }; - struct dccp_feat_entry *entry; - - if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) - return 0; - - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; - - if (nn_val == dccp_feat_nn_get(sk, feat)) - return 0; /* already set or negotiation under way */ - - entry = dccp_feat_list_lookup(fn, feat, 1); - if (entry != NULL) { - dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n", - (unsigned long long)entry->val.nn, - (unsigned long long)nn_val); - dccp_feat_list_pop(entry); - } - - inet_csk_schedule_ack(sk); - return dccp_feat_push_change(fn, feat, 1, 0, &fval); -} -EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); - -/* - * Tracking features whose value depend on the choice of CCID - * - * This is designed with an extension in mind so that a list walk could be done - * before activating any features. However, the existing framework was found to - * work satisfactorily up until now, the automatic verification is left open. - * When adding new CCIDs, add a corresponding dependency table here. - */ -static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) -{ - static const struct ccid_dependency ccid2_dependencies[2][2] = { - /* - * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX - * feature and Send Ack Vector is an RX feature, `is_local' - * needs to be reversed. - */ - { /* Dependencies of the receiver-side (remote) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - }, - { /* Dependencies of the sender-side (local) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - static const struct ccid_dependency ccid3_dependencies[2][5] = { - { /* - * Dependencies of the receiver-side CCID3 - */ - { /* locally disable Ack Vectors */ - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* see below why Send Loss Event Rate is on */ - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { /* NDP Count is needed as per RFC 4342, 6.1.1 */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 }, - }, - { /* - * CCID3 at the TX side: we request that the HC-receiver - * will not send Ack Vectors (they will be ignored, so - * Mandatory is not set); we enable Send Loss Event Rate - * (Mandatory since the implementation does not support - * the Loss Intervals option of RFC 4342, 8.6). - * The last two options are for peer's information only. - */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = false, - .val = 0 - }, - { - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { /* this CCID does not support Ack Ratio */ - .dependent_feat = DCCPF_ACK_RATIO, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* tell receiver we are sending NDP counts */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = true, - .is_mandatory = false, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - switch (ccid) { - case DCCPC_CCID2: - return ccid2_dependencies[is_local]; - case DCCPC_CCID3: - return ccid3_dependencies[is_local]; - default: - return NULL; - } -} - -/** - * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID - * @fn: feature-negotiation list to update - * @id: CCID number to track - * @is_local: whether TX CCID (1) or RX CCID (0) is meant - * - * This function needs to be called after registering all other features. - */ -static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) -{ - const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); - int i, rc = (table == NULL); - - for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) - if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) - rc = __feat_register_sp(fn, table[i].dependent_feat, - table[i].is_local, - table[i].is_mandatory, - &table[i].val, 1); - else - rc = __feat_register_nn(fn, table[i].dependent_feat, - table[i].is_mandatory, - table[i].val); - return rc; -} - -/** - * dccp_feat_finalise_settings - Finalise settings before starting negotiation - * @dp: client or listening socket (settings will be inherited) - * - * This is called after all registrations (socket initialisation, sysctls, and - * sockopt calls), and before sending the first packet containing Change options - * (ie. client-Request or server-Response), to ensure internal consistency. - */ -int dccp_feat_finalise_settings(struct dccp_sock *dp) -{ - struct list_head *fn = &dp->dccps_featneg; - struct dccp_feat_entry *entry; - int i = 2, ccids[2] = { -1, -1 }; - - /* - * Propagating CCIDs: - * 1) not useful to propagate CCID settings if this host advertises more - * than one CCID: the choice of CCID may still change - if this is - * the client, or if this is the server and the client sends - * singleton CCID values. - * 2) since is that propagate_ccid changes the list, we defer changing - * the sorted list until after the traversal. - */ - list_for_each_entry(entry, fn, node) - if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) - ccids[entry->is_local] = entry->val.sp.vec[0]; - while (i--) - if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) - return -1; - dccp_feat_print_fnlist(fn); - return 0; -} - -/** - * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features - * @dreq: server socket to resolve - * - * It is the server which resolves the dependencies once the CCID has been - * fully negotiated. If no CCID has been negotiated, it uses the default CCID. - */ -int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) -{ - struct list_head *fn = &dreq->dreq_featneg; - struct dccp_feat_entry *entry; - u8 is_local, ccid; - - for (is_local = 0; is_local <= 1; is_local++) { - entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); - - if (entry != NULL && !entry->empty_confirm) - ccid = entry->val.sp.vec[0]; - else - ccid = dccp_feat_default_value(DCCPF_CCID); - - if (dccp_feat_propagate_ccid(fn, ccid, is_local)) - return -1; - } - return 0; -} - -/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ -static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) -{ - u8 c, s; - - for (s = 0; s < slen; s++) - for (c = 0; c < clen; c++) - if (servlist[s] == clilist[c]) - return servlist[s]; - return -1; -} - -/** - * dccp_feat_prefer - Move preferred entry to the start of array - * @preferred_value: entry to move to start of array - * @array: array of preferred entries - * @array_len: size of the array - * - * Reorder the @array_len elements in @array so that @preferred_value comes - * first. Returns >0 to indicate that @preferred_value does occur in @array. - */ -static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) -{ - u8 i, does_occur = 0; - - if (array != NULL) { - for (i = 0; i < array_len; i++) - if (array[i] == preferred_value) { - array[i] = array[0]; - does_occur++; - } - if (does_occur) - array[0] = preferred_value; - } - return does_occur; -} - -/** - * dccp_feat_reconcile - Reconcile SP preference lists - * @fv: SP list to reconcile into - * @arr: received SP preference list - * @len: length of @arr in bytes - * @is_server: whether this side is the server (and @fv is the server's list) - * @reorder: whether to reorder the list in @fv after reconciling with @arr - * When successful, > 0 is returned and the reconciled list is in @fval. - * A value of 0 means that negotiation failed (no shared entry). - */ -static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, - bool is_server, bool reorder) -{ - int rc; - - if (!fv->sp.vec || !arr) { - DCCP_CRIT("NULL feature value or array"); - return 0; - } - - if (is_server) - rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); - else - rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); - - if (!reorder) - return rc; - if (rc < 0) - return 0; - - /* - * Reorder list: used for activating features and in dccp_insert_fn_opt. - */ - return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); -} - -/** - * dccp_feat_change_recv - Process incoming ChangeL/R options - * @fn: feature-negotiation list to update - * @is_mandatory: whether the Change was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is the server (1) or the client (0) - */ -static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 defval, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CHANGE_R); - struct dccp_feat_entry *entry; - dccp_feat_val fval; - - if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ - goto unknown_feature_or_value; - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - /* - * Negotiation of NN features: Change R is invalid, so there is no - * simultaneous negotiation; hence we do not look up in the list. - */ - if (type == FEAT_NN) { - if (local || len > sizeof(fval.nn)) - goto unknown_feature_or_value; - - /* 6.3.2: "The feature remote MUST accept any valid value..." */ - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto unknown_feature_or_value; - - return dccp_feat_push_confirm(fn, feat, local, &fval); - } - - /* - * Unidirectional/simultaneous negotiation of SP features (6.3.1) - */ - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL) { - /* - * No particular preferences have been registered. We deal with - * this situation by assuming that all valid values are equally - * acceptable, and apply the following checks: - * - if the peer's list is a singleton, we accept a valid value; - * - if we are the server, we first try to see if the peer (the - * client) advertises the default value. If yes, we use it, - * otherwise we accept the preferred value; - * - else if we are the client, we use the first list element. - */ - if (dccp_feat_clone_sp_val(&fval, val, 1)) - return DCCP_RESET_CODE_TOO_BUSY; - - if (len > 1 && server) { - defval = dccp_feat_default_value(feat); - if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) - fval.sp.vec[0] = defval; - } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { - kfree(fval.sp.vec); - goto unknown_feature_or_value; - } - - /* Treat unsupported CCIDs like invalid values */ - if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { - kfree(fval.sp.vec); - goto not_valid_or_not_known; - } - - if (dccp_feat_push_confirm(fn, feat, local, &fval)) { - kfree(fval.sp.vec); - return DCCP_RESET_CODE_TOO_BUSY; - } - - return 0; - } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ - return 0; - } - - if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { - entry->empty_confirm = false; - } else if (is_mandatory) { - return DCCP_RESET_CODE_MANDATORY_ERROR; - } else if (entry->state == FEAT_INITIALISING) { - /* - * Failed simultaneous negotiation (server only): try to `save' - * the connection by checking whether entry contains the default - * value for @feat. If yes, send an empty Confirm to signal that - * the received Change was not understood - which implies using - * the default value. - * If this also fails, we use Reset as the last resort. - */ - WARN_ON(!server); - defval = dccp_feat_default_value(feat); - if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) - return DCCP_RESET_CODE_OPTION_ERROR; - entry->empty_confirm = true; - } - entry->needs_confirm = true; - entry->needs_mandatory = false; - entry->state = FEAT_STABLE; - return 0; - -unknown_feature_or_value: - if (!is_mandatory) - return dccp_push_empty_confirm(fn, feat, local); - -not_valid_or_not_known: - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_confirm_recv - Process received Confirm options - * @fn: feature-negotiation list to update - * @is_mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is server (1) or client (0) - */ -static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 *plist, plen, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - if (entry == NULL) { /* nothing queued: ignore or handle error */ - if (is_mandatory && type == FEAT_UNKNOWN) - return DCCP_RESET_CODE_MANDATORY_ERROR; - - if (!local && type == FEAT_NN) /* 6.3.2 */ - goto confirmation_failed; - return 0; - } - - if (entry->state != FEAT_CHANGING) /* 6.6.2 */ - return 0; - - if (len == 0) { - if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ - goto confirmation_failed; - /* - * Empty Confirm during connection setup: this means reverting - * to the `old' value, which in this case is the default. Since - * we handle default values automatically when no other values - * have been set, we revert to the old value by removing this - * entry from the list. - */ - dccp_feat_list_pop(entry); - return 0; - } - - if (type == FEAT_NN) { - if (len > sizeof(entry->val.nn)) - goto confirmation_failed; - - if (entry->val.nn == dccp_decode_value_var(val, len)) - goto confirmation_succeeded; - - DCCP_WARN("Bogus Confirm for non-existing value\n"); - goto confirmation_failed; - } - - /* - * Parsing SP Confirms: the first element of @val is the preferred - * SP value which the peer confirms, the remainder depends on @len. - * Note that only the confirmed value need to be a valid SP value. - */ - if (!dccp_feat_is_valid_sp_val(feat, *val)) - goto confirmation_failed; - - if (len == 1) { /* peer didn't supply a preference list */ - plist = val; - plen = len; - } else { /* preferred value + preference list */ - plist = val + 1; - plen = len - 1; - } - - /* Check whether the peer got the reconciliation right (6.6.8) */ - if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { - DCCP_WARN("Confirm selected the wrong value %u\n", *val); - return DCCP_RESET_CODE_OPTION_ERROR; - } - entry->val.sp.vec[0] = *val; - -confirmation_succeeded: - entry->state = FEAT_STABLE; - return 0; - -confirmation_failed: - DCCP_WARN("Confirmation failed\n"); - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_handle_nn_established - Fast-path reception of NN options - * @sk: socket of an established DCCP connection - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) - * @feat: NN number, one of %dccp_feature_numbers - * @val: NN value - * @len: length of @val in bytes - * - * This function combines the functionality of change_recv/confirm_recv, with - * the following differences (reset codes are the same): - * - cleanup after receiving the Confirm; - * - values are directly activated after successful parsing; - * - deliberately restricted to NN features. - * The restriction to NN features is essential since SP features can have non- - * predictable outcomes (depending on the remote configuration), and are inter- - * dependent (CCIDs for instance cause further dependencies). - */ -static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, - u8 feat, u8 *val, u8 len) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry; - u8 type = dccp_feat_type(feat); - dccp_feat_val fval; - - dccp_feat_print_opt(opt, feat, val, len, mandatory); - - /* Ignore non-mandatory unknown and non-NN features */ - if (type == FEAT_UNKNOWN) { - if (local && !mandatory) - return 0; - goto fast_path_unknown; - } else if (type != FEAT_NN) { - return 0; - } - - /* - * We don't accept empty Confirms, since in fast-path feature - * negotiation the values are enabled immediately after sending - * the Change option. - * Empty Changes on the other hand are invalid (RFC 4340, 6.1). - */ - if (len == 0 || len > sizeof(fval.nn)) - goto fast_path_unknown; - - if (opt == DCCPO_CHANGE_L) { - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto fast_path_unknown; - - if (dccp_feat_push_confirm(fn, feat, local, &fval) || - dccp_feat_activate(sk, feat, local, &fval)) - return DCCP_RESET_CODE_TOO_BUSY; - - /* set the `Ack Pending' flag to piggyback a Confirm */ - inet_csk_schedule_ack(sk); - - } else if (opt == DCCPO_CONFIRM_R) { - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL || entry->state != FEAT_CHANGING) - return 0; - - fval.nn = dccp_decode_value_var(val, len); - /* - * Just ignore a value that doesn't match our current value. - * If the option changes twice within two RTTs, then at least - * one CONFIRM will be received for the old value after a - * new CHANGE was sent. - */ - if (fval.nn != entry->val.nn) - return 0; - - /* Only activate after receiving the Confirm option (6.6.1). */ - dccp_feat_activate(sk, feat, local, &fval); - - /* It has been confirmed - so remove the entry */ - dccp_feat_list_pop(entry); - - } else { - DCCP_WARN("Received illegal option %u\n", opt); - goto fast_path_failed; - } - return 0; - -fast_path_unknown: - if (!mandatory) - return dccp_push_empty_confirm(fn, feat, local); - -fast_path_failed: - return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_parse_options - Process Feature-Negotiation Options - * @sk: for general use and used by the client during connection setup - * @dreq: used by the server during connection setup - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: value contents of @opt - * @len: length of @val in bytes - * - * Returns 0 on success, a Reset code for ending the connection otherwise. - */ -int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - bool server = false; - - switch (sk->sk_state) { - /* - * Negotiation during connection setup - */ - case DCCP_LISTEN: - server = true; - fallthrough; - case DCCP_REQUESTING: - switch (opt) { - case DCCPO_CHANGE_L: - case DCCPO_CHANGE_R: - return dccp_feat_change_recv(fn, mandatory, opt, feat, - val, len, server); - case DCCPO_CONFIRM_R: - case DCCPO_CONFIRM_L: - return dccp_feat_confirm_recv(fn, mandatory, opt, feat, - val, len, server); - } - break; - /* - * Support for exchanging NN options on an established connection. - */ - case DCCP_OPEN: - case DCCP_PARTOPEN: - return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, - val, len); - } - return 0; /* ignore FN options in all other states */ -} - -/** - * dccp_feat_init - Seed feature negotiation with host-specific defaults - * @sk: Socket to initialize. - * - * This initialises global defaults, depending on the value of the sysctls. - * These can later be overridden by registering changes via setsockopt calls. - * The last link in the chain is finalise_settings, to make sure that between - * here and the start of actual feature negotiation no inconsistencies enter. - * - * All features not appearing below use either defaults or are otherwise - * later adjusted through dccp_feat_finalise_settings(). - */ -int dccp_feat_init(struct sock *sk) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - u8 on = 1, off = 0; - int rc; - struct { - u8 *val; - u8 len; - } tx, rx; - - /* Non-negotiable (NN) features */ - rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, - sysctl_dccp_sequence_window); - if (rc) - return rc; - - /* Server-priority (SP) features */ - - /* Advertise that short seqnos are not supported (7.6.1) */ - rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); - if (rc) - return rc; - - /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ - rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); - if (rc) - return rc; - - /* - * We advertise the available list of CCIDs and reorder according to - * preferences, to avoid failure resulting from negotiating different - * singleton values (which always leads to failure). - * These settings can still (later) be overridden via sockopts. - */ - if (ccid_get_builtin_ccids(&tx.val, &tx.len)) - return -ENOBUFS; - if (ccid_get_builtin_ccids(&rx.val, &rx.len)) { - kfree(tx.val); - return -ENOBUFS; - } - - if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || - !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) - goto free_ccid_lists; - - rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); - if (rc) - goto free_ccid_lists; - - rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); - -free_ccid_lists: - kfree(tx.val); - kfree(rx.val); - return rc; -} - -int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_feat_entry *cur, *next; - int idx; - dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { - [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } - }; - - list_for_each_entry(cur, fn_list, node) { - /* - * An empty Confirm means that either an unknown feature type - * or an invalid value was present. In the first case there is - * nothing to activate, in the other the default value is used. - */ - if (cur->empty_confirm) - continue; - - idx = dccp_feat_index(cur->feat_num); - if (idx < 0) { - DCCP_BUG("Unknown feature %u", cur->feat_num); - goto activation_failed; - } - if (cur->state != FEAT_STABLE) { - DCCP_CRIT("Negotiation of %s %s failed in state %s", - cur->is_local ? "local" : "remote", - dccp_feat_fname(cur->feat_num), - dccp_feat_sname[cur->state]); - goto activation_failed; - } - fvals[idx][cur->is_local] = &cur->val; - } - - /* - * Activate in decreasing order of index, so that the CCIDs are always - * activated as the last feature. This avoids the case where a CCID - * relies on the initialisation of one or more features that it depends - * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). - */ - for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) - if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || - __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { - DCCP_CRIT("Could not activate %d", idx); - goto activation_failed; - } - - /* Clean up Change options which have been confirmed already */ - list_for_each_entry_safe(cur, next, fn_list, node) - if (!cur->needs_confirm) - dccp_feat_list_pop(cur); - - dccp_pr_debug("Activation OK\n"); - return 0; - -activation_failed: - /* - * We clean up everything that may have been allocated, since - * it is difficult to track at which stage negotiation failed. - * This is ok, since all allocation functions below are robust - * against NULL arguments. - */ - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - return -1; -} diff --git a/net/dccp/feat.h b/net/dccp/feat.h deleted file mode 100644 index 57d9c026aa3f5..0000000000000 --- a/net/dccp/feat.h +++ /dev/null @@ -1,133 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_FEAT_H -#define _DCCP_FEAT_H -/* - * net/dccp/feat.h - * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * Copyright (c) 2008 Gerrit Renker - * Copyright (c) 2005 Andrea Bittau - */ -#include -#include "dccp.h" - -/* - * Known limit values - */ -/* Ack Ratio takes 2-byte integer values (11.3) */ -#define DCCPF_ACK_RATIO_MAX 0xFFFF -/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ -#define DCCPF_SEQ_WMIN 32 -#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull -/* Maximum number of SP values that fit in a single (Confirm) option */ -#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) - -enum dccp_feat_type { - FEAT_AT_RX = 1, /* located at RX side of half-connection */ - FEAT_AT_TX = 2, /* located at TX side of half-connection */ - FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ - FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ - FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ -}; - -enum dccp_feat_state { - FEAT_DEFAULT = 0, /* using default values from 6.4 */ - FEAT_INITIALISING, /* feature is being initialised */ - FEAT_CHANGING, /* Change sent but not confirmed yet */ - FEAT_UNSTABLE, /* local modification in state CHANGING */ - FEAT_STABLE /* both ends (think they) agree */ -}; - -/** - * dccp_feat_val - Container for SP or NN feature values - * @nn: single NN value - * @sp.vec: single SP value plus optional preference list - * @sp.len: length of @sp.vec in bytes - */ -typedef union { - u64 nn; - struct { - u8 *vec; - u8 len; - } sp; -} dccp_feat_val; - -/** - * struct feat_entry - Data structure to perform feature negotiation - * @val: feature's current value (SP features may have preference list) - * @state: feature's current state - * @feat_num: one of %dccp_feature_numbers - * @needs_mandatory: whether Mandatory options should be sent - * @needs_confirm: whether to send a Confirm instead of a Change - * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) - * @is_local: feature location (1) or feature-remote (0) - * @node: list pointers, entries arranged in FIFO order - */ -struct dccp_feat_entry { - dccp_feat_val val; - enum dccp_feat_state state:8; - u8 feat_num; - - bool needs_mandatory, - needs_confirm, - empty_confirm, - is_local; - - struct list_head node; -}; - -static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) -{ - if (entry->needs_confirm) - return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; - return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; -} - -/** - * struct ccid_dependency - Track changes resulting from choosing a CCID - * @dependent_feat: one of %dccp_feature_numbers - * @is_local: local (1) or remote (0) @dependent_feat - * @is_mandatory: whether presence of @dependent_feat is mission-critical or not - * @val: corresponding default value for @dependent_feat (u8 is sufficient here) - */ -struct ccid_dependency { - u8 dependent_feat; - bool is_local:1, - is_mandatory:1; - u8 val; -}; - -/* - * Sysctls to seed defaults for feature negotiation - */ -extern unsigned long sysctl_dccp_sequence_window; -extern int sysctl_dccp_rx_ccid; -extern int sysctl_dccp_tx_ccid; - -int dccp_feat_init(struct sock *sk); -int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len); -int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, - u8 mand, u8 opt, u8 feat, u8 *val, u8 len); -int dccp_feat_clone_list(struct list_head const *, struct list_head *); - -/* - * Encoding variable-length options and their maximum length. - * - * This affects NN options (SP options are all u8) and other variable-length - * options (see table 3 in RFC 4340). The limit is currently given the Sequence - * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other - * options consume less than 6 bytes (timestamps are 4 bytes). - * When updating this constant (e.g. due to new internet drafts / RFCs), make - * sure that you also update all code which refers to it. - */ -#define DCCP_OPTVAL_MAXLEN 6 - -void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); -u64 dccp_decode_value_var(const u8 *bf, const u8 len); -u64 dccp_feat_nn_get(struct sock *sk, u8 feat); - -int dccp_insert_option_mandatory(struct sk_buff *skb); -int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len, - bool repeat_first); -#endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c deleted file mode 100644 index 2cbb757a894f8..0000000000000 --- a/net/dccp/input.c +++ /dev/null @@ -1,739 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/input.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include - -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" - -/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */ -int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8; - -static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb) -{ - __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - sk->sk_data_ready(sk); -} - -static void dccp_fin(struct sock *sk, struct sk_buff *skb) -{ - /* - * On receiving Close/CloseReq, both RD/WR shutdown are performed. - * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after - * receiving the closing segment, but there is no guarantee that such - * data will be processed at all. - */ - sk->sk_shutdown = SHUTDOWN_MASK; - sock_set_flag(sk, SOCK_DONE); - dccp_enqueue_skb(sk, skb); -} - -static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb) -{ - int queued = 0; - - switch (sk->sk_state) { - /* - * We ignore Close when received in one of the following states: - * - CLOSED (may be a late or duplicate packet) - * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier) - * - RESPOND (already handled by dccp_check_req) - */ - case DCCP_CLOSING: - /* - * Simultaneous-close: receiving a Close after sending one. This - * can happen if both client and server perform active-close and - * will result in an endless ping-pong of crossing and retrans- - * mitted Close packets, which only terminates when one of the - * nodes times out (min. 64 seconds). Quicker convergence can be - * achieved when one of the nodes acts as tie-breaker. - * This is ok as both ends are done with data transfer and each - * end is just waiting for the other to acknowledge termination. - */ - if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) - break; - fallthrough; - case DCCP_REQUESTING: - case DCCP_ACTIVE_CLOSEREQ: - dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_done(sk); - break; - case DCCP_OPEN: - case DCCP_PARTOPEN: - /* Give waiting application a chance to read pending data */ - queued = 1; - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_PASSIVE_CLOSE); - fallthrough; - case DCCP_PASSIVE_CLOSE: - /* - * Retransmitted Close: we have already enqueued the first one. - */ - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - } - return queued; -} - -static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) -{ - int queued = 0; - - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == CloseReq) - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); - return queued; - } - - /* Step 13: process relevant Client states < CLOSEREQ */ - switch (sk->sk_state) { - case DCCP_REQUESTING: - dccp_send_close(sk, 0); - dccp_set_state(sk, DCCP_CLOSING); - break; - case DCCP_OPEN: - case DCCP_PARTOPEN: - /* Give waiting application a chance to read pending data */ - queued = 1; - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ); - fallthrough; - case DCCP_PASSIVE_CLOSEREQ: - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - } - return queued; -} - -static u16 dccp_reset_code_convert(const u8 code) -{ - static const u16 error_code[] = { - [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */ - [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */ - [DCCP_RESET_CODE_ABORTED] = ECONNRESET, - - [DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED, - [DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED, - [DCCP_RESET_CODE_TOO_BUSY] = EUSERS, - [DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT, - - [DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG, - [DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR, - [DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC, - [DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ, - [DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP, - }; - - return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code]; -} - -static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) -{ - u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code); - - sk->sk_err = err; - - /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */ - dccp_fin(sk, skb); - - if (err && !sock_flag(sk, SOCK_DEAD)) - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); - dccp_time_wait(sk, DCCP_TIME_WAIT, 0); -} - -static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; - - if (av == NULL) - return; - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); - dccp_ackvec_input(av, skb); -} - -static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - /* Don't deliver to RX CCID when node has shut down read end. */ - if (!(sk->sk_shutdown & RCV_SHUTDOWN)) - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - /* - * Until the TX queue has been drained, we can not honour SHUT_WR, since - * we need received feedback as input to adjust congestion control. - */ - if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN)) - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); -} - -static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - struct dccp_sock *dp = dccp_sk(sk); - u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq, - ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - - /* - * Step 5: Prepare sequence numbers for Sync - * If P.type == Sync or P.type == SyncAck, - * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL, - * / * P is valid, so update sequence number variables - * accordingly. After this update, P will pass the tests - * in Step 6. A SyncAck is generated if necessary in - * Step 15 * / - * Update S.GSR, S.SWL, S.SWH - * Otherwise, - * Drop packet and return - */ - if (dh->dccph_type == DCCP_PKT_SYNC || - dh->dccph_type == DCCP_PKT_SYNCACK) { - if (between48(ackno, dp->dccps_awl, dp->dccps_awh) && - dccp_delta_seqno(dp->dccps_swl, seqno) >= 0) - dccp_update_gsr(sk, seqno); - else - return -1; - } - - /* - * Step 6: Check sequence numbers - * Let LSWL = S.SWL and LAWL = S.AWL - * If P.type == CloseReq or P.type == Close or P.type == Reset, - * LSWL := S.GSR + 1, LAWL := S.GAR - * If LSWL <= P.seqno <= S.SWH - * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH), - * Update S.GSR, S.SWL, S.SWH - * If P.type != Sync, - * Update S.GAR - */ - lswl = dp->dccps_swl; - lawl = dp->dccps_awl; - - if (dh->dccph_type == DCCP_PKT_CLOSEREQ || - dh->dccph_type == DCCP_PKT_CLOSE || - dh->dccph_type == DCCP_PKT_RESET) { - lswl = ADD48(dp->dccps_gsr, 1); - lawl = dp->dccps_gar; - } - - if (between48(seqno, lswl, dp->dccps_swh) && - (ackno == DCCP_PKT_WITHOUT_ACK_SEQ || - between48(ackno, lawl, dp->dccps_awh))) { - dccp_update_gsr(sk, seqno); - - if (dh->dccph_type != DCCP_PKT_SYNC && - ackno != DCCP_PKT_WITHOUT_ACK_SEQ && - after48(ackno, dp->dccps_gar)) - dp->dccps_gar = ackno; - } else { - unsigned long now = jiffies; - /* - * Step 6: Check sequence numbers - * Otherwise, - * If P.type == Reset, - * Send Sync packet acknowledging S.GSR - * Otherwise, - * Send Sync packet acknowledging P.seqno - * Drop packet and return - * - * These Syncs are rate-limited as per RFC 4340, 7.5.4: - * at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second. - */ - if (time_before(now, (dp->dccps_rate_last + - sysctl_dccp_sync_ratelimit))) - return -1; - - DCCP_WARN("Step 6 failed for %s packet, " - "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " - "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " - "sending SYNC...\n", dccp_packet_name(dh->dccph_type), - (unsigned long long) lswl, (unsigned long long) seqno, - (unsigned long long) dp->dccps_swh, - (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" - : "exists", - (unsigned long long) lawl, (unsigned long long) ackno, - (unsigned long long) dp->dccps_awh); - - dp->dccps_rate_last = now; - - if (dh->dccph_type == DCCP_PKT_RESET) - seqno = dp->dccps_gsr; - dccp_send_sync(sk, seqno, DCCP_PKT_SYNC); - return -1; - } - - return 0; -} - -static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - - switch (dccp_hdr(skb)->dccph_type) { - case DCCP_PKT_DATAACK: - case DCCP_PKT_DATA: - /* - * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when - * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening" - * - sk_receive_queue is full, use Code 2, "Receive Buffer" - */ - dccp_enqueue_skb(sk, skb); - return 0; - case DCCP_PKT_ACK: - goto discard; - case DCCP_PKT_RESET: - /* - * Step 9: Process Reset - * If P.type == Reset, - * Tear down connection - * S.state := TIMEWAIT - * Set TIMEWAIT timer - * Drop packet and return - */ - dccp_rcv_reset(sk, skb); - return 0; - case DCCP_PKT_CLOSEREQ: - if (dccp_rcv_closereq(sk, skb)) - return 0; - goto discard; - case DCCP_PKT_CLOSE: - if (dccp_rcv_close(sk, skb)) - return 0; - goto discard; - case DCCP_PKT_REQUEST: - /* Step 7 - * or (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state >= OPEN and P.type == Request - * and P.seqno >= S.OSR) - * or (S.state >= OPEN and P.type == Response - * and P.seqno >= S.OSR) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if (dp->dccps_role != DCCP_ROLE_LISTEN) - goto send_sync; - goto check_seq; - case DCCP_PKT_RESPONSE: - if (dp->dccps_role != DCCP_ROLE_CLIENT) - goto send_sync; -check_seq: - if (dccp_delta_seqno(dp->dccps_osr, - DCCP_SKB_CB(skb)->dccpd_seq) >= 0) { -send_sync: - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_PKT_SYNC); - } - break; - case DCCP_PKT_SYNC: - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_PKT_SYNCACK); - /* - * From RFC 4340, sec. 5.7 - * - * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets - * MAY have non-zero-length application data areas, whose - * contents receivers MUST ignore. - */ - goto discard; - } - - DCCP_INC_STATS(DCCP_MIB_INERRS); -discard: - __kfree_skb(skb); - return 0; -} - -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len) -{ - if (dccp_check_seqno(sk, skb)) - goto discard; - - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); - - return __dccp_rcv_established(sk, skb, dh, len); -discard: - __kfree_skb(skb); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_rcv_established); - -static int dccp_rcv_request_sent_state_process(struct sock *sk, - struct sk_buff *skb, - const struct dccp_hdr *dh, - const unsigned int len) -{ - /* - * Step 4: Prepare sequence numbers in REQUEST - * If S.state == REQUEST, - * If (P.type == Response or P.type == Reset) - * and S.AWL <= P.ackno <= S.AWH, - * / * Set sequence number variables corresponding to the - * other endpoint, so P will pass the tests in Step 6 * / - * Set S.GSR, S.ISR, S.SWL, S.SWH - * / * Response processing continues in Step 10; Reset - * processing continues in Step 9 * / - */ - if (dh->dccph_type == DCCP_PKT_RESPONSE) { - const struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - long tstamp = dccp_timestamp(); - - if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, - dp->dccps_awl, dp->dccps_awh)) { - dccp_pr_debug("invalid ackno: S.AWL=%llu, " - "P.ackno=%llu, S.AWH=%llu\n", - (unsigned long long)dp->dccps_awl, - (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, - (unsigned long long)dp->dccps_awh); - goto out_invalid_packet; - } - - /* - * If option processing (Step 8) failed, return 1 here so that - * dccp_v4_do_rcv() sends a Reset. The Reset code depends on - * the option type and is set in dccp_parse_options(). - */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ - if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) - dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - - dp->dccps_options_received.dccpor_timestamp_echo)); - - /* Stop the REQUEST timer */ - inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); - WARN_ON(sk->sk_send_head == NULL); - kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - - /* - * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect - * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH - * is done as part of activating the feature values below, since - * these settings depend on the local/remote Sequence Window - * features, which were undefined or not confirmed until now. - */ - dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; - - dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); - - /* - * Step 10: Process REQUEST state (second part) - * If S.state == REQUEST, - * / * If we get here, P is a valid Response from the - * server (see Step 4), and we should move to - * PARTOPEN state. PARTOPEN means send an Ack, - * don't send Data packets, retransmit Acks - * periodically, and always include any Init Cookie - * from the Response * / - * S.state := PARTOPEN - * Set PARTOPEN timer - * Continue with S.state == PARTOPEN - * / * Step 12 will send the Ack completing the - * three-way handshake * / - */ - dccp_set_state(sk, DCCP_PARTOPEN); - - /* - * If feature negotiation was successful, activate features now; - * an activation failure means that this host could not activate - * one ore more features (e.g. insufficient memory), which would - * leave at least one feature in an undefined state. - */ - if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) - goto unable_to_proceed; - - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - if (!sock_flag(sk, SOCK_DEAD)) { - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - } - - if (sk->sk_write_pending || inet_csk_in_pingpong_mode(sk) || - icsk->icsk_accept_queue.rskq_defer_accept) { - /* Save one ACK. Data will be ready after - * several ticks, if write_pending is set. - * - * It may be deleted, but with this feature tcpdumps - * look so _wonderfully_ clever, that I was not able - * to stand against the temptation 8) --ANK - */ - /* - * OK, in DCCP we can as well do a similar trick, its - * even in the draft, but there is no need for us to - * schedule an ack here, as dccp_sendmsg does this for - * us, also stated in the draft. -acme - */ - __kfree_skb(skb); - return 0; - } - dccp_send_ack(sk); - return -1; - } - -out_invalid_packet: - /* dccp_v4_do_rcv will send a reset */ - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; - return 1; - -unable_to_proceed: - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; - /* - * We mark this socket as no longer usable, so that the loop in - * dccp_sendmsg() terminates and the application gets notified. - */ - dccp_set_state(sk, DCCP_CLOSED); - sk->sk_err = ECOMM; - return 1; -} - -static int dccp_rcv_respond_partopen_state_process(struct sock *sk, - struct sk_buff *skb, - const struct dccp_hdr *dh, - const unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - u32 sample = dp->dccps_options_received.dccpor_timestamp_echo; - int queued = 0; - - switch (dh->dccph_type) { - case DCCP_PKT_RESET: - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - break; - case DCCP_PKT_DATA: - if (sk->sk_state == DCCP_RESPOND) - break; - fallthrough; - case DCCP_PKT_DATAACK: - case DCCP_PKT_ACK: - /* - * FIXME: we should be resetting the PARTOPEN (DELACK) timer - * here but only if we haven't used the DELACK timer for - * something else, like sending a delayed ack for a TIMESTAMP - * echo, etc, for now were not clearing it, sending an extra - * ACK when there is nothing else to do in DELACK is not a big - * deal after all. - */ - - /* Stop the PARTOPEN timer */ - if (sk->sk_state == DCCP_PARTOPEN) - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - - /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ - if (likely(sample)) { - long delta = dccp_timestamp() - sample; - - dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta); - } - - dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq; - dccp_set_state(sk, DCCP_OPEN); - - if (dh->dccph_type == DCCP_PKT_DATAACK || - dh->dccph_type == DCCP_PKT_DATA) { - __dccp_rcv_established(sk, skb, dh, len); - queued = 1; /* packet was queued - (by __dccp_rcv_established) */ - } - break; - } - - return queued; -} - -int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct dccp_hdr *dh, unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const int old_state = sk->sk_state; - bool acceptable; - int queued = 0; - - /* - * Step 3: Process LISTEN state - * - * If S.state == LISTEN, - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init - * Cookies Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_LISTEN) { - if (dh->dccph_type == DCCP_PKT_REQUEST) { - /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH and RCU right there. - */ - rcu_read_lock(); - local_bh_disable(); - acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; - local_bh_enable(); - rcu_read_unlock(); - if (!acceptable) - return 1; - consume_skb(skb); - return 0; - } - if (dh->dccph_type == DCCP_PKT_RESET) - goto discard; - - /* Caller (dccp_v4_do_rcv) will send Reset */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - } else if (sk->sk_state == DCCP_CLOSED) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - } - - /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ - if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) - goto discard; - - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if ((dp->dccps_role != DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_RESPONSE) || - (dp->dccps_role == DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_REQUEST) || - (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); - goto discard; - } - - /* Step 8: Process options */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - /* - * Step 9: Process Reset - * If P.type == Reset, - * Tear down connection - * S.state := TIMEWAIT - * Set TIMEWAIT timer - * Drop packet and return - */ - if (dh->dccph_type == DCCP_PKT_RESET) { - dccp_rcv_reset(sk, skb); - return 0; - } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ - if (dccp_rcv_closereq(sk, skb)) - return 0; - goto discard; - } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ - if (dccp_rcv_close(sk, skb)) - return 0; - goto discard; - } - - switch (sk->sk_state) { - case DCCP_REQUESTING: - queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); - if (queued >= 0) - return queued; - - __kfree_skb(skb); - return 0; - - case DCCP_PARTOPEN: - /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); - fallthrough; - case DCCP_RESPOND: - queued = dccp_rcv_respond_partopen_state_process(sk, skb, - dh, len); - break; - } - - if (dh->dccph_type == DCCP_PKT_ACK || - dh->dccph_type == DCCP_PKT_DATAACK) { - switch (old_state) { - case DCCP_PARTOPEN: - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - break; - } - } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK); - goto discard; - } - - if (!queued) { -discard: - __kfree_skb(skb); - } - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_rcv_state_process); - -/** - * dccp_sample_rtt - Validate and finalise computation of RTT sample - * @sk: socket structure - * @delta: number of microseconds between packet and acknowledgment - * - * The routine is kept generic to work in different contexts. It should be - * called immediately when the ACK used for the RTT sample arrives. - */ -u32 dccp_sample_rtt(struct sock *sk, long delta) -{ - /* dccpor_elapsed_time is either zeroed out or set and > 0 */ - delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; - - if (unlikely(delta <= 0)) { - DCCP_WARN("unusable RTT sample %ld, using min\n", delta); - return DCCP_SANE_RTT_MIN; - } - if (unlikely(delta > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %ld too large, using max\n", delta); - return DCCP_SANE_RTT_MAX; - } - - return delta; -} diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c deleted file mode 100644 index 2045ddac0fe9f..0000000000000 --- a/net/dccp/ipv4.c +++ /dev/null @@ -1,1101 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/ipv4.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -struct dccp_v4_pernet { - struct sock *v4_ctl_sk; -}; - -static unsigned int dccp_v4_pernet_id __read_mostly; - -/* - * The per-net v4_ctl_sk socket is used for responding to - * the Out-of-the-blue (OOTB) packets. A control sock will be created - * for this socket at the initialization time. - */ - -int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) -{ - const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; - struct inet_sock *inet = inet_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - __be16 orig_sport, orig_dport; - __be32 daddr, nexthop; - struct flowi4 *fl4; - struct rtable *rt; - int err; - struct ip_options_rcu *inet_opt; - - dp->dccps_role = DCCP_ROLE_CLIENT; - - if (addr_len < sizeof(struct sockaddr_in)) - return -EINVAL; - - if (usin->sin_family != AF_INET) - return -EAFNOSUPPORT; - - nexthop = daddr = usin->sin_addr.s_addr; - - inet_opt = rcu_dereference_protected(inet->inet_opt, - lockdep_sock_is_held(sk)); - if (inet_opt != NULL && inet_opt->opt.srr) { - if (daddr == 0) - return -EINVAL; - nexthop = inet_opt->opt.faddr; - } - - orig_sport = inet->inet_sport; - orig_dport = usin->sin_port; - fl4 = &inet->cork.fl.u.ip4; - rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, - sk->sk_bound_dev_if, IPPROTO_DCCP, orig_sport, - orig_dport, sk); - if (IS_ERR(rt)) - return PTR_ERR(rt); - - if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { - ip_rt_put(rt); - return -ENETUNREACH; - } - - if (inet_opt == NULL || !inet_opt->opt.srr) - daddr = fl4->daddr; - - if (inet->inet_saddr == 0) { - err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); - if (err) { - ip_rt_put(rt); - return err; - } - } else { - sk_rcv_saddr_set(sk, inet->inet_saddr); - } - - inet->inet_dport = usin->sin_port; - sk_daddr_set(sk, daddr); - - inet_csk(sk)->icsk_ext_hdr_len = 0; - if (inet_opt) - inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - /* - * Socket identity is still unknown (sport may be zero). - * However we set state to DCCP_REQUESTING and not releasing socket - * lock select source port, enter ourselves into the hash tables and - * complete initialization after this. - */ - dccp_set_state(sk, DCCP_REQUESTING); - err = inet_hash_connect(&dccp_death_row, sk); - if (err != 0) - goto failure; - - rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, - inet->inet_sport, inet->inet_dport, sk); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; - goto failure; - } - /* OK, now commit destination to socket. */ - sk_setup_caps(sk, &rt->dst); - - dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - inet->inet_dport); - atomic_set(&inet->inet_id, get_random_u16()); - - err = dccp_connect(sk); - rt = NULL; - if (err != 0) - goto failure; -out: - return err; -failure: - /* - * This unhashes the socket and releases the local port, if necessary. - */ - dccp_set_state(sk, DCCP_CLOSED); - inet_bhash2_reset_saddr(sk); - ip_rt_put(rt); - sk->sk_route_caps = 0; - inet->inet_dport = 0; - goto out; -} -EXPORT_SYMBOL_GPL(dccp_v4_connect); - -/* - * This routine does path mtu discovery as defined in RFC1191. - */ -static inline void dccp_do_pmtu_discovery(struct sock *sk, - const struct iphdr *iph, - u32 mtu) -{ - struct dst_entry *dst; - const struct inet_sock *inet = inet_sk(sk); - const struct dccp_sock *dp = dccp_sk(sk); - - /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs - * send out by Linux are always < 576bytes so they should go through - * unfragmented). - */ - if (sk->sk_state == DCCP_LISTEN) - return; - - dst = inet_csk_update_pmtu(sk, mtu); - if (!dst) - return; - - /* Something is about to be wrong... Remember soft error - * for the case, if this connection will not able to recover. - */ - if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) - WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); - - mtu = dst_mtu(dst); - - if (inet->pmtudisc != IP_PMTUDISC_DONT && - ip_sk_accept_pmtu(sk) && - inet_csk(sk)->icsk_pmtu_cookie > mtu) { - dccp_sync_mss(sk, mtu); - - /* - * From RFC 4340, sec. 14.1: - * - * DCCP-Sync packets are the best choice for upward - * probing, since DCCP-Sync probes do not risk application - * data loss. - */ - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); - } /* else let the usual retransmit timer handle it */ -} - -static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) -{ - struct dst_entry *dst = __sk_dst_check(sk, 0); - - if (dst) - dst->ops->redirect(dst, sk, skb); -} - -void dccp_req_err(struct sock *sk, u64 seq) - { - struct request_sock *req = inet_reqsk(sk); - struct net *net = sock_net(sk); - - /* - * ICMPs are not backlogged, hence we cannot get an established - * socket here. - */ - if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - } else { - /* - * Still in RESPOND, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - inet_csk_reqsk_queue_drop(req->rsk_listener, req); - } - reqsk_put(req); -} -EXPORT_SYMBOL(dccp_req_err); - -/* - * This routine is called by the ICMP module when it gets some sort of error - * condition. If err < 0 then the socket should be closed and the error - * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code. - * After adjustment header points to the first 8 bytes of the tcp header. We - * need to find the appropriate port. - * - * The locking strategy used here is very "optimistic". When someone else - * accesses the socket the ICMP is just dropped and for some paths there is no - * check at all. A more general error queue to queue errors for later handling - * is probably better. - */ -static int dccp_v4_err(struct sk_buff *skb, u32 info) -{ - const struct iphdr *iph = (struct iphdr *)skb->data; - const u8 offset = iph->ihl << 2; - const struct dccp_hdr *dh; - struct dccp_sock *dp; - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; - struct sock *sk; - __u64 seq; - int err; - struct net *net = dev_net(skb->dev); - - if (!pskb_may_pull(skb, offset + sizeof(*dh))) - return -EINVAL; - dh = (struct dccp_hdr *)(skb->data + offset); - if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) - return -EINVAL; - iph = (struct iphdr *)skb->data; - dh = (struct dccp_hdr *)(skb->data + offset); - - sk = __inet_lookup_established(net, &dccp_hashinfo, - iph->daddr, dh->dccph_dport, - iph->saddr, ntohs(dh->dccph_sport), - inet_iif(skb), 0); - if (!sk) { - __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return -ENOENT; - } - - if (sk->sk_state == DCCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return 0; - } - seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - dccp_req_err(sk, seq); - return 0; - } - - bh_lock_sock(sk); - /* If too many ICMPs get dropped on busy - * servers this needs to be solved differently. - */ - if (sock_owned_by_user(sk)) - __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - dp = dccp_sk(sk); - if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && - !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - switch (type) { - case ICMP_REDIRECT: - if (!sock_owned_by_user(sk)) - dccp_do_redirect(skb, sk); - goto out; - case ICMP_SOURCE_QUENCH: - /* Just silently ignore these. */ - goto out; - case ICMP_PARAMETERPROB: - err = EPROTO; - break; - case ICMP_DEST_UNREACH: - if (code > NR_ICMP_UNREACH) - goto out; - - if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ - if (!sock_owned_by_user(sk)) - dccp_do_pmtu_discovery(sk, iph, info); - goto out; - } - - err = icmp_err_convert[code].errno; - break; - case ICMP_TIME_EXCEEDED: - err = EHOSTUNREACH; - break; - default: - goto out; - } - - switch (sk->sk_state) { - case DCCP_REQUESTING: - case DCCP_RESPOND: - if (!sock_owned_by_user(sk)) { - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - sk->sk_err = err; - - sk_error_report(sk); - - dccp_done(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } - goto out; - } - - /* If we've already connected we will keep trying - * until we time out, or the user gives up. - * - * rfc1122 4.2.3.9 allows to consider as hard errors - * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, - * but it is obsoleted by pmtu discovery). - * - * Note, that in modern internet, where routing is unreliable - * and in each dark corner broken firewalls sit, sending random - * errors ordered by their masters even this two messages finally lose - * their original sense (even Linux sends invalid PORT_UNREACHs) - * - * Now we are in compliance with RFCs. - * --ANK (980905) - */ - - if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { - sk->sk_err = err; - sk_error_report(sk); - } else { /* Only an error on timeout */ - WRITE_ONCE(sk->sk_err_soft, err); - } -out: - bh_unlock_sock(sk); - sock_put(sk); - return 0; -} - -static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb, - __be32 src, __be32 dst) -{ - return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum); -} - -void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb) -{ - const struct inet_sock *inet = inet_sk(sk); - struct dccp_hdr *dh = dccp_hdr(skb); - - dccp_csum_outgoing(skb); - dh->dccph_checksum = dccp_v4_csum_finish(skb, - inet->inet_saddr, - inet->inet_daddr); -} -EXPORT_SYMBOL_GPL(dccp_v4_send_check); - -static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb) -{ - return secure_dccp_sequence_number(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - dccp_hdr(skb)->dccph_dport, - dccp_hdr(skb)->dccph_sport); -} - -/* - * The three way handshake has completed - we got a valid ACK or DATAACK - - * now create the new socket. - * - * This is the equivalent of TCP's tcp_v4_syn_recv_sock - */ -struct sock *dccp_v4_request_recv_sock(const struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req) -{ - struct inet_request_sock *ireq; - struct inet_sock *newinet; - struct sock *newsk; - - if (sk_acceptq_is_full(sk)) - goto exit_overflow; - - newsk = dccp_create_openreq_child(sk, req, skb); - if (newsk == NULL) - goto exit_nonewsk; - - newinet = inet_sk(newsk); - ireq = inet_rsk(req); - RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); - newinet->mc_index = inet_iif(skb); - newinet->mc_ttl = ip_hdr(skb)->ttl; - atomic_set(&newinet->inet_id, get_random_u16()); - - if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) - goto put_and_exit; - - sk_setup_caps(newsk, dst); - - dccp_sync_mss(newsk, dst_mtu(dst)); - - if (__inet_inherit_port(sk, newsk) < 0) - goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); - if (*own_req) - ireq->ireq_opt = NULL; - else - newinet->inet_opt = NULL; - return newsk; - -exit_overflow: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -exit_nonewsk: - dst_release(dst); -exit: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); - return NULL; -put_and_exit: - newinet->inet_opt = NULL; - inet_csk_prepare_forced_close(newsk); - dccp_done(newsk); - goto exit; -} -EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); - -static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, - struct sk_buff *skb) -{ - struct rtable *rt; - const struct iphdr *iph = ip_hdr(skb); - struct flowi4 fl4 = { - .flowi4_oif = inet_iif(skb), - .daddr = iph->saddr, - .saddr = iph->daddr, - .flowi4_tos = inet_dscp_to_dsfield(inet_sk_dscp(inet_sk(sk))), - .flowi4_scope = ip_sock_rt_scope(sk), - .flowi4_proto = sk->sk_protocol, - .fl4_sport = dccp_hdr(skb)->dccph_dport, - .fl4_dport = dccp_hdr(skb)->dccph_sport, - }; - - security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); - rt = ip_route_output_flow(net, &fl4, sk); - if (IS_ERR(rt)) { - IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - - return &rt->dst; -} - -static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req) -{ - int err = -1; - struct sk_buff *skb; - struct dst_entry *dst; - struct flowi4 fl4; - - dst = inet_csk_route_req(sk, &fl4, req); - if (dst == NULL) - goto out; - - skb = dccp_make_response(sk, dst, req); - if (skb != NULL) { - const struct inet_request_sock *ireq = inet_rsk(req); - struct dccp_hdr *dh = dccp_hdr(skb); - - dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr, - ireq->ir_rmt_addr); - rcu_read_lock(); - err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt), - READ_ONCE(inet_sk(sk)->tos)); - rcu_read_unlock(); - err = net_xmit_eval(err); - } - -out: - dst_release(dst); - return err; -} - -static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, - enum sk_rst_reason reason) -{ - int err; - const struct iphdr *rxiph; - struct sk_buff *skb; - struct dst_entry *dst; - struct net *net = dev_net(skb_dst(rxskb)->dev); - struct dccp_v4_pernet *pn; - struct sock *ctl_sk; - - /* Never send a reset in response to a reset. */ - if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) - return; - - if (skb_rtable(rxskb)->rt_type != RTN_LOCAL) - return; - - pn = net_generic(net, dccp_v4_pernet_id); - ctl_sk = pn->v4_ctl_sk; - dst = dccp_v4_route_skb(net, ctl_sk, rxskb); - if (dst == NULL) - return; - - skb = dccp_ctl_make_reset(ctl_sk, rxskb); - if (skb == NULL) - goto out; - - rxiph = ip_hdr(rxskb); - dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr, - rxiph->daddr); - skb_dst_set(skb, dst_clone(dst)); - - local_bh_disable(); - bh_lock_sock(ctl_sk); - err = ip_build_and_send_pkt(skb, ctl_sk, - rxiph->daddr, rxiph->saddr, NULL, - inet_sk(ctl_sk)->tos); - bh_unlock_sock(ctl_sk); - - if (net_xmit_eval(err) == 0) { - __DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - __DCCP_INC_STATS(DCCP_MIB_OUTRSTS); - } - local_bh_enable(); -out: - dst_release(dst); -} - -static void dccp_v4_reqsk_destructor(struct request_sock *req) -{ - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); - kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); -} - -void dccp_syn_ack_timeout(const struct request_sock *req) -{ -} -EXPORT_SYMBOL(dccp_syn_ack_timeout); - -static struct request_sock_ops dccp_request_sock_ops __read_mostly = { - .family = PF_INET, - .obj_size = sizeof(struct dccp_request_sock), - .rtx_syn_ack = dccp_v4_send_response, - .send_ack = dccp_reqsk_send_ack, - .destructor = dccp_v4_reqsk_destructor, - .send_reset = dccp_v4_ctl_send_reset, - .syn_ack_timeout = dccp_syn_ack_timeout, -}; - -int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) -{ - struct inet_request_sock *ireq; - struct request_sock *req; - struct dccp_request_sock *dreq; - const __be32 service = dccp_hdr_request(skb)->dccph_req_service; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - - /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ - if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return 0; /* discard, don't send a reset here */ - - if (dccp_bad_service_code(sk, service)) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; - goto drop; - } - /* - * TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; - if (inet_csk_reqsk_queue_is_full(sk)) - goto drop; - - if (sk_acceptq_is_full(sk)) - goto drop; - - req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true); - if (req == NULL) - goto drop; - - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; - - dreq = dccp_rsk(req); - if (dccp_parse_options(sk, dreq, skb)) - goto drop_and_free; - - ireq = inet_rsk(req); - sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); - sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->ir_mark = inet_request_mark(sk, skb); - ireq->ireq_family = AF_INET; - ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - - /* - * Step 3: Process LISTEN state - * - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie - * - * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child(). - */ - dreq->dreq_isr = dcb->dccpd_seq; - dreq->dreq_gsr = dreq->dreq_isr; - dreq->dreq_iss = dccp_v4_init_sequence(skb); - dreq->dreq_gss = dreq->dreq_iss; - dreq->dreq_service = service; - - if (dccp_v4_send_response(sk, req)) - goto drop_and_free; - - if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) - reqsk_free(req); - else - reqsk_put(req); - - return 0; - -drop_and_free: - reqsk_free(req); -drop: - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - return -1; -} -EXPORT_SYMBOL_GPL(dccp_v4_conn_request); - -int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_hdr *dh = dccp_hdr(skb); - - if (sk->sk_state == DCCP_OPEN) { /* Fast path */ - if (dccp_rcv_established(sk, skb, dh, skb->len)) - goto reset; - return 0; - } - - /* - * Step 3: Process LISTEN state - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - * Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - * - * NOTE: the check for the packet types is done in - * dccp_rcv_state_process - */ - - if (dccp_rcv_state_process(sk, skb, dh, skb->len)) - goto reset; - return 0; - -reset: - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - kfree_skb(skb); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); - -/** - * dccp_invalid_packet - check for malformed packets - * @skb: Packet to validate - * - * Implements RFC 4340, 8.5: Step 1: Check header basics - * Packets that fail these checks are ignored and do not receive Resets. - */ -int dccp_invalid_packet(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - unsigned int cscov; - u8 dccph_doff; - - if (skb->pkt_type != PACKET_HOST) - return 1; - - /* If the packet is shorter than 12 bytes, drop packet and return */ - if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) { - DCCP_WARN("pskb_may_pull failed\n"); - return 1; - } - - dh = dccp_hdr(skb); - - /* If P.type is not understood, drop packet and return */ - if (dh->dccph_type >= DCCP_PKT_INVALID) { - DCCP_WARN("invalid packet type\n"); - return 1; - } - - /* - * If P.Data Offset is too small for packet type, drop packet and return - */ - dccph_doff = dh->dccph_doff; - if (dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { - DCCP_WARN("P.Data Offset(%u) too small\n", dccph_doff); - return 1; - } - /* - * If P.Data Offset is too large for packet, drop packet and return - */ - if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) { - DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff); - return 1; - } - dh = dccp_hdr(skb); - /* - * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet - * has short sequence numbers), drop packet and return - */ - if ((dh->dccph_type < DCCP_PKT_DATA || - dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) { - DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n", - dccp_packet_name(dh->dccph_type)); - return 1; - } - - /* - * If P.CsCov is too large for the packet size, drop packet and return. - * This must come _before_ checksumming (not as RFC 4340 suggests). - */ - cscov = dccp_csum_coverage(skb); - if (cscov > skb->len) { - DCCP_WARN("P.CsCov %u exceeds packet length %d\n", - dh->dccph_cscov, skb->len); - return 1; - } - - /* If header checksum is incorrect, drop packet and return. - * (This step is completed in the AF-dependent functions.) */ - skb->csum = skb_checksum(skb, 0, cscov, 0); - - return 0; -} -EXPORT_SYMBOL_GPL(dccp_invalid_packet); - -/* this is called when real data arrives */ -static int dccp_v4_rcv(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - const struct iphdr *iph; - bool refcounted; - struct sock *sk; - int min_cov; - - /* Step 1: Check header basics */ - - if (dccp_invalid_packet(skb)) - goto discard_it; - - iph = ip_hdr(skb); - /* Step 1: If header checksum is incorrect, drop packet and return */ - if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) { - DCCP_WARN("dropped packet with invalid checksum\n"); - goto discard_it; - } - - dh = dccp_hdr(skb); - - DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); - DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; - - dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu", - dccp_packet_name(dh->dccph_type), - &iph->saddr, ntohs(dh->dccph_sport), - &iph->daddr, ntohs(dh->dccph_dport), - (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); - - if (dccp_packet_without_ack(skb)) { - DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; - dccp_pr_debug_cat("\n"); - } else { - DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - } - -lookup: - sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), - dh->dccph_sport, dh->dccph_dport, 0, &refcounted); - if (!sk) { - dccp_pr_debug("failed to look up flow ID in table and " - "get corresponding socket\n"); - goto no_dccp_socket; - } - - /* - * Step 2: - * ... or S.state == TIMEWAIT, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_TIME_WAIT) { - dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); - inet_twsk_put(inet_twsk(sk)); - goto no_dccp_socket; - } - - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); - struct sock *nsk; - - sk = req->rsk_listener; - if (unlikely(sk->sk_state != DCCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; - } - sock_hold(sk); - refcounted = true; - nsk = dccp_check_req(sk, skb, req); - if (!nsk) { - reqsk_put(req); - goto discard_and_relse; - } - if (nsk == sk) { - reqsk_put(req); - } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - goto discard_and_relse; - } else { - sock_put(sk); - return 0; - } - } - /* - * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage - * o if MinCsCov = 0, only packets with CsCov = 0 are accepted - * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov - */ - min_cov = dccp_sk(sk)->dccps_pcrlen; - if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { - dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", - dh->dccph_cscov, min_cov); - /* FIXME: "Such packets SHOULD be reported using Data Dropped - * options (Section 11.7) with Drop Code 0, Protocol - * Constraints." */ - goto discard_and_relse; - } - - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) - goto discard_and_relse; - nf_reset_ct(skb); - - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted); - -no_dccp_socket: - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard_it; - /* - * Step 2: - * If no socket ... - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (dh->dccph_type != DCCP_PKT_RESET) { - DCCP_SKB_CB(skb)->dccpd_reset_code = - DCCP_RESET_CODE_NO_CONNECTION; - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - } - -discard_it: - kfree_skb(skb); - return 0; - -discard_and_relse: - if (refcounted) - sock_put(sk); - goto discard_it; -} - -static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { - .queue_xmit = ip_queue_xmit, - .send_check = dccp_v4_send_check, - .rebuild_header = inet_sk_rebuild_header, - .conn_request = dccp_v4_conn_request, - .syn_recv_sock = dccp_v4_request_recv_sock, - .net_header_len = sizeof(struct iphdr), - .setsockopt = ip_setsockopt, - .getsockopt = ip_getsockopt, -}; - -static int dccp_v4_init_sock(struct sock *sk) -{ - static __u8 dccp_v4_ctl_sock_initialized; - int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized); - - if (err == 0) { - if (unlikely(!dccp_v4_ctl_sock_initialized)) - dccp_v4_ctl_sock_initialized = 1; - inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops; - } - - return err; -} - -static struct timewait_sock_ops dccp_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct inet_timewait_sock), -}; - -static struct proto dccp_v4_prot = { - .name = "DCCP", - .owner = THIS_MODULE, - .close = dccp_close, - .connect = dccp_v4_connect, - .disconnect = dccp_disconnect, - .ioctl = dccp_ioctl, - .init = dccp_v4_init_sock, - .setsockopt = dccp_setsockopt, - .getsockopt = dccp_getsockopt, - .sendmsg = dccp_sendmsg, - .recvmsg = dccp_recvmsg, - .backlog_rcv = dccp_v4_do_rcv, - .hash = inet_hash, - .unhash = inet_unhash, - .accept = inet_csk_accept, - .get_port = inet_csk_get_port, - .shutdown = dccp_shutdown, - .destroy = dccp_destroy_sock, - .orphan_count = &dccp_orphan_count, - .max_header = MAX_DCCP_HEADER, - .obj_size = sizeof(struct dccp_sock), - .slab_flags = SLAB_TYPESAFE_BY_RCU, - .rsk_prot = &dccp_request_sock_ops, - .twsk_prot = &dccp_timewait_sock_ops, - .h.hashinfo = &dccp_hashinfo, -}; - -static const struct net_protocol dccp_v4_protocol = { - .handler = dccp_v4_rcv, - .err_handler = dccp_v4_err, - .no_policy = 1, - .icmp_strict_tag_validation = 1, -}; - -static const struct proto_ops inet_dccp_ops = { - .family = PF_INET, - .owner = THIS_MODULE, - .release = inet_release, - .bind = inet_bind, - .connect = inet_stream_connect, - .socketpair = sock_no_socketpair, - .accept = inet_accept, - .getname = inet_getname, - /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ - .poll = dccp_poll, - .ioctl = inet_ioctl, - .gettstamp = sock_gettstamp, - /* FIXME: work on inet_listen to rename it to sock_common_listen */ - .listen = inet_dccp_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, -}; - -static struct inet_protosw dccp_v4_protosw = { - .type = SOCK_DCCP, - .protocol = IPPROTO_DCCP, - .prot = &dccp_v4_prot, - .ops = &inet_dccp_ops, - .flags = INET_PROTOSW_ICSK, -}; - -static int __net_init dccp_v4_init_net(struct net *net) -{ - struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); - - if (dccp_hashinfo.bhash == NULL) - return -ESOCKTNOSUPPORT; - - return inet_ctl_sock_create(&pn->v4_ctl_sk, PF_INET, - SOCK_DCCP, IPPROTO_DCCP, net); -} - -static void __net_exit dccp_v4_exit_net(struct net *net) -{ - struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); - - inet_ctl_sock_destroy(pn->v4_ctl_sk); -} - -static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) -{ - inet_twsk_purge(&dccp_hashinfo); -} - -static struct pernet_operations dccp_v4_ops = { - .init = dccp_v4_init_net, - .exit = dccp_v4_exit_net, - .exit_batch = dccp_v4_exit_batch, - .id = &dccp_v4_pernet_id, - .size = sizeof(struct dccp_v4_pernet), -}; - -static int __init dccp_v4_init(void) -{ - int err = proto_register(&dccp_v4_prot, 1); - - if (err) - goto out; - - inet_register_protosw(&dccp_v4_protosw); - - err = register_pernet_subsys(&dccp_v4_ops); - if (err) - goto out_destroy_ctl_sock; - - err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP); - if (err) - goto out_proto_unregister; - -out: - return err; -out_proto_unregister: - unregister_pernet_subsys(&dccp_v4_ops); -out_destroy_ctl_sock: - inet_unregister_protosw(&dccp_v4_protosw); - proto_unregister(&dccp_v4_prot); - goto out; -} - -static void __exit dccp_v4_exit(void) -{ - inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP); - unregister_pernet_subsys(&dccp_v4_ops); - inet_unregister_protosw(&dccp_v4_protosw); - proto_unregister(&dccp_v4_prot); -} - -module_init(dccp_v4_init); -module_exit(dccp_v4_exit); - -/* - * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) - * values directly, Also cover the case where the protocol is not specified, - * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP - */ -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 33, 6); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 0, 6); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo "); -MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c deleted file mode 100644 index e24dbffabfc19..0000000000000 --- a/net/dccp/ipv6.c +++ /dev/null @@ -1,1174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * DCCP over IPv6 - * Linux INET6 implementation - * - * Based on net/dccp6/ipv6.c - * - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "dccp.h" -#include "ipv6.h" -#include "feat.h" - -struct dccp_v6_pernet { - struct sock *v6_ctl_sk; -}; - -static unsigned int dccp_v6_pernet_id __read_mostly; - -/* The per-net v6_ctl_sk is used for sending RSTs and ACKs */ - -static const struct inet_connection_sock_af_ops dccp_ipv6_mapped; -static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops; - -/* add pseudo-header to DCCP checksum stored in skb->csum */ -static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb, - const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum); -} - -static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct dccp_hdr *dh = dccp_hdr(skb); - - dccp_csum_outgoing(skb); - dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr); -} - -static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb) -{ - return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32, - dccp_hdr(skb)->dccph_dport, - dccp_hdr(skb)->dccph_sport ); - -} - -static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) -{ - const struct ipv6hdr *hdr; - const struct dccp_hdr *dh; - struct dccp_sock *dp; - struct ipv6_pinfo *np; - struct sock *sk; - int err; - __u64 seq; - struct net *net = dev_net(skb->dev); - - if (!pskb_may_pull(skb, offset + sizeof(*dh))) - return -EINVAL; - dh = (struct dccp_hdr *)(skb->data + offset); - if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) - return -EINVAL; - hdr = (const struct ipv6hdr *)skb->data; - dh = (struct dccp_hdr *)(skb->data + offset); - - sk = __inet6_lookup_established(net, &dccp_hashinfo, - &hdr->daddr, dh->dccph_dport, - &hdr->saddr, ntohs(dh->dccph_sport), - inet6_iif(skb), 0); - - if (!sk) { - __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); - return -ENOENT; - } - - if (sk->sk_state == DCCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return 0; - } - seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - dccp_req_err(sk, seq); - return 0; - } - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) - __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - dp = dccp_sk(sk); - if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && - !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - np = inet6_sk(sk); - - if (type == NDISC_REDIRECT) { - if (!sock_owned_by_user(sk)) { - struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst) - dst->ops->redirect(dst, sk, skb); - } - goto out; - } - - if (type == ICMPV6_PKT_TOOBIG) { - struct dst_entry *dst = NULL; - - if (!ip6_sk_accept_pmtu(sk)) - goto out; - - if (sock_owned_by_user(sk)) - goto out; - if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) - goto out; - - dst = inet6_csk_update_pmtu(sk, ntohl(info)); - if (!dst) - goto out; - - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) - dccp_sync_mss(sk, dst_mtu(dst)); - goto out; - } - - icmpv6_err_convert(type, code, &err); - - /* Might be for an request_sock */ - switch (sk->sk_state) { - case DCCP_REQUESTING: - case DCCP_RESPOND: /* Cannot happen. - It can, it SYNs are crossed. --ANK */ - if (!sock_owned_by_user(sk)) { - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - sk->sk_err = err; - /* - * Wake people up to see the error - * (see connect in sock.c) - */ - sk_error_report(sk); - dccp_done(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } - goto out; - } - - if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { - sk->sk_err = err; - sk_error_report(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } -out: - bh_unlock_sock(sk); - sock_put(sk); - return 0; -} - - -static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req) -{ - struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff *skb; - struct in6_addr *final_p, final; - struct flowi6 fl6; - int err = -1; - struct dst_entry *dst; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.daddr = ireq->ir_v6_rmt_addr; - fl6.saddr = ireq->ir_v6_loc_addr; - fl6.flowlabel = 0; - fl6.flowi6_oif = ireq->ir_iif; - fl6.fl6_dport = ireq->ir_rmt_port; - fl6.fl6_sport = htons(ireq->ir_num); - security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); - - - rcu_read_lock(); - final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); - rcu_read_unlock(); - - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - dst = NULL; - goto done; - } - - skb = dccp_make_response(sk, dst, req); - if (skb != NULL) { - struct dccp_hdr *dh = dccp_hdr(skb); - struct ipv6_txoptions *opt; - - dh->dccph_checksum = dccp_v6_csum_finish(skb, - &ireq->ir_v6_loc_addr, - &ireq->ir_v6_rmt_addr); - fl6.daddr = ireq->ir_v6_rmt_addr; - rcu_read_lock(); - opt = ireq->ipv6_opt; - if (!opt) - opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, READ_ONCE(sk->sk_mark), opt, - np->tclass, READ_ONCE(sk->sk_priority)); - rcu_read_unlock(); - err = net_xmit_eval(err); - } - -done: - dst_release(dst); - return err; -} - -static void dccp_v6_reqsk_destructor(struct request_sock *req) -{ - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); - kfree(inet_rsk(req)->ipv6_opt); - kfree_skb(inet_rsk(req)->pktopts); -} - -static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, - enum sk_rst_reason reason) -{ - const struct ipv6hdr *rxip6h; - struct sk_buff *skb; - struct flowi6 fl6; - struct net *net = dev_net(skb_dst(rxskb)->dev); - struct dccp_v6_pernet *pn; - struct sock *ctl_sk; - struct dst_entry *dst; - - if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) - return; - - if (!ipv6_unicast_destination(rxskb)) - return; - - pn = net_generic(net, dccp_v6_pernet_id); - ctl_sk = pn->v6_ctl_sk; - skb = dccp_ctl_make_reset(ctl_sk, rxskb); - if (skb == NULL) - return; - - rxip6h = ipv6_hdr(rxskb); - dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr, - &rxip6h->daddr); - - memset(&fl6, 0, sizeof(fl6)); - fl6.daddr = rxip6h->saddr; - fl6.saddr = rxip6h->daddr; - - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.flowi6_oif = inet6_iif(rxskb); - fl6.fl6_dport = dccp_hdr(skb)->dccph_dport; - fl6.fl6_sport = dccp_hdr(skb)->dccph_sport; - security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6)); - - /* sk = NULL, but it is safe for now. RST socket required. */ - dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); - if (!IS_ERR(dst)) { - skb_dst_set(skb, dst); - ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0); - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - DCCP_INC_STATS(DCCP_MIB_OUTRSTS); - return; - } - - kfree_skb(skb); -} - -static struct request_sock_ops dccp6_request_sock_ops = { - .family = AF_INET6, - .obj_size = sizeof(struct dccp6_request_sock), - .rtx_syn_ack = dccp_v6_send_response, - .send_ack = dccp_reqsk_send_ack, - .destructor = dccp_v6_reqsk_destructor, - .send_reset = dccp_v6_ctl_send_reset, - .syn_ack_timeout = dccp_syn_ack_timeout, -}; - -static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) -{ - struct request_sock *req; - struct dccp_request_sock *dreq; - struct inet_request_sock *ireq; - struct ipv6_pinfo *np = inet6_sk(sk); - const __be32 service = dccp_hdr_request(skb)->dccph_req_service; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - - if (skb->protocol == htons(ETH_P_IP)) - return dccp_v4_conn_request(sk, skb); - - if (!ipv6_unicast_destination(skb)) - return 0; /* discard, don't send a reset here */ - - if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { - __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); - return 0; - } - - if (dccp_bad_service_code(sk, service)) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; - goto drop; - } - /* - * There are no SYN attacks on IPv6, yet... - */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; - if (inet_csk_reqsk_queue_is_full(sk)) - goto drop; - - if (sk_acceptq_is_full(sk)) - goto drop; - - req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true); - if (req == NULL) - goto drop; - - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; - - dreq = dccp_rsk(req); - if (dccp_parse_options(sk, dreq, skb)) - goto drop_and_free; - - ireq = inet_rsk(req); - ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; - ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - ireq->ir_rmt_addr = LOOPBACK4_IPV6; - ireq->ir_loc_addr = LOOPBACK4_IPV6; - - ireq->ireq_family = AF_INET6; - ireq->ir_mark = inet_request_mark(sk, skb); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - - if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || - np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || - np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { - refcount_inc(&skb->users); - ireq->pktopts = skb; - } - ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); - - /* So that link locals have meaning */ - if (!ireq->ir_iif && - ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) - ireq->ir_iif = inet6_iif(skb); - - /* - * Step 3: Process LISTEN state - * - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie - * - * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child(). - */ - dreq->dreq_isr = dcb->dccpd_seq; - dreq->dreq_gsr = dreq->dreq_isr; - dreq->dreq_iss = dccp_v6_init_sequence(skb); - dreq->dreq_gss = dreq->dreq_iss; - dreq->dreq_service = service; - - if (dccp_v6_send_response(sk, req)) - goto drop_and_free; - - if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) - reqsk_free(req); - else - reqsk_put(req); - - return 0; - -drop_and_free: - reqsk_free(req); -drop: - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - return -1; -} - -static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req) -{ - struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *newnp; - const struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_txoptions *opt; - struct inet_sock *newinet; - struct dccp6_sock *newdp6; - struct sock *newsk; - - if (skb->protocol == htons(ETH_P_IP)) { - /* - * v6 mapped - */ - newsk = dccp_v4_request_recv_sock(sk, skb, req, dst, - req_unhash, own_req); - if (newsk == NULL) - return NULL; - - newdp6 = (struct dccp6_sock *)newsk; - newinet = inet_sk(newsk); - newinet->pinet6 = &newdp6->inet6; - newnp = inet6_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - - newnp->saddr = newsk->sk_v6_rcv_saddr; - - inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; - newsk->sk_backlog_rcv = dccp_v4_do_rcv; - newnp->pktoptions = NULL; - newnp->opt = NULL; - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - newnp->mcast_oif = inet_iif(skb); - newnp->mcast_hops = ip_hdr(skb)->ttl; - - /* - * No need to charge this sock to the relevant IPv6 refcnt debug socks count - * here, dccp_create_openreq_child now does this for us, see the comment in - * that function for the gory details. -acme - */ - - /* It is tricky place. Until this moment IPv4 tcp - worked with IPv6 icsk.icsk_af_ops. - Sync it now. - */ - dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); - - return newsk; - } - - - if (sk_acceptq_is_full(sk)) - goto out_overflow; - - if (!dst) { - struct flowi6 fl6; - - dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_DCCP); - if (!dst) - goto out; - } - - newsk = dccp_create_openreq_child(sk, req, skb); - if (newsk == NULL) - goto out_nonewsk; - - /* - * No need to charge this sock to the relevant IPv6 refcnt debug socks - * count here, dccp_create_openreq_child now does this for us, see the - * comment in that function for the gory details. -acme - */ - - ip6_dst_store(newsk, dst, NULL, NULL); - newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | - NETIF_F_TSO); - newdp6 = (struct dccp6_sock *)newsk; - newinet = inet_sk(newsk); - newinet->pinet6 = &newdp6->inet6; - newnp = inet6_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - - newnp->saddr = ireq->ir_v6_loc_addr; - - /* Now IPv6 options... - - First: no IPv4 options. - */ - newinet->inet_opt = NULL; - - /* Clone RX bits */ - newnp->rxopt.all = np->rxopt.all; - - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - newnp->pktoptions = NULL; - newnp->opt = NULL; - newnp->mcast_oif = inet6_iif(skb); - newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; - - /* - * Clone native IPv6 options from listening socket (if any) - * - * Yes, keeping reference count would be much more clever, but we make - * one more one thing there: reattach optmem to newsk. - */ - opt = ireq->ipv6_opt; - if (!opt) - opt = rcu_dereference(np->opt); - if (opt) { - opt = ipv6_dup_options(newsk, opt); - RCU_INIT_POINTER(newnp->opt, opt); - } - inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (opt) - inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + - opt->opt_flen; - - dccp_sync_mss(newsk, dst_mtu(dst)); - - if (__inet_inherit_port(sk, newsk) < 0) { - inet_csk_prepare_forced_close(newsk); - dccp_done(newsk); - goto out; - } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); - /* Clone pktoptions received with SYN, if we own the req */ - if (*own_req && ireq->pktopts) { - newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); - consume_skb(ireq->pktopts); - ireq->pktopts = NULL; - } - - return newsk; - -out_overflow: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -out_nonewsk: - dst_release(dst); -out: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); - return NULL; -} - -/* The socket must have it's spinlock held when we get - * here. - * - * We have a potential double-lock case here, so even when - * doing backlog processing we use the BH locking scheme. - * This is because we cannot sleep with the original spinlock - * held. - */ -static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff *opt_skb = NULL; - - /* Imagine: socket is IPv6. IPv4 packet arrives, - goes to IPv4 receive handler and backlogged. - From backlog it always goes here. Kerboom... - Fortunately, dccp_rcv_established and rcv_established - handle them correctly, but it is not case with - dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK - */ - - if (skb->protocol == htons(ETH_P_IP)) - return dccp_v4_do_rcv(sk, skb); - - if (sk_filter(sk, skb)) - goto discard; - - /* - * socket locking is here for SMP purposes as backlog rcv is currently - * called with bh processing disabled. - */ - - /* Do Stevens' IPV6_PKTOPTIONS. - - Yes, guys, it is the only place in our code, where we - may make it not affecting IPv4. - The rest of code is protocol independent, - and I do not like idea to uglify IPv4. - - Actually, all the idea behind IPV6_PKTOPTIONS - looks not very well thought. For now we latch - options, received in the last packet, enqueued - by tcp. Feel free to propose better solution. - --ANK (980728) - */ - if (np->rxopt.all && sk->sk_state != DCCP_LISTEN) - opt_skb = skb_clone_and_charge_r(skb, sk); - - if (sk->sk_state == DCCP_OPEN) { /* Fast path */ - if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) - goto reset; - if (opt_skb) - goto ipv6_pktoptions; - return 0; - } - - /* - * Step 3: Process LISTEN state - * If S.state == LISTEN, - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - * Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - * - * NOTE: the check for the packet types is done in - * dccp_rcv_state_process - */ - - if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) - goto reset; - if (opt_skb) - goto ipv6_pktoptions; - return 0; - -reset: - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); -discard: - if (opt_skb != NULL) - __kfree_skb(opt_skb); - kfree_skb(skb); - return 0; - -/* Handling IPV6_PKTOPTIONS skb the similar - * way it's done for net/ipv6/tcp_ipv6.c - */ -ipv6_pktoptions: - if (!((1 << sk->sk_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) { - if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) - WRITE_ONCE(np->mcast_oif, inet6_iif(opt_skb)); - if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) - WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); - if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) - np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); - if (inet6_test_bit(REPFLOW, sk)) - np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); - if (ipv6_opt_accepted(sk, opt_skb, - &DCCP_SKB_CB(opt_skb)->header.h6)) { - memmove(IP6CB(opt_skb), - &DCCP_SKB_CB(opt_skb)->header.h6, - sizeof(struct inet6_skb_parm)); - opt_skb = xchg(&np->pktoptions, opt_skb); - } else { - __kfree_skb(opt_skb); - opt_skb = xchg(&np->pktoptions, NULL); - } - } - - kfree_skb(opt_skb); - return 0; -} - -static int dccp_v6_rcv(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - bool refcounted; - struct sock *sk; - int min_cov; - - /* Step 1: Check header basics */ - - if (dccp_invalid_packet(skb)) - goto discard_it; - - /* Step 1: If header checksum is incorrect, drop packet and return. */ - if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr)) { - DCCP_WARN("dropped packet with invalid checksum\n"); - goto discard_it; - } - - dh = dccp_hdr(skb); - - DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); - DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; - - if (dccp_packet_without_ack(skb)) - DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; - else - DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - -lookup: - sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), - dh->dccph_sport, dh->dccph_dport, - inet6_iif(skb), 0, &refcounted); - if (!sk) { - dccp_pr_debug("failed to look up flow ID in table and " - "get corresponding socket\n"); - goto no_dccp_socket; - } - - /* - * Step 2: - * ... or S.state == TIMEWAIT, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_TIME_WAIT) { - dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); - inet_twsk_put(inet_twsk(sk)); - goto no_dccp_socket; - } - - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); - struct sock *nsk; - - sk = req->rsk_listener; - if (unlikely(sk->sk_state != DCCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; - } - sock_hold(sk); - refcounted = true; - nsk = dccp_check_req(sk, skb, req); - if (!nsk) { - reqsk_put(req); - goto discard_and_relse; - } - if (nsk == sk) { - reqsk_put(req); - } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - goto discard_and_relse; - } else { - sock_put(sk); - return 0; - } - } - /* - * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage - * o if MinCsCov = 0, only packets with CsCov = 0 are accepted - * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov - */ - min_cov = dccp_sk(sk)->dccps_pcrlen; - if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { - dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", - dh->dccph_cscov, min_cov); - /* FIXME: send Data Dropped option (see also dccp_v4_rcv) */ - goto discard_and_relse; - } - - if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) - goto discard_and_relse; - nf_reset_ct(skb); - - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, - refcounted) ? -1 : 0; - -no_dccp_socket: - if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard_it; - /* - * Step 2: - * If no socket ... - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (dh->dccph_type != DCCP_PKT_RESET) { - DCCP_SKB_CB(skb)->dccpd_reset_code = - DCCP_RESET_CODE_NO_CONNECTION; - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - } - -discard_it: - kfree_skb(skb); - return 0; - -discard_and_relse: - if (refcounted) - sock_put(sk); - goto discard_it; -} - -static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) -{ - struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - struct in6_addr *saddr = NULL, *final_p, final; - struct ipv6_txoptions *opt; - struct flowi6 fl6; - struct dst_entry *dst; - int addr_type; - int err; - - dp->dccps_role = DCCP_ROLE_CLIENT; - - if (addr_len < SIN6_LEN_RFC2133) - return -EINVAL; - - if (usin->sin6_family != AF_INET6) - return -EAFNOSUPPORT; - - memset(&fl6, 0, sizeof(fl6)); - - if (inet6_test_bit(SNDFLOW, sk)) { - fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; - IP6_ECN_flow_init(fl6.flowlabel); - if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { - struct ip6_flowlabel *flowlabel; - flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (IS_ERR(flowlabel)) - return -EINVAL; - fl6_sock_release(flowlabel); - } - } - /* - * connect() to INADDR_ANY means loopback (BSD'ism). - */ - if (ipv6_addr_any(&usin->sin6_addr)) - usin->sin6_addr.s6_addr[15] = 1; - - addr_type = ipv6_addr_type(&usin->sin6_addr); - - if (addr_type & IPV6_ADDR_MULTICAST) - return -ENETUNREACH; - - if (addr_type & IPV6_ADDR_LINKLOCAL) { - if (addr_len >= sizeof(struct sockaddr_in6) && - usin->sin6_scope_id) { - /* If interface is set while binding, indices - * must coincide. - */ - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != usin->sin6_scope_id) - return -EINVAL; - - sk->sk_bound_dev_if = usin->sin6_scope_id; - } - - /* Connect to link-local address requires an interface */ - if (!sk->sk_bound_dev_if) - return -EINVAL; - } - - sk->sk_v6_daddr = usin->sin6_addr; - np->flow_label = fl6.flowlabel; - - /* - * DCCP over IPv4 - */ - if (addr_type == IPV6_ADDR_MAPPED) { - u32 exthdrlen = icsk->icsk_ext_hdr_len; - struct sockaddr_in sin; - - net_dbg_ratelimited("connect: ipv4 mapped\n"); - - if (ipv6_only_sock(sk)) - return -ENETUNREACH; - - sin.sin_family = AF_INET; - sin.sin_port = usin->sin6_port; - sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - - icsk->icsk_af_ops = &dccp_ipv6_mapped; - sk->sk_backlog_rcv = dccp_v4_do_rcv; - - err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); - if (err) { - icsk->icsk_ext_hdr_len = exthdrlen; - icsk->icsk_af_ops = &dccp_ipv6_af_ops; - sk->sk_backlog_rcv = dccp_v6_do_rcv; - goto failure; - } - np->saddr = sk->sk_v6_rcv_saddr; - return err; - } - - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) - saddr = &sk->sk_v6_rcv_saddr; - - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.daddr = sk->sk_v6_daddr; - fl6.saddr = saddr ? *saddr : np->saddr; - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.fl6_dport = usin->sin6_port; - fl6.fl6_sport = inet->inet_sport; - security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - - opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); - final_p = fl6_update_dst(&fl6, opt, &final); - - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - goto failure; - } - - if (saddr == NULL) { - saddr = &fl6.saddr; - - err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); - if (err) - goto failure; - } - - /* set the source address */ - np->saddr = *saddr; - inet->inet_rcv_saddr = LOOPBACK4_IPV6; - - ip6_dst_store(sk, dst, NULL, NULL); - - icsk->icsk_ext_hdr_len = 0; - if (opt) - icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; - - inet->inet_dport = usin->sin6_port; - - dccp_set_state(sk, DCCP_REQUESTING); - err = inet6_hash_connect(&dccp_death_row, sk); - if (err) - goto late_failure; - - dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport); - err = dccp_connect(sk); - if (err) - goto late_failure; - - return 0; - -late_failure: - dccp_set_state(sk, DCCP_CLOSED); - inet_bhash2_reset_saddr(sk); - __sk_dst_reset(sk); -failure: - inet->inet_dport = 0; - sk->sk_route_caps = 0; - return err; -} - -static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { - .queue_xmit = inet6_csk_xmit, - .send_check = dccp_v6_send_check, - .rebuild_header = inet6_sk_rebuild_header, - .conn_request = dccp_v6_conn_request, - .syn_recv_sock = dccp_v6_request_recv_sock, - .net_header_len = sizeof(struct ipv6hdr), - .setsockopt = ipv6_setsockopt, - .getsockopt = ipv6_getsockopt, -}; - -/* - * DCCP over IPv4 via INET6 API - */ -static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { - .queue_xmit = ip_queue_xmit, - .send_check = dccp_v4_send_check, - .rebuild_header = inet_sk_rebuild_header, - .conn_request = dccp_v6_conn_request, - .syn_recv_sock = dccp_v6_request_recv_sock, - .net_header_len = sizeof(struct iphdr), - .setsockopt = ipv6_setsockopt, - .getsockopt = ipv6_getsockopt, -}; - -static void dccp_v6_sk_destruct(struct sock *sk) -{ - dccp_destruct_common(sk); - inet6_sock_destruct(sk); -} - -/* NOTE: A lot of things set to zero explicitly by call to - * sk_alloc() so need not be done here. - */ -static int dccp_v6_init_sock(struct sock *sk) -{ - static __u8 dccp_v6_ctl_sock_initialized; - int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized); - - if (err == 0) { - if (unlikely(!dccp_v6_ctl_sock_initialized)) - dccp_v6_ctl_sock_initialized = 1; - inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; - sk->sk_destruct = dccp_v6_sk_destruct; - } - - return err; -} - -static struct timewait_sock_ops dccp6_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct dccp6_timewait_sock), -}; - -static struct proto dccp_v6_prot = { - .name = "DCCPv6", - .owner = THIS_MODULE, - .close = dccp_close, - .connect = dccp_v6_connect, - .disconnect = dccp_disconnect, - .ioctl = dccp_ioctl, - .init = dccp_v6_init_sock, - .setsockopt = dccp_setsockopt, - .getsockopt = dccp_getsockopt, - .sendmsg = dccp_sendmsg, - .recvmsg = dccp_recvmsg, - .backlog_rcv = dccp_v6_do_rcv, - .hash = inet6_hash, - .unhash = inet_unhash, - .accept = inet_csk_accept, - .get_port = inet_csk_get_port, - .shutdown = dccp_shutdown, - .destroy = dccp_destroy_sock, - .orphan_count = &dccp_orphan_count, - .max_header = MAX_DCCP_HEADER, - .obj_size = sizeof(struct dccp6_sock), - .ipv6_pinfo_offset = offsetof(struct dccp6_sock, inet6), - .slab_flags = SLAB_TYPESAFE_BY_RCU, - .rsk_prot = &dccp6_request_sock_ops, - .twsk_prot = &dccp6_timewait_sock_ops, - .h.hashinfo = &dccp_hashinfo, -}; - -static const struct inet6_protocol dccp_v6_protocol = { - .handler = dccp_v6_rcv, - .err_handler = dccp_v6_err, - .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, -}; - -static const struct proto_ops inet6_dccp_ops = { - .family = PF_INET6, - .owner = THIS_MODULE, - .release = inet6_release, - .bind = inet6_bind, - .connect = inet_stream_connect, - .socketpair = sock_no_socketpair, - .accept = inet_accept, - .getname = inet6_getname, - .poll = dccp_poll, - .ioctl = inet6_ioctl, - .gettstamp = sock_gettstamp, - .listen = inet_dccp_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, -#ifdef CONFIG_COMPAT - .compat_ioctl = inet6_compat_ioctl, -#endif -}; - -static struct inet_protosw dccp_v6_protosw = { - .type = SOCK_DCCP, - .protocol = IPPROTO_DCCP, - .prot = &dccp_v6_prot, - .ops = &inet6_dccp_ops, - .flags = INET_PROTOSW_ICSK, -}; - -static int __net_init dccp_v6_init_net(struct net *net) -{ - struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); - - if (dccp_hashinfo.bhash == NULL) - return -ESOCKTNOSUPPORT; - - return inet_ctl_sock_create(&pn->v6_ctl_sk, PF_INET6, - SOCK_DCCP, IPPROTO_DCCP, net); -} - -static void __net_exit dccp_v6_exit_net(struct net *net) -{ - struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); - - inet_ctl_sock_destroy(pn->v6_ctl_sk); -} - -static struct pernet_operations dccp_v6_ops = { - .init = dccp_v6_init_net, - .exit = dccp_v6_exit_net, - .id = &dccp_v6_pernet_id, - .size = sizeof(struct dccp_v6_pernet), -}; - -static int __init dccp_v6_init(void) -{ - int err = proto_register(&dccp_v6_prot, 1); - - if (err) - goto out; - - inet6_register_protosw(&dccp_v6_protosw); - - err = register_pernet_subsys(&dccp_v6_ops); - if (err) - goto out_destroy_ctl_sock; - - err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP); - if (err) - goto out_unregister_proto; - -out: - return err; -out_unregister_proto: - unregister_pernet_subsys(&dccp_v6_ops); -out_destroy_ctl_sock: - inet6_unregister_protosw(&dccp_v6_protosw); - proto_unregister(&dccp_v6_prot); - goto out; -} - -static void __exit dccp_v6_exit(void) -{ - inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP); - unregister_pernet_subsys(&dccp_v6_ops); - inet6_unregister_protosw(&dccp_v6_protosw); - proto_unregister(&dccp_v6_prot); -} - -module_init(dccp_v6_init); -module_exit(dccp_v6_exit); - -/* - * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) - * values directly, Also cover the case where the protocol is not specified, - * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP - */ -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 33, 6); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 0, 6); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo "); -MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h deleted file mode 100644 index c5d14c48def17..0000000000000 --- a/net/dccp/ipv6.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_IPV6_H -#define _DCCP_IPV6_H -/* - * net/dccp/ipv6.h - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Arnaldo Carvalho de Melo - */ - -#include -#include - -struct dccp6_sock { - struct dccp_sock dccp; - struct ipv6_pinfo inet6; -}; - -struct dccp6_request_sock { - struct dccp_request_sock dccp; -}; - -struct dccp6_timewait_sock { - struct inet_timewait_sock inet; -}; - -#endif /* _DCCP_IPV6_H */ diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c deleted file mode 100644 index fecc8190064f8..0000000000000 --- a/net/dccp/minisocks.c +++ /dev/null @@ -1,266 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/minisocks.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -struct inet_timewait_death_row dccp_death_row = { - .tw_refcount = REFCOUNT_INIT(1), - .sysctl_max_tw_buckets = NR_FILE * 2, - .hashinfo = &dccp_hashinfo, -}; - -EXPORT_SYMBOL_GPL(dccp_death_row); - -void dccp_time_wait(struct sock *sk, int state, int timeo) -{ - struct inet_timewait_sock *tw; - - tw = inet_twsk_alloc(sk, &dccp_death_row, state); - - if (tw != NULL) { - const struct inet_connection_sock *icsk = inet_csk(sk); - const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); -#if IS_ENABLED(CONFIG_IPV6) - if (tw->tw_family == PF_INET6) { - tw->tw_v6_daddr = sk->sk_v6_daddr; - tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - tw->tw_ipv6only = sk->sk_ipv6only; - } -#endif - - /* Get the TIME_WAIT timeout firing. */ - if (timeo < rto) - timeo = rto; - - if (state == DCCP_TIME_WAIT) - timeo = DCCP_TIMEWAIT_LEN; - - /* Linkage updates. - * Note that access to tw after this point is illegal. - */ - inet_twsk_hashdance_schedule(tw, sk, &dccp_hashinfo, timeo); - } else { - /* Sorry, if we're out of memory, just CLOSE this - * socket up. We've got bigger problems than - * non-graceful socket closings. - */ - DCCP_WARN("time wait bucket table overflow\n"); - } - - dccp_done(sk); -} - -struct sock *dccp_create_openreq_child(const struct sock *sk, - const struct request_sock *req, - const struct sk_buff *skb) -{ - /* - * Step 3: Process LISTEN state - * - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - */ - struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); - - if (newsk != NULL) { - struct dccp_request_sock *dreq = dccp_rsk(req); - struct inet_connection_sock *newicsk = inet_csk(newsk); - struct dccp_sock *newdp = dccp_sk(newsk); - - newdp->dccps_role = DCCP_ROLE_SERVER; - newdp->dccps_hc_rx_ackvec = NULL; - newdp->dccps_service_list = NULL; - newdp->dccps_hc_rx_ccid = NULL; - newdp->dccps_hc_tx_ccid = NULL; - newdp->dccps_service = dreq->dreq_service; - newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo; - newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; - newicsk->icsk_rto = DCCP_TIMEOUT_INIT; - - INIT_LIST_HEAD(&newdp->dccps_featneg); - /* - * Step 3: Process LISTEN state - * - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR from packet (or Init Cookies) - * - * Setting AWL/AWH and SWL/SWH happens as part of the feature - * activation below, as these windows all depend on the local - * and remote Sequence Window feature values (7.5.2). - */ - newdp->dccps_iss = dreq->dreq_iss; - newdp->dccps_gss = dreq->dreq_gss; - newdp->dccps_gar = newdp->dccps_iss; - newdp->dccps_isr = dreq->dreq_isr; - newdp->dccps_gsr = dreq->dreq_gsr; - - /* - * Activate features: initialise CCIDs, sequence windows etc. - */ - if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { - sk_free_unlock_clone(newsk); - return NULL; - } - dccp_init_xmit_timers(newsk); - - __DCCP_INC_STATS(DCCP_MIB_PASSIVEOPENS); - } - return newsk; -} - -EXPORT_SYMBOL_GPL(dccp_create_openreq_child); - -/* - * Process an incoming packet for RESPOND sockets represented - * as an request_sock. - */ -struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req) -{ - struct sock *child = NULL; - struct dccp_request_sock *dreq = dccp_rsk(req); - bool own_req; - - /* TCP/DCCP listeners became lockless. - * DCCP stores complex state in its request_sock, so we need - * a protection for them, now this code runs without being protected - * by the parent (listener) lock. - */ - spin_lock_bh(&dreq->dreq_lock); - - /* Check for retransmitted REQUEST */ - if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { - - if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_gsr)) { - dccp_pr_debug("Retransmitted REQUEST\n"); - dreq->dreq_gsr = DCCP_SKB_CB(skb)->dccpd_seq; - /* - * Send another RESPONSE packet - * To protect against Request floods, increment retrans - * counter (backoff, monitored by dccp_response_timer). - */ - inet_rtx_syn_ack(sk, req); - } - /* Network Duplicate, discard packet */ - goto out; - } - - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; - - if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK && - dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK) - goto drop; - - /* Invalid ACK */ - if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, - dreq->dreq_iss, dreq->dreq_gss)) { - dccp_pr_debug("Invalid ACK number: ack_seq=%llu, " - "dreq_iss=%llu, dreq_gss=%llu\n", - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq, - (unsigned long long) dreq->dreq_iss, - (unsigned long long) dreq->dreq_gss); - goto drop; - } - - if (dccp_parse_options(sk, dreq, skb)) - goto drop; - - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, - req, &own_req); - if (child) { - child = inet_csk_complete_hashdance(sk, child, req, own_req); - goto out; - } - - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; -drop: - if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) - req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - - inet_csk_reqsk_queue_drop(sk, req); -out: - spin_unlock_bh(&dreq->dreq_lock); - return child; -} - -EXPORT_SYMBOL_GPL(dccp_check_req); - -/* - * Queue segment on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket. - */ -int dccp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb) - __releases(child) -{ - int ret = 0; - const int state = child->sk_state; - - if (!sock_owned_by_user(child)) { - ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb), - skb->len); - - /* Wakeup parent, send SIGIO */ - if (state == DCCP_RESPOND && child->sk_state != state) - parent->sk_data_ready(parent); - } else { - /* Alas, it is possible again, because we do lookup - * in main socket hash table and lock on listening - * socket does not protect us more. - */ - __sk_add_backlog(child, skb); - } - - bh_unlock_sock(child); - sock_put(child); - return ret; -} - -EXPORT_SYMBOL_GPL(dccp_child_process); - -void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *rsk) -{ - DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state"); -} - -EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); - -int dccp_reqsk_init(struct request_sock *req, - struct dccp_sock const *dp, struct sk_buff const *skb) -{ - struct dccp_request_sock *dreq = dccp_rsk(req); - - spin_lock_init(&dreq->dreq_lock); - inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport; - inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport); - inet_rsk(req)->acked = 0; - dreq->dreq_timestamp_echo = 0; - - /* inherit feature negotiation options from listening socket */ - return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); -} - -EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/options.c b/net/dccp/options.c deleted file mode 100644 index db62d47670249..0000000000000 --- a/net/dccp/options.c +++ /dev/null @@ -1,609 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/options.c - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Aristeu Sergio Rozanski Filho - * Copyright (c) 2005 Arnaldo Carvalho de Melo - * Copyright (c) 2005 Ian McDonald - */ -#include -#include -#include -#include -#include -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -u64 dccp_decode_value_var(const u8 *bf, const u8 len) -{ - u64 value = 0; - - if (len >= DCCP_OPTVAL_MAXLEN) - value += ((u64)*bf++) << 40; - if (len > 4) - value += ((u64)*bf++) << 32; - if (len > 3) - value += ((u64)*bf++) << 24; - if (len > 2) - value += ((u64)*bf++) << 16; - if (len > 1) - value += ((u64)*bf++) << 8; - if (len > 0) - value += *bf; - - return value; -} - -/** - * dccp_parse_options - Parse DCCP options present in @skb - * @sk: client|server|listening dccp socket (when @dreq != NULL) - * @dreq: request socket to use during connection setup, or NULL - * @skb: frame to parse - */ -int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_hdr *dh = dccp_hdr(skb); - const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; - unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); - unsigned char *opt_ptr = options; - const unsigned char *opt_end = (unsigned char *)dh + - (dh->dccph_doff * 4); - struct dccp_options_received *opt_recv = &dp->dccps_options_received; - unsigned char opt, len; - unsigned char *value; - u32 elapsed_time; - __be32 opt_val; - int rc; - int mandatory = 0; - - memset(opt_recv, 0, sizeof(*opt_recv)); - - opt = len = 0; - while (opt_ptr != opt_end) { - opt = *opt_ptr++; - len = 0; - value = NULL; - - /* Check if this isn't a single byte option */ - if (opt > DCCPO_MAX_RESERVED) { - if (opt_ptr == opt_end) - goto out_nonsensical_length; - - len = *opt_ptr++; - if (len < 2) - goto out_nonsensical_length; - /* - * Remove the type and len fields, leaving - * just the value size - */ - len -= 2; - value = opt_ptr; - opt_ptr += len; - - if (opt_ptr > opt_end) - goto out_nonsensical_length; - } - - /* - * CCID-specific options are ignored during connection setup, as - * negotiation may still be in progress (see RFC 4340, 10.3). - * The same applies to Ack Vectors, as these depend on the CCID. - */ - if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || - opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) - goto ignore_option; - - switch (opt) { - case DCCPO_PADDING: - break; - case DCCPO_MANDATORY: - if (mandatory) - goto out_invalid_option; - if (pkt_type != DCCP_PKT_DATA) - mandatory = 1; - break; - case DCCPO_NDP_COUNT: - if (len > 6) - goto out_invalid_option; - - opt_recv->dccpor_ndp = dccp_decode_value_var(value, len); - dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), - (unsigned long long)opt_recv->dccpor_ndp); - break; - case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: - if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ - break; - if (len == 0) - goto out_invalid_option; - rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, - *value, value + 1, len - 1); - if (rc) - goto out_featneg_failed; - break; - case DCCPO_TIMESTAMP: - if (len != 4) - goto out_invalid_option; - /* - * RFC 4340 13.1: "The precise time corresponding to - * Timestamp Value zero is not specified". We use - * zero to indicate absence of a meaningful timestamp. - */ - opt_val = get_unaligned((__be32 *)value); - if (unlikely(opt_val == 0)) { - DCCP_WARN("Timestamp with zero value\n"); - break; - } - - if (dreq != NULL) { - dreq->dreq_timestamp_echo = ntohl(opt_val); - dreq->dreq_timestamp_time = dccp_timestamp(); - } else { - opt_recv->dccpor_timestamp = - dp->dccps_timestamp_echo = ntohl(opt_val); - dp->dccps_timestamp_time = dccp_timestamp(); - } - dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n", - dccp_role(sk), ntohl(opt_val), - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - /* schedule an Ack in case this sender is quiescent */ - inet_csk_schedule_ack(sk); - break; - case DCCPO_TIMESTAMP_ECHO: - if (len != 4 && len != 6 && len != 8) - goto out_invalid_option; - - opt_val = get_unaligned((__be32 *)value); - opt_recv->dccpor_timestamp_echo = ntohl(opt_val); - - dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, " - "ackno=%llu", dccp_role(sk), - opt_recv->dccpor_timestamp_echo, - len + 2, - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - - value += 4; - - if (len == 4) { /* no elapsed time included */ - dccp_pr_debug_cat("\n"); - break; - } - - if (len == 6) { /* 2-byte elapsed time */ - __be16 opt_val2 = get_unaligned((__be16 *)value); - elapsed_time = ntohs(opt_val2); - } else { /* 4-byte elapsed time */ - opt_val = get_unaligned((__be32 *)value); - elapsed_time = ntohl(opt_val); - } - - dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time); - - /* Give precedence to the biggest ELAPSED_TIME */ - if (elapsed_time > opt_recv->dccpor_elapsed_time) - opt_recv->dccpor_elapsed_time = elapsed_time; - break; - case DCCPO_ELAPSED_TIME: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */ - break; - - if (len == 2) { - __be16 opt_val2 = get_unaligned((__be16 *)value); - elapsed_time = ntohs(opt_val2); - } else if (len == 4) { - opt_val = get_unaligned((__be32 *)value); - elapsed_time = ntohl(opt_val); - } else { - goto out_invalid_option; - } - - if (elapsed_time > opt_recv->dccpor_elapsed_time) - opt_recv->dccpor_elapsed_time = elapsed_time; - - dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", - dccp_role(sk), elapsed_time); - break; - case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: - if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, - pkt_type, opt, value, len)) - goto out_invalid_option; - break; - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ - break; - /* - * Ack vectors are processed by the TX CCID if it is - * interested. The RX CCID need not parse Ack Vectors, - * since it is only interested in clearing old state. - */ - fallthrough; - case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: - if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, - pkt_type, opt, value, len)) - goto out_invalid_option; - break; - default: - DCCP_CRIT("DCCP(%p): option %d(len=%d) not " - "implemented, ignoring", sk, opt, len); - break; - } -ignore_option: - if (opt != DCCPO_MANDATORY) - mandatory = 0; - } - - /* mandatory was the last byte in option list -> reset connection */ - if (mandatory) - goto out_invalid_option; - -out_nonsensical_length: - /* RFC 4340, 5.8: ignore option and all remaining option space */ - return 0; - -out_invalid_option: - DCCP_INC_STATS(DCCP_MIB_INVALIDOPT); - rc = DCCP_RESET_CODE_OPTION_ERROR; -out_featneg_failed: - DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); - DCCP_SKB_CB(skb)->dccpd_reset_code = rc; - DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; - DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; - DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; - return -1; -} - -EXPORT_SYMBOL_GPL(dccp_parse_options); - -void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) -{ - if (len >= DCCP_OPTVAL_MAXLEN) - *to++ = (value & 0xFF0000000000ull) >> 40; - if (len > 4) - *to++ = (value & 0xFF00000000ull) >> 32; - if (len > 3) - *to++ = (value & 0xFF000000) >> 24; - if (len > 2) - *to++ = (value & 0xFF0000) >> 16; - if (len > 1) - *to++ = (value & 0xFF00) >> 8; - if (len > 0) - *to++ = (value & 0xFF); -} - -static inline u8 dccp_ndp_len(const u64 ndp) -{ - if (likely(ndp <= 0xFF)) - return 1; - return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6); -} - -int dccp_insert_option(struct sk_buff *skb, const unsigned char option, - const void *value, const unsigned char len) -{ - unsigned char *to; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2; - - to = skb_push(skb, len + 2); - *to++ = option; - *to++ = len + 2; - - memcpy(to, value, len); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_insert_option); - -static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - u64 ndp = dp->dccps_ndp_count; - - if (dccp_non_data_packet(skb)) - ++dp->dccps_ndp_count; - else - dp->dccps_ndp_count = 0; - - if (ndp > 0) { - unsigned char *ptr; - const int ndp_len = dccp_ndp_len(ndp); - const int len = ndp_len + 2; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - ptr = skb_push(skb, len); - *ptr++ = DCCPO_NDP_COUNT; - *ptr++ = len; - dccp_encode_value_var(ndp, ptr, ndp_len); - } - - return 0; -} - -static inline int dccp_elapsed_time_len(const u32 elapsed_time) -{ - return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4; -} - -static int dccp_insert_option_timestamp(struct sk_buff *skb) -{ - __be32 now = htonl(dccp_timestamp()); - /* yes this will overflow but that is the point as we want a - * 10 usec 32 bit timer which mean it wraps every 11.9 hours */ - - return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now)); -} - -static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, - struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - __be32 tstamp_echo; - unsigned char *to; - u32 elapsed_time, elapsed_time_len, len; - - if (dreq != NULL) { - elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time; - tstamp_echo = htonl(dreq->dreq_timestamp_echo); - dreq->dreq_timestamp_echo = 0; - } else { - elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time; - tstamp_echo = htonl(dp->dccps_timestamp_echo); - dp->dccps_timestamp_echo = 0; - } - - elapsed_time_len = dccp_elapsed_time_len(elapsed_time); - len = 6 + elapsed_time_len; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - to = skb_push(skb, len); - *to++ = DCCPO_TIMESTAMP_ECHO; - *to++ = len; - - memcpy(to, &tstamp_echo, 4); - to += 4; - - if (elapsed_time_len == 2) { - const __be16 var16 = htons((u16)elapsed_time); - memcpy(to, &var16, 2); - } else if (elapsed_time_len == 4) { - const __be32 var32 = htonl(elapsed_time); - memcpy(to, &var32, 4); - } - - return 0; -} - -static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const u16 buflen = dccp_ackvec_buflen(av); - /* Figure out how many options do we need to represent the ackvec */ - const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); - u16 len = buflen + 2 * nr_opts; - u8 i, nonce = 0; - const unsigned char *tail, *from; - unsigned char *to; - - if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, - dccp_packet_name(dcb->dccpd_type)); - return -1; - } - /* - * Since Ack Vectors are variable-length, we can not always predict - * their size. To catch exception cases where the space is running out - * on the skb, a separate Sync is scheduled to carry the Ack Vector. - */ - if (len > DCCPAV_MIN_OPTLEN && - len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { - DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " - "MPS=%u ==> reduce payload size?\n", len, skb->len, - dcb->dccpd_opt_len, dp->dccps_mss_cache); - dp->dccps_sync_scheduled = 1; - return 0; - } - dcb->dccpd_opt_len += len; - - to = skb_push(skb, len); - len = buflen; - from = av->av_buf + av->av_buf_head; - tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; - - for (i = 0; i < nr_opts; ++i) { - int copylen = len; - - if (len > DCCP_SINGLE_OPT_MAXLEN) - copylen = DCCP_SINGLE_OPT_MAXLEN; - - /* - * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via - * its type; ack_nonce is the sum of all individual buf_nonce's. - */ - nonce ^= av->av_buf_nonce[i]; - - *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; - *to++ = copylen + 2; - - /* Check if buf_head wraps */ - if (from + copylen > tail) { - const u16 tailsize = tail - from; - - memcpy(to, from, tailsize); - to += tailsize; - len -= tailsize; - copylen -= tailsize; - from = av->av_buf; - } - - memcpy(to, from, copylen); - from += copylen; - to += copylen; - len -= copylen; - } - /* - * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. - */ - if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) - return -ENOBUFS; - return 0; -} - -/** - * dccp_insert_option_mandatory - Mandatory option (5.8.2) - * @skb: frame into which to insert option - * - * Note that since we are using skb_push, this function needs to be called - * _after_ inserting the option it is supposed to influence (stack order). - */ -int dccp_insert_option_mandatory(struct sk_buff *skb) -{ - if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len++; - *(u8 *)skb_push(skb, 1) = DCCPO_MANDATORY; - return 0; -} - -/** - * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb - * @skb: frame to insert feature negotiation option into - * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R - * @feat: one out of %dccp_feature_numbers - * @val: NN value or SP array (preferred element first) to copy - * @len: true length of @val in bytes (excluding first element repetition) - * @repeat_first: whether to copy the first element of @val twice - * - * The last argument is used to construct Confirm options, where the preferred - * value and the preference list appear separately (RFC 4340, 6.3.1). Preference - * lists are kept such that the preferred entry is always first, so we only need - * to copy twice, and avoid the overhead of cloning into a bigger array. - */ -int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, - u8 *val, u8 len, bool repeat_first) -{ - u8 tot_len, *to; - - /* take the `Feature' field and possible repetition into account */ - if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { - DCCP_WARN("length %u for feature %u too large\n", len, feat); - return -1; - } - - if (unlikely(val == NULL || len == 0)) - len = repeat_first = false; - tot_len = 3 + repeat_first + len; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("packet too small for feature %d option!\n", feat); - return -1; - } - DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; - - to = skb_push(skb, tot_len); - *to++ = type; - *to++ = tot_len; - *to++ = feat; - - if (repeat_first) - *to++ = *val; - if (len) - memcpy(to, val, len); - return 0; -} - -/* The length of all options needs to be a multiple of 4 (5.8) */ -static void dccp_insert_option_padding(struct sk_buff *skb) -{ - int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; - - if (padding != 0) { - padding = 4 - padding; - memset(skb_push(skb, padding), 0, padding); - DCCP_SKB_CB(skb)->dccpd_opt_len += padding; - } -} - -int dccp_insert_options(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - - DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - - if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) - return -1; - - if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { - - /* Feature Negotiation */ - if (dccp_feat_insert_opts(dp, NULL, skb)) - return -1; - - if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { - /* - * Obtain RTT sample from Request/Response exchange. - * This is currently used for TFRC initialisation. - */ - if (dccp_insert_option_timestamp(skb)) - return -1; - - } else if (dccp_ackvec_pending(sk) && - dccp_insert_option_ackvec(sk, skb)) { - return -1; - } - } - - if (dp->dccps_hc_rx_insert_options) { - if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb)) - return -1; - dp->dccps_hc_rx_insert_options = 0; - } - - if (dp->dccps_timestamp_echo != 0 && - dccp_insert_option_timestamp_echo(dp, NULL, skb)) - return -1; - - dccp_insert_option_padding(skb); - return 0; -} - -int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) -{ - DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - - if (dccp_feat_insert_opts(NULL, dreq, skb)) - return -1; - - /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */ - if (dccp_insert_option_timestamp(skb)) - return -1; - - if (dreq->dreq_timestamp_echo != 0 && - dccp_insert_option_timestamp_echo(NULL, dreq, skb)) - return -1; - - dccp_insert_option_padding(skb); - return 0; -} diff --git a/net/dccp/output.c b/net/dccp/output.c deleted file mode 100644 index 39cf3430177ac..0000000000000 --- a/net/dccp/output.c +++ /dev/null @@ -1,708 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/output.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include - -#include -#include - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" - -static inline void dccp_event_ack_sent(struct sock *sk) -{ - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); -} - -/* enqueue @skb on sk_send_head for retransmission, return clone to send now */ -static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) -{ - skb_set_owner_w(skb, sk); - WARN_ON(sk->sk_send_head); - sk->sk_send_head = skb; - return skb_clone(sk->sk_send_head, gfp_any()); -} - -/* - * All SKB's seen here are completely headerless. It is our - * job to build the DCCP header, and pass the packet down to - * IP so it can do the same plus pass the packet off to the - * device. - */ -static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) -{ - if (likely(skb != NULL)) { - struct inet_sock *inet = inet_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - struct dccp_hdr *dh; - /* XXX For now we're using only 48 bits sequence numbers */ - const u32 dccp_header_size = sizeof(*dh) + - sizeof(struct dccp_hdr_ext) + - dccp_packet_hdr_len(dcb->dccpd_type); - int err, set_ack = 1; - u64 ackno = dp->dccps_gsr; - /* - * Increment GSS here already in case the option code needs it. - * Update GSS for real only if option processing below succeeds. - */ - dcb->dccpd_seq = ADD48(dp->dccps_gss, 1); - - switch (dcb->dccpd_type) { - case DCCP_PKT_DATA: - set_ack = 0; - fallthrough; - case DCCP_PKT_DATAACK: - case DCCP_PKT_RESET: - break; - - case DCCP_PKT_REQUEST: - set_ack = 0; - /* Use ISS on the first (non-retransmitted) Request. */ - if (icsk->icsk_retransmits == 0) - dcb->dccpd_seq = dp->dccps_iss; - fallthrough; - - case DCCP_PKT_SYNC: - case DCCP_PKT_SYNCACK: - ackno = dcb->dccpd_ack_seq; - fallthrough; - default: - /* - * Set owner/destructor: some skbs are allocated via - * alloc_skb (e.g. when retransmission may happen). - * Only Data, DataAck, and Reset packets should come - * through here with skb->sk set. - */ - WARN_ON(skb->sk); - skb_set_owner_w(skb, sk); - break; - } - - if (dccp_insert_options(sk, skb)) { - kfree_skb(skb); - return -EPROTO; - } - - - /* Build DCCP header and checksum it. */ - dh = dccp_zeroed_hdr(skb, dccp_header_size); - dh->dccph_type = dcb->dccpd_type; - dh->dccph_sport = inet->inet_sport; - dh->dccph_dport = inet->inet_dport; - dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4; - dh->dccph_ccval = dcb->dccpd_ccval; - dh->dccph_cscov = dp->dccps_pcslen; - /* XXX For now we're using only 48 bits sequence numbers */ - dh->dccph_x = 1; - - dccp_update_gss(sk, dcb->dccpd_seq); - dccp_hdr_set_seq(dh, dp->dccps_gss); - if (set_ack) - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno); - - switch (dcb->dccpd_type) { - case DCCP_PKT_REQUEST: - dccp_hdr_request(skb)->dccph_req_service = - dp->dccps_service; - /* - * Limit Ack window to ISS <= P.ackno <= GSS, so that - * only Responses to Requests we sent are considered. - */ - dp->dccps_awl = dp->dccps_iss; - break; - case DCCP_PKT_RESET: - dccp_hdr_reset(skb)->dccph_reset_code = - dcb->dccpd_reset_code; - break; - } - - icsk->icsk_af_ops->send_check(sk, skb); - - if (set_ack) - dccp_event_ack_sent(sk); - - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - - err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); - return net_xmit_eval(err); - } - return -ENOBUFS; -} - -/** - * dccp_determine_ccmps - Find out about CCID-specific packet-size limits - * @dp: socket to find packet size limits of - * - * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.), - * since the RX CCID is restricted to feedback packets (Acks), which are small - * in comparison with the data traffic. A value of 0 means "no current CCMPS". - */ -static u32 dccp_determine_ccmps(const struct dccp_sock *dp) -{ - const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid; - - if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL) - return 0; - return tx_ccid->ccid_ops->ccid_ccmps; -} - -unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - u32 ccmps = dccp_determine_ccmps(dp); - u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; - - /* Account for header lengths and IPv4/v6 option overhead */ - cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + - sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); - - /* - * Leave enough headroom for common DCCP header options. - * This only considers options which may appear on DCCP-Data packets, as - * per table 3 in RFC 4340, 5.8. When running out of space for other - * options (eg. Ack Vector which can take up to 255 bytes), it is better - * to schedule a separate Ack. Thus we leave headroom for the following: - * - 1 byte for Slow Receiver (11.6) - * - 6 bytes for Timestamp (13.1) - * - 10 bytes for Timestamp Echo (13.3) - * - 8 bytes for NDP count (7.7, when activated) - * - 6 bytes for Data Checksum (9.3) - * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) - */ - cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + - (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); - - /* And store cached results */ - icsk->icsk_pmtu_cookie = pmtu; - WRITE_ONCE(dp->dccps_mss_cache, cur_mps); - - return cur_mps; -} - -EXPORT_SYMBOL_GPL(dccp_sync_mss); - -void dccp_write_space(struct sock *sk) -{ - struct socket_wq *wq; - - rcu_read_lock(); - wq = rcu_dereference(sk->sk_wq); - if (skwq_has_sleeper(wq)) - wake_up_interruptible(&wq->wait); - /* Should agree with poll, otherwise some programs break */ - if (sock_writeable(sk)) - sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); - - rcu_read_unlock(); -} - -/** - * dccp_wait_for_ccid - Await CCID send permission - * @sk: socket to wait for - * @delay: timeout in jiffies - * - * This is used by CCIDs which need to delay the send time in process context. - */ -static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) -{ - DEFINE_WAIT(wait); - long remaining; - - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - sk->sk_write_pending++; - release_sock(sk); - - remaining = schedule_timeout(delay); - - lock_sock(sk); - sk->sk_write_pending--; - finish_wait(sk_sleep(sk), &wait); - - if (signal_pending(current) || sk->sk_err) - return -1; - return remaining; -} - -/** - * dccp_xmit_packet - Send data packet under control of CCID - * @sk: socket to send data packet on - * - * Transmits next-queued payload and informs CCID to account for the packet. - */ -static void dccp_xmit_packet(struct sock *sk) -{ - int err, len; - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb = dccp_qpolicy_pop(sk); - - if (unlikely(skb == NULL)) - return; - len = skb->len; - - if (sk->sk_state == DCCP_PARTOPEN) { - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; - /* - * See 8.1.5 - Handshake Completion. - * - * For robustness we resend Confirm options until the client has - * entered OPEN. During the initial feature negotiation, the MPS - * is smaller than usual, reduced by the Change/Confirm options. - */ - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { - DCCP_WARN("Payload too large (%d) for featneg.\n", len); - dccp_send_ack(sk); - dccp_feat_list_purge(&dp->dccps_featneg); - } - - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else if (dccp_ack_pending(sk)) { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; - } - - err = dccp_transmit_skb(sk, skb); - if (err) - dccp_pr_debug("transmit_skb() returned err=%d\n", err); - /* - * Register this one as sent even if an error occurred. To the remote - * end a local packet drop is indistinguishable from network loss, i.e. - * any local drop will eventually be reported via receiver feedback. - */ - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); - - /* - * If the CCID needs to transfer additional header options out-of-band - * (e.g. Ack Vectors or feature-negotiation options), it activates this - * flag to schedule a Sync. The Sync will automatically incorporate all - * currently pending header options, thus clearing the backlog. - */ - if (dp->dccps_sync_scheduled) - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); -} - -/** - * dccp_flush_write_queue - Drain queue at end of connection - * @sk: socket to be drained - * @time_budget: time allowed to drain the queue - * - * Since dccp_sendmsg queues packets without waiting for them to be sent, it may - * happen that the TX queue is not empty at the end of a connection. We give the - * HC-sender CCID a grace period of up to @time_budget jiffies. If this function - * returns with a non-empty write queue, it will be purged later. - */ -void dccp_flush_write_queue(struct sock *sk, long *time_budget) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - long delay, rc; - - while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { - rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - /* - * If the CCID determines when to send, the next sending - * time is unknown or the CCID may not even send again - * (e.g. remote host crashes or lost Ack packets). - */ - DCCP_WARN("CCID did not manage to send all packets\n"); - return; - case CCID_PACKET_DELAY: - delay = msecs_to_jiffies(rc); - if (delay > *time_budget) - return; - rc = dccp_wait_for_ccid(sk, delay); - if (rc < 0) - return; - *time_budget -= (delay - rc); - /* check again if we can send now */ - break; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - skb_dequeue(&sk->sk_write_queue); - kfree_skb(skb); - dccp_pr_debug("packet discarded due to err=%ld\n", rc); - } - } -} - -void dccp_write_xmit(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - - while ((skb = dccp_qpolicy_top(sk))) { - int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - return; - case CCID_PACKET_DELAY: - sk_reset_timer(sk, &dp->dccps_xmit_timer, - jiffies + msecs_to_jiffies(rc)); - return; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - dccp_qpolicy_drop(sk, skb); - dccp_pr_debug("packet discarded due to err=%d\n", rc); - } - } -} - -/** - * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets - * @sk: socket to perform retransmit on - * - * There are only four retransmittable packet types in DCCP: - * - Request in client-REQUEST state (sec. 8.1.1), - * - CloseReq in server-CLOSEREQ state (sec. 8.3), - * - Close in node-CLOSING state (sec. 8.3), - * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()). - * This function expects sk->sk_send_head to contain the original skb. - */ -int dccp_retransmit_skb(struct sock *sk) -{ - WARN_ON(sk->sk_send_head == NULL); - - if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0) - return -EHOSTUNREACH; /* Routing failure or similar. */ - - /* this count is used to distinguish original and retransmitted skb */ - inet_csk(sk)->icsk_retransmits++; - - return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC)); -} - -struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, - struct request_sock *req) -{ - struct dccp_hdr *dh; - struct dccp_request_sock *dreq; - const u32 dccp_header_size = sizeof(struct dccp_hdr) + - sizeof(struct dccp_hdr_ext) + - sizeof(struct dccp_hdr_response); - struct sk_buff *skb; - - /* sk is marked const to clearly express we dont hold socket lock. - * sock_wmalloc() will atomically change sk->sk_wmem_alloc, - * it is safe to promote sk to non const. - */ - skb = sock_wmalloc((struct sock *)sk, MAX_DCCP_HEADER, 1, - GFP_ATOMIC); - if (!skb) - return NULL; - - skb_reserve(skb, MAX_DCCP_HEADER); - - skb_dst_set(skb, dst_clone(dst)); - - dreq = dccp_rsk(req); - if (inet_rsk(req)->acked) /* increase GSS upon retransmission */ - dccp_inc_seqno(&dreq->dreq_gss); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; - DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss; - - /* Resolve feature dependencies resulting from choice of CCID */ - if (dccp_feat_server_ccid_dependencies(dreq)) - goto response_failed; - - if (dccp_insert_options_rsk(dreq, skb)) - goto response_failed; - - /* Build and checksum header */ - dh = dccp_zeroed_hdr(skb, dccp_header_size); - - dh->dccph_sport = htons(inet_rsk(req)->ir_num); - dh->dccph_dport = inet_rsk(req)->ir_rmt_port; - dh->dccph_doff = (dccp_header_size + - DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; - dh->dccph_type = DCCP_PKT_RESPONSE; - dh->dccph_x = 1; - dccp_hdr_set_seq(dh, dreq->dreq_gss); - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr); - dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service; - - dccp_csum_outgoing(skb); - - /* We use `acked' to remember that a Response was already sent. */ - inet_rsk(req)->acked = 1; - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - return skb; -response_failed: - kfree_skb(skb); - return NULL; -} - -EXPORT_SYMBOL_GPL(dccp_make_response); - -/* answer offending packet in @rcv_skb with Reset from control socket @ctl */ -struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb) -{ - struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb); - const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) + - sizeof(struct dccp_hdr_ext) + - sizeof(struct dccp_hdr_reset); - struct dccp_hdr_reset *dhr; - struct sk_buff *skb; - - skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); - if (skb == NULL) - return NULL; - - skb_reserve(skb, sk->sk_prot->max_header); - - /* Swap the send and the receive. */ - dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len); - dh->dccph_type = DCCP_PKT_RESET; - dh->dccph_sport = rxdh->dccph_dport; - dh->dccph_dport = rxdh->dccph_sport; - dh->dccph_doff = dccp_hdr_reset_len / 4; - dh->dccph_x = 1; - - dhr = dccp_hdr_reset(skb); - dhr->dccph_reset_code = dcb->dccpd_reset_code; - - switch (dcb->dccpd_reset_code) { - case DCCP_RESET_CODE_PACKET_ERROR: - dhr->dccph_reset_data[0] = rxdh->dccph_type; - break; - case DCCP_RESET_CODE_OPTION_ERROR: - case DCCP_RESET_CODE_MANDATORY_ERROR: - memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3); - break; - } - /* - * From RFC 4340, 8.3.1: - * If P.ackno exists, set R.seqno := P.ackno + 1. - * Else set R.seqno := 0. - */ - if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1)); - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq); - - dccp_csum_outgoing(skb); - return skb; -} - -EXPORT_SYMBOL_GPL(dccp_ctl_make_reset); - -/* send Reset on established socket, to close or abort the connection */ -int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) -{ - struct sk_buff *skb; - /* - * FIXME: what if rebuild_header fails? - * Should we be doing a rebuild_header here? - */ - int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk); - - if (err != 0) - return err; - - skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC); - if (skb == NULL) - return -ENOBUFS; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET; - DCCP_SKB_CB(skb)->dccpd_reset_code = code; - - return dccp_transmit_skb(sk, skb); -} - -/* - * Do all connect socket setups that can be done AF independent. - */ -int dccp_connect(struct sock *sk) -{ - struct sk_buff *skb; - struct dccp_sock *dp = dccp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - sk->sk_err = 0; - sock_reset_flag(sk, SOCK_DONE); - - dccp_sync_mss(sk, dst_mtu(dst)); - - /* do not connect if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dccp_sk(sk))) - return -EPROTO; - - /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ - dp->dccps_gar = dp->dccps_iss; - - skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); - if (unlikely(skb == NULL)) - return -ENOBUFS; - - /* Reserve space for headers. */ - skb_reserve(skb, sk->sk_prot->max_header); - - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; - - dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); - DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); - - /* Timer for repeating the REQUEST until an answer. */ - icsk->icsk_retransmits = 0; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - icsk->icsk_rto, DCCP_RTO_MAX); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_connect); - -void dccp_send_ack(struct sock *sk) -{ - /* If we have been reset, we may not send again. */ - if (sk->sk_state != DCCP_CLOSED) { - struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, - GFP_ATOMIC); - - if (skb == NULL) { - inet_csk_schedule_ack(sk); - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, - DCCP_RTO_MAX); - return; - } - - /* Reserve space for headers */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK; - dccp_transmit_skb(sk, skb); - } -} - -EXPORT_SYMBOL_GPL(dccp_send_ack); - -#if 0 -/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */ -void dccp_send_delayed_ack(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - /* - * FIXME: tune this timer. elapsed time fixes the skew, so no problem - * with using 2s, and active senders also piggyback the ACK into a - * DATAACK packet, so this is really for quiescent senders. - */ - unsigned long timeout = jiffies + 2 * HZ; - - /* Use new timeout only if there wasn't a older one earlier. */ - if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { - /* If delack timer was blocked or is about to expire, - * send ACK now. - * - * FIXME: check the "about to expire" part - */ - if (icsk->icsk_ack.blocked) { - dccp_send_ack(sk); - return; - } - - if (!time_before(timeout, icsk_delack_timeout(icsk))) - timeout = icsk_delack_timeout(icsk); - } - icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; - sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); -} -#endif - -void dccp_send_sync(struct sock *sk, const u64 ackno, - const enum dccp_pkt_type pkt_type) -{ - /* - * We are not putting this on the write queue, so - * dccp_transmit_skb() will set the ownership to this - * sock. - */ - struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); - - if (skb == NULL) { - /* FIXME: how to make sure the sync is sent? */ - DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type)); - return; - } - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = pkt_type; - DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; - - /* - * Clear the flag in case the Sync was scheduled for out-of-band data, - * such as carrying a long Ack Vector. - */ - dccp_sk(sk)->dccps_sync_scheduled = 0; - - dccp_transmit_skb(sk, skb); -} - -EXPORT_SYMBOL_GPL(dccp_send_sync); - -/* - * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This - * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under - * any circumstances. - */ -void dccp_send_close(struct sock *sk, const int active) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC; - - skb = alloc_skb(sk->sk_prot->max_header, prio); - if (skb == NULL) - return; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait) - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ; - else - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; - - if (active) { - skb = dccp_skb_entail(sk, skb); - /* - * Retransmission timer for active-close: RFC 4340, 8.3 requires - * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ - * state can be left. The initial timeout is 2 RTTs. - * Since RTT measurement is done by the CCIDs, there is no easy - * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4 - * is too low (200ms); we use a high value to avoid unnecessary - * retransmissions when the link RTT is > 0.2 seconds. - * FIXME: Let main module sample RTTs and use that instead. - */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); - } - dccp_transmit_skb(sk, skb); -} diff --git a/net/dccp/proto.c b/net/dccp/proto.c deleted file mode 100644 index fcc5c9d64f466..0000000000000 --- a/net/dccp/proto.c +++ /dev/null @@ -1,1293 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/proto.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -#define CREATE_TRACE_POINTS -#include "trace.h" - -DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; - -EXPORT_SYMBOL_GPL(dccp_statistics); - -DEFINE_PER_CPU(unsigned int, dccp_orphan_count); -EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count); - -struct inet_hashinfo dccp_hashinfo; -EXPORT_SYMBOL_GPL(dccp_hashinfo); - -/* the maximum queue length for tx in packets. 0 is no limit */ -int sysctl_dccp_tx_qlen __read_mostly = 5; - -#ifdef CONFIG_IP_DCCP_DEBUG -static const char *dccp_state_name(const int state) -{ - static const char *const dccp_state_names[] = { - [DCCP_OPEN] = "OPEN", - [DCCP_REQUESTING] = "REQUESTING", - [DCCP_PARTOPEN] = "PARTOPEN", - [DCCP_LISTEN] = "LISTEN", - [DCCP_RESPOND] = "RESPOND", - [DCCP_CLOSING] = "CLOSING", - [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", - [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", - [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", - [DCCP_TIME_WAIT] = "TIME_WAIT", - [DCCP_CLOSED] = "CLOSED", - }; - - if (state >= DCCP_MAX_STATES) - return "INVALID STATE!"; - else - return dccp_state_names[state]; -} -#endif - -void dccp_set_state(struct sock *sk, const int state) -{ - const int oldstate = sk->sk_state; - - dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk, - dccp_state_name(oldstate), dccp_state_name(state)); - WARN_ON(state == oldstate); - - switch (state) { - case DCCP_OPEN: - if (oldstate != DCCP_OPEN) - DCCP_INC_STATS(DCCP_MIB_CURRESTAB); - /* Client retransmits all Confirm options until entering OPEN */ - if (oldstate == DCCP_PARTOPEN) - dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); - break; - - case DCCP_CLOSED: - if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ || - oldstate == DCCP_CLOSING) - DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); - - sk->sk_prot->unhash(sk); - if (inet_csk(sk)->icsk_bind_hash != NULL && - !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) - inet_put_port(sk); - fallthrough; - default: - if (oldstate == DCCP_OPEN) - DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); - } - - /* Change state AFTER socket is unhashed to avoid closed - * socket sitting in hash tables. - */ - inet_sk_set_state(sk, state); -} - -EXPORT_SYMBOL_GPL(dccp_set_state); - -static void dccp_finish_passive_close(struct sock *sk) -{ - switch (sk->sk_state) { - case DCCP_PASSIVE_CLOSE: - /* Node (client or server) has received Close packet. */ - dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_set_state(sk, DCCP_CLOSED); - break; - case DCCP_PASSIVE_CLOSEREQ: - /* - * Client received CloseReq. We set the `active' flag so that - * dccp_send_close() retransmits the Close as per RFC 4340, 8.3. - */ - dccp_send_close(sk, 1); - dccp_set_state(sk, DCCP_CLOSING); - } -} - -void dccp_done(struct sock *sk) -{ - dccp_set_state(sk, DCCP_CLOSED); - dccp_clear_xmit_timers(sk); - - sk->sk_shutdown = SHUTDOWN_MASK; - - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_state_change(sk); - else - inet_csk_destroy_sock(sk); -} - -EXPORT_SYMBOL_GPL(dccp_done); - -const char *dccp_packet_name(const int type) -{ - static const char *const dccp_packet_names[] = { - [DCCP_PKT_REQUEST] = "REQUEST", - [DCCP_PKT_RESPONSE] = "RESPONSE", - [DCCP_PKT_DATA] = "DATA", - [DCCP_PKT_ACK] = "ACK", - [DCCP_PKT_DATAACK] = "DATAACK", - [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", - [DCCP_PKT_CLOSE] = "CLOSE", - [DCCP_PKT_RESET] = "RESET", - [DCCP_PKT_SYNC] = "SYNC", - [DCCP_PKT_SYNCACK] = "SYNCACK", - }; - - if (type >= DCCP_NR_PKT_TYPES) - return "INVALID"; - else - return dccp_packet_names[type]; -} - -EXPORT_SYMBOL_GPL(dccp_packet_name); - -void dccp_destruct_common(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = NULL; -} -EXPORT_SYMBOL_GPL(dccp_destruct_common); - -static void dccp_sk_destruct(struct sock *sk) -{ - dccp_destruct_common(sk); - inet_sock_destruct(sk); -} - -int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - pr_warn_once("DCCP is deprecated and scheduled to be removed in 2025, " - "please contact the netdev mailing list\n"); - - icsk->icsk_rto = DCCP_TIMEOUT_INIT; - icsk->icsk_syn_retries = sysctl_dccp_request_retries; - sk->sk_state = DCCP_CLOSED; - sk->sk_write_space = dccp_write_space; - sk->sk_destruct = dccp_sk_destruct; - icsk->icsk_sync_mss = dccp_sync_mss; - dp->dccps_mss_cache = 536; - dp->dccps_rate_last = jiffies; - dp->dccps_role = DCCP_ROLE_UNDEFINED; - dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; - dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; - - dccp_init_xmit_timers(sk); - - INIT_LIST_HEAD(&dp->dccps_featneg); - /* control socket doesn't need feat nego */ - if (likely(ctl_sock_initialized)) - return dccp_feat_init(sk); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_init_sock); - -void dccp_destroy_sock(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - __skb_queue_purge(&sk->sk_write_queue); - if (sk->sk_send_head != NULL) { - kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - } - - /* Clean up a referenced DCCP bind bucket. */ - if (inet_csk(sk)->icsk_bind_hash != NULL) - inet_put_port(sk); - - kfree(dp->dccps_service_list); - dp->dccps_service_list = NULL; - - if (dp->dccps_hc_rx_ackvec != NULL) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = NULL; - - /* clean up feature negotiation state */ - dccp_feat_list_purge(&dp->dccps_featneg); -} - -EXPORT_SYMBOL_GPL(dccp_destroy_sock); - -static inline int dccp_need_reset(int state) -{ - return state != DCCP_CLOSED && state != DCCP_LISTEN && - state != DCCP_REQUESTING; -} - -int dccp_disconnect(struct sock *sk, int flags) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - const int old_state = sk->sk_state; - - if (old_state != DCCP_CLOSED) - dccp_set_state(sk, DCCP_CLOSED); - - /* - * This corresponds to the ABORT function of RFC793, sec. 3.8 - * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted". - */ - if (old_state == DCCP_LISTEN) { - inet_csk_listen_stop(sk); - } else if (dccp_need_reset(old_state)) { - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - sk->sk_err = ECONNRESET; - } else if (old_state == DCCP_REQUESTING) - sk->sk_err = ECONNRESET; - - dccp_clear_xmit_timers(sk); - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = NULL; - - __skb_queue_purge(&sk->sk_receive_queue); - __skb_queue_purge(&sk->sk_write_queue); - if (sk->sk_send_head != NULL) { - __kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - } - - inet->inet_dport = 0; - - inet_bhash2_reset_saddr(sk); - - sk->sk_shutdown = 0; - sock_reset_flag(sk, SOCK_DONE); - - icsk->icsk_backoff = 0; - inet_csk_delack_init(sk); - __sk_dst_reset(sk); - - WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); - - sk_error_report(sk); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_disconnect); - -/* - * Wait for a DCCP event. - * - * Note that we don't need to lock the socket, as the upper poll layers - * take care of normal races (between the test and the event) and we don't - * go look at any of the socket buffers directly. - */ -__poll_t dccp_poll(struct file *file, struct socket *sock, - poll_table *wait) -{ - struct sock *sk = sock->sk; - __poll_t mask; - u8 shutdown; - int state; - - sock_poll_wait(file, sock, wait); - - state = inet_sk_state_load(sk); - if (state == DCCP_LISTEN) - return inet_csk_listen_poll(sk); - - /* Socket is not locked. We are protected from async events - by poll logic and correct handling of state changes - made by another threads is impossible in any case. - */ - - mask = 0; - if (READ_ONCE(sk->sk_err)) - mask = EPOLLERR; - shutdown = READ_ONCE(sk->sk_shutdown); - - if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED) - mask |= EPOLLHUP; - if (shutdown & RCV_SHUTDOWN) - mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - - /* Connected? */ - if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { - if (atomic_read(&sk->sk_rmem_alloc) > 0) - mask |= EPOLLIN | EPOLLRDNORM; - - if (!(shutdown & SEND_SHUTDOWN)) { - if (sk_stream_is_writeable(sk)) { - mask |= EPOLLOUT | EPOLLWRNORM; - } else { /* send SIGIO later */ - sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - - /* Race breaker. If space is freed after - * wspace test but before the flags are set, - * IO signal will be lost. - */ - if (sk_stream_is_writeable(sk)) - mask |= EPOLLOUT | EPOLLWRNORM; - } - } - } - return mask; -} -EXPORT_SYMBOL_GPL(dccp_poll); - -int dccp_ioctl(struct sock *sk, int cmd, int *karg) -{ - int rc = -ENOTCONN; - - lock_sock(sk); - - if (sk->sk_state == DCCP_LISTEN) - goto out; - - switch (cmd) { - case SIOCOUTQ: { - *karg = sk_wmem_alloc_get(sk); - /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and - * always 0, comparably to UDP. - */ - - rc = 0; - } - break; - case SIOCINQ: { - struct sk_buff *skb; - *karg = 0; - - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) { - /* - * We will only return the amount of this packet since - * that is all that will be read. - */ - *karg = skb->len; - } - rc = 0; - } - break; - default: - rc = -ENOIOCTLCMD; - break; - } -out: - release_sock(sk); - return rc; -} - -EXPORT_SYMBOL_GPL(dccp_ioctl); - -static int dccp_setsockopt_service(struct sock *sk, const __be32 service, - sockptr_t optval, unsigned int optlen) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_service_list *sl = NULL; - - if (service == DCCP_SERVICE_INVALID_VALUE || - optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32)) - return -EINVAL; - - if (optlen > sizeof(service)) { - sl = kmalloc(optlen, GFP_KERNEL); - if (sl == NULL) - return -ENOMEM; - - sl->dccpsl_nr = optlen / sizeof(u32) - 1; - if (copy_from_sockptr_offset(sl->dccpsl_list, optval, - sizeof(service), optlen - sizeof(service)) || - dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { - kfree(sl); - return -EFAULT; - } - } - - lock_sock(sk); - dp->dccps_service = service; - - kfree(dp->dccps_service_list); - - dp->dccps_service_list = sl; - release_sock(sk); - return 0; -} - -static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) -{ - u8 *list, len; - int i, rc; - - if (cscov < 0 || cscov > 15) - return -EINVAL; - /* - * Populate a list of permissible values, in the range cscov...15. This - * is necessary since feature negotiation of single values only works if - * both sides incidentally choose the same value. Since the list starts - * lowest-value first, negotiation will pick the smallest shared value. - */ - if (cscov == 0) - return 0; - len = 16 - cscov; - - list = kmalloc(len, GFP_KERNEL); - if (list == NULL) - return -ENOBUFS; - - for (i = 0; i < len; i++) - list[i] = cscov++; - - rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); - - if (rc == 0) { - if (rx) - dccp_sk(sk)->dccps_pcrlen = cscov; - else - dccp_sk(sk)->dccps_pcslen = cscov; - } - kfree(list); - return rc; -} - -static int dccp_setsockopt_ccid(struct sock *sk, int type, - sockptr_t optval, unsigned int optlen) -{ - u8 *val; - int rc = 0; - - if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) - return -EINVAL; - - val = memdup_sockptr(optval, optlen); - if (IS_ERR(val)) - return PTR_ERR(val); - - lock_sock(sk); - if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); - - if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); - release_sock(sk); - - kfree(val); - return rc; -} - -static int do_dccp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct dccp_sock *dp = dccp_sk(sk); - int val, err = 0; - - switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CHANGE_L: - case DCCP_SOCKOPT_CHANGE_R: - DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CCID: - case DCCP_SOCKOPT_RX_CCID: - case DCCP_SOCKOPT_TX_CCID: - return dccp_setsockopt_ccid(sk, optname, optval, optlen); - } - - if (optlen < (int)sizeof(int)) - return -EINVAL; - - if (copy_from_sockptr(&val, optval, sizeof(int))) - return -EFAULT; - - if (optname == DCCP_SOCKOPT_SERVICE) - return dccp_setsockopt_service(sk, val, optval, optlen); - - lock_sock(sk); - switch (optname) { - case DCCP_SOCKOPT_SERVER_TIMEWAIT: - if (dp->dccps_role != DCCP_ROLE_SERVER) - err = -EOPNOTSUPP; - else - dp->dccps_server_timewait = (val != 0); - break; - case DCCP_SOCKOPT_SEND_CSCOV: - err = dccp_setsockopt_cscov(sk, val, false); - break; - case DCCP_SOCKOPT_RECV_CSCOV: - err = dccp_setsockopt_cscov(sk, val, true); - break; - case DCCP_SOCKOPT_QPOLICY_ID: - if (sk->sk_state != DCCP_CLOSED) - err = -EISCONN; - else if (val < 0 || val >= DCCPQ_POLICY_MAX) - err = -EINVAL; - else - dp->dccps_qpolicy = val; - break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - if (val < 0) - err = -EINVAL; - else - dp->dccps_tx_qlen = val; - break; - default: - err = -ENOPROTOOPT; - break; - } - release_sock(sk); - - return err; -} - -int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, - unsigned int optlen) -{ - if (level != SOL_DCCP) - return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, - optname, optval, - optlen); - return do_dccp_setsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(dccp_setsockopt); - -static int dccp_getsockopt_service(struct sock *sk, int len, - __be32 __user *optval, - int __user *optlen) -{ - const struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_service_list *sl; - int err = -ENOENT, slen = 0, total_len = sizeof(u32); - - lock_sock(sk); - if ((sl = dp->dccps_service_list) != NULL) { - slen = sl->dccpsl_nr * sizeof(u32); - total_len += slen; - } - - err = -EINVAL; - if (total_len > len) - goto out; - - err = 0; - if (put_user(total_len, optlen) || - put_user(dp->dccps_service, optval) || - (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen))) - err = -EFAULT; -out: - release_sock(sk); - return err; -} - -static int do_dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct dccp_sock *dp; - int val, len; - - if (get_user(len, optlen)) - return -EFAULT; - - if (len < (int)sizeof(int)) - return -EINVAL; - - dp = dccp_sk(sk); - - switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_SERVICE: - return dccp_getsockopt_service(sk, len, - (__be32 __user *)optval, optlen); - case DCCP_SOCKOPT_GET_CUR_MPS: - val = READ_ONCE(dp->dccps_mss_cache); - break; - case DCCP_SOCKOPT_AVAILABLE_CCIDS: - return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); - case DCCP_SOCKOPT_TX_CCID: - val = ccid_get_current_tx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; - case DCCP_SOCKOPT_RX_CCID: - val = ccid_get_current_rx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; - case DCCP_SOCKOPT_SERVER_TIMEWAIT: - val = dp->dccps_server_timewait; - break; - case DCCP_SOCKOPT_SEND_CSCOV: - val = dp->dccps_pcslen; - break; - case DCCP_SOCKOPT_RECV_CSCOV: - val = dp->dccps_pcrlen; - break; - case DCCP_SOCKOPT_QPOLICY_ID: - val = dp->dccps_qpolicy; - break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - val = dp->dccps_tx_qlen; - break; - case 128 ... 191: - return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, - len, (u32 __user *)optval, optlen); - case 192 ... 255: - return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname, - len, (u32 __user *)optval, optlen); - default: - return -ENOPROTOOPT; - } - - len = sizeof(val); - if (put_user(len, optlen) || copy_to_user(optval, &val, len)) - return -EFAULT; - - return 0; -} - -int dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_DCCP) - return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, - optname, optval, - optlen); - return do_dccp_getsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(dccp_getsockopt); - -static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) -{ - struct cmsghdr *cmsg; - - /* - * Assign an (opaque) qpolicy priority value to skb->priority. - * - * We are overloading this skb field for use with the qpolicy subystem. - * The skb->priority is normally used for the SO_PRIORITY option, which - * is initialised from sk_priority. Since the assignment of sk_priority - * to skb->priority happens later (on layer 3), we overload this field - * for use with queueing priorities as long as the skb is on layer 4. - * The default priority value (if nothing is set) is 0. - */ - skb->priority = 0; - - for_each_cmsghdr(cmsg, msg) { - if (!CMSG_OK(msg, cmsg)) - return -EINVAL; - - if (cmsg->cmsg_level != SOL_DCCP) - continue; - - if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && - !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) - return -EINVAL; - - switch (cmsg->cmsg_type) { - case DCCP_SCM_PRIORITY: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) - return -EINVAL; - skb->priority = *(__u32 *)CMSG_DATA(cmsg); - break; - default: - return -EINVAL; - } - } - return 0; -} - -int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) -{ - const struct dccp_sock *dp = dccp_sk(sk); - const int flags = msg->msg_flags; - const int noblock = flags & MSG_DONTWAIT; - struct sk_buff *skb; - int rc, size; - long timeo; - - trace_dccp_probe(sk, len); - - if (len > READ_ONCE(dp->dccps_mss_cache)) - return -EMSGSIZE; - - lock_sock(sk); - - timeo = sock_sndtimeo(sk, noblock); - - /* - * We have to use sk_stream_wait_connect here to set sk_write_pending, - * so that the trick in dccp_rcv_request_sent_state_process. - */ - /* Wait for a connection to finish. */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) - if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) - goto out_release; - - size = sk->sk_prot->max_header + len; - release_sock(sk); - skb = sock_alloc_send_skb(sk, size, noblock, &rc); - lock_sock(sk); - if (skb == NULL) - goto out_release; - - if (dccp_qpolicy_full(sk)) { - rc = -EAGAIN; - goto out_discard; - } - - if (sk->sk_state == DCCP_CLOSED) { - rc = -ENOTCONN; - goto out_discard; - } - - /* We need to check dccps_mss_cache after socket is locked. */ - if (len > dp->dccps_mss_cache) { - rc = -EMSGSIZE; - goto out_discard; - } - - skb_reserve(skb, sk->sk_prot->max_header); - rc = memcpy_from_msg(skb_put(skb, len), msg, len); - if (rc != 0) - goto out_discard; - - rc = dccp_msghdr_parse(msg, skb); - if (rc != 0) - goto out_discard; - - dccp_qpolicy_push(sk, skb); - /* - * The xmit_timer is set if the TX CCID is rate-based and will expire - * when congestion control permits to release further packets into the - * network. Window-based CCIDs do not use this timer. - */ - if (!timer_pending(&dp->dccps_xmit_timer)) - dccp_write_xmit(sk); -out_release: - release_sock(sk); - return rc ? : len; -out_discard: - kfree_skb(skb); - goto out_release; -} - -EXPORT_SYMBOL_GPL(dccp_sendmsg); - -int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len) -{ - const struct dccp_hdr *dh; - long timeo; - - lock_sock(sk); - - if (sk->sk_state == DCCP_LISTEN) { - len = -ENOTCONN; - goto out; - } - - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - do { - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - - if (skb == NULL) - goto verify_sock_status; - - dh = dccp_hdr(skb); - - switch (dh->dccph_type) { - case DCCP_PKT_DATA: - case DCCP_PKT_DATAACK: - goto found_ok_skb; - - case DCCP_PKT_CLOSE: - case DCCP_PKT_CLOSEREQ: - if (!(flags & MSG_PEEK)) - dccp_finish_passive_close(sk); - fallthrough; - case DCCP_PKT_RESET: - dccp_pr_debug("found fin (%s) ok!\n", - dccp_packet_name(dh->dccph_type)); - len = 0; - goto found_fin_ok; - default: - dccp_pr_debug("packet_type=%s\n", - dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb); - } -verify_sock_status: - if (sock_flag(sk, SOCK_DONE)) { - len = 0; - break; - } - - if (sk->sk_err) { - len = sock_error(sk); - break; - } - - if (sk->sk_shutdown & RCV_SHUTDOWN) { - len = 0; - break; - } - - if (sk->sk_state == DCCP_CLOSED) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - len = -ENOTCONN; - break; - } - len = 0; - break; - } - - if (!timeo) { - len = -EAGAIN; - break; - } - - if (signal_pending(current)) { - len = sock_intr_errno(timeo); - break; - } - - sk_wait_data(sk, &timeo, NULL); - continue; - found_ok_skb: - if (len > skb->len) - len = skb->len; - else if (len < skb->len) - msg->msg_flags |= MSG_TRUNC; - - if (skb_copy_datagram_msg(skb, 0, msg, len)) { - /* Exception. Bailout! */ - len = -EFAULT; - break; - } - if (flags & MSG_TRUNC) - len = skb->len; - found_fin_ok: - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); - break; - } while (1); -out: - release_sock(sk); - return len; -} - -EXPORT_SYMBOL_GPL(dccp_recvmsg); - -int inet_dccp_listen(struct socket *sock, int backlog) -{ - struct sock *sk = sock->sk; - unsigned char old_state; - int err; - - lock_sock(sk); - - err = -EINVAL; - if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) - goto out; - - old_state = sk->sk_state; - if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) - goto out; - - WRITE_ONCE(sk->sk_max_ack_backlog, backlog); - /* Really, if the socket is already in listen state - * we can only allow the backlog to be adjusted. - */ - if (old_state != DCCP_LISTEN) { - struct dccp_sock *dp = dccp_sk(sk); - - dp->dccps_role = DCCP_ROLE_LISTEN; - - /* do not start to listen if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dp)) { - err = -EPROTO; - goto out; - } - - err = inet_csk_listen_start(sk); - if (err) - goto out; - } - err = 0; - -out: - release_sock(sk); - return err; -} - -EXPORT_SYMBOL_GPL(inet_dccp_listen); - -static void dccp_terminate_connection(struct sock *sk) -{ - u8 next_state = DCCP_CLOSED; - - switch (sk->sk_state) { - case DCCP_PASSIVE_CLOSE: - case DCCP_PASSIVE_CLOSEREQ: - dccp_finish_passive_close(sk); - break; - case DCCP_PARTOPEN: - dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk); - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - fallthrough; - case DCCP_OPEN: - dccp_send_close(sk, 1); - - if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER && - !dccp_sk(sk)->dccps_server_timewait) - next_state = DCCP_ACTIVE_CLOSEREQ; - else - next_state = DCCP_CLOSING; - fallthrough; - default: - dccp_set_state(sk, next_state); - } -} - -void dccp_close(struct sock *sk, long timeout) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - u32 data_was_unread = 0; - int state; - - lock_sock(sk); - - sk->sk_shutdown = SHUTDOWN_MASK; - - if (sk->sk_state == DCCP_LISTEN) { - dccp_set_state(sk, DCCP_CLOSED); - - /* Special case. */ - inet_csk_listen_stop(sk); - - goto adjudge_to_death; - } - - sk_stop_timer(sk, &dp->dccps_xmit_timer); - - /* - * We need to flush the recv. buffs. We do this only on the - * descriptor close, not protocol-sourced closes, because the - *reader process may not have drained the data yet! - */ - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - data_was_unread += skb->len; - __kfree_skb(skb); - } - - /* If socket has been already reset kill it. */ - if (sk->sk_state == DCCP_CLOSED) - goto adjudge_to_death; - - if (data_was_unread) { - /* Unread data was tossed, send an appropriate Reset Code */ - DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - dccp_set_state(sk, DCCP_CLOSED); - } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { - /* Check zero linger _after_ checking for unread data. */ - sk->sk_prot->disconnect(sk, 0); - } else if (sk->sk_state != DCCP_CLOSED) { - /* - * Normal connection termination. May need to wait if there are - * still packets in the TX queue that are delayed by the CCID. - */ - dccp_flush_write_queue(sk, &timeout); - dccp_terminate_connection(sk); - } - - /* - * Flush write queue. This may be necessary in several cases: - * - we have been closed by the peer but still have application data; - * - abortive termination (unread data or zero linger time), - * - normal termination but queue could not be flushed within time limit - */ - __skb_queue_purge(&sk->sk_write_queue); - - sk_stream_wait_close(sk, timeout); - -adjudge_to_death: - state = sk->sk_state; - sock_hold(sk); - sock_orphan(sk); - - /* - * It is the last release_sock in its life. It will remove backlog. - */ - release_sock(sk); - /* - * Now socket is owned by kernel and we acquire BH lock - * to finish close. No need to check for user refs. - */ - local_bh_disable(); - bh_lock_sock(sk); - WARN_ON(sock_owned_by_user(sk)); - - this_cpu_inc(dccp_orphan_count); - - /* Have we already been destroyed by a softirq or backlog? */ - if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) - goto out; - - if (sk->sk_state == DCCP_CLOSED) - inet_csk_destroy_sock(sk); - - /* Otherwise, socket is reprieved until protocol close. */ - -out: - bh_unlock_sock(sk); - local_bh_enable(); - sock_put(sk); -} - -EXPORT_SYMBOL_GPL(dccp_close); - -void dccp_shutdown(struct sock *sk, int how) -{ - dccp_pr_debug("called shutdown(%x)\n", how); -} - -EXPORT_SYMBOL_GPL(dccp_shutdown); - -static inline int __init dccp_mib_init(void) -{ - dccp_statistics = alloc_percpu(struct dccp_mib); - if (!dccp_statistics) - return -ENOMEM; - return 0; -} - -static inline void dccp_mib_exit(void) -{ - free_percpu(dccp_statistics); -} - -static int thash_entries; -module_param(thash_entries, int, 0444); -MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); - -#ifdef CONFIG_IP_DCCP_DEBUG -bool dccp_debug; -module_param(dccp_debug, bool, 0644); -MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); - -EXPORT_SYMBOL_GPL(dccp_debug); -#endif - -static int __init dccp_init(void) -{ - unsigned long goal; - unsigned long nr_pages = totalram_pages(); - int ehash_order, bhash_order, i; - int rc; - - BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > - sizeof_field(struct sk_buff, cb)); - rc = inet_hashinfo2_init_mod(&dccp_hashinfo); - if (rc) - goto out_fail; - rc = -ENOBUFS; - dccp_hashinfo.bind_bucket_cachep = - kmem_cache_create("dccp_bind_bucket", - sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); - if (!dccp_hashinfo.bind_bucket_cachep) - goto out_free_hashinfo2; - dccp_hashinfo.bind2_bucket_cachep = - kmem_cache_create("dccp_bind2_bucket", - sizeof(struct inet_bind2_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); - if (!dccp_hashinfo.bind2_bucket_cachep) - goto out_free_bind_bucket_cachep; - - /* - * Size and allocate the main established and bind bucket - * hash tables. - * - * The methodology is similar to that of the buffer cache. - */ - if (nr_pages >= (128 * 1024)) - goal = nr_pages >> (21 - PAGE_SHIFT); - else - goal = nr_pages >> (23 - PAGE_SHIFT); - - if (thash_entries) - goal = (thash_entries * - sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; - for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) - ; - do { - unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE / - sizeof(struct inet_ehash_bucket); - - while (hash_size & (hash_size - 1)) - hash_size--; - dccp_hashinfo.ehash_mask = hash_size - 1; - dccp_hashinfo.ehash = (struct inet_ehash_bucket *) - __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order); - } while (!dccp_hashinfo.ehash && --ehash_order > 0); - - if (!dccp_hashinfo.ehash) { - DCCP_CRIT("Failed to allocate DCCP established hash table"); - goto out_free_bind2_bucket_cachep; - } - - for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) - INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); - - if (inet_ehash_locks_alloc(&dccp_hashinfo)) - goto out_free_dccp_ehash; - - bhash_order = ehash_order; - - do { - dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / - sizeof(struct inet_bind_hashbucket); - if ((dccp_hashinfo.bhash_size > (64 * 1024)) && - bhash_order > 0) - continue; - dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order); - } while (!dccp_hashinfo.bhash && --bhash_order >= 0); - - if (!dccp_hashinfo.bhash) { - DCCP_CRIT("Failed to allocate DCCP bind hash table"); - goto out_free_dccp_locks; - } - - dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order); - - if (!dccp_hashinfo.bhash2) { - DCCP_CRIT("Failed to allocate DCCP bind2 hash table"); - goto out_free_dccp_bhash; - } - - for (i = 0; i < dccp_hashinfo.bhash_size; i++) { - spin_lock_init(&dccp_hashinfo.bhash[i].lock); - INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); - spin_lock_init(&dccp_hashinfo.bhash2[i].lock); - INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); - } - - dccp_hashinfo.pernet = false; - - rc = dccp_mib_init(); - if (rc) - goto out_free_dccp_bhash2; - - rc = dccp_ackvec_init(); - if (rc) - goto out_free_dccp_mib; - - rc = dccp_sysctl_init(); - if (rc) - goto out_ackvec_exit; - - rc = ccid_initialize_builtins(); - if (rc) - goto out_sysctl_exit; - - dccp_timestamping_init(); - - return 0; - -out_sysctl_exit: - dccp_sysctl_exit(); -out_ackvec_exit: - dccp_ackvec_exit(); -out_free_dccp_mib: - dccp_mib_exit(); -out_free_dccp_bhash2: - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); -out_free_dccp_bhash: - free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); -out_free_dccp_locks: - inet_ehash_locks_free(&dccp_hashinfo); -out_free_dccp_ehash: - free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); -out_free_bind2_bucket_cachep: - kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep); -out_free_bind_bucket_cachep: - kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); -out_free_hashinfo2: - inet_hashinfo2_free_mod(&dccp_hashinfo); -out_fail: - dccp_hashinfo.bhash = NULL; - dccp_hashinfo.bhash2 = NULL; - dccp_hashinfo.ehash = NULL; - dccp_hashinfo.bind_bucket_cachep = NULL; - dccp_hashinfo.bind2_bucket_cachep = NULL; - return rc; -} - -static void __exit dccp_fini(void) -{ - int bhash_order = get_order(dccp_hashinfo.bhash_size * - sizeof(struct inet_bind_hashbucket)); - - ccid_cleanup_builtins(); - dccp_mib_exit(); - free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); - free_pages((unsigned long)dccp_hashinfo.ehash, - get_order((dccp_hashinfo.ehash_mask + 1) * - sizeof(struct inet_ehash_bucket))); - inet_ehash_locks_free(&dccp_hashinfo); - kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); - dccp_ackvec_exit(); - dccp_sysctl_exit(); - inet_hashinfo2_free_mod(&dccp_hashinfo); -} - -module_init(dccp_init); -module_exit(dccp_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo "); -MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c deleted file mode 100644 index 5ba204ec0aca4..0000000000000 --- a/net/dccp/qpolicy.c +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/qpolicy.c - * - * Policy-based packet dequeueing interface for DCCP. - * - * Copyright (c) 2008 Tomasz Grobelny - */ -#include "dccp.h" - -/* - * Simple Dequeueing Policy: - * If tx_qlen is different from 0, enqueue up to tx_qlen elements. - */ -static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) -{ - skb_queue_tail(&sk->sk_write_queue, skb); -} - -static bool qpolicy_simple_full(struct sock *sk) -{ - return dccp_sk(sk)->dccps_tx_qlen && - sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; -} - -static struct sk_buff *qpolicy_simple_top(struct sock *sk) -{ - return skb_peek(&sk->sk_write_queue); -} - -/* - * Priority-based Dequeueing Policy: - * If tx_qlen is different from 0 and the queue has reached its upper bound - * of tx_qlen elements, replace older packets lowest-priority-first. - */ -static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) -{ - struct sk_buff *skb, *best = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (best == NULL || skb->priority > best->priority) - best = skb; - return best; -} - -static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) -{ - struct sk_buff *skb, *worst = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (worst == NULL || skb->priority < worst->priority) - worst = skb; - return worst; -} - -static bool qpolicy_prio_full(struct sock *sk) -{ - if (qpolicy_simple_full(sk)) - dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); - return false; -} - -/** - * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface - * @push: add a new @skb to the write queue - * @full: indicates that no more packets will be admitted - * @top: peeks at whatever the queueing policy defines as its `top' - * @params: parameter passed to policy operation - */ -struct dccp_qpolicy_operations { - void (*push) (struct sock *sk, struct sk_buff *skb); - bool (*full) (struct sock *sk); - struct sk_buff* (*top) (struct sock *sk); - __be32 params; -}; - -static struct dccp_qpolicy_operations qpol_table[DCCPQ_POLICY_MAX] = { - [DCCPQ_POLICY_SIMPLE] = { - .push = qpolicy_simple_push, - .full = qpolicy_simple_full, - .top = qpolicy_simple_top, - .params = 0, - }, - [DCCPQ_POLICY_PRIO] = { - .push = qpolicy_simple_push, - .full = qpolicy_prio_full, - .top = qpolicy_prio_best_skb, - .params = DCCP_SCM_PRIORITY, - }, -}; - -/* - * Externally visible interface - */ -void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) -{ - qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); -} - -bool dccp_qpolicy_full(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); -} - -void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) -{ - if (skb != NULL) { - skb_unlink(skb, &sk->sk_write_queue); - kfree_skb(skb); - } -} - -struct sk_buff *dccp_qpolicy_top(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); -} - -struct sk_buff *dccp_qpolicy_pop(struct sock *sk) -{ - struct sk_buff *skb = dccp_qpolicy_top(sk); - - if (skb != NULL) { - /* Clear any skb fields that we used internally */ - skb->priority = 0; - skb_unlink(skb, &sk->sk_write_queue); - } - return skb; -} - -bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) -{ - /* check if exactly one bit is set */ - if (!param || (param & (param - 1))) - return false; - return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; -} diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c deleted file mode 100644 index b15845fd63001..0000000000000 --- a/net/dccp/sysctl.c +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/sysctl.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include "dccp.h" -#include "feat.h" - -/* Boundary values */ -static int u8_max = 0xFF; -static unsigned long seqw_min = DCCPF_SEQ_WMIN, - seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */ - -static struct ctl_table dccp_default_table[] = { - { - .procname = "seq_window", - .data = &sysctl_dccp_sequence_window, - .maxlen = sizeof(sysctl_dccp_sequence_window), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ - .extra2 = &seqw_max, - }, - { - .procname = "rx_ccid", - .data = &sysctl_dccp_rx_ccid, - .maxlen = sizeof(sysctl_dccp_rx_ccid), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, /* RFC 4340, 10. */ - }, - { - .procname = "tx_ccid", - .data = &sysctl_dccp_tx_ccid, - .maxlen = sizeof(sysctl_dccp_tx_ccid), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, /* RFC 4340, 10. */ - }, - { - .procname = "request_retries", - .data = &sysctl_dccp_request_retries, - .maxlen = sizeof(sysctl_dccp_request_retries), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = &u8_max, - }, - { - .procname = "retries1", - .data = &sysctl_dccp_retries1, - .maxlen = sizeof(sysctl_dccp_retries1), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, - }, - { - .procname = "retries2", - .data = &sysctl_dccp_retries2, - .maxlen = sizeof(sysctl_dccp_retries2), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, - }, - { - .procname = "tx_qlen", - .data = &sysctl_dccp_tx_qlen, - .maxlen = sizeof(sysctl_dccp_tx_qlen), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "sync_ratelimit", - .data = &sysctl_dccp_sync_ratelimit, - .maxlen = sizeof(sysctl_dccp_sync_ratelimit), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, -}; - -static struct ctl_table_header *dccp_table_header; - -int __init dccp_sysctl_init(void) -{ - dccp_table_header = register_net_sysctl(&init_net, "net/dccp/default", - dccp_default_table); - - return dccp_table_header != NULL ? 0 : -ENOMEM; -} - -void dccp_sysctl_exit(void) -{ - if (dccp_table_header != NULL) { - unregister_net_sysctl_table(dccp_table_header); - dccp_table_header = NULL; - } -} diff --git a/net/dccp/timer.c b/net/dccp/timer.c deleted file mode 100644 index 232ac4ae0a73f..0000000000000 --- a/net/dccp/timer.c +++ /dev/null @@ -1,272 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/timer.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo - */ - -#include -#include -#include - -#include "dccp.h" - -/* sysctl variables governing numbers of retransmission attempts */ -int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES; -int sysctl_dccp_retries1 __read_mostly = TCP_RETR1; -int sysctl_dccp_retries2 __read_mostly = TCP_RETR2; - -static void dccp_write_err(struct sock *sk) -{ - sk->sk_err = READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT; - sk_error_report(sk); - - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - dccp_done(sk); - __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT); -} - -/* A write timeout has occurred. Process the after effects. */ -static int dccp_write_timeout(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - int retry_until; - - if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { - if (icsk->icsk_retransmits != 0) - dst_negative_advice(sk); - retry_until = icsk->icsk_syn_retries ? - : sysctl_dccp_request_retries; - } else { - if (icsk->icsk_retransmits >= sysctl_dccp_retries1) { - /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu - black hole detection. :-( - - It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any - case. Let me to cite the same draft, which requires for - us to implement this: - - "The one security concern raised by this memo is that ICMP black holes - are often caused by over-zealous security administrators who block - all ICMP messages. It is vitally important that those who design and - deploy security systems understand the impact of strict filtering on - upper-layer protocols. The safest web site in the world is worthless - if most TCP implementations cannot transfer data from it. It would - be far nicer to have all of the black holes fixed rather than fixing - all of the TCP implementations." - - Golden words :-). - */ - - dst_negative_advice(sk); - } - - retry_until = sysctl_dccp_retries2; - /* - * FIXME: see tcp_write_timout and tcp_out_of_resources - */ - } - - if (icsk->icsk_retransmits >= retry_until) { - /* Has it gone just too far? */ - dccp_write_err(sk); - return 1; - } - return 0; -} - -/* - * The DCCP retransmit timer. - */ -static void dccp_retransmit_timer(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - /* - * More than 4MSL (8 minutes) has passed, a RESET(aborted) was - * sent, no need to retransmit, this sock is dead. - */ - if (dccp_write_timeout(sk)) - return; - - /* - * We want to know the number of packets retransmitted, not the - * total number of retransmissions of clones of original packets. - */ - if (icsk->icsk_retransmits == 0) - __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS); - - if (dccp_retransmit_skb(sk) != 0) { - /* - * Retransmission failed because of local congestion, - * do not backoff. - */ - if (--icsk->icsk_retransmits == 0) - icsk->icsk_retransmits = 1; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - min(icsk->icsk_rto, - TCP_RESOURCE_PROBE_INTERVAL), - DCCP_RTO_MAX); - return; - } - - icsk->icsk_backoff++; - - icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, - DCCP_RTO_MAX); - if (icsk->icsk_retransmits > sysctl_dccp_retries1) - __sk_dst_reset(sk); -} - -static void dccp_write_timer(struct timer_list *t) -{ - struct inet_connection_sock *icsk = - from_timer(icsk, t, icsk_retransmit_timer); - struct sock *sk = &icsk->icsk_inet.sk; - int event = 0; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later */ - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - jiffies + (HZ / 20)); - goto out; - } - - if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) - goto out; - - if (time_after(icsk_timeout(icsk), jiffies)) { - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - icsk_timeout(icsk)); - goto out; - } - - event = icsk->icsk_pending; - icsk->icsk_pending = 0; - - switch (event) { - case ICSK_TIME_RETRANS: - dccp_retransmit_timer(sk); - break; - } -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -static void dccp_keepalive_timer(struct timer_list *t) -{ - struct sock *sk = from_timer(sk, t, sk_timer); - - pr_err("dccp should not use a keepalive timer !\n"); - sock_put(sk); -} - -/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ -static void dccp_delack_timer(struct timer_list *t) -{ - struct inet_connection_sock *icsk = - from_timer(icsk, t, icsk_delack_timer); - struct sock *sk = &icsk->icsk_inet.sk; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); - sk_reset_timer(sk, &icsk->icsk_delack_timer, - jiffies + TCP_DELACK_MIN); - goto out; - } - - if (sk->sk_state == DCCP_CLOSED || - !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) - goto out; - if (time_after(icsk_delack_timeout(icsk), jiffies)) { - sk_reset_timer(sk, &icsk->icsk_delack_timer, - icsk_delack_timeout(icsk)); - goto out; - } - - icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; - - if (inet_csk_ack_scheduled(sk)) { - if (!inet_csk_in_pingpong_mode(sk)) { - /* Delayed ACK missed: inflate ATO. */ - icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, - icsk->icsk_rto); - } else { - /* Delayed ACK missed: leave pingpong mode and - * deflate ATO. - */ - inet_csk_exit_pingpong_mode(sk); - icsk->icsk_ack.ato = TCP_ATO_MIN; - } - dccp_send_ack(sk); - __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); - } -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/** - * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface - * @t: pointer to the tasklet associated with this handler - * - * See the comments above %ccid_dequeueing_decision for supported modes. - */ -static void dccp_write_xmitlet(struct tasklet_struct *t) -{ - struct dccp_sock *dp = from_tasklet(dp, t, dccps_xmitlet); - struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) - sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); - else - dccp_write_xmit(sk); - bh_unlock_sock(sk); - sock_put(sk); -} - -static void dccp_write_xmit_timer(struct timer_list *t) -{ - struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer); - - dccp_write_xmitlet(&dp->dccps_xmitlet); -} - -void dccp_init_xmit_timers(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - tasklet_setup(&dp->dccps_xmitlet, dccp_write_xmitlet); - timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0); - inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, - &dccp_keepalive_timer); -} - -static ktime_t dccp_timestamp_seed; -/** - * dccp_timestamp - 10s of microseconds time source - * Returns the number of 10s of microseconds since loading DCCP. This is native - * DCCP time difference format (RFC 4340, sec. 13). - * Please note: This will wrap around about circa every 11.9 hours. - */ -u32 dccp_timestamp(void) -{ - u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); - - do_div(delta, 10); - return delta; -} -EXPORT_SYMBOL_GPL(dccp_timestamp); - -void __init dccp_timestamping_init(void) -{ - dccp_timestamp_seed = ktime_get_real(); -} diff --git a/net/dccp/trace.h b/net/dccp/trace.h deleted file mode 100644 index 5a43b3508c7f1..0000000000000 --- a/net/dccp/trace.h +++ /dev/null @@ -1,82 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM dccp - -#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_DCCP_H - -#include -#include "dccp.h" -#include "ccids/ccid3.h" -#include -#include - -TRACE_EVENT(dccp_probe, - - TP_PROTO(struct sock *sk, size_t size), - - TP_ARGS(sk, size), - - TP_STRUCT__entry( - /* sockaddr_in6 is always bigger than sockaddr_in */ - __array(__u8, saddr, sizeof(struct sockaddr_in6)) - __array(__u8, daddr, sizeof(struct sockaddr_in6)) - __field(__u16, sport) - __field(__u16, dport) - __field(__u16, size) - __field(__u16, tx_s) - __field(__u32, tx_rtt) - __field(__u32, tx_p) - __field(__u32, tx_x_calc) - __field(__u64, tx_x_recv) - __field(__u64, tx_x) - __field(__u32, tx_t_ipi) - ), - - TP_fast_assign( - const struct inet_sock *inet = inet_sk(sk); - struct ccid3_hc_tx_sock *hc = NULL; - - if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) - hc = ccid3_hc_tx_sk(sk); - - memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); - memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); - - TP_STORE_ADDR_PORTS(__entry, inet, sk); - - /* For filtering use */ - __entry->sport = ntohs(inet->inet_sport); - __entry->dport = ntohs(inet->inet_dport); - - __entry->size = size; - if (hc) { - __entry->tx_s = hc->tx_s; - __entry->tx_rtt = hc->tx_rtt; - __entry->tx_p = hc->tx_p; - __entry->tx_x_calc = hc->tx_x_calc; - __entry->tx_x_recv = hc->tx_x_recv >> 6; - __entry->tx_x = hc->tx_x >> 6; - __entry->tx_t_ipi = hc->tx_t_ipi; - } else { - __entry->tx_s = 0; - memset_startat(__entry, 0, tx_rtt); - } - ), - - TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d " - "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d", - __entry->saddr, __entry->daddr, __entry->size, - __entry->tx_s, __entry->tx_rtt, __entry->tx_p, - __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x, - __entry->tx_t_ipi) -); - -#endif /* _TRACE_TCP_H */ - -/* This part must be outside protection */ -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace -#include diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 6d2c97f8e9ef9..12850a277251d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -425,7 +425,7 @@ config INET_DIAG tristate "INET: socket monitoring interface" default y help - Support for INET (TCP, DCCP, etc) socket monitoring interface used by + Support for INET (TCP, UDP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at: diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5df1f1325259d..76e38092cd8a3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1328,10 +1328,7 @@ int inet_sk_rebuild_header(struct sock *sk) /* Routing failed... */ sk->sk_route_caps = 0; - /* - * Other protocols have to map its equivalent state to TCP_SYN_SENT. - * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme - */ + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index dd5cf8914a28d..9a20b84dc0223 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1322,7 +1322,7 @@ void inet_csk_destroy_sock(struct sock *sk) EXPORT_SYMBOL(inet_csk_destroy_sock); /* This function allows to force a closure of a socket after the call to - * tcp/dccp_create_openreq_child(). + * tcp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) __releases(&sk->sk_lock.slock) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index c2bb91d9e9ffd..907bad776b423 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1369,8 +1369,6 @@ static int inet_diag_type2proto(int type) switch (type) { case TCPDIAG_GETSOCK: return IPPROTO_TCP; - case DCCPDIAG_GETSOCK: - return IPPROTO_DCCP; default: return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 581bc62890818..ef052ccd93800 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -259,7 +259,7 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk) } /* - * xmit an sk_buff (used by TCP, SCTP and DCCP) + * xmit an sk_buff (used by TCP and SCTP) * Note : socket lock is not held for SYNACK packets, but might be modified * by calls to skb_set_owner_w() and ipv6_local_error(), * which are using proper atomic operations or spinlocks. diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c index b7997541f7eef..f93d9145ab8a5 100644 --- a/samples/bpf/sockex2_kern.c +++ b/samples/bpf/sockex2_kern.c @@ -31,7 +31,6 @@ static inline int proto_ports_offset(__u64 proto) switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: - case IPPROTO_DCCP: case IPPROTO_ESP: case IPPROTO_SCTP: case IPPROTO_UDPLITE: diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3d22bf863eec9..298c8f8d77199 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4804,7 +4804,7 @@ sub process { } # do not use BUG() or variants - if ($line =~ /\b(?!AA_|BUILD_|DCCP_|IDA_|KVM_|RWLOCK_|snd_|SPIN_)(?:[a-zA-Z_]*_)?BUG(?:_ON)?(?:_[A-Z_]+)?\s*\(/) { + if ($line =~ /\b(?!AA_|BUILD_|IDA_|KVM_|RWLOCK_|snd_|SPIN_)(?:[a-zA-Z_]*_)?BUG(?:_ON)?(?:_[A-Z_]+)?\s*\(/) { my $msg_level = \&WARN; $msg_level = \&CHK if ($file); &{$msg_level}("AVOID_BUG", diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 1b942b4908a26..7d623b00495c1 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -68,13 +67,6 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb, ad->u.net->dport = uh->dest; break; } - case IPPROTO_DCCP: { - struct dccp_hdr *dh = dccp_hdr(skb); - - ad->u.net->sport = dh->dccph_sport; - ad->u.net->dport = dh->dccph_dport; - break; - } case IPPROTO_SCTP: { struct sctphdr *sh = sctp_hdr(skb); @@ -140,17 +132,6 @@ int ipv6_skb_to_auditdata(struct sk_buff *skb, ad->u.net->dport = uh->dest; break; } - case IPPROTO_DCCP: { - struct dccp_hdr _dccph, *dh; - - dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); - if (dh == NULL) - break; - - ad->u.net->sport = dh->dccph_sport; - ad->u.net->dport = dh->dccph_dport; - break; - } case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index e7a7dcab81db6..b2695785610ad 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -65,7 +65,6 @@ #include #include #include -#include #include #include #include @@ -1191,8 +1190,6 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc return SECCLASS_ICMP_SOCKET; else return SECCLASS_RAWIP_SOCKET; - case SOCK_DCCP: - return SECCLASS_DCCP_SOCKET; default: return SECCLASS_RAWIP_SOCKET; } @@ -4392,22 +4389,6 @@ static int selinux_parse_skb_ipv4(struct sk_buff *skb, break; } - case IPPROTO_DCCP: { - struct dccp_hdr _dccph, *dh; - - if (ntohs(ih->frag_off) & IP_OFFSET) - break; - - offset += ihlen; - dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); - if (dh == NULL) - break; - - ad->u.net->sport = dh->dccph_sport; - ad->u.net->dport = dh->dccph_dport; - break; - } - #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; @@ -4486,18 +4467,6 @@ static int selinux_parse_skb_ipv6(struct sk_buff *skb, break; } - case IPPROTO_DCCP: { - struct dccp_hdr _dccph, *dh; - - dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); - if (dh == NULL) - break; - - ad->u.net->sport = dh->dccph_sport; - ad->u.net->dport = dh->dccph_dport; - break; - } - #if IS_ENABLED(CONFIG_IP_SCTP) case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; @@ -4849,10 +4818,6 @@ static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, in node_perm = UDP_SOCKET__NODE_BIND; break; - case SECCLASS_DCCP_SOCKET: - node_perm = DCCP_SOCKET__NODE_BIND; - break; - case SECCLASS_SCTP_SOCKET: node_perm = SCTP_SOCKET__NODE_BIND; break; @@ -4908,11 +4873,10 @@ static int selinux_socket_connect_helper(struct socket *sock, return 0; /* - * If a TCP, DCCP or SCTP socket, check name_connect permission + * If a TCP or SCTP socket, check name_connect permission * for the port. */ if (sksec->sclass == SECCLASS_TCP_SOCKET || - sksec->sclass == SECCLASS_DCCP_SOCKET || sksec->sclass == SECCLASS_SCTP_SOCKET) { struct common_audit_data ad; struct lsm_network_audit net = {0,}; @@ -4957,9 +4921,6 @@ static int selinux_socket_connect_helper(struct socket *sock, case SECCLASS_TCP_SOCKET: perm = TCP_SOCKET__NAME_CONNECT; break; - case SECCLASS_DCCP_SOCKET: - perm = DCCP_SOCKET__NAME_CONNECT; - break; case SECCLASS_SCTP_SOCKET: perm = SCTP_SOCKET__NAME_CONNECT; break; diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index 04a9b480885eb..5665aa5e7853e 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -127,8 +127,6 @@ const struct security_class_mapping secclass_map[] = { { "key", { "view", "read", "write", "search", "link", "setattr", "create", NULL } }, - { "dccp_socket", - { COMMON_SOCK_PERMS, "node_bind", "name_connect", NULL } }, { "memprotect", { "mmap_zero", NULL } }, { "peer", { "recv", NULL } }, { "capability2", { COMMON_CAP2_PERMS, NULL } }, diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 3a95986b134f2..2c0b07f9fbbd0 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -98,7 +98,6 @@ static const struct nlmsg_perm nlmsg_route_perms[] = { static const struct nlmsg_perm nlmsg_tcpdiag_perms[] = { { TCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, - { DCCPDIAG_GETSOCK, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, { SOCK_DIAG_BY_FAMILY, NETLINK_TCPDIAG_SOCKET__NLMSG_READ }, { SOCK_DESTROY, NETLINK_TCPDIAG_SOCKET__NLMSG_WRITE }, }; diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 99833168604ea..fc340a6f0ddea 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -4061,7 +4060,6 @@ static int smk_skb_to_addr_ipv6(struct sk_buff *skb, struct sockaddr_in6 *sip) __be16 frag_off; struct tcphdr _tcph, *th; struct udphdr _udph, *uh; - struct dccp_hdr _dccph, *dh; sip->sin6_port = 0; @@ -4090,11 +4088,6 @@ static int smk_skb_to_addr_ipv6(struct sk_buff *skb, struct sockaddr_in6 *sip) if (uh != NULL) sip->sin6_port = uh->source; break; - case IPPROTO_DCCP: - dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); - if (dh != NULL) - sip->sin6_port = dh->dccph_sport; - break; } return proto; } @@ -4216,7 +4209,7 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) case PF_INET6: proto = smk_skb_to_addr_ipv6(skb, &sadd); if (proto != IPPROTO_UDP && proto != IPPROTO_UDPLITE && - proto != IPPROTO_TCP && proto != IPPROTO_DCCP) + proto != IPPROTO_TCP) break; #ifdef SMACK_IPV6_SECMARK_LABELING skp = smack_from_skb(skb); From 22d6c9eebf2e68e6ab831ded37daaa83daff6bb8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 9 Apr 2025 19:36:46 -0700 Subject: [PATCH 090/770] net: Unexport shared functions for DCCP. DCCP was removed, so many inet functions no longer need to be exported. Let's unexport or use EXPORT_IPV6_MOD() for such functions. sk_free_unlock_clone() is inlined in sk_clone_lock() as it's the only caller. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250410023921.11307-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 - net/core/sock.c | 32 ++++++++++++++------------------ net/ipv4/inet_connection_sock.c | 12 +----------- net/ipv4/inet_hashtables.c | 16 +++++----------- net/ipv4/inet_timewait_sock.c | 4 ---- net/ipv6/af_inet6.c | 1 - net/ipv6/inet6_connection_sock.c | 2 -- 7 files changed, 20 insertions(+), 48 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 694f954258d43..bb4d6189292fe 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1781,7 +1781,6 @@ void sk_free(struct sock *sk); void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); -void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); diff --git a/net/core/sock.c b/net/core/sock.c index 121f640112889..b64df2463300b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2494,17 +2494,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; + + goto free; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); - if (bpf_sk_storage_clone(sk, newsk)) { - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } + if (bpf_sk_storage_clone(sk, newsk)) + goto free; /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. @@ -2534,18 +2531,17 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) net_enable_timestamp(); out: return newsk; -} -EXPORT_SYMBOL_GPL(sk_clone_lock); - -void sk_free_unlock_clone(struct sock *sk) -{ +free: /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - sk->sk_destruct = NULL; - bh_unlock_sock(sk); - sk_free(sk); + * destructor and make plain sk_free() + */ + newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); + sk_free(newsk); + newsk = NULL; + goto out; } -EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock); static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 9a20b84dc0223..a195203e7eefe 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -767,7 +767,6 @@ void inet_csk_init_xmit_timers(struct sock *sk, timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } -EXPORT_SYMBOL(inet_csk_init_xmit_timers); void inet_csk_clear_xmit_timers(struct sock *sk) { @@ -780,7 +779,6 @@ void inet_csk_clear_xmit_timers(struct sock *sk) sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } -EXPORT_SYMBOL(inet_csk_clear_xmit_timers); void inet_csk_clear_xmit_timers_sync(struct sock *sk) { @@ -831,7 +829,6 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } -EXPORT_SYMBOL_GPL(inet_csk_route_req); struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, @@ -898,7 +895,6 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) req->num_retrans++; return err; } -EXPORT_SYMBOL(inet_rtx_syn_ack); static struct request_sock * reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, @@ -1058,14 +1054,13 @@ bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { return __inet_csk_reqsk_queue_drop(sk, req, false); } -EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_put(req); } -EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); +EXPORT_IPV6_MOD(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { @@ -1209,7 +1204,6 @@ bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, inet_csk_reqsk_queue_added(sk); return true; } -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, const gfp_t priority) @@ -1290,7 +1284,6 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, return newsk; } -EXPORT_SYMBOL_GPL(inet_csk_clone_lock); /* * At this point, there should be no process reference to this @@ -1380,7 +1373,6 @@ int inet_csk_listen_start(struct sock *sk) inet_sk_set_state(sk, TCP_CLOSE); return err; } -EXPORT_SYMBOL_GPL(inet_csk_listen_start); static void inet_child_forget(struct sock *sk, struct request_sock *req, struct sock *child) @@ -1475,7 +1467,6 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, sock_put(child); return NULL; } -EXPORT_SYMBOL(inet_csk_complete_hashdance); /* * This routine closes sockets which have been at least partially @@ -1590,4 +1581,3 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) out: return dst; } -EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 5bf163f756e95..d1960701a3b42 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -713,7 +713,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) } return ok; } -EXPORT_SYMBOL_GPL(inet_ehash_nolisten); +EXPORT_IPV6_MOD(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) @@ -771,7 +771,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) return err; } -EXPORT_SYMBOL(__inet_hash); +EXPORT_IPV6_MOD(__inet_hash); int inet_hash(struct sock *sk) { @@ -782,7 +782,6 @@ int inet_hash(struct sock *sk) return err; } -EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { @@ -823,7 +822,7 @@ void inet_unhash(struct sock *sk) spin_unlock_bh(lock); } } -EXPORT_SYMBOL_GPL(inet_unhash); +EXPORT_IPV6_MOD(inet_unhash); static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, @@ -982,14 +981,14 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { return __inet_bhash2_update_saddr(sk, saddr, family, false); } -EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); +EXPORT_IPV6_MOD(inet_bhash2_update_saddr); void inet_bhash2_reset_saddr(struct sock *sk) { if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) __inet_bhash2_update_saddr(sk, NULL, 0, true); } -EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); +EXPORT_IPV6_MOD(inet_bhash2_reset_saddr); /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') @@ -1214,7 +1213,6 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet_check_established); } -EXPORT_SYMBOL_GPL(inet_hash_connect); static void init_hashinfo_lhash2(struct inet_hashinfo *h) { @@ -1265,7 +1263,6 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h) init_hashinfo_lhash2(h); return 0; } -EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { @@ -1305,7 +1302,6 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) hashinfo->ehash_locks_mask = nblocks - 1; return 0; } -EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries) @@ -1341,7 +1337,6 @@ struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, err: return NULL; } -EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) { @@ -1352,4 +1347,3 @@ void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) vfree(hashinfo->ehash); kfree(hashinfo); } -EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index aded4bf1bc16d..67efe95015816 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -166,7 +166,6 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, spin_unlock(lock); local_bh_enable(); } -EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule); static void tw_timer_handler(struct timer_list *t) { @@ -223,7 +222,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, return tw; } -EXPORT_SYMBOL_GPL(inet_twsk_alloc); /* These are always called from BH context. See callers in * tcp_input.c to verify this. @@ -306,7 +304,6 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ void inet_twsk_purge(struct inet_hashinfo *hashinfo) @@ -365,4 +362,3 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo) rcu_read_unlock(); } } -EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f60ec8b0f8ea4..85bf681d427b8 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -881,7 +881,6 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, } return false; } -EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index dbcf556a35bbc..8f500eaf33cfc 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -54,7 +54,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, return dst; } -EXPORT_SYMBOL(inet6_csk_route_req); static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) @@ -137,4 +136,3 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; } -EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu); From 235bd9d21fcdf07dd125daa3e60ab64f8aefb927 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 9 Apr 2025 19:36:47 -0700 Subject: [PATCH 091/770] tcp: Rename tcp_or_dccp_get_hashinfo(). DCCP was removed, so tcp_or_dccp_get_hashinfo() should be renamed. Let's rename it to tcp_get_hashinfo(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250410023921.11307-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 3 +-- net/ipv4/inet_connection_sock.c | 9 +++++---- net/ipv4/inet_hashtables.c | 14 +++++++------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index d172b64a63201..4564b5d348b1a 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -175,9 +175,8 @@ struct inet_hashinfo { bool pernet; } ____cacheline_aligned_in_smp; -static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) +static inline struct inet_hashinfo *tcp_get_hashinfo(const struct sock *sk) { - /* TODO: rename function */ return sock_net(sk)->ipv4.tcp_death_row.hashinfo; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a195203e7eefe..20915895bdaaf 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -330,7 +330,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, struct inet_bind2_bucket **tb2_ret, struct inet_bind_hashbucket **head2_ret, int *port_ret) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); int i, low, high, attempt_half, port, l3mdev; struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); @@ -512,10 +512,10 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); int ret = -EADDRINUSE, port = snum, l3mdev; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; @@ -1022,9 +1022,10 @@ static bool reqsk_queue_unlink(struct request_sock *req) bool found = false; if (sk_hashed(sk)) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); - spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); + spinlock_t *lock; + lock = inet_ehash_lockp(hashinfo, req->rsk_hash); spin_lock(lock); found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d1960701a3b42..da85cc30e3824 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -176,7 +176,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, */ static void __inet_put_port(struct sock *sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind_bucket *tb; @@ -215,7 +215,7 @@ EXPORT_SYMBOL(inet_put_port); int __inet_inherit_port(const struct sock *sk, struct sock *child) { - struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *table = tcp_get_hashinfo(sk); unsigned short port = inet_sk(child)->inet_num; struct inet_bind_hashbucket *head, *head2; bool created_inet_bind_bucket = false; @@ -668,7 +668,7 @@ static bool inet_ehash_lookup_by_sk(struct sock *sk, */ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_ehash_bucket *head; struct hlist_nulls_head *list; spinlock_t *lock; @@ -740,7 +740,7 @@ static int inet_reuseport_add_sock(struct sock *sk, int __inet_hash(struct sock *sk, struct sock *osk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; @@ -785,7 +785,7 @@ int inet_hash(struct sock *sk) void inet_unhash(struct sock *sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); if (sk_unhashed(sk)) return; @@ -873,7 +873,7 @@ inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); u32 hash; #if IS_ENABLED(CONFIG_IPV6) @@ -901,7 +901,7 @@ static void inet_update_saddr(struct sock *sk, void *saddr, int family) static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); From b4916f67902e2ae1dc8e37dfa45e8894ad2f8921 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 9 Apr 2025 11:47:14 +0200 Subject: [PATCH 092/770] net: airoha: Add l2_flows rhashtable Introduce l2_flows rhashtable in airoha_ppe struct in order to store L2 flows committed by upper layers of the kernel. This is a preliminary patch in order to offload L2 traffic rules. Signed-off-by: Lorenzo Bianconi Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/20250409-airoha-flowtable-l2b-v2-1-4a1e3935ea92@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.h | 15 +++- drivers/net/ethernet/airoha/airoha_ppe.c | 103 ++++++++++++++++++----- 2 files changed, 98 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h index ec8908f904c61..86e08832246df 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.h +++ b/drivers/net/ethernet/airoha/airoha_eth.h @@ -422,12 +422,23 @@ struct airoha_flow_data { } pppoe; }; +enum airoha_flow_entry_type { + FLOW_TYPE_L4, + FLOW_TYPE_L2, + FLOW_TYPE_L2_SUBFLOW, +}; + struct airoha_flow_table_entry { - struct hlist_node list; + union { + struct hlist_node list; /* PPE L3 flow entry */ + struct rhash_head l2_node; /* L2 flow entry */ + }; struct airoha_foe_entry data; u32 hash; + enum airoha_flow_entry_type type; + struct rhash_head node; unsigned long cookie; }; @@ -480,6 +491,8 @@ struct airoha_ppe { void *foe; dma_addr_t foe_dma; + struct rhashtable l2_flows; + struct hlist_head *foe_flow; u16 foe_check_time[PPE_NUM_ENTRIES]; diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index f10dab935cab6..7219125ade134 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -24,6 +24,13 @@ static const struct rhashtable_params airoha_flow_table_params = { .automatic_shrinking = true, }; +static const struct rhashtable_params airoha_l2_flow_table_params = { + .head_offset = offsetof(struct airoha_flow_table_entry, l2_node), + .key_offset = offsetof(struct airoha_flow_table_entry, data.bridge), + .key_len = 2 * ETH_ALEN, + .automatic_shrinking = true, +}; + static bool airoha_ppe2_is_enabled(struct airoha_eth *eth) { return airoha_fe_rr(eth, REG_PPE_GLO_CFG(1)) & PPE_GLO_CFG_EN_MASK; @@ -476,6 +483,43 @@ static int airoha_ppe_foe_commit_entry(struct airoha_ppe *ppe, return 0; } +static void airoha_ppe_foe_remove_flow(struct airoha_ppe *ppe, + struct airoha_flow_table_entry *e) +{ + lockdep_assert_held(&ppe_lock); + + hlist_del_init(&e->list); + if (e->hash != 0xffff) { + e->data.ib1 &= ~AIROHA_FOE_IB1_BIND_STATE; + e->data.ib1 |= FIELD_PREP(AIROHA_FOE_IB1_BIND_STATE, + AIROHA_FOE_STATE_INVALID); + airoha_ppe_foe_commit_entry(ppe, &e->data, e->hash); + e->hash = 0xffff; + } +} + +static void airoha_ppe_foe_remove_l2_flow(struct airoha_ppe *ppe, + struct airoha_flow_table_entry *e) +{ + lockdep_assert_held(&ppe_lock); + + rhashtable_remove_fast(&ppe->l2_flows, &e->l2_node, + airoha_l2_flow_table_params); +} + +static void airoha_ppe_foe_flow_remove_entry(struct airoha_ppe *ppe, + struct airoha_flow_table_entry *e) +{ + spin_lock_bh(&ppe_lock); + + if (e->type == FLOW_TYPE_L2) + airoha_ppe_foe_remove_l2_flow(ppe, e); + else + airoha_ppe_foe_remove_flow(ppe, e); + + spin_unlock_bh(&ppe_lock); +} + static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, u32 hash) { struct airoha_flow_table_entry *e; @@ -505,11 +549,37 @@ static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, u32 hash) spin_unlock_bh(&ppe_lock); } +static int +airoha_ppe_foe_l2_flow_commit_entry(struct airoha_ppe *ppe, + struct airoha_flow_table_entry *e) +{ + struct airoha_flow_table_entry *prev; + + e->type = FLOW_TYPE_L2; + prev = rhashtable_lookup_get_insert_fast(&ppe->l2_flows, &e->l2_node, + airoha_l2_flow_table_params); + if (!prev) + return 0; + + if (IS_ERR(prev)) + return PTR_ERR(prev); + + return rhashtable_replace_fast(&ppe->l2_flows, &prev->l2_node, + &e->l2_node, + airoha_l2_flow_table_params); +} + static int airoha_ppe_foe_flow_commit_entry(struct airoha_ppe *ppe, struct airoha_flow_table_entry *e) { - u32 hash = airoha_ppe_foe_get_entry_hash(&e->data); + int type = FIELD_GET(AIROHA_FOE_IB1_BIND_PACKET_TYPE, e->data.ib1); + u32 hash; + if (type == PPE_PKT_TYPE_BRIDGE) + return airoha_ppe_foe_l2_flow_commit_entry(ppe, e); + + hash = airoha_ppe_foe_get_entry_hash(&e->data); + e->type = FLOW_TYPE_L4; e->hash = 0xffff; spin_lock_bh(&ppe_lock); @@ -519,23 +589,6 @@ static int airoha_ppe_foe_flow_commit_entry(struct airoha_ppe *ppe, return 0; } -static void airoha_ppe_foe_flow_remove_entry(struct airoha_ppe *ppe, - struct airoha_flow_table_entry *e) -{ - spin_lock_bh(&ppe_lock); - - hlist_del_init(&e->list); - if (e->hash != 0xffff) { - e->data.ib1 &= ~AIROHA_FOE_IB1_BIND_STATE; - e->data.ib1 |= FIELD_PREP(AIROHA_FOE_IB1_BIND_STATE, - AIROHA_FOE_STATE_INVALID); - airoha_ppe_foe_commit_entry(ppe, &e->data, e->hash); - e->hash = 0xffff; - } - - spin_unlock_bh(&ppe_lock); -} - static int airoha_ppe_flow_offload_replace(struct airoha_gdm_port *port, struct flow_cls_offload *f) { @@ -890,9 +943,20 @@ int airoha_ppe_init(struct airoha_eth *eth) if (err) return err; + err = rhashtable_init(&ppe->l2_flows, &airoha_l2_flow_table_params); + if (err) + goto error_flow_table_destroy; + err = airoha_ppe_debugfs_init(ppe); if (err) - rhashtable_destroy(ð->flow_table); + goto error_l2_flow_table_destroy; + + return 0; + +error_l2_flow_table_destroy: + rhashtable_destroy(&ppe->l2_flows); +error_flow_table_destroy: + rhashtable_destroy(ð->flow_table); return err; } @@ -909,6 +973,7 @@ void airoha_ppe_deinit(struct airoha_eth *eth) } rcu_read_unlock(); + rhashtable_destroy(ð->ppe->l2_flows); rhashtable_destroy(ð->flow_table); debugfs_remove(eth->ppe->debugfs_dir); } From cd53f622611f9a6dd83b858c85448dd3568b67ec Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 9 Apr 2025 11:47:15 +0200 Subject: [PATCH 093/770] net: airoha: Add L2 hw acceleration support Similar to mtk driver, introduce the capability to offload L2 traffic defining flower rules in the PSE/PPE engine available on EN7581 SoC. Since the hw always reports L2/L3/L4 flower rules, link all L2 rules sharing the same L2 info (with different L3/L4 info) in the L2 subflows list of a given L2 PPE entry. Signed-off-by: Lorenzo Bianconi Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/20250409-airoha-flowtable-l2b-v2-2-4a1e3935ea92@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 2 +- drivers/net/ethernet/airoha/airoha_eth.h | 9 +- drivers/net/ethernet/airoha/airoha_ppe.c | 121 ++++++++++++++++++++--- 3 files changed, 115 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index d748dc6de9236..723eba7cfa0cf 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -694,7 +694,7 @@ static int airoha_qdma_rx_process(struct airoha_queue *q, int budget) reason = FIELD_GET(AIROHA_RXD4_PPE_CPU_REASON, msg1); if (reason == PPE_CPU_REASON_HIT_UNBIND_RATE_REACHED) - airoha_ppe_check_skb(eth->ppe, hash); + airoha_ppe_check_skb(eth->ppe, q->skb, hash); done++; napi_gro_receive(&q->napi, q->skb); diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h index 86e08832246df..e82abfc1a67bd 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.h +++ b/drivers/net/ethernet/airoha/airoha_eth.h @@ -431,10 +431,14 @@ enum airoha_flow_entry_type { struct airoha_flow_table_entry { union { struct hlist_node list; /* PPE L3 flow entry */ - struct rhash_head l2_node; /* L2 flow entry */ + struct { + struct rhash_head l2_node; /* L2 flow entry */ + struct hlist_head l2_flows; /* PPE L2 subflows list */ + }; }; struct airoha_foe_entry data; + struct hlist_node l2_subflow_node; /* PPE L2 subflow entry */ u32 hash; enum airoha_flow_entry_type type; @@ -548,7 +552,8 @@ u32 airoha_rmw(void __iomem *base, u32 offset, u32 mask, u32 val); bool airoha_is_valid_gdm_port(struct airoha_eth *eth, struct airoha_gdm_port *port); -void airoha_ppe_check_skb(struct airoha_ppe *ppe, u16 hash); +void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, + u16 hash); int airoha_ppe_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv); int airoha_ppe_init(struct airoha_eth *eth); diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 7219125ade134..d4969c2a03524 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -204,6 +204,15 @@ static int airoha_get_dsa_port(struct net_device **dev) #endif } +static void airoha_ppe_foe_set_bridge_addrs(struct airoha_foe_bridge *br, + struct ethhdr *eh) +{ + br->dest_mac_hi = get_unaligned_be32(eh->h_dest); + br->dest_mac_lo = get_unaligned_be16(eh->h_dest + 4); + br->src_mac_hi = get_unaligned_be16(eh->h_source); + br->src_mac_lo = get_unaligned_be32(eh->h_source + 2); +} + static int airoha_ppe_foe_entry_prepare(struct airoha_eth *eth, struct airoha_foe_entry *hwe, struct net_device *dev, int type, @@ -254,13 +263,7 @@ static int airoha_ppe_foe_entry_prepare(struct airoha_eth *eth, qdata = FIELD_PREP(AIROHA_FOE_SHAPER_ID, 0x7f); if (type == PPE_PKT_TYPE_BRIDGE) { - hwe->bridge.dest_mac_hi = get_unaligned_be32(data->eth.h_dest); - hwe->bridge.dest_mac_lo = - get_unaligned_be16(data->eth.h_dest + 4); - hwe->bridge.src_mac_hi = - get_unaligned_be16(data->eth.h_source); - hwe->bridge.src_mac_lo = - get_unaligned_be32(data->eth.h_source + 2); + airoha_ppe_foe_set_bridge_addrs(&hwe->bridge, &data->eth); hwe->bridge.data = qdata; hwe->bridge.ib2 = val; l2 = &hwe->bridge.l2.common; @@ -385,6 +388,19 @@ static u32 airoha_ppe_foe_get_entry_hash(struct airoha_foe_entry *hwe) hv3 = hwe->ipv6.src_ip[1] ^ hwe->ipv6.dest_ip[1]; hv3 ^= hwe->ipv6.src_ip[0]; break; + case PPE_PKT_TYPE_BRIDGE: { + struct airoha_foe_mac_info *l2 = &hwe->bridge.l2; + + hv1 = l2->common.src_mac_hi & 0xffff; + hv1 = hv1 << 16 | l2->src_mac_lo; + + hv2 = l2->common.dest_mac_lo; + hv2 = hv2 << 16; + hv2 = hv2 | ((l2->common.src_mac_hi & 0xffff0000) >> 16); + + hv3 = l2->common.dest_mac_hi; + break; + } case PPE_PKT_TYPE_IPV4_DSLITE: case PPE_PKT_TYPE_IPV6_6RD: default: @@ -496,15 +512,24 @@ static void airoha_ppe_foe_remove_flow(struct airoha_ppe *ppe, airoha_ppe_foe_commit_entry(ppe, &e->data, e->hash); e->hash = 0xffff; } + if (e->type == FLOW_TYPE_L2_SUBFLOW) { + hlist_del_init(&e->l2_subflow_node); + kfree(e); + } } static void airoha_ppe_foe_remove_l2_flow(struct airoha_ppe *ppe, struct airoha_flow_table_entry *e) { + struct hlist_head *head = &e->l2_flows; + struct hlist_node *n; + lockdep_assert_held(&ppe_lock); rhashtable_remove_fast(&ppe->l2_flows, &e->l2_node, airoha_l2_flow_table_params); + hlist_for_each_entry_safe(e, n, head, l2_subflow_node) + airoha_ppe_foe_remove_flow(ppe, e); } static void airoha_ppe_foe_flow_remove_entry(struct airoha_ppe *ppe, @@ -520,10 +545,56 @@ static void airoha_ppe_foe_flow_remove_entry(struct airoha_ppe *ppe, spin_unlock_bh(&ppe_lock); } -static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, u32 hash) +static int +airoha_ppe_foe_commit_subflow_entry(struct airoha_ppe *ppe, + struct airoha_flow_table_entry *e, + u32 hash) +{ + u32 mask = AIROHA_FOE_IB1_BIND_PACKET_TYPE | AIROHA_FOE_IB1_BIND_UDP; + struct airoha_foe_entry *hwe_p, hwe; + struct airoha_flow_table_entry *f; + struct airoha_foe_mac_info *l2; + int type; + + hwe_p = airoha_ppe_foe_get_entry(ppe, hash); + if (!hwe_p) + return -EINVAL; + + f = kzalloc(sizeof(*f), GFP_ATOMIC); + if (!f) + return -ENOMEM; + + hlist_add_head(&f->l2_subflow_node, &e->l2_flows); + f->type = FLOW_TYPE_L2_SUBFLOW; + f->hash = hash; + + memcpy(&hwe, hwe_p, sizeof(*hwe_p)); + hwe.ib1 = (hwe.ib1 & mask) | (e->data.ib1 & ~mask); + l2 = &hwe.bridge.l2; + memcpy(l2, &e->data.bridge.l2, sizeof(*l2)); + + type = FIELD_GET(AIROHA_FOE_IB1_BIND_PACKET_TYPE, hwe.ib1); + if (type == PPE_PKT_TYPE_IPV4_HNAPT) + memcpy(&hwe.ipv4.new_tuple, &hwe.ipv4.orig_tuple, + sizeof(hwe.ipv4.new_tuple)); + else if (type >= PPE_PKT_TYPE_IPV6_ROUTE_3T && + l2->common.etype == ETH_P_IP) + l2->common.etype = ETH_P_IPV6; + + hwe.bridge.ib2 = e->data.bridge.ib2; + airoha_ppe_foe_commit_entry(ppe, &hwe, hash); + + return 0; +} + +static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, + struct sk_buff *skb, + u32 hash) { struct airoha_flow_table_entry *e; + struct airoha_foe_bridge br = {}; struct airoha_foe_entry *hwe; + bool commit_done = false; struct hlist_node *n; u32 index, state; @@ -539,12 +610,33 @@ static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, u32 hash) index = airoha_ppe_foe_get_entry_hash(hwe); hlist_for_each_entry_safe(e, n, &ppe->foe_flow[index], list) { - if (airoha_ppe_foe_compare_entry(e, hwe)) { - airoha_ppe_foe_commit_entry(ppe, &e->data, hash); - e->hash = hash; - break; + if (e->type == FLOW_TYPE_L2_SUBFLOW) { + state = FIELD_GET(AIROHA_FOE_IB1_BIND_STATE, hwe->ib1); + if (state != AIROHA_FOE_STATE_BIND) { + e->hash = 0xffff; + airoha_ppe_foe_remove_flow(ppe, e); + } + continue; + } + + if (commit_done || !airoha_ppe_foe_compare_entry(e, hwe)) { + e->hash = 0xffff; + continue; } + + airoha_ppe_foe_commit_entry(ppe, &e->data, hash); + commit_done = true; + e->hash = hash; } + + if (commit_done) + goto unlock; + + airoha_ppe_foe_set_bridge_addrs(&br, eth_hdr(skb)); + e = rhashtable_lookup_fast(&ppe->l2_flows, &br, + airoha_l2_flow_table_params); + if (e) + airoha_ppe_foe_commit_subflow_entry(ppe, e, hash); unlock: spin_unlock_bh(&ppe_lock); } @@ -899,7 +991,8 @@ int airoha_ppe_setup_tc_block_cb(enum tc_setup_type type, void *type_data, return err; } -void airoha_ppe_check_skb(struct airoha_ppe *ppe, u16 hash) +void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, + u16 hash) { u16 now, diff; @@ -912,7 +1005,7 @@ void airoha_ppe_check_skb(struct airoha_ppe *ppe, u16 hash) return; ppe->foe_check_time[hash] = now; - airoha_ppe_foe_insert_entry(ppe, hash); + airoha_ppe_foe_insert_entry(ppe, skb, hash); } int airoha_ppe_init(struct airoha_eth *eth) From e5566162af8b9690e096d2e6089e4ed955a0d13d Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 10 Apr 2025 12:04:03 +0200 Subject: [PATCH 094/770] net: phy: mediatek: permit to compile test GE SOC PHY driver When commit 462a3daad679 ("net: phy: mediatek: fix compile-test dependencies") fixed the dependency, it should have also introduced an or on COMPILE_TEST to permit this driver to be compile-tested even if NVMEM_MTK_EFUSE wasn't selected. The driver makes use of NVMEM API that are always compiled (return error) so the driver can actually be compiled even without that config. Fix and simplify the dependency condition of this kernel config. Fixes: 462a3daad679 ("net: phy: mediatek: fix compile-test dependencies") Acked-by: Daniel Golle Reviewed-by: Andrew Lunn Signed-off-by: Christian Marangi Acked-by: Arnd Bergmann Link: https://patch.msgid.link/20250410100410.348-1-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mediatek/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/phy/mediatek/Kconfig b/drivers/net/phy/mediatek/Kconfig index 2a8ac5aed0f89..6a4c2b328c418 100644 --- a/drivers/net/phy/mediatek/Kconfig +++ b/drivers/net/phy/mediatek/Kconfig @@ -15,8 +15,7 @@ config MEDIATEK_GE_PHY config MEDIATEK_GE_SOC_PHY tristate "MediaTek SoC Ethernet PHYs" - depends on (ARM64 && ARCH_MEDIATEK) || COMPILE_TEST - depends on NVMEM_MTK_EFUSE + depends on (ARM64 && ARCH_MEDIATEK && NVMEM_MTK_EFUSE) || COMPILE_TEST select MTK_NET_PHYLIB help Supports MediaTek SoC built-in Gigabit Ethernet PHYs. From 6a325aed130bb68790e765f923e76ec5669d2da7 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 10 Apr 2025 12:04:04 +0200 Subject: [PATCH 095/770] net: phy: mediatek: add Airoha PHY ID to SoC driver Airoha AN7581 SoC ship with a Switch based on the MT753x Switch embedded in other SoC like the MT7581 and the MT7988. Similar to these they require configuring some pin to enable LED PHYs. Add support for the PHY ID for the Airoha embedded Switch and define a simple probe function to toggle these pins. Also fill the LED functions and add dedicated function to define LED polarity. Reviewed-by: Andrew Lunn Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250410100410.348-2-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mediatek/Kconfig | 4 +- drivers/net/phy/mediatek/mtk-ge-soc.c | 62 +++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/mediatek/Kconfig b/drivers/net/phy/mediatek/Kconfig index 6a4c2b328c418..4308002bb82c4 100644 --- a/drivers/net/phy/mediatek/Kconfig +++ b/drivers/net/phy/mediatek/Kconfig @@ -15,7 +15,9 @@ config MEDIATEK_GE_PHY config MEDIATEK_GE_SOC_PHY tristate "MediaTek SoC Ethernet PHYs" - depends on (ARM64 && ARCH_MEDIATEK && NVMEM_MTK_EFUSE) || COMPILE_TEST + depends on ARM64 || COMPILE_TEST + depends on ARCH_AIROHA || (ARCH_MEDIATEK && NVMEM_MTK_EFUSE) || \ + COMPILE_TEST select MTK_NET_PHYLIB help Supports MediaTek SoC built-in Gigabit Ethernet PHYs. diff --git a/drivers/net/phy/mediatek/mtk-ge-soc.c b/drivers/net/phy/mediatek/mtk-ge-soc.c index 175cf5239bba8..fd0e447ffce79 100644 --- a/drivers/net/phy/mediatek/mtk-ge-soc.c +++ b/drivers/net/phy/mediatek/mtk-ge-soc.c @@ -11,8 +11,11 @@ #include "../phylib.h" #include "mtk.h" +#define MTK_PHY_MAX_LEDS 2 + #define MTK_GPHY_ID_MT7981 0x03a29461 #define MTK_GPHY_ID_MT7988 0x03a29481 +#define MTK_GPHY_ID_AN7581 0x03a294c1 #define MTK_EXT_PAGE_ACCESS 0x1f #define MTK_PHY_PAGE_STANDARD 0x0000 @@ -1406,6 +1409,53 @@ static int mt7981_phy_probe(struct phy_device *phydev) return mt798x_phy_calibration(phydev); } +static int an7581_phy_probe(struct phy_device *phydev) +{ + struct mtk_socphy_priv *priv; + struct pinctrl *pinctrl; + + /* Toggle pinctrl to enable PHY LED */ + pinctrl = devm_pinctrl_get_select(&phydev->mdio.dev, "gbe-led"); + if (IS_ERR(pinctrl)) + dev_err(&phydev->mdio.bus->dev, + "Failed to setup PHY LED pinctrl\n"); + + priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + phydev->priv = priv; + + return 0; +} + +static int an7581_phy_led_polarity_set(struct phy_device *phydev, int index, + unsigned long modes) +{ + u32 mode; + u16 val; + + if (index >= MTK_PHY_MAX_LEDS) + return -EINVAL; + + for_each_set_bit(mode, &modes, __PHY_LED_MODES_NUM) { + switch (mode) { + case PHY_LED_ACTIVE_LOW: + val = MTK_PHY_LED_ON_POLARITY; + break; + case PHY_LED_ACTIVE_HIGH: + val = 0; + break; + default: + return -EINVAL; + } + } + + return phy_modify_mmd(phydev, MDIO_MMD_VEND2, index ? + MTK_PHY_LED1_ON_CTRL : MTK_PHY_LED0_ON_CTRL, + MTK_PHY_LED_ON_POLARITY, val); +} + static struct phy_driver mtk_socphy_driver[] = { { PHY_ID_MATCH_EXACT(MTK_GPHY_ID_MT7981), @@ -1441,6 +1491,17 @@ static struct phy_driver mtk_socphy_driver[] = { .led_hw_control_set = mt798x_phy_led_hw_control_set, .led_hw_control_get = mt798x_phy_led_hw_control_get, }, + { + PHY_ID_MATCH_EXACT(MTK_GPHY_ID_AN7581), + .name = "Airoha AN7581 PHY", + .probe = an7581_phy_probe, + .led_blink_set = mt798x_phy_led_blink_set, + .led_brightness_set = mt798x_phy_led_brightness_set, + .led_hw_is_supported = mt798x_phy_led_hw_is_supported, + .led_hw_control_set = mt798x_phy_led_hw_control_set, + .led_hw_control_get = mt798x_phy_led_hw_control_get, + .led_polarity_set = an7581_phy_led_polarity_set, + }, }; module_phy_driver(mtk_socphy_driver); @@ -1448,6 +1509,7 @@ module_phy_driver(mtk_socphy_driver); static const struct mdio_device_id __maybe_unused mtk_socphy_tbl[] = { { PHY_ID_MATCH_EXACT(MTK_GPHY_ID_MT7981) }, { PHY_ID_MATCH_EXACT(MTK_GPHY_ID_MT7988) }, + { PHY_ID_MATCH_EXACT(MTK_GPHY_ID_AN7581) }, { } }; From b65999e7238e6f2a48dc77c8c2109c48318ff41b Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 9 Apr 2025 12:19:11 +0200 Subject: [PATCH 096/770] net: hsr: sync hw addr of slave2 according to slave1 hw addr on PRP In order to work properly PRP requires slave1 and slave2 to share the same MAC address. To ease the configuration process on userspace tools, sync the slave2 MAC address with slave1. In addition, when deleting the port from the list, restore the original MAC address. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 5 +++++ net/hsr/hsr_main.c | 9 +++++++++ net/hsr/hsr_main.h | 1 + net/hsr/hsr_slave.c | 2 ++ 4 files changed, 17 insertions(+) diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 1b1b700ec05ea..0d1e56965af0a 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -761,6 +761,11 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], if (res) goto err_unregister; + if (protocol_version == PRP_V1) { + eth_hw_addr_set(slave[1], slave[0]->dev_addr); + call_netdevice_notifiers(NETDEV_CHANGEADDR, slave[1]); + } + if (interlink) { res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack); if (res) diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index d7ae32473c41a..192893c3f2ec7 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -78,6 +78,15 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, eth_hw_addr_set(master->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev); + + if (hsr->prot_version == PRP_V1) { + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); + if (port) { + eth_hw_addr_set(port->dev, dev->dev_addr); + call_netdevice_notifiers(NETDEV_CHANGEADDR, + port->dev); + } + } } /* Make sure we recognize frames from ourselves in hsr_rcv() */ diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 1bc47b17a2968..135ec5fce0196 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -155,6 +155,7 @@ struct hsr_port { struct hsr_priv *hsr; enum hsr_port_type type; struct rcu_head rcu; + unsigned char original_macaddress[ETH_ALEN]; }; struct hsr_frame_info; diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 2a802a5de2acc..b87b6a6fe0700 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -196,6 +196,7 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, port->hsr = hsr; port->dev = dev; port->type = type; + ether_addr_copy(port->original_macaddress, dev->dev_addr); if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); @@ -232,6 +233,7 @@ void hsr_del_port(struct hsr_port *port) if (!port->hsr->fwd_offloaded) dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); + eth_hw_addr_set(port->dev, port->original_macaddress); } kfree_rcu(port, rcu); From c26c192c3d486a2a7d83d254bae294c2f8f50abf Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 9 Apr 2025 22:00:56 +0200 Subject: [PATCH 097/770] udp: properly deal with xfrm encap and ADDRFORM UDP GRO accounting assumes that the GRO receive callback is always set when the UDP tunnel is enabled, but syzkaller proved otherwise, leading tot the following splat: WARNING: CPU: 0 PID: 5837 at net/ipv4/udp_offload.c:123 udp_tunnel_update_gro_rcv+0x28d/0x4c0 net/ipv4/udp_offload.c:123 Modules linked in: CPU: 0 UID: 0 PID: 5837 Comm: syz-executor850 Not tainted 6.14.0-syzkaller-13320-g420aabef3ab5 #0 PREEMPT(full) Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025 RIP: 0010:udp_tunnel_update_gro_rcv+0x28d/0x4c0 net/ipv4/udp_offload.c:123 Code: 00 00 e8 c6 5a 2f f7 48 c1 e5 04 48 8d b5 20 53 c7 9a ba 10 00 00 00 4c 89 ff e8 ce 87 99 f7 e9 ce 00 00 00 e8 a4 5a 2f f7 90 <0f> 0b 90 e9 de fd ff ff bf 01 00 00 00 89 ee e8 cf 5e 2f f7 85 ed RSP: 0018:ffffc90003effa88 EFLAGS: 00010293 RAX: ffffffff8a93fc9c RBX: 0000000000000000 RCX: ffff8880306f9e00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffffffff8a93fabe R09: 1ffffffff20bfb2e R10: dffffc0000000000 R11: fffffbfff20bfb2f R12: ffff88814ef21738 R13: dffffc0000000000 R14: ffff88814ef21778 R15: 1ffff11029de42ef FS: 0000000000000000(0000) GS:ffff888124f96000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f04eec760d0 CR3: 000000000eb38000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: udp_tunnel_cleanup_gro include/net/udp_tunnel.h:205 [inline] udpv6_destroy_sock+0x212/0x270 net/ipv6/udp.c:1829 sk_common_release+0x71/0x2e0 net/core/sock.c:3896 inet_release+0x17d/0x200 net/ipv4/af_inet.c:435 __sock_release net/socket.c:647 [inline] sock_close+0xbc/0x240 net/socket.c:1391 __fput+0x3e9/0x9f0 fs/file_table.c:465 task_work_run+0x251/0x310 kernel/task_work.c:227 exit_task_work include/linux/task_work.h:40 [inline] do_exit+0xa11/0x27f0 kernel/exit.c:953 do_group_exit+0x207/0x2c0 kernel/exit.c:1102 __do_sys_exit_group kernel/exit.c:1113 [inline] __se_sys_exit_group kernel/exit.c:1111 [inline] __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1111 x64_sys_call+0x26c3/0x26d0 arch/x86/include/generated/asm/syscalls_64.h:232 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f04eebfac79 Code: Unable to access opcode bytes at 0x7f04eebfac4f. RSP: 002b:00007fffdcaa34a8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f04eebfac79 RDX: 000000000000003c RSI: 00000000000000e7 RDI: 0000000000000000 RBP: 00007f04eec75270 R08: ffffffffffffffb8 R09: 00007fffdcaa36c8 R10: 0000200000000000 R11: 0000000000000246 R12: 00007f04eec75270 R13: 0000000000000000 R14: 00007f04eec75cc0 R15: 00007f04eebcca70 Address the issue moving the accounting hook into setup_udp_tunnel_sock() and set_xfrm_gro_udp_encap_rcv(), where the GRO callback is actually set. set_xfrm_gro_udp_encap_rcv() is prone to races with IPV6_ADDRFORM, run the relevant setsockopt under the socket lock to ensure using consistent values of sk_family and up->encap_type. Refactor the GRO callback selection code, to make it clear that the function pointer is always initialized. Reported-by: syzbot+8c469a2260132cd095c1@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=8c469a2260132cd095c1 Fixes: 172bf009c18d ("xfrm: Support GRO for IPv4 ESP in UDP encapsulation") Fixes: 5d7f5b2f6b935 ("udp_tunnel: use static call for GRO hooks when possible") Signed-off-by: Paolo Abeni Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/92bcdb6899145a9a387c8fa9e3ca656642a43634.1744228733.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/udp_tunnel.h | 1 - net/ipv4/udp.c | 31 ++++++++++++++++++++++++++----- net/ipv4/udp_tunnel_core.c | 2 ++ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 288f06f23a804..2df3b8344eb52 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -215,7 +215,6 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) if (READ_ONCE(sk->sk_family) == PF_INET6) ipv6_stub->udpv6_encap_enable(); #endif - udp_tunnel_update_gro_rcv(sk, true); udp_encap_enable(); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8867fe687888c..f9f5b92cf4b61 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2904,15 +2904,33 @@ void udp_destroy_sock(struct sock *sk) } } +typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); + static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family, struct sock *sk) { #ifdef CONFIG_XFRM + udp_gro_receive_t new_gro_receive; + if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) { - if (family == AF_INET) - WRITE_ONCE(udp_sk(sk)->gro_receive, xfrm4_gro_udp_encap_rcv); - else if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) - WRITE_ONCE(udp_sk(sk)->gro_receive, ipv6_stub->xfrm6_gro_udp_encap_rcv); + if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) + new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv; + else + new_gro_receive = xfrm4_gro_udp_encap_rcv; + + if (udp_sk(sk)->gro_receive != new_gro_receive) { + /* + * With IPV6_ADDRFORM the gro callback could change + * after being set, unregister the old one, if valid. + */ + if (udp_sk(sk)->gro_receive) + udp_tunnel_update_gro_rcv(sk, false); + + WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive); + udp_tunnel_update_gro_rcv(sk, true); + } } #endif } @@ -2962,6 +2980,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_ENCAP: + sockopt_lock_sock(sk); switch (val) { case 0: #ifdef CONFIG_XFRM @@ -2985,6 +3004,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, err = -ENOPROTOOPT; break; } + sockopt_release_sock(sk); break; case UDP_NO_CHECK6_TX: @@ -3002,13 +3022,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_GRO: - + sockopt_lock_sock(sk); /* when enabling GRO, accept the related GSO packet type */ if (valbool) udp_tunnel_encap_enable(sk); udp_assign_bit(GRO_ENABLED, sk, valbool); udp_assign_bit(ACCEPT_L4, sk, valbool); set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk); + sockopt_release_sock(sk); break; /* diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 3d1214e6df0ea..2326548997d3f 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -90,6 +90,8 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_tunnel_encap_enable(sk); + udp_tunnel_update_gro_rcv(sk, true); + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) && sk->sk_kern_sock) udp_tunnel_update_gro_lookup(net, sk, true); From 097f171f98289cf737437599c40b0d1e81266e9e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Apr 2025 18:42:46 -0700 Subject: [PATCH 098/770] net: convert dev->rtnl_link_state to a bool netdevice reg_state was split into two 16 bit enums back in 2010 in commit a2835763e130 ("rtnetlink: handle rtnl_link netlink notifications manually"). Since the split the fields have been moved apart, and last year we converted reg_state to a normal u8 in commit 4d42b37def70 ("net: convert dev->reg_state to u8"). rtnl_link_state being a 16 bitfield makes no sense. Convert it to a single bool, it seems very unlikely after 15 years that we'll need more values in it. We could drop dev->rtnl_link_ops from the conditions but feels like having it there more clearly points at the reason for this hack. Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250410014246.780885-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/net_device.rst | 2 +- include/linux/netdevice.h | 10 ++-------- net/core/dev.c | 6 ++---- net/core/rtnetlink.c | 15 ++++++++------- 4 files changed, 13 insertions(+), 20 deletions(-) diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 6327e689e8a84..ca8605eb82ffc 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -131,7 +131,7 @@ struct ref_tracker_dir refcnt_tracker struct list_head link_watch_list enum:8 reg_state bool dismantle -enum:16 rtnl_link_state +bool rtnl_link_initilizing bool needs_free_netdev void*priv_destructor struct net_device struct netpoll_info* npinfo read_mostly napi_poll/napi_poll_lock diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d8544f6a680cd..e6036b82ef4cf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1946,9 +1946,6 @@ enum netdev_reg_state { * * @reg_state: Register/unregister state machine * @dismantle: Device is going to be freed - * @rtnl_link_state: This enum represents the phases of creating - * a new link - * * @needs_free_netdev: Should unregister perform free_netdev? * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one @@ -2363,11 +2360,8 @@ struct net_device { /** @moving_ns: device is changing netns, protected by @lock */ bool moving_ns; - - enum { - RTNL_LINK_INITIALIZED, - RTNL_LINK_INITIALIZING, - } rtnl_link_state:16; + /** @rtnl_link_initializing: Device being created, suppress events */ + bool rtnl_link_initializing; bool needs_free_netdev; void (*priv_destructor)(struct net_device *dev); diff --git a/net/core/dev.c b/net/core/dev.c index d0563ddff6ca9..03d20a98f8b70 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11110,8 +11110,7 @@ int register_netdevice(struct net_device *dev) * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: @@ -12025,8 +12024,7 @@ void unregister_netdevice_many_notify(struct list_head *head, */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, portid, nlh); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 39a5b72e861f6..38526210b8fdd 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3580,7 +3580,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh) { - unsigned int old_flags; + unsigned int old_flags, changed; int err; old_flags = dev->flags; @@ -3591,12 +3591,13 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, return err; } - if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { - __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh); - } else { - dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - __dev_notify_flags(dev, old_flags, ~0U, portid, nlh); + changed = old_flags ^ dev->flags; + if (dev->rtnl_link_initializing) { + dev->rtnl_link_initializing = false; + changed = ~0U; } + + __dev_notify_flags(dev, old_flags, changed, portid, nlh); return 0; } EXPORT_SYMBOL(rtnl_configure_link); @@ -3654,7 +3655,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, dev_net_set(dev, net); dev->rtnl_link_ops = ops; - dev->rtnl_link_state = RTNL_LINK_INITIALIZING; + dev->rtnl_link_initializing = true; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); From 81e92f4fb825b08d63641656f1510f38ad07390b Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Thu, 10 Apr 2025 09:17:38 +0200 Subject: [PATCH 099/770] net: pktgen: fix code style (ERROR: "foo * bar" should be "foo *bar") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix checkpatch code style errors: ERROR: "foo * bar" should be "foo *bar" #977: FILE: net/core/pktgen.c:977: + const char __user * user_buffer, size_t count, ERROR: "foo * bar" should be "foo *bar" #978: FILE: net/core/pktgen.c:978: + loff_t * offset) ERROR: "foo * bar" should be "foo *bar" #1912: FILE: net/core/pktgen.c:1912: + const char __user * user_buffer, ERROR: "foo * bar" should be "foo *bar" #1913: FILE: net/core/pktgen.c:1913: + size_t count, loff_t * offset) Signed-off-by: Peter Seiderer Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jakub Kicinski --- net/core/pktgen.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index fe7fdefab994c..fe89384d56d0d 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -974,8 +974,8 @@ static __u32 pktgen_read_flag(const char *f, bool *disable) } static ssize_t pktgen_if_write(struct file *file, - const char __user * user_buffer, size_t count, - loff_t * offset) + const char __user *user_buffer, size_t count, + loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_dev *pkt_dev = seq->private; @@ -1909,8 +1909,8 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) } static ssize_t pktgen_thread_write(struct file *file, - const char __user * user_buffer, - size_t count, loff_t * offset) + const char __user *user_buffer, + size_t count, loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_thread *t = seq->private; From eb1fd49ef660ddba36c91667594caf5d91a6d62e Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Thu, 10 Apr 2025 09:17:39 +0200 Subject: [PATCH 100/770] net: pktgen: fix code style (ERROR: space prohibited after that '&') MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix checkpatch code style errors/checks: CHECK: No space is necessary after a cast #2984: FILE: net/core/pktgen.c:2984: + *(__be16 *) & eth[12] = protocol; ERROR: space prohibited after that '&' (ctx:WxW) #2984: FILE: net/core/pktgen.c:2984: + *(__be16 *) & eth[12] = protocol; Signed-off-by: Peter Seiderer Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jakub Kicinski --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index fe89384d56d0d..a54683d9d44ae 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2981,7 +2981,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->priority = pkt_dev->skb_priority; memcpy(eth, pkt_dev->hh, 12); - *(__be16 *) & eth[12] = protocol; + *(__be16 *)ð[12] = protocol; /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - From 1d8f07bf4aab018692b6288d6a93fe17383a2f47 Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Thu, 10 Apr 2025 09:17:42 +0200 Subject: [PATCH 101/770] net: pktgen: fix code style (WARNING: suspect code indent for conditional statements) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix checkpatch code style warnings: WARNING: suspect code indent for conditional statements (8, 17) #2901: FILE: net/core/pktgen.c:2901: + } else { + skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); Signed-off-by: Peter Seiderer Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jakub Kicinski --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index a54683d9d44ae..15b4079553a90 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2900,7 +2900,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, skb->dev = dev; } } else { - skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); + skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } /* the caller pre-fetches from skb->data and reserves for the mac hdr */ From 870b856cb478bc02fffe4d89897e62c692efb09a Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Thu, 10 Apr 2025 09:17:43 +0200 Subject: [PATCH 102/770] net: pktgen: fix code style (WARNING: Block comments) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix checkpatch code style warnings: WARNING: Block comments use a trailing */ on a separate line + * removal by worker thread */ WARNING: Block comments use * on subsequent lines + __u8 tos; /* six MSB of (former) IPv4 TOS + are for dscp codepoint */ WARNING: Block comments use a trailing */ on a separate line + are for dscp codepoint */ WARNING: Block comments use * on subsequent lines + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 + (see RFC 3260, sec. 4) */ WARNING: Block comments use a trailing */ on a separate line + (see RFC 3260, sec. 4) */ WARNING: Block comments use * on subsequent lines + /* = { + 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, WARNING: Block comments use * on subsequent lines + /* Field for thread to receive "posted" events terminate, + stop ifs etc. */ WARNING: Block comments use a trailing */ on a separate line + stop ifs etc. */ WARNING: Block comments should align the * on each line + * we go look for it ... +*/ WARNING: Block comments use a trailing */ on a separate line + * we resolve the dst issue */ WARNING: Block comments use a trailing */ on a separate line + * with proc_create_data() */ Signed-off-by: Peter Seiderer Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jakub Kicinski --- net/core/pktgen.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 15b4079553a90..727ea41cae6e1 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -283,7 +283,8 @@ struct pktgen_dev { int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; int removal_mark; /* non-zero => the device is marked for - * removal by worker thread */ + * removal by worker thread + */ struct page *page; u64 delay; /* nano-seconds */ @@ -346,10 +347,12 @@ struct pktgen_dev { __u16 udp_dst_max; /* exclusive, dest UDP port */ /* DSCP + ECN */ - __u8 tos; /* six MSB of (former) IPv4 TOS - are for dscp codepoint */ - __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 - (see RFC 3260, sec. 4) */ + __u8 tos; /* six MSB of (former) IPv4 TOS + * are for dscp codepoint + */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 + * (see RFC 3260, sec. 4) + */ /* IMIX */ unsigned int n_imix_entries; @@ -389,12 +392,12 @@ struct pktgen_dev { __u8 hh[14]; /* = { - 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, - - We fill in SRC address later - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x08, 0x00 - }; + * 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + * + * We fill in SRC address later + * 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + * 0x08, 0x00 + * }; */ __u16 pad; /* pad out the hh struct to an even 16 bytes */ @@ -458,7 +461,8 @@ struct pktgen_thread { char result[512]; /* Field for thread to receive "posted" events terminate, - stop ifs etc. */ + * stop ifs etc. + */ u32 control; int cpu; @@ -2397,7 +2401,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) /* If there was already an IPSEC SA, we keep it as is, else * we go look for it ... -*/ + */ #define DUMMY_MARK 0 static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) { @@ -2694,7 +2698,8 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) if (!x) return 0; /* XXX: we dont support tunnel mode for now until - * we resolve the dst issue */ + * we resolve the dst issue + */ if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0)) return 0; @@ -3788,7 +3793,8 @@ static int add_dev_to_thread(struct pktgen_thread *t, * userspace on another CPU than the kthread. The if_lock() * is used here to sync with concurrent instances of * _rem_dev_from_if_list() invoked via kthread, which is also - * updating the if_list */ + * updating the if_list + */ if_lock(t); if (pkt_dev->pg_thread) { @@ -3983,7 +3989,8 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Remove proc before if_list entry, because add_device uses * list to determine if interface already exist, avoid race - * with proc_create_data() */ + * with proc_create_data() + */ proc_remove(pkt_dev->entry); /* And update the thread if_list */ From ca8ee665211ac64d38caa553cc04527b22cb25d2 Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Thu, 10 Apr 2025 09:17:44 +0200 Subject: [PATCH 103/770] net: pktgen: fix code style (WARNING: Missing a blank line after declarations) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix checkpatch code style warnings: WARNING: Missing a blank line after declarations #761: FILE: net/core/pktgen.c:761: + char c; + if (get_user(c, &user_buffer[i])) WARNING: Missing a blank line after declarations #780: FILE: net/core/pktgen.c:780: + char c; + if (get_user(c, &user_buffer[i])) WARNING: Missing a blank line after declarations #806: FILE: net/core/pktgen.c:806: + char c; + if (get_user(c, &user_buffer[i])) WARNING: Missing a blank line after declarations #823: FILE: net/core/pktgen.c:823: + char c; + if (get_user(c, &user_buffer[i])) WARNING: Missing a blank line after declarations #1968: FILE: net/core/pktgen.c:1968: + char f[32]; + memset(f, 0, 32); WARNING: Missing a blank line after declarations #2410: FILE: net/core/pktgen.c:2410: + struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); + if (!x) { WARNING: Missing a blank line after declarations #2442: FILE: net/core/pktgen.c:2442: + __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { WARNING: Missing a blank line after declarations #2523: FILE: net/core/pktgen.c:2523: + unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) WARNING: Missing a blank line after declarations #2567: FILE: net/core/pktgen.c:2567: + __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) WARNING: Missing a blank line after declarations #2587: FILE: net/core/pktgen.c:2587: + __be32 s; + if (pkt_dev->flags & F_IPDST_RND) { WARNING: Missing a blank line after declarations #2634: FILE: net/core/pktgen.c:2634: + __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { WARNING: Missing a blank line after declarations #2736: FILE: net/core/pktgen.c:2736: + int i; + for (i = 0; i < pkt_dev->cflows; i++) { WARNING: Missing a blank line after declarations #2738: FILE: net/core/pktgen.c:2738: + struct xfrm_state *x = pkt_dev->flows[i].x; + if (x) { WARNING: Missing a blank line after declarations #2752: FILE: net/core/pktgen.c:2752: + int nhead = 0; + if (x) { WARNING: Missing a blank line after declarations #2795: FILE: net/core/pktgen.c:2795: + unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) WARNING: Missing a blank line after declarations #3480: FILE: net/core/pktgen.c:3480: + ktime_t idle_start = ktime_get(); + schedule(); Signed-off-by: Peter Seiderer Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jakub Kicinski --- net/core/pktgen.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 727ea41cae6e1..b745fae0939e2 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -758,6 +758,7 @@ static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen, for (; i < maxlen; i++) { int value; char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; value = hex_to_bin(c); @@ -777,6 +778,7 @@ static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen) for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -803,6 +805,7 @@ static ssize_t num_arg(const char __user *user_buffer, size_t maxlen, for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; if ((c >= '0') && (c <= '9')) { @@ -820,6 +823,7 @@ static ssize_t strn_len(const char __user *user_buffer, size_t maxlen) for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -1966,6 +1970,7 @@ static ssize_t pktgen_thread_write(struct file *file, if (!strcmp(name, "add_device")) { char f[32]; + memset(f, 0, 32); max = min(sizeof(f) - 1, count - i); len = strn_len(&user_buffer[i], max); @@ -2408,6 +2413,7 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) #ifdef CONFIG_XFRM struct xfrm_state *x = pkt_dev->flows[flow].x; struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); + if (!x) { if (pkt_dev->spi) { @@ -2440,6 +2446,7 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { t = get_random_u32_inclusive(pkt_dev->queue_map_min, pkt_dev->queue_map_max); @@ -2521,6 +2528,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_MPLS_RND) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | @@ -2565,6 +2573,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) imx = ntohl(pkt_dev->saddr_max); if (imn < imx) { __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) t = get_random_u32_inclusive(imn, imx - 1); else { @@ -2585,6 +2594,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; __be32 s; + if (pkt_dev->flags & F_IPDST_RND) { do { @@ -2632,6 +2642,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { t = get_random_u32_inclusive(pkt_dev->min_pkt_size, pkt_dev->max_pkt_size - 1); @@ -2734,8 +2745,10 @@ static void free_SAs(struct pktgen_dev *pkt_dev) if (pkt_dev->cflows) { /* let go of the SAs if we have them */ int i; + for (i = 0; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; + if (x) { xfrm_state_put(x); pkt_dev->flows[i].x = NULL; @@ -2750,6 +2763,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, if (pkt_dev->flags & F_IPSEC) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; + if (x) { struct ethhdr *eth; struct iphdr *iph; @@ -2793,6 +2807,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; @@ -3478,6 +3493,7 @@ static void pktgen_rem_thread(struct pktgen_thread *t) static void pktgen_resched(struct pktgen_dev *pkt_dev) { ktime_t idle_start = ktime_get(); + schedule(); pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start)); } From dceae3e82ff8222f91f49ec26d5f41027ce0562c Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Thu, 10 Apr 2025 09:17:45 +0200 Subject: [PATCH 104/770] net: pktgen: fix code style (WARNING: macros should not use a trailing semicolon) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix checkpatch code style warnings: WARNING: macros should not use a trailing semicolon #180: FILE: net/core/pktgen.c:180: +#define func_enter() pr_debug("entering %s\n", __func__); WARNING: macros should not use a trailing semicolon #234: FILE: net/core/pktgen.c:234: +#define if_lock(t) mutex_lock(&(t->if_lock)); CHECK: Unnecessary parentheses around t->if_lock #235: FILE: net/core/pktgen.c:235: +#define if_unlock(t) mutex_unlock(&(t->if_lock)); Signed-off-by: Peter Seiderer Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jakub Kicinski --- net/core/pktgen.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b745fae0939e2..d26c0588dd8ab 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -177,7 +177,7 @@ #define MAX_IMIX_ENTRIES 20 #define IMIX_PRECISION 100 /* Precision of IMIX distribution */ -#define func_enter() pr_debug("entering %s\n", __func__); +#define func_enter() pr_debug("entering %s\n", __func__) #define PKT_FLAGS \ pf(IPV6) /* Interface in IPV6 Mode */ \ @@ -231,8 +231,8 @@ static char *pkt_flag_names[] = { #define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ /* If lock -- protects updating of if_list */ -#define if_lock(t) mutex_lock(&(t->if_lock)); -#define if_unlock(t) mutex_unlock(&(t->if_lock)); +#define if_lock(t) mutex_lock(&(t->if_lock)) +#define if_unlock(t) mutex_unlock(&(t->if_lock)) /* Used to help with determining the pkts on receive */ #define PKTGEN_MAGIC 0xbe9be955 From 08fcb1f242b9e7c73d3a3372890768fe28ef16a1 Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Thu, 10 Apr 2025 09:17:47 +0200 Subject: [PATCH 105/770] net: pktgen: fix code style (WARNING: quoted string split across lines) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix checkpatch code style warnings: WARNING: quoted string split across lines #480: FILE: net/core/pktgen.c:480: + "Packet Generator for packet performance testing. " + "Version: " VERSION "\n"; WARNING: quoted string split across lines #632: FILE: net/core/pktgen.c:632: + " udp_src_min: %d udp_src_max: %d" + " udp_dst_min: %d udp_dst_max: %d\n", Signed-off-by: Peter Seiderer Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jakub Kicinski --- net/core/pktgen.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d26c0588dd8ab..fa30ff0f24645 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -476,8 +476,7 @@ struct pktgen_thread { #define FIND 0 static const char version[] = - "Packet Generator for packet performance testing. " - "Version: " VERSION "\n"; + "Packet Generator for packet performance testing. Version: " VERSION "\n"; static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); @@ -628,8 +627,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_printf(seq, "%pM\n", pkt_dev->dst_mac); seq_printf(seq, - " udp_src_min: %d udp_src_max: %d" - " udp_dst_min: %d udp_dst_max: %d\n", + " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, pkt_dev->udp_dst_max); From 1450e4525f9a719a674f0b0c3bbe7c290cf933d4 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Fri, 11 Apr 2025 18:17:36 +0800 Subject: [PATCH 106/770] bna: bnad_dim_timeout: Rename del_timer_sync in comment Commit 8fa7292fee5c ("treewide: Switch/rename to timer_delete[_sync]()") switched del_timer_sync to timer_delete_sync, but did not modify the comment for bnad_dim_timeout(). Now fix it. Signed-off-by: WangYuli Reviewed-by: Simon Horman Link: https://patch.msgid.link/61DDCE7AB5B6CE82+20250411101736.160981-1-wangyuli@uniontech.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/brocade/bna/bnad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c index a03eff3d44259..50eb54ecf1bad 100644 --- a/drivers/net/ethernet/brocade/bna/bnad.c +++ b/drivers/net/ethernet/brocade/bna/bnad.c @@ -1735,7 +1735,7 @@ bnad_iocpf_sem_timeout(struct timer_list *t) * Time CPU m CPU n * 0 1 = test_bit * 1 clear_bit - * 2 del_timer_sync + * 2 timer_delete_sync * 3 mod_timer */ From e846fb5e7c5243c65ff67247cb29a9d76bbcc4e8 Mon Sep 17 00:00:00 2001 From: Joseph Huang Date: Fri, 11 Apr 2025 11:03:16 -0400 Subject: [PATCH 107/770] net: bridge: mcast: Add offload failed mdb flag Add MDB_FLAGS_OFFLOAD_FAILED and MDB_PG_FLAGS_OFFLOAD_FAILED to indicate that an attempt to offload the MDB entry to switchdev has failed. Signed-off-by: Joseph Huang Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250411150323.1117797-2-Joseph.Huang@garmin.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 9 +++++---- net/bridge/br_mdb.c | 2 ++ net/bridge/br_private.h | 20 +++++++++++++++----- net/bridge/br_switchdev.c | 9 +++++---- 4 files changed, 27 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index a5b743a2f7750..f2a6de424f3f9 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -699,10 +699,11 @@ struct br_mdb_entry { #define MDB_TEMPORARY 0 #define MDB_PERMANENT 1 __u8 state; -#define MDB_FLAGS_OFFLOAD (1 << 0) -#define MDB_FLAGS_FAST_LEAVE (1 << 1) -#define MDB_FLAGS_STAR_EXCL (1 << 2) -#define MDB_FLAGS_BLOCKED (1 << 3) +#define MDB_FLAGS_OFFLOAD (1 << 0) +#define MDB_FLAGS_FAST_LEAVE (1 << 1) +#define MDB_FLAGS_STAR_EXCL (1 << 2) +#define MDB_FLAGS_BLOCKED (1 << 3) +#define MDB_FLAGS_OFFLOAD_FAILED (1 << 4) __u8 flags; __u16 vid; struct { diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 722203b98ff74..aac96bf4ba44a 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -144,6 +144,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_STAR_EXCL; if (flags & MDB_PG_FLAGS_BLOCKED) e->flags |= MDB_FLAGS_BLOCKED; + if (flags & MDB_PG_FLAGS_OFFLOAD_FAILED) + e->flags |= MDB_FLAGS_OFFLOAD_FAILED; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d5b3c5936a79e..d9b8ed316040d 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -306,11 +306,12 @@ struct net_bridge_fdb_flush_desc { u16 vlan_id; }; -#define MDB_PG_FLAGS_PERMANENT BIT(0) -#define MDB_PG_FLAGS_OFFLOAD BIT(1) -#define MDB_PG_FLAGS_FAST_LEAVE BIT(2) -#define MDB_PG_FLAGS_STAR_EXCL BIT(3) -#define MDB_PG_FLAGS_BLOCKED BIT(4) +#define MDB_PG_FLAGS_PERMANENT BIT(0) +#define MDB_PG_FLAGS_OFFLOAD BIT(1) +#define MDB_PG_FLAGS_FAST_LEAVE BIT(2) +#define MDB_PG_FLAGS_STAR_EXCL BIT(3) +#define MDB_PG_FLAGS_BLOCKED BIT(4) +#define MDB_PG_FLAGS_OFFLOAD_FAILED BIT(5) #define PG_SRC_ENT_LIMIT 32 @@ -1342,6 +1343,15 @@ br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx) return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx)); } + +static inline void +br_multicast_set_pg_offload_flags(struct net_bridge_port_group *p, + bool offloaded) +{ + p->flags &= ~(MDB_PG_FLAGS_OFFLOAD | MDB_PG_FLAGS_OFFLOAD_FAILED); + p->flags |= (offloaded ? MDB_PG_FLAGS_OFFLOAD : + MDB_PG_FLAGS_OFFLOAD_FAILED); +} #else static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 7b41ee8740cbb..2d769ef3cb8ac 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -505,8 +505,8 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; - if (err) - goto err; + if (err == -EOPNOTSUPP) + goto out_free; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &data->ip); @@ -516,11 +516,12 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri pp = &p->next) { if (p->key.port != port) continue; - p->flags |= MDB_PG_FLAGS_OFFLOAD; + + br_multicast_set_pg_offload_flags(p, !err); } out: spin_unlock_bh(&br->multicast_lock); -err: +out_free: kfree(priv); } From 9fbe1e3e61c21508861a72324087aeeea85f796f Mon Sep 17 00:00:00 2001 From: Joseph Huang Date: Fri, 11 Apr 2025 11:03:17 -0400 Subject: [PATCH 108/770] net: bridge: Add offload_fail_notification bopt Add BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION bool option. Signed-off-by: Joseph Huang Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250411150323.1117797-3-Joseph.Huang@garmin.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br.c | 5 +++++ net/bridge/br_private.h | 1 + 3 files changed, 7 insertions(+) diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index f2a6de424f3f9..73876c0e2bba6 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -831,6 +831,7 @@ enum br_boolopt_id { BR_BOOLOPT_NO_LL_LEARN, BR_BOOLOPT_MCAST_VLAN_SNOOPING, BR_BOOLOPT_MST_ENABLE, + BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION, BR_BOOLOPT_MAX }; diff --git a/net/bridge/br.c b/net/bridge/br.c index 183fcb362f9e6..25dda554ca5b3 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -284,6 +284,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, case BR_BOOLOPT_MST_ENABLE: err = br_mst_set_enabled(br, on, extack); break; + case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: + br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -302,6 +305,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); case BR_BOOLOPT_MST_ENABLE: return br_opt_get(br, BROPT_MST_ENABLED); + case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: + return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION); default: /* shouldn't be called with unsupported options */ WARN_ON(1); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index d9b8ed316040d..e17a3c5fe6892 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -484,6 +484,7 @@ enum net_bridge_opts { BROPT_VLAN_BRIDGE_BINDING, BROPT_MCAST_VLAN_SNOOPING_ENABLED, BROPT_MST_ENABLED, + BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, }; struct net_bridge { From c428d43d4f56efd291d010c2a43f031e56978519 Mon Sep 17 00:00:00 2001 From: Joseph Huang Date: Fri, 11 Apr 2025 11:03:18 -0400 Subject: [PATCH 109/770] net: bridge: mcast: Notify on mdb offload failure Notify user space on mdb offload failure if mdb_offload_fail_notification is enabled. Signed-off-by: Joseph Huang Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250411150323.1117797-4-Joseph.Huang@garmin.com Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 26 +++++++++++++++++++++----- net/bridge/br_private.h | 9 +++++++++ net/bridge/br_switchdev.c | 4 ++++ 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index aac96bf4ba44a..400eb872b4032 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -519,16 +519,17 @@ static size_t rtnl_mdb_nlmsg_size(const struct net_bridge_port_group *pg) rtnl_mdb_nlmsg_pg_size(pg); } -void br_mdb_notify(struct net_device *dev, - struct net_bridge_mdb_entry *mp, - struct net_bridge_port_group *pg, - int type) +static void __br_mdb_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type, bool notify_switchdev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; - br_switchdev_mdb_notify(dev, mp, pg, type); + if (notify_switchdev) + br_switchdev_mdb_notify(dev, mp, pg, type); skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC); if (!skb) @@ -546,6 +547,21 @@ void br_mdb_notify(struct net_device *dev, rtnl_set_sk_err(net, RTNLGRP_MDB, err); } +void br_mdb_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type) +{ + __br_mdb_notify(dev, mp, pg, type, true); +} + +void br_mdb_flag_change_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg) +{ + __br_mdb_notify(dev, mp, pg, RTM_NEWMDB, false); +} + static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, int ifindex, u16 vid, u32 pid, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e17a3c5fe6892..71f351a6ce1bb 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1004,6 +1004,8 @@ int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type); +void br_mdb_flag_change_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg); void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type); void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, @@ -1353,6 +1355,13 @@ br_multicast_set_pg_offload_flags(struct net_bridge_port_group *p, p->flags |= (offloaded ? MDB_PG_FLAGS_OFFLOAD : MDB_PG_FLAGS_OFFLOAD_FAILED); } + +static inline bool +br_mdb_should_notify(const struct net_bridge *br, u8 changed_flags) +{ + return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION) && + (changed_flags & MDB_PG_FLAGS_OFFLOAD_FAILED); +} #else static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 2d769ef3cb8ac..95d7355a04074 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -504,6 +504,7 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri struct net_bridge_mdb_entry *mp; struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; + u8 old_flags; if (err == -EOPNOTSUPP) goto out_free; @@ -517,7 +518,10 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri if (p->key.port != port) continue; + old_flags = p->flags; br_multicast_set_pg_offload_flags(p, !err); + if (br_mdb_should_notify(br, old_flags ^ p->flags)) + br_mdb_flag_change_notify(br->dev, mp, p); } out: spin_unlock_bh(&br->multicast_lock); From f9c1120d9b5e65ded82dccb7ebf22bc9739419b0 Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Fri, 11 Apr 2025 11:39:16 +0530 Subject: [PATCH 110/770] dt-bindings: net: ethernet-controller: add 5000M speed to fixed-link A link speed of 5000 Mbps is a valid speed for a fixed-link mode of operation. Hence, update the bindings to include the same. Signed-off-by: Siddharth Vadapalli Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250411060917.633769-2-s-vadapalli@ti.com Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/ethernet-controller.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml index 45819b2358002..61e51ea58a985 100644 --- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -197,7 +197,7 @@ properties: description: Link speed. $ref: /schemas/types.yaml#/definitions/uint32 - enum: [10, 100, 1000, 2500, 10000] + enum: [10, 100, 1000, 2500, 5000, 10000] full-duplex: $ref: /schemas/types.yaml#/definitions/flag From 8b36a102c1a17d8dabc846e3ded41fe17e9ad4e2 Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Fri, 11 Apr 2025 11:39:17 +0530 Subject: [PATCH 111/770] dt-bindings: net: ti: k3-am654-cpsw-nuss: evaluate fixed-link property Since the fixed-link (phyless) mode of operation is supported by the CPSW MAC, include "fixed-link" in the set of properties to be evaluated. Signed-off-by: Siddharth Vadapalli Link: https://patch.msgid.link/20250411060917.633769-3-s-vadapalli@ti.com Signed-off-by: Jakub Kicinski --- .../devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml index b11894fbaec47..7b3d948f187df 100644 --- a/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml +++ b/Documentation/devicetree/bindings/net/ti,k3-am654-cpsw-nuss.yaml @@ -143,6 +143,8 @@ properties: label: description: label associated with this port + fixed-link: true + ti,mac-only: $ref: /schemas/types.yaml#/definitions/flag description: From cd3c93167da0e760b5819246eae7a4ea30fd014b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Apr 2025 12:41:36 +0200 Subject: [PATCH 112/770] page_pool: Move pp_magic check into helper functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we are about to stash some more information into the pp_magic field, let's move the magic signature checks into a pair of helper functions so it can be changed in one place. Reviewed-by: Mina Almasry Tested-by: Yonglong Liu Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-1-6a9ef2e0cba8@redhat.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en/xdp.c | 4 ++-- include/linux/mm.h | 20 +++++++++++++++++++ mm/page_alloc.c | 8 ++------ net/core/netmem_priv.h | 5 +++++ net/core/skbuff.c | 16 ++------------- net/core/xdp.c | 4 ++-- 6 files changed, 33 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index f803e1c935900..5ce1b463b7a8d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -707,8 +707,8 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); page = xdpi.page.page; - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) - * as we know this is a page_pool page. + /* No need to check page_pool_page_is_pp() as we + * know this is a page_pool page. */ page_pool_recycle_direct(page->pp, page); } while (++n < num); diff --git a/include/linux/mm.h b/include/linux/mm.h index b7f13f087954b..56c47f4a38ca4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4248,4 +4248,24 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define VM_SEALED_SYSMAP VM_NONE #endif +/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is + * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for + * the head page of compound page and bit 1 for pfmemalloc page. + * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling + * the pfmemalloc page. + */ +#define PP_MAGIC_MASK ~0x3UL + +#ifdef CONFIG_PAGE_POOL +static inline bool page_pool_page_is_pp(struct page *page) +{ + return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; +} +#else +static inline bool page_pool_page_is_pp(struct page *page) +{ + return false; +} +#endif + #endif /* _LINUX_MM_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd6b865cb1abf..a18340b322183 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -897,9 +897,7 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif -#ifdef CONFIG_PAGE_POOL - ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) | -#endif + page_pool_page_is_pp(page) | (page->flags & check_flags))) return false; @@ -926,10 +924,8 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif -#ifdef CONFIG_PAGE_POOL - if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE)) + if (unlikely(page_pool_page_is_pp(page))) bad_reason = "page_pool leak"; -#endif return bad_reason; } diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 7eadb8393e002..f33162fd281c2 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -18,6 +18,11 @@ static inline void netmem_clear_pp_magic(netmem_ref netmem) __netmem_clear_lsb(netmem)->pp_magic = 0; } +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; +} + static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) { __netmem_clear_lsb(netmem)->pp = pool; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6cbf77bc61fce..74a2d886a35b5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -893,11 +893,6 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } -static bool is_pp_netmem(netmem_ref netmem) -{ - return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE; -} - int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom) { @@ -995,14 +990,7 @@ bool napi_pp_put_page(netmem_ref netmem) { netmem = netmem_compound_head(netmem); - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely(!is_pp_netmem(netmem))) + if (unlikely(!netmem_is_pp(netmem))) return false; page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); @@ -1042,7 +1030,7 @@ static int skb_pp_frag_ref(struct sk_buff *skb) for (i = 0; i < shinfo->nr_frags; i++) { head_netmem = netmem_compound_head(shinfo->frags[i].netmem); - if (likely(is_pp_netmem(head_netmem))) + if (likely(netmem_is_pp(head_netmem))) page_pool_ref_netmem(head_netmem); else page_ref_inc(netmem_to_page(head_netmem)); diff --git a/net/core/xdp.c b/net/core/xdp.c index 3cd0db9c9d2d3..a014df88620cb 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -438,8 +438,8 @@ void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, netmem = netmem_compound_head(netmem); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) - * as mem->type knows this a page_pool page + /* No need to check netmem_is_pp() as mem->type knows this a + * page_pool page */ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, napi_direct); From ee62ce7a1d909ccba0399680a03c2dee83bcae95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Apr 2025 12:41:37 +0200 Subject: [PATCH 113/770] page_pool: Track DMA-mapped pages and unmap them when destroying the pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When enabling DMA mapping in page_pool, pages are kept DMA mapped until they are released from the pool, to avoid the overhead of re-mapping the pages every time they are used. This causes resource leaks and/or crashes when there are pages still outstanding while the device is torn down, because page_pool will attempt an unmap through a non-existent DMA device on the subsequent page return. To fix this, implement a simple tracking of outstanding DMA-mapped pages in page pool using an xarray. This was first suggested by Mina[0], and turns out to be fairly straight forward: We simply store pointers to pages directly in the xarray with xa_alloc() when they are first DMA mapped, and remove them from the array on unmap. Then, when a page pool is torn down, it can simply walk the xarray and unmap all pages still present there before returning, which also allows us to get rid of the get/put_device() calls in page_pool. Using xa_cmpxchg(), no additional synchronisation is needed, as a page will only ever be unmapped once. To avoid having to walk the entire xarray on unmap to find the page reference, we stash the ID assigned by xa_alloc() into the page structure itself, using the upper bits of the pp_magic field. This requires a couple of defines to avoid conflicting with the POINTER_POISON_DELTA define, but this is all evaluated at compile-time, so does not affect run-time performance. The bitmap calculations in this patch gives the following number of bits for different architectures: - 23 bits on 32-bit architectures - 21 bits on PPC64 (because of the definition of ILLEGAL_POINTER_VALUE) - 32 bits on other 64-bit architectures Stashing a value into the unused bits of pp_magic does have the effect that it can make the value stored there lie outside the unmappable range (as governed by the mmap_min_addr sysctl), for architectures that don't define ILLEGAL_POINTER_VALUE. This means that if one of the pointers that is aliased to the pp_magic field (such as page->lru.next) is dereferenced while the page is owned by page_pool, that could lead to a dereference into userspace, which is a security concern. The risk of this is mitigated by the fact that (a) we always clear pp_magic before releasing a page from page_pool, and (b) this would need a use-after-free bug for struct page, which can have many other risks since page->lru.next is used as a generic list pointer in multiple places in the kernel. As such, with this patch we take the position that this risk is negligible in practice. For more discussion, see[1]. Since all the tracking added in this patch is performed on DMA map/unmap, no additional code is needed in the fast path, meaning the performance overhead of this tracking is negligible there. A micro-benchmark shows that the total overhead of the tracking itself is about 400 ns (39 cycles(tsc) 395.218 ns; sum for both map and unmap[2]). Since this cost is only paid on DMA map and unmap, it seems like an acceptable cost to fix the late unmap issue. Further optimisation can narrow the cases where this cost is paid (for instance by eliding the tracking when DMA map/unmap is a no-op). The extra memory needed to track the pages is neatly encapsulated inside xarray, which uses the 'struct xa_node' structure to track items. This structure is 576 bytes long, with slots for 64 items, meaning that a full node occurs only 9 bytes of overhead per slot it tracks (in practice, it probably won't be this efficient, but in any case it should be an acceptable overhead). [0] https://lore.kernel.org/all/CAHS8izPg7B5DwKfSuzz-iOop_YRbk3Sd6Y4rX7KBG9DcVJcyWg@mail.gmail.com/ [1] https://lore.kernel.org/r/20250320023202.GA25514@openwall.com [2] https://lore.kernel.org/r/ae07144c-9295-4c9d-a400-153bb689fe9e@huawei.com Reported-by: Yonglong Liu Closes: https://lore.kernel.org/r/8743264a-9700-4227-a556-5f931c720211@huawei.com Fixes: ff7d6b27f894 ("page_pool: refurbish version of page_pool code") Suggested-by: Mina Almasry Reviewed-by: Mina Almasry Reviewed-by: Jesper Dangaard Brouer Tested-by: Jesper Dangaard Brouer Tested-by: Qiuling Ren Tested-by: Yuying Ma Tested-by: Yonglong Liu Acked-by: Jesper Dangaard Brouer Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409-page-pool-track-dma-v9-2-6a9ef2e0cba8@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mm.h | 46 ++++++++++++++++++-- include/linux/poison.h | 4 ++ include/net/page_pool/types.h | 6 +++ net/core/netmem_priv.h | 28 +++++++++++- net/core/page_pool.c | 81 +++++++++++++++++++++++++++++------ 5 files changed, 147 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 56c47f4a38ca4..130d3c9d2ee45 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4248,13 +4248,51 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define VM_SEALED_SYSMAP VM_NONE #endif +/* + * DMA mapping IDs for page_pool + * + * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and + * stashes it in the upper bits of page->pp_magic. We always want to be able to + * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP + * pages can have arbitrary kernel pointers stored in the same field as pp_magic + * (since it overlaps with page->lru.next), so we must ensure that we cannot + * mistake a valid kernel pointer with any of the values we write into this + * field. + * + * On architectures that set POISON_POINTER_DELTA, this is already ensured, + * since this value becomes part of PP_SIGNATURE; meaning we can just use the + * space between the PP_SIGNATURE value (without POISON_POINTER_DELTA), and the + * lowest bits of POISON_POINTER_DELTA. On arches where POISON_POINTER_DELTA is + * 0, we make sure that we leave the two topmost bits empty, as that guarantees + * we won't mistake a valid kernel pointer for a value we set, regardless of the + * VMSPLIT setting. + * + * Altogether, this means that the number of bits available is constrained by + * the size of an unsigned long (at the upper end, subtracting two bits per the + * above), and the definition of PP_SIGNATURE (with or without + * POISON_POINTER_DELTA). + */ +#define PP_DMA_INDEX_SHIFT (1 + __fls(PP_SIGNATURE - POISON_POINTER_DELTA)) +#if POISON_POINTER_DELTA > 0 +/* PP_SIGNATURE includes POISON_POINTER_DELTA, so limit the size of the DMA + * index to not overlap with that if set + */ +#define PP_DMA_INDEX_BITS MIN(32, __ffs(POISON_POINTER_DELTA) - PP_DMA_INDEX_SHIFT) +#else +/* Always leave out the topmost two; see above. */ +#define PP_DMA_INDEX_BITS MIN(32, BITS_PER_LONG - PP_DMA_INDEX_SHIFT - 2) +#endif + +#define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ + PP_DMA_INDEX_SHIFT) + /* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for - * the head page of compound page and bit 1 for pfmemalloc page. - * page_is_pfmemalloc() is checked in __page_pool_put_page() to avoid recycling - * the pfmemalloc page. + * the head page of compound page and bit 1 for pfmemalloc page, as well as the + * bits used for the DMA index. page_is_pfmemalloc() is checked in + * __page_pool_put_page() to avoid recycling the pfmemalloc page. */ -#define PP_MAGIC_MASK ~0x3UL +#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) #ifdef CONFIG_PAGE_POOL static inline bool page_pool_page_is_pp(struct page *page) diff --git a/include/linux/poison.h b/include/linux/poison.h index 331a9a996fa87..8ca2235f78d5d 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -70,6 +70,10 @@ #define KEY_DESTROY 0xbd /********** net/core/page_pool.c **********/ +/* + * page_pool uses additional free bits within this value to store data, see the + * definition of PP_DMA_INDEX_MASK in mm.h + */ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) /********** net/core/skbuff.c **********/ diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 36eb57d73abc6..431b593de7093 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA @@ -33,6 +34,9 @@ #define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM) +/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */ +#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1) + /* * Fast allocation side cache array/stack * @@ -221,6 +225,8 @@ struct page_pool { void *mp_priv; const struct memory_provider_ops *mp_ops; + struct xarray dma_mapped; + #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ struct page_pool_recycle_stats __percpu *recycle_stats; diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index f33162fd281c2..cd95394399b40 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -5,7 +5,7 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->pp_magic; + return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) @@ -15,6 +15,8 @@ static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) static inline void netmem_clear_pp_magic(netmem_ref netmem) { + WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK); + __netmem_clear_lsb(netmem)->pp_magic = 0; } @@ -33,4 +35,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem, { __netmem_clear_lsb(netmem)->dma_addr = dma_addr; } + +static inline unsigned long netmem_get_dma_index(netmem_ref netmem) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return 0; + + magic = __netmem_clear_lsb(netmem)->pp_magic; + + return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; +} + +static inline void netmem_set_dma_index(netmem_ref netmem, + unsigned long id) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return; + + magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); + __netmem_clear_lsb(netmem)->pp_magic = magic; +} #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 7745ad924ae2d..2b76848659418 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -276,8 +276,7 @@ static int page_pool_init(struct page_pool *pool, /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); - if (pool->dma_map) - get_device(pool->p.dev); + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { netdev_assert_locked(pool->slow.netdev); @@ -320,9 +319,7 @@ static int page_pool_init(struct page_pool *pool, static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); - - if (pool->dma_map) - put_device(pool->p.dev); + xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) @@ -463,13 +460,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { - if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) - __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); + if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { + rcu_read_lock(); + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ + if (pool->dma_sync) + __page_pool_dma_sync_for_device(pool, netmem, + dma_sync_size); + rcu_read_unlock(); + } } -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; + int err; + u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit @@ -483,15 +488,30 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) if (dma_mapping_error(pool->p.dev, dma)) return false; - if (page_pool_set_dma_addr_netmem(netmem, dma)) + if (page_pool_set_dma_addr_netmem(netmem, dma)) { + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); goto unmap_failed; + } + + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto unset_failed; + } + netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; +unset_failed: + page_pool_set_dma_addr_netmem(netmem, 0); unmap_failed: - WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); @@ -508,7 +528,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, if (unlikely(!page)) return NULL; - if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) { + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } @@ -554,7 +574,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, */ for (i = 0; i < nr_pages; i++) { netmem = pool->alloc.cache[i]; - if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { + if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { put_page(netmem_to_page(netmem)); continue; } @@ -656,6 +676,8 @@ void page_pool_clear_pp_info(netmem_ref netmem) static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, netmem_ref netmem) { + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; dma_addr_t dma; if (!pool->dma_map) @@ -664,6 +686,17 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, */ return; + id = netmem_get_dma_index(netmem); + if (!id) + return; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return; + dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ @@ -671,6 +704,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); + netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need @@ -1080,8 +1114,29 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) static void page_pool_scrub(struct page_pool *pool) { + unsigned long id; + void *ptr; + page_pool_empty_alloc_cache_once(pool); - pool->destroy_cnt++; + if (!pool->destroy_cnt++ && pool->dma_map) { + if (pool->dma_sync) { + /* Disable page_pool_dma_sync_for_device() */ + pool->dma_sync = false; + + /* Make sure all concurrent returns that may see the old + * value of dma_sync (and thus perform a sync) have + * finished before doing the unmapping below. Skip the + * wait if the device doesn't actually need syncing, or + * if there are no outstanding mapped pages. + */ + if (dma_dev_need_sync(pool->p.dev) && + !xa_empty(&pool->dma_mapped)) + synchronize_net(); + } + + xa_for_each(&pool->dma_mapped, id, ptr) + __page_pool_release_page_dma(pool, page_to_netmem(ptr)); + } /* No more consumers should exist, but producers could still * be in-flight. From ceaceaf79ea0fe337344fc5c1fb10a421a362410 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 11 Apr 2025 23:04:19 +0100 Subject: [PATCH 114/770] net: ethtool: fix get_ts_stats() documentation Commit 0e9c127729be ("ethtool: add interface to read Tx hardware timestamping statistics") added documentation for timestamping statistics, but added the detailed explanation for this method to the get_ts_info() rather than get_ts_stats(). Move it to the correct entry. Cc: Rahul Rameshbabu Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u3MTz-000Crx-IW@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 8210ece94fa6b..013d258586420 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -926,10 +926,11 @@ struct kernel_ethtool_ts_info { * @get_ts_info: Get the time stamping and PTP hardware clock capabilities. * It may be called with RCU, or rtnl or reference on the device. * Drivers supporting transmit time stamps in software should set this to - * ethtool_op_get_ts_info(). Drivers must not zero statistics which they - * don't report. The stats structure is initialized to ETHTOOL_STAT_NOT_SET - * indicating driver does not report statistics. - * @get_ts_stats: Query the device hardware timestamping statistics. + * ethtool_op_get_ts_info(). + * @get_ts_stats: Query the device hardware timestamping statistics. Drivers + * must not zero statistics which they don't report. The stats structure + * is initialized to ETHTOOL_STAT_NOT_SET indicating driver does not + * report statistics. * @get_module_info: Get the size and type of the eeprom contained within * a plug-in module. * @get_module_eeprom: Get the eeprom information from the plug-in module From e333b1c3cf25fca348422c9ad5cc8db40b4243fa Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:30 -0700 Subject: [PATCH 115/770] net: Factorise setup_net() and cleanup_net(). When we roll back the changes made by struct pernet_operations.init(), we execute mostly identical sequences in three places. * setup_net() * cleanup_net() * free_exit_list() The only difference between the first two is which list and RCU helpers to use. In setup_net(), an ops could fail on the way, so we need to perform a reverse walk from its previous ops in pernet_list. OTOH, in cleanup_net(), we iterate the full list from tail to head. The former passes the failed ops to list_for_each_entry_continue_reverse(). It's tricky, but we can reuse it for the latter if we pass list_entry() of the head node. Also, synchronize_rcu() and synchronize_rcu_expedited() can be easily switched by an argument. Let's factorise the rollback part in setup_net() and cleanup_net(). In the next patch, ops_undo_list() will be reused for free_exit_list(), and then two arguments (ops_list and hold_rtnl) will differ. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/net_namespace.c | 106 +++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 55 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b0dfdf791ece5..2612339efd714 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -188,6 +188,53 @@ static void ops_free_list(const struct pernet_operations *ops, } } +static void ops_undo_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list, + bool expedite_rcu, bool hold_rtnl) +{ + const struct pernet_operations *saved_ops; + + if (!ops) + ops = list_entry(ops_list, typeof(*ops), list); + + saved_ops = ops; + + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_pre_exit_list(ops, net_exit_list); + + /* Another CPU might be rcu-iterating the list, wait for it. + * This needs to be before calling the exit() notifiers, so the + * rcu_barrier() after ops_undo_list() isn't sufficient alone. + * Also the pre_exit() and exit() methods need this barrier. + */ + if (expedite_rcu) + synchronize_rcu_expedited(); + else + synchronize_rcu(); + + if (hold_rtnl) { + LIST_HEAD(dev_kill_list); + + ops = saved_ops; + rtnl_lock(); + list_for_each_entry_continue_reverse(ops, ops_list, list) { + if (ops->exit_batch_rtnl) + ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + } + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_exit_list(ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_free_list(ops, net_exit_list); +} + /* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { @@ -351,9 +398,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ static __net_init int setup_net(struct net *net) { /* Must be called with pernet_ops_rwsem held */ - const struct pernet_operations *ops, *saved_ops; + const struct pernet_operations *ops; LIST_HEAD(net_exit_list); - LIST_HEAD(dev_kill_list); int error = 0; preempt_disable(); @@ -376,29 +422,7 @@ static __net_init int setup_net(struct net *net) * for the pernet modules whose init functions did not fail. */ list_add(&net->exit_list, &net_exit_list); - saved_ops = ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - synchronize_rcu(); - - ops = saved_ops; - rtnl_lock(); - list_for_each_entry_continue_reverse(ops, &pernet_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); - + ops_undo_list(&pernet_list, ops, &net_exit_list, false, true); rcu_barrier(); goto out; } @@ -594,11 +618,9 @@ struct task_struct *cleanup_net_task; static void cleanup_net(struct work_struct *work) { - const struct pernet_operations *ops; - struct net *net, *tmp, *last; struct llist_node *net_kill_list; + struct net *net, *tmp, *last; LIST_HEAD(net_exit_list); - LIST_HEAD(dev_kill_list); cleanup_net_task = current; @@ -629,33 +651,7 @@ static void cleanup_net(struct work_struct *work) list_add_tail(&net->exit_list, &net_exit_list); } - /* Run all of the network namespace pre_exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - /* - * Another CPU might be rcu-iterating the list, wait for it. - * This needs to be before calling the exit() notifiers, so - * the rcu_barrier() below isn't sufficient alone. - * Also the pre_exit() and exit() methods need this barrier. - */ - synchronize_rcu_expedited(); - - rtnl_lock(); - list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - - /* Run all of the network namespace exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - /* Free the net generic variables */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); + ops_undo_list(&pernet_list, NULL, &net_exit_list, true, true); up_read(&pernet_ops_rwsem); From fed176bf3143362ac9935e3964949ab6a5c3286b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:31 -0700 Subject: [PATCH 116/770] net: Add ops_undo_single for module load/unload. If ops_init() fails while loading a module or we unload the module, free_exit_list() rolls back the changes. The rollback sequence is the same as ops_undo_list(). The ops is already removed from pernet_list before calling free_exit_list(). If we link the ops to a temporary list, we can reuse ops_undo_list(). Let's add a wrapper of ops_undo_list() and use it instead of free_exit_list(). Now, we have the central place to roll back ops_init(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/net_namespace.c | 54 ++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2612339efd714..37026776ae4e9 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -235,6 +235,17 @@ static void ops_undo_list(const struct list_head *ops_list, ops_free_list(ops, net_exit_list); } +static void ops_undo_single(struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + bool hold_rtnl = !!ops->exit_batch_rtnl; + LIST_HEAD(ops_list); + + list_add(&ops->list, &ops_list); + ops_undo_list(&ops_list, NULL, net_exit_list, false, hold_rtnl); + list_del(&ops->list); +} + /* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { @@ -1235,31 +1246,13 @@ void __init net_ns_init(void) rtnl_register_many(net_ns_rtnl_msg_handlers); } -static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) -{ - ops_pre_exit_list(ops, net_exit_list); - synchronize_rcu(); - - if (ops->exit_batch_rtnl) { - LIST_HEAD(dev_kill_list); - - rtnl_lock(); - ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } - ops_exit_list(ops, net_exit_list); - - ops_free_list(ops, net_exit_list); -} - #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { + LIST_HEAD(net_exit_list); struct net *net; int error; - LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); if (ops->init || ops->id) { @@ -1278,21 +1271,21 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { - struct net *net; LIST_HEAD(net_exit_list); + struct net *net; - list_del(&ops->list); /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + list_del(&ops->list); + ops_undo_single(ops, &net_exit_list); } #else @@ -1300,22 +1293,23 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { - if (!init_net_initialized) { - list_add_tail(&ops->list, list); + list_add_tail(&ops->list, list); + + if (!init_net_initialized) return 0; - } return ops_init(ops, &init_net); } static void __unregister_pernet_operations(struct pernet_operations *ops) { - if (!init_net_initialized) { - list_del(&ops->list); - } else { + list_del(&ops->list); + + if (init_net_initialized) { LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); } } From 7a60d91c690bf73c2c78e763efa29f294e217c3a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:32 -0700 Subject: [PATCH 117/770] net: Add ->exit_rtnl() hook to struct pernet_operations. struct pernet_operations provides two batching hooks; ->exit_batch() and ->exit_batch_rtnl(). The batching variant is beneficial if ->exit() meets any of the following conditions: 1) ->exit() repeatedly acquires a global lock for each netns 2) ->exit() has a time-consuming operation that can be factored out (e.g. synchronize_rcu(), smp_mb(), etc) 3) ->exit() does not need to repeat the same iterations for each netns (e.g. inet_twsk_purge()) Currently, none of the ->exit_batch_rtnl() functions satisfy any of the above conditions because RTNL is factored out and held by the caller and all of these functions iterate over the dying netns list. Also, we want to hold per-netns RTNL there but avoid spreading __rtnl_net_lock() across multiple locations. Let's add ->exit_rtnl() hook and run it under __rtnl_net_lock(). The following patches will convert all ->exit_batch_rtnl() users to ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 2 ++ net/core/net_namespace.c | 53 +++++++++++++++++++++++++++---------- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index bd57d8fb54f14..b071e6eed9d52 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -475,6 +475,8 @@ struct pernet_operations { void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); /* Following method is called with RTNL held. */ + void (*exit_rtnl)(struct net *net, + struct list_head *dev_kill_list); void (*exit_batch_rtnl)(struct list_head *net_exit_list, struct list_head *dev_kill_list); unsigned int * const id; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 37026776ae4e9..afaa3d1bda8d5 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -163,16 +163,51 @@ static void ops_pre_exit_list(const struct pernet_operations *ops, } } +static void ops_exit_rtnl_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + const struct pernet_operations *saved_ops = ops; + LIST_HEAD(dev_kill_list); + struct net *net; + + rtnl_lock(); + + list_for_each_entry(net, net_exit_list, exit_list) { + __rtnl_net_lock(net); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) { + if (ops->exit_rtnl) + ops->exit_rtnl(net, &dev_kill_list); + } + + __rtnl_net_unlock(net); + } + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) { + if (ops->exit_batch_rtnl) + ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); + } + + unregister_netdevice_many(&dev_kill_list); + + rtnl_unlock(); +} + static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { - struct net *net; if (ops->exit) { + struct net *net; + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); cond_resched(); } } + if (ops->exit_batch) ops->exit_batch(net_exit_list); } @@ -213,18 +248,8 @@ static void ops_undo_list(const struct list_head *ops_list, else synchronize_rcu(); - if (hold_rtnl) { - LIST_HEAD(dev_kill_list); - - ops = saved_ops; - rtnl_lock(); - list_for_each_entry_continue_reverse(ops, ops_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } + if (hold_rtnl) + ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list); ops = saved_ops; list_for_each_entry_continue_reverse(ops, ops_list, list) @@ -238,7 +263,7 @@ static void ops_undo_list(const struct list_head *ops_list, static void ops_undo_single(struct pernet_operations *ops, struct list_head *net_exit_list) { - bool hold_rtnl = !!ops->exit_batch_rtnl; + bool hold_rtnl = ops->exit_rtnl || ops->exit_batch_rtnl; LIST_HEAD(ops_list); list_add(&ops->list, &ops_list); From cf701038d1c87e6c54d8a45c738b56ab27a9e6c3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:33 -0700 Subject: [PATCH 118/770] nexthop: Convert nexthop_net_exit_batch_rtnl() to ->exit_rtnl(). nexthop_net_exit_batch_rtnl() iterates the dying netns list and performs the same operation for each. Let's use ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: David Ahern Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 467151517023d..d9cf06b297d11 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -4040,14 +4040,11 @@ void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, } EXPORT_SYMBOL(nexthop_res_grp_activity_update); -static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit nexthop_net_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - flush_all_nexthops(net); + ASSERT_RTNL_NET(net); + flush_all_nexthops(net); } static void __net_exit nexthop_net_exit(struct net *net) @@ -4072,7 +4069,7 @@ static int __net_init nexthop_net_init(struct net *net) static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, .exit = nexthop_net_exit, - .exit_batch_rtnl = nexthop_net_exit_batch_rtnl, + .exit_rtnl = nexthop_net_exit_rtnl, }; static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = { From 6f2667b98ef2ea81aad6761409604480d14c9095 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:34 -0700 Subject: [PATCH 119/770] vxlan: Convert vxlan_exit_batch_rtnl() to ->exit_rtnl(). vxlan_exit_batch_rtnl() iterates the dying netns list and performs the same operations for each. Let's use ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Nikolay Aleksandrov Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 9ccc3f09f71b8..56aee539c235f 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -4966,19 +4966,15 @@ static void __net_exit vxlan_destroy_tunnels(struct vxlan_net *vn, vxlan_dellink(vxlan->dev, dev_to_kill); } -static void __net_exit vxlan_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit vxlan_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_net *vn = net_generic(net, vxlan_net_id); - __unregister_nexthop_notifier(net, &vn->nexthop_notifier_block); + ASSERT_RTNL_NET(net); - vxlan_destroy_tunnels(vn, dev_to_kill); - } + __unregister_nexthop_notifier(net, &vn->nexthop_notifier_block); + vxlan_destroy_tunnels(vn, dev_to_kill); } static void __net_exit vxlan_exit_net(struct net *net) @@ -4992,7 +4988,7 @@ static void __net_exit vxlan_exit_net(struct net *net) static struct pernet_operations vxlan_net_ops = { .init = vxlan_init_net, - .exit_batch_rtnl = vxlan_exit_batch_rtnl, + .exit_rtnl = vxlan_exit_rtnl, .exit = vxlan_exit_net, .id = &vxlan_net_id, .size = sizeof(struct vxlan_net), From a967e01e2ad201f6ddc778ed65a5dae1c68ee8a5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:35 -0700 Subject: [PATCH 120/770] ipv4: ip_tunnel: Convert ip_tunnel_delete_nets() callers to ->exit_rtnl(). ip_tunnel_delete_nets() iterates the dying netns list and performs the same operations for each. Let's export ip_tunnel_destroy() as ip_tunnel_delete_net() and call it from ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: David Ahern Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/ip_tunnels.h | 7 +++---- net/ipv4/ip_gre.c | 27 ++++++++++++--------------- net/ipv4/ip_tunnel.c | 25 +++++++------------------ net/ipv4/ip_vti.c | 9 ++++----- net/ipv4/ipip.c | 9 ++++----- 5 files changed, 30 insertions(+), 47 deletions(-) diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index a36a335cef9fb..0c3d571a04a16 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -377,10 +377,9 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); - -void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, - struct rtnl_link_ops *ops, - struct list_head *dev_to_kill); +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 26d15f907551e..f5b9004d6938b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1066,16 +1066,15 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipgre_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, ipgre_net_id, &ipgre_link_ops, dev_to_kill); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit_batch_rtnl = ipgre_exit_batch_rtnl, + .exit_rtnl = ipgre_exit_rtnl, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1752,16 +1751,15 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipgre_tap_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops, - dev_to_kill); + ip_tunnel_delete_net(net, gre_tap_net_id, &ipgre_tap_ops, dev_to_kill); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit_batch_rtnl = ipgre_tap_exit_batch_rtnl, + .exit_rtnl = ipgre_tap_exit_rtnl, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1772,16 +1770,15 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit erspan_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, erspan_net_id, &erspan_link_ops, dev_to_kill); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit_batch_rtnl = erspan_exit_batch_rtnl, + .exit_rtnl = erspan_exit_rtnl, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 1024f961ec9ad..3913ec89ad207 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1174,13 +1174,16 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); -static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, - struct list_head *head, - struct rtnl_link_ops *ops) +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *head) { + struct ip_tunnel_net *itn = net_generic(net, id); struct net_device *dev, *aux; int h; + ASSERT_RTNL_NET(net); + for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == ops) unregister_netdevice_queue(dev, head); @@ -1198,21 +1201,7 @@ static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, unregister_netdevice_queue(t->dev, head); } } - -void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, - struct rtnl_link_ops *ops, - struct list_head *dev_to_kill) -{ - struct ip_tunnel_net *itn; - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) { - itn = net_generic(net, id); - ip_tunnel_destroy(net, itn, dev_to_kill, ops); - } -} -EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); +EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); int ip_tunnel_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 159b4473290e0..686e4f3d83aa7 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -523,16 +523,15 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit vti_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit_batch_rtnl = vti_exit_batch_rtnl, + .exit_rtnl = vti_exit_rtnl, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index bab0bf90c908d..3e03af073a1cc 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -604,16 +604,15 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipip_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit_batch_rtnl = ipip_exit_batch_rtnl, + .exit_rtnl = ipip_exit_rtnl, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; From f76758f18fb8677464a71cdcdd1920ffe6d59d85 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:36 -0700 Subject: [PATCH 121/770] ipv6: Convert tunnel devices' ->exit_batch_rtnl() to ->exit_rtnl(). The following functions iterates the dying netns list and performs the same operations for each netns. * ip6gre_exit_batch_rtnl() * ip6_tnl_exit_batch_rtnl() * vti6_exit_batch_rtnl() * sit_exit_batch_rtnl() Let's use ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: David Ahern Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_gre.c | 22 ++++++---------------- net/ipv6/ip6_tunnel.c | 24 ++++++++---------------- net/ipv6/ip6_vti.c | 27 +++++++-------------------- net/ipv6/sit.c | 23 ++++++----------------- 4 files changed, 27 insertions(+), 69 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 957ca98fa70f9..2dc9dcffe2cab 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1570,7 +1570,7 @@ static struct inet6_protocol ip6gre_protocol __read_mostly = { .flags = INET6_PROTO_FINAL, }; -static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) +static void __net_exit ip6gre_exit_rtnl_net(struct net *net, struct list_head *head) { struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct net_device *dev, *aux; @@ -1587,16 +1587,16 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) for (h = 0; h < IP6_GRE_HASH_SIZE; h++) { struct ip6_tnl *t; - t = rtnl_dereference(ign->tunnels[prio][h]); + t = rtnl_net_dereference(net, ign->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1640,19 +1640,9 @@ static int __net_init ip6gre_init_net(struct net *net) return err; } -static void __net_exit ip6gre_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - ip6gre_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit_batch_rtnl = ip6gre_exit_batch_rtnl, + .exit_rtnl = ip6gre_exit_rtnl_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index a04dd1bb4b195..894d3158a6f05 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -2210,7 +2210,7 @@ static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .priority = 1, }; -static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) +static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; @@ -2222,25 +2222,27 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); + t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + + t = rtnl_net_dereference(net, t->next); } } - t = rtnl_dereference(ip6n->tnls_wc[0]); + t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + + t = rtnl_net_dereference(net, t->next); } } @@ -2287,19 +2289,9 @@ static int __net_init ip6_tnl_init_net(struct net *net) return err; } -static void __net_exit ip6_tnl_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - ip6_tnl_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit_batch_rtnl = ip6_tnl_exit_batch_rtnl, + .exit_rtnl = ip6_tnl_exit_rtnl_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 09ec4b0ad7dc6..40464a88bca65 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -1112,21 +1112,21 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; -static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, - struct list_head *list) +static void __net_exit vti6_exit_rtnl_net(struct net *net, struct list_head *list) { - int h; + struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct ip6_tnl *t; + int h; for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); + t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + t = rtnl_net_dereference(net, t->next); } } - t = rtnl_dereference(ip6n->tnls_wc[0]); + t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); if (t) unregister_netdevice_queue(t->dev, list); } @@ -1170,22 +1170,9 @@ static int __net_init vti6_init_net(struct net *net) return err; } -static void __net_exit vti6_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct vti6_net *ip6n; - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) { - ip6n = net_generic(net, vti6_net_id); - vti6_destroy_tunnels(ip6n, dev_to_kill); - } -} - static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit_batch_rtnl = vti6_exit_batch_rtnl, + .exit_rtnl = vti6_exit_rtnl_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 9a0f32acb7500..a72dbca9e8fca 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1804,8 +1804,7 @@ static struct xfrm_tunnel mplsip_handler __read_mostly = { }; #endif -static void __net_exit sit_destroy_tunnels(struct net *net, - struct list_head *head) +static void __net_exit sit_exit_rtnl_net(struct net *net, struct list_head *head) { struct sit_net *sitn = net_generic(net, sit_net_id); struct net_device *dev, *aux; @@ -1820,15 +1819,15 @@ static void __net_exit sit_destroy_tunnels(struct net *net, for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) { struct ip_tunnel *t; - t = rtnl_dereference(sitn->tunnels[prio][h]); + t = rtnl_net_dereference(net, sitn->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1881,19 +1880,9 @@ static int __net_init sit_init_net(struct net *net) return err; } -static void __net_exit sit_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - sit_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit_batch_rtnl = sit_exit_batch_rtnl, + .exit_rtnl = sit_exit_rtnl_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; From 9571ab5a98fefa2698f52adfb2823eaac5ca8ce2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:37 -0700 Subject: [PATCH 122/770] xfrm: Convert xfrmi_exit_batch_rtnl() to ->exit_rtnl(). xfrmi_exit_batch_rtnl() iterates the dying netns list and performs the same operations for each. Let's use ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/xfrm/xfrm_interface_core.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 622445f041d32..cb1e12740c87b 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -952,32 +952,28 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .get_link_net = xfrmi_get_link_net, }; -static void __net_exit xfrmi_exit_batch_rtnl(struct list_head *net_exit_list, - struct list_head *dev_to_kill) +static void __net_exit xfrmi_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + struct xfrm_if __rcu **xip; + struct xfrm_if *xi; + int i; - ASSERT_RTNL(); - list_for_each_entry(net, net_exit_list, exit_list) { - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - struct xfrm_if __rcu **xip; - struct xfrm_if *xi; - int i; - - for (i = 0; i < XFRMI_HASH_SIZE; i++) { - for (xip = &xfrmn->xfrmi[i]; - (xi = rtnl_dereference(*xip)) != NULL; - xip = &xi->next) - unregister_netdevice_queue(xi->dev, dev_to_kill); - } - xi = rtnl_dereference(xfrmn->collect_md_xfrmi); - if (xi) + for (i = 0; i < XFRMI_HASH_SIZE; i++) { + for (xip = &xfrmn->xfrmi[i]; + (xi = rtnl_net_dereference(net, *xip)) != NULL; + xip = &xi->next) unregister_netdevice_queue(xi->dev, dev_to_kill); } + + xi = rtnl_net_dereference(net, xfrmn->collect_md_xfrmi); + if (xi) + unregister_netdevice_queue(xi->dev, dev_to_kill); } static struct pernet_operations xfrmi_net_ops = { - .exit_batch_rtnl = xfrmi_exit_batch_rtnl, + .exit_rtnl = xfrmi_exit_rtnl, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; From b7924f50be1525543d261a209790c86240b0df97 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:38 -0700 Subject: [PATCH 123/770] bridge: Convert br_net_exit_batch_rtnl() to ->exit_rtnl(). br_net_exit_batch_rtnl() iterates the dying netns list and performs the same operation for each. Let's use ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-10-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/bridge/br.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/net/bridge/br.c b/net/bridge/br.c index 25dda554ca5b3..0adeafe11a365 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -368,21 +368,20 @@ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) clear_bit(opt, &br->options); } -static void __net_exit br_net_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit br_net_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { struct net_device *dev; - struct net *net; - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - for_each_netdev(net, dev) - if (netif_is_bridge_master(dev)) - br_dev_delete(dev, dev_to_kill); + ASSERT_RTNL_NET(net); + + for_each_netdev(net, dev) + if (netif_is_bridge_master(dev)) + br_dev_delete(dev, dev_to_kill); } static struct pernet_operations br_net_ops = { - .exit_batch_rtnl = br_net_exit_batch_rtnl, + .exit_rtnl = br_net_exit_rtnl, }; static const struct stp_proto br_stp_proto = { From baf720334c02f07f434feb6b6977ecef20cd58a1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:39 -0700 Subject: [PATCH 124/770] bonding: Convert bond_net_exit_batch_rtnl() to ->exit_rtnl(). bond_net_exit_batch_rtnl() iterates the dying netns list and performs the same operation for each. Let's use ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Nikolay Aleksandrov Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-11-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 950d8e4d86f8b..bdd36409dd9b5 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -6558,7 +6558,7 @@ static int __net_init bond_net_init(struct net *net) /* According to commit 69b0216ac255 ("bonding: fix bonding_masters * race condition in bond unloading") we need to remove sysfs files - * before we remove our devices (done later in bond_net_exit_batch_rtnl()) + * before we remove our devices (done later in bond_net_exit_rtnl()) */ static void __net_exit bond_net_pre_exit(struct net *net) { @@ -6567,25 +6567,20 @@ static void __net_exit bond_net_pre_exit(struct net *net) bond_destroy_sysfs(bn); } -static void __net_exit bond_net_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_kill_list) +static void __net_exit bond_net_exit_rtnl(struct net *net, + struct list_head *dev_kill_list) { - struct bond_net *bn; - struct net *net; + struct bond_net *bn = net_generic(net, bond_net_id); + struct bonding *bond, *tmp_bond; /* Kill off any bonds created after unregistering bond rtnl ops */ - list_for_each_entry(net, net_list, exit_list) { - struct bonding *bond, *tmp_bond; - - bn = net_generic(net, bond_net_id); - list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list) - unregister_netdevice_queue(bond->dev, dev_kill_list); - } + list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list) + unregister_netdevice_queue(bond->dev, dev_kill_list); } /* According to commit 23fa5c2caae0 ("bonding: destroy proc directory * only after all bonds are gone") bond_destroy_proc_dir() is called - * after bond_net_exit_batch_rtnl() has completed. + * after bond_net_exit_rtnl() has completed. */ static void __net_exit bond_net_exit_batch(struct list_head *net_list) { @@ -6601,7 +6596,7 @@ static void __net_exit bond_net_exit_batch(struct list_head *net_list) static struct pernet_operations bond_net_ops = { .init = bond_net_init, .pre_exit = bond_net_pre_exit, - .exit_batch_rtnl = bond_net_exit_batch_rtnl, + .exit_rtnl = bond_net_exit_rtnl, .exit_batch = bond_net_exit_batch, .id = &bond_net_id, .size = sizeof(struct bond_net), From bc7eaf7a40bba2172af3033fbece13af206c97fe Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:40 -0700 Subject: [PATCH 125/770] gtp: Convert gtp_net_exit_batch_rtnl() to ->exit_rtnl(). gtp_net_exit_batch_rtnl() iterates the dying netns list and performs the same operations for each. Let's use ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-12-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/gtp.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index ef793607890d9..d4dec741c7f44 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -2475,23 +2475,19 @@ static int __net_init gtp_net_init(struct net *net) return 0; } -static void __net_exit gtp_net_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit gtp_net_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; - - list_for_each_entry(net, net_list, exit_list) { - struct gtp_net *gn = net_generic(net, gtp_net_id); - struct gtp_dev *gtp, *gtp_next; + struct gtp_net *gn = net_generic(net, gtp_net_id); + struct gtp_dev *gtp, *gtp_next; - list_for_each_entry_safe(gtp, gtp_next, &gn->gtp_dev_list, list) - gtp_dellink(gtp->dev, dev_to_kill); - } + list_for_each_entry_safe(gtp, gtp_next, &gn->gtp_dev_list, list) + gtp_dellink(gtp->dev, dev_to_kill); } static struct pernet_operations gtp_net_ops = { .init = gtp_net_init, - .exit_batch_rtnl = gtp_net_exit_batch_rtnl, + .exit_rtnl = gtp_net_exit_rtnl, .id = >p_net_id, .size = sizeof(struct gtp_net), }; From fc3dc33f668c12285d82ad7a57b9dc3ca2922f8e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:41 -0700 Subject: [PATCH 126/770] bareudp: Convert bareudp_exit_batch_rtnl() to ->exit_rtnl(). bareudp_exit_batch_rtnl() iterates the dying netns list and performs the same operation for each. Let's use ->exit_rtnl(). While at it, we replace unregister_netdevice_queue() with bareudp_dellink() for better cleanup. It unlinks the device from net_generic(net, bareudp_net_id)->bareudp_list, but there is no real issue as both the dev and the list are freed later. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-13-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/bareudp.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index d1473c5f8eefc..a9dffdcac8057 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -777,27 +777,19 @@ static __net_init int bareudp_init_net(struct net *net) return 0; } -static void bareudp_destroy_tunnels(struct net *net, struct list_head *head) +static void __net_exit bareudp_exit_rtnl_net(struct net *net, + struct list_head *dev_kill_list) { struct bareudp_net *bn = net_generic(net, bareudp_net_id); struct bareudp_dev *bareudp, *next; list_for_each_entry_safe(bareudp, next, &bn->bareudp_list, next) - unregister_netdevice_queue(bareudp->dev, head); -} - -static void __net_exit bareudp_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_kill_list) -{ - struct net *net; - - list_for_each_entry(net, net_list, exit_list) - bareudp_destroy_tunnels(net, dev_kill_list); + bareudp_dellink(bareudp->dev, dev_kill_list); } static struct pernet_operations bareudp_net_ops = { .init = bareudp_init_net, - .exit_batch_rtnl = bareudp_exit_batch_rtnl, + .exit_rtnl = bareudp_exit_rtnl_net, .id = &bareudp_net_id, .size = sizeof(struct bareudp_net), }; From 4e53b32d74f052458309a10867e642a8aeafd2f5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:42 -0700 Subject: [PATCH 127/770] geneve: Convert geneve_exit_batch_rtnl() to ->exit_rtnl(). geneve_exit_batch_rtnl() iterates the dying netns list and performs the same operation for each. Let's use ->exit_rtnl(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-14-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/geneve.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 66e38ce9cd1d3..ffc15a4326899 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1946,22 +1946,14 @@ static __net_init int geneve_init_net(struct net *net) return 0; } -static void geneve_destroy_tunnels(struct net *net, struct list_head *head) +static void __net_exit geneve_exit_rtnl_net(struct net *net, + struct list_head *dev_to_kill) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *geneve, *next; list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) - geneve_dellink(geneve->dev, head); -} - -static void __net_exit geneve_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - list_for_each_entry(net, net_list, exit_list) - geneve_destroy_tunnels(net, dev_to_kill); + geneve_dellink(geneve->dev, dev_to_kill); } static void __net_exit geneve_exit_net(struct net *net) @@ -1973,7 +1965,7 @@ static void __net_exit geneve_exit_net(struct net *net) static struct pernet_operations geneve_net_ops = { .init = geneve_init_net, - .exit_batch_rtnl = geneve_exit_batch_rtnl, + .exit_rtnl = geneve_exit_rtnl_net, .exit = geneve_exit_net, .id = &geneve_net_id, .size = sizeof(struct geneve_net), From c57a9c503543cd8829eeaaf88362199e0491c0d7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 11 Apr 2025 13:52:43 -0700 Subject: [PATCH 128/770] net: Remove ->exit_batch_rtnl(). There are no ->exit_batch_rtnl() users remaining. Let's remove the hook. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20250411205258.63164-15-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/net_namespace.h | 2 -- net/core/net_namespace.c | 8 +------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index b071e6eed9d52..025a7574b275f 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -477,8 +477,6 @@ struct pernet_operations { /* Following method is called with RTNL held. */ void (*exit_rtnl)(struct net *net, struct list_head *dev_kill_list); - void (*exit_batch_rtnl)(struct list_head *net_exit_list, - struct list_head *dev_kill_list); unsigned int * const id; const size_t size; }; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index afaa3d1bda8d5..0a2b24af40280 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -185,12 +185,6 @@ static void ops_exit_rtnl_list(const struct list_head *ops_list, __rtnl_net_unlock(net); } - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, ops_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); @@ -263,7 +257,7 @@ static void ops_undo_list(const struct list_head *ops_list, static void ops_undo_single(struct pernet_operations *ops, struct list_head *net_exit_list) { - bool hold_rtnl = ops->exit_rtnl || ops->exit_batch_rtnl; + bool hold_rtnl = !!ops->exit_rtnl; LIST_HEAD(ops_list); list_add(&ops->list, &ops_list); From dadc3a6be46987d9417732e5b59476267bba76ed Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 12 Apr 2025 09:08:29 +0100 Subject: [PATCH 129/770] net: stmmac: dwc-qos: remove tegra_eqos_init() tegra_eqos_init() initialises the 1US TIC counter for the EEE timers. However, the DWGMAC core is reset after this write, which clears this register to its default. However, dwmac4_core_init() configures this register using the same clock, which happens after reset - thus this is the write which ensures that the register is correctly configured. Therefore, tegra_eqos_init() is not required and is removed. This also means eqos->clk_slave can also be removed. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u3Vuf-000E7g-U4@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../stmicro/stmmac/dwmac-dwc-qos-eth.c | 24 +------------------ 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index 5db318327d332..583b5c071cd14 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -29,7 +29,6 @@ struct tegra_eqos { void __iomem *regs; struct reset_control *rst; - struct clk *clk_slave; struct gpio_desc *reset; }; @@ -199,20 +198,6 @@ static void tegra_eqos_fix_speed(void *priv, int speed, unsigned int mode) } } -static int tegra_eqos_init(struct platform_device *pdev, void *priv) -{ - struct tegra_eqos *eqos = priv; - unsigned long rate; - u32 value; - - rate = clk_get_rate(eqos->clk_slave); - - value = (rate / 1000000) - 1; - writel(value, eqos->regs + GMAC_1US_TIC_COUNTER); - - return 0; -} - static int tegra_eqos_probe(struct platform_device *pdev, struct plat_stmmacenet_data *plat_dat, struct stmmac_resources *res) @@ -227,7 +212,6 @@ static int tegra_eqos_probe(struct platform_device *pdev, eqos->dev = &pdev->dev; eqos->regs = res->addr; - eqos->clk_slave = plat_dat->stmmac_clk; if (!is_of_node(dev->fwnode)) goto bypass_clk_reset_gpio; @@ -267,17 +251,11 @@ static int tegra_eqos_probe(struct platform_device *pdev, bypass_clk_reset_gpio: plat_dat->fix_mac_speed = tegra_eqos_fix_speed; plat_dat->set_clk_tx_rate = stmmac_set_clk_tx_rate; - plat_dat->init = tegra_eqos_init; plat_dat->bsp_priv = eqos; plat_dat->flags |= STMMAC_FLAG_SPH_DISABLE; - err = tegra_eqos_init(pdev, eqos); - if (err < 0) - goto reset; - return 0; -reset: - reset_control_assert(eqos->rst); + reset_phy: gpiod_set_value(eqos->reset, 1); From 17ec6dbaaed3e2447c8aac93c7161823402aa2d9 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 12 Apr 2025 09:08:35 +0100 Subject: [PATCH 130/770] net: stmmac: intel: remove eee_usecs_rate and hardware write Remove the write to GMAC_1US_TIC_COUNTER for two reasons: 1. during initialisation or reinitialisation of the DWMAC core, the core is reset, which sets this register back to its default value. Writing it prior to stmmac_dvr_probe() has no effect. 2. Since commit 8efbdbfa9938 ("net: stmmac: Initialize MAC_ONEUS_TIC_COUNTER register"), GMAC4/5 core code will set this register based on the rate of plat->stmmac_clk. This clock is created by the same code which initialises plat->eee_usecs_rate, which is also created to run at this same rate. Since Marek's commit, this will set this register appropriately using the rate of this clock. Therefore, dwmac-intel.c writing GMAC_1US_TIC_COUNTER serves no useful purpose and can be removed. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u3Vul-000E7m-1j@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index c8bb9265bbb48..54db5b778304d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -682,7 +682,6 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, plat->axi->axi_blen[2] = 16; plat->ptp_max_adj = plat->clk_ptp_rate; - plat->eee_usecs_rate = plat->clk_ptp_rate; /* Set system clock */ sprintf(clk_name, "%s-%s", "stmmac", pci_name(pdev)); @@ -1313,13 +1312,6 @@ static int intel_eth_pci_probe(struct pci_dev *pdev, memset(&res, 0, sizeof(res)); res.addr = pcim_iomap_table(pdev)[0]; - if (plat->eee_usecs_rate > 0) { - u32 tx_lpi_usec; - - tx_lpi_usec = (plat->eee_usecs_rate / 1000000) - 1; - writel(tx_lpi_usec, res.addr + GMAC_1US_TIC_COUNTER); - } - ret = stmmac_config_multi_msi(pdev, plat, &res); if (ret) { ret = stmmac_config_single_msi(pdev, plat, &res); From 35031c6256f19773b7e46df80ef4e6d6b782b088 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 12 Apr 2025 09:08:40 +0100 Subject: [PATCH 131/770] net: stmmac: intel-plat: remove eee_usecs_rate and hardware write Remove the write to GMAC_1US_TIC_COUNTER for two reasons: 1. during initialisation or reinitialisation of the DWMAC core, the core is reset, which sets this register back to its default value. Writing it prior to stmmac_dvr_probe() has no effect. 2. Since commit 8efbdbfa9938 ("net: stmmac: Initialize MAC_ONEUS_TIC_COUNTER register"), GMAC4/5 core code will set this register based on the rate of plat->stmmac_clk. This clock is fetched by devm_stmmac_probe_config_dt(), and plat->clk_ptp_rate will be set to its rate profided a "ptp_ref" clock is not provided. In any case, Marek's commit will set the effectual value of this register. Therefore, dwmac-intel-plat.c writing GMAC_1US_TIC_COUNTER serves no useful purpose and can be removed. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u3Vuq-000E7s-5Y@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c index 599def7b3a649..4ea7b0a803d73 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c @@ -113,16 +113,7 @@ static int intel_eth_plat_probe(struct platform_device *pdev) plat_dat->clk_tx_i = dwmac->tx_clk; plat_dat->set_clk_tx_rate = stmmac_set_clk_tx_rate; - plat_dat->bsp_priv = dwmac; - plat_dat->eee_usecs_rate = plat_dat->clk_ptp_rate; - - if (plat_dat->eee_usecs_rate > 0) { - u32 tx_lpi_usec; - - tx_lpi_usec = (plat_dat->eee_usecs_rate / 1000000) - 1; - writel(tx_lpi_usec, stmmac_res.addr + GMAC_1US_TIC_COUNTER); - } ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); if (ret) From 651f88cb046c5e002f7c11de2cebf207787d2346 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 12 Apr 2025 09:08:45 +0100 Subject: [PATCH 132/770] net: stmmac: remove eee_usecs_rate plat_dat->eee_users_rate is now unused, so remove this member. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u3Vuv-000E7y-9k@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index c4ec8bb8144e7..8aed09d65b4a1 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -276,7 +276,6 @@ struct plat_stmmacenet_data { int mac_port_sel_speed; int has_xgmac; u8 vlan_fail_q; - unsigned long eee_usecs_rate; struct pci_dev *pdev; int int_snapshot_num; int msi_mac_vec; From 25af74ed68c4e3de35f5314fe4f8e2853a7e79ac Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 12 Apr 2025 09:08:50 +0100 Subject: [PATCH 133/770] net: stmmac: remove GMAC_1US_TIC_COUNTER definition GMAC_1US_TIC_COUNTER is now no longer used, so remove the definition. This was duplicated by GMAC4_MAC_ONEUS_TIC_COUNTER further down in the same file. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u3Vv0-000E87-DQ@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac4.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h index 42fe29a4e300b..5f387ec27c8cc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h @@ -31,7 +31,6 @@ #define GMAC_RXQ_CTRL3 0x000000ac #define GMAC_INT_STATUS 0x000000b0 #define GMAC_INT_EN 0x000000b4 -#define GMAC_1US_TIC_COUNTER 0x000000dc #define GMAC_PCS_BASE 0x000000e0 #define GMAC_PHYIF_CONTROL_STATUS 0x000000f8 #define GMAC_PMT 0x000000c0 From 4129a75a76a755c151be1ebc0e7d618a4d21e41f Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 10 Apr 2025 14:42:49 +0200 Subject: [PATCH 134/770] net: dsa: microchip: add ETS scheduler support for KSZ88x3 switches Implement Enhanced Transmission Selection scheduler (ETS) support for KSZ88x3 devices, which support two fixed egress scheduling modes: Strict Priority and Weighted Fair Queuing (WFQ). Since the switch does not allow remapping priorities to queues or adjusting weights, this implementation only supports enabling strict priority mode. If strict mode is not explicitly requested, the switch falls back to its default WFQ mode. This patch introduces KSZ88x3-specific handlers for ETS add and delete operations and uses TXQ Split Control registers to toggle the WFQ enable bit per queue. Corresponding macros are also added for register access. Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20250410124249.2728568-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 97 +++++++++++++++++++++++++- drivers/net/dsa/microchip/ksz_common.h | 19 +++++ 2 files changed, 113 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 89f0796894af6..b45052497f8a2 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -3999,6 +3999,89 @@ static int ksz_ets_band_to_queue(struct tc_ets_qopt_offload_replace_params *p, return p->bands - 1 - band; } +/** + * ksz88x3_tc_ets_add - Configure ETS (Enhanced Transmission Selection) + * for a port on KSZ88x3 switch + * @dev: Pointer to the KSZ switch device structure + * @port: Port number to configure + * @p: Pointer to offload replace parameters describing ETS bands and mapping + * + * The KSZ88x3 supports two scheduling modes: Strict Priority and + * Weighted Fair Queuing (WFQ). Both modes have fixed behavior: + * - No configurable queue-to-priority mapping + * - No weight adjustment in WFQ mode + * + * This function configures the switch to use strict priority mode by + * clearing the WFQ enable bit for all queues associated with ETS bands. + * If strict priority is not explicitly requested, the switch will default + * to WFQ mode. + * + * Return: 0 on success, or a negative error code on failure + */ +static int ksz88x3_tc_ets_add(struct ksz_device *dev, int port, + struct tc_ets_qopt_offload_replace_params *p) +{ + int ret, band; + + /* Only strict priority mode is supported for now. + * WFQ is implicitly enabled when strict mode is disabled. + */ + for (band = 0; band < p->bands; band++) { + int queue = ksz_ets_band_to_queue(p, band); + u8 reg; + + /* Calculate TXQ Split Control register address for this + * port/queue + */ + reg = KSZ8873_TXQ_SPLIT_CTRL_REG(port, queue); + + /* Clear WFQ enable bit to select strict priority scheduling */ + ret = ksz_rmw8(dev, reg, KSZ8873_TXQ_WFQ_ENABLE, 0); + if (ret) + return ret; + } + + return 0; +} + +/** + * ksz88x3_tc_ets_del - Reset ETS (Enhanced Transmission Selection) config + * for a port on KSZ88x3 switch + * @dev: Pointer to the KSZ switch device structure + * @port: Port number to reset + * + * The KSZ88x3 supports only fixed scheduling modes: Strict Priority or + * Weighted Fair Queuing (WFQ), with no reconfiguration of weights or + * queue mapping. This function resets the port’s scheduling mode to + * the default, which is WFQ, by enabling the WFQ bit for all queues. + * + * Return: 0 on success, or a negative error code on failure + */ +static int ksz88x3_tc_ets_del(struct ksz_device *dev, int port) +{ + int ret, queue; + + /* Iterate over all transmit queues for this port */ + for (queue = 0; queue < dev->info->num_tx_queues; queue++) { + u8 reg; + + /* Calculate TXQ Split Control register address for this + * port/queue + */ + reg = KSZ8873_TXQ_SPLIT_CTRL_REG(port, queue); + + /* Set WFQ enable bit to revert back to default scheduling + * mode + */ + ret = ksz_rmw8(dev, reg, KSZ8873_TXQ_WFQ_ENABLE, + KSZ8873_TXQ_WFQ_ENABLE); + if (ret) + return ret; + } + + return 0; +} + static int ksz_queue_set_strict(struct ksz_device *dev, int port, int queue) { int ret; @@ -4080,6 +4163,7 @@ static int ksz_tc_ets_del(struct ksz_device *dev, int port) for (queue = 0; queue < dev->info->num_tx_queues; queue++) { ret = ksz_queue_set_wrr(dev, port, queue, KSZ9477_DEFAULT_WRR_WEIGHT); + if (ret) return ret; } @@ -4132,7 +4216,7 @@ static int ksz_tc_setup_qdisc_ets(struct dsa_switch *ds, int port, struct ksz_device *dev = ds->priv; int ret; - if (is_ksz8(dev)) + if (is_ksz8(dev) && !ksz_is_ksz88x3(dev)) return -EOPNOTSUPP; if (qopt->parent != TC_H_ROOT) { @@ -4146,9 +4230,16 @@ static int ksz_tc_setup_qdisc_ets(struct dsa_switch *ds, int port, if (ret) return ret; - return ksz_tc_ets_add(dev, port, &qopt->replace_params); + if (ksz_is_ksz88x3(dev)) + return ksz88x3_tc_ets_add(dev, port, + &qopt->replace_params); + else + return ksz_tc_ets_add(dev, port, &qopt->replace_params); case TC_ETS_DESTROY: - return ksz_tc_ets_del(dev, port); + if (ksz_is_ksz88x3(dev)) + return ksz88x3_tc_ets_del(dev, port); + else + return ksz_tc_ets_del(dev, port); case TC_ETS_STATS: case TC_ETS_GRAFT: return -EOPNOTSUPP; diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index af17a9c030d41..dd5429ff16eee 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -836,6 +836,25 @@ static inline bool is_lan937x_tx_phy(struct ksz_device *dev, int port) #define SW_HI_SPEED_DRIVE_STRENGTH_S 4 #define SW_LO_SPEED_DRIVE_STRENGTH_S 0 +/* TXQ Split Control Register for per-port, per-queue configuration. + * Register 0xAF is TXQ Split for Q3 on Port 1. + * Register offset formula: 0xAF + (port * 4) + (3 - queue) + * where: port = 0..2, queue = 0..3 + */ +#define KSZ8873_TXQ_SPLIT_CTRL_REG(port, queue) \ + (0xAF + ((port) * 4) + (3 - (queue))) + +/* Bit 7 selects between: + * 0 = Strict priority mode (highest-priority queue first) + * 1 = Weighted Fair Queuing (WFQ) mode: + * Queue weights: Q3:Q2:Q1:Q0 = 8:4:2:1 + * If any queues are empty, weight is redistributed. + * + * Note: This is referred to as "Weighted Fair Queuing" (WFQ) in KSZ8863/8873 + * documentation, and as "Weighted Round Robin" (WRR) in KSZ9477 family docs. + */ +#define KSZ8873_TXQ_WFQ_ENABLE BIT(7) + #define KSZ9477_REG_PORT_OUT_RATE_0 0x0420 #define KSZ9477_OUT_RATE_NO_LIMIT 0 From 36ef2575e78d1a3c699dc3f1c9dee9be742c9bdd Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 10 Apr 2025 22:17:31 +0300 Subject: [PATCH 135/770] net/mlx5: HWS, Fix matcher action template attach The procedure of attaching an action template to an existing matcher had a few issues: 1. Attaching accidentally overran the `at` array in bwc_matcher, which would result in memory corruption. This bug wasn't triggered, but it is possible to trigger it by attaching action templates beyond the initial buffer size of 8. Fix this by converting to a dynamically sized buffer and reallocating if needed. 2. Similarly, the `at` array inside the native matcher was never reallocated. Fix this the same as above. 3. The bwc layer treated any error in action template attach as a signal that the matcher should be rehashed to account for a larger number of action STEs. In reality, there are other unrelated errors that can arise and they should be propagated upstack. Fix this by adding a `need_rehash` output parameter that's orthogonal to error codes. Fixes: 2111bb970c78 ("net/mlx5: HWS, added backward-compatible API handling") Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/1744312662-356571-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/bwc.c | 55 ++++++++++++++++--- .../mellanox/mlx5/core/steering/hws/bwc.h | 9 ++- .../mellanox/mlx5/core/steering/hws/matcher.c | 48 +++++++++++++--- .../mellanox/mlx5/core/steering/hws/matcher.h | 4 ++ .../mellanox/mlx5/core/steering/hws/mlx5hws.h | 5 +- 5 files changed, 97 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 19dce1ba512d4..32de8bfc7644f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -90,13 +90,19 @@ int mlx5hws_bwc_matcher_create_simple(struct mlx5hws_bwc_matcher *bwc_matcher, bwc_matcher->priority = priority; bwc_matcher->size_log = MLX5HWS_BWC_MATCHER_INIT_SIZE_LOG; + bwc_matcher->size_of_at_array = MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM; + bwc_matcher->at = kcalloc(bwc_matcher->size_of_at_array, + sizeof(*bwc_matcher->at), GFP_KERNEL); + if (!bwc_matcher->at) + goto free_bwc_matcher_rules; + /* create dummy action template */ bwc_matcher->at[0] = mlx5hws_action_template_create(action_types ? action_types : init_action_types); if (!bwc_matcher->at[0]) { mlx5hws_err(table->ctx, "BWC matcher: failed creating action template\n"); - goto free_bwc_matcher_rules; + goto free_bwc_matcher_at_array; } bwc_matcher->num_of_at = 1; @@ -126,6 +132,8 @@ int mlx5hws_bwc_matcher_create_simple(struct mlx5hws_bwc_matcher *bwc_matcher, mlx5hws_match_template_destroy(bwc_matcher->mt); free_at: mlx5hws_action_template_destroy(bwc_matcher->at[0]); +free_bwc_matcher_at_array: + kfree(bwc_matcher->at); free_bwc_matcher_rules: kfree(bwc_matcher->rules); err: @@ -192,6 +200,7 @@ int mlx5hws_bwc_matcher_destroy_simple(struct mlx5hws_bwc_matcher *bwc_matcher) for (i = 0; i < bwc_matcher->num_of_at; i++) mlx5hws_action_template_destroy(bwc_matcher->at[i]); + kfree(bwc_matcher->at); mlx5hws_match_template_destroy(bwc_matcher->mt); kfree(bwc_matcher->rules); @@ -520,6 +529,23 @@ hws_bwc_matcher_extend_at(struct mlx5hws_bwc_matcher *bwc_matcher, struct mlx5hws_rule_action rule_actions[]) { enum mlx5hws_action_type action_types[MLX5HWS_BWC_MAX_ACTS]; + void *p; + + if (unlikely(bwc_matcher->num_of_at >= bwc_matcher->size_of_at_array)) { + if (bwc_matcher->size_of_at_array >= MLX5HWS_MATCHER_MAX_AT) + return -ENOMEM; + bwc_matcher->size_of_at_array *= 2; + p = krealloc(bwc_matcher->at, + bwc_matcher->size_of_at_array * + sizeof(*bwc_matcher->at), + __GFP_ZERO | GFP_KERNEL); + if (!p) { + bwc_matcher->size_of_at_array /= 2; + return -ENOMEM; + } + + bwc_matcher->at = p; + } hws_bwc_rule_actions_to_action_types(rule_actions, action_types); @@ -777,6 +803,7 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, struct mlx5hws_rule_attr rule_attr; struct mutex *queue_lock; /* Protect the queue */ u32 num_of_rules; + bool need_rehash; int ret = 0; int at_idx; @@ -803,10 +830,14 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, at_idx = bwc_matcher->num_of_at - 1; ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher, - bwc_matcher->at[at_idx]); + bwc_matcher->at[at_idx], + &need_rehash); if (unlikely(ret)) { - /* Action template attach failed, possibly due to - * requiring more action STEs. + hws_bwc_unlock_all_queues(ctx); + return ret; + } + if (unlikely(need_rehash)) { + /* The new action template requires more action STEs. * Need to attempt creating new matcher with all * the action templates, including the new one. */ @@ -942,6 +973,7 @@ hws_bwc_rule_action_update(struct mlx5hws_bwc_rule *bwc_rule, struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; struct mlx5hws_rule_attr rule_attr; struct mutex *queue_lock; /* Protect the queue */ + bool need_rehash; int at_idx, ret; u16 idx; @@ -973,12 +1005,17 @@ hws_bwc_rule_action_update(struct mlx5hws_bwc_rule *bwc_rule, at_idx = bwc_matcher->num_of_at - 1; ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher, - bwc_matcher->at[at_idx]); + bwc_matcher->at[at_idx], + &need_rehash); if (unlikely(ret)) { - /* Action template attach failed, possibly due to - * requiring more action STEs. - * Need to attempt creating new matcher with all - * the action templates, including the new one. + hws_bwc_unlock_all_queues(ctx); + return ret; + } + if (unlikely(need_rehash)) { + /* The new action template requires more action + * STEs. Need to attempt creating new matcher + * with all the action templates, including the + * new one. */ ret = hws_bwc_matcher_rehash_at(bwc_matcher); if (unlikely(ret)) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h index 47f7ed1415535..bb0cf4b922ceb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.h @@ -10,9 +10,7 @@ #define MLX5HWS_BWC_MATCHER_REHASH_BURST_TH 32 /* Max number of AT attach operations for the same matcher. - * When the limit is reached, next attempt to attach new AT - * will result in creation of a new matcher and moving all - * the rules to this matcher. + * When the limit is reached, a larger buffer is allocated for the ATs. */ #define MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM 8 @@ -23,10 +21,11 @@ struct mlx5hws_bwc_matcher { struct mlx5hws_matcher *matcher; struct mlx5hws_match_template *mt; - struct mlx5hws_action_template *at[MLX5HWS_BWC_MATCHER_ATTACH_AT_NUM]; - u32 priority; + struct mlx5hws_action_template **at; u8 num_of_at; + u8 size_of_at_array; u8 size_log; + u32 priority; atomic_t num_of_rules; struct list_head *rules; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index b61864b320536..37a4497048a6f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -905,18 +905,48 @@ static int hws_matcher_uninit(struct mlx5hws_matcher *matcher) return 0; } +static int hws_matcher_grow_at_array(struct mlx5hws_matcher *matcher) +{ + void *p; + + if (matcher->size_of_at_array >= MLX5HWS_MATCHER_MAX_AT) + return -ENOMEM; + + matcher->size_of_at_array *= 2; + p = krealloc(matcher->at, + matcher->size_of_at_array * sizeof(*matcher->at), + __GFP_ZERO | GFP_KERNEL); + if (!p) { + matcher->size_of_at_array /= 2; + return -ENOMEM; + } + + matcher->at = p; + + return 0; +} + int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher, - struct mlx5hws_action_template *at) + struct mlx5hws_action_template *at, + bool *need_rehash) { bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt); struct mlx5hws_context *ctx = matcher->tbl->ctx; u32 required_stes; int ret; - if (!matcher->attr.max_num_of_at_attach) { - mlx5hws_dbg(ctx, "Num of current at (%d) exceed allowed value\n", - matcher->num_of_at); - return -EOPNOTSUPP; + *need_rehash = false; + + if (unlikely(matcher->num_of_at >= matcher->size_of_at_array)) { + ret = hws_matcher_grow_at_array(matcher); + if (ret) + return ret; + + if (matcher->col_matcher) { + ret = hws_matcher_grow_at_array(matcher->col_matcher); + if (ret) + return ret; + } } ret = hws_matcher_check_and_process_at(matcher, at); @@ -927,12 +957,11 @@ int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher, if (matcher->action_ste.max_stes < required_stes) { mlx5hws_dbg(ctx, "Required STEs [%d] exceeds initial action template STE [%d]\n", required_stes, matcher->action_ste.max_stes); - return -ENOMEM; + *need_rehash = true; } matcher->at[matcher->num_of_at] = *at; matcher->num_of_at += 1; - matcher->attr.max_num_of_at_attach -= 1; if (matcher->col_matcher) matcher->col_matcher->num_of_at = matcher->num_of_at; @@ -960,8 +989,9 @@ hws_matcher_set_templates(struct mlx5hws_matcher *matcher, if (!matcher->mt) return -ENOMEM; - matcher->at = kvcalloc(num_of_at + matcher->attr.max_num_of_at_attach, - sizeof(*matcher->at), + matcher->size_of_at_array = + num_of_at + matcher->attr.max_num_of_at_attach; + matcher->at = kvcalloc(matcher->size_of_at_array, sizeof(*matcher->at), GFP_KERNEL); if (!matcher->at) { mlx5hws_err(ctx, "Failed to allocate action template array\n"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h index 020de70270c50..20b32012c418b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h @@ -23,6 +23,9 @@ */ #define MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT 1 +/* Maximum number of action templates that can be attached to a matcher. */ +#define MLX5HWS_MATCHER_MAX_AT 128 + enum mlx5hws_matcher_offset { MLX5HWS_MATCHER_OFFSET_TAG_DW1 = 12, MLX5HWS_MATCHER_OFFSET_TAG_DW0 = 13, @@ -72,6 +75,7 @@ struct mlx5hws_matcher { struct mlx5hws_match_template *mt; struct mlx5hws_action_template *at; u8 num_of_at; + u8 size_of_at_array; u8 num_of_mt; /* enum mlx5hws_matcher_flags */ u8 flags; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h index 5121951f2778a..8ed8a715a2eb2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h @@ -399,11 +399,14 @@ int mlx5hws_matcher_destroy(struct mlx5hws_matcher *matcher); * * @matcher: Matcher to attach the action template to. * @at: Action template to be attached to the matcher. + * @need_rehash: Output parameter that tells callers if the matcher needs to be + * rehashed. * * Return: Zero on success, non-zero otherwise. */ int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher, - struct mlx5hws_action_template *at); + struct mlx5hws_action_template *at, + bool *need_rehash); /** * mlx5hws_matcher_resize_set_target - Link two matchers and enable moving rules. From b2ae16214ffeda3e1c25223eebe19f85b0876181 Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 10 Apr 2025 22:17:32 +0300 Subject: [PATCH 136/770] net/mlx5: HWS, Remove unused element array Remove the array of elements wrapped in a struct because in reality only the first element was ever used. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/1744312662-356571-3-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/pool.c | 55 ++++++++----------- .../mellanox/mlx5/core/steering/hws/pool.h | 6 +- 2 files changed, 23 insertions(+), 38 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c index 50a81d360bb2f..35ed9bee06a69 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c @@ -293,7 +293,7 @@ static int hws_pool_create_resource_on_index(struct mlx5hws_pool *pool, } static struct mlx5hws_pool_elements * -hws_pool_element_create_new_elem(struct mlx5hws_pool *pool, u32 order, int idx) +hws_pool_element_create_new_elem(struct mlx5hws_pool *pool, u32 order) { struct mlx5hws_pool_elements *elem; u32 alloc_size; @@ -311,21 +311,21 @@ hws_pool_element_create_new_elem(struct mlx5hws_pool *pool, u32 order, int idx) elem->bitmap = hws_pool_create_and_init_bitmap(alloc_size - order); if (!elem->bitmap) { mlx5hws_err(pool->ctx, - "Failed to create bitmap type: %d: size %d index: %d\n", - pool->type, alloc_size, idx); + "Failed to create bitmap type: %d: size %d\n", + pool->type, alloc_size); goto free_elem; } elem->log_size = alloc_size - order; } - if (hws_pool_create_resource_on_index(pool, alloc_size, idx)) { - mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d index: %d\n", - pool->type, alloc_size, idx); + if (hws_pool_create_resource_on_index(pool, alloc_size, 0)) { + mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n", + pool->type, alloc_size); goto free_db; } - pool->db.element_manager->elements[idx] = elem; + pool->db.element = elem; return elem; @@ -359,9 +359,9 @@ hws_pool_onesize_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order, { struct mlx5hws_pool_elements *elem; - elem = pool->db.element_manager->elements[0]; + elem = pool->db.element; if (!elem) - elem = hws_pool_element_create_new_elem(pool, order, 0); + elem = hws_pool_element_create_new_elem(pool, order); if (!elem) goto err_no_elem; @@ -451,16 +451,14 @@ static int hws_pool_general_element_db_init(struct mlx5hws_pool *pool) return 0; } -static void hws_onesize_element_db_destroy_element(struct mlx5hws_pool *pool, - struct mlx5hws_pool_elements *elem, - struct mlx5hws_pool_chunk *chunk) +static void +hws_onesize_element_db_destroy_element(struct mlx5hws_pool *pool, + struct mlx5hws_pool_elements *elem) { - if (unlikely(!pool->resource[chunk->resource_idx])) - pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx); - - hws_pool_resource_free(pool, chunk->resource_idx); + hws_pool_resource_free(pool, 0); + bitmap_free(elem->bitmap); kfree(elem); - pool->db.element_manager->elements[chunk->resource_idx] = NULL; + pool->db.element = NULL; } static void hws_onesize_element_db_put_chunk(struct mlx5hws_pool *pool, @@ -471,7 +469,7 @@ static void hws_onesize_element_db_put_chunk(struct mlx5hws_pool *pool, if (unlikely(chunk->resource_idx)) pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx); - elem = pool->db.element_manager->elements[chunk->resource_idx]; + elem = pool->db.element; if (!elem) { mlx5hws_err(pool->ctx, "No such element (%d)\n", chunk->resource_idx); return; @@ -483,7 +481,7 @@ static void hws_onesize_element_db_put_chunk(struct mlx5hws_pool *pool, if (pool->flags & MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE && !elem->num_of_elements) - hws_onesize_element_db_destroy_element(pool, elem, chunk); + hws_onesize_element_db_destroy_element(pool, elem); } static int hws_onesize_element_db_get_chunk(struct mlx5hws_pool *pool, @@ -504,18 +502,13 @@ static int hws_onesize_element_db_get_chunk(struct mlx5hws_pool *pool, static void hws_onesize_element_db_uninit(struct mlx5hws_pool *pool) { - struct mlx5hws_pool_elements *elem; - int i; + struct mlx5hws_pool_elements *elem = pool->db.element; - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { - elem = pool->db.element_manager->elements[i]; - if (elem) { - bitmap_free(elem->bitmap); - kfree(elem); - pool->db.element_manager->elements[i] = NULL; - } + if (elem) { + bitmap_free(elem->bitmap); + kfree(elem); + pool->db.element = NULL; } - kfree(pool->db.element_manager); } /* This memory management works as the following: @@ -526,10 +519,6 @@ static void hws_onesize_element_db_uninit(struct mlx5hws_pool *pool) */ static int hws_pool_onesize_element_db_init(struct mlx5hws_pool *pool) { - pool->db.element_manager = kzalloc(sizeof(*pool->db.element_manager), GFP_KERNEL); - if (!pool->db.element_manager) - return -ENOMEM; - pool->p_db_uninit = &hws_onesize_element_db_uninit; pool->p_get_chunk = &hws_onesize_element_db_get_chunk; pool->p_put_chunk = &hws_onesize_element_db_put_chunk; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h index 621298b352b24..f4258f83fdbf7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h @@ -87,14 +87,10 @@ struct mlx5hws_pool_elements { bool is_full; }; -struct mlx5hws_element_manager { - struct mlx5hws_pool_elements *elements[MLX5HWS_POOL_RESOURCE_ARR_SZ]; -}; - struct mlx5hws_pool_db { enum mlx5hws_db_type type; union { - struct mlx5hws_element_manager *element_manager; + struct mlx5hws_pool_elements *element; struct mlx5hws_buddy_manager *buddy_manager; }; }; From 38956bea7349ce75c1519b57c27cd97580b4c822 Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 10 Apr 2025 22:17:33 +0300 Subject: [PATCH 137/770] net/mlx5: HWS, Make pool single resource The pool implementation claimed to support multiple resources, but this does not really make sense in context. Callers always allocate a single STC or STE chunk of exactly the size provided. The code that handled multiple resources was unused (and likely buggy) due to the combination of flags passed by callers. Simplify the pool by having it handle a single resource. As a result of this simplification, chunks no longer contain a resource offset (there is now only one resource per pool), and the get_base_id functions no longer take a chunk parameter. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/1744312662-356571-4-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/action.c | 27 ++- .../mellanox/mlx5/core/steering/hws/debug.c | 22 +-- .../mellanox/mlx5/core/steering/hws/matcher.c | 10 +- .../mellanox/mlx5/core/steering/hws/pool.c | 182 ++++++------------ .../mellanox/mlx5/core/steering/hws/pool.h | 28 +-- 5 files changed, 91 insertions(+), 178 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c index b5332c54d4fb0..781ba8c4f733e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c @@ -238,6 +238,7 @@ hws_action_fixup_stc_attr(struct mlx5hws_context *ctx, enum mlx5hws_table_type table_type, bool is_mirror) { + struct mlx5hws_pool *pool; bool use_fixup = false; u32 fw_tbl_type; u32 base_id; @@ -253,13 +254,11 @@ hws_action_fixup_stc_attr(struct mlx5hws_context *ctx, use_fixup = true; break; } + pool = stc_attr->ste_table.ste_pool; if (!is_mirror) - base_id = mlx5hws_pool_chunk_get_base_id(stc_attr->ste_table.ste_pool, - &stc_attr->ste_table.ste); + base_id = mlx5hws_pool_get_base_id(pool); else - base_id = - mlx5hws_pool_chunk_get_base_mirror_id(stc_attr->ste_table.ste_pool, - &stc_attr->ste_table.ste); + base_id = mlx5hws_pool_get_base_mirror_id(pool); *fixup_stc_attr = *stc_attr; fixup_stc_attr->ste_table.ste_obj_id = base_id; @@ -337,7 +336,7 @@ __must_hold(&ctx->ctrl_lock) if (!mlx5hws_context_cap_dynamic_reparse(ctx)) stc_attr->reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; - obj_0_id = mlx5hws_pool_chunk_get_base_id(stc_pool, stc); + obj_0_id = mlx5hws_pool_get_base_id(stc_pool); /* According to table/action limitation change the stc_attr */ use_fixup = hws_action_fixup_stc_attr(ctx, stc_attr, &fixup_stc_attr, table_type, false); @@ -353,7 +352,7 @@ __must_hold(&ctx->ctrl_lock) if (table_type == MLX5HWS_TABLE_TYPE_FDB) { u32 obj_1_id; - obj_1_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, stc); + obj_1_id = mlx5hws_pool_get_base_mirror_id(stc_pool); use_fixup = hws_action_fixup_stc_attr(ctx, stc_attr, &fixup_stc_attr, @@ -393,11 +392,11 @@ __must_hold(&ctx->ctrl_lock) stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_DROP; stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; stc_attr.stc_offset = stc->offset; - obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, stc); + obj_id = mlx5hws_pool_get_base_id(stc_pool); mlx5hws_cmd_stc_modify(ctx->mdev, obj_id, &stc_attr); if (table_type == MLX5HWS_TABLE_TYPE_FDB) { - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, stc); + obj_id = mlx5hws_pool_get_base_mirror_id(stc_pool); mlx5hws_cmd_stc_modify(ctx->mdev, obj_id, &stc_attr); } @@ -1581,7 +1580,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, u32 miss_ft_id) { struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0}; - struct mlx5hws_action_default_stc *default_stc; struct mlx5hws_matcher_action_ste *table_ste; struct mlx5hws_pool_attr pool_attr = {0}; struct mlx5hws_pool *ste_pool, *stc_pool; @@ -1629,7 +1627,7 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, rtc_attr.fw_gen_wqe = true; rtc_attr.is_scnd_range = true; - obj_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); + obj_id = mlx5hws_pool_get_base_id(ste_pool); rtc_attr.pd = ctx->pd_num; rtc_attr.ste_base = obj_id; @@ -1639,8 +1637,7 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, /* STC is a single resource (obj_id), use any STC for the ID */ stc_pool = ctx->stc_pool; - default_stc = ctx->common_res.default_stc; - obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit); + obj_id = mlx5hws_pool_get_base_id(stc_pool); rtc_attr.stc_base = obj_id; ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_0_id); @@ -1650,11 +1647,11 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, } /* Create mirror RTC */ - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste); + obj_id = mlx5hws_pool_get_base_mirror_id(ste_pool); rtc_attr.ste_base = obj_id; rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(MLX5HWS_TABLE_TYPE_FDB, true); - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, &default_stc->default_hit); + obj_id = mlx5hws_pool_get_base_mirror_id(stc_pool); rtc_attr.stc_base = obj_id; ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_1_id); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c index 696275fd0ce22..3491408c5d84f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c @@ -118,7 +118,6 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma { enum mlx5hws_table_type tbl_type = matcher->tbl->type; struct mlx5hws_cmd_ft_query_attr ft_attr = {0}; - struct mlx5hws_pool_chunk *ste; struct mlx5hws_pool *ste_pool; u64 icm_addr_0 = 0; u64 icm_addr_1 = 0; @@ -134,12 +133,11 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma matcher->end_ft_id, matcher->col_matcher ? HWS_PTR_TO_ID(matcher->col_matcher) : 0); - ste = &matcher->match_ste.ste; ste_pool = matcher->match_ste.pool; if (ste_pool) { - ste_0_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); + ste_0_id = mlx5hws_pool_get_base_id(ste_pool); if (tbl_type == MLX5HWS_TABLE_TYPE_FDB) - ste_1_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste); + ste_1_id = mlx5hws_pool_get_base_mirror_id(ste_pool); } seq_printf(f, ",%d,%d,%d,%d", @@ -148,12 +146,11 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma matcher->match_ste.rtc_1_id, (int)ste_1_id); - ste = &matcher->action_ste.ste; ste_pool = matcher->action_ste.pool; if (ste_pool) { - ste_0_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); + ste_0_id = mlx5hws_pool_get_base_id(ste_pool); if (tbl_type == MLX5HWS_TABLE_TYPE_FDB) - ste_1_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste); + ste_1_id = mlx5hws_pool_get_base_mirror_id(ste_pool); else ste_1_id = -1; } else { @@ -387,14 +384,17 @@ static int hws_debug_dump_context_stc(struct seq_file *f, struct mlx5hws_context if (!stc_pool) return 0; - if (stc_pool->resource[0]) { - ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->resource[0]); + if (stc_pool->resource) { + ret = hws_debug_dump_context_stc_resource(f, ctx, + stc_pool->resource); if (ret) return ret; } - if (stc_pool->mirror_resource[0]) { - ret = hws_debug_dump_context_stc_resource(f, ctx, stc_pool->mirror_resource[0]); + if (stc_pool->mirror_resource) { + struct mlx5hws_pool_resource *res = stc_pool->mirror_resource; + + ret = hws_debug_dump_context_stc_resource(f, ctx, res); if (ret) return ret; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index 37a4497048a6f..59b14db427b4c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -223,7 +223,6 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0}; struct mlx5hws_match_template *mt = matcher->mt; struct mlx5hws_context *ctx = matcher->tbl->ctx; - struct mlx5hws_action_default_stc *default_stc; struct mlx5hws_matcher_action_ste *action_ste; struct mlx5hws_table *tbl = matcher->tbl; struct mlx5hws_pool *ste_pool, *stc_pool; @@ -305,7 +304,7 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, return -EINVAL; } - obj_id = mlx5hws_pool_chunk_get_base_id(ste_pool, ste); + obj_id = mlx5hws_pool_get_base_id(ste_pool); rtc_attr.pd = ctx->pd_num; rtc_attr.ste_base = obj_id; @@ -316,8 +315,7 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, /* STC is a single resource (obj_id), use any STC for the ID */ stc_pool = ctx->stc_pool; - default_stc = ctx->common_res.default_stc; - obj_id = mlx5hws_pool_chunk_get_base_id(stc_pool, &default_stc->default_hit); + obj_id = mlx5hws_pool_get_base_id(stc_pool); rtc_attr.stc_base = obj_id; ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_0_id); @@ -328,11 +326,11 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, } if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) { - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(ste_pool, ste); + obj_id = mlx5hws_pool_get_base_mirror_id(ste_pool); rtc_attr.ste_base = obj_id; rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(tbl->type, true); - obj_id = mlx5hws_pool_chunk_get_base_mirror_id(stc_pool, &default_stc->default_hit); + obj_id = mlx5hws_pool_get_base_mirror_id(stc_pool); rtc_attr.stc_base = obj_id; hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, true); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c index 35ed9bee06a69..0de03e17624c6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c @@ -20,15 +20,14 @@ static void hws_pool_free_one_resource(struct mlx5hws_pool_resource *resource) kfree(resource); } -static void hws_pool_resource_free(struct mlx5hws_pool *pool, - int resource_idx) +static void hws_pool_resource_free(struct mlx5hws_pool *pool) { - hws_pool_free_one_resource(pool->resource[resource_idx]); - pool->resource[resource_idx] = NULL; + hws_pool_free_one_resource(pool->resource); + pool->resource = NULL; if (pool->tbl_type == MLX5HWS_TABLE_TYPE_FDB) { - hws_pool_free_one_resource(pool->mirror_resource[resource_idx]); - pool->mirror_resource[resource_idx] = NULL; + hws_pool_free_one_resource(pool->mirror_resource); + pool->mirror_resource = NULL; } } @@ -78,7 +77,7 @@ hws_pool_create_one_resource(struct mlx5hws_pool *pool, u32 log_range, } static int -hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range, int idx) +hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range) { struct mlx5hws_pool_resource *resource; u32 fw_ft_type, opt_log_range; @@ -91,7 +90,7 @@ hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range, int idx) return -EINVAL; } - pool->resource[idx] = resource; + pool->resource = resource; if (pool->tbl_type == MLX5HWS_TABLE_TYPE_FDB) { struct mlx5hws_pool_resource *mirror_resource; @@ -102,10 +101,10 @@ hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range, int idx) if (!mirror_resource) { mlx5hws_err(pool->ctx, "Failed allocating mirrored resource\n"); hws_pool_free_one_resource(resource); - pool->resource[idx] = NULL; + pool->resource = NULL; return -EINVAL; } - pool->mirror_resource[idx] = mirror_resource; + pool->mirror_resource = mirror_resource; } return 0; @@ -129,9 +128,9 @@ static void hws_pool_buddy_db_put_chunk(struct mlx5hws_pool *pool, { struct mlx5hws_buddy_mem *buddy; - buddy = pool->db.buddy_manager->buddies[chunk->resource_idx]; + buddy = pool->db.buddy; if (!buddy) { - mlx5hws_err(pool->ctx, "No such buddy (%d)\n", chunk->resource_idx); + mlx5hws_err(pool->ctx, "Bad buddy state\n"); return; } @@ -139,86 +138,50 @@ static void hws_pool_buddy_db_put_chunk(struct mlx5hws_pool *pool, } static struct mlx5hws_buddy_mem * -hws_pool_buddy_get_next_buddy(struct mlx5hws_pool *pool, int idx, - u32 order, bool *is_new_buddy) +hws_pool_buddy_get_buddy(struct mlx5hws_pool *pool, u32 order) { static struct mlx5hws_buddy_mem *buddy; u32 new_buddy_size; - buddy = pool->db.buddy_manager->buddies[idx]; + buddy = pool->db.buddy; if (buddy) return buddy; new_buddy_size = max(pool->alloc_log_sz, order); - *is_new_buddy = true; buddy = mlx5hws_buddy_create(new_buddy_size); if (!buddy) { - mlx5hws_err(pool->ctx, "Failed to create buddy order: %d index: %d\n", - new_buddy_size, idx); + mlx5hws_err(pool->ctx, "Failed to create buddy order: %d\n", + new_buddy_size); return NULL; } - if (hws_pool_resource_alloc(pool, new_buddy_size, idx) != 0) { - mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d index: %d\n", - pool->type, new_buddy_size, idx); + if (hws_pool_resource_alloc(pool, new_buddy_size) != 0) { + mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n", + pool->type, new_buddy_size); mlx5hws_buddy_cleanup(buddy); return NULL; } - pool->db.buddy_manager->buddies[idx] = buddy; + pool->db.buddy = buddy; return buddy; } static int hws_pool_buddy_get_mem_chunk(struct mlx5hws_pool *pool, int order, - u32 *buddy_idx, int *seg) { struct mlx5hws_buddy_mem *buddy; - bool new_mem = false; - int ret = 0; - int i; - - *seg = -1; - - /* Find the next free place from the buddy array */ - while (*seg < 0) { - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { - buddy = hws_pool_buddy_get_next_buddy(pool, i, - order, - &new_mem); - if (!buddy) { - ret = -ENOMEM; - goto out; - } - - *seg = mlx5hws_buddy_alloc_mem(buddy, order); - if (*seg >= 0) - goto found; - - if (pool->flags & MLX5HWS_POOL_FLAGS_ONE_RESOURCE) { - mlx5hws_err(pool->ctx, - "Fail to allocate seg for one resource pool\n"); - ret = -ENOMEM; - goto out; - } - - if (new_mem) { - /* We have new memory pool, should be place for us */ - mlx5hws_err(pool->ctx, - "No memory for order: %d with buddy no: %d\n", - order, i); - ret = -ENOMEM; - goto out; - } - } - } -found: - *buddy_idx = i; -out: - return ret; + buddy = hws_pool_buddy_get_buddy(pool, order); + if (!buddy) + return -ENOMEM; + + *seg = mlx5hws_buddy_alloc_mem(buddy, order); + if (*seg >= 0) + return 0; + + return -ENOMEM; } static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool, @@ -226,9 +189,7 @@ static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool, { int ret = 0; - /* Go over the buddies and find next free slot */ ret = hws_pool_buddy_get_mem_chunk(pool, chunk->order, - &chunk->resource_idx, &chunk->offset); if (ret) mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n", @@ -240,33 +201,21 @@ static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool, static void hws_pool_buddy_db_uninit(struct mlx5hws_pool *pool) { struct mlx5hws_buddy_mem *buddy; - int i; - - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { - buddy = pool->db.buddy_manager->buddies[i]; - if (buddy) { - mlx5hws_buddy_cleanup(buddy); - kfree(buddy); - pool->db.buddy_manager->buddies[i] = NULL; - } - } - kfree(pool->db.buddy_manager); + buddy = pool->db.buddy; + if (buddy) { + mlx5hws_buddy_cleanup(buddy); + kfree(buddy); + pool->db.buddy = NULL; + } } static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool, u32 log_range) { - pool->db.buddy_manager = kzalloc(sizeof(*pool->db.buddy_manager), GFP_KERNEL); - if (!pool->db.buddy_manager) - return -ENOMEM; - if (pool->flags & MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE) { - bool new_buddy; - - if (!hws_pool_buddy_get_next_buddy(pool, 0, log_range, &new_buddy)) { + if (!hws_pool_buddy_get_buddy(pool, log_range)) { mlx5hws_err(pool->ctx, "Failed allocating memory on create log_sz: %d\n", log_range); - kfree(pool->db.buddy_manager); return -ENOMEM; } } @@ -278,14 +227,13 @@ static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool, u32 log_range) return 0; } -static int hws_pool_create_resource_on_index(struct mlx5hws_pool *pool, - u32 alloc_size, int idx) +static int hws_pool_create_resource(struct mlx5hws_pool *pool, u32 alloc_size) { - int ret = hws_pool_resource_alloc(pool, alloc_size, idx); + int ret = hws_pool_resource_alloc(pool, alloc_size); if (ret) { - mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d index: %d\n", - pool->type, alloc_size, idx); + mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n", + pool->type, alloc_size); return ret; } @@ -319,7 +267,7 @@ hws_pool_element_create_new_elem(struct mlx5hws_pool *pool, u32 order) elem->log_size = alloc_size - order; } - if (hws_pool_create_resource_on_index(pool, alloc_size, 0)) { + if (hws_pool_create_resource(pool, alloc_size)) { mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n", pool->type, alloc_size); goto free_db; @@ -355,7 +303,7 @@ static int hws_pool_element_find_seg(struct mlx5hws_pool_elements *elem, int *se static int hws_pool_onesize_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order, - u32 *idx, int *seg) + int *seg) { struct mlx5hws_pool_elements *elem; @@ -370,7 +318,6 @@ hws_pool_onesize_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order, return -ENOMEM; } - *idx = 0; elem->num_of_elements++; return 0; @@ -379,21 +326,17 @@ hws_pool_onesize_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order, return -ENOMEM; } -static int -hws_pool_general_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order, - u32 *idx, int *seg) +static int hws_pool_general_element_get_mem_chunk(struct mlx5hws_pool *pool, + u32 order, int *seg) { - int ret, i; - - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) { - if (!pool->resource[i]) { - ret = hws_pool_create_resource_on_index(pool, order, i); - if (ret) - goto err_no_res; - *idx = i; - *seg = 0; /* One memory slot in that element */ - return 0; - } + int ret; + + if (!pool->resource) { + ret = hws_pool_create_resource(pool, order); + if (ret) + goto err_no_res; + *seg = 0; /* One memory slot in that element */ + return 0; } mlx5hws_err(pool->ctx, "No more resources (last request order: %d)\n", order); @@ -409,9 +352,7 @@ static int hws_pool_general_element_db_get_chunk(struct mlx5hws_pool *pool, { int ret; - /* Go over all memory elements and find/allocate free slot */ ret = hws_pool_general_element_get_mem_chunk(pool, chunk->order, - &chunk->resource_idx, &chunk->offset); if (ret) mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n", @@ -423,11 +364,8 @@ static int hws_pool_general_element_db_get_chunk(struct mlx5hws_pool *pool, static void hws_pool_general_element_db_put_chunk(struct mlx5hws_pool *pool, struct mlx5hws_pool_chunk *chunk) { - if (unlikely(!pool->resource[chunk->resource_idx])) - pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx); - if (pool->flags & MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE) - hws_pool_resource_free(pool, chunk->resource_idx); + hws_pool_resource_free(pool); } static void hws_pool_general_element_db_uninit(struct mlx5hws_pool *pool) @@ -455,7 +393,7 @@ static void hws_onesize_element_db_destroy_element(struct mlx5hws_pool *pool, struct mlx5hws_pool_elements *elem) { - hws_pool_resource_free(pool, 0); + hws_pool_resource_free(pool); bitmap_free(elem->bitmap); kfree(elem); pool->db.element = NULL; @@ -466,12 +404,9 @@ static void hws_onesize_element_db_put_chunk(struct mlx5hws_pool *pool, { struct mlx5hws_pool_elements *elem; - if (unlikely(chunk->resource_idx)) - pr_warn("HWS: invalid resource with index %d\n", chunk->resource_idx); - elem = pool->db.element; if (!elem) { - mlx5hws_err(pool->ctx, "No such element (%d)\n", chunk->resource_idx); + mlx5hws_err(pool->ctx, "Pool element was not allocated\n"); return; } @@ -489,9 +424,7 @@ static int hws_onesize_element_db_get_chunk(struct mlx5hws_pool *pool, { int ret = 0; - /* Go over all memory elements and find/allocate free slot */ ret = hws_pool_onesize_element_get_mem_chunk(pool, chunk->order, - &chunk->resource_idx, &chunk->offset); if (ret) mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n", @@ -614,13 +547,10 @@ mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_ int mlx5hws_pool_destroy(struct mlx5hws_pool *pool) { - int i; - mutex_destroy(&pool->lock); - for (i = 0; i < MLX5HWS_POOL_RESOURCE_ARR_SZ; i++) - if (pool->resource[i]) - hws_pool_resource_free(pool, i); + if (pool->resource) + hws_pool_resource_free(pool); hws_pool_db_unint(pool); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h index f4258f83fdbf7..112a61cd29979 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h @@ -6,16 +6,12 @@ #define MLX5HWS_POOL_STC_LOG_SZ 15 -#define MLX5HWS_POOL_RESOURCE_ARR_SZ 100 - enum mlx5hws_pool_type { MLX5HWS_POOL_TYPE_STE, MLX5HWS_POOL_TYPE_STC, }; struct mlx5hws_pool_chunk { - u32 resource_idx; - /* Internal offset, relative to base index */ int offset; int order; }; @@ -72,14 +68,10 @@ enum mlx5hws_db_type { MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE, /* One resource only, all the elements are with same one size */ MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE, - /* Many resources, the memory allocated with buddy mechanism */ + /* Entries are managed using a buddy mechanism. */ MLX5HWS_POOL_DB_TYPE_BUDDY, }; -struct mlx5hws_buddy_manager { - struct mlx5hws_buddy_mem *buddies[MLX5HWS_POOL_RESOURCE_ARR_SZ]; -}; - struct mlx5hws_pool_elements { u32 num_of_elements; unsigned long *bitmap; @@ -91,7 +83,7 @@ struct mlx5hws_pool_db { enum mlx5hws_db_type type; union { struct mlx5hws_pool_elements *element; - struct mlx5hws_buddy_manager *buddy_manager; + struct mlx5hws_buddy_mem *buddy; }; }; @@ -109,8 +101,8 @@ struct mlx5hws_pool { size_t alloc_log_sz; enum mlx5hws_table_type tbl_type; enum mlx5hws_pool_optimize opt_type; - struct mlx5hws_pool_resource *resource[MLX5HWS_POOL_RESOURCE_ARR_SZ]; - struct mlx5hws_pool_resource *mirror_resource[MLX5HWS_POOL_RESOURCE_ARR_SZ]; + struct mlx5hws_pool_resource *resource; + struct mlx5hws_pool_resource *mirror_resource; /* DB */ struct mlx5hws_pool_db db; /* Functions */ @@ -131,17 +123,13 @@ int mlx5hws_pool_chunk_alloc(struct mlx5hws_pool *pool, void mlx5hws_pool_chunk_free(struct mlx5hws_pool *pool, struct mlx5hws_pool_chunk *chunk); -static inline u32 -mlx5hws_pool_chunk_get_base_id(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static inline u32 mlx5hws_pool_get_base_id(struct mlx5hws_pool *pool) { - return pool->resource[chunk->resource_idx]->base_id; + return pool->resource->base_id; } -static inline u32 -mlx5hws_pool_chunk_get_base_mirror_id(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static inline u32 mlx5hws_pool_get_base_mirror_id(struct mlx5hws_pool *pool) { - return pool->mirror_resource[chunk->resource_idx]->base_id; + return pool->mirror_resource->base_id; } #endif /* MLX5HWS_POOL_H_ */ From d171ce3d988868bed9dc3c9eeb8428f87dd9ac85 Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 10 Apr 2025 22:17:34 +0300 Subject: [PATCH 138/770] net/mlx5: HWS, Refactor pool implementation Refactor the pool implementation to remove unused flags and clarify its usage. A pool represents a single range of STEs or STCs which are allocated at pool creation time. Pools are used under three patterns: 1. STCs are allocated one at a time from a global pool using a bitmap based implementation. 2. Action STEs are allocated in power-of-two blocks using a buddy algorithm. 3. Match STEs do not use allocation, since insertion into these tables is based on hashes or direct addressing. In such cases we use a pool only to create the STE range. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/1744312662-356571-5-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/action.c | 1 - .../mellanox/mlx5/core/steering/hws/context.c | 1 - .../mellanox/mlx5/core/steering/hws/matcher.c | 19 +- .../mellanox/mlx5/core/steering/hws/pool.c | 387 +++++------------- .../mellanox/mlx5/core/steering/hws/pool.h | 45 +- 5 files changed, 116 insertions(+), 337 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c index 781ba8c4f733e..39904b337b813 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c @@ -1602,7 +1602,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, pool_attr.table_type = MLX5HWS_TABLE_TYPE_FDB; pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; - pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL; pool_attr.alloc_log_sz = 1; table_ste->pool = mlx5hws_pool_create(ctx, &pool_attr); if (!table_ste->pool) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c index 9cda2774fd64a..b7cb736b74d70 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c @@ -34,7 +34,6 @@ static int hws_context_pools_init(struct mlx5hws_context *ctx) /* Create an STC pool per FT type */ pool_attr.pool_type = MLX5HWS_POOL_TYPE_STC; - pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STC_POOL; max_log_sz = min(MLX5HWS_POOL_STC_LOG_SZ, ctx->caps->stc_alloc_log_max); pool_attr.alloc_log_sz = max(max_log_sz, ctx->caps->stc_alloc_log_gran); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index 59b14db427b4c..95d31fd6c976d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -265,14 +265,6 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, rtc_attr.match_definer_0 = ctx->caps->linear_match_definer; } } - - /* Match pool requires implicit allocation */ - ret = mlx5hws_pool_chunk_alloc(ste_pool, ste); - if (ret) { - mlx5hws_err(ctx, "Failed to allocate STE for %s RTC", - hws_matcher_rtc_type_to_str(rtc_type)); - return ret; - } break; case HWS_MATCHER_RTC_TYPE_STE_ARRAY: @@ -357,23 +349,17 @@ static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher, { struct mlx5hws_matcher_action_ste *action_ste; struct mlx5hws_table *tbl = matcher->tbl; - struct mlx5hws_pool_chunk *ste; - struct mlx5hws_pool *ste_pool; u32 rtc_0_id, rtc_1_id; switch (rtc_type) { case HWS_MATCHER_RTC_TYPE_MATCH: rtc_0_id = matcher->match_ste.rtc_0_id; rtc_1_id = matcher->match_ste.rtc_1_id; - ste_pool = matcher->match_ste.pool; - ste = &matcher->match_ste.ste; break; case HWS_MATCHER_RTC_TYPE_STE_ARRAY: action_ste = &matcher->action_ste; rtc_0_id = action_ste->rtc_0_id; rtc_1_id = action_ste->rtc_1_id; - ste_pool = action_ste->pool; - ste = &action_ste->ste; break; default: return; @@ -383,8 +369,6 @@ static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher, mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, rtc_1_id); mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, rtc_0_id); - if (rtc_type == HWS_MATCHER_RTC_TYPE_MATCH) - mlx5hws_pool_chunk_free(ste_pool, ste); } static int @@ -557,7 +541,7 @@ static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher) /* Allocate action STE mempool */ pool_attr.table_type = tbl->type; pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; - pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL; + pool_attr.flags = MLX5HWS_POOL_FLAG_BUDDY; /* Pool size is similar to action RTC size */ pool_attr.alloc_log_sz = ilog2(roundup_pow_of_two(action_ste->max_stes)) + matcher->attr.table.sz_row_log + @@ -636,7 +620,6 @@ static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher) /* Create an STE pool per matcher*/ pool_attr.table_type = matcher->tbl->type; pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; - pool_attr.flags = MLX5HWS_POOL_FLAGS_FOR_MATCHER_STE_POOL; pool_attr.alloc_log_sz = matcher->attr.table.sz_col_log + matcher->attr.table.sz_row_log; hws_matcher_set_pool_attr(&pool_attr, matcher); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c index 0de03e17624c6..270b333faab3b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c @@ -60,10 +60,8 @@ hws_pool_create_one_resource(struct mlx5hws_pool *pool, u32 log_range, ret = -EINVAL; } - if (ret) { - mlx5hws_err(pool->ctx, "Failed to allocate resource objects\n"); + if (ret) goto free_resource; - } resource->pool = pool; resource->range = 1 << log_range; @@ -76,17 +74,17 @@ hws_pool_create_one_resource(struct mlx5hws_pool *pool, u32 log_range, return NULL; } -static int -hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range) +static int hws_pool_resource_alloc(struct mlx5hws_pool *pool) { struct mlx5hws_pool_resource *resource; u32 fw_ft_type, opt_log_range; fw_ft_type = mlx5hws_table_get_res_fw_ft_type(pool->tbl_type, false); - opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_ORIG ? 0 : log_range; + opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_ORIG ? + 0 : pool->alloc_log_sz; resource = hws_pool_create_one_resource(pool, opt_log_range, fw_ft_type); if (!resource) { - mlx5hws_err(pool->ctx, "Failed allocating resource\n"); + mlx5hws_err(pool->ctx, "Failed to allocate resource\n"); return -EINVAL; } @@ -96,10 +94,11 @@ hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range) struct mlx5hws_pool_resource *mirror_resource; fw_ft_type = mlx5hws_table_get_res_fw_ft_type(pool->tbl_type, true); - opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_MIRROR ? 0 : log_range; + opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_MIRROR ? + 0 : pool->alloc_log_sz; mirror_resource = hws_pool_create_one_resource(pool, opt_log_range, fw_ft_type); if (!mirror_resource) { - mlx5hws_err(pool->ctx, "Failed allocating mirrored resource\n"); + mlx5hws_err(pool->ctx, "Failed to allocate mirrored resource\n"); hws_pool_free_one_resource(resource); pool->resource = NULL; return -EINVAL; @@ -110,92 +109,58 @@ hws_pool_resource_alloc(struct mlx5hws_pool *pool, u32 log_range) return 0; } -static unsigned long *hws_pool_create_and_init_bitmap(u32 log_range) -{ - unsigned long *cur_bmp; - - cur_bmp = bitmap_zalloc(1 << log_range, GFP_KERNEL); - if (!cur_bmp) - return NULL; - - bitmap_fill(cur_bmp, 1 << log_range); - - return cur_bmp; -} - -static void hws_pool_buddy_db_put_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static int hws_pool_buddy_init(struct mlx5hws_pool *pool) { struct mlx5hws_buddy_mem *buddy; - buddy = pool->db.buddy; + buddy = mlx5hws_buddy_create(pool->alloc_log_sz); if (!buddy) { - mlx5hws_err(pool->ctx, "Bad buddy state\n"); - return; - } - - mlx5hws_buddy_free_mem(buddy, chunk->offset, chunk->order); -} - -static struct mlx5hws_buddy_mem * -hws_pool_buddy_get_buddy(struct mlx5hws_pool *pool, u32 order) -{ - static struct mlx5hws_buddy_mem *buddy; - u32 new_buddy_size; - - buddy = pool->db.buddy; - if (buddy) - return buddy; - - new_buddy_size = max(pool->alloc_log_sz, order); - buddy = mlx5hws_buddy_create(new_buddy_size); - if (!buddy) { - mlx5hws_err(pool->ctx, "Failed to create buddy order: %d\n", - new_buddy_size); - return NULL; + mlx5hws_err(pool->ctx, "Failed to create buddy order: %zu\n", + pool->alloc_log_sz); + return -ENOMEM; } - if (hws_pool_resource_alloc(pool, new_buddy_size) != 0) { - mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n", - pool->type, new_buddy_size); + if (hws_pool_resource_alloc(pool) != 0) { + mlx5hws_err(pool->ctx, "Failed to create resource type: %d size %zu\n", + pool->type, pool->alloc_log_sz); mlx5hws_buddy_cleanup(buddy); - return NULL; + return -ENOMEM; } pool->db.buddy = buddy; - return buddy; + return 0; } -static int hws_pool_buddy_get_mem_chunk(struct mlx5hws_pool *pool, - int order, - int *seg) +static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool, + struct mlx5hws_pool_chunk *chunk) { - struct mlx5hws_buddy_mem *buddy; + struct mlx5hws_buddy_mem *buddy = pool->db.buddy; - buddy = hws_pool_buddy_get_buddy(pool, order); - if (!buddy) - return -ENOMEM; + if (!buddy) { + mlx5hws_err(pool->ctx, "Bad buddy state\n"); + return -EINVAL; + } - *seg = mlx5hws_buddy_alloc_mem(buddy, order); - if (*seg >= 0) + chunk->offset = mlx5hws_buddy_alloc_mem(buddy, chunk->order); + if (chunk->offset >= 0) return 0; return -ENOMEM; } -static int hws_pool_buddy_db_get_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static void hws_pool_buddy_db_put_chunk(struct mlx5hws_pool *pool, + struct mlx5hws_pool_chunk *chunk) { - int ret = 0; + struct mlx5hws_buddy_mem *buddy; - ret = hws_pool_buddy_get_mem_chunk(pool, chunk->order, - &chunk->offset); - if (ret) - mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n", - chunk->order); + buddy = pool->db.buddy; + if (!buddy) { + mlx5hws_err(pool->ctx, "Bad buddy state\n"); + return; + } - return ret; + mlx5hws_buddy_free_mem(buddy, chunk->offset, chunk->order); } static void hws_pool_buddy_db_uninit(struct mlx5hws_pool *pool) @@ -210,15 +175,13 @@ static void hws_pool_buddy_db_uninit(struct mlx5hws_pool *pool) } } -static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool, u32 log_range) +static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool) { - if (pool->flags & MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE) { - if (!hws_pool_buddy_get_buddy(pool, log_range)) { - mlx5hws_err(pool->ctx, - "Failed allocating memory on create log_sz: %d\n", log_range); - return -ENOMEM; - } - } + int ret; + + ret = hws_pool_buddy_init(pool); + if (ret) + return ret; pool->p_db_uninit = &hws_pool_buddy_db_uninit; pool->p_get_chunk = &hws_pool_buddy_db_get_chunk; @@ -227,234 +190,105 @@ static int hws_pool_buddy_db_init(struct mlx5hws_pool *pool, u32 log_range) return 0; } -static int hws_pool_create_resource(struct mlx5hws_pool *pool, u32 alloc_size) -{ - int ret = hws_pool_resource_alloc(pool, alloc_size); - - if (ret) { - mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n", - pool->type, alloc_size); - return ret; - } - - return 0; -} - -static struct mlx5hws_pool_elements * -hws_pool_element_create_new_elem(struct mlx5hws_pool *pool, u32 order) +static unsigned long *hws_pool_create_and_init_bitmap(u32 log_range) { - struct mlx5hws_pool_elements *elem; - u32 alloc_size; - - alloc_size = pool->alloc_log_sz; + unsigned long *bitmap; - elem = kzalloc(sizeof(*elem), GFP_KERNEL); - if (!elem) + bitmap = bitmap_zalloc(1 << log_range, GFP_KERNEL); + if (!bitmap) return NULL; - /* Sharing the same resource, also means that all the elements are with size 1 */ - if ((pool->flags & MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS) && - !(pool->flags & MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK)) { - /* Currently all chunks in size 1 */ - elem->bitmap = hws_pool_create_and_init_bitmap(alloc_size - order); - if (!elem->bitmap) { - mlx5hws_err(pool->ctx, - "Failed to create bitmap type: %d: size %d\n", - pool->type, alloc_size); - goto free_elem; - } - - elem->log_size = alloc_size - order; - } - - if (hws_pool_create_resource(pool, alloc_size)) { - mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %d\n", - pool->type, alloc_size); - goto free_db; - } - - pool->db.element = elem; + bitmap_fill(bitmap, 1 << log_range); - return elem; - -free_db: - bitmap_free(elem->bitmap); -free_elem: - kfree(elem); - return NULL; + return bitmap; } -static int hws_pool_element_find_seg(struct mlx5hws_pool_elements *elem, int *seg) +static int hws_pool_bitmap_init(struct mlx5hws_pool *pool) { - unsigned int segment, size; + unsigned long *bitmap; - size = 1 << elem->log_size; - - segment = find_first_bit(elem->bitmap, size); - if (segment >= size) { - elem->is_full = true; + bitmap = hws_pool_create_and_init_bitmap(pool->alloc_log_sz); + if (!bitmap) { + mlx5hws_err(pool->ctx, "Failed to create bitmap order: %zu\n", + pool->alloc_log_sz); return -ENOMEM; } - bitmap_clear(elem->bitmap, segment, 1); - *seg = segment; - return 0; -} - -static int -hws_pool_onesize_element_get_mem_chunk(struct mlx5hws_pool *pool, u32 order, - int *seg) -{ - struct mlx5hws_pool_elements *elem; - - elem = pool->db.element; - if (!elem) - elem = hws_pool_element_create_new_elem(pool, order); - if (!elem) - goto err_no_elem; - - if (hws_pool_element_find_seg(elem, seg) != 0) { - mlx5hws_err(pool->ctx, "No more resources (last request order: %d)\n", order); + if (hws_pool_resource_alloc(pool) != 0) { + mlx5hws_err(pool->ctx, "Failed to create resource type: %d: size %zu\n", + pool->type, pool->alloc_log_sz); + bitmap_free(bitmap); return -ENOMEM; } - elem->num_of_elements++; - return 0; + pool->db.bitmap = bitmap; -err_no_elem: - mlx5hws_err(pool->ctx, "Failed to allocate element for order: %d\n", order); - return -ENOMEM; + return 0; } -static int hws_pool_general_element_get_mem_chunk(struct mlx5hws_pool *pool, - u32 order, int *seg) +static int hws_pool_bitmap_db_get_chunk(struct mlx5hws_pool *pool, + struct mlx5hws_pool_chunk *chunk) { - int ret; + unsigned long *bitmap, size; - if (!pool->resource) { - ret = hws_pool_create_resource(pool, order); - if (ret) - goto err_no_res; - *seg = 0; /* One memory slot in that element */ - return 0; + if (chunk->order != 0) { + mlx5hws_err(pool->ctx, "Pool only supports order 0 allocs\n"); + return -EINVAL; } - mlx5hws_err(pool->ctx, "No more resources (last request order: %d)\n", order); - return -ENOMEM; - -err_no_res: - mlx5hws_err(pool->ctx, "Failed to allocate element for order: %d\n", order); - return -ENOMEM; -} - -static int hws_pool_general_element_db_get_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) -{ - int ret; - - ret = hws_pool_general_element_get_mem_chunk(pool, chunk->order, - &chunk->offset); - if (ret) - mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n", - chunk->order); - - return ret; -} + bitmap = pool->db.bitmap; + if (!bitmap) { + mlx5hws_err(pool->ctx, "Bad bitmap state\n"); + return -EINVAL; + } -static void hws_pool_general_element_db_put_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) -{ - if (pool->flags & MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE) - hws_pool_resource_free(pool); -} + size = 1 << pool->alloc_log_sz; -static void hws_pool_general_element_db_uninit(struct mlx5hws_pool *pool) -{ - (void)pool; -} + chunk->offset = find_first_bit(bitmap, size); + if (chunk->offset >= size) + return -ENOMEM; -/* This memory management works as the following: - * - At start doesn't allocate no mem at all. - * - When new request for chunk arrived: - * allocate resource and give it. - * - When free that chunk: - * the resource is freed. - */ -static int hws_pool_general_element_db_init(struct mlx5hws_pool *pool) -{ - pool->p_db_uninit = &hws_pool_general_element_db_uninit; - pool->p_get_chunk = &hws_pool_general_element_db_get_chunk; - pool->p_put_chunk = &hws_pool_general_element_db_put_chunk; + bitmap_clear(bitmap, chunk->offset, 1); return 0; } -static void -hws_onesize_element_db_destroy_element(struct mlx5hws_pool *pool, - struct mlx5hws_pool_elements *elem) -{ - hws_pool_resource_free(pool); - bitmap_free(elem->bitmap); - kfree(elem); - pool->db.element = NULL; -} - -static void hws_onesize_element_db_put_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static void hws_pool_bitmap_db_put_chunk(struct mlx5hws_pool *pool, + struct mlx5hws_pool_chunk *chunk) { - struct mlx5hws_pool_elements *elem; + unsigned long *bitmap; - elem = pool->db.element; - if (!elem) { - mlx5hws_err(pool->ctx, "Pool element was not allocated\n"); + bitmap = pool->db.bitmap; + if (!bitmap) { + mlx5hws_err(pool->ctx, "Bad bitmap state\n"); return; } - bitmap_set(elem->bitmap, chunk->offset, 1); - elem->is_full = false; - elem->num_of_elements--; - - if (pool->flags & MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE && - !elem->num_of_elements) - hws_onesize_element_db_destroy_element(pool, elem); + bitmap_set(bitmap, chunk->offset, 1); } -static int hws_onesize_element_db_get_chunk(struct mlx5hws_pool *pool, - struct mlx5hws_pool_chunk *chunk) +static void hws_pool_bitmap_db_uninit(struct mlx5hws_pool *pool) { - int ret = 0; - - ret = hws_pool_onesize_element_get_mem_chunk(pool, chunk->order, - &chunk->offset); - if (ret) - mlx5hws_err(pool->ctx, "Failed to get free slot for chunk with order: %d\n", - chunk->order); + unsigned long *bitmap; - return ret; + bitmap = pool->db.bitmap; + if (bitmap) { + bitmap_free(bitmap); + pool->db.bitmap = NULL; + } } -static void hws_onesize_element_db_uninit(struct mlx5hws_pool *pool) +static int hws_pool_bitmap_db_init(struct mlx5hws_pool *pool) { - struct mlx5hws_pool_elements *elem = pool->db.element; + int ret; - if (elem) { - bitmap_free(elem->bitmap); - kfree(elem); - pool->db.element = NULL; - } -} + ret = hws_pool_bitmap_init(pool); + if (ret) + return ret; -/* This memory management works as the following: - * - At start doesn't allocate no mem at all. - * - When new request for chunk arrived: - * aloocate the first and only slot of memory/resource - * when it ended return error. - */ -static int hws_pool_onesize_element_db_init(struct mlx5hws_pool *pool) -{ - pool->p_db_uninit = &hws_onesize_element_db_uninit; - pool->p_get_chunk = &hws_onesize_element_db_get_chunk; - pool->p_put_chunk = &hws_onesize_element_db_put_chunk; + pool->p_db_uninit = &hws_pool_bitmap_db_uninit; + pool->p_get_chunk = &hws_pool_bitmap_db_get_chunk; + pool->p_put_chunk = &hws_pool_bitmap_db_put_chunk; return 0; } @@ -464,15 +298,14 @@ static int hws_pool_db_init(struct mlx5hws_pool *pool, { int ret; - if (db_type == MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE) - ret = hws_pool_general_element_db_init(pool); - else if (db_type == MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE) - ret = hws_pool_onesize_element_db_init(pool); + if (db_type == MLX5HWS_POOL_DB_TYPE_BITMAP) + ret = hws_pool_bitmap_db_init(pool); else - ret = hws_pool_buddy_db_init(pool, pool->alloc_log_sz); + ret = hws_pool_buddy_db_init(pool); if (ret) { - mlx5hws_err(pool->ctx, "Failed to init general db : %d (ret: %d)\n", db_type, ret); + mlx5hws_err(pool->ctx, "Failed to init pool type: %d (ret: %d)\n", + db_type, ret); return ret; } @@ -521,15 +354,10 @@ mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_ pool->tbl_type = pool_attr->table_type; pool->opt_type = pool_attr->opt_type; - /* Support general db */ - if (pool->flags == (MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE | - MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK)) - res_db_type = MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE; - else if (pool->flags == (MLX5HWS_POOL_FLAGS_ONE_RESOURCE | - MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS)) - res_db_type = MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE; - else + if (pool->flags & MLX5HWS_POOL_FLAG_BUDDY) res_db_type = MLX5HWS_POOL_DB_TYPE_BUDDY; + else + res_db_type = MLX5HWS_POOL_DB_TYPE_BITMAP; pool->alloc_log_sz = pool_attr->alloc_log_sz; @@ -545,7 +373,7 @@ mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_ return NULL; } -int mlx5hws_pool_destroy(struct mlx5hws_pool *pool) +void mlx5hws_pool_destroy(struct mlx5hws_pool *pool) { mutex_destroy(&pool->lock); @@ -555,5 +383,4 @@ int mlx5hws_pool_destroy(struct mlx5hws_pool *pool) hws_pool_db_unint(pool); kfree(pool); - return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h index 112a61cd29979..9a781a87f097b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h @@ -23,29 +23,10 @@ struct mlx5hws_pool_resource { }; enum mlx5hws_pool_flags { - /* Only a one resource in that pool */ - MLX5HWS_POOL_FLAGS_ONE_RESOURCE = 1 << 0, - MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE = 1 << 1, - /* No sharing resources between chunks */ - MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK = 1 << 2, - /* All objects are in the same size */ - MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS = 1 << 3, - /* Managed by buddy allocator */ - MLX5HWS_POOL_FLAGS_BUDDY_MANAGED = 1 << 4, - /* Allocate pool_type memory on pool creation */ - MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE = 1 << 5, - - /* These values should be used by the caller */ - MLX5HWS_POOL_FLAGS_FOR_STC_POOL = - MLX5HWS_POOL_FLAGS_ONE_RESOURCE | - MLX5HWS_POOL_FLAGS_FIXED_SIZE_OBJECTS, - MLX5HWS_POOL_FLAGS_FOR_MATCHER_STE_POOL = - MLX5HWS_POOL_FLAGS_RELEASE_FREE_RESOURCE | - MLX5HWS_POOL_FLAGS_RESOURCE_PER_CHUNK, - MLX5HWS_POOL_FLAGS_FOR_STE_ACTION_POOL = - MLX5HWS_POOL_FLAGS_ONE_RESOURCE | - MLX5HWS_POOL_FLAGS_BUDDY_MANAGED | - MLX5HWS_POOL_FLAGS_ALLOC_MEM_ON_CREATE, + /* Managed by a buddy allocator. If this is not set only allocations of + * order 0 are supported. + */ + MLX5HWS_POOL_FLAG_BUDDY = BIT(0), }; enum mlx5hws_pool_optimize { @@ -64,25 +45,16 @@ struct mlx5hws_pool_attr { }; enum mlx5hws_db_type { - /* Uses for allocating chunk of big memory, each element has its own resource in the FW*/ - MLX5HWS_POOL_DB_TYPE_GENERAL_SIZE, - /* One resource only, all the elements are with same one size */ - MLX5HWS_POOL_DB_TYPE_ONE_SIZE_RESOURCE, + /* Uses a bitmap, supports only allocations of order 0. */ + MLX5HWS_POOL_DB_TYPE_BITMAP, /* Entries are managed using a buddy mechanism. */ MLX5HWS_POOL_DB_TYPE_BUDDY, }; -struct mlx5hws_pool_elements { - u32 num_of_elements; - unsigned long *bitmap; - u32 log_size; - bool is_full; -}; - struct mlx5hws_pool_db { enum mlx5hws_db_type type; union { - struct mlx5hws_pool_elements *element; + unsigned long *bitmap; struct mlx5hws_buddy_mem *buddy; }; }; @@ -103,7 +75,6 @@ struct mlx5hws_pool { enum mlx5hws_pool_optimize opt_type; struct mlx5hws_pool_resource *resource; struct mlx5hws_pool_resource *mirror_resource; - /* DB */ struct mlx5hws_pool_db db; /* Functions */ mlx5hws_pool_unint_db p_db_uninit; @@ -115,7 +86,7 @@ struct mlx5hws_pool * mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_attr); -int mlx5hws_pool_destroy(struct mlx5hws_pool *pool); +void mlx5hws_pool_destroy(struct mlx5hws_pool *pool); int mlx5hws_pool_chunk_alloc(struct mlx5hws_pool *pool, struct mlx5hws_pool_chunk *chunk); From 43a2038c6d8a810e8e70f0e7fcb965f431c92bfb Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 10 Apr 2025 22:17:35 +0300 Subject: [PATCH 139/770] net/mlx5: HWS, Cleanup after pool refactoring Remove members which are now no longer used. In fact, many of the `struct mlx5hws_pool_chunk` were not even written to beyond being initialized, but they were used in various internals. Also cleanup some local variables which made more sense when the API was thicker. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/1744312662-356571-6-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/action.c | 5 -- .../mellanox/mlx5/core/steering/hws/cmd.c | 1 - .../mellanox/mlx5/core/steering/hws/cmd.h | 1 - .../mellanox/mlx5/core/steering/hws/matcher.c | 47 ++++++------------- .../mellanox/mlx5/core/steering/hws/matcher.h | 2 - 5 files changed, 14 insertions(+), 42 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c index 39904b337b813..161ad720b3396 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c @@ -1583,7 +1583,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, struct mlx5hws_matcher_action_ste *table_ste; struct mlx5hws_pool_attr pool_attr = {0}; struct mlx5hws_pool *ste_pool, *stc_pool; - struct mlx5hws_pool_chunk *ste; u32 *rtc_0_id, *rtc_1_id; u32 obj_id; int ret; @@ -1613,8 +1612,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, rtc_0_id = &table_ste->rtc_0_id; rtc_1_id = &table_ste->rtc_1_id; ste_pool = table_ste->pool; - ste = &table_ste->ste; - ste->order = 1; rtc_attr.log_size = 0; rtc_attr.log_depth = 0; @@ -1630,7 +1627,6 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, rtc_attr.pd = ctx->pd_num; rtc_attr.ste_base = obj_id; - rtc_attr.ste_offset = ste->offset; rtc_attr.reparse_mode = mlx5hws_context_get_reparse_mode(ctx); rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(MLX5HWS_TABLE_TYPE_FDB, false); @@ -1833,7 +1829,6 @@ mlx5hws_action_create_dest_match_range(struct mlx5hws_context *ctx, stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE; stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; - stc_attr.ste_table.ste = table_ste->ste; stc_attr.ste_table.ste_pool = table_ste->pool; stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index e8f98c109b999..9c83753e45924 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -406,7 +406,6 @@ int mlx5hws_cmd_rtc_create(struct mlx5_core_dev *mdev, MLX5_SET(rtc, attr, match_definer_1, rtc_attr->match_definer_1); MLX5_SET(rtc, attr, stc_id, rtc_attr->stc_base); MLX5_SET(rtc, attr, ste_table_base_id, rtc_attr->ste_base); - MLX5_SET(rtc, attr, ste_table_offset, rtc_attr->ste_offset); MLX5_SET(rtc, attr, miss_flow_table_id, rtc_attr->miss_ft_id); MLX5_SET(rtc, attr, reparse_mode, rtc_attr->reparse_mode); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h index 51d9e0291ac16..fa6bff210266c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h @@ -70,7 +70,6 @@ struct mlx5hws_cmd_rtc_create_attr { u32 pd; u32 stc_base; u32 ste_base; - u32 ste_offset; u32 miss_ft_id; bool fw_gen_wqe; u8 update_index_mode; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index 95d31fd6c976d..3028e0387e3f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -197,22 +197,15 @@ static int hws_matcher_disconnect(struct mlx5hws_matcher *matcher) static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher, struct mlx5hws_cmd_rtc_create_attr *rtc_attr, - enum mlx5hws_matcher_rtc_type rtc_type, bool is_mirror) { - struct mlx5hws_pool_chunk *ste = &matcher->action_ste.ste; enum mlx5hws_matcher_flow_src flow_src = matcher->attr.optimize_flow_src; - bool is_match_rtc = rtc_type == HWS_MATCHER_RTC_TYPE_MATCH; if ((flow_src == MLX5HWS_MATCHER_FLOW_SRC_VPORT && !is_mirror) || (flow_src == MLX5HWS_MATCHER_FLOW_SRC_WIRE && is_mirror)) { /* Optimize FDB RTC */ rtc_attr->log_size = 0; rtc_attr->log_depth = 0; - } else { - /* Keep original values */ - rtc_attr->log_size = is_match_rtc ? matcher->attr.table.sz_row_log : ste->order; - rtc_attr->log_depth = is_match_rtc ? matcher->attr.table.sz_col_log : 0; } } @@ -225,8 +218,7 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, struct mlx5hws_context *ctx = matcher->tbl->ctx; struct mlx5hws_matcher_action_ste *action_ste; struct mlx5hws_table *tbl = matcher->tbl; - struct mlx5hws_pool *ste_pool, *stc_pool; - struct mlx5hws_pool_chunk *ste; + struct mlx5hws_pool *ste_pool; u32 *rtc_0_id, *rtc_1_id; u32 obj_id; int ret; @@ -236,8 +228,6 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, rtc_0_id = &matcher->match_ste.rtc_0_id; rtc_1_id = &matcher->match_ste.rtc_1_id; ste_pool = matcher->match_ste.pool; - ste = &matcher->match_ste.ste; - ste->order = attr->table.sz_col_log + attr->table.sz_row_log; rtc_attr.log_size = attr->table.sz_row_log; rtc_attr.log_depth = attr->table.sz_col_log; @@ -273,16 +263,15 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, rtc_0_id = &action_ste->rtc_0_id; rtc_1_id = &action_ste->rtc_1_id; ste_pool = action_ste->pool; - ste = &action_ste->ste; /* Action RTC size calculation: * log((max number of rules in matcher) * * (max number of action STEs per rule) * * (2 to support writing new STEs for update rule)) */ - ste->order = ilog2(roundup_pow_of_two(action_ste->max_stes)) + - attr->table.sz_row_log + - MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT; - rtc_attr.log_size = ste->order; + rtc_attr.log_size = + ilog2(roundup_pow_of_two(action_ste->max_stes)) + + attr->table.sz_row_log + + MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT; rtc_attr.log_depth = 0; rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; /* The action STEs use the default always hit definer */ @@ -300,21 +289,19 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, rtc_attr.pd = ctx->pd_num; rtc_attr.ste_base = obj_id; - rtc_attr.ste_offset = ste->offset; rtc_attr.reparse_mode = mlx5hws_context_get_reparse_mode(ctx); rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(tbl->type, false); - hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, false); + hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, false); /* STC is a single resource (obj_id), use any STC for the ID */ - stc_pool = ctx->stc_pool; - obj_id = mlx5hws_pool_get_base_id(stc_pool); + obj_id = mlx5hws_pool_get_base_id(ctx->stc_pool); rtc_attr.stc_base = obj_id; ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_0_id); if (ret) { mlx5hws_err(ctx, "Failed to create matcher RTC of type %s", hws_matcher_rtc_type_to_str(rtc_type)); - goto free_ste; + return ret; } if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) { @@ -322,9 +309,9 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, rtc_attr.ste_base = obj_id; rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(tbl->type, true); - obj_id = mlx5hws_pool_get_base_mirror_id(stc_pool); + obj_id = mlx5hws_pool_get_base_mirror_id(ctx->stc_pool); rtc_attr.stc_base = obj_id; - hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, rtc_type, true); + hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, true); ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_1_id); if (ret) { @@ -338,16 +325,12 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, destroy_rtc_0: mlx5hws_cmd_rtc_destroy(ctx->mdev, *rtc_0_id); -free_ste: - if (rtc_type == HWS_MATCHER_RTC_TYPE_MATCH) - mlx5hws_pool_chunk_free(ste_pool, ste); return ret; } static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher, enum mlx5hws_matcher_rtc_type rtc_type) { - struct mlx5hws_matcher_action_ste *action_ste; struct mlx5hws_table *tbl = matcher->tbl; u32 rtc_0_id, rtc_1_id; @@ -357,18 +340,17 @@ static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher, rtc_1_id = matcher->match_ste.rtc_1_id; break; case HWS_MATCHER_RTC_TYPE_STE_ARRAY: - action_ste = &matcher->action_ste; - rtc_0_id = action_ste->rtc_0_id; - rtc_1_id = action_ste->rtc_1_id; + rtc_0_id = matcher->action_ste.rtc_0_id; + rtc_1_id = matcher->action_ste.rtc_1_id; break; default: return; } if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, rtc_1_id); + mlx5hws_cmd_rtc_destroy(tbl->ctx->mdev, rtc_1_id); - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, rtc_0_id); + mlx5hws_cmd_rtc_destroy(tbl->ctx->mdev, rtc_0_id); } static int @@ -564,7 +546,6 @@ static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher) stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE; stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; - stc_attr.ste_table.ste = action_ste->ste; stc_attr.ste_table.ste_pool = action_ste->pool; stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h index 20b32012c418b..0450b6175ac9e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h @@ -45,14 +45,12 @@ struct mlx5hws_match_template { }; struct mlx5hws_matcher_match_ste { - struct mlx5hws_pool_chunk ste; u32 rtc_0_id; u32 rtc_1_id; struct mlx5hws_pool *pool; }; struct mlx5hws_matcher_action_ste { - struct mlx5hws_pool_chunk ste; struct mlx5hws_pool_chunk stc; u32 rtc_0_id; u32 rtc_1_id; From 04562694766514f00e7086d3d4884db5f3a22d4e Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 10 Apr 2025 22:17:36 +0300 Subject: [PATCH 140/770] net/mlx5: HWS, Add fullness tracking to pool Future users will need to query whether a pool is empty. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/1744312662-356571-7-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/pool.c | 7 ++++++ .../mellanox/mlx5/core/steering/hws/pool.h | 25 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c index 270b333faab3b..26d85fe3c417d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c @@ -324,6 +324,8 @@ int mlx5hws_pool_chunk_alloc(struct mlx5hws_pool *pool, mutex_lock(&pool->lock); ret = pool->p_get_chunk(pool, chunk); + if (ret == 0) + pool->available_elems -= 1 << chunk->order; mutex_unlock(&pool->lock); return ret; @@ -334,6 +336,7 @@ void mlx5hws_pool_chunk_free(struct mlx5hws_pool *pool, { mutex_lock(&pool->lock); pool->p_put_chunk(pool, chunk); + pool->available_elems += 1 << chunk->order; mutex_unlock(&pool->lock); } @@ -360,6 +363,7 @@ mlx5hws_pool_create(struct mlx5hws_context *ctx, struct mlx5hws_pool_attr *pool_ res_db_type = MLX5HWS_POOL_DB_TYPE_BITMAP; pool->alloc_log_sz = pool_attr->alloc_log_sz; + pool->available_elems = 1 << pool_attr->alloc_log_sz; if (hws_pool_db_init(pool, res_db_type)) goto free_pool; @@ -377,6 +381,9 @@ void mlx5hws_pool_destroy(struct mlx5hws_pool *pool) { mutex_destroy(&pool->lock); + if (pool->available_elems != 1 << pool->alloc_log_sz) + mlx5hws_err(pool->ctx, "Attempting to destroy non-empty pool\n"); + if (pool->resource) hws_pool_resource_free(pool); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h index 9a781a87f097b..c82760d53e1a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h @@ -71,6 +71,7 @@ struct mlx5hws_pool { enum mlx5hws_pool_flags flags; struct mutex lock; /* protect the pool */ size_t alloc_log_sz; + size_t available_elems; enum mlx5hws_table_type tbl_type; enum mlx5hws_pool_optimize opt_type; struct mlx5hws_pool_resource *resource; @@ -103,4 +104,28 @@ static inline u32 mlx5hws_pool_get_base_mirror_id(struct mlx5hws_pool *pool) { return pool->mirror_resource->base_id; } + +static inline bool +mlx5hws_pool_empty(struct mlx5hws_pool *pool) +{ + bool ret; + + mutex_lock(&pool->lock); + ret = pool->available_elems == 0; + mutex_unlock(&pool->lock); + + return ret; +} + +static inline bool +mlx5hws_pool_full(struct mlx5hws_pool *pool) +{ + bool ret; + + mutex_lock(&pool->lock); + ret = pool->available_elems == (1 << pool->alloc_log_sz); + mutex_unlock(&pool->lock); + + return ret; +} #endif /* MLX5HWS_POOL_H_ */ From a68334f9750f41fc36990840090ef9dbee1e2c7e Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 10 Apr 2025 22:17:37 +0300 Subject: [PATCH 141/770] net/mlx5: HWS, Fix pool size optimization The optimization to create a size-one STE range for the unused direction was broken. The hardware prevents us from creating RTCs over unallocated STE space, so the only reason this has worked so far is because the optimization was never used. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/1744312662-356571-8-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c index 26d85fe3c417d..7e37d6e9eb836 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.c @@ -80,7 +80,7 @@ static int hws_pool_resource_alloc(struct mlx5hws_pool *pool) u32 fw_ft_type, opt_log_range; fw_ft_type = mlx5hws_table_get_res_fw_ft_type(pool->tbl_type, false); - opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_ORIG ? + opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_MIRROR ? 0 : pool->alloc_log_sz; resource = hws_pool_create_one_resource(pool, opt_log_range, fw_ft_type); if (!resource) { @@ -94,7 +94,7 @@ static int hws_pool_resource_alloc(struct mlx5hws_pool *pool) struct mlx5hws_pool_resource *mirror_resource; fw_ft_type = mlx5hws_table_get_res_fw_ft_type(pool->tbl_type, true); - opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_MIRROR ? + opt_log_range = pool->opt_type == MLX5HWS_POOL_OPTIMIZE_ORIG ? 0 : pool->alloc_log_sz; mirror_resource = hws_pool_create_one_resource(pool, opt_log_range, fw_ft_type); if (!mirror_resource) { From 983d01b2ce0ac688bb42489f33a29a02274366d5 Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 10 Apr 2025 22:17:38 +0300 Subject: [PATCH 142/770] net/mlx5: HWS, Implement action STE pool Implement a per-queue pool of action STEs that match STEs can link to, regardless of matcher. The code relies on hints to optimize whether a given rule is added to rx-only, tx-only or both. Correspondingly, action STEs need to be added to different RTC for ingress or egress paths. For rx-and-tx rules, the current rule implementation dictates that the offsets for a given rule must be the same in both RTCs. To avoid wasting STEs, each action STE pool element holds 3 pools: rx-only, tx-only, and rx-and-tx, corresponding to the possible values of the pool optimization enum. The implementation then chooses at rule creation / update which of these elements to allocate from. Each element holds multiple action STE tables, which wrap an RTC, an STE range, the logic to buddy-allocate offsets from the range, and an STC that allows match STEs to point to this table. When allocating offsets from an element, we iterate through available action STE tables and, if needed, create a new table. Similar to the previous implementation, this iteration does not free any resources. This is implemented in a subsequent patch. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/1744312662-356571-9-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/Makefile | 3 +- .../mlx5/core/steering/hws/action_ste_pool.c | 387 ++++++++++++++++++ .../mlx5/core/steering/hws/action_ste_pool.h | 58 +++ .../mellanox/mlx5/core/steering/hws/context.c | 7 + .../mellanox/mlx5/core/steering/hws/context.h | 1 + .../mlx5/core/steering/hws/internal.h | 1 + .../mellanox/mlx5/core/steering/hws/pool.h | 1 + 7 files changed, 457 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 568bbe5f83f57..d292e6a9e22c3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -154,7 +154,8 @@ mlx5_core-$(CONFIG_MLX5_HW_STEERING) += steering/hws/cmd.o \ steering/hws/vport.o \ steering/hws/bwc_complex.o \ steering/hws/fs_hws_pools.o \ - steering/hws/fs_hws.o + steering/hws/fs_hws.o \ + steering/hws/action_ste_pool.o # # SF device diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c new file mode 100644 index 0000000000000..cb6ad84116315 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c @@ -0,0 +1,387 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */ + +#include "internal.h" + +static const char * +hws_pool_opt_to_str(enum mlx5hws_pool_optimize opt) +{ + switch (opt) { + case MLX5HWS_POOL_OPTIMIZE_NONE: + return "rx-and-tx"; + case MLX5HWS_POOL_OPTIMIZE_ORIG: + return "rx-only"; + case MLX5HWS_POOL_OPTIMIZE_MIRROR: + return "tx-only"; + default: + return "unknown"; + } +} + +static int +hws_action_ste_table_create_pool(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_table *action_tbl, + enum mlx5hws_pool_optimize opt, size_t log_sz) +{ + struct mlx5hws_pool_attr pool_attr = { 0 }; + + pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; + pool_attr.table_type = MLX5HWS_TABLE_TYPE_FDB; + pool_attr.flags = MLX5HWS_POOL_FLAG_BUDDY; + pool_attr.opt_type = opt; + pool_attr.alloc_log_sz = log_sz; + + action_tbl->pool = mlx5hws_pool_create(ctx, &pool_attr); + if (!action_tbl->pool) { + mlx5hws_err(ctx, "Failed to allocate STE pool\n"); + return -EINVAL; + } + + return 0; +} + +static int hws_action_ste_table_create_single_rtc( + struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_table *action_tbl, + enum mlx5hws_pool_optimize opt, size_t log_sz, bool tx) +{ + struct mlx5hws_cmd_rtc_create_attr rtc_attr = { 0 }; + u32 *rtc_id; + + rtc_attr.log_depth = 0; + rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; + /* Action STEs use the default always hit definer. */ + rtc_attr.match_definer_0 = ctx->caps->trivial_match_definer; + rtc_attr.is_frst_jumbo = false; + rtc_attr.miss_ft_id = 0; + rtc_attr.pd = ctx->pd_num; + rtc_attr.reparse_mode = mlx5hws_context_get_reparse_mode(ctx); + + if (tx) { + rtc_attr.table_type = FS_FT_FDB_TX; + rtc_attr.ste_base = + mlx5hws_pool_get_base_mirror_id(action_tbl->pool); + rtc_attr.stc_base = + mlx5hws_pool_get_base_mirror_id(ctx->stc_pool); + rtc_attr.log_size = + opt == MLX5HWS_POOL_OPTIMIZE_ORIG ? 0 : log_sz; + rtc_id = &action_tbl->rtc_1_id; + } else { + rtc_attr.table_type = FS_FT_FDB_RX; + rtc_attr.ste_base = mlx5hws_pool_get_base_id(action_tbl->pool); + rtc_attr.stc_base = mlx5hws_pool_get_base_id(ctx->stc_pool); + rtc_attr.log_size = + opt == MLX5HWS_POOL_OPTIMIZE_MIRROR ? 0 : log_sz; + rtc_id = &action_tbl->rtc_0_id; + } + + return mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_id); +} + +static int +hws_action_ste_table_create_rtcs(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_table *action_tbl, + enum mlx5hws_pool_optimize opt, size_t log_sz) +{ + int err; + + err = hws_action_ste_table_create_single_rtc(ctx, action_tbl, opt, + log_sz, false); + if (err) + return err; + + err = hws_action_ste_table_create_single_rtc(ctx, action_tbl, opt, + log_sz, true); + if (err) { + mlx5hws_cmd_rtc_destroy(ctx->mdev, action_tbl->rtc_0_id); + return err; + } + + return 0; +} + +static void +hws_action_ste_table_destroy_rtcs(struct mlx5hws_action_ste_table *action_tbl) +{ + mlx5hws_cmd_rtc_destroy(action_tbl->pool->ctx->mdev, + action_tbl->rtc_1_id); + mlx5hws_cmd_rtc_destroy(action_tbl->pool->ctx->mdev, + action_tbl->rtc_0_id); +} + +static int +hws_action_ste_table_create_stc(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_table *action_tbl) +{ + struct mlx5hws_cmd_stc_modify_attr stc_attr = { 0 }; + + stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; + stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE; + stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; + stc_attr.ste_table.ste_pool = action_tbl->pool; + stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer; + + return mlx5hws_action_alloc_single_stc(ctx, &stc_attr, + MLX5HWS_TABLE_TYPE_FDB, + &action_tbl->stc); +} + +static struct mlx5hws_action_ste_table * +hws_action_ste_table_alloc(struct mlx5hws_action_ste_pool_element *parent_elem) +{ + enum mlx5hws_pool_optimize opt = parent_elem->opt; + struct mlx5hws_context *ctx = parent_elem->ctx; + struct mlx5hws_action_ste_table *action_tbl; + size_t log_sz; + int err; + + log_sz = min(parent_elem->log_sz ? + parent_elem->log_sz + + MLX5HWS_ACTION_STE_TABLE_STEP_LOG_SZ : + MLX5HWS_ACTION_STE_TABLE_INIT_LOG_SZ, + MLX5HWS_ACTION_STE_TABLE_MAX_LOG_SZ); + + action_tbl = kzalloc(sizeof(*action_tbl), GFP_KERNEL); + if (!action_tbl) + return ERR_PTR(-ENOMEM); + + err = hws_action_ste_table_create_pool(ctx, action_tbl, opt, log_sz); + if (err) + goto free_tbl; + + err = hws_action_ste_table_create_rtcs(ctx, action_tbl, opt, log_sz); + if (err) + goto destroy_pool; + + err = hws_action_ste_table_create_stc(ctx, action_tbl); + if (err) + goto destroy_rtcs; + + action_tbl->parent_elem = parent_elem; + INIT_LIST_HEAD(&action_tbl->list_node); + list_add(&action_tbl->list_node, &parent_elem->available); + parent_elem->log_sz = log_sz; + + mlx5hws_dbg(ctx, + "Allocated %s action STE table log_sz %zu; STEs (%d, %d); RTCs (%d, %d); STC %d\n", + hws_pool_opt_to_str(opt), log_sz, + mlx5hws_pool_get_base_id(action_tbl->pool), + mlx5hws_pool_get_base_mirror_id(action_tbl->pool), + action_tbl->rtc_0_id, action_tbl->rtc_1_id, + action_tbl->stc.offset); + + return action_tbl; + +destroy_rtcs: + hws_action_ste_table_destroy_rtcs(action_tbl); +destroy_pool: + mlx5hws_pool_destroy(action_tbl->pool); +free_tbl: + kfree(action_tbl); + + return ERR_PTR(err); +} + +static void +hws_action_ste_table_destroy(struct mlx5hws_action_ste_table *action_tbl) +{ + struct mlx5hws_context *ctx = action_tbl->parent_elem->ctx; + + mlx5hws_dbg(ctx, + "Destroying %s action STE table: STEs (%d, %d); RTCs (%d, %d); STC %d\n", + hws_pool_opt_to_str(action_tbl->parent_elem->opt), + mlx5hws_pool_get_base_id(action_tbl->pool), + mlx5hws_pool_get_base_mirror_id(action_tbl->pool), + action_tbl->rtc_0_id, action_tbl->rtc_1_id, + action_tbl->stc.offset); + + mlx5hws_action_free_single_stc(ctx, MLX5HWS_TABLE_TYPE_FDB, + &action_tbl->stc); + hws_action_ste_table_destroy_rtcs(action_tbl); + mlx5hws_pool_destroy(action_tbl->pool); + + list_del(&action_tbl->list_node); + kfree(action_tbl); +} + +static int +hws_action_ste_pool_element_init(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_pool_element *elem, + enum mlx5hws_pool_optimize opt) +{ + elem->ctx = ctx; + elem->opt = opt; + INIT_LIST_HEAD(&elem->available); + INIT_LIST_HEAD(&elem->full); + + return 0; +} + +static void hws_action_ste_pool_element_destroy( + struct mlx5hws_action_ste_pool_element *elem) +{ + struct mlx5hws_action_ste_table *action_tbl, *p; + + /* This should be empty, but attempt to free its elements anyway. */ + list_for_each_entry_safe(action_tbl, p, &elem->full, list_node) + hws_action_ste_table_destroy(action_tbl); + + list_for_each_entry_safe(action_tbl, p, &elem->available, list_node) + hws_action_ste_table_destroy(action_tbl); +} + +static int hws_action_ste_pool_init(struct mlx5hws_context *ctx, + struct mlx5hws_action_ste_pool *pool) +{ + enum mlx5hws_pool_optimize opt; + int err; + + /* Rules which are added for both RX and TX must use the same action STE + * indices for both. If we were to use a single table, then RX-only and + * TX-only rules would waste the unused entries. Thus, we use separate + * table sets for the three cases. + */ + for (opt = MLX5HWS_POOL_OPTIMIZE_NONE; opt < MLX5HWS_POOL_OPTIMIZE_MAX; + opt++) { + err = hws_action_ste_pool_element_init(ctx, &pool->elems[opt], + opt); + if (err) + goto destroy_elems; + } + + return 0; + +destroy_elems: + while (opt-- > MLX5HWS_POOL_OPTIMIZE_NONE) + hws_action_ste_pool_element_destroy(&pool->elems[opt]); + + return err; +} + +static void hws_action_ste_pool_destroy(struct mlx5hws_action_ste_pool *pool) +{ + int opt; + + for (opt = MLX5HWS_POOL_OPTIMIZE_MAX - 1; + opt >= MLX5HWS_POOL_OPTIMIZE_NONE; opt--) + hws_action_ste_pool_element_destroy(&pool->elems[opt]); +} + +int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx) +{ + struct mlx5hws_action_ste_pool *pool; + size_t queues = ctx->queues; + int i, err; + + pool = kcalloc(queues, sizeof(*pool), GFP_KERNEL); + if (!pool) + return -ENOMEM; + + for (i = 0; i < queues; i++) { + err = hws_action_ste_pool_init(ctx, &pool[i]); + if (err) + goto free_pool; + } + + ctx->action_ste_pool = pool; + + return 0; + +free_pool: + while (i--) + hws_action_ste_pool_destroy(&pool[i]); + kfree(pool); + + return err; +} + +void mlx5hws_action_ste_pool_uninit(struct mlx5hws_context *ctx) +{ + size_t queues = ctx->queues; + int i; + + for (i = 0; i < queues; i++) + hws_action_ste_pool_destroy(&ctx->action_ste_pool[i]); + + kfree(ctx->action_ste_pool); +} + +static struct mlx5hws_action_ste_pool_element * +hws_action_ste_choose_elem(struct mlx5hws_action_ste_pool *pool, + bool skip_rx, bool skip_tx) +{ + if (skip_rx) + return &pool->elems[MLX5HWS_POOL_OPTIMIZE_MIRROR]; + + if (skip_tx) + return &pool->elems[MLX5HWS_POOL_OPTIMIZE_ORIG]; + + return &pool->elems[MLX5HWS_POOL_OPTIMIZE_NONE]; +} + +static int +hws_action_ste_table_chunk_alloc(struct mlx5hws_action_ste_table *action_tbl, + struct mlx5hws_action_ste_chunk *chunk) +{ + int err; + + err = mlx5hws_pool_chunk_alloc(action_tbl->pool, &chunk->ste); + if (err) + return err; + + chunk->action_tbl = action_tbl; + + return 0; +} + +int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool, + bool skip_rx, bool skip_tx, + struct mlx5hws_action_ste_chunk *chunk) +{ + struct mlx5hws_action_ste_pool_element *elem; + struct mlx5hws_action_ste_table *action_tbl; + bool found; + int err; + + if (skip_rx && skip_tx) + return -EINVAL; + + elem = hws_action_ste_choose_elem(pool, skip_rx, skip_tx); + + mlx5hws_dbg(elem->ctx, + "Allocating action STEs skip_rx %d skip_tx %d order %d\n", + skip_rx, skip_tx, chunk->ste.order); + + found = false; + list_for_each_entry(action_tbl, &elem->available, list_node) { + if (!hws_action_ste_table_chunk_alloc(action_tbl, chunk)) { + found = true; + break; + } + } + + if (!found) { + action_tbl = hws_action_ste_table_alloc(elem); + if (IS_ERR(action_tbl)) + return PTR_ERR(action_tbl); + + err = hws_action_ste_table_chunk_alloc(action_tbl, chunk); + if (err) + return err; + } + + if (mlx5hws_pool_empty(action_tbl->pool)) + list_move(&action_tbl->list_node, &elem->full); + + return 0; +} + +void mlx5hws_action_ste_chunk_free(struct mlx5hws_action_ste_chunk *chunk) +{ + mlx5hws_dbg(chunk->action_tbl->pool->ctx, + "Freeing action STEs offset %d order %d\n", + chunk->ste.offset, chunk->ste.order); + mlx5hws_pool_chunk_free(chunk->action_tbl->pool, &chunk->ste); + list_move(&chunk->action_tbl->list_node, + &chunk->action_tbl->parent_elem->available); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h new file mode 100644 index 0000000000000..2de660a632239 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2025 NVIDIA Corporation & Affiliates */ + +#ifndef ACTION_STE_POOL_H_ +#define ACTION_STE_POOL_H_ + +#define MLX5HWS_ACTION_STE_TABLE_INIT_LOG_SZ 10 +#define MLX5HWS_ACTION_STE_TABLE_STEP_LOG_SZ 1 +#define MLX5HWS_ACTION_STE_TABLE_MAX_LOG_SZ 20 + +struct mlx5hws_action_ste_pool_element; + +struct mlx5hws_action_ste_table { + struct mlx5hws_action_ste_pool_element *parent_elem; + /* Wraps the RTC and STE range for this given action. */ + struct mlx5hws_pool *pool; + /* Match STEs use this STC to jump to this pool's RTC. */ + struct mlx5hws_pool_chunk stc; + u32 rtc_0_id; + u32 rtc_1_id; + struct list_head list_node; +}; + +struct mlx5hws_action_ste_pool_element { + struct mlx5hws_context *ctx; + size_t log_sz; /* Size of the largest table so far. */ + enum mlx5hws_pool_optimize opt; + struct list_head available; + struct list_head full; +}; + +/* Central repository of action STEs. The context contains one of these pools + * per queue. + */ +struct mlx5hws_action_ste_pool { + struct mlx5hws_action_ste_pool_element elems[MLX5HWS_POOL_OPTIMIZE_MAX]; +}; + +/* A chunk of STEs and the table it was allocated from. Used by rules. */ +struct mlx5hws_action_ste_chunk { + struct mlx5hws_action_ste_table *action_tbl; + struct mlx5hws_pool_chunk ste; +}; + +int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx); + +void mlx5hws_action_ste_pool_uninit(struct mlx5hws_context *ctx); + +/* Callers are expected to fill chunk->ste.order. On success, this function + * populates chunk->tbl and chunk->ste.offset. + */ +int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool, + bool skip_rx, bool skip_tx, + struct mlx5hws_action_ste_chunk *chunk); + +void mlx5hws_action_ste_chunk_free(struct mlx5hws_action_ste_chunk *chunk); + +#endif /* ACTION_STE_POOL_H_ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c index b7cb736b74d70..428dae8697062 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.c @@ -158,10 +158,16 @@ static int hws_context_init_hws(struct mlx5hws_context *ctx, if (ret) goto pools_uninit; + ret = mlx5hws_action_ste_pool_init(ctx); + if (ret) + goto close_queues; + INIT_LIST_HEAD(&ctx->tbl_list); return 0; +close_queues: + mlx5hws_send_queues_close(ctx); pools_uninit: hws_context_pools_uninit(ctx); uninit_pd: @@ -174,6 +180,7 @@ static void hws_context_uninit_hws(struct mlx5hws_context *ctx) if (!(ctx->flags & MLX5HWS_CONTEXT_FLAG_HWS_SUPPORT)) return; + mlx5hws_action_ste_pool_uninit(ctx); mlx5hws_send_queues_close(ctx); hws_context_pools_uninit(ctx); hws_context_uninit_pd(ctx); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h index 38c3647444adc..e987e93bbc6e7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h @@ -39,6 +39,7 @@ struct mlx5hws_context { struct mlx5hws_cmd_query_caps *caps; u32 pd_num; struct mlx5hws_pool *stc_pool; + struct mlx5hws_action_ste_pool *action_ste_pool; /* One per queue */ struct mlx5hws_context_common_res common_res; struct mlx5hws_pattern_cache *pattern_cache; struct mlx5hws_definer_cache *definer_cache; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h index 30ccd635b5054..21279d5031174 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/internal.h @@ -17,6 +17,7 @@ #include "context.h" #include "table.h" #include "send.h" +#include "action_ste_pool.h" #include "rule.h" #include "cmd.h" #include "action.h" diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h index c82760d53e1a0..33e33d5f1fb3d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/pool.h @@ -33,6 +33,7 @@ enum mlx5hws_pool_optimize { MLX5HWS_POOL_OPTIMIZE_NONE = 0x0, MLX5HWS_POOL_OPTIMIZE_ORIG = 0x1, MLX5HWS_POOL_OPTIMIZE_MIRROR = 0x2, + MLX5HWS_POOL_OPTIMIZE_MAX = 0x3, }; struct mlx5hws_pool_attr { From 593a9470a8565a59a07b577d6bcb3c199f232d4a Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 10 Apr 2025 22:17:39 +0300 Subject: [PATCH 143/770] net/mlx5: HWS, Use the new action STE pool Use the central action STE pool when creating / updating rules. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/1744312662-356571-10-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/rule.c | 69 ++++++++----------- .../mellanox/mlx5/core/steering/hws/rule.h | 12 +--- 2 files changed, 30 insertions(+), 51 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c index a27a2d5ffc7b1..5b758467ed03d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c @@ -195,44 +195,30 @@ hws_rule_load_delete_info(struct mlx5hws_rule *rule, } } -static int hws_rule_alloc_action_ste(struct mlx5hws_rule *rule) +static int mlx5hws_rule_alloc_action_ste(struct mlx5hws_rule *rule, + u16 queue_id, bool skip_rx, + bool skip_tx) { struct mlx5hws_matcher *matcher = rule->matcher; - struct mlx5hws_matcher_action_ste *action_ste; - struct mlx5hws_pool_chunk ste = {0}; - int ret; - - action_ste = &matcher->action_ste; - ste.order = ilog2(roundup_pow_of_two(action_ste->max_stes)); - ret = mlx5hws_pool_chunk_alloc(action_ste->pool, &ste); - if (unlikely(ret)) { - mlx5hws_err(matcher->tbl->ctx, - "Failed to allocate STE for rule actions"); - return ret; - } - - rule->action_ste.pool = matcher->action_ste.pool; - rule->action_ste.num_stes = matcher->action_ste.max_stes; - rule->action_ste.index = ste.offset; + struct mlx5hws_context *ctx = matcher->tbl->ctx; - return 0; + rule->action_ste.ste.order = + ilog2(roundup_pow_of_two(matcher->action_ste.max_stes)); + return mlx5hws_action_ste_chunk_alloc(&ctx->action_ste_pool[queue_id], + skip_rx, skip_tx, + &rule->action_ste); } -void mlx5hws_rule_free_action_ste(struct mlx5hws_rule_action_ste_info *action_ste) +void mlx5hws_rule_free_action_ste(struct mlx5hws_action_ste_chunk *action_ste) { - struct mlx5hws_pool_chunk ste = {0}; - - if (!action_ste->num_stes) + if (!action_ste->action_tbl) return; - ste.order = ilog2(roundup_pow_of_two(action_ste->num_stes)); - ste.offset = action_ste->index; - /* This release is safe only when the rule match STE was deleted * (when the rule is being deleted) or replaced with the new STE that * isn't pointing to old action STEs (when the rule is being updated). */ - mlx5hws_pool_chunk_free(action_ste->pool, &ste); + mlx5hws_action_ste_chunk_free(action_ste); } static void hws_rule_create_init(struct mlx5hws_rule *rule, @@ -250,22 +236,15 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule, rule->rtc_0 = 0; rule->rtc_1 = 0; - rule->action_ste.pool = NULL; - rule->action_ste.num_stes = 0; - rule->action_ste.index = -1; - rule->status = MLX5HWS_RULE_STATUS_CREATING; } else { rule->status = MLX5HWS_RULE_STATUS_UPDATING; + /* Save the old action STE info so we can free it after writing + * new action STEs and a corresponding match STE. + */ + rule->old_action_ste = rule->action_ste; } - /* Initialize the old action STE info - shallow-copy action_ste. - * In create flow this will set old_action_ste fields to initial values. - * In update flow this will save the existing action STE info, - * so that we will later use it to free old STEs. - */ - rule->old_action_ste = rule->action_ste; - rule->pending_wqes = 0; /* Init default send STE attributes */ @@ -277,7 +256,6 @@ static void hws_rule_create_init(struct mlx5hws_rule *rule, /* Init default action apply */ apply->tbl_type = tbl->type; apply->common_res = &ctx->common_res; - apply->jump_to_action_stc = matcher->action_ste.stc.offset; apply->require_dep = 0; } @@ -353,17 +331,24 @@ static int hws_rule_create_hws(struct mlx5hws_rule *rule, if (action_stes) { /* Allocate action STEs for rules that need more than match STE */ - ret = hws_rule_alloc_action_ste(rule); + ret = mlx5hws_rule_alloc_action_ste(rule, attr->queue_id, + !!ste_attr.rtc_0, + !!ste_attr.rtc_1); if (ret) { mlx5hws_err(ctx, "Failed to allocate action memory %d", ret); mlx5hws_send_abort_new_dep_wqe(queue); return ret; } + apply.jump_to_action_stc = + rule->action_ste.action_tbl->stc.offset; /* Skip RX/TX based on the dep_wqe init */ - ste_attr.rtc_0 = dep_wqe->rtc_0 ? matcher->action_ste.rtc_0_id : 0; - ste_attr.rtc_1 = dep_wqe->rtc_1 ? matcher->action_ste.rtc_1_id : 0; + ste_attr.rtc_0 = dep_wqe->rtc_0 ? + rule->action_ste.action_tbl->rtc_0_id : 0; + ste_attr.rtc_1 = dep_wqe->rtc_1 ? + rule->action_ste.action_tbl->rtc_1_id : 0; /* Action STEs are written to a specific index last to first */ - ste_attr.direct_index = rule->action_ste.index + action_stes; + ste_attr.direct_index = + rule->action_ste.ste.offset + action_stes; apply.next_direct_idx = ste_attr.direct_index; } else { apply.next_direct_idx = 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h index b5ee94ac449b1..1c47a9c11572a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.h @@ -43,12 +43,6 @@ struct mlx5hws_rule_match_tag { }; }; -struct mlx5hws_rule_action_ste_info { - struct mlx5hws_pool *pool; - int index; /* STE array index */ - u8 num_stes; -}; - struct mlx5hws_rule_resize_info { u32 rtc_0; u32 rtc_1; @@ -64,8 +58,8 @@ struct mlx5hws_rule { struct mlx5hws_rule_match_tag tag; struct mlx5hws_rule_resize_info *resize_info; }; - struct mlx5hws_rule_action_ste_info action_ste; - struct mlx5hws_rule_action_ste_info old_action_ste; + struct mlx5hws_action_ste_chunk action_ste; + struct mlx5hws_action_ste_chunk old_action_ste; u32 rtc_0; /* The RTC into which the STE was inserted */ u32 rtc_1; /* The RTC into which the STE was inserted */ u8 status; /* enum mlx5hws_rule_status */ @@ -75,7 +69,7 @@ struct mlx5hws_rule { */ }; -void mlx5hws_rule_free_action_ste(struct mlx5hws_rule_action_ste_info *action_ste); +void mlx5hws_rule_free_action_ste(struct mlx5hws_action_ste_chunk *action_ste); int mlx5hws_rule_move_hws_remove(struct mlx5hws_rule *rule, void *queue, void *user_data); From 22174f16f1218fc98e374b3653decae54aa481f8 Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 10 Apr 2025 22:17:40 +0300 Subject: [PATCH 144/770] net/mlx5: HWS, Cleanup matcher action STE table Remove the matcher action STE implementation now that the code uses per-queue action STE pools. This also allows simplifying matcher code because it is now only handling a single type of RTC/STE. The matcher resize data is also going away. Matchers were saving old action STE data because the rules still used it, but now that data lives in the action STE pool and is no longer coupled to a matcher. Furthermore, matchers no longer need to rehash a due to action template addition. If a new action template needs more action STEs, we simply update the matcher's num_of_action_stes and future rules will allocate the correct number. Existing rules are unaffected by such an operation and can continue to use their existing action STEs. The range action was using the matcher action STE implementation, but there was no reason to do this other than the container fitting the purpose. Extract that information to a separate structure. Finally, stop dumping per-matcher information about action RTCs, because they no longer exist. A later patch in this series will add support for dumping action STE pools. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/1744312662-356571-11-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/action.c | 23 +- .../mellanox/mlx5/core/steering/hws/action.h | 8 +- .../mellanox/mlx5/core/steering/hws/bwc.c | 77 +--- .../mellanox/mlx5/core/steering/hws/debug.c | 17 +- .../mellanox/mlx5/core/steering/hws/matcher.c | 336 ++++-------------- .../mellanox/mlx5/core/steering/hws/matcher.h | 20 +- .../mellanox/mlx5/core/steering/hws/mlx5hws.h | 5 +- .../mellanox/mlx5/core/steering/hws/rule.c | 2 +- 8 files changed, 87 insertions(+), 401 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c index 161ad720b3396..bef4d25c1a2a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.c @@ -1574,13 +1574,13 @@ hws_action_create_dest_match_range_definer(struct mlx5hws_context *ctx) return definer; } -static struct mlx5hws_matcher_action_ste * +static struct mlx5hws_range_action_table * hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, struct mlx5hws_definer *definer, u32 miss_ft_id) { struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0}; - struct mlx5hws_matcher_action_ste *table_ste; + struct mlx5hws_range_action_table *table_ste; struct mlx5hws_pool_attr pool_attr = {0}; struct mlx5hws_pool *ste_pool, *stc_pool; u32 *rtc_0_id, *rtc_1_id; @@ -1669,9 +1669,9 @@ hws_action_create_dest_match_range_table(struct mlx5hws_context *ctx, return NULL; } -static void -hws_action_destroy_dest_match_range_table(struct mlx5hws_context *ctx, - struct mlx5hws_matcher_action_ste *table_ste) +static void hws_action_destroy_dest_match_range_table( + struct mlx5hws_context *ctx, + struct mlx5hws_range_action_table *table_ste) { mutex_lock(&ctx->ctrl_lock); @@ -1683,12 +1683,11 @@ hws_action_destroy_dest_match_range_table(struct mlx5hws_context *ctx, mutex_unlock(&ctx->ctrl_lock); } -static int -hws_action_create_dest_match_range_fill_table(struct mlx5hws_context *ctx, - struct mlx5hws_matcher_action_ste *table_ste, - struct mlx5hws_action *hit_ft_action, - struct mlx5hws_definer *range_definer, - u32 min, u32 max) +static int hws_action_create_dest_match_range_fill_table( + struct mlx5hws_context *ctx, + struct mlx5hws_range_action_table *table_ste, + struct mlx5hws_action *hit_ft_action, + struct mlx5hws_definer *range_definer, u32 min, u32 max) { struct mlx5hws_wqe_gta_data_seg_ste match_wqe_data = {0}; struct mlx5hws_wqe_gta_data_seg_ste range_wqe_data = {0}; @@ -1784,7 +1783,7 @@ mlx5hws_action_create_dest_match_range(struct mlx5hws_context *ctx, u32 min, u32 max, u32 flags) { struct mlx5hws_cmd_stc_modify_attr stc_attr = {0}; - struct mlx5hws_matcher_action_ste *table_ste; + struct mlx5hws_range_action_table *table_ste; struct mlx5hws_action *hit_ft_action; struct mlx5hws_definer *definer; struct mlx5hws_action *action; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h index 64b76075f7f8f..25fa0d4c92216 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action.h @@ -118,6 +118,12 @@ struct mlx5hws_action_template { u8 only_term; }; +struct mlx5hws_range_action_table { + struct mlx5hws_pool *pool; + u32 rtc_0_id; + u32 rtc_1_id; +}; + struct mlx5hws_action { u8 type; u8 flags; @@ -186,7 +192,7 @@ struct mlx5hws_action { size_t size; } remove_header; struct { - struct mlx5hws_matcher_action_ste *table_ste; + struct mlx5hws_range_action_table *table_ste; struct mlx5hws_action *hit_ft_action; struct mlx5hws_definer *definer; } range; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 32de8bfc7644f..510bfbbe59918 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -478,21 +478,9 @@ hws_bwc_matcher_size_maxed_out(struct mlx5hws_bwc_matcher *bwc_matcher) struct mlx5hws_cmd_query_caps *caps = bwc_matcher->matcher->tbl->ctx->caps; /* check the match RTC size */ - if ((bwc_matcher->size_log + - MLX5HWS_MATCHER_ASSURED_MAIN_TBL_DEPTH + - MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP) > - (caps->ste_alloc_log_max - 1)) - return true; - - /* check the action RTC size */ - if ((bwc_matcher->size_log + - MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP + - ilog2(roundup_pow_of_two(bwc_matcher->matcher->action_ste.max_stes)) + - MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT) > - (caps->ste_alloc_log_max - 1)) - return true; - - return false; + return (bwc_matcher->size_log + MLX5HWS_MATCHER_ASSURED_MAIN_TBL_DEPTH + + MLX5HWS_BWC_MATCHER_SIZE_LOG_STEP) > + (caps->ste_alloc_log_max - 1); } static bool @@ -779,19 +767,6 @@ hws_bwc_matcher_rehash_size(struct mlx5hws_bwc_matcher *bwc_matcher) return hws_bwc_matcher_move(bwc_matcher); } -static int -hws_bwc_matcher_rehash_at(struct mlx5hws_bwc_matcher *bwc_matcher) -{ - /* Rehash by action template doesn't require any additional checking. - * The bwc_matcher already contains the new action template. - * Just do the usual rehash: - * - create new matcher - * - move all the rules to the new matcher - * - destroy the old matcher - */ - return hws_bwc_matcher_move(bwc_matcher); -} - int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, u32 *match_param, struct mlx5hws_rule_action rule_actions[], @@ -803,7 +778,6 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, struct mlx5hws_rule_attr rule_attr; struct mutex *queue_lock; /* Protect the queue */ u32 num_of_rules; - bool need_rehash; int ret = 0; int at_idx; @@ -830,30 +804,11 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, at_idx = bwc_matcher->num_of_at - 1; ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher, - bwc_matcher->at[at_idx], - &need_rehash); + bwc_matcher->at[at_idx]); if (unlikely(ret)) { hws_bwc_unlock_all_queues(ctx); return ret; } - if (unlikely(need_rehash)) { - /* The new action template requires more action STEs. - * Need to attempt creating new matcher with all - * the action templates, including the new one. - */ - ret = hws_bwc_matcher_rehash_at(bwc_matcher); - if (unlikely(ret)) { - mlx5hws_action_template_destroy(bwc_matcher->at[at_idx]); - bwc_matcher->at[at_idx] = NULL; - bwc_matcher->num_of_at--; - - hws_bwc_unlock_all_queues(ctx); - - mlx5hws_err(ctx, - "BWC rule insertion: rehash AT failed (%d)\n", ret); - return ret; - } - } hws_bwc_unlock_all_queues(ctx); mutex_lock(queue_lock); @@ -973,7 +928,6 @@ hws_bwc_rule_action_update(struct mlx5hws_bwc_rule *bwc_rule, struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; struct mlx5hws_rule_attr rule_attr; struct mutex *queue_lock; /* Protect the queue */ - bool need_rehash; int at_idx, ret; u16 idx; @@ -1005,32 +959,11 @@ hws_bwc_rule_action_update(struct mlx5hws_bwc_rule *bwc_rule, at_idx = bwc_matcher->num_of_at - 1; ret = mlx5hws_matcher_attach_at(bwc_matcher->matcher, - bwc_matcher->at[at_idx], - &need_rehash); + bwc_matcher->at[at_idx]); if (unlikely(ret)) { hws_bwc_unlock_all_queues(ctx); return ret; } - if (unlikely(need_rehash)) { - /* The new action template requires more action - * STEs. Need to attempt creating new matcher - * with all the action templates, including the - * new one. - */ - ret = hws_bwc_matcher_rehash_at(bwc_matcher); - if (unlikely(ret)) { - mlx5hws_action_template_destroy(bwc_matcher->at[at_idx]); - bwc_matcher->at[at_idx] = NULL; - bwc_matcher->num_of_at--; - - hws_bwc_unlock_all_queues(ctx); - - mlx5hws_err(ctx, - "BWC rule update: rehash AT failed (%d)\n", - ret); - return ret; - } - } } hws_bwc_unlock_all_queues(ctx); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c index 3491408c5d84f..38f75dec9cfc4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c @@ -146,18 +146,6 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma matcher->match_ste.rtc_1_id, (int)ste_1_id); - ste_pool = matcher->action_ste.pool; - if (ste_pool) { - ste_0_id = mlx5hws_pool_get_base_id(ste_pool); - if (tbl_type == MLX5HWS_TABLE_TYPE_FDB) - ste_1_id = mlx5hws_pool_get_base_mirror_id(ste_pool); - else - ste_1_id = -1; - } else { - ste_0_id = -1; - ste_1_id = -1; - } - ft_attr.type = matcher->tbl->fw_ft_type; ret = mlx5hws_cmd_flow_table_query(matcher->tbl->ctx->mdev, matcher->end_ft_id, @@ -167,10 +155,7 @@ static int hws_debug_dump_matcher(struct seq_file *f, struct mlx5hws_matcher *ma if (ret) return ret; - seq_printf(f, ",%d,%d,%d,%d,%d,0x%llx,0x%llx\n", - matcher->action_ste.rtc_0_id, (int)ste_0_id, - matcher->action_ste.rtc_1_id, (int)ste_1_id, - 0, + seq_printf(f, ",-1,-1,-1,-1,0,0x%llx,0x%llx\n", mlx5hws_debug_icm_to_idx(icm_addr_0), mlx5hws_debug_icm_to_idx(icm_addr_1)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index 3028e0387e3f0..716502732d3da 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -3,25 +3,6 @@ #include "internal.h" -enum mlx5hws_matcher_rtc_type { - HWS_MATCHER_RTC_TYPE_MATCH, - HWS_MATCHER_RTC_TYPE_STE_ARRAY, - HWS_MATCHER_RTC_TYPE_MAX, -}; - -static const char * const mlx5hws_matcher_rtc_type_str[] = { - [HWS_MATCHER_RTC_TYPE_MATCH] = "MATCH", - [HWS_MATCHER_RTC_TYPE_STE_ARRAY] = "STE_ARRAY", - [HWS_MATCHER_RTC_TYPE_MAX] = "UNKNOWN", -}; - -static const char *hws_matcher_rtc_type_to_str(enum mlx5hws_matcher_rtc_type rtc_type) -{ - if (rtc_type > HWS_MATCHER_RTC_TYPE_MAX) - rtc_type = HWS_MATCHER_RTC_TYPE_MAX; - return mlx5hws_matcher_rtc_type_str[rtc_type]; -} - static bool hws_matcher_requires_col_tbl(u8 log_num_of_rules) { /* Collision table concatenation is done only for large rule tables */ @@ -209,83 +190,52 @@ static void hws_matcher_set_rtc_attr_sz(struct mlx5hws_matcher *matcher, } } -static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, - enum mlx5hws_matcher_rtc_type rtc_type) +static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher) { struct mlx5hws_matcher_attr *attr = &matcher->attr; struct mlx5hws_cmd_rtc_create_attr rtc_attr = {0}; struct mlx5hws_match_template *mt = matcher->mt; struct mlx5hws_context *ctx = matcher->tbl->ctx; - struct mlx5hws_matcher_action_ste *action_ste; struct mlx5hws_table *tbl = matcher->tbl; - struct mlx5hws_pool *ste_pool; - u32 *rtc_0_id, *rtc_1_id; u32 obj_id; int ret; - switch (rtc_type) { - case HWS_MATCHER_RTC_TYPE_MATCH: - rtc_0_id = &matcher->match_ste.rtc_0_id; - rtc_1_id = &matcher->match_ste.rtc_1_id; - ste_pool = matcher->match_ste.pool; - - rtc_attr.log_size = attr->table.sz_row_log; - rtc_attr.log_depth = attr->table.sz_col_log; - rtc_attr.is_frst_jumbo = mlx5hws_matcher_mt_is_jumbo(mt); - rtc_attr.is_scnd_range = 0; - rtc_attr.miss_ft_id = matcher->end_ft_id; - - if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_HASH) { - /* The usual Hash Table */ - rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_HASH; - - /* The first mt is used since all share the same definer */ - rtc_attr.match_definer_0 = mlx5hws_definer_get_id(mt->definer); - } else if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_INDEX) { - rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; - rtc_attr.num_hash_definer = 1; - - if (attr->distribute_mode == MLX5HWS_MATCHER_DISTRIBUTE_BY_HASH) { - /* Hash Split Table */ - rtc_attr.access_index_mode = MLX5_IFC_RTC_STE_ACCESS_MODE_BY_HASH; - rtc_attr.match_definer_0 = mlx5hws_definer_get_id(mt->definer); - } else if (attr->distribute_mode == MLX5HWS_MATCHER_DISTRIBUTE_BY_LINEAR) { - /* Linear Lookup Table */ - rtc_attr.access_index_mode = MLX5_IFC_RTC_STE_ACCESS_MODE_LINEAR; - rtc_attr.match_definer_0 = ctx->caps->linear_match_definer; - } + rtc_attr.log_size = attr->table.sz_row_log; + rtc_attr.log_depth = attr->table.sz_col_log; + rtc_attr.is_frst_jumbo = mlx5hws_matcher_mt_is_jumbo(mt); + rtc_attr.is_scnd_range = 0; + rtc_attr.miss_ft_id = matcher->end_ft_id; + + if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_HASH) { + /* The usual Hash Table */ + rtc_attr.update_index_mode = + MLX5_IFC_RTC_STE_UPDATE_MODE_BY_HASH; + + /* The first mt is used since all share the same definer */ + rtc_attr.match_definer_0 = mlx5hws_definer_get_id(mt->definer); + } else if (attr->insert_mode == MLX5HWS_MATCHER_INSERT_BY_INDEX) { + rtc_attr.update_index_mode = + MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; + rtc_attr.num_hash_definer = 1; + + if (attr->distribute_mode == + MLX5HWS_MATCHER_DISTRIBUTE_BY_HASH) { + /* Hash Split Table */ + rtc_attr.access_index_mode = + MLX5_IFC_RTC_STE_ACCESS_MODE_BY_HASH; + rtc_attr.match_definer_0 = + mlx5hws_definer_get_id(mt->definer); + } else if (attr->distribute_mode == + MLX5HWS_MATCHER_DISTRIBUTE_BY_LINEAR) { + /* Linear Lookup Table */ + rtc_attr.access_index_mode = + MLX5_IFC_RTC_STE_ACCESS_MODE_LINEAR; + rtc_attr.match_definer_0 = + ctx->caps->linear_match_definer; } - break; - - case HWS_MATCHER_RTC_TYPE_STE_ARRAY: - action_ste = &matcher->action_ste; - - rtc_0_id = &action_ste->rtc_0_id; - rtc_1_id = &action_ste->rtc_1_id; - ste_pool = action_ste->pool; - /* Action RTC size calculation: - * log((max number of rules in matcher) * - * (max number of action STEs per rule) * - * (2 to support writing new STEs for update rule)) - */ - rtc_attr.log_size = - ilog2(roundup_pow_of_two(action_ste->max_stes)) + - attr->table.sz_row_log + - MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT; - rtc_attr.log_depth = 0; - rtc_attr.update_index_mode = MLX5_IFC_RTC_STE_UPDATE_MODE_BY_OFFSET; - /* The action STEs use the default always hit definer */ - rtc_attr.match_definer_0 = ctx->caps->trivial_match_definer; - rtc_attr.is_frst_jumbo = false; - rtc_attr.miss_ft_id = 0; - break; - - default: - mlx5hws_err(ctx, "HWS Invalid RTC type\n"); - return -EINVAL; } - obj_id = mlx5hws_pool_get_base_id(ste_pool); + obj_id = mlx5hws_pool_get_base_id(matcher->match_ste.pool); rtc_attr.pd = ctx->pd_num; rtc_attr.ste_base = obj_id; @@ -297,15 +247,16 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, obj_id = mlx5hws_pool_get_base_id(ctx->stc_pool); rtc_attr.stc_base = obj_id; - ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_0_id); + ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, + &matcher->match_ste.rtc_0_id); if (ret) { - mlx5hws_err(ctx, "Failed to create matcher RTC of type %s", - hws_matcher_rtc_type_to_str(rtc_type)); + mlx5hws_err(ctx, "Failed to create matcher RTC\n"); return ret; } if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) { - obj_id = mlx5hws_pool_get_base_mirror_id(ste_pool); + obj_id = mlx5hws_pool_get_base_mirror_id( + matcher->match_ste.pool); rtc_attr.ste_base = obj_id; rtc_attr.table_type = mlx5hws_table_get_res_fw_ft_type(tbl->type, true); @@ -313,10 +264,10 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, rtc_attr.stc_base = obj_id; hws_matcher_set_rtc_attr_sz(matcher, &rtc_attr, true); - ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, rtc_1_id); + ret = mlx5hws_cmd_rtc_create(ctx->mdev, &rtc_attr, + &matcher->match_ste.rtc_1_id); if (ret) { - mlx5hws_err(ctx, "Failed to create peer matcher RTC of type %s", - hws_matcher_rtc_type_to_str(rtc_type)); + mlx5hws_err(ctx, "Failed to create mirror matcher RTC\n"); goto destroy_rtc_0; } } @@ -324,33 +275,18 @@ static int hws_matcher_create_rtc(struct mlx5hws_matcher *matcher, return 0; destroy_rtc_0: - mlx5hws_cmd_rtc_destroy(ctx->mdev, *rtc_0_id); + mlx5hws_cmd_rtc_destroy(ctx->mdev, matcher->match_ste.rtc_0_id); return ret; } -static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher, - enum mlx5hws_matcher_rtc_type rtc_type) +static void hws_matcher_destroy_rtc(struct mlx5hws_matcher *matcher) { - struct mlx5hws_table *tbl = matcher->tbl; - u32 rtc_0_id, rtc_1_id; - - switch (rtc_type) { - case HWS_MATCHER_RTC_TYPE_MATCH: - rtc_0_id = matcher->match_ste.rtc_0_id; - rtc_1_id = matcher->match_ste.rtc_1_id; - break; - case HWS_MATCHER_RTC_TYPE_STE_ARRAY: - rtc_0_id = matcher->action_ste.rtc_0_id; - rtc_1_id = matcher->action_ste.rtc_1_id; - break; - default: - return; - } + struct mlx5_core_dev *mdev = matcher->tbl->ctx->mdev; - if (tbl->type == MLX5HWS_TABLE_TYPE_FDB) - mlx5hws_cmd_rtc_destroy(tbl->ctx->mdev, rtc_1_id); + if (matcher->tbl->type == MLX5HWS_TABLE_TYPE_FDB) + mlx5hws_cmd_rtc_destroy(mdev, matcher->match_ste.rtc_1_id); - mlx5hws_cmd_rtc_destroy(tbl->ctx->mdev, rtc_0_id); + mlx5hws_cmd_rtc_destroy(mdev, matcher->match_ste.rtc_0_id); } static int @@ -418,85 +354,17 @@ static int hws_matcher_check_and_process_at(struct mlx5hws_matcher *matcher, return 0; } -static int hws_matcher_resize_init(struct mlx5hws_matcher *src_matcher) -{ - struct mlx5hws_matcher_resize_data *resize_data; - - resize_data = kzalloc(sizeof(*resize_data), GFP_KERNEL); - if (!resize_data) - return -ENOMEM; - - resize_data->max_stes = src_matcher->action_ste.max_stes; - - resize_data->stc = src_matcher->action_ste.stc; - resize_data->rtc_0_id = src_matcher->action_ste.rtc_0_id; - resize_data->rtc_1_id = src_matcher->action_ste.rtc_1_id; - resize_data->pool = src_matcher->action_ste.max_stes ? - src_matcher->action_ste.pool : NULL; - - /* Place the new resized matcher on the dst matcher's list */ - list_add(&resize_data->list_node, &src_matcher->resize_dst->resize_data); - - /* Move all the previous resized matchers to the dst matcher's list */ - while (!list_empty(&src_matcher->resize_data)) { - resize_data = list_first_entry(&src_matcher->resize_data, - struct mlx5hws_matcher_resize_data, - list_node); - list_del_init(&resize_data->list_node); - list_add(&resize_data->list_node, &src_matcher->resize_dst->resize_data); - } - - return 0; -} - -static void hws_matcher_resize_uninit(struct mlx5hws_matcher *matcher) -{ - struct mlx5hws_matcher_resize_data *resize_data; - - if (!mlx5hws_matcher_is_resizable(matcher)) - return; - - while (!list_empty(&matcher->resize_data)) { - resize_data = list_first_entry(&matcher->resize_data, - struct mlx5hws_matcher_resize_data, - list_node); - list_del_init(&resize_data->list_node); - - if (resize_data->max_stes) { - mlx5hws_action_free_single_stc(matcher->tbl->ctx, - matcher->tbl->type, - &resize_data->stc); - - if (matcher->tbl->type == MLX5HWS_TABLE_TYPE_FDB) - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, - resize_data->rtc_1_id); - - mlx5hws_cmd_rtc_destroy(matcher->tbl->ctx->mdev, - resize_data->rtc_0_id); - - if (resize_data->pool) - mlx5hws_pool_destroy(resize_data->pool); - } - - kfree(resize_data); - } -} - static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher) { bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt); - struct mlx5hws_cmd_stc_modify_attr stc_attr = {0}; - struct mlx5hws_matcher_action_ste *action_ste; - struct mlx5hws_table *tbl = matcher->tbl; - struct mlx5hws_pool_attr pool_attr = {0}; - struct mlx5hws_context *ctx = tbl->ctx; - u32 required_stes; - u8 max_stes = 0; + struct mlx5hws_context *ctx = matcher->tbl->ctx; + u8 required_stes, max_stes; int i, ret; if (matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION) return 0; + max_stes = 0; for (i = 0; i < matcher->num_of_at; i++) { struct mlx5hws_action_template *at = &matcher->at[i]; @@ -512,74 +380,9 @@ static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher) /* Future: Optimize reparse */ } - /* There are no additional STEs required for matcher */ - if (!max_stes) - return 0; - - matcher->action_ste.max_stes = max_stes; - - action_ste = &matcher->action_ste; - - /* Allocate action STE mempool */ - pool_attr.table_type = tbl->type; - pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; - pool_attr.flags = MLX5HWS_POOL_FLAG_BUDDY; - /* Pool size is similar to action RTC size */ - pool_attr.alloc_log_sz = ilog2(roundup_pow_of_two(action_ste->max_stes)) + - matcher->attr.table.sz_row_log + - MLX5HWS_MATCHER_ACTION_RTC_UPDATE_MULT; - hws_matcher_set_pool_attr(&pool_attr, matcher); - action_ste->pool = mlx5hws_pool_create(ctx, &pool_attr); - if (!action_ste->pool) { - mlx5hws_err(ctx, "Failed to create action ste pool\n"); - return -EINVAL; - } - - /* Allocate action RTC */ - ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY); - if (ret) { - mlx5hws_err(ctx, "Failed to create action RTC\n"); - goto free_ste_pool; - } - - /* Allocate STC for jumps to STE */ - stc_attr.action_offset = MLX5HWS_ACTION_OFFSET_HIT; - stc_attr.action_type = MLX5_IFC_STC_ACTION_TYPE_JUMP_TO_STE_TABLE; - stc_attr.reparse_mode = MLX5_IFC_STC_REPARSE_IGNORE; - stc_attr.ste_table.ste_pool = action_ste->pool; - stc_attr.ste_table.match_definer_id = ctx->caps->trivial_match_definer; - - ret = mlx5hws_action_alloc_single_stc(ctx, &stc_attr, tbl->type, - &action_ste->stc); - if (ret) { - mlx5hws_err(ctx, "Failed to create action jump to table STC\n"); - goto free_rtc; - } + matcher->num_of_action_stes = max_stes; return 0; - -free_rtc: - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY); -free_ste_pool: - mlx5hws_pool_destroy(action_ste->pool); - return ret; -} - -static void hws_matcher_unbind_at(struct mlx5hws_matcher *matcher) -{ - struct mlx5hws_matcher_action_ste *action_ste; - struct mlx5hws_table *tbl = matcher->tbl; - - action_ste = &matcher->action_ste; - - if (!action_ste->max_stes || - matcher->flags & MLX5HWS_MATCHER_FLAGS_COLLISION || - mlx5hws_matcher_is_in_resize(matcher)) - return; - - mlx5hws_action_free_single_stc(tbl->ctx, tbl->type, &action_ste->stc); - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_STE_ARRAY); - mlx5hws_pool_destroy(action_ste->pool); } static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher) @@ -723,10 +526,10 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher) /* Create matcher end flow table anchor */ ret = hws_matcher_create_end_ft(matcher); if (ret) - goto unbind_at; + goto unbind_mt; /* Allocate the RTC for the new matcher */ - ret = hws_matcher_create_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH); + ret = hws_matcher_create_rtc(matcher); if (ret) goto destroy_end_ft; @@ -738,11 +541,9 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher) return 0; destroy_rtc: - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH); + hws_matcher_destroy_rtc(matcher); destroy_end_ft: hws_matcher_destroy_end_ft(matcher); -unbind_at: - hws_matcher_unbind_at(matcher); unbind_mt: hws_matcher_unbind_mt(matcher); return ret; @@ -750,11 +551,9 @@ static int hws_matcher_create_and_connect(struct mlx5hws_matcher *matcher) static void hws_matcher_destroy_and_disconnect(struct mlx5hws_matcher *matcher) { - hws_matcher_resize_uninit(matcher); hws_matcher_disconnect(matcher); - hws_matcher_destroy_rtc(matcher, HWS_MATCHER_RTC_TYPE_MATCH); + hws_matcher_destroy_rtc(matcher); hws_matcher_destroy_end_ft(matcher); - hws_matcher_unbind_at(matcher); hws_matcher_unbind_mt(matcher); } @@ -776,8 +575,6 @@ hws_matcher_create_col_matcher(struct mlx5hws_matcher *matcher) if (!col_matcher) return -ENOMEM; - INIT_LIST_HEAD(&col_matcher->resize_data); - col_matcher->tbl = matcher->tbl; col_matcher->mt = matcher->mt; col_matcher->at = matcher->at; @@ -831,8 +628,6 @@ static int hws_matcher_init(struct mlx5hws_matcher *matcher) struct mlx5hws_context *ctx = matcher->tbl->ctx; int ret; - INIT_LIST_HEAD(&matcher->resize_data); - mutex_lock(&ctx->ctrl_lock); /* Allocate matcher resource and connect to the packet pipe */ @@ -889,16 +684,12 @@ static int hws_matcher_grow_at_array(struct mlx5hws_matcher *matcher) } int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher, - struct mlx5hws_action_template *at, - bool *need_rehash) + struct mlx5hws_action_template *at) { bool is_jumbo = mlx5hws_matcher_mt_is_jumbo(matcher->mt); - struct mlx5hws_context *ctx = matcher->tbl->ctx; u32 required_stes; int ret; - *need_rehash = false; - if (unlikely(matcher->num_of_at >= matcher->size_of_at_array)) { ret = hws_matcher_grow_at_array(matcher); if (ret) @@ -916,11 +707,8 @@ int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher, return ret; required_stes = at->num_of_action_stes - (!is_jumbo || at->only_term); - if (matcher->action_ste.max_stes < required_stes) { - mlx5hws_dbg(ctx, "Required STEs [%d] exceeds initial action template STE [%d]\n", - required_stes, matcher->action_ste.max_stes); - *need_rehash = true; - } + if (matcher->num_of_action_stes < required_stes) + matcher->num_of_action_stes = required_stes; matcher->at[matcher->num_of_at] = *at; matcher->num_of_at += 1; @@ -1102,7 +890,7 @@ static int hws_matcher_resize_precheck(struct mlx5hws_matcher *src_matcher, return -EINVAL; } - if (src_matcher->action_ste.max_stes > dst_matcher->action_ste.max_stes) { + if (src_matcher->num_of_action_stes > dst_matcher->num_of_action_stes) { mlx5hws_err(ctx, "Src/dst matcher max STEs mismatch\n"); return -EINVAL; } @@ -1131,10 +919,6 @@ int mlx5hws_matcher_resize_set_target(struct mlx5hws_matcher *src_matcher, src_matcher->resize_dst = dst_matcher; - ret = hws_matcher_resize_init(src_matcher); - if (ret) - src_matcher->resize_dst = NULL; - out: mutex_unlock(&src_matcher->tbl->ctx->ctrl_lock); return ret; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h index 0450b6175ac9e..bad1fa8f77fd2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h @@ -50,23 +50,6 @@ struct mlx5hws_matcher_match_ste { struct mlx5hws_pool *pool; }; -struct mlx5hws_matcher_action_ste { - struct mlx5hws_pool_chunk stc; - u32 rtc_0_id; - u32 rtc_1_id; - struct mlx5hws_pool *pool; - u8 max_stes; -}; - -struct mlx5hws_matcher_resize_data { - struct mlx5hws_pool_chunk stc; - u32 rtc_0_id; - u32 rtc_1_id; - struct mlx5hws_pool *pool; - u8 max_stes; - struct list_head list_node; -}; - struct mlx5hws_matcher { struct mlx5hws_table *tbl; struct mlx5hws_matcher_attr attr; @@ -75,15 +58,14 @@ struct mlx5hws_matcher { u8 num_of_at; u8 size_of_at_array; u8 num_of_mt; + u8 num_of_action_stes; /* enum mlx5hws_matcher_flags */ u8 flags; u32 end_ft_id; struct mlx5hws_matcher *col_matcher; struct mlx5hws_matcher *resize_dst; struct mlx5hws_matcher_match_ste match_ste; - struct mlx5hws_matcher_action_ste action_ste; struct list_head list_node; - struct list_head resize_data; }; static inline bool diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h index 8ed8a715a2eb2..5121951f2778a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h @@ -399,14 +399,11 @@ int mlx5hws_matcher_destroy(struct mlx5hws_matcher *matcher); * * @matcher: Matcher to attach the action template to. * @at: Action template to be attached to the matcher. - * @need_rehash: Output parameter that tells callers if the matcher needs to be - * rehashed. * * Return: Zero on success, non-zero otherwise. */ int mlx5hws_matcher_attach_at(struct mlx5hws_matcher *matcher, - struct mlx5hws_action_template *at, - bool *need_rehash); + struct mlx5hws_action_template *at); /** * mlx5hws_matcher_resize_set_target - Link two matchers and enable moving rules. diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c index 5b758467ed03d..9e6f35d684456 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c @@ -203,7 +203,7 @@ static int mlx5hws_rule_alloc_action_ste(struct mlx5hws_rule *rule, struct mlx5hws_context *ctx = matcher->tbl->ctx; rule->action_ste.ste.order = - ilog2(roundup_pow_of_two(matcher->action_ste.max_stes)); + ilog2(roundup_pow_of_two(matcher->num_of_action_stes)); return mlx5hws_action_ste_chunk_alloc(&ctx->action_ste_pool[queue_id], skip_rx, skip_tx, &rule->action_ste); From 864531ca2072c55ff00ba9dfd8c15cf0f576051b Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 10 Apr 2025 22:17:41 +0300 Subject: [PATCH 145/770] net/mlx5: HWS, Free unused action STE tables Periodically check for unused action STE tables and free their associated resources. In order to do this safely, add a per-queue lock to synchronize the garbage collect work with regular operations on steering rules. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/1744312662-356571-12-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mlx5/core/steering/hws/action_ste_pool.c | 88 ++++++++++++++++++- .../mlx5/core/steering/hws/action_ste_pool.h | 11 +++ .../mellanox/mlx5/core/steering/hws/context.h | 1 + 3 files changed, 96 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c index cb6ad84116315..5766a9c82f966 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.c @@ -159,6 +159,7 @@ hws_action_ste_table_alloc(struct mlx5hws_action_ste_pool_element *parent_elem) action_tbl->parent_elem = parent_elem; INIT_LIST_HEAD(&action_tbl->list_node); + action_tbl->last_used = jiffies; list_add(&action_tbl->list_node, &parent_elem->available); parent_elem->log_sz = log_sz; @@ -236,6 +237,8 @@ static int hws_action_ste_pool_init(struct mlx5hws_context *ctx, enum mlx5hws_pool_optimize opt; int err; + mutex_init(&pool->lock); + /* Rules which are added for both RX and TX must use the same action STE * indices for both. If we were to use a single table, then RX-only and * TX-only rules would waste the unused entries. Thus, we use separate @@ -247,6 +250,7 @@ static int hws_action_ste_pool_init(struct mlx5hws_context *ctx, opt); if (err) goto destroy_elems; + pool->elems[opt].parent_pool = pool; } return 0; @@ -267,6 +271,58 @@ static void hws_action_ste_pool_destroy(struct mlx5hws_action_ste_pool *pool) hws_action_ste_pool_element_destroy(&pool->elems[opt]); } +static void hws_action_ste_pool_element_collect_stale( + struct mlx5hws_action_ste_pool_element *elem, struct list_head *cleanup) +{ + struct mlx5hws_action_ste_table *action_tbl, *p; + unsigned long expire_time, now; + + expire_time = secs_to_jiffies(MLX5HWS_ACTION_STE_POOL_EXPIRE_SECONDS); + now = jiffies; + + list_for_each_entry_safe(action_tbl, p, &elem->available, list_node) { + if (mlx5hws_pool_full(action_tbl->pool) && + time_before(action_tbl->last_used + expire_time, now)) + list_move(&action_tbl->list_node, cleanup); + } +} + +static void hws_action_ste_table_cleanup_list(struct list_head *cleanup) +{ + struct mlx5hws_action_ste_table *action_tbl, *p; + + list_for_each_entry_safe(action_tbl, p, cleanup, list_node) + hws_action_ste_table_destroy(action_tbl); +} + +static void hws_action_ste_pool_cleanup(struct work_struct *work) +{ + enum mlx5hws_pool_optimize opt; + struct mlx5hws_context *ctx; + LIST_HEAD(cleanup); + int i; + + ctx = container_of(work, struct mlx5hws_context, + action_ste_cleanup.work); + + for (i = 0; i < ctx->queues; i++) { + struct mlx5hws_action_ste_pool *p = &ctx->action_ste_pool[i]; + + mutex_lock(&p->lock); + for (opt = MLX5HWS_POOL_OPTIMIZE_NONE; + opt < MLX5HWS_POOL_OPTIMIZE_MAX; opt++) + hws_action_ste_pool_element_collect_stale( + &p->elems[opt], &cleanup); + mutex_unlock(&p->lock); + } + + hws_action_ste_table_cleanup_list(&cleanup); + + schedule_delayed_work(&ctx->action_ste_cleanup, + secs_to_jiffies( + MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS)); +} + int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx) { struct mlx5hws_action_ste_pool *pool; @@ -285,6 +341,12 @@ int mlx5hws_action_ste_pool_init(struct mlx5hws_context *ctx) ctx->action_ste_pool = pool; + INIT_DELAYED_WORK(&ctx->action_ste_cleanup, + hws_action_ste_pool_cleanup); + schedule_delayed_work( + &ctx->action_ste_cleanup, + secs_to_jiffies(MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS)); + return 0; free_pool: @@ -300,6 +362,8 @@ void mlx5hws_action_ste_pool_uninit(struct mlx5hws_context *ctx) size_t queues = ctx->queues; int i; + cancel_delayed_work_sync(&ctx->action_ste_cleanup); + for (i = 0; i < queues; i++) hws_action_ste_pool_destroy(&ctx->action_ste_pool[i]); @@ -330,6 +394,7 @@ hws_action_ste_table_chunk_alloc(struct mlx5hws_action_ste_table *action_tbl, return err; chunk->action_tbl = action_tbl; + action_tbl->last_used = jiffies; return 0; } @@ -346,6 +411,8 @@ int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool, if (skip_rx && skip_tx) return -EINVAL; + mutex_lock(&pool->lock); + elem = hws_action_ste_choose_elem(pool, skip_rx, skip_tx); mlx5hws_dbg(elem->ctx, @@ -362,26 +429,39 @@ int mlx5hws_action_ste_chunk_alloc(struct mlx5hws_action_ste_pool *pool, if (!found) { action_tbl = hws_action_ste_table_alloc(elem); - if (IS_ERR(action_tbl)) - return PTR_ERR(action_tbl); + if (IS_ERR(action_tbl)) { + err = PTR_ERR(action_tbl); + goto out; + } err = hws_action_ste_table_chunk_alloc(action_tbl, chunk); if (err) - return err; + goto out; } if (mlx5hws_pool_empty(action_tbl->pool)) list_move(&action_tbl->list_node, &elem->full); - return 0; + err = 0; + +out: + mutex_unlock(&pool->lock); + + return err; } void mlx5hws_action_ste_chunk_free(struct mlx5hws_action_ste_chunk *chunk) { + struct mutex *lock = &chunk->action_tbl->parent_elem->parent_pool->lock; + mlx5hws_dbg(chunk->action_tbl->pool->ctx, "Freeing action STEs offset %d order %d\n", chunk->ste.offset, chunk->ste.order); + + mutex_lock(lock); mlx5hws_pool_chunk_free(chunk->action_tbl->pool, &chunk->ste); + chunk->action_tbl->last_used = jiffies; list_move(&chunk->action_tbl->list_node, &chunk->action_tbl->parent_elem->available); + mutex_unlock(lock); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h index 2de660a632239..a8ba97359e314 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/action_ste_pool.h @@ -8,6 +8,9 @@ #define MLX5HWS_ACTION_STE_TABLE_STEP_LOG_SZ 1 #define MLX5HWS_ACTION_STE_TABLE_MAX_LOG_SZ 20 +#define MLX5HWS_ACTION_STE_POOL_CLEANUP_SECONDS 300 +#define MLX5HWS_ACTION_STE_POOL_EXPIRE_SECONDS 300 + struct mlx5hws_action_ste_pool_element; struct mlx5hws_action_ste_table { @@ -19,10 +22,12 @@ struct mlx5hws_action_ste_table { u32 rtc_0_id; u32 rtc_1_id; struct list_head list_node; + unsigned long last_used; }; struct mlx5hws_action_ste_pool_element { struct mlx5hws_context *ctx; + struct mlx5hws_action_ste_pool *parent_pool; size_t log_sz; /* Size of the largest table so far. */ enum mlx5hws_pool_optimize opt; struct list_head available; @@ -33,6 +38,12 @@ struct mlx5hws_action_ste_pool_element { * per queue. */ struct mlx5hws_action_ste_pool { + /* Protects the entire pool. We have one pool per queue and only one + * operation can be active per rule at a given time. Thus this lock + * protects solely against concurrent garbage collection and we expect + * very little contention. + */ + struct mutex lock; struct mlx5hws_action_ste_pool_element elems[MLX5HWS_POOL_OPTIMIZE_MAX]; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h index e987e93bbc6e7..3f8938c73dc01 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/context.h @@ -40,6 +40,7 @@ struct mlx5hws_context { u32 pd_num; struct mlx5hws_pool *stc_pool; struct mlx5hws_action_ste_pool *action_ste_pool; /* One per queue */ + struct delayed_work action_ste_cleanup; struct mlx5hws_context_common_res common_res; struct mlx5hws_pattern_cache *pattern_cache; struct mlx5hws_definer_cache *definer_cache; From 3db55f8cc8d329a97e06fb44347b64a0ca44e780 Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Thu, 10 Apr 2025 22:17:42 +0300 Subject: [PATCH 146/770] net/mlx5: HWS, Export action STE tables to debugfs Introduce a new type of dump object and dump all action STE tables, along with information on their RTCs and STEs. Signed-off-by: Vlad Dogaru Reviewed-by: Hamdan Agbariya Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/1744312662-356571-13-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/debug.c | 36 ++++++++++++++++++- .../mellanox/mlx5/core/steering/hws/debug.h | 2 ++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c index 38f75dec9cfc4..91568d6c1dac9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.c @@ -387,10 +387,41 @@ static int hws_debug_dump_context_stc(struct seq_file *f, struct mlx5hws_context return 0; } +static void +hws_debug_dump_action_ste_table(struct seq_file *f, + struct mlx5hws_action_ste_table *action_tbl) +{ + int ste_0_id = mlx5hws_pool_get_base_id(action_tbl->pool); + int ste_1_id = mlx5hws_pool_get_base_mirror_id(action_tbl->pool); + + seq_printf(f, "%d,0x%llx,%d,%d,%d,%d\n", + MLX5HWS_DEBUG_RES_TYPE_ACTION_STE_TABLE, + HWS_PTR_TO_ID(action_tbl), + action_tbl->rtc_0_id, ste_0_id, + action_tbl->rtc_1_id, ste_1_id); +} + +static void hws_debug_dump_action_ste_pool(struct seq_file *f, + struct mlx5hws_action_ste_pool *pool) +{ + struct mlx5hws_action_ste_table *action_tbl; + enum mlx5hws_pool_optimize opt; + + mutex_lock(&pool->lock); + for (opt = MLX5HWS_POOL_OPTIMIZE_NONE; opt < MLX5HWS_POOL_OPTIMIZE_MAX; + opt++) { + list_for_each_entry(action_tbl, &pool->elems[opt].available, + list_node) { + hws_debug_dump_action_ste_table(f, action_tbl); + } + } + mutex_unlock(&pool->lock); +} + static int hws_debug_dump_context(struct seq_file *f, struct mlx5hws_context *ctx) { struct mlx5hws_table *tbl; - int ret; + int ret, i; ret = hws_debug_dump_context_info(f, ctx); if (ret) @@ -410,6 +441,9 @@ static int hws_debug_dump_context(struct seq_file *f, struct mlx5hws_context *ct return ret; } + for (i = 0; i < ctx->queues; i++) + hws_debug_dump_action_ste_pool(f, &ctx->action_ste_pool[i]); + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h index e44e7ae28f93e..89c396f9f2660 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/debug.h @@ -26,6 +26,8 @@ enum mlx5hws_debug_res_type { MLX5HWS_DEBUG_RES_TYPE_MATCHER_TEMPLATE_HASH_DEFINER = 4205, MLX5HWS_DEBUG_RES_TYPE_MATCHER_TEMPLATE_RANGE_DEFINER = 4206, MLX5HWS_DEBUG_RES_TYPE_MATCHER_TEMPLATE_COMPARE_MATCH_DEFINER = 4207, + + MLX5HWS_DEBUG_RES_TYPE_ACTION_STE_TABLE = 4300, }; static inline u64 From 28a79fc9b03e4ed89b65e5300091e7ed55e13192 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:46 +0100 Subject: [PATCH 147/770] rxrpc: kdoc: Update function descriptions and add link from rxrpc.rst Update the kerneldoc function descriptions to add "Return:" sections for AF_RXRPC exported functions that have return values to stop the kdoc builder from throwing warnings. Also add links from the rxrpc.rst API doc to add a function API reference at the end. (Note that the API doc really needs updating, but that's beyond this patchset). Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-2-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/networking/rxrpc.rst | 11 +++++++++++ net/rxrpc/af_rxrpc.c | 21 +++++++++++++++++---- net/rxrpc/key.c | 2 ++ net/rxrpc/peer_object.c | 22 ++++++++++++++++++---- net/rxrpc/recvmsg.c | 12 ++++++------ net/rxrpc/sendmsg.c | 7 +++++-- net/rxrpc/server_key.c | 2 ++ 7 files changed, 61 insertions(+), 16 deletions(-) diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index e807e18ba32ac..2680abd1cb269 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -1172,3 +1172,14 @@ adjusted through sysctls in /proc/net/rxrpc/: header plus exactly 1412 bytes of data. The terminal packet must contain a four byte header plus any amount of data. In any event, a jumbo packet may not exceed rxrpc_rx_mtu in size. + + +API Function Reference +====================== + +.. kernel-doc:: net/rxrpc/af_rxrpc.c +.. kernel-doc:: net/rxrpc/key.c +.. kernel-doc:: net/rxrpc/peer_object.c +.. kernel-doc:: net/rxrpc/recvmsg.c +.. kernel-doc:: net/rxrpc/sendmsg.c +.. kernel-doc:: net/rxrpc/server_key.c diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 86873399f7d50..2841ce72d7b89 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -265,7 +265,10 @@ static int rxrpc_listen(struct socket *sock, int backlog) * @gfp: Allocation flags * * Lookup or create a remote transport endpoint record for the specified - * address and return it with a ref held. + * address. + * + * Return: The peer record found with a reference, %NULL if no record is found + * or a negative error code if the address is invalid or unsupported. */ struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, struct sockaddr_rxrpc *srx, gfp_t gfp) @@ -283,9 +286,11 @@ EXPORT_SYMBOL(rxrpc_kernel_lookup_peer); /** * rxrpc_kernel_get_peer - Get a reference on a peer - * @peer: The peer to get a reference on. + * @peer: The peer to get a reference on (may be NULL). + * + * Get a reference for a remote peer record (if not NULL). * - * Get a record for the remote peer in a call. + * Return: The @peer argument. */ struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer) { @@ -296,6 +301,8 @@ EXPORT_SYMBOL(rxrpc_kernel_get_peer); /** * rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference * @peer: The peer to drop a ref on + * + * Drop a reference on a peer record. */ void rxrpc_kernel_put_peer(struct rxrpc_peer *peer) { @@ -320,10 +327,12 @@ EXPORT_SYMBOL(rxrpc_kernel_put_peer); * * Allow a kernel service to begin a call on the nominated socket. This just * sets up all the internal tracking structures and allocates connection and - * call IDs as appropriate. The call to be used is returned. + * call IDs as appropriate. * * The default socket destination address and security may be overridden by * supplying @srx and @key. + * + * Return: The new call or an error code. */ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, @@ -437,6 +446,8 @@ EXPORT_SYMBOL(rxrpc_kernel_put_call); * * Allow a kernel service to find out whether a call is still alive - whether * it has completed successfully and all received data has been consumed. + * + * Return: %true if the call is still ongoing and %false if it has completed. */ bool rxrpc_kernel_check_life(const struct socket *sock, const struct rxrpc_call *call) @@ -456,6 +467,8 @@ EXPORT_SYMBOL(rxrpc_kernel_check_life); * * Allow a kernel service to retrieve the epoch value from a service call to * see if the client at the other end rebooted. + * + * Return: The epoch of the call's connection. */ u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) { diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 33e8302a79e33..8c99cf19b19d7 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -531,6 +531,8 @@ EXPORT_SYMBOL(rxrpc_get_server_data_key); * * Generate a null RxRPC key that can be used to indicate anonymous security is * required for a particular domain. + * + * Return: The new key or a negative error code. */ struct key *rxrpc_get_null_key(const char *keyname) { diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 71b6e07bf1616..e2f35e6c04d6c 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -475,6 +475,8 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) * @call: The call to query * * Get a record for the remote peer in a call. + * + * Return: The call's peer record. */ struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call) { @@ -486,7 +488,9 @@ EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT * @peer: The peer to query * - * Get the call's peer smoothed RTT in uS or UINT_MAX if we have no samples. + * Get the call's peer smoothed RTT. + * + * Return: The RTT in uS or %UINT_MAX if we have no samples. */ unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer) { @@ -499,7 +503,10 @@ EXPORT_SYMBOL(rxrpc_kernel_get_srtt); * @peer: The peer to query * * Get a pointer to the address from a peer record. The caller is responsible - * for making sure that the address is not deallocated. + * for making sure that the address is not deallocated. A fake address will be + * substituted if %peer in NULL. + * + * Return: The rxrpc address record or a fake record. */ const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer) { @@ -512,7 +519,10 @@ EXPORT_SYMBOL(rxrpc_kernel_remote_srx); * @peer: The peer to query * * Get a pointer to the transport address from a peer record. The caller is - * responsible for making sure that the address is not deallocated. + * responsible for making sure that the address is not deallocated. A fake + * address will be substituted if %peer in NULL. + * + * Return: The transport address record or a fake record. */ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer) { @@ -527,7 +537,9 @@ EXPORT_SYMBOL(rxrpc_kernel_remote_addr); * @app_data: The data to set * * Set the app-specific data on a peer. AF_RXRPC makes no effort to retain - * anything the data might refer to. The previous app_data is returned. + * anything the data might refer to. + * + * Return: The previous app_data. */ unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data) { @@ -540,6 +552,8 @@ EXPORT_SYMBOL(rxrpc_kernel_set_peer_data); * @peer: The peer to query * * Retrieve the app-specific data from a peer. + * + * Return: The peer's app data. */ unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer) { diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 32cd5f1d541db..4c622752a5693 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -477,14 +477,14 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, * @_service: Where to store the actual service ID (may be upgraded) * * Allow a kernel service to receive data and pick up information about the - * state of a call. Returns 0 if got what was asked for and there's more - * available, 1 if we got what was asked for and we're at the end of the data - * and -EAGAIN if we need more data. + * state of a call. Note that *@_abort should also be initialised to %0. * - * Note that we may return -EAGAIN to drain empty packets at the end of the - * data, even if we've already copied over the requested data. + * Note that we may return %-EAGAIN to drain empty packets at the end + * of the data, even if we've already copied over the requested data. * - * *_abort should also be initialised to 0. + * Return: %0 if got what was asked for and there's more available, %1 + * if we got what was asked for and we're at the end of the data and + * %-EAGAIN if we need more data. */ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, struct iov_iter *iter, size_t *_len, diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 84dc6c94f23b1..3f62c34ce7de1 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -794,6 +794,8 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) * appropriate to sending data. No control data should be supplied in @msg, * nor should an address be supplied. MSG_MORE should be flagged if there's * more data to come, otherwise this data will end the transmission phase. + * + * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, size_t len, @@ -829,8 +831,9 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data); * @error: Local error value * @why: Indication as to why. * - * Allow a kernel service to abort a call, if it's still in an abortable state - * and return true if the call was aborted, false if it was already complete. + * Allow a kernel service to abort a call if it's still in an abortable state. + * + * Return: %true if the call was aborted, %false if it was already complete. */ bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, u32 abort_code, int error, enum rxrpc_abort_reason why) diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c index e51940589ee54..eb7525e929510 100644 --- a/net/rxrpc/server_key.c +++ b/net/rxrpc/server_key.c @@ -152,6 +152,8 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) * * Set the server security keyring on an rxrpc socket. This is used to provide * the encryption keys for a kernel service. + * + * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) { From 23738cc8048322cf324f330cd697380fb3455da5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:47 +0100 Subject: [PATCH 148/770] rxrpc: Pull out certain app callback funcs into an ops table A number of functions separately furnish an AF_RXRPC socket with callback function pointers into a kernel app (such as the AFS filesystem) that is using it. Replace most of these with an ops table for the entire socket. This makes it easier to add more callback functions. Note that the call incoming data processing callback is retaind as that gets set to different things, depending on the type of op. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-3-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- fs/afs/rxrpc.c | 11 ++++++++--- include/net/af_rxrpc.h | 25 +++++++++++++++---------- net/rxrpc/af_rxrpc.c | 20 ++++++++------------ net/rxrpc/ar-internal.h | 3 +-- net/rxrpc/call_accept.c | 34 ++++++++++++++++------------------ net/rxrpc/rxperf.c | 10 +++++++--- 6 files changed, 55 insertions(+), 48 deletions(-) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index d5e480a33859b..a4734304e5421 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -24,8 +24,15 @@ static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID); static int afs_deliver_cm_op_id(struct afs_call *); +static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = { + .notify_new_call = afs_rx_new_call, + .discard_new_call = afs_rx_discard_new_call, + .user_attach_call = afs_rx_attach, +}; + /* asynchronous incoming call initial processing */ static const struct afs_call_type afs_RXCMxxxx = { .name = "CB.xxxx", @@ -84,8 +91,7 @@ int afs_open_socket(struct afs_net *net) * it sends back to us. */ - rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, - afs_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) @@ -738,7 +744,6 @@ void afs_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(net->socket, afs_wake_up_async_call, - afs_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index cf793d18e5df5..ebb6092c488b5 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -29,18 +29,23 @@ enum rxrpc_interruptibility { */ extern atomic_t rxrpc_debug_id; +/* + * Operations table for rxrpc to call out to a kernel application (e.g. kAFS). + */ +struct rxrpc_kernel_ops { + void (*notify_new_call)(struct sock *sk, struct rxrpc_call *call, + unsigned long user_call_ID); + void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID); +}; + typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, unsigned long); typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *, - unsigned long); -typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long); -void rxrpc_kernel_new_call_notification(struct socket *, - rxrpc_notify_new_call_t, - rxrpc_discard_new_call_t); +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops); struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, struct key *key, @@ -72,9 +77,9 @@ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data); unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer); unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); -int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, - rxrpc_user_attach_call_t, unsigned long, gfp_t, - unsigned int); +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, + unsigned long user_call_ID, gfp_t gfp, + unsigned int debug_id); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2841ce72d7b89..2e4f6c9ef3de8 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -477,24 +477,20 @@ u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) EXPORT_SYMBOL(rxrpc_kernel_get_epoch); /** - * rxrpc_kernel_new_call_notification - Get notifications of new calls - * @sock: The socket to intercept received messages on - * @notify_new_call: Function to be called when new calls appear - * @discard_new_call: Function to discard preallocated calls + * rxrpc_kernel_set_notifications - Set table of callback operations + * @sock: The socket to install table upon + * @app_ops: Callback operation table to set * - * Allow a kernel service to be given notifications about new calls. + * Allow a kernel service to set a table of event notifications on a socket. */ -void rxrpc_kernel_new_call_notification( - struct socket *sock, - rxrpc_notify_new_call_t notify_new_call, - rxrpc_discard_new_call_t discard_new_call) +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - rx->notify_new_call = notify_new_call; - rx->discard_new_call = discard_new_call; + rx->app_ops = app_ops; } -EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); +EXPORT_SYMBOL(rxrpc_kernel_set_notifications); /** * rxrpc_kernel_set_max_life - Set maximum lifespan on a call diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 3cc3af15086ff..55810c6b4be89 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -146,8 +146,7 @@ struct rxrpc_backlog { struct rxrpc_sock { /* WARNING: sk has to be the first member */ struct sock sk; - rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */ - rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */ + const struct rxrpc_kernel_ops *app_ops; /* Table of kernel app notification funcs */ struct rxrpc_local *local; /* local endpoint */ struct rxrpc_backlog *backlog; /* Preallocation for services */ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index e685034ce4f7c..a4b363b47ccad 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -34,7 +34,6 @@ static void rxrpc_dummy_notify(struct sock *sk, struct rxrpc_call *call, static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, struct rxrpc_backlog *b, rxrpc_notify_rx_t notify_rx, - rxrpc_user_attach_call_t user_attach_call, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { @@ -123,9 +122,10 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, call->user_call_ID = user_call_ID; call->notify_rx = notify_rx; - if (user_attach_call) { + if (rx->app_ops && + rx->app_ops->user_attach_call) { rxrpc_get_call(call, rxrpc_call_get_kernel_service); - user_attach_call(call, user_call_ID); + rx->app_ops->user_attach_call(call, user_call_ID); } rxrpc_get_call(call, rxrpc_call_get_userid); @@ -219,9 +219,10 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; rcu_assign_pointer(call->socket, rx); - if (rx->discard_new_call) { + if (rx->app_ops && + rx->app_ops->discard_new_call) { _debug("discard %lx", call->user_call_ID); - rx->discard_new_call(call, call->user_call_ID); + rx->app_ops->discard_new_call(call, call->user_call_ID); if (call->notify_rx) call->notify_rx = rxrpc_dummy_notify; rxrpc_put_call(call, rxrpc_call_put_kernel); @@ -387,8 +388,9 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, rxrpc_incoming_call(rx, call, skb); conn = call->conn; - if (rx->notify_new_call) - rx->notify_new_call(&rx->sk, call, call->user_call_ID); + if (rx->app_ops && + rx->app_ops->notify_new_call) + rx->app_ops->notify_new_call(&rx->sk, call, call->user_call_ID); spin_lock(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_UNSECURED) { @@ -440,8 +442,7 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) if (rx->sk.sk_state == RXRPC_CLOSE) return -ESHUTDOWN; - return rxrpc_service_prealloc_one(rx, b, NULL, NULL, user_call_ID, - GFP_KERNEL, + return rxrpc_service_prealloc_one(rx, b, NULL, user_call_ID, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); } @@ -449,20 +450,18 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls * @sock: The socket on which to preallocate * @notify_rx: Event notification function for the call - * @user_attach_call: Func to attach call to user_call_ID * @user_call_ID: The tag to attach to the preallocated call * @gfp: The allocation conditions. * @debug_id: The tracing debug ID. * - * Charge up the socket with preallocated calls, each with a user ID. A - * function should be provided to effect the attachment from the user's side. - * The user is given a ref to hold on the call. + * Charge up the socket with preallocated calls, each with a user ID. The + * ->user_attach_call() callback function should be provided to effect the + * attachment from the user's side. The user is given a ref to hold on the + * call. * * Note that the call may be come connected before this function returns. */ -int rxrpc_kernel_charge_accept(struct socket *sock, - rxrpc_notify_rx_t notify_rx, - rxrpc_user_attach_call_t user_attach_call, +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { @@ -472,8 +471,7 @@ int rxrpc_kernel_charge_accept(struct socket *sock, if (sock->sk->sk_state == RXRPC_CLOSE) return -ESHUTDOWN; - return rxrpc_service_prealloc_one(rx, b, notify_rx, - user_attach_call, user_call_ID, + return rxrpc_service_prealloc_one(rx, b, notify_rx, user_call_ID, gfp, debug_id); } EXPORT_SYMBOL(rxrpc_kernel_charge_accept); diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index e848a4777b8c7..c76fbccfbb91c 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -136,6 +136,12 @@ static void rxperf_notify_end_reply_tx(struct sock *sock, RXPERF_CALL_SV_AWAIT_ACK); } +static const struct rxrpc_kernel_ops rxperf_rxrpc_callback_ops = { + .notify_new_call = rxperf_rx_new_call, + .discard_new_call = rxperf_rx_discard_new_call, + .user_attach_call = rxperf_rx_attach, +}; + /* * Charge the incoming call preallocation. */ @@ -161,7 +167,6 @@ static void rxperf_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(rxperf_socket, rxperf_notify_rx, - rxperf_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) @@ -209,8 +214,7 @@ static int rxperf_open_socket(void) if (ret < 0) goto error_2; - rxrpc_kernel_new_call_notification(socket, rxperf_rx_new_call, - rxperf_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &rxperf_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) From 019c8433eb2931a422376d2b0079d751aa948d4c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:48 +0100 Subject: [PATCH 149/770] rxrpc: Remove some socket lock acquire/release annotations Remove some socket lock acquire/release annotations as lock_sock() and release_sock() don't have them and so the checker gets confused. Removing all of them, however, causes warnings about "context imbalance" and "wrong count at exit" to occur instead. Probably lock_sock() and release_sock() should have annotations on indicating their taking of sk_lock - there is a dep_map in socket_lock_t, but I don't know if that matters to the static checker. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-4-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/ar-internal.h | 4 +++- net/rxrpc/call_object.c | 2 +- net/rxrpc/sendmsg.c | 3 +-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 55810c6b4be89..c267b8ac1bb53 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1000,7 +1000,9 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct rxrpc_call_params *, gfp_t, - unsigned int); + unsigned int) + __releases(&rx->sk.sk_lock) + __acquires(&call->user_mutex); void rxrpc_start_call_timer(struct rxrpc_call *call); void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index fce58be65e7cf..f428ea77f803c 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -322,7 +322,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) - __releases(&rx->sk.sk_lock.slock) + __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex) { struct rxrpc_call *call, *xcall; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 3f62c34ce7de1..2342b5f1547cb 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -607,7 +607,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, struct rxrpc_send_params *p) - __releases(&rx->sk.sk_lock.slock) + __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex) { struct rxrpc_conn_parameters cp; @@ -657,7 +657,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, * - the socket may be either a client socket or a server socket */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) - __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call; bool dropped_lock = false; From 5800b1cf3fd8ccab752a101865be1e76dac33142 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:49 +0100 Subject: [PATCH 150/770] rxrpc: Allow CHALLENGEs to the passed to the app for a RESPONSE Allow the app to request that CHALLENGEs be passed to it through an out-of-band queue that allows recvmsg() to pick it up so that the app can add data to it with sendmsg(). This will allow the application (AFS or userspace) to interact with the process if it wants to and put values into user-defined fields. This will be used by AFS when talking to a fileserver to supply that fileserver with a crypto key by which callback RPCs can be encrypted (ie. notifications from the fileserver to the client). Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-5-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/networking/rxrpc.rst | 2 + fs/afs/Makefile | 1 + fs/afs/cm_security.c | 73 ++++++ fs/afs/internal.h | 6 + fs/afs/main.c | 1 + fs/afs/rxrpc.c | 18 ++ include/net/af_rxrpc.h | 24 ++ include/trace/events/rxrpc.h | 18 +- include/uapi/linux/rxrpc.h | 46 +++- net/rxrpc/Makefile | 1 + net/rxrpc/af_rxrpc.c | 52 +++- net/rxrpc/ar-internal.h | 51 +++- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_event.c | 134 +++++++++- net/rxrpc/conn_object.c | 1 + net/rxrpc/insecure.c | 13 +- net/rxrpc/io_thread.c | 12 +- net/rxrpc/oob.c | 379 +++++++++++++++++++++++++++++ net/rxrpc/output.c | 56 +++++ net/rxrpc/recvmsg.c | 120 +++++++-- net/rxrpc/rxkad.c | 288 ++++++++++++++-------- net/rxrpc/sendmsg.c | 15 +- net/rxrpc/server_key.c | 40 +++ 23 files changed, 1188 insertions(+), 165 deletions(-) create mode 100644 fs/afs/cm_security.c create mode 100644 net/rxrpc/oob.c diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index 2680abd1cb269..eb941b2577ddf 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -1179,7 +1179,9 @@ API Function Reference .. kernel-doc:: net/rxrpc/af_rxrpc.c .. kernel-doc:: net/rxrpc/key.c +.. kernel-doc:: net/rxrpc/oob.c .. kernel-doc:: net/rxrpc/peer_object.c .. kernel-doc:: net/rxrpc/recvmsg.c +.. kernel-doc:: net/rxrpc/rxkad.c .. kernel-doc:: net/rxrpc/sendmsg.c .. kernel-doc:: net/rxrpc/server_key.c diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 5efd7e13b3043..b49b8fe682f39 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -8,6 +8,7 @@ kafs-y := \ addr_prefs.o \ callback.o \ cell.o \ + cm_security.o \ cmservice.o \ dir.o \ dir_edit.o \ diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c new file mode 100644 index 0000000000000..efe6a23c84774 --- /dev/null +++ b/fs/afs/cm_security.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Cache manager security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include "internal.h" +#include "afs_fs.h" +#include "protocol_yfs.h" +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include + +/* + * Respond to an RxGK challenge, adding appdata. + */ +static int afs_respond_to_challenge(struct sk_buff *challenge) +{ + struct rxrpc_peer *peer; + unsigned long peer_data; + u16 service_id; + u8 security_index; + + rxrpc_kernel_query_challenge(challenge, &peer, &peer_data, + &service_id, &security_index); + + _enter("%u,%u", service_id, security_index); + + switch (service_id) { + /* We don't send CM_SERVICE RPCs, so don't expect a challenge + * therefrom. + */ + case FS_SERVICE: + case VL_SERVICE: + case YFS_FS_SERVICE: + case YFS_VL_SERVICE: + break; + default: + pr_warn("Can't respond to unknown challenge %u:%u", + service_id, security_index); + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } + + switch (security_index) { + case RXRPC_SECURITY_RXKAD: + return rxkad_kernel_respond_to_challenge(challenge); + + default: + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } +} + +/* + * Process the OOB message queue, processing challenge packets. + */ +void afs_process_oob_queue(struct work_struct *work) +{ + struct afs_net *net = container_of(work, struct afs_net, rx_oob_work); + struct sk_buff *oob; + enum rxrpc_oob_type type; + + while ((oob = rxrpc_kernel_dequeue_oob(net->socket, &type))) { + switch (type) { + case RXRPC_OOB_CHALLENGE: + afs_respond_to_challenge(oob); + break; + } + rxrpc_kernel_free_oob(oob); + } +} diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 440b0e731093b..b3612b700c6a1 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -281,6 +281,7 @@ struct afs_net { struct socket *socket; struct afs_call *spare_incoming_call; struct work_struct charge_preallocation_work; + struct work_struct rx_oob_work; struct mutex socket_mutex; atomic_t nr_outstanding_calls; atomic_t nr_superblocks; @@ -1058,6 +1059,11 @@ extern void __net_exit afs_cell_purge(struct afs_net *); */ extern bool afs_cm_incoming_call(struct afs_call *); +/* + * cm_security.c + */ +void afs_process_oob_queue(struct work_struct *work); + /* * dir.c */ diff --git a/fs/afs/main.c b/fs/afs/main.c index c845c5daaeba0..02475d415d885 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -73,6 +73,7 @@ static int __net_init afs_net_init(struct net *net_ns) generate_random_uuid((unsigned char *)&net->uuid); INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation); + INIT_WORK(&net->rx_oob_work, afs_process_oob_queue); mutex_init(&net->socket_mutex); net->cells = RB_ROOT; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index a4734304e5421..212af2aa85bf4 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -25,12 +25,14 @@ static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID); +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob); static int afs_deliver_cm_op_id(struct afs_call *); static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = { .notify_new_call = afs_rx_new_call, .discard_new_call = afs_rx_discard_new_call, .user_attach_call = afs_rx_attach, + .notify_oob = afs_rx_notify_oob, }; /* asynchronous incoming call initial processing */ @@ -56,6 +58,7 @@ int afs_open_socket(struct afs_net *net) goto error_1; socket->sk->sk_allocation = GFP_NOFS; + socket->sk->sk_user_data = net; /* bind the callback manager's address to make this a server socket */ memset(&srx, 0, sizeof(srx)); @@ -71,6 +74,10 @@ int afs_open_socket(struct afs_net *net) if (ret < 0) goto error_2; + ret = rxrpc_sock_set_manage_response(socket->sk, true); + if (ret < 0) + goto error_2; + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; @@ -131,6 +138,7 @@ void afs_close_socket(struct afs_net *net) kernel_sock_shutdown(net->socket, SHUT_RDWR); flush_workqueue(afs_async_calls); + net->socket->sk->sk_user_data = NULL; sock_release(net->socket); _debug("dework"); @@ -957,3 +965,13 @@ noinline int afs_protocol_error(struct afs_call *call, call->unmarshalling_error = true; return -EBADMSG; } + +/* + * Wake up OOB notification processing. + */ +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob) +{ + struct afs_net *net = sk->sk_user_data; + + schedule_work(&net->rx_oob_work); +} diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index ebb6092c488b5..0b209f703ffc1 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -16,6 +16,7 @@ struct sock; struct socket; struct rxrpc_call; struct rxrpc_peer; +struct krb5_buffer; enum rxrpc_abort_reason; enum rxrpc_interruptibility { @@ -24,6 +25,10 @@ enum rxrpc_interruptibility { RXRPC_UNINTERRUPTIBLE, /* Call should not be interruptible at all */ }; +enum rxrpc_oob_type { + RXRPC_OOB_CHALLENGE, /* Security challenge for a connection */ +}; + /* * Debug ID counter for tracing. */ @@ -37,6 +42,7 @@ struct rxrpc_kernel_ops { unsigned long user_call_ID); void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID); void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*notify_oob)(struct sock *sk, struct sk_buff *oob); }; typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, @@ -88,5 +94,23 @@ void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); int rxrpc_sock_set_security_keyring(struct sock *, struct key *); +int rxrpc_sock_set_manage_response(struct sock *sk, bool set); + +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata); +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type); +void rxrpc_kernel_free_oob(struct sk_buff *oob); +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index); +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why); +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge); +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge); +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata); #endif /* _NET_RXRPC_H */ diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index cad50d91077ef..08ecebd905953 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -25,6 +25,7 @@ EM(afs_abort_probeuuid_negative, "afs-probeuuid-neg") \ EM(afs_abort_send_data_error, "afs-send-data") \ EM(afs_abort_unmarshal_error, "afs-unmarshal") \ + EM(afs_abort_unsupported_sec_class, "afs-unsup-sec-class") \ /* rxperf errors */ \ EM(rxperf_abort_general_error, "rxperf-error") \ EM(rxperf_abort_oom, "rxperf-oom") \ @@ -77,6 +78,7 @@ EM(rxrpc_abort_call_timeout, "call-timeout") \ EM(rxrpc_abort_no_service_key, "no-serv-key") \ EM(rxrpc_abort_nomem, "nomem") \ + EM(rxrpc_abort_response_sendmsg, "resp-sendmsg") \ EM(rxrpc_abort_service_not_offered, "serv-not-offered") \ EM(rxrpc_abort_shut_down, "shut-down") \ EM(rxrpc_abort_unsupported_security, "unsup-sec") \ @@ -133,24 +135,33 @@ EM(rxrpc_skb_get_conn_secured, "GET conn-secd") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ EM(rxrpc_skb_get_local_work, "GET locl-work") \ + EM(rxrpc_skb_get_post_oob, "GET post-oob ") \ EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ EM(rxrpc_skb_get_to_recvmsg_oos, "GET to-recv-o") \ EM(rxrpc_skb_new_encap_rcv, "NEW encap-rcv") \ EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ + EM(rxrpc_skb_new_response_rxgk, "NEW resp-rxgk") \ + EM(rxrpc_skb_new_response_rxkad, "NEW resp-rxkd") \ EM(rxrpc_skb_new_unshared, "NEW unshared ") \ EM(rxrpc_skb_put_call_rx, "PUT call-rx ") \ + EM(rxrpc_skb_put_challenge, "PUT challenge") \ EM(rxrpc_skb_put_conn_secured, "PUT conn-secd") \ EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ + EM(rxrpc_skb_put_oob, "PUT oob ") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ + EM(rxrpc_skb_put_purge_oob, "PUT purge-oob") \ + EM(rxrpc_skb_put_response, "PUT response ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ EM(rxrpc_skb_see_conn_work, "SEE conn-work") \ + EM(rxrpc_skb_see_oob_challenge, "SEE oob-chall") \ EM(rxrpc_skb_see_recvmsg, "SEE recvmsg ") \ + EM(rxrpc_skb_see_recvmsg_oob, "SEE recvm-oob") \ EM(rxrpc_skb_see_reject, "SEE reject ") \ EM(rxrpc_skb_see_rotate, "SEE rotate ") \ E_(rxrpc_skb_see_version, "SEE version ") @@ -216,9 +227,11 @@ EM(rxrpc_conn_free, "FREE ") \ EM(rxrpc_conn_get_activate_call, "GET act-call") \ EM(rxrpc_conn_get_call_input, "GET inp-call") \ + EM(rxrpc_conn_get_challenge_input, "GET inp-chal") \ EM(rxrpc_conn_get_conn_input, "GET inp-conn") \ EM(rxrpc_conn_get_idle, "GET idle ") \ EM(rxrpc_conn_get_poke_abort, "GET pk-abort") \ + EM(rxrpc_conn_get_poke_response, "GET response") \ EM(rxrpc_conn_get_poke_secured, "GET secured ") \ EM(rxrpc_conn_get_poke_timer, "GET poke ") \ EM(rxrpc_conn_get_service_conn, "GET svc-conn") \ @@ -226,10 +239,12 @@ EM(rxrpc_conn_new_service, "NEW service ") \ EM(rxrpc_conn_put_call, "PUT call ") \ EM(rxrpc_conn_put_call_input, "PUT inp-call") \ + EM(rxrpc_conn_put_challenge_input, "PUT inp-chal") \ EM(rxrpc_conn_put_conn_input, "PUT inp-conn") \ EM(rxrpc_conn_put_discard_idle, "PUT disc-idl") \ EM(rxrpc_conn_put_local_dead, "PUT loc-dead") \ EM(rxrpc_conn_put_noreuse, "PUT noreuse ") \ + EM(rxrpc_conn_put_oob, "PUT oob ") \ EM(rxrpc_conn_put_poke, "PUT poke ") \ EM(rxrpc_conn_put_service_reaped, "PUT svc-reap") \ EM(rxrpc_conn_put_unbundle, "PUT unbundle") \ @@ -331,6 +346,7 @@ EM(rxrpc_recvmsg_full, "FULL") \ EM(rxrpc_recvmsg_hole, "HOLE") \ EM(rxrpc_recvmsg_next, "NEXT") \ + EM(rxrpc_recvmsg_oobq, "OOBQ") \ EM(rxrpc_recvmsg_requeue, "REQU") \ EM(rxrpc_recvmsg_return, "RETN") \ EM(rxrpc_recvmsg_terminal, "TERM") \ @@ -456,7 +472,7 @@ EM(rxrpc_tx_point_conn_abort, "ConnAbort") \ EM(rxrpc_tx_point_reject, "Reject") \ EM(rxrpc_tx_point_rxkad_challenge, "RxkadChall") \ - EM(rxrpc_tx_point_rxkad_response, "RxkadResp") \ + EM(rxrpc_tx_point_response, "Response") \ EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \ E_(rxrpc_tx_point_version_reply, "VerReply") diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index 8f8dc7a937a43..c4e9833b0a12b 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -36,26 +36,33 @@ struct sockaddr_rxrpc { #define RXRPC_MIN_SECURITY_LEVEL 4 /* minimum security level */ #define RXRPC_UPGRADEABLE_SERVICE 5 /* Upgrade service[0] -> service[1] */ #define RXRPC_SUPPORTED_CMSG 6 /* Get highest supported control message type */ +#define RXRPC_MANAGE_RESPONSE 7 /* [clnt] Want to manage RESPONSE packets */ /* * RxRPC control messages * - If neither abort or accept are specified, the message is a data message. * - terminal messages mean that a user call ID tag can be recycled + * - C/S/- indicate whether these are applicable to client, server or both * - s/r/- indicate whether these are applicable to sendmsg() and/or recvmsg() */ enum rxrpc_cmsg_type { - RXRPC_USER_CALL_ID = 1, /* sr: user call ID specifier */ - RXRPC_ABORT = 2, /* sr: abort request / notification [terminal] */ - RXRPC_ACK = 3, /* -r: [Service] RPC op final ACK received [terminal] */ - RXRPC_NET_ERROR = 5, /* -r: network error received [terminal] */ - RXRPC_BUSY = 6, /* -r: server busy received [terminal] */ - RXRPC_LOCAL_ERROR = 7, /* -r: local error generated [terminal] */ - RXRPC_NEW_CALL = 8, /* -r: [Service] new incoming call notification */ - RXRPC_EXCLUSIVE_CALL = 10, /* s-: Call should be on exclusive connection */ - RXRPC_UPGRADE_SERVICE = 11, /* s-: Request service upgrade for client call */ - RXRPC_TX_LENGTH = 12, /* s-: Total length of Tx data */ - RXRPC_SET_CALL_TIMEOUT = 13, /* s-: Set one or more call timeouts */ - RXRPC_CHARGE_ACCEPT = 14, /* s-: Charge the accept pool with a user call ID */ + RXRPC_USER_CALL_ID = 1, /* -sr: User call ID specifier */ + RXRPC_ABORT = 2, /* -sr: Abort request / notification [terminal] */ + RXRPC_ACK = 3, /* S-r: RPC op final ACK received [terminal] */ + RXRPC_NET_ERROR = 5, /* --r: Network error received [terminal] */ + RXRPC_BUSY = 6, /* C-r: Server busy received [terminal] */ + RXRPC_LOCAL_ERROR = 7, /* --r: Local error generated [terminal] */ + RXRPC_NEW_CALL = 8, /* S-r: New incoming call notification */ + RXRPC_EXCLUSIVE_CALL = 10, /* Cs-: Call should be on exclusive connection */ + RXRPC_UPGRADE_SERVICE = 11, /* Cs-: Request service upgrade for client call */ + RXRPC_TX_LENGTH = 12, /* -s-: Total length of Tx data */ + RXRPC_SET_CALL_TIMEOUT = 13, /* -s-: Set one or more call timeouts */ + RXRPC_CHARGE_ACCEPT = 14, /* Ss-: Charge the accept pool with a user call ID */ + RXRPC_OOB_ID = 15, /* -sr: OOB message ID */ + RXRPC_CHALLENGED = 16, /* C-r: Info on a received CHALLENGE */ + RXRPC_RESPOND = 17, /* Cs-: Respond to a challenge */ + RXRPC_RESPONDED = 18, /* S-r: Data received in RESPONSE */ + RXRPC_RESP_RXGK_APPDATA = 19, /* Cs-: RESPONSE: RxGK app data to include */ RXRPC__SUPPORTED }; @@ -118,4 +125,19 @@ enum rxrpc_cmsg_type { #define RXKADDATALEN 19270411 /* user data too long */ #define RXKADILLEGALLEVEL 19270412 /* caller not authorised to use encrypted conns */ +/* + * Challenge information in the RXRPC_CHALLENGED control message. + */ +struct rxrpc_challenge { + __u16 service_id; /* The service ID of the connection (may be upgraded) */ + __u8 security_index; /* The security index of the connection */ + __u8 pad; /* Round out to a multiple of 4 bytes. */ + /* ... The security class gets to append extra information ... */ +}; + +struct rxgk_challenge { + struct rxrpc_challenge base; + __u32 enctype; /* Krb5 encoding type */ +}; + #endif /* _UAPI_LINUX_RXRPC_H */ diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 210b75e3179e9..0981941dea73d 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -24,6 +24,7 @@ rxrpc-y := \ local_object.o \ misc.o \ net_ns.o \ + oob.o \ output.o \ peer_event.o \ peer_object.o \ diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2e4f6c9ef3de8..3a558c1a541e1 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -633,7 +633,10 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) fallthrough; case RXRPC_SERVER_BOUND: case RXRPC_SERVER_LISTENING: - ret = rxrpc_do_sendmsg(rx, m, len); + if (m->msg_flags & MSG_OOB) + ret = rxrpc_sendmsg_oob(rx, m, len); + else + ret = rxrpc_do_sendmsg(rx, m, len); /* The socket has been unlocked */ goto out; default: @@ -668,7 +671,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - unsigned int min_sec_level; + unsigned int min_sec_level, val; u16 service_upgrade[2]; int ret; @@ -749,6 +752,26 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, rx->service_upgrade.to = service_upgrade[1]; goto success; + case RXRPC_MANAGE_RESPONSE: + ret = -EINVAL; + if (optlen != sizeof(unsigned int)) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNBOUND) + goto error; + ret = copy_safe_from_sockptr(&val, sizeof(val), + optval, optlen); + if (ret) + goto error; + ret = -EINVAL; + if (val > 1) + goto error; + if (val) + set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + else + clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + goto success; + default: break; } @@ -855,6 +878,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, rx->calls = RB_ROOT; spin_lock_init(&rx->incoming_lock); + skb_queue_head_init(&rx->recvmsg_oobq); + rx->pending_oobq = RB_ROOT; INIT_LIST_HEAD(&rx->sock_calls); INIT_LIST_HEAD(&rx->to_be_accepted); INIT_LIST_HEAD(&rx->recvmsg_q); @@ -888,8 +913,10 @@ static int rxrpc_shutdown(struct socket *sock, int flags) lock_sock(sk); if (sk->sk_state < RXRPC_CLOSE) { + spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; sk->sk_shutdown = SHUTDOWN_MASK; + spin_unlock_irq(&rx->recvmsg_lock); } else { ret = -ESHUTDOWN; } @@ -900,6 +927,23 @@ static int rxrpc_shutdown(struct socket *sock, int flags) return ret; } +/* + * Purge the out-of-band queue. + */ +static void rxrpc_purge_oob_queue(struct sock *sk) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&rx->recvmsg_oobq))) + rxrpc_kernel_free_oob(skb); + while (!RB_EMPTY_ROOT(&rx->pending_oobq)) { + skb = rb_entry(rx->pending_oobq.rb_node, struct sk_buff, rbnode); + rb_erase(&skb->rbnode, &rx->pending_oobq); + rxrpc_kernel_free_oob(skb); + } +} + /* * RxRPC socket destructor */ @@ -907,6 +951,7 @@ static void rxrpc_sock_destructor(struct sock *sk) { _enter("%p", sk); + rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); @@ -945,7 +990,9 @@ static int rxrpc_release_sock(struct sock *sk) break; } + spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; + spin_unlock_irq(&rx->recvmsg_lock); if (rx->local && rx->local->service == rx) { write_lock(&rx->local->services_lock); @@ -957,6 +1004,7 @@ static int rxrpc_release_sock(struct sock *sk) rxrpc_discard_prealloc(rx); rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); + rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); rxrpc_unuse_local(rx->local, rxrpc_local_unuse_release_sock); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c267b8ac1bb53..31d05784c530a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -39,6 +39,7 @@ struct rxrpc_txqueue; enum rxrpc_skb_mark { RXRPC_SKB_MARK_PACKET, /* Received packet */ RXRPC_SKB_MARK_ERROR, /* Error notification */ + RXRPC_SKB_MARK_CHALLENGE, /* Challenge notification */ RXRPC_SKB_MARK_SERVICE_CONN_SECURED, /* Service connection response has been verified */ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ @@ -149,6 +150,9 @@ struct rxrpc_sock { const struct rxrpc_kernel_ops *app_ops; /* Table of kernel app notification funcs */ struct rxrpc_local *local; /* local endpoint */ struct rxrpc_backlog *backlog; /* Preallocation for services */ + struct sk_buff_head recvmsg_oobq; /* OOB messages for recvmsg to pick up */ + struct rb_root pending_oobq; /* OOB messages awaiting userspace to respond to */ + u64 oob_id_counter; /* OOB message ID counter */ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ struct list_head sock_calls; /* List of calls owned by this socket */ struct list_head to_be_accepted; /* calls awaiting acceptance */ @@ -159,6 +163,7 @@ struct rxrpc_sock { struct rb_root calls; /* User ID -> call mapping */ unsigned long flags; #define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ +#define RXRPC_SOCK_MANAGE_RESPONSE 1 /* User wants to manage RESPONSE packets */ rwlock_t call_lock; /* lock for calls */ u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT @@ -202,7 +207,7 @@ struct rxrpc_host_header { */ struct rxrpc_skb_priv { union { - struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ + struct rxrpc_connection *poke_conn; /* Conn referred to (poke packet) */ struct { u16 offset; /* Offset of data */ u16 len; /* Length of data */ @@ -216,6 +221,19 @@ struct rxrpc_skb_priv { u16 nr_acks; /* Number of acks+nacks */ u8 reason; /* Reason for ack */ } ack; + struct { + struct rxrpc_connection *conn; /* Connection referred to */ + union { + u32 rxkad_nonce; + }; + } chall; + struct { + rxrpc_serial_t challenge_serial; + u32 kvno; + u32 version; + u16 len; + u16 ticket_len; + } resp; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; @@ -269,9 +287,24 @@ struct rxrpc_security { /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); + /* Validate a challenge packet */ + bool (*validate_challenge)(struct rxrpc_connection *conn, + struct sk_buff *skb); + + /* Fill out the cmsg for recvmsg() to pass on a challenge to userspace. + * The security class gets to add additional information. + */ + int (*challenge_to_recvmsg)(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct msghdr *msg); + + /* Parse sendmsg() control message and respond to challenge. */ + int (*sendmsg_respond_to_challenge)(struct sk_buff *challenge, + struct msghdr *msg); + /* respond to a challenge */ - int (*respond_to_challenge)(struct rxrpc_connection *, - struct sk_buff *); + int (*respond_to_challenge)(struct rxrpc_connection *conn, + struct sk_buff *challenge); /* verify a response */ int (*verify_response)(struct rxrpc_connection *, @@ -526,6 +559,7 @@ struct rxrpc_connection { u32 nonce; /* response re-use preventer */ } rxkad; }; + struct sk_buff *tx_response; /* Response packet to be transmitted */ unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ @@ -1199,8 +1233,11 @@ void rxrpc_error_report(struct sock *); bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, s32 abort_code, int err); int rxrpc_io_thread(void *data); +void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { + if (!local->io_thread) + return; wake_up_process(READ_ONCE(local->io_thread)); } @@ -1289,6 +1326,13 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) return net_generic(net, rxrpc_net_id); } +/* + * out_of_band.c + */ +void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb); +void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb); +int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len); + /* * output.c */ @@ -1300,6 +1344,7 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); +void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *skb); /* * peer_event.c diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f428ea77f803c..fc88ffe1b0503 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -145,8 +145,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); - skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->recvmsg_queue); + skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 4d9c5e21ba785..232b6986da83e 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -19,7 +19,7 @@ /* * Set the completion state on an aborted connection. */ -static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff *skb, +static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, s32 abort_code, int err, enum rxrpc_call_completion compl) { @@ -49,12 +49,20 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, s32 abort_code, int err, enum rxrpc_abort_reason why) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - if (rxrpc_set_conn_aborted(conn, skb, abort_code, err, + u32 cid = conn->proto.cid, call = 0, seq = 0; + + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + cid = sp->hdr.cid; + call = sp->hdr.callNumber; + seq = sp->hdr.seq; + } + + if (rxrpc_set_conn_aborted(conn, abort_code, err, RXRPC_CALL_LOCALLY_ABORTED)) { - trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber, - sp->hdr.seq, abort_code, err); + trace_rxrpc_abort(0, why, cid, call, seq, abort_code, err); rxrpc_poke_conn(conn, rxrpc_conn_get_poke_abort); } return -EPROTO; @@ -67,7 +75,7 @@ static void rxrpc_input_conn_abort(struct rxrpc_connection *conn, struct sk_buff *skb) { trace_rxrpc_rx_conn_abort(conn, skb); - rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, + rxrpc_set_conn_aborted(conn, skb->priority, -ECONNABORTED, RXRPC_CALL_REMOTELY_ABORTED); } @@ -248,7 +256,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_CHALLENGE: - return conn->security->respond_to_challenge(conn, skb); + ret = conn->security->respond_to_challenge(conn, skb); + sp->chall.conn = NULL; + rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input); + return ret; case RXRPC_PACKET_TYPE_RESPONSE: ret = conn->security->verify_response(conn, skb); @@ -270,7 +281,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, * we've already received the packet, put it on the * front of the queue. */ - sp->conn = rxrpc_get_connection(conn, rxrpc_conn_get_poke_secured); + sp->poke_conn = rxrpc_get_connection( + conn, rxrpc_conn_get_poke_secured); skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED; rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured); skb_queue_head(&conn->local->rx_queue, skb); @@ -391,6 +403,61 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, rxrpc_queue_conn(conn, rxrpc_conn_queue_rx_work); } +/* + * Post a CHALLENGE packet to the socket of one of a connection's calls so that + * it can get application data to include in the packet, possibly querying + * userspace. + */ +static bool rxrpc_post_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = NULL; + struct rxrpc_sock *rx; + bool respond = false; + + sp->chall.conn = + rxrpc_get_connection(conn, rxrpc_conn_get_challenge_input); + + if (!conn->security->challenge_to_recvmsg) { + rxrpc_post_packet_to_conn(conn, skb); + return true; + } + + rcu_read_lock(); + + for (int i = 0; i < ARRAY_SIZE(conn->channels); i++) { + if (conn->channels[i].call) { + call = conn->channels[i].call; + rx = rcu_dereference(call->socket); + if (!rx) { + call = NULL; + continue; + } + + respond = true; + if (test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags)) + break; + call = NULL; + } + } + + if (!respond) { + rcu_read_unlock(); + rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input); + sp->chall.conn = NULL; + return false; + } + + if (call) + rxrpc_notify_socket_oob(call, skb); + rcu_read_unlock(); + + if (!call) + rxrpc_post_packet_to_conn(conn, skb); + return true; +} + /* * Input a connection-level packet. */ @@ -411,6 +478,16 @@ bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) return true; case RXRPC_PACKET_TYPE_CHALLENGE: + rxrpc_see_skb(skb, rxrpc_skb_see_oob_challenge); + if (rxrpc_is_conn_aborted(conn)) { + if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) + rxrpc_send_conn_abort(conn); + return true; + } + if (!conn->security->validate_challenge(conn, skb)) + return false; + return rxrpc_post_challenge(conn, skb); + case RXRPC_PACKET_TYPE_RESPONSE: if (rxrpc_is_conn_aborted(conn)) { if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) @@ -436,6 +513,19 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) rxrpc_abort_calls(conn); + if (conn->tx_response) { + struct sk_buff *skb; + + spin_lock_irq(&conn->local->lock); + skb = conn->tx_response; + conn->tx_response = NULL; + spin_unlock_irq(&conn->local->lock); + + if (conn->state != RXRPC_CONN_ABORTED) + rxrpc_send_response(conn, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_response); + } + if (skb) { switch (skb->mark) { case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: @@ -452,3 +542,31 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) rxrpc_process_delayed_final_acks(conn, false); } + +/* + * Post a RESPONSE message to the I/O thread for transmission. + */ +void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_local *local = conn->local; + struct sk_buff *old; + + _enter("%x", sp->resp.challenge_serial); + + spin_lock_irq(&local->lock); + old = conn->tx_response; + if (old) { + struct rxrpc_skb_priv *osp = rxrpc_skb(skb); + + /* Always go with the response to the most recent challenge. */ + if (after(sp->resp.challenge_serial, osp->resp.challenge_serial)) + conn->tx_response = old; + else + old = skb; + } else { + conn->tx_response = skb; + } + spin_unlock_irq(&local->lock); + rxrpc_poke_conn(conn, rxrpc_conn_get_poke_response); +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 8ac22dde8b39b..d14a802d2001f 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -329,6 +329,7 @@ static void rxrpc_clean_up_connection(struct work_struct *work) } rxrpc_purge_queue(&conn->rx_queue); + rxrpc_free_skb(conn->tx_response, rxrpc_skb_put_response); rxrpc_kill_client_conn(conn); diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index e068f9b79d023..1f7c136d6d0e6 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -42,13 +42,19 @@ static void none_free_call_crypto(struct rxrpc_call *call) { } -static int none_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb) +static bool none_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) { return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, rxrpc_eproto_rxnull_challenge); } +static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + return -EINVAL; +} + static int none_verify_response(struct rxrpc_connection *conn, struct sk_buff *skb) { @@ -82,7 +88,8 @@ const struct rxrpc_security rxrpc_no_security = { .alloc_txbuf = none_alloc_txbuf, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, - .respond_to_challenge = none_respond_to_challenge, + .validate_challenge = none_validate_challenge, + .sendmsg_respond_to_challenge = none_sendmsg_respond_to_challenge, .verify_response = none_verify_response, .clear = none_clear, }; diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 64f8d77b87312..27b650d30f4db 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -489,8 +489,8 @@ int rxrpc_io_thread(void *data) rxrpc_free_skb(skb, rxrpc_skb_put_error_report); break; case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: - rxrpc_input_conn_event(sp->conn, skb); - rxrpc_put_connection(sp->conn, rxrpc_conn_put_poke); + rxrpc_input_conn_event(sp->poke_conn, skb); + rxrpc_put_connection(sp->poke_conn, rxrpc_conn_put_poke); rxrpc_free_skb(skb, rxrpc_skb_put_conn_secured); break; default: @@ -501,9 +501,11 @@ int rxrpc_io_thread(void *data) } /* Deal with connections that want immediate attention. */ - spin_lock_irq(&local->lock); - list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); - spin_unlock_irq(&local->lock); + if (!list_empty_careful(&local->conn_attend_q)) { + spin_lock_irq(&local->lock); + list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); + spin_unlock_irq(&local->lock); + } while ((conn = list_first_entry_or_null(&conn_attend_q, struct rxrpc_connection, diff --git a/net/rxrpc/oob.c b/net/rxrpc/oob.c new file mode 100644 index 0000000000000..3601022b91900 --- /dev/null +++ b/net/rxrpc/oob.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Out of band message handling (e.g. challenge-response) + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "ar-internal.h" + +enum rxrpc_oob_command { + RXRPC_OOB_CMD_UNSET, + RXRPC_OOB_CMD_RESPOND, +} __mode(byte); + +struct rxrpc_oob_params { + u64 oob_id; /* ID number of message if reply */ + s32 abort_code; + enum rxrpc_oob_command command; + bool have_oob_id:1; +}; + +/* + * Post an out-of-band message for attention by the socket or kernel service + * associated with a reference call. + */ +void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_sock *rx; + struct sock *sk; + + rcu_read_lock(); + + rx = rcu_dereference(call->socket); + if (rx) { + sk = &rx->sk; + spin_lock_irq(&rx->recvmsg_lock); + + if (sk->sk_state < RXRPC_CLOSE) { + skb->skb_mstamp_ns = rx->oob_id_counter++; + rxrpc_get_skb(skb, rxrpc_skb_get_post_oob); + skb_queue_tail(&rx->recvmsg_oobq, skb); + + trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); + if (rx->app_ops) + rx->app_ops->notify_oob(sk, skb); + } + + spin_unlock_irq(&rx->recvmsg_lock); + if (!rx->app_ops && !sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + } + + rcu_read_unlock(); +} + +/* + * Locate the OOB message to respond to by its ID. + */ +static struct sk_buff *rxrpc_find_pending_oob(struct rxrpc_sock *rx, u64 oob_id) +{ + struct rb_node *p; + struct sk_buff *skb; + + p = rx->pending_oobq.rb_node; + while (p) { + skb = rb_entry(p, struct sk_buff, rbnode); + + if (oob_id < skb->skb_mstamp_ns) + p = p->rb_left; + else if (oob_id > skb->skb_mstamp_ns) + p = p->rb_right; + else + return skb; + } + + return NULL; +} + +/* + * Add an OOB message into the pending-response set. We always assign the next + * value from a 64-bit counter to the oob_id, so just assume we're always going + * to be on the right-hand edge of the tree and that the counter won't wrap. + * The tree is also given a ref to the message. + */ +void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb) +{ + struct rb_node **pp = &rx->pending_oobq.rb_node, *p = NULL; + + while (*pp) { + p = *pp; + pp = &(*pp)->rb_right; + } + + rb_link_node(&skb->rbnode, p, pp); + rb_insert_color(&skb->rbnode, &rx->pending_oobq); +} + +/* + * Extract control messages from the sendmsg() control buffer. + */ +static int rxrpc_sendmsg_oob_cmsg(struct msghdr *msg, struct rxrpc_oob_params *p) +{ + struct cmsghdr *cmsg; + int len; + + if (msg->msg_controllen == 0) + return -EINVAL; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + len = cmsg->cmsg_len - sizeof(struct cmsghdr); + _debug("CMSG %d, %d, %d", + cmsg->cmsg_level, cmsg->cmsg_type, len); + + if (cmsg->cmsg_level != SOL_RXRPC) + continue; + + switch (cmsg->cmsg_type) { + case RXRPC_OOB_ID: + if (len != sizeof(p->oob_id) || p->have_oob_id) + return -EINVAL; + memcpy(&p->oob_id, CMSG_DATA(cmsg), sizeof(p->oob_id)); + p->have_oob_id = true; + break; + case RXRPC_RESPOND: + if (p->command != RXRPC_OOB_CMD_UNSET) + return -EINVAL; + p->command = RXRPC_OOB_CMD_RESPOND; + break; + case RXRPC_ABORT: + if (len != sizeof(p->abort_code) || p->abort_code) + return -EINVAL; + memcpy(&p->abort_code, CMSG_DATA(cmsg), sizeof(p->abort_code)); + if (p->abort_code == 0) + return -EINVAL; + break; + case RXRPC_RESP_RXGK_APPDATA: + if (p->command != RXRPC_OOB_CMD_RESPOND) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + switch (p->command) { + case RXRPC_OOB_CMD_RESPOND: + if (!p->have_oob_id) + return -EBADSLT; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * Allow userspace to respond to an OOB using sendmsg(). + */ +static int rxrpc_respond_to_oob(struct rxrpc_sock *rx, + struct rxrpc_oob_params *p, + struct msghdr *msg) +{ + struct rxrpc_connection *conn; + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + int ret; + + skb = rxrpc_find_pending_oob(rx, p->oob_id); + if (skb) + rb_erase(&skb->rbnode, &rx->pending_oobq); + release_sock(&rx->sk); + if (!skb) + return -EBADSLT; + + sp = rxrpc_skb(skb); + + switch (p->command) { + case RXRPC_OOB_CMD_RESPOND: + ret = -EPROTO; + if (skb->mark != RXRPC_OOB_CHALLENGE) + break; + conn = sp->chall.conn; + ret = -EOPNOTSUPP; + if (!conn->security->sendmsg_respond_to_challenge) + break; + if (p->abort_code) { + rxrpc_abort_conn(conn, NULL, p->abort_code, -ECONNABORTED, + rxrpc_abort_response_sendmsg); + ret = 0; + } else { + ret = conn->security->sendmsg_respond_to_challenge(skb, msg); + } + break; + default: + ret = -EINVAL; + break; + } + + rxrpc_free_skb(skb, rxrpc_skb_put_oob); + return ret; +} + +/* + * Send an out-of-band message or respond to a received out-of-band message. + * - caller gives us the socket lock + * - the socket may be either a client socket or a server socket + */ +int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) +{ + struct rxrpc_oob_params p = {}; + int ret; + + _enter(""); + + ret = rxrpc_sendmsg_oob_cmsg(msg, &p); + if (ret < 0) + goto error_release_sock; + + if (p.have_oob_id) + return rxrpc_respond_to_oob(rx, &p, msg); + + release_sock(&rx->sk); + + switch (p.command) { + default: + ret = -EINVAL; + break; + } + + _leave(" = %d", ret); + return ret; + +error_release_sock: + release_sock(&rx->sk); + return ret; +} + +/** + * rxrpc_kernel_query_oob - Query the parameters of an out-of-band message + * @oob: The message to query + * @_peer: Where to return the peer record + * @_peer_appdata: The application data attached to a peer record + * + * Extract useful parameters from an out-of-band message. The source peer + * parameters are returned through the argument list and the message type is + * returned. + * + * Return: + * * %RXRPC_OOB_CHALLENGE - Challenge wanting a response. + */ +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(oob); + enum rxrpc_oob_type type = oob->mark; + + switch (type) { + case RXRPC_OOB_CHALLENGE: + *_peer = sp->chall.conn->peer; + *_peer_appdata = 0; /* TODO: retrieve appdata */ + break; + default: + WARN_ON_ONCE(1); + *_peer = NULL; + *_peer_appdata = 0; + break; + } + + return type; +} +EXPORT_SYMBOL(rxrpc_kernel_query_oob); + +/** + * rxrpc_kernel_dequeue_oob - Dequeue and return the front OOB message + * @sock: The socket to query + * @_type: Where to return the message type + * + * Dequeue the front OOB message, if there is one, and return it and + * its type. + * + * Return: The sk_buff representing the OOB message or %NULL if the queue was + * empty. + */ +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *oob; + + oob = skb_dequeue(&rx->recvmsg_oobq); + if (oob) + *_type = oob->mark; + return oob; +} +EXPORT_SYMBOL(rxrpc_kernel_dequeue_oob); + +/** + * rxrpc_kernel_free_oob - Free an out-of-band message + * @oob: The OOB message to free + * + * Free an OOB message along with any resources it holds. + */ +void rxrpc_kernel_free_oob(struct sk_buff *oob) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(oob); + + switch (oob->mark) { + case RXRPC_OOB_CHALLENGE: + rxrpc_put_connection(sp->chall.conn, rxrpc_conn_put_oob); + break; + } + + rxrpc_free_skb(oob, rxrpc_skb_put_purge_oob); +} +EXPORT_SYMBOL(rxrpc_kernel_free_oob); + +/** + * rxrpc_kernel_query_challenge - Query the parameters of a challenge + * @challenge: The challenge to query + * @_peer: Where to return the peer record + * @_peer_appdata: The application data attached to a peer record + * @_service_id: Where to return the connection service ID + * @_security_index: Where to return the connection security index + * + * Extract useful parameters from a CHALLENGE message. + */ +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + *_peer = sp->chall.conn->peer; + *_peer_appdata = 0; /* TODO: retrieve appdata */ + *_service_id = sp->hdr.serviceId; + *_security_index = sp->hdr.securityIndex; +} +EXPORT_SYMBOL(rxrpc_kernel_query_challenge); + +/** + * rxrpc_kernel_reject_challenge - Allow a kernel service to reject a challenge + * @challenge: The challenge to be rejected + * @abort_code: The abort code to stick into the ABORT packet + * @error: Local error value + * @why: Indication as to why. + * + * Allow a kernel service to reject a challenge by aborting the connection if + * it's still in an abortable state. The error is returned so this function + * can be used with a return statement. + * + * Return: The %error parameter. + */ +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + _enter("{%x},%d,%d,%u", sp->hdr.serial, abort_code, error, why); + + rxrpc_abort_conn(sp->chall.conn, NULL, abort_code, error, why); + return error; +} +EXPORT_SYMBOL(rxrpc_kernel_reject_challenge); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 95905b85a8d71..04c900990a401 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -916,3 +916,59 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) peer->last_tx_at = ktime_get_seconds(); _leave(""); } + +/* + * Send a RESPONSE message. + */ +void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *response) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(response); + struct scatterlist sg[16]; + struct bio_vec bvec[16]; + struct msghdr msg; + size_t len = sp->resp.len; + __be32 wserial; + u32 serial = 0; + int ret, nr_sg; + + _enter("C=%x,%x", conn->debug_id, sp->resp.challenge_serial); + + sg_init_table(sg, ARRAY_SIZE(sg)); + ret = skb_to_sgvec(response, sg, 0, len); + if (ret < 0) + goto fail; + nr_sg = ret; + + for (int i = 0; i < nr_sg; i++) + bvec_set_page(&bvec[i], sg_page(&sg[i]), sg[i].length, sg[i].offset); + + iov_iter_bvec(&msg.msg_iter, WRITE, bvec, nr_sg, len); + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + serial = rxrpc_get_next_serials(conn, 1); + wserial = htonl(serial); + + ret = skb_store_bits(response, offsetof(struct rxrpc_wire_header, serial), + &wserial, sizeof(wserial)); + if (ret < 0) + goto fail; + + rxrpc_local_dont_fragment(conn->local, false); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret < 0) + goto fail; + + conn->peer->last_tx_at = ktime_get_seconds(); + return; + +fail: + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_response); + kleave(" = %d", ret); +} diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 4c622752a5693..86a27fb55a1cc 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -154,6 +154,82 @@ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) return call->security->verify_packet(call, skb); } +/* + * Transcribe a call's user ID to a control message. + */ +static int rxrpc_recvmsg_user_id(struct rxrpc_call *call, struct msghdr *msg, + int flags) +{ + if (!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) + return 0; + + if (flags & MSG_CMSG_COMPAT) { + unsigned int id32 = call->user_call_ID; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned int), &id32); + } else { + unsigned long idl = call->user_call_ID; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned long), &idl); + } +} + +/* + * Deal with a CHALLENGE packet. + */ +static int rxrpc_recvmsg_challenge(struct socket *sock, struct msghdr *msg, + struct sk_buff *challenge, unsigned int flags) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + struct rxrpc_connection *conn = sp->chall.conn; + + return conn->security->challenge_to_recvmsg(conn, challenge, msg); +} + +/* + * Process OOB packets. Called with the socket locked. + */ +static int rxrpc_recvmsg_oob(struct socket *sock, struct msghdr *msg, + unsigned int flags) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *skb; + bool need_response = false; + int ret; + + skb = skb_peek(&rx->recvmsg_oobq); + if (!skb) + return -EAGAIN; + rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg); + + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_OOB_ID, sizeof(u64), + &skb->skb_mstamp_ns); + if (ret < 0) + return ret; + + switch ((enum rxrpc_oob_type)skb->mark) { + case RXRPC_OOB_CHALLENGE: + need_response = true; + ret = rxrpc_recvmsg_challenge(sock, msg, skb, flags); + break; + default: + WARN_ONCE(1, "recvmsg() can't process unknown OOB type %u\n", + skb->mark); + ret = -EIO; + break; + } + + if (!(flags & MSG_PEEK)) + skb_unlink(skb, &rx->recvmsg_oobq); + if (need_response) + rxrpc_add_pending_oob(rx, skb); + else + rxrpc_free_skb(skb, rxrpc_skb_put_oob); + return ret; +} + /* * Deliver messages to a call. This keeps processing packets until the buffer * is filled and we find either more DATA (returns 0) or the end of the DATA @@ -165,6 +241,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, size_t len, int flags, size_t *_offset) { struct rxrpc_skb_priv *sp; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct sk_buff *skb; rxrpc_seq_t seq = 0; size_t remain; @@ -207,7 +284,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, sp->offset, sp->len, ret2); if (ret2 < 0) { - kdebug("verify = %d", ret2); ret = ret2; goto out; } @@ -255,6 +331,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (!(flags & MSG_PEEK)) rxrpc_rotate_rx_window(call); + + if (!rx->app_ops && + !skb_queue_empty_lockless(&rx->recvmsg_oobq)) { + trace_rxrpc_recvdata(call, rxrpc_recvmsg_oobq, seq, + rx_pkt_offset, rx_pkt_len, ret); + break; + } } out: @@ -262,6 +345,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; } + done: trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq, rx_pkt_offset, rx_pkt_len, ret); @@ -301,6 +385,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, /* Return immediately if a client socket has no outstanding calls */ if (RB_EMPTY_ROOT(&rx->calls) && list_empty(&rx->recvmsg_q) && + skb_queue_empty_lockless(&rx->recvmsg_oobq) && rx->sk.sk_state != RXRPC_SERVER_LISTENING) { release_sock(&rx->sk); return -EAGAIN; @@ -322,7 +407,8 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (ret) goto wait_error; - if (list_empty(&rx->recvmsg_q)) { + if (list_empty(&rx->recvmsg_q) && + skb_queue_empty_lockless(&rx->recvmsg_oobq)) { if (signal_pending(current)) goto wait_interrupted; trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0); @@ -332,6 +418,15 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, goto try_again; } + /* Deal with OOB messages before we consider getting normal data. */ + if (!skb_queue_empty_lockless(&rx->recvmsg_oobq)) { + ret = rxrpc_recvmsg_oob(sock, msg, flags); + release_sock(&rx->sk); + if (ret == -EAGAIN) + goto try_again; + goto error_no_call; + } + /* Find the next call and dequeue it if we're not just peeking. If we * do dequeue it, that comes with a ref that we will need to release. * We also want to weed out calls that got requeued whilst we were @@ -342,7 +437,8 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!rxrpc_call_is_complete(call) && - skb_queue_empty(&call->recvmsg_queue)) { + skb_queue_empty(&call->recvmsg_queue) && + skb_queue_empty(&rx->recvmsg_oobq)) { list_del_init(&call->recvmsg_link); spin_unlock_irq(&rx->recvmsg_lock); release_sock(&rx->sk); @@ -377,21 +473,9 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); - if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - if (flags & MSG_CMSG_COMPAT) { - unsigned int id32 = call->user_call_ID; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned int), &id32); - } else { - unsigned long idl = call->user_call_ID; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned long), &idl); - } - if (ret < 0) - goto error_unlock_call; - } + ret = rxrpc_recvmsg_user_id(call, msg, flags); + if (ret < 0) + goto error_unlock_call; if (msg->msg_name && call->peer) { size_t len = sizeof(call->dest_srx); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 6cb37b0eb77f4..8ddccee69009d 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -697,62 +697,6 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) return 0; } -/* - * send a Kerberos security response - */ -static int rxkad_send_response(struct rxrpc_connection *conn, - struct rxrpc_host_header *hdr, - struct rxkad_response *resp, - const struct rxkad_key *s2) -{ - struct rxrpc_wire_header whdr; - struct msghdr msg; - struct kvec iov[3]; - size_t len; - u32 serial; - int ret; - - _enter(""); - - msg.msg_name = &conn->peer->srx.transport; - msg.msg_namelen = conn->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - memset(&whdr, 0, sizeof(whdr)); - whdr.epoch = htonl(hdr->epoch); - whdr.cid = htonl(hdr->cid); - whdr.type = RXRPC_PACKET_TYPE_RESPONSE; - whdr.flags = conn->out_clientflag; - whdr.securityIndex = hdr->securityIndex; - whdr.serviceId = htons(hdr->serviceId); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = resp; - iov[1].iov_len = sizeof(*resp); - iov[2].iov_base = (void *)s2->ticket; - iov[2].iov_len = s2->ticket_len; - - len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; - - serial = rxrpc_get_next_serial(conn); - whdr.serial = htonl(serial); - - rxrpc_local_dont_fragment(conn->local, false); - ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len); - if (ret < 0) { - trace_rxrpc_tx_fail(conn->debug_id, serial, ret, - rxrpc_tx_point_rxkad_response); - return -EAGAIN; - } - - conn->peer->last_tx_at = ktime_get_seconds(); - _leave(" = 0"); - return 0; -} - /* * calculate the response checksum */ @@ -772,12 +716,21 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response) * encrypt the response packet */ static int rxkad_encrypt_response(struct rxrpc_connection *conn, - struct rxkad_response *resp, + struct sk_buff *response, const struct rxkad_key *s2) { struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg[1]; + size_t encsize = sizeof(((struct rxkad_response *)0)->encrypted); + int ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + ret = skb_to_sgvec(response, sg, + sizeof(struct rxrpc_wire_header) + + offsetof(struct rxkad_response, encrypted), encsize); + if (ret < 0) + return ret; req = skcipher_request_alloc(&conn->rxkad.cipher->base, GFP_NOFS); if (!req) @@ -786,88 +739,205 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn, /* continue encrypting from where we left off */ memcpy(&iv, s2->session_key, sizeof(iv)); - sg_init_table(sg, 1); - sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); skcipher_request_set_sync_tfm(req, conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); - crypto_skcipher_encrypt(req); + skcipher_request_set_crypt(req, sg, sg, encsize, iv.x); + ret = crypto_skcipher_encrypt(req); skcipher_request_free(req); - return 0; + return ret; } /* - * respond to a challenge packet + * Validate a challenge packet. */ -static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb) +static bool rxkad_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) { - const struct rxrpc_key_token *token; struct rxkad_challenge challenge; - struct rxkad_response *resp; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - u32 version, nonce, min_level; - int ret = -EPROTO; + u32 version, min_level; + int ret; _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); - if (!conn->key) - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, - rxkad_abort_chall_no_key); + if (!conn->key) { + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxkad_abort_chall_no_key); + return false; + } ret = key_validate(conn->key); - if (ret < 0) - return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, - rxkad_abort_chall_key_expired); + if (ret < 0) { + rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); + return false; + } if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - &challenge, sizeof(challenge)) < 0) - return rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, - rxkad_abort_chall_short); + &challenge, sizeof(challenge)) < 0) { + rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_chall_short); + return false; + } version = ntohl(challenge.version); - nonce = ntohl(challenge.nonce); + sp->chall.rxkad_nonce = ntohl(challenge.nonce); min_level = ntohl(challenge.min_level); - trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, nonce, min_level); + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, + sp->chall.rxkad_nonce, min_level); - if (version != RXKAD_VERSION) - return rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, - rxkad_abort_chall_version); + if (version != RXKAD_VERSION) { + rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, + rxkad_abort_chall_version); + return false; + } - if (conn->security_level < min_level) - return rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, - rxkad_abort_chall_level); + if (conn->security_level < min_level) { + rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, + rxkad_abort_chall_level); + return false; + } + return true; +} + +/* + * Insert the header into the response. + */ +static noinline +int rxkad_insert_response_header(struct rxrpc_connection *conn, + const struct rxrpc_key_token *token, + struct sk_buff *challenge, + struct sk_buff *response, + size_t *offset) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + struct { + struct rxrpc_wire_header whdr; + struct rxkad_response resp; + } h; + int ret; + + h.whdr.epoch = htonl(conn->proto.epoch); + h.whdr.cid = htonl(conn->proto.cid); + h.whdr.callNumber = 0; + h.whdr.serial = 0; + h.whdr.seq = 0; + h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + h.whdr.flags = conn->out_clientflag; + h.whdr.userStatus = 0; + h.whdr.securityIndex = conn->security_ix; + h.whdr.cksum = 0; + h.whdr.serviceId = htons(conn->service_id); + h.resp.version = htonl(RXKAD_VERSION); + h.resp.__pad = 0; + h.resp.encrypted.epoch = htonl(conn->proto.epoch); + h.resp.encrypted.cid = htonl(conn->proto.cid); + h.resp.encrypted.checksum = 0; + h.resp.encrypted.securityIndex = htonl(conn->security_ix); + h.resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter); + h.resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter); + h.resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter); + h.resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter); + h.resp.encrypted.inc_nonce = htonl(csp->chall.rxkad_nonce + 1); + h.resp.encrypted.level = htonl(conn->security_level); + h.resp.kvno = htonl(token->kad->kvno); + h.resp.ticket_len = htonl(token->kad->ticket_len); + + rxkad_calc_response_checksum(&h.resp); + + ret = skb_store_bits(response, *offset, &h, sizeof(h)); + *offset += sizeof(h); + return ret; +} + +/* + * respond to a challenge packet + */ +static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *challenge) +{ + const struct rxrpc_key_token *token; + struct rxrpc_skb_priv *csp, *rsp; + struct sk_buff *response; + size_t len, offset = 0; + int ret = -EPROTO; + + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + ret = key_validate(conn->key); + if (ret < 0) + return rxrpc_abort_conn(conn, challenge, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); token = conn->key->payload.data[0]; /* build the response packet */ - resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS); - if (!resp) - return -ENOMEM; + len = sizeof(struct rxrpc_wire_header) + + sizeof(struct rxkad_response) + + token->kad->ticket_len; + + response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS); + if (!response) + goto error; + rxrpc_new_skb(response, rxrpc_skb_new_response_rxkad); + response->len = len; + response->data_len = len; + + offset = 0; + ret = rxkad_insert_response_header(conn, token, challenge, response, + &offset); + if (ret < 0) + goto error; + + ret = rxkad_encrypt_response(conn, response, token->kad); + if (ret < 0) + goto error; + + ret = skb_store_bits(response, offset, token->kad->ticket, + token->kad->ticket_len); + if (ret < 0) + goto error; - resp->version = htonl(RXKAD_VERSION); - resp->encrypted.epoch = htonl(conn->proto.epoch); - resp->encrypted.cid = htonl(conn->proto.cid); - resp->encrypted.securityIndex = htonl(conn->security_ix); - resp->encrypted.inc_nonce = htonl(nonce + 1); - resp->encrypted.level = htonl(conn->security_level); - resp->kvno = htonl(token->kad->kvno); - resp->ticket_len = htonl(token->kad->ticket_len); - resp->encrypted.call_id[0] = htonl(conn->channels[0].call_counter); - resp->encrypted.call_id[1] = htonl(conn->channels[1].call_counter); - resp->encrypted.call_id[2] = htonl(conn->channels[2].call_counter); - resp->encrypted.call_id[3] = htonl(conn->channels[3].call_counter); - - /* calculate the response checksum and then do the encryption */ - rxkad_calc_response_checksum(resp); - ret = rxkad_encrypt_response(conn, resp, token->kad); - if (ret == 0) - ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad); - kfree(resp); + csp = rxrpc_skb(challenge); + rsp = rxrpc_skb(response); + rsp->resp.len = len; + rsp->resp.challenge_serial = csp->hdr.serial; + rxrpc_post_response(conn, response); + response = NULL; + ret = 0; + +error: + rxrpc_free_skb(response, rxrpc_skb_put_response); return ret; } +/* + * RxKAD does automatic response only as there's nothing to manage that isn't + * already in the key. + */ +static int rxkad_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + return -EINVAL; +} + +/** + * rxkad_kernel_respond_to_challenge - Respond to a challenge with appdata + * @challenge: The challenge to respond to + * + * Allow a kernel application to respond to a CHALLENGE. + * + * Return: %0 if successful and a negative error code otherwise. + */ +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + + return rxkad_respond_to_challenge(csp->chall.conn, challenge); +} +EXPORT_SYMBOL(rxkad_kernel_respond_to_challenge); + /* * decrypt the kerberos IV ticket in the response */ @@ -1276,6 +1346,8 @@ const struct rxrpc_security rxkad = { .verify_packet = rxkad_verify_packet, .free_call_crypto = rxkad_free_call_crypto, .issue_challenge = rxkad_issue_challenge, + .validate_challenge = rxkad_validate_challenge, + .sendmsg_respond_to_challenge = rxkad_sendmsg_respond_to_challenge, .respond_to_challenge = rxkad_respond_to_challenge, .verify_response = rxkad_verify_response, .clear = rxkad_clear, diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 2342b5f1547cb..ebbb78b842de8 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -758,14 +758,21 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (rxrpc_call_is_complete(call)) { /* it's too late for this call */ ret = -ESHUTDOWN; - } else if (p.command == RXRPC_CMD_SEND_ABORT) { + goto out_put_unlock; + } + + switch (p.command) { + case RXRPC_CMD_SEND_ABORT: rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, rxrpc_abort_call_sendmsg); ret = 0; - } else if (p.command != RXRPC_CMD_SEND_DATA) { - ret = -EINVAL; - } else { + break; + case RXRPC_CMD_SEND_DATA: ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock); + break; + default: + ret = -EINVAL; + break; } out_put_unlock: diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c index eb7525e929510..36b05fd842a7b 100644 --- a/net/rxrpc/server_key.c +++ b/net/rxrpc/server_key.c @@ -171,3 +171,43 @@ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) return ret; } EXPORT_SYMBOL(rxrpc_sock_set_security_keyring); + +/** + * rxrpc_sock_set_manage_response - Set the manage-response flag for a kernel service + * @sk: The socket to set the keyring on + * @set: True to set, false to clear the flag + * + * Set the flag on an rxrpc socket to say that the caller wants to manage the + * RESPONSE packet and the user-defined data it may contain. Setting this + * means that recvmsg() will return messages with RXRPC_CHALLENGED in the + * control message buffer containing information about the challenge. + * + * The user should respond to the challenge by passing RXRPC_RESPOND or + * RXRPC_RESPOND_ABORT control messages with sendmsg() to the same call. + * Supplementary control messages, such as RXRPC_RESP_RXGK_APPDATA, may be + * included to indicate the parts the user wants to supply. + * + * The server will be passed the response data with a RXRPC_RESPONDED control + * message when it gets the first data from each call. + * + * Note that this is only honoured by security classes that need auxiliary data + * (e.g. RxGK). Those that don't offer the facility (e.g. RxKAD) respond + * without consulting userspace. + * + * Return: The previous setting. + */ +int rxrpc_sock_set_manage_response(struct sock *sk, bool set) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret; + + lock_sock(sk); + ret = !!test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + if (set) + set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + else + clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + release_sock(sk); + return ret; +} +EXPORT_SYMBOL(rxrpc_sock_set_manage_response); From 01af64269751f261421a9e80a527c8e987aeda8d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:50 +0100 Subject: [PATCH 151/770] rxrpc: Add the security index for yfs-rxgk Add the security index and abort codes for the YFS variant of rxgk. Signed-off-by: David Howells Link: https://patch.msgid.link/20250411095303.2316168-6-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- fs/afs/misc.c | 27 +++++++++++++++++++++++++++ include/crypto/krb5.h | 5 +++++ include/uapi/linux/rxrpc.h | 31 +++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/fs/afs/misc.c b/fs/afs/misc.c index b8180bf2281f7..8f2b3a1776908 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "internal.h" #include "afs_fs.h" #include "protocol_uae.h" @@ -103,6 +104,32 @@ int afs_abort_to_error(u32 abort_code) case RXKADDATALEN: return -EKEYREJECTED; case RXKADILLEGALLEVEL: return -EKEYREJECTED; + case RXGK_INCONSISTENCY: return -EPROTO; + case RXGK_PACKETSHORT: return -EPROTO; + case RXGK_BADCHALLENGE: return -EPROTO; + case RXGK_SEALEDINCON: return -EKEYREJECTED; + case RXGK_NOTAUTH: return -EKEYREJECTED; + case RXGK_EXPIRED: return -EKEYEXPIRED; + case RXGK_BADLEVEL: return -EKEYREJECTED; + case RXGK_BADKEYNO: return -EKEYREJECTED; + case RXGK_NOTRXGK: return -EKEYREJECTED; + case RXGK_UNSUPPORTED: return -EKEYREJECTED; + case RXGK_GSSERROR: return -EKEYREJECTED; +#ifdef RXGK_BADETYPE + case RXGK_BADETYPE: return -ENOPKG; +#endif +#ifdef RXGK_BADTOKEN + case RXGK_BADTOKEN: return -EKEYREJECTED; +#endif +#ifdef RXGK_BADETYPE + case RXGK_DATALEN: return -EPROTO; +#endif +#ifdef RXGK_BADQOP + case RXGK_BADQOP: return -EKEYREJECTED; +#endif + + case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; + case RXGEN_OPCODE: return -ENOTSUPP; default: return -EREMOTEIO; diff --git a/include/crypto/krb5.h b/include/crypto/krb5.h index 62d998e62f47d..71dd38f59be1d 100644 --- a/include/crypto/krb5.h +++ b/include/crypto/krb5.h @@ -63,6 +63,11 @@ struct scatterlist; #define KEY_USAGE_SEED_ENCRYPTION (0xAA) #define KEY_USAGE_SEED_INTEGRITY (0x55) +/* + * Standard Kerberos error codes. + */ +#define KRB5_PROG_KEYTYPE_NOSUPP -1765328233 + /* * Mode of operation. */ diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index c4e9833b0a12b..d9735abd4c791 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -80,6 +80,7 @@ enum rxrpc_cmsg_type { #define RXRPC_SECURITY_RXKAD 2 /* kaserver or kerberos 4 */ #define RXRPC_SECURITY_RXGK 4 /* gssapi-based */ #define RXRPC_SECURITY_RXK5 5 /* kerberos 5 */ +#define RXRPC_SECURITY_YFS_RXGK 6 /* YFS gssapi-based */ /* * RxRPC-level abort codes @@ -125,6 +126,36 @@ enum rxrpc_cmsg_type { #define RXKADDATALEN 19270411 /* user data too long */ #define RXKADILLEGALLEVEL 19270412 /* caller not authorised to use encrypted conns */ +/* + * RxGK GSSAPI security abort codes. + */ +#if 0 /* Original standard abort codes (used by OpenAFS) */ +#define RXGK_INCONSISTENCY 1233242880 /* Security module structure inconsistent */ +#define RXGK_PACKETSHORT 1233242881 /* Packet too short for security challenge */ +#define RXGK_BADCHALLENGE 1233242882 /* Invalid security challenge */ +#define RXGK_BADETYPE 1233242883 /* Invalid or impermissible encryption type */ +#define RXGK_BADLEVEL 1233242884 /* Invalid or impermissible security level */ +#define RXGK_BADKEYNO 1233242885 /* Key version number not found */ +#define RXGK_EXPIRED 1233242886 /* Token has expired */ +#define RXGK_NOTAUTH 1233242887 /* Caller not authorized */ +#define RXGK_BAD_TOKEN 1233242888 /* Security object was passed a bad token */ +#define RXGK_SEALED_INCON 1233242889 /* Sealed data inconsistent */ +#define RXGK_DATA_LEN 1233242890 /* User data too long */ +#define RXGK_BAD_QOP 1233242891 /* Inadequate quality of protection available */ +#else /* Revised standard abort codes (used by YFS) */ +#define RXGK_INCONSISTENCY 1233242880 /* Security module structure inconsistent */ +#define RXGK_PACKETSHORT 1233242881 /* Packet too short for security challenge */ +#define RXGK_BADCHALLENGE 1233242882 /* Security challenge/response failed */ +#define RXGK_SEALEDINCON 1233242883 /* Sealed data is inconsistent */ +#define RXGK_NOTAUTH 1233242884 /* Caller not authorised */ +#define RXGK_EXPIRED 1233242885 /* Authentication expired */ +#define RXGK_BADLEVEL 1233242886 /* Unsupported or not permitted security level */ +#define RXGK_BADKEYNO 1233242887 /* Bad transport key number */ +#define RXGK_NOTRXGK 1233242888 /* Security layer is not rxgk */ +#define RXGK_UNSUPPORTED 1233242889 /* Endpoint does not support rxgk */ +#define RXGK_GSSERROR 1233242890 /* GSSAPI mechanism error */ +#endif + /* * Challenge information in the RXRPC_CHALLENGED control message. */ From 0ca100ff4df64f5d0f6c1dd5080c3e096786bea6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:51 +0100 Subject: [PATCH 152/770] rxrpc: Add YFS RxGK (GSSAPI) security class Add support for the YFS-variant RxGK security class to support GSSAPI-derived authentication. This also allows the use of better crypto over the rxkad security class. The key payload is XDR encoded of the form: typedef int64_t opr_time; const AFSTOKEN_RK_TIX_MAX = 12000; /* Matches entry in rxkad.h */ struct token_rxkad { afs_int32 viceid; afs_int32 kvno; afs_int64 key; afs_int32 begintime; afs_int32 endtime; afs_int32 primary_flag; opaque ticket; }; struct token_rxgk { opr_time begintime; opr_time endtime; afs_int64 level; afs_int64 lifetime; afs_int64 bytelife; afs_int64 enctype; opaque key<>; opaque ticket<>; }; const AFSTOKEN_UNION_NOAUTH = 0; const AFSTOKEN_UNION_KAD = 2; const AFSTOKEN_UNION_YFSGK = 6; union ktc_tokenUnion switch (afs_int32 type) { case AFSTOKEN_UNION_KAD: token_rxkad kad; case AFSTOKEN_UNION_YFSGK: token_rxgk gk; }; const AFSTOKEN_LENGTH_MAX = 16384; typedef opaque token_opaque; const AFSTOKEN_MAX = 8; const AFSTOKEN_CELL_MAX = 64; struct ktc_setTokenData { afs_int32 flags; string cell; token_opaque tokens; }; The parser for the basic token struct is already present, as is the rxkad token type. This adds a parser for the rxgk token type. Signed-off-by: David Howells cc: Marc Dionne cc: Herbert Xu cc: Chuck Lever cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-7-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/keys/rxrpc-type.h | 17 ++++ net/rxrpc/key.c | 185 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+) diff --git a/include/keys/rxrpc-type.h b/include/keys/rxrpc-type.h index 333c0f49a9cd2..0ddbe197a261b 100644 --- a/include/keys/rxrpc-type.h +++ b/include/keys/rxrpc-type.h @@ -9,6 +9,7 @@ #define _KEYS_RXRPC_TYPE_H #include +#include /* * key type for AF_RXRPC keys @@ -31,6 +32,21 @@ struct rxkad_key { u8 ticket[]; /* the encrypted ticket */ }; +/* + * RxRPC key for YFS-RxGK (type-6 security) + */ +struct rxgk_key { + s64 begintime; /* Time at which the ticket starts */ + s64 endtime; /* Time at which the ticket ends */ + u64 lifetime; /* Maximum lifespan of a connection (seconds) */ + u64 bytelife; /* Maximum number of bytes on a connection */ + unsigned int enctype; /* Encoding type */ + s8 level; /* Negotiated security RXRPC_SECURITY_PLAIN/AUTH/ENCRYPT */ + struct krb5_buffer key; /* Master key, K0 */ + struct krb5_buffer ticket; /* Ticket to be passed to server */ + u8 _key[]; /* Key storage */ +}; + /* * list of tokens attached to an rxrpc key */ @@ -40,6 +56,7 @@ struct rxrpc_key_token { struct rxrpc_key_token *next; /* the next token in the list */ union { struct rxkad_key *kad; + struct rxgk_key *rxgk; }; }; diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 8c99cf19b19d7..9fdc1f031c9da 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -129,6 +129,160 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, return 0; } +static u64 xdr_dec64(const __be32 *xdr) +{ + return (u64)ntohl(xdr[0]) << 32 | (u64)ntohl(xdr[1]); +} + +static time64_t rxrpc_s64_to_time64(s64 time_in_100ns) +{ + bool neg = false; + u64 tmp = time_in_100ns; + + if (time_in_100ns < 0) { + tmp = -time_in_100ns; + neg = true; + } + do_div(tmp, 10000000); + return neg ? -tmp : tmp; +} + +/* + * Parse a YFS-RxGK type XDR format token + * - the caller guarantees we have at least 4 words + * + * struct token_rxgk { + * opr_time begintime; + * opr_time endtime; + * afs_int64 level; + * afs_int64 lifetime; + * afs_int64 bytelife; + * afs_int64 enctype; + * opaque key<>; + * opaque ticket<>; + * }; + */ +static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, + size_t datalen, + const __be32 *xdr, unsigned int toklen) +{ + struct rxrpc_key_token *token, **pptoken; + time64_t expiry; + size_t plen; + const __be32 *ticket, *key; + s64 tmp; + u32 tktlen, keylen; + + _enter(",{%x,%x,%x,%x},%x", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + toklen); + + if (6 * 2 + 2 > toklen / 4) + goto reject; + + key = xdr + (6 * 2 + 1); + keylen = ntohl(key[-1]); + _debug("keylen: %x", keylen); + keylen = round_up(keylen, 4); + if ((6 * 2 + 2) * 4 + keylen > toklen) + goto reject; + + ticket = xdr + (6 * 2 + 1 + (keylen / 4) + 1); + tktlen = ntohl(ticket[-1]); + _debug("tktlen: %x", tktlen); + tktlen = round_up(tktlen, 4); + if ((6 * 2 + 2) * 4 + keylen + tktlen != toklen) { + kleave(" = -EKEYREJECTED [%x!=%x, %x,%x]", + (6 * 2 + 2) * 4 + keylen + tktlen, toklen, + keylen, tktlen); + goto reject; + } + + plen = sizeof(*token) + sizeof(*token->rxgk) + tktlen + keylen; + prep->quotalen = datalen + plen; + + plen -= sizeof(*token); + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + goto nomem; + + token->rxgk = kzalloc(sizeof(*token->rxgk) + keylen, GFP_KERNEL); + if (!token->rxgk) + goto nomem_token; + + token->security_index = RXRPC_SECURITY_YFS_RXGK; + token->rxgk->begintime = xdr_dec64(xdr + 0 * 2); + token->rxgk->endtime = xdr_dec64(xdr + 1 * 2); + token->rxgk->level = tmp = xdr_dec64(xdr + 2 * 2); + if (tmp < -1LL || tmp > RXRPC_SECURITY_ENCRYPT) + goto reject_token; + token->rxgk->lifetime = xdr_dec64(xdr + 3 * 2); + token->rxgk->bytelife = xdr_dec64(xdr + 4 * 2); + token->rxgk->enctype = tmp = xdr_dec64(xdr + 5 * 2); + if (tmp < 0 || tmp > UINT_MAX) + goto reject_token; + token->rxgk->key.len = ntohl(key[-1]); + token->rxgk->key.data = token->rxgk->_key; + token->rxgk->ticket.len = ntohl(ticket[-1]); + + if (token->rxgk->endtime != 0) { + expiry = rxrpc_s64_to_time64(token->rxgk->endtime); + if (expiry < 0) + goto expired; + if (expiry < prep->expiry) + prep->expiry = expiry; + } + + memcpy(token->rxgk->key.data, key, token->rxgk->key.len); + + /* Pad the ticket so that we can use it directly in XDR */ + token->rxgk->ticket.data = kzalloc(round_up(token->rxgk->ticket.len, 4), + GFP_KERNEL); + if (!token->rxgk->ticket.data) + goto nomem_yrxgk; + memcpy(token->rxgk->ticket.data, ticket, token->rxgk->ticket.len); + + _debug("SCIX: %u", token->security_index); + _debug("EXPY: %llx", token->rxgk->endtime); + _debug("LIFE: %llx", token->rxgk->lifetime); + _debug("BYTE: %llx", token->rxgk->bytelife); + _debug("ENC : %u", token->rxgk->enctype); + _debug("LEVL: %u", token->rxgk->level); + _debug("KLEN: %u", token->rxgk->key.len); + _debug("TLEN: %u", token->rxgk->ticket.len); + _debug("KEY0: %*phN", token->rxgk->key.len, token->rxgk->key.data); + _debug("TICK: %*phN", + min_t(u32, token->rxgk->ticket.len, 32), token->rxgk->ticket.data); + + /* count the number of tokens attached */ + prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); + + /* attach the data */ + for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; + *pptoken; + pptoken = &(*pptoken)->next) + continue; + *pptoken = token; + + _leave(" = 0"); + return 0; + +nomem_yrxgk: + kfree(token->rxgk); +nomem_token: + kfree(token); +nomem: + return -ENOMEM; +reject_token: + kfree(token); +reject: + return -EKEYREJECTED; +expired: + kfree(token->rxgk); + kfree(token); + return -EKEYEXPIRED; +} + /* * attempt to parse the data as the XDR format * - the caller guarantees we have more than 7 words @@ -228,6 +382,9 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) case RXRPC_SECURITY_RXKAD: ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen); break; + case RXRPC_SECURITY_YFS_RXGK: + ret2 = rxrpc_preparse_xdr_yfs_rxgk(prep, datalen, token, toklen); + break; default: ret2 = -EPROTONOSUPPORT; break; @@ -390,6 +547,10 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token) case RXRPC_SECURITY_RXKAD: kfree(token->kad); break; + case RXRPC_SECURITY_YFS_RXGK: + kfree(token->rxgk->ticket.data); + kfree(token->rxgk); + break; default: pr_err("Unknown token type %x on rxrpc key\n", token->security_index); @@ -433,6 +594,9 @@ static void rxrpc_describe(const struct key *key, struct seq_file *m) case RXRPC_SECURITY_RXKAD: seq_puts(m, "ka"); break; + case RXRPC_SECURITY_YFS_RXGK: + seq_puts(m, "ygk"); + break; default: /* we have a ticket we can't encode */ seq_printf(m, "%u", token->security_index); break; @@ -597,6 +761,13 @@ static long rxrpc_read(const struct key *key, toksize += RND(token->kad->ticket_len); break; + case RXRPC_SECURITY_YFS_RXGK: + toksize += 6 * 8 + 2 * 4; + if (!token->no_leak_key) + toksize += RND(token->rxgk->key.len); + toksize += RND(token->rxgk->ticket.len); + break; + default: /* we have a ticket we can't encode */ pr_err("Unsupported key token type (%u)\n", token->security_index); @@ -676,6 +847,20 @@ static long rxrpc_read(const struct key *key, ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); break; + case RXRPC_SECURITY_YFS_RXGK: + ENCODE64(token->rxgk->begintime); + ENCODE64(token->rxgk->endtime); + ENCODE64(token->rxgk->level); + ENCODE64(token->rxgk->lifetime); + ENCODE64(token->rxgk->bytelife); + ENCODE64(token->rxgk->enctype); + if (token->no_leak_key) + ENCODE(0); + else + ENCODE_DATA(token->rxgk->key.len, token->rxgk->key.data); + ENCODE_DATA(token->rxgk->ticket.len, token->rxgk->ticket.data); + break; + default: pr_err("Unsupported key token type (%u)\n", token->security_index); From c86f9b963dc606ce884edfcb6724778bd1471ace Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:52 +0100 Subject: [PATCH 153/770] rxrpc: rxgk: Provide infrastructure and key derivation Provide some infrastructure for implementing the RxGK transport security class: (1) A definition of an encoding type, including: - Relevant crypto-layer names - Lengths of the crypto keys and checksums involved - Crypto functions specific to the encoding type - Crypto scheme used for that type (2) A definition of a crypto scheme, including: - Underlying crypto handlers - The pseudo-random function, PRF, used in base key derivation - Functions for deriving usage keys Kc, Ke and Ki - Functions for en/decrypting parts of an sk_buff (3) A key context, with the usage keys required for a derivative of a transport key for a specific key number. This includes keys for securing packets for transmission, extracting received packets and dealing with response packets. (3) A function to look up an encoding type by number. (4) A function to set up a key context and derive the keys. (5) A function to set up the keys required to extract the ticket obtained from the GSS negotiation in the server. (6) Miscellaneous functions for context handling. The keys and key derivation functions are described in: tools.ietf.org/html/draft-wilkinson-afs3-rxgk-11 Signed-off-by: David Howells cc: Marc Dionne cc: Herbert Xu cc: Chuck Lever cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-8-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/Kconfig | 23 ++++ net/rxrpc/Makefile | 3 +- net/rxrpc/ar-internal.h | 3 + net/rxrpc/rxgk_common.h | 48 +++++++ net/rxrpc/rxgk_kdf.c | 288 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 364 insertions(+), 1 deletion(-) create mode 100644 net/rxrpc/rxgk_common.h create mode 100644 net/rxrpc/rxgk_kdf.c diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index a20986806fea9..f60b81c660789 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -67,6 +67,29 @@ config RXKAD See Documentation/networking/rxrpc.rst. +config RXGK + bool "RxRPC GSSAPI security" + select CRYPTO_KRB5 + select CRYPTO_MANAGER + select CRYPTO_KRB5ENC + select CRYPTO_AUTHENC + select CRYPTO_SKCIPHER + select CRYPTO_HASH_INFO + select CRYPTO_HMAC + select CRYPTO_CMAC + select CRYPTO_SHA1 + select CRYPTO_SHA256 + select CRYPTO_SHA512 + select CRYPTO_CBC + select CRYPTO_CTS + select CRYPTO_AES + select CRYPTO_CAMELLIA + help + Provide the GSSAPI-based RxGK security class for AFS. Keys are added + with add_key(). + + See Documentation/networking/rxrpc.rst. + config RXPERF tristate "RxRPC test service" help diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 0981941dea73d..3eda77a0266bf 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -40,6 +40,7 @@ rxrpc-y := \ rxrpc-$(CONFIG_PROC_FS) += proc.o rxrpc-$(CONFIG_RXKAD) += rxkad.o rxrpc-$(CONFIG_SYSCTL) += sysctl.o - +rxrpc-$(CONFIG_RXGK) += \ + rxgk_kdf.o obj-$(CONFIG_RXPERF) += rxperf.o diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 31d05784c530a..a776f08d10b31 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -558,6 +558,9 @@ struct rxrpc_connection { struct rxrpc_crypt csum_iv; /* packet checksum base */ u32 nonce; /* response re-use preventer */ } rxkad; + struct { + u64 start_time; /* The start time for TK derivation */ + } rxgk; }; struct sk_buff *tx_response; /* Response packet to be transmitted */ unsigned long flags; diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h new file mode 100644 index 0000000000000..da1464e65766c --- /dev/null +++ b/net/rxrpc/rxgk_common.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Common bits for GSSAPI-based RxRPC security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include +#include +#include + +/* + * Per-key number context. This is replaced when the connection is rekeyed. + */ +struct rxgk_context { + refcount_t usage; + unsigned int key_number; /* Rekeying number (goes in the rx header) */ + unsigned long flags; +#define RXGK_TK_NEEDS_REKEY 0 /* Set if this needs rekeying */ + unsigned long expiry; /* Expiration time of this key */ + long long bytes_remaining; /* Remaining Tx lifetime of this key */ + const struct krb5_enctype *krb5; /* RxGK encryption type */ + const struct rxgk_key *key; + + /* We need up to 7 keys derived from the transport key, but we don't + * actually need the transport key. Each key is derived by + * DK(TK,constant). + */ + struct crypto_aead *tx_enc; /* Transmission key */ + struct crypto_aead *rx_enc; /* Reception key */ + struct crypto_shash *tx_Kc; /* Transmission checksum key */ + struct crypto_shash *rx_Kc; /* Reception checksum key */ + struct crypto_aead *resp_enc; /* Response packet enc key */ +}; + +/* + * rxgk_kdf.c + */ +void rxgk_put(struct rxgk_context *gk); +struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn, + const struct rxgk_key *key, + unsigned int key_number, + gfp_t gfp); +int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, + struct crypto_aead **token_key, + unsigned int enctype, + const struct krb5_enctype **_krb5, + gfp_t gfp); diff --git a/net/rxrpc/rxgk_kdf.c b/net/rxrpc/rxgk_kdf.c new file mode 100644 index 0000000000000..b4db5aa30e5b9 --- /dev/null +++ b/net/rxrpc/rxgk_kdf.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RxGK transport key derivation. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include "ar-internal.h" +#include "rxgk_common.h" + +#define round16(x) (((x) + 15) & ~15) + +/* + * Constants used to derive the keys and hmacs actually used for doing stuff. + */ +#define RXGK_CLIENT_ENC_PACKET 1026U // 0x402 +#define RXGK_CLIENT_MIC_PACKET 1027U // 0x403 +#define RXGK_SERVER_ENC_PACKET 1028U // 0x404 +#define RXGK_SERVER_MIC_PACKET 1029U // 0x405 +#define RXGK_CLIENT_ENC_RESPONSE 1030U // 0x406 +#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c + +static void rxgk_free(struct rxgk_context *gk) +{ + if (gk->tx_Kc) + crypto_free_shash(gk->tx_Kc); + if (gk->rx_Kc) + crypto_free_shash(gk->rx_Kc); + if (gk->tx_enc) + crypto_free_aead(gk->tx_enc); + if (gk->rx_enc) + crypto_free_aead(gk->rx_enc); + if (gk->resp_enc) + crypto_free_aead(gk->resp_enc); + kfree(gk); +} + +void rxgk_put(struct rxgk_context *gk) +{ + if (gk && refcount_dec_and_test(&gk->usage)) + rxgk_free(gk); +} + +/* + * Transport key derivation function. + * + * TK = random-to-key(PRF+(K0, L, + * epoch || cid || start_time || key_number)) + * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-11 sec 8.3] + */ +static int rxgk_derive_transport_key(struct rxrpc_connection *conn, + struct rxgk_context *gk, + const struct rxgk_key *rxgk, + struct krb5_buffer *TK, + gfp_t gfp) +{ + const struct krb5_enctype *krb5 = gk->krb5; + struct krb5_buffer conn_info; + unsigned int L = krb5->key_bytes; + __be32 *info; + u8 *buffer; + int ret; + + _enter(""); + + conn_info.len = sizeof(__be32) * 5; + + buffer = kzalloc(round16(conn_info.len), gfp); + if (!buffer) + return -ENOMEM; + + conn_info.data = buffer; + + info = (__be32 *)conn_info.data; + info[0] = htonl(conn->proto.epoch); + info[1] = htonl(conn->proto.cid); + info[2] = htonl(conn->rxgk.start_time >> 32); + info[3] = htonl(conn->rxgk.start_time >> 0); + info[4] = htonl(gk->key_number); + + ret = crypto_krb5_calc_PRFplus(krb5, &rxgk->key, L, &conn_info, TK, gfp); + kfree_sensitive(buffer); + _leave(" = %d", ret); + return ret; +} + +/* + * Set up the ciphers for the usage keys. + */ +static int rxgk_set_up_ciphers(struct rxrpc_connection *conn, + struct rxgk_context *gk, + const struct rxgk_key *rxgk, + gfp_t gfp) +{ + const struct krb5_enctype *krb5 = gk->krb5; + struct crypto_shash *shash; + struct crypto_aead *aead; + struct krb5_buffer TK; + bool service = rxrpc_conn_is_service(conn); + int ret; + u8 *buffer; + + buffer = kzalloc(krb5->key_bytes, gfp); + if (!buffer) + return -ENOMEM; + + TK.len = krb5->key_bytes; + TK.data = buffer; + + ret = rxgk_derive_transport_key(conn, gk, rxgk, &TK, gfp); + if (ret < 0) + goto out; + + aead = crypto_krb5_prepare_encryption(krb5, &TK, RXGK_CLIENT_ENC_RESPONSE, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->resp_enc = aead; + + if (crypto_aead_blocksize(gk->resp_enc) != krb5->block_len || + crypto_aead_authsize(gk->resp_enc) != krb5->cksum_len) { + pr_notice("algo inconsistent with krb5 table %u!=%u or %u!=%u\n", + crypto_aead_blocksize(gk->resp_enc), krb5->block_len, + crypto_aead_authsize(gk->resp_enc), krb5->cksum_len); + ret = -EINVAL; + goto out; + } + + if (service) { + switch (conn->security_level) { + case RXRPC_SECURITY_AUTH: + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->tx_Kc = shash; + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->rx_Kc = shash; + break; + case RXRPC_SECURITY_ENCRYPT: + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->tx_enc = aead; + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->rx_enc = aead; + break; + } + } else { + switch (conn->security_level) { + case RXRPC_SECURITY_AUTH: + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->tx_Kc = shash; + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->rx_Kc = shash; + break; + case RXRPC_SECURITY_ENCRYPT: + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->tx_enc = aead; + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->rx_enc = aead; + break; + } + } + + ret = 0; +out: + kfree_sensitive(buffer); + return ret; +aead_error: + ret = PTR_ERR(aead); + goto out; +hash_error: + ret = PTR_ERR(shash); + goto out; +} + +/* + * Derive a transport key for a connection and then derive a bunch of usage + * keys from it and set up ciphers using them. + */ +struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn, + const struct rxgk_key *key, + unsigned int key_number, + gfp_t gfp) +{ + struct rxgk_context *gk; + unsigned long lifetime; + int ret = -ENOPKG; + + _enter(""); + + gk = kzalloc(sizeof(*gk), GFP_KERNEL); + if (!gk) + return ERR_PTR(-ENOMEM); + refcount_set(&gk->usage, 1); + gk->key = key; + gk->key_number = key_number; + + gk->krb5 = crypto_krb5_find_enctype(key->enctype); + if (!gk->krb5) + goto err_tk; + + ret = rxgk_set_up_ciphers(conn, gk, key, gfp); + if (ret) + goto err_tk; + + /* Set the remaining number of bytes encrypted with this key that may + * be transmitted before rekeying. Note that the spec has been + * interpreted differently on this point... + */ + switch (key->bytelife) { + case 0: + case 63: + gk->bytes_remaining = LLONG_MAX; + break; + case 1 ... 62: + gk->bytes_remaining = 1LL << key->bytelife; + break; + default: + gk->bytes_remaining = key->bytelife; + break; + } + + /* Set the time after which rekeying must occur */ + if (key->lifetime) { + lifetime = min_t(u64, key->lifetime, INT_MAX / HZ); + lifetime *= HZ; + } else { + lifetime = MAX_JIFFY_OFFSET; + } + gk->expiry = jiffies + lifetime; + return gk; + +err_tk: + rxgk_put(gk); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * Use the server secret key to set up the ciphers that will be used to extract + * the token from a response packet. + */ +int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, + struct crypto_aead **token_aead, + unsigned int enctype, + const struct krb5_enctype **_krb5, + gfp_t gfp) +{ + const struct krb5_enctype *krb5; + struct crypto_aead *aead; + + krb5 = crypto_krb5_find_enctype(enctype); + if (!krb5) + return -ENOPKG; + + aead = crypto_krb5_prepare_encryption(krb5, server_key, RXGK_SERVER_ENC_TOKEN, gfp); + if (IS_ERR(aead)) + return PTR_ERR(aead); + + *_krb5 = krb5; + *token_aead = aead; + return 0; +} From 9d1d2b59341f58126a69b51f9f5f8ccb9f12e54a Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:53 +0100 Subject: [PATCH 154/770] rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI) Implement the basic parts of the yfs-rxgk security class (security index 6) to support GSSAPI-negotiated security. Signed-off-by: David Howells cc: Marc Dionne cc: Herbert Xu cc: Chuck Lever cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-9-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/networking/rxrpc.rst | 1 + fs/afs/cm_security.c | 12 + include/trace/events/rxrpc.h | 45 +- net/rxrpc/Makefile | 2 + net/rxrpc/ar-internal.h | 17 + net/rxrpc/output.c | 2 +- net/rxrpc/protocol.h | 20 + net/rxrpc/rxgk.c | 1215 ++++++++++++++++++++++++++++ net/rxrpc/rxgk_app.c | 285 +++++++ net/rxrpc/rxgk_common.h | 91 +++ net/rxrpc/rxkad.c | 6 +- net/rxrpc/security.c | 3 + 12 files changed, 1694 insertions(+), 5 deletions(-) create mode 100644 net/rxrpc/rxgk.c create mode 100644 net/rxrpc/rxgk_app.c diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index eb941b2577ddf..a01f0c81ca4b2 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -1182,6 +1182,7 @@ API Function Reference .. kernel-doc:: net/rxrpc/oob.c .. kernel-doc:: net/rxrpc/peer_object.c .. kernel-doc:: net/rxrpc/recvmsg.c +.. kernel-doc:: net/rxrpc/rxgk.c .. kernel-doc:: net/rxrpc/rxkad.c .. kernel-doc:: net/rxrpc/sendmsg.c .. kernel-doc:: net/rxrpc/server_key.c diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c index efe6a23c84774..e8eb63e4d124a 100644 --- a/fs/afs/cm_security.c +++ b/fs/afs/cm_security.c @@ -6,6 +6,7 @@ */ #include +#include #include "internal.h" #include "afs_fs.h" #include "protocol_yfs.h" @@ -17,6 +18,9 @@ */ static int afs_respond_to_challenge(struct sk_buff *challenge) { +#ifdef CONFIG_RXGK + struct krb5_buffer appdata = {}; +#endif struct rxrpc_peer *peer; unsigned long peer_data; u16 service_id; @@ -44,8 +48,16 @@ static int afs_respond_to_challenge(struct sk_buff *challenge) } switch (security_index) { +#ifdef CONFIG_RXKAD case RXRPC_SECURITY_RXKAD: return rxkad_kernel_respond_to_challenge(challenge); +#endif + +#ifdef CONFIG_RXGK + case RXRPC_SECURITY_RXGK: + case RXRPC_SECURITY_YFS_RXGK: + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +#endif default: return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 08ecebd905953..aab81e8196aee 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -69,6 +69,38 @@ EM(rxkad_abort_resp_tkt_sname, "rxkad-resp-tk-sname") \ EM(rxkad_abort_resp_unknown_tkt, "rxkad-resp-unknown-tkt") \ EM(rxkad_abort_resp_version, "rxkad-resp-version") \ + /* RxGK security errors */ \ + EM(rxgk_abort_1_verify_mic_eproto, "rxgk1-vfy-mic-eproto") \ + EM(rxgk_abort_2_decrypt_eproto, "rxgk2-dec-eproto") \ + EM(rxgk_abort_2_short_data, "rxgk2-short-data") \ + EM(rxgk_abort_2_short_encdata, "rxgk2-short-encdata") \ + EM(rxgk_abort_2_short_header, "rxgk2-short-hdr") \ + EM(rxgk_abort_bad_key_number, "rxgk-bad-key-num") \ + EM(rxgk_abort_chall_key_expired, "rxgk-chall-key-exp") \ + EM(rxgk_abort_chall_no_key, "rxgk-chall-nokey") \ + EM(rxgk_abort_chall_short, "rxgk-chall-short") \ + EM(rxgk_abort_resp_auth_dec, "rxgk-resp-auth-dec") \ + EM(rxgk_abort_resp_bad_callid, "rxgk-resp-bad-callid") \ + EM(rxgk_abort_resp_bad_nonce, "rxgk-resp-bad-nonce") \ + EM(rxgk_abort_resp_bad_param, "rxgk-resp-bad-param") \ + EM(rxgk_abort_resp_call_ctr, "rxgk-resp-call-ctr") \ + EM(rxgk_abort_resp_call_state, "rxgk-resp-call-state") \ + EM(rxgk_abort_resp_internal_error, "rxgk-resp-int-error") \ + EM(rxgk_abort_resp_nopkg, "rxgk-resp-nopkg") \ + EM(rxgk_abort_resp_short_applen, "rxgk-resp-short-applen") \ + EM(rxgk_abort_resp_short_auth, "rxgk-resp-short-auth") \ + EM(rxgk_abort_resp_short_call_list, "rxgk-resp-short-callls") \ + EM(rxgk_abort_resp_short_packet, "rxgk-resp-short-packet") \ + EM(rxgk_abort_resp_short_yfs_klen, "rxgk-resp-short-yfs-klen") \ + EM(rxgk_abort_resp_short_yfs_key, "rxgk-resp-short-yfs-key") \ + EM(rxgk_abort_resp_short_yfs_tkt, "rxgk-resp-short-yfs-tkt") \ + EM(rxgk_abort_resp_tok_dec, "rxgk-resp-tok-dec") \ + EM(rxgk_abort_resp_tok_internal_error, "rxgk-resp-tok-int-err") \ + EM(rxgk_abort_resp_tok_keyerr, "rxgk-resp-tok-keyerr") \ + EM(rxgk_abort_resp_tok_nokey, "rxgk-resp-tok-nokey") \ + EM(rxgk_abort_resp_tok_nopkg, "rxgk-resp-tok-nopkg") \ + EM(rxgk_abort_resp_tok_short, "rxgk-resp-tok-short") \ + EM(rxgk_abort_resp_xdr_align, "rxgk-resp-xdr-align") \ /* rxrpc errors */ \ EM(rxrpc_abort_call_improper_term, "call-improper-term") \ EM(rxrpc_abort_call_reset, "call-reset") \ @@ -471,6 +503,7 @@ EM(rxrpc_tx_point_call_final_resend, "CallFinalResend") \ EM(rxrpc_tx_point_conn_abort, "ConnAbort") \ EM(rxrpc_tx_point_reject, "Reject") \ + EM(rxrpc_tx_point_rxgk_challenge, "RxGKChall") \ EM(rxrpc_tx_point_rxkad_challenge, "RxkadChall") \ EM(rxrpc_tx_point_response, "Response") \ EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \ @@ -489,6 +522,7 @@ #define rxrpc_txbuf_traces \ EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ + EM(rxrpc_txbuf_alloc_response, "ALLOC RESP ") \ EM(rxrpc_txbuf_free, "FREE ") \ EM(rxrpc_txbuf_get_buffer, "GET BUFFER ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ @@ -496,6 +530,7 @@ EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ + EM(rxrpc_txbuf_put_response_tx, "PUT RESP TX") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ EM(rxrpc_txbuf_see_lost, "SEE LOST ") \ @@ -1178,6 +1213,7 @@ TRACE_EVENT(rxrpc_rx_challenge, __field(u32, version) __field(u32, nonce) __field(u32, min_level) + __field(u8, security_ix) ), TP_fast_assign( @@ -1186,11 +1222,13 @@ TRACE_EVENT(rxrpc_rx_challenge, __entry->version = version; __entry->nonce = nonce; __entry->min_level = min_level; + __entry->security_ix = conn->security_ix; ), - TP_printk("C=%08x CHALLENGE %08x v=%x n=%x ml=%x", + TP_printk("C=%08x CHALLENGE r=%08x sx=%u v=%x n=%x ml=%x", __entry->conn, __entry->serial, + __entry->security_ix, __entry->version, __entry->nonce, __entry->min_level) @@ -1208,6 +1246,7 @@ TRACE_EVENT(rxrpc_rx_response, __field(u32, version) __field(u32, kvno) __field(u32, ticket_len) + __field(u8, security_ix) ), TP_fast_assign( @@ -1216,11 +1255,13 @@ TRACE_EVENT(rxrpc_rx_response, __entry->version = version; __entry->kvno = kvno; __entry->ticket_len = ticket_len; + __entry->security_ix = conn->security_ix; ), - TP_printk("C=%08x RESPONSE %08x v=%x kvno=%x tl=%x", + TP_printk("C=%08x RESPONSE r=%08x sx=%u v=%x kvno=%x tl=%x", __entry->conn, __entry->serial, + __entry->security_ix, __entry->version, __entry->kvno, __entry->ticket_len) diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 3eda77a0266bf..c0542bae719e3 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -41,6 +41,8 @@ rxrpc-$(CONFIG_PROC_FS) += proc.o rxrpc-$(CONFIG_RXKAD) += rxkad.o rxrpc-$(CONFIG_SYSCTL) += sysctl.o rxrpc-$(CONFIG_RXGK) += \ + rxgk.o \ + rxgk_app.o \ rxgk_kdf.o obj-$(CONFIG_RXPERF) += rxperf.o diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a776f08d10b31..57ea6d61b7a2a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -31,6 +31,7 @@ struct key_preparsed_payload; struct rxrpc_connection; struct rxrpc_txbuf; struct rxrpc_txqueue; +struct rxgk_context; /* * Mark applied to socket buffers in skb->mark. skb->priority is used @@ -312,6 +313,11 @@ struct rxrpc_security { /* clear connection security */ void (*clear)(struct rxrpc_connection *); + + /* Default ticket -> key decoder */ + int (*default_decode_ticket)(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key); }; /* @@ -559,7 +565,10 @@ struct rxrpc_connection { u32 nonce; /* response re-use preventer */ } rxkad; struct { + struct rxgk_context *keys[1]; u64 start_time; /* The start time for TK derivation */ + u8 nonce[20]; /* Response re-use preventer */ + u32 enctype; /* Kerberos 5 encoding type */ } rxgk; }; struct sk_buff *tx_response; /* Response packet to be transmitted */ @@ -903,6 +912,8 @@ struct rxrpc_txbuf { unsigned short len; /* Amount of data in buffer */ unsigned short space; /* Remaining data space */ unsigned short offset; /* Offset of fill point */ + unsigned short crypto_header; /* Size of crypto header */ + unsigned short sec_header; /* Size of security header */ unsigned short pkt_len; /* Size of packet content */ unsigned short alloc_size; /* Amount of bufferage allocated */ unsigned int flags; @@ -1339,6 +1350,7 @@ int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len); /* * output.c */ +ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len); void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); @@ -1411,6 +1423,11 @@ void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans); void rxrpc_call_init_rtt(struct rxrpc_call *call); +/* + * rxgk.c + */ +extern const struct rxrpc_security rxgk_yfs; + /* * rxkad.c */ diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 04c900990a401..8138f35d79459 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -18,7 +18,7 @@ extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -static ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) +ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) { struct sockaddr *sa = msg->msg_name; struct sock *sk = socket->sk; diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 42f70e4636f82..f8bfec12bc7e0 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -181,4 +181,24 @@ struct rxkad_response { __be32 ticket_len; /* Kerberos ticket length */ } __packed; +/* + * GSSAPI security type-4 and type-6 data header. + */ +struct rxgk_header { + __be32 epoch; + __be32 cid; + __be32 call_number; + __be32 seq; + __be32 sec_index; + __be32 data_len; +} __packed; + +/* + * GSSAPI security type-4 and type-6 response packet header. + */ +struct rxgk_response { + __be64 start_time; + __be32 token_len; +} __packed; + #endif /* _LINUX_RXRPC_PACKET_H */ diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c new file mode 100644 index 0000000000000..02752eeb23954 --- /dev/null +++ b/net/rxrpc/rxgk.c @@ -0,0 +1,1215 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* GSSAPI-based RxRPC security + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include "ar-internal.h" +#include "rxgk_common.h" + +/* + * Parse the information from a server key + */ +static int rxgk_preparse_server_key(struct key_preparsed_payload *prep) +{ + const struct krb5_enctype *krb5; + struct krb5_buffer *server_key = (void *)&prep->payload.data[2]; + unsigned int service, sec_class, kvno, enctype; + int n = 0; + + _enter("%zu", prep->datalen); + + if (sscanf(prep->orig_description, "%u:%u:%u:%u%n", + &service, &sec_class, &kvno, &enctype, &n) != 4) + return -EINVAL; + + if (prep->orig_description[n]) + return -EINVAL; + + krb5 = crypto_krb5_find_enctype(enctype); + if (!krb5) + return -ENOPKG; + + prep->payload.data[0] = (struct krb5_enctype *)krb5; + + if (prep->datalen != krb5->key_len) + return -EKEYREJECTED; + + server_key->len = prep->datalen; + server_key->data = kmemdup(prep->data, prep->datalen, GFP_KERNEL); + if (!server_key->data) + return -ENOMEM; + + _leave(" = 0"); + return 0; +} + +static void rxgk_free_server_key(union key_payload *payload) +{ + struct krb5_buffer *server_key = (void *)&payload->data[2]; + + kfree_sensitive(server_key->data); +} + +static void rxgk_free_preparse_server_key(struct key_preparsed_payload *prep) +{ + rxgk_free_server_key(&prep->payload); +} + +static void rxgk_destroy_server_key(struct key *key) +{ + rxgk_free_server_key(&key->payload); +} + +static void rxgk_describe_server_key(const struct key *key, struct seq_file *m) +{ + const struct krb5_enctype *krb5 = key->payload.data[0]; + + if (krb5) + seq_printf(m, ": %s", krb5->name); +} + +static struct rxgk_context *rxgk_get_key(struct rxrpc_connection *conn, + u16 *specific_key_number) +{ + refcount_inc(&conn->rxgk.keys[0]->usage); + return conn->rxgk.keys[0]; +} + +/* + * initialise connection security + */ +static int rxgk_init_connection_security(struct rxrpc_connection *conn, + struct rxrpc_key_token *token) +{ + struct rxgk_context *gk; + int ret; + + _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key)); + + conn->security_ix = token->security_index; + conn->security_level = token->rxgk->level; + + if (rxrpc_conn_is_client(conn)) { + conn->rxgk.start_time = ktime_get(); + do_div(conn->rxgk.start_time, 100); + } + + gk = rxgk_generate_transport_key(conn, token->rxgk, 0, GFP_NOFS); + if (IS_ERR(gk)) + return PTR_ERR(gk); + conn->rxgk.enctype = gk->krb5->etype; + conn->rxgk.keys[0] = gk; + + switch (conn->security_level) { + case RXRPC_SECURITY_PLAIN: + case RXRPC_SECURITY_AUTH: + case RXRPC_SECURITY_ENCRYPT: + break; + default: + ret = -EKEYREJECTED; + goto error; + } + + ret = 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * Clean up the crypto on a call. + */ +static void rxgk_free_call_crypto(struct rxrpc_call *call) +{ +} + +/* + * Work out how much data we can put in a packet. + */ +static struct rxrpc_txbuf *rxgk_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) +{ + enum krb5_crypto_mode mode; + struct rxgk_context *gk; + struct rxrpc_txbuf *txb; + size_t shdr, alloc, limit, part, offset, gap; + + switch (call->conn->security_level) { + default: + alloc = umin(remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp); + case RXRPC_SECURITY_AUTH: + shdr = 0; + mode = KRB5_CHECKSUM_MODE; + break; + case RXRPC_SECURITY_ENCRYPT: + shdr = sizeof(struct rxgk_header); + mode = KRB5_ENCRYPT_MODE; + break; + } + + gk = rxgk_get_key(call->conn, NULL); + if (IS_ERR(gk)) + return NULL; + + /* Work out the maximum amount of data that will fit. */ + alloc = RXRPC_JUMBO_DATALEN; + limit = crypto_krb5_how_much_data(gk->krb5, mode, &alloc, &offset); + + if (remain < limit - shdr) { + part = remain; + alloc = crypto_krb5_how_much_buffer(gk->krb5, mode, + shdr + part, &offset); + gap = 0; + } else { + part = limit - shdr; + gap = RXRPC_JUMBO_DATALEN - alloc; + alloc = RXRPC_JUMBO_DATALEN; + } + + rxgk_put(gk); + + txb = rxrpc_alloc_data_txbuf(call, alloc, 16, gfp); + if (!txb) + return NULL; + + txb->crypto_header = offset; + txb->sec_header = shdr; + txb->offset += offset + shdr; + txb->space = part; + + /* Clear excess space in the packet */ + if (gap) + memset(txb->data + alloc - gap, 0, gap); + return txb; +} + +/* + * Integrity mode (sign a packet - level 1 security) + */ +static int rxgk_secure_packet_integrity(const struct rxrpc_call *call, + struct rxgk_context *gk, + struct rxrpc_txbuf *txb) +{ + struct rxgk_header *hdr; + struct scatterlist sg[1]; + struct krb5_buffer metadata; + int ret = -ENOMEM; + + _enter(""); + + hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + if (!hdr) + goto error_gk; + + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(txb->seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(txb->len); + metadata.len = sizeof(*hdr); + metadata.data = hdr; + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], txb->data, txb->alloc_size); + + ret = crypto_krb5_get_mic(gk->krb5, gk->tx_Kc, &metadata, + sg, 1, txb->alloc_size, + txb->crypto_header, txb->sec_header + txb->len); + if (ret >= 0) { + txb->pkt_len = ret; + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; + gk->bytes_remaining -= ret; + } + kfree(hdr); +error_gk: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * wholly encrypt a packet (level 2 security) + */ +static int rxgk_secure_packet_encrypted(const struct rxrpc_call *call, + struct rxgk_context *gk, + struct rxrpc_txbuf *txb) +{ + struct rxgk_header *hdr; + struct scatterlist sg[1]; + int ret; + + _enter("%x", txb->len); + + /* Insert the header into the buffer. */ + hdr = txb->data + txb->crypto_header; + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(txb->seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(txb->len); + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], txb->data, txb->alloc_size); + + ret = crypto_krb5_encrypt(gk->krb5, gk->tx_enc, + sg, 1, txb->alloc_size, + txb->crypto_header, txb->sec_header + txb->len, + false); + if (ret >= 0) { + txb->pkt_len = ret; + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; + gk->bytes_remaining -= ret; + } + + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * checksum an RxRPC packet header + */ +static int rxgk_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +{ + struct rxgk_context *gk; + int ret; + + _enter("{%d{%x}},{#%u},%u,", + call->debug_id, key_serial(call->conn->key), txb->seq, txb->len); + + gk = rxgk_get_key(call->conn, NULL); + if (IS_ERR(gk)) + return PTR_ERR(gk) == -ESTALE ? -EKEYREJECTED : PTR_ERR(gk); + + ret = key_validate(call->conn->key); + if (ret < 0) + return ret; + + txb->cksum = htons(gk->key_number); + + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + rxgk_put(gk); + txb->pkt_len = txb->len; + return 0; + case RXRPC_SECURITY_AUTH: + return rxgk_secure_packet_integrity(call, gk, txb); + case RXRPC_SECURITY_ENCRYPT: + return rxgk_secure_packet_encrypted(call, gk, txb); + default: + rxgk_put(gk); + return -EPERM; + } +} + +/* + * Integrity mode (check the signature on a packet - level 1 security) + */ +static int rxgk_verify_packet_integrity(struct rxrpc_call *call, + struct rxgk_context *gk, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_header *hdr; + struct krb5_buffer metadata; + unsigned int offset = sp->offset, len = sp->len; + size_t data_offset = 0, data_len = len; + u32 ac; + int ret = -ENOMEM; + + _enter(""); + + crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, + &data_offset, &data_len); + + hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + if (!hdr) + return -ENOMEM; + + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(sp->hdr.seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(data_len); + + metadata.len = sizeof(*hdr); + metadata.data = hdr; + ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, + skb, &offset, &len, &ac); + kfree(hdr); + if (ret == -EPROTO) { + rxrpc_abort_eproto(call, skb, ac, + rxgk_abort_1_verify_mic_eproto); + } else { + sp->offset = offset; + sp->len = len; + } + + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Decrypt an encrypted packet (level 2 security). + */ +static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, + struct rxgk_context *gk, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_header hdr; + unsigned int offset = sp->offset, len = sp->len; + int ret; + u32 ac; + + _enter(""); + + ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); + if (ret == -EPROTO) + rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); + if (ret < 0) + goto error; + + if (len < sizeof(hdr)) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_header); + goto error; + } + + /* Extract the header from the skb */ + ret = skb_copy_bits(skb, offset, &hdr, sizeof(hdr)); + if (ret < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_encdata); + goto error; + } + offset += sizeof(hdr); + len -= sizeof(hdr); + + if (ntohl(hdr.epoch) != call->conn->proto.epoch || + ntohl(hdr.cid) != call->cid || + ntohl(hdr.call_number) != call->call_id || + ntohl(hdr.seq) != sp->hdr.seq || + ntohl(hdr.sec_index) != call->security_ix || + ntohl(hdr.data_len) > len) { + ret = rxrpc_abort_eproto(call, skb, RXGK_SEALEDINCON, + rxgk_abort_2_short_data); + goto error; + } + + sp->offset = offset; + sp->len = ntohl(hdr.data_len); + ret = 0; +error: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Verify the security on a received packet or subpacket (if part of a + * jumbo packet). + */ +static int rxgk_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_context *gk; + u16 key_number = sp->hdr.cksum; + + _enter("{%d{%x}},{#%u}", + call->debug_id, key_serial(call->conn->key), sp->hdr.seq); + + gk = rxgk_get_key(call->conn, &key_number); + if (IS_ERR(gk)) { + switch (PTR_ERR(gk)) { + case -ESTALE: + return rxrpc_abort_eproto(call, skb, RXGK_BADKEYNO, + rxgk_abort_bad_key_number); + default: + return PTR_ERR(gk); + } + } + + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + return 0; + case RXRPC_SECURITY_AUTH: + return rxgk_verify_packet_integrity(call, gk, skb); + case RXRPC_SECURITY_ENCRYPT: + return rxgk_verify_packet_encrypted(call, gk, skb); + default: + rxgk_put(gk); + return -ENOANO; + } +} + +/* + * Allocate memory to hold a challenge or a response packet. We're not running + * in the io_thread, so we can't use ->tx_alloc. + */ +static struct page *rxgk_alloc_packet(size_t total_len) +{ + gfp_t gfp = GFP_NOFS; + int order; + + order = get_order(total_len); + if (order > 0) + gfp |= __GFP_COMP; + return alloc_pages(gfp, order); +} + +/* + * Issue a challenge. + */ +static int rxgk_issue_challenge(struct rxrpc_connection *conn) +{ + struct rxrpc_wire_header *whdr; + struct bio_vec bvec[1]; + struct msghdr msg; + struct page *page; + size_t len = sizeof(*whdr) + sizeof(conn->rxgk.nonce); + u32 serial; + int ret; + + _enter("{%d}", conn->debug_id); + + get_random_bytes(&conn->rxgk.nonce, sizeof(conn->rxgk.nonce)); + + /* We can't use conn->tx_alloc without a lock */ + page = rxgk_alloc_packet(sizeof(*whdr) + sizeof(conn->rxgk.nonce)); + if (!page) + return -ENOMEM; + + bvec_set_page(&bvec[0], page, len, 0); + iov_iter_bvec(&msg.msg_iter, WRITE, bvec, 1, len); + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + whdr = page_address(page); + whdr->epoch = htonl(conn->proto.epoch); + whdr->cid = htonl(conn->proto.cid); + whdr->callNumber = 0; + whdr->seq = 0; + whdr->type = RXRPC_PACKET_TYPE_CHALLENGE; + whdr->flags = conn->out_clientflag; + whdr->userStatus = 0; + whdr->securityIndex = conn->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(conn->service_id); + + memcpy(whdr + 1, conn->rxgk.nonce, sizeof(conn->rxgk.nonce)); + + serial = rxrpc_get_next_serials(conn, 1); + whdr->serial = htonl(serial); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret > 0) + conn->peer->last_tx_at = ktime_get_seconds(); + __free_page(page); + + if (ret < 0) { + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_rxgk_challenge); + return -EAGAIN; + } + + trace_rxrpc_tx_packet(conn->debug_id, whdr, + rxrpc_tx_point_rxgk_challenge); + _leave(" = 0"); + return 0; +} + +/* + * Validate a challenge packet. + */ +static bool rxgk_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + u8 nonce[20]; + + if (!conn->key) { + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxgk_abort_chall_no_key); + return false; + } + + if (key_validate(conn->key) < 0) { + rxrpc_abort_conn(conn, skb, RXGK_EXPIRED, -EPROTO, + rxgk_abort_chall_key_expired); + return false; + } + + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + nonce, sizeof(nonce)) < 0) { + rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_chall_short); + return false; + } + + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, 0, *(u32 *)nonce, 0); + return true; +} + +/** + * rxgk_kernel_query_challenge - Query RxGK-specific challenge parameters + * @challenge: The challenge packet to query + * + * Return: The Kerberos 5 encoding type for the challenged connection. + */ +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + return sp->chall.conn->rxgk.enctype; +} +EXPORT_SYMBOL(rxgk_kernel_query_challenge); + +/* + * Fill out the control message to pass to userspace to inform about the + * challenge. + */ +static int rxgk_challenge_to_recvmsg(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct msghdr *msg) +{ + struct rxgk_challenge chall; + + chall.base.service_id = conn->service_id; + chall.base.security_index = conn->security_ix; + chall.enctype = conn->rxgk.enctype; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_CHALLENGED, sizeof(chall), &chall); +} + +/* + * Insert the requisite amount of XDR padding for the length given. + */ +static int rxgk_pad_out(struct sk_buff *response, size_t len, size_t offset) +{ + __be32 zero = 0; + size_t pad = xdr_round_up(len) - len; + int ret; + + if (!pad) + return 0; + + ret = skb_store_bits(response, offset, &zero, pad); + if (ret < 0) + return ret; + return pad; +} + +/* + * Insert the header into the response. + */ +static noinline ssize_t rxgk_insert_response_header(struct rxrpc_connection *conn, + struct rxgk_context *gk, + struct sk_buff *response, + size_t offset) +{ + struct rxrpc_skb_priv *rsp = rxrpc_skb(response); + + struct { + struct rxrpc_wire_header whdr; + __be32 start_time_msw; + __be32 start_time_lsw; + __be32 ticket_len; + } h; + int ret; + + rsp->resp.kvno = gk->key_number; + rsp->resp.version = gk->krb5->etype; + + h.whdr.epoch = htonl(conn->proto.epoch); + h.whdr.cid = htonl(conn->proto.cid); + h.whdr.callNumber = 0; + h.whdr.serial = 0; + h.whdr.seq = 0; + h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + h.whdr.flags = conn->out_clientflag; + h.whdr.userStatus = 0; + h.whdr.securityIndex = conn->security_ix; + h.whdr.cksum = htons(gk->key_number); + h.whdr.serviceId = htons(conn->service_id); + h.start_time_msw = htonl(upper_32_bits(conn->rxgk.start_time)); + h.start_time_lsw = htonl(lower_32_bits(conn->rxgk.start_time)); + h.ticket_len = htonl(gk->key->ticket.len); + + ret = skb_store_bits(response, offset, &h, sizeof(h)); + return ret < 0 ? ret : sizeof(h); +} + +/* + * Construct the authenticator to go in the response packet + * + * struct RXGK_Authenticator { + * opaque nonce[20]; + * opaque appdata<>; + * RXGK_Level level; + * unsigned int epoch; + * unsigned int cid; + * unsigned int call_numbers<>; + * }; + */ +static ssize_t rxgk_construct_authenticator(struct rxrpc_connection *conn, + struct sk_buff *challenge, + const struct krb5_buffer *appdata, + struct sk_buff *response, + size_t offset) +{ + struct { + u8 nonce[20]; + __be32 appdata_len; + } a; + struct { + __be32 level; + __be32 epoch; + __be32 cid; + __be32 call_numbers_count; + __be32 call_numbers[4]; + } b; + int ret; + + ret = skb_copy_bits(challenge, sizeof(struct rxrpc_wire_header), + a.nonce, sizeof(a.nonce)); + if (ret < 0) + return -EPROTO; + + a.appdata_len = htonl(appdata->len); + + ret = skb_store_bits(response, offset, &a, sizeof(a)); + if (ret < 0) + return ret; + offset += sizeof(a); + + if (appdata->len) { + ret = skb_store_bits(response, offset, appdata->data, appdata->len); + if (ret < 0) + return ret; + offset += appdata->len; + + ret = rxgk_pad_out(response, appdata->len, offset); + if (ret < 0) + return ret; + offset += ret; + } + + b.level = htonl(conn->security_level); + b.epoch = htonl(conn->proto.epoch); + b.cid = htonl(conn->proto.cid); + b.call_numbers_count = htonl(4); + b.call_numbers[0] = htonl(conn->channels[0].call_counter); + b.call_numbers[1] = htonl(conn->channels[1].call_counter); + b.call_numbers[2] = htonl(conn->channels[2].call_counter); + b.call_numbers[3] = htonl(conn->channels[3].call_counter); + + ret = skb_store_bits(response, offset, &b, sizeof(b)); + if (ret < 0) + return ret; + return sizeof(a) + xdr_round_up(appdata->len) + sizeof(b); +} + +static ssize_t rxgk_encrypt_authenticator(struct rxrpc_connection *conn, + struct rxgk_context *gk, + struct sk_buff *response, + size_t offset, + size_t alloc_len, + size_t auth_offset, + size_t auth_len) +{ + struct scatterlist sg[16]; + int nr_sg; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(response, sg, offset, alloc_len); + if (unlikely(nr_sg < 0)) + return nr_sg; + return crypto_krb5_encrypt(gk->krb5, gk->resp_enc, sg, nr_sg, alloc_len, + auth_offset, auth_len, false); +} + +/* + * Construct the response. + * + * struct RXGK_Response { + * rxgkTime start_time; + * RXGK_Data token; + * opaque authenticator + * }; + */ +static int rxgk_construct_response(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + struct rxrpc_skb_priv *csp, *rsp; + struct rxgk_context *gk; + struct sk_buff *response; + size_t len, auth_len, authx_len, offset, auth_offset, authx_offset; + __be32 tmp; + int ret; + + gk = rxgk_get_key(conn, NULL); + if (IS_ERR(gk)) + return PTR_ERR(gk); + + auth_len = 20 + (4 + appdata->len) + 12 + (1 + 4) * 4; + authx_len = crypto_krb5_how_much_buffer(gk->krb5, KRB5_ENCRYPT_MODE, + auth_len, &auth_offset); + len = sizeof(struct rxrpc_wire_header) + + 8 + (4 + xdr_round_up(gk->key->ticket.len)) + (4 + authx_len); + + response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS); + if (!response) + goto error; + rxrpc_new_skb(response, rxrpc_skb_new_response_rxgk); + response->len = len; + response->data_len = len; + + ret = rxgk_insert_response_header(conn, gk, response, 0); + if (ret < 0) + goto error; + offset = ret; + + ret = skb_store_bits(response, offset, gk->key->ticket.data, gk->key->ticket.len); + if (ret < 0) + goto error; + offset += gk->key->ticket.len; + ret = rxgk_pad_out(response, gk->key->ticket.len, offset); + if (ret < 0) + goto error; + + authx_offset = offset + ret + 4; /* Leave a gap for the length. */ + + ret = rxgk_construct_authenticator(conn, challenge, appdata, response, + authx_offset + auth_offset); + if (ret < 0) + goto error; + auth_len = ret; + + ret = rxgk_encrypt_authenticator(conn, gk, response, + authx_offset, authx_len, + auth_offset, auth_len); + if (ret < 0) + goto error; + authx_len = ret; + + tmp = htonl(authx_len); + ret = skb_store_bits(response, authx_offset - 4, &tmp, 4); + if (ret < 0) + goto error; + + ret = rxgk_pad_out(response, authx_len, authx_offset + authx_len); + if (ret < 0) + return ret; + len = authx_offset + authx_len + ret; + + if (len != response->len) { + response->len = len; + response->data_len = len; + } + + csp = rxrpc_skb(challenge); + rsp = rxrpc_skb(response); + rsp->resp.len = len; + rsp->resp.challenge_serial = csp->hdr.serial; + rxrpc_post_response(conn, response); + response = NULL; + ret = 0; + +error: + rxrpc_free_skb(response, rxrpc_skb_put_response); + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Respond to a challenge packet. + */ +static int rxgk_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + if (key_validate(conn->key) < 0) + return rxrpc_abort_conn(conn, NULL, RXGK_EXPIRED, -EPROTO, + rxgk_abort_chall_key_expired); + + return rxgk_construct_response(conn, challenge, appdata); +} + +static int rxgk_respond_to_challenge_no_appdata(struct rxrpc_connection *conn, + struct sk_buff *challenge) +{ + struct krb5_buffer appdata = {}; + + return rxgk_respond_to_challenge(conn, challenge, &appdata); +} + +/** + * rxgk_kernel_respond_to_challenge - Respond to a challenge with appdata + * @challenge: The challenge to respond to + * @appdata: The application data to include in the RESPONSE authenticator + * + * Allow a kernel application to respond to a CHALLENGE with application data + * to be included in the RxGK RESPONSE Authenticator. + * + * Return: %0 if successful and a negative error code otherwise. + */ +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + + return rxgk_respond_to_challenge(csp->chall.conn, challenge, appdata); +} +EXPORT_SYMBOL(rxgk_kernel_respond_to_challenge); + +/* + * Parse sendmsg() control message and respond to challenge. We need to see if + * there's an appdata to fish out. + */ +static int rxgk_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + struct krb5_buffer appdata = {}; + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (cmsg->cmsg_level != SOL_RXRPC || + cmsg->cmsg_type != RXRPC_RESP_RXGK_APPDATA) + continue; + if (appdata.data) + return -EINVAL; + appdata.data = CMSG_DATA(cmsg); + appdata.len = cmsg->cmsg_len - sizeof(struct cmsghdr); + } + + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +} + +/* + * Verify the authenticator. + * + * struct RXGK_Authenticator { + * opaque nonce[20]; + * opaque appdata<>; + * RXGK_Level level; + * unsigned int epoch; + * unsigned int cid; + * unsigned int call_numbers<>; + * }; + */ +static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + __be32 *p, __be32 *end) +{ + u32 app_len, call_count, level, epoch, cid, i; + + _enter(""); + + if (memcmp(p, conn->rxgk.nonce, 20) != 0) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_nonce); + p += 20 / sizeof(__be32); + + app_len = ntohl(*p++); + if (app_len > (end - p) * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_applen); + + p += xdr_round_up(app_len) / sizeof(__be32); + if (end - p < 4) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_applen); + + level = ntohl(*p++); + epoch = ntohl(*p++); + cid = ntohl(*p++); + call_count = ntohl(*p++); + + if (level != conn->security_level || + epoch != conn->proto.epoch || + cid != conn->proto.cid || + call_count > 4) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_param); + + if (end - p < call_count) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_call_list); + + for (i = 0; i < call_count; i++) { + u32 call_id = ntohl(*p++); + + if (call_id > INT_MAX) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_callid); + + if (call_id < conn->channels[i].call_counter) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_call_ctr); + + if (call_id > conn->channels[i].call_counter) { + if (conn->channels[i].call) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_call_state); + + conn->channels[i].call_counter = call_id; + } + } + + _leave(" = 0"); + return 0; +} + +/* + * Extract the authenticator and verify it. + */ +static int rxgk_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + unsigned int auth_offset, unsigned int auth_len) +{ + void *auth; + __be32 *p; + int ret; + + auth = kmalloc(auth_len, GFP_NOFS); + if (!auth) + return -ENOMEM; + + ret = skb_copy_bits(skb, auth_offset, auth, auth_len); + if (ret < 0) { + ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_auth); + goto error; + } + + p = auth; + ret = rxgk_do_verify_authenticator(conn, krb5, skb, p, p + auth_len); +error: + kfree(auth); + return ret; +} + +/* + * Verify a response. + * + * struct RXGK_Response { + * rxgkTime start_time; + * RXGK_Data token; + * opaque authenticator + * }; + */ +static int rxgk_verify_response(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + const struct krb5_enctype *krb5; + struct rxrpc_key_token *token; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_response rhdr; + struct rxgk_context *gk; + struct key *key = NULL; + unsigned int offset = sizeof(struct rxrpc_wire_header); + unsigned int len = skb->len - sizeof(struct rxrpc_wire_header); + unsigned int token_offset, token_len; + unsigned int auth_offset, auth_len; + __be32 xauth_len; + int ret, ec; + + _enter("{%d}", conn->debug_id); + + /* Parse the RXGK_Response object */ + if (sizeof(rhdr) + sizeof(__be32) > len) + goto short_packet; + + if (skb_copy_bits(skb, offset, &rhdr, sizeof(rhdr)) < 0) + goto short_packet; + offset += sizeof(rhdr); + len -= sizeof(rhdr); + + token_offset = offset; + token_len = ntohl(rhdr.token_len); + if (xdr_round_up(token_len) + sizeof(__be32) > len) + goto short_packet; + + offset += xdr_round_up(token_len); + len -= xdr_round_up(token_len); + + if (skb_copy_bits(skb, offset, &xauth_len, sizeof(xauth_len)) < 0) + goto short_packet; + offset += sizeof(xauth_len); + len -= sizeof(xauth_len); + + auth_offset = offset; + auth_len = ntohl(xauth_len); + if (auth_len < len) + goto short_packet; + if (auth_len & 3) + goto inconsistent; + if (auth_len < 20 + 9 * 4) + goto auth_too_short; + + /* We need to extract and decrypt the token and instantiate a session + * key for it. This bit, however, is application-specific. If + * possible, we use a default parser, but we might end up bumping this + * to the app to deal with - which might mean a round trip to + * userspace. + */ + ret = rxgk_extract_token(conn, skb, token_offset, token_len, &key); + if (ret < 0) + goto out; + + /* We now have a key instantiated from the decrypted ticket. We can + * pass this to the application so that they can parse the ticket + * content and we can use the session key it contains to derive the + * keys we need. + * + * Note that we have to switch enctype at this point as the enctype of + * the ticket doesn't necessarily match that of the transport. + */ + token = key->payload.data[0]; + conn->security_level = token->rxgk->level; + conn->rxgk.start_time = __be64_to_cpu(rhdr.start_time); + + gk = rxgk_generate_transport_key(conn, token->rxgk, sp->hdr.cksum, GFP_NOFS); + if (IS_ERR(gk)) { + ret = PTR_ERR(gk); + goto cant_get_token; + } + + krb5 = gk->krb5; + + trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, token_len); + + /* Decrypt, parse and verify the authenticator. */ + ret = rxgk_decrypt_skb(krb5, gk->resp_enc, skb, + &auth_offset, &auth_len, &ec); + if (ret < 0) { + rxrpc_abort_conn(conn, skb, RXGK_SEALEDINCON, ret, + rxgk_abort_resp_auth_dec); + goto out; + } + + ret = rxgk_verify_authenticator(conn, krb5, skb, auth_offset, auth_len); + if (ret < 0) + goto out; + + conn->key = key; + key = NULL; + ret = 0; +out: + key_put(key); + _leave(" = %d", ret); + return ret; + +inconsistent: + ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_xdr_align); + goto out; +auth_too_short: + ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_short_auth); + goto out; +short_packet: + ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_short_packet); + goto out; + +cant_get_token: + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -EINVAL: + ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_internal_error); + goto out; + case -ENOPKG: + ret = rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP, + -EKEYREJECTED, rxgk_abort_resp_nopkg); + goto out; + } + +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + goto out; +} + +/* + * clear the connection security + */ +static void rxgk_clear(struct rxrpc_connection *conn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(conn->rxgk.keys); i++) + rxgk_put(conn->rxgk.keys[i]); +} + +/* + * Initialise the RxGK security service. + */ +static int rxgk_init(void) +{ + return 0; +} + +/* + * Clean up the RxGK security service. + */ +static void rxgk_exit(void) +{ +} + +/* + * RxRPC YFS GSSAPI-based security + */ +const struct rxrpc_security rxgk_yfs = { + .name = "yfs-rxgk", + .security_index = RXRPC_SECURITY_YFS_RXGK, + .no_key_abort = RXGK_NOTAUTH, + .init = rxgk_init, + .exit = rxgk_exit, + .preparse_server_key = rxgk_preparse_server_key, + .free_preparse_server_key = rxgk_free_preparse_server_key, + .destroy_server_key = rxgk_destroy_server_key, + .describe_server_key = rxgk_describe_server_key, + .init_connection_security = rxgk_init_connection_security, + .alloc_txbuf = rxgk_alloc_txbuf, + .secure_packet = rxgk_secure_packet, + .verify_packet = rxgk_verify_packet, + .free_call_crypto = rxgk_free_call_crypto, + .issue_challenge = rxgk_issue_challenge, + .validate_challenge = rxgk_validate_challenge, + .challenge_to_recvmsg = rxgk_challenge_to_recvmsg, + .sendmsg_respond_to_challenge = rxgk_sendmsg_respond_to_challenge, + .respond_to_challenge = rxgk_respond_to_challenge_no_appdata, + .verify_response = rxgk_verify_response, + .clear = rxgk_clear, + .default_decode_ticket = rxgk_yfs_decode_ticket, +}; diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c new file mode 100644 index 0000000000000..6206a84395b8b --- /dev/null +++ b/net/rxrpc/rxgk_app.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Application-specific bits for GSSAPI-based RxRPC security + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include "ar-internal.h" +#include "rxgk_common.h" + +/* + * Decode a default-style YFS ticket in a response and turn it into an + * rxrpc-type key. + * + * struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + * + * struct RXGK_AuthName { + * afs_int32 kind; + * opaque data; + * opaque display; + * }; + * + * struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ +int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key) +{ + struct rxrpc_key_token *token; + const struct cred *cred = current_cred(); // TODO - use socket creds + struct key *key; + size_t pre_ticket_len, payload_len; + unsigned int klen, enctype; + void *payload, *ticket; + __be32 *t, *p, *q, tmp[2]; + int ret; + + _enter(""); + + /* Get the session key length */ + ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); + if (ret < 0) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_klen); + enctype = ntohl(tmp[0]); + klen = ntohl(tmp[1]); + + if (klen > ticket_len - 10 * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_key); + + pre_ticket_len = ((5 + 14) * sizeof(__be32) + + xdr_round_up(klen) + + sizeof(__be32)); + payload_len = pre_ticket_len + xdr_round_up(ticket_len); + + payload = kzalloc(payload_len, GFP_NOFS); + if (!payload) + return -ENOMEM; + + /* We need to fill out the XDR form for a key payload that we can pass + * to add_key(). Start by copying in the ticket so that we can parse + * it. + */ + ticket = payload + pre_ticket_len; + ret = skb_copy_bits(skb, ticket_offset, ticket, ticket_len); + if (ret < 0) { + ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_tkt); + goto error; + } + + /* Fill out the form header. */ + p = payload; + p[0] = htonl(0); /* Flags */ + p[1] = htonl(1); /* len(cellname) */ + p[2] = htonl(0x20000000); /* Cellname " " */ + p[3] = htonl(1); /* #tokens */ + p[4] = htonl(15 * sizeof(__be32) + xdr_round_up(klen) + + xdr_round_up(ticket_len)); /* Token len */ + + /* Now fill in the body. Most of this we can just scrape directly from + * the ticket. + */ + t = ticket + sizeof(__be32) * 2 + xdr_round_up(klen); + q = payload + 5 * sizeof(__be32); + q[0] = htonl(RXRPC_SECURITY_YFS_RXGK); + q[1] = t[1]; /* begintime - msw */ + q[2] = t[2]; /* - lsw */ + q[3] = t[5]; /* endtime - msw */ + q[4] = t[6]; /* - lsw */ + q[5] = 0; /* level - msw */ + q[6] = t[0]; /* - lsw */ + q[7] = 0; /* lifetime - msw */ + q[8] = t[3]; /* - lsw */ + q[9] = 0; /* bytelife - msw */ + q[10] = t[4]; /* - lsw */ + q[11] = 0; /* enctype - msw */ + q[12] = htonl(enctype); /* - lsw */ + q[13] = htonl(klen); /* Key length */ + + q += 14; + + memcpy(q, ticket + sizeof(__be32) * 2, klen); + q += xdr_round_up(klen) / 4; + q[0] = htonl(ticket_len); + q++; + if (WARN_ON((unsigned long)q != (unsigned long)ticket)) { + ret = -EIO; + goto error; + } + + /* Ticket read in with skb_copy_bits above */ + q += xdr_round_up(ticket_len) / 4; + if (WARN_ON((unsigned long)q - (unsigned long)payload != payload_len)) { + ret = -EIO; + goto error; + } + + /* Now turn that into a key. */ + key = key_alloc(&key_type_rxrpc, "x", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, // TODO: Use socket owner + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) { + _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); + goto error; + } + + _debug("key %d", key_serial(key)); + + ret = key_instantiate_and_link(key, payload, payload_len, NULL, NULL); + if (ret < 0) + goto error_key; + + token = key->payload.data[0]; + token->no_leak_key = true; + *_key = key; + key = NULL; + ret = 0; + goto error; + +error_key: + key_put(key); +error: + kfree_sensitive(payload); + _leave(" = %d", ret); + return ret; +} + +/* + * Extract the token and set up a session key from the details. + * + * struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + * + * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-afs-08 sec 6.1] + */ +int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int token_offset, unsigned int token_len, + struct key **_key) +{ + const struct krb5_enctype *krb5; + const struct krb5_buffer *server_secret; + struct crypto_aead *token_enc = NULL; + struct key *server_key; + unsigned int ticket_offset, ticket_len; + u32 kvno, enctype; + int ret, ec; + + struct { + __be32 kvno; + __be32 enctype; + __be32 token_len; + } container; + + /* Decode the RXGK_TokenContainer object. This tells us which server + * key we should be using. We can then fetch the key, get the secret + * and set up the crypto to extract the token. + */ + if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); + + kvno = ntohl(container.kvno); + enctype = ntohl(container.enctype); + ticket_len = ntohl(container.token_len); + ticket_offset = token_offset + sizeof(container); + + if (xdr_round_up(ticket_len) > token_len - 3 * 4) + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); + + _debug("KVNO %u", kvno); + _debug("ENC %u", enctype); + _debug("TLEN %u", ticket_len); + + server_key = rxrpc_look_up_server_security(conn, skb, kvno, enctype); + if (IS_ERR(server_key)) + goto cant_get_server_key; + + down_read(&server_key->sem); + server_secret = (const void *)&server_key->payload.data[2]; + ret = rxgk_set_up_token_cipher(server_secret, &token_enc, enctype, &krb5, GFP_NOFS); + up_read(&server_key->sem); + key_put(server_key); + if (ret < 0) + goto cant_get_token; + + /* We can now decrypt and parse the token/ticket. This allows us to + * gain access to K0, from which we can derive the transport key and + * thence decode the authenticator. + */ + ret = rxgk_decrypt_skb(krb5, token_enc, skb, + &ticket_offset, &ticket_len, &ec); + crypto_free_aead(token_enc); + token_enc = NULL; + if (ret < 0) + return rxrpc_abort_conn(conn, skb, ec, ret, + rxgk_abort_resp_tok_dec); + + ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, + ticket_len, _key); + if (ret < 0) + goto cant_get_token; + + _leave(" = 0"); + return ret; + +cant_get_server_key: + ret = PTR_ERR(server_key); + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -ENOKEY: + case -EKEYREJECTED: + case -EKEYEXPIRED: + case -EKEYREVOKED: + case -EPERM: + return rxrpc_abort_conn(conn, skb, RXGK_BADKEYNO, -EKEYREJECTED, + rxgk_abort_resp_tok_nokey); + default: + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_tok_keyerr); + } + +cant_get_token: + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -EINVAL: + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_tok_internal_error); + case -ENOPKG: + return rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP, + -EKEYREJECTED, rxgk_abort_resp_tok_nopkg); + } + +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + return ret; +} diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h index da1464e65766c..7370a56559853 100644 --- a/net/rxrpc/rxgk_common.h +++ b/net/rxrpc/rxgk_common.h @@ -33,6 +33,19 @@ struct rxgk_context { struct crypto_aead *resp_enc; /* Response packet enc key */ }; +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_object_len(x) (4 + xdr_round_up(x)) + +/* + * rxgk_app.c + */ +int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key); +int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int token_offset, unsigned int token_len, + struct key **_key); + /* * rxgk_kdf.c */ @@ -46,3 +59,81 @@ int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, unsigned int enctype, const struct krb5_enctype **_krb5, gfp_t gfp); + +/* + * Apply decryption and checksumming functions to part of an skbuff. The + * offset and length are updated to reflect the actual content of the encrypted + * region. + */ +static inline +int rxgk_decrypt_skb(const struct krb5_enctype *krb5, + struct crypto_aead *aead, + struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len, + int *_error_code) +{ + struct scatterlist sg[16]; + size_t offset = 0, len = *_len; + int nr_sg, ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(skb, sg, *_offset, len); + if (unlikely(nr_sg < 0)) + return nr_sg; + + ret = crypto_krb5_decrypt(krb5, aead, sg, nr_sg, + &offset, &len); + switch (ret) { + case 0: + *_offset += offset; + *_len = len; + break; + case -EPROTO: + case -EBADMSG: + *_error_code = RXGK_SEALEDINCON; + break; + default: + break; + } + + return ret; +} + +/* + * Check the MIC on a region of an skbuff. The offset and length are updated + * to reflect the actual content of the secure region. + */ +static inline +int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, + struct crypto_shash *shash, + const struct krb5_buffer *metadata, + struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len, + u32 *_error_code) +{ + struct scatterlist sg[16]; + size_t offset = 0, len = *_len; + int nr_sg, ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(skb, sg, *_offset, len); + if (unlikely(nr_sg < 0)) + return nr_sg; + + ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, nr_sg, + &offset, &len); + switch (ret) { + case 0: + *_offset += offset; + *_len = len; + break; + case -EPROTO: + case -EBADMSG: + *_error_code = RXGK_SEALEDINCON; + break; + default: + break; + } + + return ret; +} diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 8ddccee69009d..0b5e007c7de95 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -177,8 +177,10 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem if (!txb) return NULL; - txb->offset += shdr; - txb->space = part; + txb->crypto_header = 0; + txb->sec_header = shdr; + txb->offset += shdr; + txb->space = part; return txb; } diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 9784adc8f2759..078d91a6b77fb 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -20,6 +20,9 @@ static const struct rxrpc_security *rxrpc_security_types[] = { #ifdef CONFIG_RXKAD [RXRPC_SECURITY_RXKAD] = &rxkad, #endif +#ifdef CONFIG_RXGK + [RXRPC_SECURITY_YFS_RXGK] = &rxgk_yfs, +#endif }; int __init rxrpc_init_security(void) From 7a7513a3081c6a2729d8570c77bbed1978277dc9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:54 +0100 Subject: [PATCH 155/770] rxrpc: rxgk: Implement connection rekeying Implement rekeying of connections with the RxGK security class. This involves regenerating the keys with a different key number as part of the input data after a certain amount of time or a certain amount of bytes encrypted. Rekeying may be triggered by either end. The LSW of the key number is inserted into the security-specific field in the RX header, and we try and expand it to 32-bits to make it last longer. Signed-off-by: David Howells cc: Marc Dionne cc: Herbert Xu cc: Chuck Lever cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-10-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 24 ++++++ net/rxrpc/ar-internal.h | 5 +- net/rxrpc/conn_object.c | 1 + net/rxrpc/rxgk.c | 158 +++++++++++++++++++++++++++++++++-- 4 files changed, 181 insertions(+), 7 deletions(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index aab81e8196aee..920439df1f6fe 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -2725,6 +2725,30 @@ TRACE_EVENT(rxrpc_rack_timer, ktime_to_us(__entry->delay)) ); +TRACE_EVENT(rxrpc_rxgk_rekey, + TP_PROTO(struct rxrpc_connection *conn, + unsigned int current_key, unsigned int requested_key), + + TP_ARGS(conn, current_key, requested_key), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(unsigned int, current_key) + __field(unsigned int, requested_key) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->current_key = current_key; + __entry->requested_key = requested_key; + ), + + TP_printk("C=%08x cur=%x req=%x", + __entry->conn, + __entry->current_key, + __entry->requested_key) + ); + #undef EM #undef E_ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 57ea6d61b7a2a..fd235bfa226d7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -565,13 +565,16 @@ struct rxrpc_connection { u32 nonce; /* response re-use preventer */ } rxkad; struct { - struct rxgk_context *keys[1]; + struct rxgk_context *keys[4]; /* (Re-)keying buffer */ u64 start_time; /* The start time for TK derivation */ u8 nonce[20]; /* Response re-use preventer */ u32 enctype; /* Kerberos 5 encoding type */ + u32 key_number; /* Current key number */ } rxgk; }; + rwlock_t security_use_lock; /* Security use/modification lock */ struct sk_buff *tx_response; /* Response packet to be transmitted */ + unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index d14a802d2001f..37340becb224d 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -73,6 +73,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, skb_queue_head_init(&conn->rx_queue); conn->rxnet = rxnet; conn->security = &rxrpc_no_security; + rwlock_init(&conn->security_use_lock); spin_lock_init(&conn->state_lock); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); conn->idle_timestamp = jiffies; diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 02752eeb23954..8b1ccdf8bc58e 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -76,11 +76,155 @@ static void rxgk_describe_server_key(const struct key *key, struct seq_file *m) seq_printf(m, ": %s", krb5->name); } +/* + * Handle rekeying the connection when we see our limits overrun or when the + * far side decided to rekey. + * + * Returns a ref on the context if successful or -ESTALE if the key is out of + * date. + */ +static struct rxgk_context *rxgk_rekey(struct rxrpc_connection *conn, + const u16 *specific_key_number) +{ + struct rxgk_context *gk, *dead = NULL; + unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1; + bool crank = false; + + _enter("%d", specific_key_number ? *specific_key_number : -1); + + mutex_lock(&conn->security_lock); + + current_key = conn->rxgk.key_number; + if (!specific_key_number) { + key_number = current_key; + } else { + if (*specific_key_number == (u16)current_key) + key_number = current_key; + else if (*specific_key_number == (u16)(current_key - 1)) + key_number = current_key - 1; + else if (*specific_key_number == (u16)(current_key + 1)) + goto crank_window; + else + goto bad_key; + } + + gk = conn->rxgk.keys[key_number & mask]; + if (!gk) + goto generate_key; + if (!specific_key_number && + test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags)) + goto crank_window; + +grab: + refcount_inc(&gk->usage); + mutex_unlock(&conn->security_lock); + rxgk_put(dead); + return gk; + +crank_window: + trace_rxrpc_rxgk_rekey(conn, current_key, + specific_key_number ? *specific_key_number : -1); + if (current_key == UINT_MAX) + goto bad_key; + if (current_key + 1 == UINT_MAX) + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + + key_number = current_key + 1; + if (WARN_ON(conn->rxgk.keys[key_number & mask])) + goto bad_key; + crank = true; + +generate_key: + gk = conn->rxgk.keys[current_key & mask]; + gk = rxgk_generate_transport_key(conn, gk->key, key_number, GFP_NOFS); + if (IS_ERR(gk)) { + mutex_unlock(&conn->security_lock); + return gk; + } + + write_lock(&conn->security_use_lock); + if (crank) { + current_key++; + conn->rxgk.key_number = current_key; + dead = conn->rxgk.keys[(current_key - 2) & mask]; + conn->rxgk.keys[(current_key - 2) & mask] = NULL; + } + conn->rxgk.keys[current_key & mask] = gk; + write_unlock(&conn->security_use_lock); + goto grab; + +bad_key: + mutex_unlock(&conn->security_lock); + return ERR_PTR(-ESTALE); +} + +/* + * Get the specified keying context. + * + * Returns a ref on the context if successful or -ESTALE if the key is out of + * date. + */ static struct rxgk_context *rxgk_get_key(struct rxrpc_connection *conn, - u16 *specific_key_number) + const u16 *specific_key_number) { - refcount_inc(&conn->rxgk.keys[0]->usage); - return conn->rxgk.keys[0]; + struct rxgk_context *gk; + unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1; + + _enter("{%u},%d", + conn->rxgk.key_number, specific_key_number ? *specific_key_number : -1); + + read_lock(&conn->security_use_lock); + + current_key = conn->rxgk.key_number; + if (!specific_key_number) { + key_number = current_key; + } else { + /* Only the bottom 16 bits of the key number are exposed in the + * header, so we try and keep the upper 16 bits in step. The + * whole 32 bits are used to generate the TK. + */ + if (*specific_key_number == (u16)current_key) + key_number = current_key; + else if (*specific_key_number == (u16)(current_key - 1)) + key_number = current_key - 1; + else if (*specific_key_number == (u16)(current_key + 1)) + goto rekey; + else + goto bad_key; + } + + gk = conn->rxgk.keys[key_number & mask]; + if (!gk) + goto slow_path; + if (!specific_key_number && + key_number < UINT_MAX) { + if (time_after(jiffies, gk->expiry) || + gk->bytes_remaining < 0) { + set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags); + goto slow_path; + } + + if (test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags)) + goto slow_path; + } + + refcount_inc(&gk->usage); + read_unlock(&conn->security_use_lock); + return gk; + +rekey: + _debug("rekey"); + if (current_key == UINT_MAX) + goto bad_key; + gk = conn->rxgk.keys[current_key & mask]; + if (gk) + set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags); +slow_path: + read_unlock(&conn->security_use_lock); + return rxgk_rekey(conn, specific_key_number); +bad_key: + read_unlock(&conn->security_use_lock); + return ERR_PTR(-ESTALE); } /* @@ -92,7 +236,8 @@ static int rxgk_init_connection_security(struct rxrpc_connection *conn, struct rxgk_context *gk; int ret; - _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key)); + _enter("{%d,%u},{%x}", + conn->debug_id, conn->rxgk.key_number, key_serial(conn->key)); conn->security_ix = token->security_index; conn->security_level = token->rxgk->level; @@ -102,11 +247,12 @@ static int rxgk_init_connection_security(struct rxrpc_connection *conn, do_div(conn->rxgk.start_time, 100); } - gk = rxgk_generate_transport_key(conn, token->rxgk, 0, GFP_NOFS); + gk = rxgk_generate_transport_key(conn, token->rxgk, conn->rxgk.key_number, + GFP_NOFS); if (IS_ERR(gk)) return PTR_ERR(gk); conn->rxgk.enctype = gk->krb5->etype; - conn->rxgk.keys[0] = gk; + conn->rxgk.keys[gk->key_number & 3] = gk; switch (conn->security_level) { case RXRPC_SECURITY_PLAIN: From b794dc17cdd0517edb377edf863c2dc1f2dec781 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:55 +0100 Subject: [PATCH 156/770] rxrpc: Allow the app to store private data on peer structs Provide a way for the application (e.g. the afs filesystem) to store private data on the rxrpc_peer structs for later retrieval via the call object. This will allow afs to store a pointer to the afs_server object on the rxrpc_peer struct, thereby obviating the need for afs to keep lookup tables by which it can associate an incoming call with server that transmitted it. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-11-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/oob.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/oob.c b/net/rxrpc/oob.c index 3601022b91900..05ca9c1faa577 100644 --- a/net/rxrpc/oob.c +++ b/net/rxrpc/oob.c @@ -272,7 +272,7 @@ enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, switch (type) { case RXRPC_OOB_CHALLENGE: *_peer = sp->chall.conn->peer; - *_peer_appdata = 0; /* TODO: retrieve appdata */ + *_peer_appdata = sp->chall.conn->peer->app_data; break; default: WARN_ON_ONCE(1); @@ -347,7 +347,7 @@ void rxrpc_kernel_query_challenge(struct sk_buff *challenge, struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); *_peer = sp->chall.conn->peer; - *_peer_appdata = 0; /* TODO: retrieve appdata */ + *_peer_appdata = sp->chall.conn->peer->app_data; *_service_id = sp->hdr.serviceId; *_security_index = sp->hdr.securityIndex; } From d03539d5c2dec9b028297c15e57bd3c01d0d9c0d Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:56 +0100 Subject: [PATCH 157/770] rxrpc: Display security params in the afs_cb_call tracepoint Make the afs_cb_call tracepoint display some security parameters to make debugging easier. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-12-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/networking/rxrpc.rst | 1 + fs/afs/internal.h | 2 ++ fs/afs/rxrpc.c | 4 ++++ include/net/af_rxrpc.h | 2 ++ include/trace/events/afs.h | 11 +++++++++-- net/rxrpc/ar-internal.h | 1 + net/rxrpc/call_object.c | 20 ++++++++++++++++++++ net/rxrpc/rxgk.c | 2 ++ 8 files changed, 41 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index a01f0c81ca4b2..fe2ea73be4417 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -1178,6 +1178,7 @@ API Function Reference ====================== .. kernel-doc:: net/rxrpc/af_rxrpc.c +.. kernel-doc:: net/rxrpc/call_object.c .. kernel-doc:: net/rxrpc/key.c .. kernel-doc:: net/rxrpc/oob.c .. kernel-doc:: net/rxrpc/peer_object.c diff --git a/fs/afs/internal.h b/fs/afs/internal.h index b3612b700c6a1..178804817efbf 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -176,8 +176,10 @@ struct afs_call { bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ bool responded; /* Got a response from the call (may be abort) */ + u8 security_ix; /* Security class */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ + u32 enctype; /* Security encoding type */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ union { /* place to extract temporary data */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 212af2aa85bf4..00b3bc087f61f 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -813,6 +813,10 @@ static int afs_deliver_cm_op_id(struct afs_call *call) if (!afs_cm_incoming_call(call)) return -ENOTSUPP; + call->security_ix = rxrpc_kernel_query_call_security(call->rxcall, + &call->service_id, + &call->enctype); + trace_afs_cb_call(call); call->work.func = call->type->work; diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 0b209f703ffc1..f15341594cc8f 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -112,5 +112,7 @@ int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge); u32 rxgk_kernel_query_challenge(struct sk_buff *challenge); int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, struct krb5_buffer *appdata); +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype); #endif /* _NET_RXRPC_H */ diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index 8857f5ea77d48..7f83d242c8e9f 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -663,19 +663,26 @@ TRACE_EVENT(afs_cb_call, __field(unsigned int, call) __field(u32, op) __field(u16, service_id) + __field(u8, security_ix) + __field(u32, enctype) ), TP_fast_assign( __entry->call = call->debug_id; __entry->op = call->operation_ID; __entry->service_id = call->service_id; + __entry->security_ix = call->security_ix; + __entry->enctype = call->enctype; ), - TP_printk("c=%08x %s", + TP_printk("c=%08x %s sv=%u sx=%u en=%u", __entry->call, __entry->service_id == 2501 ? __print_symbolic(__entry->op, yfs_cm_operations) : - __print_symbolic(__entry->op, afs_cm_operations)) + __print_symbolic(__entry->op, afs_cm_operations), + __entry->service_id, + __entry->security_ix, + __entry->enctype) ); TRACE_EVENT(afs_call, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index fd235bfa226d7..ca62a1db32866 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -740,6 +740,7 @@ struct rxrpc_call { u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ u32 security_level; /* Security level selected */ + u32 security_enctype; /* Security-specific encoding type (or 0) */ int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index fc88ffe1b0503..e9e8f0ef3fd58 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -760,3 +760,23 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) atomic_dec(&rxnet->nr_calls); wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls)); } + +/** + * rxrpc_kernel_query_call_security - Query call's security parameters + * @call: The call to query + * @_service_id: Where to return the service ID + * @_enctype: Where to return the "encoding type" + * + * This queries the security parameters of a call, setting *@_service_id and + * *@_enctype and returning the security class. + * + * Return: The security class protocol number. + */ +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype) +{ + *_service_id = call->dest_srx.srx_service; + *_enctype = call->security_enctype; + return call->security_ix; +} +EXPORT_SYMBOL(rxrpc_kernel_query_call_security); diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 8b1ccdf8bc58e..6175fc54ba90a 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -443,6 +443,7 @@ static int rxgk_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) if (ret < 0) return ret; + call->security_enctype = gk->krb5->etype; txb->cksum = htons(gk->key_number); switch (call->conn->security_level) { @@ -590,6 +591,7 @@ static int rxgk_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) } } + call->security_enctype = gk->krb5->etype; switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: return 0; From d98c317fd9aa78dfa45e47deb6536cd35783afd1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:57 +0100 Subject: [PATCH 158/770] afs: Use rxgk RESPONSE to pass token for callback channel Implement in kafs the hook for adding appdata into a RESPONSE packet generated in response to an RxGK CHALLENGE packet, and include the key for securing the callback channel so that notifications from the fileserver get encrypted. This will be necessary when more complex notifications are used that convey changed data around. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-13-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- fs/afs/Kconfig | 1 + fs/afs/cm_security.c | 255 +++++++++++++++++++++++++++++++++++++++++++ fs/afs/internal.h | 12 ++ fs/afs/rxrpc.c | 7 +- fs/afs/server.c | 2 + 5 files changed, 276 insertions(+), 1 deletion(-) diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index fc8ba9142f2f0..682bd8ec2c105 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -5,6 +5,7 @@ config AFS_FS select AF_RXRPC select DNS_RESOLVER select NETFS_SUPPORT + select CRYPTO_KRB5 help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c index e8eb63e4d124a..edcbd249d2024 100644 --- a/fs/afs/cm_security.c +++ b/fs/afs/cm_security.c @@ -8,11 +8,21 @@ #include #include #include "internal.h" +#include "afs_cm.h" #include "afs_fs.h" #include "protocol_yfs.h" #define RXRPC_TRACE_ONLY_DEFINE_ENUMS #include +#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_len_object(x) (4 + round_up((x), sizeof(__be32))) + +#ifdef CONFIG_RXGK +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server); +#endif + /* * Respond to an RxGK challenge, adding appdata. */ @@ -20,6 +30,7 @@ static int afs_respond_to_challenge(struct sk_buff *challenge) { #ifdef CONFIG_RXGK struct krb5_buffer appdata = {}; + struct afs_server *server; #endif struct rxrpc_peer *peer; unsigned long peer_data; @@ -55,7 +66,23 @@ static int afs_respond_to_challenge(struct sk_buff *challenge) #ifdef CONFIG_RXGK case RXRPC_SECURITY_RXGK: + return rxgk_kernel_respond_to_challenge(challenge, &appdata); + case RXRPC_SECURITY_YFS_RXGK: + switch (service_id) { + case FS_SERVICE: + case YFS_FS_SERVICE: + server = (struct afs_server *)peer_data; + if (!server->cm_rxgk_appdata.data) { + mutex_lock(&server->cm_token_lock); + if (!server->cm_rxgk_appdata.data) + afs_create_yfs_cm_token(challenge, server); + mutex_unlock(&server->cm_token_lock); + } + if (server->cm_rxgk_appdata.data) + appdata = server->cm_rxgk_appdata; + break; + } return rxgk_kernel_respond_to_challenge(challenge, &appdata); #endif @@ -83,3 +110,231 @@ void afs_process_oob_queue(struct work_struct *work) rxrpc_kernel_free_oob(oob); } } + +#ifdef CONFIG_RXGK +/* + * Create a securities keyring for the cache manager and attach a key to it for + * the RxGK tokens we want to use to secure the callback connection back from + * the fileserver. + */ +int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + const struct krb5_enctype *krb5; + struct key *ring; + key_ref_t key; + char K0[32], *desc; + int ret; + + ring = keyring_alloc("kafs", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + KEY_POS_SEARCH | KEY_POS_WRITE | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(ring)) + return PTR_ERR(ring); + + ret = rxrpc_sock_set_security_keyring(socket->sk, ring); + if (ret < 0) + goto out; + + ret = -ENOPKG; + krb5 = crypto_krb5_find_enctype(KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96); + if (!krb5) + goto out; + + if (WARN_ON_ONCE(krb5->key_len > sizeof(K0))) + goto out; + + ret = -ENOMEM; + desc = kasprintf(GFP_KERNEL, "%u:%u:%u:%u", + YFS_CM_SERVICE, RXRPC_SECURITY_YFS_RXGK, 1, krb5->etype); + if (!desc) + goto out; + + wait_for_random_bytes(); + get_random_bytes(K0, krb5->key_len); + + key = key_create(make_key_ref(ring, true), + "rxrpc_s", desc, + K0, krb5->key_len, + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + kfree(desc); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto out; + } + + net->fs_cm_token_key = key_ref_to_ptr(key); + ret = 0; +out: + key_put(ring); + return ret; +} + +/* + * Create an YFS RxGK GSS token to use as a ticket to the specified fileserver. + */ +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server) +{ + const struct krb5_enctype *conn_krb5, *token_krb5; + const struct krb5_buffer *token_key; + struct crypto_aead *aead; + struct scatterlist sg; + struct afs_net *net = server->cell->net; + const struct key *key = net->fs_cm_token_key; + size_t keysize, uuidsize, authsize, toksize, encsize, contsize, adatasize, offset; + __be32 caps[1] = { + [0] = htonl(AFS_CAP_ERROR_TRANSLATION), + }; + __be32 *xdr; + void *appdata, *K0, *encbase; + u32 enctype; + int ret; + + if (!key) + return -ENOKEY; + + /* Assume that the fileserver is happy to use the same encoding type as + * we were told to use by the token obtained by the user. + */ + enctype = rxgk_kernel_query_challenge(challenge); + + conn_krb5 = crypto_krb5_find_enctype(enctype); + if (!conn_krb5) + return -ENOPKG; + token_krb5 = key->payload.data[0]; + token_key = (const struct krb5_buffer *)&key->payload.data[2]; + + /* struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + */ + keysize = 4 + xdr_len_object(conn_krb5->key_len); + + /* struct RXGK_AuthName { + * afs_int32 kind; + * opaque data; + * opaque display; + * }; + */ + uuidsize = sizeof(server->uuid); + authsize = 4 + xdr_len_object(uuidsize) + xdr_len_object(0); + + /* struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ + toksize = keysize + 8 + 4 + 4 + 8 + xdr_len_object(authsize); + + offset = 0; + encsize = crypto_krb5_how_much_buffer(token_krb5, KRB5_ENCRYPT_MODE, toksize, &offset); + + /* struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + */ + contsize = 4 + 4 + xdr_len_object(encsize); + + /* struct YFSAppData { + * opr_uuid initiatorUuid; + * opr_uuid acceptorUuid; + * Capabilities caps; + * afs_int32 enctype; + * opaque callbackKey<>; + * opaque callbackToken<>; + * }; + */ + adatasize = 16 + 16 + + xdr_len_object(sizeof(caps)) + + 4 + + xdr_len_object(conn_krb5->key_len) + + xdr_len_object(contsize); + + ret = -ENOMEM; + appdata = kzalloc(adatasize, GFP_KERNEL); + if (!appdata) + goto out; + xdr = appdata; + + memcpy(xdr, &net->uuid, 16); /* appdata.initiatorUuid */ + xdr += 16 / 4; + memcpy(xdr, &server->uuid, 16); /* appdata.acceptorUuid */ + xdr += 16 / 4; + *xdr++ = htonl(ARRAY_SIZE(caps)); /* appdata.caps.len */ + memcpy(xdr, &caps, sizeof(caps)); /* appdata.caps */ + xdr += ARRAY_SIZE(caps); + *xdr++ = htonl(conn_krb5->etype); /* appdata.enctype */ + + *xdr++ = htonl(conn_krb5->key_len); /* appdata.callbackKey.len */ + K0 = xdr; + get_random_bytes(K0, conn_krb5->key_len); /* appdata.callbackKey.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(contsize); /* appdata.callbackToken.len */ + *xdr++ = htonl(1); /* cont.kvno */ + *xdr++ = htonl(token_krb5->etype); /* cont.enctype */ + *xdr++ = htonl(encsize); /* cont.encrypted_token.len */ + + encbase = xdr; + xdr += offset / 4; + *xdr++ = htonl(conn_krb5->etype); /* token.K0.enctype */ + *xdr++ = htonl(conn_krb5->key_len); /* token.K0.key.len */ + memcpy(xdr, K0, conn_krb5->key_len); /* token.K0.key.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(RXRPC_SECURITY_ENCRYPT); /* token.level */ + *xdr++ = htonl(0); /* token.starttime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(0); /* token.lifetime */ + *xdr++ = htonl(0); /* token.bytelife */ + *xdr++ = htonl(0); /* token.expirationtime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(1); /* token.identities.count */ + *xdr++ = htonl(0); /* token.identities[0].kind */ + *xdr++ = htonl(uuidsize); /* token.identities[0].data.len */ + memcpy(xdr, &server->uuid, uuidsize); + xdr += xdr_round_up(uuidsize) / 4; + *xdr++ = htonl(0); /* token.identities[0].display.len */ + + xdr = encbase + xdr_round_up(encsize); + + if ((unsigned long)xdr - (unsigned long)appdata != adatasize) + pr_err("Appdata size incorrect %lx != %zx\n", + (unsigned long)xdr - (unsigned long)appdata, adatasize); + + aead = crypto_krb5_prepare_encryption(token_krb5, token_key, RXGK_SERVER_ENC_TOKEN, + GFP_KERNEL); + if (IS_ERR(aead)) { + ret = PTR_ERR(aead); + goto out_token; + } + + sg_init_one(&sg, encbase, encsize); + ret = crypto_krb5_encrypt(token_krb5, aead, &sg, 1, encsize, offset, toksize, false); + if (ret < 0) + goto out_aead; + + server->cm_rxgk_appdata.len = adatasize; + server->cm_rxgk_appdata.data = appdata; + appdata = NULL; + +out_aead: + crypto_free_aead(aead); +out_token: + kfree(appdata); +out: + return ret; +} +#endif /* CONFIG_RXGK */ diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 178804817efbf..1124ea4000cb1 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -308,6 +309,7 @@ struct afs_net { struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ + struct key *fs_cm_token_key; /* Key for creating CM tokens */ struct work_struct fs_prober; struct timer_list fs_probe_timer; atomic_t servers_outstanding; @@ -543,6 +545,8 @@ struct afs_server { struct list_head volumes; /* RCU list of afs_server_entry objects */ struct work_struct destroyer; /* Work item to try and destroy a server */ struct timer_list timer; /* Management timer */ + struct mutex cm_token_lock; /* Lock governing creation of appdata */ + struct krb5_buffer cm_rxgk_appdata; /* Appdata to be included in RESPONSE packet */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ @@ -1065,6 +1069,14 @@ extern bool afs_cm_incoming_call(struct afs_call *); * cm_security.c */ void afs_process_oob_queue(struct work_struct *work); +#ifdef CONFIG_RXGK +int afs_create_token_key(struct afs_net *net, struct socket *socket); +#else +static inline int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + return 0; +} +#endif /* * dir.c diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 00b3bc087f61f..c1cadf8fb346a 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -78,6 +78,10 @@ int afs_open_socket(struct afs_net *net) if (ret < 0) goto error_2; + ret = afs_create_token_key(net, socket); + if (ret < 0) + pr_err("Couldn't create RxGK CM key: %d\n", ret); + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; @@ -140,6 +144,7 @@ void afs_close_socket(struct afs_net *net) flush_workqueue(afs_async_calls); net->socket->sk->sk_user_data = NULL; sock_release(net->socket); + key_put(net->fs_cm_token_key); _debug("dework"); _leave(""); @@ -820,7 +825,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call) trace_afs_cb_call(call); call->work.func = call->type->work; - /* pass responsibility for the remainer of this message off to the + /* pass responsibility for the remainder of this message off to the * cache manager op */ return call->type->deliver(call); } diff --git a/fs/afs/server.c b/fs/afs/server.c index 8755f2703815e..a97562f831eb5 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -131,6 +131,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t * timer_setup(&server->timer, afs_server_timer, 0); INIT_LIST_HEAD(&server->volumes); init_waitqueue_head(&server->probe_wq); + mutex_init(&server->cm_token_lock); INIT_LIST_HEAD(&server->probe_link); INIT_HLIST_NODE(&server->proc_link); spin_lock_init(&server->probe_lock); @@ -396,6 +397,7 @@ static void afs_server_rcu(struct rcu_head *rcu) afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state), afs_estate_trace_put_server); afs_put_cell(server->cell, afs_cell_trace_put_server); + kfree(server->cm_rxgk_appdata.data); kfree(server); } From fba6995798c6085a0c2fc67e0cacd489a6971044 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:58 +0100 Subject: [PATCH 159/770] rxrpc: Add more CHALLENGE/RESPONSE packet tracing Add more tracing for CHALLENGE and RESPONSE packets. Currently, rxrpc only has client-relevant tracepoints (rx_challenge and tx_response), but add the server-side ones too. Further, record the service ID in the rx_challenge tracepoint as well. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-14-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 78 +++++++++++++++++++++++++++++++++++- net/rxrpc/output.c | 2 + net/rxrpc/rxgk.c | 4 ++ net/rxrpc/rxkad.c | 2 + 4 files changed, 85 insertions(+), 1 deletion(-) diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 920439df1f6fe..378d2dfc73923 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1201,6 +1201,39 @@ TRACE_EVENT(rxrpc_rx_conn_abort, __entry->abort_code) ); +TRACE_EVENT(rxrpc_tx_challenge, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, + u32 version, u32 nonce), + + TP_ARGS(conn, serial, version, nonce), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(rxrpc_serial_t, serial) + __field(u32, version) + __field(u32, nonce) + __field(u16, service_id) + __field(u8, security_ix) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = serial; + __entry->version = version; + __entry->nonce = nonce; + __entry->service_id = conn->service_id; + __entry->security_ix = conn->security_ix; + ), + + TP_printk("C=%08x CHALLENGE r=%08x sv=%u+%u v=%x n=%x", + __entry->conn, + __entry->serial, + __entry->service_id, + __entry->security_ix, + __entry->version, + __entry->nonce) + ); + TRACE_EVENT(rxrpc_rx_challenge, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 nonce, u32 min_level), @@ -1213,6 +1246,7 @@ TRACE_EVENT(rxrpc_rx_challenge, __field(u32, version) __field(u32, nonce) __field(u32, min_level) + __field(u16, service_id) __field(u8, security_ix) ), @@ -1222,18 +1256,60 @@ TRACE_EVENT(rxrpc_rx_challenge, __entry->version = version; __entry->nonce = nonce; __entry->min_level = min_level; + __entry->service_id = conn->service_id; __entry->security_ix = conn->security_ix; ), - TP_printk("C=%08x CHALLENGE r=%08x sx=%u v=%x n=%x ml=%x", + TP_printk("C=%08x CHALLENGE r=%08x sv=%u+%u v=%x n=%x ml=%x", __entry->conn, __entry->serial, + __entry->service_id, __entry->security_ix, __entry->version, __entry->nonce, __entry->min_level) ); +TRACE_EVENT(rxrpc_tx_response, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, + struct rxrpc_skb_priv *rsp), + + TP_ARGS(conn, serial, rsp), + + TP_STRUCT__entry( + __field(unsigned int, conn) + __field(rxrpc_serial_t, serial) + __field(rxrpc_serial_t, challenge) + __field(u32, version) + __field(u32, kvno) + __field(u16, ticket_len) + __field(u16, appdata_len) + __field(u16, service_id) + __field(u8, security_ix) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = serial; + __entry->challenge = rsp->resp.challenge_serial; + __entry->version = rsp->resp.version; + __entry->kvno = rsp->resp.kvno; + __entry->ticket_len = rsp->resp.ticket_len; + __entry->service_id = conn->service_id; + __entry->security_ix = conn->security_ix; + ), + + TP_printk("C=%08x RESPONSE r=%08x cr=%08x sv=%u+%u v=%x kv=%x tl=%u", + __entry->conn, + __entry->serial, + __entry->challenge, + __entry->service_id, + __entry->security_ix, + __entry->version, + __entry->kvno, + __entry->ticket_len) + ); + TRACE_EVENT(rxrpc_rx_response, TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, u32 version, u32 kvno, u32 ticket_len), diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8138f35d79459..0af19bcdc80a4 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -953,6 +953,8 @@ void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *response serial = rxrpc_get_next_serials(conn, 1); wserial = htonl(serial); + trace_rxrpc_tx_response(conn, serial, sp); + ret = skb_store_bits(response, offsetof(struct rxrpc_wire_header, serial), &wserial, sizeof(wserial)); if (ret < 0) diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 6175fc54ba90a..ba8bc201b8d38 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -668,6 +668,8 @@ static int rxgk_issue_challenge(struct rxrpc_connection *conn) serial = rxrpc_get_next_serials(conn, 1); whdr->serial = htonl(serial); + trace_rxrpc_tx_challenge(conn, serial, 0, *(u32 *)&conn->rxgk.nonce); + ret = do_udp_sendmsg(conn->local->socket, &msg, len); if (ret > 0) conn->peer->last_tx_at = ktime_get_seconds(); @@ -1203,6 +1205,8 @@ static int rxgk_verify_response(struct rxrpc_connection *conn, if (xdr_round_up(token_len) + sizeof(__be32) > len) goto short_packet; + trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, token_len); + offset += xdr_round_up(token_len); len -= xdr_round_up(token_len); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 0b5e007c7de95..3657c0661cdc7 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -685,6 +685,8 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) serial = rxrpc_get_next_serial(conn); whdr.serial = htonl(serial); + trace_rxrpc_tx_challenge(conn, serial, 0, conn->rxkad.nonce); + ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, From aa2199088a39962f30d967c667b59ba5075e77d6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Apr 2025 10:52:59 +0100 Subject: [PATCH 160/770] rxrpc: rxperf: Add test RxGK server keys Add RxGK server keys of bytes containing { 0, 1, 2, 3, 4, ... } to the server keyring for the rxperf test server. This allows the rxperf test client to connect to it. Signed-off-by: David Howells cc: Marc Dionne cc: Simon Horman cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20250411095303.2316168-15-dhowells@redhat.com Signed-off-by: Jakub Kicinski --- net/rxrpc/rxperf.c | 68 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index c76fbccfbb91c..0377301156b09 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "rxperf: " fmt #include #include +#include #include #include #define RXRPC_TRACE_ONLY_DEFINE_ENUMS @@ -550,9 +551,9 @@ static int rxperf_process_call(struct rxperf_call *call) } /* - * Add a key to the security keyring. + * Add an rxkad key to the security keyring. */ -static int rxperf_add_key(struct key *keyring) +static int rxperf_add_rxkad_key(struct key *keyring) { key_ref_t kref; int ret; @@ -578,6 +579,47 @@ static int rxperf_add_key(struct key *keyring) return ret; } +#ifdef CONFIG_RXGK +/* + * Add a yfs-rxgk key to the security keyring. + */ +static int rxperf_add_yfs_rxgk_key(struct key *keyring, u32 enctype) +{ + const struct krb5_enctype *krb5 = crypto_krb5_find_enctype(enctype); + key_ref_t kref; + char name[64]; + int ret; + u8 key[32]; + + if (!krb5 || krb5->key_len > sizeof(key)) + return 0; + + /* The key is just { 0, 1, 2, 3, 4, ... } */ + for (int i = 0; i < krb5->key_len; i++) + key[i] = i; + + sprintf(name, "%u:6:1:%u", RX_PERF_SERVICE, enctype); + + kref = key_create_or_update(make_key_ref(keyring, true), + "rxrpc_s", name, + key, krb5->key_len, + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + + if (IS_ERR(kref)) { + pr_err("Can't allocate rxperf server key: %ld\n", PTR_ERR(kref)); + return PTR_ERR(kref); + } + + ret = key_link(keyring, key_ref_to_ptr(kref)); + if (ret < 0) + pr_err("Can't link rxperf server key: %d\n", ret); + key_ref_put(kref); + return ret; +} +#endif + /* * Initialise the rxperf server. */ @@ -607,9 +649,29 @@ static int __init rxperf_init(void) goto error_keyring; } rxperf_sec_keyring = keyring; - ret = rxperf_add_key(keyring); + ret = rxperf_add_rxkad_key(keyring); + if (ret < 0) + goto error_key; +#ifdef CONFIG_RXGK + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA1_96); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA256_128); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA384_192); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA128_CTS_CMAC); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA256_CTS_CMAC); if (ret < 0) goto error_key; +#endif ret = rxperf_open_socket(); if (ret < 0) From b4589810082a6e138f61f359757cb6a6790bc09c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 12 Apr 2025 15:09:49 +0100 Subject: [PATCH 161/770] net: stmmac: qcom-ethqos: set serdes speed using serdes_speed ethqos->serdes_speed represents the current speed the serdes was configured for, which should be the same as ethqos->speed. Since we wish to remove ethqos->speed to simplify the code, switch to using the serdes_speed instead. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u3bYL-000EcE-5c@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 0e4da216f942c..5d8cd4336a8cc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -709,7 +709,7 @@ static int qcom_ethqos_serdes_powerup(struct net_device *ndev, void *priv) if (ret) return ret; - return phy_set_speed(ethqos->serdes_phy, ethqos->speed); + return phy_set_speed(ethqos->serdes_phy, ethqos->serdes_speed); } static void qcom_ethqos_serdes_powerdown(struct net_device *ndev, void *priv) From a3d54648ada2088c34a429ad16a39fb6c4a983f3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 12 Apr 2025 15:09:54 +0100 Subject: [PATCH 162/770] net: stmmac: qcom-ethqos: remove ethqos->speed Rather than ethqos_fix_mac_speed() storing the speed in struct qcom_ethqos and then functions that are only called from here reading that speed, pass the speed to the called functions instead. This removes all readers of this struct member, which then allows the removal of the two places that set its value and the struct member. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u3bYQ-000EcK-9K@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../stmicro/stmmac/dwmac-qcom-ethqos.c | 31 +++++++++---------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 5d8cd4336a8cc..cd25aea3e48f3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -106,12 +106,11 @@ struct qcom_ethqos { struct platform_device *pdev; void __iomem *rgmii_base; void __iomem *mac_base; - int (*configure_func)(struct qcom_ethqos *ethqos); + int (*configure_func)(struct qcom_ethqos *ethqos, int speed); unsigned int link_clk_rate; struct clk *link_clk; struct phy *serdes_phy; - int speed; int serdes_speed; phy_interface_t phy_mode; @@ -385,7 +384,7 @@ static int ethqos_dll_configure(struct qcom_ethqos *ethqos) return 0; } -static int ethqos_rgmii_macro_init(struct qcom_ethqos *ethqos) +static int ethqos_rgmii_macro_init(struct qcom_ethqos *ethqos, int speed) { struct device *dev = ðqos->pdev->dev; int phase_shift; @@ -412,7 +411,7 @@ static int ethqos_rgmii_macro_init(struct qcom_ethqos *ethqos) rgmii_updatel(ethqos, RGMII_CONFIG_INTF_SEL, 0, RGMII_IO_MACRO_CONFIG); - switch (ethqos->speed) { + switch (speed) { case SPEED_1000: rgmii_updatel(ethqos, RGMII_CONFIG_DDR_MODE, RGMII_CONFIG_DDR_MODE, RGMII_IO_MACRO_CONFIG); @@ -532,14 +531,14 @@ static int ethqos_rgmii_macro_init(struct qcom_ethqos *ethqos) loopback, RGMII_IO_MACRO_CONFIG); break; default: - dev_err(dev, "Invalid speed %d\n", ethqos->speed); + dev_err(dev, "Invalid speed %d\n", speed); return -EINVAL; } return 0; } -static int ethqos_configure_rgmii(struct qcom_ethqos *ethqos) +static int ethqos_configure_rgmii(struct qcom_ethqos *ethqos, int speed) { struct device *dev = ðqos->pdev->dev; volatile unsigned int dll_lock; @@ -562,7 +561,7 @@ static int ethqos_configure_rgmii(struct qcom_ethqos *ethqos) SDCC_DLL_CONFIG_PDN, SDCC_HC_REG_DLL_CONFIG); if (ethqos->has_emac_ge_3) { - if (ethqos->speed == SPEED_1000) { + if (speed == SPEED_1000) { rgmii_writel(ethqos, 0x1800000, SDCC_TEST_CTL); rgmii_writel(ethqos, 0x2C010800, SDCC_USR_CTL); rgmii_writel(ethqos, 0xA001, SDCC_HC_REG_DLL_CONFIG2); @@ -580,7 +579,7 @@ static int ethqos_configure_rgmii(struct qcom_ethqos *ethqos) rgmii_updatel(ethqos, SDCC_DLL_CONFIG_PDN, 0, SDCC_HC_REG_DLL_CONFIG); - if (ethqos->speed != SPEED_100 && ethqos->speed != SPEED_10) { + if (speed != SPEED_100 && speed != SPEED_10) { /* Set DLL_EN */ rgmii_updatel(ethqos, SDCC_DLL_CONFIG_DLL_EN, SDCC_DLL_CONFIG_DLL_EN, SDCC_HC_REG_DLL_CONFIG); @@ -607,10 +606,10 @@ static int ethqos_configure_rgmii(struct qcom_ethqos *ethqos) dev_err(dev, "Timeout while waiting for DLL lock\n"); } - if (ethqos->speed == SPEED_1000) + if (speed == SPEED_1000) ethqos_dll_configure(ethqos); - ethqos_rgmii_macro_init(ethqos); + ethqos_rgmii_macro_init(ethqos, speed); return 0; } @@ -626,7 +625,7 @@ static void ethqos_set_serdes_speed(struct qcom_ethqos *ethqos, int speed) /* On interface toggle MAC registers gets reset. * Configure MAC block for SGMII on ethernet phy link up */ -static int ethqos_configure_sgmii(struct qcom_ethqos *ethqos) +static int ethqos_configure_sgmii(struct qcom_ethqos *ethqos, int speed) { struct net_device *dev = platform_get_drvdata(ethqos->pdev); struct stmmac_priv *priv = netdev_priv(dev); @@ -634,7 +633,7 @@ static int ethqos_configure_sgmii(struct qcom_ethqos *ethqos) val = readl(ethqos->mac_base + MAC_CTRL_REG); - switch (ethqos->speed) { + switch (speed) { case SPEED_2500: val &= ~ETHQOS_MAC_CTRL_PORT_SEL; rgmii_updatel(ethqos, RGMII_CONFIG2_RGMII_CLK_SEL_CFG, @@ -681,9 +680,9 @@ static void qcom_ethqos_speed_mode_2500(struct net_device *ndev, void *data) priv->plat->phy_interface = PHY_INTERFACE_MODE_2500BASEX; } -static int ethqos_configure(struct qcom_ethqos *ethqos) +static int ethqos_configure(struct qcom_ethqos *ethqos, int speed) { - return ethqos->configure_func(ethqos); + return ethqos->configure_func(ethqos, speed); } static void ethqos_fix_mac_speed(void *priv, int speed, unsigned int mode) @@ -691,9 +690,8 @@ static void ethqos_fix_mac_speed(void *priv, int speed, unsigned int mode) struct qcom_ethqos *ethqos = priv; qcom_ethqos_set_sgmii_loopback(ethqos, false); - ethqos->speed = speed; ethqos_update_link_clk(ethqos, speed); - ethqos_configure(ethqos); + ethqos_configure(ethqos, speed); } static int qcom_ethqos_serdes_powerup(struct net_device *ndev, void *priv) @@ -847,7 +845,6 @@ static int qcom_ethqos_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(ethqos->serdes_phy), "Failed to get serdes phy\n"); - ethqos->speed = SPEED_1000; ethqos->serdes_speed = SPEED_1000; ethqos_update_link_clk(ethqos, SPEED_1000); ethqos_set_func_clk_en(ethqos); From 4c30093f784e6b098da34165408b00c1810ef9ab Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 12 Apr 2025 15:09:59 +0100 Subject: [PATCH 163/770] net: stmmac: qcom-ethqos: remove unnecessary setting max_speed Phylink will already limit the MAC speed according to the interface, so if 2500BASE-X is selected, the maximum speed will be 2.5G. It is, therefore, not necessary to set a speed limit. Remove setting plat_dat->max_speed from this glue driver. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u3bYV-000EcQ-Cv@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index cd25aea3e48f3..e8d4925be21c9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -676,7 +676,6 @@ static void qcom_ethqos_speed_mode_2500(struct net_device *ndev, void *data) { struct stmmac_priv *priv = netdev_priv(ndev); - priv->plat->max_speed = 2500; priv->plat->phy_interface = PHY_INTERFACE_MODE_2500BASEX; } From 0d1c18a10dd16d256100005f22e875cd741a35fb Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 12 Apr 2025 15:10:04 +0100 Subject: [PATCH 164/770] net: stmmac: qcom-ethqos: remove speed_mode_2500() method qcom-ethqos doesn't need to implement the speed_mode_2500() method as it is only setting priv->plat->phy_interface to 2500BASE-X, which is already a pre-condition for assigning speed_mode_2500 in qcom_ethqos_probe(). So, qcom_ethqos_speed_mode_2500() has no effect. Remove it. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u3bYa-000EcW-H1@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index e8d4925be21c9..e30bdf72331ac 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -672,13 +672,6 @@ static int ethqos_configure_sgmii(struct qcom_ethqos *ethqos, int speed) return val; } -static void qcom_ethqos_speed_mode_2500(struct net_device *ndev, void *data) -{ - struct stmmac_priv *priv = netdev_priv(ndev); - - priv->plat->phy_interface = PHY_INTERFACE_MODE_2500BASEX; -} - static int ethqos_configure(struct qcom_ethqos *ethqos, int speed) { return ethqos->configure_func(ethqos, speed); @@ -800,8 +793,6 @@ static int qcom_ethqos_probe(struct platform_device *pdev) ethqos->configure_func = ethqos_configure_rgmii; break; case PHY_INTERFACE_MODE_2500BASEX: - plat_dat->speed_mode_2500 = qcom_ethqos_speed_mode_2500; - fallthrough; case PHY_INTERFACE_MODE_SGMII: ethqos->configure_func = ethqos_configure_sgmii; break; From 29bdc1f1c1df80868fb35bc69d1f073183adc6de Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Mon, 10 Mar 2025 07:44:09 -0500 Subject: [PATCH 165/770] book3s64/radix: Fix compile errors when CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP=n Fix compile errors when CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP=n Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Donet Tom Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/8231763344223c193e3452eab0ae8ea966aff466.1741609795.git.donettom@linux.ibm.com --- arch/powerpc/mm/book3s64/radix_pgtable.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 311e2112d782e..bd6916419472c 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -976,7 +976,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, return 0; } - +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { if (radix_enabled()) @@ -984,6 +984,7 @@ bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) return false; } +#endif int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, unsigned long addr, unsigned long next) From 9cf7e13fecbab0894f6986fc6986ab2eba8de52e Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Mon, 10 Mar 2025 07:44:10 -0500 Subject: [PATCH 166/770] book3s64/radix : Align section vmemmap start address to PAGE_SIZE A vmemmap altmap is a device-provided region used to provide backing storage for struct pages. For each namespace, the altmap should belong to that same namespace. If the namespaces are created unaligned, there is a chance that the section vmemmap start address could also be unaligned. If the section vmemmap start address is unaligned, the altmap page allocated from the current namespace might be used by the previous namespace also. During the free operation, since the altmap is shared between two namespaces, the previous namespace may detect that the page does not belong to its altmap and incorrectly assume that the page is a normal page. It then attempts to free the normal page, which leads to a kernel crash. Kernel attempted to read user page (18) - exploit attempt? (uid: 0) BUG: Kernel NULL pointer dereference on read at 0x00000018 Faulting instruction address: 0xc000000000530c7c Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries CPU: 32 PID: 2104 Comm: ndctl Kdump: loaded Tainted: G W NIP: c000000000530c7c LR: c000000000530e00 CTR: 0000000000007ffe REGS: c000000015e57040 TRAP: 0300 Tainted: G W MSR: 800000000280b033 CR: 84482404 CFAR: c000000000530dfc DAR: 0000000000000018 DSISR: 40000000 IRQMASK: 0 GPR00: c000000000530e00 c000000015e572e0 c000000002c5cb00 c00c000101008040 GPR04: 0000000000000000 0000000000000007 0000000000000001 000000000000001f GPR08: 0000000000000005 0000000000000000 0000000000000018 0000000000002000 GPR12: c0000000001d2fb0 c0000060de6b0080 0000000000000000 c0000060dbf90020 GPR16: c00c000101008000 0000000000000001 0000000000000000 c000000125b20f00 GPR20: 0000000000000001 0000000000000000 ffffffffffffffff c00c000101007fff GPR24: 0000000000000001 0000000000000000 0000000000000000 0000000000000000 GPR28: 0000000004040201 0000000000000001 0000000000000000 c00c000101008040 NIP [c000000000530c7c] get_pfnblock_flags_mask+0x7c/0xd0 LR [c000000000530e00] free_unref_page_prepare+0x130/0x4f0 Call Trace: free_unref_page+0x50/0x1e0 free_reserved_page+0x40/0x68 free_vmemmap_pages+0x98/0xe0 remove_pte_table+0x164/0x1e8 remove_pmd_table+0x204/0x2c8 remove_pud_table+0x1c4/0x288 remove_pagetable+0x1c8/0x310 vmemmap_free+0x24/0x50 section_deactivate+0x28c/0x2a0 __remove_pages+0x84/0x110 arch_remove_memory+0x38/0x60 memunmap_pages+0x18c/0x3d0 devm_action_release+0x30/0x50 release_nodes+0x68/0x140 devres_release_group+0x100/0x190 dax_pmem_compat_release+0x44/0x80 [dax_pmem_compat] device_for_each_child+0x8c/0x100 [dax_pmem_compat_remove+0x2c/0x50 [dax_pmem_compat] nvdimm_bus_remove+0x78/0x140 [libnvdimm] device_remove+0x70/0xd0 Another issue is that if there is no altmap, a PMD-sized vmemmap page will be allocated from RAM, regardless of the alignment of the section start address. If the section start address is not aligned to the PMD size, a VM_BUG_ON will be triggered when setting the PMD-sized page to page table. In this patch, we are aligning the section vmemmap start address to PAGE_SIZE. After alignment, the start address will not be part of the current namespace, and a normal page will be allocated for the vmemmap mapping of the current section. For the remaining sections, altmaps will be allocated. During the free operation, the normal page will be correctly freed. In the same way, a PMD_SIZE vmemmap page will be allocated only if the section start address is PMD_SIZE-aligned; otherwise, it will fall back to a PAGE-sized vmemmap allocation. Without this patch ================== NS1 start NS2 start _________________________________________________________ | NS1 | NS2 | --------------------------------------------------------- | Altmap| Altmap | .....|Altmap| Altmap | ........... | NS1 | NS1 | | NS2 | NS2 | In the above scenario, NS1 and NS2 are two namespaces. The vmemmap for NS1 comes from Altmap NS1, which belongs to NS1, and the vmemmap for NS2 comes from Altmap NS2, which belongs to NS2. The vmemmap start for NS2 is not aligned, so Altmap NS2 is shared by both NS1 and NS2. During the free operation in NS1, Altmap NS2 is not part of NS1's altmap, causing it to attempt to free an invalid page. With this patch =============== NS1 start NS2 start _________________________________________________________ | NS1 | NS2 | --------------------------------------------------------- | Altmap| Altmap | .....| Normal | Altmap | Altmap |....... | NS1 | NS1 | | Page | NS2 | NS2 | If the vmemmap start for NS2 is not aligned then we are allocating a normal page. NS1 and NS2 vmemmap will be freed correctly. Fixes: 368a0590d954 ("powerpc/book3s64/vmemmap: switch radix to use a different vmemmap handling function") Co-developed-by: Ritesh Harjani (IBM) Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Donet Tom Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/8f98ec2b442977c618f7256cec88eb17dde3f2b9.1741609795.git.donettom@linux.ibm.com --- arch/powerpc/mm/book3s64/radix_pgtable.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index bd6916419472c..9f764bc42b8cc 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1121,6 +1121,19 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in pmd_t *pmd; pte_t *pte; + /* + * Make sure we align the start vmemmap addr so that we calculate + * the correct start_pfn in altmap boundary check to decided whether + * we should use altmap or RAM based backing memory allocation. Also + * the address need to be aligned for set_pte operation. + + * If the start addr is already PMD_SIZE aligned we will try to use + * a pmd mapping. We don't want to be too aggressive here beacause + * that will cause more allocations in RAM. So only if the namespace + * vmemmap start addr is PMD_SIZE aligned we will use PMD mapping. + */ + + start = ALIGN_DOWN(start, PAGE_SIZE); for (addr = start; addr < end; addr = next) { next = pmd_addr_end(addr, end); @@ -1146,8 +1159,8 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in * in altmap block allocation failures, in which case * we fallback to RAM for vmemmap allocation. */ - if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) || - altmap_cross_boundary(altmap, addr, PMD_SIZE))) { + if (!IS_ALIGNED(addr, PMD_SIZE) || (altmap && + altmap_cross_boundary(altmap, addr, PMD_SIZE))) { /* * make sure we don't create altmap mappings * covering things outside the device. From 534f5a8ba27863141e29766467a3e1f61bcb47ac Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Wed, 5 Feb 2025 00:18:21 +0100 Subject: [PATCH 167/770] powerpc64/ftrace: fix module loading without patchable function entries get_stubs_size assumes that there must always be at least one patchable function entry, which is not always the case (modules that export data but no code), otherwise it returns -ENOEXEC and thus the section header sh_size is set to that value. During module_memory_alloc() the size is passed to execmem_alloc() after being page-aligned and thus set to zero which will cause it to fail the allocation (and thus module loading) as __vmalloc_node_range() checks for zero-sized allocs and returns null: [ 115.466896] module_64: cast_common: doesn't contain __patchable_function_entries. [ 115.469189] ------------[ cut here ]------------ [ 115.469496] WARNING: CPU: 0 PID: 274 at mm/vmalloc.c:3778 __vmalloc_node_range_noprof+0x8b4/0x8f0 ... [ 115.478574] ---[ end trace 0000000000000000 ]--- [ 115.479545] execmem: unable to allocate memory Fix this by removing the check completely, since it is anyway not helpful to propagate this as an error upwards. Fixes: eec37961a56a ("powerpc64/ftrace: Move ftrace sequence out of line") Signed-off-by: Anthony Iliopoulos Acked-by: Naveen N Rao (AMD) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250204231821.39140-1-ailiop@suse.com --- arch/powerpc/kernel/module_64.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 34a5aec4908fb..126bf3b06ab7e 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -258,10 +258,6 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, break; } } - if (i == hdr->e_shnum) { - pr_err("%s: doesn't contain __patchable_function_entries.\n", me->name); - return -ENOEXEC; - } #endif pr_debug("Looks like a total of %lu stubs, max\n", relocs); From 3700976f2ae8dfec4c17433f8a16c9e6c334cf89 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Mon, 7 Apr 2025 14:10:29 +0530 Subject: [PATCH 168/770] powerpc: Add check to select PPC_RADIX_BROADCAST_TLBIE Commit 3d45a3d0d2e6 ("powerpc: Define config option for processors with broadcast TLBIE") added a config option PPC_RADIX_BROADCAST_TLBIE to support processors with broadcast TLBIE. Since this option is relevant only for RADIX_MMU, add a check as a dependency to enable PPC_RADIX_BROADCAST_TLBIE in both powernv and pseries configs. This fixes the unmet config dependency warning reported WARNING: unmet direct dependencies detected for PPC_RADIX_BROADCAST_TLBIE Depends on [n]: PPC_RADIX_MMU [=n] Selected by [y]: - PPC_PSERIES [=y] && PPC64 [=y] && PPC_BOOK3S [=y] Reported-by: kernel test robot Tested-by: Venkat Rao Bagalkote Reviewed-by: Ritesh Harjani (IBM) Closes: https://lore.kernel.org/oe-kbuild-all/202504051857.jRqxM60c-lkp@intel.com/ Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250407084029.357710-1-maddy@linux.ibm.com --- arch/powerpc/platforms/powernv/Kconfig | 2 +- arch/powerpc/platforms/pseries/Kconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 3fbe0295ce141..95d7ba73d43d0 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -17,7 +17,7 @@ config PPC_POWERNV select MMU_NOTIFIER select FORCE_SMP select ARCH_SUPPORTS_PER_VMA_LOCK - select PPC_RADIX_BROADCAST_TLBIE + select PPC_RADIX_BROADCAST_TLBIE if PPC_RADIX_MMU default y config OPAL_PRD diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index a934c2a262f66..fa3c2fff082a8 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -23,7 +23,7 @@ config PPC_PSERIES select FORCE_SMP select SWIOTLB select ARCH_SUPPORTS_PER_VMA_LOCK - select PPC_RADIX_BROADCAST_TLBIE + select PPC_RADIX_BROADCAST_TLBIE if PPC_RADIX_MMU default y config PARAVIRT From 24e31e4747698dc50d408f0082d7eb9b9520d2f6 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Thu, 10 Apr 2025 11:15:54 +1200 Subject: [PATCH 169/770] net: mdio: Add RTL9300 MDIO driver Add a driver for the MDIO controller on the RTL9300 family of Ethernet switches with integrated SoC. There are 4 physical SMI interfaces on the RTL9300 however access is done using the switch ports. The driver takes the MDIO bus hierarchy from the DTS and uses this to configure the switch ports so they are associated with the correct PHY. This mapping is also used when dealing with software requests from phylib. Signed-off-by: Chris Packham Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250409231554.3943115-1-chris.packham@alliedtelesis.co.nz Signed-off-by: Paolo Abeni --- drivers/net/mdio/Kconfig | 7 + drivers/net/mdio/Makefile | 1 + drivers/net/mdio/mdio-realtek-rtl9300.c | 522 ++++++++++++++++++++++++ 3 files changed, 530 insertions(+) create mode 100644 drivers/net/mdio/mdio-realtek-rtl9300.c diff --git a/drivers/net/mdio/Kconfig b/drivers/net/mdio/Kconfig index 4a7a303be2f74..058fcdaf6c185 100644 --- a/drivers/net/mdio/Kconfig +++ b/drivers/net/mdio/Kconfig @@ -185,6 +185,13 @@ config MDIO_IPQ8064 This driver supports the MDIO interface found in the network interface units of the IPQ8064 SoC +config MDIO_REALTEK_RTL9300 + tristate "Realtek RTL9300 MDIO interface support" + depends on MACH_REALTEK_RTL || COMPILE_TEST + help + This driver supports the MDIO interface found in the Realtek + RTL9300 family of Ethernet switches with integrated SoC. + config MDIO_REGMAP tristate help diff --git a/drivers/net/mdio/Makefile b/drivers/net/mdio/Makefile index 1015f0db4531d..c23778e73890c 100644 --- a/drivers/net/mdio/Makefile +++ b/drivers/net/mdio/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_MDIO_MOXART) += mdio-moxart.o obj-$(CONFIG_MDIO_MSCC_MIIM) += mdio-mscc-miim.o obj-$(CONFIG_MDIO_MVUSB) += mdio-mvusb.o obj-$(CONFIG_MDIO_OCTEON) += mdio-octeon.o +obj-$(CONFIG_MDIO_REALTEK_RTL9300) += mdio-realtek-rtl9300.o obj-$(CONFIG_MDIO_REGMAP) += mdio-regmap.o obj-$(CONFIG_MDIO_SUN4I) += mdio-sun4i.o obj-$(CONFIG_MDIO_THUNDER) += mdio-thunder.o diff --git a/drivers/net/mdio/mdio-realtek-rtl9300.c b/drivers/net/mdio/mdio-realtek-rtl9300.c new file mode 100644 index 0000000000000..33694c3ff9a71 --- /dev/null +++ b/drivers/net/mdio/mdio-realtek-rtl9300.c @@ -0,0 +1,522 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MDIO controller for RTL9300 switches with integrated SoC. + * + * The MDIO communication is abstracted by the switch. At the software level + * communication uses the switch port to address the PHY. We work out the + * mapping based on the MDIO bus described in device tree and phandles on the + * ethernet-ports property. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SMI_GLB_CTRL 0xca00 +#define GLB_CTRL_INTF_SEL(intf) BIT(16 + (intf)) +#define SMI_PORT0_15_POLLING_SEL 0xca08 +#define SMI_ACCESS_PHY_CTRL_0 0xcb70 +#define SMI_ACCESS_PHY_CTRL_1 0xcb74 +#define PHY_CTRL_REG_ADDR GENMASK(24, 20) +#define PHY_CTRL_PARK_PAGE GENMASK(19, 15) +#define PHY_CTRL_MAIN_PAGE GENMASK(14, 3) +#define PHY_CTRL_WRITE BIT(2) +#define PHY_CTRL_READ 0 +#define PHY_CTRL_TYPE_C45 BIT(1) +#define PHY_CTRL_TYPE_C22 0 +#define PHY_CTRL_CMD BIT(0) +#define PHY_CTRL_FAIL BIT(25) +#define SMI_ACCESS_PHY_CTRL_2 0xcb78 +#define PHY_CTRL_INDATA GENMASK(31, 16) +#define PHY_CTRL_DATA GENMASK(15, 0) +#define SMI_ACCESS_PHY_CTRL_3 0xcb7c +#define PHY_CTRL_MMD_DEVAD GENMASK(20, 16) +#define PHY_CTRL_MMD_REG GENMASK(15, 0) +#define SMI_PORT0_5_ADDR_CTRL 0xcb80 + +#define MAX_PORTS 28 +#define MAX_SMI_BUSSES 4 +#define MAX_SMI_ADDR 0x1f + +struct rtl9300_mdio_priv { + struct regmap *regmap; + struct mutex lock; /* protect HW access */ + DECLARE_BITMAP(valid_ports, MAX_PORTS); + u8 smi_bus[MAX_PORTS]; + u8 smi_addr[MAX_PORTS]; + bool smi_bus_is_c45[MAX_SMI_BUSSES]; + struct mii_bus *bus[MAX_SMI_BUSSES]; +}; + +struct rtl9300_mdio_chan { + struct rtl9300_mdio_priv *priv; + u8 mdio_bus; +}; + +static int rtl9300_mdio_phy_to_port(struct mii_bus *bus, int phy_id) +{ + struct rtl9300_mdio_chan *chan = bus->priv; + struct rtl9300_mdio_priv *priv; + int i; + + priv = chan->priv; + + for_each_set_bit(i, priv->valid_ports, MAX_PORTS) + if (priv->smi_bus[i] == chan->mdio_bus && + priv->smi_addr[i] == phy_id) + return i; + + return -ENOENT; +} + +static int rtl9300_mdio_wait_ready(struct rtl9300_mdio_priv *priv) +{ + struct regmap *regmap = priv->regmap; + u32 val; + + lockdep_assert_held(&priv->lock); + + return regmap_read_poll_timeout(regmap, SMI_ACCESS_PHY_CTRL_1, + val, !(val & PHY_CTRL_CMD), 10, 1000); +} + +static int rtl9300_mdio_read_c22(struct mii_bus *bus, int phy_id, int regnum) +{ + struct rtl9300_mdio_chan *chan = bus->priv; + struct rtl9300_mdio_priv *priv; + struct regmap *regmap; + int port; + u32 val; + int err; + + priv = chan->priv; + regmap = priv->regmap; + + port = rtl9300_mdio_phy_to_port(bus, phy_id); + if (port < 0) + return port; + + mutex_lock(&priv->lock); + err = rtl9300_mdio_wait_ready(priv); + if (err) + goto out_err; + + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_2, FIELD_PREP(PHY_CTRL_INDATA, port)); + if (err) + goto out_err; + + val = FIELD_PREP(PHY_CTRL_REG_ADDR, regnum) | + FIELD_PREP(PHY_CTRL_PARK_PAGE, 0x1f) | + FIELD_PREP(PHY_CTRL_MAIN_PAGE, 0xfff) | + PHY_CTRL_READ | PHY_CTRL_TYPE_C22 | PHY_CTRL_CMD; + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_1, val); + if (err) + goto out_err; + + err = rtl9300_mdio_wait_ready(priv); + if (err) + goto out_err; + + err = regmap_read(regmap, SMI_ACCESS_PHY_CTRL_2, &val); + if (err) + goto out_err; + + mutex_unlock(&priv->lock); + return FIELD_GET(PHY_CTRL_DATA, val); + +out_err: + mutex_unlock(&priv->lock); + return err; +} + +static int rtl9300_mdio_write_c22(struct mii_bus *bus, int phy_id, int regnum, u16 value) +{ + struct rtl9300_mdio_chan *chan = bus->priv; + struct rtl9300_mdio_priv *priv; + struct regmap *regmap; + int port; + u32 val; + int err; + + priv = chan->priv; + regmap = priv->regmap; + + port = rtl9300_mdio_phy_to_port(bus, phy_id); + if (port < 0) + return port; + + mutex_lock(&priv->lock); + err = rtl9300_mdio_wait_ready(priv); + if (err) + goto out_err; + + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_0, BIT(port)); + if (err) + goto out_err; + + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_2, FIELD_PREP(PHY_CTRL_INDATA, value)); + if (err) + goto out_err; + + val = FIELD_PREP(PHY_CTRL_REG_ADDR, regnum) | + FIELD_PREP(PHY_CTRL_PARK_PAGE, 0x1f) | + FIELD_PREP(PHY_CTRL_MAIN_PAGE, 0xfff) | + PHY_CTRL_WRITE | PHY_CTRL_TYPE_C22 | PHY_CTRL_CMD; + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_1, val); + if (err) + goto out_err; + + err = regmap_read_poll_timeout(regmap, SMI_ACCESS_PHY_CTRL_1, + val, !(val & PHY_CTRL_CMD), 10, 100); + if (err) + goto out_err; + + if (val & PHY_CTRL_FAIL) { + err = -ENXIO; + goto out_err; + } + + mutex_unlock(&priv->lock); + return 0; + +out_err: + mutex_unlock(&priv->lock); + return err; +} + +static int rtl9300_mdio_read_c45(struct mii_bus *bus, int phy_id, int dev_addr, int regnum) +{ + struct rtl9300_mdio_chan *chan = bus->priv; + struct rtl9300_mdio_priv *priv; + struct regmap *regmap; + int port; + u32 val; + int err; + + priv = chan->priv; + regmap = priv->regmap; + + port = rtl9300_mdio_phy_to_port(bus, phy_id); + if (port < 0) + return port; + + mutex_lock(&priv->lock); + err = rtl9300_mdio_wait_ready(priv); + if (err) + goto out_err; + + val = FIELD_PREP(PHY_CTRL_INDATA, port); + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_2, val); + if (err) + goto out_err; + + val = FIELD_PREP(PHY_CTRL_MMD_DEVAD, dev_addr) | + FIELD_PREP(PHY_CTRL_MMD_REG, regnum); + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_3, val); + if (err) + goto out_err; + + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_1, + PHY_CTRL_READ | PHY_CTRL_TYPE_C45 | PHY_CTRL_CMD); + if (err) + goto out_err; + + err = rtl9300_mdio_wait_ready(priv); + if (err) + goto out_err; + + err = regmap_read(regmap, SMI_ACCESS_PHY_CTRL_2, &val); + if (err) + goto out_err; + + mutex_unlock(&priv->lock); + return FIELD_GET(PHY_CTRL_DATA, val); + +out_err: + mutex_unlock(&priv->lock); + return err; +} + +static int rtl9300_mdio_write_c45(struct mii_bus *bus, int phy_id, int dev_addr, + int regnum, u16 value) +{ + struct rtl9300_mdio_chan *chan = bus->priv; + struct rtl9300_mdio_priv *priv; + struct regmap *regmap; + int port; + u32 val; + int err; + + priv = chan->priv; + regmap = priv->regmap; + + port = rtl9300_mdio_phy_to_port(bus, phy_id); + if (port < 0) + return port; + + mutex_lock(&priv->lock); + err = rtl9300_mdio_wait_ready(priv); + if (err) + goto out_err; + + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_0, BIT(port)); + if (err) + goto out_err; + + val = FIELD_PREP(PHY_CTRL_INDATA, value); + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_2, val); + if (err) + goto out_err; + + val = FIELD_PREP(PHY_CTRL_MMD_DEVAD, dev_addr) | + FIELD_PREP(PHY_CTRL_MMD_REG, regnum); + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_3, val); + if (err) + goto out_err; + + err = regmap_write(regmap, SMI_ACCESS_PHY_CTRL_1, + PHY_CTRL_TYPE_C45 | PHY_CTRL_WRITE | PHY_CTRL_CMD); + if (err) + goto out_err; + + err = regmap_read_poll_timeout(regmap, SMI_ACCESS_PHY_CTRL_1, + val, !(val & PHY_CTRL_CMD), 10, 100); + if (err) + goto out_err; + + if (val & PHY_CTRL_FAIL) { + err = -ENXIO; + goto out_err; + } + + mutex_unlock(&priv->lock); + return 0; + +out_err: + mutex_unlock(&priv->lock); + return err; +} + +static int rtl9300_mdiobus_init(struct rtl9300_mdio_priv *priv) +{ + u32 glb_ctrl_mask = 0, glb_ctrl_val = 0; + struct regmap *regmap = priv->regmap; + u32 port_addr[5] = { 0 }; + u32 poll_sel[2] = { 0 }; + int i, err; + + /* Associate the port with the SMI interface and PHY */ + for_each_set_bit(i, priv->valid_ports, MAX_PORTS) { + int pos; + + pos = (i % 6) * 5; + port_addr[i / 6] |= (priv->smi_addr[i] & 0x1f) << pos; + + pos = (i % 16) * 2; + poll_sel[i / 16] |= (priv->smi_bus[i] & 0x3) << pos; + } + + /* Put the interfaces into C45 mode if required */ + glb_ctrl_mask = GENMASK(19, 16); + for (i = 0; i < MAX_SMI_BUSSES; i++) + if (priv->smi_bus_is_c45[i]) + glb_ctrl_val |= GLB_CTRL_INTF_SEL(i); + + err = regmap_bulk_write(regmap, SMI_PORT0_5_ADDR_CTRL, + port_addr, 5); + if (err) + return err; + + err = regmap_bulk_write(regmap, SMI_PORT0_15_POLLING_SEL, + poll_sel, 2); + if (err) + return err; + + err = regmap_update_bits(regmap, SMI_GLB_CTRL, + glb_ctrl_mask, glb_ctrl_val); + if (err) + return err; + + return 0; +} + +static int rtl9300_mdiobus_probe_one(struct device *dev, struct rtl9300_mdio_priv *priv, + struct fwnode_handle *node) +{ + struct rtl9300_mdio_chan *chan; + struct fwnode_handle *child; + struct mii_bus *bus; + u32 mdio_bus; + int err; + + err = fwnode_property_read_u32(node, "reg", &mdio_bus); + if (err) + return err; + + /* The MDIO accesses from the kernel work with the PHY polling unit in + * the switch. We need to tell the PPU to operate either in GPHY (i.e. + * clause 22) or 10GPHY mode (i.e. clause 45). + * + * We select 10GPHY mode if there is at least one PHY that declares + * compatible = "ethernet-phy-ieee802.3-c45". This does mean we can't + * support both c45 and c22 on the same MDIO bus. + */ + fwnode_for_each_child_node(node, child) + if (fwnode_device_is_compatible(child, "ethernet-phy-ieee802.3-c45")) + priv->smi_bus_is_c45[mdio_bus] = true; + + bus = devm_mdiobus_alloc_size(dev, sizeof(*chan)); + if (!bus) + return -ENOMEM; + + bus->name = "Realtek Switch MDIO Bus"; + if (priv->smi_bus_is_c45[mdio_bus]) { + bus->read_c45 = rtl9300_mdio_read_c45; + bus->write_c45 = rtl9300_mdio_write_c45; + } else { + bus->read = rtl9300_mdio_read_c22; + bus->write = rtl9300_mdio_write_c22; + } + bus->parent = dev; + chan = bus->priv; + chan->mdio_bus = mdio_bus; + chan->priv = priv; + + snprintf(bus->id, MII_BUS_ID_SIZE, "%s-%d", dev_name(dev), mdio_bus); + + err = devm_of_mdiobus_register(dev, bus, to_of_node(node)); + if (err) + return dev_err_probe(dev, err, "cannot register MDIO bus\n"); + + return 0; +} + +/* The mdio-controller is part of a switch block so we parse the sibling + * ethernet-ports node and build a mapping of the switch port to MDIO bus/addr + * based on the phy-handle. + */ +static int rtl9300_mdiobus_map_ports(struct device *dev) +{ + struct rtl9300_mdio_priv *priv = dev_get_drvdata(dev); + struct device *parent = dev->parent; + struct fwnode_handle *port; + int err; + + struct fwnode_handle *ports __free(fwnode_handle) = + device_get_named_child_node(parent, "ethernet-ports"); + if (!ports) + return dev_err_probe(dev, -EINVAL, "%pfwP missing ethernet-ports\n", + dev_fwnode(parent)); + + fwnode_for_each_child_node(ports, port) { + struct device_node *mdio_dn; + u32 addr; + u32 bus; + u32 pn; + + struct device_node *phy_dn __free(device_node) = + of_parse_phandle(to_of_node(port), "phy-handle", 0); + /* skip ports without phys */ + if (!phy_dn) + continue; + + mdio_dn = phy_dn->parent; + /* only map ports that are connected to this mdio-controller */ + if (mdio_dn->parent != dev->of_node) + continue; + + err = fwnode_property_read_u32(port, "reg", &pn); + if (err) + return err; + + if (pn >= MAX_PORTS) + return dev_err_probe(dev, -EINVAL, "illegal port number %d\n", pn); + + if (test_bit(pn, priv->valid_ports)) + return dev_err_probe(dev, -EINVAL, "duplicated port number %d\n", pn); + + err = of_property_read_u32(mdio_dn, "reg", &bus); + if (err) + return err; + + if (bus >= MAX_SMI_BUSSES) + return dev_err_probe(dev, -EINVAL, "illegal smi bus number %d\n", bus); + + err = of_property_read_u32(phy_dn, "reg", &addr); + if (err) + return err; + + __set_bit(pn, priv->valid_ports); + priv->smi_bus[pn] = bus; + priv->smi_addr[pn] = addr; + } + + return 0; +} + +static int rtl9300_mdiobus_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct rtl9300_mdio_priv *priv; + struct fwnode_handle *child; + int err; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + err = devm_mutex_init(dev, &priv->lock); + if (err) + return err; + + priv->regmap = syscon_node_to_regmap(dev->parent->of_node); + if (IS_ERR(priv->regmap)) + return PTR_ERR(priv->regmap); + + platform_set_drvdata(pdev, priv); + + err = rtl9300_mdiobus_map_ports(dev); + if (err) + return err; + + device_for_each_child_node(dev, child) { + err = rtl9300_mdiobus_probe_one(dev, priv, child); + if (err) + return err; + } + + err = rtl9300_mdiobus_init(priv); + if (err) + return dev_err_probe(dev, err, "failed to initialise MDIO bus controller\n"); + + return 0; +} + +static const struct of_device_id rtl9300_mdio_ids[] = { + { .compatible = "realtek,rtl9301-mdio" }, + {} +}; +MODULE_DEVICE_TABLE(of, rtl9300_mdio_ids); + +static struct platform_driver rtl9300_mdio_driver = { + .probe = rtl9300_mdiobus_probe, + .driver = { + .name = "mdio-rtl9300", + .of_match_table = rtl9300_mdio_ids, + }, +}; + +module_platform_driver(rtl9300_mdio_driver); + +MODULE_DESCRIPTION("RTL9300 MDIO driver"); +MODULE_LICENSE("GPL"); From 9f61eb2d185b06d49ef8c25af994d6fc571433cc Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Thu, 10 Apr 2025 00:08:55 -0700 Subject: [PATCH 170/770] eth: fbnic: add locking support for hw stats This patch adds lock protection for the hardware statistics for fbnic. The hardware statistics access via ndo_get_stats64 is not protected by the rtnl_lock(). Since these stats can be accessed from different places in the code such as service task, ethtool, Q-API, and net_device_ops, a lock-less approach can lead to races. Note that this patch is not a fix rather, just a prep for the subsequent changes in this series. Signed-off-by: Jakub Kicinski Signed-off-by: Mohsin Bashir Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250410070859.4160768-2-mohsin.bashr@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic.h | 3 +++ drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c | 16 +++++++++++++--- drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 1 + 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index 4ca7b99ef1315..80d54edaac557 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -81,6 +81,9 @@ struct fbnic_dev { /* Local copy of hardware statistics */ struct fbnic_hw_stats hw_stats; + + /* Lock protecting access to hw_stats */ + spinlock_t hw_stats_lock; }; /* Reserve entry 0 in the MSI-X "others" array until we have filled all diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c index 89ac6bc8c7fc3..957138cb841ed 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c @@ -203,18 +203,28 @@ static void fbnic_get_pcie_stats_asic64(struct fbnic_dev *fbd, void fbnic_reset_hw_stats(struct fbnic_dev *fbd) { + spin_lock(&fbd->hw_stats_lock); fbnic_reset_rpc_stats(fbd, &fbd->hw_stats.rpc); fbnic_reset_pcie_stats_asic(fbd, &fbd->hw_stats.pcie); + spin_unlock(&fbd->hw_stats_lock); } -void fbnic_get_hw_stats32(struct fbnic_dev *fbd) +static void __fbnic_get_hw_stats32(struct fbnic_dev *fbd) { fbnic_get_rpc_stats32(fbd, &fbd->hw_stats.rpc); } -void fbnic_get_hw_stats(struct fbnic_dev *fbd) +void fbnic_get_hw_stats32(struct fbnic_dev *fbd) { - fbnic_get_hw_stats32(fbd); + spin_lock(&fbd->hw_stats_lock); + __fbnic_get_hw_stats32(fbd); + spin_unlock(&fbd->hw_stats_lock); +} +void fbnic_get_hw_stats(struct fbnic_dev *fbd) +{ + spin_lock(&fbd->hw_stats_lock); + __fbnic_get_hw_stats32(fbd); fbnic_get_pcie_stats_asic64(fbd, &fbd->hw_stats.pcie); + spin_unlock(&fbd->hw_stats_lock); } diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index 6cbbc2ee3e1f9..1f76ebdd6ad18 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -292,6 +292,7 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) fbnic_devlink_register(fbd); fbnic_dbg_fbd_init(fbd); + spin_lock_init(&fbd->hw_stats_lock); /* Capture snapshot of hardware stats so netdev can calculate delta */ fbnic_reset_hw_stats(fbd); From 8f20a2bfa4b7a2ddfd4890ca220012cbe263c97f Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Thu, 10 Apr 2025 00:08:56 -0700 Subject: [PATCH 171/770] eth: fbnic: add coverage for hw queue stats This patch provides support for hardware queue stats and covers packet errors for RX-DMA engine, RCQ drops and BDQ drops. The packet errors are also aggregated with the `rx_errors` stats in the `rtnl_link_stats` as well as with the `hw_drops` in the queue API. The RCQ and BDQ drops are aggregated with `rx_over_errors` in the `rtnl_link_stats` as well as with the `hw_drop_overruns` in the queue API. ethtool -S eth0 | grep -E 'rde' rde_0_pkt_err: 0 rde_0_pkt_cq_drop: 0 rde_0_pkt_bdq_drop: 0 --- --- rde_127_pkt_err: 0 rde_127_pkt_cq_drop: 0 rde_127_pkt_bdq_drop: 0 Signed-off-by: Jakub Kicinski Signed-off-by: Mohsin Bashir Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250410070859.4160768-3-mohsin.bashr@gmail.com Signed-off-by: Paolo Abeni --- .../device_drivers/ethernet/meta/fbnic.rst | 9 +++ drivers/net/ethernet/meta/fbnic/fbnic_csr.h | 12 ++++ .../net/ethernet/meta/fbnic/fbnic_ethtool.c | 55 ++++++++++++++++--- .../net/ethernet/meta/fbnic/fbnic_hw_stats.c | 50 +++++++++++++++++ .../net/ethernet/meta/fbnic/fbnic_hw_stats.h | 9 +++ .../net/ethernet/meta/fbnic/fbnic_netdev.c | 29 +++++++++- 6 files changed, 156 insertions(+), 8 deletions(-) diff --git a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst index 04e0595bb0a77..bc7f2fef28753 100644 --- a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst +++ b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst @@ -44,6 +44,15 @@ RPC (Rx parser) - ``rpc_out_of_hdr_err``: frames where header was larger than parsable region - ``ovr_size_err``: oversized frames +Hardware Queues +~~~~~~~~~~~~~~~ + +1. RX DMA Engine: + + - ``rde_[i]_pkt_err``: packets with MAC EOP, RPC parser, RXB truncation, or RDE frame truncation errors. These error are flagged in the packet metadata because of cut-through support but the actual drop happens once PCIE/RDE is reached. + - ``rde_[i]_pkt_cq_drop``: packets dropped because RCQ is full + - ``rde_[i]_pkt_bdq_drop``: packets dropped because HPQ or PPQ ran out of host buffer + PCIe ~~~~ diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h index 3b12a0ab59065..ff5f68c7e73d5 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h @@ -864,6 +864,12 @@ enum { #define FBNIC_QUEUE_TWQ1_BAL 0x022 /* 0x088 */ #define FBNIC_QUEUE_TWQ1_BAH 0x023 /* 0x08c */ +/* Tx Work Queue Statistics Registers */ +#define FBNIC_QUEUE_TWQ0_PKT_CNT 0x062 /* 0x188 */ +#define FBNIC_QUEUE_TWQ0_ERR_CNT 0x063 /* 0x18c */ +#define FBNIC_QUEUE_TWQ1_PKT_CNT 0x072 /* 0x1c8 */ +#define FBNIC_QUEUE_TWQ1_ERR_CNT 0x073 /* 0x1cc */ + /* Tx Completion Queue Registers */ #define FBNIC_QUEUE_TCQ_CTL 0x080 /* 0x200 */ #define FBNIC_QUEUE_TCQ_CTL_RESET CSR_BIT(0) @@ -953,6 +959,12 @@ enum { FBNIC_QUEUE_RDE_CTL1_PAYLD_PACK_RSS = 2, }; +/* Rx Per CQ Statistics Counters */ +#define FBNIC_QUEUE_RDE_PKT_CNT 0x2a2 /* 0xa88 */ +#define FBNIC_QUEUE_RDE_PKT_ERR_CNT 0x2a3 /* 0xa8c */ +#define FBNIC_QUEUE_RDE_CQ_DROP_CNT 0x2a4 /* 0xa90 */ +#define FBNIC_QUEUE_RDE_BDQ_DROP_CNT 0x2a5 /* 0xa94 */ + /* Rx Interrupt Manager Registers */ #define FBNIC_QUEUE_RIM_CTL 0x2c0 /* 0xb00 */ #define FBNIC_QUEUE_RIM_CTL_MSIX_MASK CSR_GENMASK(7, 0) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index 0a751a2aaf733..038e969f5ba38 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -39,7 +39,20 @@ static const struct fbnic_stat fbnic_gstrings_hw_stats[] = { }; #define FBNIC_HW_FIXED_STATS_LEN ARRAY_SIZE(fbnic_gstrings_hw_stats) -#define FBNIC_HW_STATS_LEN FBNIC_HW_FIXED_STATS_LEN + +#define FBNIC_HW_Q_STAT(name, stat) \ + FBNIC_STAT_FIELDS(fbnic_hw_q_stats, name, stat.value) + +static const struct fbnic_stat fbnic_gstrings_hw_q_stats[] = { + FBNIC_HW_Q_STAT("rde_%u_pkt_err", rde_pkt_err), + FBNIC_HW_Q_STAT("rde_%u_pkt_cq_drop", rde_pkt_cq_drop), + FBNIC_HW_Q_STAT("rde_%u_pkt_bdq_drop", rde_pkt_bdq_drop), +}; + +#define FBNIC_HW_Q_STATS_LEN ARRAY_SIZE(fbnic_gstrings_hw_q_stats) +#define FBNIC_HW_STATS_LEN \ + (FBNIC_HW_FIXED_STATS_LEN + \ + FBNIC_HW_Q_STATS_LEN * FBNIC_MAX_QUEUES) static void fbnic_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) @@ -300,29 +313,57 @@ fbnic_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring, static void fbnic_get_strings(struct net_device *dev, u32 sset, u8 *data) { - int i; + const struct fbnic_stat *stat; + int i, idx; switch (sset) { case ETH_SS_STATS: - for (i = 0; i < FBNIC_HW_STATS_LEN; i++) + for (i = 0; i < FBNIC_HW_FIXED_STATS_LEN; i++) ethtool_puts(&data, fbnic_gstrings_hw_stats[i].string); + + for (idx = 0; idx < FBNIC_MAX_QUEUES; idx++) { + stat = fbnic_gstrings_hw_q_stats; + + for (i = 0; i < FBNIC_HW_Q_STATS_LEN; i++, stat++) + ethtool_sprintf(&data, stat->string, idx); + } break; } } +static void fbnic_report_hw_stats(const struct fbnic_stat *stat, + const void *base, int len, u64 **data) +{ + while (len--) { + u8 *curr = (u8 *)base + stat->offset; + + **data = *(u64 *)curr; + + stat++; + (*data)++; + } +} + static void fbnic_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct fbnic_net *fbn = netdev_priv(dev); - const struct fbnic_stat *stat; + struct fbnic_dev *fbd = fbn->fbd; int i; fbnic_get_hw_stats(fbn->fbd); - for (i = 0; i < FBNIC_HW_STATS_LEN; i++) { - stat = &fbnic_gstrings_hw_stats[i]; - data[i] = *(u64 *)((u8 *)&fbn->fbd->hw_stats + stat->offset); + spin_lock(&fbd->hw_stats_lock); + fbnic_report_hw_stats(fbnic_gstrings_hw_stats, &fbd->hw_stats, + FBNIC_HW_FIXED_STATS_LEN, &data); + + for (i = 0; i < FBNIC_MAX_QUEUES; i++) { + const struct fbnic_hw_q_stats *hw_q = &fbd->hw_stats.hw_q[i]; + + fbnic_report_hw_stats(fbnic_gstrings_hw_q_stats, hw_q, + FBNIC_HW_Q_STATS_LEN, &data); } + spin_unlock(&fbd->hw_stats_lock); } static int fbnic_get_sset_count(struct net_device *dev, int sset) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c index 957138cb841ed..c8faedc2ec443 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c @@ -117,6 +117,54 @@ static void fbnic_get_rpc_stats32(struct fbnic_dev *fbd, &rpc->ovr_size_err); } +static void fbnic_reset_hw_rxq_stats(struct fbnic_dev *fbd, + struct fbnic_hw_q_stats *hw_q) +{ + int i; + + for (i = 0; i < fbd->max_num_queues; i++, hw_q++) { + u32 base = FBNIC_QUEUE(i); + + fbnic_hw_stat_rst32(fbd, + base + FBNIC_QUEUE_RDE_PKT_ERR_CNT, + &hw_q->rde_pkt_err); + fbnic_hw_stat_rst32(fbd, + base + FBNIC_QUEUE_RDE_CQ_DROP_CNT, + &hw_q->rde_pkt_cq_drop); + fbnic_hw_stat_rst32(fbd, + base + FBNIC_QUEUE_RDE_BDQ_DROP_CNT, + &hw_q->rde_pkt_bdq_drop); + } +} + +static void fbnic_get_hw_rxq_stats32(struct fbnic_dev *fbd, + struct fbnic_hw_q_stats *hw_q) +{ + int i; + + for (i = 0; i < fbd->max_num_queues; i++, hw_q++) { + u32 base = FBNIC_QUEUE(i); + + fbnic_hw_stat_rd32(fbd, + base + FBNIC_QUEUE_RDE_PKT_ERR_CNT, + &hw_q->rde_pkt_err); + fbnic_hw_stat_rd32(fbd, + base + FBNIC_QUEUE_RDE_CQ_DROP_CNT, + &hw_q->rde_pkt_cq_drop); + fbnic_hw_stat_rd32(fbd, + base + FBNIC_QUEUE_RDE_BDQ_DROP_CNT, + &hw_q->rde_pkt_bdq_drop); + } +} + +void fbnic_get_hw_q_stats(struct fbnic_dev *fbd, + struct fbnic_hw_q_stats *hw_q) +{ + spin_lock(&fbd->hw_stats_lock); + fbnic_get_hw_rxq_stats32(fbd, hw_q); + spin_unlock(&fbd->hw_stats_lock); +} + static void fbnic_reset_pcie_stats_asic(struct fbnic_dev *fbd, struct fbnic_pcie_stats *pcie) { @@ -205,6 +253,7 @@ void fbnic_reset_hw_stats(struct fbnic_dev *fbd) { spin_lock(&fbd->hw_stats_lock); fbnic_reset_rpc_stats(fbd, &fbd->hw_stats.rpc); + fbnic_reset_hw_rxq_stats(fbd, fbd->hw_stats.hw_q); fbnic_reset_pcie_stats_asic(fbd, &fbd->hw_stats.pcie); spin_unlock(&fbd->hw_stats_lock); } @@ -212,6 +261,7 @@ void fbnic_reset_hw_stats(struct fbnic_dev *fbd) static void __fbnic_get_hw_stats32(struct fbnic_dev *fbd) { fbnic_get_rpc_stats32(fbd, &fbd->hw_stats.rpc); + fbnic_get_hw_rxq_stats32(fbd, fbd->hw_stats.hw_q); } void fbnic_get_hw_stats32(struct fbnic_dev *fbd) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h index 78df56b877456..81efa8dc83812 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h @@ -43,6 +43,12 @@ struct fbnic_rpc_stats { struct fbnic_stat_counter tcp_opt_err, out_of_hdr_err, ovr_size_err; }; +struct fbnic_hw_q_stats { + struct fbnic_stat_counter rde_pkt_err; + struct fbnic_stat_counter rde_pkt_cq_drop; + struct fbnic_stat_counter rde_pkt_bdq_drop; +}; + struct fbnic_pcie_stats { struct fbnic_stat_counter ob_rd_tlp, ob_rd_dword; struct fbnic_stat_counter ob_wr_tlp, ob_wr_dword; @@ -56,12 +62,15 @@ struct fbnic_pcie_stats { struct fbnic_hw_stats { struct fbnic_mac_stats mac; struct fbnic_rpc_stats rpc; + struct fbnic_hw_q_stats hw_q[FBNIC_MAX_QUEUES]; struct fbnic_pcie_stats pcie; }; u64 fbnic_stat_rd64(struct fbnic_dev *fbd, u32 reg, u32 offset); void fbnic_reset_hw_stats(struct fbnic_dev *fbd); +void fbnic_get_hw_q_stats(struct fbnic_dev *fbd, + struct fbnic_hw_q_stats *hw_q); void fbnic_get_hw_stats32(struct fbnic_dev *fbd); void fbnic_get_hw_stats(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index 79a01fdd1dd16..e19284d4b91d6 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -403,11 +403,15 @@ static int fbnic_hwtstamp_set(struct net_device *netdev, static void fbnic_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64) { + u64 rx_bytes, rx_packets, rx_dropped = 0, rx_errors = 0; u64 tx_bytes, tx_packets, tx_dropped = 0; - u64 rx_bytes, rx_packets, rx_dropped = 0; struct fbnic_net *fbn = netdev_priv(dev); + struct fbnic_dev *fbd = fbn->fbd; struct fbnic_queue_stats *stats; unsigned int start, i; + u64 rx_over = 0; + + fbnic_get_hw_stats(fbd); stats = &fbn->tx_stats; @@ -444,9 +448,22 @@ static void fbnic_get_stats64(struct net_device *dev, rx_packets = stats->packets; rx_dropped = stats->dropped; + spin_lock(&fbd->hw_stats_lock); + for (i = 0; i < fbd->max_num_queues; i++) { + /* Report packets dropped due to CQ/BDQ being full/empty */ + rx_over += fbd->hw_stats.hw_q[i].rde_pkt_cq_drop.value; + rx_over += fbd->hw_stats.hw_q[i].rde_pkt_bdq_drop.value; + + /* Report packets with errors */ + rx_errors += fbd->hw_stats.hw_q[i].rde_pkt_err.value; + } + spin_unlock(&fbd->hw_stats_lock); + stats64->rx_bytes = rx_bytes; stats64->rx_packets = rx_packets; stats64->rx_dropped = rx_dropped; + stats64->rx_over_errors = rx_over; + stats64->rx_errors = rx_errors; for (i = 0; i < fbn->num_rx_queues; i++) { struct fbnic_ring *rxr = fbn->rx[i]; @@ -486,6 +503,7 @@ static void fbnic_get_queue_stats_rx(struct net_device *dev, int idx, { struct fbnic_net *fbn = netdev_priv(dev); struct fbnic_ring *rxr = fbn->rx[idx]; + struct fbnic_dev *fbd = fbn->fbd; struct fbnic_queue_stats *stats; u64 bytes, packets, alloc_fail; u64 csum_complete, csum_none; @@ -509,6 +527,15 @@ static void fbnic_get_queue_stats_rx(struct net_device *dev, int idx, rx->alloc_fail = alloc_fail; rx->csum_complete = csum_complete; rx->csum_none = csum_none; + + fbnic_get_hw_q_stats(fbd, fbd->hw_stats.hw_q); + + spin_lock(&fbd->hw_stats_lock); + rx->hw_drop_overruns = fbd->hw_stats.hw_q[idx].rde_pkt_cq_drop.value + + fbd->hw_stats.hw_q[idx].rde_pkt_bdq_drop.value; + rx->hw_drops = fbd->hw_stats.hw_q[idx].rde_pkt_err.value + + rx->hw_drop_overruns; + spin_unlock(&fbd->hw_stats_lock); } static void fbnic_get_queue_stats_tx(struct net_device *dev, int idx, From 986c63a0295e498b2afb38c6e969a3e0b41455d6 Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Thu, 10 Apr 2025 00:08:57 -0700 Subject: [PATCH 172/770] eth: fbnic: add coverage for RXB stats This patch provides coverage to the RXB (RX Buffer) stats. RXB stats are divided into 3 sections: RXB enqueue, RXB FIFO, and RXB dequeue stats. The RXB enqueue/dequeue stats are indexed from 0-3 and cater for the input/output counters whereas, the RXB fifo stats are indexed from 0-7. The RXB also supports pause frame stats counters which we are leaving for a later patch. ethtool -S eth0 | grep rxb rxb_integrity_err0: 0 rxb_mac_err0: 0 rxb_parser_err0: 0 rxb_frm_err0: 0 rxb_drbo0_frames: 1433543 rxb_drbo0_bytes: 775949081 --- --- rxb_intf3_frames: 1195711 rxb_intf3_bytes: 739650210 rxb_pbuf3_frames: 1195711 rxb_pbuf3_bytes: 765948092 Signed-off-by: Jakub Kicinski Signed-off-by: Mohsin Bashir Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250410070859.4160768-4-mohsin.bashr@gmail.com Signed-off-by: Paolo Abeni --- .../device_drivers/ethernet/meta/fbnic.rst | 26 +++ drivers/net/ethernet/meta/fbnic/fbnic_csr.h | 8 + .../net/ethernet/meta/fbnic/fbnic_ethtool.c | 110 +++++++++++ .../net/ethernet/meta/fbnic/fbnic_hw_stats.c | 171 ++++++++++++++++++ .../net/ethernet/meta/fbnic/fbnic_hw_stats.h | 28 +++ .../net/ethernet/meta/fbnic/fbnic_netdev.c | 14 +- 6 files changed, 356 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst index bc7f2fef28753..8ba94ae95db9c 100644 --- a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst +++ b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst @@ -31,6 +31,32 @@ separate entry. Statistics ---------- +RXB (RX Buffer) Enqueue +~~~~~~~~~~~~~~~~~~~~~~~ + + - ``rxb_integrity_err[i]``: frames enqueued with integrity errors (e.g., multi-bit ECC errors) on RXB input i + - ``rxb_mac_err[i]``: frames enqueued with MAC end-of-frame errors (e.g., bad FCS) on RXB input i + - ``rxb_parser_err[i]``: frames experienced RPC parser errors + - ``rxb_frm_err[i]``: frames experienced signaling errors (e.g., missing end-of-packet/start-of-packet) on RXB input i + - ``rxb_drbo[i]_frames``: frames received at RXB input i + - ``rxb_drbo[i]_bytes``: bytes received at RXB input i + +RXB (RX Buffer) FIFO +~~~~~~~~~~~~~~~~~~~~ + + - ``rxb_fifo[i]_drop``: transitions into the drop state on RXB pool i + - ``rxb_fifo[i]_dropped_frames``: frames dropped on RXB pool i + - ``rxb_fifo[i]_ecn``: transitions into the ECN mark state on RXB pool i + - ``rxb_fifo[i]_level``: current occupancy of RXB pool i + +RXB (RX Buffer) Dequeue +~~~~~~~~~~~~~~~~~~~~~~~ + + - ``rxb_intf[i]_frames``: frames sent to the output i + - ``rxb_intf[i]_bytes``: bytes sent to the output i + - ``rxb_pbuf[i]_frames``: frames sent to output i from the perspective of internal packet buffer + - ``rxb_pbuf[i]_bytes``: bytes sent to output i from the perspective of internal packet buffer + RPC (Rx parser) ~~~~~~~~~~~~~~~ diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h index ff5f68c7e73d5..a554e0b2cfffd 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h @@ -485,6 +485,14 @@ enum { FBNIC_RXB_FIFO_INDICES = 8 }; +enum { + FBNIC_RXB_INTF_NET = 0, + FBNIC_RXB_INTF_RBT = 1, + /* Unused */ + /* Unused */ + FBNIC_RXB_INTF_INDICES = 4 +}; + #define FBNIC_RXB_CT_SIZE(n) (0x08000 + (n)) /* 0x20000 + 4*n */ #define FBNIC_RXB_CT_SIZE_CNT 8 #define FBNIC_RXB_CT_SIZE_HEADER CSR_GENMASK(5, 0) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index 038e969f5ba38..816af96a5d5fc 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -40,6 +40,47 @@ static const struct fbnic_stat fbnic_gstrings_hw_stats[] = { #define FBNIC_HW_FIXED_STATS_LEN ARRAY_SIZE(fbnic_gstrings_hw_stats) +#define FBNIC_RXB_ENQUEUE_STAT(name, stat) \ + FBNIC_STAT_FIELDS(fbnic_rxb_enqueue_stats, name, stat) + +static const struct fbnic_stat fbnic_gstrings_rxb_enqueue_stats[] = { + FBNIC_RXB_ENQUEUE_STAT("rxb_integrity_err%u", integrity_err), + FBNIC_RXB_ENQUEUE_STAT("rxb_mac_err%u", mac_err), + FBNIC_RXB_ENQUEUE_STAT("rxb_parser_err%u", parser_err), + FBNIC_RXB_ENQUEUE_STAT("rxb_frm_err%u", frm_err), + + FBNIC_RXB_ENQUEUE_STAT("rxb_drbo%u_frames", drbo.frames), + FBNIC_RXB_ENQUEUE_STAT("rxb_drbo%u_bytes", drbo.bytes), +}; + +#define FBNIC_HW_RXB_ENQUEUE_STATS_LEN \ + ARRAY_SIZE(fbnic_gstrings_rxb_enqueue_stats) + +#define FBNIC_RXB_FIFO_STAT(name, stat) \ + FBNIC_STAT_FIELDS(fbnic_rxb_fifo_stats, name, stat) + +static const struct fbnic_stat fbnic_gstrings_rxb_fifo_stats[] = { + FBNIC_RXB_FIFO_STAT("rxb_fifo%u_drop", trans_drop), + FBNIC_RXB_FIFO_STAT("rxb_fifo%u_dropped_frames", drop.frames), + FBNIC_RXB_FIFO_STAT("rxb_fifo%u_ecn", trans_ecn), + FBNIC_RXB_FIFO_STAT("rxb_fifo%u_level", level), +}; + +#define FBNIC_HW_RXB_FIFO_STATS_LEN ARRAY_SIZE(fbnic_gstrings_rxb_fifo_stats) + +#define FBNIC_RXB_DEQUEUE_STAT(name, stat) \ + FBNIC_STAT_FIELDS(fbnic_rxb_dequeue_stats, name, stat) + +static const struct fbnic_stat fbnic_gstrings_rxb_dequeue_stats[] = { + FBNIC_RXB_DEQUEUE_STAT("rxb_intf%u_frames", intf.frames), + FBNIC_RXB_DEQUEUE_STAT("rxb_intf%u_bytes", intf.bytes), + FBNIC_RXB_DEQUEUE_STAT("rxb_pbuf%u_frames", pbuf.frames), + FBNIC_RXB_DEQUEUE_STAT("rxb_pbuf%u_bytes", pbuf.bytes), +}; + +#define FBNIC_HW_RXB_DEQUEUE_STATS_LEN \ + ARRAY_SIZE(fbnic_gstrings_rxb_dequeue_stats) + #define FBNIC_HW_Q_STAT(name, stat) \ FBNIC_STAT_FIELDS(fbnic_hw_q_stats, name, stat.value) @@ -52,6 +93,9 @@ static const struct fbnic_stat fbnic_gstrings_hw_q_stats[] = { #define FBNIC_HW_Q_STATS_LEN ARRAY_SIZE(fbnic_gstrings_hw_q_stats) #define FBNIC_HW_STATS_LEN \ (FBNIC_HW_FIXED_STATS_LEN + \ + FBNIC_HW_RXB_ENQUEUE_STATS_LEN * FBNIC_RXB_ENQUEUE_INDICES + \ + FBNIC_HW_RXB_FIFO_STATS_LEN * FBNIC_RXB_FIFO_INDICES + \ + FBNIC_HW_RXB_DEQUEUE_STATS_LEN * FBNIC_RXB_DEQUEUE_INDICES + \ FBNIC_HW_Q_STATS_LEN * FBNIC_MAX_QUEUES) static void @@ -311,6 +355,36 @@ fbnic_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring, return err; } +static void fbnic_get_rxb_enqueue_strings(u8 **data, unsigned int idx) +{ + const struct fbnic_stat *stat; + int i; + + stat = fbnic_gstrings_rxb_enqueue_stats; + for (i = 0; i < FBNIC_HW_RXB_ENQUEUE_STATS_LEN; i++, stat++) + ethtool_sprintf(data, stat->string, idx); +} + +static void fbnic_get_rxb_fifo_strings(u8 **data, unsigned int idx) +{ + const struct fbnic_stat *stat; + int i; + + stat = fbnic_gstrings_rxb_fifo_stats; + for (i = 0; i < FBNIC_HW_RXB_FIFO_STATS_LEN; i++, stat++) + ethtool_sprintf(data, stat->string, idx); +} + +static void fbnic_get_rxb_dequeue_strings(u8 **data, unsigned int idx) +{ + const struct fbnic_stat *stat; + int i; + + stat = fbnic_gstrings_rxb_dequeue_stats; + for (i = 0; i < FBNIC_HW_RXB_DEQUEUE_STATS_LEN; i++, stat++) + ethtool_sprintf(data, stat->string, idx); +} + static void fbnic_get_strings(struct net_device *dev, u32 sset, u8 *data) { const struct fbnic_stat *stat; @@ -321,6 +395,15 @@ static void fbnic_get_strings(struct net_device *dev, u32 sset, u8 *data) for (i = 0; i < FBNIC_HW_FIXED_STATS_LEN; i++) ethtool_puts(&data, fbnic_gstrings_hw_stats[i].string); + for (i = 0; i < FBNIC_RXB_ENQUEUE_INDICES; i++) + fbnic_get_rxb_enqueue_strings(&data, i); + + for (i = 0; i < FBNIC_RXB_FIFO_INDICES; i++) + fbnic_get_rxb_fifo_strings(&data, i); + + for (i = 0; i < FBNIC_RXB_DEQUEUE_INDICES; i++) + fbnic_get_rxb_dequeue_strings(&data, i); + for (idx = 0; idx < FBNIC_MAX_QUEUES; idx++) { stat = fbnic_gstrings_hw_q_stats; @@ -357,6 +440,33 @@ static void fbnic_get_ethtool_stats(struct net_device *dev, fbnic_report_hw_stats(fbnic_gstrings_hw_stats, &fbd->hw_stats, FBNIC_HW_FIXED_STATS_LEN, &data); + for (i = 0; i < FBNIC_RXB_ENQUEUE_INDICES; i++) { + const struct fbnic_rxb_enqueue_stats *enq; + + enq = &fbd->hw_stats.rxb.enq[i]; + fbnic_report_hw_stats(fbnic_gstrings_rxb_enqueue_stats, + enq, FBNIC_HW_RXB_ENQUEUE_STATS_LEN, + &data); + } + + for (i = 0; i < FBNIC_RXB_FIFO_INDICES; i++) { + const struct fbnic_rxb_fifo_stats *fifo; + + fifo = &fbd->hw_stats.rxb.fifo[i]; + fbnic_report_hw_stats(fbnic_gstrings_rxb_fifo_stats, + fifo, FBNIC_HW_RXB_FIFO_STATS_LEN, + &data); + } + + for (i = 0; i < FBNIC_RXB_DEQUEUE_INDICES; i++) { + const struct fbnic_rxb_dequeue_stats *deq; + + deq = &fbd->hw_stats.rxb.deq[i]; + fbnic_report_hw_stats(fbnic_gstrings_rxb_dequeue_stats, + deq, FBNIC_HW_RXB_DEQUEUE_STATS_LEN, + &data); + } + for (i = 0; i < FBNIC_MAX_QUEUES; i++) { const struct fbnic_hw_q_stats *hw_q = &fbd->hw_stats.hw_q[i]; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c index c8faedc2ec443..1c5ccaf397277 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c @@ -117,6 +117,173 @@ static void fbnic_get_rpc_stats32(struct fbnic_dev *fbd, &rpc->ovr_size_err); } +static void fbnic_reset_rxb_fifo_stats(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_fifo_stats *fifo) +{ + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_DROP_FRMS_STS(i), + &fifo->drop.frames); + fbnic_hw_stat_rst64(fbd, FBNIC_RXB_DROP_BYTES_STS_L(i), 1, + &fifo->drop.bytes); + + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_TRUN_FRMS_STS(i), + &fifo->trunc.frames); + fbnic_hw_stat_rst64(fbd, FBNIC_RXB_TRUN_BYTES_STS_L(i), 1, + &fifo->trunc.bytes); + + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_TRANS_DROP_STS(i), + &fifo->trans_drop); + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_TRANS_ECN_STS(i), + &fifo->trans_ecn); + + fifo->level.u.old_reg_value_32 = 0; +} + +static void fbnic_reset_rxb_enq_stats(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_enqueue_stats *enq) +{ + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_DRBO_FRM_CNT_SRC(i), + &enq->drbo.frames); + fbnic_hw_stat_rst64(fbd, FBNIC_RXB_DRBO_BYTE_CNT_SRC_L(i), 4, + &enq->drbo.bytes); + + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_INTEGRITY_ERR(i), + &enq->integrity_err); + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_MAC_ERR(i), + &enq->mac_err); + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_PARSER_ERR(i), + &enq->parser_err); + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_FRM_ERR(i), + &enq->frm_err); +} + +static void fbnic_reset_rxb_deq_stats(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_dequeue_stats *deq) +{ + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_INTF_FRM_CNT_DST(i), + &deq->intf.frames); + fbnic_hw_stat_rst64(fbd, FBNIC_RXB_INTF_BYTE_CNT_DST_L(i), 4, + &deq->intf.bytes); + + fbnic_hw_stat_rst32(fbd, FBNIC_RXB_PBUF_FRM_CNT_DST(i), + &deq->pbuf.frames); + fbnic_hw_stat_rst64(fbd, FBNIC_RXB_PBUF_BYTE_CNT_DST_L(i), 4, + &deq->pbuf.bytes); +} + +static void fbnic_reset_rxb_stats(struct fbnic_dev *fbd, + struct fbnic_rxb_stats *rxb) +{ + int i; + + for (i = 0; i < FBNIC_RXB_FIFO_INDICES; i++) + fbnic_reset_rxb_fifo_stats(fbd, i, &rxb->fifo[i]); + + for (i = 0; i < FBNIC_RXB_INTF_INDICES; i++) { + fbnic_reset_rxb_enq_stats(fbd, i, &rxb->enq[i]); + fbnic_reset_rxb_deq_stats(fbd, i, &rxb->deq[i]); + } +} + +static void fbnic_get_rxb_fifo_stats32(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_fifo_stats *fifo) +{ + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_DROP_FRMS_STS(i), + &fifo->drop.frames); + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_TRUN_FRMS_STS(i), + &fifo->trunc.frames); + + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_TRANS_DROP_STS(i), + &fifo->trans_drop); + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_TRANS_ECN_STS(i), + &fifo->trans_ecn); + + fifo->level.value = rd32(fbd, FBNIC_RXB_PBUF_FIFO_LEVEL(i)); +} + +static void fbnic_get_rxb_fifo_stats(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_fifo_stats *fifo) +{ + fbnic_hw_stat_rd64(fbd, FBNIC_RXB_DROP_BYTES_STS_L(i), 1, + &fifo->drop.bytes); + fbnic_hw_stat_rd64(fbd, FBNIC_RXB_TRUN_BYTES_STS_L(i), 1, + &fifo->trunc.bytes); + + fbnic_get_rxb_fifo_stats32(fbd, i, fifo); +} + +static void fbnic_get_rxb_enq_stats32(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_enqueue_stats *enq) +{ + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_DRBO_FRM_CNT_SRC(i), + &enq->drbo.frames); + + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_INTEGRITY_ERR(i), + &enq->integrity_err); + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_MAC_ERR(i), + &enq->mac_err); + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_PARSER_ERR(i), + &enq->parser_err); + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_FRM_ERR(i), + &enq->frm_err); +} + +static void fbnic_get_rxb_enq_stats(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_enqueue_stats *enq) +{ + fbnic_hw_stat_rd64(fbd, FBNIC_RXB_DRBO_BYTE_CNT_SRC_L(i), 4, + &enq->drbo.bytes); + + fbnic_get_rxb_enq_stats32(fbd, i, enq); +} + +static void fbnic_get_rxb_deq_stats32(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_dequeue_stats *deq) +{ + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_INTF_FRM_CNT_DST(i), + &deq->intf.frames); + fbnic_hw_stat_rd32(fbd, FBNIC_RXB_PBUF_FRM_CNT_DST(i), + &deq->pbuf.frames); +} + +static void fbnic_get_rxb_deq_stats(struct fbnic_dev *fbd, int i, + struct fbnic_rxb_dequeue_stats *deq) +{ + fbnic_hw_stat_rd64(fbd, FBNIC_RXB_INTF_BYTE_CNT_DST_L(i), 4, + &deq->intf.bytes); + fbnic_hw_stat_rd64(fbd, FBNIC_RXB_PBUF_BYTE_CNT_DST_L(i), 4, + &deq->pbuf.bytes); + + fbnic_get_rxb_deq_stats32(fbd, i, deq); +} + +static void fbnic_get_rxb_stats32(struct fbnic_dev *fbd, + struct fbnic_rxb_stats *rxb) +{ + int i; + + for (i = 0; i < FBNIC_RXB_FIFO_INDICES; i++) + fbnic_get_rxb_fifo_stats32(fbd, i, &rxb->fifo[i]); + + for (i = 0; i < FBNIC_RXB_INTF_INDICES; i++) { + fbnic_get_rxb_enq_stats32(fbd, i, &rxb->enq[i]); + fbnic_get_rxb_deq_stats32(fbd, i, &rxb->deq[i]); + } +} + +static void fbnic_get_rxb_stats(struct fbnic_dev *fbd, + struct fbnic_rxb_stats *rxb) +{ + int i; + + for (i = 0; i < FBNIC_RXB_FIFO_INDICES; i++) + fbnic_get_rxb_fifo_stats(fbd, i, &rxb->fifo[i]); + + for (i = 0; i < FBNIC_RXB_INTF_INDICES; i++) { + fbnic_get_rxb_enq_stats(fbd, i, &rxb->enq[i]); + fbnic_get_rxb_deq_stats(fbd, i, &rxb->deq[i]); + } +} + static void fbnic_reset_hw_rxq_stats(struct fbnic_dev *fbd, struct fbnic_hw_q_stats *hw_q) { @@ -253,6 +420,7 @@ void fbnic_reset_hw_stats(struct fbnic_dev *fbd) { spin_lock(&fbd->hw_stats_lock); fbnic_reset_rpc_stats(fbd, &fbd->hw_stats.rpc); + fbnic_reset_rxb_stats(fbd, &fbd->hw_stats.rxb); fbnic_reset_hw_rxq_stats(fbd, fbd->hw_stats.hw_q); fbnic_reset_pcie_stats_asic(fbd, &fbd->hw_stats.pcie); spin_unlock(&fbd->hw_stats_lock); @@ -261,6 +429,7 @@ void fbnic_reset_hw_stats(struct fbnic_dev *fbd) static void __fbnic_get_hw_stats32(struct fbnic_dev *fbd) { fbnic_get_rpc_stats32(fbd, &fbd->hw_stats.rpc); + fbnic_get_rxb_stats32(fbd, &fbd->hw_stats.rxb); fbnic_get_hw_rxq_stats32(fbd, fbd->hw_stats.hw_q); } @@ -275,6 +444,8 @@ void fbnic_get_hw_stats(struct fbnic_dev *fbd) { spin_lock(&fbd->hw_stats_lock); __fbnic_get_hw_stats32(fbd); + + fbnic_get_rxb_stats(fbd, &fbd->hw_stats.rxb); fbnic_get_pcie_stats_asic64(fbd, &fbd->hw_stats.pcie); spin_unlock(&fbd->hw_stats_lock); } diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h index 81efa8dc83812..ec03e6253ba5a 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h @@ -17,6 +17,11 @@ struct fbnic_stat_counter { bool reported; }; +struct fbnic_hw_stat { + struct fbnic_stat_counter frames; + struct fbnic_stat_counter bytes; +}; + struct fbnic_eth_mac_stats { struct fbnic_stat_counter FramesTransmittedOK; struct fbnic_stat_counter FramesReceivedOK; @@ -43,6 +48,28 @@ struct fbnic_rpc_stats { struct fbnic_stat_counter tcp_opt_err, out_of_hdr_err, ovr_size_err; }; +struct fbnic_rxb_enqueue_stats { + struct fbnic_hw_stat drbo; + struct fbnic_stat_counter integrity_err, mac_err; + struct fbnic_stat_counter parser_err, frm_err; +}; + +struct fbnic_rxb_fifo_stats { + struct fbnic_hw_stat drop, trunc; + struct fbnic_stat_counter trans_drop, trans_ecn; + struct fbnic_stat_counter level; +}; + +struct fbnic_rxb_dequeue_stats { + struct fbnic_hw_stat intf, pbuf; +}; + +struct fbnic_rxb_stats { + struct fbnic_rxb_enqueue_stats enq[FBNIC_RXB_ENQUEUE_INDICES]; + struct fbnic_rxb_fifo_stats fifo[FBNIC_RXB_FIFO_INDICES]; + struct fbnic_rxb_dequeue_stats deq[FBNIC_RXB_DEQUEUE_INDICES]; +}; + struct fbnic_hw_q_stats { struct fbnic_stat_counter rde_pkt_err; struct fbnic_stat_counter rde_pkt_cq_drop; @@ -62,6 +89,7 @@ struct fbnic_pcie_stats { struct fbnic_hw_stats { struct fbnic_mac_stats mac; struct fbnic_rpc_stats rpc; + struct fbnic_rxb_stats rxb; struct fbnic_hw_q_stats hw_q[FBNIC_MAX_QUEUES]; struct fbnic_pcie_stats pcie; }; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index e19284d4b91d6..dff4855113014 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -408,8 +408,8 @@ static void fbnic_get_stats64(struct net_device *dev, struct fbnic_net *fbn = netdev_priv(dev); struct fbnic_dev *fbd = fbn->fbd; struct fbnic_queue_stats *stats; + u64 rx_over = 0, rx_missed = 0; unsigned int start, i; - u64 rx_over = 0; fbnic_get_hw_stats(fbd); @@ -449,6 +449,17 @@ static void fbnic_get_stats64(struct net_device *dev, rx_dropped = stats->dropped; spin_lock(&fbd->hw_stats_lock); + /* Record drops for the host FIFOs. + * 4: network to Host, 6: BMC to Host + * Exclude the BMC and MC FIFOs as those stats may contain drops + * due to unrelated items such as TCAM misses. They are still + * accessible through the ethtool stats. + */ + i = FBNIC_RXB_FIFO_HOST; + rx_missed += fbd->hw_stats.rxb.fifo[i].drop.frames.value; + i = FBNIC_RXB_FIFO_BMC_TO_HOST; + rx_missed += fbd->hw_stats.rxb.fifo[i].drop.frames.value; + for (i = 0; i < fbd->max_num_queues; i++) { /* Report packets dropped due to CQ/BDQ being full/empty */ rx_over += fbd->hw_stats.hw_q[i].rde_pkt_cq_drop.value; @@ -464,6 +475,7 @@ static void fbnic_get_stats64(struct net_device *dev, stats64->rx_dropped = rx_dropped; stats64->rx_over_errors = rx_over; stats64->rx_errors = rx_errors; + stats64->rx_missed_errors = rx_missed; for (i = 0; i < fbn->num_rx_queues; i++) { struct fbnic_ring *rxr = fbn->rx[i]; From 5f8bd2ce8269b055accc1653609186c9c3beb102 Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Thu, 10 Apr 2025 00:08:58 -0700 Subject: [PATCH 173/770] eth: fbnic: add support for TMI stats This patch add coverage for TMI stats including PTP stats and drop stats. PTP stats include illegal requests, bad timestamp and good timestamps. The bad timestamp and illegal request counters are reported under as `error` via `ethtool -T` Both these counters are individually being reported via `ethtool -S` The good timestamp stats are being reported as `pkts` via `ethtool -T` ethtool -S eth0 | grep "ptp" ptp_illegal_req: 0 ptp_good_ts: 0 ptp_bad_ts: 0 Signed-off-by: Jakub Kicinski Signed-off-by: Mohsin Bashir Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250410070859.4160768-5-mohsin.bashr@gmail.com Signed-off-by: Paolo Abeni --- .../device_drivers/ethernet/meta/fbnic.rst | 7 ++++ drivers/net/ethernet/meta/fbnic/fbnic_csr.h | 5 +++ .../net/ethernet/meta/fbnic/fbnic_ethtool.c | 5 +++ .../net/ethernet/meta/fbnic/fbnic_hw_stats.c | 34 +++++++++++++++++++ .../net/ethernet/meta/fbnic/fbnic_hw_stats.h | 6 ++++ .../net/ethernet/meta/fbnic/fbnic_netdev.c | 3 ++ 6 files changed, 60 insertions(+) diff --git a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst index 8ba94ae95db9c..02339818cb8d6 100644 --- a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst +++ b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst @@ -31,6 +31,13 @@ separate entry. Statistics ---------- +TX MAC Interface +~~~~~~~~~~~~~~~~ + + - ``ptp_illegal_req``: packets sent to the NIC with PTP request bit set but routed to BMC/FW + - ``ptp_good_ts``: packets successfully routed to MAC with PTP request bit set + - ``ptp_bad_ts``: packets destined for MAC with PTP request bit set but aborted because of some error (e.g., DMA read error) + RXB (RX Buffer) Enqueue ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h index a554e0b2cfffd..9426f7f2e611e 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h @@ -432,6 +432,11 @@ enum { #define FBNIC_TMI_SOP_PROT_CTRL 0x04400 /* 0x11000 */ #define FBNIC_TMI_DROP_CTRL 0x04401 /* 0x11004 */ #define FBNIC_TMI_DROP_CTRL_EN CSR_BIT(0) +#define FBNIC_TMI_DROP_PKTS 0x04402 /* 0x11008 */ +#define FBNIC_TMI_DROP_BYTE_L 0x04403 /* 0x1100c */ +#define FBNIC_TMI_ILLEGAL_PTP_REQS 0x04409 /* 0x11024 */ +#define FBNIC_TMI_GOOD_PTP_TS 0x0440a /* 0x11028 */ +#define FBNIC_TMI_BAD_PTP_TS 0x0440b /* 0x1102c */ #define FBNIC_CSR_END_TMI 0x0443f /* CSR section delimiter */ /* Precision Time Protocol Registers */ diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index 816af96a5d5fc..7d421791033e4 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -27,6 +27,11 @@ struct fbnic_stat { FBNIC_STAT_FIELDS(fbnic_hw_stats, name, stat) static const struct fbnic_stat fbnic_gstrings_hw_stats[] = { + /* TMI */ + FBNIC_HW_STAT("ptp_illegal_req", tmi.ptp_illegal_req), + FBNIC_HW_STAT("ptp_good_ts", tmi.ptp_good_ts), + FBNIC_HW_STAT("ptp_bad_ts", tmi.ptp_bad_ts), + /* RPC */ FBNIC_HW_STAT("rpc_unkn_etype", rpc.unkn_etype), FBNIC_HW_STAT("rpc_unkn_ext_hdr", rpc.unkn_ext_hdr), diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c index 1c5ccaf397277..80157f3899756 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c @@ -70,6 +70,37 @@ static void fbnic_hw_stat_rd64(struct fbnic_dev *fbd, u32 reg, s32 offset, stat->u.old_reg_value_64 = new_reg_value; } +static void fbnic_reset_tmi_stats(struct fbnic_dev *fbd, + struct fbnic_tmi_stats *tmi) +{ + fbnic_hw_stat_rst32(fbd, FBNIC_TMI_DROP_PKTS, &tmi->drop.frames); + fbnic_hw_stat_rst64(fbd, FBNIC_TMI_DROP_BYTE_L, 1, &tmi->drop.bytes); + + fbnic_hw_stat_rst32(fbd, + FBNIC_TMI_ILLEGAL_PTP_REQS, + &tmi->ptp_illegal_req); + fbnic_hw_stat_rst32(fbd, FBNIC_TMI_GOOD_PTP_TS, &tmi->ptp_good_ts); + fbnic_hw_stat_rst32(fbd, FBNIC_TMI_BAD_PTP_TS, &tmi->ptp_bad_ts); +} + +static void fbnic_get_tmi_stats32(struct fbnic_dev *fbd, + struct fbnic_tmi_stats *tmi) +{ + fbnic_hw_stat_rd32(fbd, FBNIC_TMI_DROP_PKTS, &tmi->drop.frames); + + fbnic_hw_stat_rd32(fbd, + FBNIC_TMI_ILLEGAL_PTP_REQS, + &tmi->ptp_illegal_req); + fbnic_hw_stat_rd32(fbd, FBNIC_TMI_GOOD_PTP_TS, &tmi->ptp_good_ts); + fbnic_hw_stat_rd32(fbd, FBNIC_TMI_BAD_PTP_TS, &tmi->ptp_bad_ts); +} + +static void fbnic_get_tmi_stats(struct fbnic_dev *fbd, + struct fbnic_tmi_stats *tmi) +{ + fbnic_hw_stat_rd64(fbd, FBNIC_TMI_DROP_BYTE_L, 1, &tmi->drop.bytes); +} + static void fbnic_reset_rpc_stats(struct fbnic_dev *fbd, struct fbnic_rpc_stats *rpc) { @@ -419,6 +450,7 @@ static void fbnic_get_pcie_stats_asic64(struct fbnic_dev *fbd, void fbnic_reset_hw_stats(struct fbnic_dev *fbd) { spin_lock(&fbd->hw_stats_lock); + fbnic_reset_tmi_stats(fbd, &fbd->hw_stats.tmi); fbnic_reset_rpc_stats(fbd, &fbd->hw_stats.rpc); fbnic_reset_rxb_stats(fbd, &fbd->hw_stats.rxb); fbnic_reset_hw_rxq_stats(fbd, fbd->hw_stats.hw_q); @@ -428,6 +460,7 @@ void fbnic_reset_hw_stats(struct fbnic_dev *fbd) static void __fbnic_get_hw_stats32(struct fbnic_dev *fbd) { + fbnic_get_tmi_stats32(fbd, &fbd->hw_stats.tmi); fbnic_get_rpc_stats32(fbd, &fbd->hw_stats.rpc); fbnic_get_rxb_stats32(fbd, &fbd->hw_stats.rxb); fbnic_get_hw_rxq_stats32(fbd, fbd->hw_stats.hw_q); @@ -445,6 +478,7 @@ void fbnic_get_hw_stats(struct fbnic_dev *fbd) spin_lock(&fbd->hw_stats_lock); __fbnic_get_hw_stats32(fbd); + fbnic_get_tmi_stats(fbd, &fbd->hw_stats.tmi); fbnic_get_rxb_stats(fbd, &fbd->hw_stats.rxb); fbnic_get_pcie_stats_asic64(fbd, &fbd->hw_stats.pcie); spin_unlock(&fbd->hw_stats_lock); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h index ec03e6253ba5a..abb0957a5ac0c 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h @@ -42,6 +42,11 @@ struct fbnic_mac_stats { struct fbnic_eth_mac_stats eth_mac; }; +struct fbnic_tmi_stats { + struct fbnic_hw_stat drop; + struct fbnic_stat_counter ptp_illegal_req, ptp_good_ts, ptp_bad_ts; +}; + struct fbnic_rpc_stats { struct fbnic_stat_counter unkn_etype, unkn_ext_hdr; struct fbnic_stat_counter ipv4_frag, ipv6_frag, ipv4_esp, ipv6_esp; @@ -88,6 +93,7 @@ struct fbnic_pcie_stats { struct fbnic_hw_stats { struct fbnic_mac_stats mac; + struct fbnic_tmi_stats tmi; struct fbnic_rpc_stats rpc; struct fbnic_rxb_stats rxb; struct fbnic_hw_q_stats hw_q[FBNIC_MAX_QUEUES]; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index dff4855113014..a0f93bd271134 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -423,6 +423,9 @@ static void fbnic_get_stats64(struct net_device *dev, stats64->tx_packets = tx_packets; stats64->tx_dropped = tx_dropped; + /* Record drops from Tx HW Datapath */ + tx_dropped += fbd->hw_stats.tmi.drop.frames.value; + for (i = 0; i < fbn->num_tx_queues; i++) { struct fbnic_ring *txr = fbn->tx[i]; From f2957147ae7a1780217bb2f7b29f4d9ae14ef4b8 Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Thu, 10 Apr 2025 00:08:59 -0700 Subject: [PATCH 174/770] eth: fbnic: add support for TTI HW stats Add coverage for the TX Extension (TEI) Interface (TTI) stats. We are tracking packets and control message drops because of credit exhaustion on the TX interface. Signed-off-by: Jakub Kicinski Signed-off-by: Mohsin Bashir Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250410070859.4160768-6-mohsin.bashr@gmail.com Signed-off-by: Paolo Abeni --- .../device_drivers/ethernet/meta/fbnic.rst | 7 ++ drivers/net/ethernet/meta/fbnic/fbnic_csr.h | 9 +++ .../net/ethernet/meta/fbnic/fbnic_ethtool.c | 8 +++ .../net/ethernet/meta/fbnic/fbnic_hw_stats.c | 66 +++++++++++++++++++ .../net/ethernet/meta/fbnic/fbnic_hw_stats.h | 5 ++ .../net/ethernet/meta/fbnic/fbnic_netdev.c | 5 +- 6 files changed, 99 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst index 02339818cb8d6..3483e498c08eb 100644 --- a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst +++ b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst @@ -38,6 +38,13 @@ TX MAC Interface - ``ptp_good_ts``: packets successfully routed to MAC with PTP request bit set - ``ptp_bad_ts``: packets destined for MAC with PTP request bit set but aborted because of some error (e.g., DMA read error) +TX Extension (TEI) Interface (TTI) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + - ``tti_cm_drop``: control messages dropped at the TX Extension (TEI) Interface because of credit starvation + - ``tti_frame_drop``: packets dropped at the TX Extension (TEI) Interface because of credit starvation + - ``tti_tbi_drop``: packets dropped at the TX BMC Interface (TBI) because of credit starvation + RXB (RX Buffer) Enqueue ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h index 9426f7f2e611e..0c217c195c6a3 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h @@ -397,6 +397,15 @@ enum { #define FBNIC_TCE_DROP_CTRL_TTI_FRM_DROP_EN CSR_BIT(1) #define FBNIC_TCE_DROP_CTRL_TTI_TBI_DROP_EN CSR_BIT(2) +#define FBNIC_TCE_TTI_CM_DROP_PKTS 0x0403e /* 0x100f8 */ +#define FBNIC_TCE_TTI_CM_DROP_BYTE_L 0x0403f /* 0x100fc */ +#define FBNIC_TCE_TTI_CM_DROP_BYTE_H 0x04040 /* 0x10100 */ +#define FBNIC_TCE_TTI_FRAME_DROP_PKTS 0x04041 /* 0x10104 */ +#define FBNIC_TCE_TTI_FRAME_DROP_BYTE_L 0x04042 /* 0x10108 */ +#define FBNIC_TCE_TTI_FRAME_DROP_BYTE_H 0x04043 /* 0x1010c */ +#define FBNIC_TCE_TBI_DROP_PKTS 0x04044 /* 0x10110 */ +#define FBNIC_TCE_TBI_DROP_BYTE_L 0x04045 /* 0x10114 */ + #define FBNIC_TCE_TCAM_IDX2DEST_MAP 0x0404A /* 0x10128 */ #define FBNIC_TCE_TCAM_IDX2DEST_MAP_DEST_ID_0 CSR_GENMASK(3, 0) enum { diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index 7d421791033e4..5c7556c8c4c5f 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -27,6 +27,14 @@ struct fbnic_stat { FBNIC_STAT_FIELDS(fbnic_hw_stats, name, stat) static const struct fbnic_stat fbnic_gstrings_hw_stats[] = { + /* TTI */ + FBNIC_HW_STAT("tti_cm_drop_frames", tti.cm_drop.frames), + FBNIC_HW_STAT("tti_cm_drop_bytes", tti.cm_drop.bytes), + FBNIC_HW_STAT("tti_frame_drop_frames", tti.frame_drop.frames), + FBNIC_HW_STAT("tti_frame_drop_bytes", tti.frame_drop.bytes), + FBNIC_HW_STAT("tti_tbi_drop_frames", tti.tbi_drop.frames), + FBNIC_HW_STAT("tti_tbi_drop_bytes", tti.tbi_drop.bytes), + /* TMI */ FBNIC_HW_STAT("ptp_illegal_req", tmi.ptp_illegal_req), FBNIC_HW_STAT("ptp_good_ts", tmi.ptp_good_ts), diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c index 80157f3899756..4223d8100e64e 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.c @@ -101,6 +101,69 @@ static void fbnic_get_tmi_stats(struct fbnic_dev *fbd, fbnic_hw_stat_rd64(fbd, FBNIC_TMI_DROP_BYTE_L, 1, &tmi->drop.bytes); } +static void fbnic_reset_tti_stats(struct fbnic_dev *fbd, + struct fbnic_tti_stats *tti) +{ + fbnic_hw_stat_rst32(fbd, + FBNIC_TCE_TTI_CM_DROP_PKTS, + &tti->cm_drop.frames); + fbnic_hw_stat_rst64(fbd, + FBNIC_TCE_TTI_CM_DROP_BYTE_L, + 1, + &tti->cm_drop.bytes); + + fbnic_hw_stat_rst32(fbd, + FBNIC_TCE_TTI_FRAME_DROP_PKTS, + &tti->frame_drop.frames); + fbnic_hw_stat_rst64(fbd, + FBNIC_TCE_TTI_FRAME_DROP_BYTE_L, + 1, + &tti->frame_drop.bytes); + + fbnic_hw_stat_rst32(fbd, + FBNIC_TCE_TBI_DROP_PKTS, + &tti->tbi_drop.frames); + fbnic_hw_stat_rst64(fbd, + FBNIC_TCE_TBI_DROP_BYTE_L, + 1, + &tti->tbi_drop.bytes); +} + +static void fbnic_get_tti_stats32(struct fbnic_dev *fbd, + struct fbnic_tti_stats *tti) +{ + fbnic_hw_stat_rd32(fbd, + FBNIC_TCE_TTI_CM_DROP_PKTS, + &tti->cm_drop.frames); + + fbnic_hw_stat_rd32(fbd, + FBNIC_TCE_TTI_FRAME_DROP_PKTS, + &tti->frame_drop.frames); + + fbnic_hw_stat_rd32(fbd, + FBNIC_TCE_TBI_DROP_PKTS, + &tti->tbi_drop.frames); +} + +static void fbnic_get_tti_stats(struct fbnic_dev *fbd, + struct fbnic_tti_stats *tti) +{ + fbnic_hw_stat_rd64(fbd, + FBNIC_TCE_TTI_CM_DROP_BYTE_L, + 1, + &tti->cm_drop.bytes); + + fbnic_hw_stat_rd64(fbd, + FBNIC_TCE_TTI_FRAME_DROP_BYTE_L, + 1, + &tti->frame_drop.bytes); + + fbnic_hw_stat_rd64(fbd, + FBNIC_TCE_TBI_DROP_BYTE_L, + 1, + &tti->tbi_drop.bytes); +} + static void fbnic_reset_rpc_stats(struct fbnic_dev *fbd, struct fbnic_rpc_stats *rpc) { @@ -451,6 +514,7 @@ void fbnic_reset_hw_stats(struct fbnic_dev *fbd) { spin_lock(&fbd->hw_stats_lock); fbnic_reset_tmi_stats(fbd, &fbd->hw_stats.tmi); + fbnic_reset_tti_stats(fbd, &fbd->hw_stats.tti); fbnic_reset_rpc_stats(fbd, &fbd->hw_stats.rpc); fbnic_reset_rxb_stats(fbd, &fbd->hw_stats.rxb); fbnic_reset_hw_rxq_stats(fbd, fbd->hw_stats.hw_q); @@ -461,6 +525,7 @@ void fbnic_reset_hw_stats(struct fbnic_dev *fbd) static void __fbnic_get_hw_stats32(struct fbnic_dev *fbd) { fbnic_get_tmi_stats32(fbd, &fbd->hw_stats.tmi); + fbnic_get_tti_stats32(fbd, &fbd->hw_stats.tti); fbnic_get_rpc_stats32(fbd, &fbd->hw_stats.rpc); fbnic_get_rxb_stats32(fbd, &fbd->hw_stats.rxb); fbnic_get_hw_rxq_stats32(fbd, fbd->hw_stats.hw_q); @@ -479,6 +544,7 @@ void fbnic_get_hw_stats(struct fbnic_dev *fbd) __fbnic_get_hw_stats32(fbd); fbnic_get_tmi_stats(fbd, &fbd->hw_stats.tmi); + fbnic_get_tti_stats(fbd, &fbd->hw_stats.tti); fbnic_get_rxb_stats(fbd, &fbd->hw_stats.rxb); fbnic_get_pcie_stats_asic64(fbd, &fbd->hw_stats.pcie); spin_unlock(&fbd->hw_stats_lock); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h index abb0957a5ac0c..07e54bb75bf3b 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_hw_stats.h @@ -47,6 +47,10 @@ struct fbnic_tmi_stats { struct fbnic_stat_counter ptp_illegal_req, ptp_good_ts, ptp_bad_ts; }; +struct fbnic_tti_stats { + struct fbnic_hw_stat cm_drop, frame_drop, tbi_drop; +}; + struct fbnic_rpc_stats { struct fbnic_stat_counter unkn_etype, unkn_ext_hdr; struct fbnic_stat_counter ipv4_frag, ipv6_frag, ipv4_esp, ipv6_esp; @@ -94,6 +98,7 @@ struct fbnic_pcie_stats { struct fbnic_hw_stats { struct fbnic_mac_stats mac; struct fbnic_tmi_stats tmi; + struct fbnic_tti_stats tti; struct fbnic_rpc_stats rpc; struct fbnic_rxb_stats rxb; struct fbnic_hw_q_stats hw_q[FBNIC_MAX_QUEUES]; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index a0f93bd271134..d699f58dda211 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -424,7 +424,10 @@ static void fbnic_get_stats64(struct net_device *dev, stats64->tx_dropped = tx_dropped; /* Record drops from Tx HW Datapath */ - tx_dropped += fbd->hw_stats.tmi.drop.frames.value; + tx_dropped += fbd->hw_stats.tmi.drop.frames.value + + fbd->hw_stats.tti.frame_drop.frames.value + + fbd->hw_stats.tti.tbi_drop.frames.value + + fbd->hw_stats.tmi.drop.frames.value; for (i = 0; i < fbn->num_tx_queues; i++) { struct fbnic_ring *txr = fbn->tx[i]; From ee6a2db281a39a29087bc3aed81945f75c84fd26 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 10 Apr 2025 18:30:09 +0200 Subject: [PATCH 175/770] net: dsa: mt7530: generalize read port stats logic In preparation for migration to use of standard MIB API, generalize the read port stats logic to a dedicated function. This will permit to manually provide the offset and size of the MIB counter to directly access specific counter. Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250410163022.3695-2-ansuelsmth@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/mt7530.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index d70399bce5b9e..85a040853194e 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -789,24 +789,34 @@ mt7530_get_strings(struct dsa_switch *ds, int port, u32 stringset, ethtool_puts(&data, mt7530_mib[i].name); } +static void +mt7530_read_port_stats(struct mt7530_priv *priv, int port, + u32 offset, u8 size, uint64_t *data) +{ + u32 val, reg = MT7530_PORT_MIB_COUNTER(port) + offset; + + val = mt7530_read(priv, reg); + *data = val; + + if (size == 2) { + val = mt7530_read(priv, reg + 4); + *data |= (u64)val << 32; + } +} + static void mt7530_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data) { struct mt7530_priv *priv = ds->priv; const struct mt7530_mib_desc *mib; - u32 reg, i; - u64 hi; + int i; for (i = 0; i < ARRAY_SIZE(mt7530_mib); i++) { mib = &mt7530_mib[i]; - reg = MT7530_PORT_MIB_COUNTER(port) + mib->offset; - data[i] = mt7530_read(priv, reg); - if (mib->size == 2) { - hi = mt7530_read(priv, reg + 4); - data[i] |= hi << 32; - } + mt7530_read_port_stats(priv, port, mib->offset, mib->size, + data + i); } } From 33bc7af2b28122a3fc28c0cd0624d59599df97d2 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 10 Apr 2025 18:30:10 +0200 Subject: [PATCH 176/770] net: dsa: mt7530: move pkt size and rx err MIB counter to rmon stats API Drop custom handling of packet size and RX error MIB counter and handle them in the standard .get_rmon_stats API The MIB entry are dropped from the custom MIB table and converted to a define providing only the MIB offset. Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250410163022.3695-3-ansuelsmth@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/mt7530.c | 71 +++++++++++++++++++++++++++++++--------- drivers/net/dsa/mt7530.h | 17 ++++++++++ 2 files changed, 72 insertions(+), 16 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 85a040853194e..54a6ddc380e97 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -44,12 +44,6 @@ static const struct mt7530_mib_desc mt7530_mib[] = { MIB_DESC(1, 0x24, "TxLateCollision"), MIB_DESC(1, 0x28, "TxExcessiveCollistion"), MIB_DESC(1, 0x2c, "TxPause"), - MIB_DESC(1, 0x30, "TxPktSz64"), - MIB_DESC(1, 0x34, "TxPktSz65To127"), - MIB_DESC(1, 0x38, "TxPktSz128To255"), - MIB_DESC(1, 0x3c, "TxPktSz256To511"), - MIB_DESC(1, 0x40, "TxPktSz512To1023"), - MIB_DESC(1, 0x44, "Tx1024ToMax"), MIB_DESC(2, 0x48, "TxBytes"), MIB_DESC(1, 0x60, "RxDrop"), MIB_DESC(1, 0x64, "RxFiltering"), @@ -58,17 +52,7 @@ static const struct mt7530_mib_desc mt7530_mib[] = { MIB_DESC(1, 0x70, "RxBroadcast"), MIB_DESC(1, 0x74, "RxAlignErr"), MIB_DESC(1, 0x78, "RxCrcErr"), - MIB_DESC(1, 0x7c, "RxUnderSizeErr"), - MIB_DESC(1, 0x80, "RxFragErr"), - MIB_DESC(1, 0x84, "RxOverSzErr"), - MIB_DESC(1, 0x88, "RxJabberErr"), MIB_DESC(1, 0x8c, "RxPause"), - MIB_DESC(1, 0x90, "RxPktSz64"), - MIB_DESC(1, 0x94, "RxPktSz65To127"), - MIB_DESC(1, 0x98, "RxPktSz128To255"), - MIB_DESC(1, 0x9c, "RxPktSz256To511"), - MIB_DESC(1, 0xa0, "RxPktSz512To1023"), - MIB_DESC(1, 0xa4, "RxPktSz1024ToMax"), MIB_DESC(2, 0xa8, "RxBytes"), MIB_DESC(1, 0xb0, "RxCtrlDrop"), MIB_DESC(1, 0xb4, "RxIngressDrop"), @@ -829,6 +813,60 @@ mt7530_get_sset_count(struct dsa_switch *ds, int port, int sset) return ARRAY_SIZE(mt7530_mib); } +static const struct ethtool_rmon_hist_range mt7530_rmon_ranges[] = { + { 0, 64 }, + { 65, 127 }, + { 128, 255 }, + { 256, 511 }, + { 512, 1023 }, + { 1024, MT7530_MAX_MTU }, + {} +}; + +static void mt7530_get_rmon_stats(struct dsa_switch *ds, int port, + struct ethtool_rmon_stats *rmon_stats, + const struct ethtool_rmon_hist_range **ranges) +{ + struct mt7530_priv *priv = ds->priv; + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_UNDER_SIZE_ERR, 1, + &rmon_stats->undersize_pkts); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_OVER_SZ_ERR, 1, + &rmon_stats->oversize_pkts); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_FRAG_ERR, 1, + &rmon_stats->fragments); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_JABBER_ERR, 1, + &rmon_stats->jabbers); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PKT_SZ_64, 1, + &rmon_stats->hist[0]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PKT_SZ_65_TO_127, 1, + &rmon_stats->hist[1]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PKT_SZ_128_TO_255, 1, + &rmon_stats->hist[2]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PKT_SZ_256_TO_511, 1, + &rmon_stats->hist[3]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PKT_SZ_512_TO_1023, 1, + &rmon_stats->hist[4]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PKT_SZ_1024_TO_MAX, 1, + &rmon_stats->hist[5]); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PKT_SZ_64, 1, + &rmon_stats->hist_tx[0]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PKT_SZ_65_TO_127, 1, + &rmon_stats->hist_tx[1]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PKT_SZ_128_TO_255, 1, + &rmon_stats->hist_tx[2]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PKT_SZ_256_TO_511, 1, + &rmon_stats->hist_tx[3]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PKT_SZ_512_TO_1023, 1, + &rmon_stats->hist_tx[4]); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PKT_SZ_1024_TO_MAX, 1, + &rmon_stats->hist_tx[5]); + + *ranges = mt7530_rmon_ranges; +} + static int mt7530_set_ageing_time(struct dsa_switch *ds, unsigned int msecs) { @@ -3115,6 +3153,7 @@ const struct dsa_switch_ops mt7530_switch_ops = { .get_strings = mt7530_get_strings, .get_ethtool_stats = mt7530_get_ethtool_stats, .get_sset_count = mt7530_get_sset_count, + .get_rmon_stats = mt7530_get_rmon_stats, .set_ageing_time = mt7530_set_ageing_time, .port_enable = mt7530_port_enable, .port_disable = mt7530_port_disable, diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index c3ea403d7acfd..9bc90d1678f7e 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -423,6 +423,23 @@ enum mt7530_vlan_port_acc_frm { /* Register for MIB */ #define MT7530_PORT_MIB_COUNTER(x) (0x4000 + (x) * 0x100) +/* Each define is an offset of MT7530_PORT_MIB_COUNTER */ +#define MT7530_PORT_MIB_TX_PKT_SZ_64 0x30 +#define MT7530_PORT_MIB_TX_PKT_SZ_65_TO_127 0x34 +#define MT7530_PORT_MIB_TX_PKT_SZ_128_TO_255 0x38 +#define MT7530_PORT_MIB_TX_PKT_SZ_256_TO_511 0x3c +#define MT7530_PORT_MIB_TX_PKT_SZ_512_TO_1023 0x40 +#define MT7530_PORT_MIB_TX_PKT_SZ_1024_TO_MAX 0x44 +#define MT7530_PORT_MIB_RX_UNDER_SIZE_ERR 0x7c +#define MT7530_PORT_MIB_RX_FRAG_ERR 0x80 +#define MT7530_PORT_MIB_RX_OVER_SZ_ERR 0x84 +#define MT7530_PORT_MIB_RX_JABBER_ERR 0x88 +#define MT7530_PORT_MIB_RX_PKT_SZ_64 0x90 +#define MT7530_PORT_MIB_RX_PKT_SZ_65_TO_127 0x94 +#define MT7530_PORT_MIB_RX_PKT_SZ_128_TO_255 0x98 +#define MT7530_PORT_MIB_RX_PKT_SZ_256_TO_511 0x9c +#define MT7530_PORT_MIB_RX_PKT_SZ_512_TO_1023 0xa0 +#define MT7530_PORT_MIB_RX_PKT_SZ_1024_TO_MAX 0xa4 #define MT7530_MIB_CCR 0x4fe0 #define CCR_MIB_ENABLE BIT(31) #define CCR_RX_OCT_CNT_GOOD BIT(7) From e12989ab719cea827cd2b8442f7dada4fd259481 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 10 Apr 2025 18:30:11 +0200 Subject: [PATCH 177/770] net: dsa: mt7530: move pause MIB counter to eth_ctrl stats API Drop custom handling of TX/RX pause frame MIB counter and handle them in the standard .get_eth_ctrl_stats API The MIB entry are dropped from the custom MIB table and converted to a define providing only the MIB offset. Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250410163022.3695-4-ansuelsmth@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/mt7530.c | 15 +++++++++++++-- drivers/net/dsa/mt7530.h | 2 ++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 54a6ddc380e97..f183a604355e4 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -43,7 +43,6 @@ static const struct mt7530_mib_desc mt7530_mib[] = { MIB_DESC(1, 0x20, "TxDeferred"), MIB_DESC(1, 0x24, "TxLateCollision"), MIB_DESC(1, 0x28, "TxExcessiveCollistion"), - MIB_DESC(1, 0x2c, "TxPause"), MIB_DESC(2, 0x48, "TxBytes"), MIB_DESC(1, 0x60, "RxDrop"), MIB_DESC(1, 0x64, "RxFiltering"), @@ -52,7 +51,6 @@ static const struct mt7530_mib_desc mt7530_mib[] = { MIB_DESC(1, 0x70, "RxBroadcast"), MIB_DESC(1, 0x74, "RxAlignErr"), MIB_DESC(1, 0x78, "RxCrcErr"), - MIB_DESC(1, 0x8c, "RxPause"), MIB_DESC(2, 0xa8, "RxBytes"), MIB_DESC(1, 0xb0, "RxCtrlDrop"), MIB_DESC(1, 0xb4, "RxIngressDrop"), @@ -867,6 +865,18 @@ static void mt7530_get_rmon_stats(struct dsa_switch *ds, int port, *ranges = mt7530_rmon_ranges; } +static void mt7530_get_eth_ctrl_stats(struct dsa_switch *ds, int port, + struct ethtool_eth_ctrl_stats *ctrl_stats) +{ + struct mt7530_priv *priv = ds->priv; + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_PAUSE, 1, + &ctrl_stats->MACControlFramesTransmitted); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_PAUSE, 1, + &ctrl_stats->MACControlFramesReceived); +} + static int mt7530_set_ageing_time(struct dsa_switch *ds, unsigned int msecs) { @@ -3154,6 +3164,7 @@ const struct dsa_switch_ops mt7530_switch_ops = { .get_ethtool_stats = mt7530_get_ethtool_stats, .get_sset_count = mt7530_get_sset_count, .get_rmon_stats = mt7530_get_rmon_stats, + .get_eth_ctrl_stats = mt7530_get_eth_ctrl_stats, .set_ageing_time = mt7530_set_ageing_time, .port_enable = mt7530_port_enable, .port_disable = mt7530_port_disable, diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 9bc90d1678f7e..a651ad29b750f 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -424,6 +424,7 @@ enum mt7530_vlan_port_acc_frm { /* Register for MIB */ #define MT7530_PORT_MIB_COUNTER(x) (0x4000 + (x) * 0x100) /* Each define is an offset of MT7530_PORT_MIB_COUNTER */ +#define MT7530_PORT_MIB_TX_PAUSE 0x2c #define MT7530_PORT_MIB_TX_PKT_SZ_64 0x30 #define MT7530_PORT_MIB_TX_PKT_SZ_65_TO_127 0x34 #define MT7530_PORT_MIB_TX_PKT_SZ_128_TO_255 0x38 @@ -434,6 +435,7 @@ enum mt7530_vlan_port_acc_frm { #define MT7530_PORT_MIB_RX_FRAG_ERR 0x80 #define MT7530_PORT_MIB_RX_OVER_SZ_ERR 0x84 #define MT7530_PORT_MIB_RX_JABBER_ERR 0x88 +#define MT7530_PORT_MIB_RX_PAUSE 0x8c #define MT7530_PORT_MIB_RX_PKT_SZ_64 0x90 #define MT7530_PORT_MIB_RX_PKT_SZ_65_TO_127 0x94 #define MT7530_PORT_MIB_RX_PKT_SZ_128_TO_255 0x98 From dcf9eb6d33a2b030d96ab935a9e9f06715a951db Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 10 Apr 2025 18:30:12 +0200 Subject: [PATCH 178/770] net: dsa: mt7530: move pkt stats and err MIB counter to eth_mac stats API Drop custom handling of TX/RX packet stats and error MIB counter and handle them in the standard .get_eth_mac_stats API The MIB entry are dropped from the custom MIB table and converted to a define providing only the MIB offset. Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250410163022.3695-5-ansuelsmth@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/mt7530.c | 70 ++++++++++++++++++++++++++++++++-------- drivers/net/dsa/mt7530.h | 14 ++++++++ 2 files changed, 70 insertions(+), 14 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index f183a604355e4..2202c657930e4 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -34,24 +34,10 @@ static struct mt753x_pcs *pcs_to_mt753x_pcs(struct phylink_pcs *pcs) static const struct mt7530_mib_desc mt7530_mib[] = { MIB_DESC(1, 0x00, "TxDrop"), MIB_DESC(1, 0x04, "TxCrcErr"), - MIB_DESC(1, 0x08, "TxUnicast"), - MIB_DESC(1, 0x0c, "TxMulticast"), - MIB_DESC(1, 0x10, "TxBroadcast"), MIB_DESC(1, 0x14, "TxCollision"), - MIB_DESC(1, 0x18, "TxSingleCollision"), - MIB_DESC(1, 0x1c, "TxMultipleCollision"), - MIB_DESC(1, 0x20, "TxDeferred"), - MIB_DESC(1, 0x24, "TxLateCollision"), - MIB_DESC(1, 0x28, "TxExcessiveCollistion"), - MIB_DESC(2, 0x48, "TxBytes"), MIB_DESC(1, 0x60, "RxDrop"), MIB_DESC(1, 0x64, "RxFiltering"), - MIB_DESC(1, 0x68, "RxUnicast"), - MIB_DESC(1, 0x6c, "RxMulticast"), - MIB_DESC(1, 0x70, "RxBroadcast"), - MIB_DESC(1, 0x74, "RxAlignErr"), MIB_DESC(1, 0x78, "RxCrcErr"), - MIB_DESC(2, 0xa8, "RxBytes"), MIB_DESC(1, 0xb0, "RxCtrlDrop"), MIB_DESC(1, 0xb4, "RxIngressDrop"), MIB_DESC(1, 0xb8, "RxArlDrop"), @@ -811,6 +797,61 @@ mt7530_get_sset_count(struct dsa_switch *ds, int port, int sset) return ARRAY_SIZE(mt7530_mib); } +static void mt7530_get_eth_mac_stats(struct dsa_switch *ds, int port, + struct ethtool_eth_mac_stats *mac_stats) +{ + struct mt7530_priv *priv = ds->priv; + + /* MIB counter doesn't provide a FramesTransmittedOK but instead + * provide stats for Unicast, Broadcast and Multicast frames separately. + * To simulate a global frame counter, read Unicast and addition Multicast + * and Broadcast later + */ + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_UNICAST, 1, + &mac_stats->FramesTransmittedOK); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_SINGLE_COLLISION, 1, + &mac_stats->SingleCollisionFrames); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_MULTIPLE_COLLISION, 1, + &mac_stats->MultipleCollisionFrames); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_UNICAST, 1, + &mac_stats->FramesReceivedOK); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_BYTES, 2, + &mac_stats->OctetsTransmittedOK); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_ALIGN_ERR, 1, + &mac_stats->AlignmentErrors); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_DEFERRED, 1, + &mac_stats->FramesWithDeferredXmissions); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_LATE_COLLISION, 1, + &mac_stats->LateCollisions); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_EXCESSIVE_COLLISION, 1, + &mac_stats->FramesAbortedDueToXSColls); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_BYTES, 2, + &mac_stats->OctetsReceivedOK); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_MULTICAST, 1, + &mac_stats->MulticastFramesXmittedOK); + mac_stats->FramesTransmittedOK += mac_stats->MulticastFramesXmittedOK; + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_BROADCAST, 1, + &mac_stats->BroadcastFramesXmittedOK); + mac_stats->FramesTransmittedOK += mac_stats->BroadcastFramesXmittedOK; + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_MULTICAST, 1, + &mac_stats->MulticastFramesReceivedOK); + mac_stats->FramesReceivedOK += mac_stats->MulticastFramesReceivedOK; + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_BROADCAST, 1, + &mac_stats->BroadcastFramesReceivedOK); + mac_stats->FramesReceivedOK += mac_stats->BroadcastFramesReceivedOK; +} + static const struct ethtool_rmon_hist_range mt7530_rmon_ranges[] = { { 0, 64 }, { 65, 127 }, @@ -3163,6 +3204,7 @@ const struct dsa_switch_ops mt7530_switch_ops = { .get_strings = mt7530_get_strings, .get_ethtool_stats = mt7530_get_ethtool_stats, .get_sset_count = mt7530_get_sset_count, + .get_eth_mac_stats = mt7530_get_eth_mac_stats, .get_rmon_stats = mt7530_get_rmon_stats, .get_eth_ctrl_stats = mt7530_get_eth_ctrl_stats, .set_ageing_time = mt7530_set_ageing_time, diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index a651ad29b750f..0cc999fa13808 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -424,6 +424,14 @@ enum mt7530_vlan_port_acc_frm { /* Register for MIB */ #define MT7530_PORT_MIB_COUNTER(x) (0x4000 + (x) * 0x100) /* Each define is an offset of MT7530_PORT_MIB_COUNTER */ +#define MT7530_PORT_MIB_TX_UNICAST 0x08 +#define MT7530_PORT_MIB_TX_MULTICAST 0x0c +#define MT7530_PORT_MIB_TX_BROADCAST 0x10 +#define MT7530_PORT_MIB_TX_SINGLE_COLLISION 0x18 +#define MT7530_PORT_MIB_TX_MULTIPLE_COLLISION 0x1c +#define MT7530_PORT_MIB_TX_DEFERRED 0x20 +#define MT7530_PORT_MIB_TX_LATE_COLLISION 0x24 +#define MT7530_PORT_MIB_TX_EXCESSIVE_COLLISION 0x28 #define MT7530_PORT_MIB_TX_PAUSE 0x2c #define MT7530_PORT_MIB_TX_PKT_SZ_64 0x30 #define MT7530_PORT_MIB_TX_PKT_SZ_65_TO_127 0x34 @@ -431,6 +439,11 @@ enum mt7530_vlan_port_acc_frm { #define MT7530_PORT_MIB_TX_PKT_SZ_256_TO_511 0x3c #define MT7530_PORT_MIB_TX_PKT_SZ_512_TO_1023 0x40 #define MT7530_PORT_MIB_TX_PKT_SZ_1024_TO_MAX 0x44 +#define MT7530_PORT_MIB_TX_BYTES 0x48 /* 64 bytes */ +#define MT7530_PORT_MIB_RX_UNICAST 0x68 +#define MT7530_PORT_MIB_RX_MULTICAST 0x6c +#define MT7530_PORT_MIB_RX_BROADCAST 0x70 +#define MT7530_PORT_MIB_RX_ALIGN_ERR 0x74 #define MT7530_PORT_MIB_RX_UNDER_SIZE_ERR 0x7c #define MT7530_PORT_MIB_RX_FRAG_ERR 0x80 #define MT7530_PORT_MIB_RX_OVER_SZ_ERR 0x84 @@ -442,6 +455,7 @@ enum mt7530_vlan_port_acc_frm { #define MT7530_PORT_MIB_RX_PKT_SZ_256_TO_511 0x9c #define MT7530_PORT_MIB_RX_PKT_SZ_512_TO_1023 0xa0 #define MT7530_PORT_MIB_RX_PKT_SZ_1024_TO_MAX 0xa4 +#define MT7530_PORT_MIB_RX_BYTES 0xa8 /* 64 bytes */ #define MT7530_MIB_CCR 0x4fe0 #define CCR_MIB_ENABLE BIT(31) #define CCR_RX_OCT_CNT_GOOD BIT(7) From c3b904c6dd814507c70c2c5c24b3ba359ff41d0a Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 10 Apr 2025 18:30:13 +0200 Subject: [PATCH 179/770] net: dsa: mt7530: move remaining MIB counter to define For consistency with the other MIB counter, move also the remaining MIB counter to define and update the custom MIB table. Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250410163022.3695-6-ansuelsmth@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/mt7530.c | 18 +++++++++--------- drivers/net/dsa/mt7530.h | 9 +++++++++ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 2202c657930e4..fdceefb2083cd 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -32,15 +32,15 @@ static struct mt753x_pcs *pcs_to_mt753x_pcs(struct phylink_pcs *pcs) /* String, offset, and register size in bytes if different from 4 bytes */ static const struct mt7530_mib_desc mt7530_mib[] = { - MIB_DESC(1, 0x00, "TxDrop"), - MIB_DESC(1, 0x04, "TxCrcErr"), - MIB_DESC(1, 0x14, "TxCollision"), - MIB_DESC(1, 0x60, "RxDrop"), - MIB_DESC(1, 0x64, "RxFiltering"), - MIB_DESC(1, 0x78, "RxCrcErr"), - MIB_DESC(1, 0xb0, "RxCtrlDrop"), - MIB_DESC(1, 0xb4, "RxIngressDrop"), - MIB_DESC(1, 0xb8, "RxArlDrop"), + MIB_DESC(1, MT7530_PORT_MIB_TX_DROP, "TxDrop"), + MIB_DESC(1, MT7530_PORT_MIB_TX_CRC_ERR, "TxCrcErr"), + MIB_DESC(1, MT7530_PORT_MIB_TX_COLLISION, "TxCollision"), + MIB_DESC(1, MT7530_PORT_MIB_RX_DROP, "RxDrop"), + MIB_DESC(1, MT7530_PORT_MIB_RX_FILTERING, "RxFiltering"), + MIB_DESC(1, MT7530_PORT_MIB_RX_CRC_ERR, "RxCrcErr"), + MIB_DESC(1, MT7530_PORT_MIB_RX_CTRL_DROP, "RxCtrlDrop"), + MIB_DESC(1, MT7530_PORT_MIB_RX_INGRESS_DROP, "RxIngressDrop"), + MIB_DESC(1, MT7530_PORT_MIB_RX_ARL_DROP, "RxArlDrop"), }; static void diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index 0cc999fa13808..d4b838a055ad9 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -424,9 +424,12 @@ enum mt7530_vlan_port_acc_frm { /* Register for MIB */ #define MT7530_PORT_MIB_COUNTER(x) (0x4000 + (x) * 0x100) /* Each define is an offset of MT7530_PORT_MIB_COUNTER */ +#define MT7530_PORT_MIB_TX_DROP 0x00 +#define MT7530_PORT_MIB_TX_CRC_ERR 0x04 #define MT7530_PORT_MIB_TX_UNICAST 0x08 #define MT7530_PORT_MIB_TX_MULTICAST 0x0c #define MT7530_PORT_MIB_TX_BROADCAST 0x10 +#define MT7530_PORT_MIB_TX_COLLISION 0x14 #define MT7530_PORT_MIB_TX_SINGLE_COLLISION 0x18 #define MT7530_PORT_MIB_TX_MULTIPLE_COLLISION 0x1c #define MT7530_PORT_MIB_TX_DEFERRED 0x20 @@ -440,10 +443,13 @@ enum mt7530_vlan_port_acc_frm { #define MT7530_PORT_MIB_TX_PKT_SZ_512_TO_1023 0x40 #define MT7530_PORT_MIB_TX_PKT_SZ_1024_TO_MAX 0x44 #define MT7530_PORT_MIB_TX_BYTES 0x48 /* 64 bytes */ +#define MT7530_PORT_MIB_RX_DROP 0x60 +#define MT7530_PORT_MIB_RX_FILTERING 0x64 #define MT7530_PORT_MIB_RX_UNICAST 0x68 #define MT7530_PORT_MIB_RX_MULTICAST 0x6c #define MT7530_PORT_MIB_RX_BROADCAST 0x70 #define MT7530_PORT_MIB_RX_ALIGN_ERR 0x74 +#define MT7530_PORT_MIB_RX_CRC_ERR 0x78 #define MT7530_PORT_MIB_RX_UNDER_SIZE_ERR 0x7c #define MT7530_PORT_MIB_RX_FRAG_ERR 0x80 #define MT7530_PORT_MIB_RX_OVER_SZ_ERR 0x84 @@ -456,6 +462,9 @@ enum mt7530_vlan_port_acc_frm { #define MT7530_PORT_MIB_RX_PKT_SZ_512_TO_1023 0xa0 #define MT7530_PORT_MIB_RX_PKT_SZ_1024_TO_MAX 0xa4 #define MT7530_PORT_MIB_RX_BYTES 0xa8 /* 64 bytes */ +#define MT7530_PORT_MIB_RX_CTRL_DROP 0xb0 +#define MT7530_PORT_MIB_RX_INGRESS_DROP 0xb4 +#define MT7530_PORT_MIB_RX_ARL_DROP 0xb8 #define MT7530_MIB_CCR 0x4fe0 #define CCR_MIB_ENABLE BIT(31) #define CCR_RX_OCT_CNT_GOOD BIT(7) From 88c810f35ed543dc3ddc3e4d888d89f9e8ecd7cb Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Thu, 10 Apr 2025 18:30:14 +0200 Subject: [PATCH 180/770] net: dsa: mt7530: implement .get_stats64 It was reported that the internally calculated counter might differ from the real one from the Switch MIB. This can happen if the switch directly forward packets between the ports or offload small packets like ARP request. In such case, the kernel counter will desync compared to the real one transmitted and received by the Switch. To correctly provide the real info to the kernel, implement .get_stats64 that will directly read the current MIB counter from the switch register. Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250410163022.3695-7-ansuelsmth@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/mt7530.c | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index fdceefb2083cd..0a33ca1dd7ca1 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -906,6 +906,51 @@ static void mt7530_get_rmon_stats(struct dsa_switch *ds, int port, *ranges = mt7530_rmon_ranges; } +static void mt7530_get_stats64(struct dsa_switch *ds, int port, + struct rtnl_link_stats64 *storage) +{ + struct mt7530_priv *priv = ds->priv; + uint64_t data; + + /* MIB counter doesn't provide a FramesTransmittedOK but instead + * provide stats for Unicast, Broadcast and Multicast frames separately. + * To simulate a global frame counter, read Unicast and addition Multicast + * and Broadcast later + */ + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_UNICAST, 1, + &storage->rx_packets); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_MULTICAST, 1, + &storage->multicast); + storage->rx_packets += storage->multicast; + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_BROADCAST, 1, + &data); + storage->rx_packets += data; + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_UNICAST, 1, + &storage->tx_packets); + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_MULTICAST, 1, + &data); + storage->tx_packets += data; + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_BROADCAST, 1, + &data); + storage->tx_packets += data; + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_BYTES, 2, + &storage->rx_bytes); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_BYTES, 2, + &storage->tx_bytes); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_DROP, 1, + &storage->rx_dropped); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_TX_DROP, 1, + &storage->tx_dropped); + + mt7530_read_port_stats(priv, port, MT7530_PORT_MIB_RX_CRC_ERR, 1, + &storage->rx_crc_errors); +} + static void mt7530_get_eth_ctrl_stats(struct dsa_switch *ds, int port, struct ethtool_eth_ctrl_stats *ctrl_stats) { @@ -3207,6 +3252,7 @@ const struct dsa_switch_ops mt7530_switch_ops = { .get_eth_mac_stats = mt7530_get_eth_mac_stats, .get_rmon_stats = mt7530_get_rmon_stats, .get_eth_ctrl_stats = mt7530_get_eth_ctrl_stats, + .get_stats64 = mt7530_get_stats64, .set_ageing_time = mt7530_set_ageing_time, .port_enable = mt7530_port_enable, .port_disable = mt7530_port_disable, From 8c9b406ff470a8346232eec979dc84f518aabfd9 Mon Sep 17 00:00:00 2001 From: Kevin Paul Reddy Janagari Date: Fri, 11 Apr 2025 14:20:10 +0530 Subject: [PATCH 181/770] tipc: Removing deprecated strncpy() This patch suggests the replacement of strncpy with strscpy as per Documentation/process/deprecated. The strncpy() fails to guarantee NULL termination, The function adds zero pads which isn't really convenient for short strings as it may cause performance issues. strscpy() is a preferred replacement because it overcomes the limitations of strncpy mentioned above. Compile Tested Signed-off-by: Kevin Paul Reddy Janagari Reviewed-by: Tung Nguyen Tested-by: Tung Nguyen Link: https://patch.msgid.link/20250411085010.6249-1-kevinpaul468@gmail.com Signed-off-by: Paolo Abeni --- net/tipc/link.c | 2 +- net/tipc/node.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index 18be6ff4c3db0..3ee44d7317004 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2228,7 +2228,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME) break; - strncpy(if_name, data, TIPC_MAX_IF_NAME); + strscpy(if_name, data, TIPC_MAX_IF_NAME); /* Update own tolerance if peer indicates a non-zero value */ if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { diff --git a/net/tipc/node.c b/net/tipc/node.c index ccf5e427f43eb..cb43f2016a708 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1581,7 +1581,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, tipc_node_read_lock(node); link = node->links[bearer_id].link; if (link) { - strncpy(linkname, tipc_link_name(link), len); + strscpy(linkname, tipc_link_name(link), len); err = 0; } tipc_node_read_unlock(node); From e8a1bd8344054ce27bebf59f48e3f6bc10bc419b Mon Sep 17 00:00:00 2001 From: Hari Kalavakunta Date: Wed, 9 Apr 2025 18:23:08 -0700 Subject: [PATCH 182/770] net: ncsi: Fix GCPS 64-bit member variables Correct Get Controller Packet Statistics (GCPS) 64-bit wide member variables, as per DSP0222 v1.0.0 and forward specs. The Driver currently collects these stats, but they are yet to be exposed to the user. Therefore, no user impact. Statistics fixes: Total Bytes Received (byte range 28..35) Total Bytes Transmitted (byte range 36..43) Total Unicast Packets Received (byte range 44..51) Total Multicast Packets Received (byte range 52..59) Total Broadcast Packets Received (byte range 60..67) Total Unicast Packets Transmitted (byte range 68..75) Total Multicast Packets Transmitted (byte range 76..83) Total Broadcast Packets Transmitted (byte range 84..91) Valid Bytes Received (byte range 204..11) Signed-off-by: Hari Kalavakunta Reviewed-by: Paul Fertser Link: https://patch.msgid.link/20250410012309.1343-1-kalavakunta.hari.prasad@gmail.com Signed-off-by: Paolo Abeni --- net/ncsi/internal.h | 21 ++++++++++----------- net/ncsi/ncsi-pkt.h | 23 +++++++++++------------ net/ncsi/ncsi-rsp.c | 21 ++++++++++----------- 3 files changed, 31 insertions(+), 34 deletions(-) diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 4e0842df5234e..2c260f33b55cc 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -143,16 +143,15 @@ struct ncsi_channel_vlan_filter { }; struct ncsi_channel_stats { - u32 hnc_cnt_hi; /* Counter cleared */ - u32 hnc_cnt_lo; /* Counter cleared */ - u32 hnc_rx_bytes; /* Rx bytes */ - u32 hnc_tx_bytes; /* Tx bytes */ - u32 hnc_rx_uc_pkts; /* Rx UC packets */ - u32 hnc_rx_mc_pkts; /* Rx MC packets */ - u32 hnc_rx_bc_pkts; /* Rx BC packets */ - u32 hnc_tx_uc_pkts; /* Tx UC packets */ - u32 hnc_tx_mc_pkts; /* Tx MC packets */ - u32 hnc_tx_bc_pkts; /* Tx BC packets */ + u64 hnc_cnt; /* Counter cleared */ + u64 hnc_rx_bytes; /* Rx bytes */ + u64 hnc_tx_bytes; /* Tx bytes */ + u64 hnc_rx_uc_pkts; /* Rx UC packets */ + u64 hnc_rx_mc_pkts; /* Rx MC packets */ + u64 hnc_rx_bc_pkts; /* Rx BC packets */ + u64 hnc_tx_uc_pkts; /* Tx UC packets */ + u64 hnc_tx_mc_pkts; /* Tx MC packets */ + u64 hnc_tx_bc_pkts; /* Tx BC packets */ u32 hnc_fcs_err; /* FCS errors */ u32 hnc_align_err; /* Alignment errors */ u32 hnc_false_carrier; /* False carrier detection */ @@ -181,7 +180,7 @@ struct ncsi_channel_stats { u32 hnc_tx_1023_frames; /* Tx 512-1023 bytes frames */ u32 hnc_tx_1522_frames; /* Tx 1024-1522 bytes frames */ u32 hnc_tx_9022_frames; /* Tx 1523-9022 bytes frames */ - u32 hnc_rx_valid_bytes; /* Rx valid bytes */ + u64 hnc_rx_valid_bytes; /* Rx valid bytes */ u32 hnc_rx_runt_pkts; /* Rx error runt packets */ u32 hnc_rx_jabber_pkts; /* Rx error jabber packets */ u32 ncsi_rx_cmds; /* Rx NCSI commands */ diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index f2f3b5c1b9412..24edb27379724 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -252,16 +252,15 @@ struct ncsi_rsp_gp_pkt { /* Get Controller Packet Statistics */ struct ncsi_rsp_gcps_pkt { struct ncsi_rsp_pkt_hdr rsp; /* Response header */ - __be32 cnt_hi; /* Counter cleared */ - __be32 cnt_lo; /* Counter cleared */ - __be32 rx_bytes; /* Rx bytes */ - __be32 tx_bytes; /* Tx bytes */ - __be32 rx_uc_pkts; /* Rx UC packets */ - __be32 rx_mc_pkts; /* Rx MC packets */ - __be32 rx_bc_pkts; /* Rx BC packets */ - __be32 tx_uc_pkts; /* Tx UC packets */ - __be32 tx_mc_pkts; /* Tx MC packets */ - __be32 tx_bc_pkts; /* Tx BC packets */ + __be64 cnt; /* Counter cleared */ + __be64 rx_bytes; /* Rx bytes */ + __be64 tx_bytes; /* Tx bytes */ + __be64 rx_uc_pkts; /* Rx UC packets */ + __be64 rx_mc_pkts; /* Rx MC packets */ + __be64 rx_bc_pkts; /* Rx BC packets */ + __be64 tx_uc_pkts; /* Tx UC packets */ + __be64 tx_mc_pkts; /* Tx MC packets */ + __be64 tx_bc_pkts; /* Tx BC packets */ __be32 fcs_err; /* FCS errors */ __be32 align_err; /* Alignment errors */ __be32 false_carrier; /* False carrier detection */ @@ -290,11 +289,11 @@ struct ncsi_rsp_gcps_pkt { __be32 tx_1023_frames; /* Tx 512-1023 bytes frames */ __be32 tx_1522_frames; /* Tx 1024-1522 bytes frames */ __be32 tx_9022_frames; /* Tx 1523-9022 bytes frames */ - __be32 rx_valid_bytes; /* Rx valid bytes */ + __be64 rx_valid_bytes; /* Rx valid bytes */ __be32 rx_runt_pkts; /* Rx error runt packets */ __be32 rx_jabber_pkts; /* Rx error jabber packets */ __be32 checksum; /* Checksum */ -}; +} __packed __aligned(4); /* Get NCSI Statistics */ struct ncsi_rsp_gns_pkt { diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 4a8ce2949faea..8668888c5a2f9 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -926,16 +926,15 @@ static int ncsi_rsp_handler_gcps(struct ncsi_request *nr) /* Update HNC's statistics */ ncs = &nc->stats; - ncs->hnc_cnt_hi = ntohl(rsp->cnt_hi); - ncs->hnc_cnt_lo = ntohl(rsp->cnt_lo); - ncs->hnc_rx_bytes = ntohl(rsp->rx_bytes); - ncs->hnc_tx_bytes = ntohl(rsp->tx_bytes); - ncs->hnc_rx_uc_pkts = ntohl(rsp->rx_uc_pkts); - ncs->hnc_rx_mc_pkts = ntohl(rsp->rx_mc_pkts); - ncs->hnc_rx_bc_pkts = ntohl(rsp->rx_bc_pkts); - ncs->hnc_tx_uc_pkts = ntohl(rsp->tx_uc_pkts); - ncs->hnc_tx_mc_pkts = ntohl(rsp->tx_mc_pkts); - ncs->hnc_tx_bc_pkts = ntohl(rsp->tx_bc_pkts); + ncs->hnc_cnt = be64_to_cpu(rsp->cnt); + ncs->hnc_rx_bytes = be64_to_cpu(rsp->rx_bytes); + ncs->hnc_tx_bytes = be64_to_cpu(rsp->tx_bytes); + ncs->hnc_rx_uc_pkts = be64_to_cpu(rsp->rx_uc_pkts); + ncs->hnc_rx_mc_pkts = be64_to_cpu(rsp->rx_mc_pkts); + ncs->hnc_rx_bc_pkts = be64_to_cpu(rsp->rx_bc_pkts); + ncs->hnc_tx_uc_pkts = be64_to_cpu(rsp->tx_uc_pkts); + ncs->hnc_tx_mc_pkts = be64_to_cpu(rsp->tx_mc_pkts); + ncs->hnc_tx_bc_pkts = be64_to_cpu(rsp->tx_bc_pkts); ncs->hnc_fcs_err = ntohl(rsp->fcs_err); ncs->hnc_align_err = ntohl(rsp->align_err); ncs->hnc_false_carrier = ntohl(rsp->false_carrier); @@ -964,7 +963,7 @@ static int ncsi_rsp_handler_gcps(struct ncsi_request *nr) ncs->hnc_tx_1023_frames = ntohl(rsp->tx_1023_frames); ncs->hnc_tx_1522_frames = ntohl(rsp->tx_1522_frames); ncs->hnc_tx_9022_frames = ntohl(rsp->tx_9022_frames); - ncs->hnc_rx_valid_bytes = ntohl(rsp->rx_valid_bytes); + ncs->hnc_rx_valid_bytes = be64_to_cpu(rsp->rx_valid_bytes); ncs->hnc_rx_runt_pkts = ntohl(rsp->rx_runt_pkts); ncs->hnc_rx_jabber_pkts = ntohl(rsp->rx_jabber_pkts); From 8982fc03fd630a38c0d0ebddef768b4d4707b659 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Thu, 10 Apr 2025 14:59:54 +0200 Subject: [PATCH 183/770] devlink: add value check to devlink_info_version_put() Prevent from proceeding if there's nothing to print. Suggested-by: Przemek Kitszel Reviewed-by: Jiri Pirko Reviewed-by: Kalesh AP Tested-by: Bharath R Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- net/devlink/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/devlink/dev.c b/net/devlink/dev.c index d6e3db300acbe..02602704bdeaa 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -775,7 +775,7 @@ static int devlink_info_version_put(struct devlink_info_req *req, int attr, req->version_cb(version_name, version_type, req->version_cb_priv); - if (!req->msg) + if (!req->msg || !*version_value) return 0; nest = nla_nest_start_noflag(req->msg, attr); From fd5ef5203ce645f72a37b02a89f2eb20827f7d52 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Thu, 10 Apr 2025 14:59:55 +0200 Subject: [PATCH 184/770] ixgbe: wrap netdev_priv() usage Wrap use of netdev_priv() in order to change the allocator of the device private structure from alloc_etherdev_mq() to the devlink in next commit. All but one netdev_priv() calls in the whole driver are replaced, the remaining one is called on MACVLAN (so not ixgbe) device. Signed-off-by: Przemek Kitszel Tested-by: Bharath R Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 5 ++ .../net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c | 56 +++++++------- .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 74 +++++++++---------- drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c | 12 +-- .../net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 10 +-- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 74 +++++++++---------- .../net/ethernet/intel/ixgbe/ixgbe_sriov.c | 16 ++-- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 2 +- 8 files changed, 127 insertions(+), 122 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index e6a380d4929bd..0efd6da874a50 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -830,6 +830,11 @@ struct ixgbe_adapter { spinlock_t vfs_lock; }; +static inline struct ixgbe_adapter *ixgbe_from_netdev(struct net_device *netdev) +{ + return netdev_priv(netdev); +} + static inline int ixgbe_determine_xdp_q_idx(int cpu) { if (static_key_enabled(&ixgbe_xdp_locking_key)) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c index 19d6b6fa8fb38..3dd5a16a14df1 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_dcb_nl.c @@ -118,14 +118,14 @@ static int ixgbe_copy_dcb_cfg(struct ixgbe_adapter *adapter, int tc_max) static u8 ixgbe_dcbnl_get_state(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); return !!(adapter->flags & IXGBE_FLAG_DCB_ENABLED); } static u8 ixgbe_dcbnl_set_state(struct net_device *netdev, u8 state) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); /* Fail command if not in CEE mode */ if (!(adapter->dcbx_cap & DCB_CAP_DCBX_VER_CEE)) @@ -142,7 +142,7 @@ static u8 ixgbe_dcbnl_set_state(struct net_device *netdev, u8 state) static void ixgbe_dcbnl_get_perm_hw_addr(struct net_device *netdev, u8 *perm_addr) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int i, j; memset(perm_addr, 0xff, MAX_ADDR_LEN); @@ -167,7 +167,7 @@ static void ixgbe_dcbnl_set_pg_tc_cfg_tx(struct net_device *netdev, int tc, u8 prio, u8 bwg_id, u8 bw_pct, u8 up_map) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (prio != DCB_ATTR_VALUE_UNDEFINED) adapter->temp_dcb_cfg.tc_config[tc].path[0].prio_type = prio; @@ -184,7 +184,7 @@ static void ixgbe_dcbnl_set_pg_tc_cfg_tx(struct net_device *netdev, int tc, static void ixgbe_dcbnl_set_pg_bwg_cfg_tx(struct net_device *netdev, int bwg_id, u8 bw_pct) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); adapter->temp_dcb_cfg.bw_percentage[0][bwg_id] = bw_pct; } @@ -193,7 +193,7 @@ static void ixgbe_dcbnl_set_pg_tc_cfg_rx(struct net_device *netdev, int tc, u8 prio, u8 bwg_id, u8 bw_pct, u8 up_map) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (prio != DCB_ATTR_VALUE_UNDEFINED) adapter->temp_dcb_cfg.tc_config[tc].path[1].prio_type = prio; @@ -210,7 +210,7 @@ static void ixgbe_dcbnl_set_pg_tc_cfg_rx(struct net_device *netdev, int tc, static void ixgbe_dcbnl_set_pg_bwg_cfg_rx(struct net_device *netdev, int bwg_id, u8 bw_pct) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); adapter->temp_dcb_cfg.bw_percentage[1][bwg_id] = bw_pct; } @@ -219,7 +219,7 @@ static void ixgbe_dcbnl_get_pg_tc_cfg_tx(struct net_device *netdev, int tc, u8 *prio, u8 *bwg_id, u8 *bw_pct, u8 *up_map) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); *prio = adapter->dcb_cfg.tc_config[tc].path[0].prio_type; *bwg_id = adapter->dcb_cfg.tc_config[tc].path[0].bwg_id; @@ -230,7 +230,7 @@ static void ixgbe_dcbnl_get_pg_tc_cfg_tx(struct net_device *netdev, int tc, static void ixgbe_dcbnl_get_pg_bwg_cfg_tx(struct net_device *netdev, int bwg_id, u8 *bw_pct) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); *bw_pct = adapter->dcb_cfg.bw_percentage[0][bwg_id]; } @@ -239,7 +239,7 @@ static void ixgbe_dcbnl_get_pg_tc_cfg_rx(struct net_device *netdev, int tc, u8 *prio, u8 *bwg_id, u8 *bw_pct, u8 *up_map) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); *prio = adapter->dcb_cfg.tc_config[tc].path[1].prio_type; *bwg_id = adapter->dcb_cfg.tc_config[tc].path[1].bwg_id; @@ -250,7 +250,7 @@ static void ixgbe_dcbnl_get_pg_tc_cfg_rx(struct net_device *netdev, int tc, static void ixgbe_dcbnl_get_pg_bwg_cfg_rx(struct net_device *netdev, int bwg_id, u8 *bw_pct) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); *bw_pct = adapter->dcb_cfg.bw_percentage[1][bwg_id]; } @@ -258,7 +258,7 @@ static void ixgbe_dcbnl_get_pg_bwg_cfg_rx(struct net_device *netdev, int bwg_id, static void ixgbe_dcbnl_set_pfc_cfg(struct net_device *netdev, int priority, u8 setting) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); adapter->temp_dcb_cfg.tc_config[priority].dcb_pfc = setting; if (adapter->temp_dcb_cfg.tc_config[priority].dcb_pfc != @@ -269,14 +269,14 @@ static void ixgbe_dcbnl_set_pfc_cfg(struct net_device *netdev, int priority, static void ixgbe_dcbnl_get_pfc_cfg(struct net_device *netdev, int priority, u8 *setting) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); *setting = adapter->dcb_cfg.tc_config[priority].dcb_pfc; } static void ixgbe_dcbnl_devreset(struct net_device *dev) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); while (test_and_set_bit(__IXGBE_RESETTING, &adapter->state)) usleep_range(1000, 2000); @@ -295,7 +295,7 @@ static void ixgbe_dcbnl_devreset(struct net_device *dev) static u8 ixgbe_dcbnl_set_all(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_dcb_config *dcb_cfg = &adapter->dcb_cfg; struct ixgbe_hw *hw = &adapter->hw; int ret = DCB_NO_HW_CHG; @@ -383,7 +383,7 @@ static u8 ixgbe_dcbnl_set_all(struct net_device *netdev) static u8 ixgbe_dcbnl_getcap(struct net_device *netdev, int capid, u8 *cap) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); switch (capid) { case DCB_CAP_ATTR_PG: @@ -420,7 +420,7 @@ static u8 ixgbe_dcbnl_getcap(struct net_device *netdev, int capid, u8 *cap) static int ixgbe_dcbnl_getnumtcs(struct net_device *netdev, int tcid, u8 *num) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (adapter->flags & IXGBE_FLAG_DCB_ENABLED) { switch (tcid) { @@ -447,14 +447,14 @@ static int ixgbe_dcbnl_setnumtcs(struct net_device *netdev, int tcid, u8 num) static u8 ixgbe_dcbnl_getpfcstate(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); return adapter->dcb_cfg.pfc_mode_enable; } static void ixgbe_dcbnl_setpfcstate(struct net_device *netdev, u8 state) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); adapter->temp_dcb_cfg.pfc_mode_enable = state; } @@ -471,7 +471,7 @@ static void ixgbe_dcbnl_setpfcstate(struct net_device *netdev, u8 state) */ static int ixgbe_dcbnl_getapp(struct net_device *netdev, u8 idtype, u16 id) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct dcb_app app = { .selector = idtype, .protocol = id, @@ -486,7 +486,7 @@ static int ixgbe_dcbnl_getapp(struct net_device *netdev, u8 idtype, u16 id) static int ixgbe_dcbnl_ieee_getets(struct net_device *dev, struct ieee_ets *ets) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ieee_ets *my_ets = adapter->ixgbe_ieee_ets; ets->ets_cap = adapter->dcb_cfg.num_tcs.pg_tcs; @@ -506,7 +506,7 @@ static int ixgbe_dcbnl_ieee_getets(struct net_device *dev, static int ixgbe_dcbnl_ieee_setets(struct net_device *dev, struct ieee_ets *ets) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); int max_frame = dev->mtu + ETH_HLEN + ETH_FCS_LEN; int i, err; __u8 max_tc = 0; @@ -559,7 +559,7 @@ static int ixgbe_dcbnl_ieee_setets(struct net_device *dev, static int ixgbe_dcbnl_ieee_getpfc(struct net_device *dev, struct ieee_pfc *pfc) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ieee_pfc *my_pfc = adapter->ixgbe_ieee_pfc; int i; @@ -584,7 +584,7 @@ static int ixgbe_dcbnl_ieee_getpfc(struct net_device *dev, static int ixgbe_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; u8 *prio_tc; int err; @@ -616,7 +616,7 @@ static int ixgbe_dcbnl_ieee_setpfc(struct net_device *dev, static int ixgbe_dcbnl_ieee_setapp(struct net_device *dev, struct dcb_app *app) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); int err; if (!(adapter->dcbx_cap & DCB_CAP_DCBX_VER_IEEE)) @@ -661,7 +661,7 @@ static int ixgbe_dcbnl_ieee_setapp(struct net_device *dev, static int ixgbe_dcbnl_ieee_delapp(struct net_device *dev, struct dcb_app *app) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); int err; if (!(adapter->dcbx_cap & DCB_CAP_DCBX_VER_IEEE)) @@ -705,13 +705,13 @@ static int ixgbe_dcbnl_ieee_delapp(struct net_device *dev, static u8 ixgbe_dcbnl_getdcbx(struct net_device *dev) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); return adapter->dcbx_cap; } static u8 ixgbe_dcbnl_setdcbx(struct net_device *dev, u8 mode) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ieee_ets ets = {0}; struct ieee_pfc pfc = {0}; int err = 0; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index f03925c1f521a..96aa493471eb0 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -213,7 +213,7 @@ static void ixgbe_set_advertising_10gtypes(struct ixgbe_hw *hw, static int ixgbe_get_link_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *cmd) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; ixgbe_link_speed supported_link; bool autoneg = false; @@ -458,7 +458,7 @@ static int ixgbe_get_link_ksettings(struct net_device *netdev, static int ixgbe_set_link_ksettings(struct net_device *netdev, const struct ethtool_link_ksettings *cmd) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u32 advertised, old; int err = 0; @@ -535,7 +535,7 @@ static int ixgbe_set_link_ksettings(struct net_device *netdev, static void ixgbe_get_pause_stats(struct net_device *netdev, struct ethtool_pause_stats *stats) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw_stats *hwstats = &adapter->stats; stats->tx_pause_frames = hwstats->lxontxc + hwstats->lxofftxc; @@ -545,7 +545,7 @@ static void ixgbe_get_pause_stats(struct net_device *netdev, static void ixgbe_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; if (ixgbe_device_supports_autoneg_fc(hw) && @@ -567,7 +567,7 @@ static void ixgbe_get_pauseparam(struct net_device *netdev, static int ixgbe_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; struct ixgbe_fc_info fc = hw->fc; @@ -606,13 +606,13 @@ static int ixgbe_set_pauseparam(struct net_device *netdev, static u32 ixgbe_get_msglevel(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); return adapter->msg_enable; } static void ixgbe_set_msglevel(struct net_device *netdev, u32 data) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); adapter->msg_enable = data; } @@ -627,7 +627,7 @@ static int ixgbe_get_regs_len(struct net_device *netdev) static void ixgbe_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void *p) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u32 *regs_buff = p; u8 i; @@ -994,14 +994,14 @@ static void ixgbe_get_regs(struct net_device *netdev, static int ixgbe_get_eeprom_len(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); return adapter->hw.eeprom.word_size * 2; } static int ixgbe_get_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom, u8 *bytes) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u16 *eeprom_buff; int first_word, last_word, eeprom_len; @@ -1037,7 +1037,7 @@ static int ixgbe_get_eeprom(struct net_device *netdev, static int ixgbe_set_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom, u8 *bytes) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u16 *eeprom_buff; void *ptr; @@ -1107,7 +1107,7 @@ static int ixgbe_set_eeprom(struct net_device *netdev, static void ixgbe_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); strscpy(drvinfo->driver, ixgbe_driver_name, sizeof(drvinfo->driver)); @@ -1161,7 +1161,7 @@ static void ixgbe_get_ringparam(struct net_device *netdev, struct kernel_ethtool_ringparam *kernel_ring, struct netlink_ext_ack *extack) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_ring *tx_ring = adapter->tx_ring[0]; struct ixgbe_ring *rx_ring = adapter->rx_ring[0]; @@ -1176,7 +1176,7 @@ static int ixgbe_set_ringparam(struct net_device *netdev, struct kernel_ethtool_ringparam *kernel_ring, struct netlink_ext_ack *extack) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_ring *temp_ring; int i, j, err = 0; u32 new_rx_count, new_tx_count; @@ -1336,7 +1336,7 @@ static int ixgbe_get_sset_count(struct net_device *netdev, int sset) static void ixgbe_get_ethtool_stats(struct net_device *netdev, struct ethtool_stats *stats, u64 *data) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *net_stats; unsigned int start; @@ -1710,7 +1710,7 @@ static int ixgbe_eeprom_test(struct ixgbe_adapter *adapter, u64 *data) static irqreturn_t ixgbe_test_intr(int irq, void *data) { struct net_device *netdev = (struct net_device *) data; - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); adapter->test_icr |= IXGBE_READ_REG(&adapter->hw, IXGBE_EICR); @@ -2183,7 +2183,7 @@ static int ixgbe_loopback_test(struct ixgbe_adapter *adapter, u64 *data) static void ixgbe_diag_test(struct net_device *netdev, struct ethtool_test *eth_test, u64 *data) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); bool if_running = netif_running(netdev); if (ixgbe_removed(adapter->hw.hw_addr)) { @@ -2306,7 +2306,7 @@ static int ixgbe_wol_exclusion(struct ixgbe_adapter *adapter, static void ixgbe_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); wol->supported = WAKE_UCAST | WAKE_MCAST | WAKE_BCAST | WAKE_MAGIC; @@ -2328,7 +2328,7 @@ static void ixgbe_get_wol(struct net_device *netdev, static int ixgbe_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (wol->wolopts & (WAKE_PHY | WAKE_ARP | WAKE_MAGICSECURE | WAKE_FILTER)) @@ -2355,7 +2355,7 @@ static int ixgbe_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) static int ixgbe_nway_reset(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (netif_running(netdev)) ixgbe_reinit_locked(adapter); @@ -2366,7 +2366,7 @@ static int ixgbe_nway_reset(struct net_device *netdev) static int ixgbe_set_phys_id(struct net_device *netdev, enum ethtool_phys_id_state state) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; if (!hw->mac.ops.led_on || !hw->mac.ops.led_off) @@ -2399,7 +2399,7 @@ static int ixgbe_get_coalesce(struct net_device *netdev, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); /* only valid if in constant ITR mode */ if (adapter->rx_itr_setting <= 1) @@ -2455,7 +2455,7 @@ static int ixgbe_set_coalesce(struct net_device *netdev, struct kernel_ethtool_coalesce *kernel_coal, struct netlink_ext_ack *extack) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_q_vector *q_vector; int i; u16 tx_itr_param, rx_itr_param, tx_itr_prev; @@ -2681,7 +2681,7 @@ static int ixgbe_rss_indir_tbl_max(struct ixgbe_adapter *adapter) static int ixgbe_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, u32 *rule_locs) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); int ret = -EOPNOTSUPP; switch (cmd->cmd) { @@ -3069,7 +3069,7 @@ static int ixgbe_set_rss_hash_opt(struct ixgbe_adapter *adapter, static int ixgbe_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); int ret = -EOPNOTSUPP; switch (cmd->cmd) { @@ -3096,7 +3096,7 @@ static u32 ixgbe_get_rxfh_key_size(struct net_device *netdev) static u32 ixgbe_rss_indir_size(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); return ixgbe_rss_indir_tbl_entries(adapter); } @@ -3116,7 +3116,7 @@ static void ixgbe_get_reta(struct ixgbe_adapter *adapter, u32 *indir) static int ixgbe_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); rxfh->hfunc = ETH_RSS_HASH_TOP; @@ -3134,7 +3134,7 @@ static int ixgbe_set_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int i; u32 reta_entries = ixgbe_rss_indir_tbl_entries(adapter); @@ -3176,7 +3176,7 @@ static int ixgbe_set_rxfh(struct net_device *netdev, static int ixgbe_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); /* we always support timestamping disabled */ info->rx_filters = BIT(HWTSTAMP_FILTER_NONE); @@ -3252,7 +3252,7 @@ static unsigned int ixgbe_max_channels(struct ixgbe_adapter *adapter) static void ixgbe_get_channels(struct net_device *dev, struct ethtool_channels *ch) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); /* report maximum channels */ ch->max_combined = ixgbe_max_channels(adapter); @@ -3289,7 +3289,7 @@ static void ixgbe_get_channels(struct net_device *dev, static int ixgbe_set_channels(struct net_device *dev, struct ethtool_channels *ch) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); unsigned int count = ch->combined_count; u8 max_rss_indices = ixgbe_max_rss_indices(adapter); @@ -3327,7 +3327,7 @@ static int ixgbe_set_channels(struct net_device *dev, static int ixgbe_get_module_info(struct net_device *dev, struct ethtool_modinfo *modinfo) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; u8 sff8472_rev, addr_mode; bool page_swap = false; @@ -3373,7 +3373,7 @@ static int ixgbe_get_module_eeprom(struct net_device *dev, struct ethtool_eeprom *ee, u8 *data) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; int status = -EFAULT; u8 databyte = 0xFF; @@ -3469,7 +3469,7 @@ ixgbe_get_eee_fw(struct ixgbe_adapter *adapter, struct ethtool_keee *edata) static int ixgbe_get_eee(struct net_device *netdev, struct ethtool_keee *edata) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; if (!(adapter->flags2 & IXGBE_FLAG2_EEE_CAPABLE)) @@ -3483,7 +3483,7 @@ static int ixgbe_get_eee(struct net_device *netdev, struct ethtool_keee *edata) static int ixgbe_set_eee(struct net_device *netdev, struct ethtool_keee *edata) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; struct ethtool_keee eee_data; int ret_val; @@ -3538,7 +3538,7 @@ static int ixgbe_set_eee(struct net_device *netdev, struct ethtool_keee *edata) static u32 ixgbe_get_priv_flags(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); u32 priv_flags = 0; if (adapter->flags2 & IXGBE_FLAG2_RX_LEGACY) @@ -3555,7 +3555,7 @@ static u32 ixgbe_get_priv_flags(struct net_device *netdev) static int ixgbe_set_priv_flags(struct net_device *netdev, u32 priv_flags) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); unsigned int flags2 = adapter->flags2; unsigned int i; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c index 955dced844a98..7dcf6ecd157b1 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c @@ -56,7 +56,7 @@ int ixgbe_fcoe_ddp_put(struct net_device *netdev, u16 xid) if (xid >= netdev->fcoe_ddp_xid) return 0; - adapter = netdev_priv(netdev); + adapter = ixgbe_from_netdev(netdev); fcoe = &adapter->fcoe; ddp = &fcoe->ddp[xid]; if (!ddp->udl) @@ -153,7 +153,7 @@ static int ixgbe_fcoe_ddp_setup(struct net_device *netdev, u16 xid, if (!netdev || !sgl) return 0; - adapter = netdev_priv(netdev); + adapter = ixgbe_from_netdev(netdev); if (xid >= netdev->fcoe_ddp_xid) { e_warn(drv, "xid=0x%x out-of-range\n", xid); return 0; @@ -834,7 +834,7 @@ static void ixgbe_fcoe_ddp_disable(struct ixgbe_adapter *adapter) */ int ixgbe_fcoe_enable(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_fcoe *fcoe = &adapter->fcoe; atomic_inc(&fcoe->refcnt); @@ -881,7 +881,7 @@ int ixgbe_fcoe_enable(struct net_device *netdev) */ int ixgbe_fcoe_disable(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (!atomic_dec_and_test(&adapter->fcoe.refcnt)) return -EINVAL; @@ -927,7 +927,7 @@ int ixgbe_fcoe_disable(struct net_device *netdev) int ixgbe_fcoe_get_wwn(struct net_device *netdev, u64 *wwn, int type) { u16 prefix = 0xffff; - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_mac_info *mac = &adapter->hw.mac; switch (type) { @@ -967,7 +967,7 @@ int ixgbe_fcoe_get_wwn(struct net_device *netdev, u64 *wwn, int type) int ixgbe_fcoe_get_hbainfo(struct net_device *netdev, struct netdev_fcoe_hbainfo *info) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u64 dsn; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 07ea1954a276e..648a7c618cd18 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -478,7 +478,7 @@ static int ixgbe_ipsec_parse_proto_keys(struct xfrm_state *xs, static int ixgbe_ipsec_check_mgmt_ip(struct xfrm_state *xs) { struct net_device *dev = xs->xso.real_dev; - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; u32 mfval, manc, reg; int num_filters = 4; @@ -563,7 +563,7 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs, struct netlink_ext_ack *extack) { struct net_device *dev = xs->xso.real_dev; - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_ipsec *ipsec = adapter->ipsec; struct ixgbe_hw *hw = &adapter->hw; int checked, match, first; @@ -757,7 +757,7 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs, static void ixgbe_ipsec_del_sa(struct xfrm_state *xs) { struct net_device *dev = xs->xso.real_dev; - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_ipsec *ipsec = adapter->ipsec; struct ixgbe_hw *hw = &adapter->hw; u32 zerobuf[4] = {0, 0, 0, 0}; @@ -1052,7 +1052,7 @@ int ixgbe_ipsec_tx(struct ixgbe_ring *tx_ring, struct ixgbe_tx_buffer *first, struct ixgbe_ipsec_tx_data *itd) { - struct ixgbe_adapter *adapter = netdev_priv(tx_ring->netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(tx_ring->netdev); struct ixgbe_ipsec *ipsec = adapter->ipsec; struct xfrm_state *xs; struct sec_path *sp; @@ -1142,7 +1142,7 @@ void ixgbe_ipsec_rx(struct ixgbe_ring *rx_ring, union ixgbe_adv_rx_desc *rx_desc, struct sk_buff *skb) { - struct ixgbe_adapter *adapter = netdev_priv(rx_ring->netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(rx_ring->netdev); __le16 pkt_info = rx_desc->wb.lower.lo_dword.hs_rss.pkt_info; __le16 ipsec_pkt_types = cpu_to_le16(IXGBE_RXDADV_PKTTYPE_IPSEC_AH | IXGBE_RXDADV_PKTTYPE_IPSEC_ESP); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index a2718218963ed..cfa917fdca4ff 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1095,7 +1095,7 @@ static void ixgbe_tx_timeout_reset(struct ixgbe_adapter *adapter) static int ixgbe_tx_maxrate(struct net_device *netdev, int queue_index, u32 maxrate) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u32 bcnrc_val = ixgbe_link_mbps(adapter); @@ -4678,7 +4678,7 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter) static int ixgbe_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; /* add VID to filter table */ @@ -4737,7 +4737,7 @@ void ixgbe_update_pf_promisc_vlvf(struct ixgbe_adapter *adapter, u32 vid) static int ixgbe_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; /* remove VID from filter table */ @@ -4962,7 +4962,7 @@ static void ixgbe_restore_vlan(struct ixgbe_adapter *adapter) **/ static int ixgbe_write_mc_addr_list(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; if (!netif_running(netdev)) @@ -5138,7 +5138,7 @@ int ixgbe_del_mac_filter(struct ixgbe_adapter *adapter, static int ixgbe_uc_sync(struct net_device *netdev, const unsigned char *addr) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int ret; ret = ixgbe_add_mac_filter(adapter, addr, VMDQ_P(0)); @@ -5148,7 +5148,7 @@ static int ixgbe_uc_sync(struct net_device *netdev, const unsigned char *addr) static int ixgbe_uc_unsync(struct net_device *netdev, const unsigned char *addr) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); ixgbe_del_mac_filter(adapter, addr, VMDQ_P(0)); @@ -5166,7 +5166,7 @@ static int ixgbe_uc_unsync(struct net_device *netdev, const unsigned char *addr) **/ void ixgbe_set_rx_mode(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u32 fctrl, vmolr = IXGBE_VMOLR_BAM | IXGBE_VMOLR_AUPE; netdev_features_t features = netdev->features; @@ -5268,7 +5268,7 @@ static void ixgbe_napi_disable_all(struct ixgbe_adapter *adapter) static int ixgbe_udp_tunnel_sync(struct net_device *dev, unsigned int table) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; struct udp_tunnel_info ti; @@ -6600,7 +6600,7 @@ static void ixgbe_set_eee_capable(struct ixgbe_adapter *adapter) **/ static void ixgbe_tx_timeout(struct net_device *netdev, unsigned int __always_unused txqueue) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); /* Do the reset outside of interrupt context */ ixgbe_tx_timeout_reset(adapter); @@ -7165,7 +7165,7 @@ static int ixgbe_max_xdp_frame_size(struct ixgbe_adapter *adapter) **/ static int ixgbe_change_mtu(struct net_device *netdev, int new_mtu) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (ixgbe_enabled_xdp_adapter(adapter)) { int new_frame_size = new_mtu + IXGBE_PKT_HDR_PAD; @@ -7212,7 +7212,7 @@ static int ixgbe_change_mtu(struct net_device *netdev, int new_mtu) **/ int ixgbe_open(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; int err, queues; @@ -7316,7 +7316,7 @@ static void ixgbe_close_suspend(struct ixgbe_adapter *adapter) **/ int ixgbe_close(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); ixgbe_ptp_stop(adapter); @@ -9001,7 +9001,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb, switch (vlan_get_protocol(skb)) { case htons(ETH_P_FCOE): case htons(ETH_P_FIP): - adapter = netdev_priv(dev); + adapter = ixgbe_from_netdev(dev); if (!sb_dev && (adapter->flags & IXGBE_FLAG_FCOE_ENABLED)) break; @@ -9260,7 +9260,7 @@ static netdev_tx_t __ixgbe_xmit_frame(struct sk_buff *skb, struct net_device *netdev, struct ixgbe_ring *ring) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_ring *tx_ring; /* @@ -9292,7 +9292,7 @@ static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb, **/ static int ixgbe_set_mac(struct net_device *netdev, void *p) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; struct sockaddr *addr = p; @@ -9310,7 +9310,7 @@ static int ixgbe_set_mac(struct net_device *netdev, void *p) static int ixgbe_mdio_read(struct net_device *netdev, int prtad, int devad, u16 addr) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; u16 value; int rc; @@ -9336,7 +9336,7 @@ ixgbe_mdio_read(struct net_device *netdev, int prtad, int devad, u16 addr) static int ixgbe_mdio_write(struct net_device *netdev, int prtad, int devad, u16 addr, u16 value) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; if (adapter->mii_bus) { @@ -9356,7 +9356,7 @@ static int ixgbe_mdio_write(struct net_device *netdev, int prtad, int devad, static int ixgbe_ioctl(struct net_device *netdev, struct ifreq *req, int cmd) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); switch (cmd) { case SIOCSHWTSTAMP: @@ -9382,7 +9382,7 @@ static int ixgbe_ioctl(struct net_device *netdev, struct ifreq *req, int cmd) static int ixgbe_add_sanmac_netdev(struct net_device *dev) { int err = 0; - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; if (is_valid_ether_addr(hw->mac.san_addr)) { @@ -9406,7 +9406,7 @@ static int ixgbe_add_sanmac_netdev(struct net_device *dev) static int ixgbe_del_sanmac_netdev(struct net_device *dev) { int err = 0; - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_mac_info *mac = &adapter->hw.mac; if (is_valid_ether_addr(mac->san_addr)) { @@ -9437,7 +9437,7 @@ static void ixgbe_get_ring_stats64(struct rtnl_link_stats64 *stats, static void ixgbe_get_stats64(struct net_device *netdev, struct rtnl_link_stats64 *stats) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int i; rcu_read_lock(); @@ -9480,7 +9480,7 @@ static void ixgbe_get_stats64(struct net_device *netdev, static int ixgbe_ndo_get_vf_stats(struct net_device *netdev, int vf, struct ifla_vf_stats *vf_stats) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (vf < 0 || vf >= adapter->num_vfs) return -EINVAL; @@ -9597,7 +9597,7 @@ static int ixgbe_reassign_macvlan_pool(struct net_device *vdev, static void ixgbe_defrag_macvlan_pools(struct net_device *dev) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct netdev_nested_priv priv = { .data = (void *)adapter, }; @@ -9618,7 +9618,7 @@ static void ixgbe_defrag_macvlan_pools(struct net_device *dev) */ int ixgbe_setup_tc(struct net_device *dev, u8 tc) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_hw *hw = &adapter->hw; /* Hardware supports up to 8 traffic classes */ @@ -10176,7 +10176,7 @@ static LIST_HEAD(ixgbe_block_cb_list); static int __ixgbe_setup_tc(struct net_device *dev, enum tc_setup_type type, void *type_data) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); switch (type) { case TC_SETUP_BLOCK: @@ -10204,7 +10204,7 @@ void ixgbe_sriov_reinit(struct ixgbe_adapter *adapter) #endif void ixgbe_do_reset(struct net_device *netdev) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (netif_running(netdev)) ixgbe_reinit_locked(adapter); @@ -10215,7 +10215,7 @@ void ixgbe_do_reset(struct net_device *netdev) static netdev_features_t ixgbe_fix_features(struct net_device *netdev, netdev_features_t features) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); /* If Rx checksum is disabled, then RSC/LRO should also be disabled */ if (!(features & NETIF_F_RXCSUM)) @@ -10252,7 +10252,7 @@ static void ixgbe_reset_l2fw_offload(struct ixgbe_adapter *adapter) static int ixgbe_set_features(struct net_device *netdev, netdev_features_t features) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); netdev_features_t changed = netdev->features ^ features; bool need_reset = false; @@ -10328,7 +10328,7 @@ static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], { /* guarantee we can provide a unique filter for the unicast address */ if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); u16 pool = VMDQ_P(0); if (netdev_uc_count(dev) >= ixgbe_available_rars(adapter, pool)) @@ -10416,7 +10416,7 @@ static int ixgbe_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, struct netlink_ext_ack *extack) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct nlattr *attr, *br_spec; int rem; @@ -10444,7 +10444,7 @@ static int ixgbe_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); if (!(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED)) return 0; @@ -10456,7 +10456,7 @@ static int ixgbe_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, static void *ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev) { - struct ixgbe_adapter *adapter = netdev_priv(pdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(pdev); struct ixgbe_fwd_adapter *accel; int tcs = adapter->hw_tcs ? : 1; int pool, err; @@ -10553,7 +10553,7 @@ static void *ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev) static void ixgbe_fwd_del(struct net_device *pdev, void *priv) { struct ixgbe_fwd_adapter *accel = priv; - struct ixgbe_adapter *adapter = netdev_priv(pdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(pdev); unsigned int rxbase = accel->rx_base_queue; unsigned int i; @@ -10631,7 +10631,7 @@ ixgbe_features_check(struct sk_buff *skb, struct net_device *dev, static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) { int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct bpf_prog *old_prog; bool need_reset; int num_queues; @@ -10703,7 +10703,7 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); switch (xdp->command) { case XDP_SETUP_PROG: @@ -10738,7 +10738,7 @@ void ixgbe_xdp_ring_update_tail_locked(struct ixgbe_ring *ring) static int ixgbe_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, u32 flags) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_ring *ring; int nxmit = 0; int i; @@ -11271,7 +11271,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) SET_NETDEV_DEV(netdev, &pdev->dev); - adapter = netdev_priv(netdev); + adapter = ixgbe_from_netdev(netdev); adapter->netdev = netdev; adapter->pdev = pdev; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c index ccdce80edd142..0dbbd2befd4df 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c @@ -1418,7 +1418,7 @@ void ixgbe_set_all_vfs(struct ixgbe_adapter *adapter) int ixgbe_ndo_set_vf_mac(struct net_device *netdev, int vf, u8 *mac) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int retval; if (vf >= adapter->num_vfs) @@ -1526,7 +1526,7 @@ int ixgbe_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos, __be16 vlan_proto) { int err = 0; - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if ((vf >= adapter->num_vfs) || (vlan > 4095) || (qos > 7)) return -EINVAL; @@ -1644,7 +1644,7 @@ void ixgbe_check_vf_rate_limit(struct ixgbe_adapter *adapter) int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int min_tx_rate, int max_tx_rate) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int link_speed; /* verify VF is active */ @@ -1679,7 +1679,7 @@ int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, int min_tx_rate, int ixgbe_ndo_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); struct ixgbe_hw *hw = &adapter->hw; if (vf >= adapter->num_vfs) @@ -1757,7 +1757,7 @@ void ixgbe_set_vf_link_state(struct ixgbe_adapter *adapter, int vf, int state) **/ int ixgbe_ndo_set_vf_link_state(struct net_device *netdev, int vf, int state) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); int ret = 0; if (vf < 0 || vf >= adapter->num_vfs) { @@ -1794,7 +1794,7 @@ int ixgbe_ndo_set_vf_link_state(struct net_device *netdev, int vf, int state) int ixgbe_ndo_set_vf_rss_query_en(struct net_device *netdev, int vf, bool setting) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); /* This operation is currently supported only for 82599 and x540 * devices. @@ -1813,7 +1813,7 @@ int ixgbe_ndo_set_vf_rss_query_en(struct net_device *netdev, int vf, int ixgbe_ndo_set_vf_trust(struct net_device *netdev, int vf, bool setting) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (vf >= adapter->num_vfs) return -EINVAL; @@ -1836,7 +1836,7 @@ int ixgbe_ndo_set_vf_trust(struct net_device *netdev, int vf, bool setting) int ixgbe_ndo_get_vf_config(struct net_device *netdev, int vf, struct ifla_vf_info *ivi) { - struct ixgbe_adapter *adapter = netdev_priv(netdev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); if (vf >= adapter->num_vfs) return -EINVAL; ivi->vf = vf; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index 3e3b471e53f06..ac58964b2f087 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -508,7 +508,7 @@ bool ixgbe_clean_xdp_tx_irq(struct ixgbe_q_vector *q_vector, int ixgbe_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) { - struct ixgbe_adapter *adapter = netdev_priv(dev); + struct ixgbe_adapter *adapter = ixgbe_from_netdev(dev); struct ixgbe_ring *ring; if (test_bit(__IXGBE_DOWN, &adapter->state)) From a0285236ab93fdfdd1008afaa04561d142d6c276 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Thu, 10 Apr 2025 14:59:56 +0200 Subject: [PATCH 185/770] ixgbe: add initial devlink support Add an initial support for devlink interface to ixgbe driver. Similarly to i40e driver the implementation doesn't enable devlink to manage device-wide configuration. Devlink instance is created for each physical function of PCIe device. Create separate directory for devlink related ixgbe files and use naming scheme similar to the one used in the ice driver. Add a stub for Documentation, to be extended by further patches. Change struct ixgbe_adapter allocation to be done by devlink (Przemek), as suggested by Jiri. Reviewed-by: Mateusz Polchlopek Co-developed-by: Przemek Kitszel Signed-off-by: Przemek Kitszel Tested-by: Bharath R Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- Documentation/networking/devlink/index.rst | 1 + Documentation/networking/devlink/ixgbe.rst | 8 ++ drivers/net/ethernet/intel/Kconfig | 1 + drivers/net/ethernet/intel/ixgbe/Makefile | 3 +- .../ethernet/intel/ixgbe/devlink/devlink.c | 77 +++++++++++++++++++ .../ethernet/intel/ixgbe/devlink/devlink.h | 10 +++ drivers/net/ethernet/intel/ixgbe/ixgbe.h | 12 ++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 30 +++++++- 8 files changed, 138 insertions(+), 4 deletions(-) create mode 100644 Documentation/networking/devlink/ixgbe.rst create mode 100644 drivers/net/ethernet/intel/ixgbe/devlink/devlink.c create mode 100644 drivers/net/ethernet/intel/ixgbe/devlink/devlink.h diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst index 948c8c44e233f..8319f43b5933d 100644 --- a/Documentation/networking/devlink/index.rst +++ b/Documentation/networking/devlink/index.rst @@ -84,6 +84,7 @@ parameters, info versions, and other features it supports. i40e ionic ice + ixgbe mlx4 mlx5 mlxsw diff --git a/Documentation/networking/devlink/ixgbe.rst b/Documentation/networking/devlink/ixgbe.rst new file mode 100644 index 0000000000000..c04ac51c6d859 --- /dev/null +++ b/Documentation/networking/devlink/ixgbe.rst @@ -0,0 +1,8 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +ixgbe devlink support +===================== + +This document describes the devlink features implemented by the ``ixgbe`` +device driver. diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index 1640d2f278330..56ee58c9df21f 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -147,6 +147,7 @@ config IXGBE depends on PCI depends on PTP_1588_CLOCK_OPTIONAL select MDIO + select NET_DEVLINK select PHYLIB help This driver supports Intel(R) 10GbE PCI Express family of diff --git a/drivers/net/ethernet/intel/ixgbe/Makefile b/drivers/net/ethernet/intel/ixgbe/Makefile index b456d102655ac..11f37140c0a30 100644 --- a/drivers/net/ethernet/intel/ixgbe/Makefile +++ b/drivers/net/ethernet/intel/ixgbe/Makefile @@ -4,12 +4,13 @@ # Makefile for the Intel(R) 10GbE PCI Express ethernet driver # +subdir-ccflags-y += -I$(src) obj-$(CONFIG_IXGBE) += ixgbe.o ixgbe-y := ixgbe_main.o ixgbe_common.o ixgbe_ethtool.o \ ixgbe_82599.o ixgbe_82598.o ixgbe_phy.o ixgbe_sriov.o \ ixgbe_mbx.o ixgbe_x540.o ixgbe_x550.o ixgbe_lib.o ixgbe_ptp.o \ - ixgbe_xsk.o ixgbe_e610.o + ixgbe_xsk.o ixgbe_e610.o devlink/devlink.o ixgbe-$(CONFIG_IXGBE_DCB) += ixgbe_dcb.o ixgbe_dcb_82598.o \ ixgbe_dcb_82599.o ixgbe_dcb_nl.o diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c new file mode 100644 index 0000000000000..6c3452cf5d7d0 --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025, Intel Corporation. */ + +#include "ixgbe.h" +#include "devlink.h" + +static const struct devlink_ops ixgbe_devlink_ops = { +}; + +/** + * ixgbe_allocate_devlink - Allocate devlink instance + * @dev: device to allocate devlink for + * + * Allocate a devlink instance for this physical function. + * + * Return: pointer to the device adapter structure on success, + * ERR_PTR(-ENOMEM) when allocation failed. + */ +struct ixgbe_adapter *ixgbe_allocate_devlink(struct device *dev) +{ + struct ixgbe_adapter *adapter; + struct devlink *devlink; + + devlink = devlink_alloc(&ixgbe_devlink_ops, sizeof(*adapter), dev); + if (!devlink) + return ERR_PTR(-ENOMEM); + + adapter = devlink_priv(devlink); + adapter->devlink = devlink; + + return adapter; +} + +/** + * ixgbe_devlink_set_switch_id - Set unique switch ID based on PCI DSN + * @adapter: pointer to the device adapter structure + * @ppid: struct with switch id information + */ +static void ixgbe_devlink_set_switch_id(struct ixgbe_adapter *adapter, + struct netdev_phys_item_id *ppid) +{ + u64 id = pci_get_dsn(adapter->pdev); + + ppid->id_len = sizeof(id); + put_unaligned_be64(id, &ppid->id); +} + +/** + * ixgbe_devlink_register_port - Register devlink port + * @adapter: pointer to the device adapter structure + * + * Create and register a devlink_port for this physical function. + * + * Return: 0 on success, error code on failure. + */ +int ixgbe_devlink_register_port(struct ixgbe_adapter *adapter) +{ + struct devlink_port *devlink_port = &adapter->devlink_port; + struct devlink *devlink = adapter->devlink; + struct device *dev = &adapter->pdev->dev; + struct devlink_port_attrs attrs = {}; + int err; + + attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; + attrs.phys.port_number = adapter->hw.bus.func; + ixgbe_devlink_set_switch_id(adapter, &attrs.switch_id); + + devlink_port_attrs_set(devlink_port, &attrs); + + err = devl_port_register(devlink, devlink_port, 0); + if (err) { + dev_err(dev, + "devlink port registration failed, err %d\n", err); + } + + return err; +} diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h new file mode 100644 index 0000000000000..0b27653a34072 --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025, Intel Corporation. */ + +#ifndef _IXGBE_DEVLINK_H_ +#define _IXGBE_DEVLINK_H_ + +struct ixgbe_adapter *ixgbe_allocate_devlink(struct device *dev); +int ixgbe_devlink_register_port(struct ixgbe_adapter *adapter); + +#endif /* _IXGBE_DEVLINK_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 0efd6da874a50..6cb8772b1ebfa 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -17,6 +17,8 @@ #include #include +#include + #include "ixgbe_type.h" #include "ixgbe_common.h" #include "ixgbe_dcb.h" @@ -612,6 +614,8 @@ struct ixgbe_adapter { struct bpf_prog *xdp_prog; struct pci_dev *pdev; struct mii_bus *mii_bus; + struct devlink *devlink; + struct devlink_port devlink_port; unsigned long state; @@ -830,9 +834,15 @@ struct ixgbe_adapter { spinlock_t vfs_lock; }; +struct ixgbe_netdevice_priv { + struct ixgbe_adapter *adapter; +}; + static inline struct ixgbe_adapter *ixgbe_from_netdev(struct net_device *netdev) { - return netdev_priv(netdev); + struct ixgbe_netdevice_priv *priv = netdev_priv(netdev); + + return priv->adapter; } static inline int ixgbe_determine_xdp_q_idx(int cpu) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index cfa917fdca4ff..0c7be10657a94 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -49,6 +49,7 @@ #include "ixgbe_sriov.h" #include "ixgbe_model.h" #include "ixgbe_txrx_common.h" +#include "devlink/devlink.h" char ixgbe_driver_name[] = "ixgbe"; static const char ixgbe_driver_string[] = @@ -11210,6 +11211,7 @@ static void ixgbe_set_fw_version(struct ixgbe_adapter *adapter) static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { struct net_device *netdev; + struct ixgbe_netdevice_priv *netdev_priv_wrapper; struct ixgbe_adapter *adapter = NULL; struct ixgbe_hw *hw; const struct ixgbe_info *ii = ixgbe_info_tbl[ent->driver_data]; @@ -11263,7 +11265,13 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) indices = IXGBE_MAX_RSS_INDICES_X550; } - netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter), indices); + adapter = ixgbe_allocate_devlink(&pdev->dev); + if (IS_ERR(adapter)) { + err = PTR_ERR(adapter); + goto err_devlink; + } + + netdev = alloc_etherdev_mq(sizeof(*netdev_priv_wrapper), indices); if (!netdev) { err = -ENOMEM; goto err_alloc_etherdev; @@ -11271,7 +11279,8 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) SET_NETDEV_DEV(netdev, &pdev->dev); - adapter = ixgbe_from_netdev(netdev); + netdev_priv_wrapper = netdev_priv(netdev); + netdev_priv_wrapper->adapter = adapter; adapter->netdev = netdev; adapter->pdev = pdev; @@ -11617,6 +11626,11 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) } strcpy(netdev->name, "eth%d"); pci_set_drvdata(pdev, adapter); + + devl_lock(adapter->devlink); + ixgbe_devlink_register_port(adapter); + SET_NETDEV_DEVLINK_PORT(adapter->netdev, &adapter->devlink_port); + err = register_netdev(netdev); if (err) goto err_register; @@ -11671,11 +11685,15 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_netdev; + devl_register(adapter->devlink); + devl_unlock(adapter->devlink); return 0; err_netdev: unregister_netdev(netdev); err_register: + devl_port_unregister(&adapter->devlink_port); + devl_unlock(adapter->devlink); ixgbe_release_hw_control(adapter); ixgbe_clear_interrupt_scheme(adapter); if (hw->mac.type == ixgbe_mac_e610) @@ -11692,7 +11710,9 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state); free_netdev(netdev); err_alloc_etherdev: + devlink_free(adapter->devlink); pci_release_mem_regions(pdev); +err_devlink: err_pci_reg: err_dma: if (!adapter || disable_dev) @@ -11721,6 +11741,8 @@ static void ixgbe_remove(struct pci_dev *pdev) return; netdev = adapter->netdev; + devl_lock(adapter->devlink); + devl_unregister(adapter->devlink); ixgbe_dbg_adapter_exit(adapter); set_bit(__IXGBE_REMOVING, &adapter->state); @@ -11756,6 +11778,10 @@ static void ixgbe_remove(struct pci_dev *pdev) if (netdev->reg_state == NETREG_REGISTERED) unregister_netdev(netdev); + devl_port_unregister(&adapter->devlink_port); + devl_unlock(adapter->devlink); + devlink_free(adapter->devlink); + ixgbe_stop_ipsec_offload(adapter); ixgbe_clear_interrupt_scheme(adapter); From f6b588af3d575acba469f15ef519ae4befb0ad63 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Thu, 10 Apr 2025 14:59:57 +0200 Subject: [PATCH 186/770] ixgbe: add handler for devlink .info_get() Provide devlink .info_get() callback implementation to allow the driver to report detailed version information. The following info is reported: "serial_number" -> The PCI DSN of the adapter "fw.bundle_id" -> Unique identifier for the combined flash image "fw.undi" -> Version of the Option ROM containing the UEFI driver "board.id" -> The PBA ID string Reviewed-by: Mateusz Polchlopek Tested-by: Bharath R Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- Documentation/networking/devlink/ixgbe.rst | 32 ++++++ .../ethernet/intel/ixgbe/devlink/devlink.c | 101 ++++++++++++++++++ 2 files changed, 133 insertions(+) diff --git a/Documentation/networking/devlink/ixgbe.rst b/Documentation/networking/devlink/ixgbe.rst index c04ac51c6d859..b63645de37e81 100644 --- a/Documentation/networking/devlink/ixgbe.rst +++ b/Documentation/networking/devlink/ixgbe.rst @@ -6,3 +6,35 @@ ixgbe devlink support This document describes the devlink features implemented by the ``ixgbe`` device driver. + +Info versions +============= + +The ``ixgbe`` driver reports the following versions + +.. list-table:: devlink info versions implemented + :widths: 5 5 5 90 + + * - Name + - Type + - Example + - Description + * - ``board.id`` + - fixed + - H49289-000 + - The Product Board Assembly (PBA) identifier of the board. + * - ``fw.undi`` + - running + - 1.1937.0 + - Version of the Option ROM containing the UEFI driver. The version is + reported in ``major.minor.patch`` format. The major version is + incremented whenever a major breaking change occurs, or when the + minor version would overflow. The minor version is incremented for + non-breaking changes and reset to 1 when the major version is + incremented. The patch version is normally 0 but is incremented when + a fix is delivered as a patch against an older base Option ROM. + * - ``fw.bundle_id`` + - running + - 0x80000d0d + - Unique identifier of the firmware image file that was loaded onto + the device. Also referred to as the EETRACK identifier of the NVM. diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c index 6c3452cf5d7d0..7bbd1733b8dad 100644 --- a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c @@ -4,7 +4,108 @@ #include "ixgbe.h" #include "devlink.h" +struct ixgbe_info_ctx { + char buf[128]; +}; + +static void ixgbe_info_get_dsn(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + u8 dsn[8]; + + /* Copy the DSN into an array in Big Endian format */ + put_unaligned_be64(pci_get_dsn(adapter->pdev), dsn); + + snprintf(ctx->buf, sizeof(ctx->buf), "%8phD", dsn); +} + +static void ixgbe_info_nvm_ver(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + struct ixgbe_hw *hw = &adapter->hw; + struct ixgbe_nvm_version nvm_ver; + + ctx->buf[0] = '\0'; + + ixgbe_get_oem_prod_version(hw, &nvm_ver); + if (nvm_ver.oem_valid) { + snprintf(ctx->buf, sizeof(ctx->buf), "%x.%x.%x", + nvm_ver.oem_major, nvm_ver.oem_minor, + nvm_ver.oem_release); + + return; + } + + ixgbe_get_orom_version(hw, &nvm_ver); + if (nvm_ver.or_valid) + snprintf(ctx->buf, sizeof(ctx->buf), "%d.%d.%d", + nvm_ver.or_major, nvm_ver.or_build, nvm_ver.or_patch); +} + +static void ixgbe_info_eetrack(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + struct ixgbe_hw *hw = &adapter->hw; + struct ixgbe_nvm_version nvm_ver; + + ixgbe_get_oem_prod_version(hw, &nvm_ver); + + /* No ETRACK version for OEM */ + if (nvm_ver.oem_valid) { + ctx->buf[0] = '\0'; + return; + } + + ixgbe_get_etk_id(hw, &nvm_ver); + snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", nvm_ver.etk_id); +} + +static int ixgbe_devlink_info_get(struct devlink *devlink, + struct devlink_info_req *req, + struct netlink_ext_ack *extack) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct ixgbe_hw *hw = &adapter->hw; + struct ixgbe_info_ctx *ctx; + int err; + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ixgbe_info_get_dsn(adapter, ctx); + err = devlink_info_serial_number_put(req, ctx->buf); + if (err) + goto free_ctx; + + err = ixgbe_read_pba_string_generic(hw, ctx->buf, sizeof(ctx->buf)); + if (err) + goto free_ctx; + + err = devlink_info_version_fixed_put(req, + DEVLINK_INFO_VERSION_GENERIC_BOARD_ID, + ctx->buf); + if (err) + goto free_ctx; + + ixgbe_info_nvm_ver(adapter, ctx); + err = devlink_info_version_running_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW_UNDI, + ctx->buf); + if (err) + goto free_ctx; + + ixgbe_info_eetrack(adapter, ctx); + err = devlink_info_version_running_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID, + ctx->buf); +free_ctx: + kfree(ctx); + return err; +} + static const struct devlink_ops ixgbe_devlink_ops = { + .info_get = ixgbe_devlink_info_get, }; /** From 5f214150c76dd4cffe51249d4c6a795efca90159 Mon Sep 17 00:00:00 2001 From: Slawomir Mrozowicz Date: Thu, 10 Apr 2025 14:59:58 +0200 Subject: [PATCH 187/770] ixgbe: add E610 functions for acquiring flash data Read NVM related info from the flash. Add several helper functions used to access the flash data, find memory banks, calculate offsets, calculate the flash size. Reviewed-by: Przemek Kitszel Reviewed-by: Mateusz Polchlopek Tested-by: Bharath R Signed-off-by: Slawomir Mrozowicz Co-developed-by: Piotr Kwapulinski Signed-off-by: Piotr Kwapulinski Co-developed-by: Jedrzej Jagielski Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 507 +++++++++++++++++- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 + .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 40 +- 4 files changed, 550 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 00935747c8c55..5d15fbd2e1a63 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -2266,6 +2266,511 @@ int ixgbe_nvm_validate_checksum(struct ixgbe_hw *hw) return err; } +/** + * ixgbe_discover_flash_size - Discover the available flash size + * @hw: pointer to the HW struct + * + * The device flash could be up to 16MB in size. However, it is possible that + * the actual size is smaller. Use bisection to determine the accessible size + * of flash memory. + * + * Return: the exit code of the operation. + */ +static int ixgbe_discover_flash_size(struct ixgbe_hw *hw) +{ + u32 min_size = 0, max_size = IXGBE_ACI_NVM_MAX_OFFSET + 1; + int err; + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + if (err) + return err; + + while ((max_size - min_size) > 1) { + u32 offset = (max_size + min_size) / 2; + u32 len = 1; + u8 data; + + err = ixgbe_read_flat_nvm(hw, offset, &len, &data, false); + if (err == -EIO && + hw->aci.last_status == IXGBE_ACI_RC_EINVAL) { + err = 0; + max_size = offset; + } else if (!err) { + min_size = offset; + } else { + /* an unexpected error occurred */ + goto err_read_flat_nvm; + } + } + + hw->flash.flash_size = max_size; + +err_read_flat_nvm: + ixgbe_release_nvm(hw); + + return err; +} + +/** + * ixgbe_read_sr_base_address - Read the value of a Shadow RAM pointer word + * @hw: pointer to the HW structure + * @offset: the word offset of the Shadow RAM word to read + * @pointer: pointer value read from Shadow RAM + * + * Read the given Shadow RAM word, and convert it to a pointer value specified + * in bytes. This function assumes the specified offset is a valid pointer + * word. + * + * Each pointer word specifies whether it is stored in word size or 4KB + * sector size by using the highest bit. The reported pointer value will be in + * bytes, intended for flat NVM reads. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_sr_base_address(struct ixgbe_hw *hw, u16 offset, + u32 *pointer) +{ + u16 value; + int err; + + err = ixgbe_read_ee_aci_e610(hw, offset, &value); + if (err) + return err; + + /* Determine if the pointer is in 4KB or word units */ + if (value & IXGBE_SR_NVM_PTR_4KB_UNITS) + *pointer = (value & ~IXGBE_SR_NVM_PTR_4KB_UNITS) * SZ_4K; + else + *pointer = value * sizeof(u16); + + return 0; +} + +/** + * ixgbe_read_sr_area_size - Read an area size from a Shadow RAM word + * @hw: pointer to the HW structure + * @offset: the word offset of the Shadow RAM to read + * @size: size value read from the Shadow RAM + * + * Read the given Shadow RAM word, and convert it to an area size value + * specified in bytes. This function assumes the specified offset is a valid + * area size word. + * + * Each area size word is specified in 4KB sector units. This function reports + * the size in bytes, intended for flat NVM reads. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_sr_area_size(struct ixgbe_hw *hw, u16 offset, u32 *size) +{ + u16 value; + int err; + + err = ixgbe_read_ee_aci_e610(hw, offset, &value); + if (err) + return err; + + /* Area sizes are always specified in 4KB units */ + *size = value * SZ_4K; + + return 0; +} + +/** + * ixgbe_determine_active_flash_banks - Discover active bank for each module + * @hw: pointer to the HW struct + * + * Read the Shadow RAM control word and determine which banks are active for + * the NVM, OROM, and Netlist modules. Also read and calculate the associated + * pointer and size. These values are then cached into the ixgbe_flash_info + * structure for later use in order to calculate the correct offset to read + * from the active module. + * + * Return: the exit code of the operation. + */ +static int ixgbe_determine_active_flash_banks(struct ixgbe_hw *hw) +{ + struct ixgbe_bank_info *banks = &hw->flash.banks; + u16 ctrl_word; + int err; + + err = ixgbe_read_ee_aci_e610(hw, IXGBE_E610_SR_NVM_CTRL_WORD, + &ctrl_word); + if (err) + return err; + + if (FIELD_GET(IXGBE_SR_CTRL_WORD_1_M, ctrl_word) != + IXGBE_SR_CTRL_WORD_VALID) + return -ENODATA; + + if (!(ctrl_word & IXGBE_SR_CTRL_WORD_NVM_BANK)) + banks->nvm_bank = IXGBE_1ST_FLASH_BANK; + else + banks->nvm_bank = IXGBE_2ND_FLASH_BANK; + + if (!(ctrl_word & IXGBE_SR_CTRL_WORD_OROM_BANK)) + banks->orom_bank = IXGBE_1ST_FLASH_BANK; + else + banks->orom_bank = IXGBE_2ND_FLASH_BANK; + + if (!(ctrl_word & IXGBE_SR_CTRL_WORD_NETLIST_BANK)) + banks->netlist_bank = IXGBE_1ST_FLASH_BANK; + else + banks->netlist_bank = IXGBE_2ND_FLASH_BANK; + + err = ixgbe_read_sr_base_address(hw, IXGBE_E610_SR_1ST_NVM_BANK_PTR, + &banks->nvm_ptr); + if (err) + return err; + + err = ixgbe_read_sr_area_size(hw, IXGBE_E610_SR_NVM_BANK_SIZE, + &banks->nvm_size); + if (err) + return err; + + err = ixgbe_read_sr_base_address(hw, IXGBE_E610_SR_1ST_OROM_BANK_PTR, + &banks->orom_ptr); + if (err) + return err; + + err = ixgbe_read_sr_area_size(hw, IXGBE_E610_SR_OROM_BANK_SIZE, + &banks->orom_size); + if (err) + return err; + + err = ixgbe_read_sr_base_address(hw, IXGBE_E610_SR_NETLIST_BANK_PTR, + &banks->netlist_ptr); + if (err) + return err; + + err = ixgbe_read_sr_area_size(hw, IXGBE_E610_SR_NETLIST_BANK_SIZE, + &banks->netlist_size); + + return err; +} + +/** + * ixgbe_get_flash_bank_offset - Get offset into requested flash bank + * @hw: pointer to the HW structure + * @bank: whether to read from the active or inactive flash bank + * @module: the module to read from + * + * Based on the module, lookup the module offset from the beginning of the + * flash. + * + * Return: the flash offset. Note that a value of zero is invalid and must be + * treated as an error. + */ +static int ixgbe_get_flash_bank_offset(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u16 module) +{ + struct ixgbe_bank_info *banks = &hw->flash.banks; + enum ixgbe_flash_bank active_bank; + bool second_bank_active; + u32 offset, size; + + switch (module) { + case IXGBE_E610_SR_1ST_NVM_BANK_PTR: + offset = banks->nvm_ptr; + size = banks->nvm_size; + active_bank = banks->nvm_bank; + break; + case IXGBE_E610_SR_1ST_OROM_BANK_PTR: + offset = banks->orom_ptr; + size = banks->orom_size; + active_bank = banks->orom_bank; + break; + case IXGBE_E610_SR_NETLIST_BANK_PTR: + offset = banks->netlist_ptr; + size = banks->netlist_size; + active_bank = banks->netlist_bank; + break; + default: + return 0; + } + + switch (active_bank) { + case IXGBE_1ST_FLASH_BANK: + second_bank_active = false; + break; + case IXGBE_2ND_FLASH_BANK: + second_bank_active = true; + break; + default: + return 0; + } + + /* The second flash bank is stored immediately following the first + * bank. Based on whether the 1st or 2nd bank is active, and whether + * we want the active or inactive bank, calculate the desired offset. + */ + switch (bank) { + case IXGBE_ACTIVE_FLASH_BANK: + return offset + (second_bank_active ? size : 0); + case IXGBE_INACTIVE_FLASH_BANK: + return offset + (second_bank_active ? 0 : size); + } + + return 0; +} + +/** + * ixgbe_read_flash_module - Read a word from one of the main NVM modules + * @hw: pointer to the HW structure + * @bank: which bank of the module to read + * @module: the module to read + * @offset: the offset into the module in bytes + * @data: storage for the word read from the flash + * @length: bytes of data to read + * + * Read data from the specified flash module. The bank parameter indicates + * whether or not to read from the active bank or the inactive bank of that + * module. + * + * The word will be read using flat NVM access, and relies on the + * hw->flash.banks data being setup by ixgbe_determine_active_flash_banks() + * during initialization. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_flash_module(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u16 module, u32 offset, u8 *data, u32 length) +{ + u32 start; + int err; + + start = ixgbe_get_flash_bank_offset(hw, bank, module); + if (!start) + return -EINVAL; + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + if (err) + return err; + + err = ixgbe_read_flat_nvm(hw, start + offset, &length, data, false); + + ixgbe_release_nvm(hw); + + return err; +} + +/** + * ixgbe_read_nvm_module - Read from the active main NVM module + * @hw: pointer to the HW structure + * @bank: whether to read from active or inactive NVM module + * @offset: offset into the NVM module to read, in words + * @data: storage for returned word value + * + * Read the specified word from the active NVM module. This includes the CSS + * header at the start of the NVM module. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_nvm_module(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u32 offset, u16 *data) +{ + __le16 data_local; + int err; + + err = ixgbe_read_flash_module(hw, bank, IXGBE_E610_SR_1ST_NVM_BANK_PTR, + offset * sizeof(data_local), + (u8 *)&data_local, + sizeof(data_local)); + if (!err) + *data = le16_to_cpu(data_local); + + return err; +} + +/** + * ixgbe_get_nvm_css_hdr_len - Read the CSS header length + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash bank + * @hdr_len: storage for header length in words + * + * Read the CSS header length from the NVM CSS header and add the + * Authentication header size, and then convert to words. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_nvm_css_hdr_len(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u32 *hdr_len) +{ + u16 hdr_len_l, hdr_len_h; + u32 hdr_len_dword; + int err; + + err = ixgbe_read_nvm_module(hw, bank, IXGBE_NVM_CSS_HDR_LEN_L, + &hdr_len_l); + if (err) + return err; + + err = ixgbe_read_nvm_module(hw, bank, IXGBE_NVM_CSS_HDR_LEN_H, + &hdr_len_h); + if (err) + return err; + + /* CSS header length is in DWORD, so convert to words and add + * authentication header size. + */ + hdr_len_dword = (hdr_len_h << 16) | hdr_len_l; + *hdr_len = hdr_len_dword * 2 + IXGBE_NVM_AUTH_HEADER_LEN; + + return 0; +} + +/** + * ixgbe_read_nvm_sr_copy - Read a word from the Shadow RAM copy + * @hw: pointer to the HW structure + * @bank: whether to read from the active or inactive NVM module + * @offset: offset into the Shadow RAM copy to read, in words + * @data: storage for returned word value + * + * Read the specified word from the copy of the Shadow RAM found in the + * specified NVM module. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_nvm_sr_copy(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u32 offset, u16 *data) +{ + u32 hdr_len; + int err; + + err = ixgbe_get_nvm_css_hdr_len(hw, bank, &hdr_len); + if (err) + return err; + + hdr_len = round_up(hdr_len, IXGBE_HDR_LEN_ROUNDUP); + + return ixgbe_read_nvm_module(hw, bank, hdr_len + offset, data); +} + +/** + * ixgbe_get_nvm_srev - Read the security revision from the NVM CSS header + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash bank + * @srev: storage for security revision + * + * Read the security revision out of the CSS header of the active NVM module + * bank. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_nvm_srev(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, u32 *srev) +{ + u16 srev_l, srev_h; + int err; + + err = ixgbe_read_nvm_module(hw, bank, IXGBE_NVM_CSS_SREV_L, &srev_l); + if (err) + return err; + + err = ixgbe_read_nvm_module(hw, bank, IXGBE_NVM_CSS_SREV_H, &srev_h); + if (err) + return err; + + *srev = (srev_h << 16) | srev_l; + + return 0; +} + +/** + * ixgbe_get_nvm_ver_info - Read NVM version information + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash bank + * @nvm: pointer to NVM info structure + * + * Read the NVM EETRACK ID and map version of the main NVM image bank, filling + * in the nvm info structure. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_nvm_ver_info(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + struct ixgbe_nvm_info *nvm) +{ + u16 eetrack_lo, eetrack_hi, ver; + int err; + + err = ixgbe_read_nvm_sr_copy(hw, bank, + IXGBE_E610_SR_NVM_DEV_STARTER_VER, &ver); + if (err) + return err; + + nvm->major = FIELD_GET(IXGBE_E610_NVM_VER_HI_MASK, ver); + nvm->minor = FIELD_GET(IXGBE_E610_NVM_VER_LO_MASK, ver); + + err = ixgbe_read_nvm_sr_copy(hw, bank, IXGBE_E610_SR_NVM_EETRACK_LO, + &eetrack_lo); + if (err) + return err; + + err = ixgbe_read_nvm_sr_copy(hw, bank, IXGBE_E610_SR_NVM_EETRACK_HI, + &eetrack_hi); + if (err) + return err; + + nvm->eetrack = (eetrack_hi << 16) | eetrack_lo; + + ixgbe_get_nvm_srev(hw, bank, &nvm->srev); + + return 0; +} + +/** + * ixgbe_get_flash_data - get flash data + * @hw: pointer to the HW struct + * + * Read and populate flash data such as Shadow RAM size, + * max_timeout and blank_nvm_mode + * + * Return: the exit code of the operation. + */ +int ixgbe_get_flash_data(struct ixgbe_hw *hw) +{ + struct ixgbe_flash_info *flash = &hw->flash; + u32 fla, gens_stat; + u8 sr_size; + int err; + + /* The SR size is stored regardless of the NVM programming mode + * as the blank mode may be used in the factory line. + */ + gens_stat = IXGBE_READ_REG(hw, GLNVM_GENS); + sr_size = FIELD_GET(GLNVM_GENS_SR_SIZE_M, gens_stat); + + /* Switching to words (sr_size contains power of 2) */ + flash->sr_words = BIT(sr_size) * (SZ_1K / sizeof(u16)); + + /* Check if we are in the normal or blank NVM programming mode */ + fla = IXGBE_READ_REG(hw, IXGBE_GLNVM_FLA); + if (fla & IXGBE_GLNVM_FLA_LOCKED_M) { + flash->blank_nvm_mode = false; + } else { + flash->blank_nvm_mode = true; + return -EIO; + } + + err = ixgbe_discover_flash_size(hw); + if (err) + return err; + + err = ixgbe_determine_active_flash_banks(hw); + if (err) + return err; + + err = ixgbe_get_nvm_ver_info(hw, IXGBE_ACTIVE_FLASH_BANK, + &flash->nvm); + return err; +} + /** * ixgbe_read_sr_word_aci - Reads Shadow RAM via ACI * @hw: pointer to the HW structure @@ -2485,7 +2990,7 @@ int ixgbe_validate_eeprom_checksum_e610(struct ixgbe_hw *hw, u16 *checksum_val) if (err) return err; - err = ixgbe_read_sr_word_aci(hw, E610_SR_SW_CHECKSUM_WORD, + err = ixgbe_read_sr_word_aci(hw, IXGBE_E610_SR_SW_CHECKSUM_WORD, &tmp_checksum); ixgbe_release_nvm(hw); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h index ba8c06b738105..2c971a34200b8 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -77,5 +77,6 @@ int ixgbe_read_ee_aci_buffer_e610(struct ixgbe_hw *hw, u16 offset, u16 words, u16 *data); int ixgbe_validate_eeprom_checksum_e610(struct ixgbe_hw *hw, u16 *checksum_val); int ixgbe_reset_hw_e610(struct ixgbe_hw *hw); +int ixgbe_get_flash_data(struct ixgbe_hw *hw); #endif /* _IXGBE_E610_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 0c7be10657a94..1f2afbf6b02ee 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -11339,6 +11339,10 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err = ixgbe_get_caps(&adapter->hw); if (err) dev_err(&pdev->dev, "ixgbe_get_caps failed %d\n", err); + + err = ixgbe_get_flash_data(&adapter->hw); + if (err) + goto err_sw_init; } if (adapter->hw.mac.type == ixgbe_mac_82599EB) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index 617e07878e4f7..cde24cbb6a82e 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -10,7 +10,32 @@ #define IXGBE_MAX_VSI 768 /* Checksum and Shadow RAM pointers */ -#define E610_SR_SW_CHECKSUM_WORD 0x3F +#define IXGBE_E610_SR_NVM_CTRL_WORD 0x00 +#define IXGBE_E610_SR_PBA_BLOCK_PTR 0x16 +#define IXGBE_E610_SR_NVM_DEV_STARTER_VER 0x18 +#define IXGBE_E610_SR_NVM_EETRACK_LO 0x2D +#define IXGBE_E610_SR_NVM_EETRACK_HI 0x2E +#define IXGBE_E610_NVM_VER_LO_MASK GENMASK(7, 0) +#define IXGBE_E610_NVM_VER_HI_MASK GENMASK(15, 12) +#define IXGBE_E610_SR_SW_CHECKSUM_WORD 0x3F +#define IXGBE_E610_SR_PFA_PTR 0x40 +#define IXGBE_E610_SR_1ST_NVM_BANK_PTR 0x42 +#define IXGBE_E610_SR_NVM_BANK_SIZE 0x43 +#define IXGBE_E610_SR_1ST_OROM_BANK_PTR 0x44 +#define IXGBE_E610_SR_OROM_BANK_SIZE 0x45 +#define IXGBE_E610_SR_NETLIST_BANK_PTR 0x46 +#define IXGBE_E610_SR_NETLIST_BANK_SIZE 0x47 + +/* CSS Header words */ +#define IXGBE_NVM_CSS_HDR_LEN_L 0x02 +#define IXGBE_NVM_CSS_HDR_LEN_H 0x03 +#define IXGBE_NVM_CSS_SREV_L 0x14 +#define IXGBE_NVM_CSS_SREV_H 0x15 + +#define IXGBE_HDR_LEN_ROUNDUP 32 + +/* Length of Authentication header section in words */ +#define IXGBE_NVM_AUTH_HEADER_LEN 0x08 /* Shadow RAM related */ #define IXGBE_SR_WORDS_IN_1KB 512 @@ -29,6 +54,14 @@ #define IXGBE_GLNVM_FLA_LOCKED_S 6 #define IXGBE_GLNVM_FLA_LOCKED_M BIT(6) +/* Auxiliary field, mask and shift definition for Shadow RAM and NVM Flash */ +#define IXGBE_SR_CTRL_WORD_1_M GENMASK(7, 6) +#define IXGBE_SR_CTRL_WORD_VALID BIT(0) +#define IXGBE_SR_CTRL_WORD_OROM_BANK BIT(3) +#define IXGBE_SR_CTRL_WORD_NETLIST_BANK BIT(4) +#define IXGBE_SR_CTRL_WORD_NVM_BANK BIT(5) +#define IXGBE_SR_NVM_PTR_4KB_UNITS BIT(15) + /* Admin Command Interface (ACI) registers */ #define IXGBE_PF_HIDA(_i) (0x00085000 + ((_i) * 4)) #define IXGBE_PF_HIDA_2(_i) (0x00085020 + ((_i) * 4)) @@ -1015,6 +1048,11 @@ struct ixgbe_aci_info { enum ixgbe_aci_err last_status; /* last status of sent admin command */ }; +enum ixgbe_bank_select { + IXGBE_ACTIVE_FLASH_BANK, + IXGBE_INACTIVE_FLASH_BANK, +}; + /* Option ROM version information */ struct ixgbe_orom_info { u8 major; /* Major version of OROM */ From 70db0788a2628698c0d2746b4864f032c2d59ff9 Mon Sep 17 00:00:00 2001 From: Slawomir Mrozowicz Date: Thu, 10 Apr 2025 14:59:59 +0200 Subject: [PATCH 188/770] ixgbe: read the OROM version information Add functions reading the OROM version info and use them as a part of the setting NVM info procedure. Reviewed-by: Mateusz Polchlopek Reviewed-by: Przemek Kitszel Tested-by: Bharath R Signed-off-by: Slawomir Mrozowicz Co-developed-by: Piotr Kwapulinski Signed-off-by: Piotr Kwapulinski Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 169 ++++++++++++++++++ .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 15 ++ 2 files changed, 184 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 5d15fbd2e1a63..acd4eb8e391fb 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -2585,6 +2585,35 @@ static int ixgbe_read_nvm_module(struct ixgbe_hw *hw, return err; } +/** + * ixgbe_read_orom_module - Read from the active Option ROM module + * @hw: pointer to the HW structure + * @bank: whether to read from active or inactive OROM module + * @offset: offset into the OROM module to read, in words + * @data: storage for returned word value + * + * Read the specified word from the active Option ROM module of the flash. + * Note that unlike the NVM module, the CSS data is stored at the end of the + * module instead of at the beginning. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_orom_module(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u32 offset, u16 *data) +{ + __le16 data_local; + int err; + + err = ixgbe_read_flash_module(hw, bank, IXGBE_E610_SR_1ST_OROM_BANK_PTR, + offset * sizeof(data_local), + (u8 *)&data_local, sizeof(data_local)); + if (!err) + *data = le16_to_cpu(data_local); + + return err; +} + /** * ixgbe_get_nvm_css_hdr_len - Read the CSS header length * @hw: pointer to the HW struct @@ -2681,6 +2710,141 @@ static int ixgbe_get_nvm_srev(struct ixgbe_hw *hw, return 0; } +/** + * ixgbe_get_orom_civd_data - Get the combo version information from Option ROM + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash module + * @civd: storage for the Option ROM CIVD data. + * + * Searches through the Option ROM flash contents to locate the CIVD data for + * the image. + * + * Return: the exit code of the operation. + */ +static int +ixgbe_get_orom_civd_data(struct ixgbe_hw *hw, enum ixgbe_bank_select bank, + struct ixgbe_orom_civd_info *civd) +{ + struct ixgbe_orom_civd_info tmp; + u32 offset; + int err; + + /* The CIVD section is located in the Option ROM aligned to 512 bytes. + * The first 4 bytes must contain the ASCII characters "$CIV". + * A simple modulo 256 sum of all of the bytes of the structure must + * equal 0. + */ + for (offset = 0; (offset + SZ_512) <= hw->flash.banks.orom_size; + offset += SZ_512) { + u8 sum = 0; + u32 i; + + err = ixgbe_read_flash_module(hw, bank, + IXGBE_E610_SR_1ST_OROM_BANK_PTR, + offset, + (u8 *)&tmp, sizeof(tmp)); + if (err) + return err; + + /* Skip forward until we find a matching signature */ + if (memcmp(IXGBE_OROM_CIV_SIGNATURE, tmp.signature, + sizeof(tmp.signature))) + continue; + + /* Verify that the simple checksum is zero */ + for (i = 0; i < sizeof(tmp); i++) + sum += ((u8 *)&tmp)[i]; + + if (sum) + return -EDOM; + + *civd = tmp; + return 0; + } + + return -ENODATA; +} + +/** + * ixgbe_get_orom_srev - Read the security revision from the OROM CSS header + * @hw: pointer to the HW struct + * @bank: whether to read from active or inactive flash module + * @srev: storage for security revision + * + * Read the security revision out of the CSS header of the active OROM module + * bank. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_orom_srev(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u32 *srev) +{ + u32 orom_size_word = hw->flash.banks.orom_size / 2; + u32 css_start, hdr_len; + u16 srev_l, srev_h; + int err; + + err = ixgbe_get_nvm_css_hdr_len(hw, bank, &hdr_len); + if (err) + return err; + + if (orom_size_word < hdr_len) + return -EINVAL; + + /* Calculate how far into the Option ROM the CSS header starts. Note + * that ixgbe_read_orom_module takes a word offset. + */ + css_start = orom_size_word - hdr_len; + err = ixgbe_read_orom_module(hw, bank, + css_start + IXGBE_NVM_CSS_SREV_L, + &srev_l); + if (err) + return err; + + err = ixgbe_read_orom_module(hw, bank, + css_start + IXGBE_NVM_CSS_SREV_H, + &srev_h); + if (err) + return err; + + *srev = srev_h << 16 | srev_l; + + return 0; +} + +/** + * ixgbe_get_orom_ver_info - Read Option ROM version information + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash module + * @orom: pointer to Option ROM info structure + * + * Read Option ROM version and security revision from the Option ROM flash + * section. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_orom_ver_info(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + struct ixgbe_orom_info *orom) +{ + struct ixgbe_orom_civd_info civd; + u32 combo_ver; + int err; + + err = ixgbe_get_orom_civd_data(hw, bank, &civd); + if (err) + return err; + + combo_ver = le32_to_cpu(civd.combo_ver); + + orom->major = (u8)FIELD_GET(IXGBE_OROM_VER_MASK, combo_ver); + orom->patch = (u8)FIELD_GET(IXGBE_OROM_VER_PATCH_MASK, combo_ver); + orom->build = (u16)FIELD_GET(IXGBE_OROM_VER_BUILD_MASK, combo_ver); + + return ixgbe_get_orom_srev(hw, bank, &orom->srev); +} + /** * ixgbe_get_nvm_ver_info - Read NVM version information * @hw: pointer to the HW struct @@ -2768,6 +2932,11 @@ int ixgbe_get_flash_data(struct ixgbe_hw *hw) err = ixgbe_get_nvm_ver_info(hw, IXGBE_ACTIVE_FLASH_BANK, &flash->nvm); + if (err) + return err; + + err = ixgbe_get_orom_ver_info(hw, IXGBE_ACTIVE_FLASH_BANK, + &flash->orom); return err; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index cde24cbb6a82e..905e6ed179ec0 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -26,6 +26,11 @@ #define IXGBE_E610_SR_NETLIST_BANK_PTR 0x46 #define IXGBE_E610_SR_NETLIST_BANK_SIZE 0x47 +/* The OROM version topology */ +#define IXGBE_OROM_VER_PATCH_MASK GENMASK_ULL(7, 0) +#define IXGBE_OROM_VER_BUILD_MASK GENMASK_ULL(23, 8) +#define IXGBE_OROM_VER_MASK GENMASK_ULL(31, 24) + /* CSS Header words */ #define IXGBE_NVM_CSS_HDR_LEN_L 0x02 #define IXGBE_NVM_CSS_HDR_LEN_H 0x03 @@ -1017,6 +1022,16 @@ struct ixgbe_hw_caps { #define IXGBE_EXT_TOPO_DEV_IMG_PROG_EN BIT(1) } __packed; +#define IXGBE_OROM_CIV_SIGNATURE "$CIV" + +struct ixgbe_orom_civd_info { + u8 signature[4]; /* Must match ASCII '$CIV' characters */ + u8 checksum; /* Simple modulo 256 sum of all structure bytes must equal 0 */ + __le32 combo_ver; /* Combo Image Version number */ + u8 combo_name_len; /* Length of the unicode combo image version string, max of 32 */ + __le16 combo_name[32]; /* Unicode string representing the Combo Image version */ +}; + /* Function specific capabilities */ struct ixgbe_hw_func_caps { u32 num_allocd_vfs; /* Number of allocated VFs */ From 904c2b4c0b48d738bc598f837afeb6a027e650d1 Mon Sep 17 00:00:00 2001 From: Slawomir Mrozowicz Date: Thu, 10 Apr 2025 15:00:00 +0200 Subject: [PATCH 189/770] ixgbe: read the netlist version information Add functions reading the netlist version info and use them as a part of the setting NVM info procedure. Reviewed-by: Mateusz Polchlopek Tested-by: Bharath R Signed-off-by: Slawomir Mrozowicz Co-developed-by: Piotr Kwapulinski Signed-off-by: Piotr Kwapulinski Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 112 ++++++++++++++++++ .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 33 ++++++ 2 files changed, 145 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index acd4eb8e391fb..af88a6afb4117 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -2585,6 +2585,33 @@ static int ixgbe_read_nvm_module(struct ixgbe_hw *hw, return err; } +/** + * ixgbe_read_netlist_module - Read data from the netlist module area + * @hw: pointer to the HW structure + * @bank: whether to read from the active or inactive module + * @offset: offset into the netlist to read from + * @data: storage for returned word value + * + * Read a word from the specified netlist bank. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_netlist_module(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + u32 offset, u16 *data) +{ + __le16 data_local; + int err; + + err = ixgbe_read_flash_module(hw, bank, IXGBE_E610_SR_NETLIST_BANK_PTR, + offset * sizeof(data_local), + (u8 *)&data_local, sizeof(data_local)); + if (!err) + *data = le16_to_cpu(data_local); + + return err; +} + /** * ixgbe_read_orom_module - Read from the active Option ROM module * @hw: pointer to the HW structure @@ -2888,6 +2915,86 @@ static int ixgbe_get_nvm_ver_info(struct ixgbe_hw *hw, return 0; } +/** + * ixgbe_get_netlist_info - Read the netlist version information + * @hw: pointer to the HW struct + * @bank: whether to read from the active or inactive flash bank + * @netlist: pointer to netlist version info structure + * + * Get the netlist version information from the requested bank. Reads the Link + * Topology section to find the Netlist ID block and extract the relevant + * information into the netlist version structure. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_netlist_info(struct ixgbe_hw *hw, + enum ixgbe_bank_select bank, + struct ixgbe_netlist_info *netlist) +{ + u16 module_id, length, node_count, i; + u16 *id_blk; + int err; + + err = ixgbe_read_netlist_module(hw, bank, IXGBE_NETLIST_TYPE_OFFSET, + &module_id); + if (err) + return err; + + if (module_id != IXGBE_NETLIST_LINK_TOPO_MOD_ID) + return -EIO; + + err = ixgbe_read_netlist_module(hw, bank, IXGBE_LINK_TOPO_MODULE_LEN, + &length); + if (err) + return err; + + /* Sanity check that we have at least enough words to store the + * netlist ID block. + */ + if (length < IXGBE_NETLIST_ID_BLK_SIZE) + return -EIO; + + err = ixgbe_read_netlist_module(hw, bank, IXGBE_LINK_TOPO_NODE_COUNT, + &node_count); + if (err) + return err; + + node_count &= IXGBE_LINK_TOPO_NODE_COUNT_M; + + id_blk = kcalloc(IXGBE_NETLIST_ID_BLK_SIZE, sizeof(*id_blk), GFP_KERNEL); + if (!id_blk) + return -ENOMEM; + + /* Read out the entire Netlist ID Block at once. */ + err = ixgbe_read_flash_module(hw, bank, IXGBE_E610_SR_NETLIST_BANK_PTR, + IXGBE_NETLIST_ID_BLK_OFFSET(node_count) * + sizeof(*id_blk), (u8 *)id_blk, + IXGBE_NETLIST_ID_BLK_SIZE * + sizeof(*id_blk)); + if (err) + goto free_id_blk; + + for (i = 0; i < IXGBE_NETLIST_ID_BLK_SIZE; i++) + id_blk[i] = le16_to_cpu(((__le16 *)id_blk)[i]); + + netlist->major = id_blk[IXGBE_NETLIST_ID_BLK_MAJOR_VER_HIGH] << 16 | + id_blk[IXGBE_NETLIST_ID_BLK_MAJOR_VER_LOW]; + netlist->minor = id_blk[IXGBE_NETLIST_ID_BLK_MINOR_VER_HIGH] << 16 | + id_blk[IXGBE_NETLIST_ID_BLK_MINOR_VER_LOW]; + netlist->type = id_blk[IXGBE_NETLIST_ID_BLK_TYPE_HIGH] << 16 | + id_blk[IXGBE_NETLIST_ID_BLK_TYPE_LOW]; + netlist->rev = id_blk[IXGBE_NETLIST_ID_BLK_REV_HIGH] << 16 | + id_blk[IXGBE_NETLIST_ID_BLK_REV_LOW]; + netlist->cust_ver = id_blk[IXGBE_NETLIST_ID_BLK_CUST_VER]; + /* Read the left most 4 bytes of SHA */ + netlist->hash = id_blk[IXGBE_NETLIST_ID_BLK_SHA_HASH_WORD(15)] << 16 | + id_blk[IXGBE_NETLIST_ID_BLK_SHA_HASH_WORD(14)]; + +free_id_blk: + kfree(id_blk); + return err; +} + /** * ixgbe_get_flash_data - get flash data * @hw: pointer to the HW struct @@ -2937,6 +3044,11 @@ int ixgbe_get_flash_data(struct ixgbe_hw *hw) err = ixgbe_get_orom_ver_info(hw, IXGBE_ACTIVE_FLASH_BANK, &flash->orom); + if (err) + return err; + + err = ixgbe_get_netlist_info(hw, IXGBE_ACTIVE_FLASH_BANK, + &flash->netlist); return err; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index 905e6ed179ec0..91cc79aab388d 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -45,6 +45,39 @@ /* Shadow RAM related */ #define IXGBE_SR_WORDS_IN_1KB 512 +/* The Netlist ID Block is located after all of the Link Topology nodes. */ +#define IXGBE_NETLIST_ID_BLK_SIZE 0x30 +#define IXGBE_NETLIST_ID_BLK_OFFSET(n) IXGBE_NETLIST_LINK_TOPO_OFFSET(0x0004 + 2 * (n)) + +/* netlist ID block field offsets (word offsets) */ +#define IXGBE_NETLIST_ID_BLK_MAJOR_VER_LOW 0x02 +#define IXGBE_NETLIST_ID_BLK_MAJOR_VER_HIGH 0x03 +#define IXGBE_NETLIST_ID_BLK_MINOR_VER_LOW 0x04 +#define IXGBE_NETLIST_ID_BLK_MINOR_VER_HIGH 0x05 +#define IXGBE_NETLIST_ID_BLK_TYPE_LOW 0x06 +#define IXGBE_NETLIST_ID_BLK_TYPE_HIGH 0x07 +#define IXGBE_NETLIST_ID_BLK_REV_LOW 0x08 +#define IXGBE_NETLIST_ID_BLK_REV_HIGH 0x09 +#define IXGBE_NETLIST_ID_BLK_SHA_HASH_WORD(n) (0x0A + (n)) +#define IXGBE_NETLIST_ID_BLK_CUST_VER 0x2F + +/* The Link Topology Netlist section is stored as a series of words. It is + * stored in the NVM as a TLV, with the first two words containing the type + * and length. + */ +#define IXGBE_NETLIST_LINK_TOPO_MOD_ID 0x011B +#define IXGBE_NETLIST_TYPE_OFFSET 0x0000 +#define IXGBE_NETLIST_LEN_OFFSET 0x0001 + +/* The Link Topology section follows the TLV header. When reading the netlist + * using ixgbe_read_netlist_module, we need to account for the 2-word TLV + * header. + */ +#define IXGBE_NETLIST_LINK_TOPO_OFFSET(n) ((n) + 2) +#define IXGBE_LINK_TOPO_MODULE_LEN IXGBE_NETLIST_LINK_TOPO_OFFSET(0x0000) +#define IXGBE_LINK_TOPO_NODE_COUNT IXGBE_NETLIST_LINK_TOPO_OFFSET(0x0001) +#define IXGBE_LINK_TOPO_NODE_COUNT_M GENMASK_ULL(9, 0) + /* Firmware Status Register (GL_FWSTS) */ #define GL_FWSTS 0x00083048 /* Reset Source: POR */ #define GL_FWSTS_EP_PF0 BIT(24) From 8210ff738077ed3581e022e5cc8721aa041d42cb Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Thu, 10 Apr 2025 15:00:01 +0200 Subject: [PATCH 190/770] ixgbe: add .info_get extension specific for E610 devices E610 devices give possibility to show more detailed info than the previous boards. Extend reporting NVM info with following pieces: fw.mgmt.api -> version number of the API fw.mgmt.build -> identifier of the source for the FW fw.mgmt.srev -> number defining FW's security revision fw.psid.api -> version defining the format of the flash contents fw.undi.srev -> number defining OROM's security revision fw.netlist -> version of the netlist module fw.netlist.build -> first 4 bytes of the netlist hash Co-developed-by: Slawomir Mrozowicz Signed-off-by: Slawomir Mrozowicz Co-developed-by: Piotr Kwapulinski Signed-off-by: Piotr Kwapulinski Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- .../networking/devlink/devlink-info.rst | 4 + Documentation/networking/devlink/ixgbe.rst | 38 ++++++ .../ethernet/intel/ixgbe/devlink/devlink.c | 128 +++++++++++++++++- 3 files changed, 167 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/devlink/devlink-info.rst b/Documentation/networking/devlink/devlink-info.rst index 23073bc219d8e..dd6adc4d05596 100644 --- a/Documentation/networking/devlink/devlink-info.rst +++ b/Documentation/networking/devlink/devlink-info.rst @@ -86,6 +86,10 @@ In case software/firmware components are loaded from the disk (e.g. ``/lib/firmware``) only the running version should be reported via the kernel API. +Please note that any security versions reported via devlink are purely +informational. Devlink does not use a secure channel to communicate with +the device. + Generic Versions ================ diff --git a/Documentation/networking/devlink/ixgbe.rst b/Documentation/networking/devlink/ixgbe.rst index b63645de37e81..97692df50cfe1 100644 --- a/Documentation/networking/devlink/ixgbe.rst +++ b/Documentation/networking/devlink/ixgbe.rst @@ -10,6 +10,10 @@ device driver. Info versions ============= +Any of the versions dealing with the security presented by ``devlink-info`` +is purely informational. Devlink does not use a secure channel to communicate +with the device. + The ``ixgbe`` driver reports the following versions .. list-table:: devlink info versions implemented @@ -33,8 +37,42 @@ The ``ixgbe`` driver reports the following versions non-breaking changes and reset to 1 when the major version is incremented. The patch version is normally 0 but is incremented when a fix is delivered as a patch against an older base Option ROM. + * - ``fw.undi.srev`` + - running + - 4 + - Number indicating the security revision of the Option ROM. * - ``fw.bundle_id`` - running - 0x80000d0d - Unique identifier of the firmware image file that was loaded onto the device. Also referred to as the EETRACK identifier of the NVM. + * - ``fw.mgmt.api`` + - running + - 1.5.1 + - 3-digit version number (major.minor.patch) of the API exported over + the AdminQ by the management firmware. Used by the driver to + identify what commands are supported. Historical versions of the + kernel only displayed a 2-digit version number (major.minor). + * - ``fw.mgmt.build`` + - running + - 0x305d955f + - Unique identifier of the source for the management firmware. + * - ``fw.mgmt.srev`` + - running + - 3 + - Number indicating the security revision of the firmware. + * - ``fw.psid.api`` + - running + - 0.80 + - Version defining the format of the flash contents. + * - ``fw.netlist`` + - running + - 1.1.2000-6.7.0 + - The version of the netlist module. This module defines the device's + Ethernet capabilities and default settings, and is used by the + management firmware as part of managing link and device + connectivity. + * - ``fw.netlist.build`` + - running + - 0xee16ced7 + - The first 4 bytes of the hash of the netlist module contents. diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c index 7bbd1733b8dad..8c24564845e0f 100644 --- a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c @@ -19,14 +19,22 @@ static void ixgbe_info_get_dsn(struct ixgbe_adapter *adapter, snprintf(ctx->buf, sizeof(ctx->buf), "%8phD", dsn); } -static void ixgbe_info_nvm_ver(struct ixgbe_adapter *adapter, - struct ixgbe_info_ctx *ctx) +static void ixgbe_info_orom_ver(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) { struct ixgbe_hw *hw = &adapter->hw; struct ixgbe_nvm_version nvm_ver; ctx->buf[0] = '\0'; + if (hw->mac.type == ixgbe_mac_e610) { + struct ixgbe_orom_info *orom = &adapter->hw.flash.orom; + + snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u", + orom->major, orom->build, orom->patch); + return; + } + ixgbe_get_oem_prod_version(hw, &nvm_ver); if (nvm_ver.oem_valid) { snprintf(ctx->buf, sizeof(ctx->buf), "%x.%x.%x", @@ -48,6 +56,12 @@ static void ixgbe_info_eetrack(struct ixgbe_adapter *adapter, struct ixgbe_hw *hw = &adapter->hw; struct ixgbe_nvm_version nvm_ver; + if (hw->mac.type == ixgbe_mac_e610) { + snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", + hw->flash.nvm.eetrack); + return; + } + ixgbe_get_oem_prod_version(hw, &nvm_ver); /* No ETRACK version for OEM */ @@ -60,6 +74,110 @@ static void ixgbe_info_eetrack(struct ixgbe_adapter *adapter, snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", nvm_ver.etk_id); } +static void ixgbe_info_fw_api(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + struct ixgbe_hw *hw = &adapter->hw; + + snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u", + hw->api_maj_ver, hw->api_min_ver, hw->api_patch); +} + +static void ixgbe_info_fw_build(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + struct ixgbe_hw *hw = &adapter->hw; + + snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", hw->fw_build); +} + +static void ixgbe_info_fw_srev(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + struct ixgbe_nvm_info *nvm = &adapter->hw.flash.nvm; + + snprintf(ctx->buf, sizeof(ctx->buf), "%u", nvm->srev); +} + +static void ixgbe_info_orom_srev(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + struct ixgbe_orom_info *orom = &adapter->hw.flash.orom; + + snprintf(ctx->buf, sizeof(ctx->buf), "%u", orom->srev); +} + +static void ixgbe_info_nvm_ver(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + struct ixgbe_nvm_info *nvm = &adapter->hw.flash.nvm; + + snprintf(ctx->buf, sizeof(ctx->buf), "%x.%02x", nvm->major, nvm->minor); +} + +static void ixgbe_info_netlist_ver(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + struct ixgbe_netlist_info *netlist = &adapter->hw.flash.netlist; + + /* The netlist version fields are BCD formatted */ + snprintf(ctx->buf, sizeof(ctx->buf), "%x.%x.%x-%x.%x.%x", + netlist->major, netlist->minor, + netlist->type >> 16, netlist->type & 0xFFFF, + netlist->rev, netlist->cust_ver); +} + +static void ixgbe_info_netlist_build(struct ixgbe_adapter *adapter, + struct ixgbe_info_ctx *ctx) +{ + struct ixgbe_netlist_info *netlist = &adapter->hw.flash.netlist; + + snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", netlist->hash); +} + +static int ixgbe_devlink_info_get_e610(struct ixgbe_adapter *adapter, + struct devlink_info_req *req, + struct ixgbe_info_ctx *ctx) +{ + int err; + + ixgbe_info_fw_api(adapter, ctx); + err = devlink_info_version_running_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW_MGMT_API, + ctx->buf); + if (err) + return err; + + ixgbe_info_fw_build(adapter, ctx); + err = devlink_info_version_running_put(req, "fw.mgmt.build", ctx->buf); + if (err) + return err; + + ixgbe_info_fw_srev(adapter, ctx); + err = devlink_info_version_running_put(req, "fw.mgmt.srev", ctx->buf); + if (err) + return err; + + ixgbe_info_orom_srev(adapter, ctx); + err = devlink_info_version_running_put(req, "fw.undi.srev", ctx->buf); + if (err) + return err; + + ixgbe_info_nvm_ver(adapter, ctx); + err = devlink_info_version_running_put(req, "fw.psid.api", ctx->buf); + if (err) + return err; + + ixgbe_info_netlist_ver(adapter, ctx); + err = devlink_info_version_running_put(req, "fw.netlist", ctx->buf); + if (err) + return err; + + ixgbe_info_netlist_build(adapter, ctx); + return devlink_info_version_running_put(req, "fw.netlist.build", + ctx->buf); +} + static int ixgbe_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, struct netlink_ext_ack *extack) @@ -88,7 +206,7 @@ static int ixgbe_devlink_info_get(struct devlink *devlink, if (err) goto free_ctx; - ixgbe_info_nvm_ver(adapter, ctx); + ixgbe_info_orom_ver(adapter, ctx); err = devlink_info_version_running_put(req, DEVLINK_INFO_VERSION_GENERIC_FW_UNDI, ctx->buf); @@ -99,6 +217,10 @@ static int ixgbe_devlink_info_get(struct devlink *devlink, err = devlink_info_version_running_put(req, DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID, ctx->buf); + if (err || hw->mac.type != ixgbe_mac_e610) + goto free_ctx; + + err = ixgbe_devlink_info_get_e610(adapter, req, ctx); free_ctx: kfree(ctx); return err; From 4654ec6194b22af73c797508c72710e2e2bba4ad Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Thu, 10 Apr 2025 15:00:02 +0200 Subject: [PATCH 191/770] ixgbe: add E610 functions getting PBA and FW ver info Introduce 2 E610 specific callbacks implementations: -ixgbe_start_hw_e610() which expands the regular .start_hw callback with getting FW version information -ixgbe_read_pba_string_e610() which gets Product Board Assembly string Extend EEPROM ops with new .read_pba_string in order to distinguish generic one and the E610 one. Reviewed-by: Mateusz Polchlopek Tested-by: Bharath R Co-developed-by: Stefan Wegrzyn Signed-off-by: Stefan Wegrzyn Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- .../ethernet/intel/ixgbe/devlink/devlink.c | 2 +- .../net/ethernet/intel/ixgbe/ixgbe_82598.c | 1 + .../net/ethernet/intel/ixgbe/ixgbe_82599.c | 1 + .../net/ethernet/intel/ixgbe/ixgbe_common.c | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 184 +++++++++++++++++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 4 +- drivers/net/ethernet/intel/ixgbe/ixgbe_type.h | 2 + .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c | 1 + 10 files changed, 194 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c index 8c24564845e0f..a7c39e951c7b7 100644 --- a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c @@ -196,7 +196,7 @@ static int ixgbe_devlink_info_get(struct devlink *devlink, if (err) goto free_ctx; - err = ixgbe_read_pba_string_generic(hw, ctx->buf, sizeof(ctx->buf)); + err = hw->eeprom.ops.read_pba_string(hw, ctx->buf, sizeof(ctx->buf)); if (err) goto free_ctx; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c index 4aaaea3b5f8ff..444da982593f6 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c @@ -1169,6 +1169,7 @@ static const struct ixgbe_eeprom_operations eeprom_ops_82598 = { .calc_checksum = &ixgbe_calc_eeprom_checksum_generic, .validate_checksum = &ixgbe_validate_eeprom_checksum_generic, .update_checksum = &ixgbe_update_eeprom_checksum_generic, + .read_pba_string = &ixgbe_read_pba_string_generic, }; static const struct ixgbe_phy_operations phy_ops_82598 = { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c index 964988b4d58b8..d5b1b974b4a33 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82599.c @@ -2230,6 +2230,7 @@ static const struct ixgbe_eeprom_operations eeprom_ops_82599 = { .calc_checksum = &ixgbe_calc_eeprom_checksum_generic, .validate_checksum = &ixgbe_validate_eeprom_checksum_generic, .update_checksum = &ixgbe_update_eeprom_checksum_generic, + .read_pba_string = &ixgbe_read_pba_string_generic, }; static const struct ixgbe_phy_operations phy_ops_82599 = { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c index 7beaf6ea57f9e..5784d5d1896e0 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c @@ -332,6 +332,7 @@ int ixgbe_start_hw_generic(struct ixgbe_hw *hw) * Devices in the second generation: * 82599 * X540 + * E610 **/ int ixgbe_start_hw_gen2(struct ixgbe_hw *hw) { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index af88a6afb4117..f856690106af4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -343,6 +343,40 @@ void ixgbe_fill_dflt_direct_cmd_desc(struct ixgbe_aci_desc *desc, u16 opcode) desc->flags = cpu_to_le16(IXGBE_ACI_FLAG_SI); } +/** + * ixgbe_aci_get_fw_ver - Get the firmware version + * @hw: pointer to the HW struct + * + * Get the firmware version using ACI command (0x0001). + * + * Return: the exit code of the operation. + */ +static int ixgbe_aci_get_fw_ver(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_cmd_get_ver *resp; + struct ixgbe_aci_desc desc; + int err; + + resp = &desc.params.get_ver; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_get_ver); + + err = ixgbe_aci_send_cmd(hw, &desc, NULL, 0); + if (!err) { + hw->fw_branch = resp->fw_branch; + hw->fw_maj_ver = resp->fw_major; + hw->fw_min_ver = resp->fw_minor; + hw->fw_patch = resp->fw_patch; + hw->fw_build = le32_to_cpu(resp->fw_build); + hw->api_branch = resp->api_branch; + hw->api_maj_ver = resp->api_major; + hw->api_min_ver = resp->api_minor; + hw->api_patch = resp->api_patch; + } + + return err; +} + /** * ixgbe_aci_req_res - request a common resource * @hw: pointer to the HW struct @@ -1410,6 +1444,32 @@ int ixgbe_configure_lse(struct ixgbe_hw *hw, bool activate, u16 mask) return ixgbe_aci_get_link_info(hw, activate, NULL); } +/** + * ixgbe_start_hw_e610 - Prepare hardware for Tx/Rx + * @hw: pointer to hardware structure + * + * Get firmware version and start the hardware using the generic + * start_hw() and ixgbe_start_hw_gen2() functions. + * + * Return: the exit code of the operation. + */ +static int ixgbe_start_hw_e610(struct ixgbe_hw *hw) +{ + int err; + + err = ixgbe_aci_get_fw_ver(hw); + if (err) + return err; + + err = ixgbe_start_hw_generic(hw); + if (err) + return err; + + ixgbe_start_hw_gen2(hw); + + return 0; +} + /** * ixgbe_get_media_type_e610 - Gets media type * @hw: pointer to the HW struct @@ -3366,9 +3426,129 @@ int ixgbe_reset_hw_e610(struct ixgbe_hw *hw) return err; } +/** + * ixgbe_get_pfa_module_tlv - Read sub module TLV from NVM PFA + * @hw: pointer to hardware structure + * @module_tlv: pointer to module TLV to return + * @module_tlv_len: pointer to module TLV length to return + * @module_type: module type requested + * + * Find the requested sub module TLV type from the Preserved Field + * Area (PFA) and returns the TLV pointer and length. The caller can + * use these to read the variable length TLV value. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_pfa_module_tlv(struct ixgbe_hw *hw, u16 *module_tlv, + u16 *module_tlv_len, u16 module_type) +{ + u16 pfa_len, pfa_ptr, pfa_end_ptr; + u16 next_tlv; + int err; + + err = ixgbe_read_ee_aci_e610(hw, IXGBE_E610_SR_PFA_PTR, &pfa_ptr); + if (err) + return err; + + err = ixgbe_read_ee_aci_e610(hw, pfa_ptr, &pfa_len); + if (err) + return err; + + /* Starting with first TLV after PFA length, iterate through the list + * of TLVs to find the requested one. + */ + next_tlv = pfa_ptr + 1; + pfa_end_ptr = pfa_ptr + pfa_len; + while (next_tlv < pfa_end_ptr) { + u16 tlv_sub_module_type, tlv_len; + + /* Read TLV type */ + err = ixgbe_read_ee_aci_e610(hw, next_tlv, + &tlv_sub_module_type); + if (err) + break; + + /* Read TLV length */ + err = ixgbe_read_ee_aci_e610(hw, next_tlv + 1, &tlv_len); + if (err) + break; + + if (tlv_sub_module_type == module_type) { + if (tlv_len) { + *module_tlv = next_tlv; + *module_tlv_len = tlv_len; + return 0; + } + return -EIO; + } + /* Check next TLV, i.e. current TLV pointer + length + 2 words + * (for current TLV's type and length). + */ + next_tlv = next_tlv + tlv_len + 2; + } + /* Module does not exist */ + return -ENODATA; +} + +/** + * ixgbe_read_pba_string_e610 - Read PBA string from NVM + * @hw: pointer to hardware structure + * @pba_num: stores the part number string from the NVM + * @pba_num_size: part number string buffer length + * + * Read the part number string from the NVM. + * + * Return: the exit code of the operation. + */ +static int ixgbe_read_pba_string_e610(struct ixgbe_hw *hw, u8 *pba_num, + u32 pba_num_size) +{ + u16 pba_tlv, pba_tlv_len; + u16 pba_word, pba_size; + int err; + + *pba_num = '\0'; + + err = ixgbe_get_pfa_module_tlv(hw, &pba_tlv, &pba_tlv_len, + IXGBE_E610_SR_PBA_BLOCK_PTR); + if (err) + return err; + + /* pba_size is the next word */ + err = ixgbe_read_ee_aci_e610(hw, (pba_tlv + 2), &pba_size); + if (err) + return err; + + if (pba_tlv_len < pba_size) + return -EINVAL; + + /* Subtract one to get PBA word count (PBA Size word is included in + * total size). + */ + pba_size--; + + if (pba_num_size < (((u32)pba_size * 2) + 1)) + return -EINVAL; + + for (u16 i = 0; i < pba_size; i++) { + err = ixgbe_read_ee_aci_e610(hw, (pba_tlv + 2 + 1) + i, + &pba_word); + if (err) + return err; + + pba_num[(i * 2)] = FIELD_GET(IXGBE_E610_SR_PBA_BLOCK_MASK, + pba_word); + pba_num[(i * 2) + 1] = pba_word & 0xFF; + } + + pba_num[(pba_size * 2)] = '\0'; + + return err; +} + static const struct ixgbe_mac_operations mac_ops_e610 = { .init_hw = ixgbe_init_hw_generic, - .start_hw = ixgbe_start_hw_X540, + .start_hw = ixgbe_start_hw_e610, .clear_hw_cntrs = ixgbe_clear_hw_cntrs_generic, .enable_rx_dma = ixgbe_enable_rx_dma_generic, .get_mac_addr = ixgbe_get_mac_addr_generic, @@ -3433,6 +3613,8 @@ static const struct ixgbe_eeprom_operations eeprom_ops_e610 = { .read = ixgbe_read_ee_aci_e610, .read_buffer = ixgbe_read_ee_aci_buffer_e610, .validate_checksum = ixgbe_validate_eeprom_checksum_e610, + .read_pba_string = ixgbe_read_pba_string_e610, + .init_params = ixgbe_init_eeprom_params_e610, }; const struct ixgbe_info ixgbe_e610_info = { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 1f2afbf6b02ee..22e61baf4295e 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -6850,7 +6850,7 @@ static int ixgbe_sw_init(struct ixgbe_adapter *adapter, adapter->tx_work_limit = IXGBE_DEFAULT_TX_WORK; /* initialize eeprom parameters */ - if (ixgbe_init_eeprom_params_generic(hw)) { + if (hw->eeprom.ops.init_params(hw)) { e_dev_err("EEPROM initialization failed\n"); return -EIO; } @@ -11604,7 +11604,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (expected_gts > 0) ixgbe_check_minimum_link(adapter, expected_gts); - err = ixgbe_read_pba_string_generic(hw, part_str, sizeof(part_str)); + err = hw->eeprom.ops.read_pba_string(hw, part_str, sizeof(part_str)); if (err) strscpy(part_str, "Unknown", sizeof(part_str)); if (ixgbe_is_sfp(hw) && hw->phy.sfp_type != ixgbe_sfp_type_not_present) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h index 5fdf32d79d822..5f814f023573f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h @@ -3446,6 +3446,8 @@ struct ixgbe_eeprom_operations { int (*validate_checksum)(struct ixgbe_hw *, u16 *); int (*update_checksum)(struct ixgbe_hw *); int (*calc_checksum)(struct ixgbe_hw *); + int (*read_pba_string)(struct ixgbe_hw *hw, u8 *pba_num, + u32 pba_num_size); }; struct ixgbe_mac_operations { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index 91cc79aab388d..8b145d0a05f13 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -12,6 +12,7 @@ /* Checksum and Shadow RAM pointers */ #define IXGBE_E610_SR_NVM_CTRL_WORD 0x00 #define IXGBE_E610_SR_PBA_BLOCK_PTR 0x16 +#define IXGBE_E610_SR_PBA_BLOCK_MASK GENMASK(15, 8) #define IXGBE_E610_SR_NVM_DEV_STARTER_VER 0x18 #define IXGBE_E610_SR_NVM_EETRACK_LO 0x2D #define IXGBE_E610_SR_NVM_EETRACK_HI 0x2E diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c index 1fc821fb351a5..f1ab95aa8c83b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c @@ -894,6 +894,7 @@ static const struct ixgbe_eeprom_operations eeprom_ops_X540 = { .calc_checksum = &ixgbe_calc_eeprom_checksum_X540, .validate_checksum = &ixgbe_validate_eeprom_checksum_X540, .update_checksum = &ixgbe_update_eeprom_checksum_X540, + .read_pba_string = &ixgbe_read_pba_string_generic, }; static const struct ixgbe_phy_operations phy_ops_X540 = { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c index 277ceaf8a7939..1d2acdb64f459 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c @@ -3959,6 +3959,7 @@ static const struct ixgbe_mac_operations mac_ops_x550em_a_fw = { .validate_checksum = &ixgbe_validate_eeprom_checksum_X550, \ .update_checksum = &ixgbe_update_eeprom_checksum_X550, \ .calc_checksum = &ixgbe_calc_eeprom_checksum_X550, \ + .read_pba_string = &ixgbe_read_pba_string_generic, \ static const struct ixgbe_eeprom_operations eeprom_ops_X550 = { X550_COMMON_EEP From 6eae2aeb60b6f1cfa97e6e8f296027f15173a67a Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Thu, 10 Apr 2025 15:00:03 +0200 Subject: [PATCH 192/770] ixgbe: extend .info_get() with stored versions Add functions reading inactive versions from the inactive flash bank. Print stored versions for the content present in the inactive bank. If there's pending update the versions reflect the ones which are going to be loaded after reload. If there's no pending update both running and stored are the same, which means there won't be any NVM change on reload. Co-developed-by: Slawomir Mrozowicz Signed-off-by: Slawomir Mrozowicz Co-developed-by: Piotr Kwapulinski Signed-off-by: Piotr Kwapulinski Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- .../ethernet/intel/ixgbe/devlink/devlink.c | 170 ++++++++++++++++-- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 59 ++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h | 5 + 3 files changed, 218 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c index a7c39e951c7b7..f7bafc1dad2bb 100644 --- a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c @@ -6,6 +6,15 @@ struct ixgbe_info_ctx { char buf[128]; + struct ixgbe_orom_info pending_orom; + struct ixgbe_nvm_info pending_nvm; + struct ixgbe_netlist_info pending_netlist; + struct ixgbe_hw_dev_caps dev_caps; +}; + +enum ixgbe_devlink_version_type { + IXGBE_DL_VERSION_RUNNING, + IXGBE_DL_VERSION_STORED }; static void ixgbe_info_get_dsn(struct ixgbe_adapter *adapter, @@ -20,7 +29,8 @@ static void ixgbe_info_get_dsn(struct ixgbe_adapter *adapter, } static void ixgbe_info_orom_ver(struct ixgbe_adapter *adapter, - struct ixgbe_info_ctx *ctx) + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) { struct ixgbe_hw *hw = &adapter->hw; struct ixgbe_nvm_version nvm_ver; @@ -30,6 +40,10 @@ static void ixgbe_info_orom_ver(struct ixgbe_adapter *adapter, if (hw->mac.type == ixgbe_mac_e610) { struct ixgbe_orom_info *orom = &adapter->hw.flash.orom; + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_orom) + orom = &ctx->pending_orom; + snprintf(ctx->buf, sizeof(ctx->buf), "%u.%u.%u", orom->major, orom->build, orom->patch); return; @@ -51,14 +65,20 @@ static void ixgbe_info_orom_ver(struct ixgbe_adapter *adapter, } static void ixgbe_info_eetrack(struct ixgbe_adapter *adapter, - struct ixgbe_info_ctx *ctx) + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) { struct ixgbe_hw *hw = &adapter->hw; struct ixgbe_nvm_version nvm_ver; if (hw->mac.type == ixgbe_mac_e610) { - snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", - hw->flash.nvm.eetrack); + u32 eetrack = hw->flash.nvm.eetrack; + + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_nvm) + eetrack = ctx->pending_nvm.eetrack; + + snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", eetrack); return; } @@ -92,34 +112,54 @@ static void ixgbe_info_fw_build(struct ixgbe_adapter *adapter, } static void ixgbe_info_fw_srev(struct ixgbe_adapter *adapter, - struct ixgbe_info_ctx *ctx) + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) { struct ixgbe_nvm_info *nvm = &adapter->hw.flash.nvm; + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_nvm) + nvm = &ctx->pending_nvm; + snprintf(ctx->buf, sizeof(ctx->buf), "%u", nvm->srev); } static void ixgbe_info_orom_srev(struct ixgbe_adapter *adapter, - struct ixgbe_info_ctx *ctx) + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) { struct ixgbe_orom_info *orom = &adapter->hw.flash.orom; + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_orom) + orom = &ctx->pending_orom; + snprintf(ctx->buf, sizeof(ctx->buf), "%u", orom->srev); } static void ixgbe_info_nvm_ver(struct ixgbe_adapter *adapter, - struct ixgbe_info_ctx *ctx) + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) { struct ixgbe_nvm_info *nvm = &adapter->hw.flash.nvm; + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_nvm) + nvm = &ctx->pending_nvm; + snprintf(ctx->buf, sizeof(ctx->buf), "%x.%02x", nvm->major, nvm->minor); } static void ixgbe_info_netlist_ver(struct ixgbe_adapter *adapter, - struct ixgbe_info_ctx *ctx) + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) { struct ixgbe_netlist_info *netlist = &adapter->hw.flash.netlist; + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_netlist) + netlist = &ctx->pending_netlist; + /* The netlist version fields are BCD formatted */ snprintf(ctx->buf, sizeof(ctx->buf), "%x.%x.%x-%x.%x.%x", netlist->major, netlist->minor, @@ -128,13 +168,57 @@ static void ixgbe_info_netlist_ver(struct ixgbe_adapter *adapter, } static void ixgbe_info_netlist_build(struct ixgbe_adapter *adapter, - struct ixgbe_info_ctx *ctx) + struct ixgbe_info_ctx *ctx, + enum ixgbe_devlink_version_type type) { struct ixgbe_netlist_info *netlist = &adapter->hw.flash.netlist; + if (type == IXGBE_DL_VERSION_STORED && + ctx->dev_caps.common_cap.nvm_update_pending_netlist) + netlist = &ctx->pending_netlist; + snprintf(ctx->buf, sizeof(ctx->buf), "0x%08x", netlist->hash); } +static int ixgbe_set_ctx_dev_caps(struct ixgbe_hw *hw, + struct ixgbe_info_ctx *ctx, + struct netlink_ext_ack *extack) +{ + bool *pending_orom, *pending_nvm, *pending_netlist; + int err; + + err = ixgbe_discover_dev_caps(hw, &ctx->dev_caps); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Unable to discover device capabilities"); + return err; + } + + pending_orom = &ctx->dev_caps.common_cap.nvm_update_pending_orom; + pending_nvm = &ctx->dev_caps.common_cap.nvm_update_pending_nvm; + pending_netlist = &ctx->dev_caps.common_cap.nvm_update_pending_netlist; + + if (*pending_orom) { + err = ixgbe_get_inactive_orom_ver(hw, &ctx->pending_orom); + if (err) + *pending_orom = false; + } + + if (*pending_nvm) { + err = ixgbe_get_inactive_nvm_ver(hw, &ctx->pending_nvm); + if (err) + *pending_nvm = false; + } + + if (*pending_netlist) { + err = ixgbe_get_inactive_netlist_ver(hw, &ctx->pending_netlist); + if (err) + *pending_netlist = false; + } + + return 0; +} + static int ixgbe_devlink_info_get_e610(struct ixgbe_adapter *adapter, struct devlink_info_req *req, struct ixgbe_info_ctx *ctx) @@ -153,31 +237,77 @@ static int ixgbe_devlink_info_get_e610(struct ixgbe_adapter *adapter, if (err) return err; - ixgbe_info_fw_srev(adapter, ctx); + ixgbe_info_fw_srev(adapter, ctx, IXGBE_DL_VERSION_RUNNING); err = devlink_info_version_running_put(req, "fw.mgmt.srev", ctx->buf); if (err) return err; - ixgbe_info_orom_srev(adapter, ctx); + ixgbe_info_orom_srev(adapter, ctx, IXGBE_DL_VERSION_RUNNING); err = devlink_info_version_running_put(req, "fw.undi.srev", ctx->buf); if (err) return err; - ixgbe_info_nvm_ver(adapter, ctx); + ixgbe_info_nvm_ver(adapter, ctx, IXGBE_DL_VERSION_RUNNING); err = devlink_info_version_running_put(req, "fw.psid.api", ctx->buf); if (err) return err; - ixgbe_info_netlist_ver(adapter, ctx); + ixgbe_info_netlist_ver(adapter, ctx, IXGBE_DL_VERSION_RUNNING); err = devlink_info_version_running_put(req, "fw.netlist", ctx->buf); if (err) return err; - ixgbe_info_netlist_build(adapter, ctx); + ixgbe_info_netlist_build(adapter, ctx, IXGBE_DL_VERSION_RUNNING); return devlink_info_version_running_put(req, "fw.netlist.build", ctx->buf); } +static int +ixgbe_devlink_pending_info_get_e610(struct ixgbe_adapter *adapter, + struct devlink_info_req *req, + struct ixgbe_info_ctx *ctx) +{ + int err; + + ixgbe_info_orom_ver(adapter, ctx, IXGBE_DL_VERSION_STORED); + err = devlink_info_version_stored_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW_UNDI, + ctx->buf); + if (err) + return err; + + ixgbe_info_eetrack(adapter, ctx, IXGBE_DL_VERSION_STORED); + err = devlink_info_version_stored_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID, + ctx->buf); + if (err) + return err; + + ixgbe_info_fw_srev(adapter, ctx, IXGBE_DL_VERSION_STORED); + err = devlink_info_version_stored_put(req, "fw.mgmt.srev", ctx->buf); + if (err) + return err; + + ixgbe_info_orom_srev(adapter, ctx, IXGBE_DL_VERSION_STORED); + err = devlink_info_version_stored_put(req, "fw.undi.srev", ctx->buf); + if (err) + return err; + + ixgbe_info_nvm_ver(adapter, ctx, IXGBE_DL_VERSION_STORED); + err = devlink_info_version_stored_put(req, "fw.psid.api", ctx->buf); + if (err) + return err; + + ixgbe_info_netlist_ver(adapter, ctx, IXGBE_DL_VERSION_STORED); + err = devlink_info_version_stored_put(req, "fw.netlist", ctx->buf); + if (err) + return err; + + ixgbe_info_netlist_build(adapter, ctx, IXGBE_DL_VERSION_STORED); + return devlink_info_version_stored_put(req, "fw.netlist.build", + ctx->buf); +} + static int ixgbe_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, struct netlink_ext_ack *extack) @@ -206,21 +336,29 @@ static int ixgbe_devlink_info_get(struct devlink *devlink, if (err) goto free_ctx; - ixgbe_info_orom_ver(adapter, ctx); + ixgbe_info_orom_ver(adapter, ctx, IXGBE_DL_VERSION_RUNNING); err = devlink_info_version_running_put(req, DEVLINK_INFO_VERSION_GENERIC_FW_UNDI, ctx->buf); if (err) goto free_ctx; - ixgbe_info_eetrack(adapter, ctx); + ixgbe_info_eetrack(adapter, ctx, IXGBE_DL_VERSION_RUNNING); err = devlink_info_version_running_put(req, DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID, ctx->buf); if (err || hw->mac.type != ixgbe_mac_e610) goto free_ctx; + err = ixgbe_set_ctx_dev_caps(hw, ctx, extack); + if (err) + goto free_ctx; + err = ixgbe_devlink_info_get_e610(adapter, req, ctx); + if (err) + goto free_ctx; + + err = ixgbe_devlink_pending_info_get_e610(adapter, req, ctx); free_ctx: kfree(ctx); return err; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index f856690106af4..24443db831eb1 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -588,6 +588,15 @@ static bool ixgbe_parse_e610_caps(struct ixgbe_hw *hw, break; case IXGBE_ACI_CAPS_NVM_VER: break; + case IXGBE_ACI_CAPS_PENDING_NVM_VER: + caps->nvm_update_pending_nvm = true; + break; + case IXGBE_ACI_CAPS_PENDING_OROM_VER: + caps->nvm_update_pending_orom = true; + break; + case IXGBE_ACI_CAPS_PENDING_NET_VER: + caps->nvm_update_pending_netlist = true; + break; case IXGBE_ACI_CAPS_MAX_MTU: caps->max_mtu = number; break; @@ -2932,6 +2941,23 @@ static int ixgbe_get_orom_ver_info(struct ixgbe_hw *hw, return ixgbe_get_orom_srev(hw, bank, &orom->srev); } +/** + * ixgbe_get_inactive_orom_ver - Read Option ROM version from the inactive bank + * @hw: pointer to the HW structure + * @orom: storage for Option ROM version information + * + * Read the Option ROM version and security revision data for the inactive + * section of flash. Used to access version data for a pending update that has + * not yet been activated. + * + * Return: the exit code of the operation. + */ +int ixgbe_get_inactive_orom_ver(struct ixgbe_hw *hw, + struct ixgbe_orom_info *orom) +{ + return ixgbe_get_orom_ver_info(hw, IXGBE_INACTIVE_FLASH_BANK, orom); +} + /** * ixgbe_get_nvm_ver_info - Read NVM version information * @hw: pointer to the HW struct @@ -2975,6 +3001,22 @@ static int ixgbe_get_nvm_ver_info(struct ixgbe_hw *hw, return 0; } +/** + * ixgbe_get_inactive_nvm_ver - Read Option ROM version from the inactive bank + * @hw: pointer to the HW structure + * @nvm: storage for Option ROM version information + * + * Read the NVM EETRACK ID, Map version, and security revision of the + * inactive NVM bank. Used to access version data for a pending update that + * has not yet been activated. + * + * Return: the exit code of the operation. + */ +int ixgbe_get_inactive_nvm_ver(struct ixgbe_hw *hw, struct ixgbe_nvm_info *nvm) +{ + return ixgbe_get_nvm_ver_info(hw, IXGBE_INACTIVE_FLASH_BANK, nvm); +} + /** * ixgbe_get_netlist_info - Read the netlist version information * @hw: pointer to the HW struct @@ -3055,6 +3097,23 @@ static int ixgbe_get_netlist_info(struct ixgbe_hw *hw, return err; } +/** + * ixgbe_get_inactive_netlist_ver - Read netlist version from the inactive bank + * @hw: pointer to the HW struct + * @netlist: pointer to netlist version info structure + * + * Read the netlist version data from the inactive netlist bank. Used to + * extract version data of a pending flash update in order to display the + * version data. + * + * Return: the exit code of the operation. + */ +int ixgbe_get_inactive_netlist_ver(struct ixgbe_hw *hw, + struct ixgbe_netlist_info *netlist) +{ + return ixgbe_get_netlist_info(hw, IXGBE_INACTIVE_FLASH_BANK, netlist); +} + /** * ixgbe_get_flash_data - get flash data * @hw: pointer to the HW struct diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h index 2c971a34200b8..7565a40d792fc 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -67,6 +67,11 @@ int ixgbe_aci_read_nvm(struct ixgbe_hw *hw, u16 module_typeid, u32 offset, u16 length, void *data, bool last_command, bool read_shadow_ram); int ixgbe_nvm_validate_checksum(struct ixgbe_hw *hw); +int ixgbe_get_inactive_orom_ver(struct ixgbe_hw *hw, + struct ixgbe_orom_info *orom); +int ixgbe_get_inactive_nvm_ver(struct ixgbe_hw *hw, struct ixgbe_nvm_info *nvm); +int ixgbe_get_inactive_netlist_ver(struct ixgbe_hw *hw, + struct ixgbe_netlist_info *netlist); int ixgbe_read_sr_word_aci(struct ixgbe_hw *hw, u16 offset, u16 *data); int ixgbe_read_flat_nvm(struct ixgbe_hw *hw, u32 offset, u32 *length, u8 *data, bool read_shadow_ram); From a0f45672d5e14af053d2dc5f552381351f6eeac0 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Thu, 10 Apr 2025 15:00:04 +0200 Subject: [PATCH 193/770] ixgbe: add device flash update via devlink Use the pldmfw library to implement device flash update for the Intel ixgbe networking device driver specifically for E610 devices. This support uses the devlink flash update interface. Using the pldmfw library, the provided firmware file will be scanned for the three major components, "fw.undi" for the Option ROM, "fw.mgmt" for the main NVM module containing the primary device firmware, and "fw.netlist" containing the netlist module. The flash is separated into two banks, the active bank containing the running firmware, and the inactive bank which we use for update. Each module is updated in a staged process. First, the inactive bank is erased, preparing the device for update. Second, the contents of the component are copied to the inactive portion of the flash. After all components are updated, the driver signals the device to switch the active bank during the next EMP reset. With this implementation, basic flash update for the E610 hardware is supported. Reviewed-by: Jacob Keller Tested-by: Bharath R Co-developed-by: Slawomir Mrozowicz Signed-off-by: Slawomir Mrozowicz Co-developed-by: Piotr Kwapulinski Signed-off-by: Piotr Kwapulinski Co-developed-by: Stefan Wegrzyn Signed-off-by: Stefan Wegrzyn Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- Documentation/networking/devlink/ixgbe.rst | 27 + drivers/net/ethernet/intel/Kconfig | 1 + drivers/net/ethernet/intel/ixgbe/Makefile | 2 +- .../ethernet/intel/ixgbe/devlink/devlink.c | 4 + drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 209 ++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h | 11 + .../ethernet/intel/ixgbe/ixgbe_fw_update.c | 668 ++++++++++++++++++ .../ethernet/intel/ixgbe/ixgbe_fw_update.h | 12 + .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 64 ++ 9 files changed, 997 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c create mode 100644 drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.h diff --git a/Documentation/networking/devlink/ixgbe.rst b/Documentation/networking/devlink/ixgbe.rst index 97692df50cfe1..3fd2584b2bf08 100644 --- a/Documentation/networking/devlink/ixgbe.rst +++ b/Documentation/networking/devlink/ixgbe.rst @@ -76,3 +76,30 @@ The ``ixgbe`` driver reports the following versions - running - 0xee16ced7 - The first 4 bytes of the hash of the netlist module contents. + +Flash Update +============ + +The ``ixgbe`` driver implements support for flash update using the +``devlink-flash`` interface. It supports updating the device flash using a +combined flash image that contains the ``fw.mgmt``, ``fw.undi``, and +``fw.netlist`` components. + +.. list-table:: List of supported overwrite modes + :widths: 5 95 + + * - Bits + - Behavior + * - ``DEVLINK_FLASH_OVERWRITE_SETTINGS`` + - Do not preserve settings stored in the flash components being + updated. This includes overwriting the port configuration that + determines the number of physical functions the device will + initialize with. + * - ``DEVLINK_FLASH_OVERWRITE_SETTINGS`` and ``DEVLINK_FLASH_OVERWRITE_IDENTIFIERS`` + - Do not preserve either settings or identifiers. Overwrite everything + in the flash with the contents from the provided image, without + performing any preservation. This includes overwriting device + identifying fields such as the MAC address, Vital product Data (VPD) area, + and device serial number. It is expected that this combination be used with an + image customized for the specific device. + diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index 56ee58c9df21f..b2adb40a59210 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -148,6 +148,7 @@ config IXGBE depends on PTP_1588_CLOCK_OPTIONAL select MDIO select NET_DEVLINK + select PLDMFW select PHYLIB help This driver supports Intel(R) 10GbE PCI Express family of diff --git a/drivers/net/ethernet/intel/ixgbe/Makefile b/drivers/net/ethernet/intel/ixgbe/Makefile index 11f37140c0a30..ce447540d146e 100644 --- a/drivers/net/ethernet/intel/ixgbe/Makefile +++ b/drivers/net/ethernet/intel/ixgbe/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_IXGBE) += ixgbe.o ixgbe-y := ixgbe_main.o ixgbe_common.o ixgbe_ethtool.o \ ixgbe_82599.o ixgbe_82598.o ixgbe_phy.o ixgbe_sriov.o \ ixgbe_mbx.o ixgbe_x540.o ixgbe_x550.o ixgbe_lib.o ixgbe_ptp.o \ - ixgbe_xsk.o ixgbe_e610.o devlink/devlink.o + ixgbe_xsk.o ixgbe_e610.o devlink/devlink.o ixgbe_fw_update.o ixgbe-$(CONFIG_IXGBE_DCB) += ixgbe_dcb.o ixgbe_dcb_82598.o \ ixgbe_dcb_82599.o ixgbe_dcb_nl.o diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c index f7bafc1dad2bb..26cbeb077fa64 100644 --- a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c @@ -3,6 +3,7 @@ #include "ixgbe.h" #include "devlink.h" +#include "ixgbe_fw_update.h" struct ixgbe_info_ctx { char buf[128]; @@ -366,6 +367,9 @@ static int ixgbe_devlink_info_get(struct devlink *devlink, static const struct devlink_ops ixgbe_devlink_ops = { .info_get = ixgbe_devlink_info_get, + .supported_flash_update_params = + DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK, + .flash_update = ixgbe_flash_pldm_image, }; /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 24443db831eb1..5abba05554c8f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -597,6 +597,11 @@ static bool ixgbe_parse_e610_caps(struct ixgbe_hw *hw, case IXGBE_ACI_CAPS_PENDING_NET_VER: caps->nvm_update_pending_netlist = true; break; + case IXGBE_ACI_CAPS_NVM_MGMT: + caps->nvm_unified_update = + (number & IXGBE_NVM_MGMT_UNIFIED_UPD_SUPPORT) ? + true : false; + break; case IXGBE_ACI_CAPS_MAX_MTU: caps->max_mtu = number; break; @@ -2294,6 +2299,131 @@ int ixgbe_aci_read_nvm(struct ixgbe_hw *hw, u16 module_typeid, u32 offset, return ixgbe_aci_send_cmd(hw, &desc, data, length); } +/** + * ixgbe_aci_erase_nvm - erase NVM sector + * @hw: pointer to the HW struct + * @module_typeid: module pointer location in words from the NVM beginning + * + * Erase the NVM sector using the ACI command (0x0702). + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_erase_nvm(struct ixgbe_hw *hw, u16 module_typeid) +{ + struct ixgbe_aci_cmd_nvm *cmd; + struct ixgbe_aci_desc desc; + __le16 len; + int err; + + /* Read a length value from SR, so module_typeid is equal to 0, + * calculate offset where module size is placed from bytes to words + * set last command and read from SR values to true. + */ + err = ixgbe_aci_read_nvm(hw, 0, 2 * module_typeid + 2, 2, &len, true, + true); + if (err) + return err; + + cmd = &desc.params.nvm; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_erase); + + cmd->module_typeid = cpu_to_le16(module_typeid); + cmd->length = len; + cmd->offset_low = 0; + cmd->offset_high = 0; + + return ixgbe_aci_send_cmd(hw, &desc, NULL, 0); +} + +/** + * ixgbe_aci_update_nvm - update NVM + * @hw: pointer to the HW struct + * @module_typeid: module pointer location in words from the NVM beginning + * @offset: byte offset from the module beginning + * @length: length of the section to be written (in bytes from the offset) + * @data: command buffer (size [bytes] = length) + * @last_command: tells if this is the last command in a series + * @command_flags: command parameters + * + * Update the NVM using the ACI command (0x0703). + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_update_nvm(struct ixgbe_hw *hw, u16 module_typeid, + u32 offset, u16 length, void *data, + bool last_command, u8 command_flags) +{ + struct ixgbe_aci_cmd_nvm *cmd; + struct ixgbe_aci_desc desc; + + cmd = &desc.params.nvm; + + /* In offset the highest byte must be zeroed. */ + if (offset & 0xFF000000) + return -EINVAL; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_write); + + cmd->cmd_flags |= command_flags; + + /* If this is the last command in a series, set the proper flag. */ + if (last_command) + cmd->cmd_flags |= IXGBE_ACI_NVM_LAST_CMD; + cmd->module_typeid = cpu_to_le16(module_typeid); + cmd->offset_low = cpu_to_le16(offset & 0xFFFF); + cmd->offset_high = FIELD_GET(IXGBE_ACI_NVM_OFFSET_HI_U_MASK, offset); + cmd->length = cpu_to_le16(length); + + desc.flags |= cpu_to_le16(IXGBE_ACI_FLAG_RD); + + return ixgbe_aci_send_cmd(hw, &desc, data, length); +} + +/** + * ixgbe_nvm_write_activate - NVM activate write + * @hw: pointer to the HW struct + * @cmd_flags: flags for write activate command + * @response_flags: response indicators from firmware + * + * Update the control word with the required banks' validity bits + * and dumps the Shadow RAM to flash using ACI command (0x0707). + * + * cmd_flags controls which banks to activate, the preservation level to use + * when activating the NVM bank, and whether an EMP reset is required for + * activation. + * + * Note that the 16bit cmd_flags value is split between two separate 1 byte + * flag values in the descriptor. + * + * On successful return of the firmware command, the response_flags variable + * is updated with the flags reported by firmware indicating certain status, + * such as whether EMP reset is enabled. + * + * Return: the exit code of the operation. + */ +int ixgbe_nvm_write_activate(struct ixgbe_hw *hw, u16 cmd_flags, + u8 *response_flags) +{ + struct ixgbe_aci_cmd_nvm *cmd; + struct ixgbe_aci_desc desc; + s32 err; + + cmd = &desc.params.nvm; + ixgbe_fill_dflt_direct_cmd_desc(&desc, + ixgbe_aci_opc_nvm_write_activate); + + cmd->cmd_flags = (u8)(cmd_flags & 0xFF); + cmd->offset_high = (u8)FIELD_GET(IXGBE_ACI_NVM_OFFSET_HI_A_MASK, + cmd_flags); + + err = ixgbe_aci_send_cmd(hw, &desc, NULL, 0); + if (!err && response_flags) + *response_flags = cmd->cmd_flags; + + return err; +} + /** * ixgbe_nvm_validate_checksum - validate checksum * @hw: pointer to the HW struct @@ -3171,6 +3301,85 @@ int ixgbe_get_flash_data(struct ixgbe_hw *hw) return err; } +/* ixgbe_nvm_set_pkg_data - NVM set package data + * @hw: pointer to the HW struct + * @del_pkg_data_flag: If is set then the current pkg_data store by FW + * is deleted. + * If bit is set to 1, then buffer should be size 0. + * @data: pointer to buffer + * @length: length of the buffer + * + * Set package data using ACI command (0x070A). + * This command is equivalent to the reception of + * a PLDM FW Update GetPackageData cmd. This command should be sent + * as part of the NVM update as the first cmd in the flow. + * + * Return: the exit code of the operation. + */ +int ixgbe_nvm_set_pkg_data(struct ixgbe_hw *hw, bool del_pkg_data_flag, + u8 *data, u16 length) +{ + struct ixgbe_aci_cmd_nvm_pkg_data *cmd; + struct ixgbe_aci_desc desc; + + if (length != 0 && !data) + return -EINVAL; + + cmd = &desc.params.pkg_data; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_pkg_data); + desc.flags |= cpu_to_le16(IXGBE_ACI_FLAG_RD); + + if (del_pkg_data_flag) + cmd->cmd_flags |= IXGBE_ACI_NVM_PKG_DELETE; + + return ixgbe_aci_send_cmd(hw, &desc, data, length); +} + +/* ixgbe_nvm_pass_component_tbl - NVM pass component table + * @hw: pointer to the HW struct + * @data: pointer to buffer + * @length: length of the buffer + * @transfer_flag: parameter for determining stage of the update + * @comp_response: a pointer to the response from the 0x070B ACI. + * @comp_response_code: a pointer to the response code from the 0x070B ACI. + * + * Pass component table using ACI command (0x070B). This command is equivalent + * to the reception of a PLDM FW Update PassComponentTable cmd. + * This command should be sent once per component. It can be only sent after + * Set Package Data cmd and before actual update. FW will assume these + * commands are going to be sent until the TransferFlag is set to End or + * StartAndEnd. + * + * Return: the exit code of the operation. + */ +int ixgbe_nvm_pass_component_tbl(struct ixgbe_hw *hw, u8 *data, u16 length, + u8 transfer_flag, u8 *comp_response, + u8 *comp_response_code) +{ + struct ixgbe_aci_cmd_nvm_pass_comp_tbl *cmd; + struct ixgbe_aci_desc desc; + int err; + + if (!data || !comp_response || !comp_response_code) + return -EINVAL; + + cmd = &desc.params.pass_comp_tbl; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, + ixgbe_aci_opc_nvm_pass_component_tbl); + desc.flags |= cpu_to_le16(IXGBE_ACI_FLAG_RD); + + cmd->transfer_flag = transfer_flag; + err = ixgbe_aci_send_cmd(hw, &desc, data, length); + if (!err) { + *comp_response = cmd->component_response; + *comp_response_code = cmd->component_response_code; + } + + return err; +} + /** * ixgbe_read_sr_word_aci - Reads Shadow RAM via ACI * @hw: pointer to the HW structure diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h index 7565a40d792fc..0bc71d1a39b4b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -83,5 +83,16 @@ int ixgbe_read_ee_aci_buffer_e610(struct ixgbe_hw *hw, u16 offset, int ixgbe_validate_eeprom_checksum_e610(struct ixgbe_hw *hw, u16 *checksum_val); int ixgbe_reset_hw_e610(struct ixgbe_hw *hw); int ixgbe_get_flash_data(struct ixgbe_hw *hw); +int ixgbe_nvm_set_pkg_data(struct ixgbe_hw *hw, bool del_pkg_data_flag, + u8 *data, u16 length); +int ixgbe_nvm_pass_component_tbl(struct ixgbe_hw *hw, u8 *data, u16 length, + u8 transfer_flag, u8 *comp_response, + u8 *comp_response_code); +int ixgbe_aci_erase_nvm(struct ixgbe_hw *hw, u16 module_typeid); +int ixgbe_aci_update_nvm(struct ixgbe_hw *hw, u16 module_typeid, + u32 offset, u16 length, void *data, + bool last_command, u8 command_flags); +int ixgbe_nvm_write_activate(struct ixgbe_hw *hw, u16 cmd_flags, + u8 *response_flags); #endif /* _IXGBE_E610_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c new file mode 100644 index 0000000000000..1ff55dc8a6b7d --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c @@ -0,0 +1,668 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2025 Intel Corporation. */ + +#include +#include +#include + +#include "ixgbe.h" +#include "ixgbe_fw_update.h" + +struct ixgbe_fwu_priv { + struct pldmfw context; + + struct ixgbe_adapter *adapter; + struct netlink_ext_ack *extack; + + /* Track which NVM banks to activate at the end of the update */ + u8 activate_flags; +}; + +/** + * ixgbe_send_package_data - Send record package data to firmware + * @context: PLDM fw update structure + * @data: pointer to the package data + * @length: length of the package data + * + * Send a copy of the package data associated with the PLDM record matching + * this device to the firmware. + * + * Note that this function sends an AdminQ command that will fail unless the + * NVM resource has been acquired. + * + * Return: zero on success, or a negative error code on failure. + */ +static int ixgbe_send_package_data(struct pldmfw *context, + const u8 *data, u16 length) +{ + struct ixgbe_fwu_priv *priv = container_of(context, + struct ixgbe_fwu_priv, + context); + struct ixgbe_adapter *adapter = priv->adapter; + struct ixgbe_hw *hw = &adapter->hw; + u8 *package_data; + int err; + + package_data = kmemdup(data, length, GFP_KERNEL); + if (!package_data) + return -ENOMEM; + + err = ixgbe_nvm_set_pkg_data(hw, false, package_data, length); + + kfree(package_data); + + return err; +} + +/** + * ixgbe_check_component_response - Report firmware response to a component + * @adapter: device private data structure + * @response: indicates whether this component can be updated + * @code: code indicating reason for response + * @extack: netlink extended ACK structure + * + * Check whether firmware indicates if this component can be updated. Report + * a suitable error message over the netlink extended ACK if the component + * cannot be updated. + * + * Return: 0 if the component can be updated, or -ECANCELED if the + * firmware indicates the component cannot be updated. + */ +static int ixgbe_check_component_response(struct ixgbe_adapter *adapter, + u8 response, u8 code, + struct netlink_ext_ack *extack) +{ + switch (response) { + case IXGBE_ACI_NVM_PASS_COMP_CAN_BE_UPDATED: + /* Firmware indicated this update is good to proceed. */ + return 0; + case IXGBE_ACI_NVM_PASS_COMP_CAN_MAY_BE_UPDATEABLE: + NL_SET_ERR_MSG_MOD(extack, + "Firmware recommends not updating, as it may result in a downgrade. Continuing anyways"); + return 0; + case IXGBE_ACI_NVM_PASS_COMP_CAN_NOT_BE_UPDATED: + NL_SET_ERR_MSG_MOD(extack, "Firmware has rejected updating."); + break; + } + + switch (code) { + case IXGBE_ACI_NVM_PASS_COMP_STAMP_IDENTICAL_CODE: + NL_SET_ERR_MSG_MOD(extack, + "Component comparison stamp is identical to running image"); + break; + case IXGBE_ACI_NVM_PASS_COMP_STAMP_LOWER: + NL_SET_ERR_MSG_MOD(extack, + "Component comparison stamp is lower than running image"); + break; + case IXGBE_ACI_NVM_PASS_COMP_INVALID_STAMP_CODE: + NL_SET_ERR_MSG_MOD(extack, + "Component comparison stamp is invalid"); + break; + case IXGBE_ACI_NVM_PASS_COMP_CONFLICT_CODE: + NL_SET_ERR_MSG_MOD(extack, + "Component table conflict occurred"); + break; + case IXGBE_ACI_NVM_PASS_COMP_PRE_REQ_NOT_MET_CODE: + NL_SET_ERR_MSG_MOD(extack, "Component pre-requisites not met"); + break; + case IXGBE_ACI_NVM_PASS_COMP_NOT_SUPPORTED_CODE: + NL_SET_ERR_MSG_MOD(extack, "Component not supported"); + break; + case IXGBE_ACI_NVM_PASS_COMP_CANNOT_DOWNGRADE_CODE: + NL_SET_ERR_MSG_MOD(extack, "Component cannot be downgraded"); + break; + case IXGBE_ACI_NVM_PASS_COMP_INCOMPLETE_IMAGE_CODE: + NL_SET_ERR_MSG_MOD(extack, "Incomplete component image"); + break; + case IXGBE_ACI_NVM_PASS_COMP_VER_STR_IDENTICAL_CODE: + NL_SET_ERR_MSG_MOD(extack, + "Component version is identical to running image"); + break; + case IXGBE_ACI_NVM_PASS_COMP_VER_STR_LOWER_CODE: + NL_SET_ERR_MSG_MOD(extack, + "Component version is lower than the running image"); + break; + default: + NL_SET_ERR_MSG_MOD(extack, + "Received unexpected response code from firmware"); + break; + } + + return -ECANCELED; +} + +/** + * ixgbe_send_component_table - Send PLDM component table to firmware + * @context: PLDM fw update structure + * @component: the component to process + * @transfer_flag: relative transfer order of this component + * + * Read relevant data from the component and forward it to the device + * firmware. Check the response to determine if the firmware indicates that + * the update can proceed. + * + * This function sends ACI commands related to the NVM, and assumes that + * the NVM resource has been acquired. + * + * Return: 0 on success, or a negative error code on failure. + */ +static int ixgbe_send_component_table(struct pldmfw *context, + struct pldmfw_component *component, + u8 transfer_flag) +{ + struct ixgbe_fwu_priv *priv = container_of(context, + struct ixgbe_fwu_priv, + context); + struct ixgbe_adapter *adapter = priv->adapter; + struct netlink_ext_ack *extack = priv->extack; + struct ixgbe_aci_cmd_nvm_comp_tbl *comp_tbl; + u8 comp_response, comp_response_code; + struct ixgbe_hw *hw = &adapter->hw; + size_t length; + int err; + + switch (component->identifier) { + case NVM_COMP_ID_OROM: + case NVM_COMP_ID_NVM: + case NVM_COMP_ID_NETLIST: + break; + default: + NL_SET_ERR_MSG_MOD(extack, + "Unable to update due to unknown firmware component"); + return -EOPNOTSUPP; + } + + length = struct_size(comp_tbl, cvs, component->version_len); + comp_tbl = kzalloc(length, GFP_KERNEL); + if (!comp_tbl) + return -ENOMEM; + + comp_tbl->comp_class = cpu_to_le16(component->classification); + comp_tbl->comp_id = cpu_to_le16(component->identifier); + comp_tbl->comp_class_idx = FWU_COMP_CLASS_IDX_NOT_USE; + comp_tbl->comp_cmp_stamp = cpu_to_le32(component->comparison_stamp); + comp_tbl->cvs_type = component->version_type; + comp_tbl->cvs_len = component->version_len; + + memcpy(comp_tbl->cvs, component->version_string, + component->version_len); + + err = ixgbe_nvm_pass_component_tbl(hw, (u8 *)comp_tbl, length, + transfer_flag, &comp_response, + &comp_response_code); + + kfree(comp_tbl); + + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to transfer component table to firmware"); + return -EIO; + } + + return ixgbe_check_component_response(adapter, + comp_response, + comp_response_code, extack); +} + +/** + * ixgbe_write_one_nvm_block - Write an NVM block and await completion response + * @adapter: the PF data structure + * @module: the module to write to + * @offset: offset in bytes + * @block_size: size of the block to write, up to 4k + * @block: pointer to block of data to write + * @last_cmd: whether this is the last command + * @extack: netlink extended ACK structure + * + * Write a block of data to a flash module, and await for the completion + * response message from firmware. + * + * Note this function assumes the caller has acquired the NVM resource. + * + * On successful return, reset level indicates the device reset required to + * complete the update. + * + * 0 - IXGBE_ACI_NVM_POR_FLAG - A full power on is required + * 1 - IXGBE_ACI_NVM_PERST_FLAG - A cold PCIe reset is required + * 2 - IXGBE_ACI_NVM_EMPR_FLAG - An EMP reset is required + * + * Return: 0 on success, or a negative error code on failure. + */ +static int ixgbe_write_one_nvm_block(struct ixgbe_adapter *adapter, + u16 module, u32 offset, + u16 block_size, u8 *block, bool last_cmd, + struct netlink_ext_ack *extack) +{ + struct ixgbe_hw *hw = &adapter->hw; + + return ixgbe_aci_update_nvm(hw, module, offset, block_size, block, + last_cmd, 0); +} + +/** + * ixgbe_write_nvm_module - Write data to an NVM module + * @adapter: the PF driver structure + * @module: the module id to program + * @component: the name of the component being updated + * @image: buffer of image data to write to the NVM + * @length: length of the buffer + * @extack: netlink extended ACK structure + * + * Loop over the data for a given NVM module and program it in 4 Kb + * blocks. Notify devlink core of progress after each block is programmed. + * Loops over a block of data and programs the NVM in 4k block chunks. + * + * Note this function assumes the caller has acquired the NVM resource. + * + * Return: 0 on success, or a negative error code on failure. + */ +static int ixgbe_write_nvm_module(struct ixgbe_adapter *adapter, u16 module, + const char *component, const u8 *image, + u32 length, + struct netlink_ext_ack *extack) +{ + struct devlink *devlink = adapter->devlink; + u32 offset = 0; + bool last_cmd; + u8 *block; + int err; + + devlink_flash_update_status_notify(devlink, "Flashing", + component, 0, length); + + block = kzalloc(IXGBE_ACI_MAX_BUFFER_SIZE, GFP_KERNEL); + if (!block) + return -ENOMEM; + + do { + u32 block_size; + + block_size = min_t(u32, IXGBE_ACI_MAX_BUFFER_SIZE, + length - offset); + last_cmd = !(offset + block_size < length); + + memcpy(block, image + offset, block_size); + + err = ixgbe_write_one_nvm_block(adapter, module, offset, + block_size, block, last_cmd, + extack); + if (err) + break; + + offset += block_size; + + devlink_flash_update_status_notify(devlink, "Flashing", + component, offset, length); + } while (!last_cmd); + + if (err) + devlink_flash_update_status_notify(devlink, "Flashing failed", + component, length, length); + else + devlink_flash_update_status_notify(devlink, "Flashing done", + component, length, length); + + kfree(block); + + return err; +} + +/* Length in seconds to wait before timing out when erasing a flash module. + * Yes, erasing really can take minutes to complete. + */ +#define IXGBE_FW_ERASE_TIMEOUT 300 + +/** + * ixgbe_erase_nvm_module - Erase an NVM module and await firmware completion + * @adapter: the PF data structure + * @module: the module to erase + * @component: name of the component being updated + * @extack: netlink extended ACK structure + * + * Erase the inactive NVM bank associated with this module, and await for + * a completion response message from firmware. + * + * Note this function assumes the caller has acquired the NVM resource. + * + * Return: 0 on success, or a negative error code on failure. + */ +static int ixgbe_erase_nvm_module(struct ixgbe_adapter *adapter, u16 module, + const char *component, + struct netlink_ext_ack *extack) +{ + struct devlink *devlink = adapter->devlink; + struct ixgbe_hw *hw = &adapter->hw; + int err; + + devlink_flash_update_timeout_notify(devlink, "Erasing", component, + IXGBE_FW_ERASE_TIMEOUT); + + err = ixgbe_aci_erase_nvm(hw, module); + if (err) + devlink_flash_update_status_notify(devlink, "Erasing failed", + component, 0, 0); + else + devlink_flash_update_status_notify(devlink, "Erasing done", + component, 0, 0); + + return err; +} + +/** + * ixgbe_switch_flash_banks - Tell firmware to switch NVM banks + * @adapter: Pointer to the PF data structure + * @activate_flags: flags used for the activation command + * @extack: netlink extended ACK structure + * + * Notify firmware to activate the newly written flash banks, and wait for the + * firmware response. + * + * Return: 0 on success or an error code on failure. + */ +static int ixgbe_switch_flash_banks(struct ixgbe_adapter *adapter, + u8 activate_flags, + struct netlink_ext_ack *extack) +{ + struct ixgbe_hw *hw = &adapter->hw; + u8 response_flags; + int err; + + err = ixgbe_nvm_write_activate(hw, activate_flags, &response_flags); + if (err) + NL_SET_ERR_MSG_MOD(extack, + "Failed to switch active flash banks"); + + return err; +} + +/** + * ixgbe_flash_component - Flash a component of the NVM + * @context: PLDM fw update structure + * @component: the component table to program + * + * Program the flash contents for a given component. First, determine the + * module id. Then, erase the secondary bank for this module. Finally, write + * the contents of the component to the NVM. + * + * Note this function assumes the caller has acquired the NVM resource. + * + * Return: 0 on success, or a negative error code on failure. + */ +static int ixgbe_flash_component(struct pldmfw *context, + struct pldmfw_component *component) +{ + struct ixgbe_fwu_priv *priv = container_of(context, + struct ixgbe_fwu_priv, + context); + struct netlink_ext_ack *extack = priv->extack; + struct ixgbe_adapter *adapter = priv->adapter; + const char *name; + u16 module; + int err; + u8 flag; + + switch (component->identifier) { + case NVM_COMP_ID_OROM: + module = IXGBE_E610_SR_1ST_OROM_BANK_PTR; + flag = IXGBE_ACI_NVM_ACTIV_SEL_OROM; + name = "fw.undi"; + break; + case NVM_COMP_ID_NVM: + module = IXGBE_E610_SR_1ST_NVM_BANK_PTR; + flag = IXGBE_ACI_NVM_ACTIV_SEL_NVM; + name = "fw.mgmt"; + break; + case NVM_COMP_ID_NETLIST: + module = IXGBE_E610_SR_NETLIST_BANK_PTR; + flag = IXGBE_ACI_NVM_ACTIV_SEL_NETLIST; + name = "fw.netlist"; + break; + + default: + return -EOPNOTSUPP; + } + + /* Mark this component for activating at the end. */ + priv->activate_flags |= flag; + + err = ixgbe_erase_nvm_module(adapter, module, name, extack); + if (err) + return err; + + return ixgbe_write_nvm_module(adapter, module, name, + component->component_data, + component->component_size, extack); +} + +/** + * ixgbe_finalize_update - Perform last steps to complete device update + * @context: PLDM fw update structure + * + * Called as the last step of the update process. Complete the update by + * telling the firmware to switch active banks, and perform a reset of + * configured. + * + * Return: 0 on success, or an error code on failure. + */ +static int ixgbe_finalize_update(struct pldmfw *context) +{ + struct ixgbe_fwu_priv *priv = container_of(context, + struct ixgbe_fwu_priv, + context); + struct ixgbe_adapter *adapter = priv->adapter; + struct netlink_ext_ack *extack = priv->extack; + + return ixgbe_switch_flash_banks(adapter, priv->activate_flags, + extack); +} + +static const struct pldmfw_ops ixgbe_fwu_ops_e610 = { + .match_record = &pldmfw_op_pci_match_record, + .send_package_data = &ixgbe_send_package_data, + .send_component_table = &ixgbe_send_component_table, + .flash_component = &ixgbe_flash_component, + .finalize_update = &ixgbe_finalize_update, +}; + +/** + * ixgbe_get_pending_updates - Check if the component has a pending update + * @adapter: the PF driver structure + * @pending: on return, bitmap of updates pending + * @extack: Netlink extended ACK + * + * Check if the device has any pending updates on any flash components. + * + * Return: 0 on success, or a negative error code on failure. Update + * pending with the bitmap of pending updates. + */ +int ixgbe_get_pending_updates(struct ixgbe_adapter *adapter, u8 *pending, + struct netlink_ext_ack *extack) +{ + struct ixgbe_hw_dev_caps *dev_caps; + struct ixgbe_hw *hw = &adapter->hw; + int err; + + dev_caps = kzalloc(sizeof(*dev_caps), GFP_KERNEL); + if (!dev_caps) + return -ENOMEM; + + err = ixgbe_discover_dev_caps(hw, dev_caps); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Unable to read device capabilities"); + kfree(dev_caps); + return -EIO; + } + + *pending = 0; + + if (dev_caps->common_cap.nvm_update_pending_nvm) + *pending |= IXGBE_ACI_NVM_ACTIV_SEL_NVM; + + if (dev_caps->common_cap.nvm_update_pending_orom) + *pending |= IXGBE_ACI_NVM_ACTIV_SEL_OROM; + + if (dev_caps->common_cap.nvm_update_pending_netlist) + *pending |= IXGBE_ACI_NVM_ACTIV_SEL_NETLIST; + + kfree(dev_caps); + + return 0; +} + +/** + * ixgbe_cancel_pending_update - Cancel any pending update for a component + * @adapter: the PF driver structure + * @component: if not NULL, the name of the component being updated + * @extack: Netlink extended ACK structure + * + * Cancel any pending update for the specified component. If component is + * NULL, all device updates will be canceled. + * + * Return: 0 on success, or a negative error code on failure. + */ +static int ixgbe_cancel_pending_update(struct ixgbe_adapter *adapter, + const char *component, + struct netlink_ext_ack *extack) +{ + struct devlink *devlink = adapter->devlink; + struct ixgbe_hw *hw = &adapter->hw; + u8 pending; + int err; + + err = ixgbe_get_pending_updates(adapter, &pending, extack); + if (err) + return err; + + /* If the flash_update request is for a specific component, ignore all + * of the other components. + */ + if (component) { + if (strcmp(component, "fw.mgmt") == 0) + pending &= IXGBE_ACI_NVM_ACTIV_SEL_NVM; + else if (strcmp(component, "fw.undi") == 0) + pending &= IXGBE_ACI_NVM_ACTIV_SEL_OROM; + else if (strcmp(component, "fw.netlist") == 0) + pending &= IXGBE_ACI_NVM_ACTIV_SEL_NETLIST; + else + return -EINVAL; + } + + /* There is no previous pending update, so this request may continue */ + if (!pending) + return 0; + + /* In order to allow overwriting a previous pending update, notify + * firmware to cancel that update by issuing the appropriate command. + */ + devlink_flash_update_status_notify(devlink, + "Canceling previous pending update", + component, 0, 0); + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_WRITE); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to acquire device flash lock"); + return -EIO; + } + + pending |= IXGBE_ACI_NVM_REVERT_LAST_ACTIV; + err = ixgbe_switch_flash_banks(adapter, pending, extack); + + ixgbe_release_nvm(hw); + + return err; +} + +/** + * ixgbe_flash_pldm_image - Write a PLDM-formatted firmware image to the device + * @devlink: pointer to devlink associated with the device to update + * @params: devlink flash update parameters + * @extack: netlink extended ACK structure + * + * Parse the data for a given firmware file, verifying that it is a valid PLDM + * formatted image that matches this device. + * + * Extract the device record Package Data and Component Tables and send them + * to the firmware. Extract and write the flash data for each of the three + * main flash components, "fw.mgmt", "fw.undi", and "fw.netlist". Notify + * firmware once the data is written to the inactive banks. + * + * Return: 0 on success or a negative error code on failure. + */ +int ixgbe_flash_pldm_image(struct devlink *devlink, + struct devlink_flash_update_params *params, + struct netlink_ext_ack *extack) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct device *dev = &adapter->pdev->dev; + struct ixgbe_hw *hw = &adapter->hw; + struct ixgbe_fwu_priv priv; + u8 preservation; + int err; + + if (hw->mac.type != ixgbe_mac_e610) + return -EOPNOTSUPP; + + switch (params->overwrite_mask) { + case 0: + /* preserve all settings and identifiers */ + preservation = IXGBE_ACI_NVM_PRESERVE_ALL; + break; + case DEVLINK_FLASH_OVERWRITE_SETTINGS: + /* Overwrite settings, but preserve vital information such as + * device identifiers. + */ + preservation = IXGBE_ACI_NVM_PRESERVE_SELECTED; + break; + case (DEVLINK_FLASH_OVERWRITE_SETTINGS | + DEVLINK_FLASH_OVERWRITE_IDENTIFIERS): + /* overwrite both settings and identifiers, preserve nothing */ + preservation = IXGBE_ACI_NVM_NO_PRESERVATION; + break; + default: + NL_SET_ERR_MSG_MOD(extack, + "Requested overwrite mask is not supported"); + return -EOPNOTSUPP; + } + + if (!hw->dev_caps.common_cap.nvm_unified_update) { + NL_SET_ERR_MSG_MOD(extack, + "Current firmware does not support unified update"); + return -EOPNOTSUPP; + } + + memset(&priv, 0, sizeof(priv)); + + priv.context.ops = &ixgbe_fwu_ops_e610; + priv.context.dev = dev; + priv.extack = extack; + priv.adapter = adapter; + priv.activate_flags = preservation; + + devlink_flash_update_status_notify(devlink, + "Preparing to flash", NULL, 0, 0); + + err = ixgbe_cancel_pending_update(adapter, NULL, extack); + if (err) + return err; + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_WRITE); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to acquire device flash lock"); + return -EIO; + } + + err = pldmfw_flash_image(&priv.context, params->fw); + if (err == -ENOENT) { + NL_SET_ERR_MSG_MOD(extack, + "Firmware image has no record matching this device"); + } else if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to flash PLDM image"); + } + + ixgbe_release_nvm(hw); + + return err; +} diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.h new file mode 100644 index 0000000000000..abdd708c93dff --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2025 Intel Corporation. */ + +#ifndef _IXGBE_FW_UPDATE_H_ +#define _IXGBE_FW_UPDATE_H_ + +int ixgbe_flash_pldm_image(struct devlink *devlink, + struct devlink_flash_update_params *params, + struct netlink_ext_ack *extack); +int ixgbe_get_pending_updates(struct ixgbe_adapter *adapter, u8 *pending, + struct netlink_ext_ack *extack); +#endif diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index 8b145d0a05f13..c36eaff3a0834 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -833,6 +833,8 @@ struct ixgbe_aci_cmd_nvm { #define IXGBE_ACI_NVM_MAX_OFFSET 0xFFFFFF __le16 offset_low; u8 offset_high; /* For Write Activate offset_high is used as flags2 */ +#define IXGBE_ACI_NVM_OFFSET_HI_A_MASK GENMASK(15, 8) +#define IXGBE_ACI_NVM_OFFSET_HI_U_MASK GENMASK(23, 16) u8 cmd_flags; #define IXGBE_ACI_NVM_LAST_CMD BIT(0) #define IXGBE_ACI_NVM_PCIR_REQ BIT(0) /* Used by NVM Write reply */ @@ -848,6 +850,9 @@ struct ixgbe_aci_cmd_nvm { #define IXGBE_ACI_NVM_PERST_FLAG 1 #define IXGBE_ACI_NVM_EMPR_FLAG 2 #define IXGBE_ACI_NVM_EMPR_ENA BIT(0) /* Write Activate reply only */ +#define IXGBE_ACI_NVM_NO_PRESERVATION 0x0 +#define IXGBE_ACI_NVM_PRESERVE_SELECTED 0x6 + /* For Write Activate, several flags are sent as part of a separate * flags2 field using a separate byte. For simplicity of the software * interface, we pass the flags as a 16 bit value so these flags are @@ -877,6 +882,63 @@ struct ixgbe_aci_cmd_nvm_checksum { u8 rsvd2[12]; }; +/* Used for NVM Set Package Data command - 0x070A */ +struct ixgbe_aci_cmd_nvm_pkg_data { + u8 reserved[3]; + u8 cmd_flags; +#define IXGBE_ACI_NVM_PKG_DELETE BIT(0) /* used for command call */ + + u32 reserved1; + __le32 addr_high; + __le32 addr_low; +}; + +/* Used for Pass Component Table command - 0x070B */ +struct ixgbe_aci_cmd_nvm_pass_comp_tbl { + u8 component_response; /* Response only */ +#define IXGBE_ACI_NVM_PASS_COMP_CAN_BE_UPDATED 0x0 +#define IXGBE_ACI_NVM_PASS_COMP_CAN_MAY_BE_UPDATEABLE 0x1 +#define IXGBE_ACI_NVM_PASS_COMP_CAN_NOT_BE_UPDATED 0x2 +#define IXGBE_ACI_NVM_PASS_COMP_PARTIAL_CHECK 0x3 + u8 component_response_code; /* Response only */ +#define IXGBE_ACI_NVM_PASS_COMP_CAN_BE_UPDATED_CODE 0x0 +#define IXGBE_ACI_NVM_PASS_COMP_STAMP_IDENTICAL_CODE 0x1 +#define IXGBE_ACI_NVM_PASS_COMP_STAMP_LOWER 0x2 +#define IXGBE_ACI_NVM_PASS_COMP_INVALID_STAMP_CODE 0x3 +#define IXGBE_ACI_NVM_PASS_COMP_CONFLICT_CODE 0x4 +#define IXGBE_ACI_NVM_PASS_COMP_PRE_REQ_NOT_MET_CODE 0x5 +#define IXGBE_ACI_NVM_PASS_COMP_NOT_SUPPORTED_CODE 0x6 +#define IXGBE_ACI_NVM_PASS_COMP_CANNOT_DOWNGRADE_CODE 0x7 +#define IXGBE_ACI_NVM_PASS_COMP_INCOMPLETE_IMAGE_CODE 0x8 +#define IXGBE_ACI_NVM_PASS_COMP_VER_STR_IDENTICAL_CODE 0xA +#define IXGBE_ACI_NVM_PASS_COMP_VER_STR_LOWER_CODE 0xB + u8 reserved; + u8 transfer_flag; + __le32 reserved1; + __le32 addr_high; + __le32 addr_low; +}; + +struct ixgbe_aci_cmd_nvm_comp_tbl { + __le16 comp_class; +#define NVM_COMP_CLASS_ALL_FW 0x000A + + __le16 comp_id; +#define NVM_COMP_ID_OROM 0x5 +#define NVM_COMP_ID_NVM 0x6 +#define NVM_COMP_ID_NETLIST 0x8 + + u8 comp_class_idx; +#define FWU_COMP_CLASS_IDX_NOT_USE 0x0 + + __le32 comp_cmp_stamp; + u8 cvs_type; +#define NVM_CVS_TYPE_ASCII 0x1 + + u8 cvs_len; + u8 cvs[]; /* Component Version String */ +} __packed; + /** * struct ixgbe_aci_desc - Admin Command (AC) descriptor * @flags: IXGBE_ACI_FLAG_* flags @@ -920,6 +982,8 @@ struct ixgbe_aci_desc { struct ixgbe_aci_cmd_sff_eeprom read_write_sff_param; struct ixgbe_aci_cmd_nvm nvm; struct ixgbe_aci_cmd_nvm_checksum nvm_checksum; + struct ixgbe_aci_cmd_nvm_pkg_data pkg_data; + struct ixgbe_aci_cmd_nvm_pass_comp_tbl pass_comp_tbl; } params; }; From c9e563cae19e529abcc2cb90b4b793952f209260 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Thu, 10 Apr 2025 15:00:05 +0200 Subject: [PATCH 194/770] ixgbe: add support for devlink reload The E610 adapters contain an embedded chip with firmware which can be updated using devlink flash. The firmware which runs on this chip is referred to as the Embedded Management Processor firmware (EMP firmware). Activating the new firmware image currently requires that the system be rebooted. This is not ideal as rebooting the system can cause unwanted downtime. The EMP firmware itself can be reloaded by issuing a special update to the device called an Embedded Management Processor reset (EMP reset). This reset causes the device to reset and reload the EMP firmware. Implement support for devlink reload with the "fw_activate" flag. This allows user space to request the firmware be activated immediately. Reviewed-by: Mateusz Polchlopek Tested-by: Bharath R Co-developed-by: Slawomir Mrozowicz Signed-off-by: Slawomir Mrozowicz Co-developed-by: Piotr Kwapulinski Signed-off-by: Piotr Kwapulinski Co-developed-by: Stefan Wegrzyn Signed-off-by: Stefan Wegrzyn Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- Documentation/networking/devlink/ixgbe.rst | 17 +++ .../ethernet/intel/ixgbe/devlink/devlink.c | 112 ++++++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe.h | 4 + drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 18 +++ drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h | 1 + .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 12 ++ .../ethernet/intel/ixgbe/ixgbe_fw_update.c | 37 +++++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 5 +- 8 files changed, 199 insertions(+), 7 deletions(-) diff --git a/Documentation/networking/devlink/ixgbe.rst b/Documentation/networking/devlink/ixgbe.rst index 3fd2584b2bf08..3fce291348fac 100644 --- a/Documentation/networking/devlink/ixgbe.rst +++ b/Documentation/networking/devlink/ixgbe.rst @@ -103,3 +103,20 @@ combined flash image that contains the ``fw.mgmt``, ``fw.undi``, and and device serial number. It is expected that this combination be used with an image customized for the specific device. +Reload +====== + +The ``ixgbe`` driver supports activating new firmware after a flash update +using ``DEVLINK_CMD_RELOAD`` with the ``DEVLINK_RELOAD_ACTION_FW_ACTIVATE`` +action. + +.. code:: shell + + $ devlink dev reload pci/0000:01:00.0 reload action fw_activate + +The new firmware is activated by issuing a device specific Embedded +Management Processor reset which requests the device to reset and reload the +EMP firmware image. + +The driver does not currently support reloading the driver via +``DEVLINK_RELOAD_ACTION_DRIVER_REINIT``. diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c index 26cbeb077fa64..d0197991763c1 100644 --- a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c @@ -322,6 +322,9 @@ static int ixgbe_devlink_info_get(struct devlink *devlink, if (!ctx) return -ENOMEM; + if (hw->mac.type == ixgbe_mac_e610) + ixgbe_refresh_fw_version(adapter); + ixgbe_info_get_dsn(adapter, ctx); err = devlink_info_serial_number_put(req, ctx->buf); if (err) @@ -365,11 +368,120 @@ static int ixgbe_devlink_info_get(struct devlink *devlink, return err; } +/** + * ixgbe_devlink_reload_empr_start - Start EMP reset to activate new firmware + * @devlink: pointer to the devlink instance to reload + * @netns_change: if true, the network namespace is changing + * @action: the action to perform. Must be DEVLINK_RELOAD_ACTION_FW_ACTIVATE + * @limit: limits on what reload should do, such as not resetting + * @extack: netlink extended ACK structure + * + * Allow user to activate new Embedded Management Processor firmware by + * issuing device specific EMP reset. Called in response to + * a DEVLINK_CMD_RELOAD with the DEVLINK_RELOAD_ACTION_FW_ACTIVATE. + * + * Note that teardown and rebuild of the driver state happens automatically as + * part of an interrupt and watchdog task. This is because all physical + * functions on the device must be able to reset when an EMP reset occurs from + * any source. + * + * Return: the exit code of the operation. + */ +static int ixgbe_devlink_reload_empr_start(struct devlink *devlink, + bool netns_change, + enum devlink_reload_action action, + enum devlink_reload_limit limit, + struct netlink_ext_ack *extack) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct ixgbe_hw *hw = &adapter->hw; + u8 pending; + int err; + + if (hw->mac.type != ixgbe_mac_e610) + return -EOPNOTSUPP; + + err = ixgbe_get_pending_updates(adapter, &pending, extack); + if (err) + return err; + + /* Pending is a bitmask of which flash banks have a pending update, + * including the main NVM bank, the Option ROM bank, and the netlist + * bank. If any of these bits are set, then there is a pending update + * waiting to be activated. + */ + if (!pending) { + NL_SET_ERR_MSG_MOD(extack, "No pending firmware update"); + return -ECANCELED; + } + + if (adapter->fw_emp_reset_disabled) { + NL_SET_ERR_MSG_MOD(extack, + "EMP reset is not available. To activate firmware, a reboot or power cycle is needed"); + return -ECANCELED; + } + + err = ixgbe_aci_nvm_update_empr(hw); + if (err) + NL_SET_ERR_MSG_MOD(extack, + "Failed to trigger EMP device reset to reload firmware"); + + return err; +} + +/*Wait for 10 sec with 0.5 sec tic. EMPR takes no less than half of a sec */ +#define IXGBE_DEVLINK_RELOAD_TIMEOUT_SEC 20 + +/** + * ixgbe_devlink_reload_empr_finish - finishes EMP reset + * @devlink: pointer to the devlink instance + * @action: the action to perform. + * @limit: limits on what reload should do + * @actions_performed: actions performed + * @extack: netlink extended ACK structure + * + * Wait for new NVM to be loaded during EMP reset. + * + * Return: -ETIME when timer is exceeded, 0 on success. + */ +static int ixgbe_devlink_reload_empr_finish(struct devlink *devlink, + enum devlink_reload_action action, + enum devlink_reload_limit limit, + u32 *actions_performed, + struct netlink_ext_ack *extack) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct ixgbe_hw *hw = &adapter->hw; + int i = 0; + u32 fwsm; + + do { + /* Just right away after triggering EMP reset the FWSM register + * may be not cleared yet, so begin the loop with the delay + * in order to not check the not updated register. + */ + mdelay(500); + + fwsm = IXGBE_READ_REG(hw, IXGBE_FWSM(hw)); + + if (i++ >= IXGBE_DEVLINK_RELOAD_TIMEOUT_SEC) + return -ETIME; + + } while (!(fwsm & IXGBE_FWSM_FW_VAL_BIT)); + + *actions_performed = BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE); + + return 0; +} + static const struct devlink_ops ixgbe_devlink_ops = { .info_get = ixgbe_devlink_info_get, .supported_flash_update_params = DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK, .flash_update = ixgbe_flash_pldm_image, + .reload_actions = BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE), + .reload_down = ixgbe_devlink_reload_empr_start, + .reload_up = ixgbe_devlink_reload_empr_finish, }; /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 6cb8772b1ebfa..83d4d7368cdac 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -759,6 +759,8 @@ struct ixgbe_adapter { u32 atr_sample_rate; spinlock_t fdir_perfect_lock; + bool fw_emp_reset_disabled; + #ifdef IXGBE_FCOE struct ixgbe_fcoe fcoe; #endif /* IXGBE_FCOE */ @@ -960,6 +962,8 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter); int ixgbe_init_interrupt_scheme(struct ixgbe_adapter *adapter); bool ixgbe_wol_supported(struct ixgbe_adapter *adapter, u16 device_id, u16 subdevice_id); +void ixgbe_set_fw_version_e610(struct ixgbe_adapter *adapter); +void ixgbe_refresh_fw_version(struct ixgbe_adapter *adapter); #ifdef CONFIG_PCI_IOV void ixgbe_full_sync_mac_table(struct ixgbe_adapter *adapter); #endif diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 5abba05554c8f..9dd788aea0c21 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -3301,6 +3301,24 @@ int ixgbe_get_flash_data(struct ixgbe_hw *hw) return err; } +/** + * ixgbe_aci_nvm_update_empr - update NVM using EMPR + * @hw: pointer to the HW struct + * + * Force EMP reset using ACI command (0x0709). This command allows SW to + * request an EMPR to activate new FW. + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_nvm_update_empr(struct ixgbe_hw *hw) +{ + struct ixgbe_aci_desc desc; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_nvm_update_empr); + + return ixgbe_aci_send_cmd(hw, &desc, NULL, 0); +} + /* ixgbe_nvm_set_pkg_data - NVM set package data * @hw: pointer to the HW struct * @del_pkg_data_flag: If is set then the current pkg_data store by FW diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h index 0bc71d1a39b4b..30bc1f1b2549b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -83,6 +83,7 @@ int ixgbe_read_ee_aci_buffer_e610(struct ixgbe_hw *hw, u16 offset, int ixgbe_validate_eeprom_checksum_e610(struct ixgbe_hw *hw, u16 *checksum_val); int ixgbe_reset_hw_e610(struct ixgbe_hw *hw); int ixgbe_get_flash_data(struct ixgbe_hw *hw); +int ixgbe_aci_nvm_update_empr(struct ixgbe_hw *hw); int ixgbe_nvm_set_pkg_data(struct ixgbe_hw *hw, bool del_pkg_data_flag, u8 *data, u16 length); int ixgbe_nvm_pass_component_tbl(struct ixgbe_hw *hw, u8 *data, u16 length, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 96aa493471eb0..c86103eccc8aa 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -1104,11 +1104,23 @@ static int ixgbe_set_eeprom(struct net_device *netdev, return ret_val; } +void ixgbe_refresh_fw_version(struct ixgbe_adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + + ixgbe_get_flash_data(hw); + ixgbe_set_fw_version_e610(adapter); +} + static void ixgbe_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); + /* need to refresh info for e610 in case fw reloads in runtime */ + if (adapter->hw.mac.type == ixgbe_mac_e610) + ixgbe_refresh_fw_version(adapter); + strscpy(drvinfo->driver, ixgbe_driver_name, sizeof(drvinfo->driver)); strscpy(drvinfo->fw_version, adapter->eeprom_id, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c index 1ff55dc8a6b7d..69e3ec3087166 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c @@ -16,6 +16,7 @@ struct ixgbe_fwu_priv { /* Track which NVM banks to activate at the end of the update */ u8 activate_flags; + bool emp_reset_available; }; /** @@ -352,6 +353,7 @@ static int ixgbe_erase_nvm_module(struct ixgbe_adapter *adapter, u16 module, * ixgbe_switch_flash_banks - Tell firmware to switch NVM banks * @adapter: Pointer to the PF data structure * @activate_flags: flags used for the activation command + * @emp_reset_available: on return, indicates if EMP reset is available * @extack: netlink extended ACK structure * * Notify firmware to activate the newly written flash banks, and wait for the @@ -361,6 +363,7 @@ static int ixgbe_erase_nvm_module(struct ixgbe_adapter *adapter, u16 module, */ static int ixgbe_switch_flash_banks(struct ixgbe_adapter *adapter, u8 activate_flags, + bool *emp_reset_available, struct netlink_ext_ack *extack) { struct ixgbe_hw *hw = &adapter->hw; @@ -368,11 +371,21 @@ static int ixgbe_switch_flash_banks(struct ixgbe_adapter *adapter, int err; err = ixgbe_nvm_write_activate(hw, activate_flags, &response_flags); - if (err) + if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to switch active flash banks"); + return err; + } - return err; + if (emp_reset_available) { + if (hw->dev_caps.common_cap.reset_restrict_support) + *emp_reset_available = + response_flags & IXGBE_ACI_NVM_EMPR_ENA; + else + *emp_reset_available = true; + } + + return 0; } /** @@ -451,9 +464,23 @@ static int ixgbe_finalize_update(struct pldmfw *context) context); struct ixgbe_adapter *adapter = priv->adapter; struct netlink_ext_ack *extack = priv->extack; + struct devlink *devlink = adapter->devlink; + int err; + + /* Finally, notify firmware to activate the written NVM banks */ + err = ixgbe_switch_flash_banks(adapter, priv->activate_flags, + &priv->emp_reset_available, extack); + if (err) + return err; + + adapter->fw_emp_reset_disabled = !priv->emp_reset_available; - return ixgbe_switch_flash_banks(adapter, priv->activate_flags, - extack); + if (!adapter->fw_emp_reset_disabled) + devlink_flash_update_status_notify(devlink, + "Suggested is to activate new firmware by devlink reload, if it doesn't work then a power cycle is required", + NULL, 0, 0); + + return 0; } static const struct pldmfw_ops ixgbe_fwu_ops_e610 = { @@ -567,7 +594,7 @@ static int ixgbe_cancel_pending_update(struct ixgbe_adapter *adapter, } pending |= IXGBE_ACI_NVM_REVERT_LAST_ACTIV; - err = ixgbe_switch_flash_banks(adapter, pending, extack); + err = ixgbe_switch_flash_banks(adapter, pending, NULL, extack); ixgbe_release_nvm(hw); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 22e61baf4295e..2cc0af0dfe7ce 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8379,8 +8379,9 @@ static bool ixgbe_check_fw_error(struct ixgbe_adapter *adapter) /* read fwsm.ext_err_ind register and log errors */ fwsm = IXGBE_READ_REG(hw, IXGBE_FWSM(hw)); + /* skip if E610's FW is reloading, warning in that case may be misleading */ if (fwsm & IXGBE_FWSM_EXT_ERR_IND_MASK || - !(fwsm & IXGBE_FWSM_FW_VAL_BIT)) + (!(fwsm & IXGBE_FWSM_FW_VAL_BIT) && !(hw->mac.type == ixgbe_mac_e610))) e_dev_warn("Warning firmware error detected FWSM: 0x%08X\n", fwsm); @@ -11147,7 +11148,7 @@ bool ixgbe_wol_supported(struct ixgbe_adapter *adapter, u16 device_id, * format to display. The FW version is taken from the EEPROM/NVM. * */ -static void ixgbe_set_fw_version_e610(struct ixgbe_adapter *adapter) +void ixgbe_set_fw_version_e610(struct ixgbe_adapter *adapter) { struct ixgbe_orom_info *orom = &adapter->hw.flash.orom; struct ixgbe_nvm_info *nvm = &adapter->hw.flash.nvm; From b5aae90b6b369df01f424ee09205e287a11e085b Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Thu, 10 Apr 2025 15:00:06 +0200 Subject: [PATCH 195/770] ixgbe: add FW API version check Add E610 specific function checking whether the FW API version is compatible with the driver expectations. The major API version should be less than or equal to the expected API version. If not the driver won't be fully operational. Check the minor version, and if it is more than two versions lesser or greater than the expected version, print a message indicating that the NVM or driver should be updated respectively. Reviewed-by: Mateusz Polchlopek Co-developed-by: Piotr Kwapulinski Signed-off-by: Piotr Kwapulinski Signed-off-by: Jedrzej Jagielski Tested-by: Bharath R Signed-off-by: Tony Nguyen --- .../ethernet/intel/ixgbe/devlink/devlink.c | 2 ++ drivers/net/ethernet/intel/ixgbe/ixgbe.h | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 34 +++++++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_type.h | 1 + .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 4 +++ 6 files changed, 43 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c index d0197991763c1..3e79e446a9449 100644 --- a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c @@ -471,6 +471,8 @@ static int ixgbe_devlink_reload_empr_finish(struct devlink *devlink, *actions_performed = BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE); + adapter->flags2 &= ~IXGBE_FLAG2_API_MISMATCH; + return 0; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 83d4d7368cdac..2246997bc9fb4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -671,6 +671,7 @@ struct ixgbe_adapter { #define IXGBE_FLAG2_PHY_FW_LOAD_FAILED BIT(20) #define IXGBE_FLAG2_NO_MEDIA BIT(21) #define IXGBE_FLAG2_MOD_POWER_UNSUPPORTED BIT(22) +#define IXGBE_FLAG2_API_MISMATCH BIT(23) /* Tx fast path data */ int num_tx_queues; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 9dd788aea0c21..5d42da11547e3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -3873,6 +3873,7 @@ static const struct ixgbe_mac_operations mac_ops_e610 = { .led_off = ixgbe_led_off_generic, .init_led_link_act = ixgbe_init_led_link_act_generic, .reset_hw = ixgbe_reset_hw_e610, + .get_fw_ver = ixgbe_aci_get_fw_ver, .get_media_type = ixgbe_get_media_type_e610, .setup_link = ixgbe_setup_link_e610, .get_link_capabilities = ixgbe_get_link_capabilities_e610, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 2cc0af0dfe7ce..84ee481a0ce1a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8365,6 +8365,34 @@ static void ixgbe_reset_subtask(struct ixgbe_adapter *adapter) rtnl_unlock(); } +static int ixgbe_check_fw_api_mismatch(struct ixgbe_adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + + if (hw->mac.type != ixgbe_mac_e610) + return 0; + + if (hw->mac.ops.get_fw_ver && hw->mac.ops.get_fw_ver(hw)) + return 0; + + if (hw->api_maj_ver > IXGBE_FW_API_VER_MAJOR) { + e_dev_err("The driver for the device stopped because the NVM image is newer than expected. You must install the most recent version of the network driver.\n"); + + adapter->flags2 |= IXGBE_FLAG2_API_MISMATCH; + return -EOPNOTSUPP; + } else if (hw->api_maj_ver == IXGBE_FW_API_VER_MAJOR && + hw->api_min_ver > IXGBE_FW_API_VER_MINOR + IXGBE_FW_API_VER_DIFF_ALLOWED) { + e_dev_info("The driver for the device detected a newer version of the NVM image than expected. Please install the most recent version of the network driver.\n"); + adapter->flags2 |= IXGBE_FLAG2_API_MISMATCH; + } else if (hw->api_maj_ver < IXGBE_FW_API_VER_MAJOR || + hw->api_min_ver < IXGBE_FW_API_VER_MINOR - IXGBE_FW_API_VER_DIFF_ALLOWED) { + e_dev_info("The driver for the device detected an older version of the NVM image than expected. Please update the NVM image.\n"); + adapter->flags2 |= IXGBE_FLAG2_API_MISMATCH; + } + + return 0; +} + /** * ixgbe_check_fw_error - Check firmware for errors * @adapter: the adapter private structure @@ -8375,6 +8403,7 @@ static bool ixgbe_check_fw_error(struct ixgbe_adapter *adapter) { struct ixgbe_hw *hw = &adapter->hw; u32 fwsm; + int err; /* read fwsm.ext_err_ind register and log errors */ fwsm = IXGBE_READ_REG(hw, IXGBE_FWSM(hw)); @@ -8389,6 +8418,11 @@ static bool ixgbe_check_fw_error(struct ixgbe_adapter *adapter) e_dev_err("Firmware recovery mode detected. Limiting functionality. Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware recovery mode.\n"); return true; } + if (!(adapter->flags2 & IXGBE_FLAG2_API_MISMATCH)) { + err = ixgbe_check_fw_api_mismatch(adapter); + if (err) + return true; + } return false; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h index 5f814f023573f..6bf6ba7dcdccd 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h @@ -3456,6 +3456,7 @@ struct ixgbe_mac_operations { int (*start_hw)(struct ixgbe_hw *); int (*clear_hw_cntrs)(struct ixgbe_hw *); enum ixgbe_media_type (*get_media_type)(struct ixgbe_hw *); + int (*get_fw_ver)(struct ixgbe_hw *hw); int (*get_mac_addr)(struct ixgbe_hw *, u8 *); int (*get_san_mac_addr)(struct ixgbe_hw *, u8 *); int (*get_device_caps)(struct ixgbe_hw *, u16 *); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index c36eaff3a0834..2e105419eef84 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -112,6 +112,10 @@ #define IXGBE_PF_HICR_SV BIT(2) #define IXGBE_PF_HICR_EV BIT(3) +#define IXGBE_FW_API_VER_MAJOR 0x01 +#define IXGBE_FW_API_VER_MINOR 0x07 +#define IXGBE_FW_API_VER_DIFF_ALLOWED 0x02 + #define IXGBE_ACI_DESC_SIZE 32 #define IXGBE_ACI_DESC_SIZE_IN_DWORDS (IXGBE_ACI_DESC_SIZE / BYTES_PER_DWORD) From 29cb3b8d95c76a1d509029701435ed6b9e96a732 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Thu, 10 Apr 2025 15:00:07 +0200 Subject: [PATCH 196/770] ixgbe: add E610 implementation of FW recovery mode Add E610 implementation of fw_recovery_mode MAC operation. In case of E610 information about recovery mode is obtained from FW_MODES field in IXGBE_GL_MNG_FWSM register (0x000B6134). Introduce recovery specific probing flow and init only vital features. User should be able to perform NVM update using devlink once FW error is detected in order to load a healthy img. Reviewed-by: Mateusz Polchlopek Co-developed-by: Stefan Wegrzyn Signed-off-by: Stefan Wegrzyn Signed-off-by: Jedrzej Jagielski Tested-by: Bharath R Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 17 ++++ .../ethernet/intel/ixgbe/ixgbe_fw_update.c | 14 ++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 94 +++++++++++++++++-- .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 3 + 4 files changed, 117 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 5d42da11547e3..58deb89beb75e 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -1816,6 +1816,22 @@ void ixgbe_disable_rx_e610(struct ixgbe_hw *hw) } } +/** + * ixgbe_fw_recovery_mode_e610 - Check FW NVM recovery mode + * @hw: pointer to hardware structure + * + * Check FW NVM recovery mode by reading the value of + * the dedicated register. + * + * Return: true if FW is in recovery mode, otherwise false. + */ +static bool ixgbe_fw_recovery_mode_e610(struct ixgbe_hw *hw) +{ + u32 fwsm = IXGBE_READ_REG(hw, IXGBE_GL_MNG_FWSM); + + return !!(fwsm & IXGBE_GL_MNG_FWSM_RECOVERY_M); +} + /** * ixgbe_init_phy_ops_e610 - PHY specific init * @hw: pointer to hardware structure @@ -3876,6 +3892,7 @@ static const struct ixgbe_mac_operations mac_ops_e610 = { .get_fw_ver = ixgbe_aci_get_fw_ver, .get_media_type = ixgbe_get_media_type_e610, .setup_link = ixgbe_setup_link_e610, + .fw_recovery_mode = ixgbe_fw_recovery_mode_e610, .get_link_capabilities = ixgbe_get_link_capabilities_e610, .get_bus_info = ixgbe_get_bus_info_generic, .acquire_swfw_sync = ixgbe_acquire_swfw_sync_X540, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c index 69e3ec3087166..49d3b66add7ee 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fw_update.c @@ -73,6 +73,8 @@ static int ixgbe_check_component_response(struct ixgbe_adapter *adapter, u8 response, u8 code, struct netlink_ext_ack *extack) { + struct ixgbe_hw *hw = &adapter->hw; + switch (response) { case IXGBE_ACI_NVM_PASS_COMP_CAN_BE_UPDATED: /* Firmware indicated this update is good to proceed. */ @@ -84,6 +86,11 @@ static int ixgbe_check_component_response(struct ixgbe_adapter *adapter, case IXGBE_ACI_NVM_PASS_COMP_CAN_NOT_BE_UPDATED: NL_SET_ERR_MSG_MOD(extack, "Firmware has rejected updating."); break; + case IXGBE_ACI_NVM_PASS_COMP_PARTIAL_CHECK: + if (hw->mac.ops.fw_recovery_mode && + hw->mac.ops.fw_recovery_mode(hw)) + return 0; + break; } switch (code) { @@ -653,7 +660,12 @@ int ixgbe_flash_pldm_image(struct devlink *devlink, return -EOPNOTSUPP; } - if (!hw->dev_caps.common_cap.nvm_unified_update) { + /* Cannot get caps in recovery mode, so lack of nvm_unified_update bit + * cannot lead to error + */ + if (!hw->dev_caps.common_cap.nvm_unified_update && + (hw->mac.ops.fw_recovery_mode && + !hw->mac.ops.fw_recovery_mode(hw))) { NL_SET_ERR_MSG_MOD(extack, "Current firmware does not support unified update"); return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 84ee481a0ce1a..1c69f58466fdc 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8427,6 +8427,18 @@ static bool ixgbe_check_fw_error(struct ixgbe_adapter *adapter) return false; } +static void ixgbe_recovery_service_task(struct work_struct *work) +{ + struct ixgbe_adapter *adapter = container_of(work, + struct ixgbe_adapter, + service_task); + + ixgbe_handle_fw_event(adapter); + ixgbe_service_event_complete(adapter); + + mod_timer(&adapter->service_timer, jiffies + msecs_to_jiffies(100)); +} + /** * ixgbe_service_task - manages and runs subtasks * @work: pointer to work_struct containing our data @@ -8446,8 +8458,13 @@ static void ixgbe_service_task(struct work_struct *work) return; } if (ixgbe_check_fw_error(adapter)) { - if (!test_bit(__IXGBE_DOWN, &adapter->state)) + if (!test_bit(__IXGBE_DOWN, &adapter->state)) { + if (adapter->mii_bus) { + mdiobus_unregister(adapter->mii_bus); + adapter->mii_bus = NULL; + } unregister_netdev(adapter->netdev); + } ixgbe_service_event_complete(adapter); return; } @@ -11232,6 +11249,65 @@ static void ixgbe_set_fw_version(struct ixgbe_adapter *adapter) "0x%08x", nvm_ver.etk_id); } +/** + * ixgbe_recovery_probe - Handle FW recovery mode during probe + * @adapter: the adapter private structure + * + * Perform limited driver initialization when FW error is detected. + * + * Return: 0 on successful probe for E610, -EIO if recovery mode is detected + * for non-E610 adapter, error status code on any other case. + */ +static int ixgbe_recovery_probe(struct ixgbe_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + struct pci_dev *pdev = adapter->pdev; + struct ixgbe_hw *hw = &adapter->hw; + bool disable_dev; + int err = -EIO; + + if (hw->mac.type != ixgbe_mac_e610) + goto clean_up_probe; + + ixgbe_get_hw_control(adapter); + mutex_init(&hw->aci.lock); + err = ixgbe_get_flash_data(&adapter->hw); + if (err) + goto shutdown_aci; + + timer_setup(&adapter->service_timer, ixgbe_service_timer, 0); + INIT_WORK(&adapter->service_task, ixgbe_recovery_service_task); + set_bit(__IXGBE_SERVICE_INITED, &adapter->state); + clear_bit(__IXGBE_SERVICE_SCHED, &adapter->state); + + if (hw->mac.ops.get_bus_info) + hw->mac.ops.get_bus_info(hw); + + pci_set_drvdata(pdev, adapter); + /* We are creating devlink interface so NIC can be managed, + * e.g. new NVM image loaded + */ + devl_lock(adapter->devlink); + ixgbe_devlink_register_port(adapter); + SET_NETDEV_DEVLINK_PORT(adapter->netdev, + &adapter->devlink_port); + devl_register(adapter->devlink); + devl_unlock(adapter->devlink); + + return 0; +shutdown_aci: + mutex_destroy(&adapter->hw.aci.lock); + ixgbe_release_hw_control(adapter); + devlink_free(adapter->devlink); +clean_up_probe: + disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state); + free_netdev(netdev); + pci_release_mem_regions(pdev); + if (disable_dev) + pci_disable_device(pdev); + return err; +} + /** * ixgbe_probe - Device Initialization Routine * @pdev: PCI device information struct @@ -11370,6 +11446,13 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_sw_init; + /* Make sure the SWFW semaphore is in a valid state */ + if (hw->mac.ops.init_swfw_sync) + hw->mac.ops.init_swfw_sync(hw); + + if (ixgbe_check_fw_error(adapter)) + return ixgbe_recovery_probe(adapter); + if (adapter->hw.mac.type == ixgbe_mac_e610) { err = ixgbe_get_caps(&adapter->hw); if (err) @@ -11396,10 +11479,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) break; } - /* Make sure the SWFW semaphore is in a valid state */ - if (hw->mac.ops.init_swfw_sync) - hw->mac.ops.init_swfw_sync(hw); - /* Make it possible the adapter to be woken up via WOL */ switch (adapter->hw.mac.type) { case ixgbe_mac_82599EB: @@ -11552,11 +11631,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) netdev->features |= NETIF_F_LRO; - if (ixgbe_check_fw_error(adapter)) { - err = -EIO; - goto err_sw_init; - } - /* make sure the EEPROM is good */ if (hw->eeprom.ops.validate_checksum(hw, NULL) < 0) { e_dev_err("The EEPROM Checksum Is Not Valid\n"); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index 2e105419eef84..c035ac787b6c9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -88,6 +88,9 @@ #define GLNVM_GENS 0x000B6100 /* Reset Source: POR */ #define GLNVM_GENS_SR_SIZE_M GENMASK(7, 5) +#define IXGBE_GL_MNG_FWSM 0x000B6134 /* Reset Source: POR */ +#define IXGBE_GL_MNG_FWSM_RECOVERY_M BIT(1) + /* Flash Access Register */ #define IXGBE_GLNVM_FLA 0x000B6108 /* Reset Source: POR */ #define IXGBE_GLNVM_FLA_LOCKED_S 6 From 4811b0c220f212ff0b35da0d9fc993bce87868c0 Mon Sep 17 00:00:00 2001 From: Andrii Staikov Date: Thu, 10 Apr 2025 15:00:08 +0200 Subject: [PATCH 197/770] ixgbe: add support for FW rollback mode The driver should detect whether the device entered FW rollback mode and then notify user with the dedicated message including FW and NVM versions. Even if the driver detected rollback mode, this should not result in an probe error and the normal flow proceeds. FW tries to rollback to "old" operational FW located in the inactive NVM bank in cases when newly loaded FW exhibits faulty behavior. If something goes wrong during boot the FW may switch into rollback mode in an attempt to avoid recovery mode and stay operational. After rollback is successful, the banks are swapped, and the "rollback" bank becomes the active bank for the next reset. Reviewed-by: Mateusz Polchlopek Signed-off-by: Andrii Staikov Signed-off-by: Jedrzej Jagielski Signed-off-by: Tony Nguyen --- .../ethernet/intel/ixgbe/devlink/devlink.c | 3 +- drivers/net/ethernet/intel/ixgbe/ixgbe.h | 1 + drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 34 +++++++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 26 ++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_type.h | 2 ++ .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 1 + 6 files changed, 66 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c index 3e79e446a9449..54f1b83dfe42e 100644 --- a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.c @@ -471,7 +471,8 @@ static int ixgbe_devlink_reload_empr_finish(struct devlink *devlink, *actions_performed = BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE); - adapter->flags2 &= ~IXGBE_FLAG2_API_MISMATCH; + adapter->flags2 &= ~(IXGBE_FLAG2_API_MISMATCH | + IXGBE_FLAG2_FW_ROLLBACK); return 0; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 2246997bc9fb4..23c2e2c2649ce 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -672,6 +672,7 @@ struct ixgbe_adapter { #define IXGBE_FLAG2_NO_MEDIA BIT(21) #define IXGBE_FLAG2_MOD_POWER_UNSUPPORTED BIT(22) #define IXGBE_FLAG2_API_MISMATCH BIT(23) +#define IXGBE_FLAG2_FW_ROLLBACK BIT(24) /* Tx fast path data */ int num_tx_queues; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 58deb89beb75e..9ada35f7d8f76 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -1832,6 +1832,22 @@ static bool ixgbe_fw_recovery_mode_e610(struct ixgbe_hw *hw) return !!(fwsm & IXGBE_GL_MNG_FWSM_RECOVERY_M); } +/** + * ixgbe_fw_rollback_mode_e610 - Check FW NVM rollback mode + * @hw: pointer to hardware structure + * + * Check FW NVM rollback mode by reading the value of + * the dedicated register. + * + * Return: true if FW is in rollback mode, otherwise false. + */ +static bool ixgbe_fw_rollback_mode_e610(struct ixgbe_hw *hw) +{ + u32 fwsm = IXGBE_READ_REG(hw, IXGBE_GL_MNG_FWSM); + + return !!(fwsm & IXGBE_GL_MNG_FWSM_ROLLBACK_M); +} + /** * ixgbe_init_phy_ops_e610 - PHY specific init * @hw: pointer to hardware structure @@ -3163,6 +3179,22 @@ int ixgbe_get_inactive_nvm_ver(struct ixgbe_hw *hw, struct ixgbe_nvm_info *nvm) return ixgbe_get_nvm_ver_info(hw, IXGBE_INACTIVE_FLASH_BANK, nvm); } +/** + * ixgbe_get_active_nvm_ver - Read Option ROM version from the active bank + * @hw: pointer to the HW structure + * @nvm: storage for Option ROM version information + * + * Reads the NVM EETRACK ID, Map version, and security revision of the + * active NVM bank. + * + * Return: the exit code of the operation. + */ +static int ixgbe_get_active_nvm_ver(struct ixgbe_hw *hw, + struct ixgbe_nvm_info *nvm) +{ + return ixgbe_get_nvm_ver_info(hw, IXGBE_ACTIVE_FLASH_BANK, nvm); +} + /** * ixgbe_get_netlist_info - Read the netlist version information * @hw: pointer to the HW struct @@ -3893,6 +3925,8 @@ static const struct ixgbe_mac_operations mac_ops_e610 = { .get_media_type = ixgbe_get_media_type_e610, .setup_link = ixgbe_setup_link_e610, .fw_recovery_mode = ixgbe_fw_recovery_mode_e610, + .fw_rollback_mode = ixgbe_fw_rollback_mode_e610, + .get_nvm_ver = ixgbe_get_active_nvm_ver, .get_link_capabilities = ixgbe_get_link_capabilities_e610, .get_bus_info = ixgbe_get_bus_info_generic, .acquire_swfw_sync = ixgbe_acquire_swfw_sync_X540, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 1c69f58466fdc..cdfafc477ee0d 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -8424,6 +8424,32 @@ static bool ixgbe_check_fw_error(struct ixgbe_adapter *adapter) return true; } + /* return here if FW rollback mode has been already detected */ + if (adapter->flags2 & IXGBE_FLAG2_FW_ROLLBACK) + return false; + + if (hw->mac.ops.fw_rollback_mode && hw->mac.ops.fw_rollback_mode(hw)) { + struct ixgbe_nvm_info *nvm_info = &adapter->hw.flash.nvm; + char ver_buff[64] = ""; + + if (hw->mac.ops.get_fw_ver && hw->mac.ops.get_fw_ver(hw)) + goto no_version; + + if (hw->mac.ops.get_nvm_ver && + hw->mac.ops.get_nvm_ver(hw, nvm_info)) + goto no_version; + + snprintf(ver_buff, sizeof(ver_buff), + "Current version is NVM:%x.%x.%x, FW:%d.%d. ", + nvm_info->major, nvm_info->minor, nvm_info->eetrack, + hw->fw_maj_ver, hw->fw_maj_ver); +no_version: + e_dev_warn("Firmware rollback mode detected. %sDevice may exhibit limited functionality. Refer to the Intel(R) Ethernet Adapters and Devices User Guide for details on firmware rollback mode.", + ver_buff); + + adapter->flags2 |= IXGBE_FLAG2_FW_ROLLBACK; + } + return false; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h index 6bf6ba7dcdccd..892fa6c1f8797 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type.h @@ -3525,6 +3525,8 @@ struct ixgbe_mac_operations { int (*get_thermal_sensor_data)(struct ixgbe_hw *); int (*init_thermal_sensor_thresh)(struct ixgbe_hw *hw); bool (*fw_recovery_mode)(struct ixgbe_hw *hw); + bool (*fw_rollback_mode)(struct ixgbe_hw *hw); + int (*get_nvm_ver)(struct ixgbe_hw *hw, struct ixgbe_nvm_info *nvm); void (*disable_rx)(struct ixgbe_hw *hw); void (*enable_rx)(struct ixgbe_hw *hw); void (*set_source_address_pruning)(struct ixgbe_hw *, bool, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index c035ac787b6c9..bea94e5ccb730 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -90,6 +90,7 @@ #define IXGBE_GL_MNG_FWSM 0x000B6134 /* Reset Source: POR */ #define IXGBE_GL_MNG_FWSM_RECOVERY_M BIT(1) +#define IXGBE_GL_MNG_FWSM_ROLLBACK_M BIT(2) /* Flash Access Register */ #define IXGBE_GLNVM_FLA 0x000B6108 /* Reset Source: POR */ From f15e410687954b8d1577d9cca30eeebc148cb1b3 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Mon, 14 Apr 2025 10:24:21 +0800 Subject: [PATCH 198/770] net: txgbe: Update module description Because of the addition of support for 25G/40G devices, update the module description. Signed-off-by: Jiawen Wu Link: https://patch.msgid.link/20250414022421.375101-1-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/Kconfig | 4 ++-- drivers/net/ethernet/wangxun/txgbe/txgbe_main.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/wangxun/Kconfig b/drivers/net/ethernet/wangxun/Kconfig index 47e3e8434b9ef..e5fc942c28ccc 100644 --- a/drivers/net/ethernet/wangxun/Kconfig +++ b/drivers/net/ethernet/wangxun/Kconfig @@ -40,7 +40,7 @@ config NGBE will be called ngbe. config TXGBE - tristate "Wangxun(R) 10GbE PCI Express adapters support" + tristate "Wangxun(R) 10/25/40GbE PCI Express adapters support" depends on PCI depends on COMMON_CLK depends on I2C_DESIGNWARE_PLATFORM @@ -55,7 +55,7 @@ config TXGBE select PCS_XPCS select LIBWX help - This driver supports Wangxun(R) 10GbE PCI Express family of + This driver supports Wangxun(R) 10/25/40GbE PCI Express family of adapters. More specific information on configuring the driver is in diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c index 6d9134a3ce4df..db5166a6db2c0 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c @@ -849,5 +849,5 @@ module_pci_driver(txgbe_driver); MODULE_DEVICE_TABLE(pci, txgbe_pci_tbl); MODULE_AUTHOR("Beijing WangXun Technology Co., Ltd, "); -MODULE_DESCRIPTION("WangXun(R) 10 Gigabit PCI Express Network Driver"); +MODULE_DESCRIPTION("WangXun(R) 10/25/40 Gigabit PCI Express Network Driver"); MODULE_LICENSE("GPL"); From 6e83166dd8003e8611f253426b85e0c3d933e1c2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sun, 13 Apr 2025 11:34:32 +0200 Subject: [PATCH 199/770] mptcp: sched: remove mptcp_sched_data This is a follow-up of commit b68b106b0f15 ("mptcp: sched: reduce size for unused data"), now removing the mptcp_sched_data structure. Now is a good time to do that, because the previously mentioned WIP work has been updated, no longer depending on this structure. Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250413-net-next-mptcp-sched-mib-sft-misc-v2-1-0f83a4350150@kernel.org Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 13 ++----------- net/mptcp/sched.c | 18 +++++++----------- 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/include/net/mptcp.h b/include/net/mptcp.h index bfbad695951cf..f7263fe2a2e40 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -101,18 +101,9 @@ struct mptcp_out_options { #define MPTCP_SCHED_MAX 128 #define MPTCP_SCHED_BUF_MAX (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX) -#define MPTCP_SUBFLOWS_MAX 8 - -struct mptcp_sched_data { - u8 subflows; - struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; -}; - struct mptcp_sched_ops { - int (*get_send)(struct mptcp_sock *msk, - struct mptcp_sched_data *data); - int (*get_retrans)(struct mptcp_sock *msk, - struct mptcp_sched_data *data); + int (*get_send)(struct mptcp_sock *msk); + int (*get_retrans)(struct mptcp_sock *msk); char name[MPTCP_SCHED_NAME_MAX]; struct module *owner; diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index c16c6fbd4ba2f..f09f7eb1d63f8 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -16,8 +16,7 @@ static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); -static int mptcp_sched_default_get_send(struct mptcp_sock *msk, - struct mptcp_sched_data *data) +static int mptcp_sched_default_get_send(struct mptcp_sock *msk) { struct sock *ssk; @@ -29,8 +28,7 @@ static int mptcp_sched_default_get_send(struct mptcp_sock *msk, return 0; } -static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk, - struct mptcp_sched_data *data) +static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk) { struct sock *ssk; @@ -157,7 +155,6 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -178,14 +175,13 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) } if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_send(msk, data); - return msk->sched->get_send(msk, data); + return mptcp_sched_default_get_send(msk); + return msk->sched->get_send(msk); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -199,8 +195,8 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) } if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_retrans(msk, data); + return mptcp_sched_default_get_retrans(msk); if (msk->sched->get_retrans) - return msk->sched->get_retrans(msk, data); - return msk->sched->get_send(msk, data); + return msk->sched->get_retrans(msk); + return msk->sched->get_send(msk); } From 760ff076695cfaff3ab8fcb0bae33c8b4158818d Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 13 Apr 2025 11:34:33 +0200 Subject: [PATCH 200/770] mptcp: sched: split validation part A new interface .validate has been added in struct bpf_struct_ops recently. This patch prepares a future struct_ops support by implementing it as a new helper mptcp_validate_scheduler() for struct mptcp_sched_ops. In this helper, check whether the required ops "get_subflow" of struct mptcp_sched_ops has been implemented. Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250413-net-next-mptcp-sched-mib-sft-misc-v2-2-0f83a4350150@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.h | 1 + net/mptcp/sched.c | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d409586b5977f..7aa38d74fef6b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -744,6 +744,7 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); struct mptcp_sched_ops *mptcp_sched_find(const char *name); +int mptcp_validate_scheduler(struct mptcp_sched_ops *sched); int mptcp_register_scheduler(struct mptcp_sched_ops *sched); void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); void mptcp_sched_init(void); diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index f09f7eb1d63f8..1e59072d478c9 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -82,10 +82,23 @@ void mptcp_get_available_schedulers(char *buf, size_t maxlen) rcu_read_unlock(); } -int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +int mptcp_validate_scheduler(struct mptcp_sched_ops *sched) { - if (!sched->get_send) + if (!sched->get_send) { + pr_err("%s does not implement required ops\n", sched->name); return -EINVAL; + } + + return 0; +} + +int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +{ + int ret; + + ret = mptcp_validate_scheduler(sched); + if (ret) + return ret; spin_lock(&mptcp_sched_list_lock); if (mptcp_sched_find(sched->name)) { From def9d0958befc7dfbda5fc1f22e17fbab912bc62 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 13 Apr 2025 11:34:34 +0200 Subject: [PATCH 201/770] mptcp: pm: Return local variable instead of freed pointer Commit e4c28e3d5c090 ("mptcp: pm: move generic PM helpers to pm.c") removed an unnecessary if-check, which resulted in returning a freed pointer. This still works due to the implicit boolean conversion when returning the freed pointer from mptcp_remove_anno_list_by_saddr(), but it can be confusing and potentially error-prone. To improve clarity, add a local variable to explicitly return a boolean value instead. Signed-off-by: Thorsten Blum Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250413-net-next-mptcp-sched-mib-sft-misc-v2-3-0f83a4350150@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 31747f974941f..1306d4dc287b8 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -151,10 +151,13 @@ bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; + bool ret; entry = mptcp_pm_del_add_timer(msk, addr, false); + ret = entry; kfree(entry); - return entry; + + return ret; } bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) From 60cbf3158513ae35ab38a2593b07ce679825d425 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sun, 13 Apr 2025 11:34:35 +0200 Subject: [PATCH 202/770] mptcp: pass right struct to subflow_hmac_valid subflow_hmac_valid() needs to access the MPTCP socket and the subflow request, but not the request sock that is passed in argument. Instead, the subflow request can be directly passed to avoid getting it via an additional cast. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250413-net-next-mptcp-sched-mib-sft-misc-v2-4-0f83a4350150@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 24c2de1891bdf..e7951786a97c9 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -745,15 +745,11 @@ struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *op EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); /* validate hmac received in third ACK */ -static bool subflow_hmac_valid(const struct request_sock *req, +static bool subflow_hmac_valid(const struct mptcp_subflow_request_sock *subflow_req, const struct mptcp_options_received *mp_opt) { - const struct mptcp_subflow_request_sock *subflow_req; + struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; - struct mptcp_sock *msk; - - subflow_req = mptcp_subflow_rsk(req); - msk = subflow_req->msk; subflow_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), @@ -899,7 +895,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, goto dispose_child; } - if (!subflow_hmac_valid(req, &mp_opt)) { + if (!subflow_hmac_valid(subflow_req, &mp_opt)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; From 4ce7fb8de556c0a16c17b5d11d54fa21479f2552 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sun, 13 Apr 2025 11:34:36 +0200 Subject: [PATCH 203/770] mptcp: add MPJoinRejected MIB counter This counter is useful to understand why some paths are rejected, and not created as expected. It is incremented when receiving a connection request, if the PM didn't allow the creation of new subflows. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250413-net-next-mptcp-sched-mib-sft-misc-v2-5-0f83a4350150@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/mib.c | 1 + net/mptcp/mib.h | 1 + net/mptcp/protocol.c | 4 +++- net/mptcp/subflow.c | 2 ++ 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 19eb9292bd609..0c24545f0e8df 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -28,6 +28,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), + SNMP_MIB_ITEM("MPJoinRejected", MPTCP_MIB_JOINREJECTED), SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX), SNMP_MIB_ITEM("MPJoinSynTxCreatSkErr", MPTCP_MIB_JOINSYNTXCREATSKERR), SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 128282982843a..250c6b77977e8 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -23,6 +23,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ + MPTCP_MIB_JOINREJECTED, /* The PM rejected the JOIN request */ MPTCP_MIB_JOINSYNTX, /* Sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXCREATSKERR, /* Not able to create a socket when sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXBINDERR, /* Not able to bind() the address when sending a SYN + MP_JOIN */ diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 44f7ab463d755..26ffa06c21e8d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3527,8 +3527,10 @@ bool mptcp_finish_join(struct sock *ssk) return true; } - if (!mptcp_pm_allow_new_subflow(msk)) + if (!mptcp_pm_allow_new_subflow(msk)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_JOINREJECTED); goto err_prohibited; + } /* If we can't acquire msk socket lock here, let the release callback * handle it diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index e7951786a97c9..15613d691bfef 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -247,6 +247,7 @@ static int subflow_check_req(struct request_sock *req, if (unlikely(req->syncookie)) { if (!mptcp_can_accept_new_subflow(subflow_req->msk)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } @@ -902,6 +903,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, } if (!mptcp_can_accept_new_subflow(owner)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); goto dispose_child; } From 98dea4fd6315c17f4e072deada6047d641995777 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sun, 13 Apr 2025 11:34:37 +0200 Subject: [PATCH 204/770] selftests: mptcp: validate MPJoinRejected counter The parent commit adds this new counter, incremented when receiving a connection request, if the PM didn't allow the creation of new subflows. Most of the time, it is then kept at 0, except when the PM limits cause the receiver side to reject new MPJoin connections. This is the case in the following tests: - single subflow, limited by server - multiple subflows, limited by server - subflows limited by server w cookies - userspace pm type rejects join - userspace pm type prevents mp_prio Simply set join_syn_rej=1 when checking the MPJoin counters for these tests. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250413-net-next-mptcp-sched-mib-sft-misc-v2-6-0f83a4350150@kernel.org Signed-off-by: Jakub Kicinski --- .../testing/selftests/net/mptcp/mptcp_join.sh | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index befa66f5a366b..b8af65373b3ad 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -62,6 +62,7 @@ unset sflags unset fastclose unset fullmesh unset speed +unset join_syn_rej unset join_csum_ns1 unset join_csum_ns2 unset join_fail_nr @@ -1403,6 +1404,7 @@ chk_join_nr() local syn_nr=$1 local syn_ack_nr=$2 local ack_nr=$3 + local syn_rej=${join_syn_rej:-0} local csum_ns1=${join_csum_ns1:-0} local csum_ns2=${join_csum_ns2:-0} local fail_nr=${join_fail_nr:-0} @@ -1468,6 +1470,15 @@ chk_join_nr() fail_test "got $count JOIN[s] ack HMAC failure expected 0" fi + count=$(mptcp_lib_get_counter ${ns1} "MPTcpExtMPJoinRejected") + if [ -z "$count" ]; then + rc=${KSFT_SKIP} + elif [ "$count" != "$syn_rej" ]; then + rc=${KSFT_FAIL} + print_check "syn rejected" + fail_test "got $count JOIN[s] syn rejected expected $syn_rej" + fi + print_results "join Rx" ${rc} join_syn_tx="${join_syn_tx:-${syn_nr}}" \ @@ -1963,7 +1974,8 @@ subflows_tests() pm_nl_set_limits $ns2 0 1 pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 0 + join_syn_rej=1 \ + chk_join_nr 1 1 0 fi # subflow @@ -1992,7 +2004,8 @@ subflows_tests() pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 2 2 1 + join_syn_rej=1 \ + chk_join_nr 2 2 1 fi # single subflow, dev @@ -3061,7 +3074,8 @@ syncookies_tests() pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow pm_nl_add_endpoint $ns2 10.0.2.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 2 1 1 + join_syn_rej=1 \ + chk_join_nr 2 1 1 fi # test signal address with cookies @@ -3545,7 +3559,8 @@ userspace_tests() pm_nl_set_limits $ns2 1 1 pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 0 + join_syn_rej=1 \ + chk_join_nr 1 1 0 fi # userspace pm type does not send join @@ -3568,7 +3583,8 @@ userspace_tests() pm_nl_add_endpoint $ns2 10.0.3.2 flags subflow sflags=backup speed=slow \ run_tests $ns1 $ns2 10.0.1.1 - chk_join_nr 1 1 0 + join_syn_rej=1 \ + chk_join_nr 1 1 0 chk_prio_nr 0 0 0 0 fi From f9c7504d305546d5cefd24062f377a4d2e615025 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Sun, 13 Apr 2025 11:34:38 +0200 Subject: [PATCH 205/770] selftests: mptcp: diag: drop nlh parameter of recv_nlmsg It's strange that 'nlh' variable is set to NULL in get_mptcpinfo() and then this NULL pointer is passed to recv_nlmsg(). In fact, this variable should be defined in recv_nlmsg(), not get_mptcpinfo(). So this patch drops this useless 'nlh' parameter of recv_nlmsg() and define 'nlh' variable in recv_nlmsg(). Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250413-net-next-mptcp-sched-mib-sft-misc-v2-7-0f83a4350150@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/mptcp_diag.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_diag.c b/tools/testing/selftests/net/mptcp/mptcp_diag.c index 284286c524cfe..37d5015ad08c4 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_diag.c +++ b/tools/testing/selftests/net/mptcp/mptcp_diag.c @@ -185,9 +185,10 @@ static void parse_nlmsg(struct nlmsghdr *nlh) } } -static void recv_nlmsg(int fd, struct nlmsghdr *nlh) +static void recv_nlmsg(int fd) { char rcv_buff[8192]; + struct nlmsghdr *nlh = (struct nlmsghdr *)rcv_buff; struct sockaddr_nl rcv_nladdr = { .nl_family = AF_NETLINK }; @@ -204,7 +205,6 @@ static void recv_nlmsg(int fd, struct nlmsghdr *nlh) int len; len = recvmsg(fd, &rcv_msg, 0); - nlh = (struct nlmsghdr *)rcv_buff; while (NLMSG_OK(nlh, len)) { if (nlh->nlmsg_type == NLMSG_DONE) { @@ -225,7 +225,6 @@ static void recv_nlmsg(int fd, struct nlmsghdr *nlh) static void get_mptcpinfo(__u32 token) { - struct nlmsghdr *nlh = NULL; int fd; fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG); @@ -233,7 +232,7 @@ static void get_mptcpinfo(__u32 token) die_perror("Netlink socket"); send_query(fd, token); - recv_nlmsg(fd, nlh); + recv_nlmsg(fd); close(fd); } From a862771d1aa4c5cf4d92adfbc4b0879918e0e725 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Sun, 13 Apr 2025 11:34:39 +0200 Subject: [PATCH 206/770] selftests: mptcp: use IPPROTO_MPTCP for getaddrinfo mptcp_connect.c is a startup tutorial of MPTCP programming, however there is a lack of ai_protocol(IPPROTO_MPTCP) usage. Add comment for getaddrinfo MPTCP support. This patch first uses IPPROTO_MPTCP to get addrinfo, and if glibc version is too old, it falls back to using IPPROTO_TCP. Co-developed-by: Geliang Tang Signed-off-by: Geliang Tang Signed-off-by: zhenwei pi Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250413-net-next-mptcp-sched-mib-sft-misc-v2-8-0f83a4350150@kernel.org Signed-off-by: Jakub Kicinski --- .../selftests/net/mptcp/mptcp_connect.c | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index c83a8b47bbdfa..ac1349c4b9e54 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -180,13 +180,26 @@ static void xgetnameinfo(const struct sockaddr *addr, socklen_t addrlen, } static void xgetaddrinfo(const char *node, const char *service, - const struct addrinfo *hints, + struct addrinfo *hints, struct addrinfo **res) { +again: int err = getaddrinfo(node, service, hints, res); if (err) { - const char *errstr = getxinfo_strerr(err); + const char *errstr; + + /* glibc starts to support MPTCP since v2.42. + * For older versions, use IPPROTO_TCP to resolve, + * and use TCP/MPTCP to create socket. + * Link: https://sourceware.org/git/?p=glibc.git;a=commit;h=a8e9022e0f82 + */ + if (err == EAI_SOCKTYPE) { + hints->ai_protocol = IPPROTO_TCP; + goto again; + } + + errstr = getxinfo_strerr(err); fprintf(stderr, "Fatal: getaddrinfo(%s:%s): %s\n", node ? node : "", service ? service : "", errstr); @@ -292,7 +305,7 @@ static int sock_listen_mptcp(const char * const listenaddr, { int sock = -1; struct addrinfo hints = { - .ai_protocol = IPPROTO_TCP, + .ai_protocol = IPPROTO_MPTCP, .ai_socktype = SOCK_STREAM, .ai_flags = AI_PASSIVE | AI_NUMERICHOST }; @@ -356,7 +369,7 @@ static int sock_connect_mptcp(const char * const remoteaddr, int infd, struct wstate *winfo) { struct addrinfo hints = { - .ai_protocol = IPPROTO_TCP, + .ai_protocol = IPPROTO_MPTCP, .ai_socktype = SOCK_STREAM, }; struct addrinfo *a, *addr; From f99564688f38458d86b64f099ebf03f19517cf77 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 13 Apr 2025 16:09:40 +0200 Subject: [PATCH 207/770] net: phy: remove device_phy_find_device AFAICS this function has never had a user. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/ab7b8094-2eea-4e82-a047-fd60117f220b@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 12 ------------ include/linux/phy.h | 6 ------ 2 files changed, 18 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index cc1bfd22fb812..cc6c209fe7022 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3235,18 +3235,6 @@ struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) } EXPORT_SYMBOL(fwnode_phy_find_device); -/** - * device_phy_find_device - For the given device, get the phy_device - * @dev: Pointer to the given device - * - * Refer return conditions of fwnode_phy_find_device(). - */ -struct phy_device *device_phy_find_device(struct device *dev) -{ - return fwnode_phy_find_device(dev_fwnode(dev)); -} -EXPORT_SYMBOL_GPL(device_phy_find_device); - /** * fwnode_get_phy_node - Get the phy_node using the named reference. * @fwnode: Pointer to fwnode from which phy_node has to be obtained. diff --git a/include/linux/phy.h b/include/linux/phy.h index a2bfae80c4497..fb755358d965b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1757,7 +1757,6 @@ struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id); struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode); struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode); -struct phy_device *device_phy_find_device(struct device *dev); struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); @@ -1779,11 +1778,6 @@ struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) return NULL; } -static inline struct phy_device *device_phy_find_device(struct device *dev) -{ - return NULL; -} - static inline struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode) { From e056d3d703886d69599fb757e65495ac606605ac Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 14 Apr 2025 01:52:43 +0100 Subject: [PATCH 208/770] qed: Remove unused qed_memset_*ctx functions qed_memset_session_ctx() and qed_memset_task_ctx() were added in 2017 as part of commit da09091732ae ("qed*: Utilize FW 8.33.1.0") but have not been used. Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250414005247.341243-2-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_hsi.h | 24 ------------ .../ethernet/qlogic/qed/qed_init_fw_funcs.c | 38 ------------------- 2 files changed, 62 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h index ed1a84542ad2d..3181fed1274e9 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h @@ -2694,30 +2694,6 @@ void qed_calc_session_ctx_validation(void *p_ctx_mem, void qed_calc_task_ctx_validation(void *p_ctx_mem, u16 ctx_size, u8 ctx_type, u32 tid); -/** - * qed_memset_session_ctx(): Memset session context to 0 while - * preserving validation bytes. - * - * @p_ctx_mem: Pointer to context memory. - * @ctx_size: Size to initialzie. - * @ctx_type: Context type. - * - * Return: Void. - */ -void qed_memset_session_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type); - -/** - * qed_memset_task_ctx(): Memset task context to 0 while preserving - * validation bytes. - * - * @p_ctx_mem: Pointer to context memory. - * @ctx_size: size to initialzie. - * @ctx_type: context type. - * - * Return: Void. - */ -void qed_memset_task_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type); - #define NUM_STORMS 6 /** diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c index 407029a36fa14..4b9128c08ad33 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c +++ b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c @@ -1666,44 +1666,6 @@ void qed_calc_task_ctx_validation(void *p_ctx_mem, *region1_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 1, tid); } -/* Memset session context to 0 while preserving validation bytes */ -void qed_memset_session_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type) -{ - u8 *x_val_ptr, *t_val_ptr, *u_val_ptr, *p_ctx; - u8 x_val, t_val, u_val; - - p_ctx = (u8 * const)p_ctx_mem; - x_val_ptr = &p_ctx[con_region_offsets[0][ctx_type]]; - t_val_ptr = &p_ctx[con_region_offsets[1][ctx_type]]; - u_val_ptr = &p_ctx[con_region_offsets[2][ctx_type]]; - - x_val = *x_val_ptr; - t_val = *t_val_ptr; - u_val = *u_val_ptr; - - memset(p_ctx, 0, ctx_size); - - *x_val_ptr = x_val; - *t_val_ptr = t_val; - *u_val_ptr = u_val; -} - -/* Memset task context to 0 while preserving validation bytes */ -void qed_memset_task_ctx(void *p_ctx_mem, u32 ctx_size, u8 ctx_type) -{ - u8 *p_ctx, *region1_val_ptr; - u8 region1_val; - - p_ctx = (u8 * const)p_ctx_mem; - region1_val_ptr = &p_ctx[task_region_offsets[0][ctx_type]]; - - region1_val = *region1_val_ptr; - - memset(p_ctx, 0, ctx_size); - - *region1_val_ptr = region1_val; -} - /* Enable and configure context validation */ void qed_enable_context_validation(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) From fa381e21a907cf3cd009bd08c50231345b475772 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 14 Apr 2025 01:52:44 +0100 Subject: [PATCH 209/770] qed: Remove unused qed_calc_*_ctx_validation functions qed_calc_session_ctx_validation() and qed_calc_task_ctx_validation() were added as part of 2017's commit da09091732ae ("qed*: Utilize FW 8.33.1.0") but have remained unused. Remove them. This leaves; con_region_offsets[], task_region_offsets[], cdu_crc8_table and qed_calc_cdu_validation_byte() unused. Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250414005247.341243-3-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_hsi.h | 28 ----- .../ethernet/qlogic/qed/qed_init_fw_funcs.c | 100 ------------------ 2 files changed, 128 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h index 3181fed1274e9..10e355397ceeb 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h @@ -2666,34 +2666,6 @@ void qed_gft_config(struct qed_hwfn *p_hwfn, void qed_enable_context_validation(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt); -/** - * qed_calc_session_ctx_validation(): Calcualte validation byte for - * session context. - * - * @p_ctx_mem: Pointer to context memory. - * @ctx_size: Context size. - * @ctx_type: Context type. - * @cid: Context cid. - * - * Return: Void. - */ -void qed_calc_session_ctx_validation(void *p_ctx_mem, - u16 ctx_size, u8 ctx_type, u32 cid); - -/** - * qed_calc_task_ctx_validation(): Calcualte validation byte for task - * context. - * - * @p_ctx_mem: Pointer to context memory. - * @ctx_size: Context size. - * @ctx_type: Context type. - * @tid: Context tid. - * - * Return: Void. - */ -void qed_calc_task_ctx_validation(void *p_ctx_mem, - u16 ctx_size, u8 ctx_type, u32 tid); - #define NUM_STORMS 6 /** diff --git a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c index 4b9128c08ad33..aa20bb8caa9a9 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c +++ b/drivers/net/ethernet/qlogic/qed/qed_init_fw_funcs.c @@ -18,16 +18,6 @@ #define CDU_VALIDATION_DEFAULT_CFG CDU_CONTEXT_VALIDATION_DEFAULT_CFG -static u16 con_region_offsets[3][NUM_OF_CONNECTION_TYPES] = { - {400, 336, 352, 368, 304, 384, 416, 352}, /* region 3 offsets */ - {528, 496, 416, 512, 448, 512, 544, 480}, /* region 4 offsets */ - {608, 544, 496, 576, 576, 592, 624, 560} /* region 5 offsets */ -}; - -static u16 task_region_offsets[1][NUM_OF_CONNECTION_TYPES] = { - {240, 240, 112, 0, 0, 0, 0, 96} /* region 1 offsets */ -}; - /* General constants */ #define QM_PQ_MEM_4KB(pq_size) (pq_size ? DIV_ROUND_UP((pq_size + 1) * \ QM_PQ_ELEMENT_SIZE, \ @@ -1576,96 +1566,6 @@ void qed_gft_config(struct qed_hwfn *p_hwfn, qed_wr(p_hwfn, p_ptt, PRS_REG_SEARCH_GFT, 1); } -DECLARE_CRC8_TABLE(cdu_crc8_table); - -/* Calculate and return CDU validation byte per connection type/region/cid */ -static u8 qed_calc_cdu_validation_byte(u8 conn_type, u8 region, u32 cid) -{ - const u8 validation_cfg = CDU_VALIDATION_DEFAULT_CFG; - u8 crc, validation_byte = 0; - static u8 crc8_table_valid; /* automatically initialized to 0 */ - u32 validation_string = 0; - __be32 data_to_crc; - - if (!crc8_table_valid) { - crc8_populate_msb(cdu_crc8_table, 0x07); - crc8_table_valid = 1; - } - - /* The CRC is calculated on the String-to-compress: - * [31:8] = {CID[31:20],CID[11:0]} - * [7:4] = Region - * [3:0] = Type - */ - if ((validation_cfg >> CDU_CONTEXT_VALIDATION_CFG_USE_CID) & 1) - validation_string |= (cid & 0xFFF00000) | ((cid & 0xFFF) << 8); - - if ((validation_cfg >> CDU_CONTEXT_VALIDATION_CFG_USE_REGION) & 1) - validation_string |= ((region & 0xF) << 4); - - if ((validation_cfg >> CDU_CONTEXT_VALIDATION_CFG_USE_TYPE) & 1) - validation_string |= (conn_type & 0xF); - - /* Convert to big-endian and calculate CRC8 */ - data_to_crc = cpu_to_be32(validation_string); - crc = crc8(cdu_crc8_table, (u8 *)&data_to_crc, sizeof(data_to_crc), - CRC8_INIT_VALUE); - - /* The validation byte [7:0] is composed: - * for type A validation - * [7] = active configuration bit - * [6:0] = crc[6:0] - * - * for type B validation - * [7] = active configuration bit - * [6:3] = connection_type[3:0] - * [2:0] = crc[2:0] - */ - validation_byte |= - ((validation_cfg >> - CDU_CONTEXT_VALIDATION_CFG_USE_ACTIVE) & 1) << 7; - - if ((validation_cfg >> - CDU_CONTEXT_VALIDATION_CFG_VALIDATION_TYPE_SHIFT) & 1) - validation_byte |= ((conn_type & 0xF) << 3) | (crc & 0x7); - else - validation_byte |= crc & 0x7F; - - return validation_byte; -} - -/* Calcualte and set validation bytes for session context */ -void qed_calc_session_ctx_validation(void *p_ctx_mem, - u16 ctx_size, u8 ctx_type, u32 cid) -{ - u8 *x_val_ptr, *t_val_ptr, *u_val_ptr, *p_ctx; - - p_ctx = (u8 * const)p_ctx_mem; - x_val_ptr = &p_ctx[con_region_offsets[0][ctx_type]]; - t_val_ptr = &p_ctx[con_region_offsets[1][ctx_type]]; - u_val_ptr = &p_ctx[con_region_offsets[2][ctx_type]]; - - memset(p_ctx, 0, ctx_size); - - *x_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 3, cid); - *t_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 4, cid); - *u_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 5, cid); -} - -/* Calcualte and set validation bytes for task context */ -void qed_calc_task_ctx_validation(void *p_ctx_mem, - u16 ctx_size, u8 ctx_type, u32 tid) -{ - u8 *p_ctx, *region1_val_ptr; - - p_ctx = (u8 * const)p_ctx_mem; - region1_val_ptr = &p_ctx[task_region_offsets[0][ctx_type]]; - - memset(p_ctx, 0, ctx_size); - - *region1_val_ptr = qed_calc_cdu_validation_byte(ctx_type, 1, tid); -} - /* Enable and configure context validation */ void qed_enable_context_validation(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) From 3c18acefaf9f59c32bd9c53bded4c523464901e6 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 14 Apr 2025 01:52:45 +0100 Subject: [PATCH 210/770] qed: Remove unused qed_ptt_invalidate qed_ptt_invalidate() was added in 2015 as part of commit fe56b9e6a8d9 ("qed: Add module with basic common support") but has remained unused. Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250414005247.341243-4-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_hw.c | 11 ----------- drivers/net/ethernet/qlogic/qed/qed_hw.h | 9 --------- 2 files changed, 20 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.c b/drivers/net/ethernet/qlogic/qed/qed_hw.c index 9e5f0dbc8a076..9907973399dc4 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hw.c +++ b/drivers/net/ethernet/qlogic/qed/qed_hw.c @@ -69,17 +69,6 @@ int qed_ptt_pool_alloc(struct qed_hwfn *p_hwfn) return 0; } -void qed_ptt_invalidate(struct qed_hwfn *p_hwfn) -{ - struct qed_ptt *p_ptt; - int i; - - for (i = 0; i < PXP_EXTERNAL_BAR_PF_WINDOW_NUM; i++) { - p_ptt = &p_hwfn->p_ptt_pool->ptts[i]; - p_ptt->pxp.offset = QED_BAR_INVALID_OFFSET; - } -} - void qed_ptt_pool_free(struct qed_hwfn *p_hwfn) { kfree(p_hwfn->p_ptt_pool); diff --git a/drivers/net/ethernet/qlogic/qed/qed_hw.h b/drivers/net/ethernet/qlogic/qed/qed_hw.h index e535983ce21bb..3c98f58a184fa 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hw.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hw.h @@ -61,15 +61,6 @@ enum _dmae_cmd_crc_mask { */ void qed_gtt_init(struct qed_hwfn *p_hwfn); -/** - * qed_ptt_invalidate(): Forces all ptt entries to be re-configured - * - * @p_hwfn: HW device data. - * - * Return: Void. - */ -void qed_ptt_invalidate(struct qed_hwfn *p_hwfn); - /** * qed_ptt_pool_alloc(): Allocate and initialize PTT pool. * From 058fa8736570155830a50f836fbd31c74ff99091 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 14 Apr 2025 01:52:46 +0100 Subject: [PATCH 211/770] qed: Remove unused qed_print_mcp_trace_* While most of the trace code is reachable by other routes (I think mostly via the qed_features_lookup[] array), there are a couple of unused wrappers. qed_print_mcp_trace_line() and qed_print_mcp_trace_results_cont() were added in 2018 as part of commit a3f723079df8 ("qed*: Utilize FW 8.37.7.0") but have remained unused. Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250414005247.341243-5-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed_dbg_hsi.h | 31 ------------------- drivers/net/ethernet/qlogic/qed/qed_debug.c | 25 --------------- 2 files changed, 56 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dbg_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_dbg_hsi.h index f6cd1b3efdfdf..27e91d0d39f8c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dbg_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_dbg_hsi.h @@ -1304,37 +1304,6 @@ enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn, u32 num_dumped_dwords, char *results_buf); -/** - * qed_print_mcp_trace_results_cont(): Prints MCP Trace results, and - * keeps the MCP trace meta data allocated, to support continuous MCP Trace - * parsing. After the continuous parsing ends, mcp_trace_free_meta_data should - * be called to free the meta data. - * - * @p_hwfn: HW device data. - * @dump_buf: MVP trace dump buffer, starting from the header. - * @results_buf: Buffer for printing the mcp trace results. - * - * Return: Error if the parsing fails, ok otherwise. - */ -enum dbg_status qed_print_mcp_trace_results_cont(struct qed_hwfn *p_hwfn, - u32 *dump_buf, - char *results_buf); - -/** - * qed_print_mcp_trace_line(): Prints MCP Trace results for a single line - * - * @p_hwfn: HW device data. - * @dump_buf: MCP trace dump buffer, starting from the header. - * @num_dumped_bytes: Number of bytes that were dumped. - * @results_buf: Buffer for printing the mcp trace results. - * - * Return: Error if the parsing fails, ok otherwise. - */ -enum dbg_status qed_print_mcp_trace_line(struct qed_hwfn *p_hwfn, - u8 *dump_buf, - u32 num_dumped_bytes, - char *results_buf); - /** * qed_mcp_trace_free_meta_data(): Frees the MCP Trace meta data. * Should be called after continuous MCP Trace parsing. diff --git a/drivers/net/ethernet/qlogic/qed/qed_debug.c b/drivers/net/ethernet/qlogic/qed/qed_debug.c index 464a72afb7589..9c3d3dd2f8475 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_debug.c +++ b/drivers/net/ethernet/qlogic/qed/qed_debug.c @@ -7614,31 +7614,6 @@ enum dbg_status qed_print_mcp_trace_results(struct qed_hwfn *p_hwfn, results_buf, &parsed_buf_size, true); } -enum dbg_status qed_print_mcp_trace_results_cont(struct qed_hwfn *p_hwfn, - u32 *dump_buf, - char *results_buf) -{ - u32 parsed_buf_size; - - return qed_parse_mcp_trace_dump(p_hwfn, dump_buf, results_buf, - &parsed_buf_size, false); -} - -enum dbg_status qed_print_mcp_trace_line(struct qed_hwfn *p_hwfn, - u8 *dump_buf, - u32 num_dumped_bytes, - char *results_buf) -{ - u32 parsed_results_bytes; - - return qed_parse_mcp_trace_buf(p_hwfn, - dump_buf, - num_dumped_bytes, - 0, - num_dumped_bytes, - results_buf, &parsed_results_bytes); -} - /* Frees the specified MCP Trace meta data */ void qed_mcp_trace_free_meta_data(struct qed_hwfn *p_hwfn) { From 915359abc68c74efea4a735207dd1367b28da960 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 14 Apr 2025 01:52:47 +0100 Subject: [PATCH 212/770] qed: Remove unused qed_db_recovery_dp qed_db_recovery_dp() was added in 2018 as part of commit 36907cd5cd72 ("qed: Add doorbell overflow recovery mechanism") but has remained unused. Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250414005247.341243-6-linux@treblig.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qed/qed.h | 1 - drivers/net/ethernet/qlogic/qed/qed_dev.c | 19 ------------------- 2 files changed, 20 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed.h b/drivers/net/ethernet/qlogic/qed/qed.h index b7def3b549376..016b575861b9e 100644 --- a/drivers/net/ethernet/qlogic/qed/qed.h +++ b/drivers/net/ethernet/qlogic/qed/qed.h @@ -939,7 +939,6 @@ u16 qed_get_cm_pq_idx_ofld_mtc(struct qed_hwfn *p_hwfn, u8 tc); u16 qed_get_cm_pq_idx_llt_mtc(struct qed_hwfn *p_hwfn, u8 tc); /* doorbell recovery mechanism */ -void qed_db_recovery_dp(struct qed_hwfn *p_hwfn); void qed_db_recovery_execute(struct qed_hwfn *p_hwfn); bool qed_edpm_enabled(struct qed_hwfn *p_hwfn); diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 86a93cac26470..9659ce5b07125 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -255,25 +255,6 @@ static void qed_db_recovery_teardown(struct qed_hwfn *p_hwfn) p_hwfn->db_recovery_info.db_recovery_counter = 0; } -/* Print the content of the doorbell recovery mechanism */ -void qed_db_recovery_dp(struct qed_hwfn *p_hwfn) -{ - struct qed_db_recovery_entry *db_entry = NULL; - - DP_NOTICE(p_hwfn, - "Displaying doorbell recovery database. Counter was %d\n", - p_hwfn->db_recovery_info.db_recovery_counter); - - /* Protect the list */ - spin_lock_bh(&p_hwfn->db_recovery_info.lock); - list_for_each_entry(db_entry, - &p_hwfn->db_recovery_info.list, list_entry) { - qed_db_recovery_dp_entry(p_hwfn, db_entry, "Printing"); - } - - spin_unlock_bh(&p_hwfn->db_recovery_info.lock); -} - /* Ring the doorbell of a single doorbell recovery entry */ static void qed_db_recovery_ring(struct qed_hwfn *p_hwfn, struct qed_db_recovery_entry *db_entry) From 95d06e92a401928fe46fda7616e460f39cb7211b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Apr 2025 06:24:07 -0700 Subject: [PATCH 213/770] netlink: Introduce nlmsg_payload helper Create a new helper function, nlmsg_payload(), to simplify checking and retrieving Netlink message payloads. This reduces boilerplate code for users who need to verify the message length before accessing its data. Suggested-by: Jakub Kicinski Signed-off-by: Breno Leitao Reviewed-by: Jakub Kicinski Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250414-nlmsg-v2-1-3d90cb42c6af@debian.org Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/include/net/netlink.h b/include/net/netlink.h index 29e0db9403820..82e07e272290a 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -611,6 +611,22 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh) return nlh->nlmsg_len - NLMSG_HDRLEN; } +/** + * nlmsg_payload - message payload if the data fits in the len + * @nlh: netlink message header + * @len: struct length + * + * Returns: The netlink message payload/data if the length is sufficient, + * otherwise NULL. + */ +static inline void *nlmsg_payload(const struct nlmsghdr *nlh, size_t len) +{ + if (nlh->nlmsg_len < nlmsg_msg_size(len)) + return NULL; + + return nlmsg_data(nlh); +} + /** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header From 7527efe8a41628f84a43efd47f065ae641dde769 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Apr 2025 06:24:08 -0700 Subject: [PATCH 214/770] neighbour: Use nlmsg_payload in neightbl_valid_dump_info Update neightbl_valid_dump_info function to utilize the new nlmsg_payload() helper function. This change improves code clarity and safety by ensuring that the Netlink message payload is properly validated before accessing its data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250414-nlmsg-v2-2-3d90cb42c6af@debian.org Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a07249b59ae1e..b6bc4836c6e45 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2430,12 +2430,12 @@ static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, { struct ndtmsg *ndtm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) { + ndtm = nlmsg_payload(nlh, sizeof(*ndtm)); + if (!ndtm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); return -EINVAL; } - ndtm = nlmsg_data(nlh); if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); return -EINVAL; From 2d1f827f0642261d5c955d3d30b5f997e06ce7a9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Apr 2025 06:24:09 -0700 Subject: [PATCH 215/770] neighbour: Use nlmsg_payload in neigh_valid_get_req Update neigh_valid_get_req function to utilize the new nlmsg_payload() helper function. This change improves code clarity and safety by ensuring that the Netlink message payload is properly validated before accessing its data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250414-nlmsg-v2-3-3d90cb42c6af@debian.org Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b6bc4836c6e45..65cf582b5dacd 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2855,12 +2855,12 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); From 77d02290366f058dce2555468a48ed8d62d95939 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Apr 2025 06:24:10 -0700 Subject: [PATCH 216/770] rtnetlink: Use nlmsg_payload in valid_fdb_dump_strict Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250414-nlmsg-v2-4-3d90cb42c6af@debian.org Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 38526210b8fdd..f5c018090efc3 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4887,12 +4887,12 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); From 72be72bea9dc8641c9f39c0335b71b9fbea0fae5 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Apr 2025 06:24:11 -0700 Subject: [PATCH 217/770] mpls: Use nlmsg_payload in mpls_valid_fib_dump_req Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250414-nlmsg-v2-5-3d90cb42c6af@debian.org Signed-off-by: Jakub Kicinski --- net/mpls/af_mpls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 1f63b32d76d67..bf7cd290c2369 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2095,12 +2095,12 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct rtmsg *rtm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request"); return -EINVAL; } - rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { From e87187dfbb9f7f599da33603984a18adb07f0174 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Apr 2025 06:24:12 -0700 Subject: [PATCH 218/770] ipv6: Use nlmsg_payload in inet6_valid_dump_ifaddr_req Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250414-nlmsg-v2-6-3d90cb42c6af@debian.org Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9ba83f0c99283..a9aeb949d9c8e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5346,12 +5346,12 @@ static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct ifaddrmsg *ifm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request"); return -EINVAL; From 8cf1e30907de2ebd850042e4da9fbc72e094d7de Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Apr 2025 06:24:13 -0700 Subject: [PATCH 219/770] ipv6: Use nlmsg_payload in inet6_rtm_valid_getaddr_req Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250414-nlmsg-v2-7-3d90cb42c6af@debian.org Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a9aeb949d9c8e..4af2761795428 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5484,7 +5484,8 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, struct ifaddrmsg *ifm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get address request"); return -EINVAL; } @@ -5493,7 +5494,6 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, extack); - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request"); return -EINVAL; From 69a1ecfe47f0c61f00ed281500c2b6da8b364561 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Apr 2025 06:24:14 -0700 Subject: [PATCH 220/770] mpls: Use nlmsg_payload in mpls_valid_getroute_req Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250414-nlmsg-v2-8-3d90cb42c6af@debian.org Signed-off-by: Jakub Kicinski --- net/mpls/af_mpls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index bf7cd290c2369..d536c97144e9d 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2288,7 +2288,8 @@ static int mpls_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; @@ -2298,7 +2299,6 @@ static int mpls_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_dst_len && rtm->rtm_dst_len != 20) || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { From 4c113c803fdc5cc311383f914ca3fb301dba9810 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Apr 2025 06:24:15 -0700 Subject: [PATCH 221/770] net: fib_rules: Use nlmsg_payload in fib_valid_dumprule_req Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250414-nlmsg-v2-9-3d90cb42c6af@debian.org Signed-off-by: Jakub Kicinski --- net/core/fib_rules.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 4bc64d912a1c0..6a7a28bf631c2 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1238,12 +1238,12 @@ static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, { struct fib_rule_hdr *frh; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } - frh = nlmsg_data(nlh); if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, From 8ff953036110bf1804b054b730300c8377bdde5e Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 14 Apr 2025 06:24:16 -0700 Subject: [PATCH 222/770] net: fib_rules: Use nlmsg_payload in fib_{new,del}rule() Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Suggested-by: Kuniyuki Iwashima Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250414-nlmsg-v2-10-3d90cb42c6af@debian.org Signed-off-by: Jakub Kicinski --- net/core/fib_rules.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 6a7a28bf631c2..06052b6c946b9 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -852,13 +852,14 @@ int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, bool rtnl_held) { struct fib_rule *rule = NULL, *r, *last = NULL; - struct fib_rule_hdr *frh = nlmsg_data(nlh); int err = -EINVAL, unresolved = 0; struct fib_rules_ops *ops = NULL; struct nlattr *tb[FRA_MAX + 1]; bool user_priority = false; + struct fib_rule_hdr *frh; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } @@ -980,13 +981,14 @@ int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, bool rtnl_held) { struct fib_rule *rule = NULL, *nlrule = NULL; - struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; struct nlattr *tb[FRA_MAX+1]; bool user_priority = false; + struct fib_rule_hdr *frh; int err = -EINVAL; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } From c30a45a7e072cd1cb9887daba7446ae653b12801 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 14 Apr 2025 10:06:00 +0100 Subject: [PATCH 223/770] net: stmmac: anarion: clean up anarion_config_dt() error handling When enabled, print a user friendly description of the error when failing to ioremap() the control resource, and use ERR_CAST() when propagating the error. This allows us to get rid of the "err" local variable in anarion_config_dt(). Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u4FlQ-000XjA-2V@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c index 37fe7c2888784..232aae7526905 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c @@ -65,13 +65,12 @@ anarion_config_dt(struct platform_device *pdev, { struct anarion_gmac *gmac; void __iomem *ctl_block; - int err; ctl_block = devm_platform_ioremap_resource(pdev, 1); if (IS_ERR(ctl_block)) { - err = PTR_ERR(ctl_block); - dev_err(&pdev->dev, "Cannot get reset region (%d)!\n", err); - return ERR_PTR(err); + dev_err(&pdev->dev, "Cannot get reset region (%pe)!\n", + ctl_block); + return ERR_CAST(ctl_block); } gmac = devm_kzalloc(&pdev->dev, sizeof(*gmac), GFP_KERNEL); From a55ec9c811aa25504951113b45e1df60ab6d751c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 14 Apr 2025 10:06:05 +0100 Subject: [PATCH 224/770] net: stmmac: anarion: clean up interface parsing anarion_config_dt() used a switch statement to check for the RGMII modes, complete with an unnecessary "fallthrough", and also printed the numerical value of the PHY interface mode on error. Clean this up using the phy_interface_mode_is_rgmii() helper, and print the English version of the PHY interface mode on error. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u4FlV-000XjG-83@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-anarion.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c index 232aae7526905..941ea724c6432 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c @@ -79,17 +79,11 @@ anarion_config_dt(struct platform_device *pdev, gmac->ctl_block = ctl_block; - switch (plat_dat->phy_interface) { - case PHY_INTERFACE_MODE_RGMII: - fallthrough; - case PHY_INTERFACE_MODE_RGMII_ID: - case PHY_INTERFACE_MODE_RGMII_RXID: - case PHY_INTERFACE_MODE_RGMII_TXID: + if (phy_interface_mode_is_rgmii(plat_dat->phy_interface)) { gmac->phy_intf_sel = GMAC_CONFIG_INTF_RGMII; - break; - default: - dev_err(&pdev->dev, "Unsupported phy-mode (%d)\n", - plat_dat->phy_interface); + } else { + dev_err(&pdev->dev, "Unsupported phy-mode (%s)\n", + phy_modes(plat_dat->phy_interface)); return ERR_PTR(-ENOTSUPP); } From 5956527e26ffcdfe2f9698c9872b91dd8814d997 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 14 Apr 2025 10:06:10 +0100 Subject: [PATCH 225/770] net: stmmac: anarion: use stmmac_pltfr_probe() Rather than open-coding the call to anarion_gmac_init() and then stmmac_dvr_probe(), omitting the cleanup of calling anarion_gmac_exit(), use stmmac_pltfr_probe() which will handle this for us. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u4Fla-000XjM-Bw@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c index 941ea724c6432..99f6c977ec41c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c @@ -111,10 +111,9 @@ static int anarion_dwmac_probe(struct platform_device *pdev) plat_dat->init = anarion_gmac_init; plat_dat->exit = anarion_gmac_exit; - anarion_gmac_init(pdev, gmac); plat_dat->bsp_priv = gmac; - return stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); + return stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } static const struct of_device_id anarion_dwmac_match[] = { From a1afabef915cced083ece76cc364d22e0a66192c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 14 Apr 2025 10:06:15 +0100 Subject: [PATCH 226/770] net: stmmac: anarion: use devm_stmmac_pltfr_probe() Convert anarion to use devm_stmmac_pltfr_probe() which allows the removal of an explicit call to stmmac_pltfr_remove(). Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u4Flf-000XjS-Fi@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c index 99f6c977ec41c..84072c8ed741c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-anarion.c @@ -113,7 +113,7 @@ static int anarion_dwmac_probe(struct platform_device *pdev) plat_dat->exit = anarion_gmac_exit; plat_dat->bsp_priv = gmac; - return stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); + return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } static const struct of_device_id anarion_dwmac_match[] = { @@ -124,7 +124,6 @@ MODULE_DEVICE_TABLE(of, anarion_dwmac_match); static struct platform_driver anarion_dwmac_driver = { .probe = anarion_dwmac_probe, - .remove = stmmac_pltfr_remove, .driver = { .name = "anarion-dwmac", .pm = &stmmac_pltfr_pm_ops, From b2ee4451c1d42a736a94ee582c67c0ccdc05f715 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 14 Apr 2025 10:06:25 +0100 Subject: [PATCH 227/770] net: stmmac: imx: use stmmac_pltfr_probe() Using stmmac_pltfr_probe() simplifies the probe function. This will not only call plat_dat->init (imx_dwmac_init), but also plat_dat->exit (imx_dwmac_exit) appropriately if stmmac_dvr_probe() fails. This results in an overall simplification of the glue driver. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u4Flp-000XlM-Tb@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index 5d279fa54b3e8..889e2bb6f7f5d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -379,10 +379,6 @@ static int imx_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - ret = imx_dwmac_init(pdev, dwmac); - if (ret) - goto err_dwmac_init; - if (dwmac->ops->fix_mac_speed) { plat_dat->fix_mac_speed = dwmac->ops->fix_mac_speed; } else if (!dwmac->ops->mac_rgmii_txclk_auto_adj) { @@ -392,16 +388,10 @@ static int imx_dwmac_probe(struct platform_device *pdev) dwmac->plat_dat->fix_soc_reset = dwmac->ops->fix_soc_reset; - ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); + ret = stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); if (ret) - goto err_drv_probe; - - return 0; + imx_dwmac_clks_config(dwmac, false); -err_drv_probe: - imx_dwmac_exit(pdev, plat_dat->bsp_priv); -err_dwmac_init: - imx_dwmac_clks_config(dwmac, false); return ret; } From debfcb3f58488e1ee8177a03bf9338ede936158f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 14 Apr 2025 16:51:01 +0100 Subject: [PATCH 228/770] net: stmmac: ingenic: convert to stmmac_pltfr_pm_ops Convert the Ingenic glue driver to use the generic stmmac platform power management operations. In order to do this, we need to make ingenic_mac_init() arguments compatible with plat_dat->init() by adding a plat_dat member to struct ingenic_mac. This allows the custom suspend/resume operations to be removed, and the PM ops pointer replaced with stmmac_pltfr_pm_ops. This will adds runtime PM and noirq suspend/resume ops to this driver. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u4M5N-000YGD-5i@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/dwmac-ingenic.c | 41 ++++--------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c index 066783d664228..607e467324a4a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c @@ -56,6 +56,7 @@ enum ingenic_mac_version { struct ingenic_mac { const struct ingenic_soc_info *soc_info; + struct plat_stmmacenet_data *plat_dat; struct device *dev; struct regmap *regmap; @@ -70,13 +71,13 @@ struct ingenic_soc_info { int (*set_mode)(struct plat_stmmacenet_data *plat_dat); }; -static int ingenic_mac_init(struct plat_stmmacenet_data *plat_dat) +static int ingenic_mac_init(struct platform_device *pdev, void *bsp_priv) { - struct ingenic_mac *mac = plat_dat->bsp_priv; + struct ingenic_mac *mac = bsp_priv; int ret; if (mac->soc_info->set_mode) { - ret = mac->soc_info->set_mode(plat_dat); + ret = mac->soc_info->set_mode(mac->plat_dat); if (ret) return ret; } @@ -284,44 +285,18 @@ static int ingenic_mac_probe(struct platform_device *pdev) mac->soc_info = data; mac->dev = &pdev->dev; + mac->plat_dat = plat_dat; plat_dat->bsp_priv = mac; + plat_dat->init = ingenic_mac_init; - ret = ingenic_mac_init(plat_dat); + ret = ingenic_mac_init(pdev, mac); if (ret) return ret; return stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); } -#ifdef CONFIG_PM_SLEEP -static int ingenic_mac_suspend(struct device *dev) -{ - int ret; - - ret = stmmac_suspend(dev); - - return ret; -} - -static int ingenic_mac_resume(struct device *dev) -{ - struct net_device *ndev = dev_get_drvdata(dev); - struct stmmac_priv *priv = netdev_priv(ndev); - int ret; - - ret = ingenic_mac_init(priv->plat); - if (ret) - return ret; - - ret = stmmac_resume(dev); - - return ret; -} -#endif /* CONFIG_PM_SLEEP */ - -static SIMPLE_DEV_PM_OPS(ingenic_mac_pm_ops, ingenic_mac_suspend, ingenic_mac_resume); - static struct ingenic_soc_info jz4775_soc_info = { .version = ID_JZ4775, .mask = MACPHYC_TXCLK_SEL_MASK | MACPHYC_SOFT_RST_MASK | MACPHYC_PHY_INFT_MASK, @@ -373,7 +348,7 @@ static struct platform_driver ingenic_mac_driver = { .remove = stmmac_pltfr_remove, .driver = { .name = "ingenic-mac", - .pm = pm_ptr(&ingenic_mac_pm_ops), + .pm = &stmmac_pltfr_pm_ops, .of_match_table = ingenic_mac_of_matches, }, }; From 96f8bf85d11acdd3ebc9a91f41cb18eac6f8e26d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 14 Apr 2025 16:51:06 +0100 Subject: [PATCH 229/770] net: stmmac: ingenic: convert to devm_stmmac_pltfr_probe() As Ingenic now uses the stmmac platform PM ops, convert it to use devm_stmmac_pltfr_probe() which will call the plat_dat->init() method before stmmac_drv_probe() and appropriately cleaning up via the ->exit() method, thus simplifying the code. Using the devm_*() variant also allows removal of the explicit call to stmmac_pltfr_remove(). Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u4M5S-000YGJ-9K@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c index 607e467324a4a..15abe214131f8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ingenic.c @@ -290,11 +290,7 @@ static int ingenic_mac_probe(struct platform_device *pdev) plat_dat->bsp_priv = mac; plat_dat->init = ingenic_mac_init; - ret = ingenic_mac_init(pdev, mac); - if (ret) - return ret; - - return stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); + return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } static struct ingenic_soc_info jz4775_soc_info = { @@ -345,7 +341,6 @@ MODULE_DEVICE_TABLE(of, ingenic_mac_of_matches); static struct platform_driver ingenic_mac_driver = { .probe = ingenic_mac_probe, - .remove = stmmac_pltfr_remove, .driver = { .name = "ingenic-mac", .pm = &stmmac_pltfr_pm_ops, From bbfc077d457272bcea4f14b3a28247ade99b196d Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 14 Apr 2025 17:44:48 +0100 Subject: [PATCH 230/770] octeon_ep_vf: Remove octep_vf_wq commit cb7dd712189f ("octeon_ep_vf: Add driver framework and device initialization") added octep_vf_wq but it has never been used. Remove it. Reported-by: Dr. David Alan Gilbert Closes: https://lore.kernel.org/netdev/Z70bEoTKyeBau52q@gallifrey/ Signed-off-by: Simon Horman Reviewed-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250414-octeon-wq-v1-1-23700e4bd208@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c | 2 -- drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h | 2 -- 2 files changed, 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c index 18c922dd5fc64..5841e30dff2a9 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c @@ -18,8 +18,6 @@ #include "octep_vf_config.h" #include "octep_vf_main.h" -struct workqueue_struct *octep_vf_wq; - /* Supported Devices */ static const struct pci_device_id octep_vf_pci_id_tbl[] = { {PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, OCTEP_PCI_DEVICE_ID_CN93_VF)}, diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h index 1a352f41f823c..b9f13506f4620 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.h @@ -320,8 +320,6 @@ static inline u16 OCTEP_VF_MINOR_REV(struct octep_vf_device *oct) #define octep_vf_read_csr64(octep_vf_dev, reg_off) \ readq((octep_vf_dev)->mmio.hw_addr + (reg_off)) -extern struct workqueue_struct *octep_vf_wq; - int octep_vf_device_setup(struct octep_vf_device *oct); int octep_vf_setup_iqs(struct octep_vf_device *oct); void octep_vf_free_iqs(struct octep_vf_device *oct); From 5f5f92912b439ef4db4fcfaa9826b01cc6e4fc85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Apr 2025 16:55:23 +0200 Subject: [PATCH 231/770] tc: Return an error if filters try to attach too many actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While developing the fix for the buffer sizing issue in [0], I noticed that the kernel will happily accept a long list of actions for a filter, and then just silently truncate that list down to a maximum of 32 actions. That seems less than ideal, so this patch changes the action parsing to return an error message and refuse to create the filter in this case. This results in an error like: # ip link add type veth # tc qdisc replace dev veth0 root handle 1: fq_codel # tc -echo filter add dev veth0 parent 1: u32 match u32 0 0 $(for i in $(seq 33); do echo action pedit munge ip dport set 22; done) Error: Only 32 actions supported per filter. We have an error talking to the kernel Instead of just creating a filter with 32 actions and dropping the last one. This is obviously a change in UAPI. But seeing as creating more than 32 filters has never actually *worked*, it seems that returning an explicit error is better, and any use cases that get broken by this were already broken just in more subtle ways. [0] https://lore.kernel.org/r/20250407105542.16601-1-toke@redhat.com Signed-off-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409145523.164506-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/act_api.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 839790043256b..057e20cef3754 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1461,17 +1461,29 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; - struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 2]; struct tc_action *act; size_t sz = 0; int err; int i; - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL, + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO + 1, nla, NULL, extack); if (err < 0) return err; + /* The nested attributes are parsed as types, but they are really an + * array of actions. So we parse one more than we can handle, and return + * an error if the last one is set (as that indicates that the request + * contained more than the maximum number of actions). + */ + if (tb[TCA_ACT_MAX_PRIO + 1]) { + NL_SET_ERR_MSG_FMT(extack, + "Only %d actions supported per filter", + TCA_ACT_MAX_PRIO); + return -EINVAL; + } + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; From 1310f44dd4d635f278fb01330fddf416106332ff Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 13 Apr 2025 23:23:25 +0200 Subject: [PATCH 232/770] net: phy: remove redundant dependency on NETDEVICES for PHYLINK and PHYLIB drivers/net/phy/Kconfig is included from drivers/net/Kconfig in an "if NETDEVICES" section. Therefore we don't have to duplicate the dependency here. And if e.g. PHYLINK is selected somewhere, then the dependency is ignored anyway (see note in Kconfig help). Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/085892cd-aa11-4c22-bf8a-574a5c6dcd7c@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/Kconfig | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index d29f9f7fd2e11..0b8cc325e2bf8 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -5,7 +5,6 @@ config PHYLINK tristate - depends on NETDEVICES select PHYLIB select SWPHY help @@ -15,7 +14,6 @@ config PHYLINK menuconfig PHYLIB tristate "PHY Device support and infrastructure" - depends on NETDEVICES select MDIO_DEVICE select MDIO_DEVRES help From a496d2f0fd612ab9e10700afe00dc9267bad788b Mon Sep 17 00:00:00 2001 From: Shengyu Qu Date: Mon, 14 Apr 2025 18:56:01 +0800 Subject: [PATCH 233/770] net: bridge: locally receive all multicast packets if IFF_ALLMULTI is set If multicast snooping is enabled, multicast packets may not always end up on the local bridge interface, if the host is not a member of the multicast group. Similar to how IFF_PROMISC allows all packets to be received locally, let IFF_ALLMULTI allow all multicast packets to be received. OpenWrt uses a user space daemon for DHCPv6/RA/NDP handling, and in relay mode it sets the ALLMULTI flag in order to receive all relevant queries on the network. This works for normal network interfaces and non-snooping bridges, but not snooping bridges (unless multicast routing is enabled). Reported-by: Felix Fietkau Closes: https://github.com/openwrt/openwrt/issues/15857#issuecomment-2662851243 Signed-off-by: Shengyu Qu Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/OSZPR01MB8434308370ACAFA90A22980798B32@OSZPR01MB8434.jpnprd01.prod.outlook.com Signed-off-by: Jakub Kicinski --- net/bridge/br_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 232133a0fd21c..5f6ac9bf15275 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -189,7 +189,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || - br_multicast_is_router(brmctx, skb)) { + br_multicast_is_router(brmctx, skb) || + br->dev->flags & IFF_ALLMULTI) { local_rcv = true; DEV_STATS_INC(br->dev, multicast); } From 7c571ac57d9d97190dcba18212fabf99888b0c48 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 14 Apr 2025 14:26:30 -0700 Subject: [PATCH 234/770] net: ptp: introduce .supported_extts_flags to ptp_clock_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PTP_EXTTS_REQUEST(2) ioctl has a flags field which specifies how the external timestamp request should behave. This includes which edge of the signal to timestamp, as well as a specialized "offset" mode. It is expected that more flags will be added in the future. Driver authors routinely do not check the flags, often accepting requests with flags which they do not support. Even drivers which do check flags may not be future-proofed to reject flags not yet defined. Thus, any future flag additions often require manually updating drivers to reject these flags. This approach of hoping we catch flag checks during review, or playing whack-a-mole after the fact is the wrong approach. Introduce the "supported_extts_flags" field to the ptp_clock_info structure. This field defines the set of flags the device actually supports. Update the core character device logic to check this field and reject unsupported requests. Getting this right is somewhat tricky. First, to avoid unnecessary repetition and make basic functionality work when .supported_extts_flags is 0, the core always accepts the PTP_ENABLE_FEATURE flag. This flag is used to set the 'on' parameter to the .enable function and is thus always 'supported' by all drivers. For backwards compatibility, the PTP_RISING_EDGE and PTP_FALLING_EDGE flags are merely "hints" when using the old PTP_EXTTS_REQUEST ioctl, and are not expected to be enforced. If the user issues PTP_EXTTS_REQUEST2, the PTP_STRICT_FLAGS flag is added which is supposed to inform the driver to strictly validate the flags and reject unsupported requests. To handle this, first check if the driver reports PTP_STRICT_FLAGS support. If it does not, then always allow the PTP_RISING_EDGE and PTP_FALLING_EDGE flags. This keeps backwards compatibility with the original PTP_EXTTS_REQUEST ioctl where these flags are not guaranteed to be honored. This way, drivers which do not set the supported_extts_flags will continue to accept requests for the original PTP_EXTTS_REQUEST ioctl. The core will automatically reject requests with new flags, and correctly reject requests with PTP_STRICT_FLAGS, where the driver is supposed to strictly validate the flags. Update the various drivers, refactoring their validation logic into the .supported_extts_flags field. For consistency and readability, PTP_ENABLE_FEATURE is not set in the supported flags list, and PTP_EXTTS_EDGES is expanded to PTP_RISING_EDGE | PTP_FALLING_EDGE in all cases. Note the following driver files set n_ext_ts to a non-zero value but did not check flags at all: • drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c • drivers/net/ethernet/freescale/enetc/enetc_ptp.c • drivers/net/ethernet/intel/i40e/i40e_ptp.c • drivers/net/ethernet/marvell/octeontx2/nic/otx2_ptp.c • drivers/net/ethernet/renesas/ravb_ptp.c • drivers/net/ethernet/renesas/rtsn.c • drivers/net/ethernet/renesas/rtsn.h • drivers/net/ethernet/ti/am65-cpts.c • drivers/net/ethernet/ti/cpts.h • drivers/net/ethernet/ti/icssg/icss_iep.c • drivers/net/ethernet/xscale/ptp_ixp46x.c • drivers/net/phy/bcm-phy-ptp.c • drivers/ptp/ptp_ocp.c • drivers/ptp/ptp_pch.c • drivers/ptp/ptp_qoriq.c These drivers behavior does change slightly: they will now reject the PTP_EXTTS_REQUEST2 ioctl, because they do not strictly validate their flags. This also makes them no longer incorrectly accept PTP_EXT_OFFSET. Also note that the renesas ravb driver does not support PTP_STRICT_FLAGS. We could leave the .supported_extts_flags as 0, but I added the PTP_RISING_EDGE | PTP_FALLING_EDGE since the driver previously manually validated these flags. This is equivalent to 0 because the core will allow these flags regardless unless PTP_STRICT_FLAGS is also set. Signed-off-by: Jacob Keller Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20250414-jk-supported-perout-flags-v2-1-f6b17d15475c@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/ptp.c | 11 ++++------ drivers/net/dsa/sja1105/sja1105_ptp.c | 10 +++------- drivers/net/ethernet/intel/ice/ice_ptp.c | 12 ++++------- drivers/net/ethernet/intel/igb/igb_ptp.c | 20 ++++++------------- drivers/net/ethernet/intel/igc/igc_ptp.c | 10 +++------- .../ethernet/mellanox/mlx5/core/lib/clock.c | 11 ++++------ drivers/net/ethernet/microchip/lan743x_ptp.c | 9 +++------ .../ethernet/microchip/lan966x/lan966x_ptp.c | 8 ++------ drivers/net/ethernet/renesas/ravb_ptp.c | 7 +------ drivers/net/phy/dp83640.c | 10 +++------- drivers/net/phy/micrel.c | 8 +++----- drivers/net/phy/nxp-c45-tja11xx.c | 9 +++------ drivers/ptp/ptp_chardev.c | 14 ++++++++++++- drivers/ptp/ptp_clockmatrix.c | 14 ++----------- drivers/ptp/ptp_fc3.c | 1 + drivers/ptp/ptp_idt82p33.c | 15 +++----------- include/linux/ptp_clock_kernel.h | 12 +++++++++++ 17 files changed, 70 insertions(+), 111 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/ptp.c b/drivers/net/dsa/mv88e6xxx/ptp.c index aed4a4b07f34b..1d3b2c94c53ee 100644 --- a/drivers/net/dsa/mv88e6xxx/ptp.c +++ b/drivers/net/dsa/mv88e6xxx/ptp.c @@ -332,13 +332,6 @@ static int mv88e6352_ptp_enable_extts(struct mv88e6xxx_chip *chip, int pin; int err; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests to enable time stamping on both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -566,6 +559,10 @@ int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip) chip->ptp_clock_info.verify = ptp_ops->ptp_verify; chip->ptp_clock_info.do_aux_work = mv88e6xxx_hwtstamp_work; + chip->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; + if (ptp_ops->set_ptp_cpu_port) { struct dsa_port *dp; int upstream = 0; diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 198e787e8560c..3b979d88ca13b 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -820,13 +820,6 @@ static int sja1105_extts_enable(struct sja1105_private *priv, if (extts->index != 0) return -EOPNOTSUPP; - /* Reject requests with unsupported flags */ - if (extts->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* We can only enable time stamping on both edges, sadly. */ if ((extts->flags & PTP_STRICT_FLAGS) && (extts->flags & PTP_ENABLE_FEATURE) && @@ -912,6 +905,9 @@ int sja1105_ptp_clock_register(struct dsa_switch *ds) .n_pins = 1, .n_ext_ts = 1, .n_per_out = 1, + .supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS, }; /* Only used on SJA1105 */ diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 0ea5e52303f22..660b32126553a 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -1627,14 +1627,6 @@ static int ice_ptp_cfg_extts(struct ice_pf *pf, struct ptp_extts_request *rq, int pin_desc_idx; u8 tmr_idx; - /* Reject requests with unsupported flags */ - - if (rq->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - tmr_idx = hw->func_caps.ts_func_info.tmr_index_owned; chan = rq->index; @@ -2740,6 +2732,10 @@ static void ice_ptp_set_caps(struct ice_pf *pf) info->enable = ice_ptp_gpio_enable; info->verify = ice_verify_pin; + info->supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; + switch (pf->hw.mac_type) { case ICE_MAC_E810: ice_ptp_set_funcs_e810(pf); diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c index f323e1c1989f1..793c960162880 100644 --- a/drivers/net/ethernet/intel/igb/igb_ptp.c +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c @@ -502,13 +502,6 @@ static int igb_ptp_feature_enable_82580(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Both the rising and falling edge are timestamped */ if (rq->extts.flags & PTP_STRICT_FLAGS && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -658,13 +651,6 @@ static int igb_ptp_feature_enable_i210(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests failing to enable both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -1356,6 +1342,9 @@ void igb_ptp_init(struct igb_adapter *adapter) adapter->ptp_caps.n_per_out = IGB_N_PEROUT; adapter->ptp_caps.n_pins = IGB_N_SDP; adapter->ptp_caps.pps = 0; + adapter->ptp_caps.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; adapter->ptp_caps.pin_config = adapter->sdp_config; adapter->ptp_caps.adjfine = igb_ptp_adjfine_82580; adapter->ptp_caps.adjtime = igb_ptp_adjtime_82576; @@ -1378,6 +1367,9 @@ void igb_ptp_init(struct igb_adapter *adapter) adapter->ptp_caps.n_ext_ts = IGB_N_EXTTS; adapter->ptp_caps.n_per_out = IGB_N_PEROUT; adapter->ptp_caps.n_pins = IGB_N_SDP; + adapter->ptp_caps.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; adapter->ptp_caps.pps = 1; adapter->ptp_caps.pin_config = adapter->sdp_config; adapter->ptp_caps.adjfine = igb_ptp_adjfine_82580; diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 946edbad43022..0c6c0cc78faca 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -257,13 +257,6 @@ static int igc_ptp_feature_enable_i225(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests failing to enable both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -1146,6 +1139,9 @@ void igc_ptp_init(struct igc_adapter *adapter) adapter->ptp_caps.pin_config = adapter->sdp_config; adapter->ptp_caps.n_ext_ts = IGC_N_EXTTS; adapter->ptp_caps.n_per_out = IGC_N_PEROUT; + adapter->ptp_caps.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; adapter->ptp_caps.n_pins = IGC_N_SDP; adapter->ptp_caps.verify = igc_ptp_verify_pin; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 65a94e46edcf1..3eee84430ac98 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -643,13 +643,6 @@ static int mlx5_extts_configure(struct ptp_clock_info *ptp, int pin = -1; int err = 0; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests to enable time stamping on both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -1034,6 +1027,10 @@ static void mlx5_init_pin_config(struct mlx5_core_dev *mdev) clock->ptp_info.verify = mlx5_ptp_verify; clock->ptp_info.pps = 1; + clock->ptp_info.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; + for (i = 0; i < clock->ptp_info.n_pins; i++) { snprintf(clock->ptp_info.pin_config[i].name, sizeof(clock->ptp_info.pin_config[i].name), diff --git a/drivers/net/ethernet/microchip/lan743x_ptp.c b/drivers/net/ethernet/microchip/lan743x_ptp.c index 0be44dcb33938..b171c893175b3 100644 --- a/drivers/net/ethernet/microchip/lan743x_ptp.c +++ b/drivers/net/ethernet/microchip/lan743x_ptp.c @@ -942,12 +942,6 @@ static int lan743x_ptp_io_extts(struct lan743x_adapter *adapter, int on, extts = &ptp->extts[index]; - if (extts_request->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - if (on) { extts_pin = ptp_find_pin(ptp->ptp_clock, PTP_PF_EXTTS, index); if (extts_pin < 0) @@ -1543,6 +1537,9 @@ int lan743x_ptp_open(struct lan743x_adapter *adapter) ptp->ptp_clock_info.n_per_out = LAN743X_PTP_N_EVENT_CHAN; ptp->ptp_clock_info.n_pins = n_pins; ptp->ptp_clock_info.pps = LAN743X_PTP_N_PPS; + ptp->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; ptp->ptp_clock_info.pin_config = ptp->pin_config; ptp->ptp_clock_info.adjfine = lan743x_ptpci_adjfine; ptp->ptp_clock_info.adjtime = lan743x_ptpci_adjtime; diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c index 63905bb5a63a8..8cf41b0977b2c 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c @@ -917,12 +917,6 @@ static int lan966x_ptp_extts(struct ptp_clock_info *ptp, if (lan966x->ptp_ext_irq <= 0) return -EOPNOTSUPP; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - pin = ptp_find_pin(phc->clock, PTP_PF_EXTTS, rq->extts.index); if (pin == -1 || pin >= LAN966X_PHC_PINS_NUM) return -EINVAL; @@ -978,6 +972,8 @@ static struct ptp_clock_info lan966x_ptp_clock_info = { .n_per_out = LAN966X_PHC_PINS_NUM, .n_ext_ts = LAN966X_PHC_PINS_NUM, .n_pins = LAN966X_PHC_PINS_NUM, + .supported_extts_flags = PTP_RISING_EDGE | + PTP_STRICT_FLAGS, }; static int lan966x_ptp_phc_init(struct lan966x *lan966x, diff --git a/drivers/net/ethernet/renesas/ravb_ptp.c b/drivers/net/ethernet/renesas/ravb_ptp.c index b4365906669f3..ab1ffdc7ee4f6 100644 --- a/drivers/net/ethernet/renesas/ravb_ptp.c +++ b/drivers/net/ethernet/renesas/ravb_ptp.c @@ -176,12 +176,6 @@ static int ravb_ptp_extts(struct ptp_clock_info *ptp, struct net_device *ndev = priv->ndev; unsigned long flags; - /* Reject requests with unsupported flags */ - if (req->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE)) - return -EOPNOTSUPP; - if (req->index) return -EINVAL; @@ -287,6 +281,7 @@ static const struct ptp_clock_info ravb_ptp_info = { .max_adj = 50000000, .n_ext_ts = N_EXT_TS, .n_per_out = N_PER_OUT, + .supported_extts_flags = PTP_RISING_EDGE | PTP_FALLING_EDGE, .adjfine = ravb_ptp_adjfine, .adjtime = ravb_ptp_adjtime, .gettime64 = ravb_ptp_gettime64, diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index 85e231451093f..c89c255185a6a 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -478,13 +478,6 @@ static int ptp_dp83640_enable(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_EXTTS: - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Reject requests to enable time stamping on both edges. */ if ((rq->extts.flags & PTP_STRICT_FLAGS) && (rq->extts.flags & PTP_ENABLE_FEATURE) && @@ -1002,6 +995,9 @@ static void dp83640_clock_init(struct dp83640_clock *clock, struct mii_bus *bus) clock->caps.n_per_out = N_PER_OUT; clock->caps.n_pins = DP83640_N_PINS; clock->caps.pps = 0; + clock->caps.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; clock->caps.adjfine = ptp_dp83640_adjfine; clock->caps.adjtime = ptp_dp83640_adjtime; clock->caps.gettime64 = ptp_dp83640_gettime; diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 24882d30f6858..61123ec4c8780 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -3406,11 +3406,6 @@ static int lan8814_ptp_extts(struct ptp_clock_info *ptpci, struct phy_device *phydev = shared->phydev; int pin; - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_EXTTS_EDGES | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - pin = ptp_find_pin(shared->ptp_clock, PTP_PF_EXTTS, rq->extts.index); if (pin == -1 || pin != LAN8814_PTP_EXTTS_NUM) @@ -3917,6 +3912,9 @@ static int lan8814_ptp_probe_once(struct phy_device *phydev) shared->ptp_clock_info.n_ext_ts = LAN8814_PTP_EXTTS_NUM; shared->ptp_clock_info.n_pins = LAN8814_PTP_GPIO_NUM; shared->ptp_clock_info.pps = 0; + shared->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS; shared->ptp_clock_info.pin_config = shared->pin_config; shared->ptp_clock_info.n_per_out = LAN8814_PTP_PEROUT_NUM; shared->ptp_clock_info.adjfine = lan8814_ptpci_adjfine; diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index 250a018d55468..8634b4cb1e708 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -861,12 +861,6 @@ static int nxp_c45_extts_enable(struct nxp_c45_phy *priv, const struct nxp_c45_phy_data *data = nxp_c45_get_data(priv->phydev); int pin; - if (extts->flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - /* Sampling on both edges is not supported */ if ((extts->flags & PTP_RISING_EDGE) && (extts->flags & PTP_FALLING_EDGE) && @@ -962,6 +956,9 @@ static int nxp_c45_init_ptp_clock(struct nxp_c45_phy *priv) .n_pins = ARRAY_SIZE(nxp_c45_ptp_pins), .n_ext_ts = 1, .n_per_out = 1, + .supported_extts_flags = PTP_RISING_EDGE | + PTP_FALLING_EDGE | + PTP_STRICT_FLAGS, }; priv->ptp_clock = ptp_clock_register(&priv->caps, diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 4380e6ddb8495..c24228c139549 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -162,6 +162,7 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, { struct ptp_clock *ptp = container_of(pccontext->clk, struct ptp_clock, clock); + unsigned int i, pin_index, supported_extts_flags; struct ptp_sys_offset_extended *extoff = NULL; struct ptp_sys_offset_precise precise_offset; struct system_device_crosststamp xtstamp; @@ -172,7 +173,6 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, struct ptp_clock_request req; struct ptp_clock_caps caps; struct ptp_clock_time *pct; - unsigned int i, pin_index; struct ptp_pin_desc pd; struct timespec64 ts; int enable, err = 0; @@ -240,6 +240,18 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, err = -EINVAL; break; } + supported_extts_flags = ptp->info->supported_extts_flags; + /* The PTP_ENABLE_FEATURE flag is always supported. */ + supported_extts_flags |= PTP_ENABLE_FEATURE; + /* If the driver does not support strictly checking flags, the + * PTP_RISING_EDGE and PTP_FALLING_EDGE flags are merely + * hints which are not enforced. + */ + if (!(supported_extts_flags & PTP_STRICT_FLAGS)) + supported_extts_flags |= PTP_EXTTS_EDGES; + /* Reject unsupported flags */ + if (req.extts.flags & ~supported_extts_flags) + return -EOPNOTSUPP; req.type = PTP_CLK_REQ_EXTTS; enable = req.extts.flags & PTP_ENABLE_FEATURE ? 1 : 0; if (mutex_lock_interruptible(&ptp->pincfg_mux)) diff --git a/drivers/ptp/ptp_clockmatrix.c b/drivers/ptp/ptp_clockmatrix.c index fbb3fa8fc60b2..b8d4df8c6da2c 100644 --- a/drivers/ptp/ptp_clockmatrix.c +++ b/drivers/ptp/ptp_clockmatrix.c @@ -283,18 +283,6 @@ static int idtcm_extts_enable(struct idtcm_channel *channel, idtcm = channel->idtcm; old_mask = idtcm->extts_mask; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - - /* Reject requests to enable time stamping on falling edge */ - if ((rq->extts.flags & PTP_ENABLE_FEATURE) && - (rq->extts.flags & PTP_FALLING_EDGE)) - return -EOPNOTSUPP; - if (index >= MAX_TOD) return -EINVAL; @@ -2043,6 +2031,7 @@ static const struct ptp_clock_info idtcm_caps = { .n_per_out = 12, .n_ext_ts = MAX_TOD, .n_pins = MAX_REF_CLK, + .supported_extts_flags = PTP_RISING_EDGE | PTP_STRICT_FLAGS, .adjphase = &idtcm_adjphase, .getmaxphase = &idtcm_getmaxphase, .adjfine = &idtcm_adjfine, @@ -2060,6 +2049,7 @@ static const struct ptp_clock_info idtcm_caps_deprecated = { .n_per_out = 12, .n_ext_ts = MAX_TOD, .n_pins = MAX_REF_CLK, + .supported_extts_flags = PTP_RISING_EDGE | PTP_STRICT_FLAGS, .adjphase = &idtcm_adjphase, .getmaxphase = &idtcm_getmaxphase, .adjfine = &idtcm_adjfine, diff --git a/drivers/ptp/ptp_fc3.c b/drivers/ptp/ptp_fc3.c index cfced36c70bcb..70002500170ea 100644 --- a/drivers/ptp/ptp_fc3.c +++ b/drivers/ptp/ptp_fc3.c @@ -592,6 +592,7 @@ static const struct ptp_clock_info idtfc3_caps = { .max_adj = MAX_FFO_PPB, .n_per_out = 1, .n_ext_ts = 1, + .supported_extts_flags = PTP_STRICT_FLAGS | PTP_EXT_OFFSET, .adjphase = &idtfc3_adjphase, .adjfine = &idtfc3_adjfine, .adjtime = &idtfc3_adjtime, diff --git a/drivers/ptp/ptp_idt82p33.c b/drivers/ptp/ptp_idt82p33.c index b2fd94d4f8631..f01c50dfa44e8 100644 --- a/drivers/ptp/ptp_idt82p33.c +++ b/drivers/ptp/ptp_idt82p33.c @@ -246,18 +246,6 @@ static int idt82p33_extts_enable(struct idt82p33_channel *channel, idt82p33 = channel->idt82p33; old_mask = idt82p33->extts_mask; - /* Reject requests with unsupported flags */ - if (rq->extts.flags & ~(PTP_ENABLE_FEATURE | - PTP_RISING_EDGE | - PTP_FALLING_EDGE | - PTP_STRICT_FLAGS)) - return -EOPNOTSUPP; - - /* Reject requests to enable time stamping on falling edge */ - if ((rq->extts.flags & PTP_ENABLE_FEATURE) && - (rq->extts.flags & PTP_FALLING_EDGE)) - return -EOPNOTSUPP; - if (index >= MAX_PHC_PLL) return -EINVAL; @@ -1187,6 +1175,9 @@ static void idt82p33_caps_init(u32 index, struct ptp_clock_info *caps, caps->pin_config = pin_cfg; + caps->supported_extts_flags = PTP_RISING_EDGE | + PTP_STRICT_FLAGS; + for (i = 0; i < max_pins; ++i) { ppd = &pin_cfg[i]; diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 0d68d09bedd14..25cba2e5ee69c 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -68,6 +68,17 @@ struct ptp_system_timestamp { * @n_per_out: The number of programmable periodic signals. * @n_pins: The number of programmable pins. * @pps: Indicates whether the clock supports a PPS callback. + * + * @supported_extts_flags: The set of flags the driver supports for the + * PTP_EXTTS_REQUEST ioctl. The PTP core will use + * this list to reject unsupported requests. + * PTP_ENABLE_FEATURE is assumed and does not need to + * be included. If PTP_STRICT_FLAGS is *not* set, + * then both PTP_RISING_EDGE and PTP_FALLING_EDGE + * will be assumed. Note that PTP_STRICT_FLAGS must + * be set if the drivers wants to honor + * PTP_EXTTS_REQUEST2 and any future flags. + * * @pin_config: Array of length 'n_pins'. If the number of * programmable pins is nonzero, then drivers must * allocate and initialize this array. @@ -174,6 +185,7 @@ struct ptp_clock_info { int n_per_out; int n_pins; int pps; + unsigned int supported_extts_flags; struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); int (*adjphase)(struct ptp_clock_info *ptp, s32 phase); From d9f3e9ecc4562ae07aaf614cf0a6690ef7ca0e10 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 14 Apr 2025 14:26:31 -0700 Subject: [PATCH 235/770] net: ptp: introduce .supported_perout_flags to ptp_clock_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PTP_PEROUT_REQUEST2 ioctl has gained support for flags specifying specific output behavior including PTP_PEROUT_ONE_SHOT, PTP_PEROUT_DUTY_CYCLE, PTP_PEROUT_PHASE. Driver authors are notorious for not checking the flags of the request. This results in misinterpreting the request, generating an output signal that does not match the requested value. It is anticipated that even more flags will be added in the future, resulting in even more broken requests. Expecting these issues to be caught during review or playing whack-a-mole after the fact is not a great solution. Instead, introduce the supported_perout_flags field in the ptp_clock_info structure. Update the core character device logic to explicitly reject any request which has a flag not on this list. This ensures that drivers must 'opt in' to the flags they support. Drivers which don't set the .supported_perout_flags field will not need to check that unsupported flags aren't passed, as the core takes care of this. Update the drivers which do support flags to set this new field. Note the following driver files set n_per_out to a non-zero value but did not check the flags at all: • drivers/ptp/ptp_clockmatrix.c • drivers/ptp/ptp_idt82p33.c • drivers/ptp/ptp_fc3.c • drivers/net/ethernet/ti/am65-cpts.c • drivers/net/ethernet/aquantia/atlantic/aq_ptp.c • drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c • drivers/net/dsa/sja1105/sja1105_ptp.c • drivers/net/ethernet/freescale/dpaa2/dpaa2-ptp.c • drivers/net/ethernet/mscc/ocelot_vsc7514.c • drivers/net/ethernet/intel/i40e/i40e_ptp.c Reviewed-by: Vadim Fedorenko Signed-off-by: Jacob Keller Reviewed-by: Kory Maincent Link: https://patch.msgid.link/20250414-jk-supported-perout-flags-v2-2-f6b17d15475c@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/sja1105/sja1105_ptp.c | 4 ---- drivers/net/ethernet/intel/ice/ice_ptp.c | 4 +--- drivers/net/ethernet/intel/igc/igc_ptp.c | 4 ---- .../net/ethernet/mellanox/mlx5/core/lib/clock.c | 15 +++------------ drivers/net/ethernet/microchip/lan743x_ptp.c | 5 +---- .../net/ethernet/microchip/lan966x/lan966x_ptp.c | 6 ++---- drivers/net/ethernet/mscc/ocelot_ptp.c | 5 ----- drivers/net/ethernet/mscc/ocelot_vsc7514.c | 2 ++ drivers/net/ethernet/renesas/ravb_ptp.c | 4 ---- drivers/net/phy/dp83640.c | 3 --- drivers/net/phy/micrel.c | 9 ++------- drivers/net/phy/microchip_rds_ptp.c | 5 +---- drivers/net/phy/nxp-c45-tja11xx.c | 4 +--- drivers/ptp/ptp_chardev.c | 2 ++ include/linux/ptp_clock_kernel.h | 6 ++++++ 15 files changed, 21 insertions(+), 57 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_ptp.c b/drivers/net/dsa/sja1105/sja1105_ptp.c index 3b979d88ca13b..3abc64aec411a 100644 --- a/drivers/net/dsa/sja1105/sja1105_ptp.c +++ b/drivers/net/dsa/sja1105/sja1105_ptp.c @@ -737,10 +737,6 @@ static int sja1105_per_out_enable(struct sja1105_private *priv, if (perout->index != 0) return -EOPNOTSUPP; - /* Reject requests with unsupported flags */ - if (perout->flags) - return -EOPNOTSUPP; - mutex_lock(&ptp_data->lock); rc = sja1105_change_ptp_clk_pin_func(priv, PTP_PF_PEROUT); diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 660b32126553a..b79a148ed0f28 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -1797,9 +1797,6 @@ static int ice_ptp_cfg_perout(struct ice_pf *pf, struct ptp_perout_request *rq, struct ice_hw *hw = &pf->hw; int pin_desc_idx; - if (rq->flags & ~PTP_PEROUT_PHASE) - return -EOPNOTSUPP; - pin_desc_idx = ice_ptp_find_pin_idx(pf, PTP_PF_PEROUT, rq->index); if (pin_desc_idx < 0) return -EIO; @@ -2735,6 +2732,7 @@ static void ice_ptp_set_caps(struct ice_pf *pf) info->supported_extts_flags = PTP_RISING_EDGE | PTP_FALLING_EDGE | PTP_STRICT_FLAGS; + info->supported_perout_flags = PTP_PEROUT_PHASE; switch (pf->hw.mac_type) { case ICE_MAC_E810: diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 0c6c0cc78faca..b6c60b3d0e3a3 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -293,10 +293,6 @@ static int igc_ptp_feature_enable_i225(struct ptp_clock_info *ptp, return 0; case PTP_CLK_REQ_PEROUT: - /* Reject requests with unsupported flags */ - if (rq->perout.flags) - return -EOPNOTSUPP; - if (on) { pin = ptp_find_pin(igc->ptp_clock, PTP_PF_PEROUT, rq->perout.index); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index 3eee84430ac98..cec18efadc733 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -813,12 +813,6 @@ static int perout_conf_npps_real_time(struct mlx5_core_dev *mdev, struct ptp_clo return 0; } -static bool mlx5_perout_verify_flags(struct mlx5_core_dev *mdev, unsigned int flags) -{ - return ((!mlx5_npps_real_time_supported(mdev) && flags) || - (mlx5_npps_real_time_supported(mdev) && flags & ~PTP_PEROUT_DUTY_CYCLE)); -} - static int mlx5_perout_configure(struct ptp_clock_info *ptp, struct ptp_clock_request *rq, int on) @@ -854,12 +848,6 @@ static int mlx5_perout_configure(struct ptp_clock_info *ptp, goto unlock; } - /* Reject requests with unsupported flags */ - if (mlx5_perout_verify_flags(mdev, rq->perout.flags)) { - err = -EOPNOTSUPP; - goto unlock; - } - if (on) { pin_mode = MLX5_PIN_MODE_OUT; pattern = MLX5_OUT_PATTERN_PERIODIC; @@ -1031,6 +1019,9 @@ static void mlx5_init_pin_config(struct mlx5_core_dev *mdev) PTP_FALLING_EDGE | PTP_STRICT_FLAGS; + if (mlx5_npps_real_time_supported(mdev)) + clock->ptp_info.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE; + for (i = 0; i < clock->ptp_info.n_pins; i++) { snprintf(clock->ptp_info.pin_config[i].name, sizeof(clock->ptp_info.pin_config[i].name), diff --git a/drivers/net/ethernet/microchip/lan743x_ptp.c b/drivers/net/ethernet/microchip/lan743x_ptp.c index b171c893175b3..b07f5b099a2b4 100644 --- a/drivers/net/ethernet/microchip/lan743x_ptp.c +++ b/drivers/net/ethernet/microchip/lan743x_ptp.c @@ -463,10 +463,6 @@ static int lan743x_ptp_perout(struct lan743x_adapter *adapter, int on, struct lan743x_ptp_perout *perout = &ptp->perout[index]; int ret = 0; - /* Reject requests with unsupported flags */ - if (perout_request->flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - if (on) { perout_pin = ptp_find_pin(ptp->ptp_clock, PTP_PF_PEROUT, perout_request->index); @@ -1540,6 +1536,7 @@ int lan743x_ptp_open(struct lan743x_adapter *adapter) ptp->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | PTP_FALLING_EDGE | PTP_STRICT_FLAGS; + ptp->ptp_clock_info.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE; ptp->ptp_clock_info.pin_config = ptp->pin_config; ptp->ptp_clock_info.adjfine = lan743x_ptpci_adjfine; ptp->ptp_clock_info.adjtime = lan743x_ptpci_adjtime; diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c index 8cf41b0977b2c..098406e2e5bb2 100644 --- a/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c +++ b/drivers/net/ethernet/microchip/lan966x/lan966x_ptp.c @@ -815,10 +815,6 @@ static int lan966x_ptp_perout(struct ptp_clock_info *ptp, bool pps = false; int pin; - if (rq->perout.flags & ~(PTP_PEROUT_DUTY_CYCLE | - PTP_PEROUT_PHASE)) - return -EOPNOTSUPP; - pin = ptp_find_pin(phc->clock, PTP_PF_PEROUT, rq->perout.index); if (pin == -1 || pin >= LAN966X_PHC_PINS_NUM) return -EINVAL; @@ -974,6 +970,8 @@ static struct ptp_clock_info lan966x_ptp_clock_info = { .n_pins = LAN966X_PHC_PINS_NUM, .supported_extts_flags = PTP_RISING_EDGE | PTP_STRICT_FLAGS, + .supported_perout_flags = PTP_PEROUT_DUTY_CYCLE | + PTP_PEROUT_PHASE, }; static int lan966x_ptp_phc_init(struct lan966x *lan966x, diff --git a/drivers/net/ethernet/mscc/ocelot_ptp.c b/drivers/net/ethernet/mscc/ocelot_ptp.c index cc1088988da09..d2a0a32f75ea9 100644 --- a/drivers/net/ethernet/mscc/ocelot_ptp.c +++ b/drivers/net/ethernet/mscc/ocelot_ptp.c @@ -211,11 +211,6 @@ int ocelot_ptp_enable(struct ptp_clock_info *ptp, switch (rq->type) { case PTP_CLK_REQ_PEROUT: - /* Reject requests with unsupported flags */ - if (rq->perout.flags & ~(PTP_PEROUT_DUTY_CYCLE | - PTP_PEROUT_PHASE)) - return -EOPNOTSUPP; - pin = ptp_find_pin(ocelot->ptp_clock, PTP_PF_PEROUT, rq->perout.index); if (pin == 0) diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 055b55651a49f..498eec8ae61d8 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -108,6 +108,8 @@ static struct ptp_clock_info ocelot_ptp_clock_info = { .n_ext_ts = 0, .n_per_out = OCELOT_PTP_PINS_NUM, .n_pins = OCELOT_PTP_PINS_NUM, + .supported_perout_flags = PTP_PEROUT_DUTY_CYCLE | + PTP_PEROUT_PHASE, .pps = 0, .gettime64 = ocelot_ptp_gettime64, .settime64 = ocelot_ptp_settime64, diff --git a/drivers/net/ethernet/renesas/ravb_ptp.c b/drivers/net/ethernet/renesas/ravb_ptp.c index ab1ffdc7ee4f6..226c6c0ab945b 100644 --- a/drivers/net/ethernet/renesas/ravb_ptp.c +++ b/drivers/net/ethernet/renesas/ravb_ptp.c @@ -206,10 +206,6 @@ static int ravb_ptp_perout(struct ptp_clock_info *ptp, unsigned long flags; int error = 0; - /* Reject requests with unsupported flags */ - if (req->flags) - return -EOPNOTSUPP; - if (req->index) return -EINVAL; diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index c89c255185a6a..daab555721df8 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -506,9 +506,6 @@ static int ptp_dp83640_enable(struct ptp_clock_info *ptp, return 0; case PTP_CLK_REQ_PEROUT: - /* Reject requests with unsupported flags */ - if (rq->perout.flags) - return -EOPNOTSUPP; if (rq->perout.index >= N_PER_OUT) return -EINVAL; return periodic_output(clock, rq, on, rq->perout.index); diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 61123ec4c8780..71fb4410c31b1 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -3236,10 +3236,6 @@ static int lan8814_ptp_perout(struct ptp_clock_info *ptpci, int pulse_width; int pin, event; - /* Reject requests with unsupported flags */ - if (rq->perout.flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - mutex_lock(&shared->shared_lock); event = rq->perout.index; pin = ptp_find_pin(shared->ptp_clock, PTP_PF_PEROUT, event); @@ -3915,6 +3911,7 @@ static int lan8814_ptp_probe_once(struct phy_device *phydev) shared->ptp_clock_info.supported_extts_flags = PTP_RISING_EDGE | PTP_FALLING_EDGE | PTP_STRICT_FLAGS; + shared->ptp_clock_info.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE; shared->ptp_clock_info.pin_config = shared->pin_config; shared->ptp_clock_info.n_per_out = LAN8814_PTP_PEROUT_NUM; shared->ptp_clock_info.adjfine = lan8814_ptpci_adjfine; @@ -5066,9 +5063,6 @@ static int lan8841_ptp_perout(struct ptp_clock_info *ptp, int pin; int ret; - if (rq->perout.flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - pin = ptp_find_pin(ptp_priv->ptp_clock, PTP_PF_PEROUT, rq->perout.index); if (pin == -1 || pin >= LAN8841_PTP_GPIO_NUM) return -EINVAL; @@ -5312,6 +5306,7 @@ static struct ptp_clock_info lan8841_ptp_clock_info = { .n_per_out = LAN8841_PTP_GPIO_NUM, .n_ext_ts = LAN8841_PTP_GPIO_NUM, .n_pins = LAN8841_PTP_GPIO_NUM, + .supported_perout_flags = PTP_PEROUT_DUTY_CYCLE, }; #define LAN8841_OPERATION_MODE_STRAP_LOW_REGISTER 3 diff --git a/drivers/net/phy/microchip_rds_ptp.c b/drivers/net/phy/microchip_rds_ptp.c index 3e6bf10cdeed9..e6514ce04c29f 100644 --- a/drivers/net/phy/microchip_rds_ptp.c +++ b/drivers/net/phy/microchip_rds_ptp.c @@ -224,10 +224,6 @@ static int mchp_rds_ptp_perout(struct ptp_clock_info *ptpci, struct phy_device *phydev = clock->phydev; int ret, event_pin, pulsewidth; - /* Reject requests with unsupported flags */ - if (perout->flags & ~PTP_PEROUT_DUTY_CYCLE) - return -EOPNOTSUPP; - event_pin = ptp_find_pin(clock->ptp_clock, PTP_PF_PEROUT, perout->index); if (event_pin != clock->event_pin) @@ -1259,6 +1255,7 @@ struct mchp_rds_ptp_clock *mchp_rds_ptp_probe(struct phy_device *phydev, u8 mmd, clock->caps.pps = 0; clock->caps.n_pins = MCHP_RDS_PTP_N_PIN; clock->caps.n_per_out = MCHP_RDS_PTP_N_PEROUT; + clock->caps.supported_perout_flags = PTP_PEROUT_DUTY_CYCLE; clock->caps.pin_config = clock->pin_config; clock->caps.adjfine = mchp_rds_ptp_ltc_adjfine; clock->caps.adjtime = mchp_rds_ptp_ltc_adjtime; diff --git a/drivers/net/phy/nxp-c45-tja11xx.c b/drivers/net/phy/nxp-c45-tja11xx.c index 8634b4cb1e708..f11dd32494c30 100644 --- a/drivers/net/phy/nxp-c45-tja11xx.c +++ b/drivers/net/phy/nxp-c45-tja11xx.c @@ -763,9 +763,6 @@ static int nxp_c45_perout_enable(struct nxp_c45_phy *priv, struct phy_device *phydev = priv->phydev; int pin; - if (perout->flags & ~PTP_PEROUT_PHASE) - return -EOPNOTSUPP; - pin = ptp_find_pin(priv->ptp_clock, PTP_PF_PEROUT, perout->index); if (pin < 0) return pin; @@ -959,6 +956,7 @@ static int nxp_c45_init_ptp_clock(struct nxp_c45_phy *priv) .supported_extts_flags = PTP_RISING_EDGE | PTP_FALLING_EDGE | PTP_STRICT_FLAGS, + .supported_perout_flags = PTP_PEROUT_PHASE, }; priv->ptp_clock = ptp_clock_register(&priv->caps, diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index c24228c139549..4bf421765d033 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -324,6 +324,8 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, err = -EINVAL; break; } + if (req.perout.flags & ~ptp->info->supported_perout_flags) + return -EOPNOTSUPP; req.type = PTP_CLK_REQ_PEROUT; enable = req.perout.period.sec || req.perout.period.nsec; if (mutex_lock_interruptible(&ptp->pincfg_mux)) diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 25cba2e5ee69c..eced7e9bf69a8 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -69,6 +69,11 @@ struct ptp_system_timestamp { * @n_pins: The number of programmable pins. * @pps: Indicates whether the clock supports a PPS callback. * + * @supported_perout_flags: The set of flags the driver supports for the + * PTP_PEROUT_REQUEST ioctl. The PTP core will + * reject a request with any flag not specified + * here. + * * @supported_extts_flags: The set of flags the driver supports for the * PTP_EXTTS_REQUEST ioctl. The PTP core will use * this list to reject unsupported requests. @@ -185,6 +190,7 @@ struct ptp_clock_info { int n_per_out; int n_pins; int pps; + unsigned int supported_perout_flags; unsigned int supported_extts_flags; struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); From b2accfe7ca5bc9f9af28e603b79bdd5ad8df5c0b Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Tue, 1 Apr 2025 06:12:18 +0530 Subject: [PATCH 236/770] powerpc/boot: Check for ld-option support Commit 579aee9fc594 ("powerpc: suppress some linker warnings in recent linker versions") enabled support to add linker option "--no-warn-rwx-segments", if the version is greater than 2.39. Similar build warning were reported recently from linker version 2.35.2. ld: warning: arch/powerpc/boot/zImage.epapr has a LOAD segment with RWX permissions ld: warning: arch/powerpc/boot/zImage.pseries has a LOAD segment with RWX permissions Fix the warning by checking for "--no-warn-rwx-segments" option support in linker to enable it, instead of checking for the version range. Fixes: 579aee9fc594 ("powerpc: suppress some linker warnings in recent linker versions") Reported-by: Venkat Rao Bagalkote Suggested-by: Christophe Leroy Tested-by: Venkat Rao Bagalkote Closes: https://lore.kernel.org/linuxppc-dev/61cf556c-4947-4bd6-af63-892fc0966dad@linux.ibm.com/ Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250401004218.24869-1-maddy@linux.ibm.com --- arch/powerpc/boot/wrapper | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 1db60fe13802d..267ca6d4d9b38 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -234,10 +234,8 @@ fi # suppress some warnings in recent ld versions nowarn="-z noexecstack" -if ! ld_is_lld; then - if [ "$LD_VERSION" -ge "$(echo 2.39 | ld_version)" ]; then - nowarn="$nowarn --no-warn-rwx-segments" - fi +if [ $(${CROSS}ld -v --no-warn-rwx-segments &>/dev/null; echo $?) -eq 0 ]; then + nowarn="$nowarn --no-warn-rwx-segments" fi platformo=$object/"$platform".o From bbc9462f0cb0c8917a4908e856731708f0cee910 Mon Sep 17 00:00:00 2001 From: Shyam Saini Date: Thu, 27 Feb 2025 10:49:27 -0800 Subject: [PATCH 237/770] kernel: param: rename locate_module_kobject The locate_module_kobject() function looks up an existing module_kobject for a given module name. If it cannot find the corresponding module_kobject, it creates one for the given name. This commit renames locate_module_kobject() to lookup_or_create_module_kobject() to better describe its operations. This doesn't change anything functionality wise. Suggested-by: Rasmus Villemoes Signed-off-by: Shyam Saini Link: https://lore.kernel.org/r/20250227184930.34163-2-shyamsaini@linux.microsoft.com Signed-off-by: Petr Pavlu --- kernel/params.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/params.c b/kernel/params.c index 2509f216c9f3c..a2441ce059aed 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -760,7 +760,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) params[i].ops->free(params[i].arg); } -static struct module_kobject * __init locate_module_kobject(const char *name) +static struct module_kobject * __init lookup_or_create_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; @@ -802,7 +802,7 @@ static void __init kernel_add_sysfs_param(const char *name, struct module_kobject *mk; int err; - mk = locate_module_kobject(name); + mk = lookup_or_create_module_kobject(name); if (!mk) return; @@ -873,7 +873,7 @@ static void __init version_sysfs_builtin(void) int err; for (vattr = __start___modver; vattr < __stop___modver; vattr++) { - mk = locate_module_kobject(vattr->module_name); + mk = lookup_or_create_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); WARN_ON_ONCE(err); From 1c7777feb0e2f5925908c489513656ebb443a699 Mon Sep 17 00:00:00 2001 From: Shyam Saini Date: Thu, 27 Feb 2025 10:49:28 -0800 Subject: [PATCH 238/770] kernel: refactor lookup_or_create_module_kobject() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the unlikely event of the allocation failing, it is better to let the machine boot with a not fully populated sysfs than to kill it with this BUG_ON(). All callers are already prepared for lookup_or_create_module_kobject() returning NULL. This is also preparation for calling this function from non __init code, where using BUG_ON for allocation failure handling is not acceptable. Since we are here, also start using IS_ENABLED instead of #ifdef construct. Suggested-by: Thomas Weißschuh Suggested-by: Rasmus Villemoes Signed-off-by: Shyam Saini Link: https://lore.kernel.org/r/20250227184930.34163-3-shyamsaini@linux.microsoft.com Signed-off-by: Petr Pavlu --- kernel/params.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/kernel/params.c b/kernel/params.c index a2441ce059aed..787662663e348 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -767,31 +767,28 @@ static struct module_kobject * __init lookup_or_create_module_kobject(const char int err; kobj = kset_find_obj(module_kset, name); - if (kobj) { - mk = to_module_kobject(kobj); - } else { - mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); - BUG_ON(!mk); - - mk->mod = THIS_MODULE; - mk->kobj.kset = module_kset; - err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, - "%s", name); -#ifdef CONFIG_MODULES - if (!err) - err = sysfs_create_file(&mk->kobj, &module_uevent.attr); -#endif - if (err) { - kobject_put(&mk->kobj); - pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", - name, err); - return NULL; - } + if (kobj) + return to_module_kobject(kobj); - /* So that we hold reference in both cases. */ - kobject_get(&mk->kobj); + mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); + if (!mk) + return NULL; + + mk->mod = THIS_MODULE; + mk->kobj.kset = module_kset; + err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); + if (IS_ENABLED(CONFIG_MODULES) && !err) + err = sysfs_create_file(&mk->kobj, &module_uevent.attr); + if (err) { + kobject_put(&mk->kobj); + pr_crit("Adding module '%s' to sysfs failed (%d), the system may be unstable.\n", + name, err); + return NULL; } + /* So that we hold reference in both cases. */ + kobject_get(&mk->kobj); + return mk; } From 7c76c813cfc42a7376378a0c4b7250db2eebab81 Mon Sep 17 00:00:00 2001 From: Shyam Saini Date: Thu, 27 Feb 2025 10:49:29 -0800 Subject: [PATCH 239/770] kernel: globalize lookup_or_create_module_kobject() lookup_or_create_module_kobject() is marked as static and __init, to make it global drop static keyword. Since this function can be called from non-init code, use __modinit instead of __init, __modinit marker will make it __init if CONFIG_MODULES is not defined. Suggested-by: Rasmus Villemoes Signed-off-by: Shyam Saini Link: https://lore.kernel.org/r/20250227184930.34163-4-shyamsaini@linux.microsoft.com Signed-off-by: Petr Pavlu --- include/linux/module.h | 2 ++ kernel/params.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/module.h b/include/linux/module.h index d94b196d5a34e..b3329110d6686 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -162,6 +162,8 @@ extern void cleanup_module(void); #define __INITRODATA_OR_MODULE __INITRODATA #endif /*CONFIG_MODULES*/ +struct module_kobject *lookup_or_create_module_kobject(const char *name); + /* Generic info of form tag = "info" */ #define MODULE_INFO(tag, info) __MODULE_INFO(tag, tag, info) diff --git a/kernel/params.c b/kernel/params.c index 787662663e348..e668fc90b83ec 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -760,7 +760,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) params[i].ops->free(params[i].arg); } -static struct module_kobject * __init lookup_or_create_module_kobject(const char *name) +struct module_kobject __modinit * lookup_or_create_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; From f95bbfe18512c5c018720468959edac056a17196 Mon Sep 17 00:00:00 2001 From: Shyam Saini Date: Thu, 27 Feb 2025 10:49:30 -0800 Subject: [PATCH 240/770] drivers: base: handle module_kobject creation module_add_driver() relies on module_kset list for /sys/module//drivers directory creation. Since, commit 96a1a2412acba ("kernel/params.c: defer most of param_sysfs_init() to late_initcall time") drivers which are initialized from subsys_initcall() or any other higher precedence initcall couldn't find the related kobject entry in the module_kset list because module_kset is not fully populated by the time module_add_driver() refers it. As a consequence, module_add_driver() returns early without calling make_driver_name(). Therefore, /sys/module//drivers is never created. Fix this issue by letting module_add_driver() handle module_kobject creation itself. Fixes: 96a1a2412acb ("kernel/params.c: defer most of param_sysfs_init() to late_initcall time") Cc: stable@vger.kernel.org # requires all other patches from the series Suggested-by: Rasmus Villemoes Signed-off-by: Shyam Saini Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20250227184930.34163-5-shyamsaini@linux.microsoft.com Signed-off-by: Petr Pavlu --- drivers/base/module.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/base/module.c b/drivers/base/module.c index 5bc71bea883a0..218aaa0964552 100644 --- a/drivers/base/module.c +++ b/drivers/base/module.c @@ -42,16 +42,13 @@ int module_add_driver(struct module *mod, const struct device_driver *drv) if (mod) mk = &mod->mkobj; else if (drv->mod_name) { - struct kobject *mkobj; - - /* Lookup built-in module entry in /sys/modules */ - mkobj = kset_find_obj(module_kset, drv->mod_name); - if (mkobj) { - mk = container_of(mkobj, struct module_kobject, kobj); + /* Lookup or create built-in module entry in /sys/modules */ + mk = lookup_or_create_module_kobject(drv->mod_name); + if (mk) { /* remember our module structure */ drv->p->mkobj = mk; - /* kset_find_obj took a reference */ - kobject_put(mkobj); + /* lookup_or_create_module_kobject took a reference */ + kobject_put(&mk->kobj); } } From 1a377f142e6e2faff51f965ed40633f70579ffe6 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 14 Apr 2025 10:43:35 +0200 Subject: [PATCH 241/770] net: ethernet: ti: am65-cpsw: set fwnode for ports fwnode needs to be set for a device for fw_devlink to be able to track/enforce its dependencies correctly. Without this, you'll see error messages like this when the supplier has probed and tries to make sure all its fwnode consumers are linked to it using device links: am65-cpsw-nuss 8000000.ethernet: Failed to create device link (0x180) with supplier .. Reviewed-by: Saravana Kannan Signed-off-by: Michael Walle Link: https://patch.msgid.link/20250414084336.4017237-2-mwalle@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index e78de79a5d78c..0a38776339769 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2749,7 +2749,7 @@ am65_cpsw_nuss_init_port_ndev(struct am65_cpsw_common *common, u32 port_idx) mutex_init(&ndev_priv->mm_lock); port->qos.link_speed = SPEED_UNKNOWN; SET_NETDEV_DEV(port->ndev, dev); - port->ndev->dev.of_node = port->slave.port_np; + device_set_node(&port->ndev->dev, of_fwnode_handle(port->slave.port_np)); eth_hw_addr_set(port->ndev, port->slave.mac_addr); From 09737cb80b8686ffca4ed1805fee745d5c85604d Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 14 Apr 2025 10:43:36 +0200 Subject: [PATCH 242/770] net: ethernet: ti: am65-cpsw: handle -EPROBE_DEFER of_get_mac_address() might fetch the MAC address from NVMEM and that driver might not have been loaded. In that case, -EPROBE_DEFER is returned. Right now, this will trigger an immediate fallback to am65_cpsw_am654_get_efuse_macid() possibly resulting in a random MAC address although the MAC address is stored in the referenced NVMEM. Fix it by handling the -EPROBE_DEFER return code correctly. This also means that the creation of the MDIO device has to be moved to a later stage as -EPROBE_DEFER must not be returned after child devices are created. Signed-off-by: Michael Walle Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250414084336.4017237-3-mwalle@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 0a38776339769..aedef818800c8 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2679,7 +2679,9 @@ static int am65_cpsw_nuss_init_slave_ports(struct am65_cpsw_common *common) goto of_node_put; ret = of_get_mac_address(port_np, port->slave.mac_addr); - if (ret) { + if (ret == -EPROBE_DEFER) { + goto of_node_put; + } else if (ret) { am65_cpsw_am654_get_efuse_macid(port_np, port->port_id, port->slave.mac_addr); @@ -3550,6 +3552,16 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) return ret; } + am65_cpsw_nuss_get_ver(common); + + ret = am65_cpsw_nuss_init_host_p(common); + if (ret) + goto err_pm_clear; + + ret = am65_cpsw_nuss_init_slave_ports(common); + if (ret) + goto err_pm_clear; + node = of_get_child_by_name(dev->of_node, "mdio"); if (!node) { dev_warn(dev, "MDIO node not found\n"); @@ -3566,16 +3578,6 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) } of_node_put(node); - am65_cpsw_nuss_get_ver(common); - - ret = am65_cpsw_nuss_init_host_p(common); - if (ret) - goto err_of_clear; - - ret = am65_cpsw_nuss_init_slave_ports(common); - if (ret) - goto err_of_clear; - /* init common data */ ale_params.dev = dev; ale_params.ale_ageout = AM65_CPSW_ALE_AGEOUT_DEFAULT; From 8a8f3f4991761a70834fe6719d09e9fd338a766e Mon Sep 17 00:00:00 2001 From: Thangaraj Samynathan Date: Tue, 15 Apr 2025 10:15:09 +0530 Subject: [PATCH 243/770] net: lan743x: Allocate rings outside ZONE_DMA The driver allocates ring elements using GFP_DMA flags. There is no dependency from LAN743x hardware on memory allocation should be in DMA_ZONE. Hence modifying the flags to use only GFP_ATOMIC. This is consistent with other callers of lan743x_rx_init_ring_element(). Reported-by: Zhang, Liyin(CN) Signed-off-by: Thangaraj Samynathan Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250415044509.6695-1-thangaraj.s@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan743x_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 23760b613d3ec..8b6b9b6efe18e 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -2495,8 +2495,7 @@ static int lan743x_rx_process_buffer(struct lan743x_rx *rx) /* save existing skb, allocate new skb and map to dma */ skb = buffer_info->skb; - if (lan743x_rx_init_ring_element(rx, rx->last_head, - GFP_ATOMIC | GFP_DMA)) { + if (lan743x_rx_init_ring_element(rx, rx->last_head, GFP_ATOMIC)) { /* failed to allocate next skb. * Memory is very low. * Drop this packet and reuse buffer. From cd1fafe7da1f6f2aa25723e317f6e8e9d0c050a1 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Tue, 15 Apr 2025 05:24:58 +0000 Subject: [PATCH 244/770] eth: bnxt: add support rx side device memory TCP Currently, bnxt_en driver satisfies the requirements of the Device memory TCP, which is HDS. So, it implements rx-side Device memory TCP for bnxt_en driver. It requires only converting the page API to netmem API. `struct page` of agg rings are changed to `netmem_ref netmem` and corresponding functions are changed to a variant of netmem API. It also passes PP_FLAG_ALLOW_UNREADABLE_NETMEM flag to a parameter of page_pool. The netmem will be activated only when a user requests devmem TCP. When netmem is activated, received data is unreadable and netmem is disabled, received data is readable. But drivers don't need to handle both cases because netmem core API will handle it properly. So, using proper netmem API is enough for drivers. Device memory TCP can be tested with tools/testing/selftests/drivers/net/hw/ncdevmem. This is tested with BCM57504-N425G and firmware version 232.0.155.8/pkg 232.1.132.8. Reviewed-by: Mina Almasry Tested-by: David Wei Signed-off-by: Taehee Yoo Link: https://patch.msgid.link/20250415052458.1260575-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 201 +++++++++++++--------- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 3 +- include/net/page_pool/helpers.h | 11 ++ 3 files changed, 133 insertions(+), 82 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 8725e1e139082..83608b7447f4e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -893,9 +893,9 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int budget) bnapi->events &= ~BNXT_TX_CMP_EVENT; } -static bool bnxt_separate_head_pool(void) +static bool bnxt_separate_head_pool(struct bnxt_rx_ring_info *rxr) { - return PAGE_SIZE > BNXT_RX_PAGE_SIZE; + return rxr->need_head_pool || PAGE_SIZE > BNXT_RX_PAGE_SIZE; } static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, @@ -919,6 +919,20 @@ static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, return page; } +static netmem_ref __bnxt_alloc_rx_netmem(struct bnxt *bp, dma_addr_t *mapping, + struct bnxt_rx_ring_info *rxr, + gfp_t gfp) +{ + netmem_ref netmem; + + netmem = page_pool_alloc_netmems(rxr->page_pool, gfp); + if (!netmem) + return 0; + + *mapping = page_pool_get_dma_addr_netmem(netmem); + return netmem; +} + static inline u8 *__bnxt_alloc_rx_frag(struct bnxt *bp, dma_addr_t *mapping, struct bnxt_rx_ring_info *rxr, gfp_t gfp) @@ -999,21 +1013,19 @@ static inline u16 bnxt_find_next_agg_idx(struct bnxt_rx_ring_info *rxr, u16 idx) return next; } -static inline int bnxt_alloc_rx_page(struct bnxt *bp, - struct bnxt_rx_ring_info *rxr, - u16 prod, gfp_t gfp) +static int bnxt_alloc_rx_netmem(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, + u16 prod, gfp_t gfp) { struct rx_bd *rxbd = &rxr->rx_agg_desc_ring[RX_AGG_RING(bp, prod)][RX_IDX(prod)]; struct bnxt_sw_rx_agg_bd *rx_agg_buf; - struct page *page; - dma_addr_t mapping; u16 sw_prod = rxr->rx_sw_agg_prod; unsigned int offset = 0; + dma_addr_t mapping; + netmem_ref netmem; - page = __bnxt_alloc_rx_page(bp, &mapping, rxr, &offset, gfp); - - if (!page) + netmem = __bnxt_alloc_rx_netmem(bp, &mapping, rxr, gfp); + if (!netmem) return -ENOMEM; if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap))) @@ -1023,7 +1035,7 @@ static inline int bnxt_alloc_rx_page(struct bnxt *bp, rx_agg_buf = &rxr->rx_agg_ring[sw_prod]; rxr->rx_sw_agg_prod = RING_RX_AGG(bp, NEXT_RX_AGG(sw_prod)); - rx_agg_buf->page = page; + rx_agg_buf->netmem = netmem; rx_agg_buf->offset = offset; rx_agg_buf->mapping = mapping; rxbd->rx_bd_haddr = cpu_to_le64(mapping); @@ -1067,11 +1079,11 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx, p5_tpa = true; for (i = 0; i < agg_bufs; i++) { - u16 cons; - struct rx_agg_cmp *agg; struct bnxt_sw_rx_agg_bd *cons_rx_buf, *prod_rx_buf; + struct rx_agg_cmp *agg; struct rx_bd *prod_bd; - struct page *page; + netmem_ref netmem; + u16 cons; if (p5_tpa) agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, start + i); @@ -1088,11 +1100,11 @@ static void bnxt_reuse_rx_agg_bufs(struct bnxt_cp_ring_info *cpr, u16 idx, cons_rx_buf = &rxr->rx_agg_ring[cons]; /* It is possible for sw_prod to be equal to cons, so - * set cons_rx_buf->page to NULL first. + * set cons_rx_buf->netmem to 0 first. */ - page = cons_rx_buf->page; - cons_rx_buf->page = NULL; - prod_rx_buf->page = page; + netmem = cons_rx_buf->netmem; + cons_rx_buf->netmem = 0; + prod_rx_buf->netmem = netmem; prod_rx_buf->offset = cons_rx_buf->offset; prod_rx_buf->mapping = cons_rx_buf->mapping; @@ -1218,29 +1230,35 @@ static struct sk_buff *bnxt_rx_skb(struct bnxt *bp, return skb; } -static u32 __bnxt_rx_agg_pages(struct bnxt *bp, - struct bnxt_cp_ring_info *cpr, - struct skb_shared_info *shinfo, - u16 idx, u32 agg_bufs, bool tpa, - struct xdp_buff *xdp) +static u32 __bnxt_rx_agg_netmems(struct bnxt *bp, + struct bnxt_cp_ring_info *cpr, + u16 idx, u32 agg_bufs, bool tpa, + struct sk_buff *skb, + struct xdp_buff *xdp) { struct bnxt_napi *bnapi = cpr->bnapi; - struct pci_dev *pdev = bp->pdev; - struct bnxt_rx_ring_info *rxr = bnapi->rx_ring; - u16 prod = rxr->rx_agg_prod; + struct skb_shared_info *shinfo; + struct bnxt_rx_ring_info *rxr; u32 i, total_frag_len = 0; bool p5_tpa = false; + u16 prod; + + rxr = bnapi->rx_ring; + prod = rxr->rx_agg_prod; if ((bp->flags & BNXT_FLAG_CHIP_P5_PLUS) && tpa) p5_tpa = true; + if (skb) + shinfo = skb_shinfo(skb); + else + shinfo = xdp_get_shared_info_from_buff(xdp); + for (i = 0; i < agg_bufs; i++) { - skb_frag_t *frag = &shinfo->frags[i]; - u16 cons, frag_len; - struct rx_agg_cmp *agg; struct bnxt_sw_rx_agg_bd *cons_rx_buf; - struct page *page; - dma_addr_t mapping; + struct rx_agg_cmp *agg; + u16 cons, frag_len; + netmem_ref netmem; if (p5_tpa) agg = bnxt_get_tpa_agg_p5(bp, rxr, idx, i); @@ -1251,27 +1269,41 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, RX_AGG_CMP_LEN) >> RX_AGG_CMP_LEN_SHIFT; cons_rx_buf = &rxr->rx_agg_ring[cons]; - skb_frag_fill_page_desc(frag, cons_rx_buf->page, - cons_rx_buf->offset, frag_len); - shinfo->nr_frags = i + 1; + if (skb) { + skb_add_rx_frag_netmem(skb, i, cons_rx_buf->netmem, + cons_rx_buf->offset, + frag_len, BNXT_RX_PAGE_SIZE); + } else { + skb_frag_t *frag = &shinfo->frags[i]; + + skb_frag_fill_netmem_desc(frag, cons_rx_buf->netmem, + cons_rx_buf->offset, + frag_len); + shinfo->nr_frags = i + 1; + } __clear_bit(cons, rxr->rx_agg_bmap); - /* It is possible for bnxt_alloc_rx_page() to allocate + /* It is possible for bnxt_alloc_rx_netmem() to allocate * a sw_prod index that equals the cons index, so we * need to clear the cons entry now. */ - mapping = cons_rx_buf->mapping; - page = cons_rx_buf->page; - cons_rx_buf->page = NULL; + netmem = cons_rx_buf->netmem; + cons_rx_buf->netmem = 0; - if (xdp && page_is_pfmemalloc(page)) + if (xdp && netmem_is_pfmemalloc(netmem)) xdp_buff_set_frag_pfmemalloc(xdp); - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_ATOMIC) != 0) { + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_ATOMIC) != 0) { + if (skb) { + skb->len -= frag_len; + skb->data_len -= frag_len; + skb->truesize -= BNXT_RX_PAGE_SIZE; + } + --shinfo->nr_frags; - cons_rx_buf->page = page; + cons_rx_buf->netmem = netmem; - /* Update prod since possibly some pages have been + /* Update prod since possibly some netmems have been * allocated already. */ rxr->rx_agg_prod = prod; @@ -1279,8 +1311,8 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, return 0; } - dma_sync_single_for_cpu(&pdev->dev, mapping, BNXT_RX_PAGE_SIZE, - bp->rx_dir); + page_pool_dma_sync_netmem_for_cpu(rxr->page_pool, netmem, 0, + BNXT_RX_PAGE_SIZE); total_frag_len += frag_len; prod = NEXT_RX_AGG(prod); @@ -1289,32 +1321,28 @@ static u32 __bnxt_rx_agg_pages(struct bnxt *bp, return total_frag_len; } -static struct sk_buff *bnxt_rx_agg_pages_skb(struct bnxt *bp, - struct bnxt_cp_ring_info *cpr, - struct sk_buff *skb, u16 idx, - u32 agg_bufs, bool tpa) +static struct sk_buff *bnxt_rx_agg_netmems_skb(struct bnxt *bp, + struct bnxt_cp_ring_info *cpr, + struct sk_buff *skb, u16 idx, + u32 agg_bufs, bool tpa) { - struct skb_shared_info *shinfo = skb_shinfo(skb); u32 total_frag_len = 0; - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, idx, - agg_bufs, tpa, NULL); + total_frag_len = __bnxt_rx_agg_netmems(bp, cpr, idx, agg_bufs, tpa, + skb, NULL); if (!total_frag_len) { skb_mark_for_recycle(skb); dev_kfree_skb(skb); return NULL; } - skb->data_len += total_frag_len; - skb->len += total_frag_len; - skb->truesize += BNXT_RX_PAGE_SIZE * agg_bufs; return skb; } -static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp, - struct bnxt_cp_ring_info *cpr, - struct xdp_buff *xdp, u16 idx, - u32 agg_bufs, bool tpa) +static u32 bnxt_rx_agg_netmems_xdp(struct bnxt *bp, + struct bnxt_cp_ring_info *cpr, + struct xdp_buff *xdp, u16 idx, + u32 agg_bufs, bool tpa) { struct skb_shared_info *shinfo = xdp_get_shared_info_from_buff(xdp); u32 total_frag_len = 0; @@ -1322,8 +1350,8 @@ static u32 bnxt_rx_agg_pages_xdp(struct bnxt *bp, if (!xdp_buff_has_frags(xdp)) shinfo->nr_frags = 0; - total_frag_len = __bnxt_rx_agg_pages(bp, cpr, shinfo, - idx, agg_bufs, tpa, xdp); + total_frag_len = __bnxt_rx_agg_netmems(bp, cpr, idx, agg_bufs, tpa, + NULL, xdp); if (total_frag_len) { xdp_buff_set_frags_flag(xdp); shinfo->nr_frags = agg_bufs; @@ -1895,7 +1923,8 @@ static inline struct sk_buff *bnxt_tpa_end(struct bnxt *bp, } if (agg_bufs) { - skb = bnxt_rx_agg_pages_skb(bp, cpr, skb, idx, agg_bufs, true); + skb = bnxt_rx_agg_netmems_skb(bp, cpr, skb, idx, agg_bufs, + true); if (!skb) { /* Page reuse already handled by bnxt_rx_pages(). */ cpr->sw_stats->rx.rx_oom_discards += 1; @@ -2175,9 +2204,10 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, if (bnxt_xdp_attached(bp, rxr)) { bnxt_xdp_buff_init(bp, rxr, cons, data_ptr, len, &xdp); if (agg_bufs) { - u32 frag_len = bnxt_rx_agg_pages_xdp(bp, cpr, &xdp, - cp_cons, agg_bufs, - false); + u32 frag_len = bnxt_rx_agg_netmems_xdp(bp, cpr, &xdp, + cp_cons, + agg_bufs, + false); if (!frag_len) goto oom_next_rx; @@ -2229,7 +2259,8 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, if (agg_bufs) { if (!xdp_active) { - skb = bnxt_rx_agg_pages_skb(bp, cpr, skb, cp_cons, agg_bufs, false); + skb = bnxt_rx_agg_netmems_skb(bp, cpr, skb, cp_cons, + agg_bufs, false); if (!skb) goto oom_next_rx; } else { @@ -3445,15 +3476,15 @@ static void bnxt_free_one_rx_agg_ring(struct bnxt *bp, struct bnxt_rx_ring_info for (i = 0; i < max_idx; i++) { struct bnxt_sw_rx_agg_bd *rx_agg_buf = &rxr->rx_agg_ring[i]; - struct page *page = rx_agg_buf->page; + netmem_ref netmem = rx_agg_buf->netmem; - if (!page) + if (!netmem) continue; - rx_agg_buf->page = NULL; + rx_agg_buf->netmem = 0; __clear_bit(i, rxr->rx_agg_bmap); - page_pool_recycle_direct(rxr->page_pool, page); + page_pool_recycle_direct_netmem(rxr->page_pool, netmem); } } @@ -3746,7 +3777,7 @@ static void bnxt_free_rx_rings(struct bnxt *bp) xdp_rxq_info_unreg(&rxr->xdp_rxq); page_pool_destroy(rxr->page_pool); - if (bnxt_separate_head_pool()) + if (bnxt_separate_head_pool(rxr)) page_pool_destroy(rxr->head_pool); rxr->page_pool = rxr->head_pool = NULL; @@ -3777,15 +3808,19 @@ static int bnxt_alloc_rx_page_pool(struct bnxt *bp, pp.dev = &bp->pdev->dev; pp.dma_dir = bp->rx_dir; pp.max_len = PAGE_SIZE; - pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | + PP_FLAG_ALLOW_UNREADABLE_NETMEM; + pp.queue_idx = rxr->bnapi->index; pool = page_pool_create(&pp); if (IS_ERR(pool)) return PTR_ERR(pool); rxr->page_pool = pool; - if (bnxt_separate_head_pool()) { + rxr->need_head_pool = page_pool_is_unreadable(pool); + if (bnxt_separate_head_pool(rxr)) { pp.pool_size = max(bp->rx_ring_size, 1024); + pp.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; pool = page_pool_create(&pp); if (IS_ERR(pool)) goto err_destroy_pp; @@ -4197,6 +4232,8 @@ static void bnxt_reset_rx_ring_struct(struct bnxt *bp, rxr->page_pool->p.napi = NULL; rxr->page_pool = NULL; + rxr->head_pool->p.napi = NULL; + rxr->head_pool = NULL; memset(&rxr->xdp_rxq, 0, sizeof(struct xdp_rxq_info)); ring = &rxr->rx_ring_struct; @@ -4321,16 +4358,16 @@ static void bnxt_alloc_one_rx_ring_skb(struct bnxt *bp, rxr->rx_prod = prod; } -static void bnxt_alloc_one_rx_ring_page(struct bnxt *bp, - struct bnxt_rx_ring_info *rxr, - int ring_nr) +static void bnxt_alloc_one_rx_ring_netmem(struct bnxt *bp, + struct bnxt_rx_ring_info *rxr, + int ring_nr) { u32 prod; int i; prod = rxr->rx_agg_prod; for (i = 0; i < bp->rx_agg_ring_size; i++) { - if (bnxt_alloc_rx_page(bp, rxr, prod, GFP_KERNEL)) { + if (bnxt_alloc_rx_netmem(bp, rxr, prod, GFP_KERNEL)) { netdev_warn(bp->dev, "init'ed rx ring %d with %d/%d pages only\n", ring_nr, i, bp->rx_ring_size); break; @@ -4371,7 +4408,7 @@ static int bnxt_alloc_one_rx_ring(struct bnxt *bp, int ring_nr) if (!(bp->flags & BNXT_FLAG_AGG_RINGS)) return 0; - bnxt_alloc_one_rx_ring_page(bp, rxr, ring_nr); + bnxt_alloc_one_rx_ring_netmem(bp, rxr, ring_nr); if (rxr->rx_tpa) { rc = bnxt_alloc_one_tpa_info_data(bp, rxr); @@ -15708,6 +15745,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) clone->rx_agg_prod = 0; clone->rx_sw_agg_prod = 0; clone->rx_next_cons = 0; + clone->need_head_pool = false; rc = bnxt_alloc_rx_page_pool(bp, clone, rxr->page_pool->p.nid); if (rc) @@ -15750,7 +15788,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) bnxt_alloc_one_rx_ring_skb(bp, clone, idx); if (bp->flags & BNXT_FLAG_AGG_RINGS) - bnxt_alloc_one_rx_ring_page(bp, clone, idx); + bnxt_alloc_one_rx_ring_netmem(bp, clone, idx); if (bp->flags & BNXT_FLAG_TPA) bnxt_alloc_one_tpa_info_data(bp, clone); @@ -15766,7 +15804,7 @@ static int bnxt_queue_mem_alloc(struct net_device *dev, void *qmem, int idx) xdp_rxq_info_unreg(&clone->xdp_rxq); err_page_pool_destroy: page_pool_destroy(clone->page_pool); - if (bnxt_separate_head_pool()) + if (bnxt_separate_head_pool(clone)) page_pool_destroy(clone->head_pool); clone->page_pool = NULL; clone->head_pool = NULL; @@ -15785,7 +15823,7 @@ static void bnxt_queue_mem_free(struct net_device *dev, void *qmem) xdp_rxq_info_unreg(&rxr->xdp_rxq); page_pool_destroy(rxr->page_pool); - if (bnxt_separate_head_pool()) + if (bnxt_separate_head_pool(rxr)) page_pool_destroy(rxr->head_pool); rxr->page_pool = NULL; rxr->head_pool = NULL; @@ -15876,6 +15914,7 @@ static int bnxt_queue_start(struct net_device *dev, void *qmem, int idx) rxr->page_pool = clone->page_pool; rxr->head_pool = clone->head_pool; rxr->xdp_rxq = clone->xdp_rxq; + rxr->need_head_pool = clone->need_head_pool; bnxt_copy_rx_ring(bp, rxr, clone); @@ -15961,7 +16000,7 @@ static int bnxt_queue_stop(struct net_device *dev, void *qmem, int idx) bnxt_hwrm_rx_ring_free(bp, rxr, false); bnxt_hwrm_rx_agg_ring_free(bp, rxr, false); page_pool_disable_direct_recycling(rxr->page_pool); - if (bnxt_separate_head_pool()) + if (bnxt_separate_head_pool(rxr)) page_pool_disable_direct_recycling(rxr->head_pool); if (bp->flags & BNXT_FLAG_SHARED_RINGS) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 21726cf565866..868a2e5a5b023 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -903,7 +903,7 @@ struct bnxt_sw_rx_bd { }; struct bnxt_sw_rx_agg_bd { - struct page *page; + netmem_ref netmem; unsigned int offset; dma_addr_t mapping; }; @@ -1106,6 +1106,7 @@ struct bnxt_rx_ring_info { unsigned long *rx_agg_bmap; u16 rx_agg_bmap_size; + bool need_head_pool; dma_addr_t rx_desc_mapping[MAX_RX_PAGES]; dma_addr_t rx_agg_desc_mapping[MAX_RX_AGG_PAGES]; diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 582a3d00cbe23..93f2c31baf9bf 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -395,6 +395,12 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); } +static inline void page_pool_recycle_direct_netmem(struct page_pool *pool, + netmem_ref netmem) +{ + page_pool_put_full_netmem(pool, netmem, true); +} + #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) @@ -492,4 +498,9 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) page_pool_update_nid(pool, new_nid); } +static inline bool page_pool_is_unreadable(struct page_pool *pool) +{ + return !!pool->mp_ops; +} + #endif /* _NET_PAGE_POOL_HELPERS_H */ From 978d13b26ab31297e320dba1feb87f95c107961b Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 15 Apr 2025 11:13:55 +0100 Subject: [PATCH 245/770] net: stmmac: intel: remove unnecessary setting max_speed Phylink will already limit the MAC speed according to the interface, so if 2500BASE-X is selected, the maximum speed will be 2.5G. Similarly, if SGMII is selected, the maximum speed will be 1G. It is, therefore, not necessary to set a speed limit. Remove setting plat_dat->max_speed from this glue driver. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u4dIh-000dT5-Kt@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 54db5b778304d..96c893dd262f8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -300,11 +300,8 @@ static void intel_speed_mode_2500(struct net_device *ndev, void *intel_data) if (((data & SERDES_LINK_MODE_MASK) >> SERDES_LINK_MODE_SHIFT) == SERDES_LINK_MODE_2G5) { dev_info(priv->device, "Link Speed Mode: 2.5Gbps\n"); - priv->plat->max_speed = 2500; priv->plat->phy_interface = PHY_INTERFACE_MODE_2500BASEX; priv->plat->mdio_bus_data->default_an_inband = false; - } else { - priv->plat->max_speed = 1000; } } From 4cc8b57753ef30650a5a326df0074d25f342aa84 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 15 Apr 2025 11:15:53 +0100 Subject: [PATCH 246/770] net: stmmac: sun8i: use stmmac_pltfr_probe() Using stmmac_pltfr_probe() simplifies the probe function. This will not only call plat_dat->init (sun8i_dwmac_init), but also plat_dat->exit (sun8i_dwmac_exit) appropriately if stmmac_dvr_probe() fails. This results in an overall simplification of the glue driver. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Reviewed-by: Andrew Lunn Tested-by: Corentin Labbe Link: https://patch.msgid.link/E1u4dKb-000dV7-3B@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 85723a78793ab..fd6518e252e38 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1239,14 +1239,10 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - ret = sun8i_dwmac_init(pdev, plat_dat->bsp_priv); + ret = stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); if (ret) goto dwmac_syscon; - ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); - if (ret) - goto dwmac_exit; - ndev = dev_get_drvdata(&pdev->dev); priv = netdev_priv(ndev); @@ -1283,9 +1279,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) clk_put(gmac->ephy_clk); dwmac_remove: pm_runtime_put_noidle(&pdev->dev); - stmmac_dvr_remove(&pdev->dev); -dwmac_exit: - sun8i_dwmac_exit(pdev, gmac); + stmmac_pltfr_remove(pdev); dwmac_syscon: sun8i_dwmac_unset_syscon(gmac); From 00868d0348187fe1123324f77e069a916f932037 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Tue, 15 Apr 2025 12:53:05 +0200 Subject: [PATCH 247/770] net: phy: mediatek: init val in .phy_led_polarity_set for AN7581 Fix smatch warning for uninitialised val in .phy_led_polarity_set for AN7581 driver. Correctly init to 0 to set polarity high by default. Reported-by: Simon Horman Fixes: 6a325aed130b ("net: phy: mediatek: add Airoha PHY ID to SoC driver") Signed-off-by: Christian Marangi Link: https://patch.msgid.link/20250415105313.3409-1-ansuelsmth@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mediatek/mtk-ge-soc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/phy/mediatek/mtk-ge-soc.c b/drivers/net/phy/mediatek/mtk-ge-soc.c index fd0e447ffce79..cd09684780a49 100644 --- a/drivers/net/phy/mediatek/mtk-ge-soc.c +++ b/drivers/net/phy/mediatek/mtk-ge-soc.c @@ -1432,8 +1432,8 @@ static int an7581_phy_probe(struct phy_device *phydev) static int an7581_phy_led_polarity_set(struct phy_device *phydev, int index, unsigned long modes) { + u16 val = 0; u32 mode; - u16 val; if (index >= MTK_PHY_MAX_LEDS) return -EINVAL; @@ -1444,7 +1444,6 @@ static int an7581_phy_led_polarity_set(struct phy_device *phydev, int index, val = MTK_PHY_LED_ON_POLARITY; break; case PHY_LED_ACTIVE_HIGH: - val = 0; break; default: return -EINVAL; From 49593c298cf7f5dd360f029356bcbbcbff406d6f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 15 Apr 2025 10:26:53 -0700 Subject: [PATCH 248/770] docs: networking: clarify intended audience of netdevices.rst The netdevices doc is dangerously broad. At least make it clear that it's intended for developers, not for users. Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250415172653.811147-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/netdevices.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index f87bb55b4afea..77fe1f7b93be0 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -8,7 +8,7 @@ Network Devices, the Kernel, and You! Introduction ============ The following is a random collection of documentation regarding -network devices. +network devices. It is intended for driver developers. struct net_device lifetime rules ================================ From 72d02a9f9410a292c3be8a4b17060115610f638d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 15 Apr 2025 17:42:23 +0100 Subject: [PATCH 249/770] net: stmmac: sti: use phy_interface_mode_is_rgmii() Replace the custom IS_PHY_IF_MODE_RGMII() macro with our generic phy_interface_mode_is_rgmii() inline function. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u4jMd-000rCG-VU@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c index be57c6c12c1c2..c580647ff9dc8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c @@ -23,12 +23,7 @@ #define DWMAC_50MHZ 50000000 -#define IS_PHY_IF_MODE_RGMII(iface) (iface == PHY_INTERFACE_MODE_RGMII || \ - iface == PHY_INTERFACE_MODE_RGMII_ID || \ - iface == PHY_INTERFACE_MODE_RGMII_RXID || \ - iface == PHY_INTERFACE_MODE_RGMII_TXID) - -#define IS_PHY_IF_MODE_GBIT(iface) (IS_PHY_IF_MODE_RGMII(iface) || \ +#define IS_PHY_IF_MODE_GBIT(iface) (phy_interface_mode_is_rgmii(iface) || \ iface == PHY_INTERFACE_MODE_GMII) /* STiH4xx register definitions (STiH407/STiH410 families) @@ -148,7 +143,7 @@ static void stih4xx_fix_retime_src(void *priv, int spd, unsigned int mode) src = TX_RETIME_SRC_CLKGEN; freq = DWMAC_50MHZ; } - } else if (IS_PHY_IF_MODE_RGMII(dwmac->interface)) { + } else if (phy_interface_mode_is_rgmii(dwmac->interface)) { /* On GiGa clk source can be either ext or from clkgen */ freq = rgmii_clock(spd); From 403068c6c9c25ef772d99bc91ce85ae39ed41c76 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 15 Apr 2025 17:42:29 +0100 Subject: [PATCH 250/770] net: stmmac: sti: convert to devm_stmmac_pltfr_probe() Convert sti to use the generic devm_stmmac_pltfr_probe() which will call plat_dat->init()/plat_dat->exit() as appropriate, thus simplifying the code. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u4jMj-000rCM-31@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-sti.c | 54 +++++++++---------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c index c580647ff9dc8..b6e09bd338943 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c @@ -233,6 +233,29 @@ static int sti_dwmac_parse_data(struct sti_dwmac *dwmac, return 0; } +static int sti_dwmac_init(struct platform_device *pdev, void *bsp_priv) +{ + struct sti_dwmac *dwmac = bsp_priv; + int ret; + + ret = clk_prepare_enable(dwmac->clk); + if (ret) + return ret; + + ret = sti_dwmac_set_mode(dwmac); + if (ret) + clk_disable_unprepare(dwmac->clk); + + return ret; +} + +static void sti_dwmac_exit(struct platform_device *pdev, void *bsp_priv) +{ + struct sti_dwmac *dwmac = bsp_priv; + + clk_disable_unprepare(dwmac->clk); +} + static int sti_dwmac_probe(struct platform_device *pdev) { struct plat_stmmacenet_data *plat_dat; @@ -269,34 +292,10 @@ static int sti_dwmac_probe(struct platform_device *pdev) plat_dat->bsp_priv = dwmac; plat_dat->fix_mac_speed = data->fix_retime_src; + plat_dat->init = sti_dwmac_init; + plat_dat->exit = sti_dwmac_exit; - ret = clk_prepare_enable(dwmac->clk); - if (ret) - return ret; - - ret = sti_dwmac_set_mode(dwmac); - if (ret) - goto disable_clk; - - ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); - if (ret) - goto disable_clk; - - return 0; - -disable_clk: - clk_disable_unprepare(dwmac->clk); - - return ret; -} - -static void sti_dwmac_remove(struct platform_device *pdev) -{ - struct sti_dwmac *dwmac = get_stmmac_bsp_priv(&pdev->dev); - - stmmac_dvr_remove(&pdev->dev); - - clk_disable_unprepare(dwmac->clk); + return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } static int sti_dwmac_suspend(struct device *dev) @@ -334,7 +333,6 @@ MODULE_DEVICE_TABLE(of, sti_dwmac_match); static struct platform_driver sti_dwmac_driver = { .probe = sti_dwmac_probe, - .remove = sti_dwmac_remove, .driver = { .name = "sti-dwmac", .pm = pm_sleep_ptr(&sti_dwmac_pm_ops), From b3334f9f708cb27b9ae68f818e1424458f50497c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 15 Apr 2025 17:42:34 +0100 Subject: [PATCH 251/770] net: stmmac: sti: convert to stmmac_pltfr_pm_ops As we now have the plat_dat->init()/plat_dat->exit() populated which have the required functionality on suspend/resume, we can now use stmmac_pltfr_pm_ops which has methods that call these two functions. Switch over to use this. Doing so also fills in the runtime PM ops and _noirq variants as well. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u4jMo-000rCS-6f@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-sti.c | 25 +------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c index b6e09bd338943..53d5ce1f6dc69 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c @@ -298,29 +298,6 @@ static int sti_dwmac_probe(struct platform_device *pdev) return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } -static int sti_dwmac_suspend(struct device *dev) -{ - struct sti_dwmac *dwmac = get_stmmac_bsp_priv(dev); - int ret = stmmac_suspend(dev); - - clk_disable_unprepare(dwmac->clk); - - return ret; -} - -static int sti_dwmac_resume(struct device *dev) -{ - struct sti_dwmac *dwmac = get_stmmac_bsp_priv(dev); - - clk_prepare_enable(dwmac->clk); - sti_dwmac_set_mode(dwmac); - - return stmmac_resume(dev); -} - -static DEFINE_SIMPLE_DEV_PM_OPS(sti_dwmac_pm_ops, sti_dwmac_suspend, - sti_dwmac_resume); - static const struct sti_dwmac_of_data stih4xx_dwmac_data = { .fix_retime_src = stih4xx_fix_retime_src, }; @@ -335,7 +312,7 @@ static struct platform_driver sti_dwmac_driver = { .probe = sti_dwmac_probe, .driver = { .name = "sti-dwmac", - .pm = pm_sleep_ptr(&sti_dwmac_pm_ops), + .pm = &stmmac_pltfr_pm_ops, .of_match_table = sti_dwmac_match, }, }; From 5ef4097ed155b3eb3aa7ec2c6035c6800b0e849d Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 15 Apr 2025 12:28:52 -0700 Subject: [PATCH 252/770] ipv6: Use nlmsg_payload in addrlabel file Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. This changes function ip6addrlbl_valid_get_req() and ip6addrlbl_valid_dump_req(). Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250415-nlmsg_v2-v1-1-a1c75d493fd7@debian.org Signed-off-by: Jakub Kicinski --- net/ipv6/addrlabel.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index ab054f329e12d..fb63ffbcfc647 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -473,12 +473,12 @@ static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh, { struct ifaddrlblmsg *ifal; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + ifal = nlmsg_payload(nlh, sizeof(*ifal)); + if (!ifal) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request"); return -EINVAL; } - ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_prefixlen || ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request"); @@ -543,7 +543,8 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, struct ifaddrlblmsg *ifal; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + ifal = nlmsg_payload(nlh, sizeof(*ifal)); + if (!ifal) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for addrlabel get request"); return -EINVAL; } @@ -552,7 +553,6 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, extack); - ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for addrlabel get request"); return -EINVAL; From 6c454270a8511280ba5981e57e3bcb3c1408ebfa Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 15 Apr 2025 12:28:53 -0700 Subject: [PATCH 253/770] ipv6: Use nlmsg_payload in addrconf file Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250415-nlmsg_v2-v1-2-a1c75d493fd7@debian.org Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4af2761795428..ec6c4ca6d6d99 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6112,7 +6112,8 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request"); return -EINVAL; } @@ -6122,7 +6123,6 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request"); From bc05add844fc47ab61499d64a1128abf2ff77ea2 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 15 Apr 2025 12:28:54 -0700 Subject: [PATCH 254/770] ipv6: Use nlmsg_payload in route file Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250415-nlmsg_v2-v1-3-a1c75d493fd7@debian.org Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 210b84cecc242..e2c6c0b0684b2 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6029,7 +6029,8 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; @@ -6039,7 +6040,6 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || From 7d82cc229c0901b0fa677f2568b436bcf8dcada7 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 15 Apr 2025 12:28:55 -0700 Subject: [PATCH 255/770] ipv4: Use nlmsg_payload in devinet file Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250415-nlmsg_v2-v1-4-a1c75d493fd7@debian.org Signed-off-by: Jakub Kicinski --- net/ipv4/devinet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 77e5705ac799e..c47d3828d4f65 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1792,12 +1792,12 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct ifaddrmsg *ifm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); return -EINVAL; From b411638fb92509a518506cbf0bbce8a8a3bd07fb Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 15 Apr 2025 12:28:56 -0700 Subject: [PATCH 256/770] ipv4: Use nlmsg_payload in fib_frontend file Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250415-nlmsg_v2-v1-5-a1c75d493fd7@debian.org Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3f4e629998fab..57f088e5540e2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -948,12 +948,12 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, if (filter->rtnl_held) ASSERT_RTNL(); - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); return -EINVAL; } - rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_scope) { NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); From d5ce0ed528c44d4b06d7e543db72c3adb71f4d5f Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 15 Apr 2025 12:28:57 -0700 Subject: [PATCH 257/770] ipv4: Use nlmsg_payload in route file Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250415-nlmsg_v2-v1-6-a1c75d493fd7@debian.org Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 22dfc971aab4b..49cffbe838029 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3205,7 +3205,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for route get request"); return -EINVAL; @@ -3215,7 +3216,6 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_table || rtm->rtm_protocol || From 04e00a849e7c25b331f103bf71a56fd078f73f8d Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 15 Apr 2025 12:28:58 -0700 Subject: [PATCH 258/770] ipv4: Use nlmsg_payload in ipmr file Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250415-nlmsg_v2-v1-7-a1c75d493fd7@debian.org Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a8b04d4abcaae..3b5677d8467ef 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2511,7 +2511,8 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request"); return -EINVAL; } @@ -2520,7 +2521,6 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || @@ -2836,7 +2836,8 @@ static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump"); return -EINVAL; } @@ -2846,7 +2847,6 @@ static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request"); From 9b1097a4108f5afb0911dbbbc79a8de130042665 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 15 Apr 2025 12:28:59 -0700 Subject: [PATCH 259/770] vxlan: Use nlmsg_payload in vxlan_vnifilter_dump Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250415-nlmsg_v2-v1-8-a1c75d493fd7@debian.org Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_vnifilter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index 6e6e9f05509ab..d0753776d3394 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -411,13 +411,12 @@ static int vxlan_vnifilter_dump(struct sk_buff *skb, struct netlink_callback *cb struct tunnel_msg *tmsg; struct net_device *dev; - if (cb->nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct tunnel_msg))) { + tmsg = nlmsg_payload(cb->nlh, sizeof(*tmsg)); + if (!tmsg) { NL_SET_ERR_MSG(cb->extack, "Invalid msg length"); return -EINVAL; } - tmsg = nlmsg_data(cb->nlh); - if (tmsg->flags & ~TUNNEL_MSG_VALID_USER_FLAGS) { NL_SET_ERR_MSG(cb->extack, "Invalid tunnelmsg flags in ancillary header"); return -EINVAL; From df8398fb7bb7a0e509200af56b79343aa133b7d6 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 15 Apr 2025 09:14:34 +0200 Subject: [PATCH 260/770] net: airoha: Add matchall filter offload support Introduce tc matchall filter offload support in airoha_eth driver. Matchall hw filter is used to implement hw rate policing via tc action police: $tc qdisc add dev eth0 handle ffff: ingress $tc filter add dev eth0 parent ffff: matchall action police \ rate 100mbit burst 1000k drop The current implementation supports just drop/accept as exceed/notexceed actions. Moreover, rate and burst are the only supported configuration parameters. Reviewed-by: Davide Caratti Reviewed-by: Simon Horman Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250415-airoha-hw-rx-ratelimit-v4-1-03458784fbc3@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/airoha/airoha_eth.c | 273 +++++++++++++++++++++- drivers/net/ethernet/airoha/airoha_eth.h | 8 +- drivers/net/ethernet/airoha/airoha_ppe.c | 9 +- drivers/net/ethernet/airoha/airoha_regs.h | 7 + 4 files changed, 286 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index 723eba7cfa0cf..c773b5ea9c051 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -527,6 +527,25 @@ static int airoha_fe_init(struct airoha_eth *eth) /* disable IFC by default */ airoha_fe_clear(eth, REG_FE_CSR_IFC_CFG, FE_IFC_EN_MASK); + airoha_fe_wr(eth, REG_PPE_DFT_CPORT0(0), + FIELD_PREP(DFT_CPORT_MASK(7), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(6), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(5), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(4), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(3), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(2), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(1), FE_PSE_PORT_CDM1) | + FIELD_PREP(DFT_CPORT_MASK(0), FE_PSE_PORT_CDM1)); + airoha_fe_wr(eth, REG_PPE_DFT_CPORT0(1), + FIELD_PREP(DFT_CPORT_MASK(7), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(6), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(5), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(4), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(3), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(2), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(1), FE_PSE_PORT_CDM2) | + FIELD_PREP(DFT_CPORT_MASK(0), FE_PSE_PORT_CDM2)); + /* enable 1:N vlan action, init vlan table */ airoha_fe_set(eth, REG_MC_VLAN_EN, MC_VLAN_EN_MASK); @@ -1631,7 +1650,6 @@ static void airhoha_set_gdm2_loopback(struct airoha_gdm_port *port) if (port->id == 3) { /* FIXME: handle XSI_PCE1_PORT */ - airoha_fe_wr(eth, REG_PPE_DFT_CPORT0(0), 0x5500); airoha_fe_rmw(eth, REG_FE_WAN_PORT, WAN1_EN_MASK | WAN1_MASK | WAN0_MASK, FIELD_PREP(WAN0_MASK, HSGMII_LAN_PCIE0_SRCPORT)); @@ -2109,6 +2127,125 @@ static int airoha_tc_setup_qdisc_ets(struct airoha_gdm_port *port, } } +static int airoha_qdma_get_rl_param(struct airoha_qdma *qdma, int queue_id, + u32 addr, enum trtcm_param_type param, + u32 *val_low, u32 *val_high) +{ + u32 idx = QDMA_METER_IDX(queue_id), group = QDMA_METER_GROUP(queue_id); + u32 val, config = FIELD_PREP(RATE_LIMIT_PARAM_TYPE_MASK, param) | + FIELD_PREP(RATE_LIMIT_METER_GROUP_MASK, group) | + FIELD_PREP(RATE_LIMIT_PARAM_INDEX_MASK, idx); + + airoha_qdma_wr(qdma, REG_TRTCM_CFG_PARAM(addr), config); + if (read_poll_timeout(airoha_qdma_rr, val, + val & RATE_LIMIT_PARAM_RW_DONE_MASK, + USEC_PER_MSEC, 10 * USEC_PER_MSEC, true, qdma, + REG_TRTCM_CFG_PARAM(addr))) + return -ETIMEDOUT; + + *val_low = airoha_qdma_rr(qdma, REG_TRTCM_DATA_LOW(addr)); + if (val_high) + *val_high = airoha_qdma_rr(qdma, REG_TRTCM_DATA_HIGH(addr)); + + return 0; +} + +static int airoha_qdma_set_rl_param(struct airoha_qdma *qdma, int queue_id, + u32 addr, enum trtcm_param_type param, + u32 val) +{ + u32 idx = QDMA_METER_IDX(queue_id), group = QDMA_METER_GROUP(queue_id); + u32 config = RATE_LIMIT_PARAM_RW_MASK | + FIELD_PREP(RATE_LIMIT_PARAM_TYPE_MASK, param) | + FIELD_PREP(RATE_LIMIT_METER_GROUP_MASK, group) | + FIELD_PREP(RATE_LIMIT_PARAM_INDEX_MASK, idx); + + airoha_qdma_wr(qdma, REG_TRTCM_DATA_LOW(addr), val); + airoha_qdma_wr(qdma, REG_TRTCM_CFG_PARAM(addr), config); + + return read_poll_timeout(airoha_qdma_rr, val, + val & RATE_LIMIT_PARAM_RW_DONE_MASK, + USEC_PER_MSEC, 10 * USEC_PER_MSEC, true, + qdma, REG_TRTCM_CFG_PARAM(addr)); +} + +static int airoha_qdma_set_rl_config(struct airoha_qdma *qdma, int queue_id, + u32 addr, bool enable, u32 enable_mask) +{ + u32 val; + int err; + + err = airoha_qdma_get_rl_param(qdma, queue_id, addr, TRTCM_MISC_MODE, + &val, NULL); + if (err) + return err; + + val = enable ? val | enable_mask : val & ~enable_mask; + + return airoha_qdma_set_rl_param(qdma, queue_id, addr, TRTCM_MISC_MODE, + val); +} + +static int airoha_qdma_set_rl_token_bucket(struct airoha_qdma *qdma, + int queue_id, u32 rate_val, + u32 bucket_size) +{ + u32 val, config, tick, unit, rate, rate_frac; + int err; + + err = airoha_qdma_get_rl_param(qdma, queue_id, REG_INGRESS_TRTCM_CFG, + TRTCM_MISC_MODE, &config, NULL); + if (err) + return err; + + val = airoha_qdma_rr(qdma, REG_INGRESS_TRTCM_CFG); + tick = FIELD_GET(INGRESS_FAST_TICK_MASK, val); + if (config & TRTCM_TICK_SEL) + tick *= FIELD_GET(INGRESS_SLOW_TICK_RATIO_MASK, val); + if (!tick) + return -EINVAL; + + unit = (config & TRTCM_PKT_MODE) ? 1000000 / tick : 8000 / tick; + if (!unit) + return -EINVAL; + + rate = rate_val / unit; + rate_frac = rate_val % unit; + rate_frac = FIELD_PREP(TRTCM_TOKEN_RATE_MASK, rate_frac) / unit; + rate = FIELD_PREP(TRTCM_TOKEN_RATE_MASK, rate) | + FIELD_PREP(TRTCM_TOKEN_RATE_FRACTION_MASK, rate_frac); + + err = airoha_qdma_set_rl_param(qdma, queue_id, REG_INGRESS_TRTCM_CFG, + TRTCM_TOKEN_RATE_MODE, rate); + if (err) + return err; + + val = bucket_size; + if (!(config & TRTCM_PKT_MODE)) + val = max_t(u32, val, MIN_TOKEN_SIZE); + val = min_t(u32, __fls(val), MAX_TOKEN_SIZE_OFFSET); + + return airoha_qdma_set_rl_param(qdma, queue_id, REG_INGRESS_TRTCM_CFG, + TRTCM_BUCKETSIZE_SHIFT_MODE, val); +} + +static int airoha_qdma_init_rl_config(struct airoha_qdma *qdma, int queue_id, + bool enable, enum trtcm_unit_type unit) +{ + bool tick_sel = queue_id == 0 || queue_id == 2 || queue_id == 8; + enum trtcm_param mode = TRTCM_METER_MODE; + int err; + + mode |= unit == TRTCM_PACKET_UNIT ? TRTCM_PKT_MODE : 0; + err = airoha_qdma_set_rl_config(qdma, queue_id, REG_INGRESS_TRTCM_CFG, + enable, mode); + if (err) + return err; + + return airoha_qdma_set_rl_config(qdma, queue_id, REG_INGRESS_TRTCM_CFG, + tick_sel, TRTCM_TICK_SEL); +} + static int airoha_qdma_get_trtcm_param(struct airoha_qdma *qdma, int channel, u32 addr, enum trtcm_param_type param, enum trtcm_mode_type mode, @@ -2273,10 +2410,142 @@ static int airoha_tc_htb_alloc_leaf_queue(struct airoha_gdm_port *port, return 0; } +static int airoha_qdma_set_rx_meter(struct airoha_gdm_port *port, + u32 rate, u32 bucket_size, + enum trtcm_unit_type unit_type) +{ + struct airoha_qdma *qdma = port->qdma; + int i; + + for (i = 0; i < ARRAY_SIZE(qdma->q_rx); i++) { + int err; + + if (!qdma->q_rx[i].ndesc) + continue; + + err = airoha_qdma_init_rl_config(qdma, i, !!rate, unit_type); + if (err) + return err; + + err = airoha_qdma_set_rl_token_bucket(qdma, i, rate, + bucket_size); + if (err) + return err; + } + + return 0; +} + +static int airoha_tc_matchall_act_validate(struct tc_cls_matchall_offload *f) +{ + const struct flow_action *actions = &f->rule->action; + const struct flow_action_entry *act; + + if (!flow_action_has_entries(actions)) { + NL_SET_ERR_MSG_MOD(f->common.extack, + "filter run with no actions"); + return -EINVAL; + } + + if (!flow_offload_has_one_action(actions)) { + NL_SET_ERR_MSG_MOD(f->common.extack, + "only once action per filter is supported"); + return -EOPNOTSUPP; + } + + act = &actions->entries[0]; + if (act->id != FLOW_ACTION_POLICE) { + NL_SET_ERR_MSG_MOD(f->common.extack, "unsupported action"); + return -EOPNOTSUPP; + } + + if (act->police.exceed.act_id != FLOW_ACTION_DROP) { + NL_SET_ERR_MSG_MOD(f->common.extack, + "invalid exceed action id"); + return -EOPNOTSUPP; + } + + if (act->police.notexceed.act_id != FLOW_ACTION_ACCEPT) { + NL_SET_ERR_MSG_MOD(f->common.extack, + "invalid notexceed action id"); + return -EOPNOTSUPP; + } + + if (act->police.notexceed.act_id == FLOW_ACTION_ACCEPT && + !flow_action_is_last_entry(actions, act)) { + NL_SET_ERR_MSG_MOD(f->common.extack, + "action accept must be last"); + return -EOPNOTSUPP; + } + + if (act->police.peakrate_bytes_ps || act->police.avrate || + act->police.overhead || act->police.mtu) { + NL_SET_ERR_MSG_MOD(f->common.extack, + "peakrate/avrate/overhead/mtu unsupported"); + return -EOPNOTSUPP; + } + + return 0; +} + +static int airoha_dev_tc_matchall(struct net_device *dev, + struct tc_cls_matchall_offload *f) +{ + enum trtcm_unit_type unit_type = TRTCM_BYTE_UNIT; + struct airoha_gdm_port *port = netdev_priv(dev); + u32 rate = 0, bucket_size = 0; + + switch (f->command) { + case TC_CLSMATCHALL_REPLACE: { + const struct flow_action_entry *act; + int err; + + err = airoha_tc_matchall_act_validate(f); + if (err) + return err; + + act = &f->rule->action.entries[0]; + if (act->police.rate_pkt_ps) { + rate = act->police.rate_pkt_ps; + bucket_size = act->police.burst_pkt; + unit_type = TRTCM_PACKET_UNIT; + } else { + rate = div_u64(act->police.rate_bytes_ps, 1000); + rate = rate << 3; /* Kbps */ + bucket_size = act->police.burst; + } + fallthrough; + } + case TC_CLSMATCHALL_DESTROY: + return airoha_qdma_set_rx_meter(port, rate, bucket_size, + unit_type); + default: + return -EOPNOTSUPP; + } +} + +static int airoha_dev_setup_tc_block_cb(enum tc_setup_type type, + void *type_data, void *cb_priv) +{ + struct net_device *dev = cb_priv; + + if (!tc_can_offload(dev)) + return -EOPNOTSUPP; + + switch (type) { + case TC_SETUP_CLSFLOWER: + return airoha_ppe_setup_tc_block_cb(dev, type_data); + case TC_SETUP_CLSMATCHALL: + return airoha_dev_tc_matchall(dev, type_data); + default: + return -EOPNOTSUPP; + } +} + static int airoha_dev_setup_tc_block(struct airoha_gdm_port *port, struct flow_block_offload *f) { - flow_setup_cb_t *cb = airoha_ppe_setup_tc_block_cb; + flow_setup_cb_t *cb = airoha_dev_setup_tc_block_cb; static LIST_HEAD(block_cb_list); struct flow_block_cb *block_cb; diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h index e82abfc1a67bd..da5371bcd1473 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.h +++ b/drivers/net/ethernet/airoha/airoha_eth.h @@ -127,6 +127,11 @@ enum tx_sched_mode { TC_SCH_WRR2, }; +enum trtcm_unit_type { + TRTCM_BYTE_UNIT, + TRTCM_PACKET_UNIT, +}; + enum trtcm_param_type { TRTCM_MISC_MODE, /* meter_en, pps_mode, tick_sel */ TRTCM_TOKEN_RATE_MODE, @@ -554,8 +559,7 @@ bool airoha_is_valid_gdm_port(struct airoha_eth *eth, void airoha_ppe_check_skb(struct airoha_ppe *ppe, struct sk_buff *skb, u16 hash); -int airoha_ppe_setup_tc_block_cb(enum tc_setup_type type, void *type_data, - void *cb_priv); +int airoha_ppe_setup_tc_block_cb(struct net_device *dev, void *type_data); int airoha_ppe_init(struct airoha_eth *eth); void airoha_ppe_deinit(struct airoha_eth *eth); struct airoha_foe_entry *airoha_ppe_foe_get_entry(struct airoha_ppe *ppe, diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index d4969c2a03524..6e9787c2843bc 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -967,18 +967,13 @@ static int airoha_ppe_offload_setup(struct airoha_eth *eth) return err; } -int airoha_ppe_setup_tc_block_cb(enum tc_setup_type type, void *type_data, - void *cb_priv) +int airoha_ppe_setup_tc_block_cb(struct net_device *dev, void *type_data) { - struct flow_cls_offload *cls = type_data; - struct net_device *dev = cb_priv; struct airoha_gdm_port *port = netdev_priv(dev); + struct flow_cls_offload *cls = type_data; struct airoha_eth *eth = port->qdma->eth; int err = 0; - if (!tc_can_offload(dev) || type != TC_SETUP_CLSFLOWER) - return -EOPNOTSUPP; - mutex_lock(&flow_offload_mutex); if (!eth->npu) diff --git a/drivers/net/ethernet/airoha/airoha_regs.h b/drivers/net/ethernet/airoha/airoha_regs.h index 8146cde4e8ba3..29c8f046b9910 100644 --- a/drivers/net/ethernet/airoha/airoha_regs.h +++ b/drivers/net/ethernet/airoha/airoha_regs.h @@ -283,6 +283,7 @@ #define PPE_HASH_SEED 0x12345678 #define REG_PPE_DFT_CPORT0(_n) (((_n) ? PPE2_BASE : PPE1_BASE) + 0x248) +#define DFT_CPORT_MASK(_n) GENMASK(3 + ((_n) << 2), ((_n) << 2)) #define REG_PPE_DFT_CPORT1(_n) (((_n) ? PPE2_BASE : PPE1_BASE) + 0x24c) @@ -691,6 +692,12 @@ #define REG_TRTCM_DATA_LOW(_n) ((_n) + 0x8) #define REG_TRTCM_DATA_HIGH(_n) ((_n) + 0xc) +#define RATE_LIMIT_PARAM_RW_MASK BIT(31) +#define RATE_LIMIT_PARAM_RW_DONE_MASK BIT(30) +#define RATE_LIMIT_PARAM_TYPE_MASK GENMASK(29, 28) +#define RATE_LIMIT_METER_GROUP_MASK GENMASK(27, 26) +#define RATE_LIMIT_PARAM_INDEX_MASK GENMASK(23, 16) + #define REG_TXWRR_MODE_CFG 0x1020 #define TWRR_WEIGHT_SCALE_MASK BIT(31) #define TWRR_WEIGHT_BASE_MASK BIT(3) From 9f23d943eb6b55990acc45ea1e130b20a44c76ce Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:18 +0200 Subject: [PATCH 261/770] net: introduce OpenVPN Data Channel Offload (ovpn) OpenVPN is a userspace software existing since around 2005 that allows users to create secure tunnels. So far OpenVPN has implemented all operations in userspace, which implies several back and forth between kernel and user land in order to process packets (encapsulate/decapsulate, encrypt/decrypt, rerouting..). With `ovpn` we intend to move the fast path (data channel) entirely in kernel space and thus improve user measured throughput over the tunnel. `ovpn` is implemented as a simple virtual network device driver, that can be manipulated by means of the standard RTNL APIs. A device of kind `ovpn` allows only IPv4/6 traffic and can be of type: * P2P (peer-to-peer): any packet sent over the interface will be encapsulated and transmitted to the other side (typical OpenVPN client or peer-to-peer behaviour); * P2MP (point-to-multipoint): packets sent over the interface are transmitted to peers based on existing routes (typical OpenVPN server behaviour). After the interface has been created, OpenVPN in userspace can configure it using a new Netlink API. Specifically it is possible to manage peers and their keys. The OpenVPN control channel is multiplexed over the same transport socket by means of OP codes. Anything that is not DATA_V2 (OpenVPN OP code for data traffic) is sent to userspace and handled there. This way the `ovpn` codebase is kept as compact as possible while focusing on handling data traffic only (fast path). Any OpenVPN control feature (like cipher negotiation, TLS handshake, rekeying, etc.) is still fully handled by the userspace process. When userspace establishes a new connection with a peer, it first performs the handshake and then passes the socket to the `ovpn` kernel module, which takes ownership. From this moment on `ovpn` will handle data traffic for the new peer. When control packets are received on the link, they are forwarded to userspace through the same transport socket they were received on, as userspace is still listening to them. Some events (like peer deletion) are sent to a Netlink multicast group. Although it wasn't easy to convince the community, `ovpn` implements only a limited number of the data-channel features supported by the userspace program. Each feature that made it to `ovpn` was attentively vetted to avoid carrying too much legacy along with us (and to give a clear cut to old and probalby-not-so-useful features). Notably, only encryption using AEAD ciphers (specifically ChaCha20Poly1305 and AES-GCM) was implemented. Supporting any other cipher out there was not deemed useful. Both UDP and TCP sockets are supported. As explained above, in case of P2MP mode, OpenVPN will use the main system routing table to decide which packet goes to which peer. This implies that no routing table was re-implemented in the `ovpn` kernel module. This kernel module can be enabled by selecting the CONFIG_OVPN entry in the networking drivers section. NOTE: this first patch introduces the very basic framework only. Features are then added patch by patch, however, although each patch will compile and possibly not break at runtime, only after having applied the full set it is expected to see the ovpn module fully working. Cc: steffen.klassert@secunet.com Cc: antony.antony@secunet.com Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-1-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- MAINTAINERS | 8 ++++++ drivers/net/Kconfig | 8 ++++++ drivers/net/Makefile | 1 + drivers/net/ovpn/Makefile | 10 ++++++++ drivers/net/ovpn/main.c | 52 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 79 insertions(+) create mode 100644 drivers/net/ovpn/Makefile create mode 100644 drivers/net/ovpn/main.c diff --git a/MAINTAINERS b/MAINTAINERS index 1248443035f43..d0d77f43c5af3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18125,6 +18125,14 @@ F: arch/openrisc/ F: drivers/irqchip/irq-ompic.c F: drivers/irqchip/irq-or1k-* +OPENVPN DATA CHANNEL OFFLOAD +M: Antonio Quartulli +L: openvpn-devel@lists.sourceforge.net (subscribers-only) +L: netdev@vger.kernel.org +S: Supported +T: git https://github.com/OpenVPN/linux-kernel-ovpn.git +F: drivers/net/ovpn/ + OPENVSWITCH M: Aaron Conole M: Eelco Chaudron diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 271520510b5fc..5fbe25ae1e11e 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -115,6 +115,14 @@ config WIREGUARD_DEBUG Say N here unless you know what you're doing. +config OVPN + tristate "OpenVPN data channel offload" + depends on NET && INET + depends on IPV6 || !IPV6 + help + This module enhances the performance of the OpenVPN userspace software + by offloading the data channel processing to kernelspace. + config EQUALIZER tristate "EQL (serial line load balancing) support" help diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 75333251a01a8..73bc63ecd65ff 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_IPVLAN) += ipvlan/ obj-$(CONFIG_IPVTAP) += ipvlan/ obj-$(CONFIG_DUMMY) += dummy.o obj-$(CONFIG_WIREGUARD) += wireguard/ +obj-$(CONFIG_OVPN) += ovpn/ obj-$(CONFIG_EQUALIZER) += eql.o obj-$(CONFIG_IFB) += ifb.o obj-$(CONFIG_MACSEC) += macsec.o diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile new file mode 100644 index 0000000000000..876800ebaa21a --- /dev/null +++ b/drivers/net/ovpn/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# ovpn -- OpenVPN data channel offload in kernel space +# +# Copyright (C) 2020-2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +obj-$(CONFIG_OVPN) := ovpn.o +ovpn-y += main.o diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c new file mode 100644 index 0000000000000..d5d4ac8da6d78 --- /dev/null +++ b/drivers/net/ovpn/main.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + * James Yonan + */ + +#include +#include +#include + +static int ovpn_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static struct rtnl_link_ops ovpn_link_ops = { + .kind = "ovpn", + .netns_refund = false, + .newlink = ovpn_newlink, + .dellink = unregister_netdevice_queue, +}; + +static int __init ovpn_init(void) +{ + int err = rtnl_link_register(&ovpn_link_ops); + + if (err) { + pr_err("ovpn: can't register rtnl link ops: %d\n", err); + return err; + } + + return 0; +} + +static __exit void ovpn_cleanup(void) +{ + rtnl_link_unregister(&ovpn_link_ops); + + rcu_barrier(); +} + +module_init(ovpn_init); +module_exit(ovpn_cleanup); + +MODULE_DESCRIPTION("OpenVPN data channel offload (ovpn)"); +MODULE_AUTHOR("Antonio Quartulli "); +MODULE_LICENSE("GPL"); From b7a63391aa982295bbb3125e7d4470f51f31ff0f Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:19 +0200 Subject: [PATCH 262/770] ovpn: add basic netlink support This commit introduces basic netlink support with family registration/unregistration functionalities and stub pre/post-doit. More importantly it introduces the YAML uAPI description along with its auto-generated files: - include/uapi/linux/ovpn.h - drivers/net/ovpn/netlink-gen.c - drivers/net/ovpn/netlink-gen.h Reviewed-by: Donald Hunter Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-2-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ovpn.yaml | 367 ++++++++++++++++++++++++++ MAINTAINERS | 2 + drivers/net/ovpn/Makefile | 2 + drivers/net/ovpn/main.c | 31 +++ drivers/net/ovpn/main.h | 14 + drivers/net/ovpn/netlink-gen.c | 213 +++++++++++++++ drivers/net/ovpn/netlink-gen.h | 41 +++ drivers/net/ovpn/netlink.c | 160 +++++++++++ drivers/net/ovpn/netlink.h | 15 ++ drivers/net/ovpn/ovpnpriv.h | 21 ++ include/uapi/linux/ovpn.h | 109 ++++++++ 11 files changed, 975 insertions(+) create mode 100644 Documentation/netlink/specs/ovpn.yaml create mode 100644 drivers/net/ovpn/main.h create mode 100644 drivers/net/ovpn/netlink-gen.c create mode 100644 drivers/net/ovpn/netlink-gen.h create mode 100644 drivers/net/ovpn/netlink.c create mode 100644 drivers/net/ovpn/netlink.h create mode 100644 drivers/net/ovpn/ovpnpriv.h create mode 100644 include/uapi/linux/ovpn.h diff --git a/Documentation/netlink/specs/ovpn.yaml b/Documentation/netlink/specs/ovpn.yaml new file mode 100644 index 0000000000000..096c51f0c69a8 --- /dev/null +++ b/Documentation/netlink/specs/ovpn.yaml @@ -0,0 +1,367 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +# +# Author: Antonio Quartulli +# +# Copyright (c) 2024-2025, OpenVPN Inc. +# + +name: ovpn + +protocol: genetlink + +doc: Netlink protocol to control OpenVPN network devices + +definitions: + - + type: const + name: nonce-tail-size + value: 8 + - + type: enum + name: cipher-alg + entries: [ none, aes-gcm, chacha20-poly1305 ] + - + type: enum + name: del-peer-reason + entries: + - teardown + - userspace + - expired + - transport-error + - transport-disconnect + - + type: enum + name: key-slot + entries: [ primary, secondary ] + +attribute-sets: + - + name: peer + attributes: + - + name: id + type: u32 + doc: >- + The unique ID of the peer in the device context. To be used to identify + peers during operations for a specific device + checks: + max: 0xFFFFFF + - + name: remote-ipv4 + type: u32 + doc: The remote IPv4 address of the peer + byte-order: big-endian + display-hint: ipv4 + - + name: remote-ipv6 + type: binary + doc: The remote IPv6 address of the peer + display-hint: ipv6 + checks: + exact-len: 16 + - + name: remote-ipv6-scope-id + type: u32 + doc: The scope id of the remote IPv6 address of the peer (RFC2553) + - + name: remote-port + type: u16 + doc: The remote port of the peer + byte-order: big-endian + checks: + min: 1 + - + name: socket + type: u32 + doc: The socket to be used to communicate with the peer + - + name: socket-netnsid + type: s32 + doc: The ID of the netns the socket assigned to this peer lives in + - + name: vpn-ipv4 + type: u32 + doc: The IPv4 address assigned to the peer by the server + byte-order: big-endian + display-hint: ipv4 + - + name: vpn-ipv6 + type: binary + doc: The IPv6 address assigned to the peer by the server + display-hint: ipv6 + checks: + exact-len: 16 + - + name: local-ipv4 + type: u32 + doc: The local IPv4 to be used to send packets to the peer (UDP only) + byte-order: big-endian + display-hint: ipv4 + - + name: local-ipv6 + type: binary + doc: The local IPv6 to be used to send packets to the peer (UDP only) + display-hint: ipv6 + checks: + exact-len: 16 + - + name: local-port + type: u16 + doc: The local port to be used to send packets to the peer (UDP only) + byte-order: big-endian + checks: + min: 1 + - + name: keepalive-interval + type: u32 + doc: >- + The number of seconds after which a keep alive message is sent to the + peer + - + name: keepalive-timeout + type: u32 + doc: >- + The number of seconds from the last activity after which the peer is + assumed dead + - + name: del-reason + type: u32 + doc: The reason why a peer was deleted + enum: del-peer-reason + - + name: vpn-rx-bytes + type: uint + doc: Number of bytes received over the tunnel + - + name: vpn-tx-bytes + type: uint + doc: Number of bytes transmitted over the tunnel + - + name: vpn-rx-packets + type: uint + doc: Number of packets received over the tunnel + - + name: vpn-tx-packets + type: uint + doc: Number of packets transmitted over the tunnel + - + name: link-rx-bytes + type: uint + doc: Number of bytes received at the transport level + - + name: link-tx-bytes + type: uint + doc: Number of bytes transmitted at the transport level + - + name: link-rx-packets + type: uint + doc: Number of packets received at the transport level + - + name: link-tx-packets + type: uint + doc: Number of packets transmitted at the transport level + - + name: keyconf + attributes: + - + name: peer-id + type: u32 + doc: >- + The unique ID of the peer in the device context. To be used to + identify peers during key operations + checks: + max: 0xFFFFFF + - + name: slot + type: u32 + doc: The slot where the key should be stored + enum: key-slot + - + name: key-id + doc: >- + The unique ID of the key in the peer context. Used to fetch the + correct key upon decryption + type: u32 + checks: + max: 7 + - + name: cipher-alg + type: u32 + doc: The cipher to be used when communicating with the peer + enum: cipher-alg + - + name: encrypt-dir + type: nest + doc: Key material for encrypt direction + nested-attributes: keydir + - + name: decrypt-dir + type: nest + doc: Key material for decrypt direction + nested-attributes: keydir + - + name: keydir + attributes: + - + name: cipher-key + type: binary + doc: The actual key to be used by the cipher + checks: + max-len: 256 + - + name: nonce-tail + type: binary + doc: >- + Random nonce to be concatenated to the packet ID, in order to + obtain the actual cipher IV + checks: + exact-len: nonce-tail-size + - + name: ovpn + attributes: + - + name: ifindex + type: u32 + doc: Index of the ovpn interface to operate on + - + name: peer + type: nest + doc: >- + The peer object containing the attributed of interest for the specific + operation + nested-attributes: peer + - + name: keyconf + type: nest + doc: Peer specific cipher configuration + nested-attributes: keyconf + +operations: + list: + - + name: peer-new + attribute-set: ovpn + flags: [ admin-perm ] + doc: Add a remote peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - peer + - + name: peer-set + attribute-set: ovpn + flags: [ admin-perm ] + doc: modify a remote peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - peer + - + name: peer-get + attribute-set: ovpn + flags: [ admin-perm ] + doc: Retrieve data about existing remote peers (or a specific one) + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - peer + reply: + attributes: + - peer + dump: + request: + attributes: + - ifindex + reply: + attributes: + - peer + - + name: peer-del + attribute-set: ovpn + flags: [ admin-perm ] + doc: Delete existing remote peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - peer + - + name: peer-del-ntf + doc: Notification about a peer being deleted + notify: peer-get + mcgrp: peers + + - + name: key-new + attribute-set: ovpn + flags: [ admin-perm ] + doc: Add a cipher key for a specific peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - keyconf + - + name: key-get + attribute-set: ovpn + flags: [ admin-perm ] + doc: Retrieve non-sensitive data about peer key and cipher + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - keyconf + reply: + attributes: + - keyconf + - + name: key-swap + attribute-set: ovpn + flags: [ admin-perm ] + doc: Swap primary and secondary session keys for a specific peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - keyconf + - + name: key-swap-ntf + notify: key-get + doc: >- + Notification about key having exhausted its IV space and requiring + renegotiation + mcgrp: peers + - + name: key-del + attribute-set: ovpn + flags: [ admin-perm ] + doc: Delete cipher key for a specific peer + do: + pre: ovpn-nl-pre-doit + post: ovpn-nl-post-doit + request: + attributes: + - ifindex + - keyconf + +mcast-groups: + list: + - + name: peers diff --git a/MAINTAINERS b/MAINTAINERS index d0d77f43c5af3..c50e87ef72884 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18131,7 +18131,9 @@ L: openvpn-devel@lists.sourceforge.net (subscribers-only) L: netdev@vger.kernel.org S: Supported T: git https://github.com/OpenVPN/linux-kernel-ovpn.git +F: Documentation/netlink/specs/ovpn.yaml F: drivers/net/ovpn/ +F: include/uapi/linux/ovpn.h OPENVSWITCH M: Aaron Conole diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile index 876800ebaa21a..75ac62bba0293 100644 --- a/drivers/net/ovpn/Makefile +++ b/drivers/net/ovpn/Makefile @@ -8,3 +8,5 @@ obj-$(CONFIG_OVPN) := ovpn.o ovpn-y += main.o +ovpn-y += netlink.o +ovpn-y += netlink-gen.o diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index d5d4ac8da6d78..719cac3db6ef4 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -7,9 +7,29 @@ * James Yonan */ +#include #include #include #include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "netlink.h" + +static const struct net_device_ops ovpn_netdev_ops = { +}; + +/** + * ovpn_dev_is_valid - check if the netdevice is of type 'ovpn' + * @dev: the interface to check + * + * Return: whether the netdevice is of type 'ovpn' + */ +bool ovpn_dev_is_valid(const struct net_device *dev) +{ + return dev->netdev_ops == &ovpn_netdev_ops; +} static int ovpn_newlink(struct net_device *dev, struct rtnl_newlink_params *params, @@ -34,11 +54,22 @@ static int __init ovpn_init(void) return err; } + err = ovpn_nl_register(); + if (err) { + pr_err("ovpn: can't register netlink family: %d\n", err); + goto unreg_rtnl; + } + return 0; + +unreg_rtnl: + rtnl_link_unregister(&ovpn_link_ops); + return err; } static __exit void ovpn_cleanup(void) { + ovpn_nl_unregister(); rtnl_link_unregister(&ovpn_link_ops); rcu_barrier(); diff --git a/drivers/net/ovpn/main.h b/drivers/net/ovpn/main.h new file mode 100644 index 0000000000000..017cd01007659 --- /dev/null +++ b/drivers/net/ovpn/main.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#ifndef _NET_OVPN_MAIN_H_ +#define _NET_OVPN_MAIN_H_ + +bool ovpn_dev_is_valid(const struct net_device *dev); + +#endif /* _NET_OVPN_MAIN_H_ */ diff --git a/drivers/net/ovpn/netlink-gen.c b/drivers/net/ovpn/netlink-gen.c new file mode 100644 index 0000000000000..58e1a4342378e --- /dev/null +++ b/drivers/net/ovpn/netlink-gen.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ovpn.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "netlink-gen.h" + +#include + +/* Integer value ranges */ +static const struct netlink_range_validation ovpn_a_peer_id_range = { + .max = 16777215ULL, +}; + +static const struct netlink_range_validation ovpn_a_keyconf_peer_id_range = { + .max = 16777215ULL, +}; + +/* Common nested types */ +const struct nla_policy ovpn_keyconf_nl_policy[OVPN_A_KEYCONF_DECRYPT_DIR + 1] = { + [OVPN_A_KEYCONF_PEER_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_keyconf_peer_id_range), + [OVPN_A_KEYCONF_SLOT] = NLA_POLICY_MAX(NLA_U32, 1), + [OVPN_A_KEYCONF_KEY_ID] = NLA_POLICY_MAX(NLA_U32, 7), + [OVPN_A_KEYCONF_CIPHER_ALG] = NLA_POLICY_MAX(NLA_U32, 2), + [OVPN_A_KEYCONF_ENCRYPT_DIR] = NLA_POLICY_NESTED(ovpn_keydir_nl_policy), + [OVPN_A_KEYCONF_DECRYPT_DIR] = NLA_POLICY_NESTED(ovpn_keydir_nl_policy), +}; + +const struct nla_policy ovpn_keydir_nl_policy[OVPN_A_KEYDIR_NONCE_TAIL + 1] = { + [OVPN_A_KEYDIR_CIPHER_KEY] = NLA_POLICY_MAX_LEN(256), + [OVPN_A_KEYDIR_NONCE_TAIL] = NLA_POLICY_EXACT_LEN(OVPN_NONCE_TAIL_SIZE), +}; + +const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_LINK_TX_PACKETS + 1] = { + [OVPN_A_PEER_ID] = NLA_POLICY_FULL_RANGE(NLA_U32, &ovpn_a_peer_id_range), + [OVPN_A_PEER_REMOTE_IPV4] = { .type = NLA_BE32, }, + [OVPN_A_PEER_REMOTE_IPV6] = NLA_POLICY_EXACT_LEN(16), + [OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID] = { .type = NLA_U32, }, + [OVPN_A_PEER_REMOTE_PORT] = NLA_POLICY_MIN(NLA_BE16, 1), + [OVPN_A_PEER_SOCKET] = { .type = NLA_U32, }, + [OVPN_A_PEER_SOCKET_NETNSID] = { .type = NLA_S32, }, + [OVPN_A_PEER_VPN_IPV4] = { .type = NLA_BE32, }, + [OVPN_A_PEER_VPN_IPV6] = NLA_POLICY_EXACT_LEN(16), + [OVPN_A_PEER_LOCAL_IPV4] = { .type = NLA_BE32, }, + [OVPN_A_PEER_LOCAL_IPV6] = NLA_POLICY_EXACT_LEN(16), + [OVPN_A_PEER_LOCAL_PORT] = NLA_POLICY_MIN(NLA_BE16, 1), + [OVPN_A_PEER_KEEPALIVE_INTERVAL] = { .type = NLA_U32, }, + [OVPN_A_PEER_KEEPALIVE_TIMEOUT] = { .type = NLA_U32, }, + [OVPN_A_PEER_DEL_REASON] = NLA_POLICY_MAX(NLA_U32, 4), + [OVPN_A_PEER_VPN_RX_BYTES] = { .type = NLA_UINT, }, + [OVPN_A_PEER_VPN_TX_BYTES] = { .type = NLA_UINT, }, + [OVPN_A_PEER_VPN_RX_PACKETS] = { .type = NLA_UINT, }, + [OVPN_A_PEER_VPN_TX_PACKETS] = { .type = NLA_UINT, }, + [OVPN_A_PEER_LINK_RX_BYTES] = { .type = NLA_UINT, }, + [OVPN_A_PEER_LINK_TX_BYTES] = { .type = NLA_UINT, }, + [OVPN_A_PEER_LINK_RX_PACKETS] = { .type = NLA_UINT, }, + [OVPN_A_PEER_LINK_TX_PACKETS] = { .type = NLA_UINT, }, +}; + +/* OVPN_CMD_PEER_NEW - do */ +static const struct nla_policy ovpn_peer_new_nl_policy[OVPN_A_PEER + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), +}; + +/* OVPN_CMD_PEER_SET - do */ +static const struct nla_policy ovpn_peer_set_nl_policy[OVPN_A_PEER + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), +}; + +/* OVPN_CMD_PEER_GET - do */ +static const struct nla_policy ovpn_peer_get_do_nl_policy[OVPN_A_PEER + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), +}; + +/* OVPN_CMD_PEER_GET - dump */ +static const struct nla_policy ovpn_peer_get_dump_nl_policy[OVPN_A_IFINDEX + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, +}; + +/* OVPN_CMD_PEER_DEL - do */ +static const struct nla_policy ovpn_peer_del_nl_policy[OVPN_A_PEER + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_PEER] = NLA_POLICY_NESTED(ovpn_peer_nl_policy), +}; + +/* OVPN_CMD_KEY_NEW - do */ +static const struct nla_policy ovpn_key_new_nl_policy[OVPN_A_KEYCONF + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), +}; + +/* OVPN_CMD_KEY_GET - do */ +static const struct nla_policy ovpn_key_get_nl_policy[OVPN_A_KEYCONF + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), +}; + +/* OVPN_CMD_KEY_SWAP - do */ +static const struct nla_policy ovpn_key_swap_nl_policy[OVPN_A_KEYCONF + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), +}; + +/* OVPN_CMD_KEY_DEL - do */ +static const struct nla_policy ovpn_key_del_nl_policy[OVPN_A_KEYCONF + 1] = { + [OVPN_A_IFINDEX] = { .type = NLA_U32, }, + [OVPN_A_KEYCONF] = NLA_POLICY_NESTED(ovpn_keyconf_nl_policy), +}; + +/* Ops table for ovpn */ +static const struct genl_split_ops ovpn_nl_ops[] = { + { + .cmd = OVPN_CMD_PEER_NEW, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_peer_new_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_peer_new_nl_policy, + .maxattr = OVPN_A_PEER, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_PEER_SET, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_peer_set_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_peer_set_nl_policy, + .maxattr = OVPN_A_PEER, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_PEER_GET, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_peer_get_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_peer_get_do_nl_policy, + .maxattr = OVPN_A_PEER, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_PEER_GET, + .dumpit = ovpn_nl_peer_get_dumpit, + .policy = ovpn_peer_get_dump_nl_policy, + .maxattr = OVPN_A_IFINDEX, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP, + }, + { + .cmd = OVPN_CMD_PEER_DEL, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_peer_del_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_peer_del_nl_policy, + .maxattr = OVPN_A_PEER, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_KEY_NEW, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_key_new_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_key_new_nl_policy, + .maxattr = OVPN_A_KEYCONF, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_KEY_GET, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_key_get_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_key_get_nl_policy, + .maxattr = OVPN_A_KEYCONF, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_KEY_SWAP, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_key_swap_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_key_swap_nl_policy, + .maxattr = OVPN_A_KEYCONF, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = OVPN_CMD_KEY_DEL, + .pre_doit = ovpn_nl_pre_doit, + .doit = ovpn_nl_key_del_doit, + .post_doit = ovpn_nl_post_doit, + .policy = ovpn_key_del_nl_policy, + .maxattr = OVPN_A_KEYCONF, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group ovpn_nl_mcgrps[] = { + [OVPN_NLGRP_PEERS] = { "peers", }, +}; + +struct genl_family ovpn_nl_family __ro_after_init = { + .name = OVPN_FAMILY_NAME, + .version = OVPN_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = ovpn_nl_ops, + .n_split_ops = ARRAY_SIZE(ovpn_nl_ops), + .mcgrps = ovpn_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(ovpn_nl_mcgrps), +}; diff --git a/drivers/net/ovpn/netlink-gen.h b/drivers/net/ovpn/netlink-gen.h new file mode 100644 index 0000000000000..66a4e4a0a055b --- /dev/null +++ b/drivers/net/ovpn/netlink-gen.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ovpn.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_OVPN_GEN_H +#define _LINUX_OVPN_GEN_H + +#include +#include + +#include + +/* Common nested types */ +extern const struct nla_policy ovpn_keyconf_nl_policy[OVPN_A_KEYCONF_DECRYPT_DIR + 1]; +extern const struct nla_policy ovpn_keydir_nl_policy[OVPN_A_KEYDIR_NONCE_TAIL + 1]; +extern const struct nla_policy ovpn_peer_nl_policy[OVPN_A_PEER_LINK_TX_PACKETS + 1]; + +int ovpn_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +void +ovpn_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int ovpn_nl_peer_new_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_peer_set_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_peer_get_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_peer_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int ovpn_nl_peer_del_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_key_new_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_key_get_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_key_swap_doit(struct sk_buff *skb, struct genl_info *info); +int ovpn_nl_key_del_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + OVPN_NLGRP_PEERS, +}; + +extern struct genl_family ovpn_nl_family; + +#endif /* _LINUX_OVPN_GEN_H */ diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c new file mode 100644 index 0000000000000..8d267d4c82283 --- /dev/null +++ b/drivers/net/ovpn/netlink.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#include +#include + +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "netlink.h" +#include "netlink-gen.h" + +MODULE_ALIAS_GENL_FAMILY(OVPN_FAMILY_NAME); + +/** + * ovpn_get_dev_from_attrs - retrieve the ovpn private data from the netdevice + * a netlink message is targeting + * @net: network namespace where to look for the interface + * @info: generic netlink info from the user request + * @tracker: tracker object to be used for the netdev reference acquisition + * + * Return: the ovpn private data, if found, or an error otherwise + */ +static struct ovpn_priv * +ovpn_get_dev_from_attrs(struct net *net, const struct genl_info *info, + netdevice_tracker *tracker) +{ + struct ovpn_priv *ovpn; + struct net_device *dev; + int ifindex; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_IFINDEX)) + return ERR_PTR(-EINVAL); + + ifindex = nla_get_u32(info->attrs[OVPN_A_IFINDEX]); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + rcu_read_unlock(); + NL_SET_ERR_MSG_MOD(info->extack, + "ifindex does not match any interface"); + return ERR_PTR(-ENODEV); + } + + if (!ovpn_dev_is_valid(dev)) { + rcu_read_unlock(); + NL_SET_ERR_MSG_MOD(info->extack, + "specified interface is not ovpn"); + NL_SET_BAD_ATTR(info->extack, info->attrs[OVPN_A_IFINDEX]); + return ERR_PTR(-EINVAL); + } + + ovpn = netdev_priv(dev); + netdev_hold(dev, tracker, GFP_ATOMIC); + rcu_read_unlock(); + + return ovpn; +} + +int ovpn_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + netdevice_tracker *tracker = (netdevice_tracker *)&info->user_ptr[1]; + struct ovpn_priv *ovpn = ovpn_get_dev_from_attrs(genl_info_net(info), + info, tracker); + + if (IS_ERR(ovpn)) + return PTR_ERR(ovpn); + + info->user_ptr[0] = ovpn; + + return 0; +} + +void ovpn_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + netdevice_tracker *tracker = (netdevice_tracker *)&info->user_ptr[1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + + if (ovpn) + netdev_put(ovpn->dev, tracker); +} + +int ovpn_nl_peer_new_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_peer_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_peer_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_peer_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_peer_del_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_key_new_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_key_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_key_swap_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +int ovpn_nl_key_del_doit(struct sk_buff *skb, struct genl_info *info) +{ + return -EOPNOTSUPP; +} + +/** + * ovpn_nl_register - perform any needed registration in the NL subsustem + * + * Return: 0 on success, a negative error code otherwise + */ +int __init ovpn_nl_register(void) +{ + int ret = genl_register_family(&ovpn_nl_family); + + if (ret) { + pr_err("ovpn: genl_register_family failed: %d\n", ret); + return ret; + } + + return 0; +} + +/** + * ovpn_nl_unregister - undo any module wide netlink registration + */ +void ovpn_nl_unregister(void) +{ + genl_unregister_family(&ovpn_nl_family); +} diff --git a/drivers/net/ovpn/netlink.h b/drivers/net/ovpn/netlink.h new file mode 100644 index 0000000000000..0d6c34e17082c --- /dev/null +++ b/drivers/net/ovpn/netlink.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#ifndef _NET_OVPN_NETLINK_H_ +#define _NET_OVPN_NETLINK_H_ + +int ovpn_nl_register(void); +void ovpn_nl_unregister(void); + +#endif /* _NET_OVPN_NETLINK_H_ */ diff --git a/drivers/net/ovpn/ovpnpriv.h b/drivers/net/ovpn/ovpnpriv.h new file mode 100644 index 0000000000000..f9322536b06d6 --- /dev/null +++ b/drivers/net/ovpn/ovpnpriv.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPNSTRUCT_H_ +#define _NET_OVPN_OVPNSTRUCT_H_ + +/** + * struct ovpn_priv - per ovpn interface state + * @dev: the actual netdev representing the tunnel + */ +struct ovpn_priv { + struct net_device *dev; +}; + +#endif /* _NET_OVPN_OVPNSTRUCT_H_ */ diff --git a/include/uapi/linux/ovpn.h b/include/uapi/linux/ovpn.h new file mode 100644 index 0000000000000..680d1522dc87c --- /dev/null +++ b/include/uapi/linux/ovpn.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ovpn.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_OVPN_H +#define _UAPI_LINUX_OVPN_H + +#define OVPN_FAMILY_NAME "ovpn" +#define OVPN_FAMILY_VERSION 1 + +#define OVPN_NONCE_TAIL_SIZE 8 + +enum ovpn_cipher_alg { + OVPN_CIPHER_ALG_NONE, + OVPN_CIPHER_ALG_AES_GCM, + OVPN_CIPHER_ALG_CHACHA20_POLY1305, +}; + +enum ovpn_del_peer_reason { + OVPN_DEL_PEER_REASON_TEARDOWN, + OVPN_DEL_PEER_REASON_USERSPACE, + OVPN_DEL_PEER_REASON_EXPIRED, + OVPN_DEL_PEER_REASON_TRANSPORT_ERROR, + OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT, +}; + +enum ovpn_key_slot { + OVPN_KEY_SLOT_PRIMARY, + OVPN_KEY_SLOT_SECONDARY, +}; + +enum { + OVPN_A_PEER_ID = 1, + OVPN_A_PEER_REMOTE_IPV4, + OVPN_A_PEER_REMOTE_IPV6, + OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID, + OVPN_A_PEER_REMOTE_PORT, + OVPN_A_PEER_SOCKET, + OVPN_A_PEER_SOCKET_NETNSID, + OVPN_A_PEER_VPN_IPV4, + OVPN_A_PEER_VPN_IPV6, + OVPN_A_PEER_LOCAL_IPV4, + OVPN_A_PEER_LOCAL_IPV6, + OVPN_A_PEER_LOCAL_PORT, + OVPN_A_PEER_KEEPALIVE_INTERVAL, + OVPN_A_PEER_KEEPALIVE_TIMEOUT, + OVPN_A_PEER_DEL_REASON, + OVPN_A_PEER_VPN_RX_BYTES, + OVPN_A_PEER_VPN_TX_BYTES, + OVPN_A_PEER_VPN_RX_PACKETS, + OVPN_A_PEER_VPN_TX_PACKETS, + OVPN_A_PEER_LINK_RX_BYTES, + OVPN_A_PEER_LINK_TX_BYTES, + OVPN_A_PEER_LINK_RX_PACKETS, + OVPN_A_PEER_LINK_TX_PACKETS, + + __OVPN_A_PEER_MAX, + OVPN_A_PEER_MAX = (__OVPN_A_PEER_MAX - 1) +}; + +enum { + OVPN_A_KEYCONF_PEER_ID = 1, + OVPN_A_KEYCONF_SLOT, + OVPN_A_KEYCONF_KEY_ID, + OVPN_A_KEYCONF_CIPHER_ALG, + OVPN_A_KEYCONF_ENCRYPT_DIR, + OVPN_A_KEYCONF_DECRYPT_DIR, + + __OVPN_A_KEYCONF_MAX, + OVPN_A_KEYCONF_MAX = (__OVPN_A_KEYCONF_MAX - 1) +}; + +enum { + OVPN_A_KEYDIR_CIPHER_KEY = 1, + OVPN_A_KEYDIR_NONCE_TAIL, + + __OVPN_A_KEYDIR_MAX, + OVPN_A_KEYDIR_MAX = (__OVPN_A_KEYDIR_MAX - 1) +}; + +enum { + OVPN_A_IFINDEX = 1, + OVPN_A_PEER, + OVPN_A_KEYCONF, + + __OVPN_A_MAX, + OVPN_A_MAX = (__OVPN_A_MAX - 1) +}; + +enum { + OVPN_CMD_PEER_NEW = 1, + OVPN_CMD_PEER_SET, + OVPN_CMD_PEER_GET, + OVPN_CMD_PEER_DEL, + OVPN_CMD_PEER_DEL_NTF, + OVPN_CMD_KEY_NEW, + OVPN_CMD_KEY_GET, + OVPN_CMD_KEY_SWAP, + OVPN_CMD_KEY_SWAP_NTF, + OVPN_CMD_KEY_DEL, + + __OVPN_CMD_MAX, + OVPN_CMD_MAX = (__OVPN_CMD_MAX - 1) +}; + +#define OVPN_MCGRP_PEERS "peers" + +#endif /* _UAPI_LINUX_OVPN_H */ From c2d950c4672a012ea9765c15a389cdcdf919f652 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:20 +0200 Subject: [PATCH 263/770] ovpn: add basic interface creation/destruction/management routines Add basic infrastructure for handling ovpn interfaces. Tested-by: Donald Hunter Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-3-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/rt-link.yaml | 16 +++++ drivers/net/ovpn/Makefile | 1 + drivers/net/ovpn/io.c | 22 +++++++ drivers/net/ovpn/io.h | 24 +++++++ drivers/net/ovpn/main.c | 82 +++++++++++++++++++++++- drivers/net/ovpn/ovpnpriv.h | 5 ++ drivers/net/ovpn/proto.h | 38 +++++++++++ include/uapi/linux/if_link.h | 15 +++++ 8 files changed, 201 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ovpn/io.c create mode 100644 drivers/net/ovpn/io.h create mode 100644 drivers/net/ovpn/proto.h diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 31238455f8e9d..a50d9d7d882e7 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -938,6 +938,12 @@ definitions: entries: - name: none - name: default + - + name: ovpn-mode + type: enum + entries: + - p2p + - mp attribute-sets: - @@ -2272,6 +2278,13 @@ attribute-sets: - name: tailroom type: u16 + - + name: linkinfo-ovpn-attrs + attributes: + - + name: mode + type: u8 + enum: ovpn-mode sub-messages: - @@ -2322,6 +2335,9 @@ sub-messages: - value: netkit attribute-set: linkinfo-netkit-attrs + - + value: ovpn + attribute-set: linkinfo-ovpn-attrs - name: linkinfo-member-data-msg formats: diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile index 75ac62bba0293..0e5f686672fb5 100644 --- a/drivers/net/ovpn/Makefile +++ b/drivers/net/ovpn/Makefile @@ -8,5 +8,6 @@ obj-$(CONFIG_OVPN) := ovpn.o ovpn-y += main.o +ovpn-y += io.o ovpn-y += netlink.o ovpn-y += netlink-gen.o diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c new file mode 100644 index 0000000000000..4b71c38165d7a --- /dev/null +++ b/drivers/net/ovpn/io.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include + +#include "io.h" + +/* Send user data to the network + */ +netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev) +{ + skb_tx_error(skb); + kfree_skb(skb); + return NET_XMIT_DROP; +} diff --git a/drivers/net/ovpn/io.h b/drivers/net/ovpn/io.h new file mode 100644 index 0000000000000..afea5f81f5628 --- /dev/null +++ b/drivers/net/ovpn/io.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPN_H_ +#define _NET_OVPN_OVPN_H_ + +/* DATA_V2 header size with AEAD encryption */ +#define OVPN_HEAD_ROOM (OVPN_OPCODE_SIZE + OVPN_NONCE_WIRE_SIZE + \ + 16 /* AEAD TAG length */ + \ + max(sizeof(struct udphdr), sizeof(struct tcphdr)) +\ + max(sizeof(struct ipv6hdr), sizeof(struct iphdr))) + +/* max padding required by encryption */ +#define OVPN_MAX_PADDING 16 + +netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev); + +#endif /* _NET_OVPN_OVPN_H_ */ diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index 719cac3db6ef4..ea7dad374c008 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -10,14 +10,28 @@ #include #include #include +#include +#include #include -#include +#include #include "ovpnpriv.h" #include "main.h" #include "netlink.h" +#include "io.h" +#include "proto.h" static const struct net_device_ops ovpn_netdev_ops = { + .ndo_start_xmit = ovpn_net_xmit, +}; + +static const struct device_type ovpn_type = { + .name = OVPN_FAMILY_NAME, +}; + +static const struct nla_policy ovpn_policy[IFLA_OVPN_MAX + 1] = { + [IFLA_OVPN_MODE] = NLA_POLICY_RANGE(NLA_U8, OVPN_MODE_P2P, + OVPN_MODE_MP), }; /** @@ -31,18 +45,82 @@ bool ovpn_dev_is_valid(const struct net_device *dev) return dev->netdev_ops == &ovpn_netdev_ops; } +static void ovpn_setup(struct net_device *dev) +{ + netdev_features_t feat = NETIF_F_SG | NETIF_F_GSO | + NETIF_F_GSO_SOFTWARE | NETIF_F_HIGHDMA; + + dev->needs_free_netdev = true; + + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; + + dev->netdev_ops = &ovpn_netdev_ops; + + dev->hard_header_len = 0; + dev->addr_len = 0; + dev->mtu = ETH_DATA_LEN - OVPN_HEAD_ROOM; + dev->min_mtu = IPV4_MIN_MTU; + dev->max_mtu = IP_MAX_MTU - OVPN_HEAD_ROOM; + + dev->type = ARPHRD_NONE; + dev->flags = IFF_POINTOPOINT | IFF_NOARP; + dev->priv_flags |= IFF_NO_QUEUE; + + dev->lltx = true; + dev->features |= feat; + dev->hw_features |= feat; + dev->hw_enc_features |= feat; + + dev->needed_headroom = ALIGN(OVPN_HEAD_ROOM, 4); + dev->needed_tailroom = OVPN_MAX_PADDING; + + SET_NETDEV_DEVTYPE(dev, &ovpn_type); +} + static int ovpn_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - return -EOPNOTSUPP; + struct ovpn_priv *ovpn = netdev_priv(dev); + struct nlattr **data = params->data; + enum ovpn_mode mode = OVPN_MODE_P2P; + + if (data && data[IFLA_OVPN_MODE]) { + mode = nla_get_u8(data[IFLA_OVPN_MODE]); + netdev_dbg(dev, "setting device mode: %u\n", mode); + } + + ovpn->dev = dev; + ovpn->mode = mode; + + /* turn carrier explicitly off after registration, this way state is + * clearly defined + */ + netif_carrier_off(dev); + + return register_netdevice(dev); +} + +static int ovpn_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct ovpn_priv *ovpn = netdev_priv(dev); + + if (nla_put_u8(skb, IFLA_OVPN_MODE, ovpn->mode)) + return -EMSGSIZE; + + return 0; } static struct rtnl_link_ops ovpn_link_ops = { .kind = "ovpn", .netns_refund = false, + .priv_size = sizeof(struct ovpn_priv), + .setup = ovpn_setup, + .policy = ovpn_policy, + .maxtype = IFLA_OVPN_MAX, .newlink = ovpn_newlink, .dellink = unregister_netdevice_queue, + .fill_info = ovpn_fill_info, }; static int __init ovpn_init(void) diff --git a/drivers/net/ovpn/ovpnpriv.h b/drivers/net/ovpn/ovpnpriv.h index f9322536b06d6..b6b644668d466 100644 --- a/drivers/net/ovpn/ovpnpriv.h +++ b/drivers/net/ovpn/ovpnpriv.h @@ -10,12 +10,17 @@ #ifndef _NET_OVPN_OVPNSTRUCT_H_ #define _NET_OVPN_OVPNSTRUCT_H_ +#include +#include + /** * struct ovpn_priv - per ovpn interface state * @dev: the actual netdev representing the tunnel + * @mode: device operation mode (i.e. p2p, mp, ..) */ struct ovpn_priv { struct net_device *dev; + enum ovpn_mode mode; }; #endif /* _NET_OVPN_OVPNSTRUCT_H_ */ diff --git a/drivers/net/ovpn/proto.h b/drivers/net/ovpn/proto.h new file mode 100644 index 0000000000000..5f95a78bebd37 --- /dev/null +++ b/drivers/net/ovpn/proto.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + * James Yonan + */ + +#ifndef _NET_OVPN_PROTO_H_ +#define _NET_OVPN_PROTO_H_ + +/* When the OpenVPN protocol is ran in AEAD mode, use + * the OpenVPN packet ID as the AEAD nonce: + * + * 00000005 521c3b01 4308c041 + * [seq # ] [ nonce_tail ] + * [ 12-byte full IV ] -> OVPN_NONCE_SIZE + * [4-bytes -> OVPN_NONCE_WIRE_SIZE + * on wire] + */ + +/* nonce size (96bits) as required by AEAD ciphers */ +#define OVPN_NONCE_SIZE 12 +/* last 8 bytes of AEAD nonce: provided by userspace and usually derived + * from key material generated during TLS handshake + */ +#define OVPN_NONCE_TAIL_SIZE 8 + +/* OpenVPN nonce size reduced by 8-byte nonce tail -- this is the + * size of the AEAD Associated Data (AD) sent over the wire + * and is normally the head of the IV + */ +#define OVPN_NONCE_WIRE_SIZE (OVPN_NONCE_SIZE - OVPN_NONCE_TAIL_SIZE) + +#define OVPN_OPCODE_SIZE 4 /* DATA_V2 opcode size */ + +#endif /* _NET_OVPN_PROTO_H_ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 318386cc5b0d1..3ad2d5d980347 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1986,4 +1986,19 @@ enum { #define IFLA_DSA_MAX (__IFLA_DSA_MAX - 1) +/* OVPN section */ + +enum ovpn_mode { + OVPN_MODE_P2P, + OVPN_MODE_MP, +}; + +enum { + IFLA_OVPN_UNSPEC, + IFLA_OVPN_MODE, + __IFLA_OVPN_MAX, +}; + +#define IFLA_OVPN_MAX (__IFLA_OVPN_MAX - 1) + #endif /* _UAPI_LINUX_IF_LINK_H */ From 8327a3baa9b0561a410d00b209aae621b5a61b1c Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:21 +0200 Subject: [PATCH 264/770] ovpn: keep carrier always on for MP interfaces An ovpn interface configured in MP mode will keep carrier always on and let the user decide when to bring it administratively up and down. This way a MP node (i.e. a server) will keep its interface always up and running, even when no peer is connected. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-4-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/main.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index ea7dad374c008..fa1dd84be2406 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -93,10 +93,18 @@ static int ovpn_newlink(struct net_device *dev, ovpn->dev = dev; ovpn->mode = mode; - /* turn carrier explicitly off after registration, this way state is - * clearly defined + /* Set carrier explicitly after registration, this way state is + * clearly defined. + * + * In case of MP interfaces we keep the carrier always on. + * + * Carrier for P2P interfaces is initially off and it is then + * switched on and off when the remote peer is added or deleted. */ - netif_carrier_off(dev); + if (ovpn->mode == OVPN_MODE_MP) + netif_carrier_on(dev); + else + netif_carrier_off(dev); return register_netdevice(dev); } From 80747caef33d77f5c1b3d24644e6d7dae69066b5 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:22 +0200 Subject: [PATCH 265/770] ovpn: introduce the ovpn_peer object An ovpn_peer object holds the whole status of a remote peer (regardless whether it is a server or a client). This includes status for crypto, tx/rx buffers, napi, etc. Only support for one peer is introduced (P2P mode). Multi peer support is introduced with a later patch. Along with the ovpn_peer, also the ovpn_bind object is introcued as the two are strictly related. An ovpn_bind object wraps a sockaddr representing the local coordinates being used to talk to a specific peer. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-5-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/Kconfig | 1 + drivers/net/ovpn/Makefile | 2 + drivers/net/ovpn/bind.c | 58 +++++ drivers/net/ovpn/bind.h | 101 +++++++++ drivers/net/ovpn/main.c | 14 +- drivers/net/ovpn/ovpnpriv.h | 4 + drivers/net/ovpn/peer.c | 411 ++++++++++++++++++++++++++++++++++++ drivers/net/ovpn/peer.h | 80 +++++++ 8 files changed, 670 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ovpn/bind.c create mode 100644 drivers/net/ovpn/bind.h create mode 100644 drivers/net/ovpn/peer.c create mode 100644 drivers/net/ovpn/peer.h diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 5fbe25ae1e11e..2806fcc22a2db 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -119,6 +119,7 @@ config OVPN tristate "OpenVPN data channel offload" depends on NET && INET depends on IPV6 || !IPV6 + select DST_CACHE help This module enhances the performance of the OpenVPN userspace software by offloading the data channel processing to kernelspace. diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile index 0e5f686672fb5..618328ae33886 100644 --- a/drivers/net/ovpn/Makefile +++ b/drivers/net/ovpn/Makefile @@ -7,7 +7,9 @@ # Author: Antonio Quartulli obj-$(CONFIG_OVPN) := ovpn.o +ovpn-y += bind.o ovpn-y += main.o ovpn-y += io.o ovpn-y += netlink.o ovpn-y += netlink-gen.o +ovpn-y += peer.o diff --git a/drivers/net/ovpn/bind.c b/drivers/net/ovpn/bind.c new file mode 100644 index 0000000000000..d4a1aeed12c99 --- /dev/null +++ b/drivers/net/ovpn/bind.c @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2012-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include + +#include "ovpnpriv.h" +#include "bind.h" +#include "peer.h" + +/** + * ovpn_bind_from_sockaddr - retrieve binding matching sockaddr + * @ss: the sockaddr to match + * + * Return: the bind matching the passed sockaddr if found, NULL otherwise + */ +struct ovpn_bind *ovpn_bind_from_sockaddr(const struct sockaddr_storage *ss) +{ + struct ovpn_bind *bind; + size_t sa_len; + + if (ss->ss_family == AF_INET) + sa_len = sizeof(struct sockaddr_in); + else if (ss->ss_family == AF_INET6) + sa_len = sizeof(struct sockaddr_in6); + else + return ERR_PTR(-EAFNOSUPPORT); + + bind = kzalloc(sizeof(*bind), GFP_ATOMIC); + if (unlikely(!bind)) + return ERR_PTR(-ENOMEM); + + memcpy(&bind->remote, ss, sa_len); + + return bind; +} + +/** + * ovpn_bind_reset - assign new binding to peer + * @peer: the peer whose binding has to be replaced + * @new: the new bind to assign + */ +void ovpn_bind_reset(struct ovpn_peer *peer, struct ovpn_bind *new) +{ + struct ovpn_bind *old; + + spin_lock_bh(&peer->lock); + old = rcu_replace_pointer(peer->bind, new, true); + spin_unlock_bh(&peer->lock); + + kfree_rcu(old, rcu); +} diff --git a/drivers/net/ovpn/bind.h b/drivers/net/ovpn/bind.h new file mode 100644 index 0000000000000..4e0b8398bfd9e --- /dev/null +++ b/drivers/net/ovpn/bind.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2012-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPNBIND_H_ +#define _NET_OVPN_OVPNBIND_H_ + +#include +#include +#include +#include +#include +#include + +struct ovpn_peer; + +/** + * union ovpn_sockaddr - basic transport layer address + * @in4: IPv4 address + * @in6: IPv6 address + */ +union ovpn_sockaddr { + struct sockaddr_in in4; + struct sockaddr_in6 in6; +}; + +/** + * struct ovpn_bind - remote peer binding + * @remote: the remote peer sockaddress + * @local: local endpoint used to talk to the peer + * @local.ipv4: local IPv4 used to talk to the peer + * @local.ipv6: local IPv6 used to talk to the peer + * @rcu: used to schedule RCU cleanup job + */ +struct ovpn_bind { + union ovpn_sockaddr remote; /* remote sockaddr */ + + union { + struct in_addr ipv4; + struct in6_addr ipv6; + } local; + + struct rcu_head rcu; +}; + +/** + * ovpn_bind_skb_src_match - match packet source with binding + * @bind: the binding to match + * @skb: the packet to match + * + * Return: true if the packet source matches the remote peer sockaddr + * in the binding + */ +static inline bool ovpn_bind_skb_src_match(const struct ovpn_bind *bind, + const struct sk_buff *skb) +{ + const union ovpn_sockaddr *remote; + + if (unlikely(!bind)) + return false; + + remote = &bind->remote; + + switch (skb->protocol) { + case htons(ETH_P_IP): + if (unlikely(remote->in4.sin_family != AF_INET)) + return false; + + if (unlikely(remote->in4.sin_addr.s_addr != ip_hdr(skb)->saddr)) + return false; + + if (unlikely(remote->in4.sin_port != udp_hdr(skb)->source)) + return false; + break; + case htons(ETH_P_IPV6): + if (unlikely(remote->in6.sin6_family != AF_INET6)) + return false; + + if (unlikely(!ipv6_addr_equal(&remote->in6.sin6_addr, + &ipv6_hdr(skb)->saddr))) + return false; + + if (unlikely(remote->in6.sin6_port != udp_hdr(skb)->source)) + return false; + break; + default: + return false; + } + + return true; +} + +struct ovpn_bind *ovpn_bind_from_sockaddr(const struct sockaddr_storage *sa); +void ovpn_bind_reset(struct ovpn_peer *peer, struct ovpn_bind *bind); + +#endif /* _NET_OVPN_OVPNBIND_H_ */ diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index fa1dd84be2406..3c2b6a7df2726 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -19,6 +19,7 @@ #include "main.h" #include "netlink.h" #include "io.h" +#include "peer.h" #include "proto.h" static const struct net_device_ops ovpn_netdev_ops = { @@ -92,6 +93,7 @@ static int ovpn_newlink(struct net_device *dev, ovpn->dev = dev; ovpn->mode = mode; + spin_lock_init(&ovpn->lock); /* Set carrier explicitly after registration, this way state is * clearly defined. @@ -109,6 +111,16 @@ static int ovpn_newlink(struct net_device *dev, return register_netdevice(dev); } +static void ovpn_dellink(struct net_device *dev, struct list_head *head) +{ + struct ovpn_priv *ovpn = netdev_priv(dev); + + if (ovpn->mode == OVPN_MODE_P2P) + ovpn_peer_release_p2p(ovpn, OVPN_DEL_PEER_REASON_TEARDOWN); + + unregister_netdevice_queue(dev, head); +} + static int ovpn_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ovpn_priv *ovpn = netdev_priv(dev); @@ -127,7 +139,7 @@ static struct rtnl_link_ops ovpn_link_ops = { .policy = ovpn_policy, .maxtype = IFLA_OVPN_MAX, .newlink = ovpn_newlink, - .dellink = unregister_netdevice_queue, + .dellink = ovpn_dellink, .fill_info = ovpn_fill_info, }; diff --git a/drivers/net/ovpn/ovpnpriv.h b/drivers/net/ovpn/ovpnpriv.h index b6b644668d466..a398f9da2e095 100644 --- a/drivers/net/ovpn/ovpnpriv.h +++ b/drivers/net/ovpn/ovpnpriv.h @@ -17,10 +17,14 @@ * struct ovpn_priv - per ovpn interface state * @dev: the actual netdev representing the tunnel * @mode: device operation mode (i.e. p2p, mp, ..) + * @lock: protect this object + * @peer: in P2P mode, this is the only remote peer */ struct ovpn_priv { struct net_device *dev; enum ovpn_mode mode; + spinlock_t lock; /* protect writing to the ovpn_priv object */ + struct ovpn_peer __rcu *peer; }; #endif /* _NET_OVPN_OVPNSTRUCT_H_ */ diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c new file mode 100644 index 0000000000000..338069c99248f --- /dev/null +++ b/drivers/net/ovpn/peer.c @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include + +#include "ovpnpriv.h" +#include "bind.h" +#include "io.h" +#include "main.h" +#include "netlink.h" +#include "peer.h" + +static void unlock_ovpn(struct ovpn_priv *ovpn, + struct llist_head *release_list) + __releases(&ovpn->lock) +{ + struct ovpn_peer *peer; + + spin_unlock_bh(&ovpn->lock); + + llist_for_each_entry(peer, release_list->first, release_entry) + ovpn_peer_put(peer); +} + +/** + * ovpn_peer_new - allocate and initialize a new peer object + * @ovpn: the openvpn instance inside which the peer should be created + * @id: the ID assigned to this peer + * + * Return: a pointer to the new peer on success or an error code otherwise + */ +struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id) +{ + struct ovpn_peer *peer; + int ret; + + /* alloc and init peer object */ + peer = kzalloc(sizeof(*peer), GFP_KERNEL); + if (!peer) + return ERR_PTR(-ENOMEM); + + peer->id = id; + peer->ovpn = ovpn; + + peer->vpn_addrs.ipv4.s_addr = htonl(INADDR_ANY); + peer->vpn_addrs.ipv6 = in6addr_any; + + RCU_INIT_POINTER(peer->bind, NULL); + spin_lock_init(&peer->lock); + kref_init(&peer->refcount); + + ret = dst_cache_init(&peer->dst_cache, GFP_KERNEL); + if (ret < 0) { + netdev_err(ovpn->dev, + "cannot initialize dst cache for peer %u\n", + peer->id); + kfree(peer); + return ERR_PTR(ret); + } + + netdev_hold(ovpn->dev, &peer->dev_tracker, GFP_KERNEL); + + return peer; +} + +/** + * ovpn_peer_release_rcu - RCU callback performing last peer release steps + * @head: RCU member of the ovpn_peer + */ +static void ovpn_peer_release_rcu(struct rcu_head *head) +{ + struct ovpn_peer *peer = container_of(head, struct ovpn_peer, rcu); + + /* this call will immediately free the dst_cache, therefore we + * perform it in the RCU callback, when all contexts are done + */ + dst_cache_destroy(&peer->dst_cache); + kfree(peer); +} + +/** + * ovpn_peer_release - release peer private members + * @peer: the peer to release + */ +static void ovpn_peer_release(struct ovpn_peer *peer) +{ + ovpn_bind_reset(peer, NULL); + call_rcu(&peer->rcu, ovpn_peer_release_rcu); + netdev_put(peer->ovpn->dev, &peer->dev_tracker); +} + +/** + * ovpn_peer_release_kref - callback for kref_put + * @kref: the kref object belonging to the peer + */ +void ovpn_peer_release_kref(struct kref *kref) +{ + struct ovpn_peer *peer = container_of(kref, struct ovpn_peer, refcount); + + ovpn_peer_release(peer); +} + +/** + * ovpn_peer_skb_to_sockaddr - fill sockaddr with skb source address + * @skb: the packet to extract data from + * @ss: the sockaddr to fill + * + * Return: sockaddr length on success or -1 otherwise + */ +static int ovpn_peer_skb_to_sockaddr(struct sk_buff *skb, + struct sockaddr_storage *ss) +{ + struct sockaddr_in6 *sa6; + struct sockaddr_in *sa4; + + switch (skb->protocol) { + case htons(ETH_P_IP): + sa4 = (struct sockaddr_in *)ss; + sa4->sin_family = AF_INET; + sa4->sin_addr.s_addr = ip_hdr(skb)->saddr; + sa4->sin_port = udp_hdr(skb)->source; + return sizeof(*sa4); + case htons(ETH_P_IPV6): + sa6 = (struct sockaddr_in6 *)ss; + sa6->sin6_family = AF_INET6; + sa6->sin6_addr = ipv6_hdr(skb)->saddr; + sa6->sin6_port = udp_hdr(skb)->source; + return sizeof(*sa6); + } + + return -1; +} + +/** + * ovpn_peer_transp_match - check if sockaddr and peer binding match + * @peer: the peer to get the binding from + * @ss: the sockaddr to match + * + * Return: true if sockaddr and binding match or false otherwise + */ +static bool ovpn_peer_transp_match(const struct ovpn_peer *peer, + const struct sockaddr_storage *ss) +{ + struct ovpn_bind *bind = rcu_dereference(peer->bind); + struct sockaddr_in6 *sa6; + struct sockaddr_in *sa4; + + if (unlikely(!bind)) + return false; + + if (ss->ss_family != bind->remote.in4.sin_family) + return false; + + switch (ss->ss_family) { + case AF_INET: + sa4 = (struct sockaddr_in *)ss; + if (sa4->sin_addr.s_addr != bind->remote.in4.sin_addr.s_addr) + return false; + if (sa4->sin_port != bind->remote.in4.sin_port) + return false; + break; + case AF_INET6: + sa6 = (struct sockaddr_in6 *)ss; + if (!ipv6_addr_equal(&sa6->sin6_addr, + &bind->remote.in6.sin6_addr)) + return false; + if (sa6->sin6_port != bind->remote.in6.sin6_port) + return false; + break; + default: + return false; + } + + return true; +} + +/** + * ovpn_peer_get_by_transp_addr_p2p - get peer by transport address in a P2P + * instance + * @ovpn: the openvpn instance to search + * @ss: the transport socket address + * + * Return: the peer if found or NULL otherwise + */ +static struct ovpn_peer * +ovpn_peer_get_by_transp_addr_p2p(struct ovpn_priv *ovpn, + struct sockaddr_storage *ss) +{ + struct ovpn_peer *tmp, *peer = NULL; + + rcu_read_lock(); + tmp = rcu_dereference(ovpn->peer); + if (likely(tmp && ovpn_peer_transp_match(tmp, ss) && + ovpn_peer_hold(tmp))) + peer = tmp; + rcu_read_unlock(); + + return peer; +} + +/** + * ovpn_peer_get_by_transp_addr - retrieve peer by transport address + * @ovpn: the openvpn instance to search + * @skb: the skb to retrieve the source transport address from + * + * Return: a pointer to the peer if found or NULL otherwise + */ +struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_priv *ovpn, + struct sk_buff *skb) +{ + struct ovpn_peer *peer = NULL; + struct sockaddr_storage ss = { 0 }; + + if (unlikely(!ovpn_peer_skb_to_sockaddr(skb, &ss))) + return NULL; + + if (ovpn->mode == OVPN_MODE_P2P) + peer = ovpn_peer_get_by_transp_addr_p2p(ovpn, &ss); + + return peer; +} + +/** + * ovpn_peer_get_by_id_p2p - get peer by ID in a P2P instance + * @ovpn: the openvpn instance to search + * @peer_id: the ID of the peer to find + * + * Return: the peer if found or NULL otherwise + */ +static struct ovpn_peer *ovpn_peer_get_by_id_p2p(struct ovpn_priv *ovpn, + u32 peer_id) +{ + struct ovpn_peer *tmp, *peer = NULL; + + rcu_read_lock(); + tmp = rcu_dereference(ovpn->peer); + if (likely(tmp && tmp->id == peer_id && ovpn_peer_hold(tmp))) + peer = tmp; + rcu_read_unlock(); + + return peer; +} + +/** + * ovpn_peer_get_by_id - retrieve peer by ID + * @ovpn: the openvpn instance to search + * @peer_id: the unique peer identifier to match + * + * Return: a pointer to the peer if found or NULL otherwise + */ +struct ovpn_peer *ovpn_peer_get_by_id(struct ovpn_priv *ovpn, u32 peer_id) +{ + struct ovpn_peer *peer = NULL; + + if (ovpn->mode == OVPN_MODE_P2P) + peer = ovpn_peer_get_by_id_p2p(ovpn, peer_id); + + return peer; +} + +static void ovpn_peer_remove(struct ovpn_peer *peer, + enum ovpn_del_peer_reason reason, + struct llist_head *release_list) +{ + switch (peer->ovpn->mode) { + case OVPN_MODE_P2P: + /* prevent double remove */ + if (peer != rcu_access_pointer(peer->ovpn->peer)) + return; + + RCU_INIT_POINTER(peer->ovpn->peer, NULL); + /* in P2P mode the carrier is switched off when the peer is + * deleted so that third party protocols can react accordingly + */ + netif_carrier_off(peer->ovpn->dev); + break; + default: + return; + } + + peer->delete_reason = reason; + + /* append to provided list for later socket release and ref drop */ + llist_add(&peer->release_entry, release_list); +} + +/** + * ovpn_peer_add_p2p - add peer to related tables in a P2P instance + * @ovpn: the instance to add the peer to + * @peer: the peer to add + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_peer_add_p2p(struct ovpn_priv *ovpn, struct ovpn_peer *peer) +{ + LLIST_HEAD(release_list); + struct ovpn_peer *tmp; + + spin_lock_bh(&ovpn->lock); + /* in p2p mode it is possible to have a single peer only, therefore the + * old one is released and substituted by the new one + */ + tmp = rcu_dereference_protected(ovpn->peer, + lockdep_is_held(&ovpn->lock)); + if (tmp) + ovpn_peer_remove(tmp, OVPN_DEL_PEER_REASON_TEARDOWN, + &release_list); + + rcu_assign_pointer(ovpn->peer, peer); + /* in P2P mode the carrier is switched on when the peer is added */ + netif_carrier_on(ovpn->dev); + unlock_ovpn(ovpn, &release_list); + + return 0; +} + +/** + * ovpn_peer_add - add peer to the related tables + * @ovpn: the openvpn instance the peer belongs to + * @peer: the peer object to add + * + * Assume refcounter was increased by caller + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_peer_add(struct ovpn_priv *ovpn, struct ovpn_peer *peer) +{ + switch (ovpn->mode) { + case OVPN_MODE_P2P: + return ovpn_peer_add_p2p(ovpn, peer); + default: + return -EOPNOTSUPP; + } +} + +/** + * ovpn_peer_del_p2p - delete peer from related tables in a P2P instance + * @peer: the peer to delete + * @reason: reason why the peer was deleted (sent to userspace) + * @release_list: list where delete peer should be appended + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_peer_del_p2p(struct ovpn_peer *peer, + enum ovpn_del_peer_reason reason, + struct llist_head *release_list) +{ + struct ovpn_peer *tmp; + + lockdep_assert_held(&peer->ovpn->lock); + + tmp = rcu_dereference_protected(peer->ovpn->peer, + lockdep_is_held(&peer->ovpn->lock)); + if (tmp != peer) + return -ENOENT; + + ovpn_peer_remove(peer, reason, release_list); + + return 0; +} + +/** + * ovpn_peer_del - delete peer from related tables + * @peer: the peer object to delete + * @reason: reason for deleting peer (will be sent to userspace) + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason) +{ + LLIST_HEAD(release_list); + int ret = -EOPNOTSUPP; + + spin_lock_bh(&peer->ovpn->lock); + switch (peer->ovpn->mode) { + case OVPN_MODE_P2P: + ret = ovpn_peer_del_p2p(peer, reason, &release_list); + break; + default: + break; + } + unlock_ovpn(peer->ovpn, &release_list); + + return ret; +} + +/** + * ovpn_peer_release_p2p - release peer upon P2P device teardown + * @ovpn: the instance being torn down + * @reason: the reason for releasing the peer + */ +void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, + enum ovpn_del_peer_reason reason) +{ + LLIST_HEAD(release_list); + struct ovpn_peer *peer; + + spin_lock_bh(&ovpn->lock); + peer = rcu_dereference_protected(ovpn->peer, + lockdep_is_held(&ovpn->lock)); + if (peer) + ovpn_peer_remove(peer, reason, &release_list); + unlock_ovpn(ovpn, &release_list); +} diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h new file mode 100644 index 0000000000000..fd2e7625990a7 --- /dev/null +++ b/drivers/net/ovpn/peer.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPNPEER_H_ +#define _NET_OVPN_OVPNPEER_H_ + +#include + +/** + * struct ovpn_peer - the main remote peer object + * @ovpn: main openvpn instance this peer belongs to + * @dev_tracker: reference tracker for associated dev + * @id: unique identifier + * @vpn_addrs: IP addresses assigned over the tunnel + * @vpn_addrs.ipv4: IPv4 assigned to peer on the tunnel + * @vpn_addrs.ipv6: IPv6 assigned to peer on the tunnel + * @dst_cache: cache for dst_entry used to send to peer + * @bind: remote peer binding + * @delete_reason: why peer was deleted (i.e. timeout, transport error, ..) + * @lock: protects binding to peer (bind) + * @refcount: reference counter + * @rcu: used to free peer in an RCU safe way + * @release_entry: entry for the socket release list + */ +struct ovpn_peer { + struct ovpn_priv *ovpn; + netdevice_tracker dev_tracker; + u32 id; + struct { + struct in_addr ipv4; + struct in6_addr ipv6; + } vpn_addrs; + struct dst_cache dst_cache; + struct ovpn_bind __rcu *bind; + enum ovpn_del_peer_reason delete_reason; + spinlock_t lock; /* protects bind */ + struct kref refcount; + struct rcu_head rcu; + struct llist_node release_entry; +}; + +/** + * ovpn_peer_hold - increase reference counter + * @peer: the peer whose counter should be increased + * + * Return: true if the counter was increased or false if it was zero already + */ +static inline bool ovpn_peer_hold(struct ovpn_peer *peer) +{ + return kref_get_unless_zero(&peer->refcount); +} + +void ovpn_peer_release_kref(struct kref *kref); + +/** + * ovpn_peer_put - decrease reference counter + * @peer: the peer whose counter should be decreased + */ +static inline void ovpn_peer_put(struct ovpn_peer *peer) +{ + kref_put(&peer->refcount, ovpn_peer_release_kref); +} + +struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id); +int ovpn_peer_add(struct ovpn_priv *ovpn, struct ovpn_peer *peer); +int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason); +void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, + enum ovpn_del_peer_reason reason); + +struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_priv *ovpn, + struct sk_buff *skb); +struct ovpn_peer *ovpn_peer_get_by_id(struct ovpn_priv *ovpn, u32 peer_id); + +#endif /* _NET_OVPN_OVPNPEER_H_ */ From f6226ae7a0cd47aaa9175aca6a1e19600f884cbf Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:23 +0200 Subject: [PATCH 266/770] ovpn: introduce the ovpn_socket object This specific structure is used in the ovpn kernel module to wrap and carry around a standard kernel socket. ovpn takes ownership of passed sockets and therefore an ovpn specific objects is attached to them for status tracking purposes. Initially only UDP support is introduced. TCP will come in a later patch. Cc: willemdebruijn.kernel@gmail.com Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-6-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/Makefile | 2 + drivers/net/ovpn/main.c | 3 +- drivers/net/ovpn/peer.c | 28 +++++- drivers/net/ovpn/peer.h | 6 +- drivers/net/ovpn/socket.c | 197 ++++++++++++++++++++++++++++++++++++++ drivers/net/ovpn/socket.h | 38 ++++++++ drivers/net/ovpn/udp.c | 75 +++++++++++++++ drivers/net/ovpn/udp.h | 19 ++++ include/uapi/linux/udp.h | 1 + 9 files changed, 362 insertions(+), 7 deletions(-) create mode 100644 drivers/net/ovpn/socket.c create mode 100644 drivers/net/ovpn/socket.h create mode 100644 drivers/net/ovpn/udp.c create mode 100644 drivers/net/ovpn/udp.h diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile index 618328ae33886..164f2058ea8e6 100644 --- a/drivers/net/ovpn/Makefile +++ b/drivers/net/ovpn/Makefile @@ -13,3 +13,5 @@ ovpn-y += io.o ovpn-y += netlink.o ovpn-y += netlink-gen.o ovpn-y += peer.o +ovpn-y += socket.o +ovpn-y += udp.o diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index 3c2b6a7df2726..e9a6dc100d467 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -116,7 +116,8 @@ static void ovpn_dellink(struct net_device *dev, struct list_head *head) struct ovpn_priv *ovpn = netdev_priv(dev); if (ovpn->mode == OVPN_MODE_P2P) - ovpn_peer_release_p2p(ovpn, OVPN_DEL_PEER_REASON_TEARDOWN); + ovpn_peer_release_p2p(ovpn, NULL, + OVPN_DEL_PEER_REASON_TEARDOWN); unregister_netdevice_queue(dev, head); } diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index 338069c99248f..0bb6c15171848 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -16,17 +16,20 @@ #include "main.h" #include "netlink.h" #include "peer.h" +#include "socket.h" static void unlock_ovpn(struct ovpn_priv *ovpn, - struct llist_head *release_list) + struct llist_head *release_list) __releases(&ovpn->lock) { struct ovpn_peer *peer; spin_unlock_bh(&ovpn->lock); - llist_for_each_entry(peer, release_list->first, release_entry) + llist_for_each_entry(peer, release_list->first, release_entry) { + ovpn_socket_release(peer); ovpn_peer_put(peer); + } } /** @@ -394,18 +397,33 @@ int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason) /** * ovpn_peer_release_p2p - release peer upon P2P device teardown * @ovpn: the instance being torn down + * @sk: if not NULL, release peer only if it's using this specific socket * @reason: the reason for releasing the peer */ -void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, +void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, struct sock *sk, enum ovpn_del_peer_reason reason) { + struct ovpn_socket *ovpn_sock; LLIST_HEAD(release_list); struct ovpn_peer *peer; spin_lock_bh(&ovpn->lock); peer = rcu_dereference_protected(ovpn->peer, lockdep_is_held(&ovpn->lock)); - if (peer) - ovpn_peer_remove(peer, reason, &release_list); + if (!peer) { + spin_unlock_bh(&ovpn->lock); + return; + } + + if (sk) { + ovpn_sock = rcu_access_pointer(peer->sock); + if (!ovpn_sock || ovpn_sock->sock->sk != sk) { + spin_unlock_bh(&ovpn->lock); + ovpn_peer_put(peer); + return; + } + } + + ovpn_peer_remove(peer, reason, &release_list); unlock_ovpn(ovpn, &release_list); } diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h index fd2e7625990a7..29c9065cedccb 100644 --- a/drivers/net/ovpn/peer.h +++ b/drivers/net/ovpn/peer.h @@ -12,6 +12,8 @@ #include +#include "socket.h" + /** * struct ovpn_peer - the main remote peer object * @ovpn: main openvpn instance this peer belongs to @@ -20,6 +22,7 @@ * @vpn_addrs: IP addresses assigned over the tunnel * @vpn_addrs.ipv4: IPv4 assigned to peer on the tunnel * @vpn_addrs.ipv6: IPv6 assigned to peer on the tunnel + * @sock: the socket being used to talk to this peer * @dst_cache: cache for dst_entry used to send to peer * @bind: remote peer binding * @delete_reason: why peer was deleted (i.e. timeout, transport error, ..) @@ -36,6 +39,7 @@ struct ovpn_peer { struct in_addr ipv4; struct in6_addr ipv6; } vpn_addrs; + struct ovpn_socket __rcu *sock; struct dst_cache dst_cache; struct ovpn_bind __rcu *bind; enum ovpn_del_peer_reason delete_reason; @@ -70,7 +74,7 @@ static inline void ovpn_peer_put(struct ovpn_peer *peer) struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id); int ovpn_peer_add(struct ovpn_priv *ovpn, struct ovpn_peer *peer); int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason); -void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, +void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, struct sock *sk, enum ovpn_del_peer_reason reason); struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_priv *ovpn, diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c new file mode 100644 index 0000000000000..beec7aee35e15 --- /dev/null +++ b/drivers/net/ovpn/socket.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "io.h" +#include "peer.h" +#include "socket.h" +#include "udp.h" + +static void ovpn_socket_release_kref(struct kref *kref) +{ + struct ovpn_socket *sock = container_of(kref, struct ovpn_socket, + refcount); + + if (sock->sock->sk->sk_protocol == IPPROTO_UDP) + ovpn_udp_socket_detach(sock); + + kfree_rcu(sock, rcu); +} + +/** + * ovpn_socket_put - decrease reference counter + * @peer: peer whose socket reference counter should be decreased + * @sock: the RCU protected peer socket + * + * This function is only used internally. Users willing to release + * references to the ovpn_socket should use ovpn_socket_release() + */ +static void ovpn_socket_put(struct ovpn_peer *peer, struct ovpn_socket *sock) +{ + kref_put(&sock->refcount, ovpn_socket_release_kref); +} + +/** + * ovpn_socket_release - release resources owned by socket user + * @peer: peer whose socket should be released + * + * This function should be invoked when the user is shutting + * down and wants to drop its link to the socket. + * + * In case of UDP, the detach routine will drop a reference to the + * ovpn netdev, pointed by the ovpn_socket. + * + * In case of TCP, releasing the socket will cause dropping + * the refcounter for the peer it is linked to, thus allowing the peer + * disappear as well. + * + * This function is expected to be invoked exactly once per peer + * + * NOTE: this function may sleep + */ +void ovpn_socket_release(struct ovpn_peer *peer) +{ + struct ovpn_socket *sock; + + might_sleep(); + + sock = rcu_replace_pointer(peer->sock, NULL, true); + /* release may be invoked after socket was detached */ + if (!sock) + return; + + /* sanity check: we should not end up here if the socket + * was already closed + */ + if (!sock->sock->sk) { + DEBUG_NET_WARN_ON_ONCE(1); + return; + } + + /* Drop the reference while holding the sock lock to avoid + * concurrent ovpn_socket_new call to mess up with a partially + * detached socket. + * + * Holding the lock ensures that a socket with refcnt 0 is fully + * detached before it can be picked by a concurrent reader. + */ + lock_sock(sock->sock->sk); + ovpn_socket_put(peer, sock); + release_sock(sock->sock->sk); + + /* align all readers with sk_user_data being NULL */ + synchronize_rcu(); +} + +static bool ovpn_socket_hold(struct ovpn_socket *sock) +{ + return kref_get_unless_zero(&sock->refcount); +} + +static int ovpn_socket_attach(struct ovpn_socket *sock, struct ovpn_peer *peer) +{ + if (sock->sock->sk->sk_protocol == IPPROTO_UDP) + return ovpn_udp_socket_attach(sock, peer->ovpn); + + return -EOPNOTSUPP; +} + +/** + * ovpn_socket_new - create a new socket and initialize it + * @sock: the kernel socket to embed + * @peer: the peer reachable via this socket + * + * Return: an openvpn socket on success or a negative error code otherwise + */ +struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer) +{ + struct ovpn_socket *ovpn_sock; + int ret; + + lock_sock(sock->sk); + + /* a TCP socket can only be owned by a single peer, therefore there + * can't be any other user + */ + if (sock->sk->sk_protocol == IPPROTO_TCP && sock->sk->sk_user_data) { + ovpn_sock = ERR_PTR(-EBUSY); + goto sock_release; + } + + /* a UDP socket can be shared across multiple peers, but we must make + * sure it is not owned by something else + */ + if (sock->sk->sk_protocol == IPPROTO_UDP) { + u8 type = READ_ONCE(udp_sk(sock->sk)->encap_type); + + /* socket owned by other encapsulation module */ + if (type && type != UDP_ENCAP_OVPNINUDP) { + ovpn_sock = ERR_PTR(-EBUSY); + goto sock_release; + } + + rcu_read_lock(); + ovpn_sock = rcu_dereference_sk_user_data(sock->sk); + if (ovpn_sock) { + /* socket owned by another ovpn instance, we can't use it */ + if (ovpn_sock->ovpn != peer->ovpn) { + ovpn_sock = ERR_PTR(-EBUSY); + rcu_read_unlock(); + goto sock_release; + } + + /* this socket is already owned by this instance, + * therefore we can increase the refcounter and + * use it as expected + */ + if (WARN_ON(!ovpn_socket_hold(ovpn_sock))) { + /* this should never happen because setting + * the refcnt to 0 and detaching the socket + * is expected to be atomic + */ + ovpn_sock = ERR_PTR(-EAGAIN); + rcu_read_unlock(); + goto sock_release; + } + + rcu_read_unlock(); + goto sock_release; + } + rcu_read_unlock(); + } + + /* socket is not owned: attach to this ovpn instance */ + + ovpn_sock = kzalloc(sizeof(*ovpn_sock), GFP_KERNEL); + if (!ovpn_sock) { + ovpn_sock = ERR_PTR(-ENOMEM); + goto sock_release; + } + + ovpn_sock->ovpn = peer->ovpn; + ovpn_sock->sock = sock; + kref_init(&ovpn_sock->refcount); + + ret = ovpn_socket_attach(ovpn_sock, peer); + if (ret < 0) { + kfree(ovpn_sock); + ovpn_sock = ERR_PTR(ret); + goto sock_release; + } + + rcu_assign_sk_user_data(sock->sk, ovpn_sock); +sock_release: + release_sock(sock->sk); + return ovpn_sock; +} diff --git a/drivers/net/ovpn/socket.h b/drivers/net/ovpn/socket.h new file mode 100644 index 0000000000000..ade8c94619d7b --- /dev/null +++ b/drivers/net/ovpn/socket.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_SOCK_H_ +#define _NET_OVPN_SOCK_H_ + +#include +#include +#include + +struct ovpn_priv; +struct ovpn_peer; + +/** + * struct ovpn_socket - a kernel socket referenced in the ovpn code + * @ovpn: ovpn instance owning this socket (UDP only) + * @sock: the low level sock object + * @refcount: amount of contexts currently referencing this object + * @rcu: member used to schedule RCU destructor callback + */ +struct ovpn_socket { + struct ovpn_priv *ovpn; + struct socket *sock; + struct kref refcount; + struct rcu_head rcu; +}; + +struct ovpn_socket *ovpn_socket_new(struct socket *sock, + struct ovpn_peer *peer); +void ovpn_socket_release(struct ovpn_peer *peer); + +#endif /* _NET_OVPN_SOCK_H_ */ diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c new file mode 100644 index 0000000000000..91970e66a4340 --- /dev/null +++ b/drivers/net/ovpn/udp.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "socket.h" +#include "udp.h" + +/** + * ovpn_udp_socket_attach - set udp-tunnel CBs on socket and link it to ovpn + * @ovpn_sock: socket to configure + * @ovpn: the openvp instance to link + * + * After invoking this function, the sock will be controlled by ovpn so that + * any incoming packet may be processed by ovpn first. + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_udp_socket_attach(struct ovpn_socket *ovpn_sock, + struct ovpn_priv *ovpn) +{ + struct socket *sock = ovpn_sock->sock; + struct ovpn_socket *old_data; + int ret = 0; + + /* make sure no pre-existing encapsulation handler exists */ + rcu_read_lock(); + old_data = rcu_dereference_sk_user_data(sock->sk); + if (!old_data) { + /* socket is currently unused - we can take it */ + rcu_read_unlock(); + return 0; + } + + /* socket is in use. We need to understand if it's owned by this ovpn + * instance or by something else. + * In the former case, we can increase the refcounter and happily + * use it, because the same UDP socket is expected to be shared among + * different peers. + * + * Unlikely TCP, a single UDP socket can be used to talk to many remote + * hosts and therefore openvpn instantiates one only for all its peers + */ + if ((READ_ONCE(udp_sk(sock->sk)->encap_type) == UDP_ENCAP_OVPNINUDP) && + old_data->ovpn == ovpn) { + netdev_dbg(ovpn->dev, + "provided socket already owned by this interface\n"); + ret = -EALREADY; + } else { + netdev_dbg(ovpn->dev, + "provided socket already taken by other user\n"); + ret = -EBUSY; + } + rcu_read_unlock(); + + return ret; +} + +/** + * ovpn_udp_socket_detach - clean udp-tunnel status for this socket + * @ovpn_sock: the socket to clean + */ +void ovpn_udp_socket_detach(struct ovpn_socket *ovpn_sock) +{ +} diff --git a/drivers/net/ovpn/udp.h b/drivers/net/ovpn/udp.h new file mode 100644 index 0000000000000..1c8fb6fe402dc --- /dev/null +++ b/drivers/net/ovpn/udp.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#ifndef _NET_OVPN_UDP_H_ +#define _NET_OVPN_UDP_H_ + +struct ovpn_priv; +struct socket; + +int ovpn_udp_socket_attach(struct ovpn_socket *ovpn_sock, + struct ovpn_priv *ovpn); +void ovpn_udp_socket_detach(struct ovpn_socket *ovpn_sock); + +#endif /* _NET_OVPN_UDP_H_ */ diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h index d85d671deed3c..edca3e430305a 100644 --- a/include/uapi/linux/udp.h +++ b/include/uapi/linux/udp.h @@ -43,5 +43,6 @@ struct udphdr { #define UDP_ENCAP_GTP1U 5 /* 3GPP TS 29.060 */ #define UDP_ENCAP_RXRPC 6 #define TCP_ENCAP_ESPINTCP 7 /* Yikes, this is really xfrm encap types. */ +#define UDP_ENCAP_OVPNINUDP 8 /* OpenVPN traffic */ #endif /* _UAPI_LINUX_UDP_H */ From 08857b5ec5d91d83e69e40a36554a8c7557b7301 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:24 +0200 Subject: [PATCH 267/770] ovpn: implement basic TX path (UDP) Packets sent over the ovpn interface are processed and transmitted to the connected peer, if any. Implementation is UDP only. TCP will be added by a later patch. Note: no crypto/encapsulation exists yet. Packets are just captured and sent. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-7-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/Kconfig | 1 + drivers/net/ovpn/io.c | 137 ++++++++++++++++++++++- drivers/net/ovpn/peer.c | 32 ++++++ drivers/net/ovpn/peer.h | 2 + drivers/net/ovpn/skb.h | 55 ++++++++++ drivers/net/ovpn/udp.c | 233 +++++++++++++++++++++++++++++++++++++++- drivers/net/ovpn/udp.h | 6 ++ 7 files changed, 464 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ovpn/skb.h diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 2806fcc22a2db..305f04dd97234 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -120,6 +120,7 @@ config OVPN depends on NET && INET depends on IPV6 || !IPV6 select DST_CACHE + select NET_UDP_TUNNEL help This module enhances the performance of the OpenVPN userspace software by offloading the data channel processing to kernelspace. diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index 4b71c38165d7a..93b128827b6d6 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -9,14 +9,149 @@ #include #include +#include #include "io.h" +#include "ovpnpriv.h" +#include "peer.h" +#include "udp.h" +#include "skb.h" +#include "socket.h" + +static void ovpn_encrypt_post(struct sk_buff *skb, int ret) +{ + struct ovpn_peer *peer = ovpn_skb_cb(skb)->peer; + struct ovpn_socket *sock; + + if (unlikely(ret < 0)) + goto err; + + skb_mark_not_on_list(skb); + + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (unlikely(!sock)) + goto err_unlock; + + switch (sock->sock->sk->sk_protocol) { + case IPPROTO_UDP: + ovpn_udp_send_skb(peer, sock->sock, skb); + break; + default: + /* no transport configured yet */ + goto err_unlock; + } + /* skb passed down the stack - don't free it */ + skb = NULL; +err_unlock: + rcu_read_unlock(); +err: + if (unlikely(skb)) + dev_dstats_tx_dropped(peer->ovpn->dev); + ovpn_peer_put(peer); + kfree_skb(skb); +} + +static bool ovpn_encrypt_one(struct ovpn_peer *peer, struct sk_buff *skb) +{ + ovpn_skb_cb(skb)->peer = peer; + + /* take a reference to the peer because the crypto code may run async. + * ovpn_encrypt_post() will release it upon completion + */ + if (unlikely(!ovpn_peer_hold(peer))) { + DEBUG_NET_WARN_ON_ONCE(1); + return false; + } + + ovpn_encrypt_post(skb, 0); + return true; +} + +/* send skb to connected peer, if any */ +static void ovpn_send(struct ovpn_priv *ovpn, struct sk_buff *skb, + struct ovpn_peer *peer) +{ + struct sk_buff *curr, *next; + + /* this might be a GSO-segmented skb list: process each skb + * independently + */ + skb_list_walk_safe(skb, curr, next) { + if (unlikely(!ovpn_encrypt_one(peer, curr))) { + dev_dstats_tx_dropped(ovpn->dev); + kfree_skb(curr); + } + } + + ovpn_peer_put(peer); +} /* Send user data to the network */ netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev) { + struct ovpn_priv *ovpn = netdev_priv(dev); + struct sk_buff *segments, *curr, *next; + struct sk_buff_head skb_list; + struct ovpn_peer *peer; + __be16 proto; + int ret; + + /* reset netfilter state */ + nf_reset_ct(skb); + + /* verify IP header size in network packet */ + proto = ovpn_ip_check_protocol(skb); + if (unlikely(!proto || skb->protocol != proto)) + goto drop; + + if (skb_is_gso(skb)) { + segments = skb_gso_segment(skb, 0); + if (IS_ERR(segments)) { + ret = PTR_ERR(segments); + net_err_ratelimited("%s: cannot segment payload packet: %d\n", + netdev_name(dev), ret); + goto drop; + } + + consume_skb(skb); + skb = segments; + } + + /* from this moment on, "skb" might be a list */ + + __skb_queue_head_init(&skb_list); + skb_list_walk_safe(skb, curr, next) { + skb_mark_not_on_list(curr); + + curr = skb_share_check(curr, GFP_ATOMIC); + if (unlikely(!curr)) { + net_err_ratelimited("%s: skb_share_check failed for payload packet\n", + netdev_name(dev)); + dev_dstats_tx_dropped(ovpn->dev); + continue; + } + + __skb_queue_tail(&skb_list, curr); + } + skb_list.prev->next = NULL; + + /* retrieve peer serving the destination IP of this packet */ + peer = ovpn_peer_get_by_dst(ovpn, skb); + if (unlikely(!peer)) { + net_dbg_ratelimited("%s: no peer to send data to\n", + netdev_name(ovpn->dev)); + goto drop; + } + + ovpn_send(ovpn, skb_list.next, peer); + + return NETDEV_TX_OK; + +drop: + dev_dstats_tx_dropped(ovpn->dev); skb_tx_error(skb); - kfree_skb(skb); + kfree_skb_list(skb); return NET_XMIT_DROP; } diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index 0bb6c15171848..10eabd62ae723 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -294,6 +294,38 @@ static void ovpn_peer_remove(struct ovpn_peer *peer, llist_add(&peer->release_entry, release_list); } +/** + * ovpn_peer_get_by_dst - Lookup peer to send skb to + * @ovpn: the private data representing the current VPN session + * @skb: the skb to extract the destination address from + * + * This function takes a tunnel packet and looks up the peer to send it to + * after encapsulation. The skb is expected to be the in-tunnel packet, without + * any OpenVPN related header. + * + * Assume that the IP header is accessible in the skb data. + * + * Return: the peer if found or NULL otherwise. + */ +struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_priv *ovpn, + struct sk_buff *skb) +{ + struct ovpn_peer *peer = NULL; + + /* in P2P mode, no matter the destination, packets are always sent to + * the single peer listening on the other side + */ + if (ovpn->mode == OVPN_MODE_P2P) { + rcu_read_lock(); + peer = rcu_dereference(ovpn->peer); + if (unlikely(peer && !ovpn_peer_hold(peer))) + peer = NULL; + rcu_read_unlock(); + } + + return peer; +} + /** * ovpn_peer_add_p2p - add peer to related tables in a P2P instance * @ovpn: the instance to add the peer to diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h index 29c9065cedccb..fef04311c1593 100644 --- a/drivers/net/ovpn/peer.h +++ b/drivers/net/ovpn/peer.h @@ -80,5 +80,7 @@ void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, struct sock *sk, struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_priv *ovpn, struct sk_buff *skb); struct ovpn_peer *ovpn_peer_get_by_id(struct ovpn_priv *ovpn, u32 peer_id); +struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_priv *ovpn, + struct sk_buff *skb); #endif /* _NET_OVPN_OVPNPEER_H_ */ diff --git a/drivers/net/ovpn/skb.h b/drivers/net/ovpn/skb.h new file mode 100644 index 0000000000000..9db7a9adebdb4 --- /dev/null +++ b/drivers/net/ovpn/skb.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + * James Yonan + */ + +#ifndef _NET_OVPN_SKB_H_ +#define _NET_OVPN_SKB_H_ + +#include +#include +#include +#include +#include +#include +#include + +struct ovpn_cb { + struct ovpn_peer *peer; +}; + +static inline struct ovpn_cb *ovpn_skb_cb(struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct ovpn_cb) > sizeof(skb->cb)); + return (struct ovpn_cb *)skb->cb; +} + +/* Return IP protocol version from skb header. + * Return 0 if protocol is not IPv4/IPv6 or cannot be read. + */ +static inline __be16 ovpn_ip_check_protocol(struct sk_buff *skb) +{ + __be16 proto = 0; + + /* skb could be non-linear, + * make sure IP header is in non-fragmented part + */ + if (!pskb_network_may_pull(skb, sizeof(struct iphdr))) + return 0; + + if (ip_hdr(skb)->version == 4) { + proto = htons(ETH_P_IP); + } else if (ip_hdr(skb)->version == 6) { + if (!pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) + return 0; + proto = htons(ETH_P_IPV6); + } + + return proto; +} + +#endif /* _NET_OVPN_SKB_H_ */ diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c index 91970e66a4340..28307ef7b2672 100644 --- a/drivers/net/ovpn/udp.c +++ b/drivers/net/ovpn/udp.c @@ -7,15 +7,246 @@ */ #include +#include +#include #include #include +#include +#include +#include +#include #include +#include #include "ovpnpriv.h" #include "main.h" +#include "bind.h" +#include "io.h" +#include "peer.h" #include "socket.h" #include "udp.h" +/** + * ovpn_udp4_output - send IPv4 packet over udp socket + * @peer: the destination peer + * @bind: the binding related to the destination peer + * @cache: dst cache + * @sk: the socket to send the packet over + * @skb: the packet to send + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_udp4_output(struct ovpn_peer *peer, struct ovpn_bind *bind, + struct dst_cache *cache, struct sock *sk, + struct sk_buff *skb) +{ + struct rtable *rt; + struct flowi4 fl = { + .saddr = bind->local.ipv4.s_addr, + .daddr = bind->remote.in4.sin_addr.s_addr, + .fl4_sport = inet_sk(sk)->inet_sport, + .fl4_dport = bind->remote.in4.sin_port, + .flowi4_proto = sk->sk_protocol, + .flowi4_mark = sk->sk_mark, + }; + int ret; + + local_bh_disable(); + rt = dst_cache_get_ip4(cache, &fl.saddr); + if (rt) + goto transmit; + + if (unlikely(!inet_confirm_addr(sock_net(sk), NULL, 0, fl.saddr, + RT_SCOPE_HOST))) { + /* we may end up here when the cached address is not usable + * anymore. In this case we reset address/cache and perform a + * new look up + */ + fl.saddr = 0; + spin_lock_bh(&peer->lock); + bind->local.ipv4.s_addr = 0; + spin_unlock_bh(&peer->lock); + dst_cache_reset(cache); + } + + rt = ip_route_output_flow(sock_net(sk), &fl, sk); + if (IS_ERR(rt) && PTR_ERR(rt) == -EINVAL) { + fl.saddr = 0; + spin_lock_bh(&peer->lock); + bind->local.ipv4.s_addr = 0; + spin_unlock_bh(&peer->lock); + dst_cache_reset(cache); + + rt = ip_route_output_flow(sock_net(sk), &fl, sk); + } + + if (IS_ERR(rt)) { + ret = PTR_ERR(rt); + net_dbg_ratelimited("%s: no route to host %pISpc: %d\n", + netdev_name(peer->ovpn->dev), + &bind->remote.in4, + ret); + goto err; + } + dst_cache_set_ip4(cache, &rt->dst, fl.saddr); + +transmit: + udp_tunnel_xmit_skb(rt, sk, skb, fl.saddr, fl.daddr, 0, + ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, + fl.fl4_dport, false, sk->sk_no_check_tx); + ret = 0; +err: + local_bh_enable(); + return ret; +} + +#if IS_ENABLED(CONFIG_IPV6) +/** + * ovpn_udp6_output - send IPv6 packet over udp socket + * @peer: the destination peer + * @bind: the binding related to the destination peer + * @cache: dst cache + * @sk: the socket to send the packet over + * @skb: the packet to send + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_udp6_output(struct ovpn_peer *peer, struct ovpn_bind *bind, + struct dst_cache *cache, struct sock *sk, + struct sk_buff *skb) +{ + struct dst_entry *dst; + int ret; + + struct flowi6 fl = { + .saddr = bind->local.ipv6, + .daddr = bind->remote.in6.sin6_addr, + .fl6_sport = inet_sk(sk)->inet_sport, + .fl6_dport = bind->remote.in6.sin6_port, + .flowi6_proto = sk->sk_protocol, + .flowi6_mark = sk->sk_mark, + .flowi6_oif = bind->remote.in6.sin6_scope_id, + }; + + local_bh_disable(); + dst = dst_cache_get_ip6(cache, &fl.saddr); + if (dst) + goto transmit; + + if (unlikely(!ipv6_chk_addr(sock_net(sk), &fl.saddr, NULL, 0))) { + /* we may end up here when the cached address is not usable + * anymore. In this case we reset address/cache and perform a + * new look up + */ + fl.saddr = in6addr_any; + spin_lock_bh(&peer->lock); + bind->local.ipv6 = in6addr_any; + spin_unlock_bh(&peer->lock); + dst_cache_reset(cache); + } + + dst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(sk), sk, &fl, NULL); + if (IS_ERR(dst)) { + ret = PTR_ERR(dst); + net_dbg_ratelimited("%s: no route to host %pISpc: %d\n", + netdev_name(peer->ovpn->dev), + &bind->remote.in6, ret); + goto err; + } + dst_cache_set_ip6(cache, dst, &fl.saddr); + +transmit: + udp_tunnel6_xmit_skb(dst, sk, skb, skb->dev, &fl.saddr, &fl.daddr, 0, + ip6_dst_hoplimit(dst), 0, fl.fl6_sport, + fl.fl6_dport, udp_get_no_check6_tx(sk)); + ret = 0; +err: + local_bh_enable(); + return ret; +} +#endif + +/** + * ovpn_udp_output - transmit skb using udp-tunnel + * @peer: the destination peer + * @cache: dst cache + * @sk: the socket to send the packet over + * @skb: the packet to send + * + * rcu_read_lock should be held on entry. + * On return, the skb is consumed. + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_udp_output(struct ovpn_peer *peer, struct dst_cache *cache, + struct sock *sk, struct sk_buff *skb) +{ + struct ovpn_bind *bind; + int ret; + + /* set sk to null if skb is already orphaned */ + if (!skb->destructor) + skb->sk = NULL; + + rcu_read_lock(); + bind = rcu_dereference(peer->bind); + if (unlikely(!bind)) { + net_warn_ratelimited("%s: no bind for remote peer %u\n", + netdev_name(peer->ovpn->dev), peer->id); + ret = -ENODEV; + goto out; + } + + switch (bind->remote.in4.sin_family) { + case AF_INET: + ret = ovpn_udp4_output(peer, bind, cache, sk, skb); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ret = ovpn_udp6_output(peer, bind, cache, sk, skb); + break; +#endif + default: + ret = -EAFNOSUPPORT; + break; + } + +out: + rcu_read_unlock(); + return ret; +} + +/** + * ovpn_udp_send_skb - prepare skb and send it over via UDP + * @peer: the destination peer + * @sock: the RCU protected peer socket + * @skb: the packet to send + */ +void ovpn_udp_send_skb(struct ovpn_peer *peer, struct socket *sock, + struct sk_buff *skb) +{ + int ret = -1; + + skb->dev = peer->ovpn->dev; + /* no checksum performed at this layer */ + skb->ip_summed = CHECKSUM_NONE; + + /* get socket info */ + if (unlikely(!sock)) { + net_warn_ratelimited("%s: no sock for remote peer %u\n", + netdev_name(peer->ovpn->dev), peer->id); + goto out; + } + + /* crypto layer -> transport (UDP) */ + ret = ovpn_udp_output(peer, &peer->dst_cache, sock->sk, skb); +out: + if (unlikely(ret < 0)) { + kfree_skb(skb); + return; + } +} + /** * ovpn_udp_socket_attach - set udp-tunnel CBs on socket and link it to ovpn * @ovpn_sock: socket to configure @@ -31,7 +262,7 @@ int ovpn_udp_socket_attach(struct ovpn_socket *ovpn_sock, { struct socket *sock = ovpn_sock->sock; struct ovpn_socket *old_data; - int ret = 0; + int ret; /* make sure no pre-existing encapsulation handler exists */ rcu_read_lock(); diff --git a/drivers/net/ovpn/udp.h b/drivers/net/ovpn/udp.h index 1c8fb6fe402dc..9994eb6e04283 100644 --- a/drivers/net/ovpn/udp.h +++ b/drivers/net/ovpn/udp.h @@ -9,6 +9,9 @@ #ifndef _NET_OVPN_UDP_H_ #define _NET_OVPN_UDP_H_ +#include + +struct ovpn_peer; struct ovpn_priv; struct socket; @@ -16,4 +19,7 @@ int ovpn_udp_socket_attach(struct ovpn_socket *ovpn_sock, struct ovpn_priv *ovpn); void ovpn_udp_socket_detach(struct ovpn_socket *ovpn_sock); +void ovpn_udp_send_skb(struct ovpn_peer *peer, struct socket *sock, + struct sk_buff *skb); + #endif /* _NET_OVPN_UDP_H_ */ From ab66abbc769b894324864a68e9dcaf3eb2d4bfdb Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:25 +0200 Subject: [PATCH 268/770] ovpn: implement basic RX path (UDP) Packets received over the socket are forwarded to the user device. Implementation is UDP only. TCP will be added by a later patch. Note: no decryption/decapsulation exists yet, packets are forwarded as they arrive without much processing. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-8-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/io.c | 64 ++++++++++++++++- drivers/net/ovpn/io.h | 2 + drivers/net/ovpn/main.c | 18 +++++ drivers/net/ovpn/ovpnpriv.h | 3 + drivers/net/ovpn/proto.h | 50 ++++++++++++- drivers/net/ovpn/socket.c | 14 +++- drivers/net/ovpn/socket.h | 9 ++- drivers/net/ovpn/udp.c | 135 ++++++++++++++++++++++++++++++++++++ 8 files changed, 290 insertions(+), 5 deletions(-) diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index 93b128827b6d6..22fa92b9aeeb5 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -9,15 +9,77 @@ #include #include +#include #include -#include "io.h" #include "ovpnpriv.h" #include "peer.h" +#include "io.h" +#include "netlink.h" +#include "proto.h" #include "udp.h" #include "skb.h" #include "socket.h" +/* Called after decrypt to write the IP packet to the device. + * This method is expected to manage/free the skb. + */ +static void ovpn_netdev_write(struct ovpn_peer *peer, struct sk_buff *skb) +{ + unsigned int pkt_len; + int ret; + + /* we can't guarantee the packet wasn't corrupted before entering the + * VPN, therefore we give other layers a chance to check that + */ + skb->ip_summed = CHECKSUM_NONE; + + /* skb hash for transport packet no longer valid after decapsulation */ + skb_clear_hash(skb); + + /* post-decrypt scrub -- prepare to inject encapsulated packet onto the + * interface, based on __skb_tunnel_rx() in dst.h + */ + skb->dev = peer->ovpn->dev; + skb_set_queue_mapping(skb, 0); + skb_scrub_packet(skb, true); + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + skb_reset_inner_headers(skb); + + /* cause packet to be "received" by the interface */ + pkt_len = skb->len; + ret = gro_cells_receive(&peer->ovpn->gro_cells, skb); + if (likely(ret == NET_RX_SUCCESS)) + /* update RX stats with the size of decrypted packet */ + dev_dstats_rx_add(peer->ovpn->dev, pkt_len); +} + +static void ovpn_decrypt_post(struct sk_buff *skb, int ret) +{ + struct ovpn_peer *peer = ovpn_skb_cb(skb)->peer; + + if (unlikely(ret < 0)) + goto drop; + + ovpn_netdev_write(peer, skb); + /* skb is passed to upper layer - don't free it */ + skb = NULL; +drop: + if (unlikely(skb)) + dev_dstats_rx_dropped(peer->ovpn->dev); + ovpn_peer_put(peer); + kfree_skb(skb); +} + +/* RX path entry point: decrypt packet and forward it to the device */ +void ovpn_recv(struct ovpn_peer *peer, struct sk_buff *skb) +{ + ovpn_skb_cb(skb)->peer = peer; + ovpn_decrypt_post(skb, 0); +} + static void ovpn_encrypt_post(struct sk_buff *skb, int ret) { struct ovpn_peer *peer = ovpn_skb_cb(skb)->peer; diff --git a/drivers/net/ovpn/io.h b/drivers/net/ovpn/io.h index afea5f81f5628..1cfa66873a2d4 100644 --- a/drivers/net/ovpn/io.h +++ b/drivers/net/ovpn/io.h @@ -21,4 +21,6 @@ netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev); +void ovpn_recv(struct ovpn_peer *peer, struct sk_buff *skb); + #endif /* _NET_OVPN_OVPN_H_ */ diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index e9a6dc100d467..b356c2235d46f 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -21,8 +22,25 @@ #include "io.h" #include "peer.h" #include "proto.h" +#include "udp.h" + +static int ovpn_net_init(struct net_device *dev) +{ + struct ovpn_priv *ovpn = netdev_priv(dev); + + return gro_cells_init(&ovpn->gro_cells, dev); +} + +static void ovpn_net_uninit(struct net_device *dev) +{ + struct ovpn_priv *ovpn = netdev_priv(dev); + + gro_cells_destroy(&ovpn->gro_cells); +} static const struct net_device_ops ovpn_netdev_ops = { + .ndo_init = ovpn_net_init, + .ndo_uninit = ovpn_net_uninit, .ndo_start_xmit = ovpn_net_xmit, }; diff --git a/drivers/net/ovpn/ovpnpriv.h b/drivers/net/ovpn/ovpnpriv.h index a398f9da2e095..d31cceee508d7 100644 --- a/drivers/net/ovpn/ovpnpriv.h +++ b/drivers/net/ovpn/ovpnpriv.h @@ -10,6 +10,7 @@ #ifndef _NET_OVPN_OVPNSTRUCT_H_ #define _NET_OVPN_OVPNSTRUCT_H_ +#include #include #include @@ -19,12 +20,14 @@ * @mode: device operation mode (i.e. p2p, mp, ..) * @lock: protect this object * @peer: in P2P mode, this is the only remote peer + * @gro_cells: pointer to the Generic Receive Offload cell */ struct ovpn_priv { struct net_device *dev; enum ovpn_mode mode; spinlock_t lock; /* protect writing to the ovpn_priv object */ struct ovpn_peer __rcu *peer; + struct gro_cells gro_cells; }; #endif /* _NET_OVPN_OVPNSTRUCT_H_ */ diff --git a/drivers/net/ovpn/proto.h b/drivers/net/ovpn/proto.h index 5f95a78bebd37..591b97a9925fd 100644 --- a/drivers/net/ovpn/proto.h +++ b/drivers/net/ovpn/proto.h @@ -10,6 +10,11 @@ #ifndef _NET_OVPN_PROTO_H_ #define _NET_OVPN_PROTO_H_ +#include "main.h" + +#include +#include + /* When the OpenVPN protocol is ran in AEAD mode, use * the OpenVPN packet ID as the AEAD nonce: * @@ -34,5 +39,48 @@ #define OVPN_NONCE_WIRE_SIZE (OVPN_NONCE_SIZE - OVPN_NONCE_TAIL_SIZE) #define OVPN_OPCODE_SIZE 4 /* DATA_V2 opcode size */ +#define OVPN_OPCODE_KEYID_MASK 0x07000000 +#define OVPN_OPCODE_PKTTYPE_MASK 0xF8000000 +#define OVPN_OPCODE_PEERID_MASK 0x00FFFFFF + +/* packet opcodes of interest to us */ +#define OVPN_DATA_V1 6 /* data channel v1 packet */ +#define OVPN_DATA_V2 9 /* data channel v2 packet */ + +#define OVPN_PEER_ID_UNDEF 0x00FFFFFF + +/** + * ovpn_opcode_from_skb - extract OP code from skb at specified offset + * @skb: the packet to extract the OP code from + * @offset: the offset in the data buffer where the OP code is located + * + * Note: this function assumes that the skb head was pulled enough + * to access the first 4 bytes. + * + * Return: the OP code + */ +static inline u8 ovpn_opcode_from_skb(const struct sk_buff *skb, u16 offset) +{ + u32 opcode = be32_to_cpu(*(__be32 *)(skb->data + offset)); + + return FIELD_GET(OVPN_OPCODE_PKTTYPE_MASK, opcode); +} + +/** + * ovpn_peer_id_from_skb - extract peer ID from skb at specified offset + * @skb: the packet to extract the OP code from + * @offset: the offset in the data buffer where the OP code is located + * + * Note: this function assumes that the skb head was pulled enough + * to access the first 4 bytes. + * + * Return: the peer ID + */ +static inline u32 ovpn_peer_id_from_skb(const struct sk_buff *skb, u16 offset) +{ + u32 opcode = be32_to_cpu(*(__be32 *)(skb->data + offset)); + + return FIELD_GET(OVPN_OPCODE_PEERID_MASK, opcode); +} -#endif /* _NET_OVPN_PROTO_H_ */ +#endif /* _NET_OVPN_OVPNPROTO_H_ */ diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c index beec7aee35e15..f1fd98d528451 100644 --- a/drivers/net/ovpn/socket.c +++ b/drivers/net/ovpn/socket.c @@ -23,8 +23,10 @@ static void ovpn_socket_release_kref(struct kref *kref) struct ovpn_socket *sock = container_of(kref, struct ovpn_socket, refcount); - if (sock->sock->sk->sk_protocol == IPPROTO_UDP) + if (sock->sock->sk->sk_protocol == IPPROTO_UDP) { ovpn_udp_socket_detach(sock); + netdev_put(sock->ovpn->dev, &sock->dev_tracker); + } kfree_rcu(sock, rcu); } @@ -179,7 +181,6 @@ struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer) goto sock_release; } - ovpn_sock->ovpn = peer->ovpn; ovpn_sock->sock = sock; kref_init(&ovpn_sock->refcount); @@ -190,6 +191,15 @@ struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer) goto sock_release; } + if (sock->sk->sk_protocol == IPPROTO_UDP) { + /* in UDP we only link the ovpn instance since the socket is + * shared among multiple peers + */ + ovpn_sock->ovpn = peer->ovpn; + netdev_hold(peer->ovpn->dev, &ovpn_sock->dev_tracker, + GFP_KERNEL); + } + rcu_assign_sk_user_data(sock->sk, ovpn_sock); sock_release: release_sock(sock->sk); diff --git a/drivers/net/ovpn/socket.h b/drivers/net/ovpn/socket.h index ade8c94619d7b..c1697f4616d47 100644 --- a/drivers/net/ovpn/socket.h +++ b/drivers/net/ovpn/socket.h @@ -20,12 +20,19 @@ struct ovpn_peer; /** * struct ovpn_socket - a kernel socket referenced in the ovpn code * @ovpn: ovpn instance owning this socket (UDP only) + * @dev_tracker: reference tracker for associated dev (UDP only) * @sock: the low level sock object * @refcount: amount of contexts currently referencing this object * @rcu: member used to schedule RCU destructor callback */ struct ovpn_socket { - struct ovpn_priv *ovpn; + union { + struct { + struct ovpn_priv *ovpn; + netdevice_tracker dev_tracker; + }; + }; + struct socket *sock; struct kref refcount; struct rcu_head rcu; diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c index 28307ef7b2672..920d71da793cc 100644 --- a/drivers/net/ovpn/udp.c +++ b/drivers/net/ovpn/udp.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -23,9 +24,114 @@ #include "bind.h" #include "io.h" #include "peer.h" +#include "proto.h" #include "socket.h" #include "udp.h" +/* Retrieve the corresponding ovpn object from a UDP socket + * rcu_read_lock must be held on entry + */ +static struct ovpn_socket *ovpn_socket_from_udp_sock(struct sock *sk) +{ + struct ovpn_socket *ovpn_sock; + + if (unlikely(READ_ONCE(udp_sk(sk)->encap_type) != UDP_ENCAP_OVPNINUDP)) + return NULL; + + ovpn_sock = rcu_dereference_sk_user_data(sk); + if (unlikely(!ovpn_sock)) + return NULL; + + /* make sure that sk matches our stored transport socket */ + if (unlikely(!ovpn_sock->sock || sk != ovpn_sock->sock->sk)) + return NULL; + + return ovpn_sock; +} + +/** + * ovpn_udp_encap_recv - Start processing a received UDP packet. + * @sk: socket over which the packet was received + * @skb: the received packet + * + * If the first byte of the payload is: + * - DATA_V2 the packet is accepted for further processing, + * - DATA_V1 the packet is dropped as not supported, + * - anything else the packet is forwarded to the UDP stack for + * delivery to user space. + * + * Return: + * 0 if skb was consumed or dropped + * >0 if skb should be passed up to userspace as UDP (packet not consumed) + * <0 if skb should be resubmitted as proto -N (packet not consumed) + */ +static int ovpn_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct ovpn_socket *ovpn_sock; + struct ovpn_priv *ovpn; + struct ovpn_peer *peer; + u32 peer_id; + u8 opcode; + + ovpn_sock = ovpn_socket_from_udp_sock(sk); + if (unlikely(!ovpn_sock)) { + net_err_ratelimited("ovpn: %s invoked on non ovpn socket\n", + __func__); + goto drop_noovpn; + } + + ovpn = ovpn_sock->ovpn; + if (unlikely(!ovpn)) { + net_err_ratelimited("ovpn: cannot obtain ovpn object from UDP socket\n"); + goto drop_noovpn; + } + + /* Make sure the first 4 bytes of the skb data buffer after the UDP + * header are accessible. + * They are required to fetch the OP code, the key ID and the peer ID. + */ + if (unlikely(!pskb_may_pull(skb, sizeof(struct udphdr) + + OVPN_OPCODE_SIZE))) { + net_dbg_ratelimited("%s: packet too small from UDP socket\n", + netdev_name(ovpn->dev)); + goto drop; + } + + opcode = ovpn_opcode_from_skb(skb, sizeof(struct udphdr)); + if (unlikely(opcode != OVPN_DATA_V2)) { + /* DATA_V1 is not supported */ + if (opcode == OVPN_DATA_V1) + goto drop; + + /* unknown or control packet: let it bubble up to userspace */ + return 1; + } + + peer_id = ovpn_peer_id_from_skb(skb, sizeof(struct udphdr)); + /* some OpenVPN server implementations send data packets with the + * peer-id set to UNDEF. In this case we skip the peer lookup by peer-id + * and we try with the transport address + */ + if (peer_id == OVPN_PEER_ID_UNDEF) + peer = ovpn_peer_get_by_transp_addr(ovpn, skb); + else + peer = ovpn_peer_get_by_id(ovpn, peer_id); + + if (unlikely(!peer)) + goto drop; + + /* pop off outer UDP header */ + __skb_pull(skb, sizeof(struct udphdr)); + ovpn_recv(peer, skb); + return 0; + +drop: + dev_dstats_rx_dropped(ovpn->dev); +drop_noovpn: + kfree_skb(skb); + return 0; +} + /** * ovpn_udp4_output - send IPv4 packet over udp socket * @peer: the destination peer @@ -247,6 +353,25 @@ void ovpn_udp_send_skb(struct ovpn_peer *peer, struct socket *sock, } } +static void ovpn_udp_encap_destroy(struct sock *sk) +{ + struct ovpn_socket *sock; + struct ovpn_priv *ovpn; + + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (!sock || !sock->ovpn) { + rcu_read_unlock(); + return; + } + ovpn = sock->ovpn; + rcu_read_unlock(); + + if (ovpn->mode == OVPN_MODE_P2P) + ovpn_peer_release_p2p(ovpn, sk, + OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT); +} + /** * ovpn_udp_socket_attach - set udp-tunnel CBs on socket and link it to ovpn * @ovpn_sock: socket to configure @@ -260,6 +385,11 @@ void ovpn_udp_send_skb(struct ovpn_peer *peer, struct socket *sock, int ovpn_udp_socket_attach(struct ovpn_socket *ovpn_sock, struct ovpn_priv *ovpn) { + struct udp_tunnel_sock_cfg cfg = { + .encap_type = UDP_ENCAP_OVPNINUDP, + .encap_rcv = ovpn_udp_encap_recv, + .encap_destroy = ovpn_udp_encap_destroy, + }; struct socket *sock = ovpn_sock->sock; struct ovpn_socket *old_data; int ret; @@ -270,6 +400,7 @@ int ovpn_udp_socket_attach(struct ovpn_socket *ovpn_sock, if (!old_data) { /* socket is currently unused - we can take it */ rcu_read_unlock(); + setup_udp_tunnel_sock(sock_net(sock->sk), sock, &cfg); return 0; } @@ -303,4 +434,8 @@ int ovpn_udp_socket_attach(struct ovpn_socket *ovpn_sock, */ void ovpn_udp_socket_detach(struct ovpn_socket *ovpn_sock) { + struct udp_tunnel_sock_cfg cfg = { }; + + setup_udp_tunnel_sock(sock_net(ovpn_sock->sock->sk), ovpn_sock->sock, + &cfg); } From 8534731dbf2d52a539b94defd06d2a8d3514aacb Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:26 +0200 Subject: [PATCH 269/770] ovpn: implement packet processing This change implements encryption/decryption and encapsulation/decapsulation of OpenVPN packets. Support for generic crypto state is added along with a wrapper for the AEAD crypto kernel API. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-9-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/Kconfig | 4 + drivers/net/ovpn/Makefile | 3 + drivers/net/ovpn/bind.c | 9 +- drivers/net/ovpn/crypto.c | 148 +++++++++++++ drivers/net/ovpn/crypto.h | 139 +++++++++++++ drivers/net/ovpn/crypto_aead.c | 366 +++++++++++++++++++++++++++++++++ drivers/net/ovpn/crypto_aead.h | 27 +++ drivers/net/ovpn/io.c | 137 +++++++++++- drivers/net/ovpn/io.h | 3 + drivers/net/ovpn/peer.c | 29 +++ drivers/net/ovpn/peer.h | 5 + drivers/net/ovpn/pktid.c | 129 ++++++++++++ drivers/net/ovpn/pktid.h | 86 ++++++++ drivers/net/ovpn/proto.h | 32 +++ drivers/net/ovpn/skb.h | 5 + 15 files changed, 1105 insertions(+), 17 deletions(-) create mode 100644 drivers/net/ovpn/crypto.c create mode 100644 drivers/net/ovpn/crypto.h create mode 100644 drivers/net/ovpn/crypto_aead.c create mode 100644 drivers/net/ovpn/crypto_aead.h create mode 100644 drivers/net/ovpn/pktid.c create mode 100644 drivers/net/ovpn/pktid.h diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 305f04dd97234..a5cee847911b1 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -121,6 +121,10 @@ config OVPN depends on IPV6 || !IPV6 select DST_CACHE select NET_UDP_TUNNEL + select CRYPTO + select CRYPTO_AES + select CRYPTO_GCM + select CRYPTO_CHACHA20POLY1305 help This module enhances the performance of the OpenVPN userspace software by offloading the data channel processing to kernelspace. diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile index 164f2058ea8e6..38c9fdca0e2e8 100644 --- a/drivers/net/ovpn/Makefile +++ b/drivers/net/ovpn/Makefile @@ -8,10 +8,13 @@ obj-$(CONFIG_OVPN) := ovpn.o ovpn-y += bind.o +ovpn-y += crypto.o +ovpn-y += crypto_aead.o ovpn-y += main.o ovpn-y += io.o ovpn-y += netlink.o ovpn-y += netlink-gen.o ovpn-y += peer.o +ovpn-y += pktid.o ovpn-y += socket.o ovpn-y += udp.o diff --git a/drivers/net/ovpn/bind.c b/drivers/net/ovpn/bind.c index d4a1aeed12c99..24d2788a277e6 100644 --- a/drivers/net/ovpn/bind.c +++ b/drivers/net/ovpn/bind.c @@ -48,11 +48,8 @@ struct ovpn_bind *ovpn_bind_from_sockaddr(const struct sockaddr_storage *ss) */ void ovpn_bind_reset(struct ovpn_peer *peer, struct ovpn_bind *new) { - struct ovpn_bind *old; + lockdep_assert_held(&peer->lock); - spin_lock_bh(&peer->lock); - old = rcu_replace_pointer(peer->bind, new, true); - spin_unlock_bh(&peer->lock); - - kfree_rcu(old, rcu); + kfree_rcu(rcu_replace_pointer(peer->bind, new, + lockdep_is_held(&peer->lock)), rcu); } diff --git a/drivers/net/ovpn/crypto.c b/drivers/net/ovpn/crypto.c new file mode 100644 index 0000000000000..9544255c4588a --- /dev/null +++ b/drivers/net/ovpn/crypto.c @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "pktid.h" +#include "crypto_aead.h" +#include "crypto.h" + +static void ovpn_ks_destroy_rcu(struct rcu_head *head) +{ + struct ovpn_crypto_key_slot *ks; + + ks = container_of(head, struct ovpn_crypto_key_slot, rcu); + ovpn_aead_crypto_key_slot_destroy(ks); +} + +void ovpn_crypto_key_slot_release(struct kref *kref) +{ + struct ovpn_crypto_key_slot *ks; + + ks = container_of(kref, struct ovpn_crypto_key_slot, refcount); + call_rcu(&ks->rcu, ovpn_ks_destroy_rcu); +} + +/* can only be invoked when all peer references have been dropped (i.e. RCU + * release routine) + */ +void ovpn_crypto_state_release(struct ovpn_crypto_state *cs) +{ + struct ovpn_crypto_key_slot *ks; + + ks = rcu_access_pointer(cs->slots[0]); + if (ks) { + RCU_INIT_POINTER(cs->slots[0], NULL); + ovpn_crypto_key_slot_put(ks); + } + + ks = rcu_access_pointer(cs->slots[1]); + if (ks) { + RCU_INIT_POINTER(cs->slots[1], NULL); + ovpn_crypto_key_slot_put(ks); + } +} + +/* Reset the ovpn_crypto_state object in a way that is atomic + * to RCU readers. + */ +int ovpn_crypto_state_reset(struct ovpn_crypto_state *cs, + const struct ovpn_peer_key_reset *pkr) +{ + struct ovpn_crypto_key_slot *old = NULL, *new; + u8 idx; + + if (pkr->slot != OVPN_KEY_SLOT_PRIMARY && + pkr->slot != OVPN_KEY_SLOT_SECONDARY) + return -EINVAL; + + new = ovpn_aead_crypto_key_slot_new(&pkr->key); + if (IS_ERR(new)) + return PTR_ERR(new); + + spin_lock_bh(&cs->lock); + idx = cs->primary_idx; + switch (pkr->slot) { + case OVPN_KEY_SLOT_PRIMARY: + old = rcu_replace_pointer(cs->slots[idx], new, + lockdep_is_held(&cs->lock)); + break; + case OVPN_KEY_SLOT_SECONDARY: + old = rcu_replace_pointer(cs->slots[!idx], new, + lockdep_is_held(&cs->lock)); + break; + } + spin_unlock_bh(&cs->lock); + + if (old) + ovpn_crypto_key_slot_put(old); + + return 0; +} + +void ovpn_crypto_key_slot_delete(struct ovpn_crypto_state *cs, + enum ovpn_key_slot slot) +{ + struct ovpn_crypto_key_slot *ks = NULL; + u8 idx; + + if (slot != OVPN_KEY_SLOT_PRIMARY && + slot != OVPN_KEY_SLOT_SECONDARY) { + pr_warn("Invalid slot to release: %u\n", slot); + return; + } + + spin_lock_bh(&cs->lock); + idx = cs->primary_idx; + switch (slot) { + case OVPN_KEY_SLOT_PRIMARY: + ks = rcu_replace_pointer(cs->slots[idx], NULL, + lockdep_is_held(&cs->lock)); + break; + case OVPN_KEY_SLOT_SECONDARY: + ks = rcu_replace_pointer(cs->slots[!idx], NULL, + lockdep_is_held(&cs->lock)); + break; + } + spin_unlock_bh(&cs->lock); + + if (!ks) { + pr_debug("Key slot already released: %u\n", slot); + return; + } + + pr_debug("deleting key slot %u, key_id=%u\n", slot, ks->key_id); + ovpn_crypto_key_slot_put(ks); +} + +void ovpn_crypto_key_slots_swap(struct ovpn_crypto_state *cs) +{ + const struct ovpn_crypto_key_slot *old_primary, *old_secondary; + u8 idx; + + spin_lock_bh(&cs->lock); + idx = cs->primary_idx; + old_primary = rcu_dereference_protected(cs->slots[idx], + lockdep_is_held(&cs->lock)); + old_secondary = rcu_dereference_protected(cs->slots[!idx], + lockdep_is_held(&cs->lock)); + /* perform real swap by switching the index of the primary key */ + WRITE_ONCE(cs->primary_idx, !cs->primary_idx); + + pr_debug("key swapped: (old primary) %d <-> (new primary) %d\n", + old_primary ? old_primary->key_id : -1, + old_secondary ? old_secondary->key_id : -1); + + spin_unlock_bh(&cs->lock); +} diff --git a/drivers/net/ovpn/crypto.h b/drivers/net/ovpn/crypto.h new file mode 100644 index 0000000000000..5155791b87df7 --- /dev/null +++ b/drivers/net/ovpn/crypto.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPNCRYPTO_H_ +#define _NET_OVPN_OVPNCRYPTO_H_ + +#include "pktid.h" +#include "proto.h" + +/* info needed for both encrypt and decrypt directions */ +struct ovpn_key_direction { + const u8 *cipher_key; + size_t cipher_key_size; + const u8 *nonce_tail; /* only needed for GCM modes */ + size_t nonce_tail_size; /* only needed for GCM modes */ +}; + +/* all info for a particular symmetric key (primary or secondary) */ +struct ovpn_key_config { + enum ovpn_cipher_alg cipher_alg; + u8 key_id; + struct ovpn_key_direction encrypt; + struct ovpn_key_direction decrypt; +}; + +/* used to pass settings from netlink to the crypto engine */ +struct ovpn_peer_key_reset { + enum ovpn_key_slot slot; + struct ovpn_key_config key; +}; + +struct ovpn_crypto_key_slot { + u8 key_id; + + struct crypto_aead *encrypt; + struct crypto_aead *decrypt; + u8 nonce_tail_xmit[OVPN_NONCE_TAIL_SIZE]; + u8 nonce_tail_recv[OVPN_NONCE_TAIL_SIZE]; + + struct ovpn_pktid_recv pid_recv ____cacheline_aligned_in_smp; + struct ovpn_pktid_xmit pid_xmit ____cacheline_aligned_in_smp; + struct kref refcount; + struct rcu_head rcu; +}; + +struct ovpn_crypto_state { + struct ovpn_crypto_key_slot __rcu *slots[2]; + u8 primary_idx; + + /* protects primary and secondary slots */ + spinlock_t lock; +}; + +static inline bool ovpn_crypto_key_slot_hold(struct ovpn_crypto_key_slot *ks) +{ + return kref_get_unless_zero(&ks->refcount); +} + +static inline void ovpn_crypto_state_init(struct ovpn_crypto_state *cs) +{ + RCU_INIT_POINTER(cs->slots[0], NULL); + RCU_INIT_POINTER(cs->slots[1], NULL); + cs->primary_idx = 0; + spin_lock_init(&cs->lock); +} + +static inline struct ovpn_crypto_key_slot * +ovpn_crypto_key_id_to_slot(const struct ovpn_crypto_state *cs, u8 key_id) +{ + struct ovpn_crypto_key_slot *ks; + u8 idx; + + if (unlikely(!cs)) + return NULL; + + rcu_read_lock(); + idx = READ_ONCE(cs->primary_idx); + ks = rcu_dereference(cs->slots[idx]); + if (ks && ks->key_id == key_id) { + if (unlikely(!ovpn_crypto_key_slot_hold(ks))) + ks = NULL; + goto out; + } + + ks = rcu_dereference(cs->slots[!idx]); + if (ks && ks->key_id == key_id) { + if (unlikely(!ovpn_crypto_key_slot_hold(ks))) + ks = NULL; + goto out; + } + + /* when both key slots are occupied but no matching key ID is found, ks + * has to be reset to NULL to avoid carrying a stale pointer + */ + ks = NULL; +out: + rcu_read_unlock(); + + return ks; +} + +static inline struct ovpn_crypto_key_slot * +ovpn_crypto_key_slot_primary(const struct ovpn_crypto_state *cs) +{ + struct ovpn_crypto_key_slot *ks; + + rcu_read_lock(); + ks = rcu_dereference(cs->slots[cs->primary_idx]); + if (unlikely(ks && !ovpn_crypto_key_slot_hold(ks))) + ks = NULL; + rcu_read_unlock(); + + return ks; +} + +void ovpn_crypto_key_slot_release(struct kref *kref); + +static inline void ovpn_crypto_key_slot_put(struct ovpn_crypto_key_slot *ks) +{ + kref_put(&ks->refcount, ovpn_crypto_key_slot_release); +} + +int ovpn_crypto_state_reset(struct ovpn_crypto_state *cs, + const struct ovpn_peer_key_reset *pkr); + +void ovpn_crypto_key_slot_delete(struct ovpn_crypto_state *cs, + enum ovpn_key_slot slot); + +void ovpn_crypto_state_release(struct ovpn_crypto_state *cs); + +void ovpn_crypto_key_slots_swap(struct ovpn_crypto_state *cs); + +#endif /* _NET_OVPN_OVPNCRYPTO_H_ */ diff --git a/drivers/net/ovpn/crypto_aead.c b/drivers/net/ovpn/crypto_aead.c new file mode 100644 index 0000000000000..83ec18e4b9a4f --- /dev/null +++ b/drivers/net/ovpn/crypto_aead.c @@ -0,0 +1,366 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "io.h" +#include "pktid.h" +#include "crypto_aead.h" +#include "crypto.h" +#include "peer.h" +#include "proto.h" +#include "skb.h" + +#define OVPN_AUTH_TAG_SIZE 16 +#define OVPN_AAD_SIZE (OVPN_OPCODE_SIZE + OVPN_NONCE_WIRE_SIZE) + +#define ALG_NAME_AES "gcm(aes)" +#define ALG_NAME_CHACHAPOLY "rfc7539(chacha20,poly1305)" + +static int ovpn_aead_encap_overhead(const struct ovpn_crypto_key_slot *ks) +{ + return OVPN_OPCODE_SIZE + /* OP header size */ + sizeof(u32) + /* Packet ID */ + crypto_aead_authsize(ks->encrypt); /* Auth Tag */ +} + +int ovpn_aead_encrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, + struct sk_buff *skb) +{ + const unsigned int tag_size = crypto_aead_authsize(ks->encrypt); + struct aead_request *req; + struct sk_buff *trailer; + struct scatterlist *sg; + int nfrags, ret; + u32 pktid, op; + u8 *iv; + + ovpn_skb_cb(skb)->peer = peer; + ovpn_skb_cb(skb)->ks = ks; + + /* Sample AEAD header format: + * 48000001 00000005 7e7046bd 444a7e28 cc6387b1 64a4d6c1 380275a... + * [ OP32 ] [seq # ] [ auth tag ] [ payload ... ] + * [4-byte + * IV head] + */ + + /* check that there's enough headroom in the skb for packet + * encapsulation + */ + if (unlikely(skb_cow_head(skb, OVPN_HEAD_ROOM))) + return -ENOBUFS; + + /* get number of skb frags and ensure that packet data is writable */ + nfrags = skb_cow_data(skb, 0, &trailer); + if (unlikely(nfrags < 0)) + return nfrags; + + if (unlikely(nfrags + 2 > (MAX_SKB_FRAGS + 2))) + return -ENOSPC; + + /* sg may be required by async crypto */ + ovpn_skb_cb(skb)->sg = kmalloc(sizeof(*ovpn_skb_cb(skb)->sg) * + (nfrags + 2), GFP_ATOMIC); + if (unlikely(!ovpn_skb_cb(skb)->sg)) + return -ENOMEM; + + sg = ovpn_skb_cb(skb)->sg; + + /* sg table: + * 0: op, wire nonce (AD, len=OVPN_OP_SIZE_V2+OVPN_NONCE_WIRE_SIZE), + * 1, 2, 3, ..., n: payload, + * n+1: auth_tag (len=tag_size) + */ + sg_init_table(sg, nfrags + 2); + + /* build scatterlist to encrypt packet payload */ + ret = skb_to_sgvec_nomark(skb, sg + 1, 0, skb->len); + if (unlikely(nfrags != ret)) + return -EINVAL; + + /* append auth_tag onto scatterlist */ + __skb_push(skb, tag_size); + sg_set_buf(sg + nfrags + 1, skb->data, tag_size); + + /* obtain packet ID, which is used both as a first + * 4 bytes of nonce and last 4 bytes of associated data. + */ + ret = ovpn_pktid_xmit_next(&ks->pid_xmit, &pktid); + if (unlikely(ret < 0)) + return ret; + + /* iv may be required by async crypto */ + ovpn_skb_cb(skb)->iv = kmalloc(OVPN_NONCE_SIZE, GFP_ATOMIC); + if (unlikely(!ovpn_skb_cb(skb)->iv)) + return -ENOMEM; + + iv = ovpn_skb_cb(skb)->iv; + + /* concat 4 bytes packet id and 8 bytes nonce tail into 12 bytes + * nonce + */ + ovpn_pktid_aead_write(pktid, ks->nonce_tail_xmit, iv); + + /* make space for packet id and push it to the front */ + __skb_push(skb, OVPN_NONCE_WIRE_SIZE); + memcpy(skb->data, iv, OVPN_NONCE_WIRE_SIZE); + + /* add packet op as head of additional data */ + op = ovpn_opcode_compose(OVPN_DATA_V2, ks->key_id, peer->id); + __skb_push(skb, OVPN_OPCODE_SIZE); + BUILD_BUG_ON(sizeof(op) != OVPN_OPCODE_SIZE); + *((__force __be32 *)skb->data) = htonl(op); + + /* AEAD Additional data */ + sg_set_buf(sg, skb->data, OVPN_AAD_SIZE); + + req = aead_request_alloc(ks->encrypt, GFP_ATOMIC); + if (unlikely(!req)) + return -ENOMEM; + + ovpn_skb_cb(skb)->req = req; + + /* setup async crypto operation */ + aead_request_set_tfm(req, ks->encrypt); + aead_request_set_callback(req, 0, ovpn_encrypt_post, skb); + aead_request_set_crypt(req, sg, sg, + skb->len - ovpn_aead_encap_overhead(ks), iv); + aead_request_set_ad(req, OVPN_AAD_SIZE); + + /* encrypt it */ + return crypto_aead_encrypt(req); +} + +int ovpn_aead_decrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, + struct sk_buff *skb) +{ + const unsigned int tag_size = crypto_aead_authsize(ks->decrypt); + int ret, payload_len, nfrags; + unsigned int payload_offset; + struct aead_request *req; + struct sk_buff *trailer; + struct scatterlist *sg; + u8 *iv; + + payload_offset = OVPN_AAD_SIZE + tag_size; + payload_len = skb->len - payload_offset; + + ovpn_skb_cb(skb)->payload_offset = payload_offset; + ovpn_skb_cb(skb)->peer = peer; + ovpn_skb_cb(skb)->ks = ks; + + /* sanity check on packet size, payload size must be >= 0 */ + if (unlikely(payload_len < 0)) + return -EINVAL; + + /* Prepare the skb data buffer to be accessed up until the auth tag. + * This is required because this area is directly mapped into the sg + * list. + */ + if (unlikely(!pskb_may_pull(skb, payload_offset))) + return -ENODATA; + + /* get number of skb frags and ensure that packet data is writable */ + nfrags = skb_cow_data(skb, 0, &trailer); + if (unlikely(nfrags < 0)) + return nfrags; + + if (unlikely(nfrags + 2 > (MAX_SKB_FRAGS + 2))) + return -ENOSPC; + + /* sg may be required by async crypto */ + ovpn_skb_cb(skb)->sg = kmalloc(sizeof(*ovpn_skb_cb(skb)->sg) * + (nfrags + 2), GFP_ATOMIC); + if (unlikely(!ovpn_skb_cb(skb)->sg)) + return -ENOMEM; + + sg = ovpn_skb_cb(skb)->sg; + + /* sg table: + * 0: op, wire nonce (AD, len=OVPN_OPCODE_SIZE+OVPN_NONCE_WIRE_SIZE), + * 1, 2, 3, ..., n: payload, + * n+1: auth_tag (len=tag_size) + */ + sg_init_table(sg, nfrags + 2); + + /* packet op is head of additional data */ + sg_set_buf(sg, skb->data, OVPN_AAD_SIZE); + + /* build scatterlist to decrypt packet payload */ + ret = skb_to_sgvec_nomark(skb, sg + 1, payload_offset, payload_len); + if (unlikely(nfrags != ret)) + return -EINVAL; + + /* append auth_tag onto scatterlist */ + sg_set_buf(sg + nfrags + 1, skb->data + OVPN_AAD_SIZE, tag_size); + + /* iv may be required by async crypto */ + ovpn_skb_cb(skb)->iv = kmalloc(OVPN_NONCE_SIZE, GFP_ATOMIC); + if (unlikely(!ovpn_skb_cb(skb)->iv)) + return -ENOMEM; + + iv = ovpn_skb_cb(skb)->iv; + + /* copy nonce into IV buffer */ + memcpy(iv, skb->data + OVPN_OPCODE_SIZE, OVPN_NONCE_WIRE_SIZE); + memcpy(iv + OVPN_NONCE_WIRE_SIZE, ks->nonce_tail_recv, + OVPN_NONCE_TAIL_SIZE); + + req = aead_request_alloc(ks->decrypt, GFP_ATOMIC); + if (unlikely(!req)) + return -ENOMEM; + + ovpn_skb_cb(skb)->req = req; + + /* setup async crypto operation */ + aead_request_set_tfm(req, ks->decrypt); + aead_request_set_callback(req, 0, ovpn_decrypt_post, skb); + aead_request_set_crypt(req, sg, sg, payload_len + tag_size, iv); + + aead_request_set_ad(req, OVPN_AAD_SIZE); + + /* decrypt it */ + return crypto_aead_decrypt(req); +} + +/* Initialize a struct crypto_aead object */ +static struct crypto_aead *ovpn_aead_init(const char *title, + const char *alg_name, + const unsigned char *key, + unsigned int keylen) +{ + struct crypto_aead *aead; + int ret; + + aead = crypto_alloc_aead(alg_name, 0, 0); + if (IS_ERR(aead)) { + ret = PTR_ERR(aead); + pr_err("%s crypto_alloc_aead failed, err=%d\n", title, ret); + aead = NULL; + goto error; + } + + ret = crypto_aead_setkey(aead, key, keylen); + if (ret) { + pr_err("%s crypto_aead_setkey size=%u failed, err=%d\n", title, + keylen, ret); + goto error; + } + + ret = crypto_aead_setauthsize(aead, OVPN_AUTH_TAG_SIZE); + if (ret) { + pr_err("%s crypto_aead_setauthsize failed, err=%d\n", title, + ret); + goto error; + } + + /* basic AEAD assumption */ + if (crypto_aead_ivsize(aead) != OVPN_NONCE_SIZE) { + pr_err("%s IV size must be %d\n", title, OVPN_NONCE_SIZE); + ret = -EINVAL; + goto error; + } + + pr_debug("********* Cipher %s (%s)\n", alg_name, title); + pr_debug("*** IV size=%u\n", crypto_aead_ivsize(aead)); + pr_debug("*** req size=%u\n", crypto_aead_reqsize(aead)); + pr_debug("*** block size=%u\n", crypto_aead_blocksize(aead)); + pr_debug("*** auth size=%u\n", crypto_aead_authsize(aead)); + pr_debug("*** alignmask=0x%x\n", crypto_aead_alignmask(aead)); + + return aead; + +error: + crypto_free_aead(aead); + return ERR_PTR(ret); +} + +void ovpn_aead_crypto_key_slot_destroy(struct ovpn_crypto_key_slot *ks) +{ + if (!ks) + return; + + crypto_free_aead(ks->encrypt); + crypto_free_aead(ks->decrypt); + kfree(ks); +} + +struct ovpn_crypto_key_slot * +ovpn_aead_crypto_key_slot_new(const struct ovpn_key_config *kc) +{ + struct ovpn_crypto_key_slot *ks = NULL; + const char *alg_name; + int ret; + + /* validate crypto alg */ + switch (kc->cipher_alg) { + case OVPN_CIPHER_ALG_AES_GCM: + alg_name = ALG_NAME_AES; + break; + case OVPN_CIPHER_ALG_CHACHA20_POLY1305: + alg_name = ALG_NAME_CHACHAPOLY; + break; + default: + return ERR_PTR(-EOPNOTSUPP); + } + + if (kc->encrypt.nonce_tail_size != OVPN_NONCE_TAIL_SIZE || + kc->decrypt.nonce_tail_size != OVPN_NONCE_TAIL_SIZE) + return ERR_PTR(-EINVAL); + + /* build the key slot */ + ks = kmalloc(sizeof(*ks), GFP_KERNEL); + if (!ks) + return ERR_PTR(-ENOMEM); + + ks->encrypt = NULL; + ks->decrypt = NULL; + kref_init(&ks->refcount); + ks->key_id = kc->key_id; + + ks->encrypt = ovpn_aead_init("encrypt", alg_name, + kc->encrypt.cipher_key, + kc->encrypt.cipher_key_size); + if (IS_ERR(ks->encrypt)) { + ret = PTR_ERR(ks->encrypt); + ks->encrypt = NULL; + goto destroy_ks; + } + + ks->decrypt = ovpn_aead_init("decrypt", alg_name, + kc->decrypt.cipher_key, + kc->decrypt.cipher_key_size); + if (IS_ERR(ks->decrypt)) { + ret = PTR_ERR(ks->decrypt); + ks->decrypt = NULL; + goto destroy_ks; + } + + memcpy(ks->nonce_tail_xmit, kc->encrypt.nonce_tail, + OVPN_NONCE_TAIL_SIZE); + memcpy(ks->nonce_tail_recv, kc->decrypt.nonce_tail, + OVPN_NONCE_TAIL_SIZE); + + /* init packet ID generation/validation */ + ovpn_pktid_xmit_init(&ks->pid_xmit); + ovpn_pktid_recv_init(&ks->pid_recv); + + return ks; + +destroy_ks: + ovpn_aead_crypto_key_slot_destroy(ks); + return ERR_PTR(ret); +} diff --git a/drivers/net/ovpn/crypto_aead.h b/drivers/net/ovpn/crypto_aead.h new file mode 100644 index 0000000000000..40c056558add3 --- /dev/null +++ b/drivers/net/ovpn/crypto_aead.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#ifndef _NET_OVPN_OVPNAEAD_H_ +#define _NET_OVPN_OVPNAEAD_H_ + +#include "crypto.h" + +#include +#include + +int ovpn_aead_encrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, + struct sk_buff *skb); +int ovpn_aead_decrypt(struct ovpn_peer *peer, struct ovpn_crypto_key_slot *ks, + struct sk_buff *skb); + +struct ovpn_crypto_key_slot * +ovpn_aead_crypto_key_slot_new(const struct ovpn_key_config *kc); +void ovpn_aead_crypto_key_slot_destroy(struct ovpn_crypto_key_slot *ks); + +#endif /* _NET_OVPN_OVPNAEAD_H_ */ diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index 22fa92b9aeeb5..846b6d7d4ec69 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -7,6 +7,7 @@ * Antonio Quartulli */ +#include #include #include #include @@ -15,6 +16,9 @@ #include "ovpnpriv.h" #include "peer.h" #include "io.h" +#include "bind.h" +#include "crypto.h" +#include "crypto_aead.h" #include "netlink.h" #include "proto.h" #include "udp.h" @@ -44,7 +48,7 @@ static void ovpn_netdev_write(struct ovpn_peer *peer, struct sk_buff *skb) skb_set_queue_mapping(skb, 0); skb_scrub_packet(skb, true); - skb_reset_network_header(skb); + /* network header reset in ovpn_decrypt_post() */ skb_reset_transport_header(skb); skb_reset_inner_headers(skb); @@ -56,34 +60,135 @@ static void ovpn_netdev_write(struct ovpn_peer *peer, struct sk_buff *skb) dev_dstats_rx_add(peer->ovpn->dev, pkt_len); } -static void ovpn_decrypt_post(struct sk_buff *skb, int ret) +void ovpn_decrypt_post(void *data, int ret) { - struct ovpn_peer *peer = ovpn_skb_cb(skb)->peer; + struct ovpn_crypto_key_slot *ks; + unsigned int payload_offset = 0; + struct sk_buff *skb = data; + struct ovpn_peer *peer; + __be16 proto; + __be32 *pid; + + /* crypto is happening asynchronously. this function will be called + * again later by the crypto callback with a proper return code + */ + if (unlikely(ret == -EINPROGRESS)) + return; + + payload_offset = ovpn_skb_cb(skb)->payload_offset; + ks = ovpn_skb_cb(skb)->ks; + peer = ovpn_skb_cb(skb)->peer; + + /* crypto is done, cleanup skb CB and its members */ + kfree(ovpn_skb_cb(skb)->iv); + kfree(ovpn_skb_cb(skb)->sg); + aead_request_free(ovpn_skb_cb(skb)->req); if (unlikely(ret < 0)) goto drop; + /* PID sits after the op */ + pid = (__force __be32 *)(skb->data + OVPN_OPCODE_SIZE); + ret = ovpn_pktid_recv(&ks->pid_recv, ntohl(*pid), 0); + if (unlikely(ret < 0)) { + net_err_ratelimited("%s: PKT ID RX error for peer %u: %d\n", + netdev_name(peer->ovpn->dev), peer->id, + ret); + goto drop; + } + + /* point to encapsulated IP packet */ + __skb_pull(skb, payload_offset); + + /* check if this is a valid datapacket that has to be delivered to the + * ovpn interface + */ + skb_reset_network_header(skb); + proto = ovpn_ip_check_protocol(skb); + if (unlikely(!proto)) { + /* check if null packet */ + if (unlikely(!pskb_may_pull(skb, 1))) { + net_info_ratelimited("%s: NULL packet received from peer %u\n", + netdev_name(peer->ovpn->dev), + peer->id); + goto drop; + } + + net_info_ratelimited("%s: unsupported protocol received from peer %u\n", + netdev_name(peer->ovpn->dev), peer->id); + goto drop; + } + skb->protocol = proto; + + /* perform Reverse Path Filtering (RPF) */ + if (unlikely(!ovpn_peer_check_by_src(peer->ovpn, skb, peer))) { + if (skb->protocol == htons(ETH_P_IPV6)) + net_dbg_ratelimited("%s: RPF dropped packet from peer %u, src: %pI6c\n", + netdev_name(peer->ovpn->dev), + peer->id, &ipv6_hdr(skb)->saddr); + else + net_dbg_ratelimited("%s: RPF dropped packet from peer %u, src: %pI4\n", + netdev_name(peer->ovpn->dev), + peer->id, &ip_hdr(skb)->saddr); + goto drop; + } + ovpn_netdev_write(peer, skb); /* skb is passed to upper layer - don't free it */ skb = NULL; drop: if (unlikely(skb)) dev_dstats_rx_dropped(peer->ovpn->dev); - ovpn_peer_put(peer); + if (likely(peer)) + ovpn_peer_put(peer); + if (likely(ks)) + ovpn_crypto_key_slot_put(ks); kfree_skb(skb); } /* RX path entry point: decrypt packet and forward it to the device */ void ovpn_recv(struct ovpn_peer *peer, struct sk_buff *skb) { - ovpn_skb_cb(skb)->peer = peer; - ovpn_decrypt_post(skb, 0); + struct ovpn_crypto_key_slot *ks; + u8 key_id; + + /* get the key slot matching the key ID in the received packet */ + key_id = ovpn_key_id_from_skb(skb); + ks = ovpn_crypto_key_id_to_slot(&peer->crypto, key_id); + if (unlikely(!ks)) { + net_info_ratelimited("%s: no available key for peer %u, key-id: %u\n", + netdev_name(peer->ovpn->dev), peer->id, + key_id); + dev_dstats_rx_dropped(peer->ovpn->dev); + kfree_skb(skb); + ovpn_peer_put(peer); + return; + } + + memset(ovpn_skb_cb(skb), 0, sizeof(struct ovpn_cb)); + ovpn_decrypt_post(skb, ovpn_aead_decrypt(peer, ks, skb)); } -static void ovpn_encrypt_post(struct sk_buff *skb, int ret) +void ovpn_encrypt_post(void *data, int ret) { - struct ovpn_peer *peer = ovpn_skb_cb(skb)->peer; + struct ovpn_crypto_key_slot *ks; + struct sk_buff *skb = data; struct ovpn_socket *sock; + struct ovpn_peer *peer; + + /* encryption is happening asynchronously. This function will be + * called later by the crypto callback with a proper return value + */ + if (unlikely(ret == -EINPROGRESS)) + return; + + ks = ovpn_skb_cb(skb)->ks; + peer = ovpn_skb_cb(skb)->peer; + + /* crypto is done, cleanup skb CB and its members */ + kfree(ovpn_skb_cb(skb)->iv); + kfree(ovpn_skb_cb(skb)->sg); + aead_request_free(ovpn_skb_cb(skb)->req); if (unlikely(ret < 0)) goto err; @@ -110,23 +215,33 @@ static void ovpn_encrypt_post(struct sk_buff *skb, int ret) err: if (unlikely(skb)) dev_dstats_tx_dropped(peer->ovpn->dev); - ovpn_peer_put(peer); + if (likely(peer)) + ovpn_peer_put(peer); + if (likely(ks)) + ovpn_crypto_key_slot_put(ks); kfree_skb(skb); } static bool ovpn_encrypt_one(struct ovpn_peer *peer, struct sk_buff *skb) { - ovpn_skb_cb(skb)->peer = peer; + struct ovpn_crypto_key_slot *ks; + + /* get primary key to be used for encrypting data */ + ks = ovpn_crypto_key_slot_primary(&peer->crypto); + if (unlikely(!ks)) + return false; /* take a reference to the peer because the crypto code may run async. * ovpn_encrypt_post() will release it upon completion */ if (unlikely(!ovpn_peer_hold(peer))) { DEBUG_NET_WARN_ON_ONCE(1); + ovpn_crypto_key_slot_put(ks); return false; } - ovpn_encrypt_post(skb, 0); + memset(ovpn_skb_cb(skb), 0, sizeof(struct ovpn_cb)); + ovpn_encrypt_post(skb, ovpn_aead_encrypt(peer, ks, skb)); return true; } diff --git a/drivers/net/ovpn/io.h b/drivers/net/ovpn/io.h index 1cfa66873a2d4..5143104b2c4b8 100644 --- a/drivers/net/ovpn/io.h +++ b/drivers/net/ovpn/io.h @@ -23,4 +23,7 @@ netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev); void ovpn_recv(struct ovpn_peer *peer, struct sk_buff *skb); +void ovpn_encrypt_post(void *data, int ret); +void ovpn_decrypt_post(void *data, int ret); + #endif /* _NET_OVPN_OVPN_H_ */ diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index 10eabd62ae723..23eaab1b465b8 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -12,6 +12,8 @@ #include "ovpnpriv.h" #include "bind.h" +#include "pktid.h" +#include "crypto.h" #include "io.h" #include "main.h" #include "netlink.h" @@ -56,6 +58,7 @@ struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id) peer->vpn_addrs.ipv6 = in6addr_any; RCU_INIT_POINTER(peer->bind, NULL); + ovpn_crypto_state_init(&peer->crypto); spin_lock_init(&peer->lock); kref_init(&peer->refcount); @@ -94,7 +97,10 @@ static void ovpn_peer_release_rcu(struct rcu_head *head) */ static void ovpn_peer_release(struct ovpn_peer *peer) { + ovpn_crypto_state_release(&peer->crypto); + spin_lock_bh(&peer->lock); ovpn_bind_reset(peer, NULL); + spin_unlock_bh(&peer->lock); call_rcu(&peer->rcu, ovpn_peer_release_rcu); netdev_put(peer->ovpn->dev, &peer->dev_tracker); } @@ -326,6 +332,29 @@ struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_priv *ovpn, return peer; } +/** + * ovpn_peer_check_by_src - check that skb source is routed via peer + * @ovpn: the openvpn instance to search + * @skb: the packet to extract source address from + * @peer: the peer to check against the source address + * + * Return: true if the peer is matching or false otherwise + */ +bool ovpn_peer_check_by_src(struct ovpn_priv *ovpn, struct sk_buff *skb, + struct ovpn_peer *peer) +{ + bool match = false; + + if (ovpn->mode == OVPN_MODE_P2P) { + /* in P2P mode, no matter the destination, packets are always + * sent to the single peer listening on the other side + */ + match = (peer == rcu_access_pointer(ovpn->peer)); + } + + return match; +} + /** * ovpn_peer_add_p2p - add peer to related tables in a P2P instance * @ovpn: the instance to add the peer to diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h index fef04311c1593..a9113a969f94d 100644 --- a/drivers/net/ovpn/peer.h +++ b/drivers/net/ovpn/peer.h @@ -12,6 +12,7 @@ #include +#include "crypto.h" #include "socket.h" /** @@ -23,6 +24,7 @@ * @vpn_addrs.ipv4: IPv4 assigned to peer on the tunnel * @vpn_addrs.ipv6: IPv6 assigned to peer on the tunnel * @sock: the socket being used to talk to this peer + * @crypto: the crypto configuration (ciphers, keys, etc..) * @dst_cache: cache for dst_entry used to send to peer * @bind: remote peer binding * @delete_reason: why peer was deleted (i.e. timeout, transport error, ..) @@ -40,6 +42,7 @@ struct ovpn_peer { struct in6_addr ipv6; } vpn_addrs; struct ovpn_socket __rcu *sock; + struct ovpn_crypto_state crypto; struct dst_cache dst_cache; struct ovpn_bind __rcu *bind; enum ovpn_del_peer_reason delete_reason; @@ -82,5 +85,7 @@ struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_priv *ovpn, struct ovpn_peer *ovpn_peer_get_by_id(struct ovpn_priv *ovpn, u32 peer_id); struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_priv *ovpn, struct sk_buff *skb); +bool ovpn_peer_check_by_src(struct ovpn_priv *ovpn, struct sk_buff *skb, + struct ovpn_peer *peer); #endif /* _NET_OVPN_OVPNPEER_H_ */ diff --git a/drivers/net/ovpn/pktid.c b/drivers/net/ovpn/pktid.c new file mode 100644 index 0000000000000..2f29049897e39 --- /dev/null +++ b/drivers/net/ovpn/pktid.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + * James Yonan + */ + +#include +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "pktid.h" + +void ovpn_pktid_xmit_init(struct ovpn_pktid_xmit *pid) +{ + atomic_set(&pid->seq_num, 1); +} + +void ovpn_pktid_recv_init(struct ovpn_pktid_recv *pr) +{ + memset(pr, 0, sizeof(*pr)); + spin_lock_init(&pr->lock); +} + +/* Packet replay detection. + * Allows ID backtrack of up to REPLAY_WINDOW_SIZE - 1. + */ +int ovpn_pktid_recv(struct ovpn_pktid_recv *pr, u32 pkt_id, u32 pkt_time) +{ + const unsigned long now = jiffies; + int ret; + + /* ID must not be zero */ + if (unlikely(pkt_id == 0)) + return -EINVAL; + + spin_lock_bh(&pr->lock); + + /* expire backtracks at or below pr->id after PKTID_RECV_EXPIRE time */ + if (unlikely(time_after_eq(now, pr->expire))) + pr->id_floor = pr->id; + + /* time changed? */ + if (unlikely(pkt_time != pr->time)) { + if (pkt_time > pr->time) { + /* time moved forward, accept */ + pr->base = 0; + pr->extent = 0; + pr->id = 0; + pr->time = pkt_time; + pr->id_floor = 0; + } else { + /* time moved backward, reject */ + ret = -ETIME; + goto out; + } + } + + if (likely(pkt_id == pr->id + 1)) { + /* well-formed ID sequence (incremented by 1) */ + pr->base = REPLAY_INDEX(pr->base, -1); + pr->history[pr->base / 8] |= (1 << (pr->base % 8)); + if (pr->extent < REPLAY_WINDOW_SIZE) + ++pr->extent; + pr->id = pkt_id; + } else if (pkt_id > pr->id) { + /* ID jumped forward by more than one */ + const unsigned int delta = pkt_id - pr->id; + + if (delta < REPLAY_WINDOW_SIZE) { + unsigned int i; + + pr->base = REPLAY_INDEX(pr->base, -delta); + pr->history[pr->base / 8] |= (1 << (pr->base % 8)); + pr->extent += delta; + if (pr->extent > REPLAY_WINDOW_SIZE) + pr->extent = REPLAY_WINDOW_SIZE; + for (i = 1; i < delta; ++i) { + unsigned int newb = REPLAY_INDEX(pr->base, i); + + pr->history[newb / 8] &= ~BIT(newb % 8); + } + } else { + pr->base = 0; + pr->extent = REPLAY_WINDOW_SIZE; + memset(pr->history, 0, sizeof(pr->history)); + pr->history[0] = 1; + } + pr->id = pkt_id; + } else { + /* ID backtrack */ + const unsigned int delta = pr->id - pkt_id; + + if (delta > pr->max_backtrack) + pr->max_backtrack = delta; + if (delta < pr->extent) { + if (pkt_id > pr->id_floor) { + const unsigned int ri = REPLAY_INDEX(pr->base, + delta); + u8 *p = &pr->history[ri / 8]; + const u8 mask = (1 << (ri % 8)); + + if (*p & mask) { + ret = -EINVAL; + goto out; + } + *p |= mask; + } else { + ret = -EINVAL; + goto out; + } + } else { + ret = -EINVAL; + goto out; + } + } + + pr->expire = now + PKTID_RECV_EXPIRE; + ret = 0; +out: + spin_unlock_bh(&pr->lock); + return ret; +} diff --git a/drivers/net/ovpn/pktid.h b/drivers/net/ovpn/pktid.h new file mode 100644 index 0000000000000..0262d026d15e2 --- /dev/null +++ b/drivers/net/ovpn/pktid.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + * James Yonan + */ + +#ifndef _NET_OVPN_OVPNPKTID_H_ +#define _NET_OVPN_OVPNPKTID_H_ + +#include "proto.h" + +/* If no packets received for this length of time, set a backtrack floor + * at highest received packet ID thus far. + */ +#define PKTID_RECV_EXPIRE (30 * HZ) + +/* Packet-ID state for transmitter */ +struct ovpn_pktid_xmit { + atomic_t seq_num; +}; + +/* replay window sizing in bytes = 2^REPLAY_WINDOW_ORDER */ +#define REPLAY_WINDOW_ORDER 8 + +#define REPLAY_WINDOW_BYTES BIT(REPLAY_WINDOW_ORDER) +#define REPLAY_WINDOW_SIZE (REPLAY_WINDOW_BYTES * 8) +#define REPLAY_INDEX(base, i) (((base) + (i)) & (REPLAY_WINDOW_SIZE - 1)) + +/* Packet-ID state for receiver. + * Other than lock member, can be zeroed to initialize. + */ +struct ovpn_pktid_recv { + /* "sliding window" bitmask of recent packet IDs received */ + u8 history[REPLAY_WINDOW_BYTES]; + /* bit position of deque base in history */ + unsigned int base; + /* extent (in bits) of deque in history */ + unsigned int extent; + /* expiration of history in jiffies */ + unsigned long expire; + /* highest sequence number received */ + u32 id; + /* highest time stamp received */ + u32 time; + /* we will only accept backtrack IDs > id_floor */ + u32 id_floor; + unsigned int max_backtrack; + /* protects entire pktd ID state */ + spinlock_t lock; +}; + +/* Get the next packet ID for xmit */ +static inline int ovpn_pktid_xmit_next(struct ovpn_pktid_xmit *pid, u32 *pktid) +{ + const u32 seq_num = atomic_fetch_add_unless(&pid->seq_num, 1, 0); + /* when the 32bit space is over, we return an error because the packet + * ID is used to create the cipher IV and we do not want to reuse the + * same value more than once + */ + if (unlikely(!seq_num)) + return -ERANGE; + + *pktid = seq_num; + + return 0; +} + +/* Write 12-byte AEAD IV to dest */ +static inline void ovpn_pktid_aead_write(const u32 pktid, + const u8 nt[], + unsigned char *dest) +{ + *(__force __be32 *)(dest) = htonl(pktid); + BUILD_BUG_ON(4 + OVPN_NONCE_TAIL_SIZE != OVPN_NONCE_SIZE); + memcpy(dest + 4, nt, OVPN_NONCE_TAIL_SIZE); +} + +void ovpn_pktid_xmit_init(struct ovpn_pktid_xmit *pid); +void ovpn_pktid_recv_init(struct ovpn_pktid_recv *pr); + +int ovpn_pktid_recv(struct ovpn_pktid_recv *pr, u32 pkt_id, u32 pkt_time); + +#endif /* _NET_OVPN_OVPNPKTID_H_ */ diff --git a/drivers/net/ovpn/proto.h b/drivers/net/ovpn/proto.h index 591b97a9925fd..b7d285b4d9c1d 100644 --- a/drivers/net/ovpn/proto.h +++ b/drivers/net/ovpn/proto.h @@ -83,4 +83,36 @@ static inline u32 ovpn_peer_id_from_skb(const struct sk_buff *skb, u16 offset) return FIELD_GET(OVPN_OPCODE_PEERID_MASK, opcode); } +/** + * ovpn_key_id_from_skb - extract key ID from the skb head + * @skb: the packet to extract the key ID code from + * + * Note: this function assumes that the skb head was pulled enough + * to access the first 4 bytes. + * + * Return: the key ID + */ +static inline u8 ovpn_key_id_from_skb(const struct sk_buff *skb) +{ + u32 opcode = be32_to_cpu(*(__be32 *)skb->data); + + return FIELD_GET(OVPN_OPCODE_KEYID_MASK, opcode); +} + +/** + * ovpn_opcode_compose - combine OP code, key ID and peer ID to wire format + * @opcode: the OP code + * @key_id: the key ID + * @peer_id: the peer ID + * + * Return: a 4 bytes integer obtained combining all input values following the + * OpenVPN wire format. This integer can then be written to the packet header. + */ +static inline u32 ovpn_opcode_compose(u8 opcode, u8 key_id, u32 peer_id) +{ + return FIELD_PREP(OVPN_OPCODE_PKTTYPE_MASK, opcode) | + FIELD_PREP(OVPN_OPCODE_KEYID_MASK, key_id) | + FIELD_PREP(OVPN_OPCODE_PEERID_MASK, peer_id); +} + #endif /* _NET_OVPN_OVPNPROTO_H_ */ diff --git a/drivers/net/ovpn/skb.h b/drivers/net/ovpn/skb.h index 9db7a9adebdb4..bd3cbcfc770d2 100644 --- a/drivers/net/ovpn/skb.h +++ b/drivers/net/ovpn/skb.h @@ -20,6 +20,11 @@ struct ovpn_cb { struct ovpn_peer *peer; + struct ovpn_crypto_key_slot *ks; + struct aead_request *req; + struct scatterlist *sg; + u8 *iv; + unsigned int payload_offset; }; static inline struct ovpn_cb *ovpn_skb_cb(struct sk_buff *skb) From 04ca14955f9ae84a3185e23c4421966938e9d134 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:27 +0200 Subject: [PATCH 270/770] ovpn: store tunnel and transport statistics Byte/packet counters for in-tunnel and transport streams are now initialized and updated as needed. To be exported via netlink. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-10-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/Makefile | 1 + drivers/net/ovpn/io.c | 12 +++++++++- drivers/net/ovpn/peer.c | 2 ++ drivers/net/ovpn/peer.h | 5 +++++ drivers/net/ovpn/stats.c | 21 +++++++++++++++++ drivers/net/ovpn/stats.h | 47 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ovpn/stats.c create mode 100644 drivers/net/ovpn/stats.h diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile index 38c9fdca0e2e8..04c3345807c5d 100644 --- a/drivers/net/ovpn/Makefile +++ b/drivers/net/ovpn/Makefile @@ -17,4 +17,5 @@ ovpn-y += netlink-gen.o ovpn-y += peer.o ovpn-y += pktid.o ovpn-y += socket.o +ovpn-y += stats.o ovpn-y += udp.o diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index 846b6d7d4ec69..bf4d471e6d149 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "ovpnpriv.h" #include "peer.h" @@ -55,9 +56,11 @@ static void ovpn_netdev_write(struct ovpn_peer *peer, struct sk_buff *skb) /* cause packet to be "received" by the interface */ pkt_len = skb->len; ret = gro_cells_receive(&peer->ovpn->gro_cells, skb); - if (likely(ret == NET_RX_SUCCESS)) + if (likely(ret == NET_RX_SUCCESS)) { /* update RX stats with the size of decrypted packet */ + ovpn_peer_stats_increment_rx(&peer->vpn_stats, pkt_len); dev_dstats_rx_add(peer->ovpn->dev, pkt_len); + } } void ovpn_decrypt_post(void *data, int ret) @@ -152,6 +155,8 @@ void ovpn_recv(struct ovpn_peer *peer, struct sk_buff *skb) struct ovpn_crypto_key_slot *ks; u8 key_id; + ovpn_peer_stats_increment_rx(&peer->link_stats, skb->len); + /* get the key slot matching the key ID in the received packet */ key_id = ovpn_key_id_from_skb(skb); ks = ovpn_crypto_key_id_to_slot(&peer->crypto, key_id); @@ -175,6 +180,7 @@ void ovpn_encrypt_post(void *data, int ret) struct sk_buff *skb = data; struct ovpn_socket *sock; struct ovpn_peer *peer; + unsigned int orig_len; /* encryption is happening asynchronously. This function will be * called later by the crypto callback with a proper return value @@ -194,6 +200,7 @@ void ovpn_encrypt_post(void *data, int ret) goto err; skb_mark_not_on_list(skb); + orig_len = skb->len; rcu_read_lock(); sock = rcu_dereference(peer->sock); @@ -208,6 +215,8 @@ void ovpn_encrypt_post(void *data, int ret) /* no transport configured yet */ goto err_unlock; } + + ovpn_peer_stats_increment_tx(&peer->link_stats, orig_len); /* skb passed down the stack - don't free it */ skb = NULL; err_unlock: @@ -322,6 +331,7 @@ netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev) goto drop; } + ovpn_peer_stats_increment_tx(&peer->vpn_stats, skb->len); ovpn_send(ovpn, skb_list.next, peer); return NETDEV_TX_OK; diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index 23eaab1b465b8..0fe5333c6b810 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -61,6 +61,8 @@ struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id) ovpn_crypto_state_init(&peer->crypto); spin_lock_init(&peer->lock); kref_init(&peer->refcount); + ovpn_peer_stats_init(&peer->vpn_stats); + ovpn_peer_stats_init(&peer->link_stats); ret = dst_cache_init(&peer->dst_cache, GFP_KERNEL); if (ret < 0) { diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h index a9113a969f94d..2453d39ce327c 100644 --- a/drivers/net/ovpn/peer.h +++ b/drivers/net/ovpn/peer.h @@ -14,6 +14,7 @@ #include "crypto.h" #include "socket.h" +#include "stats.h" /** * struct ovpn_peer - the main remote peer object @@ -27,6 +28,8 @@ * @crypto: the crypto configuration (ciphers, keys, etc..) * @dst_cache: cache for dst_entry used to send to peer * @bind: remote peer binding + * @vpn_stats: per-peer in-VPN TX/RX stats + * @link_stats: per-peer link/transport TX/RX stats * @delete_reason: why peer was deleted (i.e. timeout, transport error, ..) * @lock: protects binding to peer (bind) * @refcount: reference counter @@ -45,6 +48,8 @@ struct ovpn_peer { struct ovpn_crypto_state crypto; struct dst_cache dst_cache; struct ovpn_bind __rcu *bind; + struct ovpn_peer_stats vpn_stats; + struct ovpn_peer_stats link_stats; enum ovpn_del_peer_reason delete_reason; spinlock_t lock; /* protects bind */ struct kref refcount; diff --git a/drivers/net/ovpn/stats.c b/drivers/net/ovpn/stats.c new file mode 100644 index 0000000000000..d637143473bb9 --- /dev/null +++ b/drivers/net/ovpn/stats.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + */ + +#include + +#include "stats.h" + +void ovpn_peer_stats_init(struct ovpn_peer_stats *ps) +{ + atomic64_set(&ps->rx.bytes, 0); + atomic64_set(&ps->rx.packets, 0); + + atomic64_set(&ps->tx.bytes, 0); + atomic64_set(&ps->tx.packets, 0); +} diff --git a/drivers/net/ovpn/stats.h b/drivers/net/ovpn/stats.h new file mode 100644 index 0000000000000..53433d8b6c331 --- /dev/null +++ b/drivers/net/ovpn/stats.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: James Yonan + * Antonio Quartulli + * Lev Stipakov + */ + +#ifndef _NET_OVPN_OVPNSTATS_H_ +#define _NET_OVPN_OVPNSTATS_H_ + +/* one stat */ +struct ovpn_peer_stat { + atomic64_t bytes; + atomic64_t packets; +}; + +/* rx and tx stats combined */ +struct ovpn_peer_stats { + struct ovpn_peer_stat rx; + struct ovpn_peer_stat tx; +}; + +void ovpn_peer_stats_init(struct ovpn_peer_stats *ps); + +static inline void ovpn_peer_stats_increment(struct ovpn_peer_stat *stat, + const unsigned int n) +{ + atomic64_add(n, &stat->bytes); + atomic64_inc(&stat->packets); +} + +static inline void ovpn_peer_stats_increment_rx(struct ovpn_peer_stats *stats, + const unsigned int n) +{ + ovpn_peer_stats_increment(&stats->rx, n); +} + +static inline void ovpn_peer_stats_increment_tx(struct ovpn_peer_stats *stats, + const unsigned int n) +{ + ovpn_peer_stats_increment(&stats->tx, n); +} + +#endif /* _NET_OVPN_OVPNSTATS_H_ */ From 11851cbd60ea1e5abbd97619d69845ead99303d6 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:28 +0200 Subject: [PATCH 271/770] ovpn: implement TCP transport With this change ovpn is allowed to communicate to peers also via TCP. Parsing of incoming messages is implemented through the strparser API. Note that ovpn redefines sk_prot and sk_socket->ops for the TCP socket used to communicate with the peer. For this reason it needs to access inet6_stream_ops, which is declared as extern in the IPv6 module, but it is not fully exported. Therefore this patch is also adding EXPORT_SYMBOL_GPL(inet6_stream_ops) to net/ipv6/af_inet6.c. Cc: David Ahern Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-11-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/Kconfig | 1 + drivers/net/ovpn/Makefile | 1 + drivers/net/ovpn/io.c | 4 + drivers/net/ovpn/main.c | 3 + drivers/net/ovpn/ovpnpriv.h | 1 + drivers/net/ovpn/peer.h | 35 +++ drivers/net/ovpn/socket.c | 44 ++- drivers/net/ovpn/socket.h | 8 +- drivers/net/ovpn/tcp.c | 594 ++++++++++++++++++++++++++++++++++++ drivers/net/ovpn/tcp.h | 36 +++ net/ipv6/af_inet6.c | 1 + 11 files changed, 717 insertions(+), 11 deletions(-) create mode 100644 drivers/net/ovpn/tcp.c create mode 100644 drivers/net/ovpn/tcp.h diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index a5cee847911b1..b29628d46be9b 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -125,6 +125,7 @@ config OVPN select CRYPTO_AES select CRYPTO_GCM select CRYPTO_CHACHA20POLY1305 + select STREAM_PARSER help This module enhances the performance of the OpenVPN userspace software by offloading the data channel processing to kernelspace. diff --git a/drivers/net/ovpn/Makefile b/drivers/net/ovpn/Makefile index 04c3345807c5d..229be66167e1f 100644 --- a/drivers/net/ovpn/Makefile +++ b/drivers/net/ovpn/Makefile @@ -18,4 +18,5 @@ ovpn-y += peer.o ovpn-y += pktid.o ovpn-y += socket.o ovpn-y += stats.o +ovpn-y += tcp.o ovpn-y += udp.o diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index bf4d471e6d149..5479b40f84a4d 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -22,6 +22,7 @@ #include "crypto_aead.h" #include "netlink.h" #include "proto.h" +#include "tcp.h" #include "udp.h" #include "skb.h" #include "socket.h" @@ -211,6 +212,9 @@ void ovpn_encrypt_post(void *data, int ret) case IPPROTO_UDP: ovpn_udp_send_skb(peer, sock->sock, skb); break; + case IPPROTO_TCP: + ovpn_tcp_send_skb(peer, sock->sock, skb); + break; default: /* no transport configured yet */ goto err_unlock; diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index b356c2235d46f..89075502c5af9 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -22,6 +22,7 @@ #include "io.h" #include "peer.h" #include "proto.h" +#include "tcp.h" #include "udp.h" static int ovpn_net_init(struct net_device *dev) @@ -177,6 +178,8 @@ static int __init ovpn_init(void) goto unreg_rtnl; } + ovpn_tcp_init(); + return 0; unreg_rtnl: diff --git a/drivers/net/ovpn/ovpnpriv.h b/drivers/net/ovpn/ovpnpriv.h index d31cceee508d7..bb345954b8d9e 100644 --- a/drivers/net/ovpn/ovpnpriv.h +++ b/drivers/net/ovpn/ovpnpriv.h @@ -10,6 +10,7 @@ #ifndef _NET_OVPN_OVPNSTRUCT_H_ #define _NET_OVPN_OVPNSTRUCT_H_ +#include #include #include #include diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h index 2453d39ce327c..5ef00ba6523d7 100644 --- a/drivers/net/ovpn/peer.h +++ b/drivers/net/ovpn/peer.h @@ -11,6 +11,7 @@ #define _NET_OVPN_OVPNPEER_H_ #include +#include #include "crypto.h" #include "socket.h" @@ -25,6 +26,18 @@ * @vpn_addrs.ipv4: IPv4 assigned to peer on the tunnel * @vpn_addrs.ipv6: IPv6 assigned to peer on the tunnel * @sock: the socket being used to talk to this peer + * @tcp: keeps track of TCP specific state + * @tcp.strp: stream parser context (TCP only) + * @tcp.user_queue: received packets that have to go to userspace (TCP only) + * @tcp.out_queue: packets on hold while socket is taken by user (TCP only) + * @tcp.tx_in_progress: true if TX is already ongoing (TCP only) + * @tcp.out_msg.skb: packet scheduled for sending (TCP only) + * @tcp.out_msg.offset: offset where next send should start (TCP only) + * @tcp.out_msg.len: remaining data to send within packet (TCP only) + * @tcp.sk_cb.sk_data_ready: pointer to original cb (TCP only) + * @tcp.sk_cb.sk_write_space: pointer to original cb (TCP only) + * @tcp.sk_cb.prot: pointer to original prot object (TCP only) + * @tcp.sk_cb.ops: pointer to the original prot_ops object (TCP only) * @crypto: the crypto configuration (ciphers, keys, etc..) * @dst_cache: cache for dst_entry used to send to peer * @bind: remote peer binding @@ -45,6 +58,28 @@ struct ovpn_peer { struct in6_addr ipv6; } vpn_addrs; struct ovpn_socket __rcu *sock; + + struct { + struct strparser strp; + struct sk_buff_head user_queue; + struct sk_buff_head out_queue; + bool tx_in_progress; + + struct { + struct sk_buff *skb; + int offset; + int len; + } out_msg; + + struct { + void (*sk_data_ready)(struct sock *sk); + void (*sk_write_space)(struct sock *sk); + struct proto *prot; + const struct proto_ops *ops; + } sk_cb; + + struct work_struct defer_del_work; + } tcp; struct ovpn_crypto_state crypto; struct dst_cache dst_cache; struct ovpn_bind __rcu *bind; diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c index f1fd98d528451..3a18fbdd37216 100644 --- a/drivers/net/ovpn/socket.c +++ b/drivers/net/ovpn/socket.c @@ -16,6 +16,7 @@ #include "io.h" #include "peer.h" #include "socket.h" +#include "tcp.h" #include "udp.h" static void ovpn_socket_release_kref(struct kref *kref) @@ -23,12 +24,10 @@ static void ovpn_socket_release_kref(struct kref *kref) struct ovpn_socket *sock = container_of(kref, struct ovpn_socket, refcount); - if (sock->sock->sk->sk_protocol == IPPROTO_UDP) { + if (sock->sock->sk->sk_protocol == IPPROTO_UDP) ovpn_udp_socket_detach(sock); - netdev_put(sock->ovpn->dev, &sock->dev_tracker); - } - - kfree_rcu(sock, rcu); + else if (sock->sock->sk->sk_protocol == IPPROTO_TCP) + ovpn_tcp_socket_detach(sock); } /** @@ -38,10 +37,12 @@ static void ovpn_socket_release_kref(struct kref *kref) * * This function is only used internally. Users willing to release * references to the ovpn_socket should use ovpn_socket_release() + * + * Return: true if the socket was released, false otherwise */ -static void ovpn_socket_put(struct ovpn_peer *peer, struct ovpn_socket *sock) +static bool ovpn_socket_put(struct ovpn_peer *peer, struct ovpn_socket *sock) { - kref_put(&sock->refcount, ovpn_socket_release_kref); + return kref_put(&sock->refcount, ovpn_socket_release_kref); } /** @@ -65,6 +66,7 @@ static void ovpn_socket_put(struct ovpn_peer *peer, struct ovpn_socket *sock) void ovpn_socket_release(struct ovpn_peer *peer) { struct ovpn_socket *sock; + bool released; might_sleep(); @@ -89,11 +91,26 @@ void ovpn_socket_release(struct ovpn_peer *peer) * detached before it can be picked by a concurrent reader. */ lock_sock(sock->sock->sk); - ovpn_socket_put(peer, sock); + released = ovpn_socket_put(peer, sock); release_sock(sock->sock->sk); /* align all readers with sk_user_data being NULL */ synchronize_rcu(); + + /* following cleanup should happen with lock released */ + if (released) { + if (sock->sock->sk->sk_protocol == IPPROTO_UDP) { + netdev_put(sock->ovpn->dev, &sock->dev_tracker); + } else if (sock->sock->sk->sk_protocol == IPPROTO_TCP) { + /* wait for TCP jobs to terminate */ + ovpn_tcp_socket_wait_finish(sock); + ovpn_peer_put(sock->peer); + } + /* we can call plain kfree() because we already waited one RCU + * period due to synchronize_rcu() + */ + kfree(sock); + } } static bool ovpn_socket_hold(struct ovpn_socket *sock) @@ -105,6 +122,8 @@ static int ovpn_socket_attach(struct ovpn_socket *sock, struct ovpn_peer *peer) { if (sock->sock->sk->sk_protocol == IPPROTO_UDP) return ovpn_udp_socket_attach(sock, peer->ovpn); + else if (sock->sock->sk->sk_protocol == IPPROTO_TCP) + return ovpn_tcp_socket_attach(sock, peer); return -EOPNOTSUPP; } @@ -191,7 +210,14 @@ struct ovpn_socket *ovpn_socket_new(struct socket *sock, struct ovpn_peer *peer) goto sock_release; } - if (sock->sk->sk_protocol == IPPROTO_UDP) { + /* TCP sockets are per-peer, therefore they are linked to their unique + * peer + */ + if (sock->sk->sk_protocol == IPPROTO_TCP) { + INIT_WORK(&ovpn_sock->tcp_tx_work, ovpn_tcp_tx_work); + ovpn_sock->peer = peer; + ovpn_peer_hold(peer); + } else if (sock->sk->sk_protocol == IPPROTO_UDP) { /* in UDP we only link the ovpn instance since the socket is * shared among multiple peers */ diff --git a/drivers/net/ovpn/socket.h b/drivers/net/ovpn/socket.h index c1697f4616d47..00d856b1a5d88 100644 --- a/drivers/net/ovpn/socket.h +++ b/drivers/net/ovpn/socket.h @@ -21,9 +21,11 @@ struct ovpn_peer; * struct ovpn_socket - a kernel socket referenced in the ovpn code * @ovpn: ovpn instance owning this socket (UDP only) * @dev_tracker: reference tracker for associated dev (UDP only) + * @peer: unique peer transmitting over this socket (TCP only) * @sock: the low level sock object * @refcount: amount of contexts currently referencing this object - * @rcu: member used to schedule RCU destructor callback + * @work: member used to schedule release routine (it may block) + * @tcp_tx_work: work for deferring outgoing packet processing (TCP only) */ struct ovpn_socket { union { @@ -31,11 +33,13 @@ struct ovpn_socket { struct ovpn_priv *ovpn; netdevice_tracker dev_tracker; }; + struct ovpn_peer *peer; }; struct socket *sock; struct kref refcount; - struct rcu_head rcu; + struct work_struct work; + struct work_struct tcp_tx_work; }; struct ovpn_socket *ovpn_socket_new(struct socket *sock, diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c new file mode 100644 index 0000000000000..588ff6b044010 --- /dev/null +++ b/drivers/net/ovpn/tcp.c @@ -0,0 +1,594 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ovpnpriv.h" +#include "main.h" +#include "io.h" +#include "peer.h" +#include "proto.h" +#include "skb.h" +#include "tcp.h" + +#define OVPN_TCP_DEPTH_NESTING 2 +#if OVPN_TCP_DEPTH_NESTING == SINGLE_DEPTH_NESTING +#error "OVPN TCP requires its own lockdep subclass" +#endif + +static struct proto ovpn_tcp_prot __ro_after_init; +static struct proto_ops ovpn_tcp_ops __ro_after_init; +static struct proto ovpn_tcp6_prot __ro_after_init; +static struct proto_ops ovpn_tcp6_ops __ro_after_init; + +static int ovpn_tcp_parse(struct strparser *strp, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + __be16 blen; + u16 len; + int err; + + /* when packets are written to the TCP stream, they are prepended with + * two bytes indicating the actual packet size. + * Parse accordingly and return the actual size (including the size + * header) + */ + + if (skb->len < rxm->offset + 2) + return 0; + + err = skb_copy_bits(skb, rxm->offset, &blen, sizeof(blen)); + if (err < 0) + return err; + + len = be16_to_cpu(blen); + if (len < 2) + return -EINVAL; + + return len + 2; +} + +/* queue skb for sending to userspace via recvmsg on the socket */ +static void ovpn_tcp_to_userspace(struct ovpn_peer *peer, struct sock *sk, + struct sk_buff *skb) +{ + skb_set_owner_r(skb, sk); + memset(skb->cb, 0, sizeof(skb->cb)); + skb_queue_tail(&peer->tcp.user_queue, skb); + peer->tcp.sk_cb.sk_data_ready(sk); +} + +static void ovpn_tcp_rcv(struct strparser *strp, struct sk_buff *skb) +{ + struct ovpn_peer *peer = container_of(strp, struct ovpn_peer, tcp.strp); + struct strp_msg *msg = strp_msg(skb); + size_t pkt_len = msg->full_len - 2; + size_t off = msg->offset + 2; + u8 opcode; + + /* ensure skb->data points to the beginning of the openvpn packet */ + if (!pskb_pull(skb, off)) { + net_warn_ratelimited("%s: packet too small for peer %u\n", + netdev_name(peer->ovpn->dev), peer->id); + goto err; + } + + /* strparser does not trim the skb for us, therefore we do it now */ + if (pskb_trim(skb, pkt_len) != 0) { + net_warn_ratelimited("%s: trimming skb failed for peer %u\n", + netdev_name(peer->ovpn->dev), peer->id); + goto err; + } + + /* we need the first 4 bytes of data to be accessible + * to extract the opcode and the key ID later on + */ + if (!pskb_may_pull(skb, OVPN_OPCODE_SIZE)) { + net_warn_ratelimited("%s: packet too small to fetch opcode for peer %u\n", + netdev_name(peer->ovpn->dev), peer->id); + goto err; + } + + /* DATA_V2 packets are handled in kernel, the rest goes to user space */ + opcode = ovpn_opcode_from_skb(skb, 0); + if (unlikely(opcode != OVPN_DATA_V2)) { + if (opcode == OVPN_DATA_V1) { + net_warn_ratelimited("%s: DATA_V1 detected on the TCP stream\n", + netdev_name(peer->ovpn->dev)); + goto err; + } + + /* The packet size header must be there when sending the packet + * to userspace, therefore we put it back + */ + skb_push(skb, 2); + ovpn_tcp_to_userspace(peer, strp->sk, skb); + return; + } + + /* hold reference to peer as required by ovpn_recv(). + * + * NOTE: in this context we should already be holding a reference to + * this peer, therefore ovpn_peer_hold() is not expected to fail + */ + if (WARN_ON(!ovpn_peer_hold(peer))) + goto err; + + ovpn_recv(peer, skb); + return; +err: + dev_dstats_rx_dropped(peer->ovpn->dev); + kfree_skb(skb); + ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_TRANSPORT_ERROR); +} + +static int ovpn_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int flags, int *addr_len) +{ + int err = 0, off, copied = 0, ret; + struct ovpn_socket *sock; + struct ovpn_peer *peer; + struct sk_buff *skb; + + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (unlikely(!sock || !sock->peer || !ovpn_peer_hold(sock->peer))) { + rcu_read_unlock(); + return -EBADF; + } + peer = sock->peer; + rcu_read_unlock(); + + skb = __skb_recv_datagram(sk, &peer->tcp.user_queue, flags, &off, &err); + if (!skb) { + if (err == -EAGAIN && sk->sk_shutdown & RCV_SHUTDOWN) { + ret = 0; + goto out; + } + ret = err; + goto out; + } + + copied = len; + if (copied > skb->len) + copied = skb->len; + else if (copied < skb->len) + msg->msg_flags |= MSG_TRUNC; + + err = skb_copy_datagram_msg(skb, 0, msg, copied); + if (unlikely(err)) { + kfree_skb(skb); + ret = err; + goto out; + } + + if (flags & MSG_TRUNC) + copied = skb->len; + kfree_skb(skb); + ret = copied; +out: + ovpn_peer_put(peer); + return ret; +} + +void ovpn_tcp_socket_detach(struct ovpn_socket *ovpn_sock) +{ + struct ovpn_peer *peer = ovpn_sock->peer; + struct socket *sock = ovpn_sock->sock; + + strp_stop(&peer->tcp.strp); + skb_queue_purge(&peer->tcp.user_queue); + + /* restore CBs that were saved in ovpn_sock_set_tcp_cb() */ + sock->sk->sk_data_ready = peer->tcp.sk_cb.sk_data_ready; + sock->sk->sk_write_space = peer->tcp.sk_cb.sk_write_space; + sock->sk->sk_prot = peer->tcp.sk_cb.prot; + sock->sk->sk_socket->ops = peer->tcp.sk_cb.ops; + + rcu_assign_sk_user_data(sock->sk, NULL); +} + +void ovpn_tcp_socket_wait_finish(struct ovpn_socket *sock) +{ + struct ovpn_peer *peer = sock->peer; + + /* NOTE: we don't wait for peer->tcp.defer_del_work to finish: + * either the worker is not running or this function + * was invoked by that worker. + */ + + cancel_work_sync(&sock->tcp_tx_work); + strp_done(&peer->tcp.strp); + + skb_queue_purge(&peer->tcp.out_queue); + kfree_skb(peer->tcp.out_msg.skb); + peer->tcp.out_msg.skb = NULL; +} + +static void ovpn_tcp_send_sock(struct ovpn_peer *peer, struct sock *sk) +{ + struct sk_buff *skb = peer->tcp.out_msg.skb; + + if (!skb) + return; + + if (peer->tcp.tx_in_progress) + return; + + peer->tcp.tx_in_progress = true; + + do { + int ret = skb_send_sock_locked(sk, skb, + peer->tcp.out_msg.offset, + peer->tcp.out_msg.len); + if (unlikely(ret < 0)) { + if (ret == -EAGAIN) + goto out; + + net_warn_ratelimited("%s: TCP error to peer %u: %d\n", + netdev_name(peer->ovpn->dev), + peer->id, ret); + + /* in case of TCP error we can't recover the VPN + * stream therefore we abort the connection + */ + ovpn_peer_hold(peer); + schedule_work(&peer->tcp.defer_del_work); + + /* we bail out immediately and keep tx_in_progress set + * to true. This way we prevent more TX attempts + * which would lead to more invocations of + * schedule_work() + */ + return; + } + + peer->tcp.out_msg.len -= ret; + peer->tcp.out_msg.offset += ret; + } while (peer->tcp.out_msg.len > 0); + + if (!peer->tcp.out_msg.len) { + preempt_disable(); + dev_dstats_tx_add(peer->ovpn->dev, skb->len); + preempt_enable(); + } + + kfree_skb(peer->tcp.out_msg.skb); + peer->tcp.out_msg.skb = NULL; + peer->tcp.out_msg.len = 0; + peer->tcp.out_msg.offset = 0; + +out: + peer->tcp.tx_in_progress = false; +} + +void ovpn_tcp_tx_work(struct work_struct *work) +{ + struct ovpn_socket *sock; + + sock = container_of(work, struct ovpn_socket, tcp_tx_work); + + lock_sock(sock->sock->sk); + if (sock->peer) + ovpn_tcp_send_sock(sock->peer, sock->sock->sk); + release_sock(sock->sock->sk); +} + +static void ovpn_tcp_send_sock_skb(struct ovpn_peer *peer, struct sock *sk, + struct sk_buff *skb) +{ + if (peer->tcp.out_msg.skb) + ovpn_tcp_send_sock(peer, sk); + + if (peer->tcp.out_msg.skb) { + dev_dstats_tx_dropped(peer->ovpn->dev); + kfree_skb(skb); + return; + } + + peer->tcp.out_msg.skb = skb; + peer->tcp.out_msg.len = skb->len; + peer->tcp.out_msg.offset = 0; + ovpn_tcp_send_sock(peer, sk); +} + +void ovpn_tcp_send_skb(struct ovpn_peer *peer, struct socket *sock, + struct sk_buff *skb) +{ + u16 len = skb->len; + + *(__be16 *)__skb_push(skb, sizeof(u16)) = htons(len); + + spin_lock_nested(&sock->sk->sk_lock.slock, OVPN_TCP_DEPTH_NESTING); + if (sock_owned_by_user(sock->sk)) { + if (skb_queue_len(&peer->tcp.out_queue) >= + READ_ONCE(net_hotdata.max_backlog)) { + dev_dstats_tx_dropped(peer->ovpn->dev); + kfree_skb(skb); + goto unlock; + } + __skb_queue_tail(&peer->tcp.out_queue, skb); + } else { + ovpn_tcp_send_sock_skb(peer, sock->sk, skb); + } +unlock: + spin_unlock(&sock->sk->sk_lock.slock); +} + +static void ovpn_tcp_release(struct sock *sk) +{ + struct sk_buff_head queue; + struct ovpn_socket *sock; + struct ovpn_peer *peer; + struct sk_buff *skb; + + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (!sock) { + rcu_read_unlock(); + return; + } + + peer = sock->peer; + + /* during initialization this function is called before + * assigning sock->peer + */ + if (unlikely(!peer || !ovpn_peer_hold(peer))) { + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + + __skb_queue_head_init(&queue); + skb_queue_splice_init(&peer->tcp.out_queue, &queue); + + while ((skb = __skb_dequeue(&queue))) + ovpn_tcp_send_sock_skb(peer, sk, skb); + + peer->tcp.sk_cb.prot->release_cb(sk); + ovpn_peer_put(peer); +} + +static int ovpn_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct ovpn_socket *sock; + int ret, linear = PAGE_SIZE; + struct ovpn_peer *peer; + struct sk_buff *skb; + + lock_sock(sk); + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (unlikely(!sock || !sock->peer || !ovpn_peer_hold(sock->peer))) { + rcu_read_unlock(); + release_sock(sk); + return -EIO; + } + rcu_read_unlock(); + peer = sock->peer; + + if (msg->msg_flags & ~MSG_DONTWAIT) { + ret = -EOPNOTSUPP; + goto peer_free; + } + + if (peer->tcp.out_msg.skb) { + ret = -EAGAIN; + goto peer_free; + } + + if (size < linear) + linear = size; + + skb = sock_alloc_send_pskb(sk, linear, size - linear, + msg->msg_flags & MSG_DONTWAIT, &ret, 0); + if (!skb) { + net_err_ratelimited("%s: skb alloc failed: %d\n", + netdev_name(peer->ovpn->dev), ret); + goto peer_free; + } + + skb_put(skb, linear); + skb->len = size; + skb->data_len = size - linear; + + ret = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); + if (ret) { + kfree_skb(skb); + net_err_ratelimited("%s: skb copy from iter failed: %d\n", + netdev_name(peer->ovpn->dev), ret); + goto peer_free; + } + + ovpn_tcp_send_sock_skb(peer, sk, skb); + ret = size; +peer_free: + release_sock(sk); + ovpn_peer_put(peer); + return ret; +} + +static int ovpn_tcp_disconnect(struct sock *sk, int flags) +{ + return -EBUSY; +} + +static void ovpn_tcp_data_ready(struct sock *sk) +{ + struct ovpn_socket *sock; + + trace_sk_data_ready(sk); + + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (likely(sock && sock->peer)) + strp_data_ready(&sock->peer->tcp.strp); + rcu_read_unlock(); +} + +static void ovpn_tcp_write_space(struct sock *sk) +{ + struct ovpn_socket *sock; + + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (likely(sock && sock->peer)) { + schedule_work(&sock->tcp_tx_work); + sock->peer->tcp.sk_cb.sk_write_space(sk); + } + rcu_read_unlock(); +} + +static void ovpn_tcp_build_protos(struct proto *new_prot, + struct proto_ops *new_ops, + const struct proto *orig_prot, + const struct proto_ops *orig_ops); + +static void ovpn_tcp_peer_del_work(struct work_struct *work) +{ + struct ovpn_peer *peer = container_of(work, struct ovpn_peer, + tcp.defer_del_work); + + ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_TRANSPORT_ERROR); + ovpn_peer_put(peer); +} + +/* Set TCP encapsulation callbacks */ +int ovpn_tcp_socket_attach(struct ovpn_socket *ovpn_sock, + struct ovpn_peer *peer) +{ + struct socket *sock = ovpn_sock->sock; + struct strp_callbacks cb = { + .rcv_msg = ovpn_tcp_rcv, + .parse_msg = ovpn_tcp_parse, + }; + int ret; + + /* make sure no pre-existing encapsulation handler exists */ + if (sock->sk->sk_user_data) + return -EBUSY; + + /* only a fully connected socket is expected. Connection should be + * handled in userspace + */ + if (sock->sk->sk_state != TCP_ESTABLISHED) { + net_err_ratelimited("%s: provided TCP socket is not in ESTABLISHED state: %d\n", + netdev_name(peer->ovpn->dev), + sock->sk->sk_state); + return -EINVAL; + } + + ret = strp_init(&peer->tcp.strp, sock->sk, &cb); + if (ret < 0) { + DEBUG_NET_WARN_ON_ONCE(1); + return ret; + } + + INIT_WORK(&peer->tcp.defer_del_work, ovpn_tcp_peer_del_work); + + __sk_dst_reset(sock->sk); + skb_queue_head_init(&peer->tcp.user_queue); + skb_queue_head_init(&peer->tcp.out_queue); + + /* save current CBs so that they can be restored upon socket release */ + peer->tcp.sk_cb.sk_data_ready = sock->sk->sk_data_ready; + peer->tcp.sk_cb.sk_write_space = sock->sk->sk_write_space; + peer->tcp.sk_cb.prot = sock->sk->sk_prot; + peer->tcp.sk_cb.ops = sock->sk->sk_socket->ops; + + /* assign our static CBs and prot/ops */ + sock->sk->sk_data_ready = ovpn_tcp_data_ready; + sock->sk->sk_write_space = ovpn_tcp_write_space; + + if (sock->sk->sk_family == AF_INET) { + sock->sk->sk_prot = &ovpn_tcp_prot; + sock->sk->sk_socket->ops = &ovpn_tcp_ops; + } else { + sock->sk->sk_prot = &ovpn_tcp6_prot; + sock->sk->sk_socket->ops = &ovpn_tcp6_ops; + } + + /* avoid using task_frag */ + sock->sk->sk_allocation = GFP_ATOMIC; + sock->sk->sk_use_task_frag = false; + + /* enqueue the RX worker */ + strp_check_rcv(&peer->tcp.strp); + + return 0; +} + +static void ovpn_tcp_close(struct sock *sk, long timeout) +{ + struct ovpn_socket *sock; + struct ovpn_peer *peer; + + rcu_read_lock(); + sock = rcu_dereference_sk_user_data(sk); + if (!sock || !sock->peer || !ovpn_peer_hold(sock->peer)) { + rcu_read_unlock(); + return; + } + peer = sock->peer; + rcu_read_unlock(); + + ovpn_peer_del(sock->peer, OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT); + peer->tcp.sk_cb.prot->close(sk, timeout); + ovpn_peer_put(peer); +} + +static __poll_t ovpn_tcp_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + __poll_t mask = datagram_poll(file, sock, wait); + struct ovpn_socket *ovpn_sock; + + rcu_read_lock(); + ovpn_sock = rcu_dereference_sk_user_data(sock->sk); + if (ovpn_sock && ovpn_sock->peer && + !skb_queue_empty(&ovpn_sock->peer->tcp.user_queue)) + mask |= EPOLLIN | EPOLLRDNORM; + rcu_read_unlock(); + + return mask; +} + +static void ovpn_tcp_build_protos(struct proto *new_prot, + struct proto_ops *new_ops, + const struct proto *orig_prot, + const struct proto_ops *orig_ops) +{ + memcpy(new_prot, orig_prot, sizeof(*new_prot)); + memcpy(new_ops, orig_ops, sizeof(*new_ops)); + new_prot->recvmsg = ovpn_tcp_recvmsg; + new_prot->sendmsg = ovpn_tcp_sendmsg; + new_prot->disconnect = ovpn_tcp_disconnect; + new_prot->close = ovpn_tcp_close; + new_prot->release_cb = ovpn_tcp_release; + new_ops->poll = ovpn_tcp_poll; +} + +/* Initialize TCP static objects */ +void __init ovpn_tcp_init(void) +{ + ovpn_tcp_build_protos(&ovpn_tcp_prot, &ovpn_tcp_ops, &tcp_prot, + &inet_stream_ops); + +#if IS_ENABLED(CONFIG_IPV6) + ovpn_tcp_build_protos(&ovpn_tcp6_prot, &ovpn_tcp6_ops, &tcpv6_prot, + &inet6_stream_ops); +#endif +} diff --git a/drivers/net/ovpn/tcp.h b/drivers/net/ovpn/tcp.h new file mode 100644 index 0000000000000..10aefa834cf35 --- /dev/null +++ b/drivers/net/ovpn/tcp.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* OpenVPN data channel offload + * + * Copyright (C) 2019-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#ifndef _NET_OVPN_TCP_H_ +#define _NET_OVPN_TCP_H_ + +#include +#include +#include + +#include "peer.h" +#include "skb.h" +#include "socket.h" + +void __init ovpn_tcp_init(void); + +int ovpn_tcp_socket_attach(struct ovpn_socket *ovpn_sock, + struct ovpn_peer *peer); +void ovpn_tcp_socket_detach(struct ovpn_socket *ovpn_sock); +void ovpn_tcp_socket_wait_finish(struct ovpn_socket *sock); + +/* Prepare skb and enqueue it for sending to peer. + * + * Preparation consist in prepending the skb payload with its size. + * Required by the OpenVPN protocol in order to extract packets from + * the TCP stream on the receiver side. + */ +void ovpn_tcp_send_skb(struct ovpn_peer *peer, struct socket *sock, struct sk_buff *skb); +void ovpn_tcp_tx_work(struct work_struct *work); + +#endif /* _NET_OVPN_TCP_H_ */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 85bf681d427b8..acaff12967835 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -715,6 +715,7 @@ const struct proto_ops inet6_stream_ops = { #endif .set_rcvlowat = tcp_set_rcvlowat, }; +EXPORT_SYMBOL_GPL(inet6_stream_ops); const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, From 17240749f26e07cafa676688d8a3326086498447 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:29 +0200 Subject: [PATCH 272/770] skb: implement skb_send_sock_locked_with_flags() When sending an skb over a socket using skb_send_sock_locked(), it is currently not possible to specify any flag to be set in msghdr->msg_flags. However, we may want to pass flags the user may have specified, like MSG_NOSIGNAL. Extend __skb_send_sock() with a new argument 'flags' and add a new interface named skb_send_sock_locked_with_flags(). Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-12-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 2 ++ net/core/skbuff.c | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f1381aff0f896..beb084ee4f4d2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4145,6 +4145,8 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags); int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 74a2d886a35b5..d73ad79fe739d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3227,7 +3227,7 @@ static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, - int len, sendmsg_func sendmsg) + int len, sendmsg_func sendmsg, int flags) { unsigned int orig_len = len; struct sk_buff *head = skb; @@ -3245,7 +3245,7 @@ static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, kv.iov_base = skb->data + offset; kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; + msg.msg_flags = MSG_DONTWAIT | flags; iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, @@ -3282,7 +3282,8 @@ static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, while (slen) { struct bio_vec bvec; struct msghdr msg = { - .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT, + .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | + flags, }; bvec_set_page(&bvec, skb_frag_page(frag), slen, @@ -3328,14 +3329,21 @@ static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_locked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0); } EXPORT_SYMBOL_GPL(skb_send_sock_locked); +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags); +} +EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags); + /* Send skb data on a socket. Socket must be unlocked. */ int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0); } /** From 36bb1d713a15e361e74249b75a9fd26d64967a7b Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:30 +0200 Subject: [PATCH 273/770] ovpn: add support for MSG_NOSIGNAL in tcp_sendmsg Userspace may want to pass the MSG_NOSIGNAL flag to tcp_sendmsg() in order to avoid generating a SIGPIPE. To pass this flag down the TCP stack a new skb sending API accepting a flags argument is introduced. Cc: Eric Dumazet Cc: Paolo Abeni Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-13-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/skb.h | 1 + drivers/net/ovpn/tcp.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ovpn/skb.h b/drivers/net/ovpn/skb.h index bd3cbcfc770d2..64430880f1dae 100644 --- a/drivers/net/ovpn/skb.h +++ b/drivers/net/ovpn/skb.h @@ -25,6 +25,7 @@ struct ovpn_cb { struct scatterlist *sg; u8 *iv; unsigned int payload_offset; + bool nosignal; }; static inline struct ovpn_cb *ovpn_skb_cb(struct sk_buff *skb) diff --git a/drivers/net/ovpn/tcp.c b/drivers/net/ovpn/tcp.c index 588ff6b044010..7c42d84987ad3 100644 --- a/drivers/net/ovpn/tcp.c +++ b/drivers/net/ovpn/tcp.c @@ -220,6 +220,7 @@ void ovpn_tcp_socket_wait_finish(struct ovpn_socket *sock) static void ovpn_tcp_send_sock(struct ovpn_peer *peer, struct sock *sk) { struct sk_buff *skb = peer->tcp.out_msg.skb; + int ret, flags; if (!skb) return; @@ -230,9 +231,11 @@ static void ovpn_tcp_send_sock(struct ovpn_peer *peer, struct sock *sk) peer->tcp.tx_in_progress = true; do { - int ret = skb_send_sock_locked(sk, skb, - peer->tcp.out_msg.offset, - peer->tcp.out_msg.len); + flags = ovpn_skb_cb(skb)->nosignal ? MSG_NOSIGNAL : 0; + ret = skb_send_sock_locked_with_flags(sk, skb, + peer->tcp.out_msg.offset, + peer->tcp.out_msg.len, + flags); if (unlikely(ret < 0)) { if (ret == -EAGAIN) goto out; @@ -380,7 +383,7 @@ static int ovpn_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) rcu_read_unlock(); peer = sock->peer; - if (msg->msg_flags & ~MSG_DONTWAIT) { + if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL)) { ret = -EOPNOTSUPP; goto peer_free; } @@ -413,6 +416,7 @@ static int ovpn_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) goto peer_free; } + ovpn_skb_cb(skb)->nosignal = msg->msg_flags & MSG_NOSIGNAL; ovpn_tcp_send_sock_skb(peer, sk, skb); ret = size; peer_free: From 05003b408c201a33637a30b93dcca209be3b7878 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:31 +0200 Subject: [PATCH 274/770] ovpn: implement multi-peer support With this change an ovpn instance will be able to stay connected to multiple remote endpoints. This functionality is strictly required when running ovpn on an OpenVPN server. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-14-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/main.c | 64 +++++++++++- drivers/net/ovpn/ovpnpriv.h | 19 ++++ drivers/net/ovpn/peer.c | 189 ++++++++++++++++++++++++++++++++++-- drivers/net/ovpn/peer.h | 12 ++- drivers/net/ovpn/udp.c | 4 +- 5 files changed, 272 insertions(+), 16 deletions(-) diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index 89075502c5af9..889c4585f06bc 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -25,11 +25,66 @@ #include "tcp.h" #include "udp.h" +static void ovpn_priv_free(struct net_device *net) +{ + struct ovpn_priv *ovpn = netdev_priv(net); + + kfree(ovpn->peers); +} + +static int ovpn_mp_alloc(struct ovpn_priv *ovpn) +{ + struct in_device *dev_v4; + int i; + + if (ovpn->mode != OVPN_MODE_MP) + return 0; + + dev_v4 = __in_dev_get_rtnl(ovpn->dev); + if (dev_v4) { + /* disable redirects as Linux gets confused by ovpn + * handling same-LAN routing. + * This happens because a multipeer interface is used as + * relay point between hosts in the same subnet, while + * in a classic LAN this would not be needed because the + * two hosts would be able to talk directly. + */ + IN_DEV_CONF_SET(dev_v4, SEND_REDIRECTS, false); + IPV4_DEVCONF_ALL(dev_net(ovpn->dev), SEND_REDIRECTS) = false; + } + + /* the peer container is fairly large, therefore we allocate it only in + * MP mode + */ + ovpn->peers = kzalloc(sizeof(*ovpn->peers), GFP_KERNEL); + if (!ovpn->peers) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(ovpn->peers->by_id); i++) { + INIT_HLIST_HEAD(&ovpn->peers->by_id[i]); + INIT_HLIST_NULLS_HEAD(&ovpn->peers->by_vpn_addr4[i], i); + INIT_HLIST_NULLS_HEAD(&ovpn->peers->by_vpn_addr6[i], i); + INIT_HLIST_NULLS_HEAD(&ovpn->peers->by_transp_addr[i], i); + } + + return 0; +} + static int ovpn_net_init(struct net_device *dev) { struct ovpn_priv *ovpn = netdev_priv(dev); + int err = gro_cells_init(&ovpn->gro_cells, dev); + + if (err < 0) + return err; + + err = ovpn_mp_alloc(ovpn); + if (err < 0) { + gro_cells_destroy(&ovpn->gro_cells); + return err; + } - return gro_cells_init(&ovpn->gro_cells, dev); + return 0; } static void ovpn_net_uninit(struct net_device *dev) @@ -76,6 +131,8 @@ static void ovpn_setup(struct net_device *dev) dev->netdev_ops = &ovpn_netdev_ops; + dev->priv_destructor = ovpn_priv_free; + dev->hard_header_len = 0; dev->addr_len = 0; dev->mtu = ETH_DATA_LEN - OVPN_HEAD_ROOM; @@ -134,10 +191,7 @@ static void ovpn_dellink(struct net_device *dev, struct list_head *head) { struct ovpn_priv *ovpn = netdev_priv(dev); - if (ovpn->mode == OVPN_MODE_P2P) - ovpn_peer_release_p2p(ovpn, NULL, - OVPN_DEL_PEER_REASON_TEARDOWN); - + ovpn_peers_free(ovpn, NULL, OVPN_DEL_PEER_REASON_TEARDOWN); unregister_netdevice_queue(dev, head); } diff --git a/drivers/net/ovpn/ovpnpriv.h b/drivers/net/ovpn/ovpnpriv.h index bb345954b8d9e..5ba3c3e446e4d 100644 --- a/drivers/net/ovpn/ovpnpriv.h +++ b/drivers/net/ovpn/ovpnpriv.h @@ -15,11 +15,29 @@ #include #include +/** + * struct ovpn_peer_collection - container of peers for MultiPeer mode + * @by_id: table of peers index by ID + * @by_vpn_addr4: table of peers indexed by VPN IPv4 address (items can be + * rehashed on the fly due to peer IP change) + * @by_vpn_addr6: table of peers indexed by VPN IPv6 address (items can be + * rehashed on the fly due to peer IP change) + * @by_transp_addr: table of peers indexed by transport address (items can be + * rehashed on the fly due to peer IP change) + */ +struct ovpn_peer_collection { + DECLARE_HASHTABLE(by_id, 12); + struct hlist_nulls_head by_vpn_addr4[1 << 12]; + struct hlist_nulls_head by_vpn_addr6[1 << 12]; + struct hlist_nulls_head by_transp_addr[1 << 12]; +}; + /** * struct ovpn_priv - per ovpn interface state * @dev: the actual netdev representing the tunnel * @mode: device operation mode (i.e. p2p, mp, ..) * @lock: protect this object + * @peers: data structures holding multi-peer references * @peer: in P2P mode, this is the only remote peer * @gro_cells: pointer to the Generic Receive Offload cell */ @@ -27,6 +45,7 @@ struct ovpn_priv { struct net_device *dev; enum ovpn_mode mode; spinlock_t lock; /* protect writing to the ovpn_priv object */ + struct ovpn_peer_collection *peers; struct ovpn_peer __rcu *peer; struct gro_cells gro_cells; }; diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index 0fe5333c6b810..bed2e591c000c 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -9,6 +9,7 @@ #include #include +#include #include "ovpnpriv.h" #include "bind.h" @@ -280,7 +281,19 @@ static void ovpn_peer_remove(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason, struct llist_head *release_list) { + lockdep_assert_held(&peer->ovpn->lock); + switch (peer->ovpn->mode) { + case OVPN_MODE_MP: + /* prevent double remove */ + if (hlist_unhashed(&peer->hash_entry_id)) + return; + + hlist_del_init_rcu(&peer->hash_entry_id); + hlist_nulls_del_init_rcu(&peer->hash_entry_addr4); + hlist_nulls_del_init_rcu(&peer->hash_entry_addr6); + hlist_nulls_del_init_rcu(&peer->hash_entry_transp_addr); + break; case OVPN_MODE_P2P: /* prevent double remove */ if (peer != rcu_access_pointer(peer->ovpn->peer)) @@ -292,8 +305,6 @@ static void ovpn_peer_remove(struct ovpn_peer *peer, */ netif_carrier_off(peer->ovpn->dev); break; - default: - return; } peer->delete_reason = reason; @@ -357,6 +368,89 @@ bool ovpn_peer_check_by_src(struct ovpn_priv *ovpn, struct sk_buff *skb, return match; } +#define ovpn_get_hash_head(_tbl, _key, _key_len) ({ \ + typeof(_tbl) *__tbl = &(_tbl); \ + (&(*__tbl)[jhash(_key, _key_len, 0) % HASH_SIZE(*__tbl)]); }) \ + +/** + * ovpn_peer_add_mp - add peer to related tables in a MP instance + * @ovpn: the instance to add the peer to + * @peer: the peer to add + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_peer_add_mp(struct ovpn_priv *ovpn, struct ovpn_peer *peer) +{ + struct sockaddr_storage sa = { 0 }; + struct hlist_nulls_head *nhead; + struct sockaddr_in6 *sa6; + struct sockaddr_in *sa4; + struct ovpn_bind *bind; + struct ovpn_peer *tmp; + size_t salen; + int ret = 0; + + spin_lock_bh(&ovpn->lock); + /* do not add duplicates */ + tmp = ovpn_peer_get_by_id(ovpn, peer->id); + if (tmp) { + ovpn_peer_put(tmp); + ret = -EEXIST; + goto out; + } + + bind = rcu_dereference_protected(peer->bind, true); + /* peers connected via TCP have bind == NULL */ + if (bind) { + switch (bind->remote.in4.sin_family) { + case AF_INET: + sa4 = (struct sockaddr_in *)&sa; + + sa4->sin_family = AF_INET; + sa4->sin_addr.s_addr = bind->remote.in4.sin_addr.s_addr; + sa4->sin_port = bind->remote.in4.sin_port; + salen = sizeof(*sa4); + break; + case AF_INET6: + sa6 = (struct sockaddr_in6 *)&sa; + + sa6->sin6_family = AF_INET6; + sa6->sin6_addr = bind->remote.in6.sin6_addr; + sa6->sin6_port = bind->remote.in6.sin6_port; + salen = sizeof(*sa6); + break; + default: + ret = -EPROTONOSUPPORT; + goto out; + } + + nhead = ovpn_get_hash_head(ovpn->peers->by_transp_addr, &sa, + salen); + hlist_nulls_add_head_rcu(&peer->hash_entry_transp_addr, nhead); + } + + hlist_add_head_rcu(&peer->hash_entry_id, + ovpn_get_hash_head(ovpn->peers->by_id, &peer->id, + sizeof(peer->id))); + + if (peer->vpn_addrs.ipv4.s_addr != htonl(INADDR_ANY)) { + nhead = ovpn_get_hash_head(ovpn->peers->by_vpn_addr4, + &peer->vpn_addrs.ipv4, + sizeof(peer->vpn_addrs.ipv4)); + hlist_nulls_add_head_rcu(&peer->hash_entry_addr4, nhead); + } + + if (!ipv6_addr_any(&peer->vpn_addrs.ipv6)) { + nhead = ovpn_get_hash_head(ovpn->peers->by_vpn_addr6, + &peer->vpn_addrs.ipv6, + sizeof(peer->vpn_addrs.ipv6)); + hlist_nulls_add_head_rcu(&peer->hash_entry_addr6, nhead); + } +out: + spin_unlock_bh(&ovpn->lock); + return ret; +} + /** * ovpn_peer_add_p2p - add peer to related tables in a P2P instance * @ovpn: the instance to add the peer to @@ -399,11 +493,42 @@ static int ovpn_peer_add_p2p(struct ovpn_priv *ovpn, struct ovpn_peer *peer) int ovpn_peer_add(struct ovpn_priv *ovpn, struct ovpn_peer *peer) { switch (ovpn->mode) { + case OVPN_MODE_MP: + return ovpn_peer_add_mp(ovpn, peer); case OVPN_MODE_P2P: return ovpn_peer_add_p2p(ovpn, peer); - default: - return -EOPNOTSUPP; } + + return -EOPNOTSUPP; +} + +/** + * ovpn_peer_del_mp - delete peer from related tables in a MP instance + * @peer: the peer to delete + * @reason: reason why the peer was deleted (sent to userspace) + * @release_list: list where delete peer should be appended + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_peer_del_mp(struct ovpn_peer *peer, + enum ovpn_del_peer_reason reason, + struct llist_head *release_list) +{ + struct ovpn_peer *tmp; + int ret = -ENOENT; + + lockdep_assert_held(&peer->ovpn->lock); + + tmp = ovpn_peer_get_by_id(peer->ovpn, peer->id); + if (tmp == peer) { + ovpn_peer_remove(peer, reason, release_list); + ret = 0; + } + + if (tmp) + ovpn_peer_put(tmp); + + return ret; } /** @@ -446,6 +571,9 @@ int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason) spin_lock_bh(&peer->ovpn->lock); switch (peer->ovpn->mode) { + case OVPN_MODE_MP: + ret = ovpn_peer_del_mp(peer, reason, &release_list); + break; case OVPN_MODE_P2P: ret = ovpn_peer_del_p2p(peer, reason, &release_list); break; @@ -463,8 +591,8 @@ int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason) * @sk: if not NULL, release peer only if it's using this specific socket * @reason: the reason for releasing the peer */ -void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, struct sock *sk, - enum ovpn_del_peer_reason reason) +static void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, struct sock *sk, + enum ovpn_del_peer_reason reason) { struct ovpn_socket *ovpn_sock; LLIST_HEAD(release_list); @@ -490,3 +618,52 @@ void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, struct sock *sk, ovpn_peer_remove(peer, reason, &release_list); unlock_ovpn(ovpn, &release_list); } + +static void ovpn_peers_release_mp(struct ovpn_priv *ovpn, struct sock *sk, + enum ovpn_del_peer_reason reason) +{ + struct ovpn_socket *ovpn_sock; + LLIST_HEAD(release_list); + struct ovpn_peer *peer; + struct hlist_node *tmp; + int bkt; + + spin_lock_bh(&ovpn->lock); + hash_for_each_safe(ovpn->peers->by_id, bkt, tmp, peer, hash_entry_id) { + bool remove = true; + + /* if a socket was passed as argument, skip all peers except + * those using it + */ + if (sk) { + rcu_read_lock(); + ovpn_sock = rcu_dereference(peer->sock); + remove = ovpn_sock && ovpn_sock->sock->sk == sk; + rcu_read_unlock(); + } + + if (remove) + ovpn_peer_remove(peer, reason, &release_list); + } + unlock_ovpn(ovpn, &release_list); +} + +/** + * ovpn_peers_free - free all peers in the instance + * @ovpn: the instance whose peers should be released + * @sk: if not NULL, only peers using this socket are removed and the socket + * is released immediately + * @reason: the reason for releasing all peers + */ +void ovpn_peers_free(struct ovpn_priv *ovpn, struct sock *sk, + enum ovpn_del_peer_reason reason) +{ + switch (ovpn->mode) { + case OVPN_MODE_P2P: + ovpn_peer_release_p2p(ovpn, sk, reason); + break; + case OVPN_MODE_MP: + ovpn_peers_release_mp(ovpn, sk, reason); + break; + } +} diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h index 5ef00ba6523d7..2a3b1031f58dd 100644 --- a/drivers/net/ovpn/peer.h +++ b/drivers/net/ovpn/peer.h @@ -25,6 +25,10 @@ * @vpn_addrs: IP addresses assigned over the tunnel * @vpn_addrs.ipv4: IPv4 assigned to peer on the tunnel * @vpn_addrs.ipv6: IPv6 assigned to peer on the tunnel + * @hash_entry_id: entry in the peer ID hashtable + * @hash_entry_addr4: entry in the peer IPv4 hashtable + * @hash_entry_addr6: entry in the peer IPv6 hashtable + * @hash_entry_transp_addr: entry in the peer transport address hashtable * @sock: the socket being used to talk to this peer * @tcp: keeps track of TCP specific state * @tcp.strp: stream parser context (TCP only) @@ -57,6 +61,10 @@ struct ovpn_peer { struct in_addr ipv4; struct in6_addr ipv6; } vpn_addrs; + struct hlist_node hash_entry_id; + struct hlist_nulls_node hash_entry_addr4; + struct hlist_nulls_node hash_entry_addr6; + struct hlist_nulls_node hash_entry_transp_addr; struct ovpn_socket __rcu *sock; struct { @@ -117,8 +125,8 @@ static inline void ovpn_peer_put(struct ovpn_peer *peer) struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id); int ovpn_peer_add(struct ovpn_priv *ovpn, struct ovpn_peer *peer); int ovpn_peer_del(struct ovpn_peer *peer, enum ovpn_del_peer_reason reason); -void ovpn_peer_release_p2p(struct ovpn_priv *ovpn, struct sock *sk, - enum ovpn_del_peer_reason reason); +void ovpn_peers_free(struct ovpn_priv *ovpn, struct sock *sock, + enum ovpn_del_peer_reason reason); struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_priv *ovpn, struct sk_buff *skb); diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c index 920d71da793cc..c9e189056f336 100644 --- a/drivers/net/ovpn/udp.c +++ b/drivers/net/ovpn/udp.c @@ -367,9 +367,7 @@ static void ovpn_udp_encap_destroy(struct sock *sk) ovpn = sock->ovpn; rcu_read_unlock(); - if (ovpn->mode == OVPN_MODE_P2P) - ovpn_peer_release_p2p(ovpn, sk, - OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT); + ovpn_peers_free(ovpn, sk, OVPN_DEL_PEER_REASON_TRANSPORT_DISCONNECT); } /** From a3aaef8cd1730381392dcc4868b437da7d9ad5ac Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:32 +0200 Subject: [PATCH 275/770] ovpn: implement peer lookup logic In a multi-peer scenario there are a number of situations when a specific peer needs to be looked up. We may want to lookup a peer by: 1. its ID 2. its VPN destination IP 3. its transport IP/port couple For each of the above, there is a specific routing table referencing all peers for fast look up. Case 2. is a bit special in the sense that an outgoing packet may not be sent to the peer VPN IP directly, but rather to a network behind it. For this reason we first perform a nexthop lookup in the system routing table and then we use the retrieved nexthop as peer search key. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-15-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/peer.c | 301 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 291 insertions(+), 10 deletions(-) diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index bed2e591c000c..2d6cecc28c5d1 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "ovpnpriv.h" #include "bind.h" @@ -150,6 +151,121 @@ static int ovpn_peer_skb_to_sockaddr(struct sk_buff *skb, return -1; } +/** + * ovpn_nexthop_from_skb4 - retrieve IPv4 nexthop for outgoing skb + * @skb: the outgoing packet + * + * Return: the IPv4 of the nexthop + */ +static __be32 ovpn_nexthop_from_skb4(struct sk_buff *skb) +{ + const struct rtable *rt = skb_rtable(skb); + + if (rt && rt->rt_uses_gateway) + return rt->rt_gw4; + + return ip_hdr(skb)->daddr; +} + +/** + * ovpn_nexthop_from_skb6 - retrieve IPv6 nexthop for outgoing skb + * @skb: the outgoing packet + * + * Return: the IPv6 of the nexthop + */ +static struct in6_addr ovpn_nexthop_from_skb6(struct sk_buff *skb) +{ + const struct rt6_info *rt = skb_rt6_info(skb); + + if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) + return ipv6_hdr(skb)->daddr; + + return rt->rt6i_gateway; +} + +/* variable name __tbl2 needs to be different from __tbl1 + * in the macro below to avoid confusing clang + */ +#define ovpn_get_hash_slot(_tbl, _key, _key_len) ({ \ + typeof(_tbl) *__tbl2 = &(_tbl); \ + jhash(_key, _key_len, 0) % HASH_SIZE(*__tbl2); \ +}) + +#define ovpn_get_hash_head(_tbl, _key, _key_len) ({ \ + typeof(_tbl) *__tbl1 = &(_tbl); \ + &(*__tbl1)[ovpn_get_hash_slot(*__tbl1, _key, _key_len)];\ +}) + +/** + * ovpn_peer_get_by_vpn_addr4 - retrieve peer by its VPN IPv4 address + * @ovpn: the openvpn instance to search + * @addr: VPN IPv4 to use as search key + * + * Refcounter is not increased for the returned peer. + * + * Return: the peer if found or NULL otherwise + */ +static struct ovpn_peer *ovpn_peer_get_by_vpn_addr4(struct ovpn_priv *ovpn, + __be32 addr) +{ + struct hlist_nulls_head *nhead; + struct hlist_nulls_node *ntmp; + struct ovpn_peer *tmp; + unsigned int slot; + +begin: + slot = ovpn_get_hash_slot(ovpn->peers->by_vpn_addr4, &addr, + sizeof(addr)); + nhead = &ovpn->peers->by_vpn_addr4[slot]; + + hlist_nulls_for_each_entry_rcu(tmp, ntmp, nhead, hash_entry_addr4) + if (addr == tmp->vpn_addrs.ipv4.s_addr) + return tmp; + + /* item may have moved during lookup - check nulls and restart + * if that's the case + */ + if (get_nulls_value(ntmp) != slot) + goto begin; + + return NULL; +} + +/** + * ovpn_peer_get_by_vpn_addr6 - retrieve peer by its VPN IPv6 address + * @ovpn: the openvpn instance to search + * @addr: VPN IPv6 to use as search key + * + * Refcounter is not increased for the returned peer. + * + * Return: the peer if found or NULL otherwise + */ +static struct ovpn_peer *ovpn_peer_get_by_vpn_addr6(struct ovpn_priv *ovpn, + struct in6_addr *addr) +{ + struct hlist_nulls_head *nhead; + struct hlist_nulls_node *ntmp; + struct ovpn_peer *tmp; + unsigned int slot; + +begin: + slot = ovpn_get_hash_slot(ovpn->peers->by_vpn_addr6, addr, + sizeof(*addr)); + nhead = &ovpn->peers->by_vpn_addr6[slot]; + + hlist_nulls_for_each_entry_rcu(tmp, ntmp, nhead, hash_entry_addr6) + if (ipv6_addr_equal(addr, &tmp->vpn_addrs.ipv6)) + return tmp; + + /* item may have moved during lookup - check nulls and restart + * if that's the case + */ + if (get_nulls_value(ntmp) != slot) + goto begin; + + return NULL; +} + /** * ovpn_peer_transp_match - check if sockaddr and peer binding match * @peer: the peer to get the binding from @@ -227,14 +343,43 @@ ovpn_peer_get_by_transp_addr_p2p(struct ovpn_priv *ovpn, struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_priv *ovpn, struct sk_buff *skb) { - struct ovpn_peer *peer = NULL; + struct ovpn_peer *tmp, *peer = NULL; struct sockaddr_storage ss = { 0 }; + struct hlist_nulls_head *nhead; + struct hlist_nulls_node *ntmp; + unsigned int slot; + ssize_t sa_len; - if (unlikely(!ovpn_peer_skb_to_sockaddr(skb, &ss))) + sa_len = ovpn_peer_skb_to_sockaddr(skb, &ss); + if (unlikely(sa_len < 0)) return NULL; if (ovpn->mode == OVPN_MODE_P2P) - peer = ovpn_peer_get_by_transp_addr_p2p(ovpn, &ss); + return ovpn_peer_get_by_transp_addr_p2p(ovpn, &ss); + + rcu_read_lock(); +begin: + slot = ovpn_get_hash_slot(ovpn->peers->by_transp_addr, &ss, sa_len); + nhead = &ovpn->peers->by_transp_addr[slot]; + + hlist_nulls_for_each_entry_rcu(tmp, ntmp, nhead, + hash_entry_transp_addr) { + if (!ovpn_peer_transp_match(tmp, &ss)) + continue; + + if (!ovpn_peer_hold(tmp)) + continue; + + peer = tmp; + break; + } + + /* item may have moved during lookup - check nulls and restart + * if that's the case + */ + if (!peer && get_nulls_value(ntmp) != slot) + goto begin; + rcu_read_unlock(); return peer; } @@ -269,10 +414,27 @@ static struct ovpn_peer *ovpn_peer_get_by_id_p2p(struct ovpn_priv *ovpn, */ struct ovpn_peer *ovpn_peer_get_by_id(struct ovpn_priv *ovpn, u32 peer_id) { - struct ovpn_peer *peer = NULL; + struct ovpn_peer *tmp, *peer = NULL; + struct hlist_head *head; if (ovpn->mode == OVPN_MODE_P2P) - peer = ovpn_peer_get_by_id_p2p(ovpn, peer_id); + return ovpn_peer_get_by_id_p2p(ovpn, peer_id); + + head = ovpn_get_hash_head(ovpn->peers->by_id, &peer_id, + sizeof(peer_id)); + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp, head, hash_entry_id) { + if (tmp->id != peer_id) + continue; + + if (!ovpn_peer_hold(tmp)) + continue; + + peer = tmp; + break; + } + rcu_read_unlock(); return peer; } @@ -330,6 +492,8 @@ struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_priv *ovpn, struct sk_buff *skb) { struct ovpn_peer *peer = NULL; + struct in6_addr addr6; + __be32 addr4; /* in P2P mode, no matter the destination, packets are always sent to * the single peer listening on the other side @@ -340,11 +504,109 @@ struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_priv *ovpn, if (unlikely(peer && !ovpn_peer_hold(peer))) peer = NULL; rcu_read_unlock(); + return peer; } + rcu_read_lock(); + switch (skb->protocol) { + case htons(ETH_P_IP): + addr4 = ovpn_nexthop_from_skb4(skb); + peer = ovpn_peer_get_by_vpn_addr4(ovpn, addr4); + break; + case htons(ETH_P_IPV6): + addr6 = ovpn_nexthop_from_skb6(skb); + peer = ovpn_peer_get_by_vpn_addr6(ovpn, &addr6); + break; + } + + if (unlikely(peer && !ovpn_peer_hold(peer))) + peer = NULL; + rcu_read_unlock(); + return peer; } +/** + * ovpn_nexthop_from_rt4 - look up the IPv4 nexthop for the given destination + * @ovpn: the private data representing the current VPN session + * @dest: the destination to be looked up + * + * Looks up in the IPv4 system routing table the IP of the nexthop to be used + * to reach the destination passed as argument. If no nexthop can be found, the + * destination itself is returned as it probably has to be used as nexthop. + * + * Return: the IP of the next hop if found or dest itself otherwise + */ +static __be32 ovpn_nexthop_from_rt4(struct ovpn_priv *ovpn, __be32 dest) +{ + struct rtable *rt; + struct flowi4 fl = { + .daddr = dest + }; + + rt = ip_route_output_flow(dev_net(ovpn->dev), &fl, NULL); + if (IS_ERR(rt)) { + net_dbg_ratelimited("%s: no route to host %pI4\n", + netdev_name(ovpn->dev), &dest); + /* if we end up here this packet is probably going to be + * thrown away later + */ + return dest; + } + + if (!rt->rt_uses_gateway) + goto out; + + dest = rt->rt_gw4; +out: + ip_rt_put(rt); + return dest; +} + +/** + * ovpn_nexthop_from_rt6 - look up the IPv6 nexthop for the given destination + * @ovpn: the private data representing the current VPN session + * @dest: the destination to be looked up + * + * Looks up in the IPv6 system routing table the IP of the nexthop to be used + * to reach the destination passed as argument. If no nexthop can be found, the + * destination itself is returned as it probably has to be used as nexthop. + * + * Return: the IP of the next hop if found or dest itself otherwise + */ +static struct in6_addr ovpn_nexthop_from_rt6(struct ovpn_priv *ovpn, + struct in6_addr dest) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct dst_entry *entry; + struct rt6_info *rt; + struct flowi6 fl = { + .daddr = dest, + }; + + entry = ipv6_stub->ipv6_dst_lookup_flow(dev_net(ovpn->dev), NULL, &fl, + NULL); + if (IS_ERR(entry)) { + net_dbg_ratelimited("%s: no route to host %pI6c\n", + netdev_name(ovpn->dev), &dest); + /* if we end up here this packet is probably going to be + * thrown away later + */ + return dest; + } + + rt = dst_rt6_info(entry); + + if (!(rt->rt6i_flags & RTF_GATEWAY)) + goto out; + + dest = rt->rt6i_gateway; +out: + dst_release((struct dst_entry *)rt); +#endif + return dest; +} + /** * ovpn_peer_check_by_src - check that skb source is routed via peer * @ovpn: the openvpn instance to search @@ -357,21 +619,40 @@ bool ovpn_peer_check_by_src(struct ovpn_priv *ovpn, struct sk_buff *skb, struct ovpn_peer *peer) { bool match = false; + struct in6_addr addr6; + __be32 addr4; if (ovpn->mode == OVPN_MODE_P2P) { /* in P2P mode, no matter the destination, packets are always * sent to the single peer listening on the other side */ - match = (peer == rcu_access_pointer(ovpn->peer)); + return peer == rcu_access_pointer(ovpn->peer); + } + + /* This function performs a reverse path check, therefore we now + * lookup the nexthop we would use if we wanted to route a packet + * to the source IP. If the nexthop matches the sender we know the + * latter is valid and we allow the packet to come in + */ + + switch (skb->protocol) { + case htons(ETH_P_IP): + addr4 = ovpn_nexthop_from_rt4(ovpn, ip_hdr(skb)->saddr); + rcu_read_lock(); + match = (peer == ovpn_peer_get_by_vpn_addr4(ovpn, addr4)); + rcu_read_unlock(); + break; + case htons(ETH_P_IPV6): + addr6 = ovpn_nexthop_from_rt6(ovpn, ipv6_hdr(skb)->saddr); + rcu_read_lock(); + match = (peer == ovpn_peer_get_by_vpn_addr6(ovpn, &addr6)); + rcu_read_unlock(); + break; } return match; } -#define ovpn_get_hash_head(_tbl, _key, _key_len) ({ \ - typeof(_tbl) *__tbl = &(_tbl); \ - (&(*__tbl)[jhash(_key, _key_len, 0) % HASH_SIZE(*__tbl)]); }) \ - /** * ovpn_peer_add_mp - add peer to related tables in a MP instance * @ovpn: the instance to add the peer to From 3ecfd9349f4093880d5884dbcd0b4be1b5471f47 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:33 +0200 Subject: [PATCH 276/770] ovpn: implement keepalive mechanism OpenVPN supports configuring a periodic keepalive packet. message to allow the remote endpoint detect link failures. This change implements the keepalive sending and timer expiring logic. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-16-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/io.c | 78 +++++++++++++- drivers/net/ovpn/io.h | 5 + drivers/net/ovpn/main.c | 2 + drivers/net/ovpn/ovpnpriv.h | 2 + drivers/net/ovpn/peer.c | 206 ++++++++++++++++++++++++++++++++++++ drivers/net/ovpn/peer.h | 21 +++- 6 files changed, 311 insertions(+), 3 deletions(-) diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index 5479b40f84a4d..a1816f72634bb 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -27,6 +27,33 @@ #include "skb.h" #include "socket.h" +const unsigned char ovpn_keepalive_message[OVPN_KEEPALIVE_SIZE] = { + 0x2a, 0x18, 0x7b, 0xf3, 0x64, 0x1e, 0xb4, 0xcb, + 0x07, 0xed, 0x2d, 0x0a, 0x98, 0x1f, 0xc7, 0x48 +}; + +/** + * ovpn_is_keepalive - check if skb contains a keepalive message + * @skb: packet to check + * + * Assumes that the first byte of skb->data is defined. + * + * Return: true if skb contains a keepalive or false otherwise + */ +static bool ovpn_is_keepalive(struct sk_buff *skb) +{ + if (*skb->data != ovpn_keepalive_message[0]) + return false; + + if (skb->len != OVPN_KEEPALIVE_SIZE) + return false; + + if (!pskb_may_pull(skb, OVPN_KEEPALIVE_SIZE)) + return false; + + return !memcmp(skb->data, ovpn_keepalive_message, OVPN_KEEPALIVE_SIZE); +} + /* Called after decrypt to write the IP packet to the device. * This method is expected to manage/free the skb. */ @@ -101,6 +128,9 @@ void ovpn_decrypt_post(void *data, int ret) goto drop; } + /* keep track of last received authenticated packet for keepalive */ + WRITE_ONCE(peer->last_recv, ktime_get_real_seconds()); + /* point to encapsulated IP packet */ __skb_pull(skb, payload_offset); @@ -118,6 +148,15 @@ void ovpn_decrypt_post(void *data, int ret) goto drop; } + if (ovpn_is_keepalive(skb)) { + net_dbg_ratelimited("%s: ping received from peer %u\n", + netdev_name(peer->ovpn->dev), + peer->id); + /* we drop the packet, but this is not a failure */ + consume_skb(skb); + goto drop_nocount; + } + net_info_ratelimited("%s: unsupported protocol received from peer %u\n", netdev_name(peer->ovpn->dev), peer->id); goto drop; @@ -143,11 +182,12 @@ void ovpn_decrypt_post(void *data, int ret) drop: if (unlikely(skb)) dev_dstats_rx_dropped(peer->ovpn->dev); + kfree_skb(skb); +drop_nocount: if (likely(peer)) ovpn_peer_put(peer); if (likely(ks)) ovpn_crypto_key_slot_put(ks); - kfree_skb(skb); } /* RX path entry point: decrypt packet and forward it to the device */ @@ -221,6 +261,8 @@ void ovpn_encrypt_post(void *data, int ret) } ovpn_peer_stats_increment_tx(&peer->link_stats, orig_len); + /* keep track of last sent packet for keepalive */ + WRITE_ONCE(peer->last_sent, ktime_get_real_seconds()); /* skb passed down the stack - don't free it */ skb = NULL; err_unlock: @@ -346,3 +388,37 @@ netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev) kfree_skb_list(skb); return NET_XMIT_DROP; } + +/** + * ovpn_xmit_special - encrypt and transmit an out-of-band message to peer + * @peer: peer to send the message to + * @data: message content + * @len: message length + * + * Assumes that caller holds a reference to peer, which will be + * passed to ovpn_send() + */ +void ovpn_xmit_special(struct ovpn_peer *peer, const void *data, + const unsigned int len) +{ + struct ovpn_priv *ovpn; + struct sk_buff *skb; + + ovpn = peer->ovpn; + if (unlikely(!ovpn)) { + ovpn_peer_put(peer); + return; + } + + skb = alloc_skb(256 + len, GFP_ATOMIC); + if (unlikely(!skb)) { + ovpn_peer_put(peer); + return; + } + + skb_reserve(skb, 128); + skb->priority = TC_PRIO_BESTEFFORT; + __skb_put_data(skb, data, len); + + ovpn_send(ovpn, skb, peer); +} diff --git a/drivers/net/ovpn/io.h b/drivers/net/ovpn/io.h index 5143104b2c4b8..db9e10f9077c4 100644 --- a/drivers/net/ovpn/io.h +++ b/drivers/net/ovpn/io.h @@ -19,9 +19,14 @@ /* max padding required by encryption */ #define OVPN_MAX_PADDING 16 +#define OVPN_KEEPALIVE_SIZE 16 +extern const unsigned char ovpn_keepalive_message[OVPN_KEEPALIVE_SIZE]; + netdev_tx_t ovpn_net_xmit(struct sk_buff *skb, struct net_device *dev); void ovpn_recv(struct ovpn_peer *peer, struct sk_buff *skb); +void ovpn_xmit_special(struct ovpn_peer *peer, const void *data, + const unsigned int len); void ovpn_encrypt_post(void *data, int ret); void ovpn_decrypt_post(void *data, int ret); diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index 889c4585f06bc..232eeb08e9292 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -170,6 +170,7 @@ static int ovpn_newlink(struct net_device *dev, ovpn->dev = dev; ovpn->mode = mode; spin_lock_init(&ovpn->lock); + INIT_DELAYED_WORK(&ovpn->keepalive_work, ovpn_peer_keepalive_work); /* Set carrier explicitly after registration, this way state is * clearly defined. @@ -191,6 +192,7 @@ static void ovpn_dellink(struct net_device *dev, struct list_head *head) { struct ovpn_priv *ovpn = netdev_priv(dev); + cancel_delayed_work_sync(&ovpn->keepalive_work); ovpn_peers_free(ovpn, NULL, OVPN_DEL_PEER_REASON_TEARDOWN); unregister_netdevice_queue(dev, head); } diff --git a/drivers/net/ovpn/ovpnpriv.h b/drivers/net/ovpn/ovpnpriv.h index 5ba3c3e446e4d..5898f6adada7f 100644 --- a/drivers/net/ovpn/ovpnpriv.h +++ b/drivers/net/ovpn/ovpnpriv.h @@ -40,6 +40,7 @@ struct ovpn_peer_collection { * @peers: data structures holding multi-peer references * @peer: in P2P mode, this is the only remote peer * @gro_cells: pointer to the Generic Receive Offload cell + * @keepalive_work: struct used to schedule keepalive periodic job */ struct ovpn_priv { struct net_device *dev; @@ -48,6 +49,7 @@ struct ovpn_priv { struct ovpn_peer_collection *peers; struct ovpn_peer __rcu *peer; struct gro_cells gro_cells; + struct delayed_work keepalive_work; }; #endif /* _NET_OVPN_OVPNSTRUCT_H_ */ diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index 2d6cecc28c5d1..50514736a4327 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -36,6 +36,52 @@ static void unlock_ovpn(struct ovpn_priv *ovpn, } } +/** + * ovpn_peer_keepalive_set - configure keepalive values for peer + * @peer: the peer to configure + * @interval: outgoing keepalive interval + * @timeout: incoming keepalive timeout + */ +void ovpn_peer_keepalive_set(struct ovpn_peer *peer, u32 interval, u32 timeout) +{ + time64_t now = ktime_get_real_seconds(); + + netdev_dbg(peer->ovpn->dev, + "scheduling keepalive for peer %u: interval=%u timeout=%u\n", + peer->id, interval, timeout); + + peer->keepalive_interval = interval; + WRITE_ONCE(peer->last_sent, now); + peer->keepalive_xmit_exp = now + interval; + + peer->keepalive_timeout = timeout; + WRITE_ONCE(peer->last_recv, now); + peer->keepalive_recv_exp = now + timeout; + + /* now that interval and timeout have been changed, kick + * off the worker so that the next delay can be recomputed + */ + mod_delayed_work(system_wq, &peer->ovpn->keepalive_work, 0); +} + +/** + * ovpn_peer_keepalive_send - periodic worker sending keepalive packets + * @work: pointer to the work member of the related peer object + * + * NOTE: the reference to peer is not dropped because it gets inherited + * by ovpn_xmit_special() + */ +static void ovpn_peer_keepalive_send(struct work_struct *work) +{ + struct ovpn_peer *peer = container_of(work, struct ovpn_peer, + keepalive_work); + + local_bh_disable(); + ovpn_xmit_special(peer, ovpn_keepalive_message, + sizeof(ovpn_keepalive_message)); + local_bh_enable(); +} + /** * ovpn_peer_new - allocate and initialize a new peer object * @ovpn: the openvpn instance inside which the peer should be created @@ -65,6 +111,7 @@ struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id) kref_init(&peer->refcount); ovpn_peer_stats_init(&peer->vpn_stats); ovpn_peer_stats_init(&peer->link_stats); + INIT_WORK(&peer->keepalive_work, ovpn_peer_keepalive_send); ret = dst_cache_init(&peer->dst_cache, GFP_KERNEL); if (ret < 0) { @@ -948,3 +995,162 @@ void ovpn_peers_free(struct ovpn_priv *ovpn, struct sock *sk, break; } } + +static time64_t ovpn_peer_keepalive_work_single(struct ovpn_peer *peer, + time64_t now, + struct llist_head *release_list) +{ + time64_t last_recv, last_sent, next_run1, next_run2; + unsigned long timeout, interval; + bool expired; + + spin_lock_bh(&peer->lock); + /* we expect both timers to be configured at the same time, + * therefore bail out if either is not set + */ + if (!peer->keepalive_timeout || !peer->keepalive_interval) { + spin_unlock_bh(&peer->lock); + return 0; + } + + /* check for peer timeout */ + expired = false; + timeout = peer->keepalive_timeout; + last_recv = READ_ONCE(peer->last_recv); + if (now < last_recv + timeout) { + peer->keepalive_recv_exp = last_recv + timeout; + next_run1 = peer->keepalive_recv_exp; + } else if (peer->keepalive_recv_exp > now) { + next_run1 = peer->keepalive_recv_exp; + } else { + expired = true; + } + + if (expired) { + /* peer is dead -> kill it and move on */ + spin_unlock_bh(&peer->lock); + netdev_dbg(peer->ovpn->dev, "peer %u expired\n", + peer->id); + ovpn_peer_remove(peer, OVPN_DEL_PEER_REASON_EXPIRED, + release_list); + return 0; + } + + /* check for peer keepalive */ + expired = false; + interval = peer->keepalive_interval; + last_sent = READ_ONCE(peer->last_sent); + if (now < last_sent + interval) { + peer->keepalive_xmit_exp = last_sent + interval; + next_run2 = peer->keepalive_xmit_exp; + } else if (peer->keepalive_xmit_exp > now) { + next_run2 = peer->keepalive_xmit_exp; + } else { + expired = true; + next_run2 = now + interval; + } + spin_unlock_bh(&peer->lock); + + if (expired) { + /* a keepalive packet is required */ + netdev_dbg(peer->ovpn->dev, + "sending keepalive to peer %u\n", + peer->id); + if (schedule_work(&peer->keepalive_work)) + ovpn_peer_hold(peer); + } + + if (next_run1 < next_run2) + return next_run1; + + return next_run2; +} + +static time64_t ovpn_peer_keepalive_work_mp(struct ovpn_priv *ovpn, + time64_t now, + struct llist_head *release_list) +{ + time64_t tmp_next_run, next_run = 0; + struct hlist_node *tmp; + struct ovpn_peer *peer; + int bkt; + + lockdep_assert_held(&ovpn->lock); + + hash_for_each_safe(ovpn->peers->by_id, bkt, tmp, peer, hash_entry_id) { + tmp_next_run = ovpn_peer_keepalive_work_single(peer, now, + release_list); + if (!tmp_next_run) + continue; + + /* the next worker run will be scheduled based on the shortest + * required interval across all peers + */ + if (!next_run || tmp_next_run < next_run) + next_run = tmp_next_run; + } + + return next_run; +} + +static time64_t ovpn_peer_keepalive_work_p2p(struct ovpn_priv *ovpn, + time64_t now, + struct llist_head *release_list) +{ + struct ovpn_peer *peer; + time64_t next_run = 0; + + lockdep_assert_held(&ovpn->lock); + + peer = rcu_dereference_protected(ovpn->peer, + lockdep_is_held(&ovpn->lock)); + if (peer) + next_run = ovpn_peer_keepalive_work_single(peer, now, + release_list); + + return next_run; +} + +/** + * ovpn_peer_keepalive_work - run keepalive logic on each known peer + * @work: pointer to the work member of the related ovpn object + * + * Each peer has two timers (if configured): + * 1. peer timeout: when no data is received for a certain interval, + * the peer is considered dead and it gets killed. + * 2. peer keepalive: when no data is sent to a certain peer for a + * certain interval, a special 'keepalive' packet is explicitly sent. + * + * This function iterates across the whole peer collection while + * checking the timers described above. + */ +void ovpn_peer_keepalive_work(struct work_struct *work) +{ + struct ovpn_priv *ovpn = container_of(work, struct ovpn_priv, + keepalive_work.work); + time64_t next_run = 0, now = ktime_get_real_seconds(); + LLIST_HEAD(release_list); + + spin_lock_bh(&ovpn->lock); + switch (ovpn->mode) { + case OVPN_MODE_MP: + next_run = ovpn_peer_keepalive_work_mp(ovpn, now, + &release_list); + break; + case OVPN_MODE_P2P: + next_run = ovpn_peer_keepalive_work_p2p(ovpn, now, + &release_list); + break; + } + + /* prevent rearming if the interface is being destroyed */ + if (next_run > 0 && + READ_ONCE(ovpn->dev->reg_state) == NETREG_REGISTERED) { + netdev_dbg(ovpn->dev, + "scheduling keepalive work: now=%llu next_run=%llu delta=%llu\n", + next_run, now, next_run - now); + schedule_delayed_work(&ovpn->keepalive_work, + (next_run - now) * HZ); + } + unlock_ovpn(ovpn, &release_list); +} diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h index 2a3b1031f58dd..e747c4b210642 100644 --- a/drivers/net/ovpn/peer.h +++ b/drivers/net/ovpn/peer.h @@ -45,13 +45,20 @@ * @crypto: the crypto configuration (ciphers, keys, etc..) * @dst_cache: cache for dst_entry used to send to peer * @bind: remote peer binding + * @keepalive_interval: seconds after which a new keepalive should be sent + * @keepalive_xmit_exp: future timestamp when next keepalive should be sent + * @last_sent: timestamp of the last successfully sent packet + * @keepalive_timeout: seconds after which an inactive peer is considered dead + * @keepalive_recv_exp: future timestamp when the peer should expire + * @last_recv: timestamp of the last authenticated received packet * @vpn_stats: per-peer in-VPN TX/RX stats * @link_stats: per-peer link/transport TX/RX stats * @delete_reason: why peer was deleted (i.e. timeout, transport error, ..) - * @lock: protects binding to peer (bind) + * @lock: protects binding to peer (bind) and keepalive* fields * @refcount: reference counter * @rcu: used to free peer in an RCU safe way * @release_entry: entry for the socket release list + * @keepalive_work: used to schedule keepalive sending */ struct ovpn_peer { struct ovpn_priv *ovpn; @@ -91,13 +98,20 @@ struct ovpn_peer { struct ovpn_crypto_state crypto; struct dst_cache dst_cache; struct ovpn_bind __rcu *bind; + unsigned long keepalive_interval; + unsigned long keepalive_xmit_exp; + time64_t last_sent; + unsigned long keepalive_timeout; + unsigned long keepalive_recv_exp; + time64_t last_recv; struct ovpn_peer_stats vpn_stats; struct ovpn_peer_stats link_stats; enum ovpn_del_peer_reason delete_reason; - spinlock_t lock; /* protects bind */ + spinlock_t lock; /* protects bind and keepalive* */ struct kref refcount; struct rcu_head rcu; struct llist_node release_entry; + struct work_struct keepalive_work; }; /** @@ -136,4 +150,7 @@ struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_priv *ovpn, bool ovpn_peer_check_by_src(struct ovpn_priv *ovpn, struct sk_buff *skb, struct ovpn_peer *peer); +void ovpn_peer_keepalive_set(struct ovpn_peer *peer, u32 interval, u32 timeout); +void ovpn_peer_keepalive_work(struct work_struct *work); + #endif /* _NET_OVPN_OVPNPEER_H_ */ From f0281c1d3732d1ec084fa1e06d1767d33e1a570e Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:34 +0200 Subject: [PATCH 277/770] ovpn: add support for updating local or remote UDP endpoint In case of UDP links, the local or remote endpoint used to communicate with a given peer may change without a connection restart. Add support for learning the new address in case of change. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-17-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/io.c | 8 ++ drivers/net/ovpn/peer.c | 213 +++++++++++++++++++++++++++++++++++++--- drivers/net/ovpn/peer.h | 2 + 3 files changed, 210 insertions(+), 13 deletions(-) diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index a1816f72634bb..fd8820c7d1337 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -96,6 +96,7 @@ void ovpn_decrypt_post(void *data, int ret) struct ovpn_crypto_key_slot *ks; unsigned int payload_offset = 0; struct sk_buff *skb = data; + struct ovpn_socket *sock; struct ovpn_peer *peer; __be16 proto; __be32 *pid; @@ -131,6 +132,13 @@ void ovpn_decrypt_post(void *data, int ret) /* keep track of last received authenticated packet for keepalive */ WRITE_ONCE(peer->last_recv, ktime_get_real_seconds()); + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (sock && sock->sock->sk->sk_protocol == IPPROTO_UDP) + /* check if this peer changed local or remote endpoint */ + ovpn_peer_endpoints_update(peer, skb); + rcu_read_unlock(); + /* point to encapsulated IP packet */ __skb_pull(skb, payload_offset); diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index 50514736a4327..10a6fd3e6ecd0 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -127,6 +127,206 @@ struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id) return peer; } +/** + * ovpn_peer_reset_sockaddr - recreate binding for peer + * @peer: peer to recreate the binding for + * @ss: sockaddr to use as remote endpoint for the binding + * @local_ip: local IP for the binding + * + * Return: 0 on success or a negative error code otherwise + */ +static int ovpn_peer_reset_sockaddr(struct ovpn_peer *peer, + const struct sockaddr_storage *ss, + const void *local_ip) +{ + struct ovpn_bind *bind; + size_t ip_len; + + lockdep_assert_held(&peer->lock); + + /* create new ovpn_bind object */ + bind = ovpn_bind_from_sockaddr(ss); + if (IS_ERR(bind)) + return PTR_ERR(bind); + + if (ss->ss_family == AF_INET) { + ip_len = sizeof(struct in_addr); + } else if (ss->ss_family == AF_INET6) { + ip_len = sizeof(struct in6_addr); + } else { + net_dbg_ratelimited("%s: invalid family %u for remote endpoint for peer %u\n", + netdev_name(peer->ovpn->dev), + ss->ss_family, peer->id); + kfree(bind); + return -EINVAL; + } + + memcpy(&bind->local, local_ip, ip_len); + + /* set binding */ + ovpn_bind_reset(peer, bind); + + return 0; +} + +/* variable name __tbl2 needs to be different from __tbl1 + * in the macro below to avoid confusing clang + */ +#define ovpn_get_hash_slot(_tbl, _key, _key_len) ({ \ + typeof(_tbl) *__tbl2 = &(_tbl); \ + jhash(_key, _key_len, 0) % HASH_SIZE(*__tbl2); \ +}) + +#define ovpn_get_hash_head(_tbl, _key, _key_len) ({ \ + typeof(_tbl) *__tbl1 = &(_tbl); \ + &(*__tbl1)[ovpn_get_hash_slot(*__tbl1, _key, _key_len)];\ +}) + +/** + * ovpn_peer_endpoints_update - update remote or local endpoint for peer + * @peer: peer to update the remote endpoint for + * @skb: incoming packet to retrieve the source/destination address from + */ +void ovpn_peer_endpoints_update(struct ovpn_peer *peer, struct sk_buff *skb) +{ + struct hlist_nulls_head *nhead; + struct sockaddr_storage ss; + struct sockaddr_in6 *sa6; + bool reset_cache = false; + struct sockaddr_in *sa; + struct ovpn_bind *bind; + const void *local_ip; + size_t salen = 0; + + spin_lock_bh(&peer->lock); + bind = rcu_dereference_protected(peer->bind, + lockdep_is_held(&peer->lock)); + if (unlikely(!bind)) + goto unlock; + + switch (skb->protocol) { + case htons(ETH_P_IP): + /* float check */ + if (unlikely(!ovpn_bind_skb_src_match(bind, skb))) { + /* unconditionally save local endpoint in case + * of float, as it may have changed as well + */ + local_ip = &ip_hdr(skb)->daddr; + sa = (struct sockaddr_in *)&ss; + sa->sin_family = AF_INET; + sa->sin_addr.s_addr = ip_hdr(skb)->saddr; + sa->sin_port = udp_hdr(skb)->source; + salen = sizeof(*sa); + reset_cache = true; + break; + } + + /* if no float happened, let's double check if the local endpoint + * has changed + */ + if (unlikely(bind->local.ipv4.s_addr != ip_hdr(skb)->daddr)) { + net_dbg_ratelimited("%s: learning local IPv4 for peer %d (%pI4 -> %pI4)\n", + netdev_name(peer->ovpn->dev), + peer->id, &bind->local.ipv4.s_addr, + &ip_hdr(skb)->daddr); + bind->local.ipv4.s_addr = ip_hdr(skb)->daddr; + reset_cache = true; + } + break; + case htons(ETH_P_IPV6): + /* float check */ + if (unlikely(!ovpn_bind_skb_src_match(bind, skb))) { + /* unconditionally save local endpoint in case + * of float, as it may have changed as well + */ + local_ip = &ipv6_hdr(skb)->daddr; + sa6 = (struct sockaddr_in6 *)&ss; + sa6->sin6_family = AF_INET6; + sa6->sin6_addr = ipv6_hdr(skb)->saddr; + sa6->sin6_port = udp_hdr(skb)->source; + sa6->sin6_scope_id = ipv6_iface_scope_id(&ipv6_hdr(skb)->saddr, + skb->skb_iif); + salen = sizeof(*sa6); + reset_cache = true; + break; + } + + /* if no float happened, let's double check if the local endpoint + * has changed + */ + if (unlikely(!ipv6_addr_equal(&bind->local.ipv6, + &ipv6_hdr(skb)->daddr))) { + net_dbg_ratelimited("%s: learning local IPv6 for peer %d (%pI6c -> %pI6c\n", + netdev_name(peer->ovpn->dev), + peer->id, &bind->local.ipv6, + &ipv6_hdr(skb)->daddr); + bind->local.ipv6 = ipv6_hdr(skb)->daddr; + reset_cache = true; + } + break; + default: + goto unlock; + } + + if (unlikely(reset_cache)) + dst_cache_reset(&peer->dst_cache); + + /* if the peer did not float, we can bail out now */ + if (likely(!salen)) + goto unlock; + + if (unlikely(ovpn_peer_reset_sockaddr(peer, + (struct sockaddr_storage *)&ss, + local_ip) < 0)) + goto unlock; + + net_dbg_ratelimited("%s: peer %d floated to %pIScp", + netdev_name(peer->ovpn->dev), peer->id, &ss); + + spin_unlock_bh(&peer->lock); + + /* rehashing is required only in MP mode as P2P has one peer + * only and thus there is no hashtable + */ + if (peer->ovpn->mode == OVPN_MODE_MP) { + spin_lock_bh(&peer->ovpn->lock); + spin_lock_bh(&peer->lock); + bind = rcu_dereference_protected(peer->bind, + lockdep_is_held(&peer->lock)); + if (unlikely(!bind)) { + spin_unlock_bh(&peer->lock); + spin_unlock_bh(&peer->ovpn->lock); + return; + } + + /* This function may be invoked concurrently, therefore another + * float may have happened in parallel: perform rehashing + * using the peer->bind->remote directly as key + */ + + switch (bind->remote.in4.sin_family) { + case AF_INET: + salen = sizeof(*sa); + break; + case AF_INET6: + salen = sizeof(*sa6); + break; + } + + /* remove old hashing */ + hlist_nulls_del_init_rcu(&peer->hash_entry_transp_addr); + /* re-add with new transport address */ + nhead = ovpn_get_hash_head(peer->ovpn->peers->by_transp_addr, + &bind->remote, salen); + hlist_nulls_add_head_rcu(&peer->hash_entry_transp_addr, nhead); + spin_unlock_bh(&peer->lock); + spin_unlock_bh(&peer->ovpn->lock); + } + return; +unlock: + spin_unlock_bh(&peer->lock); +} + /** * ovpn_peer_release_rcu - RCU callback performing last peer release steps * @head: RCU member of the ovpn_peer @@ -230,19 +430,6 @@ static struct in6_addr ovpn_nexthop_from_skb6(struct sk_buff *skb) return rt->rt6i_gateway; } -/* variable name __tbl2 needs to be different from __tbl1 - * in the macro below to avoid confusing clang - */ -#define ovpn_get_hash_slot(_tbl, _key, _key_len) ({ \ - typeof(_tbl) *__tbl2 = &(_tbl); \ - jhash(_key, _key_len, 0) % HASH_SIZE(*__tbl2); \ -}) - -#define ovpn_get_hash_head(_tbl, _key, _key_len) ({ \ - typeof(_tbl) *__tbl1 = &(_tbl); \ - &(*__tbl1)[ovpn_get_hash_slot(*__tbl1, _key, _key_len)];\ -}) - /** * ovpn_peer_get_by_vpn_addr4 - retrieve peer by its VPN IPv4 address * @ovpn: the openvpn instance to search diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h index e747c4b210642..f1288734ff100 100644 --- a/drivers/net/ovpn/peer.h +++ b/drivers/net/ovpn/peer.h @@ -153,4 +153,6 @@ bool ovpn_peer_check_by_src(struct ovpn_priv *ovpn, struct sk_buff *skb, void ovpn_peer_keepalive_set(struct ovpn_peer *peer, u32 interval, u32 timeout); void ovpn_peer_keepalive_work(struct work_struct *work); +void ovpn_peer_endpoints_update(struct ovpn_peer *peer, struct sk_buff *skb); + #endif /* _NET_OVPN_OVPNPEER_H_ */ From 1d36a36f6d5347360ef9681a05f6166683bafd1d Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:35 +0200 Subject: [PATCH 278/770] ovpn: implement peer add/get/dump/delete via netlink This change introduces the netlink command needed to add, delete and retrieve/dump known peers. Userspace is expected to use these commands to handle known peer lifecycles. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-18-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/netlink.c | 686 ++++++++++++++++++++++++++++++++++++- drivers/net/ovpn/peer.c | 79 +++-- drivers/net/ovpn/peer.h | 5 + drivers/net/ovpn/socket.c | 4 +- 4 files changed, 738 insertions(+), 36 deletions(-) diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c index 8d267d4c82283..5653a588a47c6 100644 --- a/drivers/net/ovpn/netlink.c +++ b/drivers/net/ovpn/netlink.c @@ -7,6 +7,7 @@ */ #include +#include #include #include @@ -15,6 +16,9 @@ #include "main.h" #include "netlink.h" #include "netlink-gen.h" +#include "bind.h" +#include "peer.h" +#include "socket.h" MODULE_ALIAS_GENL_FAMILY(OVPN_FAMILY_NAME); @@ -89,29 +93,701 @@ void ovpn_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, netdev_put(ovpn->dev, tracker); } +static bool ovpn_nl_attr_sockaddr_remote(struct nlattr **attrs, + struct sockaddr_storage *ss) +{ + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; + struct in6_addr *in6; + __be16 port = 0; + __be32 *in; + + ss->ss_family = AF_UNSPEC; + + if (attrs[OVPN_A_PEER_REMOTE_PORT]) + port = nla_get_be16(attrs[OVPN_A_PEER_REMOTE_PORT]); + + if (attrs[OVPN_A_PEER_REMOTE_IPV4]) { + ss->ss_family = AF_INET; + in = nla_data(attrs[OVPN_A_PEER_REMOTE_IPV4]); + } else if (attrs[OVPN_A_PEER_REMOTE_IPV6]) { + ss->ss_family = AF_INET6; + in6 = nla_data(attrs[OVPN_A_PEER_REMOTE_IPV6]); + } else { + return false; + } + + switch (ss->ss_family) { + case AF_INET6: + /* If this is a regular IPv6 just break and move on, + * otherwise switch to AF_INET and extract the IPv4 accordingly + */ + if (!ipv6_addr_v4mapped(in6)) { + sin6 = (struct sockaddr_in6 *)ss; + sin6->sin6_port = port; + memcpy(&sin6->sin6_addr, in6, sizeof(*in6)); + break; + } + + /* v4-mapped-v6 address */ + ss->ss_family = AF_INET; + in = &in6->s6_addr32[3]; + fallthrough; + case AF_INET: + sin = (struct sockaddr_in *)ss; + sin->sin_port = port; + sin->sin_addr.s_addr = *in; + break; + } + + return true; +} + +static u8 *ovpn_nl_attr_local_ip(struct nlattr **attrs) +{ + u8 *addr6; + + if (!attrs[OVPN_A_PEER_LOCAL_IPV4] && !attrs[OVPN_A_PEER_LOCAL_IPV6]) + return NULL; + + if (attrs[OVPN_A_PEER_LOCAL_IPV4]) + return nla_data(attrs[OVPN_A_PEER_LOCAL_IPV4]); + + addr6 = nla_data(attrs[OVPN_A_PEER_LOCAL_IPV6]); + /* this is an IPv4-mapped IPv6 address, therefore extract the actual + * v4 address from the last 4 bytes + */ + if (ipv6_addr_v4mapped((struct in6_addr *)addr6)) + return addr6 + 12; + + return addr6; +} + +static sa_family_t ovpn_nl_family_get(struct nlattr *addr4, + struct nlattr *addr6) +{ + if (addr4) + return AF_INET; + + if (addr6) { + if (ipv6_addr_v4mapped((struct in6_addr *)nla_data(addr6))) + return AF_INET; + return AF_INET6; + } + + return AF_UNSPEC; +} + +static int ovpn_nl_peer_precheck(struct ovpn_priv *ovpn, + struct genl_info *info, + struct nlattr **attrs) +{ + sa_family_t local_fam, remote_fam; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_PEER], attrs, + OVPN_A_PEER_ID)) + return -EINVAL; + + if (attrs[OVPN_A_PEER_REMOTE_IPV4] && attrs[OVPN_A_PEER_REMOTE_IPV6]) { + NL_SET_ERR_MSG_MOD(info->extack, + "cannot specify both remote IPv4 or IPv6 address"); + return -EINVAL; + } + + if (!attrs[OVPN_A_PEER_REMOTE_IPV4] && + !attrs[OVPN_A_PEER_REMOTE_IPV6] && attrs[OVPN_A_PEER_REMOTE_PORT]) { + NL_SET_ERR_MSG_MOD(info->extack, + "cannot specify remote port without IP address"); + return -EINVAL; + } + + if ((attrs[OVPN_A_PEER_REMOTE_IPV4] || + attrs[OVPN_A_PEER_REMOTE_IPV6]) && + !attrs[OVPN_A_PEER_REMOTE_PORT]) { + NL_SET_ERR_MSG_MOD(info->extack, + "cannot specify remote IP address without port"); + return -EINVAL; + } + + if (!attrs[OVPN_A_PEER_REMOTE_IPV4] && + attrs[OVPN_A_PEER_LOCAL_IPV4]) { + NL_SET_ERR_MSG_MOD(info->extack, + "cannot specify local IPv4 address without remote"); + return -EINVAL; + } + + if (!attrs[OVPN_A_PEER_REMOTE_IPV6] && + attrs[OVPN_A_PEER_LOCAL_IPV6]) { + NL_SET_ERR_MSG_MOD(info->extack, + "cannot specify local IPV6 address without remote"); + return -EINVAL; + } + + /* check that local and remote address families are the same even + * after parsing v4mapped IPv6 addresses. + * (if addresses are not provided, family will be AF_UNSPEC and + * the check is skipped) + */ + local_fam = ovpn_nl_family_get(attrs[OVPN_A_PEER_LOCAL_IPV4], + attrs[OVPN_A_PEER_LOCAL_IPV6]); + remote_fam = ovpn_nl_family_get(attrs[OVPN_A_PEER_REMOTE_IPV4], + attrs[OVPN_A_PEER_REMOTE_IPV6]); + if (local_fam != AF_UNSPEC && remote_fam != AF_UNSPEC && + local_fam != remote_fam) { + NL_SET_ERR_MSG_MOD(info->extack, + "mismatching local and remote address families"); + return -EINVAL; + } + + if (remote_fam != AF_INET6 && attrs[OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID]) { + NL_SET_ERR_MSG_MOD(info->extack, + "cannot specify scope id without remote IPv6 address"); + return -EINVAL; + } + + /* VPN IPs are needed only in MP mode for selecting the right peer */ + if (ovpn->mode == OVPN_MODE_P2P && (attrs[OVPN_A_PEER_VPN_IPV4] || + attrs[OVPN_A_PEER_VPN_IPV6])) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "unexpected VPN IP in P2P mode"); + return -EINVAL; + } + + if ((attrs[OVPN_A_PEER_KEEPALIVE_INTERVAL] && + !attrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]) || + (!attrs[OVPN_A_PEER_KEEPALIVE_INTERVAL] && + attrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT])) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "keepalive interval and timeout are required together"); + return -EINVAL; + } + + return 0; +} + +/** + * ovpn_nl_peer_modify - modify the peer attributes according to the incoming msg + * @peer: the peer to modify + * @info: generic netlink info from the user request + * @attrs: the attributes from the user request + * + * Return: a negative error code in case of failure, 0 on success or 1 on + * success and the VPN IPs have been modified (requires rehashing in MP + * mode) + */ +static int ovpn_nl_peer_modify(struct ovpn_peer *peer, struct genl_info *info, + struct nlattr **attrs) +{ + struct sockaddr_storage ss = {}; + void *local_ip = NULL; + u32 interv, timeout; + bool rehash = false; + int ret; + + spin_lock_bh(&peer->lock); + + if (ovpn_nl_attr_sockaddr_remote(attrs, &ss)) { + /* we carry the local IP in a generic container. + * ovpn_peer_reset_sockaddr() will properly interpret it + * based on ss.ss_family + */ + local_ip = ovpn_nl_attr_local_ip(attrs); + + /* set peer sockaddr */ + ret = ovpn_peer_reset_sockaddr(peer, &ss, local_ip); + if (ret < 0) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot set peer sockaddr: %d", + ret); + goto err_unlock; + } + dst_cache_reset(&peer->dst_cache); + } + + if (attrs[OVPN_A_PEER_VPN_IPV4]) { + rehash = true; + peer->vpn_addrs.ipv4.s_addr = + nla_get_in_addr(attrs[OVPN_A_PEER_VPN_IPV4]); + } + + if (attrs[OVPN_A_PEER_VPN_IPV6]) { + rehash = true; + peer->vpn_addrs.ipv6 = + nla_get_in6_addr(attrs[OVPN_A_PEER_VPN_IPV6]); + } + + /* when setting the keepalive, both parameters have to be configured */ + if (attrs[OVPN_A_PEER_KEEPALIVE_INTERVAL] && + attrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]) { + interv = nla_get_u32(attrs[OVPN_A_PEER_KEEPALIVE_INTERVAL]); + timeout = nla_get_u32(attrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]); + ovpn_peer_keepalive_set(peer, interv, timeout); + } + + netdev_dbg(peer->ovpn->dev, + "modify peer id=%u endpoint=%pIScp VPN-IPv4=%pI4 VPN-IPv6=%pI6c\n", + peer->id, &ss, + &peer->vpn_addrs.ipv4.s_addr, &peer->vpn_addrs.ipv6); + + spin_unlock_bh(&peer->lock); + + return rehash ? 1 : 0; +err_unlock: + spin_unlock_bh(&peer->lock); + return ret; +} + int ovpn_nl_peer_new_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct ovpn_socket *ovpn_sock; + struct socket *sock = NULL; + struct ovpn_peer *peer; + u32 sockfd, peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_PEER)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_PEER_MAX, info->attrs[OVPN_A_PEER], + ovpn_peer_nl_policy, info->extack); + if (ret) + return ret; + + ret = ovpn_nl_peer_precheck(ovpn, info, attrs); + if (ret < 0) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_PEER], attrs, + OVPN_A_PEER_SOCKET)) + return -EINVAL; + + /* in MP mode VPN IPs are required for selecting the right peer */ + if (ovpn->mode == OVPN_MODE_MP && !attrs[OVPN_A_PEER_VPN_IPV4] && + !attrs[OVPN_A_PEER_VPN_IPV6]) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "VPN IP must be provided in MP mode"); + return -EINVAL; + } + + peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); + peer = ovpn_peer_new(ovpn, peer_id); + if (IS_ERR(peer)) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot create new peer object for peer %u: %ld", + peer_id, PTR_ERR(peer)); + return PTR_ERR(peer); + } + + /* lookup the fd in the kernel table and extract the socket object */ + sockfd = nla_get_u32(attrs[OVPN_A_PEER_SOCKET]); + /* sockfd_lookup() increases sock's refcounter */ + sock = sockfd_lookup(sockfd, &ret); + if (!sock) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot lookup peer socket (fd=%u): %d", + sockfd, ret); + ret = -ENOTSOCK; + goto peer_release; + } + + /* Only when using UDP as transport protocol the remote endpoint + * can be configured so that ovpn knows where to send packets to. + */ + if (sock->sk->sk_protocol == IPPROTO_UDP && + !attrs[OVPN_A_PEER_REMOTE_IPV4] && + !attrs[OVPN_A_PEER_REMOTE_IPV6]) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "missing remote IP address for UDP socket"); + sockfd_put(sock); + ret = -EINVAL; + goto peer_release; + } + + /* In case of TCP, the socket is connected to the peer and ovpn + * will just send bytes over it, without the need to specify a + * destination. + */ + if (sock->sk->sk_protocol == IPPROTO_TCP && + (attrs[OVPN_A_PEER_REMOTE_IPV4] || + attrs[OVPN_A_PEER_REMOTE_IPV6])) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "unexpected remote IP address with TCP socket"); + sockfd_put(sock); + ret = -EINVAL; + goto peer_release; + } + + ovpn_sock = ovpn_socket_new(sock, peer); + /* at this point we unconditionally drop the reference to the socket: + * - in case of error, the socket has to be dropped + * - if case of success, the socket is configured and let + * userspace own the reference, so that the latter can + * trigger the final close() + */ + sockfd_put(sock); + if (IS_ERR(ovpn_sock)) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot encapsulate socket: %ld", + PTR_ERR(ovpn_sock)); + ret = -ENOTSOCK; + goto peer_release; + } + + rcu_assign_pointer(peer->sock, ovpn_sock); + + ret = ovpn_nl_peer_modify(peer, info, attrs); + if (ret < 0) + goto sock_release; + + ret = ovpn_peer_add(ovpn, peer); + if (ret < 0) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot add new peer (id=%u) to hashtable: %d", + peer->id, ret); + goto sock_release; + } + + return 0; + +sock_release: + ovpn_socket_release(peer); +peer_release: + /* release right away because peer was not yet hashed, thus it is not + * used in any context + */ + ovpn_peer_release(peer); + + return ret; } int ovpn_nl_peer_set_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct ovpn_socket *sock; + struct ovpn_peer *peer; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_PEER)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_PEER_MAX, info->attrs[OVPN_A_PEER], + ovpn_peer_nl_policy, info->extack); + if (ret) + return ret; + + ret = ovpn_nl_peer_precheck(ovpn, info, attrs); + if (ret < 0) + return ret; + + if (attrs[OVPN_A_PEER_SOCKET]) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "socket cannot be modified"); + return -EINVAL; + } + + peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot find peer with id %u", peer_id); + return -ENOENT; + } + + /* when using a TCP socket the remote IP is not expected */ + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (sock && sock->sock->sk->sk_protocol == IPPROTO_TCP && + (attrs[OVPN_A_PEER_REMOTE_IPV4] || + attrs[OVPN_A_PEER_REMOTE_IPV6])) { + rcu_read_unlock(); + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "unexpected remote IP address with TCP socket"); + ovpn_peer_put(peer); + return -EINVAL; + } + rcu_read_unlock(); + + spin_lock_bh(&ovpn->lock); + ret = ovpn_nl_peer_modify(peer, info, attrs); + if (ret < 0) { + spin_unlock_bh(&ovpn->lock); + ovpn_peer_put(peer); + return ret; + } + + /* ret == 1 means that VPN IPv4/6 has been modified and rehashing + * is required + */ + if (ret > 0) + ovpn_peer_hash_vpn_ip(peer); + spin_unlock_bh(&ovpn->lock); + ovpn_peer_put(peer); + + return 0; +} + +static int ovpn_nl_send_peer(struct sk_buff *skb, const struct genl_info *info, + const struct ovpn_peer *peer, u32 portid, u32 seq, + int flags) +{ + const struct ovpn_bind *bind; + struct ovpn_socket *sock; + int ret = -EMSGSIZE; + struct nlattr *attr; + __be16 local_port; + void *hdr; + int id; + + hdr = genlmsg_put(skb, portid, seq, &ovpn_nl_family, flags, + OVPN_CMD_PEER_GET); + if (!hdr) + return -ENOBUFS; + + attr = nla_nest_start(skb, OVPN_A_PEER); + if (!attr) + goto err; + + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (!sock) { + ret = -EINVAL; + goto err_unlock; + } + + if (!net_eq(genl_info_net(info), sock_net(sock->sock->sk))) { + id = peernet2id_alloc(genl_info_net(info), + sock_net(sock->sock->sk), + GFP_ATOMIC); + if (nla_put_s32(skb, OVPN_A_PEER_SOCKET_NETNSID, id)) + goto err_unlock; + } + local_port = inet_sk(sock->sock->sk)->inet_sport; + rcu_read_unlock(); + + if (nla_put_u32(skb, OVPN_A_PEER_ID, peer->id)) + goto err; + + if (peer->vpn_addrs.ipv4.s_addr != htonl(INADDR_ANY)) + if (nla_put_in_addr(skb, OVPN_A_PEER_VPN_IPV4, + peer->vpn_addrs.ipv4.s_addr)) + goto err; + + if (!ipv6_addr_equal(&peer->vpn_addrs.ipv6, &in6addr_any)) + if (nla_put_in6_addr(skb, OVPN_A_PEER_VPN_IPV6, + &peer->vpn_addrs.ipv6)) + goto err; + + if (nla_put_u32(skb, OVPN_A_PEER_KEEPALIVE_INTERVAL, + peer->keepalive_interval) || + nla_put_u32(skb, OVPN_A_PEER_KEEPALIVE_TIMEOUT, + peer->keepalive_timeout)) + goto err; + + rcu_read_lock(); + bind = rcu_dereference(peer->bind); + if (bind) { + if (bind->remote.in4.sin_family == AF_INET) { + if (nla_put_in_addr(skb, OVPN_A_PEER_REMOTE_IPV4, + bind->remote.in4.sin_addr.s_addr) || + nla_put_net16(skb, OVPN_A_PEER_REMOTE_PORT, + bind->remote.in4.sin_port) || + nla_put_in_addr(skb, OVPN_A_PEER_LOCAL_IPV4, + bind->local.ipv4.s_addr)) + goto err_unlock; + } else if (bind->remote.in4.sin_family == AF_INET6) { + if (nla_put_in6_addr(skb, OVPN_A_PEER_REMOTE_IPV6, + &bind->remote.in6.sin6_addr) || + nla_put_u32(skb, OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID, + bind->remote.in6.sin6_scope_id) || + nla_put_net16(skb, OVPN_A_PEER_REMOTE_PORT, + bind->remote.in6.sin6_port) || + nla_put_in6_addr(skb, OVPN_A_PEER_LOCAL_IPV6, + &bind->local.ipv6)) + goto err_unlock; + } + } + rcu_read_unlock(); + + if (nla_put_net16(skb, OVPN_A_PEER_LOCAL_PORT, local_port) || + /* VPN RX stats */ + nla_put_uint(skb, OVPN_A_PEER_VPN_RX_BYTES, + atomic64_read(&peer->vpn_stats.rx.bytes)) || + nla_put_uint(skb, OVPN_A_PEER_VPN_RX_PACKETS, + atomic64_read(&peer->vpn_stats.rx.packets)) || + /* VPN TX stats */ + nla_put_uint(skb, OVPN_A_PEER_VPN_TX_BYTES, + atomic64_read(&peer->vpn_stats.tx.bytes)) || + nla_put_uint(skb, OVPN_A_PEER_VPN_TX_PACKETS, + atomic64_read(&peer->vpn_stats.tx.packets)) || + /* link RX stats */ + nla_put_uint(skb, OVPN_A_PEER_LINK_RX_BYTES, + atomic64_read(&peer->link_stats.rx.bytes)) || + nla_put_uint(skb, OVPN_A_PEER_LINK_RX_PACKETS, + atomic64_read(&peer->link_stats.rx.packets)) || + /* link TX stats */ + nla_put_uint(skb, OVPN_A_PEER_LINK_TX_BYTES, + atomic64_read(&peer->link_stats.tx.bytes)) || + nla_put_uint(skb, OVPN_A_PEER_LINK_TX_PACKETS, + atomic64_read(&peer->link_stats.tx.packets))) + goto err; + + nla_nest_end(skb, attr); + genlmsg_end(skb, hdr); + + return 0; +err_unlock: + rcu_read_unlock(); +err: + genlmsg_cancel(skb, hdr); + return ret; } int ovpn_nl_peer_get_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct ovpn_peer *peer; + struct sk_buff *msg; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_PEER)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_PEER_MAX, info->attrs[OVPN_A_PEER], + ovpn_peer_nl_policy, info->extack); + if (ret) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_PEER], attrs, + OVPN_A_PEER_ID)) + return -EINVAL; + + peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot find peer with id %u", peer_id); + return -ENOENT; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + ret = ovpn_nl_send_peer(msg, info, peer, info->snd_portid, + info->snd_seq, 0); + if (ret < 0) { + nlmsg_free(msg); + goto err; + } + + ret = genlmsg_reply(msg, info); +err: + ovpn_peer_put(peer); + return ret; } int ovpn_nl_peer_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - return -EOPNOTSUPP; + const struct genl_info *info = genl_info_dump(cb); + int bkt, last_idx = cb->args[1], dumped = 0; + netdevice_tracker tracker; + struct ovpn_priv *ovpn; + struct ovpn_peer *peer; + + ovpn = ovpn_get_dev_from_attrs(sock_net(cb->skb->sk), info, &tracker); + if (IS_ERR(ovpn)) + return PTR_ERR(ovpn); + + if (ovpn->mode == OVPN_MODE_P2P) { + /* if we already dumped a peer it means we are done */ + if (last_idx) + goto out; + + rcu_read_lock(); + peer = rcu_dereference(ovpn->peer); + if (peer) { + if (ovpn_nl_send_peer(skb, info, peer, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI) == 0) + dumped++; + } + rcu_read_unlock(); + } else { + rcu_read_lock(); + hash_for_each_rcu(ovpn->peers->by_id, bkt, peer, + hash_entry_id) { + /* skip already dumped peers that were dumped by + * previous invocations + */ + if (last_idx > 0) { + last_idx--; + continue; + } + + if (ovpn_nl_send_peer(skb, info, peer, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI) < 0) + break; + + /* count peers being dumped during this invocation */ + dumped++; + } + rcu_read_unlock(); + } + +out: + netdev_put(ovpn->dev, &tracker); + + /* sum up peers dumped in this message, so that at the next invocation + * we can continue from where we left + */ + cb->args[1] += dumped; + return skb->len; } int ovpn_nl_peer_del_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct ovpn_peer *peer; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_PEER)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_PEER_MAX, info->attrs[OVPN_A_PEER], + ovpn_peer_nl_policy, info->extack); + if (ret) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_PEER], attrs, + OVPN_A_PEER_ID)) + return -EINVAL; + + peer_id = nla_get_u32(attrs[OVPN_A_PEER_ID]); + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot find peer with id %u", peer_id); + return -ENOENT; + } + + netdev_dbg(ovpn->dev, "del peer %u\n", peer->id); + ret = ovpn_peer_del(peer, OVPN_DEL_PEER_REASON_USERSPACE); + ovpn_peer_put(peer); + + return ret; } int ovpn_nl_key_new_doit(struct sk_buff *skb, struct genl_info *info) diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index 10a6fd3e6ecd0..0469afd6da238 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -135,9 +135,9 @@ struct ovpn_peer *ovpn_peer_new(struct ovpn_priv *ovpn, u32 id) * * Return: 0 on success or a negative error code otherwise */ -static int ovpn_peer_reset_sockaddr(struct ovpn_peer *peer, - const struct sockaddr_storage *ss, - const void *local_ip) +int ovpn_peer_reset_sockaddr(struct ovpn_peer *peer, + const struct sockaddr_storage *ss, + const void *local_ip) { struct ovpn_bind *bind; size_t ip_len; @@ -149,19 +149,21 @@ static int ovpn_peer_reset_sockaddr(struct ovpn_peer *peer, if (IS_ERR(bind)) return PTR_ERR(bind); - if (ss->ss_family == AF_INET) { - ip_len = sizeof(struct in_addr); - } else if (ss->ss_family == AF_INET6) { - ip_len = sizeof(struct in6_addr); - } else { - net_dbg_ratelimited("%s: invalid family %u for remote endpoint for peer %u\n", - netdev_name(peer->ovpn->dev), - ss->ss_family, peer->id); - kfree(bind); - return -EINVAL; - } + if (local_ip) { + if (ss->ss_family == AF_INET) { + ip_len = sizeof(struct in_addr); + } else if (ss->ss_family == AF_INET6) { + ip_len = sizeof(struct in6_addr); + } else { + net_dbg_ratelimited("%s: invalid family %u for remote endpoint for peer %u\n", + netdev_name(peer->ovpn->dev), + ss->ss_family, peer->id); + kfree(bind); + return -EINVAL; + } - memcpy(&bind->local, local_ip, ip_len); + memcpy(&bind->local, local_ip, ip_len); + } /* set binding */ ovpn_bind_reset(peer, bind); @@ -346,7 +348,7 @@ static void ovpn_peer_release_rcu(struct rcu_head *head) * ovpn_peer_release - release peer private members * @peer: the peer to release */ -static void ovpn_peer_release(struct ovpn_peer *peer) +void ovpn_peer_release(struct ovpn_peer *peer) { ovpn_crypto_state_release(&peer->crypto); spin_lock_bh(&peer->lock); @@ -887,6 +889,37 @@ bool ovpn_peer_check_by_src(struct ovpn_priv *ovpn, struct sk_buff *skb, return match; } +void ovpn_peer_hash_vpn_ip(struct ovpn_peer *peer) +{ + struct hlist_nulls_head *nhead; + + lockdep_assert_held(&peer->ovpn->lock); + + /* rehashing makes sense only in multipeer mode */ + if (peer->ovpn->mode != OVPN_MODE_MP) + return; + + if (peer->vpn_addrs.ipv4.s_addr != htonl(INADDR_ANY)) { + /* remove potential old hashing */ + hlist_nulls_del_init_rcu(&peer->hash_entry_addr4); + + nhead = ovpn_get_hash_head(peer->ovpn->peers->by_vpn_addr4, + &peer->vpn_addrs.ipv4, + sizeof(peer->vpn_addrs.ipv4)); + hlist_nulls_add_head_rcu(&peer->hash_entry_addr4, nhead); + } + + if (!ipv6_addr_any(&peer->vpn_addrs.ipv6)) { + /* remove potential old hashing */ + hlist_nulls_del_init_rcu(&peer->hash_entry_addr6); + + nhead = ovpn_get_hash_head(peer->ovpn->peers->by_vpn_addr6, + &peer->vpn_addrs.ipv6, + sizeof(peer->vpn_addrs.ipv6)); + hlist_nulls_add_head_rcu(&peer->hash_entry_addr6, nhead); + } +} + /** * ovpn_peer_add_mp - add peer to related tables in a MP instance * @ovpn: the instance to add the peer to @@ -948,19 +981,7 @@ static int ovpn_peer_add_mp(struct ovpn_priv *ovpn, struct ovpn_peer *peer) ovpn_get_hash_head(ovpn->peers->by_id, &peer->id, sizeof(peer->id))); - if (peer->vpn_addrs.ipv4.s_addr != htonl(INADDR_ANY)) { - nhead = ovpn_get_hash_head(ovpn->peers->by_vpn_addr4, - &peer->vpn_addrs.ipv4, - sizeof(peer->vpn_addrs.ipv4)); - hlist_nulls_add_head_rcu(&peer->hash_entry_addr4, nhead); - } - - if (!ipv6_addr_any(&peer->vpn_addrs.ipv6)) { - nhead = ovpn_get_hash_head(ovpn->peers->by_vpn_addr6, - &peer->vpn_addrs.ipv6, - sizeof(peer->vpn_addrs.ipv6)); - hlist_nulls_add_head_rcu(&peer->hash_entry_addr6, nhead); - } + ovpn_peer_hash_vpn_ip(peer); out: spin_unlock_bh(&ovpn->lock); return ret; diff --git a/drivers/net/ovpn/peer.h b/drivers/net/ovpn/peer.h index f1288734ff100..a1423f2b09e06 100644 --- a/drivers/net/ovpn/peer.h +++ b/drivers/net/ovpn/peer.h @@ -125,6 +125,7 @@ static inline bool ovpn_peer_hold(struct ovpn_peer *peer) return kref_get_unless_zero(&peer->refcount); } +void ovpn_peer_release(struct ovpn_peer *peer); void ovpn_peer_release_kref(struct kref *kref); /** @@ -147,6 +148,7 @@ struct ovpn_peer *ovpn_peer_get_by_transp_addr(struct ovpn_priv *ovpn, struct ovpn_peer *ovpn_peer_get_by_id(struct ovpn_priv *ovpn, u32 peer_id); struct ovpn_peer *ovpn_peer_get_by_dst(struct ovpn_priv *ovpn, struct sk_buff *skb); +void ovpn_peer_hash_vpn_ip(struct ovpn_peer *peer); bool ovpn_peer_check_by_src(struct ovpn_priv *ovpn, struct sk_buff *skb, struct ovpn_peer *peer); @@ -154,5 +156,8 @@ void ovpn_peer_keepalive_set(struct ovpn_peer *peer, u32 interval, u32 timeout); void ovpn_peer_keepalive_work(struct work_struct *work); void ovpn_peer_endpoints_update(struct ovpn_peer *peer, struct sk_buff *skb); +int ovpn_peer_reset_sockaddr(struct ovpn_peer *peer, + const struct sockaddr_storage *ss, + const void *local_ip); #endif /* _NET_OVPN_OVPNPEER_H_ */ diff --git a/drivers/net/ovpn/socket.c b/drivers/net/ovpn/socket.c index 3a18fbdd37216..a83cbab725910 100644 --- a/drivers/net/ovpn/socket.c +++ b/drivers/net/ovpn/socket.c @@ -49,8 +49,8 @@ static bool ovpn_socket_put(struct ovpn_peer *peer, struct ovpn_socket *sock) * ovpn_socket_release - release resources owned by socket user * @peer: peer whose socket should be released * - * This function should be invoked when the user is shutting - * down and wants to drop its link to the socket. + * This function should be invoked when the peer is being removed + * and wants to drop its link to the socket. * * In case of UDP, the detach routine will drop a reference to the * ovpn netdev, pointed by the ovpn_socket. From 203e2bf55990c96a7efa928b4407368a548dde97 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:36 +0200 Subject: [PATCH 279/770] ovpn: implement key add/get/del/swap via netlink This change introduces the netlink commands needed to add, get, delete and swap keys for a specific peer. Userspace is expected to use these commands to create, inspect (non sensitive data only), destroy and rotate session keys for a specific peer. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-19-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/crypto.c | 40 +++++ drivers/net/ovpn/crypto.h | 4 + drivers/net/ovpn/crypto_aead.c | 17 ++ drivers/net/ovpn/crypto_aead.h | 2 + drivers/net/ovpn/netlink.c | 301 ++++++++++++++++++++++++++++++++- 5 files changed, 360 insertions(+), 4 deletions(-) diff --git a/drivers/net/ovpn/crypto.c b/drivers/net/ovpn/crypto.c index 9544255c4588a..deeefffc09816 100644 --- a/drivers/net/ovpn/crypto.c +++ b/drivers/net/ovpn/crypto.c @@ -146,3 +146,43 @@ void ovpn_crypto_key_slots_swap(struct ovpn_crypto_state *cs) spin_unlock_bh(&cs->lock); } + +/** + * ovpn_crypto_config_get - populate keyconf object with non-sensible key data + * @cs: the crypto state to extract the key data from + * @slot: the specific slot to inspect + * @keyconf: the output object to populate + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_crypto_config_get(struct ovpn_crypto_state *cs, + enum ovpn_key_slot slot, + struct ovpn_key_config *keyconf) +{ + struct ovpn_crypto_key_slot *ks; + int idx; + + switch (slot) { + case OVPN_KEY_SLOT_PRIMARY: + idx = cs->primary_idx; + break; + case OVPN_KEY_SLOT_SECONDARY: + idx = !cs->primary_idx; + break; + default: + return -EINVAL; + } + + rcu_read_lock(); + ks = rcu_dereference(cs->slots[idx]); + if (!ks) { + rcu_read_unlock(); + return -ENOENT; + } + + keyconf->cipher_alg = ovpn_aead_crypto_alg(ks); + keyconf->key_id = ks->key_id; + rcu_read_unlock(); + + return 0; +} diff --git a/drivers/net/ovpn/crypto.h b/drivers/net/ovpn/crypto.h index 5155791b87df7..487d24a7d2663 100644 --- a/drivers/net/ovpn/crypto.h +++ b/drivers/net/ovpn/crypto.h @@ -136,4 +136,8 @@ void ovpn_crypto_state_release(struct ovpn_crypto_state *cs); void ovpn_crypto_key_slots_swap(struct ovpn_crypto_state *cs); +int ovpn_crypto_config_get(struct ovpn_crypto_state *cs, + enum ovpn_key_slot slot, + struct ovpn_key_config *keyconf); + #endif /* _NET_OVPN_OVPNCRYPTO_H_ */ diff --git a/drivers/net/ovpn/crypto_aead.c b/drivers/net/ovpn/crypto_aead.c index 83ec18e4b9a4f..74ee639ac8688 100644 --- a/drivers/net/ovpn/crypto_aead.c +++ b/drivers/net/ovpn/crypto_aead.c @@ -364,3 +364,20 @@ ovpn_aead_crypto_key_slot_new(const struct ovpn_key_config *kc) ovpn_aead_crypto_key_slot_destroy(ks); return ERR_PTR(ret); } + +enum ovpn_cipher_alg ovpn_aead_crypto_alg(struct ovpn_crypto_key_slot *ks) +{ + const char *alg_name; + + if (!ks->encrypt) + return OVPN_CIPHER_ALG_NONE; + + alg_name = crypto_tfm_alg_name(crypto_aead_tfm(ks->encrypt)); + + if (!strcmp(alg_name, ALG_NAME_AES)) + return OVPN_CIPHER_ALG_AES_GCM; + else if (!strcmp(alg_name, ALG_NAME_CHACHAPOLY)) + return OVPN_CIPHER_ALG_CHACHA20_POLY1305; + else + return OVPN_CIPHER_ALG_NONE; +} diff --git a/drivers/net/ovpn/crypto_aead.h b/drivers/net/ovpn/crypto_aead.h index 40c056558add3..65a2ff3078986 100644 --- a/drivers/net/ovpn/crypto_aead.h +++ b/drivers/net/ovpn/crypto_aead.h @@ -24,4 +24,6 @@ struct ovpn_crypto_key_slot * ovpn_aead_crypto_key_slot_new(const struct ovpn_key_config *kc); void ovpn_aead_crypto_key_slot_destroy(struct ovpn_crypto_key_slot *ks); +enum ovpn_cipher_alg ovpn_aead_crypto_alg(struct ovpn_crypto_key_slot *ks); + #endif /* _NET_OVPN_OVPNAEAD_H_ */ diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c index 5653a588a47c6..1f4220021df3a 100644 --- a/drivers/net/ovpn/netlink.c +++ b/drivers/net/ovpn/netlink.c @@ -17,6 +17,7 @@ #include "netlink.h" #include "netlink-gen.h" #include "bind.h" +#include "crypto.h" #include "peer.h" #include "socket.h" @@ -790,24 +791,316 @@ int ovpn_nl_peer_del_doit(struct sk_buff *skb, struct genl_info *info) return ret; } +static int ovpn_nl_get_key_dir(struct genl_info *info, struct nlattr *key, + enum ovpn_cipher_alg cipher, + struct ovpn_key_direction *dir) +{ + struct nlattr *attrs[OVPN_A_KEYDIR_MAX + 1]; + int ret; + + ret = nla_parse_nested(attrs, OVPN_A_KEYDIR_MAX, key, + ovpn_keydir_nl_policy, info->extack); + if (ret) + return ret; + + switch (cipher) { + case OVPN_CIPHER_ALG_AES_GCM: + case OVPN_CIPHER_ALG_CHACHA20_POLY1305: + if (NL_REQ_ATTR_CHECK(info->extack, key, attrs, + OVPN_A_KEYDIR_CIPHER_KEY) || + NL_REQ_ATTR_CHECK(info->extack, key, attrs, + OVPN_A_KEYDIR_NONCE_TAIL)) + return -EINVAL; + + dir->cipher_key = nla_data(attrs[OVPN_A_KEYDIR_CIPHER_KEY]); + dir->cipher_key_size = nla_len(attrs[OVPN_A_KEYDIR_CIPHER_KEY]); + + /* These algorithms require a 96bit nonce, + * Construct it by combining 4-bytes packet id and + * 8-bytes nonce-tail from userspace + */ + dir->nonce_tail = nla_data(attrs[OVPN_A_KEYDIR_NONCE_TAIL]); + dir->nonce_tail_size = nla_len(attrs[OVPN_A_KEYDIR_NONCE_TAIL]); + break; + default: + NL_SET_ERR_MSG_MOD(info->extack, "unsupported cipher"); + return -EINVAL; + } + + return 0; +} + +/** + * ovpn_nl_key_new_doit - configure a new key for the specified peer + * @skb: incoming netlink message + * @info: genetlink metadata + * + * This function allows the user to install a new key in the peer crypto + * state. + * Each peer has two 'slots', namely 'primary' and 'secondary', where + * keys can be installed. The key in the 'primary' slot is used for + * encryption, while both keys can be used for decryption by matching the + * key ID carried in the incoming packet. + * + * The user is responsible for rotating keys when necessary. The user + * may fetch peer traffic statistics via netlink in order to better + * identify the right time to rotate keys. + * The renegotiation follows these steps: + * 1. a new key is computed by the user and is installed in the 'secondary' + * slot + * 2. at user discretion (usually after a predetermined time) 'primary' and + * 'secondary' contents are swapped and the new key starts being used for + * encryption, while the old key is kept around for decryption of late + * packets. + * + * Return: 0 on success or a negative error code otherwise. + */ int ovpn_nl_key_new_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct nlattr *attrs[OVPN_A_KEYCONF_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct ovpn_peer_key_reset pkr; + struct ovpn_peer *peer; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_KEYCONF)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_KEYCONF_MAX, + info->attrs[OVPN_A_KEYCONF], + ovpn_keyconf_nl_policy, info->extack); + if (ret) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_PEER_ID)) + return -EINVAL; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_SLOT) || + NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_KEY_ID) || + NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_CIPHER_ALG) || + NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_ENCRYPT_DIR) || + NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_DECRYPT_DIR)) + return -EINVAL; + + pkr.slot = nla_get_u32(attrs[OVPN_A_KEYCONF_SLOT]); + pkr.key.key_id = nla_get_u32(attrs[OVPN_A_KEYCONF_KEY_ID]); + pkr.key.cipher_alg = nla_get_u32(attrs[OVPN_A_KEYCONF_CIPHER_ALG]); + + ret = ovpn_nl_get_key_dir(info, attrs[OVPN_A_KEYCONF_ENCRYPT_DIR], + pkr.key.cipher_alg, &pkr.key.encrypt); + if (ret < 0) + return ret; + + ret = ovpn_nl_get_key_dir(info, attrs[OVPN_A_KEYCONF_DECRYPT_DIR], + pkr.key.cipher_alg, &pkr.key.decrypt); + if (ret < 0) + return ret; + + peer_id = nla_get_u32(attrs[OVPN_A_KEYCONF_PEER_ID]); + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "no peer with id %u to set key for", + peer_id); + return -ENOENT; + } + + ret = ovpn_crypto_state_reset(&peer->crypto, &pkr); + if (ret < 0) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot install new key for peer %u", + peer_id); + goto out; + } + + netdev_dbg(ovpn->dev, "new key installed (id=%u) for peer %u\n", + pkr.key.key_id, peer_id); +out: + ovpn_peer_put(peer); + return ret; +} + +static int ovpn_nl_send_key(struct sk_buff *skb, const struct genl_info *info, + u32 peer_id, enum ovpn_key_slot slot, + const struct ovpn_key_config *keyconf) +{ + struct nlattr *attr; + void *hdr; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &ovpn_nl_family, + 0, OVPN_CMD_KEY_GET); + if (!hdr) + return -ENOBUFS; + + attr = nla_nest_start(skb, OVPN_A_KEYCONF); + if (!attr) + goto err; + + if (nla_put_u32(skb, OVPN_A_KEYCONF_PEER_ID, peer_id)) + goto err; + + if (nla_put_u32(skb, OVPN_A_KEYCONF_SLOT, slot) || + nla_put_u32(skb, OVPN_A_KEYCONF_KEY_ID, keyconf->key_id) || + nla_put_u32(skb, OVPN_A_KEYCONF_CIPHER_ALG, keyconf->cipher_alg)) + goto err; + + nla_nest_end(skb, attr); + genlmsg_end(skb, hdr); + + return 0; +err: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; } int ovpn_nl_key_get_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct nlattr *attrs[OVPN_A_KEYCONF_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct ovpn_key_config keyconf = { 0 }; + enum ovpn_key_slot slot; + struct ovpn_peer *peer; + struct sk_buff *msg; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_KEYCONF)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_KEYCONF_MAX, + info->attrs[OVPN_A_KEYCONF], + ovpn_keyconf_nl_policy, info->extack); + if (ret) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_PEER_ID)) + return -EINVAL; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_SLOT)) + return -EINVAL; + + peer_id = nla_get_u32(attrs[OVPN_A_KEYCONF_PEER_ID]); + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot find peer with id %u", peer_id); + return -ENOENT; + } + + slot = nla_get_u32(attrs[OVPN_A_KEYCONF_SLOT]); + + ret = ovpn_crypto_config_get(&peer->crypto, slot, &keyconf); + if (ret < 0) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "cannot extract key from slot %u for peer %u", + slot, peer_id); + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + ret = ovpn_nl_send_key(msg, info, peer->id, slot, &keyconf); + if (ret < 0) { + nlmsg_free(msg); + goto err; + } + + ret = genlmsg_reply(msg, info); +err: + ovpn_peer_put(peer); + return ret; } int ovpn_nl_key_swap_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct ovpn_priv *ovpn = info->user_ptr[0]; + struct nlattr *attrs[OVPN_A_PEER_MAX + 1]; + struct ovpn_peer *peer; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_KEYCONF)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_KEYCONF_MAX, + info->attrs[OVPN_A_KEYCONF], + ovpn_keyconf_nl_policy, info->extack); + if (ret) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_PEER_ID)) + return -EINVAL; + + peer_id = nla_get_u32(attrs[OVPN_A_KEYCONF_PEER_ID]); + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "no peer with id %u to swap keys for", + peer_id); + return -ENOENT; + } + + ovpn_crypto_key_slots_swap(&peer->crypto); + ovpn_peer_put(peer); + + return 0; } int ovpn_nl_key_del_doit(struct sk_buff *skb, struct genl_info *info) { - return -EOPNOTSUPP; + struct nlattr *attrs[OVPN_A_KEYCONF_MAX + 1]; + struct ovpn_priv *ovpn = info->user_ptr[0]; + enum ovpn_key_slot slot; + struct ovpn_peer *peer; + u32 peer_id; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, OVPN_A_KEYCONF)) + return -EINVAL; + + ret = nla_parse_nested(attrs, OVPN_A_KEYCONF_MAX, + info->attrs[OVPN_A_KEYCONF], + ovpn_keyconf_nl_policy, info->extack); + if (ret) + return ret; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_PEER_ID)) + return -EINVAL; + + if (NL_REQ_ATTR_CHECK(info->extack, info->attrs[OVPN_A_KEYCONF], attrs, + OVPN_A_KEYCONF_SLOT)) + return -EINVAL; + + peer_id = nla_get_u32(attrs[OVPN_A_KEYCONF_PEER_ID]); + slot = nla_get_u32(attrs[OVPN_A_KEYCONF_SLOT]); + + peer = ovpn_peer_get_by_id(ovpn, peer_id); + if (!peer) { + NL_SET_ERR_MSG_FMT_MOD(info->extack, + "no peer with id %u to delete key for", + peer_id); + return -ENOENT; + } + + ovpn_crypto_key_slot_delete(&peer->crypto, slot); + ovpn_peer_put(peer); + + return 0; } /** From 89d3c0e4612afa1c6429ed68d298e35592fbe208 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:37 +0200 Subject: [PATCH 280/770] ovpn: kill key and notify userspace in case of IV exhaustion IV wrap-around is cryptographically dangerous for a number of ciphers, therefore kill the key and inform userspace (via netlink) should the IV space go exhausted. Userspace has two ways of deciding when the key has to be renewed before exhausting the IV space: 1) time based approach: after X seconds/minutes userspace generates a new key and sends it to the kernel. This is based on guestimate and normally default timer value works well. 2) packet count based approach: after X packets/bytes userspace generates a new key and sends it to the kernel. Userspace keeps track of the amount of traffic by periodically polling GET_PEER and fetching the VPN/LINK stats. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-20-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/crypto.c | 22 +++++++++++++ drivers/net/ovpn/crypto.h | 2 ++ drivers/net/ovpn/io.c | 14 +++++++++ drivers/net/ovpn/netlink.c | 64 ++++++++++++++++++++++++++++++++++++++ drivers/net/ovpn/netlink.h | 2 ++ 5 files changed, 104 insertions(+) diff --git a/drivers/net/ovpn/crypto.c b/drivers/net/ovpn/crypto.c index deeefffc09816..90580e32052fb 100644 --- a/drivers/net/ovpn/crypto.c +++ b/drivers/net/ovpn/crypto.c @@ -54,6 +54,28 @@ void ovpn_crypto_state_release(struct ovpn_crypto_state *cs) } } +/* removes the key matching the specified id from the crypto context */ +bool ovpn_crypto_kill_key(struct ovpn_crypto_state *cs, u8 key_id) +{ + struct ovpn_crypto_key_slot *ks = NULL; + + spin_lock_bh(&cs->lock); + if (rcu_access_pointer(cs->slots[0])->key_id == key_id) { + ks = rcu_replace_pointer(cs->slots[0], NULL, + lockdep_is_held(&cs->lock)); + } else if (rcu_access_pointer(cs->slots[1])->key_id == key_id) { + ks = rcu_replace_pointer(cs->slots[1], NULL, + lockdep_is_held(&cs->lock)); + } + spin_unlock_bh(&cs->lock); + + if (ks) + ovpn_crypto_key_slot_put(ks); + + /* let the caller know if a key was actually killed */ + return ks; +} + /* Reset the ovpn_crypto_state object in a way that is atomic * to RCU readers. */ diff --git a/drivers/net/ovpn/crypto.h b/drivers/net/ovpn/crypto.h index 487d24a7d2663..0e284fec3a75a 100644 --- a/drivers/net/ovpn/crypto.h +++ b/drivers/net/ovpn/crypto.h @@ -140,4 +140,6 @@ int ovpn_crypto_config_get(struct ovpn_crypto_state *cs, enum ovpn_key_slot slot, struct ovpn_key_config *keyconf); +bool ovpn_crypto_kill_key(struct ovpn_crypto_state *cs, u8 key_id); + #endif /* _NET_OVPN_OVPNCRYPTO_H_ */ diff --git a/drivers/net/ovpn/io.c b/drivers/net/ovpn/io.c index fd8820c7d1337..dd8a8055d9676 100644 --- a/drivers/net/ovpn/io.c +++ b/drivers/net/ovpn/io.c @@ -245,6 +245,20 @@ void ovpn_encrypt_post(void *data, int ret) kfree(ovpn_skb_cb(skb)->sg); aead_request_free(ovpn_skb_cb(skb)->req); + if (unlikely(ret == -ERANGE)) { + /* we ran out of IVs and we must kill the key as it can't be + * use anymore + */ + netdev_warn(peer->ovpn->dev, + "killing key %u for peer %u\n", ks->key_id, + peer->id); + if (ovpn_crypto_kill_key(&peer->crypto, ks->key_id)) + /* let userspace know so that a new key must be negotiated */ + ovpn_nl_key_swap_notify(peer, ks->key_id); + + goto err; + } + if (unlikely(ret < 0)) goto err; diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c index 1f4220021df3a..f0b5716059364 100644 --- a/drivers/net/ovpn/netlink.c +++ b/drivers/net/ovpn/netlink.c @@ -1103,6 +1103,70 @@ int ovpn_nl_key_del_doit(struct sk_buff *skb, struct genl_info *info) return 0; } +/** + * ovpn_nl_key_swap_notify - notify userspace peer's key must be renewed + * @peer: the peer whose key needs to be renewed + * @key_id: the ID of the key that needs to be renewed + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_nl_key_swap_notify(struct ovpn_peer *peer, u8 key_id) +{ + struct ovpn_socket *sock; + struct nlattr *k_attr; + struct sk_buff *msg; + int ret = -EMSGSIZE; + void *hdr; + + netdev_info(peer->ovpn->dev, "peer with id %u must rekey - primary key unusable.\n", + peer->id); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &ovpn_nl_family, 0, OVPN_CMD_KEY_SWAP_NTF); + if (!hdr) { + ret = -ENOBUFS; + goto err_free_msg; + } + + if (nla_put_u32(msg, OVPN_A_IFINDEX, peer->ovpn->dev->ifindex)) + goto err_cancel_msg; + + k_attr = nla_nest_start(msg, OVPN_A_KEYCONF); + if (!k_attr) + goto err_cancel_msg; + + if (nla_put_u32(msg, OVPN_A_KEYCONF_PEER_ID, peer->id)) + goto err_cancel_msg; + + if (nla_put_u16(msg, OVPN_A_KEYCONF_KEY_ID, key_id)) + goto err_cancel_msg; + + nla_nest_end(msg, k_attr); + genlmsg_end(msg, hdr); + + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (!sock) { + ret = -EINVAL; + goto err_unlock; + } + genlmsg_multicast_netns(&ovpn_nl_family, sock_net(sock->sock->sk), + msg, 0, OVPN_NLGRP_PEERS, GFP_ATOMIC); + rcu_read_unlock(); + + return 0; +err_unlock: + rcu_read_unlock(); +err_cancel_msg: + genlmsg_cancel(msg, hdr); +err_free_msg: + nlmsg_free(msg); + return ret; +} + /** * ovpn_nl_register - perform any needed registration in the NL subsustem * diff --git a/drivers/net/ovpn/netlink.h b/drivers/net/ovpn/netlink.h index 0d6c34e17082c..5dc84c8e5e803 100644 --- a/drivers/net/ovpn/netlink.h +++ b/drivers/net/ovpn/netlink.h @@ -12,4 +12,6 @@ int ovpn_nl_register(void); void ovpn_nl_unregister(void); +int ovpn_nl_key_swap_notify(struct ovpn_peer *peer, u8 key_id); + #endif /* _NET_OVPN_NETLINK_H_ */ From a215d253c17ac8e78cea950dc5376eed29ba2d67 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:38 +0200 Subject: [PATCH 281/770] ovpn: notify userspace when a peer is deleted Whenever a peer is deleted, send a notification to userspace so that it can react accordingly. This is most important when a peer is deleted due to ping timeout, because it all happens in kernelspace and thus userspace has no direct way to learn about it. Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-21-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/netlink.c | 65 ++++++++++++++++++++++++++++++++++++++ drivers/net/ovpn/netlink.h | 1 + drivers/net/ovpn/peer.c | 1 + 3 files changed, 67 insertions(+) diff --git a/drivers/net/ovpn/netlink.c b/drivers/net/ovpn/netlink.c index f0b5716059364..bea03913bfb1e 100644 --- a/drivers/net/ovpn/netlink.c +++ b/drivers/net/ovpn/netlink.c @@ -1103,6 +1103,71 @@ int ovpn_nl_key_del_doit(struct sk_buff *skb, struct genl_info *info) return 0; } +/** + * ovpn_nl_peer_del_notify - notify userspace about peer being deleted + * @peer: the peer being deleted + * + * Return: 0 on success or a negative error code otherwise + */ +int ovpn_nl_peer_del_notify(struct ovpn_peer *peer) +{ + struct ovpn_socket *sock; + struct sk_buff *msg; + struct nlattr *attr; + int ret = -EMSGSIZE; + void *hdr; + + netdev_info(peer->ovpn->dev, "deleting peer with id %u, reason %d\n", + peer->id, peer->delete_reason); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &ovpn_nl_family, 0, OVPN_CMD_PEER_DEL_NTF); + if (!hdr) { + ret = -ENOBUFS; + goto err_free_msg; + } + + if (nla_put_u32(msg, OVPN_A_IFINDEX, peer->ovpn->dev->ifindex)) + goto err_cancel_msg; + + attr = nla_nest_start(msg, OVPN_A_PEER); + if (!attr) + goto err_cancel_msg; + + if (nla_put_u32(msg, OVPN_A_PEER_DEL_REASON, peer->delete_reason)) + goto err_cancel_msg; + + if (nla_put_u32(msg, OVPN_A_PEER_ID, peer->id)) + goto err_cancel_msg; + + nla_nest_end(msg, attr); + + genlmsg_end(msg, hdr); + + rcu_read_lock(); + sock = rcu_dereference(peer->sock); + if (!sock) { + ret = -EINVAL; + goto err_unlock; + } + genlmsg_multicast_netns(&ovpn_nl_family, sock_net(sock->sock->sk), + msg, 0, OVPN_NLGRP_PEERS, GFP_ATOMIC); + rcu_read_unlock(); + + return 0; + +err_unlock: + rcu_read_unlock(); +err_cancel_msg: + genlmsg_cancel(msg, hdr); +err_free_msg: + nlmsg_free(msg); + return ret; +} + /** * ovpn_nl_key_swap_notify - notify userspace peer's key must be renewed * @peer: the peer whose key needs to be renewed diff --git a/drivers/net/ovpn/netlink.h b/drivers/net/ovpn/netlink.h index 5dc84c8e5e803..8615dfc3c4720 100644 --- a/drivers/net/ovpn/netlink.h +++ b/drivers/net/ovpn/netlink.h @@ -12,6 +12,7 @@ int ovpn_nl_register(void); void ovpn_nl_unregister(void); +int ovpn_nl_peer_del_notify(struct ovpn_peer *peer); int ovpn_nl_key_swap_notify(struct ovpn_peer *peer, u8 key_id); #endif /* _NET_OVPN_NETLINK_H_ */ diff --git a/drivers/net/ovpn/peer.c b/drivers/net/ovpn/peer.c index 0469afd6da238..a37f89fffb02e 100644 --- a/drivers/net/ovpn/peer.c +++ b/drivers/net/ovpn/peer.c @@ -706,6 +706,7 @@ static void ovpn_peer_remove(struct ovpn_peer *peer, } peer->delete_reason = reason; + ovpn_nl_peer_del_notify(peer); /* append to provided list for later socket release and ref drop */ llist_add(&peer->release_entry, release_list); From b756861e6e63b0ddadb06bb9f6c87d06ab29bc6f Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:39 +0200 Subject: [PATCH 282/770] ovpn: add basic ethtool support Implement support for basic ethtool functionality. Note that ovpn is a virtual device driver, therefore various ethtool APIs are just not meaningful and thus not implemented. Signed-off-by: Antonio Quartulli Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250415-b4-ovpn-v26-22-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- drivers/net/ovpn/main.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/net/ovpn/main.c b/drivers/net/ovpn/main.c index 232eeb08e9292..0acb0934c1bea 100644 --- a/drivers/net/ovpn/main.c +++ b/drivers/net/ovpn/main.c @@ -7,6 +7,7 @@ * James Yonan */ +#include #include #include #include @@ -120,6 +121,19 @@ bool ovpn_dev_is_valid(const struct net_device *dev) return dev->netdev_ops == &ovpn_netdev_ops; } +static void ovpn_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strscpy(info->driver, "ovpn", sizeof(info->driver)); + strscpy(info->bus_info, "ovpn", sizeof(info->bus_info)); +} + +static const struct ethtool_ops ovpn_ethtool_ops = { + .get_drvinfo = ovpn_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_ts_info = ethtool_op_get_ts_info, +}; + static void ovpn_setup(struct net_device *dev) { netdev_features_t feat = NETIF_F_SG | NETIF_F_GSO | @@ -129,6 +143,7 @@ static void ovpn_setup(struct net_device *dev) dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; + dev->ethtool_ops = &ovpn_ethtool_ops; dev->netdev_ops = &ovpn_netdev_ops; dev->priv_destructor = ovpn_priv_free; From 959bc330a4396c4c52e790e62e23141967b39ef9 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 15 Apr 2025 13:17:40 +0200 Subject: [PATCH 283/770] testing/selftests: add test tool and scripts for ovpn module The ovpn-cli tool can be compiled and used as selftest for the ovpn kernel module. [NOTE: it depends on libmedtls for decoding base64-encoded keys] ovpn-cli implements the netlink and RTNL APIs and can thus be integrated in any script for more automated testing. Along with the tool, a bunch of scripts are provided that perform basic functionality tests by means of network namespaces. These scripts take part to the kselftest automation. The output of the scripts, which will appear in the kselftest reports, is a list of steps performed by the scripts plus some output coming from the execution of `ping`, `iperf` and `ovpn-cli` itself. In general it is useful only in case of failure, in order to understand which step has failed and why. Please note: since peer sockets are tied to the userspace process that created them (i.e. exiting the process will result in closing the socket), every run of ovpn-cli that created one will go to background and enter pause(), waiting for the signal which will allow it to terminate. Termination is accomplished at the end of each script by issuing a killall command. Cc: linux-kselftest@vger.kernel.org Cc: Shuah Khan Signed-off-by: Antonio Quartulli Link: https://patch.msgid.link/20250415-b4-ovpn-v26-23-577f6097b964@openvpn.net Reviewed-by: Sabrina Dubroca Tested-by: Oleksandr Natalenko Signed-off-by: Paolo Abeni --- MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + tools/testing/selftests/net/ovpn/.gitignore | 2 + tools/testing/selftests/net/ovpn/Makefile | 31 + tools/testing/selftests/net/ovpn/common.sh | 92 + tools/testing/selftests/net/ovpn/config | 10 + tools/testing/selftests/net/ovpn/data64.key | 5 + tools/testing/selftests/net/ovpn/ovpn-cli.c | 2376 +++++++++++++++++ .../testing/selftests/net/ovpn/tcp_peers.txt | 5 + .../selftests/net/ovpn/test-chachapoly.sh | 9 + .../net/ovpn/test-close-socket-tcp.sh | 9 + .../selftests/net/ovpn/test-close-socket.sh | 45 + .../testing/selftests/net/ovpn/test-float.sh | 9 + tools/testing/selftests/net/ovpn/test-tcp.sh | 9 + tools/testing/selftests/net/ovpn/test.sh | 113 + .../testing/selftests/net/ovpn/udp_peers.txt | 5 + 16 files changed, 2722 insertions(+) create mode 100644 tools/testing/selftests/net/ovpn/.gitignore create mode 100644 tools/testing/selftests/net/ovpn/Makefile create mode 100644 tools/testing/selftests/net/ovpn/common.sh create mode 100644 tools/testing/selftests/net/ovpn/config create mode 100644 tools/testing/selftests/net/ovpn/data64.key create mode 100644 tools/testing/selftests/net/ovpn/ovpn-cli.c create mode 100644 tools/testing/selftests/net/ovpn/tcp_peers.txt create mode 100755 tools/testing/selftests/net/ovpn/test-chachapoly.sh create mode 100755 tools/testing/selftests/net/ovpn/test-close-socket-tcp.sh create mode 100755 tools/testing/selftests/net/ovpn/test-close-socket.sh create mode 100755 tools/testing/selftests/net/ovpn/test-float.sh create mode 100755 tools/testing/selftests/net/ovpn/test-tcp.sh create mode 100755 tools/testing/selftests/net/ovpn/test.sh create mode 100644 tools/testing/selftests/net/ovpn/udp_peers.txt diff --git a/MAINTAINERS b/MAINTAINERS index c50e87ef72884..350009769173c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18134,6 +18134,7 @@ T: git https://github.com/OpenVPN/linux-kernel-ovpn.git F: Documentation/netlink/specs/ovpn.yaml F: drivers/net/ovpn/ F: include/uapi/linux/ovpn.h +F: tools/testing/selftests/net/ovpn/ OPENVSWITCH M: Aaron Conole diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index c77c8c8e3d9bd..61bb8bf1b5074 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -71,6 +71,7 @@ TARGETS += net/hsr TARGETS += net/mptcp TARGETS += net/netfilter TARGETS += net/openvswitch +TARGETS += net/ovpn TARGETS += net/packetdrill TARGETS += net/rds TARGETS += net/tcp_ao diff --git a/tools/testing/selftests/net/ovpn/.gitignore b/tools/testing/selftests/net/ovpn/.gitignore new file mode 100644 index 0000000000000..ee44c081ca7c0 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0+ +ovpn-cli diff --git a/tools/testing/selftests/net/ovpn/Makefile b/tools/testing/selftests/net/ovpn/Makefile new file mode 100644 index 0000000000000..2d102878cb6dd --- /dev/null +++ b/tools/testing/selftests/net/ovpn/Makefile @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2020-2025 OpenVPN, Inc. +# +CFLAGS = -pedantic -Wextra -Wall -Wl,--no-as-needed -g -O0 -ggdb $(KHDR_INCLUDES) +VAR_CFLAGS = $(shell pkg-config --cflags libnl-3.0 libnl-genl-3.0 2>/dev/null) +ifeq ($(VAR_CFLAGS),) +VAR_CFLAGS = -I/usr/include/libnl3 +endif +CFLAGS += $(VAR_CFLAGS) + + +LDLIBS = -lmbedtls -lmbedcrypto +VAR_LDLIBS = $(shell pkg-config --libs libnl-3.0 libnl-genl-3.0 2>/dev/null) +ifeq ($(VAR_LDLIBS),) +VAR_LDLIBS = -lnl-genl-3 -lnl-3 +endif +LDLIBS += $(VAR_LDLIBS) + + +TEST_FILES = common.sh + +TEST_PROGS = test.sh \ + test-chachapoly.sh \ + test-tcp.sh \ + test-float.sh \ + test-close-socket.sh \ + test-close-socket-tcp.sh + +TEST_GEN_FILES := ovpn-cli + +include ../../lib.mk diff --git a/tools/testing/selftests/net/ovpn/common.sh b/tools/testing/selftests/net/ovpn/common.sh new file mode 100644 index 0000000000000..7502292a1ee03 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/common.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2020-2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +UDP_PEERS_FILE=${UDP_PEERS_FILE:-udp_peers.txt} +TCP_PEERS_FILE=${TCP_PEERS_FILE:-tcp_peers.txt} +OVPN_CLI=${OVPN_CLI:-./ovpn-cli} +ALG=${ALG:-aes} +PROTO=${PROTO:-UDP} +FLOAT=${FLOAT:-0} + +create_ns() { + ip netns add peer${1} +} + +setup_ns() { + MODE="P2P" + + if [ ${1} -eq 0 ]; then + MODE="MP" + for p in $(seq 1 ${NUM_PEERS}); do + ip link add veth${p} netns peer0 type veth peer name veth${p} netns peer${p} + + ip -n peer0 addr add 10.10.${p}.1/24 dev veth${p} + ip -n peer0 link set veth${p} up + + ip -n peer${p} addr add 10.10.${p}.2/24 dev veth${p} + ip -n peer${p} link set veth${p} up + done + fi + + ip netns exec peer${1} ${OVPN_CLI} new_iface tun${1} $MODE + ip -n peer${1} addr add ${2} dev tun${1} + ip -n peer${1} link set tun${1} up +} + +add_peer() { + if [ "${PROTO}" == "UDP" ]; then + if [ ${1} -eq 0 ]; then + ip netns exec peer0 ${OVPN_CLI} new_multi_peer tun0 1 ${UDP_PEERS_FILE} + + for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ${OVPN_CLI} new_key tun0 ${p} 1 0 ${ALG} 0 \ + data64.key + done + else + ip netns exec peer${1} ${OVPN_CLI} new_peer tun${1} ${1} 1 10.10.${1}.1 1 + ip netns exec peer${1} ${OVPN_CLI} new_key tun${1} ${1} 1 0 ${ALG} 1 \ + data64.key + fi + else + if [ ${1} -eq 0 ]; then + (ip netns exec peer0 ${OVPN_CLI} listen tun0 1 ${TCP_PEERS_FILE} && { + for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ${OVPN_CLI} new_key tun0 ${p} 1 0 \ + ${ALG} 0 data64.key + done + }) & + sleep 5 + else + ip netns exec peer${1} ${OVPN_CLI} connect tun${1} ${1} 10.10.${1}.1 1 \ + data64.key + fi + fi +} + +cleanup() { + # some ovpn-cli processes sleep in background so they need manual poking + killall $(basename ${OVPN_CLI}) 2>/dev/null || true + + # netns peer0 is deleted without erasing ifaces first + for p in $(seq 1 10); do + ip -n peer${p} link set tun${p} down 2>/dev/null || true + ip netns exec peer${p} ${OVPN_CLI} del_iface tun${p} 2>/dev/null || true + done + for p in $(seq 1 10); do + ip -n peer0 link del veth${p} 2>/dev/null || true + done + for p in $(seq 0 10); do + ip netns del peer${p} 2>/dev/null || true + done +} + +if [ "${PROTO}" == "UDP" ]; then + NUM_PEERS=${NUM_PEERS:-$(wc -l ${UDP_PEERS_FILE} | awk '{print $1}')} +else + NUM_PEERS=${NUM_PEERS:-$(wc -l ${TCP_PEERS_FILE} | awk '{print $1}')} +fi + + diff --git a/tools/testing/selftests/net/ovpn/config b/tools/testing/selftests/net/ovpn/config new file mode 100644 index 0000000000000..71946ba9fa175 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/config @@ -0,0 +1,10 @@ +CONFIG_NET=y +CONFIG_INET=y +CONFIG_STREAM_PARSER=y +CONFIG_NET_UDP_TUNNEL=y +CONFIG_DST_CACHE=y +CONFIG_CRYPTO=y +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_CHACHA20POLY1305=y +CONFIG_OVPN=m diff --git a/tools/testing/selftests/net/ovpn/data64.key b/tools/testing/selftests/net/ovpn/data64.key new file mode 100644 index 0000000000000..a99e88c4e290f --- /dev/null +++ b/tools/testing/selftests/net/ovpn/data64.key @@ -0,0 +1,5 @@ +jRqMACN7d7/aFQNT8S7jkrBD8uwrgHbG5OQZP2eu4R1Y7tfpS2bf5RHv06Vi163CGoaIiTX99R3B +ia9ycAH8Wz1+9PWv51dnBLur9jbShlgZ2QHLtUc4a/gfT7zZwULXuuxdLnvR21DDeMBaTbkgbai9 +uvAa7ne1liIgGFzbv+Bas4HDVrygxIxuAnP5Qgc3648IJkZ0QEXPF+O9f0n5+QIvGCxkAUVx+5K6 +KIs+SoeWXnAopELmoGSjUpFtJbagXK82HfdqpuUxT2Tnuef0/14SzVE/vNleBNu2ZbyrSAaah8tE +BofkPJUBFY+YQcfZNM5Dgrw3i+Bpmpq/gpdg5w== diff --git a/tools/testing/selftests/net/ovpn/ovpn-cli.c b/tools/testing/selftests/net/ovpn/ovpn-cli.c new file mode 100644 index 0000000000000..69e41fc07fbca --- /dev/null +++ b/tools/testing/selftests/net/ovpn/ovpn-cli.c @@ -0,0 +1,2376 @@ +// SPDX-License-Identifier: GPL-2.0 +/* OpenVPN data channel accelerator + * + * Copyright (C) 2020-2025 OpenVPN, Inc. + * + * Author: Antonio Quartulli + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include + +/* defines to make checkpatch happy */ +#define strscpy strncpy +#define __always_unused __attribute__((__unused__)) + +/* libnl < 3.5.0 does not set the NLA_F_NESTED on its own, therefore we + * have to explicitly do it to prevent the kernel from failing upon + * parsing of the message + */ +#define nla_nest_start(_msg, _type) \ + nla_nest_start(_msg, (_type) | NLA_F_NESTED) + +/* libnl < 3.11.0 does not implement nla_get_uint() */ +uint64_t ovpn_nla_get_uint(struct nlattr *attr) +{ + if (nla_len(attr) == sizeof(uint32_t)) + return nla_get_u32(attr); + else + return nla_get_u64(attr); +} + +typedef int (*ovpn_nl_cb)(struct nl_msg *msg, void *arg); + +enum ovpn_key_direction { + KEY_DIR_IN = 0, + KEY_DIR_OUT, +}; + +#define KEY_LEN (256 / 8) +#define NONCE_LEN 8 + +#define PEER_ID_UNDEF 0x00FFFFFF +#define MAX_PEERS 10 + +struct nl_ctx { + struct nl_sock *nl_sock; + struct nl_msg *nl_msg; + struct nl_cb *nl_cb; + + int ovpn_dco_id; +}; + +enum ovpn_cmd { + CMD_INVALID, + CMD_NEW_IFACE, + CMD_DEL_IFACE, + CMD_LISTEN, + CMD_CONNECT, + CMD_NEW_PEER, + CMD_NEW_MULTI_PEER, + CMD_SET_PEER, + CMD_DEL_PEER, + CMD_GET_PEER, + CMD_NEW_KEY, + CMD_DEL_KEY, + CMD_GET_KEY, + CMD_SWAP_KEYS, + CMD_LISTEN_MCAST, +}; + +struct ovpn_ctx { + enum ovpn_cmd cmd; + + __u8 key_enc[KEY_LEN]; + __u8 key_dec[KEY_LEN]; + __u8 nonce[NONCE_LEN]; + + enum ovpn_cipher_alg cipher; + + sa_family_t sa_family; + + unsigned long peer_id; + unsigned long lport; + + union { + struct sockaddr_in in4; + struct sockaddr_in6 in6; + } remote; + + union { + struct sockaddr_in in4; + struct sockaddr_in6 in6; + } peer_ip; + + bool peer_ip_set; + + unsigned int ifindex; + char ifname[IFNAMSIZ]; + enum ovpn_mode mode; + bool mode_set; + + int socket; + int cli_sockets[MAX_PEERS]; + + __u32 keepalive_interval; + __u32 keepalive_timeout; + + enum ovpn_key_direction key_dir; + enum ovpn_key_slot key_slot; + int key_id; + + const char *peers_file; +}; + +static int ovpn_nl_recvmsgs(struct nl_ctx *ctx) +{ + int ret; + + ret = nl_recvmsgs(ctx->nl_sock, ctx->nl_cb); + + switch (ret) { + case -NLE_INTR: + fprintf(stderr, + "netlink received interrupt due to signal - ignoring\n"); + break; + case -NLE_NOMEM: + fprintf(stderr, "netlink out of memory error\n"); + break; + case -NLE_AGAIN: + fprintf(stderr, + "netlink reports blocking read - aborting wait\n"); + break; + default: + if (ret) + fprintf(stderr, "netlink reports error (%d): %s\n", + ret, nl_geterror(-ret)); + break; + } + + return ret; +} + +static struct nl_ctx *nl_ctx_alloc_flags(struct ovpn_ctx *ovpn, int cmd, + int flags) +{ + struct nl_ctx *ctx; + int err, ret; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) + return NULL; + + ctx->nl_sock = nl_socket_alloc(); + if (!ctx->nl_sock) { + fprintf(stderr, "cannot allocate netlink socket\n"); + goto err_free; + } + + nl_socket_set_buffer_size(ctx->nl_sock, 8192, 8192); + + ret = genl_connect(ctx->nl_sock); + if (ret) { + fprintf(stderr, "cannot connect to generic netlink: %s\n", + nl_geterror(ret)); + goto err_sock; + } + + /* enable Extended ACK for detailed error reporting */ + err = 1; + setsockopt(nl_socket_get_fd(ctx->nl_sock), SOL_NETLINK, NETLINK_EXT_ACK, + &err, sizeof(err)); + + ctx->ovpn_dco_id = genl_ctrl_resolve(ctx->nl_sock, OVPN_FAMILY_NAME); + if (ctx->ovpn_dco_id < 0) { + fprintf(stderr, "cannot find ovpn_dco netlink component: %d\n", + ctx->ovpn_dco_id); + goto err_free; + } + + ctx->nl_msg = nlmsg_alloc(); + if (!ctx->nl_msg) { + fprintf(stderr, "cannot allocate netlink message\n"); + goto err_sock; + } + + ctx->nl_cb = nl_cb_alloc(NL_CB_DEFAULT); + if (!ctx->nl_cb) { + fprintf(stderr, "failed to allocate netlink callback\n"); + goto err_msg; + } + + nl_socket_set_cb(ctx->nl_sock, ctx->nl_cb); + + genlmsg_put(ctx->nl_msg, 0, 0, ctx->ovpn_dco_id, 0, flags, cmd, 0); + + if (ovpn->ifindex > 0) + NLA_PUT_U32(ctx->nl_msg, OVPN_A_IFINDEX, ovpn->ifindex); + + return ctx; +nla_put_failure: +err_msg: + nlmsg_free(ctx->nl_msg); +err_sock: + nl_socket_free(ctx->nl_sock); +err_free: + free(ctx); + return NULL; +} + +static struct nl_ctx *nl_ctx_alloc(struct ovpn_ctx *ovpn, int cmd) +{ + return nl_ctx_alloc_flags(ovpn, cmd, 0); +} + +static void nl_ctx_free(struct nl_ctx *ctx) +{ + if (!ctx) + return; + + nl_socket_free(ctx->nl_sock); + nlmsg_free(ctx->nl_msg); + nl_cb_put(ctx->nl_cb); + free(ctx); +} + +static int ovpn_nl_cb_error(struct sockaddr_nl (*nla)__always_unused, + struct nlmsgerr *err, void *arg) +{ + struct nlmsghdr *nlh = (struct nlmsghdr *)err - 1; + struct nlattr *tb_msg[NLMSGERR_ATTR_MAX + 1]; + int len = nlh->nlmsg_len; + struct nlattr *attrs; + int *ret = arg; + int ack_len = sizeof(*nlh) + sizeof(int) + sizeof(*nlh); + + *ret = err->error; + + if (!(nlh->nlmsg_flags & NLM_F_ACK_TLVS)) + return NL_STOP; + + if (!(nlh->nlmsg_flags & NLM_F_CAPPED)) + ack_len += err->msg.nlmsg_len - sizeof(*nlh); + + if (len <= ack_len) + return NL_STOP; + + attrs = (void *)((uint8_t *)nlh + ack_len); + len -= ack_len; + + nla_parse(tb_msg, NLMSGERR_ATTR_MAX, attrs, len, NULL); + if (tb_msg[NLMSGERR_ATTR_MSG]) { + len = strnlen((char *)nla_data(tb_msg[NLMSGERR_ATTR_MSG]), + nla_len(tb_msg[NLMSGERR_ATTR_MSG])); + fprintf(stderr, "kernel error: %*s\n", len, + (char *)nla_data(tb_msg[NLMSGERR_ATTR_MSG])); + } + + if (tb_msg[NLMSGERR_ATTR_MISS_NEST]) { + fprintf(stderr, "missing required nesting type %u\n", + nla_get_u32(tb_msg[NLMSGERR_ATTR_MISS_NEST])); + } + + if (tb_msg[NLMSGERR_ATTR_MISS_TYPE]) { + fprintf(stderr, "missing required attribute type %u\n", + nla_get_u32(tb_msg[NLMSGERR_ATTR_MISS_TYPE])); + } + + return NL_STOP; +} + +static int ovpn_nl_cb_finish(struct nl_msg (*msg)__always_unused, + void *arg) +{ + int *status = arg; + + *status = 0; + return NL_SKIP; +} + +static int ovpn_nl_cb_ack(struct nl_msg (*msg)__always_unused, + void *arg) +{ + int *status = arg; + + *status = 0; + return NL_STOP; +} + +static int ovpn_nl_msg_send(struct nl_ctx *ctx, ovpn_nl_cb cb) +{ + int status = 1; + + nl_cb_err(ctx->nl_cb, NL_CB_CUSTOM, ovpn_nl_cb_error, &status); + nl_cb_set(ctx->nl_cb, NL_CB_FINISH, NL_CB_CUSTOM, ovpn_nl_cb_finish, + &status); + nl_cb_set(ctx->nl_cb, NL_CB_ACK, NL_CB_CUSTOM, ovpn_nl_cb_ack, &status); + + if (cb) + nl_cb_set(ctx->nl_cb, NL_CB_VALID, NL_CB_CUSTOM, cb, ctx); + + nl_send_auto_complete(ctx->nl_sock, ctx->nl_msg); + + while (status == 1) + ovpn_nl_recvmsgs(ctx); + + if (status < 0) + fprintf(stderr, "failed to send netlink message: %s (%d)\n", + strerror(-status), status); + + return status; +} + +static int ovpn_parse_key(const char *file, struct ovpn_ctx *ctx) +{ + int idx_enc, idx_dec, ret = -1; + unsigned char *ckey = NULL; + __u8 *bkey = NULL; + size_t olen = 0; + long ckey_len; + FILE *fp; + + fp = fopen(file, "r"); + if (!fp) { + fprintf(stderr, "cannot open: %s\n", file); + return -1; + } + + /* get file size */ + fseek(fp, 0L, SEEK_END); + ckey_len = ftell(fp); + rewind(fp); + + /* if the file is longer, let's just read a portion */ + if (ckey_len > 256) + ckey_len = 256; + + ckey = malloc(ckey_len); + if (!ckey) + goto err; + + ret = fread(ckey, 1, ckey_len, fp); + if (ret != ckey_len) { + fprintf(stderr, + "couldn't read enough data from key file: %dbytes read\n", + ret); + goto err; + } + + olen = 0; + ret = mbedtls_base64_decode(NULL, 0, &olen, ckey, ckey_len); + if (ret != MBEDTLS_ERR_BASE64_BUFFER_TOO_SMALL) { + char buf[256]; + + mbedtls_strerror(ret, buf, sizeof(buf)); + fprintf(stderr, "unexpected base64 error1: %s (%d)\n", buf, + ret); + + goto err; + } + + bkey = malloc(olen); + if (!bkey) { + fprintf(stderr, "cannot allocate binary key buffer\n"); + goto err; + } + + ret = mbedtls_base64_decode(bkey, olen, &olen, ckey, ckey_len); + if (ret) { + char buf[256]; + + mbedtls_strerror(ret, buf, sizeof(buf)); + fprintf(stderr, "unexpected base64 error2: %s (%d)\n", buf, + ret); + + goto err; + } + + if (olen < 2 * KEY_LEN + NONCE_LEN) { + fprintf(stderr, + "not enough data in key file, found %zdB but needs %dB\n", + olen, 2 * KEY_LEN + NONCE_LEN); + goto err; + } + + switch (ctx->key_dir) { + case KEY_DIR_IN: + idx_enc = 0; + idx_dec = 1; + break; + case KEY_DIR_OUT: + idx_enc = 1; + idx_dec = 0; + break; + default: + goto err; + } + + memcpy(ctx->key_enc, bkey + KEY_LEN * idx_enc, KEY_LEN); + memcpy(ctx->key_dec, bkey + KEY_LEN * idx_dec, KEY_LEN); + memcpy(ctx->nonce, bkey + 2 * KEY_LEN, NONCE_LEN); + + ret = 0; + +err: + fclose(fp); + free(bkey); + free(ckey); + + return ret; +} + +static int ovpn_parse_cipher(const char *cipher, struct ovpn_ctx *ctx) +{ + if (strcmp(cipher, "aes") == 0) + ctx->cipher = OVPN_CIPHER_ALG_AES_GCM; + else if (strcmp(cipher, "chachapoly") == 0) + ctx->cipher = OVPN_CIPHER_ALG_CHACHA20_POLY1305; + else if (strcmp(cipher, "none") == 0) + ctx->cipher = OVPN_CIPHER_ALG_NONE; + else + return -ENOTSUP; + + return 0; +} + +static int ovpn_parse_key_direction(const char *dir, struct ovpn_ctx *ctx) +{ + int in_dir; + + in_dir = strtoll(dir, NULL, 10); + switch (in_dir) { + case KEY_DIR_IN: + case KEY_DIR_OUT: + ctx->key_dir = in_dir; + break; + default: + fprintf(stderr, + "invalid key direction provided. Can be 0 or 1 only\n"); + return -1; + } + + return 0; +} + +static int ovpn_socket(struct ovpn_ctx *ctx, sa_family_t family, int proto) +{ + struct sockaddr_storage local_sock = { 0 }; + struct sockaddr_in6 *in6; + struct sockaddr_in *in; + int ret, s, sock_type; + size_t sock_len; + + if (proto == IPPROTO_UDP) + sock_type = SOCK_DGRAM; + else if (proto == IPPROTO_TCP) + sock_type = SOCK_STREAM; + else + return -EINVAL; + + s = socket(family, sock_type, 0); + if (s < 0) { + perror("cannot create socket"); + return -1; + } + + switch (family) { + case AF_INET: + in = (struct sockaddr_in *)&local_sock; + in->sin_family = family; + in->sin_port = htons(ctx->lport); + in->sin_addr.s_addr = htonl(INADDR_ANY); + sock_len = sizeof(*in); + break; + case AF_INET6: + in6 = (struct sockaddr_in6 *)&local_sock; + in6->sin6_family = family; + in6->sin6_port = htons(ctx->lport); + in6->sin6_addr = in6addr_any; + sock_len = sizeof(*in6); + break; + default: + return -1; + } + + int opt = 1; + + ret = setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + + if (ret < 0) { + perror("setsockopt for SO_REUSEADDR"); + return ret; + } + + ret = setsockopt(s, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)); + if (ret < 0) { + perror("setsockopt for SO_REUSEPORT"); + return ret; + } + + if (family == AF_INET6) { + opt = 0; + if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, &opt, + sizeof(opt))) { + perror("failed to set IPV6_V6ONLY"); + return -1; + } + } + + ret = bind(s, (struct sockaddr *)&local_sock, sock_len); + if (ret < 0) { + perror("cannot bind socket"); + goto err_socket; + } + + ctx->socket = s; + ctx->sa_family = family; + return 0; + +err_socket: + close(s); + return -1; +} + +static int ovpn_udp_socket(struct ovpn_ctx *ctx, sa_family_t family) +{ + return ovpn_socket(ctx, family, IPPROTO_UDP); +} + +static int ovpn_listen(struct ovpn_ctx *ctx, sa_family_t family) +{ + int ret; + + ret = ovpn_socket(ctx, family, IPPROTO_TCP); + if (ret < 0) + return ret; + + ret = listen(ctx->socket, 10); + if (ret < 0) { + perror("listen"); + close(ctx->socket); + return -1; + } + + return 0; +} + +static int ovpn_accept(struct ovpn_ctx *ctx) +{ + socklen_t socklen; + int ret; + + socklen = sizeof(ctx->remote); + ret = accept(ctx->socket, (struct sockaddr *)&ctx->remote, &socklen); + if (ret < 0) { + perror("accept"); + goto err; + } + + fprintf(stderr, "Connection received!\n"); + + switch (socklen) { + case sizeof(struct sockaddr_in): + case sizeof(struct sockaddr_in6): + break; + default: + fprintf(stderr, "error: expecting IPv4 or IPv6 connection\n"); + close(ret); + ret = -EINVAL; + goto err; + } + + return ret; +err: + close(ctx->socket); + return ret; +} + +static int ovpn_connect(struct ovpn_ctx *ovpn) +{ + socklen_t socklen; + int s, ret; + + s = socket(ovpn->remote.in4.sin_family, SOCK_STREAM, 0); + if (s < 0) { + perror("cannot create socket"); + return -1; + } + + switch (ovpn->remote.in4.sin_family) { + case AF_INET: + socklen = sizeof(struct sockaddr_in); + break; + case AF_INET6: + socklen = sizeof(struct sockaddr_in6); + break; + default: + return -EOPNOTSUPP; + } + + ret = connect(s, (struct sockaddr *)&ovpn->remote, socklen); + if (ret < 0) { + perror("connect"); + goto err; + } + + fprintf(stderr, "connected\n"); + + ovpn->socket = s; + + return 0; +err: + close(s); + return ret; +} + +static int ovpn_new_peer(struct ovpn_ctx *ovpn, bool is_tcp) +{ + struct nlattr *attr; + struct nl_ctx *ctx; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_PEER_NEW); + if (!ctx) + return -ENOMEM; + + attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_SOCKET, ovpn->socket); + + if (!is_tcp) { + switch (ovpn->remote.in4.sin_family) { + case AF_INET: + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_REMOTE_IPV4, + ovpn->remote.in4.sin_addr.s_addr); + NLA_PUT_U16(ctx->nl_msg, OVPN_A_PEER_REMOTE_PORT, + ovpn->remote.in4.sin_port); + break; + case AF_INET6: + NLA_PUT(ctx->nl_msg, OVPN_A_PEER_REMOTE_IPV6, + sizeof(ovpn->remote.in6.sin6_addr), + &ovpn->remote.in6.sin6_addr); + NLA_PUT_U32(ctx->nl_msg, + OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID, + ovpn->remote.in6.sin6_scope_id); + NLA_PUT_U16(ctx->nl_msg, OVPN_A_PEER_REMOTE_PORT, + ovpn->remote.in6.sin6_port); + break; + default: + fprintf(stderr, + "Invalid family for remote socket address\n"); + goto nla_put_failure; + } + } + + if (ovpn->peer_ip_set) { + switch (ovpn->peer_ip.in4.sin_family) { + case AF_INET: + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_VPN_IPV4, + ovpn->peer_ip.in4.sin_addr.s_addr); + break; + case AF_INET6: + NLA_PUT(ctx->nl_msg, OVPN_A_PEER_VPN_IPV6, + sizeof(struct in6_addr), + &ovpn->peer_ip.in6.sin6_addr); + break; + default: + fprintf(stderr, "Invalid family for peer address\n"); + goto nla_put_failure; + } + } + + nla_nest_end(ctx->nl_msg, attr); + + ret = ovpn_nl_msg_send(ctx, NULL); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_set_peer(struct ovpn_ctx *ovpn) +{ + struct nlattr *attr; + struct nl_ctx *ctx; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_PEER_SET); + if (!ctx) + return -ENOMEM; + + attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_KEEPALIVE_INTERVAL, + ovpn->keepalive_interval); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_KEEPALIVE_TIMEOUT, + ovpn->keepalive_timeout); + nla_nest_end(ctx->nl_msg, attr); + + ret = ovpn_nl_msg_send(ctx, NULL); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_del_peer(struct ovpn_ctx *ovpn) +{ + struct nlattr *attr; + struct nl_ctx *ctx; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_PEER_DEL); + if (!ctx) + return -ENOMEM; + + attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id); + nla_nest_end(ctx->nl_msg, attr); + + ret = ovpn_nl_msg_send(ctx, NULL); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_handle_peer(struct nl_msg *msg, void (*arg)__always_unused) +{ + struct nlattr *pattrs[OVPN_A_PEER_MAX + 1]; + struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); + struct nlattr *attrs[OVPN_A_MAX + 1]; + __u16 rport = 0, lport = 0; + + nla_parse(attrs, OVPN_A_MAX, genlmsg_attrdata(gnlh, 0), + genlmsg_attrlen(gnlh, 0), NULL); + + if (!attrs[OVPN_A_PEER]) { + fprintf(stderr, "no packet content in netlink message\n"); + return NL_SKIP; + } + + nla_parse(pattrs, OVPN_A_PEER_MAX, nla_data(attrs[OVPN_A_PEER]), + nla_len(attrs[OVPN_A_PEER]), NULL); + + if (pattrs[OVPN_A_PEER_ID]) + fprintf(stderr, "* Peer %u\n", + nla_get_u32(pattrs[OVPN_A_PEER_ID])); + + if (pattrs[OVPN_A_PEER_SOCKET_NETNSID]) + fprintf(stderr, "\tsocket NetNS ID: %d\n", + nla_get_s32(pattrs[OVPN_A_PEER_SOCKET_NETNSID])); + + if (pattrs[OVPN_A_PEER_VPN_IPV4]) { + char buf[INET_ADDRSTRLEN]; + + inet_ntop(AF_INET, nla_data(pattrs[OVPN_A_PEER_VPN_IPV4]), + buf, sizeof(buf)); + fprintf(stderr, "\tVPN IPv4: %s\n", buf); + } + + if (pattrs[OVPN_A_PEER_VPN_IPV6]) { + char buf[INET6_ADDRSTRLEN]; + + inet_ntop(AF_INET6, nla_data(pattrs[OVPN_A_PEER_VPN_IPV6]), + buf, sizeof(buf)); + fprintf(stderr, "\tVPN IPv6: %s\n", buf); + } + + if (pattrs[OVPN_A_PEER_LOCAL_PORT]) + lport = ntohs(nla_get_u16(pattrs[OVPN_A_PEER_LOCAL_PORT])); + + if (pattrs[OVPN_A_PEER_REMOTE_PORT]) + rport = ntohs(nla_get_u16(pattrs[OVPN_A_PEER_REMOTE_PORT])); + + if (pattrs[OVPN_A_PEER_REMOTE_IPV6]) { + void *ip = pattrs[OVPN_A_PEER_REMOTE_IPV6]; + char buf[INET6_ADDRSTRLEN]; + int scope_id = -1; + + if (pattrs[OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID]) { + void *p = pattrs[OVPN_A_PEER_REMOTE_IPV6_SCOPE_ID]; + + scope_id = nla_get_u32(p); + } + + inet_ntop(AF_INET6, nla_data(ip), buf, sizeof(buf)); + fprintf(stderr, "\tRemote: %s:%hu (scope-id: %u)\n", buf, rport, + scope_id); + + if (pattrs[OVPN_A_PEER_LOCAL_IPV6]) { + void *ip = pattrs[OVPN_A_PEER_LOCAL_IPV6]; + + inet_ntop(AF_INET6, nla_data(ip), buf, sizeof(buf)); + fprintf(stderr, "\tLocal: %s:%hu\n", buf, lport); + } + } + + if (pattrs[OVPN_A_PEER_REMOTE_IPV4]) { + void *ip = pattrs[OVPN_A_PEER_REMOTE_IPV4]; + char buf[INET_ADDRSTRLEN]; + + inet_ntop(AF_INET, nla_data(ip), buf, sizeof(buf)); + fprintf(stderr, "\tRemote: %s:%hu\n", buf, rport); + + if (pattrs[OVPN_A_PEER_LOCAL_IPV4]) { + void *p = pattrs[OVPN_A_PEER_LOCAL_IPV4]; + + inet_ntop(AF_INET, nla_data(p), buf, sizeof(buf)); + fprintf(stderr, "\tLocal: %s:%hu\n", buf, lport); + } + } + + if (pattrs[OVPN_A_PEER_KEEPALIVE_INTERVAL]) { + void *p = pattrs[OVPN_A_PEER_KEEPALIVE_INTERVAL]; + + fprintf(stderr, "\tKeepalive interval: %u sec\n", + nla_get_u32(p)); + } + + if (pattrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT]) + fprintf(stderr, "\tKeepalive timeout: %u sec\n", + nla_get_u32(pattrs[OVPN_A_PEER_KEEPALIVE_TIMEOUT])); + + if (pattrs[OVPN_A_PEER_VPN_RX_BYTES]) + fprintf(stderr, "\tVPN RX bytes: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_VPN_RX_BYTES])); + + if (pattrs[OVPN_A_PEER_VPN_TX_BYTES]) + fprintf(stderr, "\tVPN TX bytes: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_VPN_TX_BYTES])); + + if (pattrs[OVPN_A_PEER_VPN_RX_PACKETS]) + fprintf(stderr, "\tVPN RX packets: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_VPN_RX_PACKETS])); + + if (pattrs[OVPN_A_PEER_VPN_TX_PACKETS]) + fprintf(stderr, "\tVPN TX packets: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_VPN_TX_PACKETS])); + + if (pattrs[OVPN_A_PEER_LINK_RX_BYTES]) + fprintf(stderr, "\tLINK RX bytes: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_LINK_RX_BYTES])); + + if (pattrs[OVPN_A_PEER_LINK_TX_BYTES]) + fprintf(stderr, "\tLINK TX bytes: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_LINK_TX_BYTES])); + + if (pattrs[OVPN_A_PEER_LINK_RX_PACKETS]) + fprintf(stderr, "\tLINK RX packets: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_LINK_RX_PACKETS])); + + if (pattrs[OVPN_A_PEER_LINK_TX_PACKETS]) + fprintf(stderr, "\tLINK TX packets: %" PRIu64 "\n", + ovpn_nla_get_uint(pattrs[OVPN_A_PEER_LINK_TX_PACKETS])); + + return NL_SKIP; +} + +static int ovpn_get_peer(struct ovpn_ctx *ovpn) +{ + int flags = 0, ret = -1; + struct nlattr *attr; + struct nl_ctx *ctx; + + if (ovpn->peer_id == PEER_ID_UNDEF) + flags = NLM_F_DUMP; + + ctx = nl_ctx_alloc_flags(ovpn, OVPN_CMD_PEER_GET, flags); + if (!ctx) + return -ENOMEM; + + if (ovpn->peer_id != PEER_ID_UNDEF) { + attr = nla_nest_start(ctx->nl_msg, OVPN_A_PEER); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_PEER_ID, ovpn->peer_id); + nla_nest_end(ctx->nl_msg, attr); + } + + ret = ovpn_nl_msg_send(ctx, ovpn_handle_peer); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_new_key(struct ovpn_ctx *ovpn) +{ + struct nlattr *keyconf, *key_dir; + struct nl_ctx *ctx; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_NEW); + if (!ctx) + return -ENOMEM; + + keyconf = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_SLOT, ovpn->key_slot); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_KEY_ID, ovpn->key_id); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_CIPHER_ALG, ovpn->cipher); + + key_dir = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF_ENCRYPT_DIR); + NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_CIPHER_KEY, KEY_LEN, ovpn->key_enc); + NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_NONCE_TAIL, NONCE_LEN, ovpn->nonce); + nla_nest_end(ctx->nl_msg, key_dir); + + key_dir = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF_DECRYPT_DIR); + NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_CIPHER_KEY, KEY_LEN, ovpn->key_dec); + NLA_PUT(ctx->nl_msg, OVPN_A_KEYDIR_NONCE_TAIL, NONCE_LEN, ovpn->nonce); + nla_nest_end(ctx->nl_msg, key_dir); + + nla_nest_end(ctx->nl_msg, keyconf); + + ret = ovpn_nl_msg_send(ctx, NULL); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_del_key(struct ovpn_ctx *ovpn) +{ + struct nlattr *keyconf; + struct nl_ctx *ctx; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_DEL); + if (!ctx) + return -ENOMEM; + + keyconf = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_SLOT, ovpn->key_slot); + nla_nest_end(ctx->nl_msg, keyconf); + + ret = ovpn_nl_msg_send(ctx, NULL); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_handle_key(struct nl_msg *msg, void (*arg)__always_unused) +{ + struct nlattr *kattrs[OVPN_A_KEYCONF_MAX + 1]; + struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); + struct nlattr *attrs[OVPN_A_MAX + 1]; + + nla_parse(attrs, OVPN_A_MAX, genlmsg_attrdata(gnlh, 0), + genlmsg_attrlen(gnlh, 0), NULL); + + if (!attrs[OVPN_A_KEYCONF]) { + fprintf(stderr, "no packet content in netlink message\n"); + return NL_SKIP; + } + + nla_parse(kattrs, OVPN_A_KEYCONF_MAX, nla_data(attrs[OVPN_A_KEYCONF]), + nla_len(attrs[OVPN_A_KEYCONF]), NULL); + + if (kattrs[OVPN_A_KEYCONF_PEER_ID]) + fprintf(stderr, "* Peer %u\n", + nla_get_u32(kattrs[OVPN_A_KEYCONF_PEER_ID])); + if (kattrs[OVPN_A_KEYCONF_SLOT]) { + fprintf(stderr, "\t- Slot: "); + switch (nla_get_u32(kattrs[OVPN_A_KEYCONF_SLOT])) { + case OVPN_KEY_SLOT_PRIMARY: + fprintf(stderr, "primary\n"); + break; + case OVPN_KEY_SLOT_SECONDARY: + fprintf(stderr, "secondary\n"); + break; + default: + fprintf(stderr, "invalid (%u)\n", + nla_get_u32(kattrs[OVPN_A_KEYCONF_SLOT])); + break; + } + } + if (kattrs[OVPN_A_KEYCONF_KEY_ID]) + fprintf(stderr, "\t- Key ID: %u\n", + nla_get_u32(kattrs[OVPN_A_KEYCONF_KEY_ID])); + if (kattrs[OVPN_A_KEYCONF_CIPHER_ALG]) { + fprintf(stderr, "\t- Cipher: "); + switch (nla_get_u32(kattrs[OVPN_A_KEYCONF_CIPHER_ALG])) { + case OVPN_CIPHER_ALG_NONE: + fprintf(stderr, "none\n"); + break; + case OVPN_CIPHER_ALG_AES_GCM: + fprintf(stderr, "aes-gcm\n"); + break; + case OVPN_CIPHER_ALG_CHACHA20_POLY1305: + fprintf(stderr, "chacha20poly1305\n"); + break; + default: + fprintf(stderr, "invalid (%u)\n", + nla_get_u32(kattrs[OVPN_A_KEYCONF_CIPHER_ALG])); + break; + } + } + + return NL_SKIP; +} + +static int ovpn_get_key(struct ovpn_ctx *ovpn) +{ + struct nlattr *keyconf; + struct nl_ctx *ctx; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_GET); + if (!ctx) + return -ENOMEM; + + keyconf = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_SLOT, ovpn->key_slot); + nla_nest_end(ctx->nl_msg, keyconf); + + ret = ovpn_nl_msg_send(ctx, ovpn_handle_key); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +static int ovpn_swap_keys(struct ovpn_ctx *ovpn) +{ + struct nl_ctx *ctx; + struct nlattr *kc; + int ret = -1; + + ctx = nl_ctx_alloc(ovpn, OVPN_CMD_KEY_SWAP); + if (!ctx) + return -ENOMEM; + + kc = nla_nest_start(ctx->nl_msg, OVPN_A_KEYCONF); + NLA_PUT_U32(ctx->nl_msg, OVPN_A_KEYCONF_PEER_ID, ovpn->peer_id); + nla_nest_end(ctx->nl_msg, kc); + + ret = ovpn_nl_msg_send(ctx, NULL); +nla_put_failure: + nl_ctx_free(ctx); + return ret; +} + +/* Helper function used to easily add attributes to a rtnl message */ +static int ovpn_addattr(struct nlmsghdr *n, int maxlen, int type, + const void *data, int alen) +{ + int len = RTA_LENGTH(alen); + struct rtattr *rta; + + if ((int)(NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len)) > maxlen) { + fprintf(stderr, "%s: rtnl: message exceeded bound of %d\n", + __func__, maxlen); + return -EMSGSIZE; + } + + rta = nlmsg_tail(n); + rta->rta_type = type; + rta->rta_len = len; + + if (!data) + memset(RTA_DATA(rta), 0, alen); + else + memcpy(RTA_DATA(rta), data, alen); + + n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len); + + return 0; +} + +static struct rtattr *ovpn_nest_start(struct nlmsghdr *msg, size_t max_size, + int attr) +{ + struct rtattr *nest = nlmsg_tail(msg); + + if (ovpn_addattr(msg, max_size, attr, NULL, 0) < 0) + return NULL; + + return nest; +} + +static void ovpn_nest_end(struct nlmsghdr *msg, struct rtattr *nest) +{ + nest->rta_len = (uint8_t *)nlmsg_tail(msg) - (uint8_t *)nest; +} + +#define RT_SNDBUF_SIZE (1024 * 2) +#define RT_RCVBUF_SIZE (1024 * 4) + +/* Open RTNL socket */ +static int ovpn_rt_socket(void) +{ + int sndbuf = RT_SNDBUF_SIZE, rcvbuf = RT_RCVBUF_SIZE, fd; + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (fd < 0) { + fprintf(stderr, "%s: cannot open netlink socket\n", __func__); + return fd; + } + + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf, + sizeof(sndbuf)) < 0) { + fprintf(stderr, "%s: SO_SNDBUF\n", __func__); + close(fd); + return -1; + } + + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf, + sizeof(rcvbuf)) < 0) { + fprintf(stderr, "%s: SO_RCVBUF\n", __func__); + close(fd); + return -1; + } + + return fd; +} + +/* Bind socket to Netlink subsystem */ +static int ovpn_rt_bind(int fd, uint32_t groups) +{ + struct sockaddr_nl local = { 0 }; + socklen_t addr_len; + + local.nl_family = AF_NETLINK; + local.nl_groups = groups; + + if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) { + fprintf(stderr, "%s: cannot bind netlink socket: %d\n", + __func__, errno); + return -errno; + } + + addr_len = sizeof(local); + if (getsockname(fd, (struct sockaddr *)&local, &addr_len) < 0) { + fprintf(stderr, "%s: cannot getsockname: %d\n", __func__, + errno); + return -errno; + } + + if (addr_len != sizeof(local)) { + fprintf(stderr, "%s: wrong address length %d\n", __func__, + addr_len); + return -EINVAL; + } + + if (local.nl_family != AF_NETLINK) { + fprintf(stderr, "%s: wrong address family %d\n", __func__, + local.nl_family); + return -EINVAL; + } + + return 0; +} + +typedef int (*ovpn_parse_reply_cb)(struct nlmsghdr *msg, void *arg); + +/* Send Netlink message and run callback on reply (if specified) */ +static int ovpn_rt_send(struct nlmsghdr *payload, pid_t peer, + unsigned int groups, ovpn_parse_reply_cb cb, + void *arg_cb) +{ + int len, rem_len, fd, ret, rcv_len; + struct sockaddr_nl nladdr = { 0 }; + struct nlmsgerr *err; + struct nlmsghdr *h; + char buf[1024 * 16]; + struct iovec iov = { + .iov_base = payload, + .iov_len = payload->nlmsg_len, + }; + struct msghdr nlmsg = { + .msg_name = &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = &iov, + .msg_iovlen = 1, + }; + + nladdr.nl_family = AF_NETLINK; + nladdr.nl_pid = peer; + nladdr.nl_groups = groups; + + payload->nlmsg_seq = time(NULL); + + /* no need to send reply */ + if (!cb) + payload->nlmsg_flags |= NLM_F_ACK; + + fd = ovpn_rt_socket(); + if (fd < 0) { + fprintf(stderr, "%s: can't open rtnl socket\n", __func__); + return -errno; + } + + ret = ovpn_rt_bind(fd, 0); + if (ret < 0) { + fprintf(stderr, "%s: can't bind rtnl socket\n", __func__); + ret = -errno; + goto out; + } + + ret = sendmsg(fd, &nlmsg, 0); + if (ret < 0) { + fprintf(stderr, "%s: rtnl: error on sendmsg()\n", __func__); + ret = -errno; + goto out; + } + + /* prepare buffer to store RTNL replies */ + memset(buf, 0, sizeof(buf)); + iov.iov_base = buf; + + while (1) { + /* + * iov_len is modified by recvmsg(), therefore has to be initialized before + * using it again + */ + iov.iov_len = sizeof(buf); + rcv_len = recvmsg(fd, &nlmsg, 0); + if (rcv_len < 0) { + if (errno == EINTR || errno == EAGAIN) { + fprintf(stderr, "%s: interrupted call\n", + __func__); + continue; + } + fprintf(stderr, "%s: rtnl: error on recvmsg()\n", + __func__); + ret = -errno; + goto out; + } + + if (rcv_len == 0) { + fprintf(stderr, + "%s: rtnl: socket reached unexpected EOF\n", + __func__); + ret = -EIO; + goto out; + } + + if (nlmsg.msg_namelen != sizeof(nladdr)) { + fprintf(stderr, + "%s: sender address length: %u (expected %zu)\n", + __func__, nlmsg.msg_namelen, sizeof(nladdr)); + ret = -EIO; + goto out; + } + + h = (struct nlmsghdr *)buf; + while (rcv_len >= (int)sizeof(*h)) { + len = h->nlmsg_len; + rem_len = len - sizeof(*h); + + if (rem_len < 0 || len > rcv_len) { + if (nlmsg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "%s: truncated message\n", + __func__); + ret = -EIO; + goto out; + } + fprintf(stderr, "%s: malformed message: len=%d\n", + __func__, len); + ret = -EIO; + goto out; + } + + if (h->nlmsg_type == NLMSG_DONE) { + ret = 0; + goto out; + } + + if (h->nlmsg_type == NLMSG_ERROR) { + err = (struct nlmsgerr *)NLMSG_DATA(h); + if (rem_len < (int)sizeof(struct nlmsgerr)) { + fprintf(stderr, "%s: ERROR truncated\n", + __func__); + ret = -EIO; + goto out; + } + + if (err->error) { + fprintf(stderr, "%s: (%d) %s\n", + __func__, err->error, + strerror(-err->error)); + ret = err->error; + goto out; + } + + ret = 0; + if (cb) { + int r = cb(h, arg_cb); + + if (r <= 0) + ret = r; + } + goto out; + } + + if (cb) { + int r = cb(h, arg_cb); + + if (r <= 0) { + ret = r; + goto out; + } + } else { + fprintf(stderr, "%s: RTNL: unexpected reply\n", + __func__); + } + + rcv_len -= NLMSG_ALIGN(len); + h = (struct nlmsghdr *)((uint8_t *)h + + NLMSG_ALIGN(len)); + } + + if (nlmsg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "%s: message truncated\n", __func__); + continue; + } + + if (rcv_len) { + fprintf(stderr, "%s: rtnl: %d not parsed bytes\n", + __func__, rcv_len); + ret = -1; + goto out; + } + } +out: + close(fd); + + return ret; +} + +struct ovpn_link_req { + struct nlmsghdr n; + struct ifinfomsg i; + char buf[256]; +}; + +static int ovpn_new_iface(struct ovpn_ctx *ovpn) +{ + struct rtattr *linkinfo, *data; + struct ovpn_link_req req = { 0 }; + int ret = -1; + + fprintf(stdout, "Creating interface %s with mode %u\n", ovpn->ifname, + ovpn->mode); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.i)); + req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; + req.n.nlmsg_type = RTM_NEWLINK; + + if (ovpn_addattr(&req.n, sizeof(req), IFLA_IFNAME, ovpn->ifname, + strlen(ovpn->ifname) + 1) < 0) + goto err; + + linkinfo = ovpn_nest_start(&req.n, sizeof(req), IFLA_LINKINFO); + if (!linkinfo) + goto err; + + if (ovpn_addattr(&req.n, sizeof(req), IFLA_INFO_KIND, OVPN_FAMILY_NAME, + strlen(OVPN_FAMILY_NAME) + 1) < 0) + goto err; + + if (ovpn->mode_set) { + data = ovpn_nest_start(&req.n, sizeof(req), IFLA_INFO_DATA); + if (!data) + goto err; + + if (ovpn_addattr(&req.n, sizeof(req), IFLA_OVPN_MODE, + &ovpn->mode, sizeof(uint8_t)) < 0) + goto err; + + ovpn_nest_end(&req.n, data); + } + + ovpn_nest_end(&req.n, linkinfo); + + req.i.ifi_family = AF_PACKET; + + ret = ovpn_rt_send(&req.n, 0, 0, NULL, NULL); +err: + return ret; +} + +static int ovpn_del_iface(struct ovpn_ctx *ovpn) +{ + struct ovpn_link_req req = { 0 }; + + fprintf(stdout, "Deleting interface %s ifindex %u\n", ovpn->ifname, + ovpn->ifindex); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.i)); + req.n.nlmsg_flags = NLM_F_REQUEST; + req.n.nlmsg_type = RTM_DELLINK; + + req.i.ifi_family = AF_PACKET; + req.i.ifi_index = ovpn->ifindex; + + return ovpn_rt_send(&req.n, 0, 0, NULL, NULL); +} + +static int nl_seq_check(struct nl_msg (*msg)__always_unused, + void (*arg)__always_unused) +{ + return NL_OK; +} + +struct mcast_handler_args { + const char *group; + int id; +}; + +static int mcast_family_handler(struct nl_msg *msg, void *arg) +{ + struct mcast_handler_args *grp = arg; + struct nlattr *tb[CTRL_ATTR_MAX + 1]; + struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); + struct nlattr *mcgrp; + int rem_mcgrp; + + nla_parse(tb, CTRL_ATTR_MAX, genlmsg_attrdata(gnlh, 0), + genlmsg_attrlen(gnlh, 0), NULL); + + if (!tb[CTRL_ATTR_MCAST_GROUPS]) + return NL_SKIP; + + nla_for_each_nested(mcgrp, tb[CTRL_ATTR_MCAST_GROUPS], rem_mcgrp) { + struct nlattr *tb_mcgrp[CTRL_ATTR_MCAST_GRP_MAX + 1]; + + nla_parse(tb_mcgrp, CTRL_ATTR_MCAST_GRP_MAX, + nla_data(mcgrp), nla_len(mcgrp), NULL); + + if (!tb_mcgrp[CTRL_ATTR_MCAST_GRP_NAME] || + !tb_mcgrp[CTRL_ATTR_MCAST_GRP_ID]) + continue; + if (strncmp(nla_data(tb_mcgrp[CTRL_ATTR_MCAST_GRP_NAME]), + grp->group, nla_len(tb_mcgrp[CTRL_ATTR_MCAST_GRP_NAME]))) + continue; + grp->id = nla_get_u32(tb_mcgrp[CTRL_ATTR_MCAST_GRP_ID]); + break; + } + + return NL_SKIP; +} + +static int mcast_error_handler(struct sockaddr_nl (*nla)__always_unused, + struct nlmsgerr *err, void *arg) +{ + int *ret = arg; + + *ret = err->error; + return NL_STOP; +} + +static int mcast_ack_handler(struct nl_msg (*msg)__always_unused, void *arg) +{ + int *ret = arg; + + *ret = 0; + return NL_STOP; +} + +static int ovpn_handle_msg(struct nl_msg *msg, void *arg) +{ + struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); + struct nlattr *attrs[OVPN_A_MAX + 1]; + struct nlmsghdr *nlh = nlmsg_hdr(msg); + char ifname[IF_NAMESIZE]; + int *ret = arg; + __u32 ifindex; + + fprintf(stderr, "received message from ovpn-dco\n"); + + *ret = -1; + + if (!genlmsg_valid_hdr(nlh, 0)) { + fprintf(stderr, "invalid header\n"); + return NL_STOP; + } + + if (nla_parse(attrs, OVPN_A_MAX, genlmsg_attrdata(gnlh, 0), + genlmsg_attrlen(gnlh, 0), NULL)) { + fprintf(stderr, "received bogus data from ovpn-dco\n"); + return NL_STOP; + } + + if (!attrs[OVPN_A_IFINDEX]) { + fprintf(stderr, "no ifindex in this message\n"); + return NL_STOP; + } + + ifindex = nla_get_u32(attrs[OVPN_A_IFINDEX]); + if (!if_indextoname(ifindex, ifname)) { + fprintf(stderr, "cannot resolve ifname for ifindex: %u\n", + ifindex); + return NL_STOP; + } + + switch (gnlh->cmd) { + case OVPN_CMD_PEER_DEL_NTF: + fprintf(stdout, "received CMD_PEER_DEL_NTF\n"); + break; + case OVPN_CMD_KEY_SWAP_NTF: + fprintf(stdout, "received CMD_KEY_SWAP_NTF\n"); + break; + default: + fprintf(stderr, "received unknown command: %d\n", gnlh->cmd); + return NL_STOP; + } + + *ret = 0; + return NL_OK; +} + +static int ovpn_get_mcast_id(struct nl_sock *sock, const char *family, + const char *group) +{ + struct nl_msg *msg; + struct nl_cb *cb; + int ret, ctrlid; + struct mcast_handler_args grp = { + .group = group, + .id = -ENOENT, + }; + + msg = nlmsg_alloc(); + if (!msg) + return -ENOMEM; + + cb = nl_cb_alloc(NL_CB_DEFAULT); + if (!cb) { + ret = -ENOMEM; + goto out_fail_cb; + } + + ctrlid = genl_ctrl_resolve(sock, "nlctrl"); + + genlmsg_put(msg, 0, 0, ctrlid, 0, 0, CTRL_CMD_GETFAMILY, 0); + + ret = -ENOBUFS; + NLA_PUT_STRING(msg, CTRL_ATTR_FAMILY_NAME, family); + + ret = nl_send_auto_complete(sock, msg); + if (ret < 0) + goto nla_put_failure; + + ret = 1; + + nl_cb_err(cb, NL_CB_CUSTOM, mcast_error_handler, &ret); + nl_cb_set(cb, NL_CB_ACK, NL_CB_CUSTOM, mcast_ack_handler, &ret); + nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, mcast_family_handler, &grp); + + while (ret > 0) + nl_recvmsgs(sock, cb); + + if (ret == 0) + ret = grp.id; + nla_put_failure: + nl_cb_put(cb); + out_fail_cb: + nlmsg_free(msg); + return ret; +} + +static int ovpn_listen_mcast(void) +{ + struct nl_sock *sock; + struct nl_cb *cb; + int mcid, ret; + + sock = nl_socket_alloc(); + if (!sock) { + fprintf(stderr, "cannot allocate netlink socket\n"); + goto err_free; + } + + nl_socket_set_buffer_size(sock, 8192, 8192); + + ret = genl_connect(sock); + if (ret < 0) { + fprintf(stderr, "cannot connect to generic netlink: %s\n", + nl_geterror(ret)); + goto err_free; + } + + mcid = ovpn_get_mcast_id(sock, OVPN_FAMILY_NAME, OVPN_MCGRP_PEERS); + if (mcid < 0) { + fprintf(stderr, "cannot get mcast group: %s\n", + nl_geterror(mcid)); + goto err_free; + } + + ret = nl_socket_add_membership(sock, mcid); + if (ret) { + fprintf(stderr, "failed to join mcast group: %d\n", ret); + goto err_free; + } + + ret = 1; + cb = nl_cb_alloc(NL_CB_DEFAULT); + nl_cb_set(cb, NL_CB_SEQ_CHECK, NL_CB_CUSTOM, nl_seq_check, NULL); + nl_cb_set(cb, NL_CB_VALID, NL_CB_CUSTOM, ovpn_handle_msg, &ret); + nl_cb_err(cb, NL_CB_CUSTOM, ovpn_nl_cb_error, &ret); + + while (ret == 1) { + int err = nl_recvmsgs(sock, cb); + + if (err < 0) { + fprintf(stderr, + "cannot receive netlink message: (%d) %s\n", + err, nl_geterror(-err)); + ret = -1; + break; + } + } + + nl_cb_put(cb); +err_free: + nl_socket_free(sock); + return ret; +} + +static void usage(const char *cmd) +{ + fprintf(stderr, + "Usage %s [arguments..]\n", + cmd); + fprintf(stderr, "where can be one of the following\n\n"); + + fprintf(stderr, "* new_iface [mode]: create new ovpn interface\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tmode:\n"); + fprintf(stderr, "\t\t- P2P for peer-to-peer mode (i.e. client)\n"); + fprintf(stderr, "\t\t- MP for multi-peer mode (i.e. server)\n"); + + fprintf(stderr, "* del_iface : delete ovpn interface\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + + fprintf(stderr, + "* listen [ipv6]: listen for incoming peer TCP connections\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tlport: TCP port to listen to\n"); + fprintf(stderr, + "\tpeers_file: file containing one peer per line: Line format:\n"); + fprintf(stderr, "\t\t \n"); + fprintf(stderr, + "\tipv6: whether the socket should listen to the IPv6 wildcard address\n"); + + fprintf(stderr, + "* connect [key_file]: start connecting peer of TCP-based VPN session\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tpeer_id: peer ID of the connecting peer\n"); + fprintf(stderr, "\traddr: peer IP address to connect to\n"); + fprintf(stderr, "\trport: peer TCP port to connect to\n"); + fprintf(stderr, + "\tkey_file: file containing the symmetric key for encryption\n"); + + fprintf(stderr, + "* new_peer [vpnaddr]: add new peer\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tlport: local UDP port to bind to\n"); + fprintf(stderr, + "\tpeer_id: peer ID to be used in data packets to/from this peer\n"); + fprintf(stderr, "\traddr: peer IP address\n"); + fprintf(stderr, "\trport: peer UDP port\n"); + fprintf(stderr, "\tvpnaddr: peer VPN IP\n"); + + fprintf(stderr, + "* new_multi_peer : add multiple peers as listed in the file\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tlport: local UDP port to bind to\n"); + fprintf(stderr, + "\tpeers_file: text file containing one peer per line. Line format:\n"); + fprintf(stderr, "\t\t \n"); + + fprintf(stderr, + "* set_peer : set peer attributes\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tpeer_id: peer ID of the peer to modify\n"); + fprintf(stderr, + "\tkeepalive_interval: interval for sending ping messages\n"); + fprintf(stderr, + "\tkeepalive_timeout: time after which a peer is timed out\n"); + + fprintf(stderr, "* del_peer : delete peer\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tpeer_id: peer ID of the peer to delete\n"); + + fprintf(stderr, "* get_peer [peer_id]: retrieve peer(s) status\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, + "\tpeer_id: peer ID of the peer to query. All peers are returned if omitted\n"); + + fprintf(stderr, + "* new_key : set data channel key\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, + "\tpeer_id: peer ID of the peer to configure the key for\n"); + fprintf(stderr, "\tslot: either 1 (primary) or 2 (secondary)\n"); + fprintf(stderr, "\tkey_id: an ID from 0 to 7\n"); + fprintf(stderr, + "\tcipher: cipher to use, supported: aes (AES-GCM), chachapoly (CHACHA20POLY1305)\n"); + fprintf(stderr, + "\tkey_dir: key direction, must 0 on one host and 1 on the other\n"); + fprintf(stderr, "\tkey_file: file containing the pre-shared key\n"); + + fprintf(stderr, + "* del_key [slot]: erase existing data channel key\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tpeer_id: peer ID of the peer to modify\n"); + fprintf(stderr, "\tslot: slot to erase. PRIMARY if omitted\n"); + + fprintf(stderr, + "* get_key : retrieve non sensible key data\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tpeer_id: peer ID of the peer to query\n"); + fprintf(stderr, "\tslot: either 1 (primary) or 2 (secondary)\n"); + + fprintf(stderr, + "* swap_keys : swap content of primary and secondary key slots\n"); + fprintf(stderr, "\tiface: ovpn interface name\n"); + fprintf(stderr, "\tpeer_id: peer ID of the peer to modify\n"); + + fprintf(stderr, + "* listen_mcast: listen to ovpn netlink multicast messages\n"); +} + +static int ovpn_parse_remote(struct ovpn_ctx *ovpn, const char *host, + const char *service, const char *vpnip) +{ + int ret; + struct addrinfo *result; + struct addrinfo hints = { + .ai_family = ovpn->sa_family, + .ai_socktype = SOCK_DGRAM, + .ai_protocol = IPPROTO_UDP + }; + + if (host) { + ret = getaddrinfo(host, service, &hints, &result); + if (ret == EAI_NONAME || ret == EAI_FAIL) + return -1; + + if (!(result->ai_family == AF_INET && + result->ai_addrlen == sizeof(struct sockaddr_in)) && + !(result->ai_family == AF_INET6 && + result->ai_addrlen == sizeof(struct sockaddr_in6))) { + ret = -EINVAL; + goto out; + } + + memcpy(&ovpn->remote, result->ai_addr, result->ai_addrlen); + } + + if (vpnip) { + ret = getaddrinfo(vpnip, NULL, &hints, &result); + if (ret == EAI_NONAME || ret == EAI_FAIL) + return -1; + + if (!(result->ai_family == AF_INET && + result->ai_addrlen == sizeof(struct sockaddr_in)) && + !(result->ai_family == AF_INET6 && + result->ai_addrlen == sizeof(struct sockaddr_in6))) { + ret = -EINVAL; + goto out; + } + + memcpy(&ovpn->peer_ip, result->ai_addr, result->ai_addrlen); + ovpn->sa_family = result->ai_family; + + ovpn->peer_ip_set = true; + } + + ret = 0; +out: + freeaddrinfo(result); + return ret; +} + +static int ovpn_parse_new_peer(struct ovpn_ctx *ovpn, const char *peer_id, + const char *raddr, const char *rport, + const char *vpnip) +{ + ovpn->peer_id = strtoul(peer_id, NULL, 10); + if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + + return ovpn_parse_remote(ovpn, raddr, rport, vpnip); +} + +static int ovpn_parse_key_slot(const char *arg, struct ovpn_ctx *ovpn) +{ + int slot = strtoul(arg, NULL, 10); + + if (errno == ERANGE || slot < 1 || slot > 2) { + fprintf(stderr, "key slot out of range\n"); + return -1; + } + + switch (slot) { + case 1: + ovpn->key_slot = OVPN_KEY_SLOT_PRIMARY; + break; + case 2: + ovpn->key_slot = OVPN_KEY_SLOT_SECONDARY; + break; + } + + return 0; +} + +static int ovpn_send_tcp_data(int socket) +{ + uint16_t len = htons(1000); + uint8_t buf[1002]; + int ret; + + memcpy(buf, &len, sizeof(len)); + memset(buf + sizeof(len), 0x86, sizeof(buf) - sizeof(len)); + + ret = send(socket, buf, sizeof(buf), MSG_NOSIGNAL); + + fprintf(stdout, "Sent %u bytes over TCP socket\n", ret); + + return ret > 0 ? 0 : ret; +} + +static int ovpn_recv_tcp_data(int socket) +{ + uint8_t buf[1002]; + uint16_t len; + int ret; + + ret = recv(socket, buf, sizeof(buf), MSG_NOSIGNAL); + + if (ret < 2) { + fprintf(stderr, ">>>> Error while reading TCP data: %d\n", ret); + return ret; + } + + memcpy(&len, buf, sizeof(len)); + len = ntohs(len); + + fprintf(stdout, ">>>> Received %u bytes over TCP socket, header: %u\n", + ret, len); + + return 0; +} + +static enum ovpn_cmd ovpn_parse_cmd(const char *cmd) +{ + if (!strcmp(cmd, "new_iface")) + return CMD_NEW_IFACE; + + if (!strcmp(cmd, "del_iface")) + return CMD_DEL_IFACE; + + if (!strcmp(cmd, "listen")) + return CMD_LISTEN; + + if (!strcmp(cmd, "connect")) + return CMD_CONNECT; + + if (!strcmp(cmd, "new_peer")) + return CMD_NEW_PEER; + + if (!strcmp(cmd, "new_multi_peer")) + return CMD_NEW_MULTI_PEER; + + if (!strcmp(cmd, "set_peer")) + return CMD_SET_PEER; + + if (!strcmp(cmd, "del_peer")) + return CMD_DEL_PEER; + + if (!strcmp(cmd, "get_peer")) + return CMD_GET_PEER; + + if (!strcmp(cmd, "new_key")) + return CMD_NEW_KEY; + + if (!strcmp(cmd, "del_key")) + return CMD_DEL_KEY; + + if (!strcmp(cmd, "get_key")) + return CMD_GET_KEY; + + if (!strcmp(cmd, "swap_keys")) + return CMD_SWAP_KEYS; + + if (!strcmp(cmd, "listen_mcast")) + return CMD_LISTEN_MCAST; + + return CMD_INVALID; +} + +/* Send process to background and waits for signal. + * + * This helper is called at the end of commands + * creating sockets, so that the latter stay alive + * along with the process that created them. + * + * A signal is expected to be delivered in order to + * terminate the waiting processes + */ +static void ovpn_waitbg(void) +{ + daemon(1, 1); + pause(); +} + +static int ovpn_run_cmd(struct ovpn_ctx *ovpn) +{ + char peer_id[10], vpnip[INET6_ADDRSTRLEN], raddr[128], rport[10]; + int n, ret; + FILE *fp; + + switch (ovpn->cmd) { + case CMD_NEW_IFACE: + ret = ovpn_new_iface(ovpn); + break; + case CMD_DEL_IFACE: + ret = ovpn_del_iface(ovpn); + break; + case CMD_LISTEN: + ret = ovpn_listen(ovpn, ovpn->sa_family); + if (ret < 0) { + fprintf(stderr, "cannot listen on TCP socket\n"); + return ret; + } + + fp = fopen(ovpn->peers_file, "r"); + if (!fp) { + fprintf(stderr, "cannot open file: %s\n", + ovpn->peers_file); + return -1; + } + + int num_peers = 0; + + while ((n = fscanf(fp, "%s %s\n", peer_id, vpnip)) == 2) { + struct ovpn_ctx peer_ctx = { 0 }; + + if (num_peers == MAX_PEERS) { + fprintf(stderr, "max peers reached!\n"); + return -E2BIG; + } + + peer_ctx.ifindex = ovpn->ifindex; + peer_ctx.sa_family = ovpn->sa_family; + + peer_ctx.socket = ovpn_accept(ovpn); + if (peer_ctx.socket < 0) { + fprintf(stderr, "cannot accept connection!\n"); + return -1; + } + + /* store peer sockets to test TCP I/O */ + ovpn->cli_sockets[num_peers] = peer_ctx.socket; + + ret = ovpn_parse_new_peer(&peer_ctx, peer_id, NULL, + NULL, vpnip); + if (ret < 0) { + fprintf(stderr, "error while parsing line\n"); + return -1; + } + + ret = ovpn_new_peer(&peer_ctx, true); + if (ret < 0) { + fprintf(stderr, + "cannot add peer to VPN: %s %s\n", + peer_id, vpnip); + return ret; + } + num_peers++; + } + + for (int i = 0; i < num_peers; i++) { + ret = ovpn_recv_tcp_data(ovpn->cli_sockets[i]); + if (ret < 0) + break; + } + ovpn_waitbg(); + break; + case CMD_CONNECT: + ret = ovpn_connect(ovpn); + if (ret < 0) { + fprintf(stderr, "cannot connect TCP socket\n"); + return ret; + } + + ret = ovpn_new_peer(ovpn, true); + if (ret < 0) { + fprintf(stderr, "cannot add peer to VPN\n"); + close(ovpn->socket); + return ret; + } + + if (ovpn->cipher != OVPN_CIPHER_ALG_NONE) { + ret = ovpn_new_key(ovpn); + if (ret < 0) { + fprintf(stderr, "cannot set key\n"); + return ret; + } + } + + ret = ovpn_send_tcp_data(ovpn->socket); + ovpn_waitbg(); + break; + case CMD_NEW_PEER: + ret = ovpn_udp_socket(ovpn, AF_INET6); + if (ret < 0) + return ret; + + ret = ovpn_new_peer(ovpn, false); + ovpn_waitbg(); + break; + case CMD_NEW_MULTI_PEER: + ret = ovpn_udp_socket(ovpn, AF_INET6); + if (ret < 0) + return ret; + + fp = fopen(ovpn->peers_file, "r"); + if (!fp) { + fprintf(stderr, "cannot open file: %s\n", + ovpn->peers_file); + return -1; + } + + while ((n = fscanf(fp, "%s %s %s %s\n", peer_id, raddr, rport, + vpnip)) == 4) { + struct ovpn_ctx peer_ctx = { 0 }; + + peer_ctx.ifindex = ovpn->ifindex; + peer_ctx.socket = ovpn->socket; + peer_ctx.sa_family = AF_UNSPEC; + + ret = ovpn_parse_new_peer(&peer_ctx, peer_id, raddr, + rport, vpnip); + if (ret < 0) { + fprintf(stderr, "error while parsing line\n"); + return -1; + } + + ret = ovpn_new_peer(&peer_ctx, false); + if (ret < 0) { + fprintf(stderr, + "cannot add peer to VPN: %s %s %s %s\n", + peer_id, raddr, rport, vpnip); + return ret; + } + } + ovpn_waitbg(); + break; + case CMD_SET_PEER: + ret = ovpn_set_peer(ovpn); + break; + case CMD_DEL_PEER: + ret = ovpn_del_peer(ovpn); + break; + case CMD_GET_PEER: + if (ovpn->peer_id == PEER_ID_UNDEF) + fprintf(stderr, "List of peers connected to: %s\n", + ovpn->ifname); + + ret = ovpn_get_peer(ovpn); + break; + case CMD_NEW_KEY: + ret = ovpn_new_key(ovpn); + break; + case CMD_DEL_KEY: + ret = ovpn_del_key(ovpn); + break; + case CMD_GET_KEY: + ret = ovpn_get_key(ovpn); + break; + case CMD_SWAP_KEYS: + ret = ovpn_swap_keys(ovpn); + break; + case CMD_LISTEN_MCAST: + ret = ovpn_listen_mcast(); + break; + case CMD_INVALID: + break; + } + + return ret; +} + +static int ovpn_parse_cmd_args(struct ovpn_ctx *ovpn, int argc, char *argv[]) +{ + int ret; + + /* no args required for LISTEN_MCAST */ + if (ovpn->cmd == CMD_LISTEN_MCAST) + return 0; + + /* all commands need an ifname */ + if (argc < 3) + return -EINVAL; + + strscpy(ovpn->ifname, argv[2], IFNAMSIZ - 1); + ovpn->ifname[IFNAMSIZ - 1] = '\0'; + + /* all commands, except NEW_IFNAME, needs an ifindex */ + if (ovpn->cmd != CMD_NEW_IFACE) { + ovpn->ifindex = if_nametoindex(ovpn->ifname); + if (!ovpn->ifindex) { + fprintf(stderr, "cannot find interface: %s\n", + strerror(errno)); + return -1; + } + } + + switch (ovpn->cmd) { + case CMD_NEW_IFACE: + if (argc < 4) + break; + + if (!strcmp(argv[3], "P2P")) { + ovpn->mode = OVPN_MODE_P2P; + } else if (!strcmp(argv[3], "MP")) { + ovpn->mode = OVPN_MODE_MP; + } else { + fprintf(stderr, "Cannot parse iface mode: %s\n", + argv[3]); + return -1; + } + ovpn->mode_set = true; + break; + case CMD_DEL_IFACE: + break; + case CMD_LISTEN: + if (argc < 5) + return -EINVAL; + + ovpn->lport = strtoul(argv[3], NULL, 10); + if (errno == ERANGE || ovpn->lport > 65535) { + fprintf(stderr, "lport value out of range\n"); + return -1; + } + + ovpn->peers_file = argv[4]; + + if (argc > 5 && !strcmp(argv[5], "ipv6")) + ovpn->sa_family = AF_INET6; + break; + case CMD_CONNECT: + if (argc < 6) + return -EINVAL; + + ovpn->sa_family = AF_INET; + + ret = ovpn_parse_new_peer(ovpn, argv[3], argv[4], argv[5], + NULL); + if (ret < 0) { + fprintf(stderr, "Cannot parse remote peer data\n"); + return -1; + } + + if (argc > 6) { + ovpn->key_slot = OVPN_KEY_SLOT_PRIMARY; + ovpn->key_id = 0; + ovpn->cipher = OVPN_CIPHER_ALG_AES_GCM; + ovpn->key_dir = KEY_DIR_OUT; + + ret = ovpn_parse_key(argv[6], ovpn); + if (ret) + return -1; + } + break; + case CMD_NEW_PEER: + if (argc < 7) + return -EINVAL; + + ovpn->lport = strtoul(argv[4], NULL, 10); + if (errno == ERANGE || ovpn->lport > 65535) { + fprintf(stderr, "lport value out of range\n"); + return -1; + } + + const char *vpnip = (argc > 7) ? argv[7] : NULL; + + ret = ovpn_parse_new_peer(ovpn, argv[3], argv[5], argv[6], + vpnip); + if (ret < 0) + return -1; + break; + case CMD_NEW_MULTI_PEER: + if (argc < 5) + return -EINVAL; + + ovpn->lport = strtoul(argv[3], NULL, 10); + if (errno == ERANGE || ovpn->lport > 65535) { + fprintf(stderr, "lport value out of range\n"); + return -1; + } + + ovpn->peers_file = argv[4]; + break; + case CMD_SET_PEER: + if (argc < 6) + return -EINVAL; + + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + + ovpn->keepalive_interval = strtoul(argv[4], NULL, 10); + if (errno == ERANGE) { + fprintf(stderr, + "keepalive interval value out of range\n"); + return -1; + } + + ovpn->keepalive_timeout = strtoul(argv[5], NULL, 10); + if (errno == ERANGE) { + fprintf(stderr, + "keepalive interval value out of range\n"); + return -1; + } + break; + case CMD_DEL_PEER: + if (argc < 4) + return -EINVAL; + + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + break; + case CMD_GET_PEER: + ovpn->peer_id = PEER_ID_UNDEF; + if (argc > 3) { + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE || ovpn->peer_id > PEER_ID_UNDEF) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + } + break; + case CMD_NEW_KEY: + if (argc < 9) + return -EINVAL; + + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + + ret = ovpn_parse_key_slot(argv[4], ovpn); + if (ret) + return -1; + + ovpn->key_id = strtoul(argv[5], NULL, 10); + if (errno == ERANGE || ovpn->key_id > 2) { + fprintf(stderr, "key ID out of range\n"); + return -1; + } + + ret = ovpn_parse_cipher(argv[6], ovpn); + if (ret < 0) + return -1; + + ret = ovpn_parse_key_direction(argv[7], ovpn); + if (ret < 0) + return -1; + + ret = ovpn_parse_key(argv[8], ovpn); + if (ret) + return -1; + break; + case CMD_DEL_KEY: + if (argc < 4) + return -EINVAL; + + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + + ret = ovpn_parse_key_slot(argv[4], ovpn); + if (ret) + return ret; + break; + case CMD_GET_KEY: + if (argc < 5) + return -EINVAL; + + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + + ret = ovpn_parse_key_slot(argv[4], ovpn); + if (ret) + return ret; + break; + case CMD_SWAP_KEYS: + if (argc < 4) + return -EINVAL; + + ovpn->peer_id = strtoul(argv[3], NULL, 10); + if (errno == ERANGE) { + fprintf(stderr, "peer ID value out of range\n"); + return -1; + } + break; + case CMD_LISTEN_MCAST: + break; + case CMD_INVALID: + break; + } + + return 0; +} + +int main(int argc, char *argv[]) +{ + struct ovpn_ctx ovpn; + int ret; + + if (argc < 2) { + usage(argv[0]); + return -1; + } + + memset(&ovpn, 0, sizeof(ovpn)); + ovpn.sa_family = AF_INET; + ovpn.cipher = OVPN_CIPHER_ALG_NONE; + + ovpn.cmd = ovpn_parse_cmd(argv[1]); + if (ovpn.cmd == CMD_INVALID) { + fprintf(stderr, "Error: unknown command.\n\n"); + usage(argv[0]); + return -1; + } + + ret = ovpn_parse_cmd_args(&ovpn, argc, argv); + if (ret < 0) { + fprintf(stderr, "Error: invalid arguments.\n\n"); + if (ret == -EINVAL) + usage(argv[0]); + return ret; + } + + ret = ovpn_run_cmd(&ovpn); + if (ret) + fprintf(stderr, "Cannot execute command: %s (%d)\n", + strerror(-ret), ret); + + return ret; +} diff --git a/tools/testing/selftests/net/ovpn/tcp_peers.txt b/tools/testing/selftests/net/ovpn/tcp_peers.txt new file mode 100644 index 0000000000000..d753eebe8716e --- /dev/null +++ b/tools/testing/selftests/net/ovpn/tcp_peers.txt @@ -0,0 +1,5 @@ +1 5.5.5.2 +2 5.5.5.3 +3 5.5.5.4 +4 5.5.5.5 +5 5.5.5.6 diff --git a/tools/testing/selftests/net/ovpn/test-chachapoly.sh b/tools/testing/selftests/net/ovpn/test-chachapoly.sh new file mode 100755 index 0000000000000..32504079a2b89 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/test-chachapoly.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +ALG="chachapoly" + +source test.sh diff --git a/tools/testing/selftests/net/ovpn/test-close-socket-tcp.sh b/tools/testing/selftests/net/ovpn/test-close-socket-tcp.sh new file mode 100755 index 0000000000000..093d44772ffdf --- /dev/null +++ b/tools/testing/selftests/net/ovpn/test-close-socket-tcp.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +PROTO="TCP" + +source test-close-socket.sh diff --git a/tools/testing/selftests/net/ovpn/test-close-socket.sh b/tools/testing/selftests/net/ovpn/test-close-socket.sh new file mode 100755 index 0000000000000..5e48a8b679287 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/test-close-socket.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2020-2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +#set -x +set -e + +source ./common.sh + +cleanup + +modprobe -q ovpn || true + +for p in $(seq 0 ${NUM_PEERS}); do + create_ns ${p} +done + +for p in $(seq 0 ${NUM_PEERS}); do + setup_ns ${p} 5.5.5.$((${p} + 1))/24 +done + +for p in $(seq 0 ${NUM_PEERS}); do + add_peer ${p} +done + +for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ${OVPN_CLI} set_peer tun0 ${p} 60 120 + ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 60 120 +done + +sleep 1 + +for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ping -qfc 500 -w 3 5.5.5.$((${p} + 1)) +done + +ip netns exec peer0 iperf3 -1 -s & +sleep 1 +ip netns exec peer1 iperf3 -Z -t 3 -c 5.5.5.1 + +cleanup + +modprobe -r ovpn || true diff --git a/tools/testing/selftests/net/ovpn/test-float.sh b/tools/testing/selftests/net/ovpn/test-float.sh new file mode 100755 index 0000000000000..ba5d725e18b07 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/test-float.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +FLOAT="1" + +source test.sh diff --git a/tools/testing/selftests/net/ovpn/test-tcp.sh b/tools/testing/selftests/net/ovpn/test-tcp.sh new file mode 100755 index 0000000000000..ba3f1f315a349 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/test-tcp.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +PROTO="TCP" + +source test.sh diff --git a/tools/testing/selftests/net/ovpn/test.sh b/tools/testing/selftests/net/ovpn/test.sh new file mode 100755 index 0000000000000..7b62897b02408 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/test.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2020-2025 OpenVPN, Inc. +# +# Author: Antonio Quartulli + +#set -x +set -e + +source ./common.sh + +cleanup + +modprobe -q ovpn || true + +for p in $(seq 0 ${NUM_PEERS}); do + create_ns ${p} +done + +for p in $(seq 0 ${NUM_PEERS}); do + setup_ns ${p} 5.5.5.$((${p} + 1))/24 +done + +for p in $(seq 0 ${NUM_PEERS}); do + add_peer ${p} +done + +for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ${OVPN_CLI} set_peer tun0 ${p} 60 120 + ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 60 120 +done + +sleep 1 + +for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ping -qfc 500 -w 3 5.5.5.$((${p} + 1)) +done + +if [ "$FLOAT" == "1" ]; then + # make clients float.. + for p in $(seq 1 ${NUM_PEERS}); do + ip -n peer${p} addr del 10.10.${p}.2/24 dev veth${p} + ip -n peer${p} addr add 10.10.${p}.3/24 dev veth${p} + done + for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer${p} ping -qfc 500 -w 3 5.5.5.1 + done +fi + +ip netns exec peer0 iperf3 -1 -s & +sleep 1 +ip netns exec peer1 iperf3 -Z -t 3 -c 5.5.5.1 + +echo "Adding secondary key and then swap:" +for p in $(seq 1 ${NUM_PEERS}); do + ip netns exec peer0 ${OVPN_CLI} new_key tun0 ${p} 2 1 ${ALG} 0 data64.key + ip netns exec peer${p} ${OVPN_CLI} new_key tun${p} ${p} 2 1 ${ALG} 1 data64.key + ip netns exec peer${p} ${OVPN_CLI} swap_keys tun${p} ${p} +done + +sleep 1 + +echo "Querying all peers:" +ip netns exec peer0 ${OVPN_CLI} get_peer tun0 +ip netns exec peer1 ${OVPN_CLI} get_peer tun1 + +echo "Querying peer 1:" +ip netns exec peer0 ${OVPN_CLI} get_peer tun0 1 + +echo "Querying non-existent peer 10:" +ip netns exec peer0 ${OVPN_CLI} get_peer tun0 10 || true + +echo "Deleting peer 1:" +ip netns exec peer0 ${OVPN_CLI} del_peer tun0 1 +ip netns exec peer1 ${OVPN_CLI} del_peer tun1 1 + +echo "Querying keys:" +for p in $(seq 2 ${NUM_PEERS}); do + ip netns exec peer${p} ${OVPN_CLI} get_key tun${p} ${p} 1 + ip netns exec peer${p} ${OVPN_CLI} get_key tun${p} ${p} 2 +done + +echo "Deleting peer while sending traffic:" +(ip netns exec peer2 ping -qf -w 4 5.5.5.1)& +sleep 2 +ip netns exec peer0 ${OVPN_CLI} del_peer tun0 2 +# following command fails in TCP mode +# (both ends get conn reset when one peer disconnects) +ip netns exec peer2 ${OVPN_CLI} del_peer tun2 2 || true + +echo "Deleting keys:" +for p in $(seq 3 ${NUM_PEERS}); do + ip netns exec peer${p} ${OVPN_CLI} del_key tun${p} ${p} 1 + ip netns exec peer${p} ${OVPN_CLI} del_key tun${p} ${p} 2 +done + +echo "Setting timeout to 3s MP:" +for p in $(seq 3 ${NUM_PEERS}); do + ip netns exec peer0 ${OVPN_CLI} set_peer tun0 ${p} 3 3 || true + ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 0 0 +done +# wait for peers to timeout +sleep 5 + +echo "Setting timeout to 3s P2P:" +for p in $(seq 3 ${NUM_PEERS}); do + ip netns exec peer${p} ${OVPN_CLI} set_peer tun${p} ${p} 3 3 +done +sleep 5 + +cleanup + +modprobe -r ovpn || true diff --git a/tools/testing/selftests/net/ovpn/udp_peers.txt b/tools/testing/selftests/net/ovpn/udp_peers.txt new file mode 100644 index 0000000000000..32f14bd9347a6 --- /dev/null +++ b/tools/testing/selftests/net/ovpn/udp_peers.txt @@ -0,0 +1,5 @@ +1 10.10.1.2 1 5.5.5.2 +2 10.10.2.2 1 5.5.5.3 +3 10.10.3.2 1 5.5.5.4 +4 10.10.4.2 1 5.5.5.5 +5 10.10.5.2 1 5.5.5.6 From d55acb9732d981c7a8e07dd63089a77d2938e382 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 15 Apr 2025 13:25:53 +0200 Subject: [PATCH 284/770] net: ipv6: ioam6: use consistent dst names Be consistent and use the same terminology as other lwt users: orig_dst is the dst_entry before the transformation, while dst is either the dst_entry in the cache or the dst_entry after the transformation Signed-off-by: Justin Iurman Link: https://patch.msgid.link/20250415112554.23823-2-justin.iurman@uliege.be Signed-off-by: Paolo Abeni --- net/ipv6/ioam6_iptunnel.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 09065187378e6..57200b9991a1d 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -336,7 +336,8 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL; + struct dst_entry *orig_dst = skb_dst(skb); + struct dst_entry *dst = NULL; struct ioam6_lwt *ilwt; int err = -EINVAL; u32 pkt_cnt; @@ -344,7 +345,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ilwt = ioam6_lwt_state(dst->lwtstate); + ilwt = ioam6_lwt_state(orig_dst->lwtstate); /* Check for insertion frequency (i.e., "k over n" insertions) */ pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt); @@ -352,7 +353,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto out; local_bh_disable(); - cache_dst = dst_cache_get(&ilwt->cache); + dst = dst_cache_get(&ilwt->cache); local_bh_enable(); switch (ilwt->mode) { @@ -362,7 +363,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) goto out; - err = ioam6_do_inline(net, skb, &ilwt->tuninfo, cache_dst); + err = ioam6_do_inline(net, skb, &ilwt->tuninfo, dst); if (unlikely(err)) goto drop; @@ -372,7 +373,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) /* Encapsulation (ip6ip6) */ err = ioam6_do_encap(net, skb, &ilwt->tuninfo, ilwt->has_tunsrc, &ilwt->tunsrc, - &ilwt->tundst, cache_dst); + &ilwt->tundst, dst); if (unlikely(err)) goto drop; @@ -390,7 +391,7 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - if (unlikely(!cache_dst)) { + if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; @@ -401,20 +402,20 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = hdr->nexthdr; - cache_dst = ip6_route_output(net, NULL, &fl6); - if (cache_dst->error) { - err = cache_dst->error; + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + err = dst->error; goto drop; } /* cache only if we don't create a dst reference loop */ - if (dst->lwtstate != cache_dst->lwtstate) { + if (orig_dst->lwtstate != dst->lwtstate) { local_bh_disable(); - dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); + dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); local_bh_enable(); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev)); + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) goto drop; } @@ -422,16 +423,16 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) /* avoid lwtunnel_output() reentry loop when destination is the same * after transformation (e.g., with the inline mode) */ - if (dst->lwtstate != cache_dst->lwtstate) { + if (orig_dst->lwtstate != dst->lwtstate) { skb_dst_drop(skb); - skb_dst_set(skb, cache_dst); + skb_dst_set(skb, dst); return dst_output(net, sk, skb); } out: - dst_release(cache_dst); - return dst->lwtstate->orig_output(net, sk, skb); + dst_release(dst); + return orig_dst->lwtstate->orig_output(net, sk, skb); drop: - dst_release(cache_dst); + dst_release(dst); kfree_skb(skb); return err; } From 47ce7c854563fe8450e9cb8dcd62c6470e28076b Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 15 Apr 2025 13:25:54 +0200 Subject: [PATCH 285/770] net: ipv6: ioam6: fix double reallocation If the dst_entry is the same post transformation (which is a valid use case for IOAM), we don't add it to the cache to avoid a reference loop. Instead, we use a "fake" dst_entry and add it to the cache as a signal. When we read the cache, we compare it with our "fake" dst_entry and therefore detect if we're in the special case. Signed-off-by: Justin Iurman Link: https://patch.msgid.link/20250415112554.23823-3-justin.iurman@uliege.be Signed-off-by: Paolo Abeni --- net/ipv6/ioam6_iptunnel.c | 41 ++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 57200b9991a1d..40df8bdfaacdd 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -38,6 +38,7 @@ struct ioam6_lwt_freq { }; struct ioam6_lwt { + struct dst_entry null_dst; struct dst_cache cache; struct ioam6_lwt_freq freq; atomic_t pkt_cnt; @@ -177,6 +178,14 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, if (err) goto free_lwt; + /* This "fake" dst_entry will be stored in a dst_cache, which will call + * dst_hold() and dst_release() on it. We must ensure that dst_destroy() + * will never be called. For that, its initial refcount is 1 and +1 when + * it is stored in the cache. Then, +1/-1 each time we read the cache + * and release it. Long story short, we're fine. + */ + dst_init(&ilwt->null_dst, NULL, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); + atomic_set(&ilwt->pkt_cnt, 0); ilwt->freq.k = freq_k; ilwt->freq.n = freq_n; @@ -356,6 +365,17 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) dst = dst_cache_get(&ilwt->cache); local_bh_enable(); + /* This is how we notify that the destination does not change after + * transformation and that we need to use orig_dst instead of the cache + */ + if (dst == &ilwt->null_dst) { + dst_release(dst); + + dst = orig_dst; + /* keep refcount balance: dst_release() is called at the end */ + dst_hold(dst); + } + switch (ilwt->mode) { case IOAM6_IPTUNNEL_MODE_INLINE: do_inline: @@ -408,12 +428,19 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - /* cache only if we don't create a dst reference loop */ - if (orig_dst->lwtstate != dst->lwtstate) { - local_bh_disable(); + /* If the destination is the same after transformation (which is + * a valid use case for IOAM), then we don't want to add it to + * the cache in order to avoid a reference loop. Instead, we add + * our fake dst_entry to the cache as a way to detect this case. + * Otherwise, we add the resolved destination to the cache. + */ + local_bh_disable(); + if (orig_dst->lwtstate == dst->lwtstate) + dst_cache_set_ip6(&ilwt->cache, + &ilwt->null_dst, &fl6.saddr); + else dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); - local_bh_enable(); - } + local_bh_enable(); err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) @@ -439,6 +466,10 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) static void ioam6_destroy_state(struct lwtunnel_state *lwt) { + /* Since the refcount of per-cpu dst_entry caches will never be 0 (see + * why above) when our "fake" dst_entry is used, it is not necessary to + * remove them before calling dst_cache_destroy() + */ dst_cache_destroy(&ioam6_lwt_state(lwt)->cache); } From 3bc1ca7e173c10dab03b17d5e8ce9df390721288 Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Tue, 15 Apr 2025 13:29:14 +0200 Subject: [PATCH 286/770] net: pktgen: fix code style (ERROR: else should follow close brace '}') Fix checkpatch code style errors: ERROR: else should follow close brace '}' #1317: FILE: net/core/pktgen.c:1317: + } + else And checkpatch follow up code style check: CHECK: Unbalanced braces around else statement #1316: FILE: net/core/pktgen.c:1316: + } else Signed-off-by: Peter Seiderer Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250415112916.113455-2-ps.report@gmx.net Signed-off-by: Paolo Abeni --- net/core/pktgen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index fa30ff0f24645..21206a567843d 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1313,9 +1313,9 @@ static ssize_t pktgen_if_write(struct file *file, put_page(pkt_dev->page); pkt_dev->page = NULL; } - } - else + } else { sprintf(pg_result, "ERROR: node not possible"); + } return count; } if (!strcmp(name, "xmit_mode")) { From 65f5b9cb543189906559d5486dfe2fdf97d433a5 Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Tue, 15 Apr 2025 13:29:15 +0200 Subject: [PATCH 287/770] net: pktgen: fix code style (WARNING: please, no space before tabs) Fix checkpatch code style warnings: WARNING: please, no space before tabs #230: FILE: net/core/pktgen.c:230: +#define M_NETIF_RECEIVE ^I1^I/* Inject packets into stack */$ Signed-off-by: Peter Seiderer Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250415112916.113455-3-ps.report@gmx.net Signed-off-by: Paolo Abeni --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 21206a567843d..9d56f9765411f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -227,7 +227,7 @@ static char *pkt_flag_names[] = { /* Xmit modes */ #define M_START_XMIT 0 /* Default normal TX */ -#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ +#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ #define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ /* If lock -- protects updating of if_list */ From 422cf22aa332f80657eac39f6853f1340eabab10 Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Tue, 15 Apr 2025 13:29:16 +0200 Subject: [PATCH 288/770] net: pktgen: fix code style (WARNING: Prefer strscpy over strcpy) Fix checkpatch code style warnings: WARNING: Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88 #1423: FILE: net/core/pktgen.c:1423: + strcpy(pkt_dev->dst_min, buf); WARNING: Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88 #1444: FILE: net/core/pktgen.c:1444: + strcpy(pkt_dev->dst_max, buf); WARNING: Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88 #1554: FILE: net/core/pktgen.c:1554: + strcpy(pkt_dev->src_min, buf); WARNING: Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88 #1575: FILE: net/core/pktgen.c:1575: + strcpy(pkt_dev->src_max, buf); WARNING: Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88 #3231: FILE: net/core/pktgen.c:3231: + strcpy(pkt_dev->result, "Starting"); WARNING: Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88 #3235: FILE: net/core/pktgen.c:3235: + strcpy(pkt_dev->result, "Error starting"); WARNING: Prefer strscpy over strcpy - see: https://github.com/KSPP/linux/issues/88 #3849: FILE: net/core/pktgen.c:3849: + strcpy(pkt_dev->odevname, ifname); While at it squash memset/strcpy pattern into single strscpy_pad call. Signed-off-by: Peter Seiderer Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250415112916.113455-4-ps.report@gmx.net Signed-off-by: Paolo Abeni --- net/core/pktgen.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 9d56f9765411f..0ebe5461d4d96 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1419,8 +1419,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_min) != 0) { - memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); - strcpy(pkt_dev->dst_min, buf); + strscpy_pad(pkt_dev->dst_min, buf); pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); pkt_dev->cur_daddr = pkt_dev->daddr_min; } @@ -1440,8 +1439,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_max) != 0) { - memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); - strcpy(pkt_dev->dst_max, buf); + strscpy_pad(pkt_dev->dst_max, buf); pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); pkt_dev->cur_daddr = pkt_dev->daddr_max; } @@ -1550,8 +1548,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_min) != 0) { - memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); - strcpy(pkt_dev->src_min, buf); + strscpy_pad(pkt_dev->src_min, buf); pkt_dev->saddr_min = in_aton(pkt_dev->src_min); pkt_dev->cur_saddr = pkt_dev->saddr_min; } @@ -1571,8 +1568,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_max) != 0) { - memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); - strcpy(pkt_dev->src_max, buf); + strscpy_pad(pkt_dev->src_max, buf); pkt_dev->saddr_max = in_aton(pkt_dev->src_max); pkt_dev->cur_saddr = pkt_dev->saddr_max; } @@ -3228,11 +3224,11 @@ static void pktgen_run(struct pktgen_thread *t) set_pkt_overhead(pkt_dev); - strcpy(pkt_dev->result, "Starting"); + strscpy(pkt_dev->result, "Starting"); pkt_dev->running = 1; /* Cranke yeself! */ started++; } else - strcpy(pkt_dev->result, "Error starting"); + strscpy(pkt_dev->result, "Error starting"); } rcu_read_unlock(); if (started) @@ -3846,7 +3842,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) if (!pkt_dev) return -ENOMEM; - strcpy(pkt_dev->odevname, ifname); + strscpy(pkt_dev->odevname, ifname); pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS, sizeof(struct flow_state)), node); From 9f5595d5f03fd4dc640607a71e89a1daa68fd19d Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 14 Apr 2025 11:24:00 -0500 Subject: [PATCH 289/770] platform/x86/amd: pmc: Require at least 2.5 seconds between HW sleep cycles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an APU exits HW sleep with no active wake sources the Linux kernel will rapidly assert that the APU can enter back into HW sleep. This happens in a few ms. Contrasting this to Windows, Windows can take 10s of seconds to enter back into the resiliency phase for Modern Standby. For some situations this can be problematic because it can cause leakage from VDDCR_SOC to VDD_MISC and force VDD_MISC outside of the electrical design guide specifications. On some designs this will trip the over voltage protection feature (OVP) of the voltage regulator module, but it could cause APU damage as well. To prevent this risk, add an explicit sleep call so that future attempts to enter into HW sleep will have enough time to settle. This will occur while the screen is dark and only on cases that the APU should enter HW sleep again, so it shouldn't be noticeable to any user. Cc: stable@vger.kernel.org Signed-off-by: Mario Limonciello Acked-by: Shyam Sundar S K Link: https://lore.kernel.org/r/20250414162446.3853194-1-superm1@kernel.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/amd/pmc/pmc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/amd/pmc/pmc.c b/drivers/platform/x86/amd/pmc/pmc.c index d789d6cab7948..0329fafe14ebc 100644 --- a/drivers/platform/x86/amd/pmc/pmc.c +++ b/drivers/platform/x86/amd/pmc/pmc.c @@ -644,10 +644,9 @@ static void amd_pmc_s2idle_check(void) struct smu_metrics table; int rc; - /* CZN: Ensure that future s0i3 entry attempts at least 10ms passed */ - if (pdev->cpu_id == AMD_CPU_ID_CZN && !get_metrics_table(pdev, &table) && - table.s0i3_last_entry_status) - usleep_range(10000, 20000); + /* Avoid triggering OVP */ + if (!get_metrics_table(pdev, &table) && table.s0i3_last_entry_status) + msleep(2500); /* Dump the IdleMask before we add to the STB */ amd_pmc_idlemask_read(pdev, pdev->dev, NULL); From 8d6955ed76e8a47115f2ea1d9c263ee6f505d737 Mon Sep 17 00:00:00 2001 From: Shouye Liu Date: Thu, 17 Apr 2025 11:23:21 +0800 Subject: [PATCH 290/770] platform/x86/intel-uncore-freq: Fix missing uncore sysfs during CPU hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In certain situations, the sysfs for uncore may not be present when all CPUs in a package are offlined and then brought back online after boot. This issue can occur if there is an error in adding the sysfs entry due to a memory allocation failure. Retrying to bring the CPUs online will not resolve the issue, as the uncore_cpu_mask is already set for the package before the failure condition occurs. This issue does not occur if the failure happens during module initialization, as the module will fail to load in the event of any error. To address this, ensure that the uncore_cpu_mask is not set until the successful return of uncore_freq_add_entry(). Fixes: dbce412a7733 ("platform/x86/intel-uncore-freq: Split common and enumeration part") Signed-off-by: Shouye Liu Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250417032321.75580-1-shouyeliu@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../x86/intel/uncore-frequency/uncore-frequency.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c index 40bbf8e45fa4b..bdee5d00f30b8 100644 --- a/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c +++ b/drivers/platform/x86/intel/uncore-frequency/uncore-frequency.c @@ -146,15 +146,13 @@ static int uncore_event_cpu_online(unsigned int cpu) { struct uncore_data *data; int target; + int ret; /* Check if there is an online cpu in the package for uncore MSR */ target = cpumask_any_and(&uncore_cpu_mask, topology_die_cpumask(cpu)); if (target < nr_cpu_ids) return 0; - /* Use this CPU on this die as a control CPU */ - cpumask_set_cpu(cpu, &uncore_cpu_mask); - data = uncore_get_instance(cpu); if (!data) return 0; @@ -163,7 +161,14 @@ static int uncore_event_cpu_online(unsigned int cpu) data->die_id = topology_die_id(cpu); data->domain_id = UNCORE_DOMAIN_ID_INVALID; - return uncore_freq_add_entry(data, cpu); + ret = uncore_freq_add_entry(data, cpu); + if (ret) + return ret; + + /* Use this CPU on this die as a control CPU */ + cpumask_set_cpu(cpu, &uncore_cpu_mask); + + return 0; } static int uncore_event_cpu_offline(unsigned int cpu) From 4a8e04e2bdcb98d513e97b039899bda03b07bcf2 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Wed, 16 Apr 2025 13:50:23 -0300 Subject: [PATCH 291/770] platform/x86: alienware-wmi-wmax: Fix uninitialized variable due to bad error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit wmax_thermal_information() may also return -ENOMSG, which would leave `id` uninitialized in thermal_profile_probe. Reorder and modify logic to catch all errors. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/Z_-KVqNbD9ygvE2X@stanley.mountain Fixes: 27e9e6339896 ("platform/x86: alienware-wmi: Refactor thermal control methods") Signed-off-by: Kurt Borja Link: https://lore.kernel.org/r/20250416-smatch-fix-v1-1-35491b462d8f@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi-wmax.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/dell/alienware-wmi-wmax.c b/drivers/platform/x86/dell/alienware-wmi-wmax.c index 0c3be03385f89..3f9e1e986ecf0 100644 --- a/drivers/platform/x86/dell/alienware-wmi-wmax.c +++ b/drivers/platform/x86/dell/alienware-wmi-wmax.c @@ -655,12 +655,10 @@ static int thermal_profile_probe(void *drvdata, unsigned long *choices) for (u32 i = 0; i < sys_desc[3]; i++) { ret = wmax_thermal_information(priv->wdev, WMAX_OPERATION_LIST_IDS, i + first_mode, &out_data); - - if (ret == -EIO) - return ret; - if (ret == -EBADRQC) break; + if (ret) + return ret; if (!is_wmax_thermal_code(out_data)) continue; From a1b669ea16c4d7c1a1a8fc7e25aaf651ea0078c3 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:45:57 -0700 Subject: [PATCH 292/770] bpf: Prepare to reuse get_ctx_arg_idx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename get_ctx_arg_idx to bpf_ctx_arg_idx, and allow others to call it. No functional change. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-2-ameryhung@gmail.com --- include/linux/btf.h | 1 + kernel/bpf/btf.c | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/btf.h b/include/linux/btf.h index ebc0c0c9b9446..b2983706292f7 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -522,6 +522,7 @@ bool btf_param_match_suffix(const struct btf *btf, const char *suffix); int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, u32 arg_no); +u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, int off); struct bpf_verifier_log; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 16ba36f34dfab..ac6c8ed08de3c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6391,8 +6391,8 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t) return btf_type_is_int(t); } -static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, - int off) +u32 btf_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, + int off) { const struct btf_param *args; const struct btf_type *t; @@ -6671,7 +6671,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, tname, off); return false; } - arg = get_ctx_arg_idx(btf, t, off); + arg = btf_ctx_arg_idx(btf, t, off); args = (const struct btf_param *)(t + 1); /* if (t == NULL) Fall back to default BPF prog with * MAX_BPF_FUNC_REG_ARGS u64 arguments. From c8240344956e3f0b4e8f1d40ec3435e47040cacb Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:45:58 -0700 Subject: [PATCH 293/770] bpf: net_sched: Support implementation of Qdisc_ops in bpf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recent advancement in bpf such as allocated objects, bpf list and bpf rbtree has provided powerful and flexible building blocks to realize sophisticated packet scheduling algorithms. As struct_ops now supports core operators in Qdisc_ops, start allowing qdisc to be implemented using bpf struct_ops with this patch. Users can implement Qdisc_ops.{enqueue, dequeue, init, reset, destroy} in bpf and register the qdisc dynamically into the kernel. Co-developed-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Cong Wang Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-3-ameryhung@gmail.com --- net/sched/Kconfig | 12 +++ net/sched/Makefile | 1 + net/sched/bpf_qdisc.c | 216 ++++++++++++++++++++++++++++++++++++++++ net/sched/sch_api.c | 7 +- net/sched/sch_generic.c | 3 +- 5 files changed, 235 insertions(+), 4 deletions(-) create mode 100644 net/sched/bpf_qdisc.c diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 8180d0c12fcea..ccd0255da5a54 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -403,6 +403,18 @@ config NET_SCH_ETS If unsure, say N. +config NET_SCH_BPF + bool "BPF-based Qdisc" + depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF + help + This option allows BPF-based queueing disiplines. With BPF struct_ops, + users can implement supported operators in Qdisc_ops using BPF programs. + The queue holding skb can be built with BPF maps or graphs. + + Say Y here if you want to use BPF-based Qdisc. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" help diff --git a/net/sched/Makefile b/net/sched/Makefile index 82c3f78ca486e..904d784902d14 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -62,6 +62,7 @@ obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o +obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c new file mode 100644 index 0000000000000..7e5cb7294f406 --- /dev/null +++ b/net/sched/bpf_qdisc.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include + +static struct bpf_struct_ops bpf_Qdisc_ops; + +struct bpf_sk_buff_ptr { + struct sk_buff *skb; +}; + +static int bpf_qdisc_init(struct btf *btf) +{ + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_qdisc_ids, struct, Qdisc) +BTF_ID_LIST_SINGLE(bpf_sk_buff_ids, struct, sk_buff) +BTF_ID_LIST_SINGLE(bpf_sk_buff_ptr_ids, struct, bpf_sk_buff_ptr) + +static bool bpf_qdisc_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + struct btf *btf = prog->aux->attach_btf; + u32 arg; + + arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off); + if (prog->aux->attach_st_ops_member_off == offsetof(struct Qdisc_ops, enqueue)) { + if (arg == 2 && type == BPF_READ) { + info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED; + info->btf = btf; + info->btf_id = bpf_sk_buff_ptr_ids[0]; + return true; + } + } + + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_qdisc_qdisc_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, size_t *end) +{ + switch (off) { + case offsetof(struct Qdisc, limit): + *end = offsetofend(struct Qdisc, limit); + break; + case offsetof(struct Qdisc, q) + offsetof(struct qdisc_skb_head, qlen): + *end = offsetof(struct Qdisc, q) + offsetofend(struct qdisc_skb_head, qlen); + break; + case offsetof(struct Qdisc, qstats) ... offsetofend(struct Qdisc, qstats) - 1: + *end = offsetofend(struct Qdisc, qstats); + break; + default: + return -EACCES; + } + + return 0; +} + +static int bpf_qdisc_sk_buff_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, size_t *end) +{ + switch (off) { + case offsetof(struct sk_buff, tstamp): + *end = offsetofend(struct sk_buff, tstamp); + break; + case offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data[0]) ... + offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, + data[QDISC_CB_PRIV_LEN - 1]): + *end = offsetof(struct sk_buff, cb) + + offsetofend(struct qdisc_skb_cb, data[QDISC_CB_PRIV_LEN - 1]); + break; + default: + return -EACCES; + } + + return 0; +} + +static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + const struct btf_type *t, *skbt, *qdisct; + size_t end; + int err; + + skbt = btf_type_by_id(reg->btf, bpf_sk_buff_ids[0]); + qdisct = btf_type_by_id(reg->btf, bpf_qdisc_ids[0]); + t = btf_type_by_id(reg->btf, reg->btf_id); + + if (t == skbt) { + err = bpf_qdisc_sk_buff_access(log, reg, off, &end); + } else if (t == qdisct) { + err = bpf_qdisc_qdisc_access(log, reg, off, &end); + } else { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + if (err) { + bpf_log(log, "no write support to %s at off %d\n", + btf_name_by_offset(reg->btf, t->name_off), off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of %s ended at %zu\n", + off, size, btf_name_by_offset(reg->btf, t->name_off), end); + return -EACCES; + } + + return 0; +} + +static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = bpf_qdisc_is_valid_access, + .btf_struct_access = bpf_qdisc_btf_struct_access, +}; + +static int bpf_qdisc_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct Qdisc_ops *uqdisc_ops; + struct Qdisc_ops *qdisc_ops; + u32 moff; + + uqdisc_ops = (const struct Qdisc_ops *)udata; + qdisc_ops = (struct Qdisc_ops *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct Qdisc_ops, peek): + qdisc_ops->peek = qdisc_peek_dequeued; + return 0; + case offsetof(struct Qdisc_ops, id): + if (bpf_obj_name_cpy(qdisc_ops->id, uqdisc_ops->id, + sizeof(qdisc_ops->id)) <= 0) + return -EINVAL; + return 1; + } + + return 0; +} + +static int bpf_qdisc_reg(void *kdata, struct bpf_link *link) +{ + return register_qdisc(kdata); +} + +static void bpf_qdisc_unreg(void *kdata, struct bpf_link *link) +{ + return unregister_qdisc(kdata); +} + +static int Qdisc_ops__enqueue(struct sk_buff *skb__ref, struct Qdisc *sch, + struct sk_buff **to_free) +{ + return 0; +} + +static struct sk_buff *Qdisc_ops__dequeue(struct Qdisc *sch) +{ + return NULL; +} + +static int Qdisc_ops__init(struct Qdisc *sch, struct nlattr *arg, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static void Qdisc_ops__reset(struct Qdisc *sch) +{ +} + +static void Qdisc_ops__destroy(struct Qdisc *sch) +{ +} + +static struct Qdisc_ops __bpf_ops_qdisc_ops = { + .enqueue = Qdisc_ops__enqueue, + .dequeue = Qdisc_ops__dequeue, + .init = Qdisc_ops__init, + .reset = Qdisc_ops__reset, + .destroy = Qdisc_ops__destroy, +}; + +static struct bpf_struct_ops bpf_Qdisc_ops = { + .verifier_ops = &bpf_qdisc_verifier_ops, + .reg = bpf_qdisc_reg, + .unreg = bpf_qdisc_unreg, + .init_member = bpf_qdisc_init_member, + .init = bpf_qdisc_init, + .name = "Qdisc_ops", + .cfi_stubs = &__bpf_ops_qdisc_ops, + .owner = THIS_MODULE, +}; + +static int __init bpf_qdisc_kfunc_init(void) +{ + return register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops); +} +late_initcall(bpf_qdisc_kfunc_init); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f74a097f54ae7..db6330258dda9 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -359,7 +360,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { if (nla_strcmp(kind, q->id) == 0) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -1370,7 +1371,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: - module_put(ops->owner); + bpf_module_put(ops, ops->owner); err_out: *errp = err; return NULL; @@ -1782,7 +1783,7 @@ static void request_qdisc_module(struct nlattr *kind) ops = qdisc_lookup_ops(kind); if (ops) { - module_put(ops->owner); + bpf_module_put(ops, ops->owner); return; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 14ab2f4c190a1..e6fda9f20272a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1078,7 +1079,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc) ops->destroy(qdisc); lockdep_unregister_key(&qdisc->root_lock_key); - module_put(ops->owner); + bpf_module_put(ops, ops->owner); netdev_put(dev, &qdisc->dev_tracker); trace_qdisc_destroy(qdisc); From 870c28588afa20d786e2c26e8fcc14e2b9a55616 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:45:59 -0700 Subject: [PATCH 294/770] bpf: net_sched: Add basic bpf qdisc kfuncs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add basic kfuncs for working on skb in qdisc. Both bpf_qdisc_skb_drop() and bpf_kfree_skb() can be used to release a reference to an skb. However, bpf_qdisc_skb_drop() can only be called in .enqueue where a to_free skb list is available from kernel to defer the release. bpf_kfree_skb() should be used elsewhere. It is also used in bpf_obj_free_fields() when cleaning up skb in maps and collections. bpf_skb_get_hash() returns the flow hash of an skb, which can be used to build flow-based queueing algorithms. Finally, allow users to create read-only dynptr via bpf_dynptr_from_skb(). Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-4-ameryhung@gmail.com --- net/sched/bpf_qdisc.c | 111 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index 7e5cb7294f406..5ef13cc635602 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -8,6 +8,9 @@ #include #include +#define QDISC_OP_IDX(op) (offsetof(struct Qdisc_ops, op) / sizeof(void (*)(void))) +#define QDISC_MOFF_IDX(moff) (moff / sizeof(void (*)(void))) + static struct bpf_struct_ops bpf_Qdisc_ops; struct bpf_sk_buff_ptr { @@ -123,6 +126,95 @@ static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log, return 0; } +__bpf_kfunc_start_defs(); + +/* bpf_skb_get_hash - Get the flow hash of an skb. + * @skb: The skb to get the flow hash from. + */ +__bpf_kfunc u32 bpf_skb_get_hash(struct sk_buff *skb) +{ + return skb_get_hash(skb); +} + +/* bpf_kfree_skb - Release an skb's reference and drop it immediately. + * @skb: The skb whose reference to be released and dropped. + */ +__bpf_kfunc void bpf_kfree_skb(struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/* bpf_qdisc_skb_drop - Drop an skb by adding it to a deferred free list. + * @skb: The skb whose reference to be released and dropped. + * @to_free_list: The list of skbs to be dropped. + */ +__bpf_kfunc void bpf_qdisc_skb_drop(struct sk_buff *skb, + struct bpf_sk_buff_ptr *to_free_list) +{ + __qdisc_drop(skb, (struct sk_buff **)to_free_list); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(qdisc_kfunc_ids) +BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(qdisc_kfunc_ids) + +BTF_SET_START(qdisc_common_kfunc_set) +BTF_ID(func, bpf_skb_get_hash) +BTF_ID(func, bpf_kfree_skb) +BTF_ID(func, bpf_dynptr_from_skb) +BTF_SET_END(qdisc_common_kfunc_set) + +BTF_SET_START(qdisc_enqueue_kfunc_set) +BTF_ID(func, bpf_qdisc_skb_drop) +BTF_SET_END(qdisc_enqueue_kfunc_set) + +enum qdisc_ops_kf_flags { + QDISC_OPS_KF_COMMON = 0, + QDISC_OPS_KF_ENQUEUE = 1 << 0, +}; + +static const u32 qdisc_ops_context_flags[] = { + [QDISC_OP_IDX(enqueue)] = QDISC_OPS_KF_ENQUEUE, + [QDISC_OP_IDX(dequeue)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(init)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(reset)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(destroy)] = QDISC_OPS_KF_COMMON, +}; + +static int bpf_qdisc_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + u32 moff, flags; + + if (!btf_id_set8_contains(&qdisc_kfunc_ids, kfunc_id)) + return 0; + + if (prog->aux->st_ops != &bpf_Qdisc_ops) + return -EACCES; + + moff = prog->aux->attach_st_ops_member_off; + flags = qdisc_ops_context_flags[QDISC_MOFF_IDX(moff)]; + + if ((flags & QDISC_OPS_KF_ENQUEUE) && + btf_id_set_contains(&qdisc_enqueue_kfunc_set, kfunc_id)) + return 0; + + if (btf_id_set_contains(&qdisc_common_kfunc_set, kfunc_id)) + return 0; + + return -EACCES; +} + +static const struct btf_kfunc_id_set bpf_qdisc_kfunc_set = { + .owner = THIS_MODULE, + .set = &qdisc_kfunc_ids, + .filter = bpf_qdisc_kfunc_filter, +}; + static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = { .get_func_proto = bpf_base_func_proto, .is_valid_access = bpf_qdisc_is_valid_access, @@ -209,8 +301,25 @@ static struct bpf_struct_ops bpf_Qdisc_ops = { .owner = THIS_MODULE, }; +BTF_ID_LIST(bpf_sk_buff_dtor_ids) +BTF_ID(func, bpf_kfree_skb) + static int __init bpf_qdisc_kfunc_init(void) { - return register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops); + int ret; + const struct btf_id_dtor_kfunc skb_kfunc_dtors[] = { + { + .btf_id = bpf_sk_buff_ids[0], + .kfunc_btf_id = bpf_sk_buff_dtor_ids[0] + }, + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_qdisc_kfunc_set); + ret = ret ?: register_btf_id_dtor_kfuncs(skb_kfunc_dtors, + ARRAY_SIZE(skb_kfunc_dtors), + THIS_MODULE); + ret = ret ?: register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops); + + return ret; } late_initcall(bpf_qdisc_kfunc_init); From 7a2dafda950b78611dc441c83d105dfdc7082681 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:46:00 -0700 Subject: [PATCH 295/770] bpf: net_sched: Add a qdisc watchdog timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a watchdog timer to bpf qdisc. The watchdog can be used to schedule the execution of qdisc through kfunc, bpf_qdisc_schedule(). It can be useful for building traffic shaping scheduling algorithm, where the time the next packet will be dequeued is known. The implementation relies on struct_ops gen_prologue/epilogue to patch bpf programs provided by users. Operator specific prologue/epilogue kfuncs are introduced instead of watchdog kfuncs so that it is easier to extend prologue/epilogue in the future (writing C vs BPF bytecode). Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-5-ameryhung@gmail.com --- net/sched/bpf_qdisc.c | 106 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 105 insertions(+), 1 deletion(-) diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index 5ef13cc635602..0790f1248bd82 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -13,6 +13,10 @@ static struct bpf_struct_ops bpf_Qdisc_ops; +struct bpf_sched_data { + struct qdisc_watchdog watchdog; +}; + struct bpf_sk_buff_ptr { struct sk_buff *skb; }; @@ -126,6 +130,56 @@ static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log, return 0; } +BTF_ID_LIST(bpf_qdisc_init_prologue_ids) +BTF_ID(func, bpf_qdisc_init_prologue) + +static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, init)) + return 0; + + /* r6 = r1; // r6 will be "u64 *ctx". r1 is "u64 *ctx". + * r1 = r1[0]; // r1 will be "struct Qdisc *sch" + * r0 = bpf_qdisc_init_prologue(r1); + * r1 = r6; // r1 will be "u64 *ctx". + */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_init_prologue_ids[0]); + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + +BTF_ID_LIST(bpf_qdisc_reset_destroy_epilogue_ids) +BTF_ID(func, bpf_qdisc_reset_destroy_epilogue) + +static int bpf_qdisc_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog, + s16 ctx_stack_off) +{ + struct bpf_insn *insn = insn_buf; + + if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, reset) && + prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, destroy)) + return 0; + + /* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx" + * r1 = r1[0]; // r1 will be "struct Qdisc *sch" + * r0 = bpf_qdisc_reset_destroy_epilogue(r1); + * BPF_EXIT; + */ + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_reset_destroy_epilogue_ids[0]); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + __bpf_kfunc_start_defs(); /* bpf_skb_get_hash - Get the flow hash of an skb. @@ -154,6 +208,36 @@ __bpf_kfunc void bpf_qdisc_skb_drop(struct sk_buff *skb, __qdisc_drop(skb, (struct sk_buff **)to_free_list); } +/* bpf_qdisc_watchdog_schedule - Schedule a qdisc to a later time using a timer. + * @sch: The qdisc to be scheduled. + * @expire: The expiry time of the timer. + * @delta_ns: The slack range of the timer. + */ +__bpf_kfunc void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_schedule_range_ns(&q->watchdog, expire, delta_ns); +} + +/* bpf_qdisc_init_prologue - Hidden kfunc called in prologue of .init. */ +__bpf_kfunc void bpf_qdisc_init_prologue(struct Qdisc *sch) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_init(&q->watchdog, sch); +} + +/* bpf_qdisc_reset_destroy_epilogue - Hidden kfunc called in epilogue of .reset + * and .destroy + */ +__bpf_kfunc void bpf_qdisc_reset_destroy_epilogue(struct Qdisc *sch) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(qdisc_kfunc_ids) @@ -161,6 +245,9 @@ BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) BTF_KFUNCS_END(qdisc_kfunc_ids) BTF_SET_START(qdisc_common_kfunc_set) @@ -171,16 +258,22 @@ BTF_SET_END(qdisc_common_kfunc_set) BTF_SET_START(qdisc_enqueue_kfunc_set) BTF_ID(func, bpf_qdisc_skb_drop) +BTF_ID(func, bpf_qdisc_watchdog_schedule) BTF_SET_END(qdisc_enqueue_kfunc_set) +BTF_SET_START(qdisc_dequeue_kfunc_set) +BTF_ID(func, bpf_qdisc_watchdog_schedule) +BTF_SET_END(qdisc_dequeue_kfunc_set) + enum qdisc_ops_kf_flags { QDISC_OPS_KF_COMMON = 0, QDISC_OPS_KF_ENQUEUE = 1 << 0, + QDISC_OPS_KF_DEQUEUE = 1 << 1, }; static const u32 qdisc_ops_context_flags[] = { [QDISC_OP_IDX(enqueue)] = QDISC_OPS_KF_ENQUEUE, - [QDISC_OP_IDX(dequeue)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(dequeue)] = QDISC_OPS_KF_DEQUEUE, [QDISC_OP_IDX(init)] = QDISC_OPS_KF_COMMON, [QDISC_OP_IDX(reset)] = QDISC_OPS_KF_COMMON, [QDISC_OP_IDX(destroy)] = QDISC_OPS_KF_COMMON, @@ -203,6 +296,10 @@ static int bpf_qdisc_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id) btf_id_set_contains(&qdisc_enqueue_kfunc_set, kfunc_id)) return 0; + if ((flags & QDISC_OPS_KF_DEQUEUE) && + btf_id_set_contains(&qdisc_dequeue_kfunc_set, kfunc_id)) + return 0; + if (btf_id_set_contains(&qdisc_common_kfunc_set, kfunc_id)) return 0; @@ -219,6 +316,8 @@ static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = { .get_func_proto = bpf_base_func_proto, .is_valid_access = bpf_qdisc_is_valid_access, .btf_struct_access = bpf_qdisc_btf_struct_access, + .gen_prologue = bpf_qdisc_gen_prologue, + .gen_epilogue = bpf_qdisc_gen_epilogue, }; static int bpf_qdisc_init_member(const struct btf_type *t, @@ -234,6 +333,11 @@ static int bpf_qdisc_init_member(const struct btf_type *t, moff = __btf_member_bit_offset(t, member) / 8; switch (moff) { + case offsetof(struct Qdisc_ops, priv_size): + if (uqdisc_ops->priv_size) + return -EINVAL; + qdisc_ops->priv_size = sizeof(struct bpf_sched_data); + return 1; case offsetof(struct Qdisc_ops, peek): qdisc_ops->peek = qdisc_peek_dequeued; return 0; From 544e0a1f1e56de5a28251c188aa8f78fe50b31c9 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:46:01 -0700 Subject: [PATCH 296/770] bpf: net_sched: Support updating bstats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a kfunc to update Qdisc bstats when an skb is dequeued. The kfunc is only available in .dequeue programs. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-6-ameryhung@gmail.com --- net/sched/bpf_qdisc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index 0790f1248bd82..dcbd1c9276345 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -238,6 +238,15 @@ __bpf_kfunc void bpf_qdisc_reset_destroy_epilogue(struct Qdisc *sch) qdisc_watchdog_cancel(&q->watchdog); } +/* bpf_qdisc_bstats_update - Update Qdisc basic statistics + * @sch: The qdisc from which an skb is dequeued. + * @skb: The skb to be dequeued. + */ +__bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) +{ + bstats_update(&sch->bstats, skb); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(qdisc_kfunc_ids) @@ -248,6 +257,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS) BTF_KFUNCS_END(qdisc_kfunc_ids) BTF_SET_START(qdisc_common_kfunc_set) @@ -263,6 +273,7 @@ BTF_SET_END(qdisc_enqueue_kfunc_set) BTF_SET_START(qdisc_dequeue_kfunc_set) BTF_ID(func, bpf_qdisc_watchdog_schedule) +BTF_ID(func, bpf_qdisc_bstats_update) BTF_SET_END(qdisc_dequeue_kfunc_set) enum qdisc_ops_kf_flags { From e582778f023b4c9e608b5cca135f17f62b0cd115 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:46:02 -0700 Subject: [PATCH 297/770] bpf: net_sched: Disable attaching bpf qdisc to non root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not allow users to attach bpf qdiscs to classful qdiscs. This is to prevent accidentally breaking existings classful qdiscs if they rely on some data in the child qdisc. This restriction can potentially be lifted in the future. Note that, we still allow bpf qdisc to be attached to mq. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-7-ameryhung@gmail.com --- net/sched/bpf_qdisc.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c index dcbd1c9276345..9f32b305636fc 100644 --- a/net/sched/bpf_qdisc.c +++ b/net/sched/bpf_qdisc.c @@ -142,13 +142,19 @@ static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, return 0; /* r6 = r1; // r6 will be "u64 *ctx". r1 is "u64 *ctx". + * r2 = r1[16]; // r2 will be "struct netlink_ext_ack *extack" * r1 = r1[0]; // r1 will be "struct Qdisc *sch" - * r0 = bpf_qdisc_init_prologue(r1); + * r0 = bpf_qdisc_init_prologue(r1, r2); + * if r0 == 0 goto pc+1; + * BPF_EXIT; * r1 = r6; // r1 will be "u64 *ctx". */ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 16); *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_init_prologue_ids[0]); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1); + *insn++ = BPF_EXIT_INSN(); *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); *insn++ = prog->insnsi[0]; @@ -221,11 +227,26 @@ __bpf_kfunc void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 } /* bpf_qdisc_init_prologue - Hidden kfunc called in prologue of .init. */ -__bpf_kfunc void bpf_qdisc_init_prologue(struct Qdisc *sch) +__bpf_kfunc int bpf_qdisc_init_prologue(struct Qdisc *sch, + struct netlink_ext_ack *extack) { struct bpf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *p; + + if (sch->parent != TC_H_ROOT) { + p = qdisc_lookup(dev, TC_H_MAJ(sch->parent)); + if (!p) + return -ENOENT; + + if (!(p->flags & TCQ_F_MQROOT)) { + NL_SET_ERR_MSG(extack, "BPF qdisc only supported on root or mq"); + return -EINVAL; + } + } qdisc_watchdog_init(&q->watchdog, sch); + return 0; } /* bpf_qdisc_reset_destroy_epilogue - Hidden kfunc called in epilogue of .reset From 4b15121da7e5217636ae2edccb12cf7fc49709b3 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:46:03 -0700 Subject: [PATCH 298/770] libbpf: Support creating and destroying qdisc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend struct bpf_tc_hook with handle, qdisc name and a new attach type, BPF_TC_QDISC, to allow users to add or remove any qdisc specified in addition to clsact. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-8-ameryhung@gmail.com --- tools/lib/bpf/libbpf.h | 5 ++++- tools/lib/bpf/netlink.c | 20 +++++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index e0605403f9773..fdcee6a71e0fa 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -1283,6 +1283,7 @@ enum bpf_tc_attach_point { BPF_TC_INGRESS = 1 << 0, BPF_TC_EGRESS = 1 << 1, BPF_TC_CUSTOM = 1 << 2, + BPF_TC_QDISC = 1 << 3, }; #define BPF_TC_PARENT(a, b) \ @@ -1297,9 +1298,11 @@ struct bpf_tc_hook { int ifindex; enum bpf_tc_attach_point attach_point; __u32 parent; + __u32 handle; + const char *qdisc; size_t :0; }; -#define bpf_tc_hook__last_field parent +#define bpf_tc_hook__last_field qdisc struct bpf_tc_opts { size_t sz; diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 68a2def171751..c997e69d507fe 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -529,9 +529,9 @@ int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id) } -typedef int (*qdisc_config_t)(struct libbpf_nla_req *req); +typedef int (*qdisc_config_t)(struct libbpf_nla_req *req, const struct bpf_tc_hook *hook); -static int clsact_config(struct libbpf_nla_req *req) +static int clsact_config(struct libbpf_nla_req *req, const struct bpf_tc_hook *hook) { req->tc.tcm_parent = TC_H_CLSACT; req->tc.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0); @@ -539,6 +539,16 @@ static int clsact_config(struct libbpf_nla_req *req) return nlattr_add(req, TCA_KIND, "clsact", sizeof("clsact")); } +static int qdisc_config(struct libbpf_nla_req *req, const struct bpf_tc_hook *hook) +{ + const char *qdisc = OPTS_GET(hook, qdisc, NULL); + + req->tc.tcm_parent = OPTS_GET(hook, parent, TC_H_ROOT); + req->tc.tcm_handle = OPTS_GET(hook, handle, 0); + + return nlattr_add(req, TCA_KIND, qdisc, strlen(qdisc) + 1); +} + static int attach_point_to_config(struct bpf_tc_hook *hook, qdisc_config_t *config) { @@ -552,6 +562,9 @@ static int attach_point_to_config(struct bpf_tc_hook *hook, return 0; case BPF_TC_CUSTOM: return -EOPNOTSUPP; + case BPF_TC_QDISC: + *config = &qdisc_config; + return 0; default: return -EINVAL; } @@ -596,7 +609,7 @@ static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags) req.tc.tcm_family = AF_UNSPEC; req.tc.tcm_ifindex = OPTS_GET(hook, ifindex, 0); - ret = config(&req); + ret = config(&req, hook); if (ret < 0) return ret; @@ -639,6 +652,7 @@ int bpf_tc_hook_destroy(struct bpf_tc_hook *hook) case BPF_TC_INGRESS: case BPF_TC_EGRESS: return libbpf_err(__bpf_tc_detach(hook, NULL, true)); + case BPF_TC_QDISC: case BPF_TC_INGRESS | BPF_TC_EGRESS: return libbpf_err(tc_qdisc_delete(hook)); case BPF_TC_CUSTOM: From 11c701639ba95aac909720678bf073eeaf6ef89c Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:46:04 -0700 Subject: [PATCH 299/770] selftests/bpf: Add a basic fifo qdisc test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This selftest includes a bare minimum fifo qdisc, which simply enqueues sk_buffs into the back of a bpf list and dequeues from the front of the list. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-9-ameryhung@gmail.com --- tools/testing/selftests/bpf/config | 1 + .../selftests/bpf/prog_tests/bpf_qdisc.c | 81 ++++++++++++ .../selftests/bpf/progs/bpf_qdisc_common.h | 31 +++++ .../selftests/bpf/progs/bpf_qdisc_fifo.c | 117 ++++++++++++++++++ 4 files changed, 230 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_common.h create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index c378d5d07e029..6b0cab55bd2dc 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -71,6 +71,7 @@ CONFIG_NET_IPGRE=y CONFIG_NET_IPGRE_DEMUX=y CONFIG_NET_IPIP=y CONFIG_NET_MPLS_GSO=y +CONFIG_NET_SCH_BPF=y CONFIG_NET_SCH_FQ=y CONFIG_NET_SCH_INGRESS=y CONFIG_NET_SCHED=y diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c new file mode 100644 index 0000000000000..1ec321eb089f4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#include "network_helpers.h" +#include "bpf_qdisc_fifo.skel.h" + +#define LO_IFINDEX 1 + +static const unsigned int total_bytes = 10 * 1024 * 1024; + +static void do_test(char *qdisc) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = LO_IFINDEX, + .attach_point = BPF_TC_QDISC, + .parent = TC_H_ROOT, + .handle = 0x8000000, + .qdisc = qdisc); + int srv_fd = -1, cli_fd = -1; + int err; + + err = bpf_tc_hook_create(&hook); + if (!ASSERT_OK(err, "attach qdisc")) + return; + + srv_fd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); + if (!ASSERT_OK_FD(srv_fd, "start server")) + goto done; + + cli_fd = connect_to_fd(srv_fd, 0); + if (!ASSERT_OK_FD(cli_fd, "connect to client")) + goto done; + + err = send_recv_data(srv_fd, cli_fd, total_bytes); + ASSERT_OK(err, "send_recv_data"); + +done: + if (srv_fd != -1) + close(srv_fd); + if (cli_fd != -1) + close(cli_fd); + + bpf_tc_hook_destroy(&hook); +} + +static void test_fifo(void) +{ + struct bpf_qdisc_fifo *fifo_skel; + struct bpf_link *link; + + fifo_skel = bpf_qdisc_fifo__open_and_load(); + if (!ASSERT_OK_PTR(fifo_skel, "bpf_qdisc_fifo__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(fifo_skel->maps.fifo); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_qdisc_fifo__destroy(fifo_skel); + return; + } + + do_test("bpf_fifo"); + + bpf_link__destroy(link); + bpf_qdisc_fifo__destroy(fifo_skel); +} + +void test_bpf_qdisc(void) +{ + struct netns_obj *netns; + + netns = netns_new("bpf_qdisc_ns", true); + if (!ASSERT_OK_PTR(netns, "netns_new")) + return; + + if (test__start_subtest("fifo")) + test_fifo(); + + netns_free(netns); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h new file mode 100644 index 0000000000000..65a2c561c0bb5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BPF_QDISC_COMMON_H +#define _BPF_QDISC_COMMON_H + +#define NET_XMIT_SUCCESS 0x00 +#define NET_XMIT_DROP 0x01 /* skb dropped */ +#define NET_XMIT_CN 0x02 /* congestion notification */ + +#define TC_PRIO_CONTROL 7 +#define TC_PRIO_MAX 15 + +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +u32 bpf_skb_get_hash(struct sk_buff *p) __ksym; +void bpf_kfree_skb(struct sk_buff *p) __ksym; +void bpf_qdisc_skb_drop(struct sk_buff *p, struct bpf_sk_buff_ptr *to_free) __ksym; +void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns) __ksym; +void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) __ksym; + +static struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb) +{ + return (struct qdisc_skb_cb *)skb->cb; +} + +static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) +{ + return qdisc_skb_cb(skb)->pkt_len; +} + +#endif diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c new file mode 100644 index 0000000000000..0c7cfb82dae19 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" + +char _license[] SEC("license") = "GPL"; + +struct skb_node { + struct sk_buff __kptr * skb; + struct bpf_list_node node; +}; + +private(A) struct bpf_spin_lock q_fifo_lock; +private(A) struct bpf_list_head q_fifo __contains(skb_node, node); + +SEC("struct_ops/bpf_fifo_enqueue") +int BPF_PROG(bpf_fifo_enqueue, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + struct skb_node *skbn; + u32 pkt_len; + + if (sch->q.qlen == sch->limit) + goto drop; + + skbn = bpf_obj_new(typeof(*skbn)); + if (!skbn) + goto drop; + + pkt_len = qdisc_pkt_len(skb); + + sch->q.qlen++; + skb = bpf_kptr_xchg(&skbn->skb, skb); + if (skb) + bpf_qdisc_skb_drop(skb, to_free); + + bpf_spin_lock(&q_fifo_lock); + bpf_list_push_back(&q_fifo, &skbn->node); + bpf_spin_unlock(&q_fifo_lock); + + sch->qstats.backlog += pkt_len; + return NET_XMIT_SUCCESS; +drop: + bpf_qdisc_skb_drop(skb, to_free); + return NET_XMIT_DROP; +} + +SEC("struct_ops/bpf_fifo_dequeue") +struct sk_buff *BPF_PROG(bpf_fifo_dequeue, struct Qdisc *sch) +{ + struct bpf_list_node *node; + struct sk_buff *skb = NULL; + struct skb_node *skbn; + + bpf_spin_lock(&q_fifo_lock); + node = bpf_list_pop_front(&q_fifo); + bpf_spin_unlock(&q_fifo_lock); + if (!node) + return NULL; + + skbn = container_of(node, struct skb_node, node); + skb = bpf_kptr_xchg(&skbn->skb, skb); + bpf_obj_drop(skbn); + if (!skb) + return NULL; + + sch->qstats.backlog -= qdisc_pkt_len(skb); + bpf_qdisc_bstats_update(sch, skb); + sch->q.qlen--; + + return skb; +} + +SEC("struct_ops/bpf_fifo_init") +int BPF_PROG(bpf_fifo_init, struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + sch->limit = 1000; + return 0; +} + +SEC("struct_ops/bpf_fifo_reset") +void BPF_PROG(bpf_fifo_reset, struct Qdisc *sch) +{ + struct bpf_list_node *node; + struct skb_node *skbn; + int i; + + bpf_for(i, 0, sch->q.qlen) { + struct sk_buff *skb = NULL; + + bpf_spin_lock(&q_fifo_lock); + node = bpf_list_pop_front(&q_fifo); + bpf_spin_unlock(&q_fifo_lock); + + if (!node) + break; + + skbn = container_of(node, struct skb_node, node); + skb = bpf_kptr_xchg(&skbn->skb, skb); + if (skb) + bpf_kfree_skb(skb); + bpf_obj_drop(skbn); + } + sch->q.qlen = 0; +} + +SEC(".struct_ops") +struct Qdisc_ops fifo = { + .enqueue = (void *)bpf_fifo_enqueue, + .dequeue = (void *)bpf_fifo_dequeue, + .init = (void *)bpf_fifo_init, + .reset = (void *)bpf_fifo_reset, + .id = "bpf_fifo", +}; + From 2b59bd9e4efcfdbb2458bb68da67f7bb40951de9 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:46:05 -0700 Subject: [PATCH 300/770] selftests/bpf: Add a bpf fq qdisc to selftest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This test implements a more sophisticated qdisc using bpf. The bpf fair- queueing (fq) qdisc gives each flow an equal chance to transmit data. It also respects the timestamp of skb for rate limiting. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-10-ameryhung@gmail.com --- .../selftests/bpf/prog_tests/bpf_qdisc.c | 24 + .../selftests/bpf/progs/bpf_qdisc_fq.c | 750 ++++++++++++++++++ 2 files changed, 774 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c index 1ec321eb089f4..230d8f935303b 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c @@ -6,6 +6,7 @@ #include "network_helpers.h" #include "bpf_qdisc_fifo.skel.h" +#include "bpf_qdisc_fq.skel.h" #define LO_IFINDEX 1 @@ -66,6 +67,27 @@ static void test_fifo(void) bpf_qdisc_fifo__destroy(fifo_skel); } +static void test_fq(void) +{ + struct bpf_qdisc_fq *fq_skel; + struct bpf_link *link; + + fq_skel = bpf_qdisc_fq__open_and_load(); + if (!ASSERT_OK_PTR(fq_skel, "bpf_qdisc_fq__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(fq_skel->maps.fq); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_qdisc_fq__destroy(fq_skel); + return; + } + + do_test("bpf_fq"); + + bpf_link__destroy(link); + bpf_qdisc_fq__destroy(fq_skel); +} + void test_bpf_qdisc(void) { struct netns_obj *netns; @@ -76,6 +98,8 @@ void test_bpf_qdisc(void) if (test__start_subtest("fifo")) test_fifo(); + if (test__start_subtest("fq")) + test_fq(); netns_free(netns); } diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c new file mode 100644 index 0000000000000..7c110a1562247 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fq.c @@ -0,0 +1,750 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* bpf_fq is intended for testing the bpf qdisc infrastructure and not a direct + * copy of sch_fq. bpf_fq implements the scheduling algorithm of sch_fq before + * 29f834aa326e ("net_sched: sch_fq: add 3 bands and WRR scheduling") was + * introduced. It gives each flow a fair chance to transmit packets in a + * round-robin fashion. Note that for flow pacing, bpf_fq currently only + * respects skb->tstamp but not skb->sk->sk_pacing_rate. In addition, if there + * are multiple bpf_fq instances, they will have a shared view of flows and + * configuration since some key data structure such as fq_prio_flows, + * fq_nonprio_flows, and fq_bpf_data are global. + * + * To use bpf_fq alone without running selftests, use the following commands. + * + * 1. Register bpf_fq to the kernel + * bpftool struct_ops register bpf_qdisc_fq.bpf.o /sys/fs/bpf + * 2. Add bpf_fq to an interface + * tc qdisc add dev root handle bpf_fq + * 3. Delete bpf_fq attached to the interface + * tc qdisc delete dev root + * 4. Unregister bpf_fq + * bpftool struct_ops unregister name fq + * + * The qdisc name, bpf_fq, used in tc commands is defined by Qdisc_ops.id. + * The struct_ops_map_name, fq, used in the bpftool command is the name of the + * Qdisc_ops. + * + * SEC(".struct_ops") + * struct Qdisc_ops fq = { + * ... + * .id = "bpf_fq", + * }; + */ + +#include +#include +#include +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" + +char _license[] SEC("license") = "GPL"; + +#define NSEC_PER_USEC 1000L +#define NSEC_PER_SEC 1000000000L + +#define NUM_QUEUE (1 << 20) + +struct fq_bpf_data { + u32 quantum; + u32 initial_quantum; + u32 flow_refill_delay; + u32 flow_plimit; + u64 horizon; + u32 orphan_mask; + u32 timer_slack; + u64 time_next_delayed_flow; + u64 unthrottle_latency_ns; + u8 horizon_drop; + u32 new_flow_cnt; + u32 old_flow_cnt; + u64 ktime_cache; +}; + +enum { + CLS_RET_PRIO = 0, + CLS_RET_NONPRIO = 1, + CLS_RET_ERR = 2, +}; + +struct skb_node { + u64 tstamp; + struct sk_buff __kptr * skb; + struct bpf_rb_node node; +}; + +struct fq_flow_node { + int credit; + u32 qlen; + u64 age; + u64 time_next_packet; + struct bpf_list_node list_node; + struct bpf_rb_node rb_node; + struct bpf_rb_root queue __contains(skb_node, node); + struct bpf_spin_lock lock; + struct bpf_refcount refcount; +}; + +struct dequeue_nonprio_ctx { + bool stop_iter; + u64 expire; + u64 now; +}; + +struct remove_flows_ctx { + bool gc_only; + u32 reset_cnt; + u32 reset_max; +}; + +struct unset_throttled_flows_ctx { + bool unset_all; + u64 now; +}; + +struct fq_stashed_flow { + struct fq_flow_node __kptr * flow; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u64); + __type(value, struct fq_stashed_flow); + __uint(max_entries, NUM_QUEUE); +} fq_nonprio_flows SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u64); + __type(value, struct fq_stashed_flow); + __uint(max_entries, 1); +} fq_prio_flows SEC(".maps"); + +private(A) struct bpf_spin_lock fq_delayed_lock; +private(A) struct bpf_rb_root fq_delayed __contains(fq_flow_node, rb_node); + +private(B) struct bpf_spin_lock fq_new_flows_lock; +private(B) struct bpf_list_head fq_new_flows __contains(fq_flow_node, list_node); + +private(C) struct bpf_spin_lock fq_old_flows_lock; +private(C) struct bpf_list_head fq_old_flows __contains(fq_flow_node, list_node); + +private(D) struct fq_bpf_data q; + +/* Wrapper for bpf_kptr_xchg that expects NULL dst */ +static void bpf_kptr_xchg_back(void *map_val, void *ptr) +{ + void *ret; + + ret = bpf_kptr_xchg(map_val, ptr); + if (ret) + bpf_obj_drop(ret); +} + +static bool skbn_tstamp_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ + struct skb_node *skbn_a; + struct skb_node *skbn_b; + + skbn_a = container_of(a, struct skb_node, node); + skbn_b = container_of(b, struct skb_node, node); + + return skbn_a->tstamp < skbn_b->tstamp; +} + +static bool fn_time_next_packet_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) +{ + struct fq_flow_node *flow_a; + struct fq_flow_node *flow_b; + + flow_a = container_of(a, struct fq_flow_node, rb_node); + flow_b = container_of(b, struct fq_flow_node, rb_node); + + return flow_a->time_next_packet < flow_b->time_next_packet; +} + +static void +fq_flows_add_head(struct bpf_list_head *head, struct bpf_spin_lock *lock, + struct fq_flow_node *flow, u32 *flow_cnt) +{ + bpf_spin_lock(lock); + bpf_list_push_front(head, &flow->list_node); + bpf_spin_unlock(lock); + *flow_cnt += 1; +} + +static void +fq_flows_add_tail(struct bpf_list_head *head, struct bpf_spin_lock *lock, + struct fq_flow_node *flow, u32 *flow_cnt) +{ + bpf_spin_lock(lock); + bpf_list_push_back(head, &flow->list_node); + bpf_spin_unlock(lock); + *flow_cnt += 1; +} + +static void +fq_flows_remove_front(struct bpf_list_head *head, struct bpf_spin_lock *lock, + struct bpf_list_node **node, u32 *flow_cnt) +{ + bpf_spin_lock(lock); + *node = bpf_list_pop_front(head); + bpf_spin_unlock(lock); + *flow_cnt -= 1; +} + +static bool +fq_flows_is_empty(struct bpf_list_head *head, struct bpf_spin_lock *lock) +{ + struct bpf_list_node *node; + + bpf_spin_lock(lock); + node = bpf_list_pop_front(head); + if (node) { + bpf_list_push_front(head, node); + bpf_spin_unlock(lock); + return false; + } + bpf_spin_unlock(lock); + + return true; +} + +/* flow->age is used to denote the state of the flow (not-detached, detached, throttled) + * as well as the timestamp when the flow is detached. + * + * 0: not-detached + * 1 - (~0ULL-1): detached + * ~0ULL: throttled + */ +static void fq_flow_set_detached(struct fq_flow_node *flow) +{ + flow->age = bpf_jiffies64(); +} + +static bool fq_flow_is_detached(struct fq_flow_node *flow) +{ + return flow->age != 0 && flow->age != ~0ULL; +} + +static bool sk_listener(struct sock *sk) +{ + return (1 << sk->__sk_common.skc_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); +} + +static void fq_gc(void); + +static int fq_new_flow(void *flow_map, struct fq_stashed_flow **sflow, u64 hash) +{ + struct fq_stashed_flow tmp = {}; + struct fq_flow_node *flow; + int ret; + + flow = bpf_obj_new(typeof(*flow)); + if (!flow) + return -ENOMEM; + + flow->credit = q.initial_quantum, + flow->qlen = 0, + flow->age = 1, + flow->time_next_packet = 0, + + ret = bpf_map_update_elem(flow_map, &hash, &tmp, 0); + if (ret == -ENOMEM || ret == -E2BIG) { + fq_gc(); + bpf_map_update_elem(&fq_nonprio_flows, &hash, &tmp, 0); + } + + *sflow = bpf_map_lookup_elem(flow_map, &hash); + if (!*sflow) { + bpf_obj_drop(flow); + return -ENOMEM; + } + + bpf_kptr_xchg_back(&(*sflow)->flow, flow); + return 0; +} + +static int +fq_classify(struct sk_buff *skb, struct fq_stashed_flow **sflow) +{ + struct sock *sk = skb->sk; + int ret = CLS_RET_NONPRIO; + u64 hash = 0; + + if ((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL) { + *sflow = bpf_map_lookup_elem(&fq_prio_flows, &hash); + ret = CLS_RET_PRIO; + } else { + if (!sk || sk_listener(sk)) { + hash = bpf_skb_get_hash(skb) & q.orphan_mask; + /* Avoid collision with an existing flow hash, which + * only uses the lower 32 bits of hash, by setting the + * upper half of hash to 1. + */ + hash |= (1ULL << 32); + } else if (sk->__sk_common.skc_state == TCP_CLOSE) { + hash = bpf_skb_get_hash(skb) & q.orphan_mask; + hash |= (1ULL << 32); + } else { + hash = sk->__sk_common.skc_hash; + } + *sflow = bpf_map_lookup_elem(&fq_nonprio_flows, &hash); + } + + if (!*sflow) + ret = fq_new_flow(&fq_nonprio_flows, sflow, hash) < 0 ? + CLS_RET_ERR : CLS_RET_NONPRIO; + + return ret; +} + +static bool fq_packet_beyond_horizon(struct sk_buff *skb) +{ + return (s64)skb->tstamp > (s64)(q.ktime_cache + q.horizon); +} + +SEC("struct_ops/bpf_fq_enqueue") +int BPF_PROG(bpf_fq_enqueue, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + struct fq_flow_node *flow = NULL, *flow_copy; + struct fq_stashed_flow *sflow; + u64 time_to_send, jiffies; + struct skb_node *skbn; + int ret; + + if (sch->q.qlen >= sch->limit) + goto drop; + + if (!skb->tstamp) { + time_to_send = q.ktime_cache = bpf_ktime_get_ns(); + } else { + if (fq_packet_beyond_horizon(skb)) { + q.ktime_cache = bpf_ktime_get_ns(); + if (fq_packet_beyond_horizon(skb)) { + if (q.horizon_drop) + goto drop; + + skb->tstamp = q.ktime_cache + q.horizon; + } + } + time_to_send = skb->tstamp; + } + + ret = fq_classify(skb, &sflow); + if (ret == CLS_RET_ERR) + goto drop; + + flow = bpf_kptr_xchg(&sflow->flow, flow); + if (!flow) + goto drop; + + if (ret == CLS_RET_NONPRIO) { + if (flow->qlen >= q.flow_plimit) { + bpf_kptr_xchg_back(&sflow->flow, flow); + goto drop; + } + + if (fq_flow_is_detached(flow)) { + flow_copy = bpf_refcount_acquire(flow); + + jiffies = bpf_jiffies64(); + if ((s64)(jiffies - (flow_copy->age + q.flow_refill_delay)) > 0) { + if (flow_copy->credit < q.quantum) + flow_copy->credit = q.quantum; + } + flow_copy->age = 0; + fq_flows_add_tail(&fq_new_flows, &fq_new_flows_lock, flow_copy, + &q.new_flow_cnt); + } + } + + skbn = bpf_obj_new(typeof(*skbn)); + if (!skbn) { + bpf_kptr_xchg_back(&sflow->flow, flow); + goto drop; + } + + skbn->tstamp = skb->tstamp = time_to_send; + + sch->qstats.backlog += qdisc_pkt_len(skb); + + skb = bpf_kptr_xchg(&skbn->skb, skb); + if (skb) + bpf_qdisc_skb_drop(skb, to_free); + + bpf_spin_lock(&flow->lock); + bpf_rbtree_add(&flow->queue, &skbn->node, skbn_tstamp_less); + bpf_spin_unlock(&flow->lock); + + flow->qlen++; + bpf_kptr_xchg_back(&sflow->flow, flow); + + sch->q.qlen++; + return NET_XMIT_SUCCESS; + +drop: + bpf_qdisc_skb_drop(skb, to_free); + sch->qstats.drops++; + return NET_XMIT_DROP; +} + +static int fq_unset_throttled_flows(u32 index, struct unset_throttled_flows_ctx *ctx) +{ + struct bpf_rb_node *node = NULL; + struct fq_flow_node *flow; + + bpf_spin_lock(&fq_delayed_lock); + + node = bpf_rbtree_first(&fq_delayed); + if (!node) { + bpf_spin_unlock(&fq_delayed_lock); + return 1; + } + + flow = container_of(node, struct fq_flow_node, rb_node); + if (!ctx->unset_all && flow->time_next_packet > ctx->now) { + q.time_next_delayed_flow = flow->time_next_packet; + bpf_spin_unlock(&fq_delayed_lock); + return 1; + } + + node = bpf_rbtree_remove(&fq_delayed, &flow->rb_node); + + bpf_spin_unlock(&fq_delayed_lock); + + if (!node) + return 1; + + flow = container_of(node, struct fq_flow_node, rb_node); + flow->age = 0; + fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt); + + return 0; +} + +static void fq_flow_set_throttled(struct fq_flow_node *flow) +{ + flow->age = ~0ULL; + + if (q.time_next_delayed_flow > flow->time_next_packet) + q.time_next_delayed_flow = flow->time_next_packet; + + bpf_spin_lock(&fq_delayed_lock); + bpf_rbtree_add(&fq_delayed, &flow->rb_node, fn_time_next_packet_less); + bpf_spin_unlock(&fq_delayed_lock); +} + +static void fq_check_throttled(u64 now) +{ + struct unset_throttled_flows_ctx ctx = { + .unset_all = false, + .now = now, + }; + unsigned long sample; + + if (q.time_next_delayed_flow > now) + return; + + sample = (unsigned long)(now - q.time_next_delayed_flow); + q.unthrottle_latency_ns -= q.unthrottle_latency_ns >> 3; + q.unthrottle_latency_ns += sample >> 3; + + q.time_next_delayed_flow = ~0ULL; + bpf_loop(NUM_QUEUE, fq_unset_throttled_flows, &ctx, 0); +} + +static struct sk_buff* +fq_dequeue_nonprio_flows(u32 index, struct dequeue_nonprio_ctx *ctx) +{ + u64 time_next_packet, time_to_send; + struct bpf_rb_node *rb_node; + struct sk_buff *skb = NULL; + struct bpf_list_head *head; + struct bpf_list_node *node; + struct bpf_spin_lock *lock; + struct fq_flow_node *flow; + struct skb_node *skbn; + bool is_empty; + u32 *cnt; + + if (q.new_flow_cnt) { + head = &fq_new_flows; + lock = &fq_new_flows_lock; + cnt = &q.new_flow_cnt; + } else if (q.old_flow_cnt) { + head = &fq_old_flows; + lock = &fq_old_flows_lock; + cnt = &q.old_flow_cnt; + } else { + if (q.time_next_delayed_flow != ~0ULL) + ctx->expire = q.time_next_delayed_flow; + goto break_loop; + } + + fq_flows_remove_front(head, lock, &node, cnt); + if (!node) + goto break_loop; + + flow = container_of(node, struct fq_flow_node, list_node); + if (flow->credit <= 0) { + flow->credit += q.quantum; + fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt); + return NULL; + } + + bpf_spin_lock(&flow->lock); + rb_node = bpf_rbtree_first(&flow->queue); + if (!rb_node) { + bpf_spin_unlock(&flow->lock); + is_empty = fq_flows_is_empty(&fq_old_flows, &fq_old_flows_lock); + if (head == &fq_new_flows && !is_empty) { + fq_flows_add_tail(&fq_old_flows, &fq_old_flows_lock, flow, &q.old_flow_cnt); + } else { + fq_flow_set_detached(flow); + bpf_obj_drop(flow); + } + return NULL; + } + + skbn = container_of(rb_node, struct skb_node, node); + time_to_send = skbn->tstamp; + + time_next_packet = (time_to_send > flow->time_next_packet) ? + time_to_send : flow->time_next_packet; + if (ctx->now < time_next_packet) { + bpf_spin_unlock(&flow->lock); + flow->time_next_packet = time_next_packet; + fq_flow_set_throttled(flow); + return NULL; + } + + rb_node = bpf_rbtree_remove(&flow->queue, rb_node); + bpf_spin_unlock(&flow->lock); + + if (!rb_node) + goto add_flow_and_break; + + skbn = container_of(rb_node, struct skb_node, node); + skb = bpf_kptr_xchg(&skbn->skb, skb); + bpf_obj_drop(skbn); + + if (!skb) + goto add_flow_and_break; + + flow->credit -= qdisc_skb_cb(skb)->pkt_len; + flow->qlen--; + +add_flow_and_break: + fq_flows_add_head(head, lock, flow, cnt); + +break_loop: + ctx->stop_iter = true; + return skb; +} + +static struct sk_buff *fq_dequeue_prio(void) +{ + struct fq_flow_node *flow = NULL; + struct fq_stashed_flow *sflow; + struct bpf_rb_node *rb_node; + struct sk_buff *skb = NULL; + struct skb_node *skbn; + u64 hash = 0; + + sflow = bpf_map_lookup_elem(&fq_prio_flows, &hash); + if (!sflow) + return NULL; + + flow = bpf_kptr_xchg(&sflow->flow, flow); + if (!flow) + return NULL; + + bpf_spin_lock(&flow->lock); + rb_node = bpf_rbtree_first(&flow->queue); + if (!rb_node) { + bpf_spin_unlock(&flow->lock); + goto out; + } + + skbn = container_of(rb_node, struct skb_node, node); + rb_node = bpf_rbtree_remove(&flow->queue, &skbn->node); + bpf_spin_unlock(&flow->lock); + + if (!rb_node) + goto out; + + skbn = container_of(rb_node, struct skb_node, node); + skb = bpf_kptr_xchg(&skbn->skb, skb); + bpf_obj_drop(skbn); + +out: + bpf_kptr_xchg_back(&sflow->flow, flow); + + return skb; +} + +SEC("struct_ops/bpf_fq_dequeue") +struct sk_buff *BPF_PROG(bpf_fq_dequeue, struct Qdisc *sch) +{ + struct dequeue_nonprio_ctx cb_ctx = {}; + struct sk_buff *skb = NULL; + int i; + + if (!sch->q.qlen) + goto out; + + skb = fq_dequeue_prio(); + if (skb) + goto dequeue; + + q.ktime_cache = cb_ctx.now = bpf_ktime_get_ns(); + fq_check_throttled(q.ktime_cache); + bpf_for(i, 0, sch->limit) { + skb = fq_dequeue_nonprio_flows(i, &cb_ctx); + if (cb_ctx.stop_iter) + break; + }; + + if (skb) { +dequeue: + sch->q.qlen--; + sch->qstats.backlog -= qdisc_pkt_len(skb); + bpf_qdisc_bstats_update(sch, skb); + return skb; + } + + if (cb_ctx.expire) + bpf_qdisc_watchdog_schedule(sch, cb_ctx.expire, q.timer_slack); +out: + return NULL; +} + +static int fq_remove_flows_in_list(u32 index, void *ctx) +{ + struct bpf_list_node *node; + struct fq_flow_node *flow; + + bpf_spin_lock(&fq_new_flows_lock); + node = bpf_list_pop_front(&fq_new_flows); + bpf_spin_unlock(&fq_new_flows_lock); + if (!node) { + bpf_spin_lock(&fq_old_flows_lock); + node = bpf_list_pop_front(&fq_old_flows); + bpf_spin_unlock(&fq_old_flows_lock); + if (!node) + return 1; + } + + flow = container_of(node, struct fq_flow_node, list_node); + bpf_obj_drop(flow); + + return 0; +} + +extern unsigned CONFIG_HZ __kconfig; + +/* limit number of collected flows per round */ +#define FQ_GC_MAX 8 +#define FQ_GC_AGE (3*CONFIG_HZ) + +static bool fq_gc_candidate(struct fq_flow_node *flow) +{ + u64 jiffies = bpf_jiffies64(); + + return fq_flow_is_detached(flow) && + ((s64)(jiffies - (flow->age + FQ_GC_AGE)) > 0); +} + +static int +fq_remove_flows(struct bpf_map *flow_map, u64 *hash, + struct fq_stashed_flow *sflow, struct remove_flows_ctx *ctx) +{ + if (sflow->flow && + (!ctx->gc_only || fq_gc_candidate(sflow->flow))) { + bpf_map_delete_elem(flow_map, hash); + ctx->reset_cnt++; + } + + return ctx->reset_cnt < ctx->reset_max ? 0 : 1; +} + +static void fq_gc(void) +{ + struct remove_flows_ctx cb_ctx = { + .gc_only = true, + .reset_cnt = 0, + .reset_max = FQ_GC_MAX, + }; + + bpf_for_each_map_elem(&fq_nonprio_flows, fq_remove_flows, &cb_ctx, 0); +} + +SEC("struct_ops/bpf_fq_reset") +void BPF_PROG(bpf_fq_reset, struct Qdisc *sch) +{ + struct unset_throttled_flows_ctx utf_ctx = { + .unset_all = true, + }; + struct remove_flows_ctx rf_ctx = { + .gc_only = false, + .reset_cnt = 0, + .reset_max = NUM_QUEUE, + }; + struct fq_stashed_flow *sflow; + u64 hash = 0; + + sch->q.qlen = 0; + sch->qstats.backlog = 0; + + bpf_for_each_map_elem(&fq_nonprio_flows, fq_remove_flows, &rf_ctx, 0); + + rf_ctx.reset_cnt = 0; + bpf_for_each_map_elem(&fq_prio_flows, fq_remove_flows, &rf_ctx, 0); + fq_new_flow(&fq_prio_flows, &sflow, hash); + + bpf_loop(NUM_QUEUE, fq_remove_flows_in_list, NULL, 0); + q.new_flow_cnt = 0; + q.old_flow_cnt = 0; + + bpf_loop(NUM_QUEUE, fq_unset_throttled_flows, &utf_ctx, 0); +} + +SEC("struct_ops/bpf_fq_init") +int BPF_PROG(bpf_fq_init, struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct net_device *dev = sch->dev_queue->dev; + u32 psched_mtu = dev->mtu + dev->hard_header_len; + struct fq_stashed_flow *sflow; + u64 hash = 0; + + if (fq_new_flow(&fq_prio_flows, &sflow, hash) < 0) + return -ENOMEM; + + sch->limit = 10000; + q.initial_quantum = 10 * psched_mtu; + q.quantum = 2 * psched_mtu; + q.flow_refill_delay = 40; + q.flow_plimit = 100; + q.horizon = 10ULL * NSEC_PER_SEC; + q.horizon_drop = 1; + q.orphan_mask = 1024 - 1; + q.timer_slack = 10 * NSEC_PER_USEC; + q.time_next_delayed_flow = ~0ULL; + q.unthrottle_latency_ns = 0ULL; + q.new_flow_cnt = 0; + q.old_flow_cnt = 0; + + return 0; +} + +SEC(".struct_ops") +struct Qdisc_ops fq = { + .enqueue = (void *)bpf_fq_enqueue, + .dequeue = (void *)bpf_fq_dequeue, + .reset = (void *)bpf_fq_reset, + .init = (void *)bpf_fq_init, + .id = "bpf_fq", +}; From 2b7b5b7f100e82ca314e76214626c82b608a1d8d Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 9 Apr 2025 14:46:06 -0700 Subject: [PATCH 301/770] selftests/bpf: Test attaching bpf qdisc to mq and non root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until we are certain that existing classful qdiscs work with bpf qdisc, make sure we don't allow attaching a bpf qdisc to non root. Meanwhile, attaching to mq is allowed. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20250409214606.2000194-11-ameryhung@gmail.com --- tools/testing/selftests/bpf/config | 1 + .../selftests/bpf/prog_tests/bpf_qdisc.c | 75 +++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 6b0cab55bd2dc..3201a962b3dcd 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -74,6 +74,7 @@ CONFIG_NET_MPLS_GSO=y CONFIG_NET_SCH_BPF=y CONFIG_NET_SCH_FQ=y CONFIG_NET_SCH_INGRESS=y +CONFIG_NET_SCH_HTB=y CONFIG_NET_SCHED=y CONFIG_NETDEVSIM=y CONFIG_NETFILTER=y diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c index 230d8f935303b..c9a54177c84ed 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c @@ -88,6 +88,77 @@ static void test_fq(void) bpf_qdisc_fq__destroy(fq_skel); } +static void test_qdisc_attach_to_mq(void) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, + .attach_point = BPF_TC_QDISC, + .parent = TC_H_MAKE(1 << 16, 1), + .handle = 0x11 << 16, + .qdisc = "bpf_fifo"); + struct bpf_qdisc_fifo *fifo_skel; + struct bpf_link *link; + int err; + + fifo_skel = bpf_qdisc_fifo__open_and_load(); + if (!ASSERT_OK_PTR(fifo_skel, "bpf_qdisc_fifo__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(fifo_skel->maps.fifo); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_qdisc_fifo__destroy(fifo_skel); + return; + } + + SYS(out, "ip link add veth0 type veth peer veth1"); + hook.ifindex = if_nametoindex("veth0"); + SYS(out, "tc qdisc add dev veth0 root handle 1: mq"); + + err = bpf_tc_hook_create(&hook); + ASSERT_OK(err, "attach qdisc"); + + bpf_tc_hook_destroy(&hook); + + SYS(out, "tc qdisc delete dev veth0 root mq"); +out: + bpf_link__destroy(link); + bpf_qdisc_fifo__destroy(fifo_skel); +} + +static void test_qdisc_attach_to_non_root(void) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = LO_IFINDEX, + .attach_point = BPF_TC_QDISC, + .parent = TC_H_MAKE(1 << 16, 1), + .handle = 0x11 << 16, + .qdisc = "bpf_fifo"); + struct bpf_qdisc_fifo *fifo_skel; + struct bpf_link *link; + int err; + + fifo_skel = bpf_qdisc_fifo__open_and_load(); + if (!ASSERT_OK_PTR(fifo_skel, "bpf_qdisc_fifo__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(fifo_skel->maps.fifo); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_qdisc_fifo__destroy(fifo_skel); + return; + } + + SYS(out, "tc qdisc add dev lo root handle 1: htb"); + SYS(out_del_htb, "tc class add dev lo parent 1: classid 1:1 htb rate 75Kbit"); + + err = bpf_tc_hook_create(&hook); + if (!ASSERT_ERR(err, "attach qdisc")) + bpf_tc_hook_destroy(&hook); + +out_del_htb: + SYS(out, "tc qdisc delete dev lo root htb"); +out: + bpf_link__destroy(link); + bpf_qdisc_fifo__destroy(fifo_skel); +} + void test_bpf_qdisc(void) { struct netns_obj *netns; @@ -100,6 +171,10 @@ void test_bpf_qdisc(void) test_fifo(); if (test__start_subtest("fq")) test_fq(); + if (test__start_subtest("attach to mq")) + test_qdisc_attach_to_mq(); + if (test__start_subtest("attach to non root")) + test_qdisc_attach_to_non_root(); netns_free(netns); } From a27d798fd83c8c7c4d8c549e04c83beb3c1dc214 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 15 Apr 2025 13:58:00 +0100 Subject: [PATCH 302/770] net: stmmac: sunxi: convert to set_clk_tx_rate() Convert sunxi to use the set_clk_tx_rate() callback rather than the fix_mac_speed() callback. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u4frU-000nMf-6o@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-sunxi.c | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c index 9f098ff0ff058..a245c223a18f7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c @@ -72,28 +72,28 @@ static void sun7i_gmac_exit(struct platform_device *pdev, void *priv) regulator_disable(gmac->regulator); } -static void sun7i_fix_speed(void *priv, int speed, unsigned int mode) +static int sun7i_set_clk_tx_rate(void *bsp_priv, struct clk *clk_tx_i, + phy_interface_t interface, int speed) { - struct sunxi_priv_data *gmac = priv; - - /* only GMII mode requires us to reconfigure the clock lines */ - if (gmac->interface != PHY_INTERFACE_MODE_GMII) - return; - - if (gmac->clk_enabled) { - clk_disable(gmac->tx_clk); - gmac->clk_enabled = 0; - } - clk_unprepare(gmac->tx_clk); - - if (speed == 1000) { - clk_set_rate(gmac->tx_clk, SUN7I_GMAC_GMII_RGMII_RATE); - clk_prepare_enable(gmac->tx_clk); - gmac->clk_enabled = 1; - } else { - clk_set_rate(gmac->tx_clk, SUN7I_GMAC_MII_RATE); - clk_prepare(gmac->tx_clk); + struct sunxi_priv_data *gmac = bsp_priv; + + if (interface == PHY_INTERFACE_MODE_GMII) { + if (gmac->clk_enabled) { + clk_disable(gmac->tx_clk); + gmac->clk_enabled = 0; + } + clk_unprepare(gmac->tx_clk); + + if (speed == 1000) { + clk_set_rate(gmac->tx_clk, SUN7I_GMAC_GMII_RGMII_RATE); + clk_prepare_enable(gmac->tx_clk); + gmac->clk_enabled = 1; + } else { + clk_set_rate(gmac->tx_clk, SUN7I_GMAC_MII_RATE); + clk_prepare(gmac->tx_clk); + } } + return 0; } static int sun7i_gmac_probe(struct platform_device *pdev) @@ -140,7 +140,7 @@ static int sun7i_gmac_probe(struct platform_device *pdev) plat_dat->bsp_priv = gmac; plat_dat->init = sun7i_gmac_init; plat_dat->exit = sun7i_gmac_exit; - plat_dat->fix_mac_speed = sun7i_fix_speed; + plat_dat->set_clk_tx_rate = sun7i_set_clk_tx_rate; plat_dat->tx_fifo_size = 4096; plat_dat->rx_fifo_size = 16384; From dd2cdba4709f6a4e32e7e323d23922e99916a781 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 15 Apr 2025 13:58:05 +0100 Subject: [PATCH 303/770] net: stmmac: sunxi: use stmmac_pltfr_probe() Rather than open-coding the calls to sun7i_gmac_init() and sun7i_gmac_exit() in the probe function, use stmmac_pltfr_probe() which will automatically call the plat_dat->init() and plat_dat->exit() methods appropriately. This simplifies the code. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u4frZ-000nMl-BB@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c index a245c223a18f7..665eab1d34091 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c @@ -144,20 +144,7 @@ static int sun7i_gmac_probe(struct platform_device *pdev) plat_dat->tx_fifo_size = 4096; plat_dat->rx_fifo_size = 16384; - ret = sun7i_gmac_init(pdev, plat_dat->bsp_priv); - if (ret) - return ret; - - ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); - if (ret) - goto err_gmac_exit; - - return 0; - -err_gmac_exit: - sun7i_gmac_exit(pdev, plat_dat->bsp_priv); - - return ret; + return stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } static const struct of_device_id sun7i_dwmac_match[] = { From 69b3e38e2fb58a3d557d9e02c107d60143c8b49c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 15 Apr 2025 13:58:10 +0100 Subject: [PATCH 304/770] net: stmmac: sunxi: use devm_stmmac_pltfr_probe() Using devm_stmmac_pltfr_probe() simplifies the probe function. This will not only call plat_dat->init (sun7i_dwmac_init), but also plat_dat->exit (sun7i_dwmac_exit) appropriately if stmmac_dvr_probe() fails. This results in an overall simplification of the glue driver. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u4fre-000nMr-FT@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c index 665eab1d34091..1eadcf5d1ad63 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c @@ -144,7 +144,7 @@ static int sun7i_gmac_probe(struct platform_device *pdev) plat_dat->tx_fifo_size = 4096; plat_dat->rx_fifo_size = 16384; - return stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); + return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } static const struct of_device_id sun7i_dwmac_match[] = { @@ -155,7 +155,6 @@ MODULE_DEVICE_TABLE(of, sun7i_dwmac_match); static struct platform_driver sun7i_dwmac_driver = { .probe = sun7i_gmac_probe, - .remove = stmmac_pltfr_remove, .driver = { .name = "sun7i-dwmac", .pm = &stmmac_pltfr_pm_ops, From 2b065c098c37a3ed28df7c3be59dca61b9da8402 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 15 Apr 2025 21:29:34 +0200 Subject: [PATCH 305/770] r8169: refactor chip version detection Refactor chip version detection and merge both configuration tables. Apart from reducing the code by a third, this paves the way for merging chip version handling if only difference is the firmware. Signed-off-by: Heiner Kallweit Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/1fea533a-dd5a-4198-a9e2-895e11083947@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 325 +++++++++------------- 1 file changed, 128 insertions(+), 197 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 8e62b109518b2..b55a691c5cb8c 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -91,61 +91,114 @@ #define JUMBO_9K (9 * SZ_1K - VLAN_ETH_HLEN - ETH_FCS_LEN) #define JUMBO_16K (SZ_16K - VLAN_ETH_HLEN - ETH_FCS_LEN) -static const struct { +static const struct rtl_chip_info { + u16 mask; + u16 val; + enum mac_version mac_version; const char *name; const char *fw_name; } rtl_chip_infos[] = { - /* PCI devices. */ - [RTL_GIGA_MAC_VER_02] = {"RTL8169s" }, - [RTL_GIGA_MAC_VER_03] = {"RTL8110s" }, - [RTL_GIGA_MAC_VER_04] = {"RTL8169sb/8110sb" }, - [RTL_GIGA_MAC_VER_05] = {"RTL8169sc/8110sc" }, - [RTL_GIGA_MAC_VER_06] = {"RTL8169sc/8110sc" }, - /* PCI-E devices. */ - [RTL_GIGA_MAC_VER_07] = {"RTL8102e" }, - [RTL_GIGA_MAC_VER_08] = {"RTL8102e" }, - [RTL_GIGA_MAC_VER_09] = {"RTL8102e/RTL8103e" }, - [RTL_GIGA_MAC_VER_10] = {"RTL8101e/RTL8100e" }, - [RTL_GIGA_MAC_VER_14] = {"RTL8401" }, - [RTL_GIGA_MAC_VER_17] = {"RTL8168b/8111b" }, - [RTL_GIGA_MAC_VER_18] = {"RTL8168cp/8111cp" }, - [RTL_GIGA_MAC_VER_19] = {"RTL8168c/8111c" }, - [RTL_GIGA_MAC_VER_20] = {"RTL8168c/8111c" }, - [RTL_GIGA_MAC_VER_21] = {"RTL8168c/8111c" }, - [RTL_GIGA_MAC_VER_22] = {"RTL8168c/8111c" }, - [RTL_GIGA_MAC_VER_23] = {"RTL8168cp/8111cp" }, - [RTL_GIGA_MAC_VER_24] = {"RTL8168cp/8111cp" }, - [RTL_GIGA_MAC_VER_25] = {"RTL8168d/8111d", FIRMWARE_8168D_1}, - [RTL_GIGA_MAC_VER_26] = {"RTL8168d/8111d", FIRMWARE_8168D_2}, - [RTL_GIGA_MAC_VER_28] = {"RTL8168dp/8111dp" }, - [RTL_GIGA_MAC_VER_29] = {"RTL8105e", FIRMWARE_8105E_1}, - [RTL_GIGA_MAC_VER_30] = {"RTL8105e", FIRMWARE_8105E_1}, - [RTL_GIGA_MAC_VER_31] = {"RTL8168dp/8111dp" }, - [RTL_GIGA_MAC_VER_32] = {"RTL8168e/8111e", FIRMWARE_8168E_1}, - [RTL_GIGA_MAC_VER_33] = {"RTL8168e/8111e", FIRMWARE_8168E_2}, - [RTL_GIGA_MAC_VER_34] = {"RTL8168evl/8111evl", FIRMWARE_8168E_3}, - [RTL_GIGA_MAC_VER_35] = {"RTL8168f/8111f", FIRMWARE_8168F_1}, - [RTL_GIGA_MAC_VER_36] = {"RTL8168f/8111f", FIRMWARE_8168F_2}, - [RTL_GIGA_MAC_VER_37] = {"RTL8402", FIRMWARE_8402_1 }, - [RTL_GIGA_MAC_VER_38] = {"RTL8411", FIRMWARE_8411_1 }, - [RTL_GIGA_MAC_VER_39] = {"RTL8106e", FIRMWARE_8106E_1}, - [RTL_GIGA_MAC_VER_40] = {"RTL8168g/8111g", FIRMWARE_8168G_2}, - [RTL_GIGA_MAC_VER_42] = {"RTL8168gu/8111gu", FIRMWARE_8168G_3}, - [RTL_GIGA_MAC_VER_43] = {"RTL8106eus", FIRMWARE_8106E_2}, - [RTL_GIGA_MAC_VER_44] = {"RTL8411b", FIRMWARE_8411_2 }, - [RTL_GIGA_MAC_VER_46] = {"RTL8168h/8111h", FIRMWARE_8168H_2}, - [RTL_GIGA_MAC_VER_48] = {"RTL8107e", FIRMWARE_8107E_2}, - [RTL_GIGA_MAC_VER_51] = {"RTL8168ep/8111ep" }, - [RTL_GIGA_MAC_VER_52] = {"RTL8168fp/RTL8117", FIRMWARE_8168FP_3}, - [RTL_GIGA_MAC_VER_53] = {"RTL8168fp/RTL8117", }, - [RTL_GIGA_MAC_VER_61] = {"RTL8125A", FIRMWARE_8125A_3}, - /* reserve 62 for CFG_METHOD_4 in the vendor driver */ - [RTL_GIGA_MAC_VER_63] = {"RTL8125B", FIRMWARE_8125B_2}, - [RTL_GIGA_MAC_VER_64] = {"RTL8125D", FIRMWARE_8125D_1}, - [RTL_GIGA_MAC_VER_65] = {"RTL8125D", FIRMWARE_8125D_2}, - [RTL_GIGA_MAC_VER_66] = {"RTL8125BP", FIRMWARE_8125BP_2}, - [RTL_GIGA_MAC_VER_70] = {"RTL8126A", FIRMWARE_8126A_2}, - [RTL_GIGA_MAC_VER_71] = {"RTL8126A", FIRMWARE_8126A_3}, + /* 8126A family. */ + { 0x7cf, 0x64a, RTL_GIGA_MAC_VER_71, "RTL8126A", FIRMWARE_8126A_3 }, + { 0x7cf, 0x649, RTL_GIGA_MAC_VER_70, "RTL8126A", FIRMWARE_8126A_2 }, + + /* 8125BP family. */ + { 0x7cf, 0x681, RTL_GIGA_MAC_VER_66, "RTL8125BP", FIRMWARE_8125BP_2 }, + + /* 8125D family. */ + { 0x7cf, 0x689, RTL_GIGA_MAC_VER_65, "RTL8125D", FIRMWARE_8125D_2 }, + { 0x7cf, 0x688, RTL_GIGA_MAC_VER_64, "RTL8125D", FIRMWARE_8125D_1 }, + + /* 8125B family. */ + { 0x7cf, 0x641, RTL_GIGA_MAC_VER_63, "RTL8125B", FIRMWARE_8125B_2 }, + + /* 8125A family. */ + { 0x7cf, 0x609, RTL_GIGA_MAC_VER_61, "RTL8125A", FIRMWARE_8125A_3 }, + + /* RTL8117 */ + { 0x7cf, 0x54b, RTL_GIGA_MAC_VER_53, "RTL8168fp/RTL8117" }, + { 0x7cf, 0x54a, RTL_GIGA_MAC_VER_52, "RTL8168fp/RTL8117", + FIRMWARE_8168FP_3 }, + + /* 8168EP family. */ + { 0x7cf, 0x502, RTL_GIGA_MAC_VER_51, "RTL8168ep/8111ep" }, + + /* 8168H family. */ + { 0x7cf, 0x541, RTL_GIGA_MAC_VER_46, "RTL8168h/8111h", + FIRMWARE_8168H_2 }, + /* Realtek calls it RTL8168M, but it's handled like RTL8168H */ + { 0x7cf, 0x6c0, RTL_GIGA_MAC_VER_46, "RTL8168M", FIRMWARE_8168H_2 }, + + /* 8168G family. */ + { 0x7cf, 0x5c8, RTL_GIGA_MAC_VER_44, "RTL8411b", FIRMWARE_8411_2 }, + { 0x7cf, 0x509, RTL_GIGA_MAC_VER_42, "RTL8168gu/8111gu", + FIRMWARE_8168G_3 }, + { 0x7cf, 0x4c0, RTL_GIGA_MAC_VER_40, "RTL8168g/8111g", + FIRMWARE_8168G_2 }, + + /* 8168F family. */ + { 0x7c8, 0x488, RTL_GIGA_MAC_VER_38, "RTL8411", FIRMWARE_8411_1 }, + { 0x7cf, 0x481, RTL_GIGA_MAC_VER_36, "RTL8168f/8111f", + FIRMWARE_8168F_2 }, + { 0x7cf, 0x480, RTL_GIGA_MAC_VER_35, "RTL8168f/8111f", + FIRMWARE_8168F_1 }, + + /* 8168E family. */ + { 0x7c8, 0x2c8, RTL_GIGA_MAC_VER_34, "RTL8168evl/8111evl", + FIRMWARE_8168E_3 }, + { 0x7cf, 0x2c1, RTL_GIGA_MAC_VER_32, "RTL8168e/8111e", + FIRMWARE_8168E_1 }, + { 0x7c8, 0x2c0, RTL_GIGA_MAC_VER_33, "RTL8168e/8111e", + FIRMWARE_8168E_2 }, + + /* 8168D family. */ + { 0x7cf, 0x281, RTL_GIGA_MAC_VER_25, "RTL8168d/8111d", + FIRMWARE_8168D_1 }, + { 0x7c8, 0x280, RTL_GIGA_MAC_VER_26, "RTL8168d/8111d", + FIRMWARE_8168D_2 }, + + /* 8168DP family. */ + { 0x7cf, 0x28a, RTL_GIGA_MAC_VER_28, "RTL8168dp/8111dp" }, + { 0x7cf, 0x28b, RTL_GIGA_MAC_VER_31, "RTL8168dp/8111dp" }, + + /* 8168C family. */ + { 0x7cf, 0x3c9, RTL_GIGA_MAC_VER_23, "RTL8168cp/8111cp" }, + { 0x7cf, 0x3c8, RTL_GIGA_MAC_VER_18, "RTL8168cp/8111cp" }, + { 0x7c8, 0x3c8, RTL_GIGA_MAC_VER_24, "RTL8168cp/8111cp" }, + { 0x7cf, 0x3c0, RTL_GIGA_MAC_VER_19, "RTL8168c/8111c" }, + { 0x7cf, 0x3c2, RTL_GIGA_MAC_VER_20, "RTL8168c/8111c" }, + { 0x7cf, 0x3c3, RTL_GIGA_MAC_VER_21, "RTL8168c/8111c" }, + { 0x7c8, 0x3c0, RTL_GIGA_MAC_VER_22, "RTL8168c/8111c" }, + + /* 8168B family. */ + { 0x7c8, 0x380, RTL_GIGA_MAC_VER_17, "RTL8168b/8111b" }, + /* This one is very old and rare, support has been removed. + * { 0x7c8, 0x300, RTL_GIGA_MAC_VER_11, "RTL8168b/8111b" }, + */ + + /* 8101 family. */ + { 0x7c8, 0x448, RTL_GIGA_MAC_VER_39, "RTL8106e", FIRMWARE_8106E_1 }, + { 0x7c8, 0x440, RTL_GIGA_MAC_VER_37, "RTL8402", FIRMWARE_8402_1 }, + { 0x7cf, 0x409, RTL_GIGA_MAC_VER_29, "RTL8105e", FIRMWARE_8105E_1 }, + { 0x7c8, 0x408, RTL_GIGA_MAC_VER_30, "RTL8105e", FIRMWARE_8105E_1 }, + { 0x7cf, 0x349, RTL_GIGA_MAC_VER_08, "RTL8102e" }, + { 0x7cf, 0x249, RTL_GIGA_MAC_VER_08, "RTL8102e" }, + { 0x7cf, 0x348, RTL_GIGA_MAC_VER_07, "RTL8102e" }, + { 0x7cf, 0x248, RTL_GIGA_MAC_VER_07, "RTL8102e" }, + { 0x7cf, 0x240, RTL_GIGA_MAC_VER_14, "RTL8401" }, + { 0x7c8, 0x348, RTL_GIGA_MAC_VER_09, "RTL8102e/RTL8103e" }, + { 0x7c8, 0x248, RTL_GIGA_MAC_VER_09, "RTL8102e/RTL8103e" }, + { 0x7c8, 0x340, RTL_GIGA_MAC_VER_10, "RTL8101e/RTL8100e" }, + + /* 8110 family. */ + { 0xfc8, 0x980, RTL_GIGA_MAC_VER_06, "RTL8169sc/8110sc" }, + { 0xfc8, 0x180, RTL_GIGA_MAC_VER_05, "RTL8169sc/8110sc" }, + { 0xfc8, 0x100, RTL_GIGA_MAC_VER_04, "RTL8169sb/8110sb" }, + { 0xfc8, 0x040, RTL_GIGA_MAC_VER_03, "RTL8110s" }, + { 0xfc8, 0x008, RTL_GIGA_MAC_VER_02, "RTL8169s" }, + + /* Catch-all */ + { 0x000, 0x000, RTL_GIGA_MAC_NONE } }; static const struct pci_device_id rtl8169_pci_tbl[] = { @@ -2265,151 +2318,30 @@ static const struct ethtool_ops rtl8169_ethtool_ops = { .get_eth_ctrl_stats = rtl8169_get_eth_ctrl_stats, }; -static enum mac_version rtl8169_get_mac_version(u16 xid, bool gmii) +static const struct rtl_chip_info *rtl8169_get_chip_version(u16 xid, bool gmii) { - /* - * The driver currently handles the 8168Bf and the 8168Be identically - * but they can be identified more specifically through the test below - * if needed: - * - * (RTL_R32(tp, TxConfig) & 0x700000) == 0x500000 ? 8168Bf : 8168Be - * - * Same thing for the 8101Eb and the 8101Ec: - * - * (RTL_R32(tp, TxConfig) & 0x700000) == 0x200000 ? 8101Eb : 8101Ec - */ - static const struct rtl_mac_info { - u16 mask; - u16 val; - enum mac_version ver; - } mac_info[] = { - /* 8126A family. */ - { 0x7cf, 0x64a, RTL_GIGA_MAC_VER_71 }, - { 0x7cf, 0x649, RTL_GIGA_MAC_VER_70 }, - - /* 8125BP family. */ - { 0x7cf, 0x681, RTL_GIGA_MAC_VER_66 }, - - /* 8125D family. */ - { 0x7cf, 0x689, RTL_GIGA_MAC_VER_65 }, - { 0x7cf, 0x688, RTL_GIGA_MAC_VER_64 }, - - /* 8125B family. */ - { 0x7cf, 0x641, RTL_GIGA_MAC_VER_63 }, - - /* 8125A family. */ - { 0x7cf, 0x609, RTL_GIGA_MAC_VER_61 }, - /* It seems only XID 609 made it to the mass market. - * { 0x7cf, 0x608, RTL_GIGA_MAC_VER_60 }, - * { 0x7c8, 0x608, RTL_GIGA_MAC_VER_61 }, - */ - - /* RTL8117 */ - { 0x7cf, 0x54b, RTL_GIGA_MAC_VER_53 }, - { 0x7cf, 0x54a, RTL_GIGA_MAC_VER_52 }, - - /* 8168EP family. */ - { 0x7cf, 0x502, RTL_GIGA_MAC_VER_51 }, - /* It seems this chip version never made it to - * the wild. Let's disable detection. - * { 0x7cf, 0x501, RTL_GIGA_MAC_VER_50 }, - * { 0x7cf, 0x500, RTL_GIGA_MAC_VER_49 }, - */ - - /* 8168H family. */ - { 0x7cf, 0x541, RTL_GIGA_MAC_VER_46 }, - /* It seems this chip version never made it to - * the wild. Let's disable detection. - * { 0x7cf, 0x540, RTL_GIGA_MAC_VER_45 }, - */ - /* Realtek calls it RTL8168M, but it's handled like RTL8168H */ - { 0x7cf, 0x6c0, RTL_GIGA_MAC_VER_46 }, - - /* 8168G family. */ - { 0x7cf, 0x5c8, RTL_GIGA_MAC_VER_44 }, - { 0x7cf, 0x509, RTL_GIGA_MAC_VER_42 }, - /* It seems this chip version never made it to - * the wild. Let's disable detection. - * { 0x7cf, 0x4c1, RTL_GIGA_MAC_VER_41 }, - */ - { 0x7cf, 0x4c0, RTL_GIGA_MAC_VER_40 }, - - /* 8168F family. */ - { 0x7c8, 0x488, RTL_GIGA_MAC_VER_38 }, - { 0x7cf, 0x481, RTL_GIGA_MAC_VER_36 }, - { 0x7cf, 0x480, RTL_GIGA_MAC_VER_35 }, - - /* 8168E family. */ - { 0x7c8, 0x2c8, RTL_GIGA_MAC_VER_34 }, - { 0x7cf, 0x2c1, RTL_GIGA_MAC_VER_32 }, - { 0x7c8, 0x2c0, RTL_GIGA_MAC_VER_33 }, - - /* 8168D family. */ - { 0x7cf, 0x281, RTL_GIGA_MAC_VER_25 }, - { 0x7c8, 0x280, RTL_GIGA_MAC_VER_26 }, - - /* 8168DP family. */ - /* It seems this early RTL8168dp version never made it to - * the wild. Support has been removed. - * { 0x7cf, 0x288, RTL_GIGA_MAC_VER_27 }, - */ - { 0x7cf, 0x28a, RTL_GIGA_MAC_VER_28 }, - { 0x7cf, 0x28b, RTL_GIGA_MAC_VER_31 }, - - /* 8168C family. */ - { 0x7cf, 0x3c9, RTL_GIGA_MAC_VER_23 }, - { 0x7cf, 0x3c8, RTL_GIGA_MAC_VER_18 }, - { 0x7c8, 0x3c8, RTL_GIGA_MAC_VER_24 }, - { 0x7cf, 0x3c0, RTL_GIGA_MAC_VER_19 }, - { 0x7cf, 0x3c2, RTL_GIGA_MAC_VER_20 }, - { 0x7cf, 0x3c3, RTL_GIGA_MAC_VER_21 }, - { 0x7c8, 0x3c0, RTL_GIGA_MAC_VER_22 }, - - /* 8168B family. */ - { 0x7c8, 0x380, RTL_GIGA_MAC_VER_17 }, - /* This one is very old and rare, support has been removed. - * { 0x7c8, 0x300, RTL_GIGA_MAC_VER_11 }, - */ - - /* 8101 family. */ - { 0x7c8, 0x448, RTL_GIGA_MAC_VER_39 }, - { 0x7c8, 0x440, RTL_GIGA_MAC_VER_37 }, - { 0x7cf, 0x409, RTL_GIGA_MAC_VER_29 }, - { 0x7c8, 0x408, RTL_GIGA_MAC_VER_30 }, - { 0x7cf, 0x349, RTL_GIGA_MAC_VER_08 }, - { 0x7cf, 0x249, RTL_GIGA_MAC_VER_08 }, - { 0x7cf, 0x348, RTL_GIGA_MAC_VER_07 }, - { 0x7cf, 0x248, RTL_GIGA_MAC_VER_07 }, - { 0x7cf, 0x240, RTL_GIGA_MAC_VER_14 }, - { 0x7c8, 0x348, RTL_GIGA_MAC_VER_09 }, - { 0x7c8, 0x248, RTL_GIGA_MAC_VER_09 }, - { 0x7c8, 0x340, RTL_GIGA_MAC_VER_10 }, - - /* 8110 family. */ - { 0xfc8, 0x980, RTL_GIGA_MAC_VER_06 }, - { 0xfc8, 0x180, RTL_GIGA_MAC_VER_05 }, - { 0xfc8, 0x100, RTL_GIGA_MAC_VER_04 }, - { 0xfc8, 0x040, RTL_GIGA_MAC_VER_03 }, - { 0xfc8, 0x008, RTL_GIGA_MAC_VER_02 }, - - /* Catch-all */ - { 0x000, 0x000, RTL_GIGA_MAC_NONE } + /* Chips combining a 1Gbps MAC with a 100Mbps PHY */ + static const struct rtl_chip_info rtl8106eus_info = { + .mac_version = RTL_GIGA_MAC_VER_43, + .name = "RTL8106eus", + .fw_name = FIRMWARE_8106E_2, + }; + static const struct rtl_chip_info rtl8107e_info = { + .mac_version = RTL_GIGA_MAC_VER_48, + .name = "RTL8107e", + .fw_name = FIRMWARE_8107E_2, }; - const struct rtl_mac_info *p = mac_info; - enum mac_version ver; + const struct rtl_chip_info *p = rtl_chip_infos; while ((xid & p->mask) != p->val) p++; - ver = p->ver; - if (ver != RTL_GIGA_MAC_NONE && !gmii) { - if (ver == RTL_GIGA_MAC_VER_42) - ver = RTL_GIGA_MAC_VER_43; - else if (ver == RTL_GIGA_MAC_VER_46) - ver = RTL_GIGA_MAC_VER_48; - } + if (p->mac_version == RTL_GIGA_MAC_VER_42 && !gmii) + return &rtl8106eus_info; + if (p->mac_version == RTL_GIGA_MAC_VER_46 && !gmii) + return &rtl8107e_info; - return ver; + return p; } static void rtl_release_firmware(struct rtl8169_private *tp) @@ -5440,9 +5372,9 @@ static bool rtl_aspm_is_safe(struct rtl8169_private *tp) static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { + const struct rtl_chip_info *chip; struct rtl8169_private *tp; int jumbo_max, region, rc; - enum mac_version chipset; struct net_device *dev; u32 txconfig; u16 xid; @@ -5492,12 +5424,13 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) xid = (txconfig >> 20) & 0xfcf; /* Identify chip attached to board */ - chipset = rtl8169_get_mac_version(xid, tp->supports_gmii); - if (chipset == RTL_GIGA_MAC_NONE) + chip = rtl8169_get_chip_version(xid, tp->supports_gmii); + if (chip->mac_version == RTL_GIGA_MAC_NONE) return dev_err_probe(&pdev->dev, -ENODEV, "unknown chip XID %03x, contact r8169 maintainers (see MAINTAINERS file)\n", xid); - tp->mac_version = chipset; + tp->mac_version = chip->mac_version; + tp->fw_name = chip->fw_name; /* Disable ASPM L1 as that cause random device stop working * problems as well as full system hangs for some PCIe devices users. @@ -5602,8 +5535,6 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) rtl_set_irq_mask(tp); - tp->fw_name = rtl_chip_infos[chipset].fw_name; - tp->counters = dmam_alloc_coherent (&pdev->dev, sizeof(*tp->counters), &tp->counters_phys_addr, GFP_KERNEL); @@ -5628,7 +5559,7 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) } netdev_info(dev, "%s, %pM, XID %03x, IRQ %d\n", - rtl_chip_infos[chipset].name, dev->dev_addr, xid, tp->irq); + chip->name, dev->dev_addr, xid, tp->irq); if (jumbo_max) netdev_info(dev, "jumbo features [frames: %d bytes, tx checksumming: %s]\n", From fe733618b27a8c033f0d246c2efff56fca322656 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 15 Apr 2025 21:39:23 +0200 Subject: [PATCH 306/770] r8169: add RTL_GIGA_MAC_VER_LAST to facilitate adding support for new chip versions Add a new mac_version enum value RTL_GIGA_MAC_VER_LAST. Benefit is that when adding support for a new chip version we have to touch less code, except something changes fundamentally. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/06991f47-2aec-4aa2-8918-2c6e79332303@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169.h | 3 ++- drivers/net/ethernet/realtek/r8169_main.c | 28 +++++++++++------------ 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h index 7a194a8ab989f..9f784840ea0d2 100644 --- a/drivers/net/ethernet/realtek/r8169.h +++ b/drivers/net/ethernet/realtek/r8169.h @@ -73,7 +73,8 @@ enum mac_version { RTL_GIGA_MAC_VER_66, RTL_GIGA_MAC_VER_70, RTL_GIGA_MAC_VER_71, - RTL_GIGA_MAC_NONE + RTL_GIGA_MAC_NONE, + RTL_GIGA_MAC_VER_LAST = RTL_GIGA_MAC_NONE - 1 }; struct rtl8169_private; diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index b55a691c5cb8c..635785d917902 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -1289,7 +1289,7 @@ static void rtl_writephy(struct rtl8169_private *tp, int location, int val) case RTL_GIGA_MAC_VER_31: r8168dp_2_mdio_write(tp, location, val); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_LAST: r8168g_mdio_write(tp, location, val); break; default: @@ -1304,7 +1304,7 @@ static int rtl_readphy(struct rtl8169_private *tp, int location) case RTL_GIGA_MAC_VER_28: case RTL_GIGA_MAC_VER_31: return r8168dp_2_mdio_read(tp, location); - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_LAST: return r8168g_mdio_read(tp, location); default: return r8169_mdio_read(tp, location); @@ -1656,7 +1656,7 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts) break; case RTL_GIGA_MAC_VER_34: case RTL_GIGA_MAC_VER_37: - case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_39 ... RTL_GIGA_MAC_VER_LAST: r8169_mod_reg8_cond(tp, Config2, PME_SIGNAL, wolopts); break; default: @@ -2129,7 +2129,7 @@ static void rtl_set_eee_txidle_timer(struct rtl8169_private *tp) tp->tx_lpi_timer = timer_val; r8168_mac_ocp_write(tp, 0xe048, timer_val); break; - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_LAST: tp->tx_lpi_timer = timer_val; RTL_W16(tp, EEE_TXIDLE_TIMER_8125, timer_val); break; @@ -2491,7 +2491,7 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_61: RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_DMA_BURST); break; - case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_LAST: RTL_W32(tp, RxConfig, RX_FETCH_DFLT_8125 | RX_DMA_BURST | RX_PAUSE_SLOT_ON); break; @@ -2623,7 +2623,7 @@ static void rtl_wait_txrx_fifo_empty(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_61: rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond, 100, 42); break; - case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_63 ... RTL_GIGA_MAC_VER_LAST: RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond, 100, 42); rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond_2, 100, 42); @@ -2898,7 +2898,7 @@ static void rtl_enable_exit_l1(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_37 ... RTL_GIGA_MAC_VER_38: rtl_eri_set_bits(tp, 0xd4, 0x0c00); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_LAST: r8168_mac_ocp_modify(tp, 0xc0ac, 0, 0x1f80); break; default: @@ -2912,7 +2912,7 @@ static void rtl_disable_exit_l1(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_34 ... RTL_GIGA_MAC_VER_38: rtl_eri_clear_bits(tp, 0xd4, 0x1f00); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_LAST: r8168_mac_ocp_modify(tp, 0xc0ac, 0x1f80, 0); break; default: @@ -2950,7 +2950,7 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) switch (tp->mac_version) { case RTL_GIGA_MAC_VER_46 ... RTL_GIGA_MAC_VER_48: - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_LAST: /* reset ephy tx/rx disable timer */ r8168_mac_ocp_modify(tp, 0xe094, 0xff00, 0); /* chip can trigger L1.2 */ @@ -2962,7 +2962,7 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) } else { switch (tp->mac_version) { case RTL_GIGA_MAC_VER_46 ... RTL_GIGA_MAC_VER_48: - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_LAST: r8168_mac_ocp_modify(tp, 0xe092, 0x00ff, 0); break; default: @@ -4094,7 +4094,7 @@ static void rtl8169_cleanup(struct rtl8169_private *tp) RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); rtl_loop_wait_high(tp, &rtl_txcfg_empty_cond, 100, 666); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_LAST: rtl_enable_rxdvgate(tp); fsleep(2000); break; @@ -4251,7 +4251,7 @@ static unsigned int rtl_quirk_packet_padto(struct rtl8169_private *tp, switch (tp->mac_version) { case RTL_GIGA_MAC_VER_34: - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_LAST: padto = max_t(unsigned int, padto, ETH_ZLEN); break; default: @@ -5302,7 +5302,7 @@ static void rtl_hw_initialize(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_48: rtl_hw_init_8168g(tp); break; - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_LAST: rtl_hw_init_8125(tp); break; default: @@ -5327,7 +5327,7 @@ static int rtl_jumbo_max(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_18 ... RTL_GIGA_MAC_VER_24: return JUMBO_6K; /* RTL8125/8126 */ - case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_LAST: return JUMBO_16K; default: return JUMBO_9K; From 151e13ece86d234213b7f224f0e26a957c0eeb3e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 15 Apr 2025 18:02:15 -0700 Subject: [PATCH 307/770] net: ethtool: Adjust exactly ETH_GSTRING_LEN-long stats to use memcpy Many drivers populate the stats buffer using C-String based APIs (e.g. ethtool_sprintf() and ethtool_puts()), usually when building up the list of stats individually (i.e. with a for() loop). This, however, requires that the source strings be populated in such a way as to have a terminating NUL byte in the source. Other drivers populate the stats buffer directly using one big memcpy() of an entire array of strings. No NUL termination is needed here, as the bytes are being directly passed through. Yet others will build up the stats buffer individually, but also use memcpy(). This, too, does not need NUL termination of the source strings. However, there are cases where the strings that populate the source stats strings are exactly ETH_GSTRING_LEN long, and GCC 15's -Wunterminated-string-initialization option complains that the trailing NUL byte has been truncated. This situation is fine only if the driver is using the memcpy() approach. If the C-String APIs are used, the destination string name will have its final byte truncated by the required trailing NUL byte applied by the C-string API. For drivers that are already using memcpy() but have initializers that truncate the NUL terminator, mark their source strings as __nonstring to silence the GCC warnings. For drivers that have initializers that truncate the NUL terminator and are using the C-String APIs, switch to memcpy() to avoid destination string truncation and mark their source strings as __nonstring to silence the GCC warnings. (Also introduce ethtool_cpy() as a helper to make this an easy replacement). Specifically the following warnings were investigated and addressed: ../drivers/net/ethernet/chelsio/cxgb/cxgb2.c:364:9: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 364 | "TxFramesAbortedDueToXSCollisions", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/freescale/enetc/enetc_ethtool.c:165:33: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 165 | { ENETC_PM_R1523X(0), "MAC rx 1523 to max-octet packets" }, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/freescale/enetc/enetc_ethtool.c:190:33: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 190 | { ENETC_PM_T1523X(0), "MAC tx 1523 to max-octet packets" }, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/google/gve/gve_ethtool.c:76:9: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 76 | "adminq_dcfg_device_resources_cnt", "adminq_set_driver_parameter_cnt", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:117:53: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 117 | STMMAC_STAT(ptp_rx_msg_type_pdelay_follow_up), | ^ ../drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c:46:12: note: in definition of macro 'STMMAC_STAT' 46 | { #m, sizeof_field(struct stmmac_extra_stats, m), \ | ^ ../drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c:328:24: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 328 | .str = "a_mac_control_frames_transmitted", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c:340:24: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 340 | .str = "a_pause_mac_ctrl_frames_received", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Kees Cook Reviewed-by: Petr Machata # for mlxsw Reviewed-by: Harshitha Ramamurthy Link: https://patch.msgid.link/20250416010210.work.904-kees@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 2 +- drivers/net/ethernet/freescale/enetc/enetc_ethtool.c | 4 ++-- drivers/net/ethernet/google/gve/gve_ethtool.c | 4 ++-- .../net/ethernet/mellanox/mlxsw/spectrum_ethtool.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 2 +- include/linux/ethtool.h | 11 +++++++++++ 6 files changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c index 3b7068832f95e..4a0e2d2eb60a4 100644 --- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c @@ -351,7 +351,7 @@ static void set_msglevel(struct net_device *dev, u32 val) adapter->msg_enable = val; } -static const char stats_strings[][ETH_GSTRING_LEN] = { +static const char stats_strings[][ETH_GSTRING_LEN] __nonstring_array = { "TxOctetsOK", "TxOctetsBad", "TxUnicastFramesOK", diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index ece3ae28ba827..f47c8b6cc5118 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -141,7 +141,7 @@ static const struct { static const struct { int reg; - char name[ETH_GSTRING_LEN]; + char name[ETH_GSTRING_LEN] __nonstring; } enetc_port_counters[] = { { ENETC_PM_REOCT(0), "MAC rx ethernet octets" }, { ENETC_PM_RALN(0), "MAC rx alignment errors" }, @@ -264,7 +264,7 @@ static void enetc_get_strings(struct net_device *ndev, u32 stringset, u8 *data) break; for (i = 0; i < ARRAY_SIZE(enetc_port_counters); i++) - ethtool_puts(&data, enetc_port_counters[i].name); + ethtool_cpy(&data, enetc_port_counters[i].name); break; } diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index eae1a7595a695..3c1da0cf3f61e 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -67,7 +67,7 @@ static const char gve_gstrings_tx_stats[][ETH_GSTRING_LEN] = { "tx_xsk_sent[%u]", "tx_xdp_xmit[%u]", "tx_xdp_xmit_errors[%u]" }; -static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] = { +static const char gve_gstrings_adminq_stats[][ETH_GSTRING_LEN] __nonstring_array = { "adminq_prod_cnt", "adminq_cmd_fail", "adminq_timeouts", "adminq_describe_device_cnt", "adminq_cfg_device_resources_cnt", "adminq_register_page_list_cnt", "adminq_unregister_page_list_cnt", @@ -113,7 +113,7 @@ static void gve_get_strings(struct net_device *netdev, u32 stringset, u8 *data) i); for (i = 0; i < ARRAY_SIZE(gve_gstrings_adminq_stats); i++) - ethtool_puts(&s, gve_gstrings_adminq_stats[i]); + ethtool_cpy(&s, gve_gstrings_adminq_stats[i]); break; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c index 3f64cdbabfa3c..0a8fb9c842d3b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ethtool.c @@ -262,7 +262,7 @@ static int mlxsw_sp_port_set_pauseparam(struct net_device *dev, } struct mlxsw_sp_port_hw_stats { - char str[ETH_GSTRING_LEN]; + char str[ETH_GSTRING_LEN] __nonstring; u64 (*getter)(const char *payload); bool cells_bytes; }; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 918a32f8fda80..844f7d516a40b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -37,7 +37,7 @@ #define ETHTOOL_DMA_OFFSET 55 struct stmmac_stats { - char stat_string[ETH_GSTRING_LEN]; + char stat_string[ETH_GSTRING_LEN] __nonstring; int sizeof_stat; int stat_offset; }; diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 013d258586420..7edb5f5e71345 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1330,6 +1330,17 @@ extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...); */ extern void ethtool_puts(u8 **data, const char *str); +/** + * ethtool_cpy - Write possibly-not-NUL-terminated string to ethtool string data + * @data: Pointer to a pointer to the start of string to write into + * @str: NUL-byte padded char array of size ETH_GSTRING_LEN to copy from + */ +#define ethtool_cpy(data, str) do { \ + BUILD_BUG_ON(sizeof(str) != ETH_GSTRING_LEN); \ + memcpy(*(data), str, ETH_GSTRING_LEN); \ + *(data) += ETH_GSTRING_LEN; \ +} while (0) + /* Link mode to forced speed capabilities maps */ struct ethtool_forced_speed_map { u32 speed; From cfba1d1b61ae3f32e4bc06e9860711a4488d98b7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 15 Apr 2025 19:01:14 -0700 Subject: [PATCH 308/770] net/mlx5e: ethtool: Fix formatting of ptp_rq0_csum_complete_tail_slow The new GCC 15 warning -Wunterminated-string-initialization reports: In file included from drivers/net/ethernet/mellanox/mlx5/core/en.h:55, from drivers/net/ethernet/mellanox/mlx5/core/en_stats.c:34: drivers/net/ethernet/mellanox/mlx5/core/en_stats.h:57:46: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (33 chars into 32 available) [-Wunterminated-string-initialization] 57 | #define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq%d_"#fld, offsetof(type, fld) | ^~~~~~~~~~~ drivers/net/ethernet/mellanox/mlx5/core/en_stats.c:2279:11: note: in expansion of macro 'MLX5E_DECLARE_PTP_RQ_STAT' 2279 | { MLX5E_DECLARE_PTP_RQ_STAT(struct mlx5e_rq_stats, csum_complete_tail_slow) }, | ^~~~~~~~~~~~~~~~~~~~~~~~~ This stat string is being used in ethtool_sprintf(), so it must be a valid NUL-terminated string. Currently the string lacks the final NUL byte (as GCC warns), but by absolute luck, the next byte in memory is a space (decimal 32) followed by a NUL. "format" is immediately followed by little-endian size_t: struct counter_desc { char format[32]; /* 0 32 */ size_t offset; /* 32 8 */ }; The "offset" member is populated by the stats member offset: #define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq%d_"#fld, offsetof(type, fld) which for this struct mlx5e_rq_stats member, csum_complete_tail_slow, is 32, or space, and then the rest of the "offset" bytes are NULs. struct mlx5e_rq_stats { ... u64 csum_complete_tail_slow; /* 32 8 */ The use of vsnprintf(), within ethtool_sprintf(), reads past the end of "format" and sees the format string as "ptp_rq%d_csum_complete_tail_slow ", with %d getting resolved by MLX5E_PTP_CHANNEL_IX (value 0): ethtool_sprintf(data, ptp_rq_stats_desc[i].format, MLX5E_PTP_CHANNEL_IX); With an output result of "ptp_rq0_csum_complete_tail_slow", which gets precisely truncated to 31 characters with a trailing NUL. So, instead of accidentally getting this correct due to the NUL bytes at the end of the size_t that happens to follow the format string, just make the string initializer 1 byte shorter by replacing "%d" with "0", since MLX5E_PTP_CHANNEL_IX is already hard-coded. This results in no initializer truncation and no need to call sprintf(). Signed-off-by: Kees Cook Reviewed-by: Dragos Tatulea Link: https://patch.msgid.link/20250416020109.work.297-kees@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c index 1c121b435016d..19664fa7f2171 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c @@ -2424,8 +2424,7 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(ptp) } if (priv->rx_ptp_opened) { for (i = 0; i < NUM_PTP_RQ_STATS; i++) - ethtool_sprintf(data, ptp_rq_stats_desc[i].format, - MLX5E_PTP_CHANNEL_IX); + ethtool_puts(data, ptp_rq_stats_desc[i].format); } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index 8de6fcbd3a033..def5dea1463db 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -54,7 +54,7 @@ #define MLX5E_DECLARE_PTP_TX_STAT(type, fld) "ptp_tx%d_"#fld, offsetof(type, fld) #define MLX5E_DECLARE_PTP_CH_STAT(type, fld) "ptp_ch_"#fld, offsetof(type, fld) #define MLX5E_DECLARE_PTP_CQ_STAT(type, fld) "ptp_cq%d_"#fld, offsetof(type, fld) -#define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq%d_"#fld, offsetof(type, fld) +#define MLX5E_DECLARE_PTP_RQ_STAT(type, fld) "ptp_rq0_"#fld, offsetof(type, fld) #define MLX5E_DECLARE_QOS_TX_STAT(type, fld) "qos_tx%d_"#fld, offsetof(type, fld) From 22cbc1ee268b7ec0000848708944daa61c6e4909 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 15 Apr 2025 20:04:47 -0700 Subject: [PATCH 309/770] netdev: fix the locking for netdev notifications Kuniyuki reports that the assert for netdev lock fires when there are netdev event listeners (otherwise we skip the netlink event generation). Correct the locking when coming from the notifier. The NETDEV_XDP_FEAT_CHANGE notifier is already fully locked, it's the documentation that's incorrect. Fixes: 99e44f39a8f7 ("netdev: depend on netdev->lock for xdp features") Reported-by: syzkaller Reported-by: Kuniyuki Iwashima Link: https://lore.kernel.org/20250410171019.62128-1-kuniyu@amazon.com Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250416030447.1077551-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/netdevices.rst | 4 +++- include/linux/netdevice.h | 2 +- include/net/netdev_lock.h | 16 ++++++++++++++++ net/core/lock_debug.c | 4 +++- net/core/netdev-genl.c | 4 ++++ 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index 77fe1f7b93be0..7ebb6c36482de 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -387,12 +387,14 @@ For device drivers that implement shaping or queue management APIs, some of the notifiers (``enum netdev_cmd``) are running under the netdev instance lock. +The following netdev notifiers are always run under the instance lock: +* ``NETDEV_XDP_FEAT_CHANGE`` + For devices with locked ops, currently only the following notifiers are running under the lock: * ``NETDEV_CHANGE`` * ``NETDEV_REGISTER`` * ``NETDEV_UP`` -* ``NETDEV_XDP_FEAT_CHANGE`` The following notifiers are running without the lock: * ``NETDEV_UNREGISTER`` diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e6036b82ef4cf..0321fd952f708 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2520,7 +2520,7 @@ struct net_device { * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: - * @up, @moving_ns, @nd_net, @xdp_flags + * @up, @moving_ns, @nd_net, @xdp_features * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h index 5706835a660c1..2a753813f8496 100644 --- a/include/net/netdev_lock.h +++ b/include/net/netdev_lock.h @@ -48,6 +48,22 @@ static inline void netdev_unlock_ops(struct net_device *dev) netdev_unlock(dev); } +static inline void netdev_lock_ops_to_full(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_lock(dev); +} + +static inline void netdev_unlock_full_to_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_unlock(dev); +} + static inline void netdev_ops_assert_locked(const struct net_device *dev) { if (netdev_need_ops_lock(dev)) diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c index 6fade574bc2a9..9e9fb25314b9c 100644 --- a/net/core/lock_debug.c +++ b/net/core/lock_debug.c @@ -18,10 +18,12 @@ int netdev_debug_event(struct notifier_block *nb, unsigned long event, /* Keep enum and don't add default to trigger -Werror=switch */ switch (cmd) { + case NETDEV_XDP_FEAT_CHANGE: + netdev_assert_locked(dev); + fallthrough; case NETDEV_CHANGE: case NETDEV_REGISTER: case NETDEV_UP: - case NETDEV_XDP_FEAT_CHANGE: netdev_ops_assert_locked(dev); fallthrough; case NETDEV_DOWN: diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b64c614a00c47..2c104947d224f 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -963,10 +963,14 @@ static int netdev_genl_netdevice_event(struct notifier_block *nb, switch (event) { case NETDEV_REGISTER: + netdev_lock_ops_to_full(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); + netdev_unlock_full_to_ops(netdev); break; case NETDEV_UNREGISTER: + netdev_lock(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); + netdev_unlock(netdev); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); From 7c6cd70ffd0ffe4ec95b6eca375a4e5dd5003819 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 16 Apr 2025 11:09:33 +0100 Subject: [PATCH 310/770] net: stmmac: dwc-qos: use PHY clock-stop capability Use the PHY clock-stop capability when programming the MAC LPI mode, which allows the transmit clock to the PHY to be gated. Tested on the Jetson Xavier NX platform. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u4zi1-000xHh-57@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index 583b5c071cd14..fa900b4991d06 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -252,7 +252,8 @@ static int tegra_eqos_probe(struct platform_device *pdev, plat_dat->fix_mac_speed = tegra_eqos_fix_speed; plat_dat->set_clk_tx_rate = stmmac_set_clk_tx_rate; plat_dat->bsp_priv = eqos; - plat_dat->flags |= STMMAC_FLAG_SPH_DISABLE; + plat_dat->flags |= STMMAC_FLAG_SPH_DISABLE | + STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP; return 0; From 01be295b485a7adcd662bec3b4613b4cef7956dc Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 16 Apr 2025 11:26:47 +0100 Subject: [PATCH 311/770] net: stmmac: mediatek: stop initialising plat->mac_interface Mediatek doesn't make use of mac_interface, and none of the in-tree DT files use the mac-mode property. Therefore, mac_interface already follows phy_interface. Remove this unnecessary assignment. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/E1u4zyh-000xVE-PG@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c index d178d5ddc7c7a..39421d6a34e41 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c @@ -581,7 +581,6 @@ static int mediatek_dwmac_common_data(struct platform_device *pdev, int i; priv_plat->phy_mode = plat->phy_interface; - plat->mac_interface = priv_plat->phy_mode; if (priv_plat->mac_wol) plat->flags &= ~STMMAC_FLAG_USE_PHY_WOL; else From 2b905deb43ea0b67fa8448fc9c15dacb068f45b6 Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Wed, 16 Apr 2025 19:56:23 +0800 Subject: [PATCH 312/770] net: Delete the outer () duplicated of macro SOCK_SKB_CB_OFFSET definition For macro SOCK_SKB_CB_OFFSET definition, Delete the outer () duplicated. Signed-off-by: Zijun Hu Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250416-fix_net-v1-1-d544c9f3f169@quicinc.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index bb4d6189292fe..e223102337c77 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2604,8 +2604,8 @@ struct sock_skb_cb { * using skb->cb[] would keep using it directly and utilize its * alignment guarantee. */ -#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ - sizeof(struct sock_skb_cb))) +#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \ + sizeof(struct sock_skb_cb)) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) From 1df4a945444f071a9c5e09580a485919c42d4de5 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 16 Apr 2025 10:06:12 -0700 Subject: [PATCH 313/770] trace: tcp: Add const qualifier to skb parameter in tcp_probe event Change the tcp_probe tracepoint to accept a const struct sk_buff parameter instead of a non-const one. This improves type safety and better reflects that the skb is not modified within the tracepoint implementation. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250416-tcp_probe-v1-1-1edc3c5a1cb8@debian.org Signed-off-by: Jakub Kicinski --- include/trace/events/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 75d3d53a3832c..53e878fa14d14 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -293,7 +293,7 @@ DECLARE_TRACE(tcp_cwnd_reduction_tp, TRACE_EVENT(tcp_probe, - TP_PROTO(struct sock *sk, struct sk_buff *skb), + TP_PROTO(struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), From 8066e388be48f1ad62b0449dc1d31a25489fa12a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 16 Apr 2025 13:08:40 -0700 Subject: [PATCH 314/770] net: add UAPI to the header guard in various network headers fib_rule, ip6_tunnel, and a whole lot of if_* headers lack the customary _UAPI in the header guard. Without it YNL build can't protect from in tree and system headers both getting included. YNL doesn't need most of these but it's annoying to have to fix them one by one. Note that header installation strips this _UAPI prefix so this should result in no change to the end user. Acked-by: Jamal Hadi Salim Reviewed-by: Jason Xing Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250416200840.1338195-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/fib_rules.h | 4 ++-- include/uapi/linux/if_addr.h | 4 ++-- include/uapi/linux/if_addrlabel.h | 4 ++-- include/uapi/linux/if_alg.h | 6 +++--- include/uapi/linux/if_arcnet.h | 6 +++--- include/uapi/linux/if_bonding.h | 6 +++--- include/uapi/linux/if_fc.h | 6 +++--- include/uapi/linux/if_hippi.h | 6 +++--- include/uapi/linux/if_packet.h | 4 ++-- include/uapi/linux/if_plip.h | 4 ++-- include/uapi/linux/if_slip.h | 4 ++-- include/uapi/linux/if_x25.h | 6 +++--- include/uapi/linux/if_xdp.h | 6 +++--- include/uapi/linux/ip6_tunnel.h | 4 ++-- include/uapi/linux/net_dropmon.h | 4 ++-- include/uapi/linux/net_tstamp.h | 6 +++--- include/uapi/linux/netlink_diag.h | 4 ++-- include/uapi/linux/pkt_cls.h | 4 ++-- include/uapi/linux/pkt_sched.h | 4 ++-- 19 files changed, 46 insertions(+), 46 deletions(-) diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 2df6e4035d50b..418c4be697ade 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_FIB_RULES_H -#define __LINUX_FIB_RULES_H +#ifndef _UAPI__LINUX_FIB_RULES_H +#define _UAPI__LINUX_FIB_RULES_H #include #include diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index 1c392dd95a5ee..aa7958b4e41d0 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_IF_ADDR_H -#define __LINUX_IF_ADDR_H +#ifndef _UAPI__LINUX_IF_ADDR_H +#define _UAPI__LINUX_IF_ADDR_H #include #include diff --git a/include/uapi/linux/if_addrlabel.h b/include/uapi/linux/if_addrlabel.h index d1f5974c76e10..e69db764fbba1 100644 --- a/include/uapi/linux/if_addrlabel.h +++ b/include/uapi/linux/if_addrlabel.h @@ -8,8 +8,8 @@ * YOSHIFUJI Hideaki @ USAGI/WIDE */ -#ifndef __LINUX_IF_ADDRLABEL_H -#define __LINUX_IF_ADDRLABEL_H +#ifndef _UAPI__LINUX_IF_ADDRLABEL_H +#define _UAPI__LINUX_IF_ADDRLABEL_H #include diff --git a/include/uapi/linux/if_alg.h b/include/uapi/linux/if_alg.h index 0824fbc026a1c..b35871cbeed7d 100644 --- a/include/uapi/linux/if_alg.h +++ b/include/uapi/linux/if_alg.h @@ -11,8 +11,8 @@ * */ -#ifndef _LINUX_IF_ALG_H -#define _LINUX_IF_ALG_H +#ifndef _UAPI_LINUX_IF_ALG_H +#define _UAPI_LINUX_IF_ALG_H #include @@ -58,4 +58,4 @@ struct af_alg_iv { #define ALG_OP_DECRYPT 0 #define ALG_OP_ENCRYPT 1 -#endif /* _LINUX_IF_ALG_H */ +#endif /* _UAPI_LINUX_IF_ALG_H */ diff --git a/include/uapi/linux/if_arcnet.h b/include/uapi/linux/if_arcnet.h index b122cfac71288..473569eaf692b 100644 --- a/include/uapi/linux/if_arcnet.h +++ b/include/uapi/linux/if_arcnet.h @@ -14,8 +14,8 @@ * 2 of the License, or (at your option) any later version. */ -#ifndef _LINUX_IF_ARCNET_H -#define _LINUX_IF_ARCNET_H +#ifndef _UAPI_LINUX_IF_ARCNET_H +#define _UAPI_LINUX_IF_ARCNET_H #include #include @@ -127,4 +127,4 @@ struct archdr { } soft; }; -#endif /* _LINUX_IF_ARCNET_H */ +#endif /* _UAPI_LINUX_IF_ARCNET_H */ diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index d174914a837db..3bcc03f3aa4f0 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -41,8 +41,8 @@ * - added definitions for various XOR hashing policies */ -#ifndef _LINUX_IF_BONDING_H -#define _LINUX_IF_BONDING_H +#ifndef _UAPI_LINUX_IF_BONDING_H +#define _UAPI_LINUX_IF_BONDING_H #include #include @@ -152,4 +152,4 @@ enum { }; #define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1) -#endif /* _LINUX_IF_BONDING_H */ +#endif /* _UAPI_LINUX_IF_BONDING_H */ diff --git a/include/uapi/linux/if_fc.h b/include/uapi/linux/if_fc.h index 3e3173282cc36..ff5ab92d16c2d 100644 --- a/include/uapi/linux/if_fc.h +++ b/include/uapi/linux/if_fc.h @@ -18,8 +18,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#ifndef _LINUX_IF_FC_H -#define _LINUX_IF_FC_H +#ifndef _UAPI_LINUX_IF_FC_H +#define _UAPI_LINUX_IF_FC_H #include @@ -49,4 +49,4 @@ struct fcllc { __be16 ethertype; /* ether type field */ }; -#endif /* _LINUX_IF_FC_H */ +#endif /* _UAPI_LINUX_IF_FC_H */ diff --git a/include/uapi/linux/if_hippi.h b/include/uapi/linux/if_hippi.h index 785a1452a66c8..42c4ffd11daea 100644 --- a/include/uapi/linux/if_hippi.h +++ b/include/uapi/linux/if_hippi.h @@ -20,8 +20,8 @@ * 2 of the License, or (at your option) any later version. */ -#ifndef _LINUX_IF_HIPPI_H -#define _LINUX_IF_HIPPI_H +#ifndef _UAPI_LINUX_IF_HIPPI_H +#define _UAPI_LINUX_IF_HIPPI_H #include #include @@ -151,4 +151,4 @@ struct hippi_hdr { struct hippi_snap_hdr snap; } __attribute__((packed)); -#endif /* _LINUX_IF_HIPPI_H */ +#endif /* _UAPI_LINUX_IF_HIPPI_H */ diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 1d2718dd96472..6cd1d7a41dfb7 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_IF_PACKET_H -#define __LINUX_IF_PACKET_H +#ifndef _UAPI__LINUX_IF_PACKET_H +#define _UAPI__LINUX_IF_PACKET_H #include #include diff --git a/include/uapi/linux/if_plip.h b/include/uapi/linux/if_plip.h index 495a366112f29..054d86a9c6e6b 100644 --- a/include/uapi/linux/if_plip.h +++ b/include/uapi/linux/if_plip.h @@ -9,8 +9,8 @@ * */ -#ifndef _LINUX_IF_PLIP_H -#define _LINUX_IF_PLIP_H +#ifndef _UAPI_LINUX_IF_PLIP_H +#define _UAPI_LINUX_IF_PLIP_H #include diff --git a/include/uapi/linux/if_slip.h b/include/uapi/linux/if_slip.h index 65937be53103f..299bf7adc8624 100644 --- a/include/uapi/linux/if_slip.h +++ b/include/uapi/linux/if_slip.h @@ -6,8 +6,8 @@ * KISS TNC driver. */ -#ifndef __LINUX_SLIP_H -#define __LINUX_SLIP_H +#ifndef _UAPI__LINUX_SLIP_H +#define _UAPI__LINUX_SLIP_H #define SL_MODE_SLIP 0 #define SL_MODE_CSLIP 1 diff --git a/include/uapi/linux/if_x25.h b/include/uapi/linux/if_x25.h index 3a5938e38370d..861cfa983db44 100644 --- a/include/uapi/linux/if_x25.h +++ b/include/uapi/linux/if_x25.h @@ -13,8 +13,8 @@ * GNU General Public License for more details. */ -#ifndef _IF_X25_H -#define _IF_X25_H +#ifndef _UAPI_IF_X25_H +#define _UAPI_IF_X25_H #include @@ -24,4 +24,4 @@ #define X25_IFACE_DISCONNECT 0x02 #define X25_IFACE_PARAMS 0x03 -#endif /* _IF_X25_H */ +#endif /* _UAPI_IF_X25_H */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 42869770776ec..44f2bb93e7e69 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -7,8 +7,8 @@ * Magnus Karlsson */ -#ifndef _LINUX_IF_XDP_H -#define _LINUX_IF_XDP_H +#ifndef _UAPI_LINUX_IF_XDP_H +#define _UAPI_LINUX_IF_XDP_H #include @@ -180,4 +180,4 @@ struct xdp_desc { /* TX packet carries valid metadata. */ #define XDP_TX_METADATA (1 << 1) -#endif /* _LINUX_IF_XDP_H */ +#endif /* _UAPI_LINUX_IF_XDP_H */ diff --git a/include/uapi/linux/ip6_tunnel.h b/include/uapi/linux/ip6_tunnel.h index 0245269b037c8..85182a839d42a 100644 --- a/include/uapi/linux/ip6_tunnel.h +++ b/include/uapi/linux/ip6_tunnel.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _IP6_TUNNEL_H -#define _IP6_TUNNEL_H +#ifndef _UAPI_IP6_TUNNEL_H +#define _UAPI_IP6_TUNNEL_H #include #include /* For IFNAMSIZ. */ diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h index 84f622a66a7ac..9dd41c2f58a62 100644 --- a/include/uapi/linux/net_dropmon.h +++ b/include/uapi/linux/net_dropmon.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __NET_DROPMON_H -#define __NET_DROPMON_H +#ifndef _UAPI__NET_DROPMON_H +#define _UAPI__NET_DROPMON_H #include #include diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 383213de612a8..a93e6ea37fb3a 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -7,8 +7,8 @@ * */ -#ifndef _NET_TIMESTAMPING_H -#define _NET_TIMESTAMPING_H +#ifndef _UAPI_NET_TIMESTAMPING_H +#define _UAPI_NET_TIMESTAMPING_H #include #include /* for SO_TIMESTAMPING */ @@ -216,4 +216,4 @@ struct sock_txtime { __u32 flags; /* as defined by enum txtime_flags */ }; -#endif /* _NET_TIMESTAMPING_H */ +#endif /* _UAPI_NET_TIMESTAMPING_H */ diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h index dfa61be43d2f0..ff28200204bbb 100644 --- a/include/uapi/linux/netlink_diag.h +++ b/include/uapi/linux/netlink_diag.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __NETLINK_DIAG_H__ -#define __NETLINK_DIAG_H__ +#ifndef _UAPI__NETLINK_DIAG_H__ +#define _UAPI__NETLINK_DIAG_H__ #include diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 2c32080416b5a..4908213641656 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_PKT_CLS_H -#define __LINUX_PKT_CLS_H +#ifndef _UAPI__LINUX_PKT_CLS_H +#define _UAPI__LINUX_PKT_CLS_H #include #include diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 25a9a47001cdd..9ea8743957172 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_PKT_SCHED_H -#define __LINUX_PKT_SCHED_H +#ifndef _UAPI__LINUX_PKT_SCHED_H +#define _UAPI__LINUX_PKT_SCHED_H #include #include From b375984f0df0ed5a45e8e9dc5600d6666cdb92e7 Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:29 -0400 Subject: [PATCH 315/770] net: stmmac: move frag_size handling out of spin_lock The upcoming patch will extract verification logic into a new module, MMSV (MAC Merge Software Verification). MMSV will handle most FPE fields, except frag_size. It introduces its own lock (mmsv->lock), replacing fpe_cfg->lock. Since frag_size handling remains in the driver, the existing rtnl_lock() is sufficient. Move frag_size handling out of spin_lock_irq_save() to keep the upcoming patch a pure refactoring without behavior changes. Signed-off-by: Faizal Rahim Reviewed-by: Vladimir Oltean Reviewed-by: Furong Xu <0x1207@gmail.com> Signed-off-by: Tony Nguyen --- drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 844f7d516a40b..53f51eebb7467 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -1216,6 +1216,10 @@ static int stmmac_get_mm(struct net_device *ndev, if (!stmmac_fpe_supported(priv)) return -EOPNOTSUPP; + state->rx_min_frag_size = ETH_ZLEN; + frag_size = stmmac_fpe_get_add_frag_size(priv); + state->tx_min_frag_size = ethtool_mm_frag_size_add_to_min(frag_size); + spin_lock_irqsave(&priv->fpe_cfg.lock, flags); state->max_verify_time = STMMAC_FPE_MM_MAX_VERIFY_TIME_MS; @@ -1224,7 +1228,6 @@ static int stmmac_get_mm(struct net_device *ndev, state->verify_time = priv->fpe_cfg.verify_time; state->tx_enabled = priv->fpe_cfg.tx_enabled; state->verify_status = priv->fpe_cfg.status; - state->rx_min_frag_size = ETH_ZLEN; /* FPE active if common tx_enabled and * (verification success or disabled(forced)) @@ -1236,9 +1239,6 @@ static int stmmac_get_mm(struct net_device *ndev, else state->tx_active = false; - frag_size = stmmac_fpe_get_add_frag_size(priv); - state->tx_min_frag_size = ethtool_mm_frag_size_add_to_min(frag_size); - spin_unlock_irqrestore(&priv->fpe_cfg.lock, flags); return 0; @@ -1258,6 +1258,8 @@ static int stmmac_set_mm(struct net_device *ndev, struct ethtool_mm_cfg *cfg, if (err) return err; + stmmac_fpe_set_add_frag_size(priv, frag_size); + /* Wait for the verification that's currently in progress to finish */ timer_shutdown_sync(&fpe_cfg->verify_timer); @@ -1271,7 +1273,6 @@ static int stmmac_set_mm(struct net_device *ndev, struct ethtool_mm_cfg *cfg, if (!cfg->verify_enabled) fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; - stmmac_fpe_set_add_frag_size(priv, frag_size); stmmac_fpe_apply(priv); spin_unlock_irqrestore(&fpe_cfg->lock, flags); From 9ff2aa4206eff40a202e425f232036bc84ad4c0e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 17 Mar 2025 23:07:30 -0400 Subject: [PATCH 316/770] net: ethtool: mm: extract stmmac verification logic into common library It appears that stmmac is not the only hardware which requires a software-driven verification state machine for the MAC Merge layer. While on the one hand it's good to encourage hardware implementations, on the other hand it's quite difficult to tolerate multiple drivers implementing independently fairly non-trivial logic. Extract the hardware-independent logic from stmmac into library code and put it in ethtool. Name the state structure "mmsv" for MAC Merge Software Verification. Let this expose an operations structure for executing the hardware stuff: sync hardware with the tx_active boolean (result of verification process), enable/disable the pMAC, send mPackets, notify library of external events (reception of mPackets), as well as link state changes. Note that it is assumed that the external events are received in hardirq context. If they are not, it is probably a good idea to disable hardirqs when calling ethtool_mmsv_event_handle(), because the library does not do so. Also, the MM software verification process has no business with the tx_min_frag_size, that is all the driver's to handle. Signed-off-by: Vladimir Oltean Co-developed-by: Choong Yong Liang Signed-off-by: Choong Yong Liang Tested-by: Choong Yong Liang Tested-by: Furong Xu <0x1207@gmail.com> Reviewed-by: Vladimir Oltean Signed-off-by: Faizal Rahim Signed-off-by: Tony Nguyen --- drivers/net/ethernet/stmicro/stmmac/Kconfig | 1 + drivers/net/ethernet/stmicro/stmmac/stmmac.h | 16 +- .../ethernet/stmicro/stmmac/stmmac_ethtool.c | 42 +-- .../net/ethernet/stmicro/stmmac/stmmac_fpe.c | 174 +++-------- .../net/ethernet/stmicro/stmmac/stmmac_fpe.h | 5 - .../net/ethernet/stmicro/stmmac/stmmac_main.c | 8 +- include/linux/ethtool.h | 73 +++++ net/ethtool/mm.c | 275 +++++++++++++++++- 8 files changed, 392 insertions(+), 202 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index 3c820ef567759..59c742a8b3adf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -3,6 +3,7 @@ config STMMAC_ETH tristate "STMicroelectronics Multi-Gigabit Ethernet driver" depends on HAS_IOMEM && HAS_DMA depends on PTP_1588_CLOCK_OPTIONAL + depends on ETHTOOL_NETLINK select MII select PCS_XPCS select PAGE_POOL diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index bddfa0f4aa217..1686e559f66e2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -149,21 +149,9 @@ struct stmmac_channel { }; struct stmmac_fpe_cfg { - /* Serialize access to MAC Merge state between ethtool requests - * and link state updates. - */ - spinlock_t lock; - + struct ethtool_mmsv mmsv; const struct stmmac_fpe_reg *reg; - u32 fpe_csr; /* MAC_FPE_CTRL_STS reg cache */ - - enum ethtool_mm_verify_status status; - struct timer_list verify_timer; - bool verify_enabled; - int verify_retries; - bool pmac_enabled; - u32 verify_time; - bool tx_enabled; + u32 fpe_csr; /* MAC_FPE_CTRL_STS reg cache */ }; struct stmmac_tc_entry { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 53f51eebb7467..f702f7b7bf9fa 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -1210,7 +1210,6 @@ static int stmmac_get_mm(struct net_device *ndev, struct ethtool_mm_state *state) { struct stmmac_priv *priv = netdev_priv(ndev); - unsigned long flags; u32 frag_size; if (!stmmac_fpe_supported(priv)) @@ -1220,26 +1219,7 @@ static int stmmac_get_mm(struct net_device *ndev, frag_size = stmmac_fpe_get_add_frag_size(priv); state->tx_min_frag_size = ethtool_mm_frag_size_add_to_min(frag_size); - spin_lock_irqsave(&priv->fpe_cfg.lock, flags); - - state->max_verify_time = STMMAC_FPE_MM_MAX_VERIFY_TIME_MS; - state->verify_enabled = priv->fpe_cfg.verify_enabled; - state->pmac_enabled = priv->fpe_cfg.pmac_enabled; - state->verify_time = priv->fpe_cfg.verify_time; - state->tx_enabled = priv->fpe_cfg.tx_enabled; - state->verify_status = priv->fpe_cfg.status; - - /* FPE active if common tx_enabled and - * (verification success or disabled(forced)) - */ - if (state->tx_enabled && - (state->verify_status == ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED || - state->verify_status == ETHTOOL_MM_VERIFY_STATUS_DISABLED)) - state->tx_active = true; - else - state->tx_active = false; - - spin_unlock_irqrestore(&priv->fpe_cfg.lock, flags); + ethtool_mmsv_get_mm(&priv->fpe_cfg.mmsv, state); return 0; } @@ -1248,8 +1228,6 @@ static int stmmac_set_mm(struct net_device *ndev, struct ethtool_mm_cfg *cfg, struct netlink_ext_ack *extack) { struct stmmac_priv *priv = netdev_priv(ndev); - struct stmmac_fpe_cfg *fpe_cfg = &priv->fpe_cfg; - unsigned long flags; u32 frag_size; int err; @@ -1259,23 +1237,7 @@ static int stmmac_set_mm(struct net_device *ndev, struct ethtool_mm_cfg *cfg, return err; stmmac_fpe_set_add_frag_size(priv, frag_size); - - /* Wait for the verification that's currently in progress to finish */ - timer_shutdown_sync(&fpe_cfg->verify_timer); - - spin_lock_irqsave(&fpe_cfg->lock, flags); - - fpe_cfg->verify_enabled = cfg->verify_enabled; - fpe_cfg->pmac_enabled = cfg->pmac_enabled; - fpe_cfg->verify_time = cfg->verify_time; - fpe_cfg->tx_enabled = cfg->tx_enabled; - - if (!cfg->verify_enabled) - fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; - - stmmac_fpe_apply(priv); - - spin_unlock_irqrestore(&fpe_cfg->lock, flags); + ethtool_mmsv_set_mm(&priv->fpe_cfg.mmsv, cfg); return 0; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.c index 3a4bee029c7f6..75b470ee621a3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.c @@ -27,12 +27,6 @@ #define STMMAC_MAC_FPE_CTRL_STS_SVER BIT(1) #define STMMAC_MAC_FPE_CTRL_STS_EFPE BIT(0) -/* FPE link-partner hand-shaking mPacket type */ -enum stmmac_mpacket_type { - MPACKET_VERIFY = 0, - MPACKET_RESPONSE = 1, -}; - struct stmmac_fpe_reg { const u32 mac_fpe_reg; /* offset of MAC_FPE_CTRL_STS */ const u32 mtl_fpe_reg; /* offset of MTL_FPE_CTRL_STS */ @@ -48,10 +42,10 @@ bool stmmac_fpe_supported(struct stmmac_priv *priv) priv->hw->mac->fpe_map_preemption_class; } -static void stmmac_fpe_configure(struct stmmac_priv *priv, bool tx_enable, - bool pmac_enable) +static void stmmac_fpe_configure_tx(struct ethtool_mmsv *mmsv, bool tx_enable) { - struct stmmac_fpe_cfg *cfg = &priv->fpe_cfg; + struct stmmac_fpe_cfg *cfg = container_of(mmsv, struct stmmac_fpe_cfg, mmsv); + struct stmmac_priv *priv = container_of(cfg, struct stmmac_priv, fpe_cfg); const struct stmmac_fpe_reg *reg = cfg->reg; u32 num_rxq = priv->plat->rx_queues_to_use; void __iomem *ioaddr = priv->ioaddr; @@ -68,6 +62,15 @@ static void stmmac_fpe_configure(struct stmmac_priv *priv, bool tx_enable, cfg->fpe_csr = 0; } writel(cfg->fpe_csr, ioaddr + reg->mac_fpe_reg); +} + +static void stmmac_fpe_configure_pmac(struct ethtool_mmsv *mmsv, bool pmac_enable) +{ + struct stmmac_fpe_cfg *cfg = container_of(mmsv, struct stmmac_fpe_cfg, mmsv); + struct stmmac_priv *priv = container_of(cfg, struct stmmac_priv, fpe_cfg); + const struct stmmac_fpe_reg *reg = cfg->reg; + void __iomem *ioaddr = priv->ioaddr; + u32 value; value = readl(ioaddr + reg->int_en_reg); @@ -85,47 +88,45 @@ static void stmmac_fpe_configure(struct stmmac_priv *priv, bool tx_enable, writel(value, ioaddr + reg->int_en_reg); } -static void stmmac_fpe_send_mpacket(struct stmmac_priv *priv, - enum stmmac_mpacket_type type) +static void stmmac_fpe_send_mpacket(struct ethtool_mmsv *mmsv, + enum ethtool_mpacket type) { - const struct stmmac_fpe_reg *reg = priv->fpe_cfg.reg; + struct stmmac_fpe_cfg *cfg = container_of(mmsv, struct stmmac_fpe_cfg, mmsv); + struct stmmac_priv *priv = container_of(cfg, struct stmmac_priv, fpe_cfg); + const struct stmmac_fpe_reg *reg = cfg->reg; void __iomem *ioaddr = priv->ioaddr; - u32 value = priv->fpe_cfg.fpe_csr; + u32 value = cfg->fpe_csr; - if (type == MPACKET_VERIFY) + if (type == ETHTOOL_MPACKET_VERIFY) value |= STMMAC_MAC_FPE_CTRL_STS_SVER; - else if (type == MPACKET_RESPONSE) + else if (type == ETHTOOL_MPACKET_RESPONSE) value |= STMMAC_MAC_FPE_CTRL_STS_SRSP; writel(value, ioaddr + reg->mac_fpe_reg); } +static const struct ethtool_mmsv_ops stmmac_mmsv_ops = { + .configure_tx = stmmac_fpe_configure_tx, + .configure_pmac = stmmac_fpe_configure_pmac, + .send_mpacket = stmmac_fpe_send_mpacket, +}; + static void stmmac_fpe_event_status(struct stmmac_priv *priv, int status) { struct stmmac_fpe_cfg *fpe_cfg = &priv->fpe_cfg; + struct ethtool_mmsv *mmsv = &fpe_cfg->mmsv; - /* This is interrupt context, just spin_lock() */ - spin_lock(&fpe_cfg->lock); - - if (!fpe_cfg->pmac_enabled || status == FPE_EVENT_UNKNOWN) - goto unlock_out; + if (status == FPE_EVENT_UNKNOWN) + return; - /* LP has sent verify mPacket */ if ((status & FPE_EVENT_RVER) == FPE_EVENT_RVER) - stmmac_fpe_send_mpacket(priv, MPACKET_RESPONSE); + ethtool_mmsv_event_handle(mmsv, ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET); - /* Local has sent verify mPacket */ - if ((status & FPE_EVENT_TVER) == FPE_EVENT_TVER && - fpe_cfg->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) - fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_VERIFYING; + if ((status & FPE_EVENT_TVER) == FPE_EVENT_TVER) + ethtool_mmsv_event_handle(mmsv, ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET); - /* LP has sent response mPacket */ - if ((status & FPE_EVENT_RRSP) == FPE_EVENT_RRSP && - fpe_cfg->status == ETHTOOL_MM_VERIFY_STATUS_VERIFYING) - fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED; - -unlock_out: - spin_unlock(&fpe_cfg->lock); + if ((status & FPE_EVENT_RRSP) == FPE_EVENT_RRSP) + ethtool_mmsv_event_handle(mmsv, ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET); } void stmmac_fpe_irq_status(struct stmmac_priv *priv) @@ -164,119 +165,16 @@ void stmmac_fpe_irq_status(struct stmmac_priv *priv) stmmac_fpe_event_status(priv, status); } -/** - * stmmac_fpe_verify_timer - Timer for MAC Merge verification - * @t: timer_list struct containing private info - * - * Verify the MAC Merge capability in the local TX direction, by - * transmitting Verify mPackets up to 3 times. Wait until link - * partner responds with a Response mPacket, otherwise fail. - */ -static void stmmac_fpe_verify_timer(struct timer_list *t) -{ - struct stmmac_fpe_cfg *fpe_cfg = from_timer(fpe_cfg, t, verify_timer); - struct stmmac_priv *priv = container_of(fpe_cfg, struct stmmac_priv, - fpe_cfg); - unsigned long flags; - bool rearm = false; - - spin_lock_irqsave(&fpe_cfg->lock, flags); - - switch (fpe_cfg->status) { - case ETHTOOL_MM_VERIFY_STATUS_INITIAL: - case ETHTOOL_MM_VERIFY_STATUS_VERIFYING: - if (fpe_cfg->verify_retries != 0) { - stmmac_fpe_send_mpacket(priv, MPACKET_VERIFY); - rearm = true; - } else { - fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_FAILED; - } - - fpe_cfg->verify_retries--; - break; - - case ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED: - stmmac_fpe_configure(priv, true, true); - break; - - default: - break; - } - - if (rearm) { - mod_timer(&fpe_cfg->verify_timer, - jiffies + msecs_to_jiffies(fpe_cfg->verify_time)); - } - - spin_unlock_irqrestore(&fpe_cfg->lock, flags); -} - -static void stmmac_fpe_verify_timer_arm(struct stmmac_fpe_cfg *fpe_cfg) -{ - if (fpe_cfg->pmac_enabled && fpe_cfg->tx_enabled && - fpe_cfg->verify_enabled && - fpe_cfg->status != ETHTOOL_MM_VERIFY_STATUS_FAILED && - fpe_cfg->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) { - timer_setup(&fpe_cfg->verify_timer, stmmac_fpe_verify_timer, 0); - mod_timer(&fpe_cfg->verify_timer, jiffies); - } -} - void stmmac_fpe_init(struct stmmac_priv *priv) { - priv->fpe_cfg.verify_retries = STMMAC_FPE_MM_MAX_VERIFY_RETRIES; - priv->fpe_cfg.verify_time = STMMAC_FPE_MM_MAX_VERIFY_TIME_MS; - priv->fpe_cfg.status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; - timer_setup(&priv->fpe_cfg.verify_timer, stmmac_fpe_verify_timer, 0); - spin_lock_init(&priv->fpe_cfg.lock); + ethtool_mmsv_init(&priv->fpe_cfg.mmsv, priv->dev, + &stmmac_mmsv_ops); if ((!priv->fpe_cfg.reg || !priv->hw->mac->fpe_map_preemption_class) && priv->dma_cap.fpesel) dev_info(priv->device, "FPE is not supported by driver.\n"); } -void stmmac_fpe_apply(struct stmmac_priv *priv) -{ - struct stmmac_fpe_cfg *fpe_cfg = &priv->fpe_cfg; - - /* If verification is disabled, configure FPE right away. - * Otherwise let the timer code do it. - */ - if (!fpe_cfg->verify_enabled) { - stmmac_fpe_configure(priv, fpe_cfg->tx_enabled, - fpe_cfg->pmac_enabled); - } else { - fpe_cfg->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL; - fpe_cfg->verify_retries = STMMAC_FPE_MM_MAX_VERIFY_RETRIES; - - if (netif_running(priv->dev)) - stmmac_fpe_verify_timer_arm(fpe_cfg); - } -} - -void stmmac_fpe_link_state_handle(struct stmmac_priv *priv, bool is_up) -{ - struct stmmac_fpe_cfg *fpe_cfg = &priv->fpe_cfg; - unsigned long flags; - - timer_shutdown_sync(&fpe_cfg->verify_timer); - - spin_lock_irqsave(&fpe_cfg->lock, flags); - - if (is_up && fpe_cfg->pmac_enabled) { - /* VERIFY process requires pmac enabled when NIC comes up */ - stmmac_fpe_configure(priv, false, true); - - /* New link => maybe new partner => new verification process */ - stmmac_fpe_apply(priv); - } else { - /* No link => turn off EFPE */ - stmmac_fpe_configure(priv, false, false); - } - - spin_unlock_irqrestore(&fpe_cfg->lock, flags); -} - int stmmac_fpe_get_add_frag_size(struct stmmac_priv *priv) { const struct stmmac_fpe_reg *reg = priv->fpe_cfg.reg; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.h b/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.h index b884eac7142d0..3fc46acf7001b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_fpe.h @@ -9,15 +9,10 @@ #include #include -#define STMMAC_FPE_MM_MAX_VERIFY_RETRIES 3 -#define STMMAC_FPE_MM_MAX_VERIFY_TIME_MS 128 - struct stmmac_priv; -void stmmac_fpe_link_state_handle(struct stmmac_priv *priv, bool is_up); bool stmmac_fpe_supported(struct stmmac_priv *priv); void stmmac_fpe_init(struct stmmac_priv *priv); -void stmmac_fpe_apply(struct stmmac_priv *priv); void stmmac_fpe_irq_status(struct stmmac_priv *priv); int stmmac_fpe_get_add_frag_size(struct stmmac_priv *priv); void stmmac_fpe_set_add_frag_size(struct stmmac_priv *priv, u32 add_frag_size); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 59d07d0d3369d..f59a2363f1506 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -946,7 +946,7 @@ static void stmmac_mac_link_down(struct phylink_config *config, stmmac_set_eee_pls(priv, priv->hw, false); if (stmmac_fpe_supported(priv)) - stmmac_fpe_link_state_handle(priv, false); + ethtool_mmsv_link_state_handle(&priv->fpe_cfg.mmsv, false); } static void stmmac_mac_link_up(struct phylink_config *config, @@ -1064,7 +1064,7 @@ static void stmmac_mac_link_up(struct phylink_config *config, stmmac_set_eee_pls(priv, priv->hw, true); if (stmmac_fpe_supported(priv)) - stmmac_fpe_link_state_handle(priv, true); + ethtool_mmsv_link_state_handle(&priv->fpe_cfg.mmsv, true); if (priv->plat->flags & STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY) stmmac_hwtstamp_correct_latency(priv, priv); @@ -4152,7 +4152,7 @@ static int stmmac_release(struct net_device *dev) stmmac_release_ptp(priv); if (stmmac_fpe_supported(priv)) - timer_shutdown_sync(&priv->fpe_cfg.verify_timer); + ethtool_mmsv_stop(&priv->fpe_cfg.mmsv); pm_runtime_put(priv->device); @@ -7871,7 +7871,7 @@ int stmmac_suspend(struct device *dev) rtnl_unlock(); if (stmmac_fpe_supported(priv)) - timer_shutdown_sync(&priv->fpe_cfg.verify_timer); + ethtool_mmsv_stop(&priv->fpe_cfg.mmsv); return 0; } diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 7edb5f5e71345..117718c248143 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -17,9 +17,13 @@ #include #include #include +#include #include #include +#define ETHTOOL_MM_MAX_VERIFY_TIME_MS 128 +#define ETHTOOL_MM_MAX_VERIFY_RETRIES 3 + struct compat_ethtool_rx_flow_spec { u32 flow_type; union ethtool_flow_union h_u; @@ -718,6 +722,75 @@ struct ethtool_mm_stats { u64 MACMergeHoldCount; }; +enum ethtool_mmsv_event { + ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET, + ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET, + ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET, +}; + +/* MAC Merge verification mPacket type */ +enum ethtool_mpacket { + ETHTOOL_MPACKET_VERIFY, + ETHTOOL_MPACKET_RESPONSE, +}; + +struct ethtool_mmsv; + +/** + * struct ethtool_mmsv_ops - Operations for MAC Merge Software Verification + * @configure_tx: Driver callback for the event where the preemptible TX + * becomes active or inactive. Preemptible traffic + * classes must be committed to hardware only while + * preemptible TX is active. + * @configure_pmac: Driver callback for the event where the pMAC state + * changes as result of an administrative setting + * (ethtool) or a call to ethtool_mmsv_link_state_handle(). + * @send_mpacket: Driver-provided method for sending a Verify or a Response + * mPacket. + */ +struct ethtool_mmsv_ops { + void (*configure_tx)(struct ethtool_mmsv *mmsv, bool tx_active); + void (*configure_pmac)(struct ethtool_mmsv *mmsv, bool pmac_enabled); + void (*send_mpacket)(struct ethtool_mmsv *mmsv, enum ethtool_mpacket mpacket); +}; + +/** + * struct ethtool_mmsv - MAC Merge Software Verification + * @ops: operations for MAC Merge Software Verification + * @dev: pointer to net_device structure + * @lock: serialize access to MAC Merge state between + * ethtool requests and link state updates. + * @status: current verification FSM state + * @verify_timer: timer for verification in local TX direction + * @verify_enabled: indicates if verification is enabled + * @verify_retries: number of retries for verification + * @pmac_enabled: indicates if the preemptible MAC is enabled + * @verify_time: time for verification in milliseconds + * @tx_enabled: indicates if transmission is enabled + */ +struct ethtool_mmsv { + const struct ethtool_mmsv_ops *ops; + struct net_device *dev; + spinlock_t lock; + enum ethtool_mm_verify_status status; + struct timer_list verify_timer; + bool verify_enabled; + int verify_retries; + bool pmac_enabled; + u32 verify_time; + bool tx_enabled; +}; + +void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv); +void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up); +void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv, + enum ethtool_mmsv_event event); +void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv, + struct ethtool_mm_state *state); +void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg); +void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev, + const struct ethtool_mmsv_ops *ops); + /** * struct ethtool_rxfh_param - RXFH (RSS) parameters * @hfunc: Defines the current RSS hash function used by HW (or to be set to). diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c index 2816bb23c3ad9..bfd988464d7d0 100644 --- a/net/ethtool/mm.c +++ b/net/ethtool/mm.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright 2022-2023 NXP + * Copyright 2022-2025 NXP + * Copyright 2024 Furong Xu <0x1207@gmail.com> */ #include "common.h" #include "netlink.h" @@ -282,3 +283,275 @@ bool ethtool_dev_mm_supported(struct net_device *dev) return supported; } EXPORT_SYMBOL_GPL(ethtool_dev_mm_supported); + +static void ethtool_mmsv_configure_tx(struct ethtool_mmsv *mmsv, + bool tx_active) +{ + if (mmsv->ops->configure_tx) + mmsv->ops->configure_tx(mmsv, tx_active); +} + +static void ethtool_mmsv_configure_pmac(struct ethtool_mmsv *mmsv, + bool pmac_enabled) +{ + if (mmsv->ops->configure_pmac) + mmsv->ops->configure_pmac(mmsv, pmac_enabled); +} + +static void ethtool_mmsv_send_mpacket(struct ethtool_mmsv *mmsv, + enum ethtool_mpacket mpacket) +{ + if (mmsv->ops->send_mpacket) + mmsv->ops->send_mpacket(mmsv, mpacket); +} + +/** + * ethtool_mmsv_verify_timer - Timer for MAC Merge verification + * @t: timer_list struct containing private info + * + * Verify the MAC Merge capability in the local TX direction, by + * transmitting Verify mPackets up to 3 times. Wait until link + * partner responds with a Response mPacket, otherwise fail. + */ +static void ethtool_mmsv_verify_timer(struct timer_list *t) +{ + struct ethtool_mmsv *mmsv = from_timer(mmsv, t, verify_timer); + unsigned long flags; + bool rearm = false; + + spin_lock_irqsave(&mmsv->lock, flags); + + switch (mmsv->status) { + case ETHTOOL_MM_VERIFY_STATUS_INITIAL: + case ETHTOOL_MM_VERIFY_STATUS_VERIFYING: + if (mmsv->verify_retries != 0) { + ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_VERIFY); + rearm = true; + } else { + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_FAILED; + } + + mmsv->verify_retries--; + break; + + case ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED: + ethtool_mmsv_configure_tx(mmsv, true); + break; + + default: + break; + } + + if (rearm) { + mod_timer(&mmsv->verify_timer, + jiffies + msecs_to_jiffies(mmsv->verify_time)); + } + + spin_unlock_irqrestore(&mmsv->lock, flags); +} + +static void ethtool_mmsv_verify_timer_arm(struct ethtool_mmsv *mmsv) +{ + if (mmsv->pmac_enabled && mmsv->tx_enabled && mmsv->verify_enabled && + mmsv->status != ETHTOOL_MM_VERIFY_STATUS_FAILED && + mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) { + timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0); + mod_timer(&mmsv->verify_timer, jiffies); + } +} + +static void ethtool_mmsv_apply(struct ethtool_mmsv *mmsv) +{ + /* If verification is disabled, configure FPE right away. + * Otherwise let the timer code do it. + */ + if (!mmsv->verify_enabled) { + ethtool_mmsv_configure_pmac(mmsv, mmsv->pmac_enabled); + ethtool_mmsv_configure_tx(mmsv, mmsv->tx_enabled); + } else { + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL; + mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES; + + if (netif_running(mmsv->dev)) + ethtool_mmsv_verify_timer_arm(mmsv); + } +} + +/** + * ethtool_mmsv_stop() - Stop MAC Merge Software Verification + * @mmsv: MAC Merge Software Verification state + * + * Drivers should call this method in a state where the hardware is + * about to lose state, like ndo_stop() or suspend(), and turning off + * MAC Merge features would be superfluous. Otherwise, prefer + * ethtool_mmsv_link_state_handle() with up=false. + */ +void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv) +{ + timer_shutdown_sync(&mmsv->verify_timer); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_stop); + +/** + * ethtool_mmsv_link_state_handle() - Inform MAC Merge Software Verification + * of link state changes + * @mmsv: MAC Merge Software Verification state + * @up: True if device carrier is up and able to pass verification packets + * + * Calling context is expected to be from a task, interrupts enabled. + */ +void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up) +{ + unsigned long flags; + + ethtool_mmsv_stop(mmsv); + + spin_lock_irqsave(&mmsv->lock, flags); + + if (up && mmsv->pmac_enabled) { + /* VERIFY process requires pMAC enabled when NIC comes up */ + ethtool_mmsv_configure_pmac(mmsv, true); + + /* New link => maybe new partner => new verification process */ + ethtool_mmsv_apply(mmsv); + } else { + /* No link or pMAC not enabled */ + ethtool_mmsv_configure_pmac(mmsv, false); + ethtool_mmsv_configure_tx(mmsv, false); + } + + spin_unlock_irqrestore(&mmsv->lock, flags); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_link_state_handle); + +/** + * ethtool_mmsv_event_handle() - Inform MAC Merge Software Verification + * of interrupt-based events + * @mmsv: MAC Merge Software Verification state + * @event: Event which took place (packet transmission or reception) + * + * Calling context expects to have interrupts disabled. + */ +void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv, + enum ethtool_mmsv_event event) +{ + /* This is interrupt context, just spin_lock() */ + spin_lock(&mmsv->lock); + + if (!mmsv->pmac_enabled) + goto unlock; + + switch (event) { + case ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET: + /* Link partner has sent verify mPacket */ + ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_RESPONSE); + break; + case ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET: + /* Local device has sent verify mPacket */ + if (mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_VERIFYING; + break; + case ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET: + /* Link partner has sent response mPacket */ + if (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_VERIFYING) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED; + break; + } + +unlock: + spin_unlock(&mmsv->lock); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_event_handle); + +static bool ethtool_mmsv_is_tx_active(struct ethtool_mmsv *mmsv) +{ + /* TX is active if administratively enabled, and verification either + * succeeded, or was administratively disabled. + */ + return mmsv->tx_enabled && + (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED || + mmsv->status == ETHTOOL_MM_VERIFY_STATUS_DISABLED); +} + +/** + * ethtool_mmsv_get_mm() - get_mm() hook for MAC Merge Software Verification + * @mmsv: MAC Merge Software Verification state + * @state: see struct ethtool_mm_state + * + * Drivers are expected to call this from their ethtool_ops :: get_mm() + * method. + */ +void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv, + struct ethtool_mm_state *state) +{ + unsigned long flags; + + spin_lock_irqsave(&mmsv->lock, flags); + + state->max_verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS; + state->verify_enabled = mmsv->verify_enabled; + state->pmac_enabled = mmsv->pmac_enabled; + state->verify_time = mmsv->verify_time; + state->tx_enabled = mmsv->tx_enabled; + state->verify_status = mmsv->status; + state->tx_active = ethtool_mmsv_is_tx_active(mmsv); + + spin_unlock_irqrestore(&mmsv->lock, flags); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_get_mm); + +/** + * ethtool_mmsv_set_mm() - set_mm() hook for MAC Merge Software Verification + * @mmsv: MAC Merge Software Verification state + * @cfg: see struct ethtool_mm_cfg + * + * Drivers are expected to call this from their ethtool_ops :: set_mm() + * method. + */ +void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg) +{ + unsigned long flags; + + /* Wait for the verification that's currently in progress to finish */ + ethtool_mmsv_stop(mmsv); + + spin_lock_irqsave(&mmsv->lock, flags); + + mmsv->verify_enabled = cfg->verify_enabled; + mmsv->pmac_enabled = cfg->pmac_enabled; + mmsv->verify_time = cfg->verify_time; + mmsv->tx_enabled = cfg->tx_enabled; + + if (!cfg->verify_enabled) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; + + ethtool_mmsv_apply(mmsv); + + spin_unlock_irqrestore(&mmsv->lock, flags); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_set_mm); + +/** + * ethtool_mmsv_init() - Initialize MAC Merge Software Verification state + * @mmsv: MAC Merge Software Verification state + * @dev: Pointer to network interface + * @ops: Methods for implementing the generic functionality + * + * The MAC Merge Software Verification is a timer- and event-based state + * machine intended for network interfaces which lack a hardware-based + * TX verification process (as per IEEE 802.3 clause 99.4.3). The timer + * is managed by the core code, whereas events are supplied by the + * driver explicitly calling one of the other API functions. + */ +void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev, + const struct ethtool_mmsv_ops *ops) +{ + mmsv->ops = ops; + mmsv->dev = dev; + mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES; + mmsv->verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS; + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; + timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0); + spin_lock_init(&mmsv->lock); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_init); From dda666343cc8e90441fd55142a679bea32d239e2 Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:31 -0400 Subject: [PATCH 317/770] net: ethtool: mm: reset verification status when link is down When the link partner goes down, "ethtool --show-mm" still displays "Verification status: SUCCEEDED," reflecting a previous state that is no longer valid. Reset the verification status to ensure it reflects the current state. Reviewed-by: Furong Xu <0x1207@gmail.com> Reviewed-by: Vladimir Oltean Signed-off-by: Faizal Rahim Signed-off-by: Tony Nguyen --- net/ethtool/mm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c index bfd988464d7d0..ad9b400340033 100644 --- a/net/ethtool/mm.c +++ b/net/ethtool/mm.c @@ -415,6 +415,10 @@ void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up) /* New link => maybe new partner => new verification process */ ethtool_mmsv_apply(mmsv); } else { + /* Reset the reported verification state while the link is down */ + if (mmsv->verify_enabled) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL; + /* No link or pMAC not enabled */ ethtool_mmsv_configure_pmac(mmsv, false); ethtool_mmsv_configure_tx(mmsv, false); From 19d629079c0ef5cd121a770e40d80b8dc585dc56 Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:32 -0400 Subject: [PATCH 318/770] igc: rename xdp_get_tx_ring() for non-xdp usage Renamed xdp_get_tx_ring() function to a more generic name for use in upcoming frame preemption patches. Signed-off-by: Faizal Rahim Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 2 +- drivers/net/ethernet/intel/igc/igc_main.c | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 2f265c0959c7a..a489e14d8dc47 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -736,7 +736,7 @@ struct igc_nfc_rule *igc_get_nfc_rule(struct igc_adapter *adapter, u32 location); int igc_add_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule); void igc_del_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule); - +struct igc_ring *igc_get_tx_ring(struct igc_adapter *adapter, int cpu); void igc_ptp_init(struct igc_adapter *adapter); void igc_ptp_reset(struct igc_adapter *adapter); void igc_ptp_suspend(struct igc_adapter *adapter); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index ddfa654db1e03..27771c6ab6d7c 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2464,8 +2464,7 @@ static int igc_xdp_init_tx_descriptor(struct igc_ring *ring, return -ENOMEM; } -static struct igc_ring *igc_xdp_get_tx_ring(struct igc_adapter *adapter, - int cpu) +struct igc_ring *igc_get_tx_ring(struct igc_adapter *adapter, int cpu) { int index = cpu; @@ -2489,7 +2488,7 @@ static int igc_xdp_xmit_back(struct igc_adapter *adapter, struct xdp_buff *xdp) if (unlikely(!xdpf)) return -EFAULT; - ring = igc_xdp_get_tx_ring(adapter, cpu); + ring = igc_get_tx_ring(adapter, cpu); nq = txring_txq(ring); __netif_tx_lock(nq, cpu); @@ -2566,7 +2565,7 @@ static void igc_finalize_xdp(struct igc_adapter *adapter, int status) struct igc_ring *ring; if (status & IGC_XDP_TX) { - ring = igc_xdp_get_tx_ring(adapter, cpu); + ring = igc_get_tx_ring(adapter, cpu); nq = txring_txq(ring); __netif_tx_lock(nq, cpu); @@ -6779,7 +6778,7 @@ static int igc_xdp_xmit(struct net_device *dev, int num_frames, if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; - ring = igc_xdp_get_tx_ring(adapter, cpu); + ring = igc_get_tx_ring(adapter, cpu); nq = txring_txq(ring); __netif_tx_lock(nq, cpu); From 67287d67bebdd6de3ba7e60f20b08fb62a360a01 Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:33 -0400 Subject: [PATCH 319/770] igc: rename I225_RXPBSIZE_DEFAULT and I225_TXPBSIZE_DEFAULT Rename RX and TX packet buffer size macros in preparation for an upcoming patch that will refactor buffer size handling using FIELD_PREP and GENMASK. Changes: - Rename I225_RXPBSIZE_DEFAULT to IGC_RXPBSIZE_EXP_BMC_DEFAULT. The EXP_BMC suffix explicitly indicates Express and BMC buffer default values, improving readability and reusability for the upcoming changes, while also better reflecting the current buffer allocations. - Rename I225_TXPBSIZE_DEFAULT to IGC_TXPBSIZE_DEFAULT. These registers apply to both i225 and i226, so using the IGC prefix aligns with existing macro naming conventions. Signed-off-by: Faizal Rahim Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_defines.h | 7 ++++--- drivers/net/ethernet/intel/igc/igc_main.c | 4 ++-- drivers/net/ethernet/intel/igc/igc_tsn.c | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index d19325b0e6e0b..34b3b1a7610e6 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -396,9 +396,10 @@ #define IGC_RCTL_PMCF 0x00800000 /* pass MAC control frames */ #define IGC_RCTL_SECRC 0x04000000 /* Strip Ethernet CRC */ -#define I225_RXPBSIZE_DEFAULT 0x000000A2 /* RXPBSIZE default */ -#define I225_TXPBSIZE_DEFAULT 0x04000014 /* TXPBSIZE default */ -#define IGC_RXPBS_CFG_TS_EN 0x80000000 /* Timestamp in Rx buffer */ +/* RXPBSIZE default value for Express and BMC buffer */ +#define IGC_RXPBSIZE_EXP_BMC_DEFAULT 0x000000A2 +#define IGC_TXPBSIZE_DEFAULT 0x04000014 /* TXPBSIZE default */ +#define IGC_RXPBS_CFG_TS_EN 0x80000000 /* Timestamp in Rx buffer */ #define IGC_TXPBSIZE_TSN 0x04145145 /* 5k bytes buffer for each queue */ diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 27771c6ab6d7c..9d9661632ae79 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7159,8 +7159,8 @@ static int igc_probe(struct pci_dev *pdev, } /* configure RXPBSIZE and TXPBSIZE */ - wr32(IGC_RXPBS, I225_RXPBSIZE_DEFAULT); - wr32(IGC_TXPBS, I225_TXPBSIZE_DEFAULT); + wr32(IGC_RXPBS, IGC_RXPBSIZE_EXP_BMC_DEFAULT); + wr32(IGC_TXPBS, IGC_TXPBSIZE_DEFAULT); timer_setup(&adapter->watchdog_timer, igc_watchdog, 0); timer_setup(&adapter->phy_info_timer, igc_update_phy_info, 0); diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 1e44374ca1ffb..498741d83ca64 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -136,7 +136,7 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) int i; wr32(IGC_GTXOFFSET, 0); - wr32(IGC_TXPBS, I225_TXPBSIZE_DEFAULT); + wr32(IGC_TXPBS, IGC_TXPBSIZE_DEFAULT); wr32(IGC_DTXMXPKTSZ, IGC_DTXMXPKTSZ_DEFAULT); if (igc_is_device_id_i226(hw)) From 425d8d9cb092fdd38e139dc6c104cade85896ba4 Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:34 -0400 Subject: [PATCH 320/770] igc: use FIELD_PREP and GENMASK for existing TX packet buffer size In preparation for an upcoming patch that will modify the TX buffer size in TSN mode, replace IGC_TXPBSIZE_TSN and IGC_TXPBSIZE_DEFAULT implementation with new macros that utilizes FIELD_PREP and GENMASK for clarity. The newly introduced macros follow the naming from the i226 SW User Manual for easy reference. I've tested IGC_TXPBSIZE_TSN and IGC_TXPBSIZE_DEFAULT before and after the refactoring, and their values remain unchanged. Reviewed-by: Vladimir Oltean Signed-off-by: Faizal Rahim Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_defines.h | 23 ++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 34b3b1a7610e6..ec8f926362eb3 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -398,10 +398,29 @@ /* RXPBSIZE default value for Express and BMC buffer */ #define IGC_RXPBSIZE_EXP_BMC_DEFAULT 0x000000A2 -#define IGC_TXPBSIZE_DEFAULT 0x04000014 /* TXPBSIZE default */ #define IGC_RXPBS_CFG_TS_EN 0x80000000 /* Timestamp in Rx buffer */ -#define IGC_TXPBSIZE_TSN 0x04145145 /* 5k bytes buffer for each queue */ +/* Mask for TX packet buffer size */ +#define IGC_TXPB0SIZE_MASK GENMASK(5, 0) +#define IGC_TXPB1SIZE_MASK GENMASK(11, 6) +#define IGC_TXPB2SIZE_MASK GENMASK(17, 12) +#define IGC_TXPB3SIZE_MASK GENMASK(23, 18) +/* Mask for OS to BMC packet buffer size */ +#define IGC_OS2BMCPBSIZE_MASK GENMASK(29, 24) +/* TX Packet buffer size in KB */ +#define IGC_TXPB0SIZE(x) FIELD_PREP(IGC_TXPB0SIZE_MASK, (x)) +#define IGC_TXPB1SIZE(x) FIELD_PREP(IGC_TXPB1SIZE_MASK, (x)) +#define IGC_TXPB2SIZE(x) FIELD_PREP(IGC_TXPB2SIZE_MASK, (x)) +#define IGC_TXPB3SIZE(x) FIELD_PREP(IGC_TXPB3SIZE_MASK, (x)) +/* OS to BMC packet buffer size in KB */ +#define IGC_OS2BMCPBSIZE(x) FIELD_PREP(IGC_OS2BMCPBSIZE_MASK, (x)) +/* Default value following I225/I226 SW User Manual Section 8.3.2 */ +#define IGC_TXPBSIZE_DEFAULT ( \ + IGC_TXPB0SIZE(20) | IGC_TXPB1SIZE(0) | IGC_TXPB2SIZE(0) | \ + IGC_TXPB3SIZE(0) | IGC_OS2BMCPBSIZE(4)) +#define IGC_TXPBSIZE_TSN ( \ + IGC_TXPB0SIZE(5) | IGC_TXPB1SIZE(5) | IGC_TXPB2SIZE(5) | \ + IGC_TXPB3SIZE(5) | IGC_OS2BMCPBSIZE(4)) #define IGC_DTXMXPKTSZ_TSN 0x19 /* 1600 bytes of max TX DMA packet size */ #define IGC_DTXMXPKTSZ_DEFAULT 0x98 /* 9728-byte Jumbo frames */ From 0d58cdc902dace2298406b3972def04f55e3d775 Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:35 -0400 Subject: [PATCH 321/770] igc: optimize TX packet buffer utilization for TSN mode In preparation for upcoming frame preemption patches, optimize the TX packet buffer size. The total packet buffer size (RX + TX) is 64KB, with a maximum of 34KB for either RX or TX. Split the buffer evenly, allocating 32KB to each. For TX, assign 7KB to each of the four TX packet buffers (total 28KB) and reserve 4KB for BMC. References: I225/I226 SW User Manual Section 4.7.9, Section 8.3.2 Co-developed-by: Vinicius Costa Gomes Signed-off-by: Vinicius Costa Gomes Signed-off-by: Faizal Rahim Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_defines.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index ec8f926362eb3..5502edd6261f4 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -419,8 +419,8 @@ IGC_TXPB0SIZE(20) | IGC_TXPB1SIZE(0) | IGC_TXPB2SIZE(0) | \ IGC_TXPB3SIZE(0) | IGC_OS2BMCPBSIZE(4)) #define IGC_TXPBSIZE_TSN ( \ - IGC_TXPB0SIZE(5) | IGC_TXPB1SIZE(5) | IGC_TXPB2SIZE(5) | \ - IGC_TXPB3SIZE(5) | IGC_OS2BMCPBSIZE(4)) + IGC_TXPB0SIZE(7) | IGC_TXPB1SIZE(7) | IGC_TXPB2SIZE(7) | \ + IGC_TXPB3SIZE(7) | IGC_OS2BMCPBSIZE(4)) #define IGC_DTXMXPKTSZ_TSN 0x19 /* 1600 bytes of max TX DMA packet size */ #define IGC_DTXMXPKTSZ_DEFAULT 0x98 /* 9728-byte Jumbo frames */ From 9cd87aafc7a88f4107d9f86fc6262d717fd489e8 Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:36 -0400 Subject: [PATCH 322/770] igc: use FIELD_PREP and GENMASK for existing RX packet buffer size Prepare for an upcoming patch that modifies the RX buffer size in TSN mode. Refactor IGC_RXPBSIZE_EXP_BMC_DEFAULT and IGC_RXPBS_CFG_TS_EN using FIELD_PREP and GENMASK to improve clarity and maintainability. Refactor both macros for consistency, even though the upcoming patch only use IGC_RXPBSIZE_EXP_BMC_DEFAULT. The newly introduced macros follow the naming from the i226 SW User Manual for easy reference. I've tested IGC_RXPBSIZE_EXP_BMC_DEFAULT and IGC_RXPBS_CFG_TS_EN before and after the refactoring, and their values remain unchanged. Reviewed-by: Vladimir Oltean Signed-off-by: Faizal Rahim Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_defines.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 5502edd6261f4..12a16a7acb034 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -396,9 +396,20 @@ #define IGC_RCTL_PMCF 0x00800000 /* pass MAC control frames */ #define IGC_RCTL_SECRC 0x04000000 /* Strip Ethernet CRC */ -/* RXPBSIZE default value for Express and BMC buffer */ -#define IGC_RXPBSIZE_EXP_BMC_DEFAULT 0x000000A2 -#define IGC_RXPBS_CFG_TS_EN 0x80000000 /* Timestamp in Rx buffer */ +/* Mask for RX packet buffer size */ +#define IGC_RXPBSIZE_EXP_MASK GENMASK(5, 0) +#define IGC_BMC2OSPBSIZE_MASK GENMASK(11, 6) +/* Mask for timestamp in RX buffer */ +#define IGC_RXPBS_CFG_TS_EN_MASK GENMASK(31, 31) +/* High-priority RX packet buffer size (KB). Used for Express traffic when preemption is enabled */ +#define IGC_RXPBSIZE_EXP(x) FIELD_PREP(IGC_RXPBSIZE_EXP_MASK, (x)) +/* BMC to OS packet buffer size in KB */ +#define IGC_BMC2OSPBSIZE(x) FIELD_PREP(IGC_BMC2OSPBSIZE_MASK, (x)) +/* Enable RX packet buffer for timestamp descriptor, saving 16 bytes per packet if set */ +#define IGC_RXPBS_CFG_TS_EN FIELD_PREP(IGC_RXPBS_CFG_TS_EN_MASK, 1) +/* Default value following I225/I226 SW User Manual Section 8.3.1 */ +#define IGC_RXPBSIZE_EXP_BMC_DEFAULT ( \ + IGC_RXPBSIZE_EXP(34) | IGC_BMC2OSPBSIZE(2)) /* Mask for TX packet buffer size */ #define IGC_TXPB0SIZE_MASK GENMASK(5, 0) From 7663370e32b39cf65cc1d56155a9c152598473ea Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:37 -0400 Subject: [PATCH 323/770] igc: set the RX packet buffer size for TSN mode In preparation for supporting frame preemption, when entering TSN mode, set the receive packet buffer to 15KB for the Express MAC, 15KB for the Preemptible MAC and 2KB for the BMC. References: I225/I226 SW User Manual, Section 4.7.9, Section 7.1.3.2, Section 8.3.1 The newly introduced macros follow the naming from the i226 SW User Manual for easy reference. Co-developed-by: Vinicius Costa Gomes Signed-off-by: Vinicius Costa Gomes Signed-off-by: Faizal Rahim Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_defines.h | 5 ++++ drivers/net/ethernet/intel/igc/igc_tsn.c | 27 ++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 12a16a7acb034..86abd3bafb37c 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -399,17 +399,22 @@ /* Mask for RX packet buffer size */ #define IGC_RXPBSIZE_EXP_MASK GENMASK(5, 0) #define IGC_BMC2OSPBSIZE_MASK GENMASK(11, 6) +#define IGC_RXPBSIZE_BE_MASK GENMASK(17, 12) /* Mask for timestamp in RX buffer */ #define IGC_RXPBS_CFG_TS_EN_MASK GENMASK(31, 31) /* High-priority RX packet buffer size (KB). Used for Express traffic when preemption is enabled */ #define IGC_RXPBSIZE_EXP(x) FIELD_PREP(IGC_RXPBSIZE_EXP_MASK, (x)) /* BMC to OS packet buffer size in KB */ #define IGC_BMC2OSPBSIZE(x) FIELD_PREP(IGC_BMC2OSPBSIZE_MASK, (x)) +/* Low-priority RX packet buffer size (KB). Used for BE traffic when preemption is enabled */ +#define IGC_RXPBSIZE_BE(x) FIELD_PREP(IGC_RXPBSIZE_BE_MASK, (x)) /* Enable RX packet buffer for timestamp descriptor, saving 16 bytes per packet if set */ #define IGC_RXPBS_CFG_TS_EN FIELD_PREP(IGC_RXPBS_CFG_TS_EN_MASK, 1) /* Default value following I225/I226 SW User Manual Section 8.3.1 */ #define IGC_RXPBSIZE_EXP_BMC_DEFAULT ( \ IGC_RXPBSIZE_EXP(34) | IGC_BMC2OSPBSIZE(2)) +#define IGC_RXPBSIZE_EXP_BMC_BE_TSN ( \ + IGC_RXPBSIZE_EXP(15) | IGC_BMC2OSPBSIZE(2) | IGC_RXPBSIZE_BE(15)) /* Mask for TX packet buffer size */ #define IGC_TXPB0SIZE_MASK GENMASK(5, 0) diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 498741d83ca64..2254f255cc3b9 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -125,6 +125,29 @@ static void igc_tsn_tx_arb(struct igc_adapter *adapter, u16 *queue_per_tc) wr32(IGC_TXARB, txarb); } +/** + * igc_tsn_set_rxpbsize - Set the receive packet buffer size + * @adapter: Pointer to the igc_adapter structure + * @rxpbs_exp_bmc_be: Value to set the receive packet buffer size, including + * express buffer, BMC buffer, and Best Effort buffer + * + * The IGC_RXPBS register value may include allocations for the Express buffer, + * BMC buffer, Best Effort buffer, and the timestamp descriptor buffer + * (IGC_RXPBS_CFG_TS_EN). + */ +static void igc_tsn_set_rxpbsize(struct igc_adapter *adapter, + u32 rxpbs_exp_bmc_be) +{ + struct igc_hw *hw = &adapter->hw; + u32 rxpbs = rd32(IGC_RXPBS); + + rxpbs &= ~(IGC_RXPBSIZE_EXP_MASK | IGC_BMC2OSPBSIZE_MASK | + IGC_RXPBSIZE_BE_MASK); + rxpbs |= rxpbs_exp_bmc_be; + + wr32(IGC_RXPBS, rxpbs); +} + /* Returns the TSN specific registers to their default values after * the adapter is reset. */ @@ -139,6 +162,8 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) wr32(IGC_TXPBS, IGC_TXPBSIZE_DEFAULT); wr32(IGC_DTXMXPKTSZ, IGC_DTXMXPKTSZ_DEFAULT); + igc_tsn_set_rxpbsize(adapter, IGC_RXPBSIZE_EXP_BMC_DEFAULT); + if (igc_is_device_id_i226(hw)) igc_tsn_restore_retx_default(adapter); @@ -202,6 +227,8 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) wr32(IGC_DTXMXPKTSZ, IGC_DTXMXPKTSZ_TSN); wr32(IGC_TXPBS, IGC_TXPBSIZE_TSN); + igc_tsn_set_rxpbsize(adapter, IGC_RXPBSIZE_EXP_BMC_BE_TSN); + if (igc_is_device_id_i226(hw)) igc_tsn_set_retx_qbvfullthreshold(adapter); From 5422570c0010bb968738f9256eb2bf83e79b4d63 Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:38 -0400 Subject: [PATCH 324/770] igc: add support for frame preemption verification This patch implements the "ethtool --set-mm" callback to trigger the frame preemption verification handshake. Uses the MAC Merge Software Verification (mmsv) mechanism in ethtool to perform the verification handshake for igc. The structure fpe.mmsv is set by mmsv in ethtool and should remain read-only for the driver. Other mmsv callbacks: a) configure_tx() -> not used yet at this point - igc lacks registers to configure FPE in the transmit direction, so this API is not utilized for now. When igc supports preemptible queue, driver will use this API to manage its configuration. b) configure_pmac() -> not used - this callback dynamically controls pmac_enabled at runtime. For example, mmsv calls configure_pmac() and disables pmac_enabled when the link partner goes down, even if the user previously enabled it. The intention is to save power but it is not feasible in igc because it causes an endless adapter reset loop: 1) Board A and Board B complete the verification handshake. Tx mode register for both boards are in TSN mode. 2) Board B link goes down. On Board A: 3) mmsv calls configure_pmac() with pmac_enabled = false. 4) configure_pmac() in igc updates a new field based on pmac_enabled. Driver uses this field in igc_tsn_new_flags() to indicate that the user enabled/disabled FPE. 5) configure_pmac() in igc calls igc_tsn_offload_apply() to check whether an adapter reset is needed. Calls existing logic in igc_tsn_will_tx_mode_change() and igc_tsn_new_flags(). 6) Since pmac_enabled is now disabled and no other TSN feature is active, igc_tsn_will_tx_mode_change() evaluates to true because Tx mode will switch from TSN to Legacy. 7) Driver resets the adapter. 8) Registers are set, and Tx mode switches to Legacy. 9) When link partner is up, steps 3-8 repeat, but this time with pmac_enabled = true, reactivating TSN. igc_tsn_will_tx_mode_change() evaluates to true again, since Tx mode will switch from Legacy to TSN. 10) Driver resets the adapter. 11) Adapter reset completes, registers are set, and Tx mode switches to TSN. On Board B: 12) Adapter reset on Board A at step 10 causes it to detect its link partner as down. 13) Repeats steps 3-8. 14) Once reset adapter on Board A is completed at step 11, it detects its link partner as up. 15) Repeats steps 9-11. - this cycle repeats indefinitely. To avoid this issue, igc only uses mmsv.pmac_enabled to track whether FPE is enabled or disabled. Co-developed-by: Vinicius Costa Gomes Signed-off-by: Vinicius Costa Gomes Co-developed-by: Choong Yong Liang Signed-off-by: Choong Yong Liang Co-developed-by: Chwee-Lin Choong Signed-off-by: Chwee-Lin Choong Reviewed-by: Vladimir Oltean Signed-off-by: Faizal Rahim Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/Kconfig | 1 + drivers/net/ethernet/intel/igc/igc.h | 12 +- drivers/net/ethernet/intel/igc/igc_base.h | 1 + drivers/net/ethernet/intel/igc/igc_defines.h | 8 +- drivers/net/ethernet/intel/igc/igc_ethtool.c | 21 +++ drivers/net/ethernet/intel/igc/igc_main.c | 52 ++++++- drivers/net/ethernet/intel/igc/igc_tsn.c | 146 ++++++++++++++++++- drivers/net/ethernet/intel/igc/igc_tsn.h | 50 +++++++ 8 files changed, 286 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index b2adb40a59210..5a331c1c76cb8 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -369,6 +369,7 @@ config IGC default n depends on PCI depends on PTP_1588_CLOCK_OPTIONAL + depends on ETHTOOL_NETLINK help This driver supports Intel(R) Ethernet Controller I225-LM/I225-V family of adapters. diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index a489e14d8dc47..cce537d7dbd34 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -40,6 +40,10 @@ void igc_ethtool_set_ops(struct net_device *); #define IGC_MAX_TX_TSTAMP_REGS 4 +struct igc_fpe_t { + struct ethtool_mmsv mmsv; +}; + enum igc_mac_filter_type { IGC_MAC_FILTER_TYPE_DST = 0, IGC_MAC_FILTER_TYPE_SRC @@ -333,6 +337,8 @@ struct igc_adapter { struct timespec64 period; } perout[IGC_N_PEROUT]; + struct igc_fpe_t fpe; + /* LEDs */ struct mutex led_mutex; struct igc_led_classdev *leds; @@ -388,10 +394,11 @@ extern char igc_driver_name[]; #define IGC_FLAG_TSN_QBV_ENABLED BIT(17) #define IGC_FLAG_TSN_QAV_ENABLED BIT(18) #define IGC_FLAG_TSN_LEGACY_ENABLED BIT(19) +#define IGC_FLAG_TSN_PREEMPT_ENABLED BIT(20) #define IGC_FLAG_TSN_ANY_ENABLED \ (IGC_FLAG_TSN_QBV_ENABLED | IGC_FLAG_TSN_QAV_ENABLED | \ - IGC_FLAG_TSN_LEGACY_ENABLED) + IGC_FLAG_TSN_LEGACY_ENABLED | IGC_FLAG_TSN_PREEMPT_ENABLED) #define IGC_FLAG_RSS_FIELD_IPV4_UDP BIT(6) #define IGC_FLAG_RSS_FIELD_IPV6_UDP BIT(7) @@ -736,7 +743,10 @@ struct igc_nfc_rule *igc_get_nfc_rule(struct igc_adapter *adapter, u32 location); int igc_add_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule); void igc_del_nfc_rule(struct igc_adapter *adapter, struct igc_nfc_rule *rule); +void igc_disable_empty_addr_recv(struct igc_adapter *adapter); +int igc_enable_empty_addr_recv(struct igc_adapter *adapter); struct igc_ring *igc_get_tx_ring(struct igc_adapter *adapter, int cpu); +void igc_flush_tx_descriptors(struct igc_ring *ring); void igc_ptp_init(struct igc_adapter *adapter); void igc_ptp_reset(struct igc_adapter *adapter); void igc_ptp_suspend(struct igc_adapter *adapter); diff --git a/drivers/net/ethernet/intel/igc/igc_base.h b/drivers/net/ethernet/intel/igc/igc_base.h index bf8cdfbba9ffb..6320eabb72fe0 100644 --- a/drivers/net/ethernet/intel/igc/igc_base.h +++ b/drivers/net/ethernet/intel/igc/igc_base.h @@ -49,6 +49,7 @@ struct igc_adv_tx_context_desc { #define IGC_ADVTXD_DCMD_DEXT 0x20000000 /* Descriptor extension (1=Adv) */ #define IGC_ADVTXD_DCMD_VLE 0x40000000 /* VLAN pkt enable */ #define IGC_ADVTXD_DCMD_TSE 0x80000000 /* TCP Seg enable */ +#define IGC_ADVTXD_PAYLEN_MASK 0XFFFFC000 /* Adv desc PAYLEN mask */ #define IGC_ADVTXD_PAYLEN_SHIFT 14 /* Adv desc PAYLEN shift */ #define IGC_RAR_ENTRIES 16 diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 86abd3bafb37c..66061b0d6cbf4 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -308,6 +308,8 @@ #define IGC_TXD_DTYP_C 0x00000000 /* Context Descriptor */ #define IGC_TXD_POPTS_IXSM 0x01 /* Insert IP checksum */ #define IGC_TXD_POPTS_TXSM 0x02 /* Insert TCP/UDP checksum */ +#define IGC_TXD_POPTS_SMD_MASK 0x3000 /* Indicates whether it's SMD-V or SMD-R */ + #define IGC_TXD_CMD_EOP 0x01000000 /* End of Packet */ #define IGC_TXD_CMD_IC 0x04000000 /* Insert Checksum */ #define IGC_TXD_CMD_DEXT 0x20000000 /* Desc extension (0 = legacy) */ @@ -363,6 +365,8 @@ #define IGC_SRRCTL_TIMER0SEL(timer) (((timer) & 0x3) << 17) /* Receive Descriptor bit definitions */ +#define IGC_RXD_STAT_SMD_TYPE_V 0x01 /* SMD-V Packet */ +#define IGC_RXD_STAT_SMD_TYPE_R 0x02 /* SMD-R Packet */ #define IGC_RXD_STAT_EOP 0x02 /* End of Packet */ #define IGC_RXD_STAT_IXSM 0x04 /* Ignore checksum */ #define IGC_RXD_STAT_UDPCS 0x10 /* UDP xsum calculated */ @@ -372,7 +376,8 @@ #define IGC_RXDEXT_STATERR_LB 0x00040000 /* Advanced Receive Descriptor bit definitions */ -#define IGC_RXDADV_STAT_TSIP 0x08000 /* timestamp in packet */ +#define IGC_RXDADV_STAT_SMD_TYPE_MASK 0x06000 +#define IGC_RXDADV_STAT_TSIP 0x08000 /* timestamp in packet */ #define IGC_RXDEXT_STATERR_L4E 0x20000000 #define IGC_RXDEXT_STATERR_IPE 0x40000000 @@ -575,6 +580,7 @@ /* Transmit Scheduling */ #define IGC_TQAVCTRL_TRANSMIT_MODE_TSN 0x00000001 +#define IGC_TQAVCTRL_PREEMPT_ENA 0x00000002 #define IGC_TQAVCTRL_ENHANCED_QAV 0x00000008 #define IGC_TQAVCTRL_FUTSCDDIS 0x00000080 diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index 8178386778176..b64d5c6c1d20c 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -8,6 +8,7 @@ #include "igc.h" #include "igc_diag.h" +#include "igc_tsn.h" /* forward declaration */ struct igc_stats { @@ -1781,6 +1782,25 @@ static int igc_ethtool_set_eee(struct net_device *netdev, return 0; } +static int igc_ethtool_set_mm(struct net_device *netdev, + struct ethtool_mm_cfg *cmd, + struct netlink_ext_ack *extack) +{ + struct igc_adapter *adapter = netdev_priv(netdev); + struct igc_fpe_t *fpe = &adapter->fpe; + + if (fpe->mmsv.pmac_enabled != cmd->pmac_enabled) { + if (cmd->pmac_enabled) + static_branch_inc(&igc_fpe_enabled); + else + static_branch_dec(&igc_fpe_enabled); + } + + ethtool_mmsv_set_mm(&fpe->mmsv, cmd); + + return igc_tsn_offload_apply(adapter); +} + static int igc_ethtool_get_link_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *cmd) { @@ -2076,6 +2096,7 @@ static const struct ethtool_ops igc_ethtool_ops = { .get_link_ksettings = igc_ethtool_get_link_ksettings, .set_link_ksettings = igc_ethtool_set_link_ksettings, .self_test = igc_ethtool_diag_test, + .set_mm = igc_ethtool_set_mm, }; void igc_ethtool_set_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 9d9661632ae79..d160da49b0d28 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2548,7 +2548,7 @@ static int igc_xdp_run_prog(struct igc_adapter *adapter, struct xdp_buff *xdp) } /* This function assumes __netif_tx_lock is held by the caller. */ -static void igc_flush_tx_descriptors(struct igc_ring *ring) +void igc_flush_tx_descriptors(struct igc_ring *ring) { /* Once tail pointer is updated, hardware can fetch the descriptors * any time so we issue a write membar here to ensure all memory @@ -2637,6 +2637,14 @@ static int igc_clean_rx_irq(struct igc_q_vector *q_vector, const int budget) size -= IGC_TS_HDR_LEN; } + if (igc_fpe_is_pmac_enabled(adapter) && + igc_fpe_handle_mpacket(adapter, rx_desc, size, pktbuf)) { + /* Advance the ring next-to-clean */ + igc_is_non_eop(rx_ring, rx_desc); + cleaned_count++; + continue; + } + if (!skb) { xdp_init_buff(&ctx.xdp, truesize, &rx_ring->xdp_rxq); xdp_prepare_buff(&ctx.xdp, pktbuf - igc_rx_offset(rx_ring), @@ -3144,6 +3152,11 @@ static bool igc_clean_tx_irq(struct igc_q_vector *q_vector, int napi_budget) if (!(eop_desc->wb.status & cpu_to_le32(IGC_TXD_STAT_DD))) break; + if (igc_fpe_is_pmac_enabled(adapter) && + igc_fpe_transmitted_smd_v(tx_desc)) + ethtool_mmsv_event_handle(&adapter->fpe.mmsv, + ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET); + /* Hold the completions while there's a pending tx hardware * timestamp request from XDP Tx metadata. */ @@ -4035,6 +4048,30 @@ static int igc_uc_unsync(struct net_device *netdev, const unsigned char *addr) return 0; } +/** + * igc_enable_empty_addr_recv - Enable Rx of packets with all-zeroes MAC address + * @adapter: Pointer to the igc_adapter structure. + * + * Frame preemption verification requires that packets with the all-zeroes + * MAC address are allowed to be received by the driver. This function adds the + * all-zeroes destination address to the list of acceptable addresses. + * + * Return: 0 on success, negative value otherwise. + */ +int igc_enable_empty_addr_recv(struct igc_adapter *adapter) +{ + u8 empty[ETH_ALEN] = {}; + + return igc_add_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST, empty, -1); +} + +void igc_disable_empty_addr_recv(struct igc_adapter *adapter) +{ + u8 empty[ETH_ALEN] = {}; + + igc_del_mac_filter(adapter, IGC_MAC_FILTER_TYPE_DST, empty); +} + /** * igc_set_rx_mode - Secondary Unicast, Multicast and Promiscuous mode set * @netdev: network interface device structure @@ -5310,6 +5347,9 @@ void igc_down(struct igc_adapter *adapter) igc_disable_all_tx_rings_hw(adapter); igc_clean_all_tx_rings(adapter); igc_clean_all_rx_rings(adapter); + + if (adapter->fpe.mmsv.pmac_enabled) + ethtool_mmsv_stop(&adapter->fpe.mmsv); } void igc_reinit_locked(struct igc_adapter *adapter) @@ -5834,6 +5874,10 @@ static void igc_watchdog_task(struct work_struct *work) */ igc_tsn_adjust_txtime_offset(adapter); + if (adapter->fpe.mmsv.pmac_enabled) + ethtool_mmsv_link_state_handle(&adapter->fpe.mmsv, + true); + if (adapter->link_speed != SPEED_1000) goto no_wait; @@ -5869,6 +5913,10 @@ static void igc_watchdog_task(struct work_struct *work) netdev_info(netdev, "NIC Link is Down\n"); netif_carrier_off(netdev); + if (adapter->fpe.mmsv.pmac_enabled) + ethtool_mmsv_link_state_handle(&adapter->fpe.mmsv, + false); + /* link state has changed, schedule phy info update */ if (!test_bit(__IGC_DOWN, &adapter->state)) mod_timer(&adapter->phy_info_timer, @@ -7192,6 +7240,8 @@ static int igc_probe(struct pci_dev *pdev, igc_tsn_clear_schedule(adapter); + igc_fpe_init(adapter); + /* reset the hardware with the new settings */ igc_reset(adapter); diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index 2254f255cc3b9..a3f32cb94469d 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -2,9 +2,135 @@ /* Copyright (c) 2019 Intel Corporation */ #include "igc.h" +#include "igc_base.h" #include "igc_hw.h" #include "igc_tsn.h" +DEFINE_STATIC_KEY_FALSE(igc_fpe_enabled); + +static int igc_fpe_init_smd_frame(struct igc_ring *ring, + struct igc_tx_buffer *buffer, + struct sk_buff *skb) +{ + dma_addr_t dma = dma_map_single(ring->dev, skb->data, skb->len, + DMA_TO_DEVICE); + + if (dma_mapping_error(ring->dev, dma)) { + netdev_err_once(ring->netdev, "Failed to map DMA for TX\n"); + return -ENOMEM; + } + + buffer->skb = skb; + buffer->protocol = 0; + buffer->bytecount = skb->len; + buffer->gso_segs = 1; + buffer->time_stamp = jiffies; + dma_unmap_len_set(buffer, len, skb->len); + dma_unmap_addr_set(buffer, dma, dma); + + return 0; +} + +static int igc_fpe_init_tx_descriptor(struct igc_ring *ring, + struct sk_buff *skb, + enum igc_txd_popts_type type) +{ + u32 cmd_type, olinfo_status = 0; + struct igc_tx_buffer *buffer; + union igc_adv_tx_desc *desc; + int err; + + if (!igc_desc_unused(ring)) + return -EBUSY; + + buffer = &ring->tx_buffer_info[ring->next_to_use]; + err = igc_fpe_init_smd_frame(ring, buffer, skb); + if (err) + return err; + + cmd_type = IGC_ADVTXD_DTYP_DATA | IGC_ADVTXD_DCMD_DEXT | + IGC_ADVTXD_DCMD_IFCS | IGC_TXD_DCMD | + buffer->bytecount; + + olinfo_status |= FIELD_PREP(IGC_ADVTXD_PAYLEN_MASK, buffer->bytecount); + + switch (type) { + case SMD_V: + case SMD_R: + olinfo_status |= FIELD_PREP(IGC_TXD_POPTS_SMD_MASK, type); + break; + } + + desc = IGC_TX_DESC(ring, ring->next_to_use); + desc->read.cmd_type_len = cpu_to_le32(cmd_type); + desc->read.olinfo_status = cpu_to_le32(olinfo_status); + desc->read.buffer_addr = cpu_to_le64(dma_unmap_addr(buffer, dma)); + + netdev_tx_sent_queue(txring_txq(ring), skb->len); + + buffer->next_to_watch = desc; + ring->next_to_use = (ring->next_to_use + 1) % ring->count; + + return 0; +} + +static int igc_fpe_xmit_smd_frame(struct igc_adapter *adapter, + enum igc_txd_popts_type type) +{ + int cpu = smp_processor_id(); + struct netdev_queue *nq; + struct igc_ring *ring; + struct sk_buff *skb; + int err; + + ring = igc_get_tx_ring(adapter, cpu); + nq = txring_txq(ring); + + skb = alloc_skb(SMD_FRAME_SIZE, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + skb_put_zero(skb, SMD_FRAME_SIZE); + + __netif_tx_lock(nq, cpu); + + err = igc_fpe_init_tx_descriptor(ring, skb, type); + igc_flush_tx_descriptors(ring); + + __netif_tx_unlock(nq); + + return err; +} + +static void igc_fpe_send_mpacket(struct ethtool_mmsv *mmsv, + enum ethtool_mpacket type) +{ + struct igc_fpe_t *fpe = container_of(mmsv, struct igc_fpe_t, mmsv); + struct igc_adapter *adapter; + int err; + + adapter = container_of(fpe, struct igc_adapter, fpe); + + if (type == ETHTOOL_MPACKET_VERIFY) { + err = igc_fpe_xmit_smd_frame(adapter, SMD_V); + if (err && net_ratelimit()) + netdev_err(adapter->netdev, "Error sending SMD-V\n"); + } else if (type == ETHTOOL_MPACKET_RESPONSE) { + err = igc_fpe_xmit_smd_frame(adapter, SMD_R); + if (err && net_ratelimit()) + netdev_err(adapter->netdev, "Error sending SMD-R frame\n"); + } +} + +static const struct ethtool_mmsv_ops igc_mmsv_ops = { + .send_mpacket = igc_fpe_send_mpacket, +}; + +void igc_fpe_init(struct igc_adapter *adapter) +{ + ethtool_mmsv_init(&adapter->fpe.mmsv, adapter->netdev, &igc_mmsv_ops); +} + static bool is_any_launchtime(struct igc_adapter *adapter) { int i; @@ -49,6 +175,9 @@ static unsigned int igc_tsn_new_flags(struct igc_adapter *adapter) if (adapter->strict_priority_enable) new_flags |= IGC_FLAG_TSN_LEGACY_ENABLED; + if (adapter->fpe.mmsv.pmac_enabled) + new_flags |= IGC_FLAG_TSN_PREEMPT_ENABLED; + return new_flags; } @@ -169,7 +298,8 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) tqavctrl = rd32(IGC_TQAVCTRL); tqavctrl &= ~(IGC_TQAVCTRL_TRANSMIT_MODE_TSN | - IGC_TQAVCTRL_ENHANCED_QAV | IGC_TQAVCTRL_FUTSCDDIS); + IGC_TQAVCTRL_ENHANCED_QAV | IGC_TQAVCTRL_FUTSCDDIS | + IGC_TQAVCTRL_PREEMPT_ENA); wr32(IGC_TQAVCTRL, tqavctrl); @@ -388,10 +518,14 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) wr32(IGC_TXQCTL(i), txqctl); } - tqavctrl = rd32(IGC_TQAVCTRL) & ~IGC_TQAVCTRL_FUTSCDDIS; + tqavctrl = rd32(IGC_TQAVCTRL) & ~(IGC_TQAVCTRL_FUTSCDDIS | + IGC_TQAVCTRL_PREEMPT_ENA); tqavctrl |= IGC_TQAVCTRL_TRANSMIT_MODE_TSN | IGC_TQAVCTRL_ENHANCED_QAV; + if (adapter->fpe.mmsv.pmac_enabled) + tqavctrl |= IGC_TQAVCTRL_PREEMPT_ENA; + adapter->qbv_count++; cycle = adapter->cycle_time; @@ -452,6 +586,14 @@ int igc_tsn_reset(struct igc_adapter *adapter) unsigned int new_flags; int err = 0; + if (adapter->fpe.mmsv.pmac_enabled) { + err = igc_enable_empty_addr_recv(adapter); + if (err && net_ratelimit()) + netdev_err(adapter->netdev, "Error adding empty address to MAC filter\n"); + } else { + igc_disable_empty_addr_recv(adapter); + } + new_flags = igc_tsn_new_flags(adapter); if (!(new_flags & IGC_FLAG_TSN_ANY_ENABLED)) diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.h b/drivers/net/ethernet/intel/igc/igc_tsn.h index 98ec845a86bf0..bb30d15cec76d 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.h +++ b/drivers/net/ethernet/intel/igc/igc_tsn.h @@ -4,9 +4,59 @@ #ifndef _IGC_TSN_H_ #define _IGC_TSN_H_ +#define SMD_FRAME_SIZE 60 + +enum igc_txd_popts_type { + SMD_V = 0x01, + SMD_R = 0x02, +}; + +DECLARE_STATIC_KEY_FALSE(igc_fpe_enabled); + +void igc_fpe_init(struct igc_adapter *adapter); int igc_tsn_offload_apply(struct igc_adapter *adapter); int igc_tsn_reset(struct igc_adapter *adapter); void igc_tsn_adjust_txtime_offset(struct igc_adapter *adapter); bool igc_tsn_is_taprio_activated_by_user(struct igc_adapter *adapter); +static inline bool igc_fpe_is_pmac_enabled(struct igc_adapter *adapter) +{ + return static_branch_unlikely(&igc_fpe_enabled) && + adapter->fpe.mmsv.pmac_enabled; +} + +static inline bool igc_fpe_handle_mpacket(struct igc_adapter *adapter, + union igc_adv_rx_desc *rx_desc, + unsigned int size, void *pktbuf) +{ + u32 status_error = le32_to_cpu(rx_desc->wb.upper.status_error); + int smd; + + smd = FIELD_GET(IGC_RXDADV_STAT_SMD_TYPE_MASK, status_error); + if (smd != IGC_RXD_STAT_SMD_TYPE_V && smd != IGC_RXD_STAT_SMD_TYPE_R) + return false; + + if (size == SMD_FRAME_SIZE && mem_is_zero(pktbuf, SMD_FRAME_SIZE)) { + struct ethtool_mmsv *mmsv = &adapter->fpe.mmsv; + enum ethtool_mmsv_event event; + + if (smd == IGC_RXD_STAT_SMD_TYPE_V) + event = ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET; + else + event = ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET; + + ethtool_mmsv_event_handle(mmsv, event); + } + + return true; +} + +static inline bool igc_fpe_transmitted_smd_v(union igc_adv_tx_desc *tx_desc) +{ + u32 olinfo_status = le32_to_cpu(tx_desc->read.olinfo_status); + u8 smd = FIELD_GET(IGC_TXD_POPTS_SMD_MASK, olinfo_status); + + return smd == SMD_V; +} + #endif /* _IGC_BASE_H */ From 55ececab98856be35e8a418a25e1081df931f1eb Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:39 -0400 Subject: [PATCH 325/770] igc: add support to set tx-min-frag-size Add support for setting tx-min-frag-size via the set_mm callback in igc. If the requested value is unsupported, round it up to the smallest supported i226 size (64, 128, 192, 256) and send a netlink message to inform the user. Co-developed-by: Vinicius Costa Gomes Signed-off-by: Vinicius Costa Gomes Reviewed-by: Vladimir Oltean Signed-off-by: Faizal Rahim Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 1 + drivers/net/ethernet/intel/igc/igc_defines.h | 1 + drivers/net/ethernet/intel/igc/igc_ethtool.c | 5 +++ drivers/net/ethernet/intel/igc/igc_tsn.c | 39 ++++++++++++++++++-- drivers/net/ethernet/intel/igc/igc_tsn.h | 1 + 5 files changed, 44 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index cce537d7dbd34..9bd243f23b469 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -42,6 +42,7 @@ void igc_ethtool_set_ops(struct net_device *); struct igc_fpe_t { struct ethtool_mmsv mmsv; + u32 tx_min_frag_size; }; enum igc_mac_filter_type { diff --git a/drivers/net/ethernet/intel/igc/igc_defines.h b/drivers/net/ethernet/intel/igc/igc_defines.h index 66061b0d6cbf4..7189dfc389ad5 100644 --- a/drivers/net/ethernet/intel/igc/igc_defines.h +++ b/drivers/net/ethernet/intel/igc/igc_defines.h @@ -583,6 +583,7 @@ #define IGC_TQAVCTRL_PREEMPT_ENA 0x00000002 #define IGC_TQAVCTRL_ENHANCED_QAV 0x00000008 #define IGC_TQAVCTRL_FUTSCDDIS 0x00000080 +#define IGC_TQAVCTRL_MIN_FRAG_MASK 0x0000C000 #define IGC_TXQCTL_QUEUE_MODE_LAUNCHT 0x00000001 #define IGC_TXQCTL_STRICT_CYCLE 0x00000002 diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index b64d5c6c1d20c..529654ccd83f6 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -1789,6 +1789,11 @@ static int igc_ethtool_set_mm(struct net_device *netdev, struct igc_adapter *adapter = netdev_priv(netdev); struct igc_fpe_t *fpe = &adapter->fpe; + fpe->tx_min_frag_size = igc_fpe_get_supported_frag_size(cmd->tx_min_frag_size); + if (fpe->tx_min_frag_size != cmd->tx_min_frag_size) + NL_SET_ERR_MSG_MOD(extack, + "tx-min-frag-size value set is unsupported. Rounded up to supported value (64, 128, 192, 256)"); + if (fpe->mmsv.pmac_enabled != cmd->pmac_enabled) { if (cmd->pmac_enabled) static_branch_inc(&igc_fpe_enabled); diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index a3f32cb94469d..b23bf0be007de 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -6,6 +6,13 @@ #include "igc_hw.h" #include "igc_tsn.h" +#define MIN_MULTPLIER_TX_MIN_FRAG 0 +#define MAX_MULTPLIER_TX_MIN_FRAG 3 +/* Frag size is based on the Section 8.12.2 of the SW User Manual */ +#define TX_MIN_FRAG_SIZE 64 +#define TX_MAX_FRAG_SIZE (TX_MIN_FRAG_SIZE * \ + (MAX_MULTPLIER_TX_MIN_FRAG + 1)) + DEFINE_STATIC_KEY_FALSE(igc_fpe_enabled); static int igc_fpe_init_smd_frame(struct igc_ring *ring, @@ -128,6 +135,7 @@ static const struct ethtool_mmsv_ops igc_mmsv_ops = { void igc_fpe_init(struct igc_adapter *adapter) { + adapter->fpe.tx_min_frag_size = TX_MIN_FRAG_SIZE; ethtool_mmsv_init(&adapter->fpe.mmsv, adapter->netdev, &igc_mmsv_ops); } @@ -299,7 +307,7 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) tqavctrl = rd32(IGC_TQAVCTRL); tqavctrl &= ~(IGC_TQAVCTRL_TRANSMIT_MODE_TSN | IGC_TQAVCTRL_ENHANCED_QAV | IGC_TQAVCTRL_FUTSCDDIS | - IGC_TQAVCTRL_PREEMPT_ENA); + IGC_TQAVCTRL_PREEMPT_ENA | IGC_TQAVCTRL_MIN_FRAG_MASK); wr32(IGC_TQAVCTRL, tqavctrl); @@ -345,12 +353,35 @@ static void igc_tsn_set_retx_qbvfullthreshold(struct igc_adapter *adapter) wr32(IGC_RETX_CTL, retxctl); } +static u8 igc_fpe_get_frag_size_mult(const struct igc_fpe_t *fpe) +{ + u8 mult = (fpe->tx_min_frag_size / TX_MIN_FRAG_SIZE) - 1; + + return clamp_t(u8, mult, MIN_MULTPLIER_TX_MIN_FRAG, + MAX_MULTPLIER_TX_MIN_FRAG); +} + +u32 igc_fpe_get_supported_frag_size(u32 frag_size) +{ + const u32 supported_sizes[] = {64, 128, 192, 256}; + + /* Find the smallest supported size that is >= frag_size */ + for (int i = 0; i < ARRAY_SIZE(supported_sizes); i++) { + if (frag_size <= supported_sizes[i]) + return supported_sizes[i]; + } + + /* Should not happen */ + return TX_MAX_FRAG_SIZE; +} + static int igc_tsn_enable_offload(struct igc_adapter *adapter) { struct igc_hw *hw = &adapter->hw; u32 tqavctrl, baset_l, baset_h; u32 sec, nsec, cycle; ktime_t base_time, systim; + u32 frag_size_mult; int i; wr32(IGC_TSAUXC, 0); @@ -519,13 +550,15 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) } tqavctrl = rd32(IGC_TQAVCTRL) & ~(IGC_TQAVCTRL_FUTSCDDIS | - IGC_TQAVCTRL_PREEMPT_ENA); - + IGC_TQAVCTRL_PREEMPT_ENA | IGC_TQAVCTRL_MIN_FRAG_MASK); tqavctrl |= IGC_TQAVCTRL_TRANSMIT_MODE_TSN | IGC_TQAVCTRL_ENHANCED_QAV; if (adapter->fpe.mmsv.pmac_enabled) tqavctrl |= IGC_TQAVCTRL_PREEMPT_ENA; + frag_size_mult = igc_fpe_get_frag_size_mult(&adapter->fpe); + tqavctrl |= FIELD_PREP(IGC_TQAVCTRL_MIN_FRAG_MASK, frag_size_mult); + adapter->qbv_count++; cycle = adapter->cycle_time; diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.h b/drivers/net/ethernet/intel/igc/igc_tsn.h index bb30d15cec76d..58fe5f0773d76 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.h +++ b/drivers/net/ethernet/intel/igc/igc_tsn.h @@ -14,6 +14,7 @@ enum igc_txd_popts_type { DECLARE_STATIC_KEY_FALSE(igc_fpe_enabled); void igc_fpe_init(struct igc_adapter *adapter); +u32 igc_fpe_get_supported_frag_size(u32 frag_size); int igc_tsn_offload_apply(struct igc_adapter *adapter); int igc_tsn_reset(struct igc_adapter *adapter); void igc_tsn_adjust_txtime_offset(struct igc_adapter *adapter); From e9074d7f376876f819090b4fca99a370ab7b830b Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:40 -0400 Subject: [PATCH 326/770] igc: block setting preemptible traffic class in taprio Since preemptible tc implementation is not ready yet, block it from being set in taprio. The existing code already blocks it in mqprio. Reviewed-by: Vladimir Oltean Signed-off-by: Faizal Rahim Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index d160da49b0d28..5b06765a35e9a 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6486,6 +6486,10 @@ static int igc_save_qbv_schedule(struct igc_adapter *adapter, if (!validate_schedule(adapter, qopt)) return -EINVAL; + /* preemptible isn't supported yet */ + if (qopt->mqprio.preemptible_tcs) + return -EOPNOTSUPP; + igc_ptp_read(adapter, &now); if (igc_tsn_is_taprio_activated_by_user(adapter) && From 10e2ffe10e437b81ed3ec8162638b3dac5d6247b Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:41 -0400 Subject: [PATCH 327/770] igc: add support to get MAC Merge data via ethtool Implement "ethtool --show-mm" callback for IGC. Tested with command: $ ethtool --show-mm enp1s0. MAC Merge layer state for enp1s0: pMAC enabled: on TX enabled: on TX active: on TX minimum fragment size: 64 RX minimum fragment size: 60 Verify enabled: on Verify time: 128 Max verify time: 128 Verification status: SUCCEEDED Verified that the fields value are retrieved correctly. Reviewed-by: Vladimir Oltean Signed-off-by: Faizal Rahim Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_ethtool.c | 14 ++++++++++++++ drivers/net/ethernet/intel/igc/igc_tsn.h | 1 + 2 files changed, 15 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index 529654ccd83f6..fd4b4b3323093 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -1782,6 +1782,19 @@ static int igc_ethtool_set_eee(struct net_device *netdev, return 0; } +static int igc_ethtool_get_mm(struct net_device *netdev, + struct ethtool_mm_state *cmd) +{ + struct igc_adapter *adapter = netdev_priv(netdev); + struct igc_fpe_t *fpe = &adapter->fpe; + + ethtool_mmsv_get_mm(&fpe->mmsv, cmd); + cmd->tx_min_frag_size = fpe->tx_min_frag_size; + cmd->rx_min_frag_size = IGC_RX_MIN_FRAG_SIZE; + + return 0; +} + static int igc_ethtool_set_mm(struct net_device *netdev, struct ethtool_mm_cfg *cmd, struct netlink_ext_ack *extack) @@ -2101,6 +2114,7 @@ static const struct ethtool_ops igc_ethtool_ops = { .get_link_ksettings = igc_ethtool_get_link_ksettings, .set_link_ksettings = igc_ethtool_set_link_ksettings, .self_test = igc_ethtool_diag_test, + .get_mm = igc_ethtool_get_mm, .set_mm = igc_ethtool_set_mm, }; diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.h b/drivers/net/ethernet/intel/igc/igc_tsn.h index 58fe5f0773d76..c2a77229207b3 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.h +++ b/drivers/net/ethernet/intel/igc/igc_tsn.h @@ -4,6 +4,7 @@ #ifndef _IGC_TSN_H_ #define _IGC_TSN_H_ +#define IGC_RX_MIN_FRAG_SIZE 60 #define SMD_FRAME_SIZE 60 enum igc_txd_popts_type { From f05ce73cc3b2bbf8cd03d35337d3b380692840fb Mon Sep 17 00:00:00 2001 From: Faizal Rahim Date: Mon, 17 Mar 2025 23:07:42 -0400 Subject: [PATCH 328/770] igc: add support to get frame preemption statistics via ethtool Implemented "ethtool --include-statistics --show-mm" callback for IGC. Tested preemption scenario to check preemption statistics: 1) Trigger verification handshake on both boards: $ sudo ethtool --set-mm enp1s0 pmac-enabled on $ sudo ethtool --set-mm enp1s0 tx-enabled on $ sudo ethtool --set-mm enp1s0 verify-enabled on 2) Set preemptible or express queue in taprio for tx board: $ sudo tc qdisc replace dev enp1s0 parent root handle 100 taprio \ num_tc 4 map 3 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3 \ queues 1@0 1@1 1@2 1@3 base-time 0 sched-entry S F 100000 \ fp E E P P 3) Send large size packets on preemptible queue 4) Send small size packets on express queue to preempt packets in preemptible queue 5) Show preemption statistics on the receiving board: $ ethtool --include-statistics --show-mm enp1s0 MAC Merge layer state for enp1s0: pMAC enabled: on TX enabled: on TX active: on TX minimum fragment size: 64 RX minimum fragment size: 60 Verify enabled: on Verify time: 128 Max verify time: 128 Verification status: SUCCEEDED Statistics: MACMergeFrameAssErrorCount: 0 MACMergeFrameSmdErrorCount: 0 MACMergeFrameAssOkCount: 511 MACMergeFragCountRx: 764 MACMergeFragCountTx: 0 MACMergeHoldCount: 0 Co-developed-by: Vinicius Costa Gomes Signed-off-by: Vinicius Costa Gomes Co-developed-by: Chwee-Lin Choong Signed-off-by: Chwee-Lin Choong Reviewed-by: Vladimir Oltean Signed-off-by: Faizal Rahim Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_ethtool.c | 41 ++++++++++++++++++++ drivers/net/ethernet/intel/igc/igc_regs.h | 16 ++++++++ 2 files changed, 57 insertions(+) diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c index fd4b4b3323093..3fc1eded9605f 100644 --- a/drivers/net/ethernet/intel/igc/igc_ethtool.c +++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c @@ -1819,6 +1819,46 @@ static int igc_ethtool_set_mm(struct net_device *netdev, return igc_tsn_offload_apply(adapter); } +/** + * igc_ethtool_get_frame_ass_error - Get the frame assembly error count. + * @reg_value: Register value for IGC_PRMEXCPRCNT + * Return: The count of frame assembly errors. + */ +static u64 igc_ethtool_get_frame_ass_error(u32 reg_value) +{ + /* Out of order statistics */ + u32 ooo_frame_cnt, ooo_frag_cnt; + u32 miss_frame_frag_cnt; + + ooo_frame_cnt = FIELD_GET(IGC_PRMEXCPRCNT_OOO_FRAME_CNT, reg_value); + ooo_frag_cnt = FIELD_GET(IGC_PRMEXCPRCNT_OOO_FRAG_CNT, reg_value); + miss_frame_frag_cnt = FIELD_GET(IGC_PRMEXCPRCNT_MISS_FRAME_FRAG_CNT, + reg_value); + + return ooo_frame_cnt + ooo_frag_cnt + miss_frame_frag_cnt; +} + +static u64 igc_ethtool_get_frame_smd_error(u32 reg_value) +{ + return FIELD_GET(IGC_PRMEXCPRCNT_OOO_SMDC, reg_value); +} + +static void igc_ethtool_get_mm_stats(struct net_device *dev, + struct ethtool_mm_stats *stats) +{ + struct igc_adapter *adapter = netdev_priv(dev); + struct igc_hw *hw = &adapter->hw; + u32 reg_value; + + reg_value = rd32(IGC_PRMEXCPRCNT); + + stats->MACMergeFrameAssErrorCount = igc_ethtool_get_frame_ass_error(reg_value); + stats->MACMergeFrameSmdErrorCount = igc_ethtool_get_frame_smd_error(reg_value); + stats->MACMergeFrameAssOkCount = rd32(IGC_PRMPTDRCNT); + stats->MACMergeFragCountRx = rd32(IGC_PRMEVNTRCNT); + stats->MACMergeFragCountTx = rd32(IGC_PRMEVNTTCNT); +} + static int igc_ethtool_get_link_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *cmd) { @@ -2115,6 +2155,7 @@ static const struct ethtool_ops igc_ethtool_ops = { .set_link_ksettings = igc_ethtool_set_link_ksettings, .self_test = igc_ethtool_diag_test, .get_mm = igc_ethtool_get_mm, + .get_mm_stats = igc_ethtool_get_mm_stats, .set_mm = igc_ethtool_set_mm, }; diff --git a/drivers/net/ethernet/intel/igc/igc_regs.h b/drivers/net/ethernet/intel/igc/igc_regs.h index 12ddc5793651d..f343c6bfc6be9 100644 --- a/drivers/net/ethernet/intel/igc/igc_regs.h +++ b/drivers/net/ethernet/intel/igc/igc_regs.h @@ -222,6 +222,22 @@ #define IGC_FTQF(_n) (0x059E0 + (4 * (_n))) /* 5-tuple Queue Fltr */ +/* Time sync registers - preemption statistics */ +#define IGC_PRMPTDRCNT 0x04284 /* Good RX Preempted Packets */ +#define IGC_PRMEVNTTCNT 0x04298 /* TX Preemption event counter */ +#define IGC_PRMEVNTRCNT 0x0429C /* RX Preemption event counter */ + + /* Preemption Exception Counter */ + #define IGC_PRMEXCPRCNT 0x42A0 +/* Received out of order packets with SMD-C */ +#define IGC_PRMEXCPRCNT_OOO_SMDC 0x000000FF +/* Received out of order packets with SMD-C and wrong Frame CNT */ +#define IGC_PRMEXCPRCNT_OOO_FRAME_CNT 0x0000FF00 +/* Received out of order packets with SMD-C and wrong Frag CNT */ +#define IGC_PRMEXCPRCNT_OOO_FRAG_CNT 0x00FF0000 +/* Received packets with SMD-S and wrong Frag CNT and Frame CNT */ +#define IGC_PRMEXCPRCNT_MISS_FRAME_FRAG_CNT 0xFF000000 + /* Transmit Scheduling Registers */ #define IGC_TQAVCTRL 0x3570 #define IGC_TXQCTL(_n) (0x3344 + 0x4 * (_n)) From 12b196568a3a7a32d18fc82fdef82832e9b94d0f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 18 Apr 2025 16:49:42 -0700 Subject: [PATCH 329/770] tools: ynl: add missing header deps Various new families and my recent work on rtnetlink missed adding dependencies on C headers. If the system headers are up to date or don't include a given header at all this doesn't make a difference. But if the system headers are in place but stale - compilation will break. Reported-by: Kory Maincent Fixes: 29d34a4d785b ("tools: ynl: generate code for rt-addr and add a sample") Link: https://lore.kernel.org/20250418190431.69c10431@kmaincent-XPS-13-7390 Acked-by: Stanislav Fomichev Tested-by: Kory Maincent Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250418234942.2344036-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/Makefile.deps | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 385783489f849..8b7bf673b686f 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -20,15 +20,18 @@ CFLAGS_ethtool:=$(call get_hdr_inc,_LINUX_ETHTOOL_H,ethtool.h) \ $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_H_,ethtool_netlink.h) \ $(call get_hdr_inc,_LINUX_ETHTOOL_NETLINK_GENERATED_H,ethtool_netlink_generated.h) CFLAGS_handshake:=$(call get_hdr_inc,_LINUX_HANDSHAKE_H,handshake.h) +CFLAGS_lockd_netlink:=$(call get_hdr_inc,_LINUX_LOCKD_NETLINK_H,lockd_netlink.h) CFLAGS_mptcp_pm:=$(call get_hdr_inc,_LINUX_MPTCP_PM_H,mptcp_pm.h) CFLAGS_net_shaper:=$(call get_hdr_inc,_LINUX_NET_SHAPER_H,net_shaper.h) CFLAGS_netdev:=$(call get_hdr_inc,_LINUX_NETDEV_H,netdev.h) CFLAGS_nl80211:=$(call get_hdr_inc,__LINUX_NL802121_H,nl80211.h) CFLAGS_nlctrl:=$(call get_hdr_inc,__LINUX_GENERIC_NETLINK_H,genetlink.h) CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_NETLINK_H,nfsd_netlink.h) +CFLAGS_ovpn:=$(call get_hdr_inc,_LINUX_OVPN,ovpn.h) CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_vport:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) -CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) +CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ + $(call get_hdr_inc,__LINUX_IF_ADDR_H,if_addr.h) CFLAGS_rt-route:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) CFLAGS_tcp_metrics:=$(call get_hdr_inc,_LINUX_TCP_METRICS_H,tcp_metrics.h) From 61fde5110ee9f1f6fbf6c75c3426ee5d144628b4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 18 Apr 2025 12:24:47 +0100 Subject: [PATCH 330/770] net: axienet: Fix spelling mistake "archecture" -> "architecture" There is a spelling mistake in a dev_error message. Fix it. Signed-off-by: Colin Ian King Link: https://patch.msgid.link/20250418112447.533746-1-colin.i.king@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 054abf283ab33..1b7a653c1f4ed 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -2980,7 +2980,7 @@ static int axienet_probe(struct platform_device *pdev) } } if (!IS_ENABLED(CONFIG_64BIT) && lp->features & XAE_FEATURE_DMA_64BIT) { - dev_err(&pdev->dev, "64-bit addressable DMA is not compatible with 32-bit archecture\n"); + dev_err(&pdev->dev, "64-bit addressable DMA is not compatible with 32-bit architecture\n"); ret = -EINVAL; goto cleanup_clk; } From 1e36473215297708dbe144c65b9f242c6e604520 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 18 Apr 2025 14:57:03 +0100 Subject: [PATCH 331/770] net/mlx5: Fix spelling mistakes in mlx5_core_dbg message and comments There is a spelling mistake in a mlx5_core_dbg and two spelling mistakes in comment blocks. Fix them. Signed-off-by: Colin Ian King Acked-by: Mark Bloch Link: https://patch.msgid.link/20250418135703.542722-1-colin.i.king@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 2c5f850c31f68..40024cfa30998 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -148,7 +148,7 @@ int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int function_id, * Free the IRQ and other resources such as rmap from the system. * BUT doesn't free or remove reference from mlx5. * This function is very important for the shutdown flow, where we need to - * cleanup system resoruces but keep mlx5 objects alive, + * cleanup system resources but keep mlx5 objects alive, * see mlx5_irq_table_free_irqs(). */ static void mlx5_system_free_irq(struct mlx5_irq *irq) @@ -588,7 +588,7 @@ static void irq_pool_free(struct mlx5_irq_pool *pool) struct mlx5_irq *irq; unsigned long index; - /* There are cases in which we are destrying the irq_table before + /* There are cases in which we are destroying the irq_table before * freeing all the IRQs, fast teardown for example. Hence, free the irqs * which might not have been freed. */ @@ -617,7 +617,7 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec, if (!mlx5_sf_max_functions(dev)) return 0; if (sf_vec < MLX5_IRQ_VEC_COMP_BASE_SF) { - mlx5_core_dbg(dev, "Not enught IRQs for SFs. SF may run at lower performance\n"); + mlx5_core_dbg(dev, "Not enough IRQs for SFs. SF may run at lower performance\n"); return 0; } From 199561a48f026bf674424fd9019c0690a3570377 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 17 Apr 2025 11:28:23 +0100 Subject: [PATCH 332/770] s390: ism: Pass string literal as format argument of dev_set_name() GCC 14.2.0 reports that passing a non-string literal as the format argument of dev_set_name() is potentially insecure. drivers/s390/net/ism_drv.c: In function 'ism_probe': drivers/s390/net/ism_drv.c:615:2: warning: format not a string literal and no format arguments [-Wformat-security] 615 | dev_set_name(&ism->dev, dev_name(&pdev->dev)); | ^~~~~~~~~~~~ It seems to me that as pdev is a PCIE device then the dev_name call above should always return the device's BDF, e.g. 00:12.0. That this should not contain format escape sequences. And thus the current usage is safe. But, it seems better to be safe than sorry. And, as a bonus, compiler output becomes less verbose by addressing this issue. Compile tested only. No functional change intended. Signed-off-by: Simon Horman Link: https://patch.msgid.link/20250417-ism-str-fmt-v1-1-9818b029874d@kernel.org Signed-off-by: Jakub Kicinski --- drivers/s390/net/ism_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index 60ed70a39d2cc..b7f15f303ea2d 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -611,7 +611,7 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) ism->dev.parent = &pdev->dev; ism->dev.release = ism_dev_release; device_initialize(&ism->dev); - dev_set_name(&ism->dev, dev_name(&pdev->dev)); + dev_set_name(&ism->dev, "%s", dev_name(&pdev->dev)); ret = device_add(&ism->dev); if (ret) goto err_dev; From 9929ba194299eadc80849db4cb992f4bc310401b Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 17 Apr 2025 06:03:07 -0700 Subject: [PATCH 333/770] net: Use nlmsg_payload in neighbour file Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250417-nlmsg_v3-v1-1-9b09d9d7e61d@debian.org Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 65cf582b5dacd..254067b719da3 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2747,12 +2747,12 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, if (strict_check) { struct ndmsg *ndm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); From a45193018001b6fbbaaf4ae2c5c37db426d42d94 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 17 Apr 2025 06:03:08 -0700 Subject: [PATCH 334/770] net: Use nlmsg_payload in rtnetlink file Leverage the new nlmsg_payload() helper to avoid checking for message size and then reading the nlmsg data. Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250417-nlmsg_v3-v1-2-9b09d9d7e61d@debian.org Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ea2e15de8d997..8a914b37ef6e8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2390,12 +2390,12 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, if (strict_check) { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); @@ -4084,7 +4084,8 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, struct ifinfomsg *ifm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for get link"); return -EINVAL; } @@ -4093,7 +4094,6 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); @@ -5052,12 +5052,12 @@ static int valid_fdb_get_strict(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); @@ -5324,12 +5324,12 @@ static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, if (strict_check) { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); @@ -6221,7 +6221,8 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, { struct if_stats_msg *ifsm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { + ifsm = nlmsg_payload(nlh, sizeof(*ifsm)); + if (!ifsm) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } @@ -6229,8 +6230,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, if (!strict_check) return 0; - ifsm = nlmsg_data(nlh); - /* only requests using strict checks can pass data to influence * the dump. The legacy exception is filter_mask. */ @@ -6458,12 +6457,12 @@ static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh, { struct br_port_msg *bpm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) { + bpm = nlmsg_payload(nlh, sizeof(*bpm)); + if (!bpm) { NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request"); return -EINVAL; } - bpm = nlmsg_data(nlh); if (bpm->ifindex) { NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request"); return -EINVAL; From 9276bfc2df92c6c45a963dea34d8c83656987ea7 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 17 Apr 2025 18:13:12 +0100 Subject: [PATCH 335/770] net: stmmac: socfpga: init dwmac->stmmac_rst before registration Initialisation/setup after registration is a bug. This is the first of two patches fixing this in socfpga. dwmac->stmmac_rst is initialised from the stmmac plat_dat's stmmac_rst member, which is itself initialised by devm_stmmac_probe_config_dt(). Therefore, this can be initialised before we call stmmac_dvr_probe(). Move it there. dwmac->stmmac_rst is used by the set_phy_mode() method. Signed-off-by: Russell King (Oracle) Tested-by: Maxime Chevallier Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/E1u5SnY-001IJY-90@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 1168556585597..bcdb25ee2a333 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -442,8 +442,6 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; int ret; struct socfpga_dwmac *dwmac; - struct net_device *ndev; - struct stmmac_priv *stpriv; const struct socfpga_dwmac_ops *ops; ops = device_get_match_data(&pdev->dev); @@ -479,7 +477,13 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) return ret; } + /* The socfpga driver needs to control the stmmac reset to set the phy + * mode. Create a copy of the core reset handle so it can be used by + * the driver later. + */ + dwmac->stmmac_rst = plat_dat->stmmac_rst; dwmac->ops = ops; + plat_dat->bsp_priv = dwmac; plat_dat->fix_mac_speed = socfpga_dwmac_fix_mac_speed; plat_dat->pcs_init = socfpga_dwmac_pcs_init; @@ -493,15 +497,6 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - ndev = platform_get_drvdata(pdev); - stpriv = netdev_priv(ndev); - - /* The socfpga driver needs to control the stmmac reset to set the phy - * mode. Create a copy of the core reset handle so it can be used by - * the driver later. - */ - dwmac->stmmac_rst = stpriv->plat->stmmac_rst; - ret = ops->set_phy_mode(dwmac); if (ret) goto err_dvr_remove; From 0dbd4a6f57c29a6df89dfa02d2bc5c54e2ae0eaf Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 17 Apr 2025 18:13:17 +0100 Subject: [PATCH 336/770] net: stmmac: socfpga: provide init function Both the resume and probe path needs to configure the phy mode, so provide a common function to do this which can later be hooked into plat_dat->init. Signed-off-by: Russell King (Oracle) Tested-by: Maxime Chevallier Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/E1u5Snd-001IJe-Cx@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index bcdb25ee2a333..c333ec07d15fa 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -435,6 +435,13 @@ static struct phylink_pcs *socfpga_dwmac_select_pcs(struct stmmac_priv *priv, return priv->hw->phylink_pcs; } +static int socfpga_dwmac_init(struct platform_device *pdev, void *bsp_priv) +{ + struct socfpga_dwmac *dwmac = bsp_priv; + + return dwmac->ops->set_phy_mode(dwmac); +} + static int socfpga_dwmac_probe(struct platform_device *pdev) { struct plat_stmmacenet_data *plat_dat; @@ -497,7 +504,7 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) if (ret) return ret; - ret = ops->set_phy_mode(dwmac); + ret = socfpga_dwmac_init(pdev, dwmac); if (ret) goto err_dvr_remove; @@ -512,11 +519,9 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) #ifdef CONFIG_PM_SLEEP static int socfpga_dwmac_resume(struct device *dev) { - struct net_device *ndev = dev_get_drvdata(dev); - struct stmmac_priv *priv = netdev_priv(ndev); struct socfpga_dwmac *dwmac_priv = get_stmmac_bsp_priv(dev); - dwmac_priv->ops->set_phy_mode(priv->plat->bsp_priv); + socfpga_dwmac_init(to_platform_device(dev), dwmac_priv); return stmmac_resume(dev); } From 6bf70d999aa90bc06656917baf042aead9cd4cda Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 17 Apr 2025 18:13:22 +0100 Subject: [PATCH 337/770] net: stmmac: socfpga: convert to stmmac_pltfr_pm_ops Convert socfpga to use the generic stmmac_pltfr_pm_ops, which can be achieved by adding an appropriate plat_dat->init function to do the setup. Signed-off-by: Russell King (Oracle) Tested-by: Maxime Chevallier Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/E1u5Sni-001IJk-Gi@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/dwmac-socfpga.c | 37 +------------------ 1 file changed, 2 insertions(+), 35 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index c333ec07d15fa..69ffc52c02752 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -493,6 +493,7 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) plat_dat->bsp_priv = dwmac; plat_dat->fix_mac_speed = socfpga_dwmac_fix_mac_speed; + plat_dat->init = socfpga_dwmac_init; plat_dat->pcs_init = socfpga_dwmac_pcs_init; plat_dat->pcs_exit = socfpga_dwmac_pcs_exit; plat_dat->select_pcs = socfpga_dwmac_select_pcs; @@ -516,40 +517,6 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) return ret; } -#ifdef CONFIG_PM_SLEEP -static int socfpga_dwmac_resume(struct device *dev) -{ - struct socfpga_dwmac *dwmac_priv = get_stmmac_bsp_priv(dev); - - socfpga_dwmac_init(to_platform_device(dev), dwmac_priv); - - return stmmac_resume(dev); -} -#endif /* CONFIG_PM_SLEEP */ - -static int __maybe_unused socfpga_dwmac_runtime_suspend(struct device *dev) -{ - struct net_device *ndev = dev_get_drvdata(dev); - struct stmmac_priv *priv = netdev_priv(ndev); - - stmmac_bus_clks_config(priv, false); - - return 0; -} - -static int __maybe_unused socfpga_dwmac_runtime_resume(struct device *dev) -{ - struct net_device *ndev = dev_get_drvdata(dev); - struct stmmac_priv *priv = netdev_priv(ndev); - - return stmmac_bus_clks_config(priv, true); -} - -static const struct dev_pm_ops socfpga_dwmac_pm_ops = { - SET_SYSTEM_SLEEP_PM_OPS(stmmac_suspend, socfpga_dwmac_resume) - SET_RUNTIME_PM_OPS(socfpga_dwmac_runtime_suspend, socfpga_dwmac_runtime_resume, NULL) -}; - static const struct socfpga_dwmac_ops socfpga_gen5_ops = { .set_phy_mode = socfpga_gen5_set_phy_mode, }; @@ -570,7 +537,7 @@ static struct platform_driver socfpga_dwmac_driver = { .remove = stmmac_pltfr_remove, .driver = { .name = "socfpga-dwmac", - .pm = &socfpga_dwmac_pm_ops, + .pm = &stmmac_pltfr_pm_ops, .of_match_table = socfpga_dwmac_match, }, }; From 91255347bba9637dc2717897127a6259cd4fbeac Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 17 Apr 2025 18:13:27 +0100 Subject: [PATCH 338/770] net: stmmac: socfpga: call set_phy_mode() before registration Initialisation/setup after registration is a bug. This is the second of two patches fixing this in socfpga. The set_phy_mode() functions do various hardware setup that would interfere with a netdev that has been published, and thus available to be opened by the kernel/userspace. However, set_phy_mode() relies upon the netdev having been initialised to get at the plat_stmmacenet_data structure, which is probably why it was placed after stmmac_drv_probe(). We can remove that need by storing a pointer to struct plat_stmmacenet_data in struct socfpga_dwmac. Move the call to set_phy_mode() before calling stmmac_dvr_probe(). This also simplifies the probe function as there is no need to unregister the netdev if set_phy_mode() fails. Signed-off-by: Russell King (Oracle) Tested-by: Maxime Chevallier Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/E1u5Snn-001IJq-L0@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/dwmac-socfpga.c | 20 +++++-------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 69ffc52c02752..c7c120e302974 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -50,6 +50,7 @@ struct socfpga_dwmac { u32 reg_offset; u32 reg_shift; struct device *dev; + struct plat_stmmacenet_data *plat_dat; struct regmap *sys_mgr_base_addr; struct reset_control *stmmac_rst; struct reset_control *stmmac_ocp_rst; @@ -233,10 +234,7 @@ static int socfpga_dwmac_parse_data(struct socfpga_dwmac *dwmac, struct device * static int socfpga_get_plat_phymode(struct socfpga_dwmac *dwmac) { - struct net_device *ndev = dev_get_drvdata(dwmac->dev); - struct stmmac_priv *priv = netdev_priv(ndev); - - return priv->plat->mac_interface; + return dwmac->plat_dat->mac_interface; } static void socfpga_sgmii_config(struct socfpga_dwmac *dwmac, bool enable) @@ -490,6 +488,7 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) */ dwmac->stmmac_rst = plat_dat->stmmac_rst; dwmac->ops = ops; + dwmac->plat_dat = plat_dat; plat_dat->bsp_priv = dwmac; plat_dat->fix_mac_speed = socfpga_dwmac_fix_mac_speed; @@ -501,20 +500,11 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) plat_dat->riwt_off = 1; - ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); - if (ret) - return ret; - ret = socfpga_dwmac_init(pdev, dwmac); if (ret) - goto err_dvr_remove; - - return 0; - -err_dvr_remove: - stmmac_dvr_remove(&pdev->dev); + return ret; - return ret; + return stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); } static const struct socfpga_dwmac_ops socfpga_gen5_ops = { From 1dbefd578d8bd3bb627f68cb162c8ab7884341fa Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 17 Apr 2025 18:13:32 +0100 Subject: [PATCH 339/770] net: stmmac: socfpga: convert to devm_stmmac_pltfr_probe() Convert socfpga to use devm_stmmac_pltfr_probe() to further simplify the probe function, wrapping the call to the set_phy_mode() method into socfpga_dwmac_init() which can be called from the plat_dat->init() method. Also call this from socfpga_dwmac_resume() thereby simplifying that function. Using the devm variant also means we can remove the call to stmmac_pltfr_remove(). Unfortunately, we can't also convert to stmmac_pltfr_pm_ops as there is extra work done in socfpga_dwmac_resume(). Signed-off-by: Russell King (Oracle) Tested-by: Maxime Chevallier Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/E1u5Sns-001IJw-OY@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index c7c120e302974..59f90b123c5b8 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -500,11 +500,7 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) plat_dat->riwt_off = 1; - ret = socfpga_dwmac_init(pdev, dwmac); - if (ret) - return ret; - - return stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); + return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); } static const struct socfpga_dwmac_ops socfpga_gen5_ops = { @@ -524,7 +520,6 @@ MODULE_DEVICE_TABLE(of, socfpga_dwmac_match); static struct platform_driver socfpga_dwmac_driver = { .probe = socfpga_dwmac_probe, - .remove = stmmac_pltfr_remove, .driver = { .name = "socfpga-dwmac", .pm = &stmmac_pltfr_pm_ops, From 0fcad44a86bdc2b5f202d91ba1eeeee6fceb7b25 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Thu, 17 Apr 2025 10:24:45 -0700 Subject: [PATCH 340/770] bnxt_en: Change FW message timeout warning The firmware advertises a "hwrm_cmd_max_timeout" value to the driver for NVRAM and coredump related functions that can take tens of seconds to complete. The driver polls for the operation to complete under mutex and may trigger hung task watchdog warning if the wait is too long. To warn the user about this, the driver currently prints a warning if this advertised value exceeds 40 seconds: Device requests max timeout of %d seconds, may trigger hung task watchdog Initially, we chose 40 seconds, well below the kernel's default CONFIG_DEFAULT_HUNG_TASK_TIMEOUT (120 seconds) to avoid triggering the hung task watchdog. But 60 seconds is the timeout on most production FW and cannot be reduced further. Change the driver's warning threshold to 60 seconds to avoid triggering this warning on all production devices. We also print the warning if the value exceeds CONFIG_DEFAULT_HUNG_TASK_TIMEOUT which may be set to architecture specific defaults as low as 10 seconds. Reviewed-by: Kalesh AP Reviewed-by: Pavan Chebbi Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250417172448.1206107-2-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 11 +++++++---- drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3d9205b9a8c3c..ad30445b47998 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -10110,7 +10110,7 @@ static int bnxt_hwrm_ver_get(struct bnxt *bp) struct hwrm_ver_get_input *req; u16 fw_maj, fw_min, fw_bld, fw_rsv; u32 dev_caps_cfg, hwrm_ver; - int rc, len; + int rc, len, max_tmo_secs; rc = hwrm_req_init(bp, req, HWRM_VER_GET); if (rc) @@ -10183,9 +10183,12 @@ static int bnxt_hwrm_ver_get(struct bnxt *bp) bp->hwrm_cmd_max_timeout = le16_to_cpu(resp->max_req_timeout) * 1000; if (!bp->hwrm_cmd_max_timeout) bp->hwrm_cmd_max_timeout = HWRM_CMD_MAX_TIMEOUT; - else if (bp->hwrm_cmd_max_timeout > HWRM_CMD_MAX_TIMEOUT) - netdev_warn(bp->dev, "Device requests max timeout of %d seconds, may trigger hung task watchdog\n", - bp->hwrm_cmd_max_timeout / 1000); + max_tmo_secs = bp->hwrm_cmd_max_timeout / 1000; + if (bp->hwrm_cmd_max_timeout > HWRM_CMD_MAX_TIMEOUT || + max_tmo_secs > CONFIG_DEFAULT_HUNG_TASK_TIMEOUT) { + netdev_warn(bp->dev, "Device requests max timeout of %d seconds, may trigger hung task watchdog (kernel default %ds)\n", + max_tmo_secs, CONFIG_DEFAULT_HUNG_TASK_TIMEOUT); + } if (resp->hwrm_intf_maj_8b >= 1) { bp->hwrm_max_req_len = le16_to_cpu(resp->max_req_win_len); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h index 15ca51b5d204e..fb5f5b063c3d8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.h @@ -58,7 +58,7 @@ void hwrm_update_token(struct bnxt *bp, u16 seq, enum bnxt_hwrm_wait_state s); #define BNXT_HWRM_MAX_REQ_LEN (bp->hwrm_max_req_len) #define BNXT_HWRM_SHORT_REQ_LEN sizeof(struct hwrm_short_input) -#define HWRM_CMD_MAX_TIMEOUT 40000U +#define HWRM_CMD_MAX_TIMEOUT 60000U #define SHORT_HWRM_CMD_TIMEOUT 20 #define HWRM_CMD_TIMEOUT (bp->hwrm_cmd_timeout) #define HWRM_RESET_TIMEOUT ((HWRM_CMD_TIMEOUT) * 4) From c21c8e1e4348315aa89f31dfa96c6f3ba3b9a5cf Mon Sep 17 00:00:00 2001 From: Shruti Parab Date: Thu, 17 Apr 2025 10:24:46 -0700 Subject: [PATCH 341/770] bnxt_en: Report the ethtool coredump length after copying the coredump ethtool first calls .get_dump_flags() to get the dump length. For coredump, the driver calls the FW to get the coredump length (L1). The min. of L1 and the user specified length is then passed to .get_dump_data() (L2) to get the coredump. The actual coredump length retrieved by the FW (L3) during .get_dump_data() may be smaller than L1. This length discrepancy will trigger a WARN_ON() in ethtool_get_dump_data(). ethtool has already vzalloc'ed a buffer with size L1. Just report the coredump length as L2 even though the actual coredump length L3 may be smaller. The extra zero padding does not matter. This will prevent the warning that may alarm the user. For correctness, only do the final length update if there is no error. Reviewed-by: Andy Gospodarek Reviewed-by: Damodharam Ammepalli Signed-off-by: Shruti Parab Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250417172448.1206107-3-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c index 5576e7cf84631..9b6489e417fc1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c @@ -496,9 +496,16 @@ static int __bnxt_get_coredump(struct bnxt *bp, u16 dump_type, void *buf, start_utc, coredump.total_segs + 1, rc); kfree(coredump.data); - *dump_len += sizeof(struct bnxt_coredump_record); - if (rc == -ENOBUFS) + if (!rc) { + *dump_len += sizeof(struct bnxt_coredump_record); + /* The actual coredump length can be smaller than the FW + * reported length earlier. Use the ethtool provided length. + */ + if (buf_len) + *dump_len = buf_len; + } else if (rc == -ENOBUFS) { netdev_err(bp->dev, "Firmware returned large coredump buffer\n"); + } return rc; } From 5bccacb4cc32cb835fe2fe100a210332c494e81d Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 17 Apr 2025 10:24:47 -0700 Subject: [PATCH 342/770] bnxt_en: Remove unused field "ref_count" in struct bnxt_ulp The "ref_count" field in struct bnxt_ulp is unused after commit a43c26fa2e6c ("RDMA/bnxt_re: Remove the sriov config callback"). So we can just remove it now. Reviewed-by: Somnath Kotur Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250417172448.1206107-4-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 5 ----- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h | 1 - 2 files changed, 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index a8e930d5dbb09..238db9a1aebf8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -148,7 +148,6 @@ void bnxt_unregister_dev(struct bnxt_en_dev *edev) struct net_device *dev = edev->net; struct bnxt *bp = netdev_priv(dev); struct bnxt_ulp *ulp; - int i = 0; ulp = edev->ulp_tbl; netdev_lock(dev); @@ -164,10 +163,6 @@ void bnxt_unregister_dev(struct bnxt_en_dev *edev) synchronize_rcu(); ulp->max_async_event_id = 0; ulp->async_events_bmap = NULL; - while (atomic_read(&ulp->ref_count) != 0 && i < 10) { - msleep(100); - i++; - } mutex_unlock(&edev->en_dev_lock); netdev_unlock(dev); return; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h index 7fa3b8d1ebd28..f6b5efb5e7753 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h @@ -50,7 +50,6 @@ struct bnxt_ulp { unsigned long *async_events_bmap; u16 max_async_event_id; u16 msix_requested; - atomic_t ref_count; }; struct bnxt_en_dev { From 76a69f360a712a30dfbc88d521454afb460012b1 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 17 Apr 2025 10:24:48 -0700 Subject: [PATCH 343/770] bnxt_en: Remove unused macros in bnxt_ulp.h BNXT_ROCE_ULP and BNXT_MAX_ULP are no longer used. Remove them to clean up the code. Reviewed-by: Shruti Parab Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Link: https://patch.msgid.link/20250417172448.1206107-5-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h index f6b5efb5e7753..7b9dd8ebe4bcc 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h @@ -10,9 +10,6 @@ #ifndef BNXT_ULP_H #define BNXT_ULP_H -#define BNXT_ROCE_ULP 0 -#define BNXT_MAX_ULP 1 - #define BNXT_MIN_ROCE_CP_RINGS 2 #define BNXT_MIN_ROCE_STAT_CTXS 1 From 804b09be09f8af4eda5346a72361459ba21fcf1b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:29 +0300 Subject: [PATCH 344/770] vxlan: Add RCU read-side critical sections in the Tx path The Tx path does not run from an RCU read-side critical section which makes the current lockless accesses to FDB entries invalid. As far as I am aware, this has not been a problem in practice, but traces will be generated once we transition the FDB lookup to rhashtable_lookup(). Add rcu_read_{lock,unlock}() around the handling of FDB entries in the Tx path. Remove the RCU read-side critical section from vxlan_xmit_nh() as now the function is always called from an RCU read-side critical section. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-2-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 56aee539c235f..7872b85e890e0 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -1916,12 +1916,15 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; } + rcu_read_lock(); f = vxlan_find_mac(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); + rcu_read_unlock(); goto out; } + rcu_read_unlock(); reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, n->ha, sha); @@ -2648,14 +2651,10 @@ static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); hash = skb_get_hash(skb); - rcu_read_lock(); nh = rcu_dereference(f->nh); - if (!nh) { - rcu_read_unlock(); + if (!nh) goto drop; - } do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); - rcu_read_unlock(); if (likely(do_xmit)) vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); @@ -2782,6 +2781,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } eth = eth_hdr(skb); + rcu_read_lock(); f = vxlan_find_mac(vxlan, eth->h_dest, vni); did_rsc = false; @@ -2804,7 +2804,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); - return NETDEV_TX_OK; + goto out; } } @@ -2829,6 +2829,8 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); } +out: + rcu_read_unlock(); return NETDEV_TX_OK; } From 884dd448f1ac8b5c5c0dcef9bcaab7c16ee48276 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:30 +0300 Subject: [PATCH 345/770] vxlan: Simplify creation of default FDB entry There is asymmetry in how the default FDB entry (all-zeroes) is created and destroyed in the VXLAN driver. It is created as part of the driver's newlink() routine, but destroyed as part of its ndo_uninit() routine. This caused multiple problems in the past. First, commit 0241b836732f ("vxlan: fix default fdb entry netlink notify ordering during netdev create") split the notification about the entry from its creation so that it will not be notified to user space before the VXLAN device is registered. Then, commit 6db924687139 ("vxlan: Fix error path in __vxlan_dev_create()") made the error path in __vxlan_dev_create() asymmetric by destroying the FDB entry before unregistering the net device. Otherwise, the FDB entry would have been freed twice: By ndo_uninit() as part of unregister_netdevice() and by vxlan_fdb_destroy() in the error path. Finally, commit 7c31e54aeee5 ("vxlan: do not destroy fdb if register_netdevice() is failed") split the insertion of the FDB entry into the hash table from its creation, moving the insertion after the registration of the net device. Otherwise, like before, the FDB entry would have been freed twice: By ndo_uninit() as part of register_netdevice()'s error path and by vxlan_fdb_destroy() in the error path of __vxlan_dev_create(). The end result is that the code is unnecessarily complex. In addition, the fixed size hash table cannot be converted to rhashtable as vxlan_fdb_insert() cannot fail, which will no longer be true with rhashtable. Solve this by making the addition and deletion of the default FDB entry completely symmetric. Namely, as part of newlink() routine, create the entry, insert it into to the hash table and send a notification to user space after the net device was registered. Note that at this stage the net device is still administratively down and cannot transmit / receive packets. Move the deletion from ndo_uninit() to the dellink routine(): Flush the default entry together with all the other entries, before unregistering the net device. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-3-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 78 +++++++++++----------------------- 1 file changed, 25 insertions(+), 53 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 7872b85e890e0..3df86927b1ece 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2930,18 +2930,6 @@ static int vxlan_init(struct net_device *dev) return err; } -static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni) -{ - struct vxlan_fdb *f; - u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); - - spin_lock_bh(&vxlan->hash_lock[hash_index]); - f = __vxlan_find_mac(vxlan, all_zeros_mac, vni); - if (f) - vxlan_fdb_destroy(vxlan, f, true, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); -} - static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -2952,8 +2940,6 @@ static void vxlan_uninit(struct net_device *dev) vxlan_vnigroup_uninit(vxlan); gro_cells_destroy(&vxlan->gro_cells); - - vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni); } /* Start ageing timer and join group when device is brought up */ @@ -3187,7 +3173,7 @@ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = { - /* Default entry is deleted at vxlan_uninit. */ + /* Default entry is deleted at vxlan_dellink. */ .ignore_default_entry = true, .state = 0, .state_mask = NUD_PERMANENT | NUD_NOARP, @@ -3963,7 +3949,6 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *remote_dev = NULL; struct vxlan_fdb *f = NULL; - bool unregister = false; struct vxlan_rdst *dst; int err; @@ -3974,72 +3959,62 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, dev->ethtool_ops = &vxlan_ethtool_ops; - /* create an fdb entry for a valid default destination */ - if (!vxlan_addr_any(&dst->remote_ip)) { - err = vxlan_fdb_create(vxlan, all_zeros_mac, - &dst->remote_ip, - NUD_REACHABLE | NUD_PERMANENT, - vxlan->cfg.dst_port, - dst->remote_vni, - dst->remote_vni, - dst->remote_ifindex, - NTF_SELF, 0, &f, extack); - if (err) - return err; - } - err = register_netdevice(dev); if (err) - goto errout; - unregister = true; + return err; if (dst->remote_ifindex) { remote_dev = __dev_get_by_index(net, dst->remote_ifindex); if (!remote_dev) { err = -ENODEV; - goto errout; + goto unregister; } err = netdev_upper_dev_link(remote_dev, dev, extack); if (err) - goto errout; + goto unregister; } err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto unlink; + /* create an fdb entry for a valid default destination */ + if (!vxlan_addr_any(&dst->remote_ip)) { + err = vxlan_fdb_create(vxlan, all_zeros_mac, + &dst->remote_ip, + NUD_REACHABLE | NUD_PERMANENT, + vxlan->cfg.dst_port, + dst->remote_vni, + dst->remote_vni, + dst->remote_ifindex, + NTF_SELF, 0, &f, extack); + if (err) + goto unlink; + } + if (f) { vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f); /* notify default fdb entry */ err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, true, extack); - if (err) { - vxlan_fdb_destroy(vxlan, f, false, false); - if (remote_dev) - netdev_upper_dev_unlink(remote_dev, dev); - goto unregister; - } + if (err) + goto fdb_destroy; } list_add(&vxlan->next, &vn->vxlan_list); if (remote_dev) dst->remote_dev = remote_dev; return 0; + +fdb_destroy: + vxlan_fdb_destroy(vxlan, f, false, false); unlink: if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); -errout: - /* unregister_netdevice() destroys the default FDB entry with deletion - * notification. But the addition notification was not sent yet, so - * destroy the entry by hand here. - */ - if (f) - __vxlan_fdb_free(f); unregister: - if (unregister) - unregister_netdevice(dev); + unregister_netdevice(dev); return err; } @@ -4520,10 +4495,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_fdb_flush_desc desc = { - /* Default entry is deleted at vxlan_uninit. */ - .ignore_default_entry = true, - }; + struct vxlan_fdb_flush_desc desc = {}; vxlan_flush(vxlan, &desc); From 69281e0fe18ab60e7b67e328214ffbd8609b13b4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:31 +0300 Subject: [PATCH 346/770] vxlan: Insert FDB into hash table in vxlan_fdb_create() Commit 7c31e54aeee5 ("vxlan: do not destroy fdb if register_netdevice() is failed") split the insertion of FDB entries into the FDB hash table from the function where they are created. This was done in order to work around a problem that is no longer possible after the previous patch. Simplify the code and move the body of vxlan_fdb_insert() back into vxlan_fdb_create(). Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-4-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 3df86927b1ece..915ce73f0c870 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -816,14 +816,6 @@ static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac, return f; } -static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac, - __be32 src_vni, struct vxlan_fdb *f) -{ - ++vxlan->addrcnt; - hlist_add_head_rcu(&f->hlist, - vxlan_fdb_head(vxlan, mac, src_vni)); -} - static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, u32 nhid, struct netlink_ext_ack *extack) { @@ -913,6 +905,10 @@ int vxlan_fdb_create(struct vxlan_dev *vxlan, if (rc < 0) goto errout; + ++vxlan->addrcnt; + hlist_add_head_rcu(&f->hlist, + vxlan_fdb_head(vxlan, mac, src_vni)); + *fdb = f; return 0; @@ -1101,7 +1097,6 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, if (rc < 0) return rc; - vxlan_fdb_insert(vxlan, mac, src_vni, f); rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, swdev_notify, extack); if (rc) @@ -3994,8 +3989,6 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, } if (f) { - vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f); - /* notify default fdb entry */ err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, true, extack); From ccc203b9a846c1d5262b5c60565ff79930c13477 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:32 +0300 Subject: [PATCH 347/770] vxlan: Unsplit default FDB entry creation and notification Commit 0241b836732f ("vxlan: fix default fdb entry netlink notify ordering during netdev create") split the creation of the default FDB entry from its notification to avoid sending a RTM_NEWNEIGH notification before RTM_NEWLINK. Previous patches restructured the code so that the default FDB entry is created after registering the VXLAN device and the notification about the new entry immediately follows its creation. Therefore, simplify the code and revert back to vxlan_fdb_update() which takes care of both creating the FDB entry and notifying user space about it. Hold the FDB hash lock when calling vxlan_fdb_update() like it expects. A subsequent patch will add a lockdep assertion to make sure this is indeed the case. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-5-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 915ce73f0c870..d3dfc4af95560 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -3943,7 +3943,6 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *remote_dev = NULL; - struct vxlan_fdb *f = NULL; struct vxlan_rdst *dst; int err; @@ -3976,33 +3975,29 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, /* create an fdb entry for a valid default destination */ if (!vxlan_addr_any(&dst->remote_ip)) { - err = vxlan_fdb_create(vxlan, all_zeros_mac, + u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, + dst->remote_vni); + + spin_lock_bh(&vxlan->hash_lock[hash_index]); + err = vxlan_fdb_update(vxlan, all_zeros_mac, &dst->remote_ip, NUD_REACHABLE | NUD_PERMANENT, + NLM_F_EXCL | NLM_F_CREATE, vxlan->cfg.dst_port, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, - NTF_SELF, 0, &f, extack); + NTF_SELF, 0, true, extack); + spin_unlock_bh(&vxlan->hash_lock[hash_index]); if (err) goto unlink; } - if (f) { - /* notify default fdb entry */ - err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), - RTM_NEWNEIGH, true, extack); - if (err) - goto fdb_destroy; - } - list_add(&vxlan->next, &vn->vxlan_list); if (remote_dev) dst->remote_dev = remote_dev; return 0; -fdb_destroy: - vxlan_fdb_destroy(vxlan, f, false, false); unlink: if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); From 6ba480cca25f26bd0f0e328b2d2f8eab6bde7cc0 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:33 +0300 Subject: [PATCH 348/770] vxlan: Relocate assignment of default remote device The default FDB entry can be associated with a net device if a physical device (i.e., 'dev PHYS_DEV') was specified during the creation of the VXLAN device. The assignment of the net device pointer to 'dst->remote_dev' logically belongs in the if block that resolves the pointer from the specified ifindex, so move it there. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-6-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index d3dfc4af95560..d45b63180714c 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -3967,6 +3967,8 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, err = netdev_upper_dev_link(remote_dev, dev, extack); if (err) goto unregister; + + dst->remote_dev = remote_dev; } err = rtnl_configure_link(dev, NULL, 0, NULL); @@ -3994,8 +3996,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, } list_add(&vxlan->next, &vn->vxlan_list); - if (remote_dev) - dst->remote_dev = remote_dev; + return 0; unlink: From 094adad91310d9f8f8485251129482f4f3e2c5b3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:34 +0300 Subject: [PATCH 349/770] vxlan: Use a single lock to protect the FDB table Currently, the VXLAN driver stores FDB entries in a hash table with a fixed number of buckets (256). Subsequent patches are going to convert this table to rhashtable with a linked list for entry traversal, as rhashtable is more scalable. In preparation for this conversion, move from a per-bucket spin lock to a single spin lock that protects the entire FDB table. The per-bucket spin locks were introduced by commit fe1e0713bbe8 ("vxlan: Use FDB_HASH_SIZE hash_locks to reduce contention") citing "huge contention when inserting/deleting vxlan_fdbs into the fdb_head". It is not clear from the commit message which code path was holding the spin lock for long periods of time, but the obvious suspect is the FDB cleanup routine (vxlan_cleanup()) that periodically traverses the entire table in order to delete aged-out entries. This will be solved by subsequent patches that will convert the FDB cleanup routine to traverse the linked list of FDB entries using RCU, only acquiring the spin lock when deleting an aged-out entry. The change reduces the size of the VXLAN device structure from 3600 bytes to 2576 bytes. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-7-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 82 +++++++++++------------------ drivers/net/vxlan/vxlan_vnifilter.c | 8 ++- include/net/vxlan.h | 2 +- 3 files changed, 34 insertions(+), 58 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index d45b63180714c..b8edd8afda284 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -524,8 +524,8 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, return -EINVAL; vxlan = netdev_priv(dev); + spin_lock_bh(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) { if (f->vni == vni) { list_for_each_entry(rdst, &f->remotes, list) { @@ -537,12 +537,12 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, } } } - spin_unlock_bh(&vxlan->hash_lock[h]); } + spin_unlock_bh(&vxlan->hash_lock); return 0; unlock: - spin_unlock_bh(&vxlan->hash_lock[h]); + spin_unlock_bh(&vxlan->hash_lock); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_replay); @@ -558,14 +558,14 @@ void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) return; vxlan = netdev_priv(dev); + spin_lock_bh(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) if (f->vni == vni) list_for_each_entry(rdst, &f->remotes, list) rdst->offloaded = false; - spin_unlock_bh(&vxlan->hash_lock[h]); } + spin_unlock_bh(&vxlan->hash_lock); } EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload); @@ -1248,7 +1248,6 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], __be16 port; __be32 src_vni, vni; u32 ifindex, nhid; - u32 hash_index; int err; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { @@ -1268,13 +1267,12 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) return -EAFNOSUPPORT; - hash_index = fdb_head_index(vxlan, addr, src_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, nhid, true, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); if (!err) *notified = true; @@ -1325,7 +1323,6 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], union vxlan_addr ip; __be32 src_vni, vni; u32 ifindex, nhid; - u32 hash_index; __be16 port; int err; @@ -1334,11 +1331,10 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], if (err) return err; - hash_index = fdb_head_index(vxlan, addr, src_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); if (!err) *notified = true; @@ -1486,10 +1482,8 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, rdst->remote_ip = *src_ip; vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { - u32 hash_index = fdb_head_index(vxlan, src_mac, vni); - /* learned new entry */ - spin_lock(&vxlan->hash_lock[hash_index]); + spin_lock(&vxlan->hash_lock); /* close off race between vxlan_flush and incoming packets */ if (netif_running(dev)) @@ -1500,7 +1494,7 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, vni, vxlan->default_dst.remote_vni, ifindex, NTF_SELF, 0, true, NULL); - spin_unlock(&vxlan->hash_lock[hash_index]); + spin_unlock(&vxlan->hash_lock); } return SKB_NOT_DROPPED_YET; @@ -2839,10 +2833,10 @@ static void vxlan_cleanup(struct timer_list *t) if (!netif_running(vxlan->dev)) return; + spin_lock(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; - spin_lock(&vxlan->hash_lock[h]); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); @@ -2864,8 +2858,8 @@ static void vxlan_cleanup(struct timer_list *t) } else if (time_before(timeout, next_timer)) next_timer = timeout; } - spin_unlock(&vxlan->hash_lock[h]); } + spin_unlock(&vxlan->hash_lock); mod_timer(&vxlan->age_timer, next_timer); } @@ -3056,10 +3050,10 @@ static void vxlan_flush(struct vxlan_dev *vxlan, bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); unsigned int h; + spin_lock_bh(&vxlan->hash_lock); for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; - spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); @@ -3079,8 +3073,8 @@ static void vxlan_flush(struct vxlan_dev *vxlan, vxlan_fdb_destroy(vxlan, f, true, true); } - spin_unlock_bh(&vxlan->hash_lock[h]); } + spin_unlock_bh(&vxlan->hash_lock); } static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = { @@ -3358,15 +3352,14 @@ static void vxlan_setup(struct net_device *dev) dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; INIT_LIST_HEAD(&vxlan->next); + spin_lock_init(&vxlan->hash_lock); timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); vxlan->dev = dev; - for (h = 0; h < FDB_HASH_SIZE; ++h) { - spin_lock_init(&vxlan->hash_lock[h]); + for (h = 0; h < FDB_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vxlan->fdb_head[h]); - } } static void vxlan_ether_setup(struct net_device *dev) @@ -3977,10 +3970,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, /* create an fdb entry for a valid default destination */ if (!vxlan_addr_any(&dst->remote_ip)) { - u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, - dst->remote_vni); - - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, all_zeros_mac, &dst->remote_ip, NUD_REACHABLE | NUD_PERMANENT, @@ -3990,7 +3980,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, dst->remote_vni, dst->remote_ifindex, NTF_SELF, 0, true, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); if (err) goto unlink; } @@ -4420,9 +4410,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], /* handle default dst entry */ if (rem_ip_changed) { - u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni); - - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); if (!vxlan_addr_any(&conf.remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, &conf.remote_ip, @@ -4433,7 +4421,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], conf.remote_ifindex, NTF_SELF, 0, true, extack); if (err) { - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; @@ -4447,7 +4435,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], dst->remote_vni, dst->remote_ifindex, true); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); /* If vni filtering device, also update fdb entries of * all vnis that were using default remote ip @@ -4747,11 +4735,8 @@ vxlan_fdb_offloaded_set(struct net_device *dev, struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst; struct vxlan_fdb *f; - u32 hash_index; - - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) @@ -4767,7 +4752,7 @@ vxlan_fdb_offloaded_set(struct net_device *dev, rdst->offloaded = fdb_info->offloaded; out: - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); } static int @@ -4776,13 +4761,11 @@ vxlan_fdb_external_learn_add(struct net_device *dev, { struct vxlan_dev *vxlan = netdev_priv(dev); struct netlink_ext_ack *extack; - u32 hash_index; int err; - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); extack = switchdev_notifier_info_to_extack(&fdb_info->info); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip, NUD_REACHABLE, NLM_F_CREATE | NLM_F_REPLACE, @@ -4792,7 +4775,7 @@ vxlan_fdb_external_learn_add(struct net_device *dev, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, 0, false, extack); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } @@ -4803,11 +4786,9 @@ vxlan_fdb_external_learn_del(struct net_device *dev, { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; - u32 hash_index; int err = 0; - hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) @@ -4821,7 +4802,7 @@ vxlan_fdb_external_learn_del(struct net_device *dev, fdb_info->remote_ifindex, false); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } @@ -4870,18 +4851,15 @@ static void vxlan_fdb_nh_flush(struct nexthop *nh) { struct vxlan_fdb *fdb; struct vxlan_dev *vxlan; - u32 hash_index; rcu_read_lock(); list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) { vxlan = rcu_dereference(fdb->vdev); WARN_ON(!vxlan); - hash_index = fdb_head_index(vxlan, fdb->eth_addr, - vxlan->default_dst.remote_vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); if (!hlist_unhashed(&fdb->hlist)) vxlan_fdb_destroy(vxlan, fdb, false, false); - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); } rcu_read_unlock(); } diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index d0753776d3394..336cd0e203094 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -482,11 +482,9 @@ static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, struct netlink_ext_ack *extack) { struct vxlan_rdst *dst = &vxlan->default_dst; - u32 hash_index; int err = 0; - hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); - spin_lock_bh(&vxlan->hash_lock[hash_index]); + spin_lock_bh(&vxlan->hash_lock); if (remote_ip && !vxlan_addr_any(remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, remote_ip, @@ -498,7 +496,7 @@ static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, dst->remote_ifindex, NTF_SELF, 0, true, extack); if (err) { - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } } @@ -511,7 +509,7 @@ static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni, dst->remote_ifindex, true); } - spin_unlock_bh(&vxlan->hash_lock[hash_index]); + spin_unlock_bh(&vxlan->hash_lock); return err; } diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 2dd23ee2bacdd..272e11708a33f 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -296,7 +296,7 @@ struct vxlan_dev { struct vxlan_rdst default_dst; /* default destination */ struct timer_list age_timer; - spinlock_t hash_lock[FDB_HASH_SIZE]; + spinlock_t hash_lock; unsigned int addrcnt; struct gro_cells gro_cells; From 8d45673d2d2e59d03e108c569a3e8c031aa534c8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:35 +0300 Subject: [PATCH 350/770] vxlan: Add a linked list of FDB entries Currently, FDB entries are stored in a hash table with a fixed number of buckets. The table is used for both lookups and entry traversal. Subsequent patches will convert the table to rhashtable which is not suitable for entry traversal. In preparation for this conversion, add FDB entries to a linked list. Subsequent patches will convert the driver to use this list when traversing entries during dump, flush, etc. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-8-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 3 +++ drivers/net/vxlan/vxlan_private.h | 1 + include/net/vxlan.h | 1 + 3 files changed, 5 insertions(+) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index b8edd8afda284..511c24e29d457 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -908,6 +908,7 @@ int vxlan_fdb_create(struct vxlan_dev *vxlan, ++vxlan->addrcnt; hlist_add_head_rcu(&f->hlist, vxlan_fdb_head(vxlan, mac, src_vni)); + hlist_add_head_rcu(&f->fdb_node, &vxlan->fdb_list); *fdb = f; @@ -962,6 +963,7 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, swdev_notify, NULL); } + hlist_del_init_rcu(&f->fdb_node); hlist_del_rcu(&f->hlist); list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); @@ -3360,6 +3362,7 @@ static void vxlan_setup(struct net_device *dev) for (h = 0; h < FDB_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vxlan->fdb_head[h]); + INIT_HLIST_HEAD(&vxlan->fdb_list); } static void vxlan_ether_setup(struct net_device *dev) diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h index 76a351a997d51..078702ec604df 100644 --- a/drivers/net/vxlan/vxlan_private.h +++ b/drivers/net/vxlan/vxlan_private.h @@ -36,6 +36,7 @@ struct vxlan_fdb { __be32 vni; u16 flags; /* see ndm_flags and below */ struct list_head nh_list; + struct hlist_node fdb_node; struct nexthop __rcu *nh; struct vxlan_dev __rcu *vdev; }; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 272e11708a33f..96a6c6f45c2e9 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -307,6 +307,7 @@ struct vxlan_dev { struct hlist_head fdb_head[FDB_HASH_SIZE]; struct rhashtable mdb_tbl; + struct hlist_head fdb_list; struct hlist_head mdb_list; unsigned int mdb_seq; }; From 7aa0dc750d4bca9545c89803e3779bdff4c58b58 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:36 +0300 Subject: [PATCH 351/770] vxlan: Use linked list to traverse FDB entries In preparation for removing the fixed size hash table, convert FDB entry traversal to use the newly added FDB linked list. No functional changes intended. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-9-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 172 ++++++++++++++------------------- 1 file changed, 75 insertions(+), 97 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 511c24e29d457..f9840a4b6e44c 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -517,7 +517,6 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; - unsigned int h; int rc = 0; if (!netif_is_vxlan(dev)) @@ -525,16 +524,13 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, vxlan = netdev_priv(dev); spin_lock_bh(&vxlan->hash_lock); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) { - if (f->vni == vni) { - list_for_each_entry(rdst, &f->remotes, list) { - rc = vxlan_fdb_notify_one(nb, vxlan, - f, rdst, - extack); - if (rc) - goto unlock; - } + hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { + if (f->vni == vni) { + list_for_each_entry(rdst, &f->remotes, list) { + rc = vxlan_fdb_notify_one(nb, vxlan, f, rdst, + extack); + if (rc) + goto unlock; } } } @@ -552,18 +548,17 @@ void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; - unsigned int h; if (!netif_is_vxlan(dev)) return; vxlan = netdev_priv(dev); spin_lock_bh(&vxlan->hash_lock); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) - if (f->vni == vni) - list_for_each_entry(rdst, &f->remotes, list) - rdst->offloaded = false; + hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { + if (f->vni == vni) { + list_for_each_entry(rdst, &f->remotes, list) + rdst->offloaded = false; + } } spin_unlock_bh(&vxlan->hash_lock); @@ -1351,52 +1346,46 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, { struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct vxlan_dev *vxlan = netdev_priv(dev); - unsigned int h; + struct vxlan_fdb *f; int err = 0; - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct vxlan_fdb *f; - - rcu_read_lock(); - hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { - struct vxlan_rdst *rd; - - if (rcu_access_pointer(f->nh)) { - if (*idx < ctx->fdb_idx) - goto skip_nh; - err = vxlan_fdb_info(skb, vxlan, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI, NULL); - if (err < 0) { - rcu_read_unlock(); - goto out; - } -skip_nh: - *idx += 1; - continue; + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { + struct vxlan_rdst *rd; + + if (rcu_access_pointer(f->nh)) { + if (*idx < ctx->fdb_idx) + goto skip_nh; + err = vxlan_fdb_info(skb, vxlan, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI, NULL); + if (err < 0) { + rcu_read_unlock(); + goto out; } +skip_nh: + *idx += 1; + continue; + } - list_for_each_entry_rcu(rd, &f->remotes, list) { - if (*idx < ctx->fdb_idx) - goto skip; - - err = vxlan_fdb_info(skb, vxlan, f, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, - NLM_F_MULTI, rd); - if (err < 0) { - rcu_read_unlock(); - goto out; - } -skip: - *idx += 1; + list_for_each_entry_rcu(rd, &f->remotes, list) { + if (*idx < ctx->fdb_idx) + goto skip; + + err = vxlan_fdb_info(skb, vxlan, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI, rd); + if (err < 0) { + rcu_read_unlock(); + goto out; } +skip: + *idx += 1; } - rcu_read_unlock(); } + rcu_read_unlock(); out: return err; } @@ -2830,35 +2819,30 @@ static void vxlan_cleanup(struct timer_list *t) { struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; - unsigned int h; + struct hlist_node *n; + struct vxlan_fdb *f; if (!netif_running(vxlan->dev)) return; spin_lock(&vxlan->hash_lock); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct hlist_node *p, *n; - - hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { - struct vxlan_fdb *f - = container_of(p, struct vxlan_fdb, hlist); - unsigned long timeout; + hlist_for_each_entry_safe(f, n, &vxlan->fdb_list, fdb_node) { + unsigned long timeout; - if (f->state & (NUD_PERMANENT | NUD_NOARP)) - continue; + if (f->state & (NUD_PERMANENT | NUD_NOARP)) + continue; - if (f->flags & NTF_EXT_LEARNED) - continue; + if (f->flags & NTF_EXT_LEARNED) + continue; - timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ; - if (time_before_eq(timeout, jiffies)) { - netdev_dbg(vxlan->dev, - "garbage collect %pM\n", - f->eth_addr); - f->state = NUD_STALE; - vxlan_fdb_destroy(vxlan, f, true, true); - } else if (time_before(timeout, next_timer)) - next_timer = timeout; + timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ; + if (time_before_eq(timeout, jiffies)) { + netdev_dbg(vxlan->dev, "garbage collect %pM\n", + f->eth_addr); + f->state = NUD_STALE; + vxlan_fdb_destroy(vxlan, f, true, true); + } else if (time_before(timeout, next_timer)) { + next_timer = timeout; } } spin_unlock(&vxlan->hash_lock); @@ -3050,31 +3034,25 @@ static void vxlan_flush(struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); - unsigned int h; + struct hlist_node *n; + struct vxlan_fdb *f; spin_lock_bh(&vxlan->hash_lock); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct hlist_node *p, *n; - - hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { - struct vxlan_fdb *f - = container_of(p, struct vxlan_fdb, hlist); - - if (!vxlan_fdb_flush_matches(f, vxlan, desc)) - continue; - - if (match_remotes) { - bool destroy_fdb = false; + hlist_for_each_entry_safe(f, n, &vxlan->fdb_list, fdb_node) { + if (!vxlan_fdb_flush_matches(f, vxlan, desc)) + continue; - vxlan_fdb_flush_match_remotes(f, vxlan, desc, - &destroy_fdb); + if (match_remotes) { + bool destroy_fdb = false; - if (!destroy_fdb) - continue; - } + vxlan_fdb_flush_match_remotes(f, vxlan, desc, + &destroy_fdb); - vxlan_fdb_destroy(vxlan, f, true, true); + if (!destroy_fdb) + continue; } + + vxlan_fdb_destroy(vxlan, f, true, true); } spin_unlock_bh(&vxlan->hash_lock); } @@ -4860,7 +4838,7 @@ static void vxlan_fdb_nh_flush(struct nexthop *nh) vxlan = rcu_dereference(fdb->vdev); WARN_ON(!vxlan); spin_lock_bh(&vxlan->hash_lock); - if (!hlist_unhashed(&fdb->hlist)) + if (!hlist_unhashed(&fdb->fdb_node)) vxlan_fdb_destroy(vxlan, fdb, false, false); spin_unlock_bh(&vxlan->hash_lock); } From a6d04f8937e3a721c3a3578fc18b28cb59b15f42 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:37 +0300 Subject: [PATCH 352/770] vxlan: Convert FDB garbage collection to RCU Instead of holding the FDB hash lock when traversing the FDB linked list during garbage collection, use RCU and only acquire the lock for entries that need to be removed (aged out). Avoid races by using hlist_unhashed() to check that the entry has not been removed from the list by another thread. Note that vxlan_fdb_destroy() uses hlist_del_init_rcu() to remove an entry from the list which should cause list_unhashed() to return true. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-10-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index f9840a4b6e44c..c3511a43ce998 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2819,14 +2819,13 @@ static void vxlan_cleanup(struct timer_list *t) { struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; - struct hlist_node *n; struct vxlan_fdb *f; if (!netif_running(vxlan->dev)) return; - spin_lock(&vxlan->hash_lock); - hlist_for_each_entry_safe(f, n, &vxlan->fdb_list, fdb_node) { + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { unsigned long timeout; if (f->state & (NUD_PERMANENT | NUD_NOARP)) @@ -2837,15 +2836,19 @@ static void vxlan_cleanup(struct timer_list *t) timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ; if (time_before_eq(timeout, jiffies)) { - netdev_dbg(vxlan->dev, "garbage collect %pM\n", - f->eth_addr); - f->state = NUD_STALE; - vxlan_fdb_destroy(vxlan, f, true, true); + spin_lock(&vxlan->hash_lock); + if (!hlist_unhashed(&f->fdb_node)) { + netdev_dbg(vxlan->dev, "garbage collect %pM\n", + f->eth_addr); + f->state = NUD_STALE; + vxlan_fdb_destroy(vxlan, f, true, true); + } + spin_unlock(&vxlan->hash_lock); } else if (time_before(timeout, next_timer)) { next_timer = timeout; } } - spin_unlock(&vxlan->hash_lock); + rcu_read_unlock(); mod_timer(&vxlan->age_timer, next_timer); } From 54f45187b635f8c4c1e554a2ed347bba7d27dfbd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:38 +0300 Subject: [PATCH 353/770] vxlan: Convert FDB flushing to RCU Instead of holding the FDB hash lock when traversing the FDB linked list during flushing, use RCU and only acquire the lock for entries that need to be flushed. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-11-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index c3511a43ce998..762dde70d9e9c 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -3037,14 +3037,17 @@ static void vxlan_flush(struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); - struct hlist_node *n; struct vxlan_fdb *f; - spin_lock_bh(&vxlan->hash_lock); - hlist_for_each_entry_safe(f, n, &vxlan->fdb_list, fdb_node) { + rcu_read_lock(); + hlist_for_each_entry_rcu(f, &vxlan->fdb_list, fdb_node) { if (!vxlan_fdb_flush_matches(f, vxlan, desc)) continue; + spin_lock_bh(&vxlan->hash_lock); + if (hlist_unhashed(&f->fdb_node)) + goto unlock; + if (match_remotes) { bool destroy_fdb = false; @@ -3052,12 +3055,14 @@ static void vxlan_flush(struct vxlan_dev *vxlan, &destroy_fdb); if (!destroy_fdb) - continue; + goto unlock; } vxlan_fdb_destroy(vxlan, f, true, true); +unlock: + spin_unlock_bh(&vxlan->hash_lock); } - spin_unlock_bh(&vxlan->hash_lock); + rcu_read_unlock(); } static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = { From 5cde39ea38813ebb5bf07922a3ba60871edebf99 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:39 +0300 Subject: [PATCH 354/770] vxlan: Rename FDB Tx lookup function vxlan_find_mac() is only expected to be called from the Tx path as it updates the 'used' timestamp. Rename it to vxlan_find_mac_tx() to reflect that and to avoid incorrect updates of this timestamp like those addressed by commit 9722f834fe9a ("vxlan: Avoid unnecessary updates to FDB 'used' time"). No functional changes intended. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-12-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 762dde70d9e9c..397b1691ab069 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -429,8 +429,8 @@ static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, return NULL; } -static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, - const u8 *mac, __be32 vni) +static struct vxlan_fdb *vxlan_find_mac_tx(struct vxlan_dev *vxlan, + const u8 *mac, __be32 vni) { struct vxlan_fdb *f; @@ -1897,7 +1897,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) } rcu_read_lock(); - f = vxlan_find_mac(vxlan, n->ha, vni); + f = vxlan_find_mac_tx(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); @@ -2063,7 +2063,7 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) goto out; } - f = vxlan_find_mac(vxlan, n->ha, vni); + f = vxlan_find_mac_tx(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); @@ -2762,7 +2762,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) eth = eth_hdr(skb); rcu_read_lock(); - f = vxlan_find_mac(vxlan, eth->h_dest, vni); + f = vxlan_find_mac_tx(vxlan, eth->h_dest, vni); did_rsc = false; if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) && @@ -2770,11 +2770,11 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) ntohs(eth->h_proto) == ETH_P_IPV6)) { did_rsc = route_shortcircuit(dev, skb); if (did_rsc) - f = vxlan_find_mac(vxlan, eth->h_dest, vni); + f = vxlan_find_mac_tx(vxlan, eth->h_dest, vni); } if (f == NULL) { - f = vxlan_find_mac(vxlan, all_zeros_mac, vni); + f = vxlan_find_mac_tx(vxlan, all_zeros_mac, vni); if (f == NULL) { if ((vxlan->cfg.flags & VXLAN_F_L2MISS) && !is_multicast_ether_addr(eth->h_dest)) From ebe6420674551751e6a888180ebda76f1dddf911 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:40 +0300 Subject: [PATCH 355/770] vxlan: Create wrappers for FDB lookup __vxlan_find_mac() is called from both the data path (e.g., during learning) and the control path (e.g., when replacing an entry). The function is missing lockdep annotations to make sure that the FDB hash lock is held during FDB updates. Rename __vxlan_find_mac() to vxlan_find_mac_rcu() to reflect the fact that it should be called from an RCU read-side critical section and call it from vxlan_find_mac() which checks that the FDB hash lock is held. Change callers to invoke the appropriate function. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-13-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 397b1691ab069..2846c8c5234e2 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -409,8 +409,8 @@ static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, } /* Look up Ethernet address in forwarding table */ -static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, - const u8 *mac, __be32 vni) +static struct vxlan_fdb *vxlan_find_mac_rcu(struct vxlan_dev *vxlan, + const u8 *mac, __be32 vni) { struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni); struct vxlan_fdb *f; @@ -434,7 +434,7 @@ static struct vxlan_fdb *vxlan_find_mac_tx(struct vxlan_dev *vxlan, { struct vxlan_fdb *f; - f = __vxlan_find_mac(vxlan, mac, vni); + f = vxlan_find_mac_rcu(vxlan, mac, vni); if (f) { unsigned long now = jiffies; @@ -445,6 +445,20 @@ static struct vxlan_fdb *vxlan_find_mac_tx(struct vxlan_dev *vxlan, return f; } +static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, + const u8 *mac, __be32 vni) +{ + struct vxlan_fdb *f; + + lockdep_assert_held_once(&vxlan->hash_lock); + + rcu_read_lock(); + f = vxlan_find_mac_rcu(vxlan, mac, vni); + rcu_read_unlock(); + + return f; +} + /* caller should hold vxlan->hash_lock */ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, @@ -480,7 +494,7 @@ int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, rcu_read_lock(); - f = __vxlan_find_mac(vxlan, eth_addr, vni); + f = vxlan_find_mac_rcu(vxlan, eth_addr, vni); if (!f) { rc = -ENOENT; goto out; @@ -1117,7 +1131,7 @@ int vxlan_fdb_update(struct vxlan_dev *vxlan, { struct vxlan_fdb *f; - f = __vxlan_find_mac(vxlan, mac, src_vni); + f = vxlan_find_mac(vxlan, mac, src_vni); if (f) { if (flags & NLM_F_EXCL) { netdev_dbg(vxlan->dev, @@ -1286,7 +1300,7 @@ int __vxlan_fdb_delete(struct vxlan_dev *vxlan, struct vxlan_fdb *f; int err = -ENOENT; - f = __vxlan_find_mac(vxlan, addr, src_vni); + f = vxlan_find_mac(vxlan, addr, src_vni); if (!f) return err; @@ -1409,7 +1423,7 @@ static int vxlan_fdb_get(struct sk_buff *skb, rcu_read_lock(); - f = __vxlan_find_mac(vxlan, addr, vni); + f = vxlan_find_mac_rcu(vxlan, addr, vni); if (!f) { NL_SET_ERR_MSG(extack, "Fdb entry not found"); err = -ENOENT; @@ -1445,7 +1459,7 @@ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, ifindex = src_ifindex; #endif - f = __vxlan_find_mac(vxlan, src_mac, vni); + f = vxlan_find_mac_rcu(vxlan, src_mac, vni); if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); unsigned long now = jiffies; @@ -4727,7 +4741,7 @@ vxlan_fdb_offloaded_set(struct net_device *dev, spin_lock_bh(&vxlan->hash_lock); - f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); + f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) goto out; @@ -4779,7 +4793,7 @@ vxlan_fdb_external_learn_del(struct net_device *dev, spin_lock_bh(&vxlan->hash_lock); - f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); + f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) err = -ENOENT; else if (f->flags & NTF_EXT_LEARNED) From 20c76dadc783759fd3819d289c72be590660cc8b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:41 +0300 Subject: [PATCH 356/770] vxlan: Do not treat dst cache initialization errors as fatal FDB entries are allocated in an atomic context as they can be added from the data path when learning is enabled. After converting the FDB hash table to rhashtable, the insertion rate will be much higher (*) which will entail a much higher rate of per-CPU allocations via dst_cache_init(). When adding a large number of entries (e.g., 256k) in a batch, a small percentage (< 0.02%) of these per-CPU allocations will fail [1]. This does not happen with the current code since the insertion rate is low enough to give the per-CPU allocator a chance to asynchronously create new chunks of per-CPU memory. Given that: a. Only a small percentage of these per-CPU allocations fail. b. The scenario where this happens might not be the most realistic one. c. The driver can work correctly without dst caches. The dst_cache_*() APIs first check that the dst cache was properly initialized. d. The dst caches are not always used (e.g., 'tos inherit'). It seems reasonable to not treat these allocation failures as fatal. Therefore, do not bail when dst_cache_init() fails and suppress warnings by specifying '__GFP_NOWARN'. [1] percpu: allocation failed, size=40 align=8 atomic=1, atomic alloc failed, no space left (*) 97% reduction in average latency of vxlan_fdb_update() when adding 256k entries in a batch. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-14-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 2846c8c5234e2..5c0752161529a 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -619,10 +619,10 @@ static int vxlan_fdb_append(struct vxlan_fdb *f, if (rd == NULL) return -ENOMEM; - if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { - kfree(rd); - return -ENOMEM; - } + /* The driver can work correctly without a dst cache, so do not treat + * dst cache initialization errors as fatal. + */ + dst_cache_init(&rd->dst_cache, GFP_ATOMIC | __GFP_NOWARN); rd->remote_ip = *ip; rd->remote_port = port; From f13f3b4157dd9a05a5b651dfd15a4efa2df06f67 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:42 +0300 Subject: [PATCH 357/770] vxlan: Introduce FDB key structure In preparation for converting the FDB table to rhashtable, introduce a key structure that includes the MAC address and source VNI. No functional changes intended. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-15-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 44 ++++++++++++++++--------------- drivers/net/vxlan/vxlan_private.h | 8 ++++-- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 5c0752161529a..8e359cf8dbbd1 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -186,7 +186,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, } else if (nh) { ndm->ndm_family = nh_family; } - send_eth = !is_zero_ether_addr(fdb->eth_addr); + send_eth = !is_zero_ether_addr(fdb->key.eth_addr); } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; @@ -201,7 +201,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, peernet2id(dev_net(vxlan->dev), vxlan->net))) goto nla_put_failure; - if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) + if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.eth_addr)) goto nla_put_failure; if (nh) { if (nla_put_u32(skb, NDA_NH_ID, nh_id)) @@ -223,9 +223,9 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, goto nla_put_failure; } - if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && + if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->key.vni && nla_put_u32(skb, NDA_SRC_VNI, - be32_to_cpu(fdb->vni))) + be32_to_cpu(fdb->key.vni))) goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - READ_ONCE(fdb->used)); @@ -293,8 +293,8 @@ static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan, fdb_info->remote_port = rd->remote_port; fdb_info->remote_vni = rd->remote_vni; fdb_info->remote_ifindex = rd->remote_ifindex; - memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN); - fdb_info->vni = fdb->vni; + memcpy(fdb_info->eth_addr, fdb->key.eth_addr, ETH_ALEN); + fdb_info->vni = fdb->key.vni; fdb_info->offloaded = rd->offloaded; fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER; } @@ -366,7 +366,7 @@ static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) }; struct vxlan_rdst remote = { }; - memcpy(f.eth_addr, eth_addr, ETH_ALEN); + memcpy(f.key.eth_addr, eth_addr, ETH_ALEN); vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } @@ -416,9 +416,9 @@ static struct vxlan_fdb *vxlan_find_mac_rcu(struct vxlan_dev *vxlan, struct vxlan_fdb *f; hlist_for_each_entry_rcu(f, head, hlist) { - if (ether_addr_equal(mac, f->eth_addr)) { + if (ether_addr_equal(mac, f->key.eth_addr)) { if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { - if (vni == f->vni) + if (vni == f->key.vni) return f; } else { return f; @@ -539,7 +539,7 @@ int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, spin_lock_bh(&vxlan->hash_lock); hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { - if (f->vni == vni) { + if (f->key.vni == vni) { list_for_each_entry(rdst, &f->remotes, list) { rc = vxlan_fdb_notify_one(nb, vxlan, f, rdst, extack); @@ -569,7 +569,7 @@ void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) spin_lock_bh(&vxlan->hash_lock); hlist_for_each_entry(f, &vxlan->fdb_list, fdb_node) { - if (f->vni == vni) { + if (f->key.vni == vni) { list_for_each_entry(rdst, &f->remotes, list) rdst->offloaded = false; } @@ -812,15 +812,16 @@ static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac, f = kmalloc(sizeof(*f), GFP_ATOMIC); if (!f) return NULL; + memset(&f->key, 0, sizeof(f->key)); f->state = state; f->flags = ndm_flags; f->updated = f->used = jiffies; - f->vni = src_vni; + f->key.vni = src_vni; f->nh = NULL; RCU_INIT_POINTER(f->vdev, vxlan); INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); - memcpy(f->eth_addr, mac, ETH_ALEN); + memcpy(f->key.eth_addr, mac, ETH_ALEN); return f; } @@ -959,7 +960,7 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, { struct vxlan_rdst *rd; - netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr); + netdev_dbg(vxlan->dev, "delete %pM\n", f->key.eth_addr); --vxlan->addrcnt; if (do_notify) { @@ -1031,8 +1032,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, if ((flags & NLM_F_REPLACE)) { /* Only change unicasts */ - if (!(is_multicast_ether_addr(f->eth_addr) || - is_zero_ether_addr(f->eth_addr))) { + if (!(is_multicast_ether_addr(f->key.eth_addr) || + is_zero_ether_addr(f->key.eth_addr))) { if (nhid) { rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); if (rc < 0) @@ -1048,8 +1049,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, } } if ((flags & NLM_F_APPEND) && - (is_multicast_ether_addr(f->eth_addr) || - is_zero_ether_addr(f->eth_addr))) { + (is_multicast_ether_addr(f->key.eth_addr) || + is_zero_ether_addr(f->key.eth_addr))) { rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); if (rc < 0) @@ -2853,7 +2854,7 @@ static void vxlan_cleanup(struct timer_list *t) spin_lock(&vxlan->hash_lock); if (!hlist_unhashed(&f->fdb_node)) { netdev_dbg(vxlan->dev, "garbage collect %pM\n", - f->eth_addr); + f->key.eth_addr); f->state = NUD_STALE; vxlan_fdb_destroy(vxlan, f, true, true); } @@ -2972,7 +2973,8 @@ struct vxlan_fdb_flush_desc { static bool vxlan_fdb_is_default_entry(const struct vxlan_fdb *f, const struct vxlan_dev *vxlan) { - return is_zero_ether_addr(f->eth_addr) && f->vni == vxlan->cfg.vni; + return is_zero_ether_addr(f->key.eth_addr) && + f->key.vni == vxlan->cfg.vni; } static bool vxlan_fdb_nhid_matches(const struct vxlan_fdb *f, u32 nhid) @@ -2995,7 +2997,7 @@ static bool vxlan_fdb_flush_matches(const struct vxlan_fdb *f, if (desc->ignore_default_entry && vxlan_fdb_is_default_entry(f, vxlan)) return false; - if (desc->src_vni && f->vni != desc->src_vni) + if (desc->src_vni && f->key.vni != desc->src_vni) return false; if (desc->nhid && !vxlan_fdb_nhid_matches(f, desc->nhid)) diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h index 078702ec604df..3ca19e7167c9c 100644 --- a/drivers/net/vxlan/vxlan_private.h +++ b/drivers/net/vxlan/vxlan_private.h @@ -24,6 +24,11 @@ struct vxlan_net { struct notifier_block nexthop_notifier_block; }; +struct vxlan_fdb_key { + u8 eth_addr[ETH_ALEN]; + __be32 vni; +}; + /* Forwarding table entry */ struct vxlan_fdb { struct hlist_node hlist; /* linked list of entries */ @@ -31,9 +36,8 @@ struct vxlan_fdb { unsigned long updated; /* jiffies */ unsigned long used; struct list_head remotes; - u8 eth_addr[ETH_ALEN]; + struct vxlan_fdb_key key; u16 state; /* see ndm_state */ - __be32 vni; u16 flags; /* see ndm_flags and below */ struct list_head nh_list; struct hlist_node fdb_node; From 1f763fa808e92a67feea8364ef80ca3065d74702 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Apr 2025 15:11:43 +0300 Subject: [PATCH 358/770] vxlan: Convert FDB table to rhashtable FDB entries are currently stored in a hash table with a fixed number of buckets (256), resulting in performance degradation as the number of entries grows. Solve this by converting the driver to use rhashtable which maintains more or less constant performance regardless of the number of entries. Measured transmitted packets per second using a single pktgen thread with varying number of entries when the transmitted packet always hits the default entry (worst case): Number of entries | Improvement ------------------|------------ 1k | +1.12% 4k | +9.22% 16k | +55% 64k | +585% 256k | +2460% In addition, the change reduces the size of the VXLAN device structure from 2584 bytes to 672 bytes. Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250415121143.345227-16-idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/vxlan/vxlan_core.c | 102 ++++++++++++------------------ drivers/net/vxlan/vxlan_private.h | 2 +- include/net/vxlan.h | 2 +- 3 files changed, 43 insertions(+), 63 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 8e359cf8dbbd1..a56d7239b1273 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -63,8 +64,12 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan); static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); -/* salt for hash table */ -static u32 vxlan_salt __read_mostly; +static const struct rhashtable_params vxlan_fdb_rht_params = { + .head_offset = offsetof(struct vxlan_fdb, rhnode), + .key_offset = offsetof(struct vxlan_fdb, key), + .key_len = sizeof(struct vxlan_fdb_key), + .automatic_shrinking = true, +}; static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) { @@ -371,62 +376,21 @@ static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } -/* Hash Ethernet address */ -static u32 eth_hash(const unsigned char *addr) -{ - u64 value = get_unaligned((u64 *)addr); - - /* only want 6 bytes */ -#ifdef __BIG_ENDIAN - value >>= 16; -#else - value <<= 16; -#endif - return hash_64(value, FDB_HASH_BITS); -} - -u32 eth_vni_hash(const unsigned char *addr, __be32 vni) -{ - /* use 1 byte of OUI and 3 bytes of NIC */ - u32 key = get_unaligned((u32 *)(addr + 2)); - - return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1); -} - -u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) -{ - if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) - return eth_vni_hash(mac, vni); - else - return eth_hash(mac); -} - -/* Hash chain to use given mac address */ -static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, - const u8 *mac, __be32 vni) -{ - return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)]; -} - /* Look up Ethernet address in forwarding table */ static struct vxlan_fdb *vxlan_find_mac_rcu(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { - struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni); - struct vxlan_fdb *f; + struct vxlan_fdb_key key; - hlist_for_each_entry_rcu(f, head, hlist) { - if (ether_addr_equal(mac, f->key.eth_addr)) { - if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { - if (vni == f->key.vni) - return f; - } else { - return f; - } - } - } + memset(&key, 0, sizeof(key)); + memcpy(key.eth_addr, mac, sizeof(key.eth_addr)); + if (!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) + key.vni = vxlan->default_dst.remote_vni; + else + key.vni = vni; - return NULL; + return rhashtable_lookup(&vxlan->fdb_hash_tbl, &key, + vxlan_fdb_rht_params); } static struct vxlan_fdb *vxlan_find_mac_tx(struct vxlan_dev *vxlan, @@ -915,15 +879,27 @@ int vxlan_fdb_create(struct vxlan_dev *vxlan, if (rc < 0) goto errout; + rc = rhashtable_lookup_insert_fast(&vxlan->fdb_hash_tbl, &f->rhnode, + vxlan_fdb_rht_params); + if (rc) + goto destroy_remote; + ++vxlan->addrcnt; - hlist_add_head_rcu(&f->hlist, - vxlan_fdb_head(vxlan, mac, src_vni)); hlist_add_head_rcu(&f->fdb_node, &vxlan->fdb_list); *fdb = f; return 0; +destroy_remote: + if (rcu_access_pointer(f->nh)) { + list_del_rcu(&f->nh_list); + nexthop_put(rtnl_dereference(f->nh)); + } else { + list_del(&rd->list); + dst_cache_destroy(&rd->dst_cache); + kfree(rd); + } errout: kfree(f); return rc; @@ -974,7 +950,8 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, } hlist_del_init_rcu(&f->fdb_node); - hlist_del_rcu(&f->hlist); + rhashtable_remove_fast(&vxlan->fdb_hash_tbl, &f->rhnode, + vxlan_fdb_rht_params); list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); } @@ -2898,10 +2875,14 @@ static int vxlan_init(struct net_device *dev) struct vxlan_dev *vxlan = netdev_priv(dev); int err; + err = rhashtable_init(&vxlan->fdb_hash_tbl, &vxlan_fdb_rht_params); + if (err) + return err; + if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { err = vxlan_vnigroup_init(vxlan); if (err) - return err; + goto err_rhashtable_destroy; } err = gro_cells_init(&vxlan->gro_cells, dev); @@ -2920,6 +2901,8 @@ static int vxlan_init(struct net_device *dev) err_vnigroup_uninit: if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); +err_rhashtable_destroy: + rhashtable_destroy(&vxlan->fdb_hash_tbl); return err; } @@ -2933,6 +2916,8 @@ static void vxlan_uninit(struct net_device *dev) vxlan_vnigroup_uninit(vxlan); gro_cells_destroy(&vxlan->gro_cells); + + rhashtable_destroy(&vxlan->fdb_hash_tbl); } /* Start ageing timer and join group when device is brought up */ @@ -3329,7 +3314,6 @@ static void vxlan_offload_rx_ports(struct net_device *dev, bool push) static void vxlan_setup(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - unsigned int h; eth_hw_addr_random(dev); ether_setup(dev); @@ -3362,8 +3346,6 @@ static void vxlan_setup(struct net_device *dev) vxlan->dev = dev; - for (h = 0; h < FDB_HASH_SIZE; ++h) - INIT_HLIST_HEAD(&vxlan->fdb_head[h]); INIT_HLIST_HEAD(&vxlan->fdb_list); } @@ -4944,8 +4926,6 @@ static int __init vxlan_init_module(void) { int rc; - get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); - rc = register_pernet_subsys(&vxlan_net_ops); if (rc) goto out1; diff --git a/drivers/net/vxlan/vxlan_private.h b/drivers/net/vxlan/vxlan_private.h index 3ca19e7167c9c..d328aed9feefd 100644 --- a/drivers/net/vxlan/vxlan_private.h +++ b/drivers/net/vxlan/vxlan_private.h @@ -31,7 +31,7 @@ struct vxlan_fdb_key { /* Forwarding table entry */ struct vxlan_fdb { - struct hlist_node hlist; /* linked list of entries */ + struct rhash_head rhnode; struct rcu_head rcu; unsigned long updated; /* jiffies */ unsigned long used; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 96a6c6f45c2e9..e2f7ca045d3e5 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -304,7 +304,7 @@ struct vxlan_dev { struct vxlan_vni_group __rcu *vnigrp; - struct hlist_head fdb_head[FDB_HASH_SIZE]; + struct rhashtable fdb_hash_tbl; struct rhashtable mdb_tbl; struct hlist_head fdb_list; From c51ab838f5323a375b7f7334644456b23e7532ca Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Tue, 15 Apr 2025 16:13:14 -0700 Subject: [PATCH 359/770] ionic: extend the QSFP module sprom for more pages Some QSFP modules have more eeprom to be read by ethtool than the initial high and low page 0 that is currently available in the DSC's ionic sprom[] buffer. Since the current sprom[] is baked into the middle of an existing API struct, to make the high end of page 1 and page 2 available a block is carved from a reserved space of the existing port_info struct and the ionic_get_module_eeprom() service is taught how to get there. Newer firmware writes the additional QSFP page info here, yet this remains backward compatible because older firmware sets this space to all 0 and older ionic drivers do not use the reserved space. Reviewed-by: Brett Creeley Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250415231317.40616-2-shannon.nelson@amd.com Signed-off-by: Paolo Abeni --- .../ethernet/pensando/ionic/ionic_ethtool.c | 66 ++++++++++++++----- .../net/ethernet/pensando/ionic/ionic_if.h | 7 +- 2 files changed, 56 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index a2d4336d27662..66f172e28f8b6 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -968,10 +968,13 @@ static int ionic_get_module_info(struct net_device *netdev, break; case SFF8024_ID_QSFP_8436_8636: case SFF8024_ID_QSFP28_8636: - case SFF8024_ID_QSFP_PLUS_CMIS: modinfo->type = ETH_MODULE_SFF_8436; modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN; break; + case SFF8024_ID_QSFP_PLUS_CMIS: + modinfo->type = ETH_MODULE_SFF_8472; + modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; + break; default: netdev_info(netdev, "unknown xcvr type 0x%02x\n", xcvr->sprom[0]); @@ -983,29 +986,20 @@ static int ionic_get_module_info(struct net_device *netdev, return 0; } -static int ionic_get_module_eeprom(struct net_device *netdev, - struct ethtool_eeprom *ee, - u8 *data) +static int ionic_do_module_copy(u8 *dst, u8 *src, u32 len) { - struct ionic_lif *lif = netdev_priv(netdev); - struct ionic_dev *idev = &lif->ionic->idev; - struct ionic_xcvr_status *xcvr; - char tbuf[sizeof(xcvr->sprom)]; + char tbuf[sizeof_field(struct ionic_xcvr_status, sprom)]; int count = 10; - u32 len; /* The NIC keeps the module prom up-to-date in the DMA space * so we can simply copy the module bytes into the data buffer. */ - xcvr = &idev->port_info->status.xcvr; - len = min_t(u32, sizeof(xcvr->sprom), ee->len); - do { - memcpy(data, &xcvr->sprom[ee->offset], len); - memcpy(tbuf, &xcvr->sprom[ee->offset], len); + memcpy(dst, src, len); + memcpy(tbuf, src, len); /* Let's make sure we got a consistent copy */ - if (!memcmp(data, tbuf, len)) + if (!memcmp(dst, tbuf, len)) break; } while (--count); @@ -1016,6 +1010,48 @@ static int ionic_get_module_eeprom(struct net_device *netdev, return 0; } +static int ionic_get_module_eeprom(struct net_device *netdev, + struct ethtool_eeprom *ee, + u8 *data) +{ + struct ionic_lif *lif = netdev_priv(netdev); + struct ionic_dev *idev = &lif->ionic->idev; + u32 start = ee->offset; + u32 err = -EINVAL; + u32 size = 0; + u8 *src; + + if (start < ETH_MODULE_SFF_8079_LEN) { + if (start + ee->len > ETH_MODULE_SFF_8079_LEN) + size = ETH_MODULE_SFF_8079_LEN - start; + else + size = ee->len; + + src = &idev->port_info->status.xcvr.sprom[start]; + err = ionic_do_module_copy(data, src, size); + if (err) + return err; + + data += size; + start += size; + } + + if (start >= ETH_MODULE_SFF_8079_LEN && + start < ETH_MODULE_SFF_8472_LEN) { + size = ee->len - size; + if (start + size > ETH_MODULE_SFF_8472_LEN) + size = ETH_MODULE_SFF_8472_LEN - start; + + start -= ETH_MODULE_SFF_8079_LEN; + src = &idev->port_info->sprom_epage[start]; + err = ionic_do_module_copy(data, src, size); + if (err) + return err; + } + + return err; +} + static int ionic_get_ts_info(struct net_device *netdev, struct kernel_ethtool_ts_info *info) { diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h index 830c8adbfbee0..4943ebb27ab3d 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_if.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h @@ -2839,6 +2839,7 @@ union ionic_port_identity { * @status: Port status data * @stats: Port statistics data * @mgmt_stats: Port management statistics data + * @sprom_epage: Extended Transceiver sprom, high page 1 and 2 * @rsvd: reserved byte(s) * @pb_stats: uplink pb drop stats */ @@ -2849,8 +2850,10 @@ struct ionic_port_info { struct ionic_port_stats stats; struct ionic_mgmt_port_stats mgmt_stats; }; - /* room for pb_stats to start at 2k offset */ - u8 rsvd[760]; + u8 sprom_epage[256]; + u8 rsvd[504]; + + /* pb_stats must start at 2k offset */ struct ionic_port_pb_stats pb_stats; }; From 9c2e17d30b65a940e7d72ce67f2102d0084a93bf Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Tue, 15 Apr 2025 16:13:15 -0700 Subject: [PATCH 360/770] ionic: support ethtool get_module_eeprom_by_page Add support for the newer get_module_eeprom_by_page interface. Only the upper half of the 256 byte page is available for reading, and the firmware puts the two sections into the extended sprom buffer, so a union is used over the extended sprom buffer to make clear which page is to be accessed. With get_module_eeprom_by_page implemented there is no need for the older get_module_info or git_module_eeprom interfaces, so remove them. Reviewed-by: Brett Creeley Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250415231317.40616-3-shannon.nelson@amd.com Signed-off-by: Paolo Abeni --- .../ethernet/pensando/ionic/ionic_ethtool.c | 96 ++++++------------- .../net/ethernet/pensando/ionic/ionic_if.h | 12 ++- 2 files changed, 37 insertions(+), 71 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index 66f172e28f8b6..0d2ef808237b4 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -948,44 +948,6 @@ static int ionic_get_tunable(struct net_device *netdev, return 0; } -static int ionic_get_module_info(struct net_device *netdev, - struct ethtool_modinfo *modinfo) - -{ - struct ionic_lif *lif = netdev_priv(netdev); - struct ionic_dev *idev = &lif->ionic->idev; - struct ionic_xcvr_status *xcvr; - struct sfp_eeprom_base *sfp; - - xcvr = &idev->port_info->status.xcvr; - sfp = (struct sfp_eeprom_base *) xcvr->sprom; - - /* report the module data type and length */ - switch (sfp->phys_id) { - case SFF8024_ID_SFP: - modinfo->type = ETH_MODULE_SFF_8079; - modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN; - break; - case SFF8024_ID_QSFP_8436_8636: - case SFF8024_ID_QSFP28_8636: - modinfo->type = ETH_MODULE_SFF_8436; - modinfo->eeprom_len = ETH_MODULE_SFF_8436_LEN; - break; - case SFF8024_ID_QSFP_PLUS_CMIS: - modinfo->type = ETH_MODULE_SFF_8472; - modinfo->eeprom_len = ETH_MODULE_SFF_8472_LEN; - break; - default: - netdev_info(netdev, "unknown xcvr type 0x%02x\n", - xcvr->sprom[0]); - modinfo->type = 0; - modinfo->eeprom_len = ETH_MODULE_SFF_8079_LEN; - break; - } - - return 0; -} - static int ionic_do_module_copy(u8 *dst, u8 *src, u32 len) { char tbuf[sizeof_field(struct ionic_xcvr_status, sprom)]; @@ -1010,46 +972,43 @@ static int ionic_do_module_copy(u8 *dst, u8 *src, u32 len) return 0; } -static int ionic_get_module_eeprom(struct net_device *netdev, - struct ethtool_eeprom *ee, - u8 *data) +static int ionic_get_module_eeprom_by_page(struct net_device *netdev, + const struct ethtool_module_eeprom *page_data, + struct netlink_ext_ack *extack) { struct ionic_lif *lif = netdev_priv(netdev); struct ionic_dev *idev = &lif->ionic->idev; - u32 start = ee->offset; u32 err = -EINVAL; - u32 size = 0; u8 *src; - if (start < ETH_MODULE_SFF_8079_LEN) { - if (start + ee->len > ETH_MODULE_SFF_8079_LEN) - size = ETH_MODULE_SFF_8079_LEN - start; - else - size = ee->len; - - src = &idev->port_info->status.xcvr.sprom[start]; - err = ionic_do_module_copy(data, src, size); - if (err) - return err; + if (!page_data->length) + return -EINVAL; - data += size; - start += size; + if (page_data->bank != 0) { + NL_SET_ERR_MSG_MOD(extack, "Only bank 0 is supported"); + return -EINVAL; } - if (start >= ETH_MODULE_SFF_8079_LEN && - start < ETH_MODULE_SFF_8472_LEN) { - size = ee->len - size; - if (start + size > ETH_MODULE_SFF_8472_LEN) - size = ETH_MODULE_SFF_8472_LEN - start; - - start -= ETH_MODULE_SFF_8079_LEN; - src = &idev->port_info->sprom_epage[start]; - err = ionic_do_module_copy(data, src, size); - if (err) - return err; + switch (page_data->page) { + case 0: + src = &idev->port_info->status.xcvr.sprom[page_data->offset]; + break; + case 1: + src = &idev->port_info->sprom_page1[page_data->offset - 128]; + break; + case 2: + src = &idev->port_info->sprom_page2[page_data->offset - 128]; + break; + default: + return -EOPNOTSUPP; } - return err; + memset(page_data->data, 0, page_data->length); + err = ionic_do_module_copy(page_data->data, src, page_data->length); + if (err) + return err; + + return page_data->length; } static int ionic_get_ts_info(struct net_device *netdev, @@ -1197,8 +1156,7 @@ static const struct ethtool_ops ionic_ethtool_ops = { .set_rxfh = ionic_set_rxfh, .get_tunable = ionic_get_tunable, .set_tunable = ionic_set_tunable, - .get_module_info = ionic_get_module_info, - .get_module_eeprom = ionic_get_module_eeprom, + .get_module_eeprom_by_page = ionic_get_module_eeprom_by_page, .get_pauseparam = ionic_get_pauseparam, .set_pauseparam = ionic_set_pauseparam, .get_fecparam = ionic_get_fecparam, diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h index 4943ebb27ab3d..23218208b7118 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_if.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h @@ -2839,7 +2839,9 @@ union ionic_port_identity { * @status: Port status data * @stats: Port statistics data * @mgmt_stats: Port management statistics data - * @sprom_epage: Extended Transceiver sprom, high page 1 and 2 + * @sprom_epage: Extended Transceiver sprom + * @sprom_page1: Extended Transceiver sprom, page 1 + * @sprom_page2: Extended Transceiver sprom, page 2 * @rsvd: reserved byte(s) * @pb_stats: uplink pb drop stats */ @@ -2850,7 +2852,13 @@ struct ionic_port_info { struct ionic_port_stats stats; struct ionic_mgmt_port_stats mgmt_stats; }; - u8 sprom_epage[256]; + union { + u8 sprom_epage[256]; + struct { + u8 sprom_page1[128]; + u8 sprom_page2[128]; + }; + }; u8 rsvd[504]; /* pb_stats must start at 2k offset */ From 0651c83ea96c4cff739ba9a71a7b7271219700b4 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Tue, 15 Apr 2025 16:13:16 -0700 Subject: [PATCH 361/770] ionic: add module eeprom channel data to ionic_if and ethtool Make the CMIS module type's page 17 channel data available for ethtool to request. As done previously, carve space for this data from the port_info reserved space. In the future, if additional pages are needed, a new firmware AdminQ command will be added for accessing random pages. Reviewed-by: Brett Creeley Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250415231317.40616-4-shannon.nelson@amd.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/pensando/ionic/ionic_ethtool.c | 3 +++ drivers/net/ethernet/pensando/ionic/ionic_if.h | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index 0d2ef808237b4..92f30ff2d6316 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -999,6 +999,9 @@ static int ionic_get_module_eeprom_by_page(struct net_device *netdev, case 2: src = &idev->port_info->sprom_page2[page_data->offset - 128]; break; + case 17: + src = &idev->port_info->sprom_page17[page_data->offset - 128]; + break; default: return -EOPNOTSUPP; } diff --git a/drivers/net/ethernet/pensando/ionic/ionic_if.h b/drivers/net/ethernet/pensando/ionic/ionic_if.h index 23218208b7118..f1ddbe9994a3b 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_if.h +++ b/drivers/net/ethernet/pensando/ionic/ionic_if.h @@ -2842,6 +2842,7 @@ union ionic_port_identity { * @sprom_epage: Extended Transceiver sprom * @sprom_page1: Extended Transceiver sprom, page 1 * @sprom_page2: Extended Transceiver sprom, page 2 + * @sprom_page17: Extended Transceiver sprom, page 17 * @rsvd: reserved byte(s) * @pb_stats: uplink pb drop stats */ @@ -2853,13 +2854,14 @@ struct ionic_port_info { struct ionic_mgmt_port_stats mgmt_stats; }; union { - u8 sprom_epage[256]; + u8 sprom_epage[384]; struct { u8 sprom_page1[128]; u8 sprom_page2[128]; + u8 sprom_page17[128]; }; }; - u8 rsvd[504]; + u8 rsvd[376]; /* pb_stats must start at 2k offset */ struct ionic_port_pb_stats pb_stats; From b77ad30c428e6336833d9f28a9907076f8ad9432 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 16 Apr 2025 14:09:51 +0300 Subject: [PATCH 362/770] rxrpc: rxgk: Set error code in rxgk_yfs_decode_ticket() Propagate the error code if key_alloc() fails. Don't return success. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Link: https://patch.msgid.link/Z_-P_1iLDWksH1ik@stanley.mountain Signed-off-by: Paolo Abeni --- net/rxrpc/rxgk_app.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index 6206a84395b8b..b94b77a1c3178 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -141,6 +141,7 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(key)) { _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); + ret = PTR_ERR(key); goto error; } From 37f2f2fe26e8d42f4e55251b4646f06bf50d9626 Mon Sep 17 00:00:00 2001 From: Justin Lai Date: Wed, 16 Apr 2025 19:57:57 +0800 Subject: [PATCH 363/770] rtase: Add ndo_setup_tc support for CBS offload in traffic control setup Add support for ndo_setup_tc to enable CBS offload functionality as part of traffic control configuration for network devices, where CBS is applied from the CPU to the switch. More specifically, CBS is applied at the GMAC in the topmost architecture diagram. Signed-off-by: Justin Lai Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250416115757.28156-1-justinlai0215@realtek.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/realtek/rtase/rtase.h | 15 +++++ .../net/ethernet/realtek/rtase/rtase_main.c | 60 +++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/drivers/net/ethernet/realtek/rtase/rtase.h b/drivers/net/ethernet/realtek/rtase/rtase.h index 2bbfcad613abb..498cfe4d0cac3 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase.h +++ b/drivers/net/ethernet/realtek/rtase/rtase.h @@ -170,6 +170,7 @@ enum rtase_registers { #define RTASE_TC_MODE_MASK GENMASK(11, 10) RTASE_TOKSEL = 0x2046, + RTASE_TXQCRDT_0 = 0x2500, RTASE_RFIFONFULL = 0x4406, RTASE_INT_MITI_TX = 0x0A00, RTASE_INT_MITI_RX = 0x0A80, @@ -259,6 +260,12 @@ union rtase_rx_desc { #define RTASE_VLAN_TAG_MASK GENMASK(15, 0) #define RTASE_RX_PKT_SIZE_MASK GENMASK(13, 0) +/* txqos hardware definitions */ +#define RTASE_1T_CLOCK 64 +#define RTASE_1T_POWER 10000000 +#define RTASE_IDLESLOPE_INT_SHIFT 25 +#define RTASE_IDLESLOPE_INT_MASK GENMASK(31, 25) + #define RTASE_IVEC_NAME_SIZE (IFNAMSIZ + 10) struct rtase_int_vector { @@ -294,6 +301,13 @@ struct rtase_ring { u64 alloc_fail; }; +struct rtase_txqos { + int hicredit; + int locredit; + int idleslope; + int sendslope; +}; + struct rtase_stats { u64 tx_dropped; u64 rx_dropped; @@ -313,6 +327,7 @@ struct rtase_private { struct page_pool *page_pool; struct rtase_ring tx_ring[RTASE_NUM_TX_QUEUE]; + struct rtase_txqos tx_qos[RTASE_NUM_TX_QUEUE]; struct rtase_ring rx_ring[RTASE_NUM_RX_QUEUE]; struct rtase_counters *tally_vaddr; dma_addr_t tally_paddr; diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c index 2aacc1996796d..6251548d50ffc 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c @@ -1661,6 +1661,65 @@ static void rtase_get_stats64(struct net_device *dev, stats->rx_length_errors = tp->stats.rx_length_errors; } +static void rtase_set_hw_cbs(const struct rtase_private *tp, u32 queue) +{ + u32 idle = tp->tx_qos[queue].idleslope * RTASE_1T_CLOCK; + u32 val, i; + + val = u32_encode_bits(idle / RTASE_1T_POWER, RTASE_IDLESLOPE_INT_MASK); + idle %= RTASE_1T_POWER; + + for (i = 1; i <= RTASE_IDLESLOPE_INT_SHIFT; i++) { + idle *= 2; + if ((idle / RTASE_1T_POWER) == 1) + val |= BIT(RTASE_IDLESLOPE_INT_SHIFT - i); + + idle %= RTASE_1T_POWER; + } + + rtase_w32(tp, RTASE_TXQCRDT_0 + queue * 4, val); +} + +static int rtase_setup_tc_cbs(struct rtase_private *tp, + const struct tc_cbs_qopt_offload *qopt) +{ + int queue = qopt->queue; + + if (queue < 0 || queue >= tp->func_tx_queue_num) + return -EINVAL; + + if (!qopt->enable) { + tp->tx_qos[queue].hicredit = 0; + tp->tx_qos[queue].locredit = 0; + tp->tx_qos[queue].idleslope = 0; + tp->tx_qos[queue].sendslope = 0; + + rtase_w32(tp, RTASE_TXQCRDT_0 + queue * 4, 0); + } else { + tp->tx_qos[queue].hicredit = qopt->hicredit; + tp->tx_qos[queue].locredit = qopt->locredit; + tp->tx_qos[queue].idleslope = qopt->idleslope; + tp->tx_qos[queue].sendslope = qopt->sendslope; + + rtase_set_hw_cbs(tp, queue); + } + + return 0; +} + +static int rtase_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) +{ + struct rtase_private *tp = netdev_priv(dev); + + switch (type) { + case TC_SETUP_QDISC_CBS: + return rtase_setup_tc_cbs(tp, type_data); + default: + return -EOPNOTSUPP; + } +} + static netdev_features_t rtase_fix_features(struct net_device *dev, netdev_features_t features) { @@ -1696,6 +1755,7 @@ static const struct net_device_ops rtase_netdev_ops = { .ndo_change_mtu = rtase_change_mtu, .ndo_tx_timeout = rtase_tx_timeout, .ndo_get_stats64 = rtase_get_stats64, + .ndo_setup_tc = rtase_setup_tc, .ndo_fix_features = rtase_fix_features, .ndo_set_features = rtase_set_features, }; From 45bd443bfd8697a7da308c16c3e75e2bb353b3d1 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 18 Apr 2025 02:15:19 +0100 Subject: [PATCH 364/770] net: 802: Remove unused p8022 code p8022.c defines two external functions, register_8022_client() and unregister_8022_client(), the last use of which was removed in 2018 by commit 7a2e838d28cf ("staging: ipx: delete it from the tree") Remove the p8022.c file, it's corresponding header, and glue surrounding it. There was one place the header was included in vlan.c but it didn't use the functions it declared. There was a comment in net/802/Makefile about checking against net/core/Makefile, but that's at least 20 years old and there's no sign of net/core/Makefile mentioning it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250418011519.145320-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/net/p8022.h | 16 ------------ net/802/Makefile | 5 ++-- net/802/p8022.c | 64 --------------------------------------------- net/8021q/vlan.c | 1 - 4 files changed, 2 insertions(+), 84 deletions(-) delete mode 100644 include/net/p8022.h delete mode 100644 net/802/p8022.c diff --git a/include/net/p8022.h b/include/net/p8022.h deleted file mode 100644 index a29e224ac498a..0000000000000 --- a/include/net/p8022.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_P8022_H -#define _NET_P8022_H - -struct net_device; -struct packet_type; -struct sk_buff; - -struct datalink_proto * -register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)); -void unregister_8022_client(struct datalink_proto *proto); -#endif diff --git a/net/802/Makefile b/net/802/Makefile index bfed80221b8bc..99abc29d537c0 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -3,12 +3,11 @@ # Makefile for the Linux 802.x protocol layers. # -# Check the p8022 selections against net/core/Makefile. -obj-$(CONFIG_LLC) += p8022.o psnap.o +obj-$(CONFIG_LLC) += psnap.o obj-$(CONFIG_NET_FC) += fc.o obj-$(CONFIG_FDDI) += fddi.o obj-$(CONFIG_HIPPI) += hippi.o -obj-$(CONFIG_ATALK) += p8022.o psnap.o +obj-$(CONFIG_ATALK) += psnap.o obj-$(CONFIG_STP) += stp.o obj-$(CONFIG_GARP) += garp.o obj-$(CONFIG_MRP) += mrp.o diff --git a/net/802/p8022.c b/net/802/p8022.c deleted file mode 100644 index 78c25168d7c91..0000000000000 --- a/net/802/p8022.c +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * NET3: Support for 802.2 demultiplexing off Ethernet - * - * Demultiplex 802.2 encoded protocols. We match the entry by the - * SSAP/DSAP pair and then deliver to the registered datalink that - * matches. The control byte is ignored and handling of such items - * is up to the routine passed the frame. - * - * Unlike the 802.3 datalink we have a list of 802.2 entries as - * there are multiple protocols to demux. The list is currently - * short (3 or 4 entries at most). The current demux assumes this. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb, - const unsigned char *dest) -{ - llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap); - return 0; -} - -struct datalink_proto *register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)) -{ - struct datalink_proto *proto; - - proto = kmalloc(sizeof(*proto), GFP_ATOMIC); - if (proto) { - proto->type[0] = type; - proto->header_length = 3; - proto->request = p8022_request; - proto->sap = llc_sap_open(type, func); - if (!proto->sap) { - kfree(proto); - proto = NULL; - } - } - return proto; -} - -void unregister_8022_client(struct datalink_proto *proto) -{ - llc_sap_put(proto->sap); - kfree(proto); -} - -EXPORT_SYMBOL(register_8022_client); -EXPORT_SYMBOL(unregister_8022_client); - -MODULE_DESCRIPTION("Support for 802.2 demultiplexing off Ethernet"); -MODULE_LICENSE("GPL"); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 41be38264493d..06908e37c3d94 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include From 67b083f14fb75c4157eb8ab9a4ae543aebcb8e1b Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sun, 20 Apr 2025 23:58:10 +0100 Subject: [PATCH 365/770] octeontx2-af: Remove unused rvu_npc_enable_bcast_entry The last use of rvu_npc_enable_bcast_entry() was removed in 2021 by commit 967db3529eca ("octeontx2-af: add support for multicast/promisc packet replication feature") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250420225810.171852-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeontx2/af/rvu.h | 2 -- .../ethernet/marvell/octeontx2/af/rvu_npc.c | 18 ------------------ 2 files changed, 20 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index 60f085b00a8cc..147d7f5c1fcc0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -969,8 +969,6 @@ void rvu_npc_enable_promisc_entry(struct rvu *rvu, u16 pcifunc, int nixlf, bool enable); void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc, int nixlf, u64 chan); -void rvu_npc_enable_bcast_entry(struct rvu *rvu, u16 pcifunc, int nixlf, - bool enable); void rvu_npc_install_allmulti_entry(struct rvu *rvu, u16 pcifunc, int nixlf, u64 chan); void rvu_npc_enable_allmulti_entry(struct rvu *rvu, u16 pcifunc, int nixlf, diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c index 821fe242f821f..6296a3cdabbb1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c @@ -820,24 +820,6 @@ void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc, rvu_mbox_handler_npc_install_flow(rvu, &req, &rsp); } -void rvu_npc_enable_bcast_entry(struct rvu *rvu, u16 pcifunc, int nixlf, - bool enable) -{ - struct npc_mcam *mcam = &rvu->hw->mcam; - int blkaddr, index; - - blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0); - if (blkaddr < 0) - return; - - /* Get 'pcifunc' of PF device */ - pcifunc = pcifunc & ~RVU_PFVF_FUNC_MASK; - - index = npc_get_nixlf_mcam_index(mcam, pcifunc, nixlf, - NIXLF_BCAST_ENTRY); - npc_enable_mcam_entry(rvu, mcam, blkaddr, index, enable); -} - void rvu_npc_install_allmulti_entry(struct rvu *rvu, u16 pcifunc, int nixlf, u64 chan) { From b7ed5d5a78fccee96cf8919ac2c7a064c2f4c45b Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 21 Apr 2025 11:25:18 +0200 Subject: [PATCH 366/770] r8169: use pci_prepare_to_sleep in rtl_shutdown Use pci_prepare_to_sleep() like PCI core does in pci_pm_suspend_noirq. This aligns setting a low-power mode during shutdown with the handling of the transition to system suspend. Also the transition to runtime suspend uses pci_target_state() instead of setting D3hot unconditionally. Note: pci_prepare_to_sleep() uses device_may_wakeup() to check whether device may generate wakeup events. So we don't lose anything by not passing tp->saved_wolopts any longer. Signed-off-by: Heiner Kallweit Reviewed-by: Jacob Keller Link: https://patch.msgid.link/f573fdbd-ba6d-41c1-b68f-311d3c88db2c@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 635785d917902..3ef7126f8ec50 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5039,10 +5039,8 @@ static void rtl_shutdown(struct pci_dev *pdev) /* Restore original MAC address */ rtl_rar_set(tp, tp->dev->perm_addr); - if (system_state == SYSTEM_POWER_OFF && !tp->dash_enabled) { - pci_wake_from_d3(pdev, tp->saved_wolopts); - pci_set_power_state(pdev, PCI_D3hot); - } + if (system_state == SYSTEM_POWER_OFF && !tp->dash_enabled) + pci_prepare_to_sleep(pdev); } static void rtl_remove_one(struct pci_dev *pdev) From 4cb6316d33d8ccf982f8975b1ea5f7f3f71c8da1 Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Wed, 16 Apr 2025 19:14:47 +0200 Subject: [PATCH 367/770] dt-bindings: net: ethernet-phy: add property mac-termination-ohms Add property mac-termination-ohms in the device tree bindings for selecting the resistance value of the builtin series termination resistors of the PHY. Changing the resistance to an appropriate value can reduce signal reflections and therefore improve signal quality. Signed-off-by: Dimitri Fedrau Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250416-dp83822-mac-impedance-v3-1-028ac426cddb@liebherr.com Signed-off-by: Jakub Kicinski --- .../devicetree/bindings/net/ethernet-phy.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml b/Documentation/devicetree/bindings/net/ethernet-phy.yaml index 824bbe4333b7e..71e2cd32580f2 100644 --- a/Documentation/devicetree/bindings/net/ethernet-phy.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml @@ -238,6 +238,16 @@ properties: peak-to-peak specified in ANSI X3.263. When omitted, the PHYs default will be left as is. + mac-termination-ohms: + maximum: 200 + description: + The xMII signals need series termination on the driver side to match both + the output driver impedance and the line characteristic impedance, to + prevent reflections and EMI problems. Select a resistance value which is + supported by the builtin resistors of the PHY, otherwise the resistors may + have to be placed on board. When omitted, the PHYs default will be left as + is. + leds: type: object From 1de1390ee014f72ddff65ac73bee55005696fe96 Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Wed, 16 Apr 2025 19:14:48 +0200 Subject: [PATCH 368/770] dt-bindings: net: dp83822: add constraints for mac-termination-ohms Property mac-termination-ohms is defined in ethernet-phy.yaml. Add allowed values for the property. Signed-off-by: Dimitri Fedrau Reviewed-by: Andrew Lunn Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250416-dp83822-mac-impedance-v3-2-028ac426cddb@liebherr.com Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/ti,dp83822.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/net/ti,dp83822.yaml b/Documentation/devicetree/bindings/net/ti,dp83822.yaml index 50c24248df266..28a0bddb9af94 100644 --- a/Documentation/devicetree/bindings/net/ti,dp83822.yaml +++ b/Documentation/devicetree/bindings/net/ti,dp83822.yaml @@ -122,6 +122,9 @@ properties: - free-running - recovered + mac-termination-ohms: + enum: [43, 44, 46, 48, 50, 53, 55, 58, 61, 65, 69, 73, 78, 84, 91, 99] + required: - reg @@ -137,6 +140,7 @@ examples: rx-internal-delay-ps = <1>; tx-internal-delay-ps = <1>; ti,gpio2-clk-out = "xi"; + mac-termination-ohms = <43>; }; }; From 145436ae01193c0a379fd3ea9c4fbdf32863db1f Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Wed, 16 Apr 2025 19:14:49 +0200 Subject: [PATCH 369/770] net: phy: Add helper for getting MAC termination resistance Add helper which returns the MAC termination resistance value. Modifying the resistance to an appropriate value can reduce signal reflections and therefore improve signal quality. Reviewed-by: Russell King (Oracle) Signed-off-by: Dimitri Fedrau Link: https://patch.msgid.link/20250416-dp83822-mac-impedance-v3-3-028ac426cddb@liebherr.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 15 +++++++++++++++ include/linux/phy.h | 3 +++ 2 files changed, 18 insertions(+) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index cc6c209fe7022..f85c172c446c5 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -2975,6 +2975,21 @@ int phy_get_tx_amplitude_gain(struct phy_device *phydev, struct device *dev, } EXPORT_SYMBOL_GPL(phy_get_tx_amplitude_gain); +/** + * phy_get_mac_termination - stores MAC termination in @val + * @phydev: phy_device struct + * @dev: pointer to the devices device struct + * @val: MAC termination + * + * Returns: 0 on success, < 0 on failure + */ +int phy_get_mac_termination(struct phy_device *phydev, struct device *dev, + u32 *val) +{ + return phy_get_u32_property(dev, "mac-termination-ohms", val); +} +EXPORT_SYMBOL_GPL(phy_get_mac_termination); + static int phy_led_set_brightness(struct led_classdev *led_cdev, enum led_brightness value) { diff --git a/include/linux/phy.h b/include/linux/phy.h index fb755358d965b..066a28a4b64b2 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2040,6 +2040,9 @@ int phy_get_tx_amplitude_gain(struct phy_device *phydev, struct device *dev, enum ethtool_link_mode_bit_indices linkmode, u32 *val); +int phy_get_mac_termination(struct phy_device *phydev, struct device *dev, + u32 *val); + void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv, bool *tx_pause, bool *rx_pause); From 6c3c3c230a13f10a8a86f20c6710cf44532dfcf3 Mon Sep 17 00:00:00 2001 From: Dimitri Fedrau Date: Wed, 16 Apr 2025 19:14:50 +0200 Subject: [PATCH 370/770] net: phy: dp83822: Add support for changing the MAC termination The dp83822 provides the possibility to set the resistance value of the the MAC termination. Modifying the resistance to an appropriate value can reduce signal reflections and therefore improve signal quality. Reviewed-by: Russell King (Oracle) Signed-off-by: Dimitri Fedrau Link: https://patch.msgid.link/20250416-dp83822-mac-impedance-v3-4-028ac426cddb@liebherr.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83822.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index 14f3615496384..490c9f4e5d4e4 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -33,6 +33,7 @@ #define MII_DP83822_MLEDCR 0x25 #define MII_DP83822_LDCTRL 0x403 #define MII_DP83822_LEDCFG1 0x460 +#define MII_DP83822_IOCTRL 0x461 #define MII_DP83822_IOCTRL1 0x462 #define MII_DP83822_IOCTRL2 0x463 #define MII_DP83822_GENCFG 0x465 @@ -118,6 +119,9 @@ #define DP83822_LEDCFG1_LED1_CTRL GENMASK(11, 8) #define DP83822_LEDCFG1_LED3_CTRL GENMASK(7, 4) +/* IOCTRL bits */ +#define DP83822_IOCTRL_MAC_IMPEDANCE_CTRL GENMASK(4, 1) + /* IOCTRL1 bits */ #define DP83822_IOCTRL1_GPIO3_CTRL GENMASK(10, 8) #define DP83822_IOCTRL1_GPIO3_CTRL_LED3 BIT(0) @@ -202,6 +206,7 @@ struct dp83822_private { u32 gpio2_clk_out; bool led_pin_enable[DP83822_MAX_LED_PINS]; int tx_amplitude_100base_tx_index; + int mac_termination_index; }; static int dp83822_config_wol(struct phy_device *phydev, @@ -533,6 +538,12 @@ static int dp83822_config_init(struct phy_device *phydev) FIELD_PREP(DP83822_100BASE_TX_LINE_DRIVER_SWING, dp83822->tx_amplitude_100base_tx_index)); + if (dp83822->mac_termination_index >= 0) + phy_modify_mmd(phydev, MDIO_MMD_VEND2, MII_DP83822_IOCTRL, + DP83822_IOCTRL_MAC_IMPEDANCE_CTRL, + FIELD_PREP(DP83822_IOCTRL_MAC_IMPEDANCE_CTRL, + dp83822->mac_termination_index)); + err = dp83822_config_init_leds(phydev); if (err) return err; @@ -736,6 +747,10 @@ static const u32 tx_amplitude_100base_tx_gain[] = { 93, 95, 97, 98, 100, 102, 103, 105, }; +static const u32 mac_termination[] = { + 99, 91, 84, 78, 73, 69, 65, 61, 58, 55, 53, 50, 48, 46, 44, 43, +}; + static int dp83822_of_init_leds(struct phy_device *phydev) { struct device_node *node = phydev->mdio.dev.of_node; @@ -852,6 +867,23 @@ static int dp83822_of_init(struct phy_device *phydev) } } + ret = phy_get_mac_termination(phydev, dev, &val); + if (!ret) { + for (i = 0; i < ARRAY_SIZE(mac_termination); i++) { + if (mac_termination[i] == val) { + dp83822->mac_termination_index = i; + break; + } + } + + if (dp83822->mac_termination_index < 0) { + phydev_err(phydev, + "Invalid value for mac-termination-ohms property (%u)\n", + val); + return -EINVAL; + } + } + return dp83822_of_init_leds(phydev); } @@ -931,6 +963,7 @@ static int dp8382x_probe(struct phy_device *phydev) return -ENOMEM; dp83822->tx_amplitude_100base_tx_index = -1; + dp83822->mac_termination_index = -1; phydev->priv = dp83822; return 0; From f0f149d9747f0d597d3e04bb87be0f31e7e25c2e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Apr 2025 15:10:29 -0700 Subject: [PATCH 371/770] emulex/benet: Annotate flash_cookie as nonstring GCC 15's new -Wunterminated-string-initialization notices that the 32 character "flash_cookie" (which is not used as a C-String) needs to be marked as "nonstring": drivers/net/ethernet/emulex/benet/be_cmds.c:2618:51: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (17 chars into 16 available) [-Wunterminated-string-initialization] 2618 | static char flash_cookie[2][16] = {"*** SE FLAS", "H DIRECTORY *** "}; | ^~~~~~~~~~~~~~~~~~ Add this annotation, avoid using a multidimensional array, but keep the string split (with a comment about why). Additionally mark it const and annotate the "cookie" member that is being memcmp()ed against as nonstring too. Signed-off-by: Kees Cook Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250416221028.work.967-kees@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/emulex/benet/be_cmds.c | 6 +++++- drivers/net/ethernet/emulex/benet/be_cmds.h | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c index 51b8377edd1d0..adb441b36581a 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.c +++ b/drivers/net/ethernet/emulex/benet/be_cmds.c @@ -2615,7 +2615,11 @@ static int be_cmd_get_flash_crc(struct be_adapter *adapter, u8 *flashed_crc, return status; } -static char flash_cookie[2][16] = {"*** SE FLAS", "H DIRECTORY *** "}; +/* + * Since the cookie is text, add a parsing-skipped space to keep it from + * ever being matched on storage holding this source file. + */ +static const char flash_cookie[32] __nonstring = "*** SE FLAS" "H DIRECTORY *** "; static bool phy_flashing_required(struct be_adapter *adapter) { diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h index d70818f06be73..5e2d3ddb5d438 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.h +++ b/drivers/net/ethernet/emulex/benet/be_cmds.h @@ -1415,7 +1415,7 @@ struct flash_section_entry { } __packed; struct flash_section_info { - u8 cookie[32]; + u8 cookie[32] __nonstring; struct flash_section_hdr fsec_hdr; struct flash_section_entry fsec_entry[32]; } __packed; From a7696fb251c61ce5419fff5cf71556b5ca9f815a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 17 Apr 2025 09:46:42 +0200 Subject: [PATCH 372/770] ptp: Do not enable by default during compile testing Enabling the compile test should not cause automatic enabling of all drivers, but only allow to choose to compile them. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250417074643.81448-1-krzysztof.kozlowski@linaro.org Signed-off-by: Jakub Kicinski --- drivers/ptp/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index 07bf7f9aae019..204278eb215e5 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -44,7 +44,7 @@ config PTP_1588_CLOCK_DTE depends on PTP_1588_CLOCK depends on NET && HAS_IOMEM depends on ARCH_BCM_MOBILE || (ARCH_BCM_IPROC && !(ARCH_BCM_NSP || ARCH_BCM_5301X)) || COMPILE_TEST - default y + default y if ARCH_BCM_MOBILE || ARCH_BCM_IPROC help This driver adds support for using the Digital timing engine (DTE) in the Broadcom SoC's as a PTP clock. @@ -59,7 +59,7 @@ config PTP_1588_CLOCK_QORIQ tristate "Freescale QorIQ 1588 timer as PTP clock" depends on GIANFAR || FSL_DPAA_ETH || FSL_DPAA2_ETH || FSL_ENETC || FSL_ENETC_VF || COMPILE_TEST depends on PTP_1588_CLOCK - default y + default y if GIANFAR || FSL_DPAA_ETH || FSL_DPAA2_ETH || FSL_ENETC || FSL_ENETC_VF help This driver adds support for using the Freescale QorIQ 1588 timer as a PTP clock. This clock is only useful if your PTP From af5226abb40cae959f424f7ca614787a1c87ce48 Mon Sep 17 00:00:00 2001 From: Salah Triki Date: Wed, 16 Apr 2025 20:26:25 +0100 Subject: [PATCH 373/770] smb: server: smb2pdu: check return value of xa_store() xa_store() may fail so check its return value and return error code if error occurred. Signed-off-by: Salah Triki Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 57839f9708bb6..372021b3f2632 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1445,7 +1445,7 @@ static int ntlm_authenticate(struct ksmbd_work *work, { struct ksmbd_conn *conn = work->conn; struct ksmbd_session *sess = work->sess; - struct channel *chann = NULL; + struct channel *chann = NULL, *old; struct ksmbd_user *user; u64 prev_id; int sz, rc; @@ -1557,7 +1557,12 @@ static int ntlm_authenticate(struct ksmbd_work *work, return -ENOMEM; chann->conn = conn; - xa_store(&sess->ksmbd_chann_list, (long)conn, chann, KSMBD_DEFAULT_GFP); + old = xa_store(&sess->ksmbd_chann_list, (long)conn, chann, + KSMBD_DEFAULT_GFP); + if (xa_is_err(old)) { + kfree(chann); + return xa_err(old); + } } } From a1f46c99d9ea411f9bf30025b912d881d36fc709 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 17 Apr 2025 10:10:15 +0900 Subject: [PATCH 374/770] ksmbd: fix use-after-free in ksmbd_session_rpc_open A UAF issue can occur due to a race condition between ksmbd_session_rpc_open() and __session_rpc_close(). Add rpc_lock to the session to protect it. Cc: stable@vger.kernel.org Reported-by: Norbert Szetei Tested-by: Norbert Szetei Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/mgmt/user_session.c | 20 ++++++++++++++------ fs/smb/server/mgmt/user_session.h | 1 + 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 3f45f28f6f0f8..9dec4c2940bc0 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -59,10 +59,12 @@ static void ksmbd_session_rpc_clear_list(struct ksmbd_session *sess) struct ksmbd_session_rpc *entry; long index; + down_write(&sess->rpc_lock); xa_for_each(&sess->rpc_handle_list, index, entry) { xa_erase(&sess->rpc_handle_list, index); __session_rpc_close(sess, entry); } + up_write(&sess->rpc_lock); xa_destroy(&sess->rpc_handle_list); } @@ -92,7 +94,7 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) { struct ksmbd_session_rpc *entry, *old; struct ksmbd_rpc_command *resp; - int method; + int method, id; method = __rpc_method(rpc_name); if (!method) @@ -102,26 +104,29 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) if (!entry) return -ENOMEM; + down_read(&sess->rpc_lock); entry->method = method; - entry->id = ksmbd_ipc_id_alloc(); - if (entry->id < 0) + entry->id = id = ksmbd_ipc_id_alloc(); + if (id < 0) goto free_entry; - old = xa_store(&sess->rpc_handle_list, entry->id, entry, KSMBD_DEFAULT_GFP); + old = xa_store(&sess->rpc_handle_list, id, entry, KSMBD_DEFAULT_GFP); if (xa_is_err(old)) goto free_id; - resp = ksmbd_rpc_open(sess, entry->id); + resp = ksmbd_rpc_open(sess, id); if (!resp) goto erase_xa; + up_read(&sess->rpc_lock); kvfree(resp); - return entry->id; + return id; erase_xa: xa_erase(&sess->rpc_handle_list, entry->id); free_id: ksmbd_rpc_id_free(entry->id); free_entry: kfree(entry); + up_read(&sess->rpc_lock); return -EINVAL; } @@ -129,9 +134,11 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id) { struct ksmbd_session_rpc *entry; + down_write(&sess->rpc_lock); entry = xa_erase(&sess->rpc_handle_list, id); if (entry) __session_rpc_close(sess, entry); + up_write(&sess->rpc_lock); } int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id) @@ -439,6 +446,7 @@ static struct ksmbd_session *__session_create(int protocol) sess->sequence_number = 1; rwlock_init(&sess->tree_conns_lock); atomic_set(&sess->refcnt, 2); + init_rwsem(&sess->rpc_lock); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index f21348381d598..c5749d6ec7151 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -63,6 +63,7 @@ struct ksmbd_session { rwlock_t tree_conns_lock; atomic_t refcnt; + struct rw_semaphore rpc_lock; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) From 8c989368c04c17a6ed834ae528458844f553bead Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 17 Apr 2025 09:40:12 +0100 Subject: [PATCH 375/770] dt-bindings: net: dwmac: Increase 'maxItems' for 'interrupts' and 'interrupt-names' Increase the `maxItems` value for the `interrupts` and `interrupt-names` properties to 11 to support additional per-channel Tx/Rx completion interrupts on the Renesas RZ/V2H(P) SoC, which features the `snps,dwmac-5.20` IP. Refactor the `interrupt-names` property by replacing repeated `enum` entries with a `oneOf` list. Add support for per-channel receive and transmit completion interrupts using regex patterns. Signed-off-by: Lad Prabhakar Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250417084015.74154-2-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Jakub Kicinski --- .../devicetree/bindings/net/snps,dwmac.yaml | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml index 78b3030dc56d2..4d4fcaeca8a8d 100644 --- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -114,19 +114,25 @@ properties: interrupts: minItems: 1 - items: - - description: Combined signal for various interrupt events - - description: The interrupt to manage the remote wake-up packet detection - - description: The interrupt that occurs when Rx exits the LPI state - - description: The interrupt that occurs when HW safety error triggered + maxItems: 11 interrupt-names: minItems: 1 + maxItems: 11 items: - - const: macirq - - enum: [eth_wake_irq, eth_lpi, sfty] - - enum: [eth_wake_irq, eth_lpi, sfty] - - enum: [eth_wake_irq, eth_lpi, sfty] + oneOf: + - description: Combined signal for various interrupt events + const: macirq + - description: The interrupt to manage the remote wake-up packet detection + const: eth_wake_irq + - description: The interrupt that occurs when Rx exits the LPI state + const: eth_lpi + - description: The interrupt that occurs when HW safety error triggered + const: sfty + - description: Per channel receive completion interrupt + pattern: '^rx-queue-[0-3]$' + - description: Per channel transmit completion interrupt + pattern: '^tx-queue-[0-3]$' clocks: minItems: 1 From 8fff7ae84d1880b7d435f3bb71518eb24bf84846 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 17 Apr 2025 09:40:13 +0100 Subject: [PATCH 376/770] dt-bindings: net: Document support for Renesas RZ/V2H(P) GBETH GBETH IP on the Renesas RZ/V2H(P) SoC is integrated with Synopsys DesignWare MAC (version 5.20). Document the device tree bindings for the GBETH glue layer. Generic compatible string 'renesas,rzv2h-gbeth' is added since this module is identical on both the RZ/V2H(P) and RZ/G3E SoCs. The Rx/Tx clocks supplied for GBETH on the RZ/V2H(P) SoC is depicted below: Rx / Tx -------+------------- on / off ------- | | Rx-180 / Tx-180 +---- not ---- on / off ------- Signed-off-by: Lad Prabhakar Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20250417084015.74154-3-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Jakub Kicinski --- .../bindings/net/renesas,r9a09g057-gbeth.yaml | 201 ++++++++++++++++++ .../devicetree/bindings/net/snps,dwmac.yaml | 1 + 2 files changed, 202 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/renesas,r9a09g057-gbeth.yaml diff --git a/Documentation/devicetree/bindings/net/renesas,r9a09g057-gbeth.yaml b/Documentation/devicetree/bindings/net/renesas,r9a09g057-gbeth.yaml new file mode 100644 index 0000000000000..02a6793c26f58 --- /dev/null +++ b/Documentation/devicetree/bindings/net/renesas,r9a09g057-gbeth.yaml @@ -0,0 +1,201 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/renesas,r9a09g057-gbeth.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: GBETH glue layer for Renesas RZ/V2H(P) (and similar SoCs) + +maintainers: + - Lad Prabhakar + +select: + properties: + compatible: + contains: + enum: + - renesas,r9a09g057-gbeth + - renesas,rzv2h-gbeth + required: + - compatible + +properties: + compatible: + items: + - enum: + - renesas,r9a09g057-gbeth # RZ/V2H(P) + - const: renesas,rzv2h-gbeth + - const: snps,dwmac-5.20 + + reg: + maxItems: 1 + + clocks: + items: + - description: CSR clock + - description: AXI system clock + - description: PTP clock + - description: TX clock + - description: RX clock + - description: TX clock phase-shifted by 180 degrees + - description: RX clock phase-shifted by 180 degrees + + clock-names: + items: + - const: stmmaceth + - const: pclk + - const: ptp_ref + - const: tx + - const: rx + - const: tx-180 + - const: rx-180 + + interrupts: + minItems: 11 + + interrupt-names: + items: + - const: macirq + - const: eth_wake_irq + - const: eth_lpi + - const: rx-queue-0 + - const: rx-queue-1 + - const: rx-queue-2 + - const: rx-queue-3 + - const: tx-queue-0 + - const: tx-queue-1 + - const: tx-queue-2 + - const: tx-queue-3 + + resets: + items: + - description: AXI power-on system reset + +required: + - compatible + - reg + - clocks + - clock-names + - interrupts + - interrupt-names + - resets + +allOf: + - $ref: snps,dwmac.yaml# + +unevaluatedProperties: false + +examples: + - | + #include + #include + + ethernet@15c30000 { + compatible = "renesas,r9a09g057-gbeth", "renesas,rzv2h-gbeth", "snps,dwmac-5.20"; + reg = <0x15c30000 0x10000>; + clocks = <&cpg CPG_MOD 0xbd>, <&cpg CPG_MOD 0xbc>, + <&ptp_clock>, <&cpg CPG_MOD 0xb8>, + <&cpg CPG_MOD 0xb9>, <&cpg CPG_MOD 0xba>, + <&cpg CPG_MOD 0xbb>; + clock-names = "stmmaceth", "pclk", "ptp_ref", + "tx", "rx", "tx-180", "rx-180"; + resets = <&cpg 0xb0>; + interrupts = , + , + , + , + , + , + , + , + , + , + ; + interrupt-names = "macirq", "eth_wake_irq", "eth_lpi", + "rx-queue-0", "rx-queue-1", "rx-queue-2", + "rx-queue-3", "tx-queue-0", "tx-queue-1", + "tx-queue-2", "tx-queue-3"; + phy-mode = "rgmii-id"; + snps,multicast-filter-bins = <256>; + snps,perfect-filter-entries = <128>; + rx-fifo-depth = <8192>; + tx-fifo-depth = <8192>; + snps,fixed-burst; + snps,force_thresh_dma_mode; + snps,axi-config = <&stmmac_axi_setup>; + snps,mtl-rx-config = <&mtl_rx_setup>; + snps,mtl-tx-config = <&mtl_tx_setup>; + snps,txpbl = <32>; + snps,rxpbl = <32>; + phy-handle = <&phy0>; + + stmmac_axi_setup: stmmac-axi-config { + snps,lpi_en; + snps,wr_osr_lmt = <0xf>; + snps,rd_osr_lmt = <0xf>; + snps,blen = <16 8 4 0 0 0 0>; + }; + + mtl_rx_setup: rx-queues-config { + snps,rx-queues-to-use = <4>; + snps,rx-sched-sp; + + queue0 { + snps,dcb-algorithm; + snps,priority = <0x1>; + snps,map-to-dma-channel = <0>; + }; + + queue1 { + snps,dcb-algorithm; + snps,priority = <0x2>; + snps,map-to-dma-channel = <1>; + }; + + queue2 { + snps,dcb-algorithm; + snps,priority = <0x4>; + snps,map-to-dma-channel = <2>; + }; + + queue3 { + snps,dcb-algorithm; + snps,priority = <0x8>; + snps,map-to-dma-channel = <3>; + }; + }; + + mtl_tx_setup: tx-queues-config { + snps,tx-queues-to-use = <4>; + + queue0 { + snps,dcb-algorithm; + snps,priority = <0x1>; + }; + + queue1 { + snps,dcb-algorithm; + snps,priority = <0x2>; + }; + + queue2 { + snps,dcb-algorithm; + snps,priority = <0x4>; + }; + + queue3 { + snps,dcb-algorithm; + snps,priority = <0x1>; + }; + }; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + compatible = "snps,dwmac-mdio"; + + phy0: ethernet-phy@0 { + reg = <0>; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml index 4d4fcaeca8a8d..b525eca538506 100644 --- a/Documentation/devicetree/bindings/net/snps,dwmac.yaml +++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml @@ -75,6 +75,7 @@ properties: - qcom,sm8150-ethqos - renesas,r9a06g032-gmac - renesas,rzn1-gmac + - renesas,rzv2h-gbeth - rockchip,px30-gmac - rockchip,rk3128-gmac - rockchip,rk3228-gmac From 461f6529e5946d98ef91bf115d5ced61d8439a6a Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 17 Apr 2025 09:40:14 +0100 Subject: [PATCH 377/770] net: stmmac: Add DWMAC glue layer for Renesas GBETH Add the DWMAC glue layer for the GBETH IP found in the Renesas RZ/V2H(P) SoC. Signed-off-by: Lad Prabhakar Reviewed-by: Russell King (Oracle) Reviewed-by: Philipp Zabel Link: https://patch.msgid.link/20250417084015.74154-4-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/Kconfig | 11 ++ drivers/net/ethernet/stmicro/stmmac/Makefile | 1 + .../stmicro/stmmac/dwmac-renesas-gbeth.c | 146 ++++++++++++++++++ 3 files changed, 158 insertions(+) create mode 100644 drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index 3c820ef567759..2c99b23f0faae 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -131,6 +131,17 @@ config DWMAC_QCOM_ETHQOS This selects the Qualcomm ETHQOS glue layer support for the stmmac device driver. +config DWMAC_RENESAS_GBETH + tristate "Renesas RZ/V2H(P) GBETH support" + default ARCH_RENESAS + depends on OF && (ARCH_RENESAS || COMPILE_TEST) + help + Support for Gigabit Ethernet Interface (GBETH) on Renesas + RZ/V2H(P) SoCs. + + This selects the Renesas RZ/V2H(P) Soc specific glue layer support + for the stmmac device driver. + config DWMAC_ROCKCHIP tristate "Rockchip dwmac support" default ARCH_ROCKCHIP diff --git a/drivers/net/ethernet/stmicro/stmmac/Makefile b/drivers/net/ethernet/stmicro/stmmac/Makefile index 594883fb41644..91050215511b2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Makefile +++ b/drivers/net/ethernet/stmicro/stmmac/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_DWMAC_LPC18XX) += dwmac-lpc18xx.o obj-$(CONFIG_DWMAC_MEDIATEK) += dwmac-mediatek.o obj-$(CONFIG_DWMAC_MESON) += dwmac-meson.o dwmac-meson8b.o obj-$(CONFIG_DWMAC_QCOM_ETHQOS) += dwmac-qcom-ethqos.o +obj-$(CONFIG_DWMAC_RENESAS_GBETH) += dwmac-renesas-gbeth.o obj-$(CONFIG_DWMAC_ROCKCHIP) += dwmac-rk.o obj-$(CONFIG_DWMAC_RZN1) += dwmac-rzn1.o obj-$(CONFIG_DWMAC_S32) += dwmac-s32.o diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c new file mode 100644 index 0000000000000..9a774046455b1 --- /dev/null +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * dwmac-renesas-gbeth.c - DWMAC Specific Glue layer for Renesas GBETH + * + * The Rx and Tx clocks are supplied as follows for the GBETH IP. + * + * Rx / Tx + * -------+------------- on / off ------- + * | + * | Rx-180 / Tx-180 + * +---- not ---- on / off ------- + * + * Copyright (C) 2025 Renesas Electronics Corporation + */ + +#include +#include +#include +#include +#include + +#include "stmmac_platform.h" + +struct renesas_gbeth { + struct plat_stmmacenet_data *plat_dat; + struct reset_control *rstc; + struct device *dev; +}; + +static const char *const renesas_gbeth_clks[] = { + "tx", "tx-180", "rx", "rx-180", +}; + +static int renesas_gbeth_init(struct platform_device *pdev, void *priv) +{ + struct plat_stmmacenet_data *plat_dat; + struct renesas_gbeth *gbeth = priv; + int ret; + + plat_dat = gbeth->plat_dat; + + ret = reset_control_deassert(gbeth->rstc); + if (ret) { + dev_err(gbeth->dev, "Reset deassert failed\n"); + return ret; + } + + ret = clk_bulk_prepare_enable(plat_dat->num_clks, + plat_dat->clks); + if (ret) + reset_control_assert(gbeth->rstc); + + return ret; +} + +static void renesas_gbeth_exit(struct platform_device *pdev, void *priv) +{ + struct plat_stmmacenet_data *plat_dat; + struct renesas_gbeth *gbeth = priv; + int ret; + + plat_dat = gbeth->plat_dat; + + clk_bulk_disable_unprepare(plat_dat->num_clks, plat_dat->clks); + + ret = reset_control_assert(gbeth->rstc); + if (ret) + dev_err(gbeth->dev, "Reset assert failed\n"); +} + +static int renesas_gbeth_probe(struct platform_device *pdev) +{ + struct plat_stmmacenet_data *plat_dat; + struct stmmac_resources stmmac_res; + struct device *dev = &pdev->dev; + struct renesas_gbeth *gbeth; + unsigned int i; + int err; + + err = stmmac_get_platform_resources(pdev, &stmmac_res); + if (err) + return dev_err_probe(dev, err, + "failed to get resources\n"); + + plat_dat = devm_stmmac_probe_config_dt(pdev, stmmac_res.mac); + if (IS_ERR(plat_dat)) + return dev_err_probe(dev, PTR_ERR(plat_dat), + "dt configuration failed\n"); + + gbeth = devm_kzalloc(dev, sizeof(*gbeth), GFP_KERNEL); + if (!gbeth) + return -ENOMEM; + + plat_dat->num_clks = ARRAY_SIZE(renesas_gbeth_clks); + plat_dat->clks = devm_kcalloc(dev, plat_dat->num_clks, + sizeof(*plat_dat->clks), GFP_KERNEL); + if (!plat_dat->clks) + return -ENOMEM; + + for (i = 0; i < plat_dat->num_clks; i++) + plat_dat->clks[i].id = renesas_gbeth_clks[i]; + + err = devm_clk_bulk_get(dev, plat_dat->num_clks, plat_dat->clks); + if (err < 0) + return err; + + plat_dat->clk_tx_i = stmmac_pltfr_find_clk(plat_dat, "tx"); + if (!plat_dat->clk_tx_i) + return dev_err_probe(dev, -EINVAL, + "error finding tx clock\n"); + + gbeth->rstc = devm_reset_control_get_exclusive(dev, NULL); + if (IS_ERR(gbeth->rstc)) + return PTR_ERR(gbeth->rstc); + + gbeth->dev = dev; + gbeth->plat_dat = plat_dat; + plat_dat->bsp_priv = gbeth; + plat_dat->set_clk_tx_rate = stmmac_set_clk_tx_rate; + plat_dat->init = renesas_gbeth_init; + plat_dat->exit = renesas_gbeth_exit; + plat_dat->flags |= STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY | + STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP | + STMMAC_FLAG_SPH_DISABLE; + + return devm_stmmac_pltfr_probe(pdev, plat_dat, &stmmac_res); +} + +static const struct of_device_id renesas_gbeth_match[] = { + { .compatible = "renesas,rzv2h-gbeth", }, + { /* Sentinel */ } +}; +MODULE_DEVICE_TABLE(of, renesas_gbeth_match); + +static struct platform_driver renesas_gbeth_driver = { + .probe = renesas_gbeth_probe, + .driver = { + .name = "renesas-gbeth", + .of_match_table = renesas_gbeth_match, + }, +}; +module_platform_driver(renesas_gbeth_driver); + +MODULE_AUTHOR("Lad Prabhakar "); +MODULE_DESCRIPTION("Renesas GBETH DWMAC Specific Glue layer"); +MODULE_LICENSE("GPL"); From 326976b05543f07dd084a43bf45070aa87f290bc Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Thu, 17 Apr 2025 09:40:15 +0100 Subject: [PATCH 378/770] MAINTAINERS: Add entry for Renesas RZ/V2H(P) DWMAC GBETH glue layer driver Add a new MAINTAINERS entry for the Renesas RZ/V2H(P) DWMAC GBETH glue layer driver. Signed-off-by: Lad Prabhakar Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250417084015.74154-5-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 657a67f9031ef..82e4b96030df5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20626,6 +20626,14 @@ S: Maintained F: Documentation/devicetree/bindings/usb/renesas,rzn1-usbf.yaml F: drivers/usb/gadget/udc/renesas_usbf.c +RENESAS RZ/V2H(P) DWMAC GBETH GLUE LAYER DRIVER +M: Lad Prabhakar +L: netdev@vger.kernel.org +L: linux-renesas-soc@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/net/renesas,r9a09g057-gbeth.yaml +F: drivers/net/ethernet/stmicro/stmmac/dwmac-renesas-gbeth.c + RENESAS RZ/V2M I2C DRIVER M: Fabrizio Castro L: linux-i2c@vger.kernel.org From f7ca612018cfd4e2a33b0600d89c917fdec6e11c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 17 Apr 2025 17:13:52 +0100 Subject: [PATCH 379/770] net: dsa: rzn1_a5psw: Make the read-only array offsets static const Don't populate the read-only array offsets on the stack at run time, instead make it static const. Signed-off-by: Colin Ian King Reviewed-by: Andrew Lunn Reviewed-by: Wolfram Sang Tested-by: Wolfram Sang Link: https://patch.msgid.link/20250417161353.490219-1-colin.i.king@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/rzn1_a5psw.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/rzn1_a5psw.c b/drivers/net/dsa/rzn1_a5psw.c index 31ea8130a495f..df7466d4fe8f0 100644 --- a/drivers/net/dsa/rzn1_a5psw.c +++ b/drivers/net/dsa/rzn1_a5psw.c @@ -337,8 +337,9 @@ static void a5psw_port_rx_block_set(struct a5psw *a5psw, int port, bool block) static void a5psw_flooding_set_resolution(struct a5psw *a5psw, int port, bool set) { - u8 offsets[] = {A5PSW_UCAST_DEF_MASK, A5PSW_BCAST_DEF_MASK, - A5PSW_MCAST_DEF_MASK}; + static const u8 offsets[] = { + A5PSW_UCAST_DEF_MASK, A5PSW_BCAST_DEF_MASK, A5PSW_MCAST_DEF_MASK + }; int i; for (i = 0; i < ARRAY_SIZE(offsets); i++) From 21b01cb8e88ea200a834a2c114b5dc6aa378ac56 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 17 Apr 2025 18:07:54 +0100 Subject: [PATCH 380/770] net: stmmac: visconti: convert to set_clk_tx_rate() method Convert visconti to use the set_clk_tx_rate() method. By doing so, the GMAC control register will already have been updated (unlike with the fix_mac_speed() method) so this code can be removed while porting to the set_clk_tx_rate() method. There is also no need for the spinlock, and has never been - neither fix_mac_speed() nor set_clk_tx_rate() can be called by more than one thread at a time, so the lock does nothing useful. Reviewed-by: Andrew Lunn Reviewed-by: Jacob Keller Acked-by: Nobuhiro Iwamatsu Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u5SiQ-001I0B-OQ@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/dwmac-visconti.c | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c index 33cf99797df53..5e6ac82a89b9f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c @@ -51,21 +51,14 @@ struct visconti_eth { u32 phy_intf_sel; struct clk *phy_ref_clk; struct device *dev; - spinlock_t lock; /* lock to protect register update */ }; -static void visconti_eth_fix_mac_speed(void *priv, int speed, unsigned int mode) +static int visconti_eth_set_clk_tx_rate(void *bsp_priv, struct clk *clk_tx_i, + phy_interface_t interface, int speed) { - struct visconti_eth *dwmac = priv; + struct visconti_eth *dwmac = bsp_priv; struct net_device *netdev = dev_get_drvdata(dwmac->dev); unsigned int val, clk_sel_val = 0; - unsigned long flags; - - spin_lock_irqsave(&dwmac->lock, flags); - - /* adjust link */ - val = readl(dwmac->reg + MAC_CTRL_REG); - val &= ~(GMAC_CONFIG_PS | GMAC_CONFIG_FES); switch (speed) { case SPEED_1000: @@ -77,24 +70,19 @@ static void visconti_eth_fix_mac_speed(void *priv, int speed, unsigned int mode) clk_sel_val = ETHER_CLK_SEL_FREQ_SEL_25M; if (dwmac->phy_intf_sel == ETHER_CONFIG_INTF_RMII) clk_sel_val = ETHER_CLK_SEL_DIV_SEL_2; - val |= GMAC_CONFIG_PS | GMAC_CONFIG_FES; break; case SPEED_10: if (dwmac->phy_intf_sel == ETHER_CONFIG_INTF_RGMII) clk_sel_val = ETHER_CLK_SEL_FREQ_SEL_2P5M; if (dwmac->phy_intf_sel == ETHER_CONFIG_INTF_RMII) clk_sel_val = ETHER_CLK_SEL_DIV_SEL_20; - val |= GMAC_CONFIG_PS; break; default: /* No bit control */ netdev_err(netdev, "Unsupported speed request (%d)", speed); - spin_unlock_irqrestore(&dwmac->lock, flags); - return; + return -EINVAL; } - writel(val, dwmac->reg + MAC_CTRL_REG); - /* Stop internal clock */ val = readl(dwmac->reg + REG_ETHER_CLOCK_SEL); val &= ~(ETHER_CLK_SEL_RMII_CLK_EN | ETHER_CLK_SEL_RX_TX_CLK_EN); @@ -136,7 +124,7 @@ static void visconti_eth_fix_mac_speed(void *priv, int speed, unsigned int mode) break; } - spin_unlock_irqrestore(&dwmac->lock, flags); + return 0; } static int visconti_eth_init_hw(struct platform_device *pdev, struct plat_stmmacenet_data *plat_dat) @@ -228,11 +216,10 @@ static int visconti_eth_dwmac_probe(struct platform_device *pdev) if (!dwmac) return -ENOMEM; - spin_lock_init(&dwmac->lock); dwmac->reg = stmmac_res.addr; dwmac->dev = &pdev->dev; plat_dat->bsp_priv = dwmac; - plat_dat->fix_mac_speed = visconti_eth_fix_mac_speed; + plat_dat->set_clk_tx_rate = visconti_eth_set_clk_tx_rate; ret = visconti_eth_clock_probe(pdev, plat_dat); if (ret) From 434efd3d0cdd935d46c7448061537a2adcf8aeab Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:32:32 -0700 Subject: [PATCH 381/770] net: Drop hold_rtnl arg from ops_undo_list(). ops_undo_list() first iterates over ops_list for ->pre_exit(). Let's check if any of the ops has ->exit_rtnl() there and drop the hold_rtnl argument. Note that nexthop uses ->exit_rtnl() and is built-in, so hold_rtnl is always true for setup_net() and cleanup_net() for now. Suggested-by: Jakub Kicinski Link: https://lore.kernel.org/netdev/20250414170148.21f3523c@kernel.org/ Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418003259.48017-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/net_namespace.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 0a2b24af40280..48dd6dc603c9e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -220,17 +220,20 @@ static void ops_free_list(const struct pernet_operations *ops, static void ops_undo_list(const struct list_head *ops_list, const struct pernet_operations *ops, struct list_head *net_exit_list, - bool expedite_rcu, bool hold_rtnl) + bool expedite_rcu) { const struct pernet_operations *saved_ops; + bool hold_rtnl = false; if (!ops) ops = list_entry(ops_list, typeof(*ops), list); saved_ops = ops; - list_for_each_entry_continue_reverse(ops, ops_list, list) + list_for_each_entry_continue_reverse(ops, ops_list, list) { + hold_rtnl |= !!ops->exit_rtnl; ops_pre_exit_list(ops, net_exit_list); + } /* Another CPU might be rcu-iterating the list, wait for it. * This needs to be before calling the exit() notifiers, so the @@ -257,11 +260,10 @@ static void ops_undo_list(const struct list_head *ops_list, static void ops_undo_single(struct pernet_operations *ops, struct list_head *net_exit_list) { - bool hold_rtnl = !!ops->exit_rtnl; LIST_HEAD(ops_list); list_add(&ops->list, &ops_list); - ops_undo_list(&ops_list, NULL, net_exit_list, false, hold_rtnl); + ops_undo_list(&ops_list, NULL, net_exit_list, false); list_del(&ops->list); } @@ -452,7 +454,7 @@ static __net_init int setup_net(struct net *net) * for the pernet modules whose init functions did not fail. */ list_add(&net->exit_list, &net_exit_list); - ops_undo_list(&pernet_list, ops, &net_exit_list, false, true); + ops_undo_list(&pernet_list, ops, &net_exit_list, false); rcu_barrier(); goto out; } @@ -681,7 +683,7 @@ static void cleanup_net(struct work_struct *work) list_add_tail(&net->exit_list, &net_exit_list); } - ops_undo_list(&pernet_list, NULL, &net_exit_list, true, true); + ops_undo_list(&pernet_list, NULL, &net_exit_list, true); up_read(&pernet_ops_rwsem); From 81eccc131bc1d53c9f7fa0d8c241589c514adb4e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:32:33 -0700 Subject: [PATCH 382/770] pfcp: Convert pfcp_net_exit() to ->exit_rtnl(). pfcp_net_exit() holds RTNL and cleans up all devices in the netns and other devices tied to sockets in the netns. We can use ->exit_rtnl() to save RTNL dance for all dying netns. Note that we delegate the for_each_netdev() part to default_device_exit_batch() to avoid a list corruption splat like the one reported in commit 4ccacf86491d ("gtp: Suppress list corruption splat in gtp_net_exit_batch_rtnl().") Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418003259.48017-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/pfcp.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/drivers/net/pfcp.c b/drivers/net/pfcp.c index f873a92d24459..28e6bc4a1f14c 100644 --- a/drivers/net/pfcp.c +++ b/drivers/net/pfcp.c @@ -245,30 +245,21 @@ static int __net_init pfcp_net_init(struct net *net) return 0; } -static void __net_exit pfcp_net_exit(struct net *net) +static void __net_exit pfcp_net_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { struct pfcp_net *pn = net_generic(net, pfcp_net_id); struct pfcp_dev *pfcp, *pfcp_next; - struct net_device *dev; - LIST_HEAD(list); - - rtnl_lock(); - for_each_netdev(net, dev) - if (dev->rtnl_link_ops == &pfcp_link_ops) - pfcp_dellink(dev, &list); list_for_each_entry_safe(pfcp, pfcp_next, &pn->pfcp_dev_list, list) - pfcp_dellink(pfcp->dev, &list); - - unregister_netdevice_many(&list); - rtnl_unlock(); + pfcp_dellink(pfcp->dev, dev_to_kill); } static struct pernet_operations pfcp_net_ops = { - .init = pfcp_net_init, - .exit = pfcp_net_exit, - .id = &pfcp_net_id, - .size = sizeof(struct pfcp_net), + .init = pfcp_net_init, + .exit_rtnl = pfcp_net_exit_rtnl, + .id = &pfcp_net_id, + .size = sizeof(struct pfcp_net), }; static int __init pfcp_init(void) From 7ee32072c732799bcb7221cb97adc0e9e52e3792 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:32:34 -0700 Subject: [PATCH 383/770] ppp: Split ppp_exit_net() to ->exit_rtnl(). ppp_exit_net() unregisters devices related to the netns under RTNL and destroys lists and IDR. Let's use ->exit_rtnl() for the device unregistration part to save RTNL dances for each netns. Note that we delegate the for_each_netdev_safe() part to default_device_exit_batch() and replace unregister_netdevice_queue() with ppp_nl_dellink() to align with bond, geneve, gtp, and pfcp. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418003259.48017-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/ppp/ppp_generic.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 53463767cc43e..def84e87e05b2 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1131,6 +1131,8 @@ static const struct file_operations ppp_device_fops = { .llseek = noop_llseek, }; +static void ppp_nl_dellink(struct net_device *dev, struct list_head *head); + static __net_init int ppp_init_net(struct net *net) { struct ppp_net *pn = net_generic(net, ppp_net_id); @@ -1146,28 +1148,20 @@ static __net_init int ppp_init_net(struct net *net) return 0; } -static __net_exit void ppp_exit_net(struct net *net) +static __net_exit void ppp_exit_rtnl_net(struct net *net, + struct list_head *dev_to_kill) { struct ppp_net *pn = net_generic(net, ppp_net_id); - struct net_device *dev; - struct net_device *aux; struct ppp *ppp; - LIST_HEAD(list); int id; - rtnl_lock(); - for_each_netdev_safe(net, dev, aux) { - if (dev->netdev_ops == &ppp_netdev_ops) - unregister_netdevice_queue(dev, &list); - } - idr_for_each_entry(&pn->units_idr, ppp, id) - /* Skip devices already unregistered by previous loop */ - if (!net_eq(dev_net(ppp->dev), net)) - unregister_netdevice_queue(ppp->dev, &list); + ppp_nl_dellink(ppp->dev, dev_to_kill); +} - unregister_netdevice_many(&list); - rtnl_unlock(); +static __net_exit void ppp_exit_net(struct net *net) +{ + struct ppp_net *pn = net_generic(net, ppp_net_id); mutex_destroy(&pn->all_ppp_mutex); idr_destroy(&pn->units_idr); @@ -1177,6 +1171,7 @@ static __net_exit void ppp_exit_net(struct net *net) static struct pernet_operations ppp_net_ops = { .init = ppp_init_net, + .exit_rtnl = ppp_exit_rtnl_net, .exit = ppp_exit_net, .id = &ppp_net_id, .size = sizeof(struct ppp_net), From f294516f1ff2fbf6fed6d0376f1ad096f8312879 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Mon, 21 Apr 2025 10:29:55 +0800 Subject: [PATCH 384/770] net: txgbe: Support to set UDP tunnel port Tunnel types VXLAN/VXLAN_GPE/GENEVE are supported for txgbe devices. The hardware supports to set only one port for each tunnel type. Signed-off-by: Jiawen Wu Link: https://patch.msgid.link/20250421022956.508018-2-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/wangxun/txgbe/txgbe_main.c | 36 +++++++++++++++++++ .../net/ethernet/wangxun/txgbe/txgbe_type.h | 3 ++ 2 files changed, 39 insertions(+) diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c index d40db6b422001..dcfccc6d963a5 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -537,6 +538,39 @@ void txgbe_do_reset(struct net_device *netdev) txgbe_reset(wx); } +static int txgbe_udp_tunnel_sync(struct net_device *dev, unsigned int table) +{ + struct wx *wx = netdev_priv(dev); + struct udp_tunnel_info ti; + + udp_tunnel_nic_get_port(dev, table, 0, &ti); + switch (ti.type) { + case UDP_TUNNEL_TYPE_VXLAN: + wr32(wx, TXGBE_CFG_VXLAN, ntohs(ti.port)); + break; + case UDP_TUNNEL_TYPE_VXLAN_GPE: + wr32(wx, TXGBE_CFG_VXLAN_GPE, ntohs(ti.port)); + break; + case UDP_TUNNEL_TYPE_GENEVE: + wr32(wx, TXGBE_CFG_GENEVE, ntohs(ti.port)); + break; + default: + break; + } + + return 0; +} + +static const struct udp_tunnel_nic_info txgbe_udp_tunnels = { + .sync_table = txgbe_udp_tunnel_sync, + .flags = UDP_TUNNEL_NIC_INFO_OPEN_ONLY, + .tables = { + { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, + { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN_GPE, }, + { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE, }, + }, +}; + static const struct net_device_ops txgbe_netdev_ops = { .ndo_open = txgbe_open, .ndo_stop = txgbe_close, @@ -632,6 +666,7 @@ static int txgbe_probe(struct pci_dev *pdev, wx->driver_name = txgbe_driver_name; txgbe_set_ethtool_ops(netdev); netdev->netdev_ops = &txgbe_netdev_ops; + netdev->udp_tunnel_nic_info = &txgbe_udp_tunnels; /* setup the private structure */ err = txgbe_sw_init(wx); @@ -677,6 +712,7 @@ static int txgbe_probe(struct pci_dev *pdev, netdev->features |= NETIF_F_HIGHDMA; netdev->hw_features |= NETIF_F_GRO; netdev->features |= NETIF_F_GRO; + netdev->features |= NETIF_F_RX_UDP_TUNNEL_PORT; netdev->priv_flags |= IFF_UNICAST_FLT; netdev->priv_flags |= IFF_SUPP_NOFCS; diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h b/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h index 5937cbc6bd057..cb553318641da 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_type.h @@ -88,6 +88,9 @@ /* Port cfg registers */ #define TXGBE_CFG_PORT_ST 0x14404 #define TXGBE_CFG_PORT_ST_LINK_UP BIT(0) +#define TXGBE_CFG_VXLAN 0x14410 +#define TXGBE_CFG_VXLAN_GPE 0x14414 +#define TXGBE_CFG_GENEVE 0x14418 /* I2C registers */ #define TXGBE_I2C_BASE 0x14900 From 3b05aa997c491487c8d4b0033cca76a15760a930 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Mon, 21 Apr 2025 10:29:56 +0800 Subject: [PATCH 385/770] net: wangxun: restrict feature flags for tunnel packets Implement ndo_features_check to restrict Tx checksum offload flags, since there are some inner layer length and protocols unsupported. Signed-off-by: Jiawen Wu Reviewed-by: Michal Kubiak Link: https://patch.msgid.link/20250421022956.508018-3-jiawenwu@trustnetic.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 27 +++++++++++++++++++ drivers/net/ethernet/wangxun/libwx/wx_lib.h | 3 +++ drivers/net/ethernet/wangxun/ngbe/ngbe_main.c | 1 + .../net/ethernet/wangxun/txgbe/txgbe_main.c | 1 + 4 files changed, 32 insertions(+) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 18422b940dbed..2a808afeb4142 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -3000,6 +3000,33 @@ netdev_features_t wx_fix_features(struct net_device *netdev, } EXPORT_SYMBOL(wx_fix_features); +#define WX_MAX_TUNNEL_HDR_LEN 80 +netdev_features_t wx_features_check(struct sk_buff *skb, + struct net_device *netdev, + netdev_features_t features) +{ + struct wx *wx = netdev_priv(netdev); + + if (!skb->encapsulation) + return features; + + if (wx->mac.type == wx_mac_em) + return features & ~NETIF_F_CSUM_MASK; + + if (unlikely(skb_inner_mac_header(skb) - skb_transport_header(skb) > + WX_MAX_TUNNEL_HDR_LEN)) + return features & ~NETIF_F_CSUM_MASK; + + if (skb->inner_protocol_type == ENCAP_TYPE_ETHER && + skb->inner_protocol != htons(ETH_P_IP) && + skb->inner_protocol != htons(ETH_P_IPV6) && + skb->inner_protocol != htons(ETH_P_TEB)) + return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); + + return features; +} +EXPORT_SYMBOL(wx_features_check); + void wx_set_ring(struct wx *wx, u32 new_tx_count, u32 new_rx_count, struct wx_ring *temp_ring) { diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.h b/drivers/net/ethernet/wangxun/libwx/wx_lib.h index fdeb0c315b755..919f499993089 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.h +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.h @@ -33,6 +33,9 @@ void wx_get_stats64(struct net_device *netdev, int wx_set_features(struct net_device *netdev, netdev_features_t features); netdev_features_t wx_fix_features(struct net_device *netdev, netdev_features_t features); +netdev_features_t wx_features_check(struct sk_buff *skb, + struct net_device *netdev, + netdev_features_t features); void wx_set_ring(struct wx *wx, u32 new_tx_count, u32 new_rx_count, struct wx_ring *temp_ring); diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c index 9c18203c8ce1e..b5022c49dc5ee 100644 --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c @@ -587,6 +587,7 @@ static const struct net_device_ops ngbe_netdev_ops = { .ndo_set_rx_mode = wx_set_rx_mode, .ndo_set_features = wx_set_features, .ndo_fix_features = wx_fix_features, + .ndo_features_check = wx_features_check, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = wx_set_mac, .ndo_get_stats64 = wx_get_stats64, diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c index dcfccc6d963a5..f57d84628e071 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c @@ -579,6 +579,7 @@ static const struct net_device_ops txgbe_netdev_ops = { .ndo_set_rx_mode = wx_set_rx_mode, .ndo_set_features = wx_set_features, .ndo_fix_features = wx_fix_features, + .ndo_features_check = wx_features_check, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = wx_set_mac, .ndo_get_stats64 = wx_get_stats64, From 0e0a7e3719bc8cbe6d6e30b3e81f21472ecba5bc Mon Sep 17 00:00:00 2001 From: Joshua Washington Date: Mon, 21 Apr 2025 18:16:32 -0700 Subject: [PATCH 386/770] xdp: create locked/unlocked instances of xdp redirect target setters Commit 03df156dd3a6 ("xdp: double protect netdev->xdp_flags with netdev->lock") introduces the netdev lock to xdp_set_features_flag(). The change includes a _locked version of the method, as it is possible for a driver to have already acquired the netdev lock before calling this helper. However, the same applies to xdp_features_(set|clear)_redirect_flags(), which ends up calling the unlocked version of xdp_set_features_flags() leading to deadlocks in GVE, which grabs the netdev lock as part of its suspend, reset, and shutdown processes: [ 833.265543] WARNING: possible recursive locking detected [ 833.270949] 6.15.0-rc1 #6 Tainted: G E [ 833.276271] -------------------------------------------- [ 833.281681] systemd-shutdow/1 is trying to acquire lock: [ 833.287090] ffff949d2b148c68 (&dev->lock){+.+.}-{4:4}, at: xdp_set_features_flag+0x29/0x90 [ 833.295470] [ 833.295470] but task is already holding lock: [ 833.301400] ffff949d2b148c68 (&dev->lock){+.+.}-{4:4}, at: gve_shutdown+0x44/0x90 [gve] [ 833.309508] [ 833.309508] other info that might help us debug this: [ 833.316130] Possible unsafe locking scenario: [ 833.316130] [ 833.322142] CPU0 [ 833.324681] ---- [ 833.327220] lock(&dev->lock); [ 833.330455] lock(&dev->lock); [ 833.333689] [ 833.333689] *** DEADLOCK *** [ 833.333689] [ 833.339701] May be due to missing lock nesting notation [ 833.339701] [ 833.346582] 5 locks held by systemd-shutdow/1: [ 833.351205] #0: ffffffffa9c89130 (system_transition_mutex){+.+.}-{4:4}, at: __se_sys_reboot+0xe6/0x210 [ 833.360695] #1: ffff93b399e5c1b8 (&dev->mutex){....}-{4:4}, at: device_shutdown+0xb4/0x1f0 [ 833.369144] #2: ffff949d19a471b8 (&dev->mutex){....}-{4:4}, at: device_shutdown+0xc2/0x1f0 [ 833.377603] #3: ffffffffa9eca050 (rtnl_mutex){+.+.}-{4:4}, at: gve_shutdown+0x33/0x90 [gve] [ 833.386138] #4: ffff949d2b148c68 (&dev->lock){+.+.}-{4:4}, at: gve_shutdown+0x44/0x90 [gve] Introduce xdp_features_(set|clear)_redirect_target_locked() versions which assume that the netdev lock has already been acquired before setting the XDP feature flag and update GVE to use the locked version. Fixes: 03df156dd3a6 ("xdp: double protect netdev->xdp_flags with netdev->lock") Tested-by: Mina Almasry Reviewed-by: Willem de Bruijn Reviewed-by: Harshitha Ramamurthy Signed-off-by: Joshua Washington Acked-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/20250422011643.3509287-1-joshwash@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 4 ++-- include/net/xdp.h | 3 +++ net/core/xdp.c | 25 ++++++++++++++++++---- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 8aaac91013777..446e4b6fd3f17 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -1830,7 +1830,7 @@ static void gve_turndown(struct gve_priv *priv) /* Stop tx queues */ netif_tx_disable(priv->dev); - xdp_features_clear_redirect_target(priv->dev); + xdp_features_clear_redirect_target_locked(priv->dev); gve_clear_napi_enabled(priv); gve_clear_report_stats(priv); @@ -1902,7 +1902,7 @@ static void gve_turnup(struct gve_priv *priv) } if (priv->tx_cfg.num_xdp_queues && gve_supports_xdp_xmit(priv)) - xdp_features_set_redirect_target(priv->dev, false); + xdp_features_set_redirect_target_locked(priv->dev, false); gve_set_napi_enabled(priv); } diff --git a/include/net/xdp.h b/include/net/xdp.h index 20e41b5ff319a..b40f1f96cb117 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -618,7 +618,10 @@ bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); +void xdp_features_clear_redirect_target_locked(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } diff --git a/net/core/xdp.c b/net/core/xdp.c index a014df88620cb..ea819764ae39a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -1014,21 +1014,38 @@ void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) } EXPORT_SYMBOL_GPL(xdp_set_features_flag); -void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg) { xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT); if (support_sg) val |= NETDEV_XDP_ACT_NDO_XMIT_SG; - xdp_set_features_flag(dev, val); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked); + +void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +{ + netdev_lock(dev); + xdp_features_set_redirect_target_locked(dev, support_sg); + netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target); -void xdp_features_clear_redirect_target(struct net_device *dev) +void xdp_features_clear_redirect_target_locked(struct net_device *dev) { xdp_features_t val = dev->xdp_features; val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG); - xdp_set_features_flag(dev, val); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked); + +void xdp_features_clear_redirect_target(struct net_device *dev) +{ + netdev_lock(dev); + xdp_features_clear_redirect_target_locked(dev); + netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target); From be3f1938d3e6ea8186f0de3dd95245dda4f22c1e Mon Sep 17 00:00:00 2001 From: Dave Chen Date: Tue, 15 Apr 2025 14:33:42 +0800 Subject: [PATCH 387/770] btrfs: fix COW handling in run_delalloc_nocow() In run_delalloc_nocow(), when the found btrfs_key's offset > cur_offset, it indicates a gap between the current processing region and the next file extent. The original code would directly jump to the "must_cow" label, which increments the slot and forces a fallback to COW. This behavior might skip an extent item and result in an overestimated COW fallback range. This patch modifies the logic so that when a gap is detected: - If no COW range is already being recorded (cow_start is unset), cow_start is set to cur_offset. - cur_offset is then advanced to the beginning of the next extent. - Instead of jumping to "must_cow", control flows directly to "next_slot" so that the same extent item can be reexamined properly. The change ensures that we accurately account for the extent gap and avoid accidentally extending the range that needs to fallback to COW. CC: stable@vger.kernel.org # 6.6+ Reviewed-by: Filipe Manana Signed-off-by: Dave Chen Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6eedfbfce1cba..312fa996a9871 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2129,12 +2129,13 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, /* * If the found extent starts after requested offset, then - * adjust extent_end to be right before this extent begins + * adjust cur_offset to be right before this extent begins. */ if (found_key.offset > cur_offset) { - extent_end = found_key.offset; - extent_type = 0; - goto must_cow; + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = found_key.offset; + goto next_slot; } /* From 48c1d1bb525b1c44b8bdc8e7ec5629cb6c2b9fc4 Mon Sep 17 00:00:00 2001 From: Penglei Jiang Date: Mon, 21 Apr 2025 08:40:29 -0700 Subject: [PATCH 388/770] btrfs: fix the inode leak in btrfs_iget() [BUG] There is a bug report that a syzbot reproducer can lead to the following busy inode at unmount time: BTRFS info (device loop1): last unmount of filesystem 1680000e-3c1e-4c46-84b6-56bd3909af50 VFS: Busy inodes after unmount of loop1 (btrfs) ------------[ cut here ]------------ kernel BUG at fs/super.c:650! Oops: invalid opcode: 0000 [#1] SMP KASAN NOPTI CPU: 0 UID: 0 PID: 48168 Comm: syz-executor Not tainted 6.15.0-rc2-00471-g119009db2674 #2 PREEMPT(full) Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:generic_shutdown_super+0x2e9/0x390 fs/super.c:650 Call Trace: kill_anon_super+0x3a/0x60 fs/super.c:1237 btrfs_kill_super+0x3b/0x50 fs/btrfs/super.c:2099 deactivate_locked_super+0xbe/0x1a0 fs/super.c:473 deactivate_super fs/super.c:506 [inline] deactivate_super+0xe2/0x100 fs/super.c:502 cleanup_mnt+0x21f/0x440 fs/namespace.c:1435 task_work_run+0x14d/0x240 kernel/task_work.c:227 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop kernel/entry/common.c:114 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x269/0x290 kernel/entry/common.c:218 do_syscall_64+0xd4/0x250 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x77/0x7f [CAUSE] When btrfs_alloc_path() failed, btrfs_iget() directly returned without releasing the inode already allocated by btrfs_iget_locked(). This results the above busy inode and trigger the kernel BUG. [FIX] Fix it by calling iget_failed() if btrfs_alloc_path() failed. If we hit error inside btrfs_read_locked_inode(), it will properly call iget_failed(), so nothing to worry about. Although the iget_failed() cleanup inside btrfs_read_locked_inode() is a break of the normal error handling scheme, let's fix the obvious bug and backport first, then rework the error handling later. Reported-by: Penglei Jiang Link: https://lore.kernel.org/linux-btrfs/20250421102425.44431-1-superman.xpt@gmail.com/ Fixes: 7c855e16ab72 ("btrfs: remove conditional path allocation in btrfs_read_locked_inode()") CC: stable@vger.kernel.org # 6.13+ Reviewed-by: Qu Wenruo Signed-off-by: Penglei Jiang Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 312fa996a9871..d295a37fa0491 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5682,8 +5682,10 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) return inode; path = btrfs_alloc_path(); - if (!path) + if (!path) { + iget_failed(&inode->vfs_inode); return ERR_PTR(-ENOMEM); + } ret = btrfs_read_locked_inode(inode, path); btrfs_free_path(path); From e08e49d986f82c30f42ad0ed43ebbede1e1e3739 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 14 Apr 2025 14:51:58 -0400 Subject: [PATCH 389/770] btrfs: adjust subpage bit start based on sectorsize When running machines with 64k page size and a 16k nodesize we started seeing tree log corruption in production. This turned out to be because we were not writing out dirty blocks sometimes, so this in fact affects all metadata writes. When writing out a subpage EB we scan the subpage bitmap for a dirty range. If the range isn't dirty we do bit_start++; to move onto the next bit. The problem is the bitmap is based on the number of sectors that an EB has. So in this case, we have a 64k pagesize, 16k nodesize, but a 4k sectorsize. This means our bitmap is 4 bits for every node. With a 64k page size we end up with 4 nodes per page. To make this easier this is how everything looks [0 16k 32k 48k ] logical address [0 4 8 12 ] radix tree offset [ 64k page ] folio [ 16k eb ][ 16k eb ][ 16k eb ][ 16k eb ] extent buffers [ | | | | | | | | | | | | | | | | ] bitmap Now we use all of our addressing based on fs_info->sectorsize_bits, so as you can see the above our 16k eb->start turns into radix entry 4. When we find a dirty range for our eb, we correctly do bit_start += sectors_per_node, because if we start at bit 0, the next bit for the next eb is 4, to correspond to eb->start 16k. However if our range is clean, we will do bit_start++, which will now put us offset from our radix tree entries. In our case, assume that the first time we check the bitmap the block is not dirty, we increment bit_start so now it == 1, and then we loop around and check again. This time it is dirty, and we go to find that start using the following equation start = folio_start + bit_start * fs_info->sectorsize; so in the case above, eb->start 0 is now dirty, and we calculate start as 0 + 1 * fs_info->sectorsize = 4096 4096 >> 12 = 1 Now we're looking up the radix tree for 1, and we won't find an eb. What's worse is now we're using bit_start == 1, so we do bit_start += sectors_per_node, which is now 5. If that eb is dirty we will run into the same thing, we will look at an offset that is not populated in the radix tree, and now we're skipping the writeout of dirty extent buffers. The best fix for this is to not use sectorsize_bits to address nodes, but that's a larger change. Since this is a fs corruption problem fix it simply by always using sectors_per_node to increment the start bit. Fixes: c4aec299fa8f ("btrfs: introduce submit_eb_subpage() to submit a subpage metadata page") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 197f5e51c4744..8515c31f563b8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2047,7 +2047,7 @@ static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock(&folio->mapping->i_private_lock); - bit_start++; + bit_start += sectors_per_node; continue; } From 12df9ec3e1955aed6a0c839f2375cd8e5d5150cf Mon Sep 17 00:00:00 2001 From: Saranya Gopal Date: Mon, 21 Apr 2025 09:43:32 +0530 Subject: [PATCH 390/770] platform/x86/intel: hid: Add Pantherlake support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Pantherlake ACPI device ID to the Intel HID driver. While there, clean up the device ID table to remove the ", 0" parts. Suggested-by: Andy Shevchenko Signed-off-by: Saranya Gopal Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250421041332.830136-1-saranya.gopal@intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/hid.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/platform/x86/intel/hid.c b/drivers/platform/x86/intel/hid.c index 88a1a9ff2f344..0b5e43444ed60 100644 --- a/drivers/platform/x86/intel/hid.c +++ b/drivers/platform/x86/intel/hid.c @@ -44,16 +44,17 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Alex Hung"); static const struct acpi_device_id intel_hid_ids[] = { - {"INT33D5", 0}, - {"INTC1051", 0}, - {"INTC1054", 0}, - {"INTC1070", 0}, - {"INTC1076", 0}, - {"INTC1077", 0}, - {"INTC1078", 0}, - {"INTC107B", 0}, - {"INTC10CB", 0}, - {"", 0}, + { "INT33D5" }, + { "INTC1051" }, + { "INTC1054" }, + { "INTC1070" }, + { "INTC1076" }, + { "INTC1077" }, + { "INTC1078" }, + { "INTC107B" }, + { "INTC10CB" }, + { "INTC10CC" }, + { } }; MODULE_DEVICE_TABLE(acpi, intel_hid_ids); From 246f9bb62016c423972ea7f2335a8e0ed3521cde Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Sat, 19 Apr 2025 12:45:29 -0300 Subject: [PATCH 391/770] platform/x86: alienware-wmi-wmax: Add support for Alienware m15 R7 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend thermal control support to Alienware m15 R7. Cc: stable@vger.kernel.org Tested-by: Romain THERY Signed-off-by: Kurt Borja Link: https://lore.kernel.org/r/20250419-m15-r7-v1-1-18c6eaa27e25@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/dell/alienware-wmi-wmax.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/platform/x86/dell/alienware-wmi-wmax.c b/drivers/platform/x86/dell/alienware-wmi-wmax.c index 3f9e1e986ecf0..08b82c151e071 100644 --- a/drivers/platform/x86/dell/alienware-wmi-wmax.c +++ b/drivers/platform/x86/dell/alienware-wmi-wmax.c @@ -69,6 +69,14 @@ static const struct dmi_system_id awcc_dmi_table[] __initconst = { }, .driver_data = &generic_quirks, }, + { + .ident = "Alienware m15 R7", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Alienware"), + DMI_MATCH(DMI_PRODUCT_NAME, "Alienware m15 R7"), + }, + .driver_data = &generic_quirks, + }, { .ident = "Alienware m16 R1", .matches = { From 77bdac73754e4c0c564c1ca80fe3d9c93b0e715a Mon Sep 17 00:00:00 2001 From: Pavel Nikulin Date: Fri, 18 Apr 2025 20:06:08 +0600 Subject: [PATCH 392/770] platform/x86: asus-wmi: Disable OOBE state after resume from hibernation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASUS firmware resets OOBE state during S4 suspend, so the keyboard blinks during resume from hibernation. This patch disables OOBE state after resume from hibernation. Signed-off-by: Pavel Nikulin Link: https://lore.kernel.org/r/20250418140706.1691-1-pavel@noa-labs.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/asus-wmi.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 38ef778e8c19b..0c697b46f436a 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -304,6 +304,7 @@ struct asus_wmi { u32 kbd_rgb_dev; bool kbd_rgb_state_available; + bool oobe_state_available; u8 throttle_thermal_policy_mode; u32 throttle_thermal_policy_dev; @@ -1826,7 +1827,7 @@ static int asus_wmi_led_init(struct asus_wmi *asus) goto error; } - if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_OOBE)) { + if (asus->oobe_state_available) { /* * Disable OOBE state, so that e.g. the keyboard backlight * works. @@ -4723,6 +4724,7 @@ static int asus_wmi_add(struct platform_device *pdev) asus->egpu_enable_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_EGPU); asus->dgpu_disable_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_DGPU); asus->kbd_rgb_state_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_STATE); + asus->oobe_state_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_OOBE); asus->ally_mcu_usb_switch = acpi_has_method(NULL, ASUS_USB0_PWR_EC0_CSEE) && dmi_check_system(asus_ally_mcu_quirk); @@ -4970,6 +4972,13 @@ static int asus_hotk_restore(struct device *device) } if (!IS_ERR_OR_NULL(asus->kbd_led.dev)) kbd_led_update(asus); + if (asus->oobe_state_available) { + /* + * Disable OOBE state, so that e.g. the keyboard backlight + * works. + */ + asus_wmi_set_devstate(ASUS_WMI_DEVID_OOBE, 1, NULL); + } if (asus_wmi_has_fnlock_key(asus)) asus_wmi_fnlock_update(asus); From 02c6e43397c39edd0c172859bf8c851b46be09a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C5=A1per=20Nemgar?= Date: Fri, 18 Apr 2025 09:07:38 +0200 Subject: [PATCH 393/770] platform/x86: ideapad-laptop: add support for some new buttons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add entries to unsupported WMI codes in ideapad_keymap[] and one check for WMI code 0x13d to trigger platform_profile_cycle(). Signed-off-by: Gašper Nemgar Reviewed-by: Hans de Goede Link: https://lore.kernel.org/r/20250418070738.7171-1-gasper.nemgar@gmail.com [ij: joined nested if ()s & major tweaks to changelog] Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/ideapad-laptop.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index 17a09b7784edb..ede483573fe0d 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -1294,6 +1294,16 @@ static const struct key_entry ideapad_keymap[] = { /* Specific to some newer models */ { KE_KEY, 0x3e | IDEAPAD_WMI_KEY, { KEY_MICMUTE } }, { KE_KEY, 0x3f | IDEAPAD_WMI_KEY, { KEY_RFKILL } }, + /* Star- (User Assignable Key) */ + { KE_KEY, 0x44 | IDEAPAD_WMI_KEY, { KEY_PROG1 } }, + /* Eye */ + { KE_KEY, 0x45 | IDEAPAD_WMI_KEY, { KEY_PROG3 } }, + /* Performance toggle also Fn+Q, handled inside ideapad_wmi_notify() */ + { KE_KEY, 0x3d | IDEAPAD_WMI_KEY, { KEY_PROG4 } }, + /* shift + prtsc */ + { KE_KEY, 0x2d | IDEAPAD_WMI_KEY, { KEY_CUT } }, + { KE_KEY, 0x29 | IDEAPAD_WMI_KEY, { KEY_TOUCHPAD_TOGGLE } }, + { KE_KEY, 0x2a | IDEAPAD_WMI_KEY, { KEY_ROOT_MENU } }, { KE_END }, }; @@ -2080,6 +2090,12 @@ static void ideapad_wmi_notify(struct wmi_device *wdev, union acpi_object *data) dev_dbg(&wdev->dev, "WMI fn-key event: 0x%llx\n", data->integer.value); + /* performance button triggered by 0x3d */ + if (data->integer.value == 0x3d && priv->dytc) { + platform_profile_cycle(); + break; + } + /* 0x02 FnLock, 0x03 Esc */ if (data->integer.value == 0x02 || data->integer.value == 0x03) ideapad_fn_lock_led_notify(priv, data->integer.value == 0x02); From a3d8f0a7f5e8b193db509c7191fefeed3533fc44 Mon Sep 17 00:00:00 2001 From: LongPing Wei Date: Thu, 17 Apr 2025 11:07:38 +0800 Subject: [PATCH 394/770] dm-bufio: don't schedule in atomic context A BUG was reported as below when CONFIG_DEBUG_ATOMIC_SLEEP and try_verify_in_tasklet are enabled. [ 129.444685][ T934] BUG: sleeping function called from invalid context at drivers/md/dm-bufio.c:2421 [ 129.444723][ T934] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 934, name: kworker/1:4 [ 129.444740][ T934] preempt_count: 201, expected: 0 [ 129.444756][ T934] RCU nest depth: 0, expected: 0 [ 129.444781][ T934] Preemption disabled at: [ 129.444789][ T934] [] shrink_work+0x21c/0x248 [ 129.445167][ T934] kernel BUG at kernel/sched/walt/walt_debug.c:16! [ 129.445183][ T934] Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP [ 129.445204][ T934] Skip md ftrace buffer dump for: 0x1609e0 [ 129.447348][ T934] CPU: 1 PID: 934 Comm: kworker/1:4 Tainted: G W OE 6.6.56-android15-8-o-g6f82312b30b9-debug #1 1400000003000000474e5500b3187743670464e8 [ 129.447362][ T934] Hardware name: Qualcomm Technologies, Inc. Parrot QRD, Alpha-M (DT) [ 129.447373][ T934] Workqueue: dm_bufio_cache shrink_work [ 129.447394][ T934] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 129.447406][ T934] pc : android_rvh_schedule_bug+0x0/0x8 [sched_walt_debug] [ 129.447435][ T934] lr : __traceiter_android_rvh_schedule_bug+0x44/0x6c [ 129.447451][ T934] sp : ffffffc0843dbc90 [ 129.447459][ T934] x29: ffffffc0843dbc90 x28: ffffffffffffffff x27: 0000000000000c8b [ 129.447479][ T934] x26: 0000000000000040 x25: ffffff804b3d6260 x24: ffffffd816232b68 [ 129.447497][ T934] x23: ffffff805171c5b4 x22: 0000000000000000 x21: ffffffd816231900 [ 129.447517][ T934] x20: ffffff80306ba898 x19: 0000000000000000 x18: ffffffc084159030 [ 129.447535][ T934] x17: 00000000d2b5dd1f x16: 00000000d2b5dd1f x15: ffffffd816720358 [ 129.447554][ T934] x14: 0000000000000004 x13: ffffff89ef978000 x12: 0000000000000003 [ 129.447572][ T934] x11: ffffffd817a823c4 x10: 0000000000000202 x9 : 7e779c5735de9400 [ 129.447591][ T934] x8 : ffffffd81560d004 x7 : 205b5d3938373434 x6 : ffffffd8167397c8 [ 129.447610][ T934] x5 : 0000000000000000 x4 : 0000000000000001 x3 : ffffffc0843db9e0 [ 129.447629][ T934] x2 : 0000000000002f15 x1 : 0000000000000000 x0 : 0000000000000000 [ 129.447647][ T934] Call trace: [ 129.447655][ T934] android_rvh_schedule_bug+0x0/0x8 [sched_walt_debug 1400000003000000474e550080cce8a8a78606b6] [ 129.447681][ T934] __might_resched+0x190/0x1a8 [ 129.447694][ T934] shrink_work+0x180/0x248 [ 129.447706][ T934] process_one_work+0x260/0x624 [ 129.447718][ T934] worker_thread+0x28c/0x454 [ 129.447729][ T934] kthread+0x118/0x158 [ 129.447742][ T934] ret_from_fork+0x10/0x20 [ 129.447761][ T934] Code: ???????? ???????? ???????? d2b5dd1f (d4210000) [ 129.447772][ T934] ---[ end trace 0000000000000000 ]--- dm_bufio_lock will call spin_lock_bh when try_verify_in_tasklet is enabled, and __scan will be called in atomic context. Fixes: 7cd326747f46 ("dm bufio: remove dm_bufio_cond_resched()") Signed-off-by: LongPing Wei Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka --- drivers/md/dm-bufio.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 9c8ed65cd87e6..f0b5a6931161a 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -68,6 +68,8 @@ #define LIST_DIRTY 1 #define LIST_SIZE 2 +#define SCAN_RESCHED_CYCLE 16 + /*--------------------------------------------------------------*/ /* @@ -2424,7 +2426,12 @@ static void __scan(struct dm_bufio_client *c) atomic_long_dec(&c->need_shrink); freed++; - cond_resched(); + + if (unlikely(freed % SCAN_RESCHED_CYCLE == 0)) { + dm_bufio_unlock(c); + cond_resched(); + dm_bufio_lock(c); + } } } } From 0a533c3e4246c29d502a7e0fba0e86d80a906b04 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 22 Apr 2025 21:18:33 +0200 Subject: [PATCH 395/770] dm-integrity: fix a warning on invalid table line If we use the 'B' mode and we have an invalit table line, cancel_delayed_work_sync would trigger a warning. This commit avoids the warning. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org --- drivers/md/dm-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 2a283feb3319c..cc3d3897ef428 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -5164,7 +5164,7 @@ static void dm_integrity_dtr(struct dm_target *ti) BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); BUG_ON(!list_empty(&ic->wait_list)); - if (ic->mode == 'B') + if (ic->mode == 'B' && ic->bitmap_flush_work.work.func) cancel_delayed_work_sync(&ic->bitmap_flush_work); if (ic->metadata_wq) destroy_workqueue(ic->metadata_wq); From 4b30ae9adb047dd0a7982975ec3933c529537026 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Thu, 17 Apr 2025 15:43:12 +0200 Subject: [PATCH 396/770] net: bridge: mcast: re-implement br_multicast_{enable, disable}_port functions When a bridge port STP state is changed from BLOCKING/DISABLED to FORWARDING, the port's igmp query timer will NOT re-arm itself if the bridge has been configured as per-VLAN multicast snooping. Solve this by choosing the correct multicast context(s) to enable/disable port multicast based on whether per-VLAN multicast snooping is enabled or not, i.e. using per-{port, VLAN} context in case of per-VLAN multicast snooping by re-implementing br_multicast_enable_port() and br_multicast_disable_port() functions. Before the patch, the IGMP query does not happen in the last step of the following test sequence, i.e. no growth for tx counter: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 mcast_vlan_snooping 1 mcast_querier 1 mcast_stats_enabled 1 # bridge vlan global set vid 1 dev br1 mcast_snooping 1 mcast_querier 1 mcast_query_interval 100 mcast_startup_query_count 0 # ip link add name swp1 up master br1 type dummy # bridge link set dev swp1 state 0 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # sleep 1 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # bridge link set dev swp1 state 3 # sleep 2 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 After the patch, the IGMP query happens in the last step of the test: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 mcast_vlan_snooping 1 mcast_querier 1 mcast_stats_enabled 1 # bridge vlan global set vid 1 dev br1 mcast_snooping 1 mcast_querier 1 mcast_query_interval 100 mcast_startup_query_count 0 # ip link add name swp1 up master br1 type dummy # bridge link set dev swp1 state 0 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # sleep 1 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # bridge link set dev swp1 state 3 # sleep 2 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 3 Signed-off-by: Yong Wang Reviewed-by: Andy Roulin Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 77 +++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 8 deletions(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index dcbf058de1e3b..ce07fda6a8483 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2105,12 +2105,17 @@ static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) } } -void br_multicast_enable_port(struct net_bridge_port *port) +static void br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; spin_lock_bh(&br->multicast_lock); - __br_multicast_enable_port_ctx(&port->multicast_ctx); + if (br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) { + spin_unlock_bh(&br->multicast_lock); + return; + } + __br_multicast_enable_port_ctx(pmctx); spin_unlock_bh(&br->multicast_lock); } @@ -2137,11 +2142,67 @@ static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) br_multicast_rport_del_notify(pmctx, del); } +static void br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) +{ + struct net_bridge *br = pmctx->port->br; + + spin_lock_bh(&br->multicast_lock); + if (br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) { + spin_unlock_bh(&br->multicast_lock); + return; + } + + __br_multicast_disable_port_ctx(pmctx); + spin_unlock_bh(&br->multicast_lock); +} + +static void br_multicast_toggle_port(struct net_bridge_port *port, bool on) +{ +#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) + if (br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + + rcu_read_lock(); + vg = nbp_vlan_group_rcu(port); + if (!vg) { + rcu_read_unlock(); + return; + } + + /* iterate each vlan, toggle vlan multicast context */ + list_for_each_entry_rcu(vlan, &vg->vlan_list, vlist) { + struct net_bridge_mcast_port *pmctx = + &vlan->port_mcast_ctx; + u8 state = br_vlan_get_state(vlan); + /* enable vlan multicast context when state is + * LEARNING or FORWARDING + */ + if (on && br_vlan_state_allowed(state, true)) + br_multicast_enable_port_ctx(pmctx); + else + br_multicast_disable_port_ctx(pmctx); + } + rcu_read_unlock(); + return; + } +#endif + /* toggle port multicast context when vlan snooping is disabled */ + if (on) + br_multicast_enable_port_ctx(&port->multicast_ctx); + else + br_multicast_disable_port_ctx(&port->multicast_ctx); +} + +void br_multicast_enable_port(struct net_bridge_port *port) +{ + br_multicast_toggle_port(port, true); +} + void br_multicast_disable_port(struct net_bridge_port *port) { - spin_lock_bh(&port->br->multicast_lock); - __br_multicast_disable_port_ctx(&port->multicast_ctx); - spin_unlock_bh(&port->br->multicast_lock); + br_multicast_toggle_port(port, false); } static int __grp_src_delete_marked(struct net_bridge_port_group *pg) @@ -4304,9 +4365,9 @@ int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, __br_multicast_open(&br->multicast_ctx); list_for_each_entry(p, &br->port_list, list) { if (on) - br_multicast_disable_port(p); + br_multicast_disable_port_ctx(&p->multicast_ctx); else - br_multicast_enable_port(p); + br_multicast_enable_port_ctx(&p->multicast_ctx); } list_for_each_entry(vlan, &vg->vlan_list, vlist) From 6c131043eaf1be2a6cc2d228f92ceb626fbcc0f3 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Thu, 17 Apr 2025 15:43:13 +0200 Subject: [PATCH 397/770] net: bridge: mcast: update multicast contex when vlan state is changed When the vlan STP state is changed, which could be manipulated by "bridge vlan" commands, similar to port STP state, this also impacts multicast behaviors such as igmp query. In the scenario of per-VLAN snooping, there's a need to update the corresponding multicast context to re-arm the port query timer when vlan state becomes "forwarding" etc. Update br_vlan_set_state() function to enable vlan multicast context in such scenario. Before the patch, the IGMP query does not happen in the last step of the following test sequence, i.e. no growth for tx counter: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 mcast_vlan_snooping 1 mcast_querier 1 mcast_stats_enabled 1 # bridge vlan global set vid 1 dev br1 mcast_snooping 1 mcast_querier 1 mcast_query_interval 100 mcast_startup_query_count 0 # ip link add name swp1 up master br1 type dummy # sleep 1 # bridge vlan set vid 1 dev swp1 state 4 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # sleep 1 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # bridge vlan set vid 1 dev swp1 state 3 # sleep 2 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 After the patch, the IGMP query happens in the last step of the test: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 mcast_vlan_snooping 1 mcast_querier 1 mcast_stats_enabled 1 # bridge vlan global set vid 1 dev br1 mcast_snooping 1 mcast_querier 1 mcast_query_interval 100 mcast_startup_query_count 0 # ip link add name swp1 up master br1 type dummy # sleep 1 # bridge vlan set vid 1 dev swp1 state 4 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # sleep 1 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 1 # bridge vlan set vid 1 dev swp1 state 3 # sleep 2 # ip -j -p stats show dev swp1 group xstats_slave subgroup bridge suite mcast | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]' 3 Signed-off-by: Yong Wang Reviewed-by: Andy Roulin Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_mst.c | 4 ++-- net/bridge/br_multicast.c | 26 ++++++++++++++++++++++++++ net/bridge/br_private.h | 11 ++++++++++- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 1820f09ff59ce..3f24b4ee49c27 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -80,10 +80,10 @@ static void br_mst_vlan_set_state(struct net_bridge_vlan_group *vg, if (br_vlan_get_state(v) == state) return; - br_vlan_set_state(v, state); - if (v->vid == vg->pvid) br_vlan_set_pvid_state(vg, state); + + br_vlan_set_state(v, state); } int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index ce07fda6a8483..7e0b2362b9ee5 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4272,6 +4272,32 @@ static void __br_multicast_stop(struct net_bridge_mcast *brmctx) #endif } +void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state) +{ +#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) + struct net_bridge *br; + + if (!br_vlan_should_use(v)) + return; + + if (br_vlan_is_master(v)) + return; + + br = v->port->br; + + if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + return; + + if (br_vlan_state_allowed(state, true)) + br_multicast_enable_port_ctx(&v->port_mcast_ctx); + + /* Multicast is not disabled for the vlan when it goes in + * blocking state because the timers will expire and stop by + * themselves without sending more queries. + */ +#endif +} + void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { struct net_bridge *br; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 71f351a6ce1bb..db1bddb330ffe 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1055,6 +1055,7 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port, struct net_bridge_vlan *vlan, struct net_bridge_mcast_port *pmctx); void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx); +void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state); void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on); int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, struct netlink_ext_ack *extack); @@ -1521,6 +1522,11 @@ static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pm { } +static inline void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, + u8 state) +{ +} + static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { @@ -1881,7 +1887,9 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts); -/* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ +/* vlan state manipulation helpers using *_ONCE to annotate lock-free access, + * while br_vlan_set_state() may access data protected by multicast_lock. + */ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) { return READ_ONCE(v->state); @@ -1890,6 +1898,7 @@ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state) { WRITE_ONCE(v->state, state); + br_multicast_update_vlan_mcast_ctx(v, state); } static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg) From aea45363e29dd16050e6ce333ce0d3696ac3b5a9 Mon Sep 17 00:00:00 2001 From: Yong Wang Date: Thu, 17 Apr 2025 15:43:14 +0200 Subject: [PATCH 398/770] selftests: net/bridge : add tests for per vlan snooping with stp state changes Change ALL_TESTS definition to "test-per-line". Add the test case of per vlan snooping with port stp state change to forwarding and also vlan equivalent case in both bridge_igmp.sh and bridge_mld.sh. Signed-off-by: Yong Wang Reviewed-by: Andy Roulin Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- .../selftests/net/forwarding/bridge_igmp.sh | 80 +++++++++++++++++- .../selftests/net/forwarding/bridge_mld.sh | 81 ++++++++++++++++++- tools/testing/selftests/net/forwarding/config | 1 + 3 files changed, 154 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/bridge_igmp.sh b/tools/testing/selftests/net/forwarding/bridge_igmp.sh index e6a3e04fd83f3..d4e7dd6593548 100755 --- a/tools/testing/selftests/net/forwarding/bridge_igmp.sh +++ b/tools/testing/selftests/net/forwarding/bridge_igmp.sh @@ -1,10 +1,24 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="v2reportleave_test v3include_test v3inc_allow_test v3inc_is_include_test \ - v3inc_is_exclude_test v3inc_to_exclude_test v3exc_allow_test v3exc_is_include_test \ - v3exc_is_exclude_test v3exc_to_exclude_test v3inc_block_test v3exc_block_test \ - v3exc_timeout_test v3star_ex_auto_add_test" +ALL_TESTS=" + v2reportleave_test + v3include_test + v3inc_allow_test + v3inc_is_include_test + v3inc_is_exclude_test + v3inc_to_exclude_test + v3exc_allow_test + v3exc_is_include_test + v3exc_is_exclude_test + v3exc_to_exclude_test + v3inc_block_test + v3exc_block_test + v3exc_timeout_test + v3star_ex_auto_add_test + v2per_vlan_snooping_port_stp_test + v2per_vlan_snooping_vlan_stp_test +" NUM_NETIFS=4 CHECK_TC="yes" TEST_GROUP="239.10.10.10" @@ -554,6 +568,64 @@ v3star_ex_auto_add_test() v3cleanup $swp2 $TEST_GROUP } +v2per_vlan_snooping_stp_test() +{ + local is_port=$1 + + local msg="port" + [[ $is_port -ne 1 ]] && msg="vlan" + + ip link set br0 up type bridge vlan_filtering 1 \ + mcast_igmp_version 2 \ + mcast_snooping 1 \ + mcast_vlan_snooping 1 \ + mcast_querier 1 \ + mcast_stats_enabled 1 + bridge vlan global set vid 1 dev br0 \ + mcast_snooping 1 \ + mcast_querier 1 \ + mcast_query_interval 100 \ + mcast_startup_query_count 0 + [[ $is_port -eq 1 ]] && bridge link set dev $swp1 state 0 + [[ $is_port -ne 1 ]] && bridge vlan set vid 1 dev $swp1 state 4 + sleep 5 + local tx_s=$(ip -j -p stats show dev $swp1 \ + group xstats_slave subgroup bridge suite mcast \ + | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]') + + [[ $is_port -eq 1 ]] && bridge link set dev $swp1 state 3 + [[ $is_port -ne 1 ]] && bridge vlan set vid 1 dev $swp1 state 3 + sleep 5 + local tx_e=$(ip -j -p stats show dev $swp1 \ + group xstats_slave subgroup bridge suite mcast \ + | jq '.[]["multicast"]["igmp_queries"]["tx_v2"]') + + RET=0 + local tx=$(expr $tx_e - $tx_s) + test $tx -gt 0 + check_err $? "No IGMP queries after STP state becomes forwarding" + log_test "per vlan snooping with $msg stp state change" + + # restore settings + bridge vlan global set vid 1 dev br0 \ + mcast_querier 0 \ + mcast_query_interval 12500 \ + mcast_startup_query_count 2 + ip link set br0 up type bridge vlan_filtering 0 \ + mcast_vlan_snooping 0 \ + mcast_stats_enabled 0 +} + +v2per_vlan_snooping_port_stp_test() +{ + v2per_vlan_snooping_stp_test 1 +} + +v2per_vlan_snooping_vlan_stp_test() +{ + v2per_vlan_snooping_stp_test 0 +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/bridge_mld.sh b/tools/testing/selftests/net/forwarding/bridge_mld.sh index f84ab2e657547..4cacef5a813aa 100755 --- a/tools/testing/selftests/net/forwarding/bridge_mld.sh +++ b/tools/testing/selftests/net/forwarding/bridge_mld.sh @@ -1,10 +1,23 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="mldv2include_test mldv2inc_allow_test mldv2inc_is_include_test mldv2inc_is_exclude_test \ - mldv2inc_to_exclude_test mldv2exc_allow_test mldv2exc_is_include_test \ - mldv2exc_is_exclude_test mldv2exc_to_exclude_test mldv2inc_block_test \ - mldv2exc_block_test mldv2exc_timeout_test mldv2star_ex_auto_add_test" +ALL_TESTS=" + mldv2include_test + mldv2inc_allow_test + mldv2inc_is_include_test + mldv2inc_is_exclude_test + mldv2inc_to_exclude_test + mldv2exc_allow_test + mldv2exc_is_include_test + mldv2exc_is_exclude_test + mldv2exc_to_exclude_test + mldv2inc_block_test + mldv2exc_block_test + mldv2exc_timeout_test + mldv2star_ex_auto_add_test + mldv2per_vlan_snooping_port_stp_test + mldv2per_vlan_snooping_vlan_stp_test +" NUM_NETIFS=4 CHECK_TC="yes" TEST_GROUP="ff02::cc" @@ -554,6 +567,66 @@ mldv2star_ex_auto_add_test() mldv2cleanup $swp2 } +mldv2per_vlan_snooping_stp_test() +{ + local is_port=$1 + + local msg="port" + [[ $is_port -ne 1 ]] && msg="vlan" + + ip link set br0 up type bridge vlan_filtering 1 \ + mcast_mld_version 2 \ + mcast_snooping 1 \ + mcast_vlan_snooping 1 \ + mcast_querier 1 \ + mcast_stats_enabled 1 + bridge vlan global set vid 1 dev br0 \ + mcast_mld_version 2 \ + mcast_snooping 1 \ + mcast_querier 1 \ + mcast_query_interval 100 \ + mcast_startup_query_count 0 + + [[ $is_port -eq 1 ]] && bridge link set dev $swp1 state 0 + [[ $is_port -ne 1 ]] && bridge vlan set vid 1 dev $swp1 state 4 + sleep 5 + local tx_s=$(ip -j -p stats show dev $swp1 \ + group xstats_slave subgroup bridge suite mcast \ + | jq '.[]["multicast"]["mld_queries"]["tx_v2"]') + [[ $is_port -eq 1 ]] && bridge link set dev $swp1 state 3 + [[ $is_port -ne 1 ]] && bridge vlan set vid 1 dev $swp1 state 3 + sleep 5 + local tx_e=$(ip -j -p stats show dev $swp1 \ + group xstats_slave subgroup bridge suite mcast \ + | jq '.[]["multicast"]["mld_queries"]["tx_v2"]') + + RET=0 + local tx=$(expr $tx_e - $tx_s) + test $tx -gt 0 + check_err $? "No MLD queries after STP state becomes forwarding" + log_test "per vlan snooping with $msg stp state change" + + # restore settings + bridge vlan global set vid 1 dev br0 \ + mcast_querier 0 \ + mcast_query_interval 12500 \ + mcast_startup_query_count 2 \ + mcast_mld_version 1 + ip link set br0 up type bridge vlan_filtering 0 \ + mcast_vlan_snooping 0 \ + mcast_stats_enabled 0 +} + +mldv2per_vlan_snooping_port_stp_test() +{ + mldv2per_vlan_snooping_stp_test 1 +} + +mldv2per_vlan_snooping_vlan_stp_test() +{ + mldv2per_vlan_snooping_stp_test 0 +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/net/forwarding/config b/tools/testing/selftests/net/forwarding/config index 8d7a1a004b7c3..18fd69d8d937d 100644 --- a/tools/testing/selftests/net/forwarding/config +++ b/tools/testing/selftests/net/forwarding/config @@ -1,6 +1,7 @@ CONFIG_BRIDGE=m CONFIG_VLAN_8021Q=m CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_BRIDGE_IGMP_SNOOPING=y CONFIG_NET_L3_MASTER_DEV=y CONFIG_IPV6_MULTIPLE_TABLES=y CONFIG_NET_VRF=m From abf078c0a322159f5ebe2adaa0cd69dc45b1e710 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 22 Apr 2025 21:32:51 +0200 Subject: [PATCH 399/770] wifi: mac80211: restore monitor for outgoing frames This code was accidentally dropped during the cooked monitor removal, but really should've been simplified instead. Add the simple version back. Fixes: 286e69677065 ("wifi: mac80211: Drop cooked monitor support") Link: https://patch.msgid.link/20250422213251.b3d65fd0f323.Id2a6901583f7af86bbe94deb355968b238f350c6@changeid Signed-off-by: Johannes Berg --- net/mac80211/status.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/mac80211/status.c b/net/mac80211/status.c index b17b3cc7fb903..a362254b310cd 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -1085,7 +1085,13 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp); - if (status->free_list) + /* + * This is a bit racy but we can avoid a lot of work + * with this test... + */ + if (local->tx_mntrs) + ieee80211_tx_monitor(local, skb, retry_count, status); + else if (status->free_list) list_add_tail(&skb->list, status->free_list); else dev_kfree_skb(skb); From 72bb272541808188c54d0df30b7edce14d979538 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Sun, 20 Apr 2025 12:01:49 +0300 Subject: [PATCH 400/770] Revert "wifi: iwlwifi: add support for BE213" This reverts commit 16a8d9a739430bec9c11eda69226c5a39f3478aa. This device needs commit 75a3313f52b7 ("wifi: iwlwifi: make no_160 more generic"), which has a bug and is being reverted until it is fixed. Since this device wasn't shipped yet it is ok to not support it. Reported-by: Todd Brandt Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220029 Fixes: 16a8d9a73943 ("wifi: iwlwifi: add support for BE213") Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250420115541.581160ae3e4b.Icecc46baee8a797c00ad04fab92d7d1114b84829@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/cfg/sc.c | 2 -- .../net/wireless/intel/iwlwifi/iwl-config.h | 1 - .../wireless/intel/iwlwifi/iwl-nvm-parse.c | 12 ++++------- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 21 +++---------------- 4 files changed, 7 insertions(+), 29 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/sc.c b/drivers/net/wireless/intel/iwlwifi/cfg/sc.c index 670031fd60dc9..59af36960f9c5 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/sc.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/sc.c @@ -142,8 +142,6 @@ const struct iwl_cfg_trans_params iwl_sc_trans_cfg = { .ltr_delay = IWL_CFG_TRANS_LTR_DELAY_2500US, }; -const char iwl_sp_name[] = "Intel(R) Wi-Fi 7 BE213 160MHz"; - const struct iwl_cfg iwl_cfg_sc = { .fw_name_mac = "sc", IWL_DEVICE_SC, diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h index b9bd89bfdd748..7e4864c00780b 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h @@ -550,7 +550,6 @@ extern const char iwl_ax231_name[]; extern const char iwl_ax411_name[]; extern const char iwl_fm_name[]; extern const char iwl_wh_name[]; -extern const char iwl_sp_name[]; extern const char iwl_gl_name[]; extern const char iwl_mtp_name[]; extern const char iwl_dr_name[]; diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index cd1b0048bb6da..08269168b2fab 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -944,8 +944,7 @@ iwl_nvm_fixup_sband_iftd(struct iwl_trans *trans, IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK); break; case NL80211_BAND_6GHZ: - if (!trans->reduced_cap_sku && - trans->bw_limit >= 320) { + if (!trans->reduced_cap_sku) { iftype_data->eht_cap.eht_cap_elem.phy_cap_info[0] |= IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ; iftype_data->eht_cap.eht_cap_elem.phy_cap_info[1] |= @@ -1099,18 +1098,15 @@ iwl_nvm_fixup_sband_iftd(struct iwl_trans *trans, iftype_data->he_cap.he_cap_elem.phy_cap_info[0] &= ~IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; - if (trans->bw_limit < 320 || trans->reduced_cap_sku) { + if (trans->reduced_cap_sku) { memset(&iftype_data->eht_cap.eht_mcs_nss_supp.bw._320, 0, sizeof(iftype_data->eht_cap.eht_mcs_nss_supp.bw._320)); - iftype_data->eht_cap.eht_cap_elem.phy_cap_info[2] &= - ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK; - } - - if (trans->reduced_cap_sku) { iftype_data->eht_cap.eht_mcs_nss_supp.bw._80.rx_tx_mcs13_max_nss = 0; iftype_data->eht_cap.eht_mcs_nss_supp.bw._160.rx_tx_mcs13_max_nss = 0; iftype_data->eht_cap.eht_cap_elem.phy_cap_info[8] &= ~IEEE80211_EHT_PHY_CAP8_RX_4096QAM_WIDER_BW_DL_OFDMA; + iftype_data->eht_cap.eht_cap_elem.phy_cap_info[2] &= + ~IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK; } } diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 93446c3740081..03f7eb46bbc78 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1187,13 +1187,8 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC, IWL_CFG_ANY, IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_sc, iwl_wh_name), - _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_MAC_TYPE_SC, IWL_CFG_ANY, - IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, - 160, IWL_CFG_ANY, IWL_CFG_ANY, - iwl_cfg_sc, iwl_sp_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC2, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, @@ -1207,13 +1202,8 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC2, IWL_CFG_ANY, IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_sc2, iwl_wh_name), - _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_MAC_TYPE_SC2, IWL_CFG_ANY, - IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, - 160, IWL_CFG_ANY, IWL_CFG_ANY, - iwl_cfg_sc2, iwl_sp_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC2F, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, @@ -1227,13 +1217,8 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC2F, IWL_CFG_ANY, IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_sc2f, iwl_wh_name), - _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_MAC_TYPE_SC2F, IWL_CFG_ANY, - IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, - 160, IWL_CFG_ANY, IWL_CFG_ANY, - iwl_cfg_sc2f, iwl_sp_name), /* Dr */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, From 64dc5d5e341d6145cce65e35bdfa0d6ab9fc0a75 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Sun, 20 Apr 2025 12:01:50 +0300 Subject: [PATCH 401/770] Revert "wifi: iwlwifi: make no_160 more generic" This reverts commit 75a3313f52b7e08e7e73746f69a68c2b7c28bb2b. The indication of the BW limitation in the sub-device ID is not applicable for Killer devices. For those devices, bw_limit will hold a random value, so a matching dev_info might not be found, which leads to a probe failure. Until it is properly fixed, revert this. Reported-by: Todd Brandt Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220029 Fixes: 75a3313f52b7 ("wifi: iwlwifi: make no_160 more generic") Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250420115541.36dd3007151e.I66b6b78db09bfea12ae84dd85603cf1583271474@changeid Signed-off-by: Johannes Berg --- .../net/wireless/intel/iwlwifi/iwl-config.h | 15 +- .../wireless/intel/iwlwifi/iwl-nvm-parse.c | 4 +- .../net/wireless/intel/iwlwifi/iwl-trans.h | 7 +- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 206 +++++++++--------- .../wireless/intel/iwlwifi/tests/devinfo.c | 15 +- 5 files changed, 118 insertions(+), 129 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-config.h b/drivers/net/wireless/intel/iwlwifi/iwl-config.h index 7e4864c00780b..acafee538b8ab 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-config.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-config.h @@ -2,7 +2,7 @@ /* * Copyright (C) 2005-2014, 2018-2021 Intel Corporation * Copyright (C) 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #ifndef __IWL_CONFIG_H__ #define __IWL_CONFIG_H__ @@ -451,8 +451,11 @@ struct iwl_cfg { #define IWL_CFG_RF_ID_HR 0x7 #define IWL_CFG_RF_ID_HR1 0x4 -#define IWL_CFG_BW_NO_LIM (U16_MAX - 1) -#define IWL_CFG_BW_ANY U16_MAX +#define IWL_CFG_NO_160 0x1 +#define IWL_CFG_160 0x0 + +#define IWL_CFG_NO_320 0x1 +#define IWL_CFG_320 0x0 #define IWL_CFG_CORES_BT 0x0 #define IWL_CFG_CORES_BT_GNSS 0x5 @@ -464,7 +467,7 @@ struct iwl_cfg { #define IWL_CFG_IS_JACKET 0x1 #define IWL_SUBDEVICE_RF_ID(subdevice) ((u16)((subdevice) & 0x00F0) >> 4) -#define IWL_SUBDEVICE_BW_LIM(subdevice) ((u16)((subdevice) & 0x0200) >> 9) +#define IWL_SUBDEVICE_NO_160(subdevice) ((u16)((subdevice) & 0x0200) >> 9) #define IWL_SUBDEVICE_CORES(subdevice) ((u16)((subdevice) & 0x1C00) >> 10) struct iwl_dev_info { @@ -472,10 +475,10 @@ struct iwl_dev_info { u16 subdevice; u16 mac_type; u16 rf_type; - u16 bw_limit; u8 mac_step; u8 rf_step; u8 rf_id; + u8 no_160; u8 cores; u8 cdb; u8 jacket; @@ -489,7 +492,7 @@ extern const unsigned int iwl_dev_info_table_size; const struct iwl_dev_info * iwl_pci_find_dev_info(u16 device, u16 subsystem_device, u16 mac_type, u8 mac_step, u16 rf_type, u8 cdb, - u8 jacket, u8 rf_id, u8 bw_limit, u8 cores, u8 rf_step); + u8 jacket, u8 rf_id, u8 no_160, u8 cores, u8 rf_step); extern const struct pci_device_id iwl_hw_card_ids[]; #endif diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c index 08269168b2fab..c381511e9ec65 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (C) 2005-2014, 2018-2023, 2025 Intel Corporation + * Copyright (C) 2005-2014, 2018-2023 Intel Corporation * Copyright (C) 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2016-2017 Intel Deutschland GmbH */ @@ -1094,7 +1094,7 @@ iwl_nvm_fixup_sband_iftd(struct iwl_trans *trans, iftype_data->eht_cap.eht_mcs_nss_supp.bw._320.rx_tx_mcs13_max_nss = 0; } - if (trans->bw_limit < 160) + if (trans->no_160) iftype_data->he_cap.he_cap_elem.phy_cap_info[0] &= ~IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h index 25fb4c50e38b1..18698f0bd2e4d 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ /* - * Copyright (C) 2005-2014, 2018-2023, 2025 Intel Corporation + * Copyright (C) 2005-2014, 2018-2023 Intel Corporation * Copyright (C) 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2016-2017 Intel Deutschland GmbH */ @@ -876,7 +876,7 @@ struct iwl_txq { * only valid for discrete (not integrated) NICs * @invalid_tx_cmd: invalid TX command buffer * @reduced_cap_sku: reduced capability supported SKU - * @bw_limit: the max bandwidth + * @no_160: device not supporting 160 MHz * @step_urm: STEP is in URM, no support for MCS>9 in 320 MHz * @restart: restart worker data * @restart.wk: restart worker @@ -911,8 +911,7 @@ struct iwl_trans { char hw_id_str[52]; u32 sku_id[3]; bool reduced_cap_sku; - u16 bw_limit; - bool step_urm; + u8 no_160:1, step_urm:1; u8 dsbr_urm_fw_dependent:1, dsbr_urm_permanent:1; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 03f7eb46bbc78..6c22c95f28d5e 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (C) 2005-2014, 2018-2025 Intel Corporation + * Copyright (C) 2005-2014, 2018-2024 Intel Corporation * Copyright (C) 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2016-2017 Intel Deutschland GmbH */ @@ -552,17 +552,16 @@ MODULE_DEVICE_TABLE(pci, iwl_hw_card_ids); EXPORT_SYMBOL_IF_IWLWIFI_KUNIT(iwl_hw_card_ids); #define _IWL_DEV_INFO(_device, _subdevice, _mac_type, _mac_step, _rf_type, \ - _rf_id, _rf_step, _bw_limit, _cores, _cdb, _cfg, _name) \ + _rf_id, _rf_step, _no_160, _cores, _cdb, _cfg, _name) \ { .device = (_device), .subdevice = (_subdevice), .cfg = &(_cfg), \ .name = _name, .mac_type = _mac_type, .rf_type = _rf_type, .rf_step = _rf_step, \ - .bw_limit = _bw_limit, .cores = _cores, .rf_id = _rf_id, \ + .no_160 = _no_160, .cores = _cores, .rf_id = _rf_id, \ .mac_step = _mac_step, .cdb = _cdb, .jacket = IWL_CFG_ANY } #define IWL_DEV_INFO(_device, _subdevice, _cfg, _name) \ _IWL_DEV_INFO(_device, _subdevice, IWL_CFG_ANY, IWL_CFG_ANY, \ - IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, \ - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_ANY, \ - _cfg, _name) + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, \ + IWL_CFG_ANY, _cfg, _name) VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { #if IS_ENABLED(CONFIG_IWLMVM) @@ -725,66 +724,66 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_PU, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_2ac_cfg_soc, iwl9461_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_PU, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_2ac_cfg_soc, iwl9461_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_PU, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_2ac_cfg_soc, iwl9462_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_PU, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_2ac_cfg_soc, iwl9462_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_PU, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_2ac_cfg_soc, iwl9560_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_PU, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_2ac_cfg_soc, iwl9560_name), _IWL_DEV_INFO(0x2526, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_TH, IWL_CFG_ANY, IWL_CFG_RF_TYPE_TH, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT_GNSS, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT_GNSS, IWL_CFG_NO_CDB, iwl9260_2ac_cfg, iwl9270_160_name), _IWL_DEV_INFO(0x2526, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_TH, IWL_CFG_ANY, IWL_CFG_RF_TYPE_TH, IWL_CFG_ANY, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT_GNSS, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT_GNSS, IWL_CFG_NO_CDB, iwl9260_2ac_cfg, iwl9270_name), _IWL_DEV_INFO(0x271B, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_TH, IWL_CFG_ANY, IWL_CFG_RF_TYPE_TH1, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9260_2ac_cfg, iwl9162_160_name), _IWL_DEV_INFO(0x271B, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_TH, IWL_CFG_ANY, IWL_CFG_RF_TYPE_TH1, IWL_CFG_ANY, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9260_2ac_cfg, iwl9162_name), _IWL_DEV_INFO(0x2526, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_TH, IWL_CFG_ANY, IWL_CFG_RF_TYPE_TH, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9260_2ac_cfg, iwl9260_160_name), _IWL_DEV_INFO(0x2526, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_TH, IWL_CFG_ANY, IWL_CFG_RF_TYPE_TH, IWL_CFG_ANY, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9260_2ac_cfg, iwl9260_name), /* Qu with Jf */ @@ -792,132 +791,132 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_B_STEP, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_b0_jf_b0_cfg, iwl9461_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_B_STEP, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_b0_jf_b0_cfg, iwl9461_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_B_STEP, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_b0_jf_b0_cfg, iwl9462_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_B_STEP, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_b0_jf_b0_cfg, iwl9462_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_B_STEP, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_b0_jf_b0_cfg, iwl9560_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_B_STEP, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_b0_jf_b0_cfg, iwl9560_name), _IWL_DEV_INFO(IWL_CFG_ANY, 0x1551, IWL_CFG_MAC_TYPE_QU, SILICON_B_STEP, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_b0_jf_b0_cfg, iwl9560_killer_1550s_name), _IWL_DEV_INFO(IWL_CFG_ANY, 0x1552, IWL_CFG_MAC_TYPE_QU, SILICON_B_STEP, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_b0_jf_b0_cfg, iwl9560_killer_1550i_name), /* Qu C step */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_C_STEP, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_c0_jf_b0_cfg, iwl9461_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_C_STEP, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_c0_jf_b0_cfg, iwl9461_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_C_STEP, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_c0_jf_b0_cfg, iwl9462_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_C_STEP, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_c0_jf_b0_cfg, iwl9462_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_C_STEP, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_c0_jf_b0_cfg, iwl9560_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_C_STEP, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_c0_jf_b0_cfg, iwl9560_name), _IWL_DEV_INFO(IWL_CFG_ANY, 0x1551, IWL_CFG_MAC_TYPE_QU, SILICON_C_STEP, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_c0_jf_b0_cfg, iwl9560_killer_1550s_name), _IWL_DEV_INFO(IWL_CFG_ANY, 0x1552, IWL_CFG_MAC_TYPE_QU, SILICON_C_STEP, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_qu_c0_jf_b0_cfg, iwl9560_killer_1550i_name), /* QuZ */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QUZ, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_quz_a0_jf_b0_cfg, iwl9461_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QUZ, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_quz_a0_jf_b0_cfg, iwl9461_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QUZ, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_quz_a0_jf_b0_cfg, iwl9462_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QUZ, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_quz_a0_jf_b0_cfg, iwl9462_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QUZ, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_quz_a0_jf_b0_cfg, iwl9560_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QUZ, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_quz_a0_jf_b0_cfg, iwl9560_name), _IWL_DEV_INFO(IWL_CFG_ANY, 0x1551, IWL_CFG_MAC_TYPE_QUZ, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_quz_a0_jf_b0_cfg, iwl9560_killer_1550s_name), _IWL_DEV_INFO(IWL_CFG_ANY, 0x1552, IWL_CFG_MAC_TYPE_QUZ, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwl9560_quz_a0_jf_b0_cfg, iwl9560_killer_1550i_name), /* Qu with Hr */ @@ -925,189 +924,189 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_B_STEP, IWL_CFG_RF_TYPE_HR1, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_qu_b0_hr1_b0, iwl_ax101_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_B_STEP, IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, - 80, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_qu_b0_hr_b0, iwl_ax203_name), /* Qu C step */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_C_STEP, IWL_CFG_RF_TYPE_HR1, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_qu_c0_hr1_b0, iwl_ax101_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_C_STEP, IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, - 80, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_qu_c0_hr_b0, iwl_ax203_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QU, SILICON_C_STEP, IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_qu_c0_hr_b0, iwl_ax201_name), /* QuZ */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QUZ, IWL_CFG_ANY, IWL_CFG_RF_TYPE_HR1, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_quz_a0_hr1_b0, iwl_ax101_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QUZ, SILICON_B_STEP, IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, - 80, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_quz_a0_hr_b0, iwl_ax203_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_QUZ, SILICON_B_STEP, IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_quz_a0_hr_b0, iwl_ax201_name), /* Ma */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_MA, IWL_CFG_ANY, IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_ma, iwl_ax201_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_MA, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_ma, iwl_ax211_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_MA, IWL_CFG_ANY, IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_ma, iwl_ax231_name), /* So with Hr */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, - 80, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_so_a0_hr_a0, iwl_ax203_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, IWL_CFG_RF_TYPE_HR1, IWL_CFG_ANY, IWL_CFG_ANY, - 80, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_so_a0_hr_a0, iwl_ax101_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_so_a0_hr_a0, iwl_ax201_name), /* So-F with Hr */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, - 80, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_so_a0_hr_a0, iwl_ax203_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, IWL_CFG_RF_TYPE_HR1, IWL_CFG_ANY, IWL_CFG_ANY, - 80, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_so_a0_hr_a0, iwl_ax101_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_so_a0_hr_a0, iwl_ax201_name), /* So-F with Gf */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwlax211_2ax_cfg_so_gf_a0, iwl_ax211_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_CDB, + IWL_CFG_160, IWL_CFG_ANY, IWL_CFG_CDB, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_name), /* SoF with JF2 */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwlax210_2ax_cfg_so_jf_b0, iwl9560_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwlax210_2ax_cfg_so_jf_b0, iwl9560_name), /* SoF with JF */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwlax210_2ax_cfg_so_jf_b0, iwl9461_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwlax210_2ax_cfg_so_jf_b0, iwl9462_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwlax210_2ax_cfg_so_jf_b0, iwl9461_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwlax210_2ax_cfg_so_jf_b0, iwl9462_name), /* So with GF */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwlax211_2ax_cfg_so_gf_a0, iwl_ax211_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_CDB, + IWL_CFG_160, IWL_CFG_ANY, IWL_CFG_CDB, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_name), /* So with JF2 */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwlax210_2ax_cfg_so_jf_b0, iwl9560_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwlax210_2ax_cfg_so_jf_b0, iwl9560_name), /* So with JF */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwlax210_2ax_cfg_so_jf_b0, iwl9461_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwlax210_2ax_cfg_so_jf_b0, iwl9462_160_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwlax210_2ax_cfg_so_jf_b0, iwl9461_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, IWL_CFG_ANY, - 80, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, iwlax210_2ax_cfg_so_jf_b0, iwl9462_name), #endif /* CONFIG_IWLMVM */ @@ -1116,13 +1115,13 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_bz, iwl_ax201_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_bz, iwl_ax211_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, @@ -1134,104 +1133,104 @@ VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info iwl_dev_info_table[] = { _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ, IWL_CFG_ANY, IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_bz, iwl_wh_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, IWL_CFG_RF_TYPE_HR2, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_bz, iwl_ax201_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_bz, iwl_ax211_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_bz, iwl_fm_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BZ_W, IWL_CFG_ANY, IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_bz, iwl_wh_name), /* Ga (Gl) */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_GL, IWL_CFG_ANY, IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_NO_LIM, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_320, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_gl, iwl_gl_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_GL, IWL_CFG_ANY, IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, - 160, IWL_CFG_ANY, IWL_CFG_NO_CDB, + IWL_CFG_NO_320, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_gl, iwl_mtp_name), /* Sc */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_sc, iwl_ax211_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC, IWL_CFG_ANY, IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_sc, iwl_fm_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC, IWL_CFG_ANY, IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_sc, iwl_wh_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC2, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_sc2, iwl_ax211_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC2, IWL_CFG_ANY, IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_sc2, iwl_fm_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC2, IWL_CFG_ANY, IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_sc2, iwl_wh_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC2F, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_sc2f, iwl_ax211_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC2F, IWL_CFG_ANY, IWL_CFG_RF_TYPE_FM, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_sc2f, iwl_fm_name), _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SC2F, IWL_CFG_ANY, IWL_CFG_RF_TYPE_WH, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_sc2f, iwl_wh_name), /* Dr */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_DR, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_dr, iwl_dr_name), /* Br */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_BR, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, - IWL_CFG_BW_ANY, IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_ANY, iwl_cfg_br, iwl_br_name), #endif /* CONFIG_IWLMLD */ }; @@ -1383,7 +1382,7 @@ static int map_crf_id(struct iwl_trans *iwl_trans) VISIBLE_IF_IWLWIFI_KUNIT const struct iwl_dev_info * iwl_pci_find_dev_info(u16 device, u16 subsystem_device, u16 mac_type, u8 mac_step, u16 rf_type, u8 cdb, - u8 jacket, u8 rf_id, u8 bw_limit, u8 cores, u8 rf_step) + u8 jacket, u8 rf_id, u8 no_160, u8 cores, u8 rf_step) { int num_devices = ARRAY_SIZE(iwl_dev_info_table); int i; @@ -1426,15 +1425,8 @@ iwl_pci_find_dev_info(u16 device, u16 subsystem_device, dev_info->rf_id != rf_id) continue; - /* - * Check that bw_limit have the same "boolean" value since - * IWL_SUBDEVICE_BW_LIM can only return a boolean value and - * dev_info->bw_limit encodes a non-boolean value. - * dev_info->bw_limit == IWL_CFG_BW_NO_LIM must be equal to - * !bw_limit to have a match. - */ - if (dev_info->bw_limit != IWL_CFG_BW_ANY && - (dev_info->bw_limit == IWL_CFG_BW_NO_LIM) == !!bw_limit) + if (dev_info->no_160 != (u8)IWL_CFG_ANY && + dev_info->no_160 != no_160) continue; if (dev_info->cores != (u8)IWL_CFG_ANY && @@ -1572,13 +1564,13 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) CSR_HW_RFID_IS_CDB(iwl_trans->hw_rf_id), CSR_HW_RFID_IS_JACKET(iwl_trans->hw_rf_id), IWL_SUBDEVICE_RF_ID(pdev->subsystem_device), - IWL_SUBDEVICE_BW_LIM(pdev->subsystem_device), + IWL_SUBDEVICE_NO_160(pdev->subsystem_device), IWL_SUBDEVICE_CORES(pdev->subsystem_device), CSR_HW_RFID_STEP(iwl_trans->hw_rf_id)); if (dev_info) { iwl_trans->cfg = dev_info->cfg; iwl_trans->name = dev_info->name; - iwl_trans->bw_limit = dev_info->bw_limit; + iwl_trans->no_160 = dev_info->no_160 == IWL_CFG_NO_160; } #if IS_ENABLED(CONFIG_IWLMVM) diff --git a/drivers/net/wireless/intel/iwlwifi/tests/devinfo.c b/drivers/net/wireless/intel/iwlwifi/tests/devinfo.c index 7ef5e89c6af27..d0bda23c628aa 100644 --- a/drivers/net/wireless/intel/iwlwifi/tests/devinfo.c +++ b/drivers/net/wireless/intel/iwlwifi/tests/devinfo.c @@ -2,7 +2,7 @@ /* * KUnit tests for the iwlwifi device info table * - * Copyright (C) 2023-2025 Intel Corporation + * Copyright (C) 2023-2024 Intel Corporation */ #include #include @@ -13,9 +13,9 @@ MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); static void iwl_pci_print_dev_info(const char *pfx, const struct iwl_dev_info *di) { - printk(KERN_DEBUG "%sdev=%.4x,subdev=%.4x,mac_type=%.4x,mac_step=%.4x,rf_type=%.4x,cdb=%d,jacket=%d,rf_id=%.2x,bw_limit=%d,cores=%.2x\n", + printk(KERN_DEBUG "%sdev=%.4x,subdev=%.4x,mac_type=%.4x,mac_step=%.4x,rf_type=%.4x,cdb=%d,jacket=%d,rf_id=%.2x,no_160=%d,cores=%.2x\n", pfx, di->device, di->subdevice, di->mac_type, di->mac_step, - di->rf_type, di->cdb, di->jacket, di->rf_id, di->bw_limit, + di->rf_type, di->cdb, di->jacket, di->rf_id, di->no_160, di->cores); } @@ -31,13 +31,8 @@ static void devinfo_table_order(struct kunit *test) di->mac_type, di->mac_step, di->rf_type, di->cdb, di->jacket, di->rf_id, - di->bw_limit != IWL_CFG_BW_NO_LIM, - di->cores, di->rf_step); - if (!ret) { - iwl_pci_print_dev_info("No entry found for: ", di); - KUNIT_FAIL(test, - "No entry found for entry at index %d\n", idx); - } else if (ret != di) { + di->no_160, di->cores, di->rf_step); + if (ret != di) { iwl_pci_print_dev_info("searched: ", di); iwl_pci_print_dev_info("found: ", ret); KUNIT_FAIL(test, From 4f7a07791944e57ea5f12ce03939e3ad0fd50504 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Sun, 20 Apr 2025 09:59:55 +0300 Subject: [PATCH 402/770] wifi: iwlwifi: mld: properly handle async notification in op mode start From the moment that we have ALIVE, we can receive notification that are handled asynchronously. Some notifications (for example iwl_rfi_support_notif) requires an operational FW. So we need to make sure that they were handled in iwl_op_mode_mld_start before we stop the FW. Flush the async_handlers_wk there to achieve that. Also, if loading the FW in op mode start failed, we need to cancel these notifications, as they are from a dead FW. More than that, not doing so can cause us to access freed memory if async_handlers_wk is executed after ieee80211_free_hw is called. Fix this by canceling all async notifications if a failure occurred in init (after ALIVE). Fixes: d1e879ec600f ("wifi: iwlwifi: add iwlmld sub-driver") Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250420095642.1a8579662437.Ifd77d9c1a29fdd278b0a7bfc2709dd5d5e5efdb1@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mld/fw.c | 13 ++++++++++--- drivers/net/wireless/intel/iwlwifi/mld/mld.c | 5 +++++ drivers/net/wireless/intel/iwlwifi/mld/mld.h | 5 ----- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mld/fw.c b/drivers/net/wireless/intel/iwlwifi/mld/fw.c index 62da137e10247..4b083d447ee2f 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/fw.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/fw.c @@ -333,19 +333,22 @@ int iwl_mld_load_fw(struct iwl_mld *mld) ret = iwl_trans_start_hw(mld->trans); if (ret) - return ret; + goto err; ret = iwl_mld_run_fw_init_sequence(mld); if (ret) - return ret; + goto err; ret = iwl_mld_init_mcc(mld); if (ret) - return ret; + goto err; mld->fw_status.running = true; return 0; +err: + iwl_mld_stop_fw(mld); + return ret; } void iwl_mld_stop_fw(struct iwl_mld *mld) @@ -358,6 +361,10 @@ void iwl_mld_stop_fw(struct iwl_mld *mld) iwl_trans_stop_device(mld->trans); + wiphy_work_cancel(mld->wiphy, &mld->async_handlers_wk); + + iwl_mld_purge_async_handlers_list(mld); + mld->fw_status.running = false; } diff --git a/drivers/net/wireless/intel/iwlwifi/mld/mld.c b/drivers/net/wireless/intel/iwlwifi/mld/mld.c index d4a99ae64074f..cfdf52b43c684 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/mld.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/mld.c @@ -417,6 +417,11 @@ iwl_op_mode_mld_start(struct iwl_trans *trans, const struct iwl_cfg *cfg, goto free_hw; } + /* We are about to stop the FW. Notifications may require an + * operational FW, so handle them all here before we stop. + */ + wiphy_work_flush(mld->wiphy, &mld->async_handlers_wk); + iwl_mld_stop_fw(mld); wiphy_unlock(mld->wiphy); diff --git a/drivers/net/wireless/intel/iwlwifi/mld/mld.h b/drivers/net/wireless/intel/iwlwifi/mld/mld.h index 5eceaaf7696db..a4a16da6ebf3f 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/mld.h +++ b/drivers/net/wireless/intel/iwlwifi/mld/mld.h @@ -298,11 +298,6 @@ iwl_cleanup_mld(struct iwl_mld *mld) #endif iwl_mld_low_latency_restart_cleanup(mld); - - /* Empty the list of async notification handlers so we won't process - * notifications from the dead fw after the reconfig flow. - */ - iwl_mld_purge_async_handlers_list(mld); } enum iwl_power_scheme { From c155f7c3ad1e70a1e203047b20e3bca235ada207 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Sun, 20 Apr 2025 09:59:56 +0300 Subject: [PATCH 403/770] wifi: iwlwifi: mld: inform trans on init failure If starting the op mode failed, the opmode memory is being freed, so trans->op_mode needs to be NULLified. Otherwise, trans will access already freed memory. Call iwl_trans_op_mode_leave in that case. Fixes: d1e879ec600f ("wifi: iwlwifi: add iwlmld sub-driver") Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250420095642.3331d1686556.Ifaf15bdd8ef8c59e04effbd2e7aa0034b30eeacb@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mld/mld.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mld/mld.c b/drivers/net/wireless/intel/iwlwifi/mld/mld.c index cfdf52b43c684..4a0842a46a8d4 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/mld.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/mld.c @@ -414,7 +414,7 @@ iwl_op_mode_mld_start(struct iwl_trans *trans, const struct iwl_cfg *cfg, wiphy_unlock(mld->wiphy); rtnl_unlock(); iwl_fw_flush_dumps(&mld->fwrt); - goto free_hw; + goto err; } /* We are about to stop the FW. Notifications may require an @@ -460,7 +460,8 @@ iwl_op_mode_mld_start(struct iwl_trans *trans, const struct iwl_cfg *cfg, iwl_mld_leds_exit(mld); free_nvm: kfree(mld->nvm_data); -free_hw: +err: + iwl_trans_op_mode_leave(mld->trans); ieee80211_free_hw(mld->hw); return ERR_PTR(ret); } From d1ee2c1922566257cd6cc4ac7c21974d708fea62 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sun, 20 Apr 2025 09:59:57 +0300 Subject: [PATCH 404/770] wifi: iwlwifi: mld: only create debugfs symlink if it does not exist When mac80211 switches between non-MLO and MLO it will recreate the debugfs directories. This results in the add_if_debugfs handler being called multiple times. As the convenience symlink is created in the mld debugfs directory and not the vif, it will not be removed by mac80211 when this happens and still exists. Add a check and only create the convenience symlink if we have not yet done so. Fixes: d1e879ec600f ("wifi: iwlwifi: add iwlmld sub-driver") Signed-off-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250420095642.2490696f032a.I74319c7cf18f7e16a3d331cb96e38504b9fbab66@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mld/debugfs.c | 5 +++-- drivers/net/wireless/intel/iwlwifi/mld/mac80211.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mld/debugfs.c b/drivers/net/wireless/intel/iwlwifi/mld/debugfs.c index 89d95e9b4f301..93f9f78e4276b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/debugfs.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/debugfs.c @@ -949,8 +949,9 @@ void iwl_mld_add_vif_debugfs(struct ieee80211_hw *hw, snprintf(name, sizeof(name), "%pd", vif->debugfs_dir); snprintf(target, sizeof(target), "../../../%pd3/iwlmld", vif->debugfs_dir); - mld_vif->dbgfs_slink = - debugfs_create_symlink(name, mld->debugfs_dir, target); + if (!mld_vif->dbgfs_slink) + mld_vif->dbgfs_slink = + debugfs_create_symlink(name, mld->debugfs_dir, target); if (iwlmld_mod_params.power_scheme != IWL_POWER_SCHEME_CAM && vif->type == NL80211_IFTYPE_STATION) { diff --git a/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c index 99e13cfd1e5fe..68d97d3b8f026 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c @@ -651,6 +651,7 @@ void iwl_mld_mac80211_remove_interface(struct ieee80211_hw *hw, #ifdef CONFIG_IWLWIFI_DEBUGFS debugfs_remove(iwl_mld_vif_from_mac80211(vif)->dbgfs_slink); + iwl_mld_vif_from_mac80211(vif)->dbgfs_slink = NULL; #endif iwl_mld_rm_vif(mld, vif); From d49437a6afc707951e5767ef70c9726b6c05da08 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 20 Apr 2025 09:59:58 +0300 Subject: [PATCH 405/770] wifi: iwlwifi: back off on continuous errors When errors occur repeatedly, the driver shouldn't go into a tight loop trying to reset the device. Implement the backoff I had already defined IWL_TRANS_RESET_DELAY for, but clearly forgotten the implementation of. Fixes: 9a2f13c40c63 ("wifi: iwlwifi: implement reset escalation") Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250420095642.8816e299efa2.I82cde34e2345a2b33b1f03dbb040f5ad3439a5aa@changeid Signed-off-by: Johannes Berg --- .../net/wireless/intel/iwlwifi/iwl-trans.c | 27 ++++++++++++++----- .../net/wireless/intel/iwlwifi/iwl-trans.h | 7 +++-- .../net/wireless/intel/iwlwifi/pcie/trans.c | 3 ++- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c index c1607b6d0759e..6125fe70ce728 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c @@ -21,6 +21,7 @@ struct iwl_trans_dev_restart_data { struct list_head list; unsigned int restart_count; time64_t last_error; + bool backoff; char name[]; }; @@ -125,13 +126,20 @@ iwl_trans_determine_restart_mode(struct iwl_trans *trans) if (!data) return at_least; - if (ktime_get_boottime_seconds() - data->last_error >= + if (!data->backoff && + ktime_get_boottime_seconds() - data->last_error >= IWL_TRANS_RESET_OK_TIME) data->restart_count = 0; index = data->restart_count; - if (index >= ARRAY_SIZE(escalation_list)) + if (index >= ARRAY_SIZE(escalation_list)) { index = ARRAY_SIZE(escalation_list) - 1; + if (!data->backoff) { + data->backoff = true; + return IWL_RESET_MODE_BACKOFF; + } + data->backoff = false; + } return max(at_least, escalation_list[index]); } @@ -140,7 +148,8 @@ iwl_trans_determine_restart_mode(struct iwl_trans *trans) static void iwl_trans_restart_wk(struct work_struct *wk) { - struct iwl_trans *trans = container_of(wk, typeof(*trans), restart.wk); + struct iwl_trans *trans = container_of(wk, typeof(*trans), + restart.wk.work); struct iwl_trans_reprobe *reprobe; enum iwl_reset_mode mode; @@ -168,6 +177,12 @@ static void iwl_trans_restart_wk(struct work_struct *wk) return; mode = iwl_trans_determine_restart_mode(trans); + if (mode == IWL_RESET_MODE_BACKOFF) { + IWL_ERR(trans, "Too many device errors - delay next reset\n"); + queue_delayed_work(system_unbound_wq, &trans->restart.wk, + IWL_TRANS_RESET_DELAY); + return; + } iwl_trans_inc_restart_count(trans->dev); @@ -227,7 +242,7 @@ struct iwl_trans *iwl_trans_alloc(unsigned int priv_size, trans->dev = dev; trans->num_rx_queues = 1; - INIT_WORK(&trans->restart.wk, iwl_trans_restart_wk); + INIT_DELAYED_WORK(&trans->restart.wk, iwl_trans_restart_wk); return trans; } @@ -271,7 +286,7 @@ int iwl_trans_init(struct iwl_trans *trans) void iwl_trans_free(struct iwl_trans *trans) { - cancel_work_sync(&trans->restart.wk); + cancel_delayed_work_sync(&trans->restart.wk); kmem_cache_destroy(trans->dev_cmd_pool); } @@ -403,7 +418,7 @@ void iwl_trans_op_mode_leave(struct iwl_trans *trans) iwl_trans_pcie_op_mode_leave(trans); - cancel_work_sync(&trans->restart.wk); + cancel_delayed_work_sync(&trans->restart.wk); trans->op_mode = NULL; diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h index 18698f0bd2e4d..ce4954b0d5246 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h @@ -961,7 +961,7 @@ struct iwl_trans { struct iwl_dma_ptr invalid_tx_cmd; struct { - struct work_struct wk; + struct delayed_work wk; struct iwl_fw_error_dump_mode mode; bool during_reset; } restart; @@ -1162,7 +1162,7 @@ static inline void iwl_trans_schedule_reset(struct iwl_trans *trans, */ trans->restart.during_reset = test_bit(STATUS_IN_SW_RESET, &trans->status); - queue_work(system_unbound_wq, &trans->restart.wk); + queue_delayed_work(system_unbound_wq, &trans->restart.wk, 0); } static inline void iwl_trans_fw_error(struct iwl_trans *trans, @@ -1261,6 +1261,9 @@ enum iwl_reset_mode { IWL_RESET_MODE_RESCAN, IWL_RESET_MODE_FUNC_RESET, IWL_RESET_MODE_PROD_RESET, + + /* keep last - special backoff value */ + IWL_RESET_MODE_BACKOFF, }; void iwl_trans_pcie_reset(struct iwl_trans *trans, enum iwl_reset_mode mode); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index c917ed4c19bcc..b1ccace7377fb 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -2351,7 +2351,8 @@ void iwl_trans_pcie_reset(struct iwl_trans *trans, enum iwl_reset_mode mode) struct iwl_trans_pcie_removal *removal; char _msg = 0, *msg = &_msg; - if (WARN_ON(mode < IWL_RESET_MODE_REMOVE_ONLY)) + if (WARN_ON(mode < IWL_RESET_MODE_REMOVE_ONLY || + mode == IWL_RESET_MODE_BACKOFF)) return; if (test_bit(STATUS_TRANS_DEAD, &trans->status)) From 60d418e8540402f4732cce4e8df428e747d79e47 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 20 Apr 2025 09:59:59 +0300 Subject: [PATCH 406/770] wifi: iwlwifi: mld: fix BAID validity check Perhaps IWL_FW_CHECK() is a bit misnamed, but it just returns the value of the inner condition. Therefore, the current code skips the actual function when it has the BAID data and makes it crash later when it doesn't. Fix the logic. Fixes: d1e879ec600f ("wifi: iwlwifi: add iwlmld sub-driver") Signed-off-by: Johannes Berg Reviewed-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250420095642.9c0b84c44c3b.Ied236258854b149960eb357ec61bf3a572503fbc@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mld/agg.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mld/agg.c b/drivers/net/wireless/intel/iwlwifi/mld/agg.c index db9e0f04f4b77..687a9450ac984 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/agg.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/agg.c @@ -124,9 +124,9 @@ void iwl_mld_handle_bar_frame_release_notif(struct iwl_mld *mld, rcu_read_lock(); baid_data = rcu_dereference(mld->fw_id_to_ba[baid]); - if (!IWL_FW_CHECK(mld, !baid_data, - "Got valid BAID %d but not allocated, invalid BAR release!\n", - baid)) + if (IWL_FW_CHECK(mld, !baid_data, + "Got valid BAID %d but not allocated, invalid BAR release!\n", + baid)) goto out_unlock; if (IWL_FW_CHECK(mld, tid != baid_data->tid || From 15220a257319ffe3bf95796326dfe0aacdbeb1c4 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 20 Apr 2025 10:00:00 +0300 Subject: [PATCH 407/770] wifi: iwlwifi: don't warn if the NIC is gone in resume Some BIOSes decide to power gate the WLAN device during S3. Since iwlwifi doesn't expect this, it gets very noisy reporting that the device is no longer available. Wifi is still available because iwlwifi recovers, but it spews scary prints in the log. Fix that by failing gracefully. Fixes: e8bb19c1d590 ("wifi: iwlwifi: support fast resume") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219597 Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250420095642.d8d58146c829.I569ca15eaaa774d633038a749cc6ec7448419714@changeid Signed-off-by: Johannes Berg --- .../net/wireless/intel/iwlwifi/iwl-trans.c | 1 - drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 20 ++++++++++++++++--- .../wireless/intel/iwlwifi/pcie/internal.h | 9 +++++---- .../net/wireless/intel/iwlwifi/pcie/trans.c | 13 +++++++++--- drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 2 +- 5 files changed, 33 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c index 6125fe70ce728..e7b2e08645efa 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.c @@ -555,7 +555,6 @@ void __releases(nic_access) iwl_trans_release_nic_access(struct iwl_trans *trans) { iwl_trans_pcie_release_nic_access(trans); - __release(nic_access); } IWL_EXPORT_SYMBOL(iwl_trans_release_nic_access); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 6c22c95f28d5e..3a605e7c070e4 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1737,10 +1737,24 @@ static int _iwl_pci_resume(struct device *device, bool restore) * need to reset it completely. * Note: MAC (bits 0:7) will be cleared upon suspend even with wowlan, * so assume that any bits there mean that the device is usable. + * For older devices, just try silently to grab the NIC. */ - if (trans->trans_cfg->device_family >= IWL_DEVICE_FAMILY_BZ && - !iwl_read32(trans, CSR_FUNC_SCRATCH)) - device_was_powered_off = true; + if (trans->trans_cfg->device_family >= IWL_DEVICE_FAMILY_BZ) { + if (!iwl_read32(trans, CSR_FUNC_SCRATCH)) + device_was_powered_off = true; + } else { + /* + * bh are re-enabled by iwl_trans_pcie_release_nic_access, + * so re-enable them if _iwl_trans_pcie_grab_nic_access fails. + */ + local_bh_disable(); + if (_iwl_trans_pcie_grab_nic_access(trans, true)) { + iwl_trans_pcie_release_nic_access(trans); + } else { + device_was_powered_off = true; + local_bh_enable(); + } + } if (restore || device_was_powered_off) { trans->state = IWL_TRANS_NO_FW; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h index 45460f93d24ad..114a9195ad7f7 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h @@ -558,10 +558,10 @@ void iwl_trans_pcie_free(struct iwl_trans *trans); void iwl_trans_pcie_free_pnvm_dram_regions(struct iwl_dram_regions *dram_regions, struct device *dev); -bool __iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans); -#define _iwl_trans_pcie_grab_nic_access(trans) \ +bool __iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans, bool silent); +#define _iwl_trans_pcie_grab_nic_access(trans, silent) \ __cond_lock(nic_access_nobh, \ - likely(__iwl_trans_pcie_grab_nic_access(trans))) + likely(__iwl_trans_pcie_grab_nic_access(trans, silent))) void iwl_trans_pcie_check_product_reset_status(struct pci_dev *pdev); void iwl_trans_pcie_check_product_reset_mode(struct pci_dev *pdev); @@ -1105,7 +1105,8 @@ void iwl_trans_pcie_set_bits_mask(struct iwl_trans *trans, u32 reg, int iwl_trans_pcie_read_config32(struct iwl_trans *trans, u32 ofs, u32 *val); bool iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans); -void iwl_trans_pcie_release_nic_access(struct iwl_trans *trans); +void __releases(nic_access_nobh) +iwl_trans_pcie_release_nic_access(struct iwl_trans *trans); /* transport gen 1 exported functions */ void iwl_trans_pcie_fw_alive(struct iwl_trans *trans, u32 scd_addr); diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c index b1ccace7377fb..102a6123bba0e 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c @@ -2406,7 +2406,7 @@ EXPORT_SYMBOL(iwl_trans_pcie_reset); * This version doesn't disable BHs but rather assumes they're * already disabled. */ -bool __iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans) +bool __iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans, bool silent) { int ret; struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); @@ -2458,6 +2458,11 @@ bool __iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans) if (unlikely(ret < 0)) { u32 cntrl = iwl_read32(trans, CSR_GP_CNTRL); + if (silent) { + spin_unlock(&trans_pcie->reg_lock); + return false; + } + WARN_ONCE(1, "Timeout waiting for hardware access (CSR_GP_CNTRL 0x%08x)\n", cntrl); @@ -2489,7 +2494,7 @@ bool iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans) bool ret; local_bh_disable(); - ret = __iwl_trans_pcie_grab_nic_access(trans); + ret = __iwl_trans_pcie_grab_nic_access(trans, false); if (ret) { /* keep BHs disabled until iwl_trans_pcie_release_nic_access */ return ret; @@ -2498,7 +2503,8 @@ bool iwl_trans_pcie_grab_nic_access(struct iwl_trans *trans) return false; } -void iwl_trans_pcie_release_nic_access(struct iwl_trans *trans) +void __releases(nic_access_nobh) +iwl_trans_pcie_release_nic_access(struct iwl_trans *trans) { struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); @@ -2525,6 +2531,7 @@ void iwl_trans_pcie_release_nic_access(struct iwl_trans *trans) * scheduled on different CPUs (after we drop reg_lock). */ out: + __release(nic_access_nobh); spin_unlock_bh(&trans_pcie->reg_lock); } diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c index bb90bcfc67639..9fc4e98693ebf 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c @@ -1021,7 +1021,7 @@ static int iwl_pcie_set_cmd_in_flight(struct iwl_trans *trans, * returned. This needs to be done only on NICs that have * apmg_wake_up_wa set (see above.) */ - if (!_iwl_trans_pcie_grab_nic_access(trans)) + if (!_iwl_trans_pcie_grab_nic_access(trans, false)) return -EIO; /* From a17821321a9b42f26e77335cd525fee72dc1cd63 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Sun, 20 Apr 2025 10:00:01 +0300 Subject: [PATCH 408/770] wifi: iwlwifi: fix the check for the SCRATCH register upon resume We can't rely on the SCRATCH register being 0 on platform that power gate the NIC in S3. Even in those platforms, the SCRATCH register is still returning 0x1010000. Make sure that we understand that those platforms have powered off the device. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219597 Fixes: cb347bd29d0d ("wifi: iwlwifi: mvm: fix hibernation") Signed-off-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250420095642.a7e082ee785c.I9418d76f860f54261cfa89e1f7ac10300904ba40@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/iwl-csr.h | 1 + drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-csr.h b/drivers/net/wireless/intel/iwlwifi/iwl-csr.h index be9e464c9b7b0..3ff493e920d28 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-csr.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-csr.h @@ -148,6 +148,7 @@ * during a error FW error. */ #define CSR_FUNC_SCRATCH_INIT_VALUE (0x01010101) +#define CSR_FUNC_SCRATCH_POWER_OFF_MASK 0xFFFF /* Bits for CSR_HW_IF_CONFIG_REG */ #define CSR_HW_IF_CONFIG_REG_MSK_MAC_STEP_DASH (0x0000000F) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 3a605e7c070e4..debeea2b3ae57 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1736,11 +1736,13 @@ static int _iwl_pci_resume(struct device *device, bool restore) * Scratch value was altered, this means the device was powered off, we * need to reset it completely. * Note: MAC (bits 0:7) will be cleared upon suspend even with wowlan, - * so assume that any bits there mean that the device is usable. + * but not bits [15:8]. So if we have bits set in lower word, assume + * the device is alive. * For older devices, just try silently to grab the NIC. */ if (trans->trans_cfg->device_family >= IWL_DEVICE_FAMILY_BZ) { - if (!iwl_read32(trans, CSR_FUNC_SCRATCH)) + if (!(iwl_read32(trans, CSR_FUNC_SCRATCH) & + CSR_FUNC_SCRATCH_POWER_OFF_MASK)) device_was_powered_off = true; } else { /* From 0fb15ae3b0a9221be01715dac0335647c79f3362 Mon Sep 17 00:00:00 2001 From: Murad Masimov Date: Fri, 21 Mar 2025 21:52:25 +0300 Subject: [PATCH 409/770] wifi: plfxlc: Remove erroneous assert in plfxlc_mac_release plfxlc_mac_release() asserts that mac->lock is held. This assertion is incorrect, because even if it was possible, it would not be the valid behaviour. The function is used when probe fails or after the device is disconnected. In both cases mac->lock can not be held as the driver is not working with the device at the moment. All functions that use mac->lock unlock it just after it was held. There is also no need to hold mac->lock for plfxlc_mac_release() itself, as mac data is not affected, except for mac->flags, which is modified atomically. This bug leads to the following warning: ================================================================ WARNING: CPU: 0 PID: 127 at drivers/net/wireless/purelifi/plfxlc/mac.c:106 plfxlc_mac_release+0x7d/0xa0 Modules linked in: CPU: 0 PID: 127 Comm: kworker/0:2 Not tainted 6.1.124-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: usb_hub_wq hub_event RIP: 0010:plfxlc_mac_release+0x7d/0xa0 drivers/net/wireless/purelifi/plfxlc/mac.c:106 Call Trace: probe+0x941/0xbd0 drivers/net/wireless/purelifi/plfxlc/usb.c:694 usb_probe_interface+0x5c0/0xaf0 drivers/usb/core/driver.c:396 really_probe+0x2ab/0xcb0 drivers/base/dd.c:639 __driver_probe_device+0x1a2/0x3d0 drivers/base/dd.c:785 driver_probe_device+0x50/0x420 drivers/base/dd.c:815 __device_attach_driver+0x2cf/0x510 drivers/base/dd.c:943 bus_for_each_drv+0x183/0x200 drivers/base/bus.c:429 __device_attach+0x359/0x570 drivers/base/dd.c:1015 bus_probe_device+0xba/0x1e0 drivers/base/bus.c:489 device_add+0xb48/0xfd0 drivers/base/core.c:3696 usb_set_configuration+0x19dd/0x2020 drivers/usb/core/message.c:2165 usb_generic_driver_probe+0x84/0x140 drivers/usb/core/generic.c:238 usb_probe_device+0x130/0x260 drivers/usb/core/driver.c:293 really_probe+0x2ab/0xcb0 drivers/base/dd.c:639 __driver_probe_device+0x1a2/0x3d0 drivers/base/dd.c:785 driver_probe_device+0x50/0x420 drivers/base/dd.c:815 __device_attach_driver+0x2cf/0x510 drivers/base/dd.c:943 bus_for_each_drv+0x183/0x200 drivers/base/bus.c:429 __device_attach+0x359/0x570 drivers/base/dd.c:1015 bus_probe_device+0xba/0x1e0 drivers/base/bus.c:489 device_add+0xb48/0xfd0 drivers/base/core.c:3696 usb_new_device+0xbdd/0x18f0 drivers/usb/core/hub.c:2620 hub_port_connect drivers/usb/core/hub.c:5477 [inline] hub_port_connect_change drivers/usb/core/hub.c:5617 [inline] port_event drivers/usb/core/hub.c:5773 [inline] hub_event+0x2efe/0x5730 drivers/usb/core/hub.c:5855 process_one_work+0x8a9/0x11d0 kernel/workqueue.c:2292 worker_thread+0xa47/0x1200 kernel/workqueue.c:2439 kthread+0x28d/0x320 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 ================================================================ Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 68d57a07bfe5 ("wireless: add plfxlc driver for pureLiFi X, XL, XC devices") Reported-by: syzbot+7d4f142f6c288de8abfe@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7d4f142f6c288de8abfe Signed-off-by: Murad Masimov Link: https://patch.msgid.link/20250321185226.71-2-m.masimov@mt-integration.ru Signed-off-by: Johannes Berg --- drivers/net/wireless/purelifi/plfxlc/mac.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/wireless/purelifi/plfxlc/mac.c b/drivers/net/wireless/purelifi/plfxlc/mac.c index eae93efa61504..82d1bf7edba20 100644 --- a/drivers/net/wireless/purelifi/plfxlc/mac.c +++ b/drivers/net/wireless/purelifi/plfxlc/mac.c @@ -102,7 +102,6 @@ int plfxlc_mac_init_hw(struct ieee80211_hw *hw) void plfxlc_mac_release(struct plfxlc_mac *mac) { plfxlc_chip_release(&mac->chip); - lockdep_assert_held(&mac->lock); } int plfxlc_op_start(struct ieee80211_hw *hw) From 8e089e7b585d95122c8122d732d1d5ef8f879396 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Tue, 22 Apr 2025 12:22:02 +0800 Subject: [PATCH 410/770] wifi: brcm80211: fmac: Add error handling for brcmf_usb_dl_writeimage() The function brcmf_usb_dl_writeimage() calls the function brcmf_usb_dl_cmd() but dose not check its return value. The 'state.state' and the 'state.bytes' are uninitialized if the function brcmf_usb_dl_cmd() fails. It is dangerous to use uninitialized variables in the conditions. Add error handling for brcmf_usb_dl_cmd() to jump to error handling path if the brcmf_usb_dl_cmd() fails and the 'state.state' and the 'state.bytes' are uninitialized. Improve the error message to report more detailed error information. Fixes: 71bb244ba2fd ("brcm80211: fmac: add USB support for bcm43235/6/8 chipsets") Cc: stable@vger.kernel.org # v3.4+ Signed-off-by: Wentao Liang Acked-by: Arend van Spriel Link: https://patch.msgid.link/20250422042203.2259-1-vulab@iscas.ac.cn Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c index 2821c27f317ee..d06c724f63d9c 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/usb.c @@ -896,14 +896,16 @@ brcmf_usb_dl_writeimage(struct brcmf_usbdev_info *devinfo, u8 *fw, int fwlen) } /* 1) Prepare USB boot loader for runtime image */ - brcmf_usb_dl_cmd(devinfo, DL_START, &state, sizeof(state)); + err = brcmf_usb_dl_cmd(devinfo, DL_START, &state, sizeof(state)); + if (err) + goto fail; rdlstate = le32_to_cpu(state.state); rdlbytes = le32_to_cpu(state.bytes); /* 2) Check we are in the Waiting state */ if (rdlstate != DL_WAITING) { - brcmf_err("Failed to DL_START\n"); + brcmf_err("Invalid DL state: %u\n", rdlstate); err = -EINVAL; goto fail; } From 175e69e33c66904dfe910c5f43edfe5c95b32f0c Mon Sep 17 00:00:00 2001 From: Itamar Shalev Date: Wed, 23 Apr 2025 12:25:02 +0300 Subject: [PATCH 411/770] wifi: iwlwifi: restore missing initialization of async_handlers_list The initialization of async_handlers_list was accidentally removed in a previous change. This patch restores the missing initialization to ensure proper handler registration. Fixes: 6895d74c11d8 ("wifi: iwlwifi: mld: initialize regulatory early") Signed-off-by: Itamar Shalev Acked-by: Miri Korenblit Link: https://patch.msgid.link/20250423092503.35206-1-itamar.shalev@intel.com Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mld/mld.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mld/mld.c b/drivers/net/wireless/intel/iwlwifi/mld/mld.c index 4a0842a46a8d4..73d2166a4c257 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/mld.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/mld.c @@ -75,6 +75,7 @@ void iwl_construct_mld(struct iwl_mld *mld, struct iwl_trans *trans, /* Setup async RX handling */ spin_lock_init(&mld->async_handlers_lock); + INIT_LIST_HEAD(&mld->async_handlers_list); wiphy_work_init(&mld->async_handlers_wk, iwl_mld_async_handlers_wk); From 8ff6175139967cd17b2a62bca4b2de2559942b7e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 23 Apr 2025 18:28:21 +0200 Subject: [PATCH 412/770] bnxt_en: hide CONFIG_DETECT_HUNG_TASK specific code The CONFIG_DEFAULT_HUNG_TASK_TIMEOUT setting is only available when the hung task detection is enabled, otherwise the code now produces a build failure: drivers/net/ethernet/broadcom/bnxt/bnxt.c:10188:21: error: use of undeclared identifier 'CONFIG_DEFAULT_HUNG_TASK_TIMEOUT' 10188 | max_tmo_secs > CONFIG_DEFAULT_HUNG_TASK_TIMEOUT) { Enclose this warning logic in an #ifdef to ensure this builds. Fixes: 0fcad44a86bd ("bnxt_en: Change FW message timeout warning") Signed-off-by: Arnd Bergmann Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250423162827.2189658-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index ad30445b47998..18359fabe0876 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -10184,11 +10184,13 @@ static int bnxt_hwrm_ver_get(struct bnxt *bp) if (!bp->hwrm_cmd_max_timeout) bp->hwrm_cmd_max_timeout = HWRM_CMD_MAX_TIMEOUT; max_tmo_secs = bp->hwrm_cmd_max_timeout / 1000; +#ifdef CONFIG_DETECT_HUNG_TASK if (bp->hwrm_cmd_max_timeout > HWRM_CMD_MAX_TIMEOUT || max_tmo_secs > CONFIG_DEFAULT_HUNG_TASK_TIMEOUT) { netdev_warn(bp->dev, "Device requests max timeout of %d seconds, may trigger hung task watchdog (kernel default %ds)\n", max_tmo_secs, CONFIG_DEFAULT_HUNG_TASK_TIMEOUT); } +#endif if (resp->hwrm_intf_maj_8b >= 1) { bp->hwrm_max_req_len = le16_to_cpu(resp->max_req_win_len); From 87f43e6f06a2b027ec2a7e86b5b767c9d60f5fc8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 22 Apr 2025 15:24:55 +0100 Subject: [PATCH 413/770] net: stmmac: dwc-qos: calibrate tegra with mdio bus idle Thierry states that there are prerequists for Tegra's calibration that should be met before starting calibration - both the RGMII and MDIO interfaces should be idle. This commit adds the necessary MII bus locking to ensure that the MDIO interface is idle during calibration. Signed-off-by: Russell King (Oracle) Acked-by: Thierry Reding Link: https://patch.msgid.link/E1u7EYR-001ZAS-Cr@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index fa900b4991d06..09ae16e026eb4 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -136,10 +136,11 @@ static int dwc_qos_probe(struct platform_device *pdev, #define AUTO_CAL_STATUS 0x880c #define AUTO_CAL_STATUS_ACTIVE BIT(31) -static void tegra_eqos_fix_speed(void *priv, int speed, unsigned int mode) +static void tegra_eqos_fix_speed(void *bsp_priv, int speed, unsigned int mode) { - struct tegra_eqos *eqos = priv; + struct tegra_eqos *eqos = bsp_priv; bool needs_calibration = false; + struct stmmac_priv *priv; u32 value; int err; @@ -158,6 +159,11 @@ static void tegra_eqos_fix_speed(void *priv, int speed, unsigned int mode) } if (needs_calibration) { + priv = netdev_priv(dev_get_drvdata(eqos->dev)); + + /* Calibration should be done with the MDIO bus idle */ + mutex_lock(&priv->mii->mdio_lock); + /* calibrate */ value = readl(eqos->regs + SDMEMCOMPPADCTRL); value |= SDMEMCOMPPADCTRL_PAD_E_INPUT_OR_E_PWRD; @@ -191,6 +197,8 @@ static void tegra_eqos_fix_speed(void *priv, int speed, unsigned int mode) value = readl(eqos->regs + SDMEMCOMPPADCTRL); value &= ~SDMEMCOMPPADCTRL_PAD_E_INPUT_OR_E_PWRD; writel(value, eqos->regs + SDMEMCOMPPADCTRL); + + mutex_unlock(&priv->mii->mdio_lock); } else { value = readl(eqos->regs + AUTO_CAL_CONFIG); value &= ~AUTO_CAL_CONFIG_ENABLE; From 7965facefaed629fe22f07595f16ee50e8aff1e3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 19:16:55 -0700 Subject: [PATCH 414/770] netlink: specs: allow header properties for attribute sets rt-link has a number of disjoint headers, plus it uses attributes of other families (e.g. DPLL). Allow declaring a attribute set as "foreign" by specifying which header its definition is coming from. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250418021706.1967583-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/genetlink-c.yaml | 3 +++ Documentation/netlink/genetlink-legacy.yaml | 3 +++ Documentation/netlink/netlink-raw.yaml | 3 +++ tools/net/ynl/pyynl/ynl_gen_c.py | 2 +- 4 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Documentation/netlink/genetlink-c.yaml b/Documentation/netlink/genetlink-c.yaml index 96fa1f1522ed1..5a234e9b5fa2e 100644 --- a/Documentation/netlink/genetlink-c.yaml +++ b/Documentation/netlink/genetlink-c.yaml @@ -148,6 +148,9 @@ properties: attr-max-name: description: The explicit name for last member of attribute enum. type: string + header: + description: For C-compatible languages, header which already defines this attribute set. + type: string # End genetlink-c attributes: description: List of attributes in the space. diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml index a8c5b521937d1..4cbfe666e6f5f 100644 --- a/Documentation/netlink/genetlink-legacy.yaml +++ b/Documentation/netlink/genetlink-legacy.yaml @@ -193,6 +193,9 @@ properties: attr-max-name: description: The explicit name for last member of attribute enum. type: string + header: + description: For C-compatible languages, header which already defines this attribute set. + type: string # End genetlink-c attributes: description: List of attributes in the space. diff --git a/Documentation/netlink/netlink-raw.yaml b/Documentation/netlink/netlink-raw.yaml index 1b0772c8e333a..e34bf23897fab 100644 --- a/Documentation/netlink/netlink-raw.yaml +++ b/Documentation/netlink/netlink-raw.yaml @@ -207,6 +207,9 @@ properties: attr-max-name: description: The explicit name for last member of attribute enum. type: string + header: + description: For C-compatible languages, header which already defines this attribute set. + type: string # End genetlink-c attributes: description: List of attributes in the space. diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 0d930c17f9638..9613a61350037 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -2909,7 +2909,7 @@ def main(): cw.p(f'#include "{hdr_file}"') cw.p('#include "ynl.h"') headers = [] - for definition in parsed['definitions']: + for definition in parsed['definitions'] + parsed['attribute-sets']: if 'header' in definition: headers.append(definition['header']) if args.mode == 'user': From 43b606d98482d9612d683a3467649d3999b608ab Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 19:16:56 -0700 Subject: [PATCH 415/770] netlink: specs: rt-link: remove the fixed members from attrs The purpose of the attribute list is to list the attributes which will be included in a given message to shrink the objects for families with huge attr spaces. Fixed headers are always present in their entirety (between netlink header and the attrs) so there's no point in listing their members. Current C codegen doesn't expect them and tries to look them up in the attribute space. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250418021706.1967583-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 726dfa083d14b..cb7bacbd3d958 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -2367,7 +2367,6 @@ operations: request: value: 16 attributes: &link-new-attrs - - ifi-index - ifname - net-ns-pid - net-ns-fd @@ -2399,7 +2398,6 @@ operations: request: value: 17 attributes: - - ifi-index - ifname - name: getlink @@ -2410,7 +2408,6 @@ operations: request: value: 18 attributes: - - ifi-index - ifname - alt-ifname - ext-mask @@ -2418,11 +2415,6 @@ operations: reply: value: 16 attributes: &link-all-attrs - - ifi-family - - ifi-type - - ifi-index - - ifi-flags - - ifi-change - address - broadcast - ifname @@ -2515,14 +2507,9 @@ operations: do: request: value: 94 - attributes: - - ifindex reply: value: 92 attributes: &link-stats-attrs - - family - - ifindex - - filter-mask - link-64 - link-xstats - link-xstats-slave From ed43ce6ab2229b5077b8c01897e3a6c2427082f6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 19:16:57 -0700 Subject: [PATCH 416/770] netlink: specs: rt-link: remove if-netnsid from attr list if-netnsid an alias to target-netnsid: IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ We don't have a definition for this attr. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250418021706.1967583-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index cb7bacbd3d958..7f411e8cd755e 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -2460,7 +2460,6 @@ operations: - xdp - event - new-netnsid - - if-netnsid - target-netnsid - carrier-up-count - carrier-down-count From c703d258f626d12f1910bcf6a5f67eeec26a56a2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 19:16:58 -0700 Subject: [PATCH 417/770] netlink: specs: rt-link: remove duplicated group in attr list group is listed twice for newlink. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250418021706.1967583-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 7f411e8cd755e..38f439911f94c 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -2382,7 +2382,6 @@ operations: - txqlen - operstate - linkmode - - group - gso-max-size - gso-max-segs - gro-max-size From b12b0f41819aaa101fe3d0953cb6e1eb25086c62 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 19:16:59 -0700 Subject: [PATCH 418/770] netlink: specs: rt-link: add C naming info Add properties needed for C codegen to match names with uAPI headers. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250418021706.1967583-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 30 +++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 38f439911f94c..a331eb5eecb2b 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -2,6 +2,7 @@ name: rt-link protocol: netlink-raw +uapi-header: linux/rtnetlink.h protonum: 0 doc: @@ -11,6 +12,9 @@ definitions: - name: ifinfo-flags type: flags + header: linux/if.h + enum-name: net-device-flags + name-prefix: iff- entries: - name: up @@ -53,6 +57,7 @@ definitions: - name: vlan-protocols type: enum + enum-name: entries: - name: 8021q @@ -754,6 +759,7 @@ definitions: - name: vlan-flags type: flags + enum-name: entries: - reorder-hdr - gvrp @@ -840,6 +846,7 @@ definitions: - name: ifla-vf-link-state-enum type: enum + enum-name: entries: - auto - enable @@ -906,6 +913,7 @@ definitions: - name: rtext-filter type: flags + enum-name: entries: - vf - brvlan @@ -918,6 +926,7 @@ definitions: - name: netkit-policy type: enum + enum-name: entries: - name: forward @@ -928,6 +937,7 @@ definitions: - name: netkit-mode type: enum + enum-name: netkit-mode entries: - name: l2 - name: l3 @@ -935,6 +945,7 @@ definitions: - name: netkit-scrub type: enum + enum-name: entries: - name: none - name: default @@ -1195,6 +1206,7 @@ attribute-sets: nested-attributes: mctp-attrs - name: vfinfo-list-attrs + name-prefix: ifla-vf- attributes: - name: info @@ -1203,6 +1215,7 @@ attribute-sets: multi-attr: true - name: vfinfo-attrs + name-prefix: ifla-vf- attributes: - name: mac @@ -1257,6 +1270,7 @@ attribute-sets: type: binary - name: vf-stats-attrs + name-prefix: ifla-vf-stats- attributes: - name: rx-packets @@ -1288,6 +1302,8 @@ attribute-sets: type: u64 - name: vf-vlan-attrs + name-prefix: ifla-vf-vlan- + attr-max-name: ifla-vf-vlan-info-max attributes: - name: info @@ -1296,12 +1312,15 @@ attribute-sets: multi-attr: true - name: vf-ports-attrs + name-prefix: ifla- attributes: [] - name: port-self-attrs + name-prefix: ifla- attributes: [] - name: linkinfo-attrs + name-prefix: ifla-info- attributes: - name: kind @@ -1855,6 +1874,7 @@ attribute-sets: - name: linkinfo-vti-attrs name-prefix: ifla-vti- + header: linux/if_tunnel.h attributes: - name: link @@ -2107,7 +2127,7 @@ attribute-sets: byte-order: big-endian - name: ifla-vlan-qos - name-prefix: ifla-vlan-qos + name-prefix: ifla-vlan-qos- attributes: - name: mapping @@ -2123,6 +2143,7 @@ attribute-sets: type: u32 - name: xdp-attrs + name-prefix: ifla-xdp- attributes: - name: fd @@ -2150,6 +2171,7 @@ attribute-sets: type: s32 - name: ifla-attrs + name-prefix: ifla-inet- attributes: - name: conf @@ -2157,6 +2179,7 @@ attribute-sets: struct: ipv4-devconf - name: ifla6-attrs + name-prefix: ifla-inet6- attributes: - name: flags @@ -2222,6 +2245,7 @@ attribute-sets: type: binary - name: link-offload-xstats + name-prefix: ifla-offload-xstats- attributes: - name: cpu-hit @@ -2236,6 +2260,7 @@ attribute-sets: type: binary - name: hw-s-info-one + name-prefix: ifla-offload-xstats-hw-s-info- attributes: - name: request @@ -2245,6 +2270,8 @@ attribute-sets: type: u8 - name: link-dpll-pin-attrs + name-prefix: dpll-a- + header: linux/dpll.h attributes: - name: id @@ -2357,6 +2384,7 @@ sub-messages: operations: enum-model: directional + name-prefix: rtm- list: - name: newlink From e6e1f53f0283d2213dba95dbaa82c3978718245a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 19:17:00 -0700 Subject: [PATCH 419/770] netlink: specs: rt-link: adjust AF_ nest for C codegen The AF nest is indexed by AF ID, so it's a bit strange, but with minor adjustments C codegen deals with it just fine. Entirely unclear why the names have been in quotes here. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250418021706.1967583-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index a331eb5eecb2b..6ab9b876a464a 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -1188,19 +1188,21 @@ attribute-sets: multi-attr: true - name: af-spec-attrs + name-prefix: af- + attr-max-name: af-max attributes: - - name: "inet" + name: inet type: nest value: 2 nested-attributes: ifla-attrs - - name: "inet6" + name: inet6 type: nest value: 10 nested-attributes: ifla6-attrs - - name: "mctp" + name: mctp type: nest value: 45 nested-attributes: mctp-attrs From 1c224f19ff06e59fb4079f3b26532dbd3b537b1e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 19:17:01 -0700 Subject: [PATCH 420/770] netlink: specs: rt-link: make bond's ipv6 address attribute fixed size ns-ip6-target is an indexed-array. Codegen for variable size binary array would be a bit tedious, tell C that we know the size of these attributes, since they are IPv6 addrs. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250418021706.1967583-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 6ab9b876a464a..4b51c5b60d955 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -1447,6 +1447,8 @@ attribute-sets: type: indexed-array sub-type: binary display-hint: ipv6 + checks: + exact-len: 16 - name: coupled-control type: u8 From 622d7050cfd40c6c9602d4e16b25cc5350f9d65a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 19:17:02 -0700 Subject: [PATCH 421/770] netlink: specs: rt-link: add notification for newlink Add a notification entry for netlink so that we can test ntf handling in classic netlink and C. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250418021706.1967583-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 4b51c5b60d955..25f0c3c6a886a 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -2420,6 +2420,12 @@ operations: - gso-ipv4-max-size - gro-ipv4-max-size - af-spec + - + name: newlink-ntf + doc: Notify that a link has been created + value: 16 + notify: getlink + fixed-header: ifinfomsg - name: dellink doc: Delete an existing link. From cd879795c3eed39225608bd8cdefdec3b606389f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 19:17:03 -0700 Subject: [PATCH 422/770] netlink: specs: rt-neigh: add C naming info Add properties needed for C codegen to match names with uAPI headers. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250418021706.1967583-10-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-neigh.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/netlink/specs/rt-neigh.yaml b/Documentation/netlink/specs/rt-neigh.yaml index a843caa72259e..9b87efaafd15e 100644 --- a/Documentation/netlink/specs/rt-neigh.yaml +++ b/Documentation/netlink/specs/rt-neigh.yaml @@ -2,6 +2,7 @@ name: rt-neigh protocol: netlink-raw +uapi-header: linux/rtnetlink.h protonum: 0 doc: @@ -48,6 +49,7 @@ definitions: - name: nud-state type: flags + enum-name: entries: - incomplete - reachable @@ -60,6 +62,7 @@ definitions: - name: ntf-flags type: flags + enum-name: entries: - use - self @@ -72,12 +75,14 @@ definitions: - name: ntf-ext-flags type: flags + enum-name: entries: - managed - locked - name: rtm-type type: enum + enum-name: entries: - unspec - unicast @@ -179,6 +184,7 @@ definitions: attribute-sets: - name: neighbour-attrs + name-prefix: nda- attributes: - name: unspec @@ -241,6 +247,7 @@ attribute-sets: type: u8 - name: ndt-attrs + name-prefix: ndta- attributes: - name: name @@ -274,6 +281,7 @@ attribute-sets: type: pad - name: ndtpa-attrs + name-prefix: ndtpa- attributes: - name: ifindex @@ -335,6 +343,7 @@ attribute-sets: operations: enum-model: directional + name-prefix: rtm- list: - name: newneigh From eee94a89c55afdb909b384e306d384f0dfe60c45 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 19:17:04 -0700 Subject: [PATCH 423/770] netlink: specs: rt-neigh: make sure getneigh is consistent The consistency check complains replies to do and dump don't match because dump has no value. It doesn't have to by the schema... but fixing this in code gen would be more code than adjusting the spec. This is rare. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250418021706.1967583-11-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-neigh.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/netlink/specs/rt-neigh.yaml b/Documentation/netlink/specs/rt-neigh.yaml index 9b87efaafd15e..fe34ade6b300b 100644 --- a/Documentation/netlink/specs/rt-neigh.yaml +++ b/Documentation/netlink/specs/rt-neigh.yaml @@ -402,6 +402,7 @@ operations: - ifindex - master reply: + value: 28 attributes: *neighbour-all - name: newneigh-ntf From e3d199d30909237a31ef7065d25c1f270967986f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 19:17:05 -0700 Subject: [PATCH 424/770] netlink: specs: rtnetlink: correct notify properties The notify property should point at the object the notifications carry, usually the get object, not the cmd which triggers the notification: notify: description: Name of the command sharing the reply type with this notification. Not treating this as a fix, I think that only C codegen cares. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250418021706.1967583-12-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-neigh.yaml | 2 +- Documentation/netlink/specs/rt-rule.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/netlink/specs/rt-neigh.yaml b/Documentation/netlink/specs/rt-neigh.yaml index fe34ade6b300b..e9cba164e3d15 100644 --- a/Documentation/netlink/specs/rt-neigh.yaml +++ b/Documentation/netlink/specs/rt-neigh.yaml @@ -381,7 +381,7 @@ operations: name: delneigh-ntf doc: Notify a neighbour deletion value: 29 - notify: delneigh + notify: getneigh fixed-header: ndmsg - name: getneigh diff --git a/Documentation/netlink/specs/rt-rule.yaml b/Documentation/netlink/specs/rt-rule.yaml index de0938d365419..f585654a4d410 100644 --- a/Documentation/netlink/specs/rt-rule.yaml +++ b/Documentation/netlink/specs/rt-rule.yaml @@ -234,7 +234,7 @@ operations: name: newrule-ntf doc: Notify a rule creation value: 32 - notify: newrule + notify: getrule - name: delrule doc: Remove an existing FIB rule @@ -247,7 +247,7 @@ operations: name: delrule-ntf doc: Notify a rule deletion value: 33 - notify: delrule + notify: getrule - name: getrule doc: Dump all FIB rules From 620b38232f432b51e62bb83abdbfaec6317c54d1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 19:17:06 -0700 Subject: [PATCH 425/770] netlink: specs: rt-rule: add C naming info Add properties needed for C codegen to match names with uAPI headers. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250418021706.1967583-13-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-rule.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/netlink/specs/rt-rule.yaml b/Documentation/netlink/specs/rt-rule.yaml index f585654a4d410..003707ca4a3e0 100644 --- a/Documentation/netlink/specs/rt-rule.yaml +++ b/Documentation/netlink/specs/rt-rule.yaml @@ -2,6 +2,7 @@ name: rt-rule protocol: netlink-raw +uapi-header: linux/fib_rules.h protonum: 0 doc: @@ -56,6 +57,7 @@ definitions: - name: fr-act type: enum + enum-name: entries: - unspec - to-tbl @@ -90,6 +92,7 @@ definitions: attribute-sets: - name: fib-rule-attrs + name-prefix: fra- attributes: - name: dst @@ -198,6 +201,7 @@ attribute-sets: operations: enum-model: directional fixed-header: fib-rule-hdr + name-prefix: rtm- list: - name: newrule From f0cc3777b2db18307d9734edf2d16e046714760f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Apr 2025 14:50:20 -0700 Subject: [PATCH 426/770] net: Fix wild-memory-access in __register_pernet_operations() when CONFIG_NET_NS=n. kernel test robot reported the splat below. [0] Before commit fed176bf3143 ("net: Add ops_undo_single for module load/unload."), if CONFIG_NET_NS=n, ops was linked to pernet_list only when init_net had not been initialised, and ops was unlinked from pernet_list only under the same condition. Let's say an ops is loaded before the init_net setup but unloaded after that. Then, the ops remains in pernet_list, which seems odd. The cited commit added ops_undo_single(), which calls list_add() for ops to link it to a temporary list, so a minor change was added to __register_pernet_operations() and __unregister_pernet_operations() under CONFIG_NET_NS=n to avoid the pernet_list corruption. However, the corruption must have been left as is. When CONFIG_NET_NS=n, pernet_list was used to keep ops registered before the init_net setup, and after that, pernet_list was not used at all. This was because some ops annotated with __net_initdata are cleared out of memory at some point during boot. Then, such ops is initialised by POISON_FREE_INITMEM (0xcc), resulting in that ops->list.{next,prev} suddenly switches from a valid pointer to a weird value, 0xcccccccccccccccc. To avoid such wild memory access, let's allow the pernet_list corruption for CONFIG_NET_NS=n. [0]: Oops: general protection fault, probably for non-canonical address 0xf999959999999999: 0000 [#1] SMP KASAN NOPTI KASAN: maybe wild-memory-access in range [0xccccccccccccccc8-0xcccccccccccccccf] CPU: 2 UID: 0 PID: 346 Comm: modprobe Not tainted 6.15.0-rc1-00294-ga4cba7e98e35 #85 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:__list_add_valid_or_report (lib/list_debug.c:32) Code: 48 c1 ea 03 80 3c 02 00 0f 85 5a 01 00 00 49 39 74 24 08 0f 85 83 00 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 f2 48 c1 ea 03 <80> 3c 02 00 0f 85 1f 01 00 00 4c 39 26 0f 85 ab 00 00 00 4c 39 ee RSP: 0018:ff11000135b87830 EFLAGS: 00010a07 RAX: dffffc0000000000 RBX: ffffffffc02223c0 RCX: ffffffff8406fcc2 RDX: 1999999999999999 RSI: cccccccccccccccc RDI: ffffffffc02223c0 RBP: ffffffff86064e40 R08: 0000000000000001 R09: fffffbfff0a9f5b5 R10: ffffffff854fadaf R11: 676552203a54454e R12: ffffffff86064e40 R13: ffffffffc02223c0 R14: ffffffff86064e48 R15: 0000000000000021 FS: 00007f6fb0d9e1c0(0000) GS:ff11000858ea0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f6fb0eda580 CR3: 0000000122fec005 CR4: 0000000000771ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: register_pernet_operations (./include/linux/list.h:150 (discriminator 5) ./include/linux/list.h:183 (discriminator 5) net/core/net_namespace.c:1315 (discriminator 5) net/core/net_namespace.c:1359 (discriminator 5)) register_pernet_subsys (net/core/net_namespace.c:1401) inet6_init (net/ipv6/af_inet6.c:535) ipv6 do_one_initcall (init/main.c:1257) do_init_module (kernel/module/main.c:2942) load_module (kernel/module/main.c:3409) init_module_from_file (kernel/module/main.c:3599) idempotent_init_module (kernel/module/main.c:3611) __x64_sys_finit_module (./include/linux/file.h:62 ./include/linux/file.h:83 kernel/module/main.c:3634 kernel/module/main.c:3621 kernel/module/main.c:3621) do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) RIP: 0033:0x7f6fb0df7e5d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 73 9f 1b 00 f7 d8 64 89 01 48 RSP: 002b:00007fffdc6a8968 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 000055b535721b70 RCX: 00007f6fb0df7e5d RDX: 0000000000000000 RSI: 000055b51e44aa2a RDI: 0000000000000004 RBP: 0000000000040000 R08: 0000000000000000 R09: 000055b535721b30 R10: 0000000000000004 R11: 0000000000000246 R12: 000055b51e44aa2a R13: 000055b535721bf0 R14: 000055b5357220b0 R15: 0000000000000000 Modules linked in: ipv6(+) crc_ccitt Fixes: fed176bf3143 ("net: Add ops_undo_single for module load/unload.") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202504181511.1c3f23e4-lkp@intel.com Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418215025.87871-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/net_namespace.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 48dd6dc603c9e..42ee7fce3d95b 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1314,19 +1314,19 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { - list_add_tail(&ops->list, list); - - if (!init_net_initialized) + if (!init_net_initialized) { + list_add_tail(&ops->list, list); return 0; + } return ops_init(ops, &init_net); } static void __unregister_pernet_operations(struct pernet_operations *ops) { - list_del(&ops->list); - - if (init_net_initialized) { + if (!init_net_initialized) { + list_del(&ops->list); + } else { LIST_HEAD(net_exit_list); list_add(&init_net.exit_list, &net_exit_list); From 52358dd63e348c3b6c488acc105be1aeda8fb923 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 18 Apr 2025 11:04:01 +0200 Subject: [PATCH 427/770] net: phy: remove function stubs All callers of these functions depend on PHYLIB or select it directly or indirectly by selecting PHYLINK. Stubs make sense for optional functionality, but that's not the case here. MDIO_XGENE usually is selected by NET_XGENE which also selects PHYLIB. Add a dependency to PHYLIB nevertheless, in order not to break randconfig builds. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/f7a69a1f-60e9-4ac0-8b7c-481e0cc850e7@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/mdio/Kconfig | 1 + include/linux/phy.h | 37 ------------------------------------- 2 files changed, 1 insertion(+), 37 deletions(-) diff --git a/drivers/net/mdio/Kconfig b/drivers/net/mdio/Kconfig index 058fcdaf6c185..38a4901da32f3 100644 --- a/drivers/net/mdio/Kconfig +++ b/drivers/net/mdio/Kconfig @@ -57,6 +57,7 @@ config MDIO_SUN4I config MDIO_XGENE tristate "APM X-Gene SoC MDIO bus controller" depends on ARCH_XGENE || COMPILE_TEST + depends on PHYLIB help This module provides a driver for the MDIO busses found in the APM X-Gene SoC's. diff --git a/include/linux/phy.h b/include/linux/phy.h index 066a28a4b64b2..3beaf225ee881 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1753,7 +1753,6 @@ int phy_modify_paged(struct phy_device *phydev, int page, u32 regnum, struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids); -#if IS_ENABLED(CONFIG_PHYLIB) int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id); struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode); struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode); @@ -1761,42 +1760,6 @@ struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode); struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); void phy_device_free(struct phy_device *phydev); -#else -static inline int fwnode_get_phy_id(struct fwnode_handle *fwnode, u32 *phy_id) -{ - return 0; -} -static inline -struct mdio_device *fwnode_mdio_find_device(struct fwnode_handle *fwnode) -{ - return 0; -} - -static inline -struct phy_device *fwnode_phy_find_device(struct fwnode_handle *phy_fwnode) -{ - return NULL; -} - -static inline -struct fwnode_handle *fwnode_get_phy_node(struct fwnode_handle *fwnode) -{ - return NULL; -} - -static inline -struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45) -{ - return NULL; -} - -static inline int phy_device_register(struct phy_device *phy) -{ - return 0; -} - -static inline void phy_device_free(struct phy_device *phydev) { } -#endif /* CONFIG_PHYLIB */ void phy_device_remove(struct phy_device *phydev); int phy_get_c45_ids(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); From 4dec0702b8627c364a0e1f2c5b249e06709a1c24 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 18 Apr 2025 11:23:45 +0200 Subject: [PATCH 428/770] r8169: merge chip versions 70 and 71 (RTL8126A) Handling of both chip versions is the same, only difference is the firmware. So we can merge handling of both chip versions. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/97d7ae79-d021-4b6b-b424-89e5e305b029@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169.h | 1 - drivers/net/ethernet/realtek/r8169_main.c | 15 ++++----------- drivers/net/ethernet/realtek/r8169_phy_config.c | 1 - 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h index 9f784840ea0d2..3f7182dc8c9ec 100644 --- a/drivers/net/ethernet/realtek/r8169.h +++ b/drivers/net/ethernet/realtek/r8169.h @@ -72,7 +72,6 @@ enum mac_version { RTL_GIGA_MAC_VER_65, RTL_GIGA_MAC_VER_66, RTL_GIGA_MAC_VER_70, - RTL_GIGA_MAC_VER_71, RTL_GIGA_MAC_NONE, RTL_GIGA_MAC_VER_LAST = RTL_GIGA_MAC_NONE - 1 }; diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 3ef7126f8ec50..f2679bca9d0c1 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -99,7 +99,7 @@ static const struct rtl_chip_info { const char *fw_name; } rtl_chip_infos[] = { /* 8126A family. */ - { 0x7cf, 0x64a, RTL_GIGA_MAC_VER_71, "RTL8126A", FIRMWARE_8126A_3 }, + { 0x7cf, 0x64a, RTL_GIGA_MAC_VER_70, "RTL8126A", FIRMWARE_8126A_3 }, { 0x7cf, 0x649, RTL_GIGA_MAC_VER_70, "RTL8126A", FIRMWARE_8126A_2 }, /* 8125BP family. */ @@ -2939,7 +2939,6 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) rtl_mod_config5(tp, 0, ASPM_en); switch (tp->mac_version) { case RTL_GIGA_MAC_VER_70: - case RTL_GIGA_MAC_VER_71: val8 = RTL_R8(tp, INT_CFG0_8125) | INT_CFG0_CLKREQEN; RTL_W8(tp, INT_CFG0_8125, val8); break; @@ -2971,7 +2970,6 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) switch (tp->mac_version) { case RTL_GIGA_MAC_VER_70: - case RTL_GIGA_MAC_VER_71: val8 = RTL_R8(tp, INT_CFG0_8125) & ~INT_CFG0_CLKREQEN; RTL_W8(tp, INT_CFG0_8125, val8); break; @@ -3691,12 +3689,10 @@ static void rtl_hw_start_8125_common(struct rtl8169_private *tp) /* disable new tx descriptor format */ r8168_mac_ocp_modify(tp, 0xeb58, 0x0001, 0x0000); - if (tp->mac_version == RTL_GIGA_MAC_VER_70 || - tp->mac_version == RTL_GIGA_MAC_VER_71) + if (tp->mac_version == RTL_GIGA_MAC_VER_70) RTL_W8(tp, 0xD8, RTL_R8(tp, 0xD8) & ~0x02); - if (tp->mac_version == RTL_GIGA_MAC_VER_70 || - tp->mac_version == RTL_GIGA_MAC_VER_71) + if (tp->mac_version == RTL_GIGA_MAC_VER_70) r8168_mac_ocp_modify(tp, 0xe614, 0x0700, 0x0400); else if (tp->mac_version == RTL_GIGA_MAC_VER_63) r8168_mac_ocp_modify(tp, 0xe614, 0x0700, 0x0200); @@ -3714,8 +3710,7 @@ static void rtl_hw_start_8125_common(struct rtl8169_private *tp) r8168_mac_ocp_modify(tp, 0xe056, 0x00f0, 0x0030); r8168_mac_ocp_modify(tp, 0xe040, 0x1000, 0x0000); r8168_mac_ocp_modify(tp, 0xea1c, 0x0003, 0x0001); - if (tp->mac_version == RTL_GIGA_MAC_VER_70 || - tp->mac_version == RTL_GIGA_MAC_VER_71) + if (tp->mac_version == RTL_GIGA_MAC_VER_70) r8168_mac_ocp_modify(tp, 0xea1c, 0x0300, 0x0000); else r8168_mac_ocp_modify(tp, 0xea1c, 0x0004, 0x0000); @@ -3838,7 +3833,6 @@ static void rtl_hw_config(struct rtl8169_private *tp) [RTL_GIGA_MAC_VER_65] = rtl_hw_start_8125d, [RTL_GIGA_MAC_VER_66] = rtl_hw_start_8125d, [RTL_GIGA_MAC_VER_70] = rtl_hw_start_8126a, - [RTL_GIGA_MAC_VER_71] = rtl_hw_start_8126a, }; if (hw_configs[tp->mac_version]) @@ -3862,7 +3856,6 @@ static void rtl_hw_start_8125(struct rtl8169_private *tp) break; case RTL_GIGA_MAC_VER_63: case RTL_GIGA_MAC_VER_70: - case RTL_GIGA_MAC_VER_71: for (i = 0xa00; i < 0xa80; i += 4) RTL_W32(tp, i, 0); RTL_W16(tp, INT_CFG1_8125, 0x0000); diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c index 748ca8b212ba1..7f513086ba96c 100644 --- a/drivers/net/ethernet/realtek/r8169_phy_config.c +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c @@ -1183,7 +1183,6 @@ void r8169_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev, [RTL_GIGA_MAC_VER_65] = rtl8125d_hw_phy_config, [RTL_GIGA_MAC_VER_66] = rtl8125bp_hw_phy_config, [RTL_GIGA_MAC_VER_70] = rtl8126a_hw_phy_config, - [RTL_GIGA_MAC_VER_71] = rtl8126a_hw_phy_config, }; if (phy_configs[ver]) From f372ef6ed5a6b0401c884561d4bba1843e54d46a Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 18 Apr 2025 11:24:30 +0200 Subject: [PATCH 429/770] r8169: merge chip versions 64 and 65 (RTL8125D) Handling of both chip versions is the same, only difference is the firmware. So we can merge handling of both chip versions. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/0baad123-c679-4154-923f-fdc12783e900@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169.h | 1 - drivers/net/ethernet/realtek/r8169_main.c | 4 +--- drivers/net/ethernet/realtek/r8169_phy_config.c | 1 - 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h index 3f7182dc8c9ec..1878c44ece91a 100644 --- a/drivers/net/ethernet/realtek/r8169.h +++ b/drivers/net/ethernet/realtek/r8169.h @@ -69,7 +69,6 @@ enum mac_version { RTL_GIGA_MAC_VER_61, RTL_GIGA_MAC_VER_63, RTL_GIGA_MAC_VER_64, - RTL_GIGA_MAC_VER_65, RTL_GIGA_MAC_VER_66, RTL_GIGA_MAC_VER_70, RTL_GIGA_MAC_NONE, diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index f2679bca9d0c1..dfd356a76e950 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -106,7 +106,7 @@ static const struct rtl_chip_info { { 0x7cf, 0x681, RTL_GIGA_MAC_VER_66, "RTL8125BP", FIRMWARE_8125BP_2 }, /* 8125D family. */ - { 0x7cf, 0x689, RTL_GIGA_MAC_VER_65, "RTL8125D", FIRMWARE_8125D_2 }, + { 0x7cf, 0x689, RTL_GIGA_MAC_VER_64, "RTL8125D", FIRMWARE_8125D_2 }, { 0x7cf, 0x688, RTL_GIGA_MAC_VER_64, "RTL8125D", FIRMWARE_8125D_1 }, /* 8125B family. */ @@ -3830,7 +3830,6 @@ static void rtl_hw_config(struct rtl8169_private *tp) [RTL_GIGA_MAC_VER_61] = rtl_hw_start_8125a_2, [RTL_GIGA_MAC_VER_63] = rtl_hw_start_8125b, [RTL_GIGA_MAC_VER_64] = rtl_hw_start_8125d, - [RTL_GIGA_MAC_VER_65] = rtl_hw_start_8125d, [RTL_GIGA_MAC_VER_66] = rtl_hw_start_8125d, [RTL_GIGA_MAC_VER_70] = rtl_hw_start_8126a, }; @@ -3849,7 +3848,6 @@ static void rtl_hw_start_8125(struct rtl8169_private *tp) switch (tp->mac_version) { case RTL_GIGA_MAC_VER_61: case RTL_GIGA_MAC_VER_64: - case RTL_GIGA_MAC_VER_65: case RTL_GIGA_MAC_VER_66: for (i = 0xa00; i < 0xb00; i += 4) RTL_W32(tp, i, 0); diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c index 7f513086ba96c..e3adfafa2b1b6 100644 --- a/drivers/net/ethernet/realtek/r8169_phy_config.c +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c @@ -1180,7 +1180,6 @@ void r8169_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev, [RTL_GIGA_MAC_VER_61] = rtl8125a_2_hw_phy_config, [RTL_GIGA_MAC_VER_63] = rtl8125b_hw_phy_config, [RTL_GIGA_MAC_VER_64] = rtl8125d_hw_phy_config, - [RTL_GIGA_MAC_VER_65] = rtl8125d_hw_phy_config, [RTL_GIGA_MAC_VER_66] = rtl8125bp_hw_phy_config, [RTL_GIGA_MAC_VER_70] = rtl8126a_hw_phy_config, }; From 4f51e7d370a04122fa78470b031d6487c52298b1 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 18 Apr 2025 11:25:17 +0200 Subject: [PATCH 430/770] r8169: merge chip versions 52 and 53 (RTL8117) Handling of both chip versions is the same, only difference is the firmware. So we can merge handling of both chip versions. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/ae866b71-c904-434e-befb-848c831e33ff@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169.h | 1 - drivers/net/ethernet/realtek/r8169_main.c | 17 +++++++---------- drivers/net/ethernet/realtek/r8169_phy_config.c | 1 - 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169.h b/drivers/net/ethernet/realtek/r8169.h index 1878c44ece91a..f05231030925e 100644 --- a/drivers/net/ethernet/realtek/r8169.h +++ b/drivers/net/ethernet/realtek/r8169.h @@ -64,7 +64,6 @@ enum mac_version { /* support for RTL_GIGA_MAC_VER_50 has been removed */ RTL_GIGA_MAC_VER_51, RTL_GIGA_MAC_VER_52, - RTL_GIGA_MAC_VER_53, /* support for RTL_GIGA_MAC_VER_60 has been removed */ RTL_GIGA_MAC_VER_61, RTL_GIGA_MAC_VER_63, diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index dfd356a76e950..7bf71a675362b 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -116,7 +116,7 @@ static const struct rtl_chip_info { { 0x7cf, 0x609, RTL_GIGA_MAC_VER_61, "RTL8125A", FIRMWARE_8125A_3 }, /* RTL8117 */ - { 0x7cf, 0x54b, RTL_GIGA_MAC_VER_53, "RTL8168fp/RTL8117" }, + { 0x7cf, 0x54b, RTL_GIGA_MAC_VER_52, "RTL8168fp/RTL8117" }, { 0x7cf, 0x54a, RTL_GIGA_MAC_VER_52, "RTL8168fp/RTL8117", FIRMWARE_8168FP_3 }, @@ -830,7 +830,7 @@ static bool rtl_is_8168evl_up(struct rtl8169_private *tp) { return tp->mac_version >= RTL_GIGA_MAC_VER_34 && tp->mac_version != RTL_GIGA_MAC_VER_39 && - tp->mac_version <= RTL_GIGA_MAC_VER_53; + tp->mac_version <= RTL_GIGA_MAC_VER_52; } static bool rtl_supports_eee(struct rtl8169_private *tp) @@ -998,9 +998,7 @@ void r8169_get_led_name(struct rtl8169_private *tp, int idx, static void r8168fp_adjust_ocp_cmd(struct rtl8169_private *tp, u32 *cmd, int type) { /* based on RTL8168FP_OOBMAC_BASE in vendor driver */ - if (type == ERIAR_OOB && - (tp->mac_version == RTL_GIGA_MAC_VER_52 || - tp->mac_version == RTL_GIGA_MAC_VER_53)) + if (type == ERIAR_OOB && tp->mac_version == RTL_GIGA_MAC_VER_52) *cmd |= 0xf70 << 18; } @@ -1500,7 +1498,7 @@ static enum rtl_dash_type rtl_get_dash_type(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_28: case RTL_GIGA_MAC_VER_31: return RTL_DASH_DP; - case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_53: + case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_52: return RTL_DASH_EP; case RTL_GIGA_MAC_VER_66: return RTL_DASH_25_BP; @@ -2485,7 +2483,7 @@ static void rtl_init_rxcfg(struct rtl8169_private *tp) case RTL_GIGA_MAC_VER_38: RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST); break; - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_53: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_52: RTL_W32(tp, RxConfig, RX128_INT_EN | RX_MULTI_EN | RX_DMA_BURST | RX_EARLY_OFF); break; case RTL_GIGA_MAC_VER_61: @@ -2616,7 +2614,7 @@ DECLARE_RTL_COND(rtl_rxtx_empty_cond_2) static void rtl_wait_txrx_fifo_empty(struct rtl8169_private *tp) { switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_53: + case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_52: rtl_loop_wait_high(tp, &rtl_txcfg_empty_cond, 100, 42); rtl_loop_wait_high(tp, &rtl_rxtx_empty_cond, 100, 42); break; @@ -3826,7 +3824,6 @@ static void rtl_hw_config(struct rtl8169_private *tp) [RTL_GIGA_MAC_VER_48] = rtl_hw_start_8168h_1, [RTL_GIGA_MAC_VER_51] = rtl_hw_start_8168ep_3, [RTL_GIGA_MAC_VER_52] = rtl_hw_start_8117, - [RTL_GIGA_MAC_VER_53] = rtl_hw_start_8117, [RTL_GIGA_MAC_VER_61] = rtl_hw_start_8125a_2, [RTL_GIGA_MAC_VER_63] = rtl_hw_start_8125b, [RTL_GIGA_MAC_VER_64] = rtl_hw_start_8125d, @@ -5285,7 +5282,7 @@ static void rtl_hw_init_8125(struct rtl8169_private *tp) static void rtl_hw_initialize(struct rtl8169_private *tp) { switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_53: + case RTL_GIGA_MAC_VER_51 ... RTL_GIGA_MAC_VER_52: rtl8168ep_stop_cmac(tp); fallthrough; case RTL_GIGA_MAC_VER_40 ... RTL_GIGA_MAC_VER_48: diff --git a/drivers/net/ethernet/realtek/r8169_phy_config.c b/drivers/net/ethernet/realtek/r8169_phy_config.c index e3adfafa2b1b6..5403f8202c797 100644 --- a/drivers/net/ethernet/realtek/r8169_phy_config.c +++ b/drivers/net/ethernet/realtek/r8169_phy_config.c @@ -1176,7 +1176,6 @@ void r8169_hw_phy_config(struct rtl8169_private *tp, struct phy_device *phydev, [RTL_GIGA_MAC_VER_48] = rtl8168h_2_hw_phy_config, [RTL_GIGA_MAC_VER_51] = rtl8168ep_2_hw_phy_config, [RTL_GIGA_MAC_VER_52] = rtl8117_hw_phy_config, - [RTL_GIGA_MAC_VER_53] = rtl8117_hw_phy_config, [RTL_GIGA_MAC_VER_61] = rtl8125a_2_hw_phy_config, [RTL_GIGA_MAC_VER_63] = rtl8125b_hw_phy_config, [RTL_GIGA_MAC_VER_64] = rtl8125d_hw_phy_config, From 9439db26d3ee4a897e5cd108864172531f31ce07 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 18 Apr 2025 12:40:49 +0200 Subject: [PATCH 431/770] net: airoha: Introduce airoha_irq_bank struct EN7581 ethernet SoC supports 4 programmable IRQ lines each one composed by 4 IRQ configuration registers. Add airoha_irq_bank struct as a container for independent IRQ lines info (e.g. IRQ number, enabled source interrupts, ecc). This is a preliminary patch to support multiple IRQ lines in airoha_eth driver. Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250418-airoha-eth-multi-irq-v1-1-1ab0083ca3c1@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 106 ++++++++++++++-------- drivers/net/ethernet/airoha/airoha_eth.h | 13 ++- drivers/net/ethernet/airoha/airoha_regs.h | 11 ++- 3 files changed, 86 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index c773b5ea9c051..e1c1cf965c2f1 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -34,37 +34,40 @@ u32 airoha_rmw(void __iomem *base, u32 offset, u32 mask, u32 val) return val; } -static void airoha_qdma_set_irqmask(struct airoha_qdma *qdma, int index, - u32 clear, u32 set) +static void airoha_qdma_set_irqmask(struct airoha_irq_bank *irq_bank, + int index, u32 clear, u32 set) { + struct airoha_qdma *qdma = irq_bank->qdma; + int bank = irq_bank - &qdma->irq_banks[0]; unsigned long flags; - if (WARN_ON_ONCE(index >= ARRAY_SIZE(qdma->irqmask))) + if (WARN_ON_ONCE(index >= ARRAY_SIZE(irq_bank->irqmask))) return; - spin_lock_irqsave(&qdma->irq_lock, flags); + spin_lock_irqsave(&irq_bank->irq_lock, flags); - qdma->irqmask[index] &= ~clear; - qdma->irqmask[index] |= set; - airoha_qdma_wr(qdma, REG_INT_ENABLE(index), qdma->irqmask[index]); + irq_bank->irqmask[index] &= ~clear; + irq_bank->irqmask[index] |= set; + airoha_qdma_wr(qdma, REG_INT_ENABLE(bank, index), + irq_bank->irqmask[index]); /* Read irq_enable register in order to guarantee the update above * completes in the spinlock critical section. */ - airoha_qdma_rr(qdma, REG_INT_ENABLE(index)); + airoha_qdma_rr(qdma, REG_INT_ENABLE(bank, index)); - spin_unlock_irqrestore(&qdma->irq_lock, flags); + spin_unlock_irqrestore(&irq_bank->irq_lock, flags); } -static void airoha_qdma_irq_enable(struct airoha_qdma *qdma, int index, - u32 mask) +static void airoha_qdma_irq_enable(struct airoha_irq_bank *irq_bank, + int index, u32 mask) { - airoha_qdma_set_irqmask(qdma, index, 0, mask); + airoha_qdma_set_irqmask(irq_bank, index, 0, mask); } -static void airoha_qdma_irq_disable(struct airoha_qdma *qdma, int index, - u32 mask) +static void airoha_qdma_irq_disable(struct airoha_irq_bank *irq_bank, + int index, u32 mask) { - airoha_qdma_set_irqmask(qdma, index, mask, 0); + airoha_qdma_set_irqmask(irq_bank, index, mask, 0); } static bool airhoa_is_lan_gdm_port(struct airoha_gdm_port *port) @@ -732,6 +735,7 @@ static int airoha_qdma_rx_process(struct airoha_queue *q, int budget) static int airoha_qdma_rx_napi_poll(struct napi_struct *napi, int budget) { struct airoha_queue *q = container_of(napi, struct airoha_queue, napi); + struct airoha_irq_bank *irq_bank = &q->qdma->irq_banks[0]; int cur, done = 0; do { @@ -740,7 +744,7 @@ static int airoha_qdma_rx_napi_poll(struct napi_struct *napi, int budget) } while (cur && done < budget); if (done < budget && napi_complete(napi)) - airoha_qdma_irq_enable(q->qdma, QDMA_INT_REG_IDX1, + airoha_qdma_irq_enable(irq_bank, QDMA_INT_REG_IDX1, RX_DONE_INT_MASK); return done; @@ -944,7 +948,7 @@ static int airoha_qdma_tx_napi_poll(struct napi_struct *napi, int budget) } if (done < budget && napi_complete(napi)) - airoha_qdma_irq_enable(qdma, QDMA_INT_REG_IDX0, + airoha_qdma_irq_enable(&qdma->irq_banks[0], QDMA_INT_REG_IDX0, TX_DONE_INT_MASK(id)); return done; @@ -1175,13 +1179,16 @@ static int airoha_qdma_hw_init(struct airoha_qdma *qdma) int i; /* clear pending irqs */ - for (i = 0; i < ARRAY_SIZE(qdma->irqmask); i++) + for (i = 0; i < ARRAY_SIZE(qdma->irq_banks[0].irqmask); i++) airoha_qdma_wr(qdma, REG_INT_STATUS(i), 0xffffffff); /* setup irqs */ - airoha_qdma_irq_enable(qdma, QDMA_INT_REG_IDX0, INT_IDX0_MASK); - airoha_qdma_irq_enable(qdma, QDMA_INT_REG_IDX1, INT_IDX1_MASK); - airoha_qdma_irq_enable(qdma, QDMA_INT_REG_IDX4, INT_IDX4_MASK); + airoha_qdma_irq_enable(&qdma->irq_banks[0], QDMA_INT_REG_IDX0, + INT_IDX0_MASK); + airoha_qdma_irq_enable(&qdma->irq_banks[0], QDMA_INT_REG_IDX1, + INT_IDX1_MASK); + airoha_qdma_irq_enable(&qdma->irq_banks[0], QDMA_INT_REG_IDX4, + INT_IDX4_MASK); /* setup irq binding */ for (i = 0; i < ARRAY_SIZE(qdma->q_tx); i++) { @@ -1226,13 +1233,14 @@ static int airoha_qdma_hw_init(struct airoha_qdma *qdma) static irqreturn_t airoha_irq_handler(int irq, void *dev_instance) { - struct airoha_qdma *qdma = dev_instance; - u32 intr[ARRAY_SIZE(qdma->irqmask)]; + struct airoha_irq_bank *irq_bank = dev_instance; + struct airoha_qdma *qdma = irq_bank->qdma; + u32 intr[ARRAY_SIZE(irq_bank->irqmask)]; int i; - for (i = 0; i < ARRAY_SIZE(qdma->irqmask); i++) { + for (i = 0; i < ARRAY_SIZE(intr); i++) { intr[i] = airoha_qdma_rr(qdma, REG_INT_STATUS(i)); - intr[i] &= qdma->irqmask[i]; + intr[i] &= irq_bank->irqmask[i]; airoha_qdma_wr(qdma, REG_INT_STATUS(i), intr[i]); } @@ -1240,7 +1248,7 @@ static irqreturn_t airoha_irq_handler(int irq, void *dev_instance) return IRQ_NONE; if (intr[1] & RX_DONE_INT_MASK) { - airoha_qdma_irq_disable(qdma, QDMA_INT_REG_IDX1, + airoha_qdma_irq_disable(irq_bank, QDMA_INT_REG_IDX1, RX_DONE_INT_MASK); for (i = 0; i < ARRAY_SIZE(qdma->q_rx); i++) { @@ -1257,7 +1265,7 @@ static irqreturn_t airoha_irq_handler(int irq, void *dev_instance) if (!(intr[0] & TX_DONE_INT_MASK(i))) continue; - airoha_qdma_irq_disable(qdma, QDMA_INT_REG_IDX0, + airoha_qdma_irq_disable(irq_bank, QDMA_INT_REG_IDX0, TX_DONE_INT_MASK(i)); napi_schedule(&qdma->q_tx_irq[i].napi); } @@ -1266,6 +1274,39 @@ static irqreturn_t airoha_irq_handler(int irq, void *dev_instance) return IRQ_HANDLED; } +static int airoha_qdma_init_irq_banks(struct platform_device *pdev, + struct airoha_qdma *qdma) +{ + struct airoha_eth *eth = qdma->eth; + int i, id = qdma - ð->qdma[0]; + + for (i = 0; i < ARRAY_SIZE(qdma->irq_banks); i++) { + struct airoha_irq_bank *irq_bank = &qdma->irq_banks[i]; + int err, irq_index = 4 * id + i; + const char *name; + + spin_lock_init(&irq_bank->irq_lock); + irq_bank->qdma = qdma; + + irq_bank->irq = platform_get_irq(pdev, irq_index); + if (irq_bank->irq < 0) + return irq_bank->irq; + + name = devm_kasprintf(eth->dev, GFP_KERNEL, + KBUILD_MODNAME ".%d", irq_index); + if (!name) + return -ENOMEM; + + err = devm_request_irq(eth->dev, irq_bank->irq, + airoha_irq_handler, IRQF_SHARED, name, + irq_bank); + if (err) + return err; + } + + return 0; +} + static int airoha_qdma_init(struct platform_device *pdev, struct airoha_eth *eth, struct airoha_qdma *qdma) @@ -1273,9 +1314,7 @@ static int airoha_qdma_init(struct platform_device *pdev, int err, id = qdma - ð->qdma[0]; const char *res; - spin_lock_init(&qdma->irq_lock); qdma->eth = eth; - res = devm_kasprintf(eth->dev, GFP_KERNEL, "qdma%d", id); if (!res) return -ENOMEM; @@ -1285,12 +1324,7 @@ static int airoha_qdma_init(struct platform_device *pdev, return dev_err_probe(eth->dev, PTR_ERR(qdma->regs), "failed to iomap qdma%d regs\n", id); - qdma->irq = platform_get_irq(pdev, 4 * id); - if (qdma->irq < 0) - return qdma->irq; - - err = devm_request_irq(eth->dev, qdma->irq, airoha_irq_handler, - IRQF_SHARED, KBUILD_MODNAME, qdma); + err = airoha_qdma_init_irq_banks(pdev, qdma); if (err) return err; @@ -2784,7 +2818,7 @@ static int airoha_alloc_gdm_port(struct airoha_eth *eth, dev->features |= dev->hw_features; dev->vlan_features = dev->hw_features; dev->dev.of_node = np; - dev->irq = qdma->irq; + dev->irq = qdma->irq_banks[0].irq; SET_NETDEV_DEV(dev, eth->dev); /* reserve hw queues for HTB offloading */ diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h index da5371bcd1473..af263203d488e 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.h +++ b/drivers/net/ethernet/airoha/airoha_eth.h @@ -17,6 +17,7 @@ #define AIROHA_MAX_NUM_GDM_PORTS 4 #define AIROHA_MAX_NUM_QDMA 2 +#define AIROHA_MAX_NUM_IRQ_BANKS 1 #define AIROHA_MAX_DSA_PORTS 7 #define AIROHA_MAX_NUM_RSTS 3 #define AIROHA_MAX_NUM_XSI_RSTS 5 @@ -452,17 +453,23 @@ struct airoha_flow_table_entry { unsigned long cookie; }; -struct airoha_qdma { - struct airoha_eth *eth; - void __iomem *regs; +struct airoha_irq_bank { + struct airoha_qdma *qdma; /* protect concurrent irqmask accesses */ spinlock_t irq_lock; u32 irqmask[QDMA_INT_REG_MAX]; int irq; +}; + +struct airoha_qdma { + struct airoha_eth *eth; + void __iomem *regs; atomic_t users; + struct airoha_irq_bank irq_banks[AIROHA_MAX_NUM_IRQ_BANKS]; + struct airoha_tx_irq_queue q_tx_irq[AIROHA_NUM_TX_IRQ]; struct airoha_queue q_tx[AIROHA_NUM_TX_RING]; diff --git a/drivers/net/ethernet/airoha/airoha_regs.h b/drivers/net/ethernet/airoha/airoha_regs.h index 29c8f046b9910..1d99fe9f81124 100644 --- a/drivers/net/ethernet/airoha/airoha_regs.h +++ b/drivers/net/ethernet/airoha/airoha_regs.h @@ -423,11 +423,12 @@ ((_n) == 2) ? 0x0720 : \ ((_n) == 1) ? 0x0024 : 0x0020) -#define REG_INT_ENABLE(_n) \ - (((_n) == 4) ? 0x0750 : \ - ((_n) == 3) ? 0x0744 : \ - ((_n) == 2) ? 0x0740 : \ - ((_n) == 1) ? 0x002c : 0x0028) +#define REG_INT_ENABLE(_b, _n) \ + (((_n) == 4) ? 0x0750 + ((_b) << 5) : \ + ((_n) == 3) ? 0x0744 + ((_b) << 5) : \ + ((_n) == 2) ? 0x0740 + ((_b) << 5) : \ + ((_n) == 1) ? 0x002c + ((_b) << 3) : \ + 0x0028 + ((_b) << 3)) /* QDMA_CSR_INT_ENABLE1 */ #define RX15_COHERENT_INT_MASK BIT(31) From f252493e1835366fc25ce631c3056f900977dd11 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 18 Apr 2025 12:40:50 +0200 Subject: [PATCH 432/770] net: airoha: Enable multiple IRQ lines support in airoha_eth driver. EN7581 ethernet SoC supports 4 programmable IRQ lines for Tx and Rx interrupts. Enable multiple IRQ lines support. Map Rx/Tx queues to the available IRQ lines using the default scheme used in the vendor SDK: - IRQ0: rx queues [0-4],[7-9],15 - IRQ1: rx queues [21-30] - IRQ2: rx queues 5 - IRQ3: rx queues 6 Tx queues interrupts are managed by IRQ0. Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250418-airoha-eth-multi-irq-v1-2-1ab0083ca3c1@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/airoha/airoha_eth.c | 67 +++++--- drivers/net/ethernet/airoha/airoha_eth.h | 13 +- drivers/net/ethernet/airoha/airoha_regs.h | 185 +++++++++++++++++----- 3 files changed, 206 insertions(+), 59 deletions(-) diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c index e1c1cf965c2f1..16c7896f931fd 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.c +++ b/drivers/net/ethernet/airoha/airoha_eth.c @@ -735,7 +735,6 @@ static int airoha_qdma_rx_process(struct airoha_queue *q, int budget) static int airoha_qdma_rx_napi_poll(struct napi_struct *napi, int budget) { struct airoha_queue *q = container_of(napi, struct airoha_queue, napi); - struct airoha_irq_bank *irq_bank = &q->qdma->irq_banks[0]; int cur, done = 0; do { @@ -743,9 +742,20 @@ static int airoha_qdma_rx_napi_poll(struct napi_struct *napi, int budget) done += cur; } while (cur && done < budget); - if (done < budget && napi_complete(napi)) - airoha_qdma_irq_enable(irq_bank, QDMA_INT_REG_IDX1, - RX_DONE_INT_MASK); + if (done < budget && napi_complete(napi)) { + struct airoha_qdma *qdma = q->qdma; + int i, qid = q - &qdma->q_rx[0]; + int intr_reg = qid < RX_DONE_HIGH_OFFSET ? QDMA_INT_REG_IDX1 + : QDMA_INT_REG_IDX2; + + for (i = 0; i < ARRAY_SIZE(qdma->irq_banks); i++) { + if (!(BIT(qid) & RX_IRQ_BANK_PIN_MASK(i))) + continue; + + airoha_qdma_irq_enable(&qdma->irq_banks[i], intr_reg, + BIT(qid % RX_DONE_HIGH_OFFSET)); + } + } return done; } @@ -1178,17 +1188,24 @@ static int airoha_qdma_hw_init(struct airoha_qdma *qdma) { int i; - /* clear pending irqs */ - for (i = 0; i < ARRAY_SIZE(qdma->irq_banks[0].irqmask); i++) + for (i = 0; i < ARRAY_SIZE(qdma->irq_banks); i++) { + /* clear pending irqs */ airoha_qdma_wr(qdma, REG_INT_STATUS(i), 0xffffffff); - - /* setup irqs */ + /* setup rx irqs */ + airoha_qdma_irq_enable(&qdma->irq_banks[i], QDMA_INT_REG_IDX0, + INT_RX0_MASK(RX_IRQ_BANK_PIN_MASK(i))); + airoha_qdma_irq_enable(&qdma->irq_banks[i], QDMA_INT_REG_IDX1, + INT_RX1_MASK(RX_IRQ_BANK_PIN_MASK(i))); + airoha_qdma_irq_enable(&qdma->irq_banks[i], QDMA_INT_REG_IDX2, + INT_RX2_MASK(RX_IRQ_BANK_PIN_MASK(i))); + airoha_qdma_irq_enable(&qdma->irq_banks[i], QDMA_INT_REG_IDX3, + INT_RX3_MASK(RX_IRQ_BANK_PIN_MASK(i))); + } + /* setup tx irqs */ airoha_qdma_irq_enable(&qdma->irq_banks[0], QDMA_INT_REG_IDX0, - INT_IDX0_MASK); - airoha_qdma_irq_enable(&qdma->irq_banks[0], QDMA_INT_REG_IDX1, - INT_IDX1_MASK); + TX_COHERENT_LOW_INT_MASK | INT_TX_MASK); airoha_qdma_irq_enable(&qdma->irq_banks[0], QDMA_INT_REG_IDX4, - INT_IDX4_MASK); + TX_COHERENT_HIGH_INT_MASK); /* setup irq binding */ for (i = 0; i < ARRAY_SIZE(qdma->q_tx); i++) { @@ -1235,6 +1252,7 @@ static irqreturn_t airoha_irq_handler(int irq, void *dev_instance) { struct airoha_irq_bank *irq_bank = dev_instance; struct airoha_qdma *qdma = irq_bank->qdma; + u32 rx_intr_mask = 0, rx_intr1, rx_intr2; u32 intr[ARRAY_SIZE(irq_bank->irqmask)]; int i; @@ -1247,17 +1265,24 @@ static irqreturn_t airoha_irq_handler(int irq, void *dev_instance) if (!test_bit(DEV_STATE_INITIALIZED, &qdma->eth->state)) return IRQ_NONE; - if (intr[1] & RX_DONE_INT_MASK) { - airoha_qdma_irq_disable(irq_bank, QDMA_INT_REG_IDX1, - RX_DONE_INT_MASK); + rx_intr1 = intr[1] & RX_DONE_LOW_INT_MASK; + if (rx_intr1) { + airoha_qdma_irq_disable(irq_bank, QDMA_INT_REG_IDX1, rx_intr1); + rx_intr_mask |= rx_intr1; + } - for (i = 0; i < ARRAY_SIZE(qdma->q_rx); i++) { - if (!qdma->q_rx[i].ndesc) - continue; + rx_intr2 = intr[2] & RX_DONE_HIGH_INT_MASK; + if (rx_intr2) { + airoha_qdma_irq_disable(irq_bank, QDMA_INT_REG_IDX2, rx_intr2); + rx_intr_mask |= (rx_intr2 << 16); + } - if (intr[1] & BIT(i)) - napi_schedule(&qdma->q_rx[i].napi); - } + for (i = 0; rx_intr_mask && i < ARRAY_SIZE(qdma->q_rx); i++) { + if (!qdma->q_rx[i].ndesc) + continue; + + if (rx_intr_mask & BIT(i)) + napi_schedule(&qdma->q_rx[i].napi); } if (intr[0] & INT_TX_MASK) { diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h index af263203d488e..53f39083a8b05 100644 --- a/drivers/net/ethernet/airoha/airoha_eth.h +++ b/drivers/net/ethernet/airoha/airoha_eth.h @@ -17,7 +17,7 @@ #define AIROHA_MAX_NUM_GDM_PORTS 4 #define AIROHA_MAX_NUM_QDMA 2 -#define AIROHA_MAX_NUM_IRQ_BANKS 1 +#define AIROHA_MAX_NUM_IRQ_BANKS 4 #define AIROHA_MAX_DSA_PORTS 7 #define AIROHA_MAX_NUM_RSTS 3 #define AIROHA_MAX_NUM_XSI_RSTS 5 @@ -453,6 +453,17 @@ struct airoha_flow_table_entry { unsigned long cookie; }; +/* RX queue to IRQ mapping: BIT(q) in IRQ(n) */ +#define RX_IRQ0_BANK_PIN_MASK 0x839f +#define RX_IRQ1_BANK_PIN_MASK 0x7fe00000 +#define RX_IRQ2_BANK_PIN_MASK 0x20 +#define RX_IRQ3_BANK_PIN_MASK 0x40 +#define RX_IRQ_BANK_PIN_MASK(_n) \ + (((_n) == 3) ? RX_IRQ3_BANK_PIN_MASK : \ + ((_n) == 2) ? RX_IRQ2_BANK_PIN_MASK : \ + ((_n) == 1) ? RX_IRQ1_BANK_PIN_MASK : \ + RX_IRQ0_BANK_PIN_MASK) + struct airoha_irq_bank { struct airoha_qdma *qdma; diff --git a/drivers/net/ethernet/airoha/airoha_regs.h b/drivers/net/ethernet/airoha/airoha_regs.h index 1d99fe9f81124..d931530fc96fb 100644 --- a/drivers/net/ethernet/airoha/airoha_regs.h +++ b/drivers/net/ethernet/airoha/airoha_regs.h @@ -463,6 +463,26 @@ #define IRQ0_FULL_INT_MASK BIT(1) #define IRQ0_INT_MASK BIT(0) +#define RX_COHERENT_LOW_INT_MASK \ + (RX15_COHERENT_INT_MASK | RX14_COHERENT_INT_MASK | \ + RX13_COHERENT_INT_MASK | RX12_COHERENT_INT_MASK | \ + RX11_COHERENT_INT_MASK | RX10_COHERENT_INT_MASK | \ + RX9_COHERENT_INT_MASK | RX8_COHERENT_INT_MASK | \ + RX7_COHERENT_INT_MASK | RX6_COHERENT_INT_MASK | \ + RX5_COHERENT_INT_MASK | RX4_COHERENT_INT_MASK | \ + RX3_COHERENT_INT_MASK | RX2_COHERENT_INT_MASK | \ + RX1_COHERENT_INT_MASK | RX0_COHERENT_INT_MASK) + +#define RX_COHERENT_LOW_OFFSET __ffs(RX_COHERENT_LOW_INT_MASK) +#define INT_RX0_MASK(_n) \ + (((_n) << RX_COHERENT_LOW_OFFSET) & RX_COHERENT_LOW_INT_MASK) + +#define TX_COHERENT_LOW_INT_MASK \ + (TX7_COHERENT_INT_MASK | TX6_COHERENT_INT_MASK | \ + TX5_COHERENT_INT_MASK | TX4_COHERENT_INT_MASK | \ + TX3_COHERENT_INT_MASK | TX2_COHERENT_INT_MASK | \ + TX1_COHERENT_INT_MASK | TX0_COHERENT_INT_MASK) + #define TX_DONE_INT_MASK(_n) \ ((_n) ? IRQ1_INT_MASK | IRQ1_FULL_INT_MASK \ : IRQ0_INT_MASK | IRQ0_FULL_INT_MASK) @@ -471,17 +491,6 @@ (IRQ1_INT_MASK | IRQ1_FULL_INT_MASK | \ IRQ0_INT_MASK | IRQ0_FULL_INT_MASK) -#define INT_IDX0_MASK \ - (TX0_COHERENT_INT_MASK | TX1_COHERENT_INT_MASK | \ - TX2_COHERENT_INT_MASK | TX3_COHERENT_INT_MASK | \ - TX4_COHERENT_INT_MASK | TX5_COHERENT_INT_MASK | \ - TX6_COHERENT_INT_MASK | TX7_COHERENT_INT_MASK | \ - RX0_COHERENT_INT_MASK | RX1_COHERENT_INT_MASK | \ - RX2_COHERENT_INT_MASK | RX3_COHERENT_INT_MASK | \ - RX4_COHERENT_INT_MASK | RX7_COHERENT_INT_MASK | \ - RX8_COHERENT_INT_MASK | RX9_COHERENT_INT_MASK | \ - RX15_COHERENT_INT_MASK | INT_TX_MASK) - /* QDMA_CSR_INT_ENABLE2 */ #define RX15_NO_CPU_DSCP_INT_MASK BIT(31) #define RX14_NO_CPU_DSCP_INT_MASK BIT(30) @@ -516,19 +525,121 @@ #define RX1_DONE_INT_MASK BIT(1) #define RX0_DONE_INT_MASK BIT(0) -#define RX_DONE_INT_MASK \ - (RX0_DONE_INT_MASK | RX1_DONE_INT_MASK | \ - RX2_DONE_INT_MASK | RX3_DONE_INT_MASK | \ - RX4_DONE_INT_MASK | RX7_DONE_INT_MASK | \ - RX8_DONE_INT_MASK | RX9_DONE_INT_MASK | \ - RX15_DONE_INT_MASK) -#define INT_IDX1_MASK \ - (RX_DONE_INT_MASK | \ - RX0_NO_CPU_DSCP_INT_MASK | RX1_NO_CPU_DSCP_INT_MASK | \ - RX2_NO_CPU_DSCP_INT_MASK | RX3_NO_CPU_DSCP_INT_MASK | \ - RX4_NO_CPU_DSCP_INT_MASK | RX7_NO_CPU_DSCP_INT_MASK | \ - RX8_NO_CPU_DSCP_INT_MASK | RX9_NO_CPU_DSCP_INT_MASK | \ - RX15_NO_CPU_DSCP_INT_MASK) +#define RX_NO_CPU_DSCP_LOW_INT_MASK \ + (RX15_NO_CPU_DSCP_INT_MASK | RX14_NO_CPU_DSCP_INT_MASK | \ + RX13_NO_CPU_DSCP_INT_MASK | RX12_NO_CPU_DSCP_INT_MASK | \ + RX11_NO_CPU_DSCP_INT_MASK | RX10_NO_CPU_DSCP_INT_MASK | \ + RX9_NO_CPU_DSCP_INT_MASK | RX8_NO_CPU_DSCP_INT_MASK | \ + RX7_NO_CPU_DSCP_INT_MASK | RX6_NO_CPU_DSCP_INT_MASK | \ + RX5_NO_CPU_DSCP_INT_MASK | RX4_NO_CPU_DSCP_INT_MASK | \ + RX3_NO_CPU_DSCP_INT_MASK | RX2_NO_CPU_DSCP_INT_MASK | \ + RX1_NO_CPU_DSCP_INT_MASK | RX0_NO_CPU_DSCP_INT_MASK) + +#define RX_DONE_LOW_INT_MASK \ + (RX15_DONE_INT_MASK | RX14_DONE_INT_MASK | \ + RX13_DONE_INT_MASK | RX12_DONE_INT_MASK | \ + RX11_DONE_INT_MASK | RX10_DONE_INT_MASK | \ + RX9_DONE_INT_MASK | RX8_DONE_INT_MASK | \ + RX7_DONE_INT_MASK | RX6_DONE_INT_MASK | \ + RX5_DONE_INT_MASK | RX4_DONE_INT_MASK | \ + RX3_DONE_INT_MASK | RX2_DONE_INT_MASK | \ + RX1_DONE_INT_MASK | RX0_DONE_INT_MASK) + +#define RX_NO_CPU_DSCP_LOW_OFFSET __ffs(RX_NO_CPU_DSCP_LOW_INT_MASK) +#define INT_RX1_MASK(_n) \ + ((((_n) << RX_NO_CPU_DSCP_LOW_OFFSET) & RX_NO_CPU_DSCP_LOW_INT_MASK) | \ + (RX_DONE_LOW_INT_MASK & (_n))) + +/* QDMA_CSR_INT_ENABLE3 */ +#define RX31_NO_CPU_DSCP_INT_MASK BIT(31) +#define RX30_NO_CPU_DSCP_INT_MASK BIT(30) +#define RX29_NO_CPU_DSCP_INT_MASK BIT(29) +#define RX28_NO_CPU_DSCP_INT_MASK BIT(28) +#define RX27_NO_CPU_DSCP_INT_MASK BIT(27) +#define RX26_NO_CPU_DSCP_INT_MASK BIT(26) +#define RX25_NO_CPU_DSCP_INT_MASK BIT(25) +#define RX24_NO_CPU_DSCP_INT_MASK BIT(24) +#define RX23_NO_CPU_DSCP_INT_MASK BIT(23) +#define RX22_NO_CPU_DSCP_INT_MASK BIT(22) +#define RX21_NO_CPU_DSCP_INT_MASK BIT(21) +#define RX20_NO_CPU_DSCP_INT_MASK BIT(20) +#define RX19_NO_CPU_DSCP_INT_MASK BIT(19) +#define RX18_NO_CPU_DSCP_INT_MASK BIT(18) +#define RX17_NO_CPU_DSCP_INT_MASK BIT(17) +#define RX16_NO_CPU_DSCP_INT_MASK BIT(16) +#define RX31_DONE_INT_MASK BIT(15) +#define RX30_DONE_INT_MASK BIT(14) +#define RX29_DONE_INT_MASK BIT(13) +#define RX28_DONE_INT_MASK BIT(12) +#define RX27_DONE_INT_MASK BIT(11) +#define RX26_DONE_INT_MASK BIT(10) +#define RX25_DONE_INT_MASK BIT(9) +#define RX24_DONE_INT_MASK BIT(8) +#define RX23_DONE_INT_MASK BIT(7) +#define RX22_DONE_INT_MASK BIT(6) +#define RX21_DONE_INT_MASK BIT(5) +#define RX20_DONE_INT_MASK BIT(4) +#define RX19_DONE_INT_MASK BIT(3) +#define RX18_DONE_INT_MASK BIT(2) +#define RX17_DONE_INT_MASK BIT(1) +#define RX16_DONE_INT_MASK BIT(0) + +#define RX_NO_CPU_DSCP_HIGH_INT_MASK \ + (RX31_NO_CPU_DSCP_INT_MASK | RX30_NO_CPU_DSCP_INT_MASK | \ + RX29_NO_CPU_DSCP_INT_MASK | RX28_NO_CPU_DSCP_INT_MASK | \ + RX27_NO_CPU_DSCP_INT_MASK | RX26_NO_CPU_DSCP_INT_MASK | \ + RX25_NO_CPU_DSCP_INT_MASK | RX24_NO_CPU_DSCP_INT_MASK | \ + RX23_NO_CPU_DSCP_INT_MASK | RX22_NO_CPU_DSCP_INT_MASK | \ + RX21_NO_CPU_DSCP_INT_MASK | RX20_NO_CPU_DSCP_INT_MASK | \ + RX19_NO_CPU_DSCP_INT_MASK | RX18_NO_CPU_DSCP_INT_MASK | \ + RX17_NO_CPU_DSCP_INT_MASK | RX16_NO_CPU_DSCP_INT_MASK) + +#define RX_DONE_HIGH_INT_MASK \ + (RX31_DONE_INT_MASK | RX30_DONE_INT_MASK | \ + RX29_DONE_INT_MASK | RX28_DONE_INT_MASK | \ + RX27_DONE_INT_MASK | RX26_DONE_INT_MASK | \ + RX25_DONE_INT_MASK | RX24_DONE_INT_MASK | \ + RX23_DONE_INT_MASK | RX22_DONE_INT_MASK | \ + RX21_DONE_INT_MASK | RX20_DONE_INT_MASK | \ + RX19_DONE_INT_MASK | RX18_DONE_INT_MASK | \ + RX17_DONE_INT_MASK | RX16_DONE_INT_MASK) + +#define RX_DONE_INT_MASK (RX_DONE_HIGH_INT_MASK | RX_DONE_LOW_INT_MASK) +#define RX_DONE_HIGH_OFFSET fls(RX_DONE_HIGH_INT_MASK) + +#define INT_RX2_MASK(_n) \ + ((RX_NO_CPU_DSCP_HIGH_INT_MASK & (_n)) | \ + (((_n) >> RX_DONE_HIGH_OFFSET) & RX_DONE_HIGH_INT_MASK)) + +/* QDMA_CSR_INT_ENABLE4 */ +#define RX31_COHERENT_INT_MASK BIT(31) +#define RX30_COHERENT_INT_MASK BIT(30) +#define RX29_COHERENT_INT_MASK BIT(29) +#define RX28_COHERENT_INT_MASK BIT(28) +#define RX27_COHERENT_INT_MASK BIT(27) +#define RX26_COHERENT_INT_MASK BIT(26) +#define RX25_COHERENT_INT_MASK BIT(25) +#define RX24_COHERENT_INT_MASK BIT(24) +#define RX23_COHERENT_INT_MASK BIT(23) +#define RX22_COHERENT_INT_MASK BIT(22) +#define RX21_COHERENT_INT_MASK BIT(21) +#define RX20_COHERENT_INT_MASK BIT(20) +#define RX19_COHERENT_INT_MASK BIT(19) +#define RX18_COHERENT_INT_MASK BIT(18) +#define RX17_COHERENT_INT_MASK BIT(17) +#define RX16_COHERENT_INT_MASK BIT(16) + +#define RX_COHERENT_HIGH_INT_MASK \ + (RX31_COHERENT_INT_MASK | RX30_COHERENT_INT_MASK | \ + RX29_COHERENT_INT_MASK | RX28_COHERENT_INT_MASK | \ + RX27_COHERENT_INT_MASK | RX26_COHERENT_INT_MASK | \ + RX25_COHERENT_INT_MASK | RX24_COHERENT_INT_MASK | \ + RX23_COHERENT_INT_MASK | RX22_COHERENT_INT_MASK | \ + RX21_COHERENT_INT_MASK | RX20_COHERENT_INT_MASK | \ + RX19_COHERENT_INT_MASK | RX18_COHERENT_INT_MASK | \ + RX17_COHERENT_INT_MASK | RX16_COHERENT_INT_MASK) + +#define INT_RX3_MASK(_n) (RX_COHERENT_HIGH_INT_MASK & (_n)) /* QDMA_CSR_INT_ENABLE5 */ #define TX31_COHERENT_INT_MASK BIT(31) @@ -556,19 +667,19 @@ #define TX9_COHERENT_INT_MASK BIT(9) #define TX8_COHERENT_INT_MASK BIT(8) -#define INT_IDX4_MASK \ - (TX8_COHERENT_INT_MASK | TX9_COHERENT_INT_MASK | \ - TX10_COHERENT_INT_MASK | TX11_COHERENT_INT_MASK | \ - TX12_COHERENT_INT_MASK | TX13_COHERENT_INT_MASK | \ - TX14_COHERENT_INT_MASK | TX15_COHERENT_INT_MASK | \ - TX16_COHERENT_INT_MASK | TX17_COHERENT_INT_MASK | \ - TX18_COHERENT_INT_MASK | TX19_COHERENT_INT_MASK | \ - TX20_COHERENT_INT_MASK | TX21_COHERENT_INT_MASK | \ - TX22_COHERENT_INT_MASK | TX23_COHERENT_INT_MASK | \ - TX24_COHERENT_INT_MASK | TX25_COHERENT_INT_MASK | \ - TX26_COHERENT_INT_MASK | TX27_COHERENT_INT_MASK | \ - TX28_COHERENT_INT_MASK | TX29_COHERENT_INT_MASK | \ - TX30_COHERENT_INT_MASK | TX31_COHERENT_INT_MASK) +#define TX_COHERENT_HIGH_INT_MASK \ + (TX31_COHERENT_INT_MASK | TX30_COHERENT_INT_MASK | \ + TX29_COHERENT_INT_MASK | TX28_COHERENT_INT_MASK | \ + TX27_COHERENT_INT_MASK | TX26_COHERENT_INT_MASK | \ + TX25_COHERENT_INT_MASK | TX24_COHERENT_INT_MASK | \ + TX23_COHERENT_INT_MASK | TX22_COHERENT_INT_MASK | \ + TX21_COHERENT_INT_MASK | TX20_COHERENT_INT_MASK | \ + TX19_COHERENT_INT_MASK | TX18_COHERENT_INT_MASK | \ + TX17_COHERENT_INT_MASK | TX16_COHERENT_INT_MASK | \ + TX15_COHERENT_INT_MASK | TX14_COHERENT_INT_MASK | \ + TX13_COHERENT_INT_MASK | TX12_COHERENT_INT_MASK | \ + TX11_COHERENT_INT_MASK | TX10_COHERENT_INT_MASK | \ + TX9_COHERENT_INT_MASK | TX8_COHERENT_INT_MASK) #define REG_TX_IRQ_BASE(_n) ((_n) ? 0x0048 : 0x0050) From b5cdb9b3113e6633d84f5598b5010794e86a4d35 Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Sun, 20 Apr 2025 08:53:50 +0530 Subject: [PATCH 433/770] octeontx2-pf: AF_XDP: code clean up The current API, otx2_xdp_sq_append_pkt, verifies the number of available descriptors before sending packets to the hardware. However, for AF_XDP, this check is unnecessary because the batch value is already determined based on the free descriptors. This patch introduces a new API, "otx2_xsk_sq_append_pkt" to address this. Remove the logic for releasing the TX buffers, as it is implicitly handled by xsk_tx_peek_release_desc_batch Signed-off-by: Hariprasad Kelam Link: https://patch.msgid.link/20250420032350.4047706-1-hkelam@marvell.com Signed-off-by: Jakub Kicinski --- .../marvell/octeontx2/nic/otx2_common.h | 2 + .../marvell/octeontx2/nic/otx2_txrx.c | 5 +-- .../ethernet/marvell/octeontx2/nic/otx2_xsk.c | 42 ++++++++++++++----- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 1e88422825be7..7e3ddb0bee120 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -1107,6 +1107,8 @@ int otx2_enable_rxvlan(struct otx2_nic *pf, bool enable); int otx2_install_rxvlan_offload_flow(struct otx2_nic *pfvf); bool otx2_xdp_sq_append_pkt(struct otx2_nic *pfvf, struct xdp_frame *xdpf, u64 iova, int len, u16 qidx, u16 flags); +void otx2_xdp_sqe_add_sg(struct otx2_snd_queue *sq, struct xdp_frame *xdpf, + u64 dma_addr, int len, int *offset, u16 flags); u16 otx2_get_max_mtu(struct otx2_nic *pfvf); int otx2_handle_ntuple_tc_features(struct net_device *netdev, netdev_features_t features); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index 0a6bb346ba45b..9593627b35a3e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -1410,9 +1410,8 @@ void otx2_free_pending_sqe(struct otx2_nic *pfvf) } } -static void otx2_xdp_sqe_add_sg(struct otx2_snd_queue *sq, - struct xdp_frame *xdpf, - u64 dma_addr, int len, int *offset, u16 flags) +void otx2_xdp_sqe_add_sg(struct otx2_snd_queue *sq, struct xdp_frame *xdpf, + u64 dma_addr, int len, int *offset, u16 flags) { struct nix_sqe_sg_s *sg = NULL; u64 *iova = NULL; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_xsk.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_xsk.c index ce10caea8511d..b328aae23d731 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_xsk.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_xsk.c @@ -11,6 +11,7 @@ #include #include "otx2_common.h" +#include "otx2_struct.h" #include "otx2_xsk.h" int otx2_xsk_pool_alloc_buf(struct otx2_nic *pfvf, struct otx2_pool *pool, @@ -196,11 +197,39 @@ void otx2_attach_xsk_buff(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, int sq->xsk_pool = xsk_get_pool_from_qid(pfvf->netdev, qidx); } +static void otx2_xsk_sq_append_pkt(struct otx2_nic *pfvf, u64 iova, int len, + u16 qidx) +{ + struct nix_sqe_hdr_s *sqe_hdr; + struct otx2_snd_queue *sq; + int offset; + + sq = &pfvf->qset.sq[qidx]; + memset(sq->sqe_base + 8, 0, sq->sqe_size - 8); + + sqe_hdr = (struct nix_sqe_hdr_s *)(sq->sqe_base); + + if (!sqe_hdr->total) { + sqe_hdr->aura = sq->aura_id; + sqe_hdr->df = 1; + sqe_hdr->sq = qidx; + sqe_hdr->pnc = 1; + } + sqe_hdr->total = len; + sqe_hdr->sqe_id = sq->head; + + offset = sizeof(*sqe_hdr); + + otx2_xdp_sqe_add_sg(sq, NULL, iova, len, &offset, OTX2_AF_XDP_FRAME); + sqe_hdr->sizem1 = (offset / 16) - 1; + pfvf->hw_ops->sqe_flush(pfvf, sq, offset, qidx); +} + void otx2_zc_napi_handler(struct otx2_nic *pfvf, struct xsk_buff_pool *pool, int queue, int budget) { struct xdp_desc *xdp_desc = pool->tx_descs; - int err, i, work_done = 0, batch; + int i, batch; budget = min(budget, otx2_read_free_sqe(pfvf, queue)); batch = xsk_tx_peek_release_desc_batch(pool, budget); @@ -211,15 +240,6 @@ void otx2_zc_napi_handler(struct otx2_nic *pfvf, struct xsk_buff_pool *pool, dma_addr_t dma_addr; dma_addr = xsk_buff_raw_get_dma(pool, xdp_desc[i].addr); - err = otx2_xdp_sq_append_pkt(pfvf, NULL, dma_addr, xdp_desc[i].len, - queue, OTX2_AF_XDP_FRAME); - if (!err) { - netdev_err(pfvf->netdev, "AF_XDP: Unable to transfer packet err%d\n", err); - break; - } - work_done++; + otx2_xsk_sq_append_pkt(pfvf, dma_addr, xdp_desc[i].len, queue); } - - if (work_done) - xsk_tx_release(pool); } From 5f2f8d8b6800e4fc760c2eccec9b2bd2cacf80cf Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Tue, 22 Apr 2025 12:25:38 +0300 Subject: [PATCH 434/770] net/mlx5: HWS, Fix IP version decision Unify the check for IP version when creating a definer. A given matcher is deemed to match on IPv6 if any of the higher order (>31) bits of source or destination address mask are set. A single packet cannot mix IP versions between source and destination addresses, so it makes no sense that they would be decided on independently. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250422092540.182091-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/definer.c | 38 ++++++++----------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c index c8cc0c8115f53..5257e706dde2d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c @@ -509,9 +509,9 @@ static int hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd, u32 *match_param) { - bool is_s_ipv6, is_d_ipv6, smac_set, dmac_set; struct mlx5hws_definer_fc *fc = cd->fc; struct mlx5hws_definer_fc *curr_fc; + bool is_ipv6, smac_set, dmac_set; u32 *s_ipv6, *d_ipv6; if (HWS_IS_FLD_SET_SZ(match_param, outer_headers.l4_type, 0x2) || @@ -570,10 +570,10 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd, outer_headers.dst_ipv4_dst_ipv6.ipv6_layout); /* Assume IPv6 is used if ipv6 bits are set */ - is_s_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2]; - is_d_ipv6 = d_ipv6[0] || d_ipv6[1] || d_ipv6[2]; + is_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2] || + d_ipv6[0] || d_ipv6[1] || d_ipv6[2]; - if (is_s_ipv6) { + if (is_ipv6) { /* Handle IPv6 source address */ HWS_SET_HDR(fc, match_param, IPV6_SRC_127_96_O, outer_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_127_96, @@ -587,13 +587,6 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd, HWS_SET_HDR(fc, match_param, IPV6_SRC_31_0_O, outer_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0, ipv6_src_outer.ipv6_address_31_0); - } else { - /* Handle IPv4 source address */ - HWS_SET_HDR(fc, match_param, IPV4_SRC_O, - outer_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0, - ipv4_src_dest_outer.source_address); - } - if (is_d_ipv6) { /* Handle IPv6 destination address */ HWS_SET_HDR(fc, match_param, IPV6_DST_127_96_O, outer_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_127_96, @@ -608,6 +601,10 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd, outer_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_31_0, ipv6_dst_outer.ipv6_address_31_0); } else { + /* Handle IPv4 source address */ + HWS_SET_HDR(fc, match_param, IPV4_SRC_O, + outer_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0, + ipv4_src_dest_outer.source_address); /* Handle IPv4 destination address */ HWS_SET_HDR(fc, match_param, IPV4_DST_O, outer_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_31_0, @@ -665,9 +662,9 @@ static int hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd, u32 *match_param) { - bool is_s_ipv6, is_d_ipv6, smac_set, dmac_set; struct mlx5hws_definer_fc *fc = cd->fc; struct mlx5hws_definer_fc *curr_fc; + bool is_ipv6, smac_set, dmac_set; u32 *s_ipv6, *d_ipv6; if (HWS_IS_FLD_SET_SZ(match_param, inner_headers.l4_type, 0x2) || @@ -728,10 +725,10 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd, inner_headers.dst_ipv4_dst_ipv6.ipv6_layout); /* Assume IPv6 is used if ipv6 bits are set */ - is_s_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2]; - is_d_ipv6 = d_ipv6[0] || d_ipv6[1] || d_ipv6[2]; + is_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2] || + d_ipv6[0] || d_ipv6[1] || d_ipv6[2]; - if (is_s_ipv6) { + if (is_ipv6) { /* Handle IPv6 source address */ HWS_SET_HDR(fc, match_param, IPV6_SRC_127_96_I, inner_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_127_96, @@ -745,13 +742,6 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd, HWS_SET_HDR(fc, match_param, IPV6_SRC_31_0_I, inner_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0, ipv6_src_inner.ipv6_address_31_0); - } else { - /* Handle IPv4 source address */ - HWS_SET_HDR(fc, match_param, IPV4_SRC_I, - inner_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0, - ipv4_src_dest_inner.source_address); - } - if (is_d_ipv6) { /* Handle IPv6 destination address */ HWS_SET_HDR(fc, match_param, IPV6_DST_127_96_I, inner_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_127_96, @@ -766,6 +756,10 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd, inner_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_31_0, ipv6_dst_inner.ipv6_address_31_0); } else { + /* Handle IPv4 source address */ + HWS_SET_HDR(fc, match_param, IPV4_SRC_I, + inner_headers.src_ipv4_src_ipv6.ipv6_simple_layout.ipv6_31_0, + ipv4_src_dest_inner.source_address); /* Handle IPv4 destination address */ HWS_SET_HDR(fc, match_param, IPV4_DST_I, inner_headers.dst_ipv4_dst_ipv6.ipv6_simple_layout.ipv6_31_0, From 6991a975e416154576b0f5f06256aec13e23b0a7 Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Tue, 22 Apr 2025 12:25:39 +0300 Subject: [PATCH 435/770] net/mlx5: HWS, Harden IP version definer checks Replicate some sanity checks that firmware does, since hardware steering does not go through firmware. When creating a definer, disallow matching on IP addresses without also matching on IP version. The latter can be satisfied by matching either on the version field in the IP header, or on the ethertype field. Also refuse to match IPv4 IHL alongside IPv6. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250422092540.182091-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/definer.c | 44 ++++++++++++++++++- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c index 5257e706dde2d..1061a46811ac0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c @@ -509,9 +509,9 @@ static int hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd, u32 *match_param) { + bool is_ipv6, smac_set, dmac_set, ip_addr_set, ip_ver_set; struct mlx5hws_definer_fc *fc = cd->fc; struct mlx5hws_definer_fc *curr_fc; - bool is_ipv6, smac_set, dmac_set; u32 *s_ipv6, *d_ipv6; if (HWS_IS_FLD_SET_SZ(match_param, outer_headers.l4_type, 0x2) || @@ -521,6 +521,20 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd, return -EINVAL; } + ip_addr_set = HWS_IS_FLD_SET_SZ(match_param, + outer_headers.src_ipv4_src_ipv6, + 0x80) || + HWS_IS_FLD_SET_SZ(match_param, + outer_headers.dst_ipv4_dst_ipv6, 0x80); + ip_ver_set = HWS_IS_FLD_SET(match_param, outer_headers.ip_version) || + HWS_IS_FLD_SET(match_param, outer_headers.ethertype); + + if (ip_addr_set && !ip_ver_set) { + mlx5hws_err(cd->ctx, + "Unsupported match on IP address without version or ethertype\n"); + return -EINVAL; + } + /* L2 Check ethertype */ HWS_SET_HDR(fc, match_param, ETH_TYPE_O, outer_headers.ethertype, @@ -573,6 +587,12 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd, is_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2] || d_ipv6[0] || d_ipv6[1] || d_ipv6[2]; + /* IHL is an IPv4-specific field. */ + if (is_ipv6 && HWS_IS_FLD_SET(match_param, outer_headers.ipv4_ihl)) { + mlx5hws_err(cd->ctx, "Unsupported match on IPv6 address and IPv4 IHL\n"); + return -EINVAL; + } + if (is_ipv6) { /* Handle IPv6 source address */ HWS_SET_HDR(fc, match_param, IPV6_SRC_127_96_O, @@ -662,9 +682,9 @@ static int hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd, u32 *match_param) { + bool is_ipv6, smac_set, dmac_set, ip_addr_set, ip_ver_set; struct mlx5hws_definer_fc *fc = cd->fc; struct mlx5hws_definer_fc *curr_fc; - bool is_ipv6, smac_set, dmac_set; u32 *s_ipv6, *d_ipv6; if (HWS_IS_FLD_SET_SZ(match_param, inner_headers.l4_type, 0x2) || @@ -674,6 +694,20 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd, return -EINVAL; } + ip_addr_set = HWS_IS_FLD_SET_SZ(match_param, + inner_headers.src_ipv4_src_ipv6, + 0x80) || + HWS_IS_FLD_SET_SZ(match_param, + inner_headers.dst_ipv4_dst_ipv6, 0x80); + ip_ver_set = HWS_IS_FLD_SET(match_param, inner_headers.ip_version) || + HWS_IS_FLD_SET(match_param, inner_headers.ethertype); + + if (ip_addr_set && !ip_ver_set) { + mlx5hws_err(cd->ctx, + "Unsupported match on IP address without version or ethertype\n"); + return -EINVAL; + } + /* L2 Check ethertype */ HWS_SET_HDR(fc, match_param, ETH_TYPE_I, inner_headers.ethertype, @@ -728,6 +762,12 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd, is_ipv6 = s_ipv6[0] || s_ipv6[1] || s_ipv6[2] || d_ipv6[0] || d_ipv6[1] || d_ipv6[2]; + /* IHL is an IPv4-specific field. */ + if (is_ipv6 && HWS_IS_FLD_SET(match_param, inner_headers.ipv4_ihl)) { + mlx5hws_err(cd->ctx, "Unsupported match on IPv6 address and IPv4 IHL\n"); + return -EINVAL; + } + if (is_ipv6) { /* Handle IPv6 source address */ HWS_SET_HDR(fc, match_param, IPV6_SRC_127_96_I, From f41f3edf0b15d7ce0b0f71c00a6125e8d7ca735f Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Tue, 22 Apr 2025 12:25:40 +0300 Subject: [PATCH 436/770] net/mlx5: HWS, Disallow matcher IP version mixing Signal clearly to the user, via an error, that mixing IPv4 and IPv6 rules in the same matcher is not supported. Previously such cases silently failed by adding a rule that did not work correctly. Rules can specify an IP version by one of two fields: IP version or ethertype. At matcher creation, store whether the template matches on any of these two fields. If yes, inspect each rule for its corresponding match value and store the IP version inside the matcher to guard against inconsistencies with subsequent rules. Furthermore, also check rules for internal consistency, i.e. verify that the ethertype and IP version match values do not contradict each other. The logic applies to inner and outer headers independently, to account for tunneling. Rules that do not match on IP addresses are not affected. Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250422092540.182091-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/matcher.c | 26 ++++ .../mellanox/mlx5/core/steering/hws/matcher.h | 12 ++ .../mellanox/mlx5/core/steering/hws/rule.c | 122 ++++++++++++++++++ 3 files changed, 160 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index 716502732d3da..5b0c1623499bf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -385,6 +385,30 @@ static int hws_matcher_bind_at(struct mlx5hws_matcher *matcher) return 0; } +static void hws_matcher_set_ip_version_match(struct mlx5hws_matcher *matcher) +{ + int i; + + for (i = 0; i < matcher->mt->fc_sz; i++) { + switch (matcher->mt->fc[i].fname) { + case MLX5HWS_DEFINER_FNAME_ETH_TYPE_O: + matcher->matches_outer_ethertype = 1; + break; + case MLX5HWS_DEFINER_FNAME_ETH_L3_TYPE_O: + matcher->matches_outer_ip_version = 1; + break; + case MLX5HWS_DEFINER_FNAME_ETH_TYPE_I: + matcher->matches_inner_ethertype = 1; + break; + case MLX5HWS_DEFINER_FNAME_ETH_L3_TYPE_I: + matcher->matches_inner_ip_version = 1; + break; + default: + break; + } + } +} + static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher) { struct mlx5hws_context *ctx = matcher->tbl->ctx; @@ -401,6 +425,8 @@ static int hws_matcher_bind_mt(struct mlx5hws_matcher *matcher) } } + hws_matcher_set_ip_version_match(matcher); + /* Create an STE pool per matcher*/ pool_attr.table_type = matcher->tbl->type; pool_attr.pool_type = MLX5HWS_POOL_TYPE_STE; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h index bad1fa8f77fd2..8e95158a66b54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.h @@ -50,6 +50,12 @@ struct mlx5hws_matcher_match_ste { struct mlx5hws_pool *pool; }; +enum { + MLX5HWS_MATCHER_IPV_UNSET = 0, + MLX5HWS_MATCHER_IPV_4 = 1, + MLX5HWS_MATCHER_IPV_6 = 2, +}; + struct mlx5hws_matcher { struct mlx5hws_table *tbl; struct mlx5hws_matcher_attr attr; @@ -61,6 +67,12 @@ struct mlx5hws_matcher { u8 num_of_action_stes; /* enum mlx5hws_matcher_flags */ u8 flags; + u8 matches_outer_ethertype:1; + u8 matches_outer_ip_version:1; + u8 matches_inner_ethertype:1; + u8 matches_inner_ip_version:1; + u8 outer_ip_version:2; + u8 inner_ip_version:2; u32 end_ft_id; struct mlx5hws_matcher *col_matcher; struct mlx5hws_matcher *resize_dst; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c index 9e6f35d684456..5342a4cc71942 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/rule.c @@ -655,6 +655,124 @@ int mlx5hws_rule_move_hws_add(struct mlx5hws_rule *rule, return 0; } +static u8 hws_rule_ethertype_to_matcher_ipv(u32 ethertype) +{ + switch (ethertype) { + case ETH_P_IP: + return MLX5HWS_MATCHER_IPV_4; + case ETH_P_IPV6: + return MLX5HWS_MATCHER_IPV_6; + default: + return MLX5HWS_MATCHER_IPV_UNSET; + } +} + +static u8 hws_rule_ip_version_to_matcher_ipv(u32 ip_version) +{ + switch (ip_version) { + case 4: + return MLX5HWS_MATCHER_IPV_4; + case 6: + return MLX5HWS_MATCHER_IPV_6; + default: + return MLX5HWS_MATCHER_IPV_UNSET; + } +} + +static int hws_rule_check_outer_ip_version(struct mlx5hws_matcher *matcher, + u32 *match_param) +{ + struct mlx5hws_context *ctx = matcher->tbl->ctx; + u8 outer_ipv_ether = MLX5HWS_MATCHER_IPV_UNSET; + u8 outer_ipv_ip = MLX5HWS_MATCHER_IPV_UNSET; + u8 outer_ipv, ver; + + if (matcher->matches_outer_ethertype) { + ver = MLX5_GET(fte_match_param, match_param, + outer_headers.ethertype); + outer_ipv_ether = hws_rule_ethertype_to_matcher_ipv(ver); + } + if (matcher->matches_outer_ip_version) { + ver = MLX5_GET(fte_match_param, match_param, + outer_headers.ip_version); + outer_ipv_ip = hws_rule_ip_version_to_matcher_ipv(ver); + } + + if (outer_ipv_ether != MLX5HWS_MATCHER_IPV_UNSET && + outer_ipv_ip != MLX5HWS_MATCHER_IPV_UNSET && + outer_ipv_ether != outer_ipv_ip) { + mlx5hws_err(ctx, "Rule matches on inconsistent outer ethertype and ip version\n"); + return -EINVAL; + } + + outer_ipv = outer_ipv_ether != MLX5HWS_MATCHER_IPV_UNSET ? + outer_ipv_ether : outer_ipv_ip; + if (outer_ipv != MLX5HWS_MATCHER_IPV_UNSET && + matcher->outer_ip_version != MLX5HWS_MATCHER_IPV_UNSET && + outer_ipv != matcher->outer_ip_version) { + mlx5hws_err(ctx, "Matcher and rule disagree on outer IP version\n"); + return -EINVAL; + } + matcher->outer_ip_version = outer_ipv; + + return 0; +} + +static int hws_rule_check_inner_ip_version(struct mlx5hws_matcher *matcher, + u32 *match_param) +{ + struct mlx5hws_context *ctx = matcher->tbl->ctx; + u8 inner_ipv_ether = MLX5HWS_MATCHER_IPV_UNSET; + u8 inner_ipv_ip = MLX5HWS_MATCHER_IPV_UNSET; + u8 inner_ipv, ver; + + if (matcher->matches_inner_ethertype) { + ver = MLX5_GET(fte_match_param, match_param, + inner_headers.ethertype); + inner_ipv_ether = hws_rule_ethertype_to_matcher_ipv(ver); + } + if (matcher->matches_inner_ip_version) { + ver = MLX5_GET(fte_match_param, match_param, + inner_headers.ip_version); + inner_ipv_ip = hws_rule_ip_version_to_matcher_ipv(ver); + } + + if (inner_ipv_ether != MLX5HWS_MATCHER_IPV_UNSET && + inner_ipv_ip != MLX5HWS_MATCHER_IPV_UNSET && + inner_ipv_ether != inner_ipv_ip) { + mlx5hws_err(ctx, "Rule matches on inconsistent inner ethertype and ip version\n"); + return -EINVAL; + } + + inner_ipv = inner_ipv_ether != MLX5HWS_MATCHER_IPV_UNSET ? + inner_ipv_ether : inner_ipv_ip; + if (inner_ipv != MLX5HWS_MATCHER_IPV_UNSET && + matcher->inner_ip_version != MLX5HWS_MATCHER_IPV_UNSET && + inner_ipv != matcher->inner_ip_version) { + mlx5hws_err(ctx, "Matcher and rule disagree on inner IP version\n"); + return -EINVAL; + } + matcher->inner_ip_version = inner_ipv; + + return 0; +} + +static int hws_rule_check_ip_version(struct mlx5hws_matcher *matcher, + u32 *match_param) +{ + int ret; + + ret = hws_rule_check_outer_ip_version(matcher, match_param); + if (unlikely(ret)) + return ret; + + ret = hws_rule_check_inner_ip_version(matcher, match_param); + if (unlikely(ret)) + return ret; + + return 0; +} + int mlx5hws_rule_create(struct mlx5hws_matcher *matcher, u8 mt_idx, u32 *match_param, @@ -665,6 +783,10 @@ int mlx5hws_rule_create(struct mlx5hws_matcher *matcher, { int ret; + ret = hws_rule_check_ip_version(matcher, match_param); + if (unlikely(ret)) + return ret; + rule_handle->matcher = matcher; ret = hws_rule_enqueue_precheck_create(rule_handle, attr); From e3f506b78d921e48a00d005bea5c45ec36a99240 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Wed, 23 Apr 2025 13:51:54 +0530 Subject: [PATCH 437/770] powerpc/boot: Fix dash warning 'commit b2accfe7ca5b ("powerpc/boot: Check for ld-option support")' suppressed linker warnings, but the expressed used did not go well with POSIX shell (dash) resulting with this warning arch/powerpc/boot/wrapper: 237: [: 0: unexpected operator ld: warning: arch/powerpc/boot/zImage.epapr has a LOAD segment with RWX permissions Fix the check to handle the reported warning. Patch also fixes couple of shellcheck reported errors for the same line. In arch/powerpc/boot/wrapper line 237: if [ $(${CROSS}ld -v --no-warn-rwx-segments &>/dev/null; echo $?) -eq 0 ]; then ^-- SC2046 (warning): Quote this to prevent word splitting. ^------^ SC2086 (info): Double quote to prevent globbing and word splitting. ^---------^ SC3020 (warning): In POSIX sh, &> is undefined. Fixes: b2accfe7ca5b ("powerpc/boot: Check for ld-option support") Reported-by: Stephen Rothwell Suggested-by: Stephen Rothwell Tested-by: Venkat Rao Bagalkote Reviewed-by: Stephen Rothwell Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250423082154.30625-1-maddy@linux.ibm.com --- arch/powerpc/boot/wrapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 267ca6d4d9b38..3d8dc822282ac 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -234,7 +234,7 @@ fi # suppress some warnings in recent ld versions nowarn="-z noexecstack" -if [ $(${CROSS}ld -v --no-warn-rwx-segments &>/dev/null; echo $?) -eq 0 ]; then +if "${CROSS}ld" -v --no-warn-rwx-segments >/dev/null 2>&1; then nowarn="$nowarn --no-warn-rwx-segments" fi From 4cb4861d8c3b3b4da478573dc6c03899b526bf08 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:42 -0700 Subject: [PATCH 438/770] ipv6: Validate RTA_GATEWAY of RTA_MULTIPATH in rtm_to_fib6_config(). We will perform RTM_NEWROUTE and RTM_DELROUTE under RCU, and then we want to perform some validation out of the RCU scope. When creating / removing an IPv6 route with RTA_MULTIPATH, inet6_rtm_newroute() / inet6_rtm_delroute() validates RTA_GATEWAY in each multipath entry. Let's do that in rtm_to_fib6_config(). Note that now RTM_DELROUTE returns an error for RTA_MULTIPATH with 0 entries, which was accepted but should result in -EINVAL as RTM_NEWROUTE. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418000443.43734-2-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 82 +++++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 945857a8bfe38..369914a374613 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5051,6 +5051,44 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_FLOWLABEL] = { .type = NLA_BE32 }, }; +static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, + struct netlink_ext_ack *extack) +{ + struct rtnexthop *rtnh; + int remaining; + + remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; + + if (!rtnh_ok(rtnh, remaining)) { + NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops"); + return -EINVAL; + } + + do { + int attrlen = rtnh_attrlen(rtnh); + + if (attrlen > 0) { + struct nlattr *nla, *attrs; + + attrs = rtnh_attrs(rtnh); + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + if (nla_len(nla) < sizeof(cfg->fc_gateway)) { + NL_SET_ERR_MSG(extack, + "Invalid IPv6 address in RTA_GATEWAY"); + return -EINVAL; + } + } + } + + rtnh = rtnh_next(rtnh, &remaining); + } while (rtnh_ok(rtnh, remaining)); + + return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, + extack, true); +} + static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config *cfg, struct netlink_ext_ack *extack) @@ -5165,9 +5203,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); - err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, - cfg->fc_mp_len, - extack, true); + err = rtm_to_fib6_multipath_config(cfg, extack); if (err < 0) goto errout; } @@ -5287,19 +5323,6 @@ static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) return should_notify; } -static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla, - struct netlink_ext_ack *extack) -{ - if (nla_len(nla) < sizeof(*gw)) { - NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); - return -EINVAL; - } - - *gw = nla_get_in6_addr(nla); - - return 0; -} - static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { @@ -5340,18 +5363,11 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, - extack); - if (err) - goto cleanup; - + r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } - r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); - /* RTA_ENCAP_TYPE length checked in - * lwtunnel_valid_encap_type_attr - */ + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); @@ -5384,12 +5400,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = rtnh_next(rtnh, &remaining); } - if (list_empty(&rt6_nh_list)) { - NL_SET_ERR_MSG(extack, - "Invalid nexthop configuration - no valid nexthops"); - return -EINVAL; - } - /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done @@ -5511,21 +5521,15 @@ static int ip6_route_multipath_del(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, - extack); - if (err) { - last_err = err; - goto next_rtnh; - } - + r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } } + err = ip6_route_del(&r_cfg, extack); if (err) last_err = err; -next_rtnh: rtnh = rtnh_next(rtnh, &remaining); } From bd11ff421d36abdb585b9104fa70057bf01b3110 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:43 -0700 Subject: [PATCH 439/770] ipv6: Get rid of RTNL for SIOCDELRT and RTM_DELROUTE. Basically, removing an IPv6 route does not require RTNL because the IPv6 routing tables are protected by per table lock. inet6_rtm_delroute() calls nexthop_find_by_id() to check if the nexthop specified by RTA_NH_ID exists. nexthop uses rbtree and the top-down walk can be safely performed under RCU. ip6_route_del() already relies on RCU and the table lock, but we need to extend the RCU critical section a bit more to cover __ip6_del_rt(). For example, nexthop_for_each_fib6_nh() and inet6_rt_notify() needs RCU. Let's call nexthop_find_by_id() and __ip6_del_rt() under RCU and get rid of RTNL from inet6_rtm_delroute() and SIOCDELRT. Even if the nexthop is removed after rcu_read_unlock() in inet6_rtm_delroute(), __remove_nexthop_fib() cleans up the routes tied to the nexthop, and ip6_route_del() returns -ESRCH. So the request was at least valid as of nexthop_find_by_id(), and it's just a matter of timing. Note that we need to pass false to lwtunnel_valid_encap_type_attr(). The following patches also use the newroute bool. Note also that fib6_get_table() does not require RCU because once allocated fib6_table is not freed until netns dismantle. I will post a follow-up series to convert such callers to RCU-lockless version. [0] Link: https://lore.kernel.org/netdev/20250417174557.65721-1-kuniyu@amazon.com/ #[0] Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418000443.43734-3-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 369914a374613..1c304f259d9b4 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4125,9 +4125,9 @@ static int ip6_route_del(struct fib6_config *cfg, if (rt->nh) { if (!fib6_info_hold_safe(rt)) continue; - rcu_read_unlock(); - return __ip6_del_rt(rt, &cfg->fc_nlinfo); + err = __ip6_del_rt(rt, &cfg->fc_nlinfo); + break; } if (cfg->fc_nh_id) continue; @@ -4142,13 +4142,13 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (!fib6_info_hold_safe(rt)) continue; - rcu_read_unlock(); /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) - return __ip6_del_rt(rt, &cfg->fc_nlinfo); - - return __ip6_del_rt_siblings(rt, cfg); + err = __ip6_del_rt(rt, &cfg->fc_nlinfo); + else + err = __ip6_del_rt_siblings(rt, cfg); + break; } } rcu_read_unlock(); @@ -4517,19 +4517,20 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) rtmsg_to_fib6_config(net, rtmsg, &cfg); - rtnl_lock(); switch (cmd) { case SIOCADDRT: + rtnl_lock(); /* Only do the default setting of fc_metric in route adding */ if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; err = ip6_route_add(&cfg, GFP_KERNEL, NULL); + rtnl_unlock(); break; case SIOCDELRT: err = ip6_route_del(&cfg, NULL); break; } - rtnl_unlock(); + return err; } @@ -5052,7 +5053,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { }; static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, + bool newroute) { struct rtnexthop *rtnh; int remaining; @@ -5086,15 +5088,16 @@ static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, } while (rtnh_ok(rtnh, remaining)); return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, - extack, true); + extack, newroute); } static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rtmsg *rtm; + bool newroute = nlh->nlmsg_type == RTM_NEWROUTE; struct nlattr *tb[RTA_MAX+1]; + struct rtmsg *rtm; unsigned int pref; int err; @@ -5203,7 +5206,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); - err = rtm_to_fib6_multipath_config(cfg, extack); + err = rtm_to_fib6_multipath_config(cfg, extack, newroute); if (err < 0) goto errout; } @@ -5223,7 +5226,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, - extack, true); + extack, newroute); if (err < 0) goto errout; } @@ -5546,15 +5549,20 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - if (cfg.fc_nh_id && - !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { - NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); - return -EINVAL; + if (cfg.fc_nh_id) { + rcu_read_lock(); + err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id); + rcu_read_unlock(); + + if (err) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + return -EINVAL; + } } - if (cfg.fc_mp) + if (cfg.fc_mp) { return ip6_route_multipath_del(&cfg, extack); - else { + } else { cfg.fc_delete_all_nh = 1; return ip6_route_del(&cfg, extack); } @@ -6766,7 +6774,7 @@ static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_o {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE, .doit = inet6_rtm_newroute}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE, - .doit = inet6_rtm_delroute}, + .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; From fa76c1674f2ebafb5cc8ab6fc3a3b0c0d09e9321 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:44 -0700 Subject: [PATCH 440/770] ipv6: Move some validation from ip6_route_info_create() to rtm_to_fib6_config(). ip6_route_info_create() is called from 3 functions: * ip6_route_add() * ip6_route_multipath_add() * addrconf_f6i_alloc() addrconf_f6i_alloc() does not need validation for struct fib6_config in ip6_route_info_create(). ip6_route_multipath_add() calls ip6_route_info_create() for multiple routes with slightly different fib6_config instances, which is copied from the base config passed from userspace. So, we need not validate the same config repeatedly. Let's move such validation into rtm_to_fib6_config(). Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250418000443.43734-4-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 79 +++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1c304f259d9b4..aa92e02a47f49 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3740,38 +3740,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, int err = -EINVAL; int addr_type; - /* RTF_PCPU is an internal flag; can not be set by userspace */ - if (cfg->fc_flags & RTF_PCPU) { - NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); - goto out; - } - - /* RTF_CACHE is an internal flag; can not be set by userspace */ - if (cfg->fc_flags & RTF_CACHE) { - NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); - goto out; - } - - if (cfg->fc_type > RTN_MAX) { - NL_SET_ERR_MSG(extack, "Invalid route type"); - goto out; - } - - if (cfg->fc_dst_len > 128) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - goto out; - } - if (cfg->fc_src_len > 128) { - NL_SET_ERR_MSG(extack, "Invalid source address length"); - goto out; - } -#ifndef CONFIG_IPV6_SUBTREES - if (cfg->fc_src_len) { - NL_SET_ERR_MSG(extack, - "Specifying source address requires IPV6_SUBTREES to be enabled"); - goto out; - } -#endif if (cfg->fc_nh_id) { nh = nexthop_find_by_id(net, cfg->fc_nh_id); if (!nh) { @@ -3836,11 +3804,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, rt->fib6_src.plen = cfg->fc_src_len; #endif if (nh) { - if (rt->fib6_src.plen) { - NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); - err = -EINVAL; - goto out_free; - } if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -ENOENT; @@ -5240,6 +5203,48 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, } } + if (newroute) { + /* RTF_PCPU is an internal flag; can not be set by userspace */ + if (cfg->fc_flags & RTF_PCPU) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); + goto errout; + } + + /* RTF_CACHE is an internal flag; can not be set by userspace */ + if (cfg->fc_flags & RTF_CACHE) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); + goto errout; + } + + if (cfg->fc_type > RTN_MAX) { + NL_SET_ERR_MSG(extack, "Invalid route type"); + goto errout; + } + + if (cfg->fc_dst_len > 128) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + goto errout; + } + +#ifdef CONFIG_IPV6_SUBTREES + if (cfg->fc_src_len > 128) { + NL_SET_ERR_MSG(extack, "Invalid source address length"); + goto errout; + } + + if (cfg->fc_nh_id && cfg->fc_src_len) { + NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + goto errout; + } +#else + if (cfg->fc_src_len) { + NL_SET_ERR_MSG(extack, + "Specifying source address requires IPV6_SUBTREES to be enabled"); + goto errout; + } +#endif + } + err = 0; errout: return err; From e6f497955fb6a072999db491a01dd3a203d5bcea Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:45 -0700 Subject: [PATCH 441/770] ipv6: Check GATEWAY in rtm_to_fib6_multipath_config(). In ip6_route_multipath_add(), we call rt6_qualify_for_ecmp() for each entry. If it returns false, the request fails. rt6_qualify_for_ecmp() returns false if either of the conditions below is true: 1. f6i->fib6_flags has RTF_ADDRCONF 2. f6i->nh is not NULL 3. f6i->fib6_nh->fib_nh_gw_family is AF_UNSPEC 1 is unnecessary because rtm_to_fib6_config() never sets RTF_ADDRCONF to cfg->fc_flags. 2. is equivalent with cfg->fc_nh_id. 3. can be replaced by checking RTF_GATEWAY in the base and each multipath entry because AF_INET6 is set to f6i->fib6_nh->fib_nh_gw_family only when cfg.fc_is_fdb is true or RTF_GATEWAY is set, but the former is always false. These checks do not require RCU and can be done earlier. Let's perform the equivalent checks in rtm_to_fib6_multipath_config(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418000443.43734-5-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index aa92e02a47f49..88d2f85ed69dc 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5031,6 +5031,7 @@ static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, } do { + bool has_gateway = cfg->fc_flags & RTF_GATEWAY; int attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { @@ -5044,9 +5045,17 @@ static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, "Invalid IPv6 address in RTA_GATEWAY"); return -EINVAL; } + + has_gateway = true; } } + if (newroute && (cfg->fc_nh_id || !has_gateway)) { + NL_SET_ERR_MSG(extack, + "Device only routes can not be added for IPv6 using the multipath API."); + return -EINVAL; + } + rtnh = rtnh_next(rtnh, &remaining); } while (rtnh_ok(rtnh, remaining)); @@ -5388,13 +5397,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rt = NULL; goto cleanup; } - if (!rt6_qualify_for_ecmp(rt)) { - err = -EINVAL; - NL_SET_ERR_MSG(extack, - "Device only routes can not be added for IPv6 using the multipath API."); - fib6_info_release(rt); - goto cleanup; - } rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; From c9cabe05e450b4a23072b248db33e6d97c986933 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:46 -0700 Subject: [PATCH 442/770] ipv6: Move nexthop_find_by_id() after fib6_info_alloc(). We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT. Then, we must perform two lookups for nexthop and dev under RCU to guarantee their lifetime. ip6_route_info_create() calls nexthop_find_by_id() first if RTA_NH_ID is specified, and then allocates struct fib6_info. nexthop_find_by_id() must be called under RCU, but we do not want to use GFP_ATOMIC for memory allocation here, which will be likely to fail in ip6_route_multipath_add(). Let's move nexthop_find_by_id() after the memory allocation so that we can later split ip6_route_info_create() into two parts: the sleepable part and the RCU part. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250418000443.43734-6-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 88d2f85ed69dc..f66f90f8f1531 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3734,24 +3734,11 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_info *rt = NULL; - struct nexthop *nh = NULL; struct fib6_table *table; struct fib6_nh *fib6_nh; - int err = -EINVAL; + int err = -ENOBUFS; int addr_type; - if (cfg->fc_nh_id) { - nh = nexthop_find_by_id(net, cfg->fc_nh_id); - if (!nh) { - NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); - goto out; - } - err = fib6_check_nexthop(nh, cfg, extack); - if (err) - goto out; - } - - err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { table = fib6_get_table(net, cfg->fc_table); @@ -3767,7 +3754,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; err = -ENOMEM; - rt = fib6_info_alloc(gfp_flags, !nh); + rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id); if (!rt) goto out; @@ -3803,12 +3790,27 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - if (nh) { + + if (cfg->fc_nh_id) { + struct nexthop *nh; + + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto out_free; + } + + err = fib6_check_nexthop(nh, cfg, extack); + if (err) + goto out_free; + if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -ENOENT; goto out_free; } + rt->nh = nh; fib6_nh = nexthop_fib6_nh(rt->nh); } else { From c4837b9853e5a7fa70122b7760f0f26147b08a57 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:47 -0700 Subject: [PATCH 443/770] ipv6: Split ip6_route_info_create(). We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT and rely on RCU to guarantee dev and nexthop lifetime. Then, we want to allocate as much as possible before entering the RCU section. The RCU section will start in the middle of ip6_route_info_create(), and this is problematic for ip6_route_multipath_add() that calls ip6_route_info_create() multiple times. Let's split ip6_route_info_create() into two parts; one for memory allocation and another for nexthop setup. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250418000443.43734-7-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 95 +++++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 33 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f66f90f8f1531..bba35cb2dc259 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3729,15 +3729,13 @@ void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) } static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, - gfp_t gfp_flags, - struct netlink_ext_ack *extack) + gfp_t gfp_flags, + struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; - struct fib6_info *rt = NULL; struct fib6_table *table; - struct fib6_nh *fib6_nh; - int err = -ENOBUFS; - int addr_type; + struct fib6_info *rt; + int err; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { @@ -3749,22 +3747,22 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, } else { table = fib6_new_table(net, cfg->fc_table); } + if (!table) { + err = -ENOBUFS; + goto err; + } - if (!table) - goto out; - - err = -ENOMEM; rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id); - if (!rt) - goto out; + if (!rt) { + err = -ENOMEM; + goto err; + } rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); - /* Do not leave garbage there. */ - rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; - goto out_free; + goto free; } if (cfg->fc_flags & RTF_ADDRCONF) @@ -3772,12 +3770,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + - clock_t_to_jiffies(cfg->fc_expires)); + clock_t_to_jiffies(cfg->fc_expires)); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; - rt->fib6_protocol = cfg->fc_protocol; + rt->fib6_protocol = cfg->fc_protocol; rt->fib6_table = table; rt->fib6_metric = cfg->fc_metric; rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; @@ -3790,6 +3788,20 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif + return rt; +free: + kfree(rt); +err: + return ERR_PTR(err); +} + +static int ip6_route_info_create_nh(struct fib6_info *rt, + struct fib6_config *cfg, + struct netlink_ext_ack *extack) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + struct fib6_nh *fib6_nh; + int err; if (cfg->fc_nh_id) { struct nexthop *nh; @@ -3814,9 +3826,11 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, rt->nh = nh; fib6_nh = nexthop_fib6_nh(rt->nh); } else { - err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); + int addr_type; + + err = fib6_nh_init(net, rt->fib6_nh, cfg, GFP_ATOMIC, extack); if (err) - goto out; + goto out_release; fib6_nh = rt->fib6_nh; @@ -3835,21 +3849,20 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; - goto out; + goto out_release; } rt->fib6_prefsrc.addr = cfg->fc_prefsrc; rt->fib6_prefsrc.plen = 128; - } else - rt->fib6_prefsrc.plen = 0; + } - return rt; -out: + return 0; +out_release: fib6_info_release(rt); - return ERR_PTR(err); + return err; out_free: ip_fib_metrics_put(rt->fib6_metrics); kfree(rt); - return ERR_PTR(err); + return err; } int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, @@ -3862,6 +3875,10 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, if (IS_ERR(rt)) return PTR_ERR(rt); + err = ip6_route_info_create_nh(rt, cfg, extack); + if (err) + return err; + err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); fib6_info_release(rt); @@ -4585,6 +4602,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, .fc_ignore_dev_down = true, }; struct fib6_info *f6i; + int err; if (anycast) { cfg.fc_type = RTN_ANYCAST; @@ -4595,14 +4613,19 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, } f6i = ip6_route_info_create(&cfg, gfp_flags, extack); - if (!IS_ERR(f6i)) { - f6i->dst_nocount = true; + if (IS_ERR(f6i)) + return f6i; - if (!anycast && - (READ_ONCE(net->ipv6.devconf_all->disable_policy) || - READ_ONCE(idev->cnf.disable_policy))) - f6i->dst_nopolicy = true; - } + err = ip6_route_info_create_nh(f6i, &cfg, extack); + if (err) + return ERR_PTR(err); + + f6i->dst_nocount = true; + + if (!anycast && + (READ_ONCE(net->ipv6.devconf_all->disable_policy) || + READ_ONCE(idev->cnf.disable_policy))) + f6i->dst_nopolicy = true; return f6i; } @@ -5400,6 +5423,12 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } + err = ip6_route_info_create_nh(rt, &r_cfg, extack); + if (err) { + rt = NULL; + goto cleanup; + } + rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(info->nl_net, &rt6_nh_list, From 5720a328c3e9802af48dd216c1c7eb0e91b61b6c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:48 -0700 Subject: [PATCH 444/770] ipv6: Preallocate rt->fib6_nh->rt6i_pcpu in ip6_route_info_create(). ip6_route_info_create_nh() will be called under RCU. Then, fib6_nh_init() is also under RCU, but per-cpu memory allocation is very likely to fail with GFP_ATOMIC while bulk-adding IPv6 routes and we would see a bunch of this message in dmesg. percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left Let's preallocate rt->fib6_nh->rt6i_pcpu in ip6_route_info_create(). If something fails before the original memory allocation in fib6_nh_init(), ip6_route_info_create_nh() calls fib6_info_release(), which releases the preallocated per-cpu memory. Note that rt->fib6_nh->rt6i_pcpu is not preallocated when called via ipv6_stub, so we still need alloc_percpu_gfp() in fib6_nh_init(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418000443.43734-8-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index bba35cb2dc259..1155cb0173430 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3665,10 +3665,12 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, goto out; pcpu_alloc: - fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); if (!fib6_nh->rt6i_pcpu) { - err = -ENOMEM; - goto out; + fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); + if (!fib6_nh->rt6i_pcpu) { + err = -ENOMEM; + goto out; + } } fib6_nh->fib_nh_dev = dev; @@ -3728,6 +3730,15 @@ void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) } } +static int fib6_nh_prealloc_percpu(struct fib6_nh *fib6_nh, gfp_t gfp_flags) +{ + fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); + if (!fib6_nh->rt6i_pcpu) + return -ENOMEM; + + return 0; +} + static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) @@ -3765,6 +3776,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto free; } + if (!cfg->fc_nh_id) { + err = fib6_nh_prealloc_percpu(&rt->fib6_nh[0], gfp_flags); + if (err) + goto free_metrics; + } + if (cfg->fc_flags & RTF_ADDRCONF) rt->dst_nocount = true; @@ -3789,6 +3806,8 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, rt->fib6_src.plen = cfg->fc_src_len; #endif return rt; +free_metrics: + ip_fib_metrics_put(rt->fib6_metrics); free: kfree(rt); err: From d27b9c40dbd66aa78b3e6657e600cf057a48ac1e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:49 -0700 Subject: [PATCH 445/770] ipv6: Preallocate nhc_pcpu_rth_output in ip6_route_info_create(). ip6_route_info_create_nh() will be called under RCU. It calls fib_nh_common_init() and allocates nhc->nhc_pcpu_rth_output. As with the reason for rt->fib6_nh->rt6i_pcpu, we want to avoid GFP_ATOMIC allocation for nhc->nhc_pcpu_rth_output under RCU. Let's preallocate it in ip6_route_info_create(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418000443.43734-9-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv4/fib_semantics.c | 10 ++++++---- net/ipv6/route.c | 9 +++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f68bb9e34c34c..5326f1501af02 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -617,10 +617,12 @@ int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, { int err; - nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, - gfp_flags); - if (!nhc->nhc_pcpu_rth_output) - return -ENOMEM; + if (!nhc->nhc_pcpu_rth_output) { + nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, + gfp_flags); + if (!nhc->nhc_pcpu_rth_output) + return -ENOMEM; + } if (encap) { struct lwtunnel_state *lwtstate; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1155cb0173430..3037d13043882 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3732,10 +3732,19 @@ void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) static int fib6_nh_prealloc_percpu(struct fib6_nh *fib6_nh, gfp_t gfp_flags) { + struct fib_nh_common *nhc = &fib6_nh->nh_common; + fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); if (!fib6_nh->rt6i_pcpu) return -ENOMEM; + nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, + gfp_flags); + if (!nhc->nhc_pcpu_rth_output) { + free_percpu(fib6_nh->rt6i_pcpu); + return -ENOMEM; + } + return 0; } From 87d5d921eaf225fc820907f3c6827e5d7cc85bc3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:50 -0700 Subject: [PATCH 446/770] ipv6: Don't pass net to ip6_route_info_append(). net is not used in ip6_route_info_append() after commit 36f19d5b4f99 ("net/ipv6: Remove extra call to ip6_convert_metrics for multipath case"). Let's remove the argument. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250418000443.43734-10-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3037d13043882..a687eec1eab00 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5318,8 +5318,7 @@ struct rt6_nh { struct list_head next; }; -static int ip6_route_info_append(struct net *net, - struct list_head *rt6_nh_list, +static int ip6_route_info_append(struct list_head *rt6_nh_list, struct fib6_info *rt, struct fib6_config *r_cfg) { @@ -5459,8 +5458,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; - err = ip6_route_info_append(info->nl_net, &rt6_nh_list, - rt, &r_cfg); + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { fib6_info_release(rt); goto cleanup; From 5a1ccff5c65aa553429a8caf8f491d503d760568 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:51 -0700 Subject: [PATCH 447/770] ipv6: Rename rt6_nh.next to rt6_nh.list. ip6_route_multipath_add() allocates struct rt6_nh for each config of multipath routes to link them to a local list rt6_nh_list. struct rt6_nh.next is the list node of each config, so the name is quite misleading. Let's rename it to list. Suggested-by: Paolo Abeni Link: https://lore.kernel.org/netdev/c9bee472-c94e-4878-8cc2-1512b2c54db5@redhat.com/ Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418000443.43734-11-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a687eec1eab00..aa4287ce49446 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5315,7 +5315,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct rt6_nh { struct fib6_info *fib6_info; struct fib6_config r_cfg; - struct list_head next; + struct list_head list; }; static int ip6_route_info_append(struct list_head *rt6_nh_list, @@ -5325,7 +5325,7 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list, struct rt6_nh *nh; int err = -EEXIST; - list_for_each_entry(nh, rt6_nh_list, next) { + list_for_each_entry(nh, rt6_nh_list, list) { /* check if fib6_info already exists */ if (rt6_duplicate_nexthop(nh->fib6_info, rt)) return err; @@ -5336,7 +5336,7 @@ static int ip6_route_info_append(struct list_head *rt6_nh_list, return -ENOMEM; nh->fib6_info = rt; memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); - list_add_tail(&nh->next, rt6_nh_list); + list_add_tail(&nh->list, rt6_nh_list); return 0; } @@ -5479,7 +5479,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, info->skip_notify_kernel = 1; err_nh = NULL; - list_for_each_entry(nh, &rt6_nh_list, next) { + list_for_each_entry(nh, &rt6_nh_list, list) { err = __ip6_ins_rt(nh->fib6_info, info, extack); if (err) { @@ -5547,16 +5547,16 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); /* Delete routes that were already added */ - list_for_each_entry(nh, &rt6_nh_list, next) { + list_for_each_entry(nh, &rt6_nh_list, list) { if (err_nh == nh) break; ip6_route_del(&nh->r_cfg, extack); } cleanup: - list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { + list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) { fib6_info_release(nh->fib6_info); - list_del(&nh->next); + list_del(&nh->list); kfree(nh); } From 71c0efb6d12f4c734b2a894c17e84ebab49b4c60 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:52 -0700 Subject: [PATCH 448/770] ipv6: Factorise ip6_route_multipath_add(). We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT and rely on RCU to guarantee dev and nexthop lifetime. Then, the RCU section will start before ip6_route_info_create_nh() in ip6_route_multipath_add(), but ip6_route_info_create() is called in the same loop and will sleep. Let's split the loop into ip6_route_mpath_info_create() and ip6_route_mpath_info_create_nh(). Note that ip6_route_info_append() is now integrated into ip6_route_mpath_info_create_nh() because we need to call different free functions for nexthops that passed ip6_route_info_create_nh(). In case of failure, the remaining nexthops that ip6_route_info_create_nh() has not been called for will be freed by ip6_route_mpath_info_cleanup(). OTOH, if a nexthop passes ip6_route_info_create_nh(), it will be linked to a local temporary list, which will be spliced back to rt6_nh_list. In case of failure, these nexthops will be released by fib6_info_release() in ip6_route_multipath_add(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418000443.43734-12-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv6/route.c | 205 ++++++++++++++++++++++++++++++----------------- 1 file changed, 130 insertions(+), 75 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index aa4287ce49446..85724aee12707 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5316,29 +5316,131 @@ struct rt6_nh { struct fib6_info *fib6_info; struct fib6_config r_cfg; struct list_head list; + int weight; }; -static int ip6_route_info_append(struct list_head *rt6_nh_list, - struct fib6_info *rt, - struct fib6_config *r_cfg) +static void ip6_route_mpath_info_cleanup(struct list_head *rt6_nh_list) { - struct rt6_nh *nh; - int err = -EEXIST; + struct rt6_nh *nh, *nh_next; - list_for_each_entry(nh, rt6_nh_list, list) { - /* check if fib6_info already exists */ - if (rt6_duplicate_nexthop(nh->fib6_info, rt)) - return err; + list_for_each_entry_safe(nh, nh_next, rt6_nh_list, list) { + struct fib6_info *rt = nh->fib6_info; + + if (rt) { + free_percpu(rt->fib6_nh->nh_common.nhc_pcpu_rth_output); + free_percpu(rt->fib6_nh->rt6i_pcpu); + ip_fib_metrics_put(rt->fib6_metrics); + kfree(rt); + } + + list_del(&nh->list); + kfree(nh); } +} - nh = kzalloc(sizeof(*nh), GFP_KERNEL); - if (!nh) - return -ENOMEM; - nh->fib6_info = rt; - memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); - list_add_tail(&nh->list, rt6_nh_list); +static int ip6_route_mpath_info_create(struct list_head *rt6_nh_list, + struct fib6_config *cfg, + struct netlink_ext_ack *extack) +{ + struct rtnexthop *rtnh; + int remaining; + int err; + + remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; + + /* Parse a Multipath Entry and build a list (rt6_nh_list) of + * fib6_info structs per nexthop + */ + while (rtnh_ok(rtnh, remaining)) { + struct fib6_config r_cfg; + struct fib6_info *rt; + struct rt6_nh *nh; + int attrlen; + + nh = kzalloc(sizeof(*nh), GFP_KERNEL); + if (!nh) { + err = -ENOMEM; + goto err; + } + + list_add_tail(&nh->list, rt6_nh_list); + + memcpy(&r_cfg, cfg, sizeof(*cfg)); + if (rtnh->rtnh_ifindex) + r_cfg.fc_ifindex = rtnh->rtnh_ifindex; + + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + r_cfg.fc_gateway = nla_get_in6_addr(nla); + r_cfg.fc_flags |= RTF_GATEWAY; + } + + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); + nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + if (nla) + r_cfg.fc_encap_type = nla_get_u16(nla); + } + + r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); + + rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto err; + } + + nh->fib6_info = rt; + nh->weight = rtnh->rtnh_hops + 1; + memcpy(&nh->r_cfg, &r_cfg, sizeof(r_cfg)); + + rtnh = rtnh_next(rtnh, &remaining); + } return 0; +err: + ip6_route_mpath_info_cleanup(rt6_nh_list); + return err; +} + +static int ip6_route_mpath_info_create_nh(struct list_head *rt6_nh_list, + struct netlink_ext_ack *extack) +{ + struct rt6_nh *nh, *nh_next, *nh_tmp; + LIST_HEAD(tmp); + int err; + + list_for_each_entry_safe(nh, nh_next, rt6_nh_list, list) { + struct fib6_info *rt = nh->fib6_info; + + err = ip6_route_info_create_nh(rt, &nh->r_cfg, extack); + if (err) { + nh->fib6_info = NULL; + goto err; + } + + rt->fib6_nh->fib_nh_weight = nh->weight; + + list_move_tail(&nh->list, &tmp); + + list_for_each_entry(nh_tmp, rt6_nh_list, list) { + /* check if fib6_info already exists */ + if (rt6_duplicate_nexthop(nh_tmp->fib6_info, rt)) { + err = -EEXIST; + goto err; + } + } + } +out: + list_splice(&tmp, rt6_nh_list); + return err; +err: + ip6_route_mpath_info_cleanup(rt6_nh_list); + goto out; } static void ip6_route_mpath_notify(struct fib6_info *rt, @@ -5397,75 +5499,28 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; - struct fib6_config r_cfg; - struct rtnexthop *rtnh; - struct fib6_info *rt; - struct rt6_nh *err_nh; struct rt6_nh *nh, *nh_safe; + LIST_HEAD(rt6_nh_list); + struct rt6_nh *err_nh; __u16 nlflags; - int remaining; - int attrlen; - int err = 1; int nhn = 0; - int replace = (cfg->fc_nlinfo.nlh && - (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); - LIST_HEAD(rt6_nh_list); + int replace; + int err; + + replace = (cfg->fc_nlinfo.nlh && + (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; - remaining = cfg->fc_mp_len; - rtnh = (struct rtnexthop *)cfg->fc_mp; - - /* Parse a Multipath Entry and build a list (rt6_nh_list) of - * fib6_info structs per nexthop - */ - while (rtnh_ok(rtnh, remaining)) { - memcpy(&r_cfg, cfg, sizeof(*cfg)); - if (rtnh->rtnh_ifindex) - r_cfg.fc_ifindex = rtnh->rtnh_ifindex; - - attrlen = rtnh_attrlen(rtnh); - if (attrlen > 0) { - struct nlattr *nla, *attrs = rtnh_attrs(rtnh); - - nla = nla_find(attrs, attrlen, RTA_GATEWAY); - if (nla) { - r_cfg.fc_gateway = nla_get_in6_addr(nla); - r_cfg.fc_flags |= RTF_GATEWAY; - } - - r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); - nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); - if (nla) - r_cfg.fc_encap_type = nla_get_u16(nla); - } - - r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); - rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; - goto cleanup; - } - - err = ip6_route_info_create_nh(rt, &r_cfg, extack); - if (err) { - rt = NULL; - goto cleanup; - } - - rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; - - err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); - if (err) { - fib6_info_release(rt); - goto cleanup; - } + err = ip6_route_mpath_info_create(&rt6_nh_list, cfg, extack); + if (err) + return err; - rtnh = rtnh_next(rtnh, &remaining); - } + err = ip6_route_mpath_info_create_nh(&rt6_nh_list, extack); + if (err) + goto cleanup; /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with From 834d97843e3bca86f17cc517885f54f3433427b2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:53 -0700 Subject: [PATCH 449/770] ipv6: Protect fib6_link_table() with spinlock. We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT. If the request specifies a new table ID, fib6_new_table() is called to create a new routing table. Two concurrent requests could specify the same table ID, so we need a lock to protect net->ipv6.fib_table_hash[h]. Let's add a spinlock to protect the hash bucket linkage. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250418000443.43734-13-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/netns/ipv6.h | 1 + net/ipv6/ip6_fib.c | 26 +++++++++++++++++++++----- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 5f2cfd84570ae..47dc70d8100a4 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -72,6 +72,7 @@ struct netns_ipv6 { struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; + spinlock_t fib_table_hash_lock; struct fib6_table *fib6_main_tbl; struct list_head fib6_walkers; rwlock_t fib6_walker_lock; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bf727149fdece..79b672f3fc532 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -249,19 +249,33 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) struct fib6_table *fib6_new_table(struct net *net, u32 id) { - struct fib6_table *tb; + struct fib6_table *tb, *new_tb; if (id == 0) id = RT6_TABLE_MAIN; + tb = fib6_get_table(net, id); if (tb) return tb; - tb = fib6_alloc_table(net, id); - if (tb) - fib6_link_table(net, tb); + new_tb = fib6_alloc_table(net, id); + if (!new_tb) + return NULL; + + spin_lock_bh(&net->ipv6.fib_table_hash_lock); + + tb = fib6_get_table(net, id); + if (unlikely(tb)) { + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); + kfree(new_tb); + return tb; + } - return tb; + fib6_link_table(net, new_tb); + + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); + + return new_tb; } EXPORT_SYMBOL_GPL(fib6_new_table); @@ -2423,6 +2437,8 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib_table_hash) goto out_rt6_stats; + spin_lock_init(&net->ipv6.fib_table_hash_lock); + net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) From accb46b56bc3bc99ee69ba18b06ca60266ad6fca Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:54 -0700 Subject: [PATCH 450/770] ipv6: Defer fib6_purge_rt() in fib6_add_rt2node() to fib6_add(). The next patch adds per-nexthop spinlock which protects nh->f6i_list. When rt->nh is not NULL, fib6_add_rt2node() will be called under the lock. fib6_add_rt2node() could call fib6_purge_rt() for another route, which could holds another nexthop lock. Then, deadlock could happen between two nexthops. Let's defer fib6_purge_rt() after fib6_add_rt2node(). Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://patch.msgid.link/20250418000443.43734-14-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/ip6_fib.h | 1 + net/ipv6/ip6_fib.c | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 7c87873ae211c..88b0dd4d8e094 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -198,6 +198,7 @@ struct fib6_info { fib6_destroying:1, unused:4; + struct list_head purge_link; struct rcu_head rcu; struct nexthop *nh; struct fib6_nh fib6_nh[]; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 79b672f3fc532..9e9db5470bbf5 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1083,8 +1083,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, - struct nl_info *info, - struct netlink_ext_ack *extack) + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -1308,10 +1308,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); + list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ @@ -1324,10 +1323,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); + list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { @@ -1397,6 +1395,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; + LIST_HEAD(purge_list); struct fib6_node *fn; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node *pn = NULL; @@ -1499,8 +1498,16 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, extack); + err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); if (!err) { + struct fib6_info *iter, *next; + + list_for_each_entry_safe(iter, next, &purge_list, purge_link) { + list_del(&iter->purge_link); + fib6_purge_rt(iter, fn, info->nl_net); + fib6_info_release(iter); + } + if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); From 081efd18326e353c6fbfdeff903a83edde953f72 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:55 -0700 Subject: [PATCH 451/770] ipv6: Protect nh->f6i_list with spinlock and flag. We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT. Then, we may be going to add a route tied to a dying nexthop. The nexthop itself is not freed during the RCU grace period, but if we link a route after __remove_nexthop_fib() is called for the nexthop, the route will be leaked. To avoid the race between IPv6 route addition under RCU vs nexthop deletion under RTNL, let's add a dead flag and protect it and nh->f6i_list with a spinlock. __remove_nexthop_fib() acquires the nexthop's spinlock and sets false to nh->dead, then calls ip6_del_rt() for the linked route one by one without the spinlock because fib6_purge_rt() acquires it later. While adding an IPv6 route, fib6_add() acquires the nexthop lock and checks the dead flag just before inserting the route. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418000443.43734-15-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- include/net/nexthop.h | 2 ++ net/ipv4/nexthop.c | 18 +++++++++++++++--- net/ipv6/ip6_fib.c | 39 ++++++++++++++++++++++++++++++++++----- 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/include/net/nexthop.h b/include/net/nexthop.h index d9fb44e8b3219..572e69cda4766 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -152,6 +152,8 @@ struct nexthop { u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; + bool dead; + spinlock_t lock; /* protect dead and f6i_list */ refcount_t refcnt; struct rcu_head rcu; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index d9cf06b297d11..6ba6cb1340c12 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -541,6 +541,7 @@ static struct nexthop *nexthop_alloc(void) INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fdb_list); + spin_lock_init(&nh->lock); } return nh; } @@ -2118,7 +2119,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) /* not called for nexthop replace */ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { - struct fib6_info *f6i, *tmp; + struct fib6_info *f6i; bool do_flush = false; struct fib_info *fi; @@ -2129,13 +2130,24 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) if (do_flush) fib_flush(net); - /* ip6_del_rt removes the entry from this list hence the _safe */ - list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { + spin_lock_bh(&nh->lock); + + nh->dead = true; + + while (!list_empty(&nh->f6i_list)) { + f6i = list_first_entry(&nh->f6i_list, typeof(*f6i), nh_list); + /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); + + spin_unlock_bh(&nh->lock); ipv6_stub->ip6_del_rt(net, f6i, !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); + + spin_lock_bh(&nh->lock); } + + spin_unlock_bh(&nh->lock); } static void __remove_nexthop(struct net *net, struct nexthop *nh, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 9e9db5470bbf5..1f860340690ca 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1048,8 +1048,14 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); - if (rt->nh && !list_empty(&rt->nh_list)) - list_del_init(&rt->nh_list); + if (rt->nh) { + spin_lock(&rt->nh->lock); + + if (!list_empty(&rt->nh_list)) + list_del_init(&rt->nh_list); + + spin_unlock(&rt->nh->lock); + } if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1341,6 +1347,28 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, return 0; } +static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt, + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) +{ + int err; + + spin_lock(&rt->nh->lock); + + if (rt->nh->dead) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -EINVAL; + } else { + err = fib6_add_rt2node(fn, rt, info, extack, purge_list); + if (!err) + list_add(&rt->nh_list, &rt->nh->f6i_list); + } + + spin_unlock(&rt->nh->lock); + + return err; +} + static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && @@ -1498,7 +1526,10 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); + if (rt->nh) + err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list); + else + err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); if (!err) { struct fib6_info *iter, *next; @@ -1508,8 +1539,6 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, fib6_info_release(iter); } - if (rt->nh) - list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (rt->fib6_flags & RTF_EXPIRES) From 169fd62799e8acabbfb4760799be11138ced949c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Apr 2025 17:03:56 -0700 Subject: [PATCH 452/770] ipv6: Get rid of RTNL for SIOCADDRT and RTM_NEWROUTE. Now we are ready to remove RTNL from SIOCADDRT and RTM_NEWROUTE. The remaining things to do are 1. pass false to lwtunnel_valid_encap_type_attr() 2. use rcu_dereference_rtnl() in fib6_check_nexthop() 3. place rcu_read_lock() before ip6_route_info_create_nh(). Let's complete the RTNL-free conversion. When each CPU-X adds 100000 routes on table-X in a batch concurrently on c7a.metal-48xl EC2 instance with 192 CPUs, without this series: $ sudo ./route_test.sh ... added 19200000 routes (100000 routes * 192 tables). time elapsed: 191577 milliseconds. with this series: $ sudo ./route_test.sh ... added 19200000 routes (100000 routes * 192 tables). time elapsed: 62854 milliseconds. I changed the number of routes in each table (1000 ~ 100000) and consistently saw it finish 3x faster with this series. Note that now every caller of lwtunnel_valid_encap_type() passes false as the last argument, and this can be removed later. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250418000443.43734-16-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/ipv4/nexthop.c | 4 ++-- net/ipv6/route.c | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 6ba6cb1340c12..823e4a783d2b1 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1556,12 +1556,12 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, if (nh->is_group) { struct nh_group *nhg; - nhg = rtnl_dereference(nh->nh_grp); + nhg = rcu_dereference_rtnl(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; is_fdb_nh = nhg->fdb_nh; } else { - nhi = rtnl_dereference(nh->nh_info); + nhi = rcu_dereference_rtnl(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; is_fdb_nh = nhi->fdb_nh; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 85724aee12707..d0351e95d9161 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3903,12 +3903,16 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, if (IS_ERR(rt)) return PTR_ERR(rt); + rcu_read_lock(); + err = ip6_route_info_create_nh(rt, cfg, extack); if (err) - return err; + goto unlock; err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); fib6_info_release(rt); +unlock: + rcu_read_unlock(); return err; } @@ -4529,12 +4533,10 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) switch (cmd) { case SIOCADDRT: - rtnl_lock(); /* Only do the default setting of fc_metric in route adding */ if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; err = ip6_route_add(&cfg, GFP_KERNEL, NULL); - rtnl_unlock(); break; case SIOCDELRT: err = ip6_route_del(&cfg, NULL); @@ -5113,7 +5115,7 @@ static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, } while (rtnh_ok(rtnh, remaining)); return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, - extack, newroute); + extack, false); } static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -5251,7 +5253,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, - extack, newroute); + extack, false); if (err < 0) goto errout; } @@ -5518,6 +5520,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, if (err) return err; + rcu_read_lock(); + err = ip6_route_mpath_info_create_nh(&rt6_nh_list, extack); if (err) goto cleanup; @@ -5609,6 +5613,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, } cleanup: + rcu_read_unlock(); + list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) { fib6_info_release(nh->fib6_info); list_del(&nh->list); @@ -6891,7 +6897,7 @@ static void bpf_iter_unregister(void) static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE, - .doit = inet6_rtm_newroute}, + .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE, .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, From c73c67026fe65d6677260dfd15dd968b709dc237 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 18 Apr 2025 21:39:02 +0200 Subject: [PATCH 453/770] fanotify: fix flush of mntns marks fanotify_mark(fd, FAN_MARK_FLUSH | FAN_MARK_MNTNS, ...) incorrectly ends up causing removal inode marks. Fixes: 0f46d81f2bce ("fanotify: notify on mount attach and detach") Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/20250418193903.2607617-2-amir73il@gmail.com --- fs/notify/fanotify/fanotify_user.c | 7 +------ include/linux/fsnotify_backend.h | 15 --------------- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f2d840ae4ded8..87f861e9004f2 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1961,12 +1961,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; if (mark_cmd == FAN_MARK_FLUSH) { - if (mark_type == FAN_MARK_MOUNT) - fsnotify_clear_vfsmount_marks_by_group(group); - else if (mark_type == FAN_MARK_FILESYSTEM) - fsnotify_clear_sb_marks_by_group(group); - else - fsnotify_clear_inode_marks_by_group(group); + fsnotify_clear_marks_by_group(group, obj_type); return 0; } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 6cd8d1d28b8be..fc27b53c58c2e 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -907,21 +907,6 @@ extern void fsnotify_wait_marks_destroyed(void); /* Clear all of the marks of a group attached to a given object type */ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type); -/* run all the marks in a group, and clear all of the vfsmount marks */ -static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) -{ - fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT); -} -/* run all the marks in a group, and clear all of the inode marks */ -static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) -{ - fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE); -} -/* run all the marks in a group, and clear all of the sn marks */ -static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group) -{ - fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB); -} extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); From cd188e9ef80fd005fd8c8de34ed649bd653d00e5 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 18 Apr 2025 21:39:03 +0200 Subject: [PATCH 454/770] selftests/fs/mount-notify: test also remove/flush of mntns marks Regression test for FAN_MARK_MNTFS | FAN_MARK_FLUSH bug. Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara Link: https://patch.msgid.link/20250418193903.2607617-3-amir73il@gmail.com --- .../mount-notify/mount-notify_test.c | 57 +++++++++++++++---- 1 file changed, 46 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c index 4a2d5c454fd15..59a71f22fb118 100644 --- a/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c +++ b/tools/testing/selftests/filesystems/mount-notify/mount-notify_test.c @@ -48,8 +48,16 @@ static uint64_t get_mnt_id(struct __test_metadata *const _metadata, static const char root_mntpoint_templ[] = "/tmp/mount-notify_test_root.XXXXXX"; +static const int mark_cmds[] = { + FAN_MARK_ADD, + FAN_MARK_REMOVE, + FAN_MARK_FLUSH +}; + +#define NUM_FAN_FDS ARRAY_SIZE(mark_cmds) + FIXTURE(fanotify) { - int fan_fd; + int fan_fd[NUM_FAN_FDS]; char buf[256]; unsigned int rem; void *next; @@ -61,7 +69,7 @@ FIXTURE(fanotify) { FIXTURE_SETUP(fanotify) { - int ret; + int i, ret; ASSERT_EQ(unshare(CLONE_NEWNS), 0); @@ -89,20 +97,34 @@ FIXTURE_SETUP(fanotify) self->root_id = get_mnt_id(_metadata, "/"); ASSERT_NE(self->root_id, 0); - self->fan_fd = fanotify_init(FAN_REPORT_MNT, 0); - ASSERT_GE(self->fan_fd, 0); - - ret = fanotify_mark(self->fan_fd, FAN_MARK_ADD | FAN_MARK_MNTNS, - FAN_MNT_ATTACH | FAN_MNT_DETACH, self->ns_fd, NULL); - ASSERT_EQ(ret, 0); + for (i = 0; i < NUM_FAN_FDS; i++) { + self->fan_fd[i] = fanotify_init(FAN_REPORT_MNT | FAN_NONBLOCK, + 0); + ASSERT_GE(self->fan_fd[i], 0); + ret = fanotify_mark(self->fan_fd[i], FAN_MARK_ADD | + FAN_MARK_MNTNS, + FAN_MNT_ATTACH | FAN_MNT_DETACH, + self->ns_fd, NULL); + ASSERT_EQ(ret, 0); + // On fd[0] we do an extra ADD that changes nothing. + // On fd[1]/fd[2] we REMOVE/FLUSH which removes the mark. + ret = fanotify_mark(self->fan_fd[i], mark_cmds[i] | + FAN_MARK_MNTNS, + FAN_MNT_ATTACH | FAN_MNT_DETACH, + self->ns_fd, NULL); + ASSERT_EQ(ret, 0); + } self->rem = 0; } FIXTURE_TEARDOWN(fanotify) { + int i; + ASSERT_EQ(self->rem, 0); - close(self->fan_fd); + for (i = 0; i < NUM_FAN_FDS; i++) + close(self->fan_fd[i]); ASSERT_EQ(fchdir(self->orig_root), 0); @@ -123,8 +145,21 @@ static uint64_t expect_notify(struct __test_metadata *const _metadata, unsigned int thislen; if (!self->rem) { - ssize_t len = read(self->fan_fd, self->buf, sizeof(self->buf)); - ASSERT_GT(len, 0); + ssize_t len; + int i; + + for (i = NUM_FAN_FDS - 1; i >= 0; i--) { + len = read(self->fan_fd[i], self->buf, + sizeof(self->buf)); + if (i > 0) { + // Groups 1,2 should get EAGAIN + ASSERT_EQ(len, -1); + ASSERT_EQ(errno, EAGAIN); + } else { + // Group 0 should get events + ASSERT_GT(len, 0); + } + } self->rem = len; self->next = (void *) self->buf; From bef4f1156b74721b7d111114538659031119b6f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Fri, 18 Apr 2025 16:58:00 +0200 Subject: [PATCH 455/770] net: phy: marvell-88q2xxx: Enable temperature sensor for mv88q211x MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The temperature sensor enabled for mv88q222x devices also functions for mv88q211x based devices. Unify the two devices probe functions to enable the sensors for all devices supported by this driver. The same oddity as for mv88q222x devices exists, the PHY link must be up for a correct temperature reading to be reported. # cat /sys/class/hwmon/hwmon9/temp1_input -75000 # ifconfig end5 up # cat /sys/class/hwmon/hwmon9/temp1_input 59000 Worth noting is that while the temperature register offsets and layout are the same between mv88q211x and mv88q222x devices their names in the datasheets are different. This change keeps the mv88q222x names for the mv88q211x support. Signed-off-by: Niklas Söderlund Reviewed-by: Dimitri Fedrau Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250418145800.2420751-1-niklas.soderlund+renesas@ragnatech.se Signed-off-by: Paolo Abeni --- drivers/net/phy/marvell-88q2xxx.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/net/phy/marvell-88q2xxx.c b/drivers/net/phy/marvell-88q2xxx.c index 23e1f0521f549..5c687164b8e06 100644 --- a/drivers/net/phy/marvell-88q2xxx.c +++ b/drivers/net/phy/marvell-88q2xxx.c @@ -828,6 +828,7 @@ static int mv88q2xxx_leds_probe(struct phy_device *phydev) static int mv88q2xxx_probe(struct phy_device *phydev) { struct mv88q2xxx_priv *priv; + int ret; priv = devm_kzalloc(&phydev->mdio.dev, sizeof(*priv), GFP_KERNEL); if (!priv) @@ -835,17 +836,6 @@ static int mv88q2xxx_probe(struct phy_device *phydev) phydev->priv = priv; - return 0; -} - -static int mv88q222x_probe(struct phy_device *phydev) -{ - int ret; - - ret = mv88q2xxx_probe(phydev); - if (ret) - return ret; - ret = mv88q2xxx_leds_probe(phydev); if (ret) return ret; @@ -1118,7 +1108,7 @@ static struct phy_driver mv88q2xxx_driver[] = { .phy_id_mask = MARVELL_PHY_ID_MASK, .name = "mv88q2220", .flags = PHY_POLL_CABLE_TEST, - .probe = mv88q222x_probe, + .probe = mv88q2xxx_probe, .get_features = mv88q2xxx_get_features, .config_aneg = mv88q2xxx_config_aneg, .aneg_done = genphy_c45_aneg_done, From f4293c2baf6faa5f1a1638bcce698ed88d0d396e Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Wed, 19 Feb 2025 20:30:37 +0000 Subject: [PATCH 456/770] netfilter: xt_IDLETIMER: convert timeouts to secs_to_jiffies() Commit b35108a51cf7 ("jiffies: Define secs_to_jiffies()") introduced secs_to_jiffies(). As the value here is a multiple of 1000, use secs_to_jiffies() instead of msecs_to_jiffies to avoid the multiplication. This is converted using scripts/coccinelle/misc/secs_to_jiffies.cocci with the following Coccinelle rules: @depends on patch@ expression E; @@ -msecs_to_jiffies(E * 1000) +secs_to_jiffies(E) -msecs_to_jiffies(E * MSEC_PER_SEC) +secs_to_jiffies(E) Signed-off-by: Easwar Hariharan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_IDLETIMER.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 9f54819eb52ca..9082155ee5589 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -168,7 +168,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) INIT_WORK(&info->timer->work, idletimer_tg_work); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); return 0; @@ -229,7 +229,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } return 0; @@ -254,7 +254,7 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb, info->label, info->timeout); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); return XT_CONTINUE; } @@ -275,7 +275,7 @@ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, alarm_start_relative(&info->timer->alarm, tout); } else { mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } return XT_CONTINUE; @@ -320,7 +320,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) if (info->timer) { info->timer->refcnt++; mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); @@ -382,7 +382,7 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) } } else { mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); From 3ba0032afea888d0edebf5ece3c6b36417189b63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Tue, 1 Apr 2025 13:57:30 +0200 Subject: [PATCH 457/770] netfilter: xt_cgroup: Make it independent from net_cls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xt_group matching supports the default hierarchy since commit c38c4597e4bf3 ("netfilter: implement xt_cgroup cgroup2 path match"). The cgroup v1 matching (based on clsid) and cgroup v2 matching (based on path) are rather independent. Downgrade the Kconfig dependency to mere CONFIG_SOCK_GROUP_DATA so that xt_group can be built even without CONFIG_NET_CLS_CGROUP for path matching. Also add a message for users when they attempt to specify any clsid. Link: https://lists.opensuse.org/archives/list/kernel@lists.opensuse.org/thread/S23NOILB7MUIRHSKPBOQKJHVSK26GP6X/ Cc: Jan Engelhardt Cc: Florian Westphal Signed-off-by: Michal Koutný Signed-off-by: Pablo Neira Ayuso --- net/netfilter/Kconfig | 2 +- net/netfilter/xt_cgroup.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 047ba81865edf..3b2183fc7e563 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1180,7 +1180,7 @@ config NETFILTER_XT_MATCH_CGROUP tristate '"control group" match support' depends on NETFILTER_ADVANCED depends on CGROUPS - select CGROUP_NET_CLASSID + select SOCK_CGROUP_DATA help Socket/process control group matching allows you to match locally generated packets based on which net_cls control group processes diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index c0f5e9a4f3c65..66915bf0d89ad 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -23,6 +23,8 @@ MODULE_DESCRIPTION("Xtables: process control group matching"); MODULE_ALIAS("ipt_cgroup"); MODULE_ALIAS("ip6t_cgroup"); +#define NET_CLS_CLASSID_INVALID_MSG "xt_cgroup: classid invalid without net_cls cgroups\n" + static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) { struct xt_cgroup_info_v0 *info = par->matchinfo; @@ -30,6 +32,11 @@ static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) if (info->invert & ~1) return -EINVAL; + if (!IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + return 0; } @@ -51,6 +58,11 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); @@ -83,6 +95,11 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); From 08764531474578d56ba1dc000c35668dffd55721 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Tue, 1 Apr 2025 13:57:31 +0200 Subject: [PATCH 458/770] net: cgroup: Guard users of sock_cgroup_classid() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exclude code that relies on sock_cgroup_classid() as preparation of removal of the function. Signed-off-by: Michal Koutný Signed-off-by: Pablo Neira Ayuso --- net/ipv4/inet_diag.c | 2 +- net/netfilter/xt_cgroup.c | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 907bad776b423..1d1d6ad53f4c9 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -160,7 +160,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, ext & (1 << (INET_DIAG_TCLASS - 1))) { u32 classid = 0; -#ifdef CONFIG_SOCK_CGROUP_DATA +#ifdef CONFIG_CGROUP_NET_CLASSID classid = sock_cgroup_classid(&sk->sk_cgrp_data); #endif /* Fallback to socket priority if class id isn't set. diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 66915bf0d89ad..c437fbd59ec13 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -117,6 +117,7 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { +#ifdef CONFIG_CGROUP_NET_CLASSID const struct xt_cgroup_info_v0 *info = par->matchinfo; struct sock *sk = skb->sk; @@ -125,6 +126,8 @@ cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ info->invert; +#endif + return false; } static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) @@ -140,9 +143,12 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; +#ifdef CONFIG_CGROUP_NET_CLASSID else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; +#endif + return false; } static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) @@ -158,9 +164,12 @@ static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; +#ifdef CONFIG_CGROUP_NET_CLASSID else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; +#endif + return false; } static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) From 087a9eb9e5978e3ba362e1163691e41097e8ca20 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 23 Apr 2025 17:51:31 +0300 Subject: [PATCH 459/770] vxlan: vnifilter: Fix unlocked deletion of default FDB entry When a VNI is deleted from a VXLAN device in 'vnifilter' mode, the FDB entry associated with the default remote (assuming one was configured) is deleted without holding the hash lock. This is wrong and will result in a warning [1] being generated by the lockdep annotation that was added by commit ebe642067455 ("vxlan: Create wrappers for FDB lookup"). Reproducer: # ip link add vx0 up type vxlan dstport 4789 external vnifilter local 192.0.2.1 # bridge vni add vni 10010 remote 198.51.100.1 dev vx0 # bridge vni del vni 10010 dev vx0 Fix by acquiring the hash lock before the deletion and releasing it afterwards. Blame the original commit that introduced the issue rather than the one that exposed it. [1] WARNING: CPU: 3 PID: 392 at drivers/net/vxlan/vxlan_core.c:417 vxlan_find_mac+0x17f/0x1a0 [...] RIP: 0010:vxlan_find_mac+0x17f/0x1a0 [...] Call Trace: __vxlan_fdb_delete+0xbe/0x560 vxlan_vni_delete_group+0x2ba/0x940 vxlan_vni_del.isra.0+0x15f/0x580 vxlan_process_vni_filter+0x38b/0x7b0 vxlan_vnifilter_process+0x3bb/0x510 rtnetlink_rcv_msg+0x2f7/0xb70 netlink_rcv_skb+0x131/0x360 netlink_unicast+0x426/0x710 netlink_sendmsg+0x75a/0xc20 __sock_sendmsg+0xc1/0x150 ____sys_sendmsg+0x5aa/0x7b0 ___sys_sendmsg+0xfc/0x180 __sys_sendmsg+0x121/0x1b0 do_syscall_64+0xbb/0x1d0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: f9c4bb0b245c ("vxlan: vni filtering support on collect metadata device") Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250423145131.513029-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_vnifilter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index 6e6e9f05509ab..06d19e90eadb5 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -627,7 +627,11 @@ static void vxlan_vni_delete_group(struct vxlan_dev *vxlan, * default dst remote_ip previously added for this vni */ if (!vxlan_addr_any(&vninode->remote_ip) || - !vxlan_addr_any(&dst->remote_ip)) + !vxlan_addr_any(&dst->remote_ip)) { + u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, + vninode->vni); + + spin_lock_bh(&vxlan->hash_lock[hash_index]); __vxlan_fdb_delete(vxlan, all_zeros_mac, (vxlan_addr_any(&vninode->remote_ip) ? dst->remote_ip : vninode->remote_ip), @@ -635,6 +639,8 @@ static void vxlan_vni_delete_group(struct vxlan_dev *vxlan, vninode->vni, vninode->vni, dst->remote_ifindex, true); + spin_unlock_bh(&vxlan->hash_lock[hash_index]); + } if (vxlan->dev->flags & IFF_UP) { if (vxlan_addr_multicast(&vninode->remote_ip) && From df8cf32413fa487313eb799f28f87a543480cfa0 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Sat, 19 Apr 2025 22:10:15 +0800 Subject: [PATCH 460/770] selftests: iou-zcrx: Get the page size at runtime Use the API `sysconf()` to query page size at runtime, instead of using hard code number 4096. And use `posix_memalign` to allocate the page size aligned momory. Signed-off-by: Haiyue Wang Reviewed-by: Simon Horman Reviewed-by: David Wei Link: https://patch.msgid.link/20250419141044.10304-1-haiyuewa@163.com Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/hw/iou-zcrx.c | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c index c26b4180eddd3..8aa426014c87b 100644 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c @@ -37,8 +37,8 @@ #include -#define PAGE_SIZE (4096) -#define AREA_SIZE (8192 * PAGE_SIZE) +static long page_size; +#define AREA_SIZE (8192 * page_size) #define SEND_SIZE (512 * 4096) #define min(a, b) \ ({ \ @@ -66,7 +66,7 @@ static int cfg_oneshot_recvs; static int cfg_send_size = SEND_SIZE; static struct sockaddr_in6 cfg_addr; -static char payload[SEND_SIZE] __attribute__((aligned(PAGE_SIZE))); +static char *payload; static void *area_ptr; static void *ring_ptr; static size_t ring_size; @@ -114,8 +114,8 @@ static inline size_t get_refill_ring_size(unsigned int rq_entries) ring_size = rq_entries * sizeof(struct io_uring_zcrx_rqe); /* add space for the header (head/tail/etc.) */ - ring_size += PAGE_SIZE; - return ALIGN_UP(ring_size, 4096); + ring_size += page_size; + return ALIGN_UP(ring_size, page_size); } static void setup_zcrx(struct io_uring *ring) @@ -219,7 +219,7 @@ static void process_accept(struct io_uring *ring, struct io_uring_cqe *cqe) connfd = cqe->res; if (cfg_oneshot) - add_recvzc_oneshot(ring, connfd, PAGE_SIZE); + add_recvzc_oneshot(ring, connfd, page_size); else add_recvzc(ring, connfd); } @@ -245,7 +245,7 @@ static void process_recvzc(struct io_uring *ring, struct io_uring_cqe *cqe) if (cfg_oneshot) { if (cqe->res == 0 && cqe->flags == 0 && cfg_oneshot_recvs) { - add_recvzc_oneshot(ring, connfd, PAGE_SIZE); + add_recvzc_oneshot(ring, connfd, page_size); cfg_oneshot_recvs--; } } else if (!(cqe->flags & IORING_CQE_F_MORE)) { @@ -370,7 +370,7 @@ static void usage(const char *filepath) static void parse_opts(int argc, char **argv) { - const int max_payload_len = sizeof(payload) - + const int max_payload_len = SEND_SIZE - sizeof(struct ipv6hdr) - sizeof(struct tcphdr) - 40 /* max tcp options */; @@ -443,6 +443,13 @@ int main(int argc, char **argv) const char *cfg_test = argv[argc - 1]; int i; + page_size = sysconf(_SC_PAGESIZE); + if (page_size < 0) + return 1; + + if (posix_memalign((void **)&payload, page_size, SEND_SIZE)) + return 1; + parse_opts(argc, argv); for (i = 0; i < SEND_SIZE; i++) From ef7c993ae24781957c225e419cdd2ea19c818c56 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Tue, 22 Apr 2025 16:36:38 -0700 Subject: [PATCH 461/770] dt-bindings: net: brcm,asp-v2.0: Remove asp-v2.0 Remove asp-v2.0 which was only supported on one SoC that never saw the light of day. Signed-off-by: Justin Chen Acked-by: Conor Dooley Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250422233645.1931036-2-justin.chen@broadcom.com Signed-off-by: Jakub Kicinski --- .../bindings/net/brcm,asp-v2.0.yaml | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/Documentation/devicetree/bindings/net/brcm,asp-v2.0.yaml b/Documentation/devicetree/bindings/net/brcm,asp-v2.0.yaml index 660e2ca42daf5..1efbee2c4efd0 100644 --- a/Documentation/devicetree/bindings/net/brcm,asp-v2.0.yaml +++ b/Documentation/devicetree/bindings/net/brcm,asp-v2.0.yaml @@ -4,7 +4,7 @@ $id: http://devicetree.org/schemas/net/brcm,asp-v2.0.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# -title: Broadcom ASP 2.0 Ethernet controller +title: Broadcom ASP Ethernet controller maintainers: - Justin Chen @@ -23,10 +23,6 @@ properties: - enum: - brcm,bcm74165-asp - const: brcm,asp-v2.1 - - items: - - enum: - - brcm,bcm72165-asp - - const: brcm,asp-v2.0 "#address-cells": const: 1 @@ -39,11 +35,9 @@ properties: ranges: true interrupts: - minItems: 1 items: - description: RX/TX interrupt - - description: Port 0 Wake-on-LAN - - description: Port 1 Wake-on-LAN + - description: Wake-on-LAN interrupt clocks: maxItems: 1 @@ -106,16 +100,17 @@ examples: #include ethernet@9c00000 { - compatible = "brcm,bcm72165-asp", "brcm,asp-v2.0"; + compatible = "brcm,bcm74165-asp", "brcm,asp-v2.1"; reg = <0x9c00000 0x1fff14>; - interrupts = ; + interrupts-extended = <&intc GIC_SPI 51 IRQ_TYPE_LEVEL_HIGH>, + <&aon_pm_l2_intc 14>; ranges = <0x0 0x9c00000 0x1fff14>; clocks = <&scmi 14>; #address-cells = <1>; #size-cells = <1>; mdio@c614 { - compatible = "brcm,asp-v2.0-mdio"; + compatible = "brcm,asp-v2.1-mdio"; reg = <0xc614 0x8>; reg-names = "mdio"; #address-cells = <1>; @@ -127,7 +122,7 @@ examples: }; mdio@ce14 { - compatible = "brcm,asp-v2.0-mdio"; + compatible = "brcm,asp-v2.1-mdio"; reg = <0xce14 0x8>; reg-names = "mdio"; #address-cells = <1>; From 62c8c4656ef1ce4ad14926e459816c77de3bbb9e Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Tue, 22 Apr 2025 16:36:39 -0700 Subject: [PATCH 462/770] dt-bindings: net: brcm,unimac-mdio: Remove asp-v2.0 Remove asp-v2.0 which was only supported on one SoC that never saw the light of day. Signed-off-by: Justin Chen Acked-by: Conor Dooley Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250422233645.1931036-3-justin.chen@broadcom.com Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml b/Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml index 63bee5b542f50..7eb6d5839f0bd 100644 --- a/Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml +++ b/Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml @@ -22,7 +22,6 @@ properties: - brcm,genet-mdio-v3 - brcm,genet-mdio-v4 - brcm,genet-mdio-v5 - - brcm,asp-v2.0-mdio - brcm,asp-v2.1-mdio - brcm,asp-v2.2-mdio - brcm,unimac-mdio From 4ad8cb76bd0d17827fd0c84707c0d72c8710b0e9 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Tue, 22 Apr 2025 16:36:40 -0700 Subject: [PATCH 463/770] net: bcmasp: Remove support for asp-v2.0 The SoC that supported asp-v2.0 never saw the light of day. asp-v2.0 has quirks that makes the logic overly complicated. For example, asp-v2.0 is the only revision that has a different wake up IRQ hook up. Remove asp-v2.0 support to make supporting future HW revisions cleaner. Signed-off-by: Justin Chen Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250422233645.1931036-4-justin.chen@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/asp2/bcmasp.c | 97 ++----------------- drivers/net/ethernet/broadcom/asp2/bcmasp.h | 45 ++------- .../ethernet/broadcom/asp2/bcmasp_ethtool.c | 21 +--- .../net/ethernet/broadcom/asp2/bcmasp_intf.c | 2 +- .../ethernet/broadcom/asp2/bcmasp_intf_defs.h | 3 +- 5 files changed, 22 insertions(+), 146 deletions(-) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.c b/drivers/net/ethernet/broadcom/asp2/bcmasp.c index a68fab1b05f0c..1eab372966084 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.c @@ -141,7 +141,7 @@ void bcmasp_flush_rx_port(struct bcmasp_intf *intf) return; } - rx_ctrl_core_wl(priv, mask, priv->hw_info->rx_ctrl_flush); + rx_ctrl_core_wl(priv, mask, ASP_RX_CTRL_FLUSH); } static void bcmasp_netfilt_hw_en_wake(struct bcmasp_priv *priv, @@ -1108,7 +1108,7 @@ static int bcmasp_get_and_request_irq(struct bcmasp_priv *priv, int i) return irq; } -static void bcmasp_init_wol_shared(struct bcmasp_priv *priv) +static void bcmasp_init_wol(struct bcmasp_priv *priv) { struct platform_device *pdev = priv->pdev; struct device *dev = &pdev->dev; @@ -1125,7 +1125,7 @@ static void bcmasp_init_wol_shared(struct bcmasp_priv *priv) device_set_wakeup_capable(&pdev->dev, 1); } -static void bcmasp_enable_wol_shared(struct bcmasp_intf *intf, bool en) +void bcmasp_enable_wol(struct bcmasp_intf *intf, bool en) { struct bcmasp_priv *priv = intf->parent; struct device *dev = &priv->pdev->dev; @@ -1154,54 +1154,12 @@ static void bcmasp_enable_wol_shared(struct bcmasp_intf *intf, bool en) } } -static void bcmasp_wol_irq_destroy_shared(struct bcmasp_priv *priv) +static void bcmasp_wol_irq_destroy(struct bcmasp_priv *priv) { if (priv->wol_irq > 0) free_irq(priv->wol_irq, priv); } -static void bcmasp_init_wol_per_intf(struct bcmasp_priv *priv) -{ - struct platform_device *pdev = priv->pdev; - struct device *dev = &pdev->dev; - struct bcmasp_intf *intf; - int irq; - - list_for_each_entry(intf, &priv->intfs, list) { - irq = bcmasp_get_and_request_irq(priv, intf->port + 1); - if (irq < 0) { - dev_warn(dev, "Failed to init WoL irq(port %d): %d\n", - intf->port, irq); - continue; - } - - intf->wol_irq = irq; - intf->wol_irq_enabled = false; - device_set_wakeup_capable(&pdev->dev, 1); - } -} - -static void bcmasp_enable_wol_per_intf(struct bcmasp_intf *intf, bool en) -{ - struct device *dev = &intf->parent->pdev->dev; - - if (en ^ intf->wol_irq_enabled) - irq_set_irq_wake(intf->wol_irq, en); - - intf->wol_irq_enabled = en; - device_set_wakeup_enable(dev, en); -} - -static void bcmasp_wol_irq_destroy_per_intf(struct bcmasp_priv *priv) -{ - struct bcmasp_intf *intf; - - list_for_each_entry(intf, &priv->intfs, list) { - if (intf->wol_irq > 0) - free_irq(intf->wol_irq, priv); - } -} - static void bcmasp_eee_fixup(struct bcmasp_intf *intf, bool en) { u32 reg, phy_lpi_overwrite; @@ -1220,60 +1178,22 @@ static void bcmasp_eee_fixup(struct bcmasp_intf *intf, bool en) usleep_range(50, 100); } -static struct bcmasp_hw_info v20_hw_info = { - .rx_ctrl_flush = ASP_RX_CTRL_FLUSH, - .umac2fb = UMAC2FB_OFFSET, - .rx_ctrl_fb_out_frame_count = ASP_RX_CTRL_FB_OUT_FRAME_COUNT, - .rx_ctrl_fb_filt_out_frame_count = ASP_RX_CTRL_FB_FILT_OUT_FRAME_COUNT, - .rx_ctrl_fb_rx_fifo_depth = ASP_RX_CTRL_FB_RX_FIFO_DEPTH, -}; - -static const struct bcmasp_plat_data v20_plat_data = { - .init_wol = bcmasp_init_wol_per_intf, - .enable_wol = bcmasp_enable_wol_per_intf, - .destroy_wol = bcmasp_wol_irq_destroy_per_intf, - .core_clock_select = bcmasp_core_clock_select_one, - .hw_info = &v20_hw_info, -}; - -static struct bcmasp_hw_info v21_hw_info = { - .rx_ctrl_flush = ASP_RX_CTRL_FLUSH_2_1, - .umac2fb = UMAC2FB_OFFSET_2_1, - .rx_ctrl_fb_out_frame_count = ASP_RX_CTRL_FB_OUT_FRAME_COUNT_2_1, - .rx_ctrl_fb_filt_out_frame_count = - ASP_RX_CTRL_FB_FILT_OUT_FRAME_COUNT_2_1, - .rx_ctrl_fb_rx_fifo_depth = ASP_RX_CTRL_FB_RX_FIFO_DEPTH_2_1, -}; - static const struct bcmasp_plat_data v21_plat_data = { - .init_wol = bcmasp_init_wol_shared, - .enable_wol = bcmasp_enable_wol_shared, - .destroy_wol = bcmasp_wol_irq_destroy_shared, .core_clock_select = bcmasp_core_clock_select_one, - .hw_info = &v21_hw_info, }; static const struct bcmasp_plat_data v22_plat_data = { - .init_wol = bcmasp_init_wol_shared, - .enable_wol = bcmasp_enable_wol_shared, - .destroy_wol = bcmasp_wol_irq_destroy_shared, .core_clock_select = bcmasp_core_clock_select_many, - .hw_info = &v21_hw_info, .eee_fixup = bcmasp_eee_fixup, }; static void bcmasp_set_pdata(struct bcmasp_priv *priv, const struct bcmasp_plat_data *pdata) { - priv->init_wol = pdata->init_wol; - priv->enable_wol = pdata->enable_wol; - priv->destroy_wol = pdata->destroy_wol; priv->core_clock_select = pdata->core_clock_select; priv->eee_fixup = pdata->eee_fixup; - priv->hw_info = pdata->hw_info; } static const struct of_device_id bcmasp_of_match[] = { - { .compatible = "brcm,asp-v2.0", .data = &v20_plat_data }, { .compatible = "brcm,asp-v2.1", .data = &v21_plat_data }, { .compatible = "brcm,asp-v2.2", .data = &v22_plat_data }, { /* sentinel */ }, @@ -1281,9 +1201,8 @@ static const struct of_device_id bcmasp_of_match[] = { MODULE_DEVICE_TABLE(of, bcmasp_of_match); static const struct of_device_id bcmasp_mdio_of_match[] = { - { .compatible = "brcm,asp-v2.2-mdio", }, { .compatible = "brcm,asp-v2.1-mdio", }, - { .compatible = "brcm,asp-v2.0-mdio", }, + { .compatible = "brcm,asp-v2.2-mdio", }, { /* sentinel */ }, }; MODULE_DEVICE_TABLE(of, bcmasp_mdio_of_match); @@ -1387,7 +1306,7 @@ static int bcmasp_probe(struct platform_device *pdev) } /* Check and enable WoL */ - priv->init_wol(priv); + bcmasp_init_wol(priv); /* Drop the clock reference count now and let ndo_open()/ndo_close() * manage it for us from now on. @@ -1404,7 +1323,7 @@ static int bcmasp_probe(struct platform_device *pdev) if (ret) { netdev_err(intf->ndev, "failed to register net_device: %d\n", ret); - priv->destroy_wol(priv); + bcmasp_wol_irq_destroy(priv); bcmasp_remove_intfs(priv); goto of_put_exit; } @@ -1425,7 +1344,7 @@ static void bcmasp_remove(struct platform_device *pdev) if (!priv) return; - priv->destroy_wol(priv); + bcmasp_wol_irq_destroy(priv); bcmasp_remove_intfs(priv); } diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.h b/drivers/net/ethernet/broadcom/asp2/bcmasp.h index 8fc75bcedb70f..6f49ebad4e99c 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp.h +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.h @@ -53,22 +53,15 @@ #define ASP_RX_CTRL_FB_0_FRAME_COUNT 0x14 #define ASP_RX_CTRL_FB_1_FRAME_COUNT 0x18 #define ASP_RX_CTRL_FB_8_FRAME_COUNT 0x1c -/* asp2.1 diverges offsets here */ -/* ASP2.0 */ -#define ASP_RX_CTRL_FB_OUT_FRAME_COUNT 0x20 -#define ASP_RX_CTRL_FB_FILT_OUT_FRAME_COUNT 0x24 -#define ASP_RX_CTRL_FLUSH 0x28 -#define ASP_CTRL_UMAC0_FLUSH_MASK (BIT(0) | BIT(12)) -#define ASP_CTRL_UMAC1_FLUSH_MASK (BIT(1) | BIT(13)) -#define ASP_CTRL_SPB_FLUSH_MASK (BIT(8) | BIT(20)) -#define ASP_RX_CTRL_FB_RX_FIFO_DEPTH 0x30 -/* ASP2.1 */ -#define ASP_RX_CTRL_FB_9_FRAME_COUNT_2_1 0x20 -#define ASP_RX_CTRL_FB_10_FRAME_COUNT_2_1 0x24 -#define ASP_RX_CTRL_FB_OUT_FRAME_COUNT_2_1 0x28 -#define ASP_RX_CTRL_FB_FILT_OUT_FRAME_COUNT_2_1 0x2c -#define ASP_RX_CTRL_FLUSH_2_1 0x30 -#define ASP_RX_CTRL_FB_RX_FIFO_DEPTH_2_1 0x38 +#define ASP_RX_CTRL_FB_9_FRAME_COUNT 0x20 +#define ASP_RX_CTRL_FB_10_FRAME_COUNT 0x24 +#define ASP_RX_CTRL_FB_OUT_FRAME_COUNT 0x28 +#define ASP_RX_CTRL_FB_FILT_OUT_FRAME_COUNT 0x2c +#define ASP_RX_CTRL_FLUSH 0x30 +#define ASP_CTRL_UMAC0_FLUSH_MASK (BIT(0) | BIT(12)) +#define ASP_CTRL_UMAC1_FLUSH_MASK (BIT(1) | BIT(13)) +#define ASP_CTRL_SPB_FLUSH_MASK (BIT(8) | BIT(20)) +#define ASP_RX_CTRL_FB_RX_FIFO_DEPTH 0x38 #define ASP_RX_FILTER_OFFSET 0x80000 #define ASP_RX_FILTER_BLK_CTRL 0x0 @@ -345,9 +338,6 @@ struct bcmasp_intf { u32 wolopts; u8 sopass[SOPASS_MAX]; - /* Used if per intf wol irq */ - int wol_irq; - unsigned int wol_irq_enabled:1; }; #define NUM_NET_FILTERS 32 @@ -370,21 +360,9 @@ struct bcmasp_mda_filter { u8 mask[ETH_ALEN]; }; -struct bcmasp_hw_info { - u32 rx_ctrl_flush; - u32 umac2fb; - u32 rx_ctrl_fb_out_frame_count; - u32 rx_ctrl_fb_filt_out_frame_count; - u32 rx_ctrl_fb_rx_fifo_depth; -}; - struct bcmasp_plat_data { - void (*init_wol)(struct bcmasp_priv *priv); - void (*enable_wol)(struct bcmasp_intf *intf, bool en); - void (*destroy_wol)(struct bcmasp_priv *priv); void (*core_clock_select)(struct bcmasp_priv *priv, bool slow); void (*eee_fixup)(struct bcmasp_intf *priv, bool en); - struct bcmasp_hw_info *hw_info; }; struct bcmasp_priv { @@ -399,14 +377,10 @@ struct bcmasp_priv { int wol_irq; unsigned long wol_irq_enabled_mask; - void (*init_wol)(struct bcmasp_priv *priv); - void (*enable_wol)(struct bcmasp_intf *intf, bool en); - void (*destroy_wol)(struct bcmasp_priv *priv); void (*core_clock_select)(struct bcmasp_priv *priv, bool slow); void (*eee_fixup)(struct bcmasp_intf *intf, bool en); void __iomem *base; - struct bcmasp_hw_info *hw_info; struct list_head intfs; @@ -599,4 +573,5 @@ int bcmasp_netfilt_get_all_active(struct bcmasp_intf *intf, u32 *rule_locs, void bcmasp_netfilt_suspend(struct bcmasp_intf *intf); +void bcmasp_enable_wol(struct bcmasp_intf *intf, bool en); #endif diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c index a537c121d3e27..991a87096e087 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c @@ -71,23 +71,6 @@ static const struct bcmasp_stats bcmasp_gstrings_stats[] = { #define BCMASP_STATS_LEN ARRAY_SIZE(bcmasp_gstrings_stats) -static u16 bcmasp_stat_fixup_offset(struct bcmasp_intf *intf, - const struct bcmasp_stats *s) -{ - struct bcmasp_priv *priv = intf->parent; - - if (!strcmp("Frames Out(Buffer)", s->stat_string)) - return priv->hw_info->rx_ctrl_fb_out_frame_count; - - if (!strcmp("Frames Out(Filters)", s->stat_string)) - return priv->hw_info->rx_ctrl_fb_filt_out_frame_count; - - if (!strcmp("RX Buffer FIFO Depth", s->stat_string)) - return priv->hw_info->rx_ctrl_fb_rx_fifo_depth; - - return s->reg_offset; -} - static int bcmasp_get_sset_count(struct net_device *dev, int string_set) { switch (string_set) { @@ -126,7 +109,7 @@ static void bcmasp_update_mib_counters(struct bcmasp_intf *intf) char *p; s = &bcmasp_gstrings_stats[i]; - offset = bcmasp_stat_fixup_offset(intf, s); + offset = s->reg_offset; switch (s->type) { case BCMASP_STAT_SOFT: continue; @@ -215,7 +198,7 @@ static int bcmasp_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol) memcpy(intf->sopass, wol->sopass, sizeof(wol->sopass)); mutex_lock(&priv->wol_lock); - priv->enable_wol(intf, !!intf->wolopts); + bcmasp_enable_wol(intf, !!intf->wolopts); mutex_unlock(&priv->wol_lock); return 0; diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c index 45ec1a9214a28..01de05ec3ebc6 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c @@ -1185,7 +1185,7 @@ static void bcmasp_map_res(struct bcmasp_priv *priv, struct bcmasp_intf *intf) { /* Per port */ intf->res.umac = priv->base + UMC_OFFSET(intf); - intf->res.umac2fb = priv->base + (priv->hw_info->umac2fb + + intf->res.umac2fb = priv->base + (UMAC2FB_OFFSET + (intf->port * 0x4)); intf->res.rgmii = priv->base + RGMII_OFFSET(intf); diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf_defs.h b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf_defs.h index ad742612895fc..af7418348e815 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf_defs.h +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf_defs.h @@ -118,8 +118,7 @@ #define UMC_PSW_MS 0x624 #define UMC_PSW_LS 0x628 -#define UMAC2FB_OFFSET_2_1 0x9f044 -#define UMAC2FB_OFFSET 0x9f03c +#define UMAC2FB_OFFSET 0x9f044 #define UMAC2FB_CFG 0x0 #define UMAC2FB_CFG_OPUT_EN BIT(0) #define UMAC2FB_CFG_VLAN_EN BIT(1) From 8c28aace88646ce0ac9567fee3ac7d808c6d0f86 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Tue, 22 Apr 2025 16:36:41 -0700 Subject: [PATCH 464/770] net: phy: mdio-bcm-unimac: Remove asp-v2.0 Remove asp-v2.0 which will no longer be supported. Signed-off-by: Justin Chen Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250422233645.1931036-5-justin.chen@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/mdio/mdio-bcm-unimac.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/mdio/mdio-bcm-unimac.c b/drivers/net/mdio/mdio-bcm-unimac.c index 074d96328f41a..e2800b2e0288e 100644 --- a/drivers/net/mdio/mdio-bcm-unimac.c +++ b/drivers/net/mdio/mdio-bcm-unimac.c @@ -336,7 +336,6 @@ static SIMPLE_DEV_PM_OPS(unimac_mdio_pm_ops, static const struct of_device_id unimac_mdio_ids[] = { { .compatible = "brcm,asp-v2.2-mdio", }, { .compatible = "brcm,asp-v2.1-mdio", }, - { .compatible = "brcm,asp-v2.0-mdio", }, { .compatible = "brcm,bcm6846-mdio", }, { .compatible = "brcm,genet-mdio-v5", }, { .compatible = "brcm,genet-mdio-v4", }, From e4bf8f8a22d83ef82fc679b835072a290889b3f8 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Tue, 22 Apr 2025 16:36:42 -0700 Subject: [PATCH 465/770] dt-bindings: net: brcm,asp-v2.0: Add asp-v3.0 Add asp-v3.0 support. v3.0 is a major revision that reduces the feature set for cost savings. We have a reduced amount of channels and network filters. Signed-off-by: Justin Chen Acked-by: Conor Dooley Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250422233645.1931036-6-justin.chen@broadcom.com Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/brcm,asp-v2.0.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/net/brcm,asp-v2.0.yaml b/Documentation/devicetree/bindings/net/brcm,asp-v2.0.yaml index 1efbee2c4efd0..a3db6d594c8c9 100644 --- a/Documentation/devicetree/bindings/net/brcm,asp-v2.0.yaml +++ b/Documentation/devicetree/bindings/net/brcm,asp-v2.0.yaml @@ -15,6 +15,10 @@ description: Broadcom Ethernet controller first introduced with 72165 properties: compatible: oneOf: + - items: + - enum: + - brcm,bcm74110-asp + - const: brcm,asp-v3.0 - items: - enum: - brcm,bcm74165b0-asp From 9a8a73766b34a2e8ef1b2dfa85bfecece2cd3702 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Tue, 22 Apr 2025 16:36:43 -0700 Subject: [PATCH 466/770] dt-bindings: net: brcm,unimac-mdio: Add asp-v3.0 The asp-v3.0 Ethernet controller uses a brcm unimac like its predecessor. Signed-off-by: Justin Chen Acked-by: Conor Dooley Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250422233645.1931036-7-justin.chen@broadcom.com Signed-off-by: Jakub Kicinski --- Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml b/Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml index 7eb6d5839f0bd..43516dd357b8c 100644 --- a/Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml +++ b/Documentation/devicetree/bindings/net/brcm,unimac-mdio.yaml @@ -24,6 +24,7 @@ properties: - brcm,genet-mdio-v5 - brcm,asp-v2.1-mdio - brcm,asp-v2.2-mdio + - brcm,asp-v3.0-mdio - brcm,unimac-mdio - brcm,bcm6846-mdio From e9f31435ee7d1dd350f5efaf9de7b0db3ad4bbfe Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Tue, 22 Apr 2025 16:36:44 -0700 Subject: [PATCH 467/770] net: bcmasp: Add support for asp-v3.0 The asp-v3.0 is a major HW revision that reduced the number of channels and filters. The goal was to save cost by reducing the feature set. Changes for asp-v3.0 - Number of network filters were reduced. - Number of channels were reduced. - EDPKT stats were removed. - Fix a bug with csum offload. Signed-off-by: Justin Chen Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250422233645.1931036-8-justin.chen@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/asp2/bcmasp.c | 79 +++++++++++++------ drivers/net/ethernet/broadcom/asp2/bcmasp.h | 33 ++++++-- .../ethernet/broadcom/asp2/bcmasp_ethtool.c | 15 +--- .../net/ethernet/broadcom/asp2/bcmasp_intf.c | 13 ++- 4 files changed, 92 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.c b/drivers/net/ethernet/broadcom/asp2/bcmasp.c index 1eab372966084..fd35f4b4dc50b 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.c @@ -518,7 +518,7 @@ void bcmasp_netfilt_suspend(struct bcmasp_intf *intf) int ret, i; /* Write all filters to HW */ - for (i = 0; i < NUM_NET_FILTERS; i++) { + for (i = 0; i < priv->num_net_filters; i++) { /* If the filter does not match the port, skip programming. */ if (!priv->net_filters[i].claimed || priv->net_filters[i].port != intf->port) @@ -551,7 +551,7 @@ int bcmasp_netfilt_get_all_active(struct bcmasp_intf *intf, u32 *rule_locs, struct bcmasp_priv *priv = intf->parent; int j = 0, i; - for (i = 0; i < NUM_NET_FILTERS; i++) { + for (i = 0; i < priv->num_net_filters; i++) { if (!priv->net_filters[i].claimed || priv->net_filters[i].port != intf->port) continue; @@ -577,7 +577,7 @@ int bcmasp_netfilt_get_active(struct bcmasp_intf *intf) struct bcmasp_priv *priv = intf->parent; int cnt = 0, i; - for (i = 0; i < NUM_NET_FILTERS; i++) { + for (i = 0; i < priv->num_net_filters; i++) { if (!priv->net_filters[i].claimed || priv->net_filters[i].port != intf->port) continue; @@ -602,7 +602,7 @@ bool bcmasp_netfilt_check_dup(struct bcmasp_intf *intf, size_t fs_size = 0; int i; - for (i = 0; i < NUM_NET_FILTERS; i++) { + for (i = 0; i < priv->num_net_filters; i++) { if (!priv->net_filters[i].claimed || priv->net_filters[i].port != intf->port) continue; @@ -670,7 +670,7 @@ struct bcmasp_net_filter *bcmasp_netfilt_get_init(struct bcmasp_intf *intf, int i, open_index = -1; /* Check whether we exceed the filter table capacity */ - if (loc != RX_CLS_LOC_ANY && loc >= NUM_NET_FILTERS) + if (loc != RX_CLS_LOC_ANY && loc >= priv->num_net_filters) return ERR_PTR(-EINVAL); /* If the filter location is busy (already claimed) and we are initializing @@ -686,7 +686,7 @@ struct bcmasp_net_filter *bcmasp_netfilt_get_init(struct bcmasp_intf *intf, /* Initialize the loop index based on the desired location or from 0 */ i = loc == RX_CLS_LOC_ANY ? 0 : loc; - for ( ; i < NUM_NET_FILTERS; i++) { + for ( ; i < priv->num_net_filters; i++) { /* Found matching network filter */ if (!init && priv->net_filters[i].claimed && @@ -779,7 +779,7 @@ static void bcmasp_en_mda_filter(struct bcmasp_intf *intf, bool en, priv->mda_filters[i].en = en; priv->mda_filters[i].port = intf->port; - rx_filter_core_wl(priv, ((intf->channel + 8) | + rx_filter_core_wl(priv, ((intf->channel + priv->tx_chan_offset) | (en << ASP_RX_FILTER_MDA_CFG_EN_SHIFT) | ASP_RX_FILTER_MDA_CFG_UMC_SEL(intf->port)), ASP_RX_FILTER_MDA_CFG(i)); @@ -865,7 +865,7 @@ void bcmasp_disable_all_filters(struct bcmasp_intf *intf) res_count = bcmasp_total_res_mda_cnt(intf->parent); /* Disable all filters held by this port */ - for (i = res_count; i < NUM_MDA_FILTERS; i++) { + for (i = res_count; i < priv->num_mda_filters; i++) { if (priv->mda_filters[i].en && priv->mda_filters[i].port == intf->port) bcmasp_en_mda_filter(intf, 0, i); @@ -909,7 +909,7 @@ int bcmasp_set_en_mda_filter(struct bcmasp_intf *intf, unsigned char *addr, res_count = bcmasp_total_res_mda_cnt(intf->parent); - for (i = res_count; i < NUM_MDA_FILTERS; i++) { + for (i = res_count; i < priv->num_mda_filters; i++) { /* If filter not enabled or belongs to another port skip */ if (!priv->mda_filters[i].en || priv->mda_filters[i].port != intf->port) @@ -924,7 +924,7 @@ int bcmasp_set_en_mda_filter(struct bcmasp_intf *intf, unsigned char *addr, } /* Create new filter if possible */ - for (i = res_count; i < NUM_MDA_FILTERS; i++) { + for (i = res_count; i < priv->num_mda_filters; i++) { if (priv->mda_filters[i].en) continue; @@ -944,12 +944,12 @@ static void bcmasp_core_init_filters(struct bcmasp_priv *priv) /* Disable all filters and reset software view since the HW * can lose context while in deep sleep suspend states */ - for (i = 0; i < NUM_MDA_FILTERS; i++) { + for (i = 0; i < priv->num_mda_filters; i++) { rx_filter_core_wl(priv, 0x0, ASP_RX_FILTER_MDA_CFG(i)); priv->mda_filters[i].en = 0; } - for (i = 0; i < NUM_NET_FILTERS; i++) + for (i = 0; i < priv->num_net_filters; i++) rx_filter_core_wl(priv, 0x0, ASP_RX_FILTER_NET_CFG(i)); /* Top level filter enable bit should be enabled at all times, set @@ -966,18 +966,8 @@ static void bcmasp_core_init_filters(struct bcmasp_priv *priv) /* ASP core initialization */ static void bcmasp_core_init(struct bcmasp_priv *priv) { - tx_analytics_core_wl(priv, 0x0, ASP_TX_ANALYTICS_CTRL); - rx_analytics_core_wl(priv, 0x4, ASP_RX_ANALYTICS_CTRL); - - rx_edpkt_core_wl(priv, (ASP_EDPKT_HDR_SZ_128 << ASP_EDPKT_HDR_SZ_SHIFT), - ASP_EDPKT_HDR_CFG); - rx_edpkt_core_wl(priv, - (ASP_EDPKT_ENDI_BT_SWP_WD << ASP_EDPKT_ENDI_DESC_SHIFT), - ASP_EDPKT_ENDI); - rx_edpkt_core_wl(priv, 0x1b, ASP_EDPKT_BURST_BUF_PSCAL_TOUT); rx_edpkt_core_wl(priv, 0x3e8, ASP_EDPKT_BURST_BUF_WRITE_TOUT); - rx_edpkt_core_wl(priv, 0x3e8, ASP_EDPKT_BURST_BUF_READ_TOUT); rx_edpkt_core_wl(priv, ASP_EDPKT_ENABLE_EN, ASP_EDPKT_ENABLE); @@ -1020,6 +1010,18 @@ static void bcmasp_core_clock_select_one(struct bcmasp_priv *priv, bool slow) ctrl_core_wl(priv, reg, ASP_CTRL_CORE_CLOCK_SELECT); } +static void bcmasp_core_clock_select_one_ctrl2(struct bcmasp_priv *priv, bool slow) +{ + u32 reg; + + reg = ctrl2_core_rl(priv, ASP_CTRL2_CORE_CLOCK_SELECT); + if (slow) + reg &= ~ASP_CTRL2_CORE_CLOCK_SELECT_MAIN; + else + reg |= ASP_CTRL2_CORE_CLOCK_SELECT_MAIN; + ctrl2_core_wl(priv, reg, ASP_CTRL2_CORE_CLOCK_SELECT); +} + static void bcmasp_core_clock_set_ll(struct bcmasp_priv *priv, u32 clr, u32 set) { u32 reg; @@ -1180,22 +1182,43 @@ static void bcmasp_eee_fixup(struct bcmasp_intf *intf, bool en) static const struct bcmasp_plat_data v21_plat_data = { .core_clock_select = bcmasp_core_clock_select_one, + .num_mda_filters = 32, + .num_net_filters = 32, + .tx_chan_offset = 8, + .rx_ctrl_offset = 0x0, }; static const struct bcmasp_plat_data v22_plat_data = { .core_clock_select = bcmasp_core_clock_select_many, .eee_fixup = bcmasp_eee_fixup, + .num_mda_filters = 32, + .num_net_filters = 32, + .tx_chan_offset = 8, + .rx_ctrl_offset = 0x0, +}; + +static const struct bcmasp_plat_data v30_plat_data = { + .core_clock_select = bcmasp_core_clock_select_one_ctrl2, + .num_mda_filters = 20, + .num_net_filters = 16, + .tx_chan_offset = 0, + .rx_ctrl_offset = 0x10000, }; static void bcmasp_set_pdata(struct bcmasp_priv *priv, const struct bcmasp_plat_data *pdata) { priv->core_clock_select = pdata->core_clock_select; priv->eee_fixup = pdata->eee_fixup; + priv->num_mda_filters = pdata->num_mda_filters; + priv->num_net_filters = pdata->num_net_filters; + priv->tx_chan_offset = pdata->tx_chan_offset; + priv->rx_ctrl_offset = pdata->rx_ctrl_offset; } static const struct of_device_id bcmasp_of_match[] = { { .compatible = "brcm,asp-v2.1", .data = &v21_plat_data }, { .compatible = "brcm,asp-v2.2", .data = &v22_plat_data }, + { .compatible = "brcm,asp-v3.0", .data = &v30_plat_data }, { /* sentinel */ }, }; MODULE_DEVICE_TABLE(of, bcmasp_of_match); @@ -1203,6 +1226,7 @@ MODULE_DEVICE_TABLE(of, bcmasp_of_match); static const struct of_device_id bcmasp_mdio_of_match[] = { { .compatible = "brcm,asp-v2.1-mdio", }, { .compatible = "brcm,asp-v2.2-mdio", }, + { .compatible = "brcm,asp-v3.0-mdio", }, { /* sentinel */ }, }; MODULE_DEVICE_TABLE(of, bcmasp_mdio_of_match); @@ -1284,6 +1308,17 @@ static int bcmasp_probe(struct platform_device *pdev) * how many interfaces come up. */ bcmasp_core_init(priv); + + priv->mda_filters = devm_kcalloc(dev, priv->num_mda_filters, + sizeof(*priv->mda_filters), GFP_KERNEL); + if (!priv->mda_filters) + return -ENOMEM; + + priv->net_filters = devm_kcalloc(dev, priv->num_net_filters, + sizeof(*priv->net_filters), GFP_KERNEL); + if (!priv->net_filters) + return -ENOMEM; + bcmasp_core_init_filters(priv); ports_node = of_find_node_by_name(dev->of_node, "ethernet-ports"); diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.h b/drivers/net/ethernet/broadcom/asp2/bcmasp.h index 6f49ebad4e99c..74adfdb50e11d 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp.h +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.h @@ -363,6 +363,10 @@ struct bcmasp_mda_filter { struct bcmasp_plat_data { void (*core_clock_select)(struct bcmasp_priv *priv, bool slow); void (*eee_fixup)(struct bcmasp_intf *priv, bool en); + unsigned int num_mda_filters; + unsigned int num_net_filters; + unsigned int tx_chan_offset; + unsigned int rx_ctrl_offset; }; struct bcmasp_priv { @@ -379,12 +383,16 @@ struct bcmasp_priv { void (*core_clock_select)(struct bcmasp_priv *priv, bool slow); void (*eee_fixup)(struct bcmasp_intf *intf, bool en); + unsigned int num_mda_filters; + unsigned int num_net_filters; + unsigned int tx_chan_offset; + unsigned int rx_ctrl_offset; void __iomem *base; struct list_head intfs; - struct bcmasp_mda_filter mda_filters[NUM_MDA_FILTERS]; + struct bcmasp_mda_filter *mda_filters; /* MAC destination address filters lock */ spinlock_t mda_lock; @@ -392,7 +400,7 @@ struct bcmasp_priv { /* Protects accesses to ASP_CTRL_CLOCK_CTRL */ spinlock_t clk_lock; - struct bcmasp_net_filter net_filters[NUM_NET_FILTERS]; + struct bcmasp_net_filter *net_filters; /* Network filter lock */ struct mutex net_lock; @@ -482,8 +490,8 @@ BCMASP_FP_IO_MACRO_Q(rx_edpkt_cfg); #define PKT_OFFLOAD_EPKT_IP(x) ((x) << 21) #define PKT_OFFLOAD_EPKT_TP(x) ((x) << 19) #define PKT_OFFLOAD_EPKT_LEN(x) ((x) << 16) -#define PKT_OFFLOAD_EPKT_CSUM_L3 BIT(15) -#define PKT_OFFLOAD_EPKT_CSUM_L2 BIT(14) +#define PKT_OFFLOAD_EPKT_CSUM_L4 BIT(15) +#define PKT_OFFLOAD_EPKT_CSUM_L3 BIT(14) #define PKT_OFFLOAD_EPKT_ID(x) ((x) << 12) #define PKT_OFFLOAD_EPKT_SEQ(x) ((x) << 10) #define PKT_OFFLOAD_EPKT_TS(x) ((x) << 8) @@ -515,12 +523,27 @@ BCMASP_CORE_IO_MACRO(intr2, ASP_INTR2_OFFSET); BCMASP_CORE_IO_MACRO(wakeup_intr2, ASP_WAKEUP_INTR2_OFFSET); BCMASP_CORE_IO_MACRO(tx_analytics, ASP_TX_ANALYTICS_OFFSET); BCMASP_CORE_IO_MACRO(rx_analytics, ASP_RX_ANALYTICS_OFFSET); -BCMASP_CORE_IO_MACRO(rx_ctrl, ASP_RX_CTRL_OFFSET); BCMASP_CORE_IO_MACRO(rx_filter, ASP_RX_FILTER_OFFSET); BCMASP_CORE_IO_MACRO(rx_edpkt, ASP_EDPKT_OFFSET); BCMASP_CORE_IO_MACRO(ctrl, ASP_CTRL_OFFSET); BCMASP_CORE_IO_MACRO(ctrl2, ASP_CTRL2_OFFSET); +#define BCMASP_CORE_IO_MACRO_OFFSET(name, offset) \ +static inline u32 name##_core_rl(struct bcmasp_priv *priv, \ + u32 off) \ +{ \ + u32 reg = readl_relaxed(priv->base + priv->name##_offset + \ + (offset) + off); \ + return reg; \ +} \ +static inline void name##_core_wl(struct bcmasp_priv *priv, \ + u32 val, u32 off) \ +{ \ + writel_relaxed(val, priv->base + priv->name##_offset + \ + (offset) + off); \ +} +BCMASP_CORE_IO_MACRO_OFFSET(rx_ctrl, ASP_RX_CTRL_OFFSET); + struct bcmasp_intf *bcmasp_interface_create(struct bcmasp_priv *priv, struct device_node *ndev_dn, int i); diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c index 991a87096e087..4381a4cfd8c6f 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_ethtool.c @@ -10,7 +10,6 @@ #include "bcmasp_intf_defs.h" enum bcmasp_stat_type { - BCMASP_STAT_RX_EDPKT, BCMASP_STAT_RX_CTRL, BCMASP_STAT_RX_CTRL_PER_INTF, BCMASP_STAT_SOFT, @@ -33,8 +32,6 @@ struct bcmasp_stats { .reg_offset = offset, \ } -#define STAT_BCMASP_RX_EDPKT(str, offset) \ - STAT_BCMASP_OFFSET(str, BCMASP_STAT_RX_EDPKT, offset) #define STAT_BCMASP_RX_CTRL(str, offset) \ STAT_BCMASP_OFFSET(str, BCMASP_STAT_RX_CTRL, offset) #define STAT_BCMASP_RX_CTRL_PER_INTF(str, offset) \ @@ -42,11 +39,6 @@ struct bcmasp_stats { /* Must match the order of struct bcmasp_mib_counters */ static const struct bcmasp_stats bcmasp_gstrings_stats[] = { - /* EDPKT counters */ - STAT_BCMASP_RX_EDPKT("RX Time Stamp", ASP_EDPKT_RX_TS_COUNTER), - STAT_BCMASP_RX_EDPKT("RX PKT Count", ASP_EDPKT_RX_PKT_CNT), - STAT_BCMASP_RX_EDPKT("RX PKT Buffered", ASP_EDPKT_HDR_EXTR_CNT), - STAT_BCMASP_RX_EDPKT("RX PKT Pushed to DRAM", ASP_EDPKT_HDR_OUT_CNT), /* ASP RX control */ STAT_BCMASP_RX_CTRL_PER_INTF("Frames From Unimac", ASP_RX_CTRL_UMAC_0_FRAME_COUNT), @@ -113,9 +105,6 @@ static void bcmasp_update_mib_counters(struct bcmasp_intf *intf) switch (s->type) { case BCMASP_STAT_SOFT: continue; - case BCMASP_STAT_RX_EDPKT: - val = rx_edpkt_core_rl(intf->parent, offset); - break; case BCMASP_STAT_RX_CTRL: val = rx_ctrl_core_rl(intf->parent, offset); break; @@ -272,7 +261,7 @@ static int bcmasp_flow_get(struct bcmasp_intf *intf, struct ethtool_rxnfc *cmd) memcpy(&cmd->fs, &nfilter->fs, sizeof(nfilter->fs)); - cmd->data = NUM_NET_FILTERS; + cmd->data = intf->parent->num_net_filters; return 0; } @@ -319,7 +308,7 @@ static int bcmasp_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, break; case ETHTOOL_GRXCLSRLALL: err = bcmasp_netfilt_get_all_active(intf, rule_locs, &cmd->rule_cnt); - cmd->data = NUM_NET_FILTERS; + cmd->data = intf->parent->num_net_filters; break; default: err = -EOPNOTSUPP; diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c index 01de05ec3ebc6..0d61b8580d72b 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c @@ -180,14 +180,14 @@ static struct sk_buff *bcmasp_csum_offload(struct net_device *dev, case htons(ETH_P_IP): header |= PKT_OFFLOAD_HDR_SIZE_2((ip_hdrlen(skb) >> 8) & 0xf); header2 |= PKT_OFFLOAD_HDR2_SIZE_2(ip_hdrlen(skb) & 0xff); - epkt |= PKT_OFFLOAD_EPKT_IP(0) | PKT_OFFLOAD_EPKT_CSUM_L2; + epkt |= PKT_OFFLOAD_EPKT_IP(0); ip_proto = ip_hdr(skb)->protocol; header_cnt += 2; break; case htons(ETH_P_IPV6): header |= PKT_OFFLOAD_HDR_SIZE_2((IP6_HLEN >> 8) & 0xf); header2 |= PKT_OFFLOAD_HDR2_SIZE_2(IP6_HLEN & 0xff); - epkt |= PKT_OFFLOAD_EPKT_IP(1) | PKT_OFFLOAD_EPKT_CSUM_L2; + epkt |= PKT_OFFLOAD_EPKT_IP(1); ip_proto = ipv6_hdr(skb)->nexthdr; header_cnt += 2; break; @@ -198,12 +198,12 @@ static struct sk_buff *bcmasp_csum_offload(struct net_device *dev, switch (ip_proto) { case IPPROTO_TCP: header2 |= PKT_OFFLOAD_HDR2_SIZE_3(tcp_hdrlen(skb)); - epkt |= PKT_OFFLOAD_EPKT_TP(0) | PKT_OFFLOAD_EPKT_CSUM_L3; + epkt |= PKT_OFFLOAD_EPKT_TP(0) | PKT_OFFLOAD_EPKT_CSUM_L4; header_cnt++; break; case IPPROTO_UDP: header2 |= PKT_OFFLOAD_HDR2_SIZE_3(UDP_HLEN); - epkt |= PKT_OFFLOAD_EPKT_TP(1) | PKT_OFFLOAD_EPKT_CSUM_L3; + epkt |= PKT_OFFLOAD_EPKT_TP(1) | PKT_OFFLOAD_EPKT_CSUM_L4; header_cnt++; break; default: @@ -818,9 +818,7 @@ static void bcmasp_init_tx(struct bcmasp_intf *intf) /* Tx SPB */ tx_spb_ctrl_wl(intf, ((intf->channel + 8) << TX_SPB_CTRL_XF_BID_SHIFT), TX_SPB_CTRL_XF_CTRL2); - tx_pause_ctrl_wl(intf, (1 << (intf->channel + 8)), TX_PAUSE_MAP_VECTOR); tx_spb_top_wl(intf, 0x1e, TX_SPB_TOP_BLKOUT); - tx_spb_top_wl(intf, 0x0, TX_SPB_TOP_SPRE_BW_CTRL); tx_spb_dma_wq(intf, intf->tx_spb_dma_addr, TX_SPB_DMA_READ); tx_spb_dma_wq(intf, intf->tx_spb_dma_addr, TX_SPB_DMA_BASE); @@ -1185,7 +1183,7 @@ static void bcmasp_map_res(struct bcmasp_priv *priv, struct bcmasp_intf *intf) { /* Per port */ intf->res.umac = priv->base + UMC_OFFSET(intf); - intf->res.umac2fb = priv->base + (UMAC2FB_OFFSET + + intf->res.umac2fb = priv->base + (UMAC2FB_OFFSET + priv->rx_ctrl_offset + (intf->port * 0x4)); intf->res.rgmii = priv->base + RGMII_OFFSET(intf); @@ -1200,7 +1198,6 @@ static void bcmasp_map_res(struct bcmasp_priv *priv, struct bcmasp_intf *intf) intf->rx_edpkt_cfg = priv->base + RX_EDPKT_CFG_OFFSET(intf); } -#define MAX_IRQ_STR_LEN 64 struct bcmasp_intf *bcmasp_interface_create(struct bcmasp_priv *priv, struct device_node *ndev_dn, int i) { From 538cb5573ae79d90e48f5f195e87e5bb73519825 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Tue, 22 Apr 2025 16:36:45 -0700 Subject: [PATCH 468/770] net: phy: mdio-bcm-unimac: Add asp-v3.0 Add mdio compat string for asp-v3.0 ethernet driver. Signed-off-by: Justin Chen Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250422233645.1931036-9-justin.chen@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/mdio/mdio-bcm-unimac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/mdio/mdio-bcm-unimac.c b/drivers/net/mdio/mdio-bcm-unimac.c index e2800b2e0288e..b6e30bdf53257 100644 --- a/drivers/net/mdio/mdio-bcm-unimac.c +++ b/drivers/net/mdio/mdio-bcm-unimac.c @@ -334,6 +334,7 @@ static SIMPLE_DEV_PM_OPS(unimac_mdio_pm_ops, NULL, unimac_mdio_resume); static const struct of_device_id unimac_mdio_ids[] = { + { .compatible = "brcm,asp-v3.0-mdio", }, { .compatible = "brcm,asp-v2.2-mdio", }, { .compatible = "brcm,asp-v2.1-mdio", }, { .compatible = "brcm,bcm6846-mdio", }, From 39144062ea335495d659b08c9e3133ab746a0b1b Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 23 Apr 2025 00:51:47 +0100 Subject: [PATCH 469/770] rxrpc: Remove deadcode Remove three functions that are no longer used. rxrpc_get_txbuf() last use was removed by 2020's commit 5e6ef4f1017c ("rxrpc: Make the I/O thread take over the call and local processor work") rxrpc_kernel_get_epoch() last use was removed by 2020's commit 44746355ccb1 ("afs: Don't get epoch from a server because it may be ambiguous") rxrpc_kernel_set_max_life() last use was removed by 2023's commit db099c625b13 ("rxrpc: Fix timeout of a call that hasn't yet been granted a channel") Both of the rxrpc_kernel_* functions were documented. Remove that documentation as well as the code. Signed-off-by: Dr. David Alan Gilbert Acked-by: David Howells Link: https://patch.msgid.link/20250422235147.146460-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- Documentation/networking/rxrpc.rst | 24 ----------------- include/net/af_rxrpc.h | 3 --- net/rxrpc/af_rxrpc.c | 41 ------------------------------ net/rxrpc/ar-internal.h | 1 - net/rxrpc/txbuf.c | 8 ------ 5 files changed, 77 deletions(-) diff --git a/Documentation/networking/rxrpc.rst b/Documentation/networking/rxrpc.rst index fe2ea73be4417..d63e3e27dd06b 100644 --- a/Documentation/networking/rxrpc.rst +++ b/Documentation/networking/rxrpc.rst @@ -1062,30 +1062,6 @@ The kernel interface functions are as follows: first function to change. Note that this must be called in TASK_RUNNING state. - (#) Get remote client epoch:: - - u32 rxrpc_kernel_get_epoch(struct socket *sock, - struct rxrpc_call *call) - - This allows the epoch that's contained in packets of an incoming client - call to be queried. This value is returned. The function always - successful if the call is still in progress. It shouldn't be called once - the call has expired. Note that calling this on a local client call only - returns the local epoch. - - This value can be used to determine if the remote client has been - restarted as it shouldn't change otherwise. - - (#) Set the maximum lifespan on a call:: - - void rxrpc_kernel_set_max_life(struct socket *sock, - struct rxrpc_call *call, - unsigned long hard_timeout) - - This sets the maximum lifespan on a call to hard_timeout (which is in - jiffies). In the event of the timeout occurring, the call will be - aborted and -ETIME or -ETIMEDOUT will be returned. - (#) Apply the RXRPC_MIN_SECURITY_LEVEL sockopt to a socket from within in the kernel:: diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index f15341594cc8f..0fb4c41c9bbf5 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -88,9 +88,6 @@ int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, unsigned int debug_id); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); -u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); -void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, - unsigned long); int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); int rxrpc_sock_set_security_keyring(struct sock *, struct key *); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 3a558c1a541e1..36df0274d7b74 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -460,22 +460,6 @@ bool rxrpc_kernel_check_life(const struct socket *sock, } EXPORT_SYMBOL(rxrpc_kernel_check_life); -/** - * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. - * @sock: The socket the call is on - * @call: The call to query - * - * Allow a kernel service to retrieve the epoch value from a service call to - * see if the client at the other end rebooted. - * - * Return: The epoch of the call's connection. - */ -u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) -{ - return call->conn->proto.epoch; -} -EXPORT_SYMBOL(rxrpc_kernel_get_epoch); - /** * rxrpc_kernel_set_notifications - Set table of callback operations * @sock: The socket to install table upon @@ -492,31 +476,6 @@ void rxrpc_kernel_set_notifications(struct socket *sock, } EXPORT_SYMBOL(rxrpc_kernel_set_notifications); -/** - * rxrpc_kernel_set_max_life - Set maximum lifespan on a call - * @sock: The socket the call is on - * @call: The call to configure - * @hard_timeout: The maximum lifespan of the call in ms - * - * Set the maximum lifespan of a call. The call will end with ETIME or - * ETIMEDOUT if it takes longer than this. - */ -void rxrpc_kernel_set_max_life(struct socket *sock, struct rxrpc_call *call, - unsigned long hard_timeout) -{ - ktime_t delay = ms_to_ktime(hard_timeout), expect_term_by; - - mutex_lock(&call->user_mutex); - - expect_term_by = ktime_add(ktime_get_real(), delay); - WRITE_ONCE(call->expect_term_by, expect_term_by); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); - rxrpc_poke_call(call, rxrpc_call_poke_set_timeout); - - mutex_unlock(&call->user_mutex); -} -EXPORT_SYMBOL(rxrpc_kernel_set_max_life); - /* * connect an RxRPC socket * - this just targets it at a specific destination; no actual connection diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ca62a1db32866..5bd3922c310d6 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1503,7 +1503,6 @@ static inline void rxrpc_sysctl_exit(void) {} extern atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp); -void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index c550991d48faa..29767038691ad 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -60,14 +60,6 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ return txb; } -void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) -{ - int r; - - __refcount_inc(&txb->ref, &r); - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r + 1, what); -} - void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) { int r = refcount_read(&txb->ref); From ffb0c5c4cf666380b8786e87c954967cbad22ad8 Mon Sep 17 00:00:00 2001 From: Bo-Cun Chen Date: Wed, 23 Apr 2025 01:48:02 +0100 Subject: [PATCH 470/770] net: ethernet: mtk_eth_soc: convert cap_bit in mtk_eth_muxc struct to u64 With commit 51a4df60db5c2 ("net: ethernet: mtk_eth_soc: convert caps in mtk_soc_data struct to u64") the capabilities bitfield was converted to a 64-bit value, but a cap_bit in struct mtk_eth_muxc which is used to store a full bitfield (rather than the bit number, as the name would suggest) still holds only a 32-bit value. Change the type of cap_bit to u64 in order to avoid truncating the bitfield which results in path selection to not work with capabilities above the 32-bit limit. The values currently stored in the cap_bit field are MTK_ETH_MUX_GDM1_TO_GMAC1_ESW: BIT_ULL(18) | BIT_ULL(5) MTK_ETH_MUX_GMAC2_GMAC0_TO_GEPHY: BIT_ULL(19) | BIT_ULL(5) | BIT_ULL(6) MTK_ETH_MUX_U3_GMAC2_TO_QPHY: BIT_ULL(20) | BIT_ULL(5) | BIT_ULL(6) MTK_ETH_MUX_GMAC1_GMAC2_TO_SGMII_RGMII: BIT_ULL(20) | BIT_ULL(5) | BIT_ULL(7) MTK_ETH_MUX_GMAC12_TO_GEPHY_SGMII: BIT_ULL(21) | BIT_ULL(5) While all those values are currently still within 32-bit boundaries, the addition of new capabilities of MT7988 as well as future SoC's like MT7987 will exceed them. Also, the use of a 32-bit 'int' type to store the result of a BIT_ULL(...) is misleading. Signed-off-by: Bo-Cun Chen Signed-off-by: Daniel Golle Reviewed-by: Simon Horman Link: https://patch.msgid.link/ded98b0d716c3203017a7a92151516ec2bf1abee.1745369249.git.daniel@makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_path.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_path.c b/drivers/net/ethernet/mediatek/mtk_eth_path.c index 7c27a19c4d8f4..6fbfb16438a51 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_path.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_path.c @@ -14,7 +14,7 @@ struct mtk_eth_muxc { const char *name; - int cap_bit; + u64 cap_bit; int (*set_path)(struct mtk_eth *eth, u64 path); }; From a1356ac7749cafc4e27aa62c0c4604b5dca4983e Mon Sep 17 00:00:00 2001 From: "e.kubanski" Date: Wed, 16 Apr 2025 12:19:08 +0200 Subject: [PATCH 471/770] xsk: Fix race condition in AF_XDP generic RX path Move rx_lock from xsk_socket to xsk_buff_pool. Fix synchronization for shared umem mode in generic RX path where multiple sockets share single xsk_buff_pool. RX queue is exclusive to xsk_socket, while FILL queue can be shared between multiple sockets. This could result in race condition where two CPU cores access RX path of two different sockets sharing the same umem. Protect both queues by acquiring spinlock in shared xsk_buff_pool. Lock contention may be minimized in the future by some per-thread FQ buffering. It's safe and necessary to move spin_lock_bh(rx_lock) after xsk_rcv_check(): * xs->pool and spinlock_init is synchronized by xsk_bind() -> xsk_is_bound() memory barriers. * xsk_rcv_check() may return true at the moment of xsk_release() or xsk_unbind_dev(), however this will not cause any data races or race conditions. xsk_unbind_dev() removes xdp socket from all maps and waits for completion of all outstanding rx operations. Packets in RX path will either complete safely or drop. Signed-off-by: Eryk Kubanski Fixes: bf0bdd1343efb ("xdp: fix race on generic receive path") Acked-by: Magnus Karlsson Link: https://patch.msgid.link/20250416101908.10919-1-e.kubanski@partner.samsung.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock.h | 3 --- include/net/xsk_buff_pool.h | 2 ++ net/xdp/xsk.c | 6 +++--- net/xdp/xsk_buff_pool.c | 1 + 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index a58ae7589d121..e8bd6ddb7b127 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -71,9 +71,6 @@ struct xdp_sock { */ u32 tx_budget_spent; - /* Protects generic receive. */ - spinlock_t rx_lock; - /* Statistics */ u64 rx_dropped; u64 rx_queue_full; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 1dcd4d71468a5..3b243ea70e38a 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -53,6 +53,8 @@ struct xsk_buff_pool { refcount_t users; struct xdp_umem *umem; struct work_struct work; + /* Protects generic receive in shared and non-shared umem mode. */ + spinlock_t rx_lock; struct list_head free_list; struct list_head xskb_list; u32 heads_cnt; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 5696af45bcf71..4abc81f33d3eb 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -338,13 +338,14 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) u32 len = xdp_get_buff_len(xdp); int err; - spin_lock_bh(&xs->rx_lock); err = xsk_rcv_check(xs, xdp, len); if (!err) { + spin_lock_bh(&xs->pool->rx_lock); err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); + spin_unlock_bh(&xs->pool->rx_lock); } - spin_unlock_bh(&xs->rx_lock); + return err; } @@ -1734,7 +1735,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); xs->state = XSK_READY; mutex_init(&xs->mutex); - spin_lock_init(&xs->rx_lock); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 25a76c5ce0f12..c5181a9044add 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -89,6 +89,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->addrs = umem->addrs; pool->tx_metadata_len = umem->tx_metadata_len; pool->tx_sw_csum = umem->flags & XDP_UMEM_TX_SW_CSUM; + spin_lock_init(&pool->rx_lock); INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); From bf20af07909925ec0ae6cd4f3b7be0279dfa8768 Mon Sep 17 00:00:00 2001 From: "e.kubanski" Date: Wed, 16 Apr 2025 13:29:25 +0200 Subject: [PATCH 472/770] xsk: Fix offset calculation in unaligned mode Bring back previous offset calculation behaviour in AF_XDP unaligned umem mode. In unaligned mode, upper 16 bits should contain data offset, lower 48 bits should contain only specific chunk location without offset. Remove pool->headroom duplication into 48bit address. Signed-off-by: Eryk Kubanski Fixes: bea14124bacb ("xsk: Get rid of xdp_buff_xsk::orig_addr") Acked-by: Magnus Karlsson Link: https://patch.msgid.link/20250416112925.7501-1-e.kubanski@partner.samsung.com Signed-off-by: Jakub Kicinski --- include/net/xsk_buff_pool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 3b243ea70e38a..cac56e6b0869b 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -240,8 +240,8 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, return orig_addr; offset = xskb->xdp.data - xskb->xdp.data_hard_start; - orig_addr -= offset; offset += pool->headroom; + orig_addr -= offset; return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } From 3a4236c379543878e957004d3415152d28955481 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 23 Apr 2025 11:25:45 +0300 Subject: [PATCH 473/770] rxrpc: rxgk: Fix some reference count leaks These paths should call rxgk_put(gk) but they don't. In the rxgk_construct_response() function the "goto error;" will free the "response" skb as well calling rxgk_put() so that's a bonus. Fixes: 9d1d2b59341f ("rxrpc: rxgk: Implement the yfs-rxgk security class (GSSAPI)") Signed-off-by: Dan Carpenter Acked-by: David Howells Link: https://patch.msgid.link/aAikCbsnnzYtVmIA@stanley.mountain Signed-off-by: Jakub Kicinski --- net/rxrpc/rxgk.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index ba8bc201b8d38..1e19c605bcc82 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -440,8 +440,10 @@ static int rxgk_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) return PTR_ERR(gk) == -ESTALE ? -EKEYREJECTED : PTR_ERR(gk); ret = key_validate(call->conn->key); - if (ret < 0) + if (ret < 0) { + rxgk_put(gk); return ret; + } call->security_enctype = gk->krb5->etype; txb->cksum = htons(gk->key_number); @@ -483,7 +485,7 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, hdr = kzalloc(sizeof(*hdr), GFP_NOFS); if (!hdr) - return -ENOMEM; + goto put_gk; hdr->epoch = htonl(call->conn->proto.epoch); hdr->cid = htonl(call->cid); @@ -505,6 +507,7 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, sp->len = len; } +put_gk: rxgk_put(gk); _leave(" = %d", ret); return ret; @@ -594,6 +597,7 @@ static int rxgk_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) call->security_enctype = gk->krb5->etype; switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: + rxgk_put(gk); return 0; case RXRPC_SECURITY_AUTH: return rxgk_verify_packet_integrity(call, gk, skb); @@ -969,7 +973,7 @@ static int rxgk_construct_response(struct rxrpc_connection *conn, ret = rxgk_pad_out(response, authx_len, authx_offset + authx_len); if (ret < 0) - return ret; + goto error; len = authx_offset + authx_len + ret; if (len != response->len) { From eacc77a73275895eca0e3655dc6c671853500e2e Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Wed, 23 Apr 2025 11:36:07 +0300 Subject: [PATCH 474/770] net/mlx5e: Use custom tunnel header for vxlan gbp Symbolic (e.g. "vxlan") and custom (e.g. "tunnel_header_0") tunnels cannot be combined, but the match params interface does not have fields for matching on vxlan gbp. To match vxlan bgp, the tc_tun layer uses tunnel_header_0. Allow matching on both VNI and GBP by matching the VNI with a custom tunnel header instead of the symbolic field name. Matching solely on the VNI continues to use the symbolic field name. Fixes: 74a778b4a63f ("net/mlx5: HWS, added definers handling") Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Signed-off-by: Mark Bloch Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250423083611.324567-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/en/tc_tun_vxlan.c | 32 +++++++++++++++++-- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c index 5c762a71818db..7a18a469961db 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c @@ -165,9 +165,6 @@ static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv, struct flow_match_enc_keyid enc_keyid; void *misc_c, *misc_v; - misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters); - misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters); - if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ENC_KEYID)) return 0; @@ -182,6 +179,30 @@ static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv, err = mlx5e_tc_tun_parse_vxlan_gbp_option(priv, spec, f); if (err) return err; + + /* We can't mix custom tunnel headers with symbolic ones and we + * don't have a symbolic field name for GBP, so we use custom + * tunnel headers in this case. We need hardware support to + * match on custom tunnel headers, but we already know it's + * supported because the previous call successfully checked for + * that. + */ + misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_5); + misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_5); + + /* Shift by 8 to account for the reserved bits in the vxlan + * header after the VNI. + */ + MLX5_SET(fte_match_set_misc5, misc_c, tunnel_header_1, + be32_to_cpu(enc_keyid.mask->keyid) << 8); + MLX5_SET(fte_match_set_misc5, misc_v, tunnel_header_1, + be32_to_cpu(enc_keyid.key->keyid) << 8); + + spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_5; + + return 0; } /* match on VNI is required */ @@ -195,6 +216,11 @@ static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv, return -EOPNOTSUPP; } + misc_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters); + misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + MLX5_SET(fte_match_set_misc, misc_c, vxlan_vni, be32_to_cpu(enc_keyid.mask->keyid)); MLX5_SET(fte_match_set_misc, misc_v, vxlan_vni, From 5d1a04f347e6cbf5ffe74da409a5d71fbe8c5f19 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 23 Apr 2025 11:36:08 +0300 Subject: [PATCH 475/770] net/mlx5: E-Switch, Initialize MAC Address for Default GID Initialize the source MAC address when creating the default GID entry. Since this entry is used only for loopback traffic, it only needs to be a unicast address. A zeroed-out MAC address is sufficient for this purpose. Without this fix, random bits would be assigned as the source address. If these bits formed a multicast address, the firmware would return an error, preventing the user from switching to switchdev mode: Error: mlx5_core: Failed setting eswitch to offloads. kernel answers: Invalid argument Fixes: 80f09dfc237f ("net/mlx5: Eswitch, enable RoCE loopback traffic") Signed-off-by: Maor Gottlieb Signed-off-by: Mark Bloch Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250423083611.324567-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c index a42f6cd99b744..f585ef5a34243 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c @@ -118,8 +118,8 @@ static void mlx5_rdma_make_default_gid(struct mlx5_core_dev *dev, union ib_gid * static int mlx5_rdma_add_roce_addr(struct mlx5_core_dev *dev) { + u8 mac[ETH_ALEN] = {}; union ib_gid gid; - u8 mac[ETH_ALEN]; mlx5_rdma_make_default_gid(dev, &gid); return mlx5_core_roce_gid_set(dev, 0, From 172c034264c894518c012387f2de2f9d6443505d Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Wed, 23 Apr 2025 11:36:09 +0300 Subject: [PATCH 476/770] net/mlx5e: TC, Continue the attr process even if encap entry is invalid Previously the offload of the rule with header rewrite and mirror to both internal and external destinations is skipped if the encap entry is not valid. But it shouldn't because driver will try to offload it again if neighbor is updated and encap entry is valid, to replace the old FTE added for slow path. But the extra split attr doesn't exist at that time as the process is skipped, driver then fails to offload it. To fix this issue, remove the checking and continue the attr process if encap entry is invalid. Fixes: b11bde56246e ("net/mlx5e: TC, Offload rewrite and mirror to both internal and external dests") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250423083611.324567-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 9ba99609999f4..f1d908f611349 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -1750,9 +1750,6 @@ extra_split_attr_dests_needed(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr !list_is_first(&attr->list, &flow->attrs)) return 0; - if (flow_flag_test(flow, SLOW)) - return 0; - esw_attr = attr->esw_attr; if (!esw_attr->split_count || esw_attr->split_count == esw_attr->out_count - 1) @@ -1766,7 +1763,7 @@ extra_split_attr_dests_needed(struct mlx5e_tc_flow *flow, struct mlx5_flow_attr for (i = esw_attr->split_count; i < esw_attr->out_count; i++) { /* external dest with encap is considered as internal by firmware */ if (esw_attr->dests[i].vport == MLX5_VPORT_UPLINK && - !(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP_VALID)) + !(esw_attr->dests[i].flags & MLX5_ESW_DEST_ENCAP)) ext_dest = true; else int_dest = true; From 1c2940ec0ddf51c689ee9ab85ead85c11b77809d Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Wed, 23 Apr 2025 11:36:10 +0300 Subject: [PATCH 477/770] net/mlx5e: Fix lock order in mlx5e_tx_reporter_ptpsq_unhealthy_recover RTNL needs to be acquired before state_lock. Fixes: fdce06bda7e5 ("net/mlx5e: Acquire RTNL lock before RQs/SQs activation/deactivation") Signed-off-by: Cosmin Ratiu Reviewed-by: Dragos Tatulea Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250423083611.324567-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 532c7fa94d172..dbd9482359e1e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -176,6 +176,7 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) priv = ptpsq->txqsq.priv; + rtnl_lock(); mutex_lock(&priv->state_lock); chs = &priv->channels; netdev = priv->netdev; @@ -183,22 +184,19 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) carrier_ok = netif_carrier_ok(netdev); netif_carrier_off(netdev); - rtnl_lock(); mlx5e_deactivate_priv_channels(priv); - rtnl_unlock(); mlx5e_ptp_close(chs->ptp); err = mlx5e_ptp_open(priv, &chs->params, chs->c[0]->lag_port, &chs->ptp); - rtnl_lock(); mlx5e_activate_priv_channels(priv); - rtnl_unlock(); /* return carrier back if needed */ if (carrier_ok) netif_carrier_on(netdev); mutex_unlock(&priv->state_lock); + rtnl_unlock(); return err; } From 90538d23278a981e344d364e923162fce752afeb Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Wed, 23 Apr 2025 11:36:11 +0300 Subject: [PATCH 478/770] net/mlx5: E-switch, Fix error handling for enabling roce The cited commit assumes enabling roce always succeeds. But it is not true. Add error handling for it. Fixes: 80f09dfc237f ("net/mlx5: Eswitch, enable RoCE loopback traffic") Signed-off-by: Chris Mi Reviewed-by: Roi Dayan Reviewed-by: Maor Gottlieb Signed-off-by: Mark Bloch Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250423083611.324567-6-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 5 ++++- drivers/net/ethernet/mellanox/mlx5/core/rdma.c | 9 +++++---- drivers/net/ethernet/mellanox/mlx5/core/rdma.h | 4 ++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index a6a8eea5980ca..0e3a977d53329 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3533,7 +3533,9 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) int err; mutex_init(&esw->offloads.termtbl_mutex); - mlx5_rdma_enable_roce(esw->dev); + err = mlx5_rdma_enable_roce(esw->dev); + if (err) + goto err_roce; err = mlx5_esw_host_number_init(esw); if (err) @@ -3594,6 +3596,7 @@ int esw_offloads_enable(struct mlx5_eswitch *esw) esw_offloads_metadata_uninit(esw); err_metadata: mlx5_rdma_disable_roce(esw->dev); +err_roce: mutex_destroy(&esw->offloads.termtbl_mutex); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c index f585ef5a34243..5c552b71e371c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rdma.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/rdma.c @@ -140,17 +140,17 @@ void mlx5_rdma_disable_roce(struct mlx5_core_dev *dev) mlx5_nic_vport_disable_roce(dev); } -void mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) +int mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) { int err; if (!MLX5_CAP_GEN(dev, roce)) - return; + return 0; err = mlx5_nic_vport_enable_roce(dev); if (err) { mlx5_core_err(dev, "Failed to enable RoCE: %d\n", err); - return; + return err; } err = mlx5_rdma_add_roce_addr(dev); @@ -165,10 +165,11 @@ void mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) goto del_roce_addr; } - return; + return err; del_roce_addr: mlx5_rdma_del_roce_addr(dev); disable_roce: mlx5_nic_vport_disable_roce(dev); + return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/rdma.h b/drivers/net/ethernet/mellanox/mlx5/core/rdma.h index 750cff2a71a4b..3d9e76c3d42fb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/rdma.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/rdma.h @@ -8,12 +8,12 @@ #ifdef CONFIG_MLX5_ESWITCH -void mlx5_rdma_enable_roce(struct mlx5_core_dev *dev); +int mlx5_rdma_enable_roce(struct mlx5_core_dev *dev); void mlx5_rdma_disable_roce(struct mlx5_core_dev *dev); #else /* CONFIG_MLX5_ESWITCH */ -static inline void mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) {} +static inline int mlx5_rdma_enable_roce(struct mlx5_core_dev *dev) { return 0; } static inline void mlx5_rdma_disable_roce(struct mlx5_core_dev *dev) {} #endif /* CONFIG_MLX5_ESWITCH */ From 4134bb726efdc7a4a64bd0f776359cb0564564b2 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 23 Apr 2025 12:37:19 +0100 Subject: [PATCH 479/770] net: ip_gre: Fix spelling mistake "demultiplexor" -> "demultiplexer" There is a spelling mistake in a pr_info message. Fix it. Signed-off-by: Colin Ian King Link: https://patch.msgid.link/20250423113719.173539-1-colin.i.king@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/gre_demux.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 6701a98d9a9ff..dafd68f3436a2 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -199,7 +199,7 @@ static const struct net_protocol net_gre_protocol = { static int __init gre_init(void) { - pr_info("GRE over IPv4 demultiplexor driver\n"); + pr_info("GRE over IPv4 demultiplexer driver\n"); if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { pr_err("can't add protocol\n"); From bc2550b4e195754fbb24aac1f012d3dd9e3b4edc Mon Sep 17 00:00:00 2001 From: Jeremy Harris Date: Wed, 23 Apr 2025 13:43:33 +0100 Subject: [PATCH 480/770] tcp: fastopen: note that a child socket was created tcp: fastopen: note that a child socket was created This uses up the last bit in a field of tcp_sock. Signed-off-by: Jeremy Harris Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250423124334.4916-2-jgh@exim.org Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 3 ++- net/ipv4/tcp.c | 1 + net/ipv4/tcp_fastopen.c | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1669d95bb0f9a..a8af71623ba7c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -385,7 +385,8 @@ struct tcp_sock { syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ - syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ + syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ + syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e0e96f8fd47cb..df3eb0fb7cb3d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3409,6 +3409,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rack.reo_wnd_persist = 0; tp->rack.dsack_seen = 0; tp->syn_data_acked = 0; + tp->syn_fastopen_child = 0; tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 1a6b1bc542451..9b83d639b5acd 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -401,6 +401,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); + tcp_sk(child)->syn_fastopen_child = 1; return child; } NET_INC_STATS(sock_net(sk), From 2b13042d3636327eb50c8a0ee06f629d52d1b8fb Mon Sep 17 00:00:00 2001 From: Jeremy Harris Date: Wed, 23 Apr 2025 13:43:34 +0100 Subject: [PATCH 481/770] tcp: fastopen: pass TFO child indication through getsockopt tcp: fastopen: pass TFO child indication through getsockopt Note that this uses up the last bit of a field in struct tcp_info Signed-off-by: Jeremy Harris Reviewed-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20250423124334.4916-3-jgh@exim.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index dc8fdc80e16b0..bdac8c42fa82a 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -184,6 +184,7 @@ enum tcp_fastopen_client_fail { #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ +#define TCPI_OPT_TFO_CHILD 128 /* child from a Fast Open option on SYN */ /* * Sender's congestion state indicating normal or abnormal situations diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index df3eb0fb7cb3d..86c427f166367 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4165,6 +4165,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_SYN_DATA; if (tp->tcp_usec_ts) info->tcpi_options |= TCPI_OPT_USEC_TS; + if (tp->syn_fastopen_child) + info->tcpi_options |= TCPI_OPT_TFO_CHILD; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato, From d57ee99831e336576359beb26e2b140511c99106 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 23 Apr 2025 17:08:08 +0200 Subject: [PATCH 482/770] net: ethernet: mtk_wed: annotate RCU release in attach() There are some sparse warnings in wifi, and it seems that it's actually possible to annotate a function pointer with __releases(), making the sparse warnings go away. In a way that also serves as documentation that rcu_read_unlock() must be called in the attach method, so add that annotation. Signed-off-by: Johannes Berg Link: https://patch.msgid.link/20250423150811.456205-2-johannes@sipsolutions.net Signed-off-by: Jakub Kicinski --- include/linux/soc/mediatek/mtk_wed.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index a476648858a66..d8949a4ed0dc9 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -192,7 +192,7 @@ struct mtk_wed_device { }; struct mtk_wed_ops { - int (*attach)(struct mtk_wed_device *dev); + int (*attach)(struct mtk_wed_device *dev) __releases(RCU); int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs, bool reset); int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, From f74d14a7dfb10ba908ce799d2176817d097713c7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 23 Apr 2025 15:02:31 -0700 Subject: [PATCH 483/770] tools: ynl: fix the header guard name for OVPN Thorsten reports that after upgrading system headers from linux-next the YNL build breaks. I typo'ed the header guard, _H is missing. Reported-by: Thorsten Leemhuis Link: https://lore.kernel.org/59ba7a94-17b9-485f-aa6d-14e4f01a7a39@leemhuis.info Fixes: 12b196568a3a ("tools: ynl: add missing header deps") Reviewed-by: Jacob Keller Tested-by: Thorsten Leemhuis Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250423220231.1035931-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/Makefile.deps | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 8b7bf673b686f..a5e6093903fb9 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -27,7 +27,7 @@ CFLAGS_netdev:=$(call get_hdr_inc,_LINUX_NETDEV_H,netdev.h) CFLAGS_nl80211:=$(call get_hdr_inc,__LINUX_NL802121_H,nl80211.h) CFLAGS_nlctrl:=$(call get_hdr_inc,__LINUX_GENERIC_NETLINK_H,genetlink.h) CFLAGS_nfsd:=$(call get_hdr_inc,_LINUX_NFSD_NETLINK_H,nfsd_netlink.h) -CFLAGS_ovpn:=$(call get_hdr_inc,_LINUX_OVPN,ovpn.h) +CFLAGS_ovpn:=$(call get_hdr_inc,_LINUX_OVPN_H,ovpn.h) CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_vport:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) From f71c549b26a33fd62f1e9c7deeba738bfc73fbfc Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Thu, 24 Apr 2025 00:27:31 +0000 Subject: [PATCH 484/770] netdevsim: Mark NAPI ID on skb in nsim_rcv Previously, nsim_rcv was not marking the NAPI ID on the skb, leading to applications seeing a napi ID of 0 when using SO_INCOMING_NAPI_ID. To add to the userland confusion, netlink appears to correctly report the NAPI IDs for netdevsim queues but the resulting file descriptor from a call to accept() was reporting a NAPI ID of 0. Signed-off-by: Joe Damato Link: https://patch.msgid.link/20250424002746.16891-2-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/netdev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index 0e0321a7ddd71..2aa999345fe12 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "netdevsim.h" @@ -357,6 +358,7 @@ static int nsim_rcv(struct nsim_rq *rq, int budget) break; skb = skb_dequeue(&rq->skb_queue); + skb_mark_napi_id(skb, &rq->napi); netif_receive_skb(skb); } From 2b6d490b82668bbd0a9201c27154890f842e985f Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Thu, 24 Apr 2025 00:27:32 +0000 Subject: [PATCH 485/770] selftests: drv-net: Factor out ksft C helpers Factor ksft C helpers to a header so they can be used by other C-based tests. Signed-off-by: Joe Damato Link: https://patch.msgid.link/20250424002746.16891-3-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/ksft.h | 56 +++++++++++++++++++ .../selftests/drivers/net/xdp_helper.c | 49 +--------------- 2 files changed, 58 insertions(+), 47 deletions(-) create mode 100644 tools/testing/selftests/drivers/net/ksft.h diff --git a/tools/testing/selftests/drivers/net/ksft.h b/tools/testing/selftests/drivers/net/ksft.h new file mode 100644 index 0000000000000..17dc34a612c64 --- /dev/null +++ b/tools/testing/selftests/drivers/net/ksft.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#if !defined(__NET_KSFT_H__) +#define __NET_KSFT_H__ + +#include +#include +#include + +static inline void ksft_ready(void) +{ + const char msg[7] = "ready\n"; + char *env_str; + int fd; + + env_str = getenv("KSFT_READY_FD"); + if (env_str) { + fd = atoi(env_str); + if (!fd) { + fprintf(stderr, "invalid KSFT_READY_FD = '%s'\n", + env_str); + return; + } + } else { + fd = STDOUT_FILENO; + } + + write(fd, msg, sizeof(msg)); + if (fd != STDOUT_FILENO) + close(fd); +} + +static inline void ksft_wait(void) +{ + char *env_str; + char byte; + int fd; + + env_str = getenv("KSFT_WAIT_FD"); + if (env_str) { + fd = atoi(env_str); + if (!fd) { + fprintf(stderr, "invalid KSFT_WAIT_FD = '%s'\n", + env_str); + return; + } + } else { + /* Not running in KSFT env, wait for input from STDIN instead */ + fd = STDIN_FILENO; + } + + read(fd, &byte, sizeof(byte)); + if (fd != STDIN_FILENO) + close(fd); +} + +#endif diff --git a/tools/testing/selftests/drivers/net/xdp_helper.c b/tools/testing/selftests/drivers/net/xdp_helper.c index aeed259141045..d5bb8ac33efa5 100644 --- a/tools/testing/selftests/drivers/net/xdp_helper.c +++ b/tools/testing/selftests/drivers/net/xdp_helper.c @@ -11,56 +11,11 @@ #include #include +#include "ksft.h" + #define UMEM_SZ (1U << 16) #define NUM_DESC (UMEM_SZ / 2048) -/* Move this to a common header when reused! */ -static void ksft_ready(void) -{ - const char msg[7] = "ready\n"; - char *env_str; - int fd; - - env_str = getenv("KSFT_READY_FD"); - if (env_str) { - fd = atoi(env_str); - if (!fd) { - fprintf(stderr, "invalid KSFT_READY_FD = '%s'\n", - env_str); - return; - } - } else { - fd = STDOUT_FILENO; - } - - write(fd, msg, sizeof(msg)); - if (fd != STDOUT_FILENO) - close(fd); -} - -static void ksft_wait(void) -{ - char *env_str; - char byte; - int fd; - - env_str = getenv("KSFT_WAIT_FD"); - if (env_str) { - fd = atoi(env_str); - if (!fd) { - fprintf(stderr, "invalid KSFT_WAIT_FD = '%s'\n", - env_str); - return; - } - } else { - /* Not running in KSFT env, wait for input from STDIN instead */ - fd = STDIN_FILENO; - } - - read(fd, &byte, sizeof(byte)); - if (fd != STDIN_FILENO) - close(fd); -} /* this is a simple helper program that creates an XDP socket and does the * minimum necessary to get bind() to succeed. From 2593a0a1446ae4bf651b0af1c42a421f511b4839 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Thu, 24 Apr 2025 00:27:33 +0000 Subject: [PATCH 486/770] selftests: drv-net: Test that NAPI ID is non-zero Test that the SO_INCOMING_NAPI_ID of a network file descriptor is non-zero. This ensures that either the core networking stack or, in some cases like netdevsim, the driver correctly sets the NAPI ID. Signed-off-by: Joe Damato Link: https://patch.msgid.link/20250424002746.16891-4-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- .../testing/selftests/drivers/net/.gitignore | 1 + tools/testing/selftests/drivers/net/Makefile | 6 +- .../testing/selftests/drivers/net/napi_id.py | 23 +++++ .../selftests/drivers/net/napi_id_helper.c | 83 +++++++++++++++++++ 4 files changed, 112 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/drivers/net/napi_id.py create mode 100644 tools/testing/selftests/drivers/net/napi_id_helper.c diff --git a/tools/testing/selftests/drivers/net/.gitignore b/tools/testing/selftests/drivers/net/.gitignore index ec746f374e858..72d2124fd5138 100644 --- a/tools/testing/selftests/drivers/net/.gitignore +++ b/tools/testing/selftests/drivers/net/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only +napi_id_helper xdp_helper diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 0c95bd944d56d..47247c2ef948b 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -6,9 +6,13 @@ TEST_INCLUDES := $(wildcard lib/py/*.py) \ ../../net/net_helper.sh \ ../../net/lib.sh \ -TEST_GEN_FILES := xdp_helper +TEST_GEN_FILES := \ + napi_id_helper \ + xdp_helper \ +# end of TEST_GEN_FILES TEST_PROGS := \ + napi_id.py \ netcons_basic.sh \ netcons_fragmented_msg.sh \ netcons_overflow.sh \ diff --git a/tools/testing/selftests/drivers/net/napi_id.py b/tools/testing/selftests/drivers/net/napi_id.py new file mode 100755 index 0000000000000..356bac46ba04a --- /dev/null +++ b/tools/testing/selftests/drivers/net/napi_id.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq, NetDrvEpEnv +from lib.py import bkg, cmd, rand_port, NetNSEnter + +def test_napi_id(cfg) -> None: + port = rand_port() + listen_cmd = f"{cfg.test_dir}/napi_id_helper {cfg.addr_v['4']} {port}" + + with bkg(listen_cmd, ksft_wait=3) as server: + cmd(f"echo a | socat - TCP:{cfg.addr_v['4']}:{port}", host=cfg.remote, shell=True) + + ksft_eq(0, server.ret) + +def main() -> None: + with NetDrvEpEnv(__file__) as cfg: + ksft_run([test_napi_id], args=(cfg,)) + ksft_exit() + +if __name__ == "__main__": + main() diff --git a/tools/testing/selftests/drivers/net/napi_id_helper.c b/tools/testing/selftests/drivers/net/napi_id_helper.c new file mode 100644 index 0000000000000..7e8e7d373b612 --- /dev/null +++ b/tools/testing/selftests/drivers/net/napi_id_helper.c @@ -0,0 +1,83 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include + +#include "ksft.h" + +int main(int argc, char *argv[]) +{ + struct sockaddr_in address; + unsigned int napi_id; + unsigned int port; + socklen_t optlen; + char buf[1024]; + int opt = 1; + int server; + int client; + int ret; + + server = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (server < 0) { + perror("socket creation failed"); + if (errno == EAFNOSUPPORT) + return -1; + return 1; + } + + port = atoi(argv[2]); + + if (setsockopt(server, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt))) { + perror("setsockopt"); + return 1; + } + + address.sin_family = AF_INET; + inet_pton(AF_INET, argv[1], &address.sin_addr); + address.sin_port = htons(port); + + if (bind(server, (struct sockaddr *)&address, sizeof(address)) < 0) { + perror("bind failed"); + return 1; + } + + if (listen(server, 1) < 0) { + perror("listen"); + return 1; + } + + ksft_ready(); + + client = accept(server, NULL, 0); + if (client < 0) { + perror("accept"); + return 1; + } + + optlen = sizeof(napi_id); + ret = getsockopt(client, SOL_SOCKET, SO_INCOMING_NAPI_ID, &napi_id, + &optlen); + if (ret != 0) { + perror("getsockopt"); + return 1; + } + + read(client, buf, 1024); + + ksft_wait(); + + if (napi_id == 0) { + fprintf(stderr, "napi ID is 0\n"); + return 1; + } + + close(client); + close(server); + + return 0; +} From a32f1923c6d6e9e727d00558a15ec0af6639de19 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 24 Apr 2025 22:15:50 +0200 Subject: [PATCH 487/770] crypto: scompress - increment scomp_scratch_users when already allocated Commit ddd0a42671c0 only increments scomp_scratch_users when it was 0, causing a panic when using ipcomp: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 UID: 0 PID: 619 Comm: ping Tainted: G N 6.15.0-rc3-net-00032-ga79be02bba5c #41 PREEMPT(full) Tainted: [N]=TEST Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014 RIP: 0010:inflate_fast+0x5a2/0x1b90 [...] Call Trace: zlib_inflate+0x2d60/0x6620 deflate_sdecompress+0x166/0x350 scomp_acomp_comp_decomp+0x45f/0xa10 scomp_acomp_decompress+0x21/0x120 acomp_do_req_chain+0x3e5/0x4e0 ipcomp_input+0x212/0x550 xfrm_input+0x2de2/0x72f0 [...] Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: disabled ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Instead, let's keep the old increment, and decrement back to 0 if the scratch allocation fails. Fixes: ddd0a42671c0 ("crypto: scompress - Fix scratch allocation failure handling") Signed-off-by: Sabrina Dubroca Signed-off-by: Herbert Xu --- crypto/scompress.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crypto/scompress.c b/crypto/scompress.c index 36934c78d1277..ffeedcf20b0f1 100644 --- a/crypto/scompress.c +++ b/crypto/scompress.c @@ -163,11 +163,10 @@ static int crypto_scomp_init_tfm(struct crypto_tfm *tfm) if (ret) goto unlock; } - if (!scomp_scratch_users) { + if (!scomp_scratch_users++) { ret = crypto_scomp_alloc_scratches(); if (ret) - goto unlock; - scomp_scratch_users++; + scomp_scratch_users--; } unlock: mutex_unlock(&scomp_lock); From 9bbb8a07fd65fca0f29a869ec3f2435761a6c676 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Mon, 2 Dec 2024 11:19:55 +0100 Subject: [PATCH 488/770] tools/hv: update route parsing in kvp daemon After recent changes in the VM network stack, the host fails to display the IP addresses of the VM. As a result the "IP Addresses" column in the "Networking" tab in the Windows Hyper-V Manager is empty. This is caused by a change in the expected output of the "ip route show" command. Previously the gateway address was shown in the third row. Now the gateway addresses might be split into several lines of output. As a result, the string "ra" instead of an IP address is sent to the host. To me more specific, a VM with the wellknown wicked network managing tool still shows the expected output in recent openSUSE Tumbleweed snapshots: ip a show dev uplink;ip -4 route show;ip -6 route show 2: uplink: mtu 1500 qdisc mq state ... link/ether 00:15:5d:d0:93:08 brd ff:ff:ff:ff:ff:ff inet 1.2.3.4/22 brd 1.2.3.255 scope global uplink valid_lft forever preferred_lft forever inet6 fe80::215:5dff:fed0:9308/64 scope link proto kernel_ll valid_lft forever preferred_lft forever default via 1.2.3.254 dev uplink proto dhcp 1.2.3.0/22 dev uplink proto kernel scope link src 1.2.3.4 fe80::/64 dev uplink proto kernel metric 256 pref medium default via fe80::26fc:4e00:3b:74 dev uplink proto ra metric 1024 exp... default via fe80::6a22:8e00:fb:14f8 dev uplink proto ra metric 1024 e... A similar VM, but with NetworkManager as network managing tool: ip a show dev eth0;ip -4 route show;ip -6 route show 2: eth0: mtu 1500 qdisc mq state UP... link/ether 00:15:5d:d0:93:0b brd ff:ff:ff:ff:ff:ff inet 1.2.3.8/22 brd 1.2.3.255 scope global dynamic noprefixroute ... valid_lft 1022sec preferred_lft 1022sec inet6 fe80::215:5dff:fed0:930b/64 scope link noprefixroute valid_lft forever preferred_lft forever default via 1.2.3.254 dev eth0 proto dhcp src 1.2.3.8 metric 100 1.2.3.0/22 dev eth0 proto kernel scope link src 1.2.3.8 metric 100 fe80::/64 dev eth0 proto kernel metric 1024 pref medium default proto ra metric 20100 pref medium nexthop via fe80::6a22:8e00:fb:14f8 dev eth0 weight 1 nexthop via fe80::26fc:4e00:3b:74 dev eth0 weight 1 Adjust the route parsing to use a single line for each line of output. Also use a single shell invocation to retrieve both IPv4 and IPv6 information. The actual IP addresses are expected after the "via" keyword. Signed-off-by: Olaf Hering Reviewed-by: Shradha Gupta Link: https://lore.kernel.org/r/20241202102235.9701-1-olaf@aepfle.de Signed-off-by: Wei Liu Message-ID: <20241202102235.9701-1-olaf@aepfle.de> --- tools/hv/hv_kvp_daemon.c | 108 ++++++++++++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 24 deletions(-) diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c index 04ba035d67e9d..b9ce3aab15fea 100644 --- a/tools/hv/hv_kvp_daemon.c +++ b/tools/hv/hv_kvp_daemon.c @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -677,6 +678,88 @@ static void kvp_process_ipconfig_file(char *cmd, pclose(file); } +static bool kvp_verify_ip_address(const void *address_string) +{ + char verify_buf[sizeof(struct in6_addr)]; + + if (inet_pton(AF_INET, address_string, verify_buf) == 1) + return true; + if (inet_pton(AF_INET6, address_string, verify_buf) == 1) + return true; + return false; +} + +static void kvp_extract_routes(const char *line, void **output, size_t *remaining) +{ + static const char needle[] = "via "; + const char *match, *haystack = line; + + while ((match = strstr(haystack, needle))) { + const char *address, *next_char; + + /* Address starts after needle. */ + address = match + strlen(needle); + + /* The char following address is a space or end of line. */ + next_char = strpbrk(address, " \t\\"); + if (!next_char) + next_char = address + strlen(address) + 1; + + /* Enough room for address and semicolon. */ + if (*remaining >= (next_char - address) + 1) { + memcpy(*output, address, next_char - address); + /* Terminate string for verification. */ + memcpy(*output + (next_char - address), "", 1); + if (kvp_verify_ip_address(*output)) { + /* Advance output buffer. */ + *output += next_char - address; + *remaining -= next_char - address; + + /* Each address needs a trailing semicolon. */ + memcpy(*output, ";", 1); + *output += 1; + *remaining -= 1; + } + } + haystack = next_char; + } +} + +static void kvp_get_gateway(void *buffer, size_t buffer_len) +{ + static const char needle[] = "default "; + FILE *f; + void *output = buffer; + char *line = NULL; + size_t alloc_size = 0, remaining = buffer_len - 1; + ssize_t num_chars; + + /* Show route information in a single line, for each address family */ + f = popen("ip --oneline -4 route show;ip --oneline -6 route show", "r"); + if (!f) { + /* Convert buffer into C-String. */ + memcpy(output, "", 1); + return; + } + while ((num_chars = getline(&line, &alloc_size, f)) > 0) { + /* Skip short lines. */ + if (num_chars <= strlen(needle)) + continue; + /* Skip lines without default route. */ + if (memcmp(line, needle, strlen(needle))) + continue; + /* Remove trailing newline to simplify further parsing. */ + if (line[num_chars - 1] == '\n') + line[num_chars - 1] = '\0'; + /* Search routes after match. */ + kvp_extract_routes(line + strlen(needle), &output, &remaining); + } + /* Convert buffer into C-String. */ + memcpy(output, "", 1); + free(line); + pclose(f); +} + static void kvp_get_ipconfig_info(char *if_name, struct hv_kvp_ipaddr_value *buffer) { @@ -685,30 +768,7 @@ static void kvp_get_ipconfig_info(char *if_name, char *p; FILE *file; - /* - * Get the address of default gateway (ipv4). - */ - sprintf(cmd, "%s %s", "ip route show dev", if_name); - strcat(cmd, " | awk '/default/ {print $3 }'"); - - /* - * Execute the command to gather gateway info. - */ - kvp_process_ipconfig_file(cmd, (char *)buffer->gate_way, - (MAX_GATEWAY_SIZE * 2), INET_ADDRSTRLEN, 0); - - /* - * Get the address of default gateway (ipv6). - */ - sprintf(cmd, "%s %s", "ip -f inet6 route show dev", if_name); - strcat(cmd, " | awk '/default/ {print $3 }'"); - - /* - * Execute the command to gather gateway info (ipv6). - */ - kvp_process_ipconfig_file(cmd, (char *)buffer->gate_way, - (MAX_GATEWAY_SIZE * 2), INET6_ADDRSTRLEN, 1); - + kvp_get_gateway(buffer->gate_way, sizeof(buffer->gate_way)); /* * Gather the DNS state. From 6d0417e4e1cf66fd917f06f0454958362714ef7d Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 9 Apr 2025 16:08:48 -0400 Subject: [PATCH 489/770] Bluetooth: hci_conn: Fix not setting conn_timeout for Broadcast Receiver Broadcast Receiver requires creating PA sync but the command just generates a status so this makes use of __hci_cmd_sync_status_sk to wait for HCI_EV_LE_PA_SYNC_ESTABLISHED, also because of this chance it is not longer necessary to use a custom method to serialize the process of creating the PA sync since the cmd_work_sync itself ensures only one command would be pending which now awaits for HCI_EV_LE_PA_SYNC_ESTABLISHED before proceeding to next connection. Fixes: 4a5e0ba68676 ("Bluetooth: ISO: Do not emit LE PA Create Sync if previous is pending") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 2 + include/net/bluetooth/hci_core.h | 13 +++-- include/net/bluetooth/hci_sync.h | 2 + net/bluetooth/hci_conn.c | 92 +------------------------------- net/bluetooth/hci_event.c | 6 +-- net/bluetooth/hci_sync.c | 87 ++++++++++++++++++++++++++++-- 6 files changed, 95 insertions(+), 107 deletions(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index a8586c3058c7c..8ea7a063cc651 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -1931,6 +1931,8 @@ struct hci_cp_le_pa_create_sync { __u8 sync_cte_type; } __packed; +#define HCI_OP_LE_PA_CREATE_SYNC_CANCEL 0x2045 + #define HCI_OP_LE_PA_TERM_SYNC 0x2046 struct hci_cp_le_pa_term_sync { __le16 handle; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 5115da34f8812..f20368b9a5d20 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1113,10 +1113,8 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, return NULL; } -static inline struct hci_conn *hci_conn_hash_lookup_sid(struct hci_dev *hdev, - __u8 sid, - bdaddr_t *dst, - __u8 dst_type) +static inline struct hci_conn * +hci_conn_hash_lookup_create_pa_sync(struct hci_dev *hdev) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; @@ -1124,8 +1122,10 @@ static inline struct hci_conn *hci_conn_hash_lookup_sid(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || bacmp(&c->dst, dst) || - c->dst_type != dst_type || c->sid != sid) + if (c->type != ISO_LINK) + continue; + + if (!test_bit(HCI_CONN_CREATE_PA_SYNC, &c->flags)) continue; rcu_read_unlock(); @@ -1524,7 +1524,6 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); int hci_le_create_cis_pending(struct hci_dev *hdev); -int hci_pa_create_sync_pending(struct hci_dev *hdev); int hci_le_big_create_sync_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 7e2cf0cca939a..93dac4c7f9e3e 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -185,3 +185,5 @@ int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn); int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn); int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, struct hci_conn_params *params); + +int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 7e1b53857648d..c3112ce39f672 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2064,95 +2064,6 @@ static int create_big_sync(struct hci_dev *hdev, void *data) return hci_le_create_big(conn, &conn->iso_qos); } -static void create_pa_complete(struct hci_dev *hdev, void *data, int err) -{ - bt_dev_dbg(hdev, ""); - - if (err) - bt_dev_err(hdev, "Unable to create PA: %d", err); -} - -static bool hci_conn_check_create_pa_sync(struct hci_conn *conn) -{ - if (conn->type != ISO_LINK || conn->sid == HCI_SID_INVALID) - return false; - - return true; -} - -static int create_pa_sync(struct hci_dev *hdev, void *data) -{ - struct hci_cp_le_pa_create_sync cp = {0}; - struct hci_conn *conn; - int err = 0; - - hci_dev_lock(hdev); - - rcu_read_lock(); - - /* The spec allows only one pending LE Periodic Advertising Create - * Sync command at a time. If the command is pending now, don't do - * anything. We check for pending connections after each PA Sync - * Established event. - * - * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E - * page 2493: - * - * If the Host issues this command when another HCI_LE_Periodic_ - * Advertising_Create_Sync command is pending, the Controller shall - * return the error code Command Disallowed (0x0C). - */ - list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { - if (test_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags)) - goto unlock; - } - - list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { - if (hci_conn_check_create_pa_sync(conn)) { - struct bt_iso_qos *qos = &conn->iso_qos; - - cp.options = qos->bcast.options; - cp.sid = conn->sid; - cp.addr_type = conn->dst_type; - bacpy(&cp.addr, &conn->dst); - cp.skip = cpu_to_le16(qos->bcast.skip); - cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); - cp.sync_cte_type = qos->bcast.sync_cte_type; - - break; - } - } - -unlock: - rcu_read_unlock(); - - hci_dev_unlock(hdev); - - if (bacmp(&cp.addr, BDADDR_ANY)) { - hci_dev_set_flag(hdev, HCI_PA_SYNC); - set_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); - - err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC, - sizeof(cp), &cp, HCI_CMD_TIMEOUT); - if (!err) - err = hci_update_passive_scan_sync(hdev); - - if (err) { - hci_dev_clear_flag(hdev, HCI_PA_SYNC); - clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); - } - } - - return err; -} - -int hci_pa_create_sync_pending(struct hci_dev *hdev) -{ - /* Queue start pa_create_sync and scan */ - return hci_cmd_sync_queue(hdev, create_pa_sync, - NULL, create_pa_complete); -} - struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos) @@ -2167,10 +2078,11 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, conn->dst_type = dst_type; conn->sid = sid; conn->state = BT_LISTEN; + conn->conn_timeout = msecs_to_jiffies(qos->bcast.sync_timeout * 10); hci_conn_hold(conn); - hci_pa_create_sync_pending(hdev); + hci_connect_pa_sync(hdev, conn); return conn; } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 5f808f0b0e9a2..ea7ccafd055a7 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6378,8 +6378,7 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, hci_dev_clear_flag(hdev, HCI_PA_SYNC); - conn = hci_conn_hash_lookup_sid(hdev, ev->sid, &ev->bdaddr, - ev->bdaddr_type); + conn = hci_conn_hash_lookup_create_pa_sync(hdev); if (!conn) { bt_dev_err(hdev, "Unable to find connection for dst %pMR sid 0x%2.2x", @@ -6418,9 +6417,6 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, } unlock: - /* Handle any other pending PA sync command */ - hci_pa_create_sync_pending(hdev); - hci_dev_unlock(hdev); } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 609b035e5c904..99c116b056cee 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2693,16 +2693,16 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) /* Force address filtering if PA Sync is in progress */ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) { - struct hci_cp_le_pa_create_sync *sent; + struct hci_conn *conn; - sent = hci_sent_cmd_data(hdev, HCI_OP_LE_PA_CREATE_SYNC); - if (sent) { + conn = hci_conn_hash_lookup_create_pa_sync(hdev); + if (conn) { struct conn_params pa; memset(&pa, 0, sizeof(pa)); - bacpy(&pa.addr, &sent->addr); - pa.addr_type = sent->addr_type; + bacpy(&pa.addr, &conn->dst); + pa.addr_type = conn->dst_type; /* Clear first since there could be addresses left * behind. @@ -6895,3 +6895,80 @@ int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, return __hci_cmd_sync_status(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } + +static void create_pa_complete(struct hci_dev *hdev, void *data, int err) +{ + bt_dev_dbg(hdev, "err %d", err); + + if (!err) + return; + + hci_dev_clear_flag(hdev, HCI_PA_SYNC); + + if (err == -ECANCELED) + return; + + hci_dev_lock(hdev); + + hci_update_passive_scan_sync(hdev); + + hci_dev_unlock(hdev); +} + +static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) +{ + struct hci_cp_le_pa_create_sync cp; + struct hci_conn *conn = data; + struct bt_iso_qos *qos = &conn->iso_qos; + int err; + + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; + + if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC)) + return -EBUSY; + + /* Mark HCI_CONN_CREATE_PA_SYNC so hci_update_passive_scan_sync can + * program the address in the allow list so PA advertisements can be + * received. + */ + set_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); + + hci_update_passive_scan_sync(hdev); + + memset(&cp, 0, sizeof(cp)); + cp.options = qos->bcast.options; + cp.sid = conn->sid; + cp.addr_type = conn->dst_type; + bacpy(&cp.addr, &conn->dst); + cp.skip = cpu_to_le16(qos->bcast.skip); + cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); + cp.sync_cte_type = qos->bcast.sync_cte_type; + + /* The spec allows only one pending LE Periodic Advertising Create + * Sync command at a time so we forcefully wait for PA Sync Established + * event since cmd_work can only schedule one command at a time. + * + * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 2493: + * + * If the Host issues this command when another HCI_LE_Periodic_ + * Advertising_Create_Sync command is pending, the Controller shall + * return the error code Command Disallowed (0x0C). + */ + err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_PA_CREATE_SYNC, + sizeof(cp), &cp, + HCI_EV_LE_PA_SYNC_ESTABLISHED, + conn->conn_timeout, NULL); + if (err == -ETIMEDOUT) + __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL, + 0, NULL, HCI_CMD_TIMEOUT); + + return err; +} + +int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn) +{ + return hci_cmd_sync_queue_once(hdev, hci_le_pa_create_sync, conn, + create_pa_complete); +} From 024421cf39923927ab2b5fe895d1d922b9abe67f Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 16 Apr 2025 15:43:32 -0400 Subject: [PATCH 490/770] Bluetooth: hci_conn: Fix not setting timeout for BIG Create Sync BIG Create Sync requires the command to just generates a status so this makes use of __hci_cmd_sync_status_sk to wait for HCI_EVT_LE_BIG_SYNC_ESTABLISHED, also because of this chance it is not longer necessary to use a custom method to serialize the process of creating the BIG sync since the cmd_work_sync itself ensures only one command would be pending which now awaits for HCI_EVT_LE_BIG_SYNC_ESTABLISHED before proceeding to next connection. Fixes: 42ecf1947135 ("Bluetooth: ISO: Do not emit LE BIG Create Sync if previous is pending") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 2 +- include/net/bluetooth/hci_core.h | 7 ++- include/net/bluetooth/hci_sync.h | 1 + net/bluetooth/hci_conn.c | 89 ++------------------------------ net/bluetooth/hci_event.c | 9 ++-- net/bluetooth/hci_sync.c | 63 ++++++++++++++++++++++ net/bluetooth/iso.c | 26 +++++----- 7 files changed, 88 insertions(+), 109 deletions(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 8ea7a063cc651..797992019f9ee 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2832,7 +2832,7 @@ struct hci_evt_le_create_big_complete { __le16 bis_handle[]; } __packed; -#define HCI_EVT_LE_BIG_SYNC_ESTABILISHED 0x1d +#define HCI_EVT_LE_BIG_SYNC_ESTABLISHED 0x1d struct hci_evt_le_big_sync_estabilished { __u8 status; __u8 handle; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index f20368b9a5d20..522d837a23fa4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1524,7 +1524,6 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); int hci_le_create_cis_pending(struct hci_dev *hdev); -int hci_le_big_create_sync_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, @@ -1565,9 +1564,9 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 data_len, __u8 *data); struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos); -int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, - struct bt_iso_qos *qos, - __u16 sync_handle, __u8 num_bis, __u8 bis[]); +int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, + struct bt_iso_qos *qos, __u16 sync_handle, + __u8 num_bis, __u8 bis[]); int hci_conn_check_link_mode(struct hci_conn *conn); int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level); int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 93dac4c7f9e3e..72558c826aa1b 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -187,3 +187,4 @@ int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, struct hci_conn_params *params); int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn); +int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index c3112ce39f672..6533e281ada33 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2087,89 +2087,9 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, return conn; } -static bool hci_conn_check_create_big_sync(struct hci_conn *conn) -{ - if (!conn->num_bis) - return false; - - return true; -} - -static void big_create_sync_complete(struct hci_dev *hdev, void *data, int err) -{ - bt_dev_dbg(hdev, ""); - - if (err) - bt_dev_err(hdev, "Unable to create BIG sync: %d", err); -} - -static int big_create_sync(struct hci_dev *hdev, void *data) -{ - DEFINE_FLEX(struct hci_cp_le_big_create_sync, pdu, bis, num_bis, 0x11); - struct hci_conn *conn; - - rcu_read_lock(); - - pdu->num_bis = 0; - - /* The spec allows only one pending LE BIG Create Sync command at - * a time. If the command is pending now, don't do anything. We - * check for pending connections after each BIG Sync Established - * event. - * - * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E - * page 2586: - * - * If the Host sends this command when the Controller is in the - * process of synchronizing to any BIG, i.e. the HCI_LE_BIG_Sync_ - * Established event has not been generated, the Controller shall - * return the error code Command Disallowed (0x0C). - */ - list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { - if (test_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags)) - goto unlock; - } - - list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { - if (hci_conn_check_create_big_sync(conn)) { - struct bt_iso_qos *qos = &conn->iso_qos; - - set_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); - - pdu->handle = qos->bcast.big; - pdu->sync_handle = cpu_to_le16(conn->sync_handle); - pdu->encryption = qos->bcast.encryption; - memcpy(pdu->bcode, qos->bcast.bcode, - sizeof(pdu->bcode)); - pdu->mse = qos->bcast.mse; - pdu->timeout = cpu_to_le16(qos->bcast.timeout); - pdu->num_bis = conn->num_bis; - memcpy(pdu->bis, conn->bis, conn->num_bis); - - break; - } - } - -unlock: - rcu_read_unlock(); - - if (!pdu->num_bis) - return 0; - - return hci_send_cmd(hdev, HCI_OP_LE_BIG_CREATE_SYNC, - struct_size(pdu, bis, pdu->num_bis), pdu); -} - -int hci_le_big_create_sync_pending(struct hci_dev *hdev) -{ - /* Queue big_create_sync */ - return hci_cmd_sync_queue_once(hdev, big_create_sync, - NULL, big_create_sync_complete); -} - -int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, - struct bt_iso_qos *qos, - __u16 sync_handle, __u8 num_bis, __u8 bis[]) +int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, + struct bt_iso_qos *qos, __u16 sync_handle, + __u8 num_bis, __u8 bis[]) { int err; @@ -2186,9 +2106,10 @@ int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, hcon->num_bis = num_bis; memcpy(hcon->bis, bis, num_bis); + hcon->conn_timeout = msecs_to_jiffies(qos->bcast.timeout * 10); } - return hci_le_big_create_sync_pending(hdev); + return hci_connect_big_sync(hdev, hcon); } static void create_big_complete(struct hci_dev *hdev, void *data, int err) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ea7ccafd055a7..6d6061111ac56 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6928,7 +6928,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); - if (!hci_le_ev_skb_pull(hdev, skb, HCI_EVT_LE_BIG_SYNC_ESTABILISHED, + if (!hci_le_ev_skb_pull(hdev, skb, HCI_EVT_LE_BIG_SYNC_ESTABLISHED, flex_array_size(ev, bis, ev->num_bis))) return; @@ -6999,9 +6999,6 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, } unlock: - /* Handle any other pending BIG sync command */ - hci_le_big_create_sync_pending(hdev); - hci_dev_unlock(hdev); } @@ -7123,8 +7120,8 @@ static const struct hci_le_ev { hci_le_create_big_complete_evt, sizeof(struct hci_evt_le_create_big_complete), HCI_MAX_EVENT_SIZE), - /* [0x1d = HCI_EV_LE_BIG_SYNC_ESTABILISHED] */ - HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_ESTABILISHED, + /* [0x1d = HCI_EV_LE_BIG_SYNC_ESTABLISHED] */ + HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_ESTABLISHED, hci_le_big_sync_established_evt, sizeof(struct hci_evt_le_big_sync_estabilished), HCI_MAX_EVENT_SIZE), diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 99c116b056cee..e56b1cbedab90 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -6972,3 +6972,66 @@ int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn) return hci_cmd_sync_queue_once(hdev, hci_le_pa_create_sync, conn, create_pa_complete); } + +static void create_big_complete(struct hci_dev *hdev, void *data, int err) +{ + struct hci_conn *conn = data; + + bt_dev_dbg(hdev, "err %d", err); + + if (err == -ECANCELED) + return; + + if (hci_conn_valid(hdev, conn)) + clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); +} + +static int hci_le_big_create_sync(struct hci_dev *hdev, void *data) +{ + DEFINE_FLEX(struct hci_cp_le_big_create_sync, cp, bis, num_bis, 0x11); + struct hci_conn *conn = data; + struct bt_iso_qos *qos = &conn->iso_qos; + int err; + + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; + + set_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); + + memset(cp, 0, sizeof(*cp)); + cp->handle = qos->bcast.big; + cp->sync_handle = cpu_to_le16(conn->sync_handle); + cp->encryption = qos->bcast.encryption; + memcpy(cp->bcode, qos->bcast.bcode, sizeof(cp->bcode)); + cp->mse = qos->bcast.mse; + cp->timeout = cpu_to_le16(qos->bcast.timeout); + cp->num_bis = conn->num_bis; + memcpy(cp->bis, conn->bis, conn->num_bis); + + /* The spec allows only one pending LE BIG Create Sync command at + * a time, so we forcefully wait for BIG Sync Established event since + * cmd_work can only schedule one command at a time. + * + * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 2586: + * + * If the Host sends this command when the Controller is in the + * process of synchronizing to any BIG, i.e. the HCI_LE_BIG_Sync_ + * Established event has not been generated, the Controller shall + * return the error code Command Disallowed (0x0C). + */ + err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_BIG_CREATE_SYNC, + struct_size(cp, bis, cp->num_bis), cp, + HCI_EVT_LE_BIG_SYNC_ESTABLISHED, + conn->conn_timeout, NULL); + if (err == -ETIMEDOUT) + hci_le_big_terminate_sync(hdev, cp->handle); + + return err; +} + +int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn) +{ + return hci_cmd_sync_queue_once(hdev, hci_le_big_create_sync, conn, + create_big_complete); +} diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 3501a991f1c64..2819cda616bce 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1462,14 +1462,13 @@ static void iso_conn_big_sync(struct sock *sk) lock_sock(sk); if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { - err = hci_le_big_create_sync(hdev, iso_pi(sk)->conn->hcon, - &iso_pi(sk)->qos, - iso_pi(sk)->sync_handle, - iso_pi(sk)->bc_num_bis, - iso_pi(sk)->bc_bis); + err = hci_conn_big_create_sync(hdev, iso_pi(sk)->conn->hcon, + &iso_pi(sk)->qos, + iso_pi(sk)->sync_handle, + iso_pi(sk)->bc_num_bis, + iso_pi(sk)->bc_bis); if (err) - bt_dev_err(hdev, "hci_le_big_create_sync: %d", - err); + bt_dev_err(hdev, "hci_big_create_sync: %d", err); } release_sock(sk); @@ -1922,7 +1921,7 @@ static void iso_conn_ready(struct iso_conn *conn) hcon); } else if (test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { ev = hci_recv_event_data(hcon->hdev, - HCI_EVT_LE_BIG_SYNC_ESTABILISHED); + HCI_EVT_LE_BIG_SYNC_ESTABLISHED); /* Get reference to PA sync parent socket, if it exists */ parent = iso_get_sock(&hcon->src, &hcon->dst, @@ -2113,12 +2112,11 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { - err = hci_le_big_create_sync(hdev, - hcon, - &iso_pi(sk)->qos, - iso_pi(sk)->sync_handle, - iso_pi(sk)->bc_num_bis, - iso_pi(sk)->bc_bis); + err = hci_conn_big_create_sync(hdev, hcon, + &iso_pi(sk)->qos, + iso_pi(sk)->sync_handle, + iso_pi(sk)->bc_num_bis, + iso_pi(sk)->bc_bis); if (err) { bt_dev_err(hdev, "hci_le_big_create_sync: %d", err); From d1af1f02ef8653dea4573e444136c8331189cd59 Mon Sep 17 00:00:00 2001 From: Kiran K Date: Thu, 17 Apr 2025 09:18:42 +0530 Subject: [PATCH 491/770] Bluetooth: btintel_pcie: Avoid redundant buffer allocation Reuse the skb buffer provided by the PCIe driver to pass it onto the stack, instead of copying it to a new skb. Fixes: c2b636b3f788 ("Bluetooth: btintel_pcie: Add support for PCIe transport") Signed-off-by: Kiran K Reviewed-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel_pcie.c | 33 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c index c1e69fcc9c4fa..efe6ad6d4dc0f 100644 --- a/drivers/bluetooth/btintel_pcie.c +++ b/drivers/bluetooth/btintel_pcie.c @@ -957,8 +957,10 @@ static int btintel_pcie_recv_event(struct hci_dev *hdev, struct sk_buff *skb) /* This is a debug event that comes from IML and OP image when it * starts execution. There is no need pass this event to stack. */ - if (skb->data[2] == 0x97) + if (skb->data[2] == 0x97) { + hci_recv_diag(hdev, skb); return 0; + } } return hci_recv_frame(hdev, skb); @@ -974,7 +976,6 @@ static int btintel_pcie_recv_frame(struct btintel_pcie_data *data, u8 pkt_type; u16 plen; u32 pcie_pkt_type; - struct sk_buff *new_skb; void *pdata; struct hci_dev *hdev = data->hdev; @@ -1051,24 +1052,20 @@ static int btintel_pcie_recv_frame(struct btintel_pcie_data *data, bt_dev_dbg(hdev, "pkt_type: 0x%2.2x len: %u", pkt_type, plen); - new_skb = bt_skb_alloc(plen, GFP_ATOMIC); - if (!new_skb) { - bt_dev_err(hdev, "Failed to allocate memory for skb of len: %u", - skb->len); - ret = -ENOMEM; - goto exit_error; - } - - hci_skb_pkt_type(new_skb) = pkt_type; - skb_put_data(new_skb, skb->data, plen); + hci_skb_pkt_type(skb) = pkt_type; hdev->stat.byte_rx += plen; + skb_trim(skb, plen); if (pcie_pkt_type == BTINTEL_PCIE_HCI_EVT_PKT) - ret = btintel_pcie_recv_event(hdev, new_skb); + ret = btintel_pcie_recv_event(hdev, skb); else - ret = hci_recv_frame(hdev, new_skb); + ret = hci_recv_frame(hdev, skb); + skb = NULL; /* skb is freed in the callee */ exit_error: + if (skb) + kfree_skb(skb); + if (ret) hdev->stat.err_rx++; @@ -1202,8 +1199,6 @@ static void btintel_pcie_rx_work(struct work_struct *work) struct btintel_pcie_data *data = container_of(work, struct btintel_pcie_data, rx_work); struct sk_buff *skb; - int err; - struct hci_dev *hdev = data->hdev; if (test_bit(BTINTEL_PCIE_HWEXP_INPROGRESS, &data->flags)) { /* Unlike usb products, controller will not send hardware @@ -1224,11 +1219,7 @@ static void btintel_pcie_rx_work(struct work_struct *work) /* Process the sk_buf in queue and send to the HCI layer */ while ((skb = skb_dequeue(&data->rx_skb_q))) { - err = btintel_pcie_recv_frame(data, skb); - if (err) - bt_dev_err(hdev, "Failed to send received frame: %d", - err); - kfree_skb(skb); + btintel_pcie_recv_frame(data, skb); } } From 0317b033abcd1d8dd2798f0e2de5e84543d0bd22 Mon Sep 17 00:00:00 2001 From: En-Wei Wu Date: Mon, 21 Apr 2025 21:00:37 +0800 Subject: [PATCH 492/770] Bluetooth: btusb: avoid NULL pointer dereference in skb_dequeue() A NULL pointer dereference can occur in skb_dequeue() when processing a QCA firmware crash dump on WCN7851 (0489:e0f3). [ 93.672166] Bluetooth: hci0: ACL memdump size(589824) [ 93.672475] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 93.672517] Workqueue: hci0 hci_devcd_rx [bluetooth] [ 93.672598] RIP: 0010:skb_dequeue+0x50/0x80 The issue stems from handle_dump_pkt_qca() returning 0 even when a dump packet is successfully processed. This is because it incorrectly forwards the return value of hci_devcd_init() (which returns 0 on success). As a result, the caller (btusb_recv_acl_qca() or btusb_recv_evt_qca()) assumes the packet was not handled and passes it to hci_recv_frame(), leading to premature kfree() of the skb. Later, hci_devcd_rx() attempts to dequeue the same skb from the dump queue, resulting in a NULL pointer dereference. Fix this by: 1. Making handle_dump_pkt_qca() return 0 on success and negative errno on failure, consistent with kernel conventions. 2. Splitting dump packet detection into separate functions for ACL and event packets for better structure and readability. This ensures dump packets are properly identified and consumed, avoiding double handling and preventing NULL pointer access. Fixes: 20981ce2d5a5 ("Bluetooth: btusb: Add WCN6855 devcoredump support") Signed-off-by: En-Wei Wu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 101 +++++++++++++++++++++++++++----------- 1 file changed, 73 insertions(+), 28 deletions(-) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 5012b5ff92c8a..a42dedb78e0a0 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -3010,22 +3010,16 @@ static void btusb_coredump_qca(struct hci_dev *hdev) bt_dev_err(hdev, "%s: triggle crash failed (%d)", __func__, err); } -/* - * ==0: not a dump pkt. - * < 0: fails to handle a dump pkt - * > 0: otherwise. - */ +/* Return: 0 on success, negative errno on failure. */ static int handle_dump_pkt_qca(struct hci_dev *hdev, struct sk_buff *skb) { - int ret = 1; + int ret = 0; u8 pkt_type; u8 *sk_ptr; unsigned int sk_len; u16 seqno; u32 dump_size; - struct hci_event_hdr *event_hdr; - struct hci_acl_hdr *acl_hdr; struct qca_dump_hdr *dump_hdr; struct btusb_data *btdata = hci_get_drvdata(hdev); struct usb_device *udev = btdata->udev; @@ -3035,30 +3029,14 @@ static int handle_dump_pkt_qca(struct hci_dev *hdev, struct sk_buff *skb) sk_len = skb->len; if (pkt_type == HCI_ACLDATA_PKT) { - acl_hdr = hci_acl_hdr(skb); - if (le16_to_cpu(acl_hdr->handle) != QCA_MEMDUMP_ACL_HANDLE) - return 0; sk_ptr += HCI_ACL_HDR_SIZE; sk_len -= HCI_ACL_HDR_SIZE; - event_hdr = (struct hci_event_hdr *)sk_ptr; - } else { - event_hdr = hci_event_hdr(skb); } - if ((event_hdr->evt != HCI_VENDOR_PKT) - || (event_hdr->plen != (sk_len - HCI_EVENT_HDR_SIZE))) - return 0; - sk_ptr += HCI_EVENT_HDR_SIZE; sk_len -= HCI_EVENT_HDR_SIZE; dump_hdr = (struct qca_dump_hdr *)sk_ptr; - if ((sk_len < offsetof(struct qca_dump_hdr, data)) - || (dump_hdr->vse_class != QCA_MEMDUMP_VSE_CLASS) - || (dump_hdr->msg_type != QCA_MEMDUMP_MSG_TYPE)) - return 0; - - /*it is dump pkt now*/ seqno = le16_to_cpu(dump_hdr->seqno); if (seqno == 0) { set_bit(BTUSB_HW_SSR_ACTIVE, &btdata->flags); @@ -3132,17 +3110,84 @@ static int handle_dump_pkt_qca(struct hci_dev *hdev, struct sk_buff *skb) return ret; } +/* Return: true if the ACL packet is a dump packet, false otherwise. */ +static bool acl_pkt_is_dump_qca(struct hci_dev *hdev, struct sk_buff *skb) +{ + u8 *sk_ptr; + unsigned int sk_len; + + struct hci_event_hdr *event_hdr; + struct hci_acl_hdr *acl_hdr; + struct qca_dump_hdr *dump_hdr; + + sk_ptr = skb->data; + sk_len = skb->len; + + acl_hdr = hci_acl_hdr(skb); + if (le16_to_cpu(acl_hdr->handle) != QCA_MEMDUMP_ACL_HANDLE) + return false; + + sk_ptr += HCI_ACL_HDR_SIZE; + sk_len -= HCI_ACL_HDR_SIZE; + event_hdr = (struct hci_event_hdr *)sk_ptr; + + if ((event_hdr->evt != HCI_VENDOR_PKT) || + (event_hdr->plen != (sk_len - HCI_EVENT_HDR_SIZE))) + return false; + + sk_ptr += HCI_EVENT_HDR_SIZE; + sk_len -= HCI_EVENT_HDR_SIZE; + + dump_hdr = (struct qca_dump_hdr *)sk_ptr; + if ((sk_len < offsetof(struct qca_dump_hdr, data)) || + (dump_hdr->vse_class != QCA_MEMDUMP_VSE_CLASS) || + (dump_hdr->msg_type != QCA_MEMDUMP_MSG_TYPE)) + return false; + + return true; +} + +/* Return: true if the event packet is a dump packet, false otherwise. */ +static bool evt_pkt_is_dump_qca(struct hci_dev *hdev, struct sk_buff *skb) +{ + u8 *sk_ptr; + unsigned int sk_len; + + struct hci_event_hdr *event_hdr; + struct qca_dump_hdr *dump_hdr; + + sk_ptr = skb->data; + sk_len = skb->len; + + event_hdr = hci_event_hdr(skb); + + if ((event_hdr->evt != HCI_VENDOR_PKT) + || (event_hdr->plen != (sk_len - HCI_EVENT_HDR_SIZE))) + return false; + + sk_ptr += HCI_EVENT_HDR_SIZE; + sk_len -= HCI_EVENT_HDR_SIZE; + + dump_hdr = (struct qca_dump_hdr *)sk_ptr; + if ((sk_len < offsetof(struct qca_dump_hdr, data)) || + (dump_hdr->vse_class != QCA_MEMDUMP_VSE_CLASS) || + (dump_hdr->msg_type != QCA_MEMDUMP_MSG_TYPE)) + return false; + + return true; +} + static int btusb_recv_acl_qca(struct hci_dev *hdev, struct sk_buff *skb) { - if (handle_dump_pkt_qca(hdev, skb)) - return 0; + if (acl_pkt_is_dump_qca(hdev, skb)) + return handle_dump_pkt_qca(hdev, skb); return hci_recv_frame(hdev, skb); } static int btusb_recv_evt_qca(struct hci_dev *hdev, struct sk_buff *skb) { - if (handle_dump_pkt_qca(hdev, skb)) - return 0; + if (evt_pkt_is_dump_qca(hdev, skb)) + return handle_dump_pkt_qca(hdev, skb); return hci_recv_frame(hdev, skb); } From 07e90048e356a29079fbc011cfc2e1fa1d1c5ac9 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Tue, 22 Apr 2025 09:21:55 +0800 Subject: [PATCH 493/770] Bluetooth: btmtksdio: Check function enabled before doing close Check BTMTKSDIO_FUNC_ENABLED flag before doing close to prevent btmtksdio_close been called twice. Fixes: 6ac4233afb9a ("Bluetooth: btmtksdio: Prevent enabling interrupts after IRQ handler removal") Signed-off-by: Chris Lu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtksdio.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/bluetooth/btmtksdio.c b/drivers/bluetooth/btmtksdio.c index edd5eead1e93b..e5a119ca72435 100644 --- a/drivers/bluetooth/btmtksdio.c +++ b/drivers/bluetooth/btmtksdio.c @@ -723,6 +723,10 @@ static int btmtksdio_close(struct hci_dev *hdev) { struct btmtksdio_dev *bdev = hci_get_drvdata(hdev); + /* Skip btmtksdio_close if BTMTKSDIO_FUNC_ENABLED isn't set */ + if (!test_bit(BTMTKSDIO_FUNC_ENABLED, &bdev->tx_state)) + return 0; + sdio_claim_host(bdev->func); /* Disable interrupt */ From 0b6d58bc6ea85e57de25c828444928e4a0aa79cb Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Tue, 22 Apr 2025 09:21:56 +0800 Subject: [PATCH 494/770] Bluetooth: btmtksdio: Do close if SDIO card removed without close To prevent Bluetooth SDIO card from be physically removed suddenly, driver needs to ensure btmtksdio_close is called before btmtksdio_remove to disable interrupts and txrx workqueue. Fixes: 6ac4233afb9a ("Bluetooth: btmtksdio: Prevent enabling interrupts after IRQ handler removal") Signed-off-by: Chris Lu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtksdio.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btmtksdio.c b/drivers/bluetooth/btmtksdio.c index e5a119ca72435..1d26207b2ba70 100644 --- a/drivers/bluetooth/btmtksdio.c +++ b/drivers/bluetooth/btmtksdio.c @@ -1447,11 +1447,15 @@ static void btmtksdio_remove(struct sdio_func *func) if (!bdev) return; + hdev = bdev->hdev; + + /* Make sure to call btmtksdio_close before removing sdio card */ + if (test_bit(BTMTKSDIO_FUNC_ENABLED, &bdev->tx_state)) + btmtksdio_close(hdev); + /* Be consistent the state in btmtksdio_probe */ pm_runtime_get_noresume(bdev->dev); - hdev = bdev->hdev; - sdio_set_drvdata(func, NULL); hci_unregister_dev(hdev); hci_free_dev(hdev); From 1c7664957e4edb234c69de2db4be1f740d2df564 Mon Sep 17 00:00:00 2001 From: Kiran K Date: Sun, 20 Apr 2025 07:21:56 +0530 Subject: [PATCH 495/770] Bluetooth: btintel_pcie: Add additional to checks to clear TX/RX paths Due to a hardware issue, there is a possibility that the driver may miss an MSIx interrupt on the RX/TX data path. Since the TX and RX paths are independent, when a TX MSIx interrupt occurs, the driver can check the RX queue for any pending data and process it if present. The same approach applies to the RX path. Fixes: c2b636b3f788 ("Bluetooth: btintel_pcie: Add support for PCIe transport") Signed-off-by: Chandrashekar Devegowda Signed-off-by: Kiran K Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel_pcie.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c index efe6ad6d4dc0f..0a759ea26fd38 100644 --- a/drivers/bluetooth/btintel_pcie.c +++ b/drivers/bluetooth/btintel_pcie.c @@ -1272,10 +1272,8 @@ static void btintel_pcie_msix_rx_handle(struct btintel_pcie_data *data) bt_dev_dbg(hdev, "RXQ: cr_hia: %u cr_tia: %u", cr_hia, cr_tia); /* Check CR_TIA and CR_HIA for change */ - if (cr_tia == cr_hia) { - bt_dev_warn(hdev, "RXQ: no new CD found"); + if (cr_tia == cr_hia) return; - } rxq = &data->rxq; @@ -1311,6 +1309,16 @@ static irqreturn_t btintel_pcie_msix_isr(int irq, void *data) return IRQ_WAKE_THREAD; } +static inline bool btintel_pcie_is_rxq_empty(struct btintel_pcie_data *data) +{ + return data->ia.cr_hia[BTINTEL_PCIE_RXQ_NUM] == data->ia.cr_tia[BTINTEL_PCIE_RXQ_NUM]; +} + +static inline bool btintel_pcie_is_txackq_empty(struct btintel_pcie_data *data) +{ + return data->ia.cr_tia[BTINTEL_PCIE_TXQ_NUM] == data->ia.cr_hia[BTINTEL_PCIE_TXQ_NUM]; +} + static irqreturn_t btintel_pcie_irq_msix_handler(int irq, void *dev_id) { struct msix_entry *entry = dev_id; @@ -1342,12 +1350,18 @@ static irqreturn_t btintel_pcie_irq_msix_handler(int irq, void *dev_id) btintel_pcie_msix_gp0_handler(data); /* For TX */ - if (intr_fh & BTINTEL_PCIE_MSIX_FH_INT_CAUSES_0) + if (intr_fh & BTINTEL_PCIE_MSIX_FH_INT_CAUSES_0) { btintel_pcie_msix_tx_handle(data); + if (!btintel_pcie_is_rxq_empty(data)) + btintel_pcie_msix_rx_handle(data); + } /* For RX */ - if (intr_fh & BTINTEL_PCIE_MSIX_FH_INT_CAUSES_1) + if (intr_fh & BTINTEL_PCIE_MSIX_FH_INT_CAUSES_1) { btintel_pcie_msix_rx_handle(data); + if (!btintel_pcie_is_txackq_empty(data)) + btintel_pcie_msix_tx_handle(data); + } /* * Before sending the interrupt the HW disables it to prevent a nested From 3908feb1bd7f319a10e18d84369a48163264cc7d Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Thu, 24 Apr 2025 22:51:03 +0300 Subject: [PATCH 496/770] Bluetooth: L2CAP: copy RX timestamp to new fragments Copy timestamp too when allocating new skb for received fragment. Fixes missing RX timestamps with fragmentation. Fixes: 4d7ea8ee90e4 ("Bluetooth: L2CAP: Fix handling fragmented length") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 5ca7ac43c58d4..73472756618a0 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7415,6 +7415,9 @@ static int l2cap_recv_frag(struct l2cap_conn *conn, struct sk_buff *skb, return -ENOMEM; /* Init rx_len */ conn->rx_len = len; + + skb_set_delivery_time(conn->rx_skb, skb->tstamp, + skb->tstamp_type); } /* Copy as much as the rx_skb can hold */ From 14ae3003e73e777c9b36385a7c86f754b50a1821 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 21 Apr 2025 09:31:34 -0700 Subject: [PATCH 497/770] Drivers: hv: Fix bad ref to hv_synic_eventring_tail when CPU goes offline When a CPU goes offline, hv_common_cpu_die() frees the hv_synic_eventring_tail memory for the CPU. But in a normal VM (i.e., not running in the root partition) the per-CPU memory has not been allocated, resulting in a bad memory reference and oops when computing the argument to kfree(). Fix this by freeing the memory only when running in the root partition. Fixes: 04df7ac39943 ("Drivers: hv: Introduce per-cpu event ring tail") Signed-off-by: Michael Kelley Reviewed-by: Nuno Das Neves Link: https://lore.kernel.org/r/20250421163134.2024-1-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20250421163134.2024-1-mhklinux@outlook.com> --- drivers/hv/hv_common.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index a7d7494feaca1..59792e00cecf3 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -566,9 +566,11 @@ int hv_common_cpu_die(unsigned int cpu) * originally allocated memory is reused in hv_common_cpu_init(). */ - synic_eventring_tail = this_cpu_ptr(hv_synic_eventring_tail); - kfree(*synic_eventring_tail); - *synic_eventring_tail = NULL; + if (hv_root_partition()) { + synic_eventring_tail = this_cpu_ptr(hv_synic_eventring_tail); + kfree(*synic_eventring_tail); + *synic_eventring_tail = NULL; + } return 0; } From e86e9134e1d1c90a960dd57f59ce574d27b9a124 Mon Sep 17 00:00:00 2001 From: Sean Heelan Date: Sat, 19 Apr 2025 19:59:28 +0100 Subject: [PATCH 498/770] ksmbd: fix use-after-free in kerberos authentication Setting sess->user = NULL was introduced to fix the dangling pointer created by ksmbd_free_user. However, it is possible another thread could be operating on the session and make use of sess->user after it has been passed to ksmbd_free_user but before sess->user is set to NULL. Cc: stable@vger.kernel.org Signed-off-by: Sean Heelan Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/auth.c | 14 +++++++++++++- fs/smb/server/smb2pdu.c | 5 ----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 83caa38497493..b3d121052408c 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -550,7 +550,19 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, retval = -ENOMEM; goto out; } - sess->user = user; + + if (!sess->user) { + /* First successful authentication */ + sess->user = user; + } else { + if (!ksmbd_compare_user(sess->user, user)) { + ksmbd_debug(AUTH, "different user tried to reuse session\n"); + retval = -EPERM; + ksmbd_free_user(user); + goto out; + } + ksmbd_free_user(user); + } memcpy(sess->sess_key, resp->payload, resp->session_key_len); memcpy(out_blob, resp->payload + resp->session_key_len, diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 372021b3f2632..acc05657cfc72 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1607,11 +1607,6 @@ static int krb5_authenticate(struct ksmbd_work *work, if (prev_sess_id && prev_sess_id != sess->id) destroy_previous_session(conn, sess->user, prev_sess_id); - if (sess->state == SMB2_SESSION_VALID) { - ksmbd_free_user(sess->user); - sess->user = NULL; - } - retval = ksmbd_krb5_authenticate(sess, in_blob, in_len, out_blob, &out_len); if (retval) { From 2fc9feff45d92a92cd5f96487655d5be23fb7e2b Mon Sep 17 00:00:00 2001 From: Sean Heelan Date: Mon, 21 Apr 2025 15:39:29 +0000 Subject: [PATCH 499/770] ksmbd: fix use-after-free in session logoff The sess->user object can currently be in use by another thread, for example if another connection has sent a session setup request to bind to the session being free'd. The handler for that connection could be in the smb2_sess_setup function which makes use of sess->user. Cc: stable@vger.kernel.org Signed-off-by: Sean Heelan Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index acc05657cfc72..46aa082457424 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2249,10 +2249,6 @@ int smb2_session_logoff(struct ksmbd_work *work) sess->state = SMB2_SESSION_EXPIRED; up_write(&conn->session_lock); - if (sess->user) { - ksmbd_free_user(sess->user); - sess->user = NULL; - } ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_SETUP); rsp->StructureSize = cpu_to_le16(4); From 4c2227656d9003f4d77afc76f34dd81b95e4c2c4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 23 Apr 2025 15:36:00 +0200 Subject: [PATCH 500/770] vmxnet3: Fix malformed packet sizing in vmxnet3_process_xdp vmxnet3 driver's XDP handling is buggy for packet sizes using ring0 (that is, packet sizes between 128 - 3k bytes). We noticed MTU-related connectivity issues with Cilium's service load- balancing in case of vmxnet3 as NIC underneath. A simple curl to a HTTP backend service where the XDP LB was doing IPIP encap led to overly large packet sizes but only for *some* of the packets (e.g. HTTP GET request) while others (e.g. the prior TCP 3WHS) looked completely fine on the wire. In fact, the pcap recording on the backend node actually revealed that the node with the XDP LB was leaking uninitialized kernel data onto the wire for the affected packets, for example, while the packets should have been 152 bytes their actual size was 1482 bytes, so the remainder after 152 bytes was padded with whatever other data was in that page at the time (e.g. we saw user/payload data from prior processed packets). We only noticed this through an MTU issue, e.g. when the XDP LB node and the backend node both had the same MTU (e.g. 1500) then the curl request got dropped on the backend node's NIC given the packet was too large even though the IPIP-encapped packet normally would never even come close to the MTU limit. Lowering the MTU on the XDP LB (e.g. 1480) allowed to let the curl request succeed (which also indicates that the kernel ignored the padding, and thus the issue wasn't very user-visible). Commit e127ce7699c1 ("vmxnet3: Fix missing reserved tailroom") was too eager to also switch xdp_prepare_buff() from rcd->len to rbi->len. It really needs to stick to rcd->len which is the actual packet length from the descriptor. The latter we also feed into vmxnet3_process_xdp_small(), by the way, and it indicates the correct length needed to initialize the xdp->{data,data_end} parts. For e127ce7699c1 ("vmxnet3: Fix missing reserved tailroom") the relevant part was adapting xdp_init_buff() to address the warning given the xdp_data_hard_end() depends on xdp->frame_sz. With that fixed, traffic on the wire looks good again. Fixes: e127ce7699c1 ("vmxnet3: Fix missing reserved tailroom") Signed-off-by: Daniel Borkmann Tested-by: Andrew Sauber Cc: Anton Protopopov Cc: William Tu Cc: Martin Zaharinov Cc: Ronak Doshi Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250423133600.176689-1-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- drivers/net/vmxnet3/vmxnet3_xdp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/vmxnet3/vmxnet3_xdp.c b/drivers/net/vmxnet3/vmxnet3_xdp.c index 616ecc38d1726..5f470499e6002 100644 --- a/drivers/net/vmxnet3/vmxnet3_xdp.c +++ b/drivers/net/vmxnet3/vmxnet3_xdp.c @@ -397,7 +397,7 @@ vmxnet3_process_xdp(struct vmxnet3_adapter *adapter, xdp_init_buff(&xdp, PAGE_SIZE, &rq->xdp_rxq); xdp_prepare_buff(&xdp, page_address(page), rq->page_pool->p.offset, - rbi->len, false); + rcd->len, false); xdp_buff_clear_frags_flag(&xdp); xdp_prog = rcu_dereference(rq->adapter->xdp_bpf_prog); From 43fd0054f3569d8063b2a5b6a3987031cd0d36f6 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 24 Apr 2025 19:20:47 -0700 Subject: [PATCH 501/770] io_uring/zcrx: selftests: switch to using defer() for cleanup Switch to using defer() for putting the NIC back to the original state prior to running the selftest. Signed-off-by: David Wei Reviewed-by: Simon Horman Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250425022049.3474590-2-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/hw/iou-zcrx.py | 61 +++++++++---------- 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index 6a0378e06cabd..698f29cfd7ebd 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -5,7 +5,7 @@ from os import path from lib.py import ksft_run, ksft_exit from lib.py import NetDrvEpEnv -from lib.py import bkg, cmd, ethtool, wait_port_listen +from lib.py import bkg, cmd, defer, ethtool, wait_port_listen def _get_rx_ring_entries(cfg): @@ -34,22 +34,21 @@ def test_zcrx(cfg) -> None: raise KsftSkipEx('at least 2 combined channels required') rx_ring = _get_rx_ring_entries(cfg) - try: - ethtool(f"-G {cfg.ifname} tcp-data-split on", host=cfg.remote) - ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) - ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) - flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) - rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1}" - tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p 9999 -l 12840" - with bkg(rx_cmd, host=cfg.remote, exit_wait=True): - wait_port_listen(9999, proto="tcp", host=cfg.remote) - cmd(tx_cmd) - finally: - ethtool(f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) - ethtool(f"-X {cfg.ifname} default", host=cfg.remote) - ethtool(f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) - ethtool(f"-G {cfg.ifname} tcp-data-split auto", host=cfg.remote) + ethtool(f"-G {cfg.ifname} tcp-data-split on", host=cfg.remote) + defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto", host=cfg.remote) + ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) + defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) + defer(ethtool, f"-X {cfg.ifname} default", host=cfg.remote) + flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) + + rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1}" + tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p 9999 -l 12840" + with bkg(rx_cmd, host=cfg.remote, exit_wait=True): + wait_port_listen(9999, proto="tcp", host=cfg.remote) + cmd(tx_cmd) def test_zcrx_oneshot(cfg) -> None: @@ -60,22 +59,20 @@ def test_zcrx_oneshot(cfg) -> None: raise KsftSkipEx('at least 2 combined channels required') rx_ring = _get_rx_ring_entries(cfg) - try: - ethtool(f"-G {cfg.ifname} tcp-data-split on", host=cfg.remote) - ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) - ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) - flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) - - rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1} -o 4" - tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p 9999 -l 4096 -z 16384" - with bkg(rx_cmd, host=cfg.remote, exit_wait=True): - wait_port_listen(9999, proto="tcp", host=cfg.remote) - cmd(tx_cmd) - finally: - ethtool(f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) - ethtool(f"-X {cfg.ifname} default", host=cfg.remote) - ethtool(f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) - ethtool(f"-G {cfg.ifname} tcp-data-split auto", host=cfg.remote) + ethtool(f"-G {cfg.ifname} tcp-data-split on", host=cfg.remote) + defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto", host=cfg.remote) + ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) + defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) + defer(ethtool, f"-X {cfg.ifname} default", host=cfg.remote) + flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) + + rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1} -o 4" + tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p 9999 -l 4096 -z 16384" + with bkg(rx_cmd, host=cfg.remote, exit_wait=True): + wait_port_listen(9999, proto="tcp", host=cfg.remote) + cmd(tx_cmd) def main() -> None: From 4ce3ade36f251e47fc6b0345afb73a09794fb2e9 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 24 Apr 2025 19:20:48 -0700 Subject: [PATCH 502/770] io_uring/zcrx: selftests: set hds_thresh to 0 Setting hds_thresh to 0 is required for queue reset. Signed-off-by: David Wei Reviewed-by: Simon Horman Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250425022049.3474590-3-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- .../testing/selftests/drivers/net/hw/iou-zcrx.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index 698f29cfd7ebd..0b0b6a2611596 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -8,10 +8,11 @@ from lib.py import bkg, cmd, defer, ethtool, wait_port_listen -def _get_rx_ring_entries(cfg): +def _get_current_settings(cfg): output = ethtool(f"-g {cfg.ifname}", host=cfg.remote).stdout - values = re.findall(r'RX:\s+(\d+)', output) - return int(values[1]) + rx_ring = re.findall(r'RX:\s+(\d+)', output) + hds_thresh = re.findall(r'HDS thresh:\s+(\d+)', output) + return (int(rx_ring[1]), int(hds_thresh[1])) def _get_combined_channels(cfg): @@ -32,11 +33,12 @@ def test_zcrx(cfg) -> None: combined_chans = _get_combined_channels(cfg) if combined_chans < 2: raise KsftSkipEx('at least 2 combined channels required') - rx_ring = _get_rx_ring_entries(cfg) - + (rx_ring, hds_thresh) = _get_current_settings(cfg) ethtool(f"-G {cfg.ifname} tcp-data-split on", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto", host=cfg.remote) + ethtool(f"-G {cfg.ifname} hds-thresh 0", host=cfg.remote) + defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}", host=cfg.remote) ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) @@ -57,10 +59,12 @@ def test_zcrx_oneshot(cfg) -> None: combined_chans = _get_combined_channels(cfg) if combined_chans < 2: raise KsftSkipEx('at least 2 combined channels required') - rx_ring = _get_rx_ring_entries(cfg) + (rx_ring, hds_thresh) = _get_current_settings(cfg) ethtool(f"-G {cfg.ifname} tcp-data-split on", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto", host=cfg.remote) + ethtool(f"-G {cfg.ifname} hds-thresh 0", host=cfg.remote) + defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}", host=cfg.remote) ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) From 5c3524b031be98de5263d313eafef0ec9303f8b5 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 24 Apr 2025 19:20:49 -0700 Subject: [PATCH 503/770] io_uring/zcrx: selftests: add test case for rss ctx RSS contexts are used to shard work across multiple queues for an application using io_uring zero copy receive. Add a test case checking that steering flows into an RSS context works. Until I add multi-thread support to the selftest binary, this test case only has 1 queue in the RSS context. Signed-off-by: David Wei Reviewed-by: Simon Horman Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250425022049.3474590-4-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/hw/iou-zcrx.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index 0b0b6a2611596..48b3d27cf472a 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -21,12 +21,25 @@ def _get_combined_channels(cfg): return int(values[1]) +def _create_rss_ctx(cfg, chans): + output = ethtool(f"-X {cfg.ifname} context new start {chans - 1} equal 1", host=cfg.remote).stdout + values = re.search(r'New RSS context is (\d+)', output).group(1) + ctx_id = int(values) + return (ctx_id, defer(ethtool, f"-X {cfg.ifname} delete context {ctx_id}", host=cfg.remote)) + + def _set_flow_rule(cfg, chan): output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port 9999 action {chan}", host=cfg.remote).stdout values = re.search(r'ID (\d+)', output).group(1) return int(values) +def _set_flow_rule_rss(cfg, chan): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port 9999 action {chan}", host=cfg.remote).stdout + values = re.search(r'ID (\d+)', output).group(1) + return int(values) + + def test_zcrx(cfg) -> None: cfg.require_ipver('6') @@ -79,6 +92,34 @@ def test_zcrx_oneshot(cfg) -> None: cmd(tx_cmd) +def test_zcrx_rss(cfg) -> None: + cfg.require_ipver('6') + + combined_chans = _get_combined_channels(cfg) + if combined_chans < 2: + raise KsftSkipEx('at least 2 combined channels required') + (rx_ring, hds_thresh) = _get_current_settings(cfg) + + ethtool(f"-G {cfg.ifname} tcp-data-split on", host=cfg.remote) + defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto", host=cfg.remote) + ethtool(f"-G {cfg.ifname} hds-thresh 0", host=cfg.remote) + defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}", host=cfg.remote) + ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) + defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) + defer(ethtool, f"-X {cfg.ifname} default", host=cfg.remote) + + (ctx_id, delete_ctx) = _create_rss_ctx(cfg, combined_chans) + flow_rule_id = _set_flow_rule_rss(cfg, ctx_id) + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) + + rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1}" + tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p 9999 -l 12840" + with bkg(rx_cmd, host=cfg.remote, exit_wait=True): + wait_port_listen(9999, proto="tcp", host=cfg.remote) + cmd(tx_cmd) + + def main() -> None: with NetDrvEpEnv(__file__) as cfg: cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") From 5ec6d7d737a491256cd37e33910f7ac1978db591 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Apr 2025 01:37:33 +0300 Subject: [PATCH 504/770] net: mscc: ocelot: delete PVID VLAN when readding it as non-PVID The following set of commands: ip link add br0 type bridge vlan_filtering 1 # vlan_default_pvid 1 is implicit ip link set swp0 master br0 bridge vlan add dev swp0 vid 1 should result in the dropping of untagged and 802.1p-tagged traffic, but we see that it continues to be accepted. Whereas, had we deleted VID 1 instead, the aforementioned dropping would have worked This is because the ANA_PORT_DROP_CFG update logic doesn't run, because ocelot_vlan_add() only calls ocelot_port_set_pvid() if the new VLAN has the BRIDGE_VLAN_INFO_PVID flag. Similar to other drivers like mt7530_port_vlan_add() which handle this case correctly, we need to test whether the VLAN we're changing used to have the BRIDGE_VLAN_INFO_PVID flag, but lost it now. That amounts to a PVID deletion and should be treated as such. Regarding blame attribution: this never worked properly since the introduction of bridge VLAN filtering in commit 7142529f1688 ("net: mscc: ocelot: add VLAN filtering"). However, there was a significant paradigm shift which aligned the ANA_PORT_DROP_CFG register with the PVID concept rather than with the native VLAN concept, and that change wasn't targeted for 'stable'. Realistically, that is as far as this fix needs to be propagated to. Fixes: be0576fed6d3 ("net: mscc: ocelot: move the logic to drop 802.1p traffic to the pvid deletion") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250424223734.3096202-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index ef93df5208871..08bee56aea35f 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -830,6 +830,7 @@ EXPORT_SYMBOL(ocelot_vlan_prepare); int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, bool untagged) { + struct ocelot_port *ocelot_port = ocelot->ports[port]; int err; /* Ignore VID 0 added to our RX filter by the 8021q module, since @@ -849,6 +850,11 @@ int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, ocelot_bridge_vlan_find(ocelot, vid)); if (err) return err; + } else if (ocelot_port->pvid_vlan && + ocelot_bridge_vlan_find(ocelot, vid) == ocelot_port->pvid_vlan) { + err = ocelot_port_set_pvid(ocelot, port, NULL); + if (err) + return err; } /* Untagged egress vlan clasification */ From bf9de1dcd0eecd16020a677c900a70ea9b0a9714 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Apr 2025 01:37:34 +0300 Subject: [PATCH 505/770] selftests: net: bridge_vlan_aware: test untagged/8021p-tagged with and without PVID Recent discussions around commit ad1afb003939 ("vlan_dev: VLAN 0 should be treated as "no vlan tag" (802.1p packet)") have sparked the question what happens with the DSA (and possibly other switchdev) data path when the bridge says that ports should have no PVID VLAN, but the 8021q module, as the result of a NETDEV_UP event, decides it should add VID 0 to the RX filter of those bridge ports. Do those bridge ports receive packets tagged with VID 0 or not, now? We don't know, there is no test. In the veth realm, this passes trivially, because veth is not VLAN filtering and this, the 8021q module lacks the instinct to add VID 0 in the first place. In the realm of VLAN filtering NICs with no switchdev offload, this should also pass, because the VLAN groups of the software bridge are consulted, where it can clearly be seen that a PVID is missing, even though the packet was initially accepted by the NIC. The test only poses a challenge for switchdev drivers, which usually have to program to hardware both VLANs from RX filtering, as well as from switchdev. Especially when a switchdev port joins a VLAN-aware bridge, it is unavoidable that it gains the NETIF_F_HW_VLAN_CTAG_FILTER feature, i.e. any 8021q uppers that the bridge port may have must also be committed to the RX filtering table of the interface. When a VLAN-tagged packet is physically received by the port, it is initially indistinguishable whether it will reach the bridge data path or the 8021q upper data path. That is rather the final step of the new tests that we introduce. We need to build context up to that stage, which means the following: - we need to test that 802.1p (VID 0) tagged traffic is received in the first place (on bridge ports with a valid PVID). This is the "8021p" test. - we need to test that the usual paths of reaching a configuration with no PVID on a bridge port are all covered and they all reach the same state. Signed-off-by: Vladimir Oltean Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://patch.msgid.link/20250424223734.3096202-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- .../net/forwarding/bridge_vlan_aware.sh | 96 ++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh index 90f8a244ea901..e59fba366a0a6 100755 --- a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh +++ b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh @@ -1,7 +1,7 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn other_tpid" +ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn other_tpid 8021p drop_untagged" NUM_NETIFS=4 CHECK_TC="yes" source lib.sh @@ -194,6 +194,100 @@ other_tpid() tc qdisc del dev $h2 clsact } +8021p_do() +{ + local should_fail=$1; shift + local mac=de:ad:be:ef:13:37 + + tc filter add dev $h2 ingress protocol all pref 1 handle 101 \ + flower dst_mac $mac action drop + + $MZ -q $h1 -c 1 -b $mac -a own "81:00 00:00 08:00 aa-aa-aa-aa-aa-aa-aa-aa-aa" + sleep 1 + + tc -j -s filter show dev $h2 ingress \ + | jq -e ".[] | select(.options.handle == 101) \ + | select(.options.actions[0].stats.packets == 1)" &> /dev/null + check_err_fail $should_fail $? "802.1p-tagged reception" + + tc filter del dev $h2 ingress pref 1 +} + +8021p() +{ + RET=0 + + tc qdisc add dev $h2 clsact + ip link set $h2 promisc on + + # Test that with the default_pvid, 1, packets tagged with VID 0 are + # accepted. + 8021p_do 0 + + # Test that packets tagged with VID 0 are still accepted after changing + # the default_pvid. + ip link set br0 type bridge vlan_default_pvid 10 + 8021p_do 0 + + log_test "Reception of 802.1p-tagged traffic" + + ip link set $h2 promisc off + tc qdisc del dev $h2 clsact +} + +send_untagged_and_8021p() +{ + ping_do $h1 192.0.2.2 + check_fail $? + + 8021p_do 1 +} + +drop_untagged() +{ + RET=0 + + tc qdisc add dev $h2 clsact + ip link set $h2 promisc on + + # Test that with no PVID, untagged and 802.1p-tagged traffic is + # dropped. + ip link set br0 type bridge vlan_default_pvid 1 + + # First we reconfigure the default_pvid, 1, as a non-PVID VLAN. + bridge vlan add dev $swp1 vid 1 untagged + send_untagged_and_8021p + bridge vlan add dev $swp1 vid 1 pvid untagged + + # Next we try to delete VID 1 altogether + bridge vlan del dev $swp1 vid 1 + send_untagged_and_8021p + bridge vlan add dev $swp1 vid 1 pvid untagged + + # Set up the bridge without a default_pvid, then check that the 8021q + # module, when the bridge port goes down and then up again, does not + # accidentally re-enable untagged packet reception. + ip link set br0 type bridge vlan_default_pvid 0 + ip link set $swp1 down + ip link set $swp1 up + setup_wait + send_untagged_and_8021p + + # Remove swp1 as a bridge port and let it rejoin the bridge while it + # has no default_pvid. + ip link set $swp1 nomaster + ip link set $swp1 master br0 + send_untagged_and_8021p + + # Restore settings + ip link set br0 type bridge vlan_default_pvid 1 + + log_test "Dropping of untagged and 802.1p-tagged traffic with no PVID" + + ip link set $h2 promisc off + tc qdisc del dev $h2 clsact +} + trap cleanup EXIT setup_prepare From 765f253e28909f161b0211f85cf0431cfee7d6df Mon Sep 17 00:00:00 2001 From: Christian Heusel Date: Thu, 24 Apr 2025 16:00:28 +0200 Subject: [PATCH 506/770] Revert "rndis_host: Flag RNDIS modems as WWAN devices" This reverts commit 67d1a8956d2d62fe6b4c13ebabb57806098511d8. Since this commit has been proven to be problematic for the setup of USB-tethered ethernet connections and the related breakage is very noticeable for users it should be reverted until a fixed version of the change can be rolled out. Closes: https://lore.kernel.org/all/e0df2d85-1296-4317-b717-bd757e3ab928@heusel.eu/ Link: https://chaos.social/@gromit/114377862699921553 Link: https://bugzilla.kernel.org/show_bug.cgi?id=220002 Link: https://bugs.gentoo.org/953555 Link: https://bbs.archlinux.org/viewtopic.php?id=304892 Cc: stable@vger.kernel.org Acked-by: Lubomir Rintel Signed-off-by: Christian Heusel Link: https://patch.msgid.link/20250424-usb-tethering-fix-v1-1-b65cf97c740e@heusel.eu Signed-off-by: Jakub Kicinski --- drivers/net/usb/rndis_host.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c index bb0bf14158727..7b3739b29c8f7 100644 --- a/drivers/net/usb/rndis_host.c +++ b/drivers/net/usb/rndis_host.c @@ -630,16 +630,6 @@ static const struct driver_info zte_rndis_info = { .tx_fixup = rndis_tx_fixup, }; -static const struct driver_info wwan_rndis_info = { - .description = "Mobile Broadband RNDIS device", - .flags = FLAG_WWAN | FLAG_POINTTOPOINT | FLAG_FRAMING_RN | FLAG_NO_SETINT, - .bind = rndis_bind, - .unbind = rndis_unbind, - .status = rndis_status, - .rx_fixup = rndis_rx_fixup, - .tx_fixup = rndis_tx_fixup, -}; - /*-------------------------------------------------------------------------*/ static const struct usb_device_id products [] = { @@ -676,11 +666,9 @@ static const struct usb_device_id products [] = { USB_INTERFACE_INFO(USB_CLASS_WIRELESS_CONTROLLER, 1, 3), .driver_info = (unsigned long) &rndis_info, }, { - /* Mobile Broadband Modem, seen in Novatel Verizon USB730L and - * Telit FN990A (RNDIS) - */ + /* Novatel Verizon USB730L */ USB_INTERFACE_INFO(USB_CLASS_MISC, 4, 1), - .driver_info = (unsigned long)&wwan_rndis_info, + .driver_info = (unsigned long) &rndis_info, }, { }, // END }; From 8548c84c004be3da4ffbe35ed0589041a4050c03 Mon Sep 17 00:00:00 2001 From: Sathesh B Edara Date: Thu, 24 Apr 2025 06:39:44 -0700 Subject: [PATCH 507/770] octeon_ep_vf: Resolve netdevice usage count issue The netdevice usage count increases during transmit queue timeouts because netdev_hold is called in ndo_tx_timeout, scheduling a task to reinitialize the card. Although netdev_put is called at the end of the scheduled work, rtnl_unlock checks the reference count during cleanup. This could cause issues if transmit timeout is called on multiple queues. Fixes: cb7dd712189f ("octeon_ep_vf: Add driver framework and device initialization") Signed-off-by: Sathesh B Edara Link: https://patch.msgid.link/20250424133944.28128-1-sedara@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c index 18c922dd5fc64..ccb69bc5c9529 100644 --- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_main.c @@ -835,7 +835,9 @@ static void octep_vf_tx_timeout(struct net_device *netdev, unsigned int txqueue) struct octep_vf_device *oct = netdev_priv(netdev); netdev_hold(netdev, NULL, GFP_ATOMIC); - schedule_work(&oct->tx_timeout_task); + if (!schedule_work(&oct->tx_timeout_task)) + netdev_put(netdev, NULL); + } static int octep_vf_set_mac(struct net_device *netdev, void *p) From 8f7ae5a85137b913cb97e2d24409d36548d0bab1 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Thu, 24 Apr 2025 05:55:47 -0700 Subject: [PATCH 508/770] bnxt_en: improve TX timestamping FIFO configuration Reconfiguration of netdev may trigger close/open procedure which can break FIFO status by adjusting the amount of empty slots for TX timestamps. But it is not really needed because timestamps for the packets sent over the wire still can be retrieved. On the other side, during netdev close procedure any skbs waiting for TX timestamps can be leaked because there is no cleaning procedure called. Free skbs waiting for TX timestamps when closing netdev. Fixes: 8aa2a79e9b95 ("bnxt_en: Increase the max total outstanding PTP TX packets to 4") Reviewed-by: Michael Chan Reviewed-by: Pavan Chebbi Signed-off-by: Vadim Fedorenko Link: https://patch.msgid.link/20250424125547.460632-1-vadfed@meta.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 ++-- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 29 ++++++++++++++----- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h | 1 + 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c8e3468eee612..2c8e2c19d8548 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3414,6 +3414,9 @@ static void bnxt_free_tx_skbs(struct bnxt *bp) bnxt_free_one_tx_ring_skbs(bp, txr, i); } + + if (bp->ptp_cfg && !(bp->fw_cap & BNXT_FW_CAP_TX_TS_CMP)) + bnxt_ptp_free_txts_skbs(bp->ptp_cfg); } static void bnxt_free_one_rx_ring(struct bnxt *bp, struct bnxt_rx_ring_info *rxr) @@ -12797,8 +12800,6 @@ static int __bnxt_open_nic(struct bnxt *bp, bool irq_re_init, bool link_re_init) /* VF-reps may need to be re-opened after the PF is re-opened */ if (BNXT_PF(bp)) bnxt_vf_reps_open(bp); - if (bp->ptp_cfg && !(bp->fw_cap & BNXT_FW_CAP_TX_TS_CMP)) - WRITE_ONCE(bp->ptp_cfg->tx_avail, BNXT_MAX_TX_TS); bnxt_ptp_init_rtc(bp, true); bnxt_ptp_cfg_tstamp_filters(bp); if (BNXT_SUPPORTS_MULTI_RSS_CTX(bp)) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c index 2d4e19b96ee74..0669d43472f51 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c @@ -794,6 +794,27 @@ static long bnxt_ptp_ts_aux_work(struct ptp_clock_info *ptp_info) return HZ; } +void bnxt_ptp_free_txts_skbs(struct bnxt_ptp_cfg *ptp) +{ + struct bnxt_ptp_tx_req *txts_req; + u16 cons = ptp->txts_cons; + + /* make sure ptp aux worker finished with + * possible BNXT_STATE_OPEN set + */ + ptp_cancel_worker_sync(ptp->ptp_clock); + + ptp->tx_avail = BNXT_MAX_TX_TS; + while (cons != ptp->txts_prod) { + txts_req = &ptp->txts_req[cons]; + if (!IS_ERR_OR_NULL(txts_req->tx_skb)) + dev_kfree_skb_any(txts_req->tx_skb); + cons = NEXT_TXTS(cons); + } + ptp->txts_cons = cons; + ptp_schedule_worker(ptp->ptp_clock, 0); +} + int bnxt_ptp_get_txts_prod(struct bnxt_ptp_cfg *ptp, u16 *prod) { spin_lock_bh(&ptp->ptp_tx_lock); @@ -1105,7 +1126,6 @@ int bnxt_ptp_init(struct bnxt *bp) void bnxt_ptp_clear(struct bnxt *bp) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; - int i; if (!ptp) return; @@ -1117,12 +1137,5 @@ void bnxt_ptp_clear(struct bnxt *bp) kfree(ptp->ptp_info.pin_config); ptp->ptp_info.pin_config = NULL; - for (i = 0; i < BNXT_MAX_TX_TS; i++) { - if (ptp->txts_req[i].tx_skb) { - dev_kfree_skb_any(ptp->txts_req[i].tx_skb); - ptp->txts_req[i].tx_skb = NULL; - } - } - bnxt_unmap_ptp_regs(bp); } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h index a95f05e9c579b..0481161d26ef5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h @@ -162,6 +162,7 @@ int bnxt_ptp_cfg_tstamp_filters(struct bnxt *bp); void bnxt_ptp_reapply_pps(struct bnxt *bp); int bnxt_hwtstamp_set(struct net_device *dev, struct ifreq *ifr); int bnxt_hwtstamp_get(struct net_device *dev, struct ifreq *ifr); +void bnxt_ptp_free_txts_skbs(struct bnxt_ptp_cfg *ptp); int bnxt_ptp_get_txts_prod(struct bnxt_ptp_cfg *ptp, u16 *prod); void bnxt_get_tx_ts_p5(struct bnxt *bp, struct sk_buff *skb, u16 prod); int bnxt_get_rx_ts_p5(struct bnxt *bp, u64 *ts, u32 pkt_ts); From 68f9d8974b545668e1be2422240b25a92e304b14 Mon Sep 17 00:00:00 2001 From: Justin Lai Date: Thu, 24 Apr 2025 12:04:44 +0800 Subject: [PATCH 509/770] rtase: Modify the condition used to detect overflow in rtase_calc_time_mitigation Fix the following compile error reported by the kernel test robot by modifying the condition used to detect overflow in rtase_calc_time_mitigation. In file included from include/linux/mdio.h:10:0, from drivers/net/ethernet/realtek/rtase/rtase_main.c:58: In function 'u16_encode_bits', inlined from 'rtase_calc_time_mitigation.constprop' at drivers/net/ ethernet/realtek/rtase/rtase_main.c:1915:13, inlined from 'rtase_init_software_variable.isra.41' at drivers/net/ ethernet/realtek/rtase/rtase_main.c:1961:13, inlined from 'rtase_init_one' at drivers/net/ethernet/realtek/ rtase/rtase_main.c:2111:2: >> include/linux/bitfield.h:178:3: error: call to '__field_overflow' declared with attribute error: value doesn't fit into mask __field_overflow(); \ ^~~~~~~~~~~~~~~~~~ include/linux/bitfield.h:198:2: note: in expansion of macro '____MAKE_OP' ____MAKE_OP(u##size,u##size,,) ^~~~~~~~~~~ include/linux/bitfield.h:200:1: note: in expansion of macro '__MAKE_OP' __MAKE_OP(16) ^~~~~~~~~ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202503182158.nkAlbJWX-lkp@intel.com/ Fixes: a36e9f5cfe9e ("rtase: Add support for a pci table in this module") Signed-off-by: Justin Lai Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250424040444.5530-1-justinlai0215@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/rtase/rtase_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c index 2aacc1996796d..55b8d36661530 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c @@ -1925,8 +1925,8 @@ static u16 rtase_calc_time_mitigation(u32 time_us) time_us = min_t(int, time_us, RTASE_MITI_MAX_TIME); - msb = fls(time_us); - if (msb >= RTASE_MITI_COUNT_BIT_NUM) { + if (time_us > RTASE_MITI_TIME_COUNT_MASK) { + msb = fls(time_us); time_unit = msb - RTASE_MITI_COUNT_BIT_NUM; time_count = time_us >> (msb - RTASE_MITI_COUNT_BIT_NUM); } else { From 6fe0866014486736cc3ba1c6fd4606d3dbe55c9c Mon Sep 17 00:00:00 2001 From: Louis-Alexis Eyraud Date: Thu, 24 Apr 2025 10:38:48 +0200 Subject: [PATCH 510/770] net: ethernet: mtk-star-emac: fix spinlock recursion issues on rx/tx poll Use spin_lock_irqsave and spin_unlock_irqrestore instead of spin_lock and spin_unlock in mtk_star_emac driver to avoid spinlock recursion occurrence that can happen when enabling the DMA interrupts again in rx/tx poll. ``` BUG: spinlock recursion on CPU#0, swapper/0/0 lock: 0xffff00000db9cf20, .magic: dead4ead, .owner: swapper/0/0, .owner_cpu: 0 CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.15.0-rc2-next-20250417-00001-gf6a27738686c-dirty #28 PREEMPT Hardware name: MediaTek MT8365 Open Platform EVK (DT) Call trace: show_stack+0x18/0x24 (C) dump_stack_lvl+0x60/0x80 dump_stack+0x18/0x24 spin_dump+0x78/0x88 do_raw_spin_lock+0x11c/0x120 _raw_spin_lock+0x20/0x2c mtk_star_handle_irq+0xc0/0x22c [mtk_star_emac] __handle_irq_event_percpu+0x48/0x140 handle_irq_event+0x4c/0xb0 handle_fasteoi_irq+0xa0/0x1bc handle_irq_desc+0x34/0x58 generic_handle_domain_irq+0x1c/0x28 gic_handle_irq+0x4c/0x120 do_interrupt_handler+0x50/0x84 el1_interrupt+0x34/0x68 el1h_64_irq_handler+0x18/0x24 el1h_64_irq+0x6c/0x70 regmap_mmio_read32le+0xc/0x20 (P) _regmap_bus_reg_read+0x6c/0xac _regmap_read+0x60/0xdc regmap_read+0x4c/0x80 mtk_star_rx_poll+0x2f4/0x39c [mtk_star_emac] __napi_poll+0x38/0x188 net_rx_action+0x164/0x2c0 handle_softirqs+0x100/0x244 __do_softirq+0x14/0x20 ____do_softirq+0x10/0x20 call_on_irq_stack+0x24/0x64 do_softirq_own_stack+0x1c/0x40 __irq_exit_rcu+0xd4/0x10c irq_exit_rcu+0x10/0x1c el1_interrupt+0x38/0x68 el1h_64_irq_handler+0x18/0x24 el1h_64_irq+0x6c/0x70 cpuidle_enter_state+0xac/0x320 (P) cpuidle_enter+0x38/0x50 do_idle+0x1e4/0x260 cpu_startup_entry+0x34/0x3c rest_init+0xdc/0xe0 console_on_rootfs+0x0/0x6c __primary_switched+0x88/0x90 ``` Fixes: 0a8bd81fd6aa ("net: ethernet: mtk-star-emac: separate tx/rx handling with two NAPIs") Signed-off-by: Louis-Alexis Eyraud Reviewed-by: Maxime Chevallier Acked-by: Bartosz Golaszewski Link: https://patch.msgid.link/20250424-mtk_star_emac-fix-spinlock-recursion-issue-v2-1-f3fde2e529d8@collabora.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_star_emac.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c index 76f202d7f0553..23115881d8e89 100644 --- a/drivers/net/ethernet/mediatek/mtk_star_emac.c +++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c @@ -1163,6 +1163,7 @@ static int mtk_star_tx_poll(struct napi_struct *napi, int budget) struct net_device *ndev = priv->ndev; unsigned int head = ring->head; unsigned int entry = ring->tail; + unsigned long flags; while (entry != head && count < (MTK_STAR_RING_NUM_DESCS - 1)) { ret = mtk_star_tx_complete_one(priv); @@ -1182,9 +1183,9 @@ static int mtk_star_tx_poll(struct napi_struct *napi, int budget) netif_wake_queue(ndev); if (napi_complete(napi)) { - spin_lock(&priv->lock); + spin_lock_irqsave(&priv->lock, flags); mtk_star_enable_dma_irq(priv, false, true); - spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->lock, flags); } return 0; @@ -1341,6 +1342,7 @@ static int mtk_star_rx(struct mtk_star_priv *priv, int budget) static int mtk_star_rx_poll(struct napi_struct *napi, int budget) { struct mtk_star_priv *priv; + unsigned long flags; int work_done = 0; priv = container_of(napi, struct mtk_star_priv, rx_napi); @@ -1348,9 +1350,9 @@ static int mtk_star_rx_poll(struct napi_struct *napi, int budget) work_done = mtk_star_rx(priv, budget); if (work_done < budget) { napi_complete_done(napi, work_done); - spin_lock(&priv->lock); + spin_lock_irqsave(&priv->lock, flags); mtk_star_enable_dma_irq(priv, true, false); - spin_unlock(&priv->lock); + spin_unlock_irqrestore(&priv->lock, flags); } return work_done; From e54b4db35e201a9173da9cb7abc8377e12abaf87 Mon Sep 17 00:00:00 2001 From: Louis-Alexis Eyraud Date: Thu, 24 Apr 2025 10:38:49 +0200 Subject: [PATCH 511/770] net: ethernet: mtk-star-emac: rearm interrupts in rx_poll only when advised In mtk_star_rx_poll function, on event processing completion, the mtk_star_emac driver calls napi_complete_done but ignores its return code and enable RX DMA interrupts inconditionally. This return code gives the info if a device should avoid rearming its interrupts or not, so fix this behaviour by taking it into account. Fixes: 8c7bd5a454ff ("net: ethernet: mtk-star-emac: new driver") Signed-off-by: Louis-Alexis Eyraud Acked-by: Bartosz Golaszewski Link: https://patch.msgid.link/20250424-mtk_star_emac-fix-spinlock-recursion-issue-v2-2-f3fde2e529d8@collabora.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_star_emac.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_star_emac.c b/drivers/net/ethernet/mediatek/mtk_star_emac.c index 23115881d8e89..b175119a6a7da 100644 --- a/drivers/net/ethernet/mediatek/mtk_star_emac.c +++ b/drivers/net/ethernet/mediatek/mtk_star_emac.c @@ -1348,8 +1348,7 @@ static int mtk_star_rx_poll(struct napi_struct *napi, int budget) priv = container_of(napi, struct mtk_star_priv, rx_napi); work_done = mtk_star_rx(priv, budget); - if (work_done < budget) { - napi_complete_done(napi, work_done); + if (work_done < budget && napi_complete_done(napi, work_done)) { spin_lock_irqsave(&priv->lock, flags); mtk_star_enable_dma_irq(priv, true, false); spin_unlock_irqrestore(&priv->lock, flags); From 5a2a6c428190f945c5cbf5791f72dbea83e97f66 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Tue, 15 Apr 2025 00:17:16 -0400 Subject: [PATCH 512/770] dm: always update the array size in realloc_argv on success realloc_argv() was only updating the array size if it was called with old_argv already allocated. The first time it was called to create an argv array, it would allocate the array but return the array size as zero. dm_split_args() would think that it couldn't store any arguments in the array and would call realloc_argv() again, causing it to reallocate the initial slots (this time using GPF_KERNEL) and finally return a size. Aside from being wasteful, this could cause deadlocks on targets that need to process messages without starting new IO. Instead, realloc_argv should always update the allocated array size on success. Fixes: a0651926553c ("dm table: don't copy from a NULL pointer in realloc_argv()") Cc: stable@vger.kernel.org Signed-off-by: Benjamin Marzinski Signed-off-by: Mikulas Patocka --- drivers/md/dm-table.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 53759dbbe9d60..9e175c5e0634b 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -523,9 +523,10 @@ static char **realloc_argv(unsigned int *size, char **old_argv) gfp = GFP_NOIO; } argv = kmalloc_array(new_size, sizeof(*argv), gfp); - if (argv && old_argv) { - memcpy(argv, old_argv, *size * sizeof(*argv)); + if (argv) { *size = new_size; + if (old_argv) + memcpy(argv, old_argv, *size * sizeof(*argv)); } kfree(old_argv); From 5c3bf6cba7911f470afd748606be5c03a9512fcc Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 24 Apr 2025 04:22:38 +0000 Subject: [PATCH 513/770] bonding: assign random address if device address is same as bond This change addresses a MAC address conflict issue in failover scenarios, similar to the problem described in commit a951bc1e6ba5 ("bonding: correct the MAC address for 'follow' fail_over_mac policy"). In fail_over_mac=follow mode, the bonding driver expects the formerly active slave to swap MAC addresses with the newly active slave during failover. However, under certain conditions, two slaves may end up with the same MAC address, which breaks this policy: 1) ip link set eth0 master bond0 -> bond0 adopts eth0's MAC address (MAC0). 2) ip link set eth1 master bond0 -> eth1 is added as a backup with its own MAC (MAC1). 3) ip link set eth0 nomaster -> eth0 is released and restores its MAC (MAC0). -> eth1 becomes the active slave, and bond0 assigns MAC0 to eth1. 4) ip link set eth0 master bond0 -> eth0 is re-added to bond0, now both eth0 and eth1 have MAC0. This results in a MAC address conflict and violates the expected behavior of the failover policy. To fix this, we assign a random MAC address to any newly added slave if its current MAC address matches that of the bond. The original (permanent) MAC address is saved and will be restored when the device is released from the bond. This ensures that each slave has a unique MAC address during failover transitions, preserving the integrity of the fail_over_mac=follow policy. Fixes: 3915c1e8634a ("bonding: Add "follow" option to fail_over_mac") Signed-off-by: Hangbin Liu Acked-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 665b9af684fcc..d05226484c64d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2118,15 +2118,26 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, * set the master's mac address to that of the first slave */ memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len); - ss.ss_family = slave_dev->type; - res = dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, - extack); - if (res) { - slave_err(bond_dev, slave_dev, "Error %d calling set_mac_address\n", res); - goto err_restore_mtu; - } + } else if (bond->params.fail_over_mac == BOND_FOM_FOLLOW && + BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && + memcmp(slave_dev->dev_addr, bond_dev->dev_addr, bond_dev->addr_len) == 0) { + /* Set slave to random address to avoid duplicate mac + * address in later fail over. + */ + eth_random_addr(ss.__data); + } else { + goto skip_mac_set; } + ss.ss_family = slave_dev->type; + res = dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, extack); + if (res) { + slave_err(bond_dev, slave_dev, "Error %d calling set_mac_address\n", res); + goto err_restore_mtu; + } + +skip_mac_set: + /* set no_addrconf flag before open to prevent IPv6 addrconf */ slave_dev->priv_flags |= IFF_NO_ADDRCONF; From f438eee2c8c93320d70cdee7630a06baaa47a304 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 24 Apr 2025 15:22:07 +0800 Subject: [PATCH 514/770] net: stmmac: dwmac-loongson: Move queue number init to common function Currently, the tx and rx queue number initialization is duplicated in loongson_gmac_data() and loongson_gnet_data(), so move it to the common function loongson_default_data(). This is a preparation for later patches. Reviewed-by: Yanteng Si Tested-by: Henry Chen Tested-by: Biao Dong Signed-off-by: Baoqi Zhang Signed-off-by: Huacai Chen Signed-off-by: David S. Miller --- .../ethernet/stmicro/stmmac/dwmac-loongson.c | 40 +++++-------------- 1 file changed, 9 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index 1a93787056a77..2fb7a137b312d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -83,6 +83,8 @@ struct stmmac_pci_info { static void loongson_default_data(struct pci_dev *pdev, struct plat_stmmacenet_data *plat) { + struct loongson_data *ld = plat->bsp_priv; + /* Get bus_id, this can be overwritten later */ plat->bus_id = pci_dev_id(pdev); @@ -116,17 +118,6 @@ static void loongson_default_data(struct pci_dev *pdev, plat->dma_cfg->pbl = 32; plat->dma_cfg->pblx8 = true; -} - -static int loongson_gmac_data(struct pci_dev *pdev, - struct plat_stmmacenet_data *plat) -{ - struct loongson_data *ld; - int i; - - ld = plat->bsp_priv; - - loongson_default_data(pdev, plat); if (ld->loongson_id == DWMAC_CORE_LS_MULTICHAN) { plat->rx_queues_to_use = CHANNEL_NUM; @@ -135,12 +126,18 @@ static int loongson_gmac_data(struct pci_dev *pdev, /* Only channel 0 supports checksum, * so turn off checksum to enable multiple channels. */ - for (i = 1; i < CHANNEL_NUM; i++) + for (int i = 1; i < CHANNEL_NUM; i++) plat->tx_queues_cfg[i].coe_unsupported = 1; } else { plat->tx_queues_to_use = 1; plat->rx_queues_to_use = 1; } +} + +static int loongson_gmac_data(struct pci_dev *pdev, + struct plat_stmmacenet_data *plat) +{ + loongson_default_data(pdev, plat); plat->phy_interface = PHY_INTERFACE_MODE_RGMII_ID; @@ -172,27 +169,8 @@ static void loongson_gnet_fix_speed(void *priv, int speed, unsigned int mode) static int loongson_gnet_data(struct pci_dev *pdev, struct plat_stmmacenet_data *plat) { - struct loongson_data *ld; - int i; - - ld = plat->bsp_priv; - loongson_default_data(pdev, plat); - if (ld->loongson_id == DWMAC_CORE_LS_MULTICHAN) { - plat->rx_queues_to_use = CHANNEL_NUM; - plat->tx_queues_to_use = CHANNEL_NUM; - - /* Only channel 0 supports checksum, - * so turn off checksum to enable multiple channels. - */ - for (i = 1; i < CHANNEL_NUM; i++) - plat->tx_queues_cfg[i].coe_unsupported = 1; - } else { - plat->tx_queues_to_use = 1; - plat->rx_queues_to_use = 1; - } - plat->phy_interface = PHY_INTERFACE_MODE_GMII; plat->mdio_bus_data->phy_mask = ~(u32)BIT(2); plat->fix_mac_speed = loongson_gnet_fix_speed; From 6fba40e7f61033a2350bd812c5faa1d30333eefd Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 24 Apr 2025 09:12:20 +0200 Subject: [PATCH 515/770] net: stmmac: socfpga: Enable internal GMII when using 1000BaseX Dwmac Socfpga may be used with an instance of a Lynx / Altera TSE PCS, in which case it gains support for 1000BaseX. It appears that the PCS is wired to the MAC through an internal GMII bus. Make sure that we enable the GMII_MII mode for the internal MAC when using 1000BaseX. Reviewed-by: Russell King (Oracle) Signed-off-by: Maxime Chevallier Link: https://patch.msgid.link/20250424071223.221239-2-maxime.chevallier@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 59f90b123c5b8..027356033e5e0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -256,6 +256,7 @@ static int socfpga_set_phy_mode_common(int phymode, u32 *val) case PHY_INTERFACE_MODE_MII: case PHY_INTERFACE_MODE_GMII: case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_1000BASEX: *val = SYSMGR_EMACGRP_CTRL_PHYSEL_ENUM_GMII_MII; break; case PHY_INTERFACE_MODE_RMII: From 3bf19459da62113122cfb2b1305677b034529742 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 24 Apr 2025 09:12:21 +0200 Subject: [PATCH 516/770] net: stmmac: socfpga: Don't check for phy to enable the SGMII adapter The SGMII adapter needs to be enabled for both Cisco SGMII and 1000BaseX operations. It doesn't make sense to check for an attached phydev here, as we simply might not have any, in particular if we're using the 1000BaseX interface mode. Make so that we only re-enable the SGMII adapter when it's present, and when we use a phy_mode that is handled by said adapter. Signed-off-by: Maxime Chevallier Link: https://patch.msgid.link/20250424071223.221239-3-maxime.chevallier@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 027356033e5e0..c832a41c17473 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -62,14 +62,13 @@ struct socfpga_dwmac { struct mdio_device *pcs_mdiodev; }; -static void socfpga_dwmac_fix_mac_speed(void *priv, int speed, unsigned int mode) +static void socfpga_dwmac_fix_mac_speed(void *bsp_priv, int speed, + unsigned int mode) { - struct socfpga_dwmac *dwmac = (struct socfpga_dwmac *)priv; + struct socfpga_dwmac *dwmac = (struct socfpga_dwmac *)bsp_priv; + struct stmmac_priv *priv = netdev_priv(dev_get_drvdata(dwmac->dev)); void __iomem *splitter_base = dwmac->splitter_base; void __iomem *sgmii_adapter_base = dwmac->sgmii_adapter_base; - struct device *dev = dwmac->dev; - struct net_device *ndev = dev_get_drvdata(dev); - struct phy_device *phy_dev = ndev->phydev; u32 val; if (sgmii_adapter_base) @@ -96,7 +95,9 @@ static void socfpga_dwmac_fix_mac_speed(void *priv, int speed, unsigned int mode writel(val, splitter_base + EMAC_SPLITTER_CTRL_REG); } - if (phy_dev && sgmii_adapter_base) + if ((priv->plat->phy_interface == PHY_INTERFACE_MODE_SGMII || + priv->plat->phy_interface == PHY_INTERFACE_MODE_1000BASEX) && + sgmii_adapter_base) writew(SGMII_ADAPTER_ENABLE, sgmii_adapter_base + SGMII_ADAPTER_CTRL_REG); } From 8fb33581bb8a9d27e021723cbccef2936c5ad671 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 24 Apr 2025 09:12:22 +0200 Subject: [PATCH 517/770] net: stmmac: socfpga: Remove unused pcs-mdiodev field When dwmac-socfpga was converted to using the Lynx PCS (previously referred to in the driver as the Altera TSE PCS), the lynx_pcs_create_mdiodev() was used to create the pcs instance. As this function didn't exist in the early versions of the series, a local mdiodev object was stored for PCS creation. It was never used, but still made it into the driver, so remove it. Reviewed-by: Russell King (Oracle) Signed-off-by: Maxime Chevallier Link: https://patch.msgid.link/20250424071223.221239-4-maxime.chevallier@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index c832a41c17473..72b50f6d72f4d 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -59,7 +59,6 @@ struct socfpga_dwmac { void __iomem *sgmii_adapter_base; bool f2h_ptp_ref_clk; const struct socfpga_dwmac_ops *ops; - struct mdio_device *pcs_mdiodev; }; static void socfpga_dwmac_fix_mac_speed(void *bsp_priv, int speed, From 2725fc2e0b6113ea9e655b77409d6129876e45e9 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 24 Apr 2025 15:22:08 +0800 Subject: [PATCH 518/770] net: stmmac: dwmac-loongson: Add new multi-chan IP core support Add a new multi-chan IP core (0x12) support which is used in Loongson- 2K3000/Loongson-3B6000M. Compared with the 0x10 core, the new 0x12 core reduces channel numbers from 8 to 4, but checksum is supported for all channels. Add a "multichan" flag to loongson_data, so that we can simply use a "if (ld->multichan)" condition rather than the complicated condition "if (ld->loongson_id == DWMAC_CORE_MULTICHAN_V1 || ld->loongson_id == DWMAC_CORE_MULTICHAN_V2)". Reviewed-by: Andrew Lunn Tested-by: Henry Chen Tested-by: Biao Dong Signed-off-by: Baoqi Zhang Signed-off-by: Huacai Chen Reviewed-by: Yanteng Si Link: https://patch.msgid.link/20250424072209.3134762-3-chenhuacai@loongson.cn Signed-off-by: Jakub Kicinski --- .../ethernet/stmicro/stmmac/dwmac-loongson.c | 62 +++++++++++-------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index 2fb7a137b312d..57917f26ab4d6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -68,10 +68,11 @@ #define PCI_DEVICE_ID_LOONGSON_GMAC 0x7a03 #define PCI_DEVICE_ID_LOONGSON_GNET 0x7a13 -#define DWMAC_CORE_LS_MULTICHAN 0x10 /* Loongson custom ID */ -#define CHANNEL_NUM 8 +#define DWMAC_CORE_MULTICHAN_V1 0x10 /* Loongson custom ID 0x10 */ +#define DWMAC_CORE_MULTICHAN_V2 0x12 /* Loongson custom ID 0x12 */ struct loongson_data { + u32 multichan; u32 loongson_id; struct device *dev; }; @@ -119,18 +120,29 @@ static void loongson_default_data(struct pci_dev *pdev, plat->dma_cfg->pbl = 32; plat->dma_cfg->pblx8 = true; - if (ld->loongson_id == DWMAC_CORE_LS_MULTICHAN) { - plat->rx_queues_to_use = CHANNEL_NUM; - plat->tx_queues_to_use = CHANNEL_NUM; + switch (ld->loongson_id) { + case DWMAC_CORE_MULTICHAN_V1: + ld->multichan = 1; + plat->rx_queues_to_use = 8; + plat->tx_queues_to_use = 8; /* Only channel 0 supports checksum, * so turn off checksum to enable multiple channels. */ - for (int i = 1; i < CHANNEL_NUM; i++) + for (int i = 1; i < 8; i++) plat->tx_queues_cfg[i].coe_unsupported = 1; - } else { + + break; + case DWMAC_CORE_MULTICHAN_V2: + ld->multichan = 1; + plat->rx_queues_to_use = 4; + plat->tx_queues_to_use = 4; + break; + default: + ld->multichan = 0; plat->tx_queues_to_use = 1; plat->rx_queues_to_use = 1; + break; } } @@ -328,14 +340,14 @@ static struct mac_device_info *loongson_dwmac_setup(void *apriv) return NULL; /* The Loongson GMAC and GNET devices are based on the DW GMAC - * v3.50a and v3.73a IP-cores. But the HW designers have changed the - * GMAC_VERSION.SNPSVER field to the custom 0x10 value on the - * network controllers with the multi-channels feature + * v3.50a and v3.73a IP-cores. But the HW designers have changed + * the GMAC_VERSION.SNPSVER field to the custom 0x10/0x12 value + * on the network controllers with the multi-channels feature * available to emphasize the differences: multiple DMA-channels, * AV feature and GMAC_INT_STATUS CSR flags layout. Get back the * original value so the correct HW-interface would be selected. */ - if (ld->loongson_id == DWMAC_CORE_LS_MULTICHAN) { + if (ld->multichan) { priv->synopsys_id = DWMAC_CORE_3_70; *dma = dwmac1000_dma_ops; dma->init_chan = loongson_dwmac_dma_init_channel; @@ -356,13 +368,13 @@ static struct mac_device_info *loongson_dwmac_setup(void *apriv) if (mac->multicast_filter_bins) mac->mcast_bits_log2 = ilog2(mac->multicast_filter_bins); - /* Loongson GMAC doesn't support the flow control. LS2K2000 - * GNET doesn't support the half-duplex link mode. + /* Loongson GMAC doesn't support the flow control. Loongson GNET + * without multi-channel doesn't support the half-duplex link mode. */ if (pdev->device == PCI_DEVICE_ID_LOONGSON_GMAC) { mac->link.caps = MAC_10 | MAC_100 | MAC_1000; } else { - if (ld->loongson_id == DWMAC_CORE_LS_MULTICHAN) + if (ld->multichan) mac->link.caps = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 | MAC_100 | MAC_1000; else @@ -391,9 +403,11 @@ static int loongson_dwmac_msi_config(struct pci_dev *pdev, struct plat_stmmacenet_data *plat, struct stmmac_resources *res) { - int i, ret, vecs; + int i, ch_num, ret, vecs; + + ch_num = min(plat->tx_queues_to_use, plat->rx_queues_to_use); - vecs = roundup_pow_of_two(CHANNEL_NUM * 2 + 1); + vecs = roundup_pow_of_two(ch_num * 2 + 1); ret = pci_alloc_irq_vectors(pdev, vecs, vecs, PCI_IRQ_MSI); if (ret < 0) { dev_warn(&pdev->dev, "Failed to allocate MSI IRQs\n"); @@ -402,14 +416,12 @@ static int loongson_dwmac_msi_config(struct pci_dev *pdev, res->irq = pci_irq_vector(pdev, 0); - for (i = 0; i < plat->rx_queues_to_use; i++) { - res->rx_irq[CHANNEL_NUM - 1 - i] = - pci_irq_vector(pdev, 1 + i * 2); + for (i = 0; i < ch_num; i++) { + res->rx_irq[ch_num - 1 - i] = pci_irq_vector(pdev, 1 + i * 2); } - for (i = 0; i < plat->tx_queues_to_use; i++) { - res->tx_irq[CHANNEL_NUM - 1 - i] = - pci_irq_vector(pdev, 2 + i * 2); + for (i = 0; i < ch_num; i++) { + res->tx_irq[ch_num - 1 - i] = pci_irq_vector(pdev, 2 + i * 2); } plat->flags |= STMMAC_FLAG_MULTI_MSI_EN; @@ -571,7 +583,7 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id goto err_disable_device; /* Use the common MAC IRQ if per-channel MSIs allocation failed */ - if (ld->loongson_id == DWMAC_CORE_LS_MULTICHAN) + if (ld->multichan) loongson_dwmac_msi_config(pdev, plat, &res); ret = stmmac_dvr_probe(&pdev->dev, plat, &res); @@ -583,7 +595,7 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id err_plat_clear: if (dev_of_node(&pdev->dev)) loongson_dwmac_dt_clear(pdev, plat); - if (ld->loongson_id == DWMAC_CORE_LS_MULTICHAN) + if (ld->multichan) loongson_dwmac_msi_clear(pdev); err_disable_device: pci_disable_device(pdev); @@ -602,7 +614,7 @@ static void loongson_dwmac_remove(struct pci_dev *pdev) if (dev_of_node(&pdev->dev)) loongson_dwmac_dt_clear(pdev, priv->plat); - if (ld->loongson_id == DWMAC_CORE_LS_MULTICHAN) + if (ld->multichan) loongson_dwmac_msi_clear(pdev); pci_disable_device(pdev); From ef1179f78119c243bdc276cfb819242bc35308cd Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 24 Apr 2025 15:22:09 +0800 Subject: [PATCH 519/770] net: stmmac: dwmac-loongson: Add new GMAC's PCI device ID support Add a new GMAC's PCI device ID (0x7a23) support which is used in Loongson-2K3000/Loongson-3B6000M. The new GMAC device use external PHY, so it reuses loongson_gmac_data() as the old GMAC device (0x7a03), and the new GMAC device still doesn't support flow control now. Reviewed-by: Andrew Lunn Reviewed-by: Yanteng Si Tested-by: Henry Chen Tested-by: Biao Dong Signed-off-by: Baoqi Zhang Signed-off-by: Huacai Chen Link: https://patch.msgid.link/20250424072209.3134762-4-chenhuacai@loongson.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index 57917f26ab4d6..e1591e6217d4b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -66,7 +66,8 @@ DMA_STATUS_TPS | DMA_STATUS_TI | \ DMA_STATUS_MSK_COMMON_LOONGSON) -#define PCI_DEVICE_ID_LOONGSON_GMAC 0x7a03 +#define PCI_DEVICE_ID_LOONGSON_GMAC1 0x7a03 +#define PCI_DEVICE_ID_LOONGSON_GMAC2 0x7a23 #define PCI_DEVICE_ID_LOONGSON_GNET 0x7a13 #define DWMAC_CORE_MULTICHAN_V1 0x10 /* Loongson custom ID 0x10 */ #define DWMAC_CORE_MULTICHAN_V2 0x12 /* Loongson custom ID 0x12 */ @@ -371,7 +372,7 @@ static struct mac_device_info *loongson_dwmac_setup(void *apriv) /* Loongson GMAC doesn't support the flow control. Loongson GNET * without multi-channel doesn't support the half-duplex link mode. */ - if (pdev->device == PCI_DEVICE_ID_LOONGSON_GMAC) { + if (pdev->device != PCI_DEVICE_ID_LOONGSON_GNET) { mac->link.caps = MAC_10 | MAC_100 | MAC_1000; } else { if (ld->multichan) @@ -659,7 +660,8 @@ static SIMPLE_DEV_PM_OPS(loongson_dwmac_pm_ops, loongson_dwmac_suspend, loongson_dwmac_resume); static const struct pci_device_id loongson_dwmac_id_table[] = { - { PCI_DEVICE_DATA(LOONGSON, GMAC, &loongson_gmac_pci_info) }, + { PCI_DEVICE_DATA(LOONGSON, GMAC1, &loongson_gmac_pci_info) }, + { PCI_DEVICE_DATA(LOONGSON, GMAC2, &loongson_gmac_pci_info) }, { PCI_DEVICE_DATA(LOONGSON, GNET, &loongson_gnet_pci_info) }, {} }; From f04dd30f1bef1ed2e74a4050af6e5e5e3869bac3 Mon Sep 17 00:00:00 2001 From: Vishal Badole Date: Thu, 24 Apr 2025 18:32:48 +0530 Subject: [PATCH 520/770] amd-xgbe: Fix to ensure dependent features are toggled with RX checksum offload According to the XGMAC specification, enabling features such as Layer 3 and Layer 4 Packet Filtering, Split Header and Virtualized Network support automatically selects the IPC Full Checksum Offload Engine on the receive side. When RX checksum offload is disabled, these dependent features must also be disabled to prevent abnormal behavior caused by mismatched feature dependencies. Ensure that toggling RX checksum offload (disabling or enabling) properly disables or enables all dependent features, maintaining consistent and expected behavior in the network device. Cc: stable@vger.kernel.org Fixes: 1a510ccf5869 ("amd-xgbe: Add support for VXLAN offload capabilities") Signed-off-by: Vishal Badole Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250424130248.428865-1-Vishal.Badole@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/xgbe/xgbe-desc.c | 9 +++++++-- drivers/net/ethernet/amd/xgbe/xgbe-dev.c | 24 +++++++++++++++++++++-- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 11 +++++++++-- drivers/net/ethernet/amd/xgbe/xgbe.h | 4 ++++ 4 files changed, 42 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c index 230726d7b74f6..d41b58fad37bb 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c @@ -373,8 +373,13 @@ static int xgbe_map_rx_buffer(struct xgbe_prv_data *pdata, } /* Set up the header page info */ - xgbe_set_buffer_data(&rdata->rx.hdr, &ring->rx_hdr_pa, - XGBE_SKB_ALLOC_SIZE); + if (pdata->netdev->features & NETIF_F_RXCSUM) { + xgbe_set_buffer_data(&rdata->rx.hdr, &ring->rx_hdr_pa, + XGBE_SKB_ALLOC_SIZE); + } else { + xgbe_set_buffer_data(&rdata->rx.hdr, &ring->rx_hdr_pa, + pdata->rx_buf_size); + } /* Set up the buffer page info */ xgbe_set_buffer_data(&rdata->rx.buf, &ring->rx_buf_pa, diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c index f393228d41c7b..f1b0fb02b3cd1 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c @@ -320,6 +320,18 @@ static void xgbe_config_sph_mode(struct xgbe_prv_data *pdata) XGMAC_IOWRITE_BITS(pdata, MAC_RCR, HDSMS, XGBE_SPH_HDSMS_SIZE); } +static void xgbe_disable_sph_mode(struct xgbe_prv_data *pdata) +{ + unsigned int i; + + for (i = 0; i < pdata->channel_count; i++) { + if (!pdata->channel[i]->rx_ring) + break; + + XGMAC_DMA_IOWRITE_BITS(pdata->channel[i], DMA_CH_CR, SPH, 0); + } +} + static int xgbe_write_rss_reg(struct xgbe_prv_data *pdata, unsigned int type, unsigned int index, unsigned int val) { @@ -3545,8 +3557,12 @@ static int xgbe_init(struct xgbe_prv_data *pdata) xgbe_config_tx_coalesce(pdata); xgbe_config_rx_buffer_size(pdata); xgbe_config_tso_mode(pdata); - xgbe_config_sph_mode(pdata); - xgbe_config_rss(pdata); + + if (pdata->netdev->features & NETIF_F_RXCSUM) { + xgbe_config_sph_mode(pdata); + xgbe_config_rss(pdata); + } + desc_if->wrapper_tx_desc_init(pdata); desc_if->wrapper_rx_desc_init(pdata); xgbe_enable_dma_interrupts(pdata); @@ -3702,5 +3718,9 @@ void xgbe_init_function_ptrs_dev(struct xgbe_hw_if *hw_if) hw_if->disable_vxlan = xgbe_disable_vxlan; hw_if->set_vxlan_id = xgbe_set_vxlan_id; + /* For Split Header*/ + hw_if->enable_sph = xgbe_config_sph_mode; + hw_if->disable_sph = xgbe_disable_sph_mode; + DBGPR("<--xgbe_init_function_ptrs\n"); } diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index d84a310dfcd40..8e09ad8fa022a 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -2257,10 +2257,17 @@ static int xgbe_set_features(struct net_device *netdev, if (ret) return ret; - if ((features & NETIF_F_RXCSUM) && !rxcsum) + if ((features & NETIF_F_RXCSUM) && !rxcsum) { + hw_if->enable_sph(pdata); + hw_if->enable_vxlan(pdata); hw_if->enable_rx_csum(pdata); - else if (!(features & NETIF_F_RXCSUM) && rxcsum) + schedule_work(&pdata->restart_work); + } else if (!(features & NETIF_F_RXCSUM) && rxcsum) { + hw_if->disable_sph(pdata); + hw_if->disable_vxlan(pdata); hw_if->disable_rx_csum(pdata); + schedule_work(&pdata->restart_work); + } if ((features & NETIF_F_HW_VLAN_CTAG_RX) && !rxvlan) hw_if->enable_rx_vlan_stripping(pdata); diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h index d85386cac8d16..ed5d43c16d0e2 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe.h +++ b/drivers/net/ethernet/amd/xgbe/xgbe.h @@ -865,6 +865,10 @@ struct xgbe_hw_if { void (*enable_vxlan)(struct xgbe_prv_data *); void (*disable_vxlan)(struct xgbe_prv_data *); void (*set_vxlan_id)(struct xgbe_prv_data *); + + /* For Split Header */ + void (*enable_sph)(struct xgbe_prv_data *pdata); + void (*disable_sph)(struct xgbe_prv_data *pdata); }; /* This structure represents implementation specific routines for an From 8c47d5753a119f1c986bc3ed92e9178d2624e1e8 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Fri, 25 Apr 2025 05:29:53 +0100 Subject: [PATCH 521/770] net: ethernet: mtk_eth_soc: sync mtk_clks_source_name array When removing the clock bits for clocks which aren't used by the Ethernet driver their names should also have been removed from the mtk_clks_source_name array. Remove them now as enum mtk_clks_map needs to match the mtk_clks_source_name array so the driver can make sure that all required clocks are present and correctly name missing clocks. Fixes: 887b1d1adb2e ("net: ethernet: mtk_eth_soc: drop clocks unused by Ethernet driver") Signed-off-by: Daniel Golle Reviewed-by: Simon Horman Link: https://patch.msgid.link/d075e706ff1cebc07f9ec666736d0b32782fd487.1745555321.git.daniel@makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 47807b2023104..83068925c589d 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -269,12 +269,8 @@ static const char * const mtk_clks_source_name[] = { "ethwarp_wocpu2", "ethwarp_wocpu1", "ethwarp_wocpu0", - "top_usxgmii0_sel", - "top_usxgmii1_sel", "top_sgm0_sel", "top_sgm1_sel", - "top_xfi_phy0_xtal_sel", - "top_xfi_phy1_xtal_sel", "top_eth_gmii_sel", "top_eth_refck_50m_sel", "top_eth_sys_200m_sel", From ccc25158c22bf9aaf1242182b852bb6c73e88ac3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Apr 2025 13:27:56 +0200 Subject: [PATCH 522/770] mdio: fix CONFIG_MDIO_DEVRES selects The newly added rtl9300 driver needs MDIO_DEVRES: x86_64-linux-ld: drivers/net/mdio/mdio-realtek-rtl9300.o: in function `rtl9300_mdiobus_probe': mdio-realtek-rtl9300.c:(.text+0x941): undefined reference to `devm_mdiobus_alloc_size' x86_64-linux-ld: mdio-realtek-rtl9300.c:(.text+0x9e2): undefined reference to `__devm_mdiobus_register' Since this is a hidden symbol, it needs to be selected by each user, rather than the usual 'depends on'. I see that there are a few other drivers that accidentally use 'depends on', so fix these as well for consistency and to avoid dependency loops. Fixes: 37f9b2a6c086 ("net: ethernet: Add missing depends on MDIO_DEVRES") Fixes: 24e31e474769 ("net: mdio: Add RTL9300 MDIO driver") Signed-off-by: Arnd Bergmann Reviewed-by: Chris Packham Link: https://patch.msgid.link/20250425112819.1645342-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/Kconfig | 3 ++- drivers/net/mdio/Kconfig | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/Kconfig b/drivers/net/ethernet/freescale/enetc/Kconfig index 6c2779047dcd5..5367e8af1e1a0 100644 --- a/drivers/net/ethernet/freescale/enetc/Kconfig +++ b/drivers/net/ethernet/freescale/enetc/Kconfig @@ -73,7 +73,8 @@ config FSL_ENETC_IERB config FSL_ENETC_MDIO tristate "ENETC MDIO driver" - depends on PCI && MDIO_DEVRES && MDIO_BUS + depends on PCI && MDIO_BUS + select MDIO_DEVRES help This driver supports NXP ENETC Central MDIO controller as a PCIe physical function (PF) device. diff --git a/drivers/net/mdio/Kconfig b/drivers/net/mdio/Kconfig index 38a4901da32f3..f680ed6767973 100644 --- a/drivers/net/mdio/Kconfig +++ b/drivers/net/mdio/Kconfig @@ -66,7 +66,7 @@ config MDIO_ASPEED tristate "ASPEED MDIO bus controller" depends on ARCH_ASPEED || COMPILE_TEST depends on OF_MDIO && HAS_IOMEM - depends on MDIO_DEVRES + select MDIO_DEVRES help This module provides a driver for the independent MDIO bus controllers found in the ASPEED AST2600 SoC. This is a driver for the @@ -172,7 +172,7 @@ config MDIO_IPQ4019 tristate "Qualcomm IPQ4019 MDIO interface support" depends on HAS_IOMEM && OF_MDIO depends on COMMON_CLK - depends on MDIO_DEVRES + select MDIO_DEVRES help This driver supports the MDIO interface found in Qualcomm IPQ40xx, IPQ60xx, IPQ807x and IPQ50xx series Soc-s. @@ -181,7 +181,7 @@ config MDIO_IPQ8064 tristate "Qualcomm IPQ8064 MDIO interface support" depends on HAS_IOMEM && OF_MDIO depends on MFD_SYSCON - depends on MDIO_DEVRES + select MDIO_DEVRES help This driver supports the MDIO interface found in the network interface units of the IPQ8064 SoC @@ -189,6 +189,7 @@ config MDIO_IPQ8064 config MDIO_REALTEK_RTL9300 tristate "Realtek RTL9300 MDIO interface support" depends on MACH_REALTEK_RTL || COMPILE_TEST + select MDIO_DEVRES help This driver supports the MDIO interface found in the Realtek RTL9300 family of Ethernet switches with integrated SoC. From 34dd0fecaa02d654c447d43a7e4c72f9b18b7033 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 25 Apr 2025 16:55:31 +0200 Subject: [PATCH 523/770] net: sched: generalize check for no-queue qdisc on TX queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "noqueue" qdisc can either be directly attached, or get default attached if net_device priv_flags has IFF_NO_QUEUE. In both cases, the allocated Qdisc structure gets it's enqueue function pointer reset to NULL by noqueue_init() via noqueue_qdisc_ops. This is a common case for software virtual net_devices. For these devices with no-queue, the transmission path in __dev_queue_xmit() will bypass the qdisc layer. Directly invoking device drivers ndo_start_xmit (via dev_hard_start_xmit). In this mode the device driver is not allowed to ask for packets to be queued (either via returning NETDEV_TX_BUSY or stopping the TXQ). The simplest and most reliable way to identify this no-queue case is by checking if enqueue == NULL. The vrf driver currently open-codes this check (!qdisc->enqueue). While functionally correct, this low-level detail is better encapsulated in a dedicated helper for clarity and long-term maintainability. To make this behavior more explicit and reusable, this patch introduce a new helper: qdisc_txq_has_no_queue(). Helper will also be used by the veth driver in the next patch, which introduces optional qdisc-based backpressure. This is a non-functional change. Reviewed-by: David Ahern Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/174559293172.827981.7583862632045264175.stgit@firesoul Signed-off-by: Jakub Kicinski --- drivers/net/vrf.c | 4 +--- include/net/sch_generic.h | 8 ++++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 7168b33adadb0..9a4beea6ee0c2 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -343,15 +343,13 @@ static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id) static bool qdisc_tx_is_default(const struct net_device *dev) { struct netdev_queue *txq; - struct Qdisc *qdisc; if (dev->num_tx_queues > 1) return false; txq = netdev_get_tx_queue(dev, 0); - qdisc = rcu_access_pointer(txq->qdisc); - return !qdisc->enqueue; + return qdisc_txq_has_no_queue(txq); } /* Local traffic destined to local address. Reinsert the packet to rx diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index d48c657191cd0..b6c177f7141c0 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -803,6 +803,14 @@ static inline bool qdisc_tx_changing(const struct net_device *dev) return false; } +/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */ +static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq) +{ + struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc); + + return qdisc->enqueue == NULL; +} + /* Is the device using the noop qdisc on all queues? */ static inline bool qdisc_tx_is_noop(const struct net_device *dev) { From dc82a33297fc2c58cb0b2b008d728668d45c0f6a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 25 Apr 2025 16:55:40 +0200 Subject: [PATCH 524/770] veth: apply qdisc backpressure on full ptr_ring to reduce TX drops In production, we're seeing TX drops on veth devices when the ptr_ring fills up. This can occur when NAPI mode is enabled, though it's relatively rare. However, with threaded NAPI - which we use in production - the drops become significantly more frequent. The underlying issue is that with threaded NAPI, the consumer often runs on a different CPU than the producer. This increases the likelihood of the ring filling up before the consumer gets scheduled, especially under load, leading to drops in veth_xmit() (ndo_start_xmit()). This patch introduces backpressure by returning NETDEV_TX_BUSY when the ring is full, signaling the qdisc layer to requeue the packet. The txq (netdev queue) is stopped in this condition and restarted once veth_poll() drains entries from the ring, ensuring coordination between NAPI and qdisc. Backpressure is only enabled when a qdisc is attached. Without a qdisc, the driver retains its original behavior - dropping packets immediately when the ring is full. This avoids unexpected behavior changes in setups without a configured qdisc. With a qdisc in place (e.g. fq, sfq) this allows Active Queue Management (AQM) to fairly schedule packets across flows and reduce collateral damage from elephant flows. A known limitation of this approach is that the full ring sits in front of the qdisc layer, effectively forming a FIFO buffer that introduces base latency. While AQM still improves fairness and mitigates flow dominance, the latency impact is measurable. In hardware drivers, this issue is typically addressed using BQL (Byte Queue Limits), which tracks in-flight bytes needed based on physical link rate. However, for virtual drivers like veth, there is no fixed bandwidth constraint - the bottleneck is CPU availability and the scheduler's ability to run the NAPI thread. It is unclear how effective BQL would be in this context. This patch serves as a first step toward addressing TX drops. Future work may explore adapting a BQL-like mechanism to better suit virtual devices like veth. Reported-by: Yan Zhai Signed-off-by: Jesper Dangaard Brouer Reviewed-by: Toshiaki Makita Link: https://patch.msgid.link/174559294022.827981.1282809941662942189.stgit@firesoul Signed-off-by: Jakub Kicinski --- drivers/net/veth.c | 57 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 7bb53961c0eaf..e58a0f1b5c5b7 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -307,12 +307,10 @@ static void __veth_xdp_flush(struct veth_rq *rq) static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb) { - if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) { - dev_kfree_skb_any(skb); - return NET_RX_DROP; - } + if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) + return NETDEV_TX_BUSY; /* signal qdisc layer */ - return NET_RX_SUCCESS; + return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */ } static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, @@ -346,11 +344,11 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); struct veth_rq *rq = NULL; - int ret = NETDEV_TX_OK; + struct netdev_queue *txq; struct net_device *rcv; int length = skb->len; bool use_napi = false; - int rxq; + int ret, rxq; rcu_read_lock(); rcv = rcu_dereference(priv->peer); @@ -373,17 +371,45 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) } skb_tx_timestamp(skb); - if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { + + ret = veth_forward_skb(rcv, skb, rq, use_napi); + switch (ret) { + case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */ if (!use_napi) dev_sw_netstats_tx_add(dev, 1, length); else __veth_xdp_flush(rq); - } else { + break; + case NETDEV_TX_BUSY: + /* If a qdisc is attached to our virtual device, returning + * NETDEV_TX_BUSY is allowed. + */ + txq = netdev_get_tx_queue(dev, rxq); + + if (qdisc_txq_has_no_queue(txq)) { + dev_kfree_skb_any(skb); + goto drop; + } + /* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */ + __skb_push(skb, ETH_HLEN); + /* Depend on prior success packets started NAPI consumer via + * __veth_xdp_flush(). Cancel TXQ stop if consumer stopped, + * paired with empty check in veth_poll(). + */ + netif_tx_stop_queue(txq); + smp_mb__after_atomic(); + if (unlikely(__ptr_ring_empty(&rq->xdp_ring))) + netif_tx_wake_queue(txq); + break; + case NET_RX_DROP: /* same as NET_XMIT_DROP */ drop: atomic64_inc(&priv->dropped); ret = NET_XMIT_DROP; + break; + default: + net_crit_ratelimited("%s(%s): Invalid return code(%d)", + __func__, dev->name, ret); } - rcu_read_unlock(); return ret; @@ -874,9 +900,17 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, struct veth_xdp_tx_bq *bq, struct veth_stats *stats) { + struct veth_priv *priv = netdev_priv(rq->dev); + int queue_idx = rq->xdp_rxq.queue_index; + struct netdev_queue *peer_txq; + struct net_device *peer_dev; int i, done = 0, n_xdpf = 0; void *xdpf[VETH_XDP_BATCH]; + /* NAPI functions as RCU section */ + peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held()); + peer_txq = netdev_get_tx_queue(peer_dev, queue_idx); + for (i = 0; i < budget; i++) { void *ptr = __ptr_ring_consume(&rq->xdp_ring); @@ -925,6 +959,9 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget, rq->stats.vs.xdp_packets += done; u64_stats_update_end(&rq->stats.syncp); + if (unlikely(netif_tx_queue_stopped(peer_txq))) + netif_tx_wake_queue(peer_txq); + return done; } From eaa2b34db021c304697b964505cd2477c7c77eb6 Mon Sep 17 00:00:00 2001 From: Xuanqiang Luo Date: Fri, 4 Apr 2025 17:47:51 +0800 Subject: [PATCH 525/770] netfilter: conntrack: Remove redundant NFCT_ALIGN call The "nf_ct_tmpl_alloc" function had a redundant call to "NFCT_ALIGN" when aligning the pointer "p". Since "NFCT_ALIGN" always gives the same result for the same input. Signed-off-by: Xuanqiang Luo Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 7f8b245e287ae..de8d50af9b5b5 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -531,10 +531,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, p = tmpl; tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); - if (tmpl != p) { - tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); + if (tmpl != p) tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; - } } else { tmpl = kzalloc(sizeof(*tmpl), flags); if (!tmpl) From 149a133a548158586058d14963b4e3a699d0de70 Mon Sep 17 00:00:00 2001 From: Chen Linxuan Date: Tue, 8 Apr 2025 15:35:50 +0800 Subject: [PATCH 526/770] docs: tproxy: fix formatting for nft code block The nft command snippet for redirecting traffic isn't formatted in a literal code block like the rest of snippets. Fix the formatting inconsistency. Signed-off-by: Chen Linxuan Reviewed-by: Bagas Sanjaya Signed-off-by: Pablo Neira Ayuso --- Documentation/networking/tproxy.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/tproxy.rst b/Documentation/networking/tproxy.rst index 7f7c1ff6f159e..75e4990cc3db1 100644 --- a/Documentation/networking/tproxy.rst +++ b/Documentation/networking/tproxy.rst @@ -69,9 +69,9 @@ add rules like this to the iptables ruleset above:: # iptables -t mangle -A PREROUTING -p tcp --dport 80 -j TPROXY \ --tproxy-mark 0x1/0x1 --on-port 50080 -Or the following rule to nft: +Or the following rule to nft:: -# nft add rule filter divert tcp dport 80 tproxy to :50080 meta mark set 1 accept + # nft add rule filter divert tcp dport 80 tproxy to :50080 meta mark set 1 accept Note that for this to work you'll have to modify the proxy to enable (SOL_IP, IP_TRANSPARENT) for the listening socket. From 0014af802193aa3547484b5db0f1a258bad28c81 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Apr 2025 15:55:53 +0200 Subject: [PATCH 527/770] netfilter: nf_tables: export set count and backend name to userspace nf_tables picks a suitable set backend implementation (bitmap, hash, rbtree..) based on the userspace requirements. Figuring out the chosen backend requires information about the set flags and the kernel version. Export this to userspace so nft can include this information in '--debug=netlink' output. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++++ net/netfilter/nf_tables_api.c | 26 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 49c944e78463f..7d6bc19a0153f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -394,6 +394,8 @@ enum nft_set_field_attributes { * @NFTA_SET_HANDLE: set handle (NLA_U64) * @NFTA_SET_EXPR: set expression (NLA_NESTED: nft_expr_attributes) * @NFTA_SET_EXPRESSIONS: list of expressions (NLA_NESTED: nft_list_attributes) + * @NFTA_SET_TYPE: set backend type (NLA_STRING) + * @NFTA_SET_COUNT: number of set elements (NLA_U32) */ enum nft_set_attributes { NFTA_SET_UNSPEC, @@ -415,6 +417,8 @@ enum nft_set_attributes { NFTA_SET_HANDLE, NFTA_SET_EXPR, NFTA_SET_EXPRESSIONS, + NFTA_SET_TYPE, + NFTA_SET_COUNT, __NFTA_SET_MAX }; #define NFTA_SET_MAX (__NFTA_SET_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a133e1c175ce9..b28f6730e26d6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4569,6 +4569,8 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_HANDLE] = { .type = NLA_U64 }, [NFTA_SET_EXPR] = { .type = NLA_NESTED }, [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), + [NFTA_SET_TYPE] = { .type = NLA_REJECT }, + [NFTA_SET_COUNT] = { .type = NLA_REJECT }, }; static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { @@ -4763,6 +4765,27 @@ static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size) return size; } +static noinline_for_stack int +nf_tables_fill_set_info(struct sk_buff *skb, const struct nft_set *set) +{ + unsigned int nelems; + char str[40]; + int ret; + + ret = snprintf(str, sizeof(str), "%ps", set->ops); + + /* Not expected to happen and harmless: NFTA_SET_TYPE is dumped + * to userspace purely for informational/debug purposes. + */ + DEBUG_NET_WARN_ON_ONCE(ret >= sizeof(str)); + + if (nla_put_string(skb, NFTA_SET_TYPE, str)) + return -EMSGSIZE; + + nelems = nft_set_userspace_size(set->ops, atomic_read(&set->nelems)); + return nla_put_be32(skb, NFTA_SET_COUNT, htonl(nelems)); +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -4843,6 +4866,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_nest_end(skb, nest); + if (nf_tables_fill_set_info(skb, set)) + goto nla_put_failure; + if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0) From 59dd07db92c166ca3947d2a1bf548d57b7f03316 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Fri, 25 Apr 2025 14:10:15 +0700 Subject: [PATCH 528/770] selftests: net: move xdp_helper to net/lib Move xdp_helper to net/lib to make it easier for other selftests to use the helper. Signed-off-by: Bui Quang Minh Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20250425071018.36078-2-minhquangbui99@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/.gitignore | 1 - tools/testing/selftests/drivers/net/Makefile | 1 - tools/testing/selftests/drivers/net/napi_id_helper.c | 2 +- tools/testing/selftests/drivers/net/queues.py | 4 ++-- tools/testing/selftests/net/lib/.gitignore | 1 + tools/testing/selftests/net/lib/Makefile | 1 + tools/testing/selftests/{drivers/net => net/lib}/ksft.h | 0 tools/testing/selftests/{drivers/net => net/lib}/xdp_helper.c | 0 8 files changed, 5 insertions(+), 5 deletions(-) rename tools/testing/selftests/{drivers/net => net/lib}/ksft.h (100%) rename tools/testing/selftests/{drivers/net => net/lib}/xdp_helper.c (100%) diff --git a/tools/testing/selftests/drivers/net/.gitignore b/tools/testing/selftests/drivers/net/.gitignore index 72d2124fd5138..d634d8395d90c 100644 --- a/tools/testing/selftests/drivers/net/.gitignore +++ b/tools/testing/selftests/drivers/net/.gitignore @@ -1,3 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only napi_id_helper -xdp_helper diff --git a/tools/testing/selftests/drivers/net/Makefile b/tools/testing/selftests/drivers/net/Makefile index 47247c2ef948b..17db31aa58c94 100644 --- a/tools/testing/selftests/drivers/net/Makefile +++ b/tools/testing/selftests/drivers/net/Makefile @@ -8,7 +8,6 @@ TEST_INCLUDES := $(wildcard lib/py/*.py) \ TEST_GEN_FILES := \ napi_id_helper \ - xdp_helper \ # end of TEST_GEN_FILES TEST_PROGS := \ diff --git a/tools/testing/selftests/drivers/net/napi_id_helper.c b/tools/testing/selftests/drivers/net/napi_id_helper.c index 7e8e7d373b612..eecd610c21095 100644 --- a/tools/testing/selftests/drivers/net/napi_id_helper.c +++ b/tools/testing/selftests/drivers/net/napi_id_helper.c @@ -8,7 +8,7 @@ #include #include -#include "ksft.h" +#include "../../net/lib/ksft.h" int main(int argc, char *argv[]) { diff --git a/tools/testing/selftests/drivers/net/queues.py b/tools/testing/selftests/drivers/net/queues.py index 06abd3f233e1f..236005290a33e 100755 --- a/tools/testing/selftests/drivers/net/queues.py +++ b/tools/testing/selftests/drivers/net/queues.py @@ -26,13 +26,13 @@ def nl_get_queues(cfg, nl, qtype='rx'): def check_xsk(cfg, nl, xdp_queue_id=0) -> None: # Probe for support - xdp = cmd(f'{cfg.test_dir / "xdp_helper"} - -', fail=False) + xdp = cmd(f'{cfg.net_lib_dir / "xdp_helper"} - -', fail=False) if xdp.ret == 255: raise KsftSkipEx('AF_XDP unsupported') elif xdp.ret > 0: raise KsftFailEx('unable to create AF_XDP socket') - with bkg(f'{cfg.test_dir / "xdp_helper"} {cfg.ifindex} {xdp_queue_id}', + with bkg(f'{cfg.net_lib_dir / "xdp_helper"} {cfg.ifindex} {xdp_queue_id}', ksft_wait=3): rx = tx = False diff --git a/tools/testing/selftests/net/lib/.gitignore b/tools/testing/selftests/net/lib/.gitignore index 1ebc6187f421c..bbc97d6bf5564 100644 --- a/tools/testing/selftests/net/lib/.gitignore +++ b/tools/testing/selftests/net/lib/.gitignore @@ -1,2 +1,3 @@ # SPDX-License-Identifier: GPL-2.0-only csum +xdp_helper diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile index c22623b9a2a5f..88c4bc4614599 100644 --- a/tools/testing/selftests/net/lib/Makefile +++ b/tools/testing/selftests/net/lib/Makefile @@ -10,6 +10,7 @@ TEST_FILES += ../../../../net/ynl TEST_GEN_FILES += csum TEST_GEN_FILES += $(patsubst %.c,%.o,$(wildcard *.bpf.c)) +TEST_GEN_FILES += xdp_helper TEST_INCLUDES := $(wildcard py/*.py sh/*.sh) diff --git a/tools/testing/selftests/drivers/net/ksft.h b/tools/testing/selftests/net/lib/ksft.h similarity index 100% rename from tools/testing/selftests/drivers/net/ksft.h rename to tools/testing/selftests/net/lib/ksft.h diff --git a/tools/testing/selftests/drivers/net/xdp_helper.c b/tools/testing/selftests/net/lib/xdp_helper.c similarity index 100% rename from tools/testing/selftests/drivers/net/xdp_helper.c rename to tools/testing/selftests/net/lib/xdp_helper.c From 5d346179e709ea688f29b450a918cbf2ead80960 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Fri, 25 Apr 2025 14:10:16 +0700 Subject: [PATCH 529/770] selftests: net: add flag to force zerocopy mode in xdp_helper This commit adds an optional -z flag to xdp_helper. When this flag is provided, the XDP socket binding is forced to be in zerocopy mode. Signed-off-by: Bui Quang Minh Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20250425071018.36078-3-minhquangbui99@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/xdp_helper.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/lib/xdp_helper.c b/tools/testing/selftests/net/lib/xdp_helper.c index d5bb8ac33efa5..6327863cafa6d 100644 --- a/tools/testing/selftests/net/lib/xdp_helper.c +++ b/tools/testing/selftests/net/lib/xdp_helper.c @@ -17,6 +17,12 @@ #define NUM_DESC (UMEM_SZ / 2048) +static void print_usage(const char *bin) +{ + fprintf(stderr, "Usage: %s ifindex queue_id [-z]\n\n" + "where:\n\t-z: force zerocopy mode", bin); +} + /* this is a simple helper program that creates an XDP socket and does the * minimum necessary to get bind() to succeed. * @@ -36,8 +42,8 @@ int main(int argc, char **argv) int sock_fd; int queue; - if (argc != 3) { - fprintf(stderr, "Usage: %s ifindex queue_id\n", argv[0]); + if (argc != 3 && argc != 4) { + print_usage(argv[0]); return 1; } @@ -87,6 +93,15 @@ int main(int argc, char **argv) sxdp.sxdp_queue_id = queue; sxdp.sxdp_flags = 0; + if (argc > 3) { + if (!strcmp(argv[3], "-z")) { + sxdp.sxdp_flags = XDP_ZEROCOPY; + } else { + print_usage(argv[0]); + return 1; + } + } + if (bind(sock_fd, (struct sockaddr *)&sxdp, sizeof(sxdp)) != 0) { munmap(umem_area, UMEM_SZ); perror("bind failed"); From b2b4555cf2a6cc4d08ddfaa181687bb7a8559a51 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Fri, 25 Apr 2025 14:10:17 +0700 Subject: [PATCH 530/770] selftests: net: retry when bind returns EBUSY in xdp_helper When binding the XDP socket, we may get EBUSY because the deferred destructor of XDP socket in previous test has not been executed yet. If that is the case, just sleep and retry some times. Signed-off-by: Bui Quang Minh Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20250425071018.36078-4-minhquangbui99@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/lib/xdp_helper.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/net/lib/xdp_helper.c b/tools/testing/selftests/net/lib/xdp_helper.c index 6327863cafa6d..eb025a9f35b18 100644 --- a/tools/testing/selftests/net/lib/xdp_helper.c +++ b/tools/testing/selftests/net/lib/xdp_helper.c @@ -38,6 +38,7 @@ int main(int argc, char **argv) struct sockaddr_xdp sxdp = { 0 }; int num_desc = NUM_DESC; void *umem_area; + int retry = 0; int ifindex; int sock_fd; int queue; @@ -102,11 +103,20 @@ int main(int argc, char **argv) } } - if (bind(sock_fd, (struct sockaddr *)&sxdp, sizeof(sxdp)) != 0) { - munmap(umem_area, UMEM_SZ); - perror("bind failed"); - close(sock_fd); - return 1; + while (1) { + if (bind(sock_fd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0) + break; + + if (errno == EBUSY && retry < 3) { + retry++; + sleep(1); + continue; + } else { + perror("bind failed"); + munmap(umem_area, UMEM_SZ); + close(sock_fd); + return 1; + } } ksft_ready(); From c347fb0ff844f2c72fa779c76ec8b2d7385127e6 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Fri, 25 Apr 2025 14:10:18 +0700 Subject: [PATCH 531/770] selftests: net: add a virtio_net deadlock selftest The selftest reproduces the deadlock scenario when binding/unbinding XDP program, XDP socket, rx ring resize on virtio_net interface. Signed-off-by: Bui Quang Minh Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20250425071018.36078-5-minhquangbui99@gmail.com Signed-off-by: Jakub Kicinski --- .../testing/selftests/drivers/net/hw/Makefile | 1 + .../selftests/drivers/net/hw/xsk_reconfig.py | 60 +++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hw/xsk_reconfig.py diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 07cddb19ba351..5447785c286ee 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -21,6 +21,7 @@ TEST_PROGS = \ rss_ctx.py \ rss_input_xfrm.py \ tso.py \ + xsk_reconfig.py \ # TEST_FILES := \ diff --git a/tools/testing/selftests/drivers/net/hw/xsk_reconfig.py b/tools/testing/selftests/drivers/net/hw/xsk_reconfig.py new file mode 100755 index 0000000000000..d19d1d5182081 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/xsk_reconfig.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +# This is intended to be run on a virtio-net guest interface. +# The test binds the XDP socket to the interface without setting +# the fill ring to trigger delayed refill_work. This helps to +# make it easier to reproduce the deadlock when XDP program, +# XDP socket bind/unbind, rx ring resize race with refill_work on +# the buggy kernel. +# +# The Qemu command to setup virtio-net +# -netdev tap,id=hostnet1,vhost=on,script=no,downscript=no +# -device virtio-net-pci,netdev=hostnet1,iommu_platform=on,disable-legacy=on + +from lib.py import ksft_exit, ksft_run +from lib.py import KsftSkipEx, KsftFailEx +from lib.py import NetDrvEnv +from lib.py import bkg, ip, cmd, ethtool +import time + +def _get_rx_ring_entries(cfg): + output = ethtool(f"-g {cfg.ifname}", json=True) + return output[0]["rx"] + +def setup_xsk(cfg, xdp_queue_id = 0) -> bkg: + # Probe for support + xdp = cmd(f'{cfg.net_lib_dir / "xdp_helper"} - -', fail=False) + if xdp.ret == 255: + raise KsftSkipEx('AF_XDP unsupported') + elif xdp.ret > 0: + raise KsftFailEx('unable to create AF_XDP socket') + + try: + return bkg(f'{cfg.net_lib_dir / "xdp_helper"} {cfg.ifindex} ' \ + '{xdp_queue_id} -z', ksft_wait=3) + except: + raise KsftSkipEx('Failed to bind XDP socket in zerocopy.\n' \ + 'Please consider adding iommu_platform=on ' \ + 'when setting up virtio-net-pci') + +def check_xdp_bind(cfg): + with setup_xsk(cfg): + ip(f"link set dev %s xdp obj %s sec xdp" % + (cfg.ifname, cfg.net_lib_dir / "xdp_dummy.bpf.o")) + ip(f"link set dev %s xdp off" % cfg.ifname) + +def check_rx_resize(cfg): + with setup_xsk(cfg): + rx_ring = _get_rx_ring_entries(cfg) + ethtool(f"-G %s rx %d" % (cfg.ifname, rx_ring // 2)) + ethtool(f"-G %s rx %d" % (cfg.ifname, rx_ring)) + +def main(): + with NetDrvEnv(__file__, nsim_test=False) as cfg: + ksft_run([check_xdp_bind, check_rx_resize], + args=(cfg, )) + ksft_exit() + +if __name__ == "__main__": + main() From 10c34b7d71a4ff8c06d926f1846edf8295ed75bf Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Fri, 25 Apr 2025 19:14:18 +0200 Subject: [PATCH 532/770] netlink: specs: ethtool: Remove UAPI duplication of phy-upstream enum The phy-upstream enum is already defined in the ethtool.h UAPI header and used by the ethtool userspace tool. However, the ethtool spec does not reference it, causing YNL to auto-generate a duplicate and redundant enum. Fix this by updating the spec to reference the existing UAPI enum in ethtool.h. Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20250425171419.947352-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 4 +++- include/uapi/linux/ethtool_netlink_generated.h | 5 ----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 655d8d10fe248..c650cd3dcb80b 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -89,8 +89,10 @@ definitions: doc: Group of short_detected states - name: phy-upstream-type - enum-name: + enum-name: phy-upstream + header: linux/ethtool.h type: enum + name-prefix: phy-upstream entries: [ mac, phy ] - name: tcp-data-split diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index fe24c3459ac0f..30c8dad6214e9 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -31,11 +31,6 @@ enum ethtool_header_flags { ETHTOOL_FLAG_STATS = 4, }; -enum { - ETHTOOL_PHY_UPSTREAM_TYPE_MAC, - ETHTOOL_PHY_UPSTREAM_TYPE_PHY, -}; - enum ethtool_tcp_data_split { ETHTOOL_TCP_DATA_SPLIT_UNKNOWN, ETHTOOL_TCP_DATA_SPLIT_DISABLED, From dfd76010f8e821b66116dec3c7d90dd2403d1396 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 25 Apr 2025 13:38:57 -0700 Subject: [PATCH 533/770] pds_core: remove write-after-free of client_id A use-after-free error popped up in stress testing: [Mon Apr 21 21:21:33 2025] BUG: KFENCE: use-after-free write in pdsc_auxbus_dev_del+0xef/0x160 [pds_core] [Mon Apr 21 21:21:33 2025] Use-after-free write at 0x000000007013ecd1 (in kfence-#47): [Mon Apr 21 21:21:33 2025] pdsc_auxbus_dev_del+0xef/0x160 [pds_core] [Mon Apr 21 21:21:33 2025] pdsc_remove+0xc0/0x1b0 [pds_core] [Mon Apr 21 21:21:33 2025] pci_device_remove+0x24/0x70 [Mon Apr 21 21:21:33 2025] device_release_driver_internal+0x11f/0x180 [Mon Apr 21 21:21:33 2025] driver_detach+0x45/0x80 [Mon Apr 21 21:21:33 2025] bus_remove_driver+0x83/0xe0 [Mon Apr 21 21:21:33 2025] pci_unregister_driver+0x1a/0x80 The actual device uninit usually happens on a separate thread scheduled after this code runs, but there is no guarantee of order of thread execution, so this could be a problem. There's no actual need to clear the client_id at this point, so simply remove the offending code. Fixes: 10659034c622 ("pds_core: add the aux client API") Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250425203857.71547-1-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/auxbus.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/amd/pds_core/auxbus.c b/drivers/net/ethernet/amd/pds_core/auxbus.c index c9aac27883a3c..92f359f2b4492 100644 --- a/drivers/net/ethernet/amd/pds_core/auxbus.c +++ b/drivers/net/ethernet/amd/pds_core/auxbus.c @@ -186,7 +186,6 @@ void pdsc_auxbus_dev_del(struct pdsc *cf, struct pdsc *pf, pds_client_unregister(pf, padev->client_id); auxiliary_device_delete(&padev->aux_dev); auxiliary_device_uninit(&padev->aux_dev); - padev->client_id = 0; *pd_ptr = NULL; mutex_unlock(&pf->config_lock); From f99a3fbf023e20b626be4b0f042463d598050c9a Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 25 Apr 2025 19:07:05 -0300 Subject: [PATCH 534/770] net_sched: drr: Fix double list add in class with netem as child qdisc As described in Gerrard's report [1], there are use cases where a netem child qdisc will make the parent qdisc's enqueue callback reentrant. In the case of drr, there won't be a UAF, but the code will add the same classifier to the list twice, which will cause memory corruption. In addition to checking for qlen being zero, this patch checks whether the class was already added to the active_list (cl_is_active) before adding to the list to cover for the reentrant case. [1] https://lore.kernel.org/netdev/CAHcdcOm+03OD2j6R0=YHKqmy=VgJ8xEOKuP6c7mSgnp-TEJJbw@mail.gmail.com/ Fixes: 37d9cf1a3ce3 ("sched: Fix detection of empty queues in child qdiscs") Acked-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20250425220710.3964791-2-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_drr.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index e0a81d313aa76..9b6d79bd87371 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -35,6 +35,11 @@ struct drr_sched { struct Qdisc_class_hash clhash; }; +static bool cl_is_active(struct drr_class *cl) +{ + return !list_empty(&cl->alist); +} + static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) { struct drr_sched *q = qdisc_priv(sch); @@ -337,7 +342,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; int err = 0; - bool first; cl = drr_classify(skb, sch, &err); if (cl == NULL) { @@ -347,7 +351,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { @@ -357,7 +360,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - if (first) { + if (!cl_is_active(cl)) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } From 141d34391abbb315d68556b7c67ad97885407547 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 25 Apr 2025 19:07:06 -0300 Subject: [PATCH 535/770] net_sched: hfsc: Fix a UAF vulnerability in class with netem as child qdisc As described in Gerrard's report [1], we have a UAF case when an hfsc class has a netem child qdisc. The crux of the issue is that hfsc is assuming that checking for cl->qdisc->q.qlen == 0 guarantees that it hasn't inserted the class in the vttree or eltree (which is not true for the netem duplicate case). This patch checks the n_active class variable to make sure that the code won't insert the class in the vttree or eltree twice, catering for the reentrant case. [1] https://lore.kernel.org/netdev/CAHcdcOm+03OD2j6R0=YHKqmy=VgJ8xEOKuP6c7mSgnp-TEJJbw@mail.gmail.com/ Fixes: 37d9cf1a3ce3 ("sched: Fix detection of empty queues in child qdiscs") Reported-by: Gerrard Tai Acked-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20250425220710.3964791-3-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_hfsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 6c8ef826cec0b..cb8c525ea20ea 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1569,7 +1569,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) return err; } - if (first) { + if (first && !cl->cl_nactive) { if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); if (cl->cl_flags & HFSC_FSC) From 1a6d0c00fa07972384b0c308c72db091d49988b6 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 25 Apr 2025 19:07:07 -0300 Subject: [PATCH 536/770] net_sched: ets: Fix double list add in class with netem as child qdisc As described in Gerrard's report [1], there are use cases where a netem child qdisc will make the parent qdisc's enqueue callback reentrant. In the case of ets, there won't be a UAF, but the code will add the same classifier to the list twice, which will cause memory corruption. In addition to checking for qlen being zero, this patch checks whether the class was already added to the active_list (cl_is_active) before doing the addition to cater for the reentrant case. [1] https://lore.kernel.org/netdev/CAHcdcOm+03OD2j6R0=YHKqmy=VgJ8xEOKuP6c7mSgnp-TEJJbw@mail.gmail.com/ Fixes: 37d9cf1a3ce3 ("sched: Fix detection of empty queues in child qdiscs") Acked-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20250425220710.3964791-4-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_ets.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index c3bdeb14185be..2c069f0181c62 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -74,6 +74,11 @@ static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = { [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, }; +static bool cl_is_active(struct ets_class *cl) +{ + return !list_empty(&cl->alist); +} + static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr, unsigned int *quantum, struct netlink_ext_ack *extack) @@ -416,7 +421,6 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct ets_sched *q = qdisc_priv(sch); struct ets_class *cl; int err = 0; - bool first; cl = ets_classify(skb, sch, &err); if (!cl) { @@ -426,7 +430,6 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { @@ -436,7 +439,7 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - if (first && !ets_class_is_strict(q, cl)) { + if (!cl_is_active(cl) && !ets_class_is_strict(q, cl)) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } From f139f37dcdf34b67f5bf92bc8e0f7f6b3ac63aa4 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 25 Apr 2025 19:07:08 -0300 Subject: [PATCH 537/770] net_sched: qfq: Fix double list add in class with netem as child qdisc As described in Gerrard's report [1], there are use cases where a netem child qdisc will make the parent qdisc's enqueue callback reentrant. In the case of qfq, there won't be a UAF, but the code will add the same classifier to the list twice, which will cause memory corruption. This patch checks whether the class was already added to the agg->active list (cl_is_active) before doing the addition to cater for the reentrant case. [1] https://lore.kernel.org/netdev/CAHcdcOm+03OD2j6R0=YHKqmy=VgJ8xEOKuP6c7mSgnp-TEJJbw@mail.gmail.com/ Fixes: 37d9cf1a3ce3 ("sched: Fix detection of empty queues in child qdiscs") Acked-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20250425220710.3964791-5-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- net/sched/sch_qfq.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 687a932eb9b2f..bf1282cb22eba 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -202,6 +202,11 @@ struct qfq_sched { */ enum update_reason {enqueue, requeue}; +static bool cl_is_active(struct qfq_class *cl) +{ + return !list_empty(&cl->alist); +} + static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) { struct qfq_sched *q = qdisc_priv(sch); @@ -1215,7 +1220,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct qfq_class *cl; struct qfq_aggregate *agg; int err = 0; - bool first; cl = qfq_classify(skb, sch, &err); if (cl == NULL) { @@ -1237,7 +1241,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; - first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); @@ -1253,8 +1256,8 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, ++sch->q.qlen; agg = cl->agg; - /* if the queue was not empty, then done here */ - if (!first) { + /* if the class is active, then done here */ + if (cl_is_active(cl)) { if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && list_first_entry(&agg->active, struct qfq_class, alist) == cl && cl->deficit < len) From a6e1c5aa16dd5d351603c9d3ae259a069eabdcc2 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 25 Apr 2025 19:07:09 -0300 Subject: [PATCH 538/770] selftests: tc-testing: Add TDC tests that exercise reentrant enqueue behaviour Add 5 TDC tests that exercise the reentrant enqueue behaviour in drr, ets, qfq, and hfsc: - Test DRR's enqueue reentrant behaviour with netem (which caused a double list add) - Test ETS's enqueue reentrant behaviour with netem (which caused a double list add) - Test QFQ's enqueue reentrant behaviour with netem (which caused a double list add) - Test HFSC's enqueue reentrant behaviour with netem (which caused a UAF) - Test nested DRR's enqueue reentrant behaviour with netem (which caused a double list add) Acked-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Link: https://patch.msgid.link/20250425220710.3964791-6-victor@mojatatu.com Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 186 ++++++++++++++++++ 1 file changed, 186 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index e26bbc1697832..0843f6d37e9c7 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -352,5 +352,191 @@ "$TC qdisc del dev $DUMMY handle 1:0 root", "$IP addr del 10.10.10.10/24 dev $DUMMY || true" ] + }, + { + "id": "90ec", + "name": "Test DRR's enqueue reentrant behaviour with netem", + "category": [ + "qdisc", + "drr" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1:0 root drr", + "$TC class replace dev $DUMMY parent 1:0 classid 1:1 drr", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 netem duplicate 100%", + "$TC filter add dev $DUMMY parent 1:0 protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:1" + ], + "cmdUnderTest": "ping -c 1 -I $DUMMY 10.10.10.1 > /dev/null || true", + "expExitCode": "0", + "verifyCmd": "$TC -j -s qdisc ls dev $DUMMY handle 1:0", + "matchJSON": [ + { + "kind": "drr", + "handle": "1:", + "bytes": 196, + "packets": 2 + } + ], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1:0 root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] + }, + { + "id": "1f1f", + "name": "Test ETS's enqueue reentrant behaviour with netem", + "category": [ + "qdisc", + "ets" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1:0 root ets bands 2", + "$TC class replace dev $DUMMY parent 1:0 classid 1:1 ets quantum 1500", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 netem duplicate 100%", + "$TC filter add dev $DUMMY parent 1:0 protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:1" + ], + "cmdUnderTest": "ping -c 1 -I $DUMMY 10.10.10.1 > /dev/null || true", + "expExitCode": "0", + "verifyCmd": "$TC -j -s class show dev $DUMMY", + "matchJSON": [ + { + "class": "ets", + "handle": "1:1", + "stats": { + "bytes": 196, + "packets": 2 + } + } + ], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1:0 root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] + }, + { + "id": "5e6d", + "name": "Test QFQ's enqueue reentrant behaviour with netem", + "category": [ + "qdisc", + "qfq" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1:0 root qfq", + "$TC class replace dev $DUMMY parent 1:0 classid 1:1 qfq weight 100 maxpkt 1500", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 netem duplicate 100%", + "$TC filter add dev $DUMMY parent 1:0 protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:1" + ], + "cmdUnderTest": "ping -c 1 -I $DUMMY 10.10.10.1 > /dev/null || true", + "expExitCode": "0", + "verifyCmd": "$TC -j -s qdisc ls dev $DUMMY handle 1:0", + "matchJSON": [ + { + "kind": "qfq", + "handle": "1:", + "bytes": 196, + "packets": 2 + } + ], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1:0 root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] + }, + { + "id": "bf1d", + "name": "Test HFSC's enqueue reentrant behaviour with netem", + "category": [ + "qdisc", + "hfsc" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1:0 root hfsc", + "$TC class add dev $DUMMY parent 1:0 classid 1:1 hfsc ls m2 10Mbit", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 netem duplicate 100%", + "$TC filter add dev $DUMMY parent 1:0 protocol ip prio 1 u32 match ip dst 10.10.10.1/32 flowid 1:1", + "$TC class add dev $DUMMY parent 1:0 classid 1:2 hfsc ls m2 10Mbit", + "$TC qdisc add dev $DUMMY parent 1:2 handle 3:0 netem duplicate 100%", + "$TC filter add dev $DUMMY parent 1:0 protocol ip prio 2 u32 match ip dst 10.10.10.2/32 flowid 1:2", + "ping -c 1 10.10.10.1 -I$DUMMY > /dev/null || true", + "$TC filter del dev $DUMMY parent 1:0 protocol ip prio 1", + "$TC class del dev $DUMMY classid 1:1" + ], + "cmdUnderTest": "ping -c 1 10.10.10.2 -I$DUMMY > /dev/null || true", + "expExitCode": "0", + "verifyCmd": "$TC -j -s qdisc ls dev $DUMMY handle 1:0", + "matchJSON": [ + { + "kind": "hfsc", + "handle": "1:", + "bytes": 392, + "packets": 4 + } + ], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1:0 root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] + }, + { + "id": "7c3b", + "name": "Test nested DRR's enqueue reentrant behaviour with netem", + "category": [ + "qdisc", + "drr" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1:0 root drr", + "$TC class add dev $DUMMY parent 1:0 classid 1:1 drr", + "$TC filter add dev $DUMMY parent 1:0 protocol ip prio 1 u32 match ip protocol 1 0xff flowid 1:1", + "$TC qdisc add dev $DUMMY handle 2:0 parent 1:1 drr", + "$TC class add dev $DUMMY classid 2:1 parent 2:0 drr", + "$TC filter add dev $DUMMY parent 2:0 protocol ip prio 1 u32 match ip protocol 1 0xff flowid 2:1", + "$TC qdisc add dev $DUMMY parent 2:1 handle 3:0 netem duplicate 100%" + ], + "cmdUnderTest": "ping -c 1 -I $DUMMY 10.10.10.1 > /dev/null || true", + "expExitCode": "0", + "verifyCmd": "$TC -j -s qdisc ls dev $DUMMY handle 1:0", + "matchJSON": [ + { + "kind": "drr", + "handle": "1:", + "bytes": 196, + "packets": 2 + } + ], + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1:0 root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] } ] From 3ffcd7b657c9d96eb3ffe174449b4248dd7fc6a9 Mon Sep 17 00:00:00 2001 From: Paul Greenwalt Date: Fri, 25 Apr 2025 15:26:31 -0700 Subject: [PATCH 539/770] ice: fix Get Tx Topology AQ command error on E830 The Get Tx Topology AQ command (opcode 0x0418) has different read flag requirements depending on the hardware/firmware. For E810, E822, and E823 firmware the read flag must be set, and for newer hardware (E825 and E830) it must not be set. This results in failure to configure Tx topology and the following warning message during probe: DDP package does not support Tx scheduling layers switching feature - please update to the latest DDP package and try again The current implementation only handles E825-C but not E830. It is confusing as we first check ice_is_e825c() and then set the flag in the set case. Finally, we check ice_is_e825c() again and set the flag for all other hardware in both the set and get case. Instead, notice that we always need the read flag for set, but only need the read flag for get on E810, E822, and E823 firmware. Fix the logic to check the MAC type and set the read flag in get only on the older devices which require it. Fixes: ba1124f58afd ("ice: Add E830 device IDs, MAC type and registers") Signed-off-by: Paul Greenwalt Signed-off-by: Jacob Keller Reviewed-by: Michal Swiatkowski Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250425222636.3188441-2-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_ddp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ddp.c b/drivers/net/ethernet/intel/ice/ice_ddp.c index 69d5b1a28491d..59323c019544f 100644 --- a/drivers/net/ethernet/intel/ice/ice_ddp.c +++ b/drivers/net/ethernet/intel/ice/ice_ddp.c @@ -2345,15 +2345,15 @@ ice_get_set_tx_topo(struct ice_hw *hw, u8 *buf, u16 buf_size, cmd->set_flags |= ICE_AQC_TX_TOPO_FLAGS_SRC_RAM | ICE_AQC_TX_TOPO_FLAGS_LOAD_NEW; - if (hw->mac_type == ICE_MAC_GENERIC_3K_E825) - desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD); + desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD); } else { ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_tx_topo); cmd->get_flags = ICE_AQC_TX_TOPO_GET_RAM; - } - if (hw->mac_type != ICE_MAC_GENERIC_3K_E825) - desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD); + if (hw->mac_type == ICE_MAC_E810 || + hw->mac_type == ICE_MAC_GENERIC) + desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD); + } status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd); if (status) From 425c5f266b2edeee0ce16fedd8466410cdcfcfe3 Mon Sep 17 00:00:00 2001 From: Xuanqiang Luo Date: Fri, 25 Apr 2025 15:26:32 -0700 Subject: [PATCH 540/770] ice: Check VF VSI Pointer Value in ice_vc_add_fdir_fltr() As mentioned in the commit baeb705fd6a7 ("ice: always check VF VSI pointer values"), we need to perform a null pointer check on the return value of ice_get_vf_vsi() before using it. Fixes: 6ebbe97a4881 ("ice: Add a per-VF limit on number of FDIR filters") Signed-off-by: Xuanqiang Luo Reviewed-by: Przemek Kitszel Reviewed-by: Simon Horman Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250425222636.3188441-3-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c index 7752920d7a8ee..1cca9b2262e86 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c @@ -2097,6 +2097,11 @@ int ice_vc_add_fdir_fltr(struct ice_vf *vf, u8 *msg) pf = vf->pf; dev = ice_pf_to_dev(pf); vf_vsi = ice_get_vf_vsi(vf); + if (!vf_vsi) { + dev_err(dev, "Can not get FDIR vf_vsi for VF %u\n", vf->vf_id); + v_ret = VIRTCHNL_STATUS_ERR_PARAM; + goto err_exit; + } #define ICE_VF_MAX_FDIR_FILTERS 128 if (!ice_fdir_num_avail_fltr(&pf->hw, vf_vsi) || From 713dd6c2deca88cba0596b1e2576f7b7a8e5c59e Mon Sep 17 00:00:00 2001 From: Madhu Chittim Date: Fri, 25 Apr 2025 15:26:33 -0700 Subject: [PATCH 541/770] idpf: fix offloads support for encapsulated packets Split offloads into csum, tso and other offloads so that tunneled packets do not by default have all the offloads enabled. Stateless offloads for encapsulated packets are not yet supported in firmware/software but in the driver we were setting the features same as non encapsulated features. Fixed naming to clarify CSUM bits are being checked for Tx. Inherit netdev features to VLAN interfaces as well. Fixes: 0fe45467a104 ("idpf: add create vport and netdev configuration") Reviewed-by: Sridhar Samudrala Signed-off-by: Madhu Chittim Tested-by: Zachary Goldstein Tested-by: Samuel Salin Signed-off-by: Tony Nguyen Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250425222636.3188441-4-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/idpf/idpf.h | 18 +++---- drivers/net/ethernet/intel/idpf/idpf_lib.c | 57 ++++++++-------------- 2 files changed, 27 insertions(+), 48 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf.h b/drivers/net/ethernet/intel/idpf/idpf.h index 66544faab710a..aef0e9775a330 100644 --- a/drivers/net/ethernet/intel/idpf/idpf.h +++ b/drivers/net/ethernet/intel/idpf/idpf.h @@ -629,13 +629,13 @@ bool idpf_is_capability_ena(struct idpf_adapter *adapter, bool all, VIRTCHNL2_CAP_RX_HSPLIT_AT_L4V4 |\ VIRTCHNL2_CAP_RX_HSPLIT_AT_L4V6) -#define IDPF_CAP_RX_CSUM_L4V4 (\ - VIRTCHNL2_CAP_RX_CSUM_L4_IPV4_TCP |\ - VIRTCHNL2_CAP_RX_CSUM_L4_IPV4_UDP) +#define IDPF_CAP_TX_CSUM_L4V4 (\ + VIRTCHNL2_CAP_TX_CSUM_L4_IPV4_TCP |\ + VIRTCHNL2_CAP_TX_CSUM_L4_IPV4_UDP) -#define IDPF_CAP_RX_CSUM_L4V6 (\ - VIRTCHNL2_CAP_RX_CSUM_L4_IPV6_TCP |\ - VIRTCHNL2_CAP_RX_CSUM_L4_IPV6_UDP) +#define IDPF_CAP_TX_CSUM_L4V6 (\ + VIRTCHNL2_CAP_TX_CSUM_L4_IPV6_TCP |\ + VIRTCHNL2_CAP_TX_CSUM_L4_IPV6_UDP) #define IDPF_CAP_RX_CSUM (\ VIRTCHNL2_CAP_RX_CSUM_L3_IPV4 |\ @@ -644,11 +644,9 @@ bool idpf_is_capability_ena(struct idpf_adapter *adapter, bool all, VIRTCHNL2_CAP_RX_CSUM_L4_IPV6_TCP |\ VIRTCHNL2_CAP_RX_CSUM_L4_IPV6_UDP) -#define IDPF_CAP_SCTP_CSUM (\ +#define IDPF_CAP_TX_SCTP_CSUM (\ VIRTCHNL2_CAP_TX_CSUM_L4_IPV4_SCTP |\ - VIRTCHNL2_CAP_TX_CSUM_L4_IPV6_SCTP |\ - VIRTCHNL2_CAP_RX_CSUM_L4_IPV4_SCTP |\ - VIRTCHNL2_CAP_RX_CSUM_L4_IPV6_SCTP) + VIRTCHNL2_CAP_TX_CSUM_L4_IPV6_SCTP) #define IDPF_CAP_TUNNEL_TX_CSUM (\ VIRTCHNL2_CAP_TX_CSUM_L3_SINGLE_TUNNEL |\ diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index aa755dedb41d9..730a9c7a59f2b 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -703,8 +703,10 @@ static int idpf_cfg_netdev(struct idpf_vport *vport) { struct idpf_adapter *adapter = vport->adapter; struct idpf_vport_config *vport_config; + netdev_features_t other_offloads = 0; + netdev_features_t csum_offloads = 0; + netdev_features_t tso_offloads = 0; netdev_features_t dflt_features; - netdev_features_t offloads = 0; struct idpf_netdev_priv *np; struct net_device *netdev; u16 idx = vport->idx; @@ -766,53 +768,32 @@ static int idpf_cfg_netdev(struct idpf_vport *vport) if (idpf_is_cap_ena_all(adapter, IDPF_RSS_CAPS, IDPF_CAP_RSS)) dflt_features |= NETIF_F_RXHASH; - if (idpf_is_cap_ena_all(adapter, IDPF_CSUM_CAPS, IDPF_CAP_RX_CSUM_L4V4)) - dflt_features |= NETIF_F_IP_CSUM; - if (idpf_is_cap_ena_all(adapter, IDPF_CSUM_CAPS, IDPF_CAP_RX_CSUM_L4V6)) - dflt_features |= NETIF_F_IPV6_CSUM; + if (idpf_is_cap_ena_all(adapter, IDPF_CSUM_CAPS, IDPF_CAP_TX_CSUM_L4V4)) + csum_offloads |= NETIF_F_IP_CSUM; + if (idpf_is_cap_ena_all(adapter, IDPF_CSUM_CAPS, IDPF_CAP_TX_CSUM_L4V6)) + csum_offloads |= NETIF_F_IPV6_CSUM; if (idpf_is_cap_ena(adapter, IDPF_CSUM_CAPS, IDPF_CAP_RX_CSUM)) - dflt_features |= NETIF_F_RXCSUM; - if (idpf_is_cap_ena_all(adapter, IDPF_CSUM_CAPS, IDPF_CAP_SCTP_CSUM)) - dflt_features |= NETIF_F_SCTP_CRC; + csum_offloads |= NETIF_F_RXCSUM; + if (idpf_is_cap_ena_all(adapter, IDPF_CSUM_CAPS, IDPF_CAP_TX_SCTP_CSUM)) + csum_offloads |= NETIF_F_SCTP_CRC; if (idpf_is_cap_ena(adapter, IDPF_SEG_CAPS, VIRTCHNL2_CAP_SEG_IPV4_TCP)) - dflt_features |= NETIF_F_TSO; + tso_offloads |= NETIF_F_TSO; if (idpf_is_cap_ena(adapter, IDPF_SEG_CAPS, VIRTCHNL2_CAP_SEG_IPV6_TCP)) - dflt_features |= NETIF_F_TSO6; + tso_offloads |= NETIF_F_TSO6; if (idpf_is_cap_ena_all(adapter, IDPF_SEG_CAPS, VIRTCHNL2_CAP_SEG_IPV4_UDP | VIRTCHNL2_CAP_SEG_IPV6_UDP)) - dflt_features |= NETIF_F_GSO_UDP_L4; + tso_offloads |= NETIF_F_GSO_UDP_L4; if (idpf_is_cap_ena_all(adapter, IDPF_RSC_CAPS, IDPF_CAP_RSC)) - offloads |= NETIF_F_GRO_HW; - /* advertise to stack only if offloads for encapsulated packets is - * supported - */ - if (idpf_is_cap_ena(vport->adapter, IDPF_SEG_CAPS, - VIRTCHNL2_CAP_SEG_TX_SINGLE_TUNNEL)) { - offloads |= NETIF_F_GSO_UDP_TUNNEL | - NETIF_F_GSO_GRE | - NETIF_F_GSO_GRE_CSUM | - NETIF_F_GSO_PARTIAL | - NETIF_F_GSO_UDP_TUNNEL_CSUM | - NETIF_F_GSO_IPXIP4 | - NETIF_F_GSO_IPXIP6 | - 0; - - if (!idpf_is_cap_ena_all(vport->adapter, IDPF_CSUM_CAPS, - IDPF_CAP_TUNNEL_TX_CSUM)) - netdev->gso_partial_features |= - NETIF_F_GSO_UDP_TUNNEL_CSUM; - - netdev->gso_partial_features |= NETIF_F_GSO_GRE_CSUM; - offloads |= NETIF_F_TSO_MANGLEID; - } + other_offloads |= NETIF_F_GRO_HW; if (idpf_is_cap_ena(adapter, IDPF_OTHER_CAPS, VIRTCHNL2_CAP_LOOPBACK)) - offloads |= NETIF_F_LOOPBACK; + other_offloads |= NETIF_F_LOOPBACK; - netdev->features |= dflt_features; - netdev->hw_features |= dflt_features | offloads; - netdev->hw_enc_features |= dflt_features | offloads; + netdev->features |= dflt_features | csum_offloads | tso_offloads; + netdev->hw_features |= netdev->features | other_offloads; + netdev->vlan_features |= netdev->features | other_offloads; + netdev->hw_enc_features |= dflt_features | other_offloads; idpf_set_ethtool_ops(netdev); netif_set_affinity_auto(netdev); SET_NETDEV_DEV(netdev, &adapter->pdev->dev); From 66ada7471155cb70f7c651fab69327322ab9d92b Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Fri, 25 Apr 2025 10:57:34 +0200 Subject: [PATCH 542/770] net: prestera: Use pure PCI devres API The currently used function pci_request_regions() is one of the problematic "hybrid devres" PCI functions, which are sometimes managed through devres, and sometimes not (depending on whether pci_enable_device() or pcim_enable_device() has been called before). The PCI subsystem wants to remove this behavior and, therefore, needs to port all users to functions that don't have this problem. Furthermore, the PCI function being managed implies that it's not necessary to call pci_release_regions() manually. Remove the calls to pci_release_regions(). Replace pci_request_regions() with pcim_request_all_regions(). Signed-off-by: Philipp Stanner Acked-by: Elad Nachman Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250425085740.65304-3-phasta@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/prestera/prestera_pci.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_pci.c b/drivers/net/ethernet/marvell/prestera/prestera_pci.c index 35857dc19542f..c45d108b2f6da 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_pci.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_pci.c @@ -845,9 +845,9 @@ static int prestera_pci_probe(struct pci_dev *pdev, goto err_pci_enable_device; } - err = pci_request_regions(pdev, driver_name); + err = pcim_request_all_regions(pdev, driver_name); if (err) { - dev_err(&pdev->dev, "pci_request_regions failed\n"); + dev_err(&pdev->dev, "pcim_request_all_regions failed\n"); goto err_pci_request_regions; } @@ -938,7 +938,6 @@ static int prestera_pci_probe(struct pci_dev *pdev, err_pp_ioremap: err_mem_ioremap: err_dma_mask: - pci_release_regions(pdev); err_pci_request_regions: err_pci_enable_device: return err; @@ -953,7 +952,6 @@ static void prestera_pci_remove(struct pci_dev *pdev) pci_free_irq_vectors(pdev); destroy_workqueue(fw->wq); prestera_fw_uninit(fw); - pci_release_regions(pdev); } static const struct pci_device_id prestera_pci_devices[] = { From 48217b8345294e8e49402f8ec7bd1fcb5f8705a6 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Fri, 25 Apr 2025 10:57:35 +0200 Subject: [PATCH 543/770] net: octeontx2: Use pure PCI devres API The currently used function pci_request_regions() is one of the problematic "hybrid devres" PCI functions, which are sometimes managed through devres, and sometimes not (depending on whether pci_enable_device() or pcim_enable_device() has been called before). The PCI subsystem wants to remove this behavior and, therefore, needs to port all users to functions that don't have this problem. Furthermore, the PCI function being managed implies that it's not necessary to call pci_release_regions() manually. Remove the calls to pci_release_regions(). Replace pci_request_regions() with pcim_request_all_regions(). Signed-off-by: Philipp Stanner Link: https://patch.msgid.link/20250425085740.65304-4-phasta@kernel.org Signed-off-by: Jakub Kicinski --- .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 14 ++++---------- .../net/ethernet/marvell/octeontx2/nic/otx2_vf.c | 14 ++++---------- drivers/net/ethernet/marvell/octeontx2/nic/rep.c | 12 +++++------- 3 files changed, 13 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index cfed9ec5b1579..0aee8e3861f39 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -3048,7 +3048,7 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) return err; } - err = pci_request_regions(pdev, DRV_NAME); + err = pcim_request_all_regions(pdev, DRV_NAME); if (err) { dev_err(dev, "PCI request regions failed 0x%x\n", err); return err; @@ -3057,7 +3057,7 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(48)); if (err) { dev_err(dev, "DMA mask config failed, abort\n"); - goto err_release_regions; + return err; } pci_set_master(pdev); @@ -3067,10 +3067,8 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) qos_txqs = min_t(int, qcount, OTX2_QOS_MAX_LEAF_NODES); netdev = alloc_etherdev_mqs(sizeof(*pf), qcount + qos_txqs, qcount); - if (!netdev) { - err = -ENOMEM; - goto err_release_regions; - } + if (!netdev) + return -ENOMEM; pci_set_drvdata(pdev, netdev); SET_NETDEV_DEV(netdev, &pdev->dev); @@ -3246,8 +3244,6 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) err_free_netdev: pci_set_drvdata(pdev, NULL); free_netdev(netdev); -err_release_regions: - pci_release_regions(pdev); return err; } @@ -3447,8 +3443,6 @@ static void otx2_remove(struct pci_dev *pdev) pci_free_irq_vectors(pf->pdev); pci_set_drvdata(pdev, NULL); free_netdev(netdev); - - pci_release_regions(pdev); } static struct pci_driver otx2_pf_driver = { diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c index 7ef3ba477d496..fb4da816d2188 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c @@ -548,7 +548,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) return err; } - err = pci_request_regions(pdev, DRV_NAME); + err = pcim_request_all_regions(pdev, DRV_NAME); if (err) { dev_err(dev, "PCI request regions failed 0x%x\n", err); return err; @@ -557,7 +557,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(48)); if (err) { dev_err(dev, "DMA mask config failed, abort\n"); - goto err_release_regions; + return err; } pci_set_master(pdev); @@ -565,10 +565,8 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) qcount = num_online_cpus(); qos_txqs = min_t(int, qcount, OTX2_QOS_MAX_LEAF_NODES); netdev = alloc_etherdev_mqs(sizeof(*vf), qcount + qos_txqs, qcount); - if (!netdev) { - err = -ENOMEM; - goto err_release_regions; - } + if (!netdev) + return -ENOMEM; pci_set_drvdata(pdev, netdev); SET_NETDEV_DEV(netdev, &pdev->dev); @@ -765,8 +763,6 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) err_free_netdev: pci_set_drvdata(pdev, NULL); free_netdev(netdev); -err_release_regions: - pci_release_regions(pdev); return err; } @@ -815,8 +811,6 @@ static void otx2vf_remove(struct pci_dev *pdev) pci_free_irq_vectors(vf->pdev); pci_set_drvdata(pdev, NULL); free_netdev(netdev); - - pci_release_regions(pdev); } static struct pci_driver otx2vf_driver = { diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c index 7153a71dfc860..2cd3da3b68432 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/rep.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/rep.c @@ -765,7 +765,7 @@ static int rvu_rep_probe(struct pci_dev *pdev, const struct pci_device_id *id) return err; } - err = pci_request_regions(pdev, DRV_NAME); + err = pcim_request_all_regions(pdev, DRV_NAME); if (err) { dev_err(dev, "PCI request regions failed 0x%x\n", err); return err; @@ -774,7 +774,7 @@ static int rvu_rep_probe(struct pci_dev *pdev, const struct pci_device_id *id) err = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(48)); if (err) { dev_err(dev, "DMA mask config failed, abort\n"); - goto err_release_regions; + goto err_set_drv_data; } pci_set_master(pdev); @@ -782,7 +782,7 @@ static int rvu_rep_probe(struct pci_dev *pdev, const struct pci_device_id *id) priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) { err = -ENOMEM; - goto err_release_regions; + goto err_set_drv_data; } pci_set_drvdata(pdev, priv); @@ -799,7 +799,7 @@ static int rvu_rep_probe(struct pci_dev *pdev, const struct pci_device_id *id) err = otx2_init_rsrc(pdev, priv); if (err) - goto err_release_regions; + goto err_set_drv_data; priv->iommu_domain = iommu_get_domain_for_dev(dev); @@ -822,9 +822,8 @@ static int rvu_rep_probe(struct pci_dev *pdev, const struct pci_device_id *id) otx2_disable_mbox_intr(priv); otx2_pfaf_mbox_destroy(priv); pci_free_irq_vectors(pdev); -err_release_regions: +err_set_drv_data: pci_set_drvdata(pdev, NULL); - pci_release_regions(pdev); return err; } @@ -844,7 +843,6 @@ static void rvu_rep_remove(struct pci_dev *pdev) otx2_pfaf_mbox_destroy(priv); pci_free_irq_vectors(priv->pdev); pci_set_drvdata(pdev, NULL); - pci_release_regions(pdev); } static struct pci_driver rvu_rep_driver = { From adc36d0914f629e12700ef223f146e1c65d01c57 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Fri, 25 Apr 2025 10:57:36 +0200 Subject: [PATCH 544/770] net: tulip: Use pure PCI devres API The currently used function pci_request_regions() is one of the problematic "hybrid devres" PCI functions, which are sometimes managed through devres, and sometimes not (depending on whether pci_enable_device() or pcim_enable_device() has been called before). The PCI subsystem wants to remove this behavior and, therefore, needs to port all users to functions that don't have this problem. Replace pci_request_regions() with pcim_request_all_regions(). Signed-off-by: Philipp Stanner Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250425085740.65304-5-phasta@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/dec/tulip/tulip_core.c | 2 +- drivers/net/ethernet/dec/tulip/winbond-840.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/dec/tulip/tulip_core.c b/drivers/net/ethernet/dec/tulip/tulip_core.c index c8c53121557f2..bec76e7bf5dd1 100644 --- a/drivers/net/ethernet/dec/tulip/tulip_core.c +++ b/drivers/net/ethernet/dec/tulip/tulip_core.c @@ -1411,7 +1411,7 @@ static int tulip_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* grab all resources from both PIO and MMIO regions, as we * don't want anyone else messing around with our hardware */ - if (pci_request_regions(pdev, DRV_NAME)) + if (pcim_request_all_regions(pdev, DRV_NAME)) return -ENODEV; ioaddr = pcim_iomap(pdev, TULIP_BAR, tulip_tbl[chip_idx].io_size); diff --git a/drivers/net/ethernet/dec/tulip/winbond-840.c b/drivers/net/ethernet/dec/tulip/winbond-840.c index 5930cdec6f2f6..e593273b28675 100644 --- a/drivers/net/ethernet/dec/tulip/winbond-840.c +++ b/drivers/net/ethernet/dec/tulip/winbond-840.c @@ -375,7 +375,7 @@ static int w840_probe1(struct pci_dev *pdev, const struct pci_device_id *ent) return -ENOMEM; SET_NETDEV_DEV(dev, &pdev->dev); - if (pci_request_regions(pdev, DRV_NAME)) + if (pcim_request_all_regions(pdev, DRV_NAME)) goto err_out_netdev; ioaddr = pci_iomap(pdev, TULIP_BAR, netdev_res_size); From 2a5a74947a2bdd8ba437f79e746c23724182c662 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Fri, 25 Apr 2025 10:57:37 +0200 Subject: [PATCH 545/770] net: ethernet: natsemi: Use pure PCI devres API The currently used function pci_request_regions() is one of the problematic "hybrid devres" PCI functions, which are sometimes managed through devres, and sometimes not (depending on whether pci_enable_device() or pcim_enable_device() has been called before). The PCI subsystem wants to remove this behavior and, therefore, needs to port all users to functions that don't have this problem. Replace pci_request_regions() with pcim_request_all_regions(). Signed-off-by: Philipp Stanner Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250425085740.65304-6-phasta@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/natsemi/natsemi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/natsemi/natsemi.c b/drivers/net/ethernet/natsemi/natsemi.c index 05606692e6319..dd279788cf9eb 100644 --- a/drivers/net/ethernet/natsemi/natsemi.c +++ b/drivers/net/ethernet/natsemi/natsemi.c @@ -846,7 +846,7 @@ static int natsemi_probe1(struct pci_dev *pdev, const struct pci_device_id *ent) return -ENOMEM; SET_NETDEV_DEV(dev, &pdev->dev); - i = pci_request_regions(pdev, DRV_NAME); + i = pcim_request_all_regions(pdev, DRV_NAME); if (i) goto err_pci_request_regions; From 6e5f7a5b5e0c6623742298fe0d5a0d3df6be1afc Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Fri, 25 Apr 2025 10:57:38 +0200 Subject: [PATCH 546/770] net: ethernet: sis900: Use pure PCI devres API The currently used function pci_request_regions() is one of the problematic "hybrid devres" PCI functions, which are sometimes managed through devres, and sometimes not (depending on whether pci_enable_device() or pcim_enable_device() has been called before). The PCI subsystem wants to remove this behavior and, therefore, needs to port all users to functions that don't have this problem. Replace pci_request_regions() with pcim_request_all_regions(). Signed-off-by: Philipp Stanner Reviewed-by: Jacob Keller Acked-by: Daniele Venzano Link: https://patch.msgid.link/20250425085740.65304-7-phasta@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/sis/sis900.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/sis/sis900.c b/drivers/net/ethernet/sis/sis900.c index 332cbd7259009..df869f82cae8a 100644 --- a/drivers/net/ethernet/sis/sis900.c +++ b/drivers/net/ethernet/sis/sis900.c @@ -468,7 +468,7 @@ static int sis900_probe(struct pci_dev *pci_dev, SET_NETDEV_DEV(net_dev, &pci_dev->dev); /* We do a request_region() to register /proc/ioports info. */ - ret = pci_request_regions(pci_dev, "sis900"); + ret = pcim_request_all_regions(pci_dev, "sis900"); if (ret) goto err_out; From fad4d94d9ae5a67b76b934e544be91220b682ef1 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Fri, 25 Apr 2025 10:57:39 +0200 Subject: [PATCH 547/770] net: mdio: thunder: Use pure PCI devres API The currently used function pci_request_regions() is one of the problematic "hybrid devres" PCI functions, which are sometimes managed through devres, and sometimes not (depending on whether pci_enable_device() or pcim_enable_device() has been called before). The PCI subsystem wants to remove this behavior and, therefore, needs to port all users to functions that don't have this problem. Furthermore, the PCI function being managed implies that it's not necessary to call pci_release_regions() manually. Remove the calls to pci_release_regions(). Replace pci_request_regions() with pcim_request_all_regions(). Signed-off-by: Philipp Stanner Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250425085740.65304-8-phasta@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/mdio/mdio-thunder.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/net/mdio/mdio-thunder.c b/drivers/net/mdio/mdio-thunder.c index 1e1aa72b1eff8..a3047f7258a72 100644 --- a/drivers/net/mdio/mdio-thunder.c +++ b/drivers/net/mdio/mdio-thunder.c @@ -40,16 +40,16 @@ static int thunder_mdiobus_pci_probe(struct pci_dev *pdev, return err; } - err = pci_request_regions(pdev, KBUILD_MODNAME); + err = pcim_request_all_regions(pdev, KBUILD_MODNAME); if (err) { - dev_err(&pdev->dev, "pci_request_regions failed\n"); + dev_err(&pdev->dev, "pcim_request_all_regions failed\n"); goto err_disable_device; } nexus->bar0 = pcim_iomap(pdev, 0, pci_resource_len(pdev, 0)); if (!nexus->bar0) { err = -ENOMEM; - goto err_release_regions; + goto err_disable_device; } i = 0; @@ -107,9 +107,6 @@ static int thunder_mdiobus_pci_probe(struct pci_dev *pdev, } return 0; -err_release_regions: - pci_release_regions(pdev); - err_disable_device: pci_set_drvdata(pdev, NULL); return err; @@ -129,7 +126,6 @@ static void thunder_mdiobus_pci_remove(struct pci_dev *pdev) mdiobus_unregister(bus->mii_bus); oct_mdio_writeq(0, bus->register_base + SMI_EN); } - pci_release_regions(pdev); pci_set_drvdata(pdev, NULL); } From 06133ddc3590845c91d8bd44425b170fce11837f Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Fri, 25 Apr 2025 10:57:40 +0200 Subject: [PATCH 548/770] net: thunder_bgx: Use pure PCI devres API The currently used function pci_request_regions() is one of the problematic "hybrid devres" PCI functions, which are sometimes managed through devres, and sometimes not (depending on whether pci_enable_device() or pcim_enable_device() has been called before). The PCI subsystem wants to remove this behavior and, therefore, needs to port all users to functions that don't have this problem. Furthermore, the PCI function being managed implies that it's not necessary to call pci_release_regions() manually. Remove the calls to pci_release_regions(). Replace pci_request_regions() with pcim_request_all_regions(). Signed-off-by: Philipp Stanner Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250425085740.65304-9-phasta@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index 608cc6af5af1c..c9369bdd04e0e 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -1605,7 +1605,7 @@ static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return dev_err_probe(dev, err, "Failed to enable PCI device\n"); } - err = pci_request_regions(pdev, DRV_NAME); + err = pcim_request_all_regions(pdev, DRV_NAME); if (err) { dev_err(dev, "PCI request regions failed 0x%x\n", err); goto err_disable_device; @@ -1616,7 +1616,7 @@ static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (!bgx->reg_base) { dev_err(dev, "BGX: Cannot map CSR memory space, aborting\n"); err = -ENOMEM; - goto err_release_regions; + goto err_disable_device; } set_max_bgx_per_node(pdev); @@ -1688,8 +1688,6 @@ static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err_enable: bgx_vnic[bgx->bgx_id] = NULL; pci_free_irq(pdev, GMPX_GMI_TX_INT, bgx); -err_release_regions: - pci_release_regions(pdev); err_disable_device: pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); @@ -1710,7 +1708,6 @@ static void bgx_remove(struct pci_dev *pdev) pci_free_irq(pdev, GMPX_GMI_TX_INT, bgx); bgx_vnic[bgx->bgx_id] = NULL; - pci_release_regions(pdev); pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); } From 1549bd06e340eaac2c317a045c493c3e9cc454cf Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Fri, 25 Apr 2025 10:57:41 +0200 Subject: [PATCH 549/770] net: thunder_bgx: Don't disable PCI device manually thunder_bgx's PCI device is enabled with pcim_enable_device(), a managed devres function which ensures that the device gets enabled on driver detach automatically. Remove the calls to pci_disable_device(). Signed-off-by: Philipp Stanner Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250425085740.65304-10-phasta@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index c9369bdd04e0e..3b7ad744b2dd6 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -1608,7 +1608,7 @@ static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err = pcim_request_all_regions(pdev, DRV_NAME); if (err) { dev_err(dev, "PCI request regions failed 0x%x\n", err); - goto err_disable_device; + goto err_zero_drv_data; } /* MAP configuration registers */ @@ -1616,7 +1616,7 @@ static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (!bgx->reg_base) { dev_err(dev, "BGX: Cannot map CSR memory space, aborting\n"); err = -ENOMEM; - goto err_disable_device; + goto err_zero_drv_data; } set_max_bgx_per_node(pdev); @@ -1688,8 +1688,7 @@ static int bgx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err_enable: bgx_vnic[bgx->bgx_id] = NULL; pci_free_irq(pdev, GMPX_GMI_TX_INT, bgx); -err_disable_device: - pci_disable_device(pdev); +err_zero_drv_data: pci_set_drvdata(pdev, NULL); return err; } @@ -1708,7 +1707,6 @@ static void bgx_remove(struct pci_dev *pdev) pci_free_irq(pdev, GMPX_GMI_TX_INT, bgx); bgx_vnic[bgx->bgx_id] = NULL; - pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); } From ef7d33e174564d6294edd00ca797e2b0aac76259 Mon Sep 17 00:00:00 2001 From: Justin Lai Date: Fri, 25 Apr 2025 14:40:57 +0800 Subject: [PATCH 550/770] rtase: Modify the format specifier in snprintf to %u Modify the format specifier in snprintf to %u. Signed-off-by: Justin Lai Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250425064057.30035-1-justinlai0215@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/rtase/rtase_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/rtase/rtase_main.c b/drivers/net/ethernet/realtek/rtase/rtase_main.c index 6251548d50ffc..700e9ae8e8334 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase_main.c +++ b/drivers/net/ethernet/realtek/rtase/rtase_main.c @@ -1114,7 +1114,7 @@ static int rtase_open(struct net_device *dev) /* request other interrupts to handle multiqueue */ for (i = 1; i < tp->int_nums; i++) { ivec = &tp->int_vector[i]; - snprintf(ivec->name, sizeof(ivec->name), "%s_int%i", + snprintf(ivec->name, sizeof(ivec->name), "%s_int%u", tp->dev->name, i); ret = request_irq(ivec->irq, rtase_q_interrupt, 0, ivec->name, ivec); From a427e7f99b710f3547f0bfb91c4371acb56e1c84 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 23 Apr 2025 20:46:44 +0000 Subject: [PATCH 551/770] tools/Makefile: Add ynl target Add targets to build, clean, and install ynl headers, libynl.a, and python tooling. Signed-off-by: Joe Damato Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250423204647.190784-1-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- tools/Makefile | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tools/Makefile b/tools/Makefile index 5e1254eb66de1..c31cbbd12c456 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -41,6 +41,7 @@ help: @echo ' mm - misc mm tools' @echo ' wmi - WMI interface examples' @echo ' x86_energy_perf_policy - Intel energy policy tool' + @echo ' ynl - ynl headers, library, and python tool' @echo '' @echo 'You can do:' @echo ' $$ make -C tools/ _install' @@ -118,11 +119,14 @@ freefall: FORCE kvm_stat: FORCE $(call descend,kvm/$@) +ynl: FORCE + $(call descend,net/ynl) + all: acpi counter cpupower gpio hv firewire \ perf selftests bootconfig spi turbostat usb \ virtio mm bpf x86_energy_perf_policy \ tmon freefall iio objtool kvm_stat wmi \ - debugging tracing thermal thermometer thermal-engine + debugging tracing thermal thermometer thermal-engine ynl acpi_install: $(call descend,power/$(@:_install=),install) @@ -157,13 +161,16 @@ freefall_install: kvm_stat_install: $(call descend,kvm/$(@:_install=),install) +ynl_install: + $(call descend,net/$(@:_install=),install) + install: acpi_install counter_install cpupower_install gpio_install \ hv_install firewire_install iio_install \ perf_install selftests_install turbostat_install usb_install \ virtio_install mm_install bpf_install x86_energy_perf_policy_install \ tmon_install freefall_install objtool_install kvm_stat_install \ wmi_install debugging_install intel-speed-select_install \ - tracing_install thermometer_install thermal-engine_install + tracing_install thermometer_install thermal-engine_install ynl_install acpi_clean: $(call descend,power/acpi,clean) @@ -214,12 +221,15 @@ freefall_clean: build_clean: $(call descend,build,clean) +ynl_clean: + $(call descend,net/$(@:_clean=),clean) + clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \ perf_clean selftests_clean turbostat_clean bootconfig_clean spi_clean usb_clean virtio_clean \ mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ freefall_clean build_clean libbpf_clean libsubcmd_clean \ gpio_clean objtool_clean leds_clean wmi_clean firmware_clean debugging_clean \ intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \ - sched_ext_clean + sched_ext_clean ynl_clean .PHONY: FORCE From 0d15a26b247d25cd012134bf8825128fedb15cc9 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Thu, 24 Apr 2025 15:23:16 +0530 Subject: [PATCH 552/770] net: ti: icssg-prueth: Add ICSSG FW Stats The ICSSG firmware maintains set of stats called PA_STATS. Currently the driver only dumps 4 stats. Add support for dumping more stats. The offset for different stats are defined as MACROs in icssg_switch_map.h file. All the offsets are for Slice0. Slice1 offsets are slice0 + 4. The offset calculation is taken care while reading the stats in emac_update_hardware_stats(). The statistics are documented in Documentation/networking/device_drivers/icssg_prueth.rst Reviewed-by: Simon Horman Signed-off-by: MD Danish Anwar Link: https://patch.msgid.link/20250424095316.2643573-1-danishanwar@ti.com Signed-off-by: Jakub Kicinski --- .../device_drivers/ethernet/index.rst | 1 + .../ethernet/ti/icssg_prueth.rst | 56 ++++++++++++++++++ drivers/net/ethernet/ti/icssg/icssg_common.c | 24 +++++++- drivers/net/ethernet/ti/icssg/icssg_prueth.h | 2 +- drivers/net/ethernet/ti/icssg/icssg_stats.c | 8 +-- drivers/net/ethernet/ti/icssg/icssg_stats.h | 58 ++++++++++++------- .../net/ethernet/ti/icssg/icssg_switch_map.h | 33 +++++++++++ 7 files changed, 151 insertions(+), 31 deletions(-) create mode 100644 Documentation/networking/device_drivers/ethernet/ti/icssg_prueth.rst diff --git a/Documentation/networking/device_drivers/ethernet/index.rst b/Documentation/networking/device_drivers/ethernet/index.rst index 05d822b904b45..f9ed93c1da35d 100644 --- a/Documentation/networking/device_drivers/ethernet/index.rst +++ b/Documentation/networking/device_drivers/ethernet/index.rst @@ -55,6 +55,7 @@ Contents: ti/cpsw_switchdev ti/am65_nuss_cpsw_switchdev ti/tlan + ti/icssg_prueth wangxun/txgbe wangxun/ngbe diff --git a/Documentation/networking/device_drivers/ethernet/ti/icssg_prueth.rst b/Documentation/networking/device_drivers/ethernet/ti/icssg_prueth.rst new file mode 100644 index 0000000000000..da21ddf431bbc --- /dev/null +++ b/Documentation/networking/device_drivers/ethernet/ti/icssg_prueth.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================== +Texas Instruments ICSSG PRUETH ethernet driver +============================================== + +:Version: 1.0 + +ICSSG Firmware +============== + +Every ICSSG core has two Programmable Real-Time Unit(PRUs), two auxiliary +Real-Time Transfer Unit (RTUs), and two Transmit Real-Time Transfer Units +(TX_PRUs). Each one of these runs its own firmware. The firmwares combnined are +referred as ICSSG Firmware. + +Firmware Statistics +=================== + +The ICSSG firmware maintains certain statistics which are dumped by the driver +via ``ethtool -S `` + +These statistics are as follows, + + - ``FW_RTU_PKT_DROP``: Diagnostic error counter which increments when RTU drops a locally injected packet due to port being disabled or rule violation. + - ``FW_Q0_OVERFLOW``: TX overflow counter for queue0 + - ``FW_Q1_OVERFLOW``: TX overflow counter for queue1 + - ``FW_Q2_OVERFLOW``: TX overflow counter for queue2 + - ``FW_Q3_OVERFLOW``: TX overflow counter for queue3 + - ``FW_Q4_OVERFLOW``: TX overflow counter for queue4 + - ``FW_Q5_OVERFLOW``: TX overflow counter for queue5 + - ``FW_Q6_OVERFLOW``: TX overflow counter for queue6 + - ``FW_Q7_OVERFLOW``: TX overflow counter for queue7 + - ``FW_DROPPED_PKT``: This counter is incremented when a packet is dropped at PRU because of rule violation. + - ``FW_RX_ERROR``: Incremented if there was a CRC error or Min/Max frame error at PRU + - ``FW_RX_DS_INVALID``: Incremented when RTU detects Data Status invalid condition + - ``FW_TX_DROPPED_PACKET``: Counter for packets dropped via TX Port + - ``FW_TX_TS_DROPPED_PACKET``: Counter for packets with TS flag dropped via TX Port + - ``FW_INF_PORT_DISABLED``: Incremented when RX frame is dropped due to port being disabled + - ``FW_INF_SAV``: Incremented when RX frame is dropped due to Source Address violation + - ``FW_INF_SA_DL``: Incremented when RX frame is dropped due to Source Address being in the denylist + - ``FW_INF_PORT_BLOCKED``: Incremented when RX frame is dropped due to port being blocked and frame being a special frame + - ``FW_INF_DROP_TAGGED`` : Incremented when RX frame is dropped for being tagged + - ``FW_INF_DROP_PRIOTAGGED``: Incremented when RX frame is dropped for being priority tagged + - ``FW_INF_DROP_NOTAG``: Incremented when RX frame is dropped for being untagged + - ``FW_INF_DROP_NOTMEMBER``: Incremented when RX frame is dropped for port not being member of VLAN + - ``FW_RX_EOF_SHORT_FRMERR``: Incremented if End Of Frame (EOF) task is scheduled without seeing RX_B1 + - ``FW_RX_B0_DROP_EARLY_EOF``: Incremented when frame is dropped due to Early EOF + - ``FW_TX_JUMBO_FRM_CUTOFF``: Incremented when frame is cut off to prevent packet size > 2000 Bytes + - ``FW_RX_EXP_FRAG_Q_DROP``: Incremented when express frame is received in the same queue as the previous fragment + - ``FW_RX_FIFO_OVERRUN``: RX fifo overrun counter + - ``FW_CUT_THR_PKT``: Incremented when a packet is forwarded using Cut-Through forwarding method + - ``FW_HOST_RX_PKT_CNT``: Number of valid packets sent by Rx PRU to Host on PSI + - ``FW_HOST_TX_PKT_CNT``: Number of valid packets copied by RTU0 to Tx queues + - ``FW_HOST_EGRESS_Q_PRE_OVERFLOW``: Host Egress Q (Pre-emptible) Overflow Counter + - ``FW_HOST_EGRESS_Q_EXP_OVERFLOW``: Host Egress Q (Pre-emptible) Overflow Counter diff --git a/drivers/net/ethernet/ti/icssg/icssg_common.c b/drivers/net/ethernet/ti/icssg/icssg_common.c index b4be76e13a2fb..79f2d86acf219 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_common.c +++ b/drivers/net/ethernet/ti/icssg/icssg_common.c @@ -1318,10 +1318,28 @@ void icssg_ndo_get_stats64(struct net_device *ndev, stats->rx_over_errors = emac_get_stat_by_name(emac, "rx_over_errors"); stats->multicast = emac_get_stat_by_name(emac, "rx_multicast_frames"); - stats->rx_errors = ndev->stats.rx_errors; - stats->rx_dropped = ndev->stats.rx_dropped; + stats->rx_errors = ndev->stats.rx_errors + + emac_get_stat_by_name(emac, "FW_RX_ERROR") + + emac_get_stat_by_name(emac, "FW_RX_EOF_SHORT_FRMERR") + + emac_get_stat_by_name(emac, "FW_RX_B0_DROP_EARLY_EOF") + + emac_get_stat_by_name(emac, "FW_RX_EXP_FRAG_Q_DROP") + + emac_get_stat_by_name(emac, "FW_RX_FIFO_OVERRUN"); + stats->rx_dropped = ndev->stats.rx_dropped + + emac_get_stat_by_name(emac, "FW_DROPPED_PKT") + + emac_get_stat_by_name(emac, "FW_INF_PORT_DISABLED") + + emac_get_stat_by_name(emac, "FW_INF_SAV") + + emac_get_stat_by_name(emac, "FW_INF_SA_DL") + + emac_get_stat_by_name(emac, "FW_INF_PORT_BLOCKED") + + emac_get_stat_by_name(emac, "FW_INF_DROP_TAGGED") + + emac_get_stat_by_name(emac, "FW_INF_DROP_PRIOTAGGED") + + emac_get_stat_by_name(emac, "FW_INF_DROP_NOTAG") + + emac_get_stat_by_name(emac, "FW_INF_DROP_NOTMEMBER"); stats->tx_errors = ndev->stats.tx_errors; - stats->tx_dropped = ndev->stats.tx_dropped; + stats->tx_dropped = ndev->stats.tx_dropped + + emac_get_stat_by_name(emac, "FW_RTU_PKT_DROP") + + emac_get_stat_by_name(emac, "FW_TX_DROPPED_PACKET") + + emac_get_stat_by_name(emac, "FW_TX_TS_DROPPED_PACKET") + + emac_get_stat_by_name(emac, "FW_TX_JUMBO_FRM_CUTOFF"); } EXPORT_SYMBOL_GPL(icssg_ndo_get_stats64); diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h index b6be4aa57a615..23c465f1ce7ff 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h @@ -54,7 +54,7 @@ #define ICSSG_MAX_RFLOWS 8 /* per slice */ -#define ICSSG_NUM_PA_STATS 4 +#define ICSSG_NUM_PA_STATS 32 #define ICSSG_NUM_MIIG_STATS 60 /* Number of ICSSG related stats */ #define ICSSG_NUM_STATS (ICSSG_NUM_MIIG_STATS + ICSSG_NUM_PA_STATS) diff --git a/drivers/net/ethernet/ti/icssg/icssg_stats.c b/drivers/net/ethernet/ti/icssg/icssg_stats.c index 6f0edae38ea24..e8241e998aa9f 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_stats.c +++ b/drivers/net/ethernet/ti/icssg/icssg_stats.c @@ -11,7 +11,6 @@ #define ICSSG_TX_PACKET_OFFSET 0xA0 #define ICSSG_TX_BYTE_OFFSET 0xEC -#define ICSSG_FW_STATS_BASE 0x0248 static u32 stats_base[] = { 0x54c, /* Slice 0 stats start */ 0xb18, /* Slice 1 stats start */ @@ -46,9 +45,8 @@ void emac_update_hardware_stats(struct prueth_emac *emac) if (prueth->pa_stats) { for (i = 0; i < ARRAY_SIZE(icssg_all_pa_stats); i++) { - reg = ICSSG_FW_STATS_BASE + - icssg_all_pa_stats[i].offset * - PRUETH_NUM_MACS + slice * sizeof(u32); + reg = icssg_all_pa_stats[i].offset + + slice * sizeof(u32); regmap_read(prueth->pa_stats, reg, &val); emac->pa_stats[i] += val; } @@ -80,7 +78,7 @@ int emac_get_stat_by_name(struct prueth_emac *emac, char *stat_name) if (emac->prueth->pa_stats) { for (i = 0; i < ARRAY_SIZE(icssg_all_pa_stats); i++) { if (!strcmp(icssg_all_pa_stats[i].name, stat_name)) - return emac->pa_stats[icssg_all_pa_stats[i].offset / sizeof(u32)]; + return emac->pa_stats[i]; } } diff --git a/drivers/net/ethernet/ti/icssg/icssg_stats.h b/drivers/net/ethernet/ti/icssg/icssg_stats.h index e88b919f532cd..5ec0b38e0c67d 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_stats.h +++ b/drivers/net/ethernet/ti/icssg/icssg_stats.h @@ -155,24 +155,10 @@ static const struct icssg_miig_stats icssg_all_miig_stats[] = { ICSSG_MIIG_STATS(tx_bytes, true), }; -/** - * struct pa_stats_regs - ICSSG Firmware maintained PA Stats register - * @fw_rx_cnt: Number of valid packets sent by Rx PRU to Host on PSI - * @fw_tx_cnt: Number of valid packets copied by RTU0 to Tx queues - * @fw_tx_pre_overflow: Host Egress Q (Pre-emptible) Overflow Counter - * @fw_tx_exp_overflow: Host Egress Q (Express) Overflow Counter - */ -struct pa_stats_regs { - u32 fw_rx_cnt; - u32 fw_tx_cnt; - u32 fw_tx_pre_overflow; - u32 fw_tx_exp_overflow; -}; - -#define ICSSG_PA_STATS(field) \ -{ \ - #field, \ - offsetof(struct pa_stats_regs, field), \ +#define ICSSG_PA_STATS(field) \ +{ \ + #field, \ + field, \ } struct icssg_pa_stats { @@ -181,10 +167,38 @@ struct icssg_pa_stats { }; static const struct icssg_pa_stats icssg_all_pa_stats[] = { - ICSSG_PA_STATS(fw_rx_cnt), - ICSSG_PA_STATS(fw_tx_cnt), - ICSSG_PA_STATS(fw_tx_pre_overflow), - ICSSG_PA_STATS(fw_tx_exp_overflow), + ICSSG_PA_STATS(FW_RTU_PKT_DROP), + ICSSG_PA_STATS(FW_Q0_OVERFLOW), + ICSSG_PA_STATS(FW_Q1_OVERFLOW), + ICSSG_PA_STATS(FW_Q2_OVERFLOW), + ICSSG_PA_STATS(FW_Q3_OVERFLOW), + ICSSG_PA_STATS(FW_Q4_OVERFLOW), + ICSSG_PA_STATS(FW_Q5_OVERFLOW), + ICSSG_PA_STATS(FW_Q6_OVERFLOW), + ICSSG_PA_STATS(FW_Q7_OVERFLOW), + ICSSG_PA_STATS(FW_DROPPED_PKT), + ICSSG_PA_STATS(FW_RX_ERROR), + ICSSG_PA_STATS(FW_RX_DS_INVALID), + ICSSG_PA_STATS(FW_TX_DROPPED_PACKET), + ICSSG_PA_STATS(FW_TX_TS_DROPPED_PACKET), + ICSSG_PA_STATS(FW_INF_PORT_DISABLED), + ICSSG_PA_STATS(FW_INF_SAV), + ICSSG_PA_STATS(FW_INF_SA_DL), + ICSSG_PA_STATS(FW_INF_PORT_BLOCKED), + ICSSG_PA_STATS(FW_INF_DROP_TAGGED), + ICSSG_PA_STATS(FW_INF_DROP_PRIOTAGGED), + ICSSG_PA_STATS(FW_INF_DROP_NOTAG), + ICSSG_PA_STATS(FW_INF_DROP_NOTMEMBER), + ICSSG_PA_STATS(FW_RX_EOF_SHORT_FRMERR), + ICSSG_PA_STATS(FW_RX_B0_DROP_EARLY_EOF), + ICSSG_PA_STATS(FW_TX_JUMBO_FRM_CUTOFF), + ICSSG_PA_STATS(FW_RX_EXP_FRAG_Q_DROP), + ICSSG_PA_STATS(FW_RX_FIFO_OVERRUN), + ICSSG_PA_STATS(FW_CUT_THR_PKT), + ICSSG_PA_STATS(FW_HOST_RX_PKT_CNT), + ICSSG_PA_STATS(FW_HOST_TX_PKT_CNT), + ICSSG_PA_STATS(FW_HOST_EGRESS_Q_PRE_OVERFLOW), + ICSSG_PA_STATS(FW_HOST_EGRESS_Q_EXP_OVERFLOW), }; #endif /* __NET_TI_ICSSG_STATS_H */ diff --git a/drivers/net/ethernet/ti/icssg/icssg_switch_map.h b/drivers/net/ethernet/ti/icssg/icssg_switch_map.h index 424a7e945ea84..490a9cc06fb05 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_switch_map.h +++ b/drivers/net/ethernet/ti/icssg/icssg_switch_map.h @@ -231,4 +231,37 @@ /* Start of 32 bits PA_STAT counters */ #define PA_STAT_32b_START_OFFSET 0x0080 +#define FW_RTU_PKT_DROP 0x0088 +#define FW_Q0_OVERFLOW 0x0090 +#define FW_Q1_OVERFLOW 0x0098 +#define FW_Q2_OVERFLOW 0x00A0 +#define FW_Q3_OVERFLOW 0x00A8 +#define FW_Q4_OVERFLOW 0x00B0 +#define FW_Q5_OVERFLOW 0x00B8 +#define FW_Q6_OVERFLOW 0x00C0 +#define FW_Q7_OVERFLOW 0x00C8 +#define FW_DROPPED_PKT 0x00F8 +#define FW_RX_ERROR 0x0100 +#define FW_RX_DS_INVALID 0x0108 +#define FW_TX_DROPPED_PACKET 0x0110 +#define FW_TX_TS_DROPPED_PACKET 0x0118 +#define FW_INF_PORT_DISABLED 0x0120 +#define FW_INF_SAV 0x0128 +#define FW_INF_SA_DL 0x0130 +#define FW_INF_PORT_BLOCKED 0x0138 +#define FW_INF_DROP_TAGGED 0x0140 +#define FW_INF_DROP_PRIOTAGGED 0x0148 +#define FW_INF_DROP_NOTAG 0x0150 +#define FW_INF_DROP_NOTMEMBER 0x0158 +#define FW_RX_EOF_SHORT_FRMERR 0x0188 +#define FW_RX_B0_DROP_EARLY_EOF 0x0190 +#define FW_TX_JUMBO_FRM_CUTOFF 0x0198 +#define FW_RX_EXP_FRAG_Q_DROP 0x01A0 +#define FW_RX_FIFO_OVERRUN 0x01A8 +#define FW_CUT_THR_PKT 0x01B0 +#define FW_HOST_RX_PKT_CNT 0x0248 +#define FW_HOST_TX_PKT_CNT 0x0250 +#define FW_HOST_EGRESS_Q_PRE_OVERFLOW 0x0258 +#define FW_HOST_EGRESS_Q_EXP_OVERFLOW 0x0260 + #endif /* __NET_TI_ICSSG_SWITCH_MAP_H */ From 32607a332cfea5a4b2a185f3e3d605a9bf4f8df0 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:18 -0400 Subject: [PATCH 553/770] ipv4: prefer multipath nexthop that matches source address With multipath routes, try to ensure that packets leave on the device that is associated with the source address. Avoid the following tcpdump example: veth0 Out IP 10.1.0.2.38640 > 10.2.0.3.8000: Flags [S] veth1 Out IP 10.1.0.2.38648 > 10.2.0.3.8000: Flags [S] Which can happen easily with the most straightforward setup: ip addr add 10.0.0.1/24 dev veth0 ip addr add 10.1.0.1/24 dev veth1 ip route add 10.2.0.3 nexthop via 10.0.0.2 dev veth0 \ nexthop via 10.1.0.2 dev veth1 This is apparently considered WAI, based on the comment in ip_route_output_key_hash_rcu: * 2. Moreover, we are allowed to send packets with saddr * of another iface. --ANK It may be ok for some uses of multipath, but not all. For instance, when using two ISPs, a router may drop packets with unknown source. The behavior occurs because tcp_v4_connect makes three route lookups when establishing a connection: 1. ip_route_connect calls to select a source address, with saddr zero. 2. ip_route_connect calls again now that saddr and daddr are known. 3. ip_route_newports calls again after a source port is also chosen. With a route with multiple nexthops, each lookup may make a different choice depending on available entropy to fib_select_multipath. So it is possible for 1 to select the saddr from the first entry, but 3 to select the second entry. Leading to the above situation. Address this by preferring a match that matches the flowi4 saddr. This will make 2 and 3 make the same choice as 1. Continue to update the backup choice until a choice that matches saddr is found. Do this in fib_select_multipath itself, rather than passing an fl4_oif constraint, to avoid changing non-multipath route selection. Commit e6b45241c57a ("ipv4: reset flowi parameters on route connect") shows how that may cause regressions. Also read ipv4.sysctl_fib_multipath_use_neigh only once. No need to refresh in the loop. This does not happen in IPv6, which performs only one lookup. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-2-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- include/net/ip_fib.h | 3 ++- net/ipv4/fib_semantics.c | 39 +++++++++++++++++++++++++-------------- net/ipv4/route.c | 2 +- 3 files changed, 28 insertions(+), 16 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index e3864b74e92a6..48bb3cf414691 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -574,7 +574,8 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net, int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); -void fib_select_multipath(struct fib_result *res, int hash); +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 5326f1501af02..2371f311a1e14 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -2170,34 +2170,45 @@ static bool fib_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -void fib_select_multipath(struct fib_result *res, int hash) +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4) { struct fib_info *fi = res->fi; struct net *net = fi->fib_net; - bool first = false; + bool found = false; + bool use_neigh; + __be32 saddr; if (unlikely(res->fi->nh)) { nexthop_path_fib_result(res, hash); return; } + use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh); + saddr = fl4 ? fl4->saddr : 0; + change_nexthops(fi) { - if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) { - if (!fib_good_nh(nexthop_nh)) - continue; - if (!first) { - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - first = true; - } + if (use_neigh && !fib_good_nh(nexthop_nh)) + continue; + + if (!found) { + res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; + found = !saddr || nexthop_nh->nh_saddr == saddr; } if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound)) continue; - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - return; + if (!saddr || nexthop_nh->nh_saddr == saddr) { + res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; + return; + } + + if (found) + return; + } endfor_nexthops(fi); } #endif @@ -2212,7 +2223,7 @@ void fib_select_path(struct net *net, struct fib_result *res, if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); - fib_select_multipath(res, h); + fib_select_multipath(res, h, fl4); } else #endif diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 49cffbe838029..e5e4c71be3afb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2154,7 +2154,7 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); - fib_select_multipath(res, h); + fib_select_multipath(res, h, NULL); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif From 65e9024643c7512ade3aedbb341e11d77ed7abc2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:19 -0400 Subject: [PATCH 554/770] ip: load balance tcp connections to single dst addr and port Load balance new TCP connections across nexthops also when they connect to the same service at a single remote address and port. This affects only port-based multipath hashing: fib_multipath_hash_policy 1 or 3. Local connections must choose both a source address and port when connecting to a remote service, in ip_route_connect. This "chicken-and-egg problem" (commit 2d7192d6cbab ("ipv4: Sanitize and simplify ip_route_{connect,newports}()")) is resolved by first selecting a source address, by looking up a route using the zero wildcard source port and address. As a result multiple connections to the same destination address and port have no entropy in fib_multipath_hash. This is not a problem when forwarding, as skb-based hashing has a 4-tuple. Nor when establishing UDP connections, as autobind there selects a port before reaching ip_route_connect. Load balance also TCP, by using a random port in fib_multipath_hash. Port assignment in inet_hash_connect is not atomic with ip_route_connect. Thus ports are unpredictable, effectively random. Implementation details: Do not actually pass a random fl4_sport, as that affects not only hashing, but routing more broadly, and can match a source port based policy route, which existing wildcard port 0 will not. Instead, define a new wildcard flowi flag that is used only for hashing. Selecting a random source is equivalent to just selecting a random hash entirely. But for code clarity, follow the normal 4-tuple hash process and only update this field. fib_multipath_hash can be reached with zero sport from other code paths, so explicitly pass this flowi flag, rather than trying to infer this case in the function itself. Signed-off-by: Willem de Bruijn Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-3-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- include/net/flow.h | 1 + include/net/route.h | 3 +++ net/ipv4/route.c | 13 ++++++++++--- net/ipv6/route.c | 13 ++++++++++--- net/ipv6/tcp_ipv6.c | 2 ++ 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/include/net/flow.h b/include/net/flow.h index 2a3f0c42f0925..a1839c278d873 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -39,6 +39,7 @@ struct flowi_common { #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 #define FLOWI_FLAG_L3MDEV_OIF 0x04 +#define FLOWI_FLAG_ANY_SPORT 0x08 __u32 flowic_secid; kuid_t flowic_uid; __u32 flowic_multipath_hash; diff --git a/include/net/route.h b/include/net/route.h index c605fd5ec0c08..8e39aa822cf98 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -326,6 +326,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport) + flow_flags |= FLOWI_FLAG_ANY_SPORT; + flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, src, dport, sport, sk->sk_uid); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e5e4c71be3afb..507b2e5dec50f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2037,8 +2037,12 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net, hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl4->fl4_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; @@ -2093,7 +2097,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; - hash_keys.ports.src = fl4->fl4_sport; + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d0351e95d9161..aa6b45bd35157 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2492,8 +2492,12 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net, hash_keys.basic.ip_proto = fl6->flowi6_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl6->fl6_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; @@ -2547,7 +2551,10 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; - hash_keys.ports.src = fl6->fl6_sport; + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7dcb33f879ee1..e8e68a1426499 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -267,6 +267,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) + fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; fl6.flowi6_uid = sk->sk_uid; opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); From 4d0dac499bf384fe3f42acc30906d304c3499dd8 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 24 Apr 2025 10:35:20 -0400 Subject: [PATCH 555/770] selftests/net: test tcp connection load balancing Verify that TCP connections use both routes when connecting multiple times to a remote service over a two nexthop multipath route. Use socat to create the connections. Use tc prio + tc filter to count routes taken, counting SYN packets across the two egress devices. Also verify that the saddr matches that of the device. To avoid flaky tests when testing inherently randomized behavior, set a low bar and pass if even a single SYN is observed on each device. Signed-off-by: Willem de Bruijn Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://patch.msgid.link/20250424143549.669426-4-willemdebruijn.kernel@gmail.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/fib_tests.sh | 120 ++++++++++++++++++++++- tools/testing/selftests/net/lib.sh | 24 +++++ 2 files changed, 143 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 3ea6f886a2109..c58dc4ac2810c 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -11,7 +11,7 @@ TESTS="unregister down carrier nexthop suppress ipv6_notify ipv4_notify \ ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics \ ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \ ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \ - ipv4_mpath_list ipv6_mpath_list" + ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance" VERBOSE=0 PAUSE_ON_FAIL=no @@ -1085,6 +1085,35 @@ route_setup() set +e } +forwarding_cleanup() +{ + cleanup_ns $ns3 + + route_cleanup +} + +# extend route_setup with an ns3 reachable through ns2 over both devices +forwarding_setup() +{ + forwarding_cleanup + + route_setup + + setup_ns ns3 + + ip link add veth5 netns $ns3 type veth peer name veth6 netns $ns2 + ip -netns $ns3 link set veth5 up + ip -netns $ns2 link set veth6 up + + ip -netns $ns3 -4 addr add dev veth5 172.16.105.1/24 + ip -netns $ns2 -4 addr add dev veth6 172.16.105.2/24 + ip -netns $ns3 -4 route add 172.16.100.0/22 via 172.16.105.2 + + ip -netns $ns3 -6 addr add dev veth5 2001:db8:105::1/64 nodad + ip -netns $ns2 -6 addr add dev veth6 2001:db8:105::2/64 nodad + ip -netns $ns3 -6 route add 2001:db8:101::/33 via 2001:db8:105::2 +} + # assumption is that basic add of a single path route works # otherwise just adding an address on an interface is broken ipv6_rt_add() @@ -2600,6 +2629,93 @@ ipv6_mpath_list_test() route_cleanup } +tc_set_flower_counter__saddr_syn() { + tc_set_flower_counter $1 $2 $3 "src_ip $4 ip_proto tcp tcp_flags 0x2" +} + +ip_mpath_balance_dep_check() +{ + if [ ! -x "$(command -v socat)" ]; then + echo "socat command not found. Skipping test" + return 1 + fi + + if [ ! -x "$(command -v jq)" ]; then + echo "jq command not found. Skipping test" + return 1 + fi +} + +ip_mpath_balance() { + local -r ipver=$1 + local -r daddr=$2 + local -r num_conn=20 + + for i in $(seq 1 $num_conn); do + ip netns exec $ns3 socat $ipver TCP-LISTEN:8000 STDIO >/dev/null & + sleep 0.02 + echo -n a | ip netns exec $ns1 socat $ipver STDIO TCP:$daddr:8000 + done + + local -r syn0="$(tc_get_flower_counter $ns1 veth1)" + local -r syn1="$(tc_get_flower_counter $ns1 veth3)" + local -r syns=$((syn0+syn1)) + + [ "$VERBOSE" = "1" ] && echo "multipath: syns seen: ($syn0,$syn1)" + + [[ $syns -ge $num_conn ]] && [[ $syn0 -gt 0 ]] && [[ $syn1 -gt 0 ]] +} + +ipv4_mpath_balance_test() +{ + echo + echo "IPv4 multipath load balance test" + + ip_mpath_balance_dep_check || return 1 + forwarding_setup + + $IP route add 172.16.105.1 \ + nexthop via 172.16.101.2 \ + nexthop via 172.16.103.2 + + ip netns exec $ns1 \ + sysctl -q -w net.ipv4.fib_multipath_hash_policy=1 + + tc_set_flower_counter__saddr_syn $ns1 4 veth1 172.16.101.1 + tc_set_flower_counter__saddr_syn $ns1 4 veth3 172.16.103.1 + + ip_mpath_balance -4 172.16.105.1 + + log_test $? 0 "IPv4 multipath loadbalance" + + forwarding_cleanup +} + +ipv6_mpath_balance_test() +{ + echo + echo "IPv6 multipath load balance test" + + ip_mpath_balance_dep_check || return 1 + forwarding_setup + + $IP route add 2001:db8:105::1\ + nexthop via 2001:db8:101::2 \ + nexthop via 2001:db8:103::2 + + ip netns exec $ns1 \ + sysctl -q -w net.ipv6.fib_multipath_hash_policy=1 + + tc_set_flower_counter__saddr_syn $ns1 6 veth1 2001:db8:101::1 + tc_set_flower_counter__saddr_syn $ns1 6 veth3 2001:db8:103::1 + + ip_mpath_balance -6 "[2001:db8:105::1]" + + log_test $? 0 "IPv6 multipath loadbalance" + + forwarding_cleanup +} + ################################################################################ # usage @@ -2683,6 +2799,8 @@ do fib6_gc_test|ipv6_gc) fib6_gc_test;; ipv4_mpath_list) ipv4_mpath_list_test;; ipv6_mpath_list) ipv6_mpath_list_test;; + ipv4_mpath_balance) ipv4_mpath_balance_test;; + ipv6_mpath_balance) ipv6_mpath_balance_test;; help) echo "Test names: $TESTS"; exit 0;; esac diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index 701905eeff66d..7e1e563186253 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -270,6 +270,30 @@ tc_rule_handle_stats_get() .options.actions[0].stats$selector" } +# attach a qdisc with two children match/no-match and a flower filter to match +tc_set_flower_counter() { + local -r ns=$1 + local -r ipver=$2 + local -r dev=$3 + local -r flower_expr=$4 + + tc -n $ns qdisc add dev $dev root handle 1: prio bands 2 \ + priomap 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + + tc -n $ns qdisc add dev $dev parent 1:1 handle 11: pfifo + tc -n $ns qdisc add dev $dev parent 1:2 handle 12: pfifo + + tc -n $ns filter add dev $dev parent 1: protocol ipv$ipver \ + flower $flower_expr classid 1:2 +} + +tc_get_flower_counter() { + local -r ns=$1 + local -r dev=$2 + + tc -n $ns -j -s qdisc show dev $dev handle 12: | jq .[0].packets +} + ret_set_ksft_status() { local ksft_status=$1; shift From fca6170f5a039543fa5f390f1895fde503b80f46 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 25 Apr 2025 23:05:30 -0700 Subject: [PATCH 556/770] ipv4: fib: Fix fib_info_hash_alloc() allocation type In preparation for making the kmalloc family of allocators type aware, we need to make sure that the returned type from the allocation matches the type of the variable being assigned. (Before, the allocator would always return "void *", which can be implicitly cast to any pointer type.) This was allocating many sizeof(struct hlist_head *) when it actually wanted sizeof(struct hlist_head). Luckily these are the same size. Adjust the allocation type to match the assignment. Signed-off-by: Kees Cook Reviewed-by: Simon Horman Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250426060529.work.873-kees@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2371f311a1e14..03959c60d1285 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -365,7 +365,7 @@ static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) { /* The second half is used for prefsrc */ - return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head *), + return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head), GFP_KERNEL); } From 2eea791a75542651c97c3ca8d34d5a594867094a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 25 Apr 2025 23:07:13 -0700 Subject: [PATCH 557/770] pds_core: Allocate pdsc_viftype_defaults copy with ARRAY_SIZE() In preparation for making the kmalloc family of allocators type aware, we need to make sure that the returned type from the allocation matches the type of the variable being assigned. (Before, the allocator would always return "void *", which can be implicitly cast to any pointer type.) This is allocating a copy of pdsc_viftype_defaults, which is an array of struct pdsc_viftype. To correctly return "struct pdsc_viftype *" in the future, adjust the allocation to allocating ARRAY_SIZE-many entries. The resulting allocation size is the same. Signed-off-by: Kees Cook Reviewed-by: Shannon Nelson Link: https://patch.msgid.link/20250426060712.work.575-kees@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c index 9512aa4083f05..0e408f990f0b2 100644 --- a/drivers/net/ethernet/amd/pds_core/core.c +++ b/drivers/net/ethernet/amd/pds_core/core.c @@ -414,7 +414,8 @@ static int pdsc_viftypes_init(struct pdsc *pdsc) { enum pds_core_vif_types vt; - pdsc->viftype_status = kzalloc(sizeof(pdsc_viftype_defaults), + pdsc->viftype_status = kcalloc(ARRAY_SIZE(pdsc_viftype_defaults), + sizeof(*pdsc->viftype_status), GFP_KERNEL); if (!pdsc->viftype_status) return -ENOMEM; From 01cbf838c775a2096b8b0704b499012d867332bb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 25 Apr 2025 23:07:58 -0700 Subject: [PATCH 558/770] net/mlx4_core: Adjust allocation type for buddy->bits In preparation for making the kmalloc family of allocators type aware, we need to make sure that the returned type from the allocation matches the type of the variable being assigned. (Before, the allocator would always return "void *", which can be implicitly cast to any pointer type.) The assigned type is "unsigned long **", but the returned type will be "long **". These are the same size allocation (pointer size) but the types do not match. Adjust the allocation type to match the assignment. Signed-off-by: Kees Cook Reviewed-by: Simon Horman Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20250426060757.work.865-kees@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c index d7444782bfdd0..698a5d1f0d7e2 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mr.c +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c @@ -106,7 +106,7 @@ static int mlx4_buddy_init(struct mlx4_buddy *buddy, int max_order) buddy->max_order = max_order; spin_lock_init(&buddy->lock); - buddy->bits = kcalloc(buddy->max_order + 1, sizeof(long *), + buddy->bits = kcalloc(buddy->max_order + 1, sizeof(*buddy->bits), GFP_KERNEL); buddy->num_free = kcalloc(buddy->max_order + 1, sizeof(*buddy->num_free), GFP_KERNEL); From c636eed60958875e6499043bcb32f69abf24314c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 25 Apr 2025 23:08:42 -0700 Subject: [PATCH 559/770] nfp: xsk: Adjust allocation type for nn->dp.xsk_pools In preparation for making the kmalloc family of allocators type aware, we need to make sure that the returned type from the allocation matches the type of the variable being assigned. (Before, the allocator would always return "void *", which can be implicitly cast to any pointer type.) The assigned type "struct xsk_buff_pool **", but the returned type will be "struct xsk_buff_pool ***". These are the same allocation size (pointer size), but the types don't match. Adjust the allocation type to match the assignment. Signed-off-by: Kees Cook Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250426060841.work.016-kees@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 95514fabadf27..28997ddab9664 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2538,7 +2538,7 @@ nfp_net_alloc(struct pci_dev *pdev, const struct nfp_dev_info *dev_info, nn->dp.num_r_vecs, num_online_cpus()); nn->max_r_vecs = nn->dp.num_r_vecs; - nn->dp.xsk_pools = kcalloc(nn->max_r_vecs, sizeof(nn->dp.xsk_pools), + nn->dp.xsk_pools = kcalloc(nn->max_r_vecs, sizeof(*nn->dp.xsk_pools), GFP_KERNEL); if (!nn->dp.xsk_pools) { err = -ENOMEM; From 5fe6530cd54b8647e71ff450f54f16b7f4c68066 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 25 Apr 2025 23:18:59 -0700 Subject: [PATCH 560/770] ptp: ocp: Add const to bp->attr_group allocation type In preparation for making the kmalloc family of allocators type aware, we need to make sure that the returned type from the allocation matches the type of the variable being assigned. (Before, the allocator would always return "void *", which can be implicitly cast to any pointer type.) The assigned type is "const struct attribute_group **", but the returned type, while technically matching, will be not const qualified. As there is no general way to safely add const qualifiers, adjust the allocation type to match the assignment. Signed-off-by: Kees Cook Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250426061858.work.470-kees@kernel.org Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_ocp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index faf6e027f89ab..ed5968a3ea5a5 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -2372,7 +2372,7 @@ ptp_ocp_attr_group_add(struct ptp_ocp *bp, if (attr_tbl[i].cap & bp->fw_cap) count++; - bp->attr_group = kcalloc(count + 1, sizeof(struct attribute_group *), + bp->attr_group = kcalloc(count + 1, sizeof(*bp->attr_group), GFP_KERNEL); if (!bp->attr_group) return -ENOMEM; From 187e0216366f3573c894514cf41267df843efd49 Mon Sep 17 00:00:00 2001 From: David Wei Date: Sat, 26 Apr 2025 12:55:24 -0700 Subject: [PATCH 561/770] io_uring/zcrx: selftests: use rand_port() Use rand_port() and stop hard coding port 9999. Signed-off-by: David Wei Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250426195525.1906774-2-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/hw/iou-zcrx.py | 48 ++++++++++++------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index 48b3d27cf472a..a19550419771d 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -5,7 +5,7 @@ from os import path from lib.py import ksft_run, ksft_exit from lib.py import NetDrvEpEnv -from lib.py import bkg, cmd, defer, ethtool, wait_port_listen +from lib.py import bkg, cmd, defer, ethtool, rand_port, wait_port_listen def _get_current_settings(cfg): @@ -28,14 +28,14 @@ def _create_rss_ctx(cfg, chans): return (ctx_id, defer(ethtool, f"-X {cfg.ifname} delete context {ctx_id}", host=cfg.remote)) -def _set_flow_rule(cfg, chan): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port 9999 action {chan}", host=cfg.remote).stdout +def _set_flow_rule(cfg, port, chan): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {port} action {chan}", host=cfg.remote).stdout values = re.search(r'ID (\d+)', output).group(1) return int(values) -def _set_flow_rule_rss(cfg, chan): - output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port 9999 action {chan}", host=cfg.remote).stdout +def _set_flow_rule_rss(cfg, port, chan): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {port} action {chan}", host=cfg.remote).stdout values = re.search(r'ID (\d+)', output).group(1) return int(values) @@ -47,22 +47,27 @@ def test_zcrx(cfg) -> None: if combined_chans < 2: raise KsftSkipEx('at least 2 combined channels required') (rx_ring, hds_thresh) = _get_current_settings(cfg) + port = rand_port() ethtool(f"-G {cfg.ifname} tcp-data-split on", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto", host=cfg.remote) + ethtool(f"-G {cfg.ifname} hds-thresh 0", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}", host=cfg.remote) + ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) defer(ethtool, f"-X {cfg.ifname} default", host=cfg.remote) - flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) + + flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1) defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) - rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1}" - tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p 9999 -l 12840" + rx_cmd = f"{cfg.bin_remote} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1}" + tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p {port} -l 12840" with bkg(rx_cmd, host=cfg.remote, exit_wait=True): - wait_port_listen(9999, proto="tcp", host=cfg.remote) + wait_port_listen(port, proto="tcp", host=cfg.remote) cmd(tx_cmd) @@ -73,22 +78,27 @@ def test_zcrx_oneshot(cfg) -> None: if combined_chans < 2: raise KsftSkipEx('at least 2 combined channels required') (rx_ring, hds_thresh) = _get_current_settings(cfg) + port = rand_port() ethtool(f"-G {cfg.ifname} tcp-data-split on", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto", host=cfg.remote) + ethtool(f"-G {cfg.ifname} hds-thresh 0", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}", host=cfg.remote) + ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) defer(ethtool, f"-X {cfg.ifname} default", host=cfg.remote) - flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) + + flow_rule_id = _set_flow_rule(cfg, port, combined_chans - 1) defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) - rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1} -o 4" - tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p 9999 -l 4096 -z 16384" + rx_cmd = f"{cfg.bin_remote} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1} -o 4" + tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p {port} -l 4096 -z 16384" with bkg(rx_cmd, host=cfg.remote, exit_wait=True): - wait_port_listen(9999, proto="tcp", host=cfg.remote) + wait_port_listen(port, proto="tcp", host=cfg.remote) cmd(tx_cmd) @@ -99,24 +109,28 @@ def test_zcrx_rss(cfg) -> None: if combined_chans < 2: raise KsftSkipEx('at least 2 combined channels required') (rx_ring, hds_thresh) = _get_current_settings(cfg) + port = rand_port() ethtool(f"-G {cfg.ifname} tcp-data-split on", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} tcp-data-split auto", host=cfg.remote) + ethtool(f"-G {cfg.ifname} hds-thresh 0", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} hds-thresh {hds_thresh}", host=cfg.remote) + ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) defer(ethtool, f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) defer(ethtool, f"-X {cfg.ifname} default", host=cfg.remote) (ctx_id, delete_ctx) = _create_rss_ctx(cfg, combined_chans) - flow_rule_id = _set_flow_rule_rss(cfg, ctx_id) + flow_rule_id = _set_flow_rule_rss(cfg, port, ctx_id) defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) - rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1}" - tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p 9999 -l 12840" + rx_cmd = f"{cfg.bin_remote} -s -p {port} -i {cfg.ifname} -q {combined_chans - 1}" + tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_addr_v['6']} -p {port} -l 12840" with bkg(rx_cmd, host=cfg.remote, exit_wait=True): - wait_port_listen(9999, proto="tcp", host=cfg.remote) + wait_port_listen(port, proto="tcp", host=cfg.remote) cmd(tx_cmd) From 6fbb4d3f7262771c376d1176e04811645d3c0c7b Mon Sep 17 00:00:00 2001 From: David Wei Date: Sat, 26 Apr 2025 12:55:25 -0700 Subject: [PATCH 562/770] io_uring/zcrx: selftests: parse json from ethtool -g Parse JSON from ethtool -g instead of parsing text output. Signed-off-by: David Wei Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250426195525.1906774-3-dw@davidwei.uk Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/iou-zcrx.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py index a19550419771d..aef43f82edd56 100755 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -9,10 +9,8 @@ def _get_current_settings(cfg): - output = ethtool(f"-g {cfg.ifname}", host=cfg.remote).stdout - rx_ring = re.findall(r'RX:\s+(\d+)', output) - hds_thresh = re.findall(r'HDS thresh:\s+(\d+)', output) - return (int(rx_ring[1]), int(hds_thresh[1])) + output = ethtool(f"-g {cfg.ifname}", json=True, host=cfg.remote)[0] + return (output['rx'], output['hds-thresh']) def _get_combined_channels(cfg): From 2b06aa2bcfb4eee0c5dc882ff8936913b1e6d44f Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Sun, 27 Apr 2025 12:59:49 -0700 Subject: [PATCH 563/770] net: phylink: Drop unused defines for SUPPORTED/ADVERTISED_INTERFACES The defines for SUPPORTED_INTERFACES and ADVERTISED_INTERFACES both appear to be unused. I couldn't find anything that actually references them in the original diff that added them and it seems like they have persisted despite using deprecated defines that aren't supposed to be used as per the ethtool.h header that defines the bits they are composed of. Since they are unused, and not supposed to be used anymore I am just dropping the lines of code since they seem to just be occupying space. Signed-off-by: Alexander Duyck Reviewed-by: Russell King (Oracle) Link: https://patch.msgid.link/174578398922.1580647.9720643128205980455.stgit@ahduyck-xeon-server.home.arpa Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 1bdd5d8bb5b02..0faa3d97e06b9 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -24,13 +24,6 @@ #include "sfp.h" #include "swphy.h" -#define SUPPORTED_INTERFACES \ - (SUPPORTED_TP | SUPPORTED_MII | SUPPORTED_FIBRE | \ - SUPPORTED_BNC | SUPPORTED_AUI | SUPPORTED_Backplane) -#define ADVERTISED_INTERFACES \ - (ADVERTISED_TP | ADVERTISED_MII | ADVERTISED_FIBRE | \ - ADVERTISED_BNC | ADVERTISED_AUI | ADVERTISED_Backplane) - enum { PHYLINK_DISABLE_STOPPED, PHYLINK_DISABLE_LINK, From eed848871c96d4b5a7b06307755b75abd0cc7a06 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 28 Apr 2025 11:22:06 +0100 Subject: [PATCH 564/770] crypto/krb5: Fix change to use SG miter to use offset The recent patch to make the rfc3961 simplified code use sg_miter rather than manually walking the scatterlist to hash the contents of a buffer described by that scatterlist failed to take the starting offset into account. This is indicated by the selftests reporting: krb5: Running aes128-cts-hmac-sha256-128 mic krb5: !!! TESTFAIL crypto/krb5/selftest.c:446 krb5: MIC mismatch Fix this by calling sg_miter_skip() before doing the loop to advance by the offset. This only affects packet signing modes and not full encryption in RxGK because, for full encryption, the message digest is handled inside the authenc and krb5enc drivers. Note: Nothing in linus/master uses the krb5lib, though the bug is there. It is used by AF_RXRPC's RxGK implementation in -next, no need to backport. Fixes: da6f9bf40ac2 ("crypto: krb5 - Use SG miter instead of doing it by hand") Reported-by: Marc Dionne Signed-off-by: David Howells cc: Chuck Lever cc: Simon Horman cc: linux-afs@lists.infradead.org Acked-by: Herbert Xu Link: https://patch.msgid.link/3824017.1745835726@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- crypto/krb5/rfc3961_simplified.c | 1 + 1 file changed, 1 insertion(+) diff --git a/crypto/krb5/rfc3961_simplified.c b/crypto/krb5/rfc3961_simplified.c index 79180d28baa9f..e49cbdec7c404 100644 --- a/crypto/krb5/rfc3961_simplified.c +++ b/crypto/krb5/rfc3961_simplified.c @@ -89,6 +89,7 @@ int crypto_shash_update_sg(struct shash_desc *desc, struct scatterlist *sg, sg_miter_start(&miter, sg, sg_nents(sg), SG_MITER_FROM_SG | SG_MITER_LOCAL); + sg_miter_skip(&miter, offset); for (i = 0; i < len; i += n) { sg_miter_next(&miter); n = min(miter.length, len - i); From e7e5ae71831c44d58627a991e603845a2fed2cab Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Fri, 25 Apr 2025 16:50:47 +0100 Subject: [PATCH 565/770] net: dlink: Correct endianness handling of led_mode As it's name suggests, parse_eeprom() parses EEPROM data. This is done by reading data, 16 bits at a time as follows: for (i = 0; i < 128; i++) ((__le16 *) sromdata)[i] = cpu_to_le16(read_eeprom(np, i)); sromdata is at the same memory location as psrom. And the type of psrom is a pointer to struct t_SROM. As can be seen in the loop above, data is stored in sromdata, and thus psrom, as 16-bit little-endian values. However, the integer fields of t_SROM are host byte order integers. And in the case of led_mode this leads to a little endian value being incorrectly treated as host byte order. Looking at rio_set_led_mode, this does appear to be a bug as that code masks led_mode with 0x1, 0x2 and 0x8. Logic that would be effected by a reversed byte order. This problem would only manifest on big endian hosts. Found by inspection while investigating a sparse warning regarding the crc field of t_SROM. I believe that warning is a false positive. And although I plan to send a follow-up to use little-endian types for other the integer fields of PSROM_t I do not believe that will involve any bug fixes. Compile tested only. Fixes: c3f45d322cbd ("dl2k: Add support for IP1000A-based cards") Signed-off-by: Simon Horman Link: https://patch.msgid.link/20250425-dlink-led-mode-v1-1-6bae3c36e736@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/dlink/dl2k.c | 2 +- drivers/net/ethernet/dlink/dl2k.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/dlink/dl2k.c b/drivers/net/ethernet/dlink/dl2k.c index d88fbecdab4b8..232e839a9d071 100644 --- a/drivers/net/ethernet/dlink/dl2k.c +++ b/drivers/net/ethernet/dlink/dl2k.c @@ -352,7 +352,7 @@ parse_eeprom (struct net_device *dev) eth_hw_addr_set(dev, psrom->mac_addr); if (np->chip_id == CHIP_IP1000A) { - np->led_mode = psrom->led_mode; + np->led_mode = le16_to_cpu(psrom->led_mode); return 0; } diff --git a/drivers/net/ethernet/dlink/dl2k.h b/drivers/net/ethernet/dlink/dl2k.h index 195dc6cfd8955..0e33e2eaae960 100644 --- a/drivers/net/ethernet/dlink/dl2k.h +++ b/drivers/net/ethernet/dlink/dl2k.h @@ -335,7 +335,7 @@ typedef struct t_SROM { u16 sub_system_id; /* 0x06 */ u16 pci_base_1; /* 0x08 (IP1000A only) */ u16 pci_base_2; /* 0x0a (IP1000A only) */ - u16 led_mode; /* 0x0c (IP1000A only) */ + __le16 led_mode; /* 0x0c (IP1000A only) */ u16 reserved1[9]; /* 0x0e-0x1f */ u8 mac_addr[6]; /* 0x20-0x25 */ u8 reserved2[10]; /* 0x26-0x2f */ From b23285e93bef729e67519a5209d5b7fde3b4af50 Mon Sep 17 00:00:00 2001 From: Da Xue Date: Fri, 25 Apr 2025 15:20:09 -0400 Subject: [PATCH 566/770] net: mdio: mux-meson-gxl: set reversed bit when using internal phy This bit is necessary to receive packets from the internal PHY. Without this bit set, no activity occurs on the interface. Normally u-boot sets this bit, but if u-boot is compiled without net support, the interface will be up but without any activity. If bit is set once, it will work until the IP is powered down or reset. The vendor SDK sets this bit along with the PHY_ID bits. Signed-off-by: Da Xue Fixes: 9a24e1ff4326 ("net: mdio: add amlogic gxl mdio mux support") Link: https://patch.msgid.link/20250425192009.1439508-1-da@libre.computer Signed-off-by: Jakub Kicinski --- drivers/net/mdio/mdio-mux-meson-gxl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/mdio/mdio-mux-meson-gxl.c b/drivers/net/mdio/mdio-mux-meson-gxl.c index 00c66240136b1..3dd12a8c8b03e 100644 --- a/drivers/net/mdio/mdio-mux-meson-gxl.c +++ b/drivers/net/mdio/mdio-mux-meson-gxl.c @@ -17,6 +17,7 @@ #define REG2_LEDACT GENMASK(23, 22) #define REG2_LEDLINK GENMASK(25, 24) #define REG2_DIV4SEL BIT(27) +#define REG2_REVERSED BIT(28) #define REG2_ADCBYPASS BIT(30) #define REG2_CLKINSEL BIT(31) #define ETH_REG3 0x4 @@ -65,7 +66,7 @@ static void gxl_enable_internal_mdio(struct gxl_mdio_mux *priv) * The only constraint is that it must match the one in * drivers/net/phy/meson-gxl.c to properly match the PHY. */ - writel(FIELD_PREP(REG2_PHYID, EPHY_GXL_ID), + writel(REG2_REVERSED | FIELD_PREP(REG2_PHYID, EPHY_GXL_ID), priv->regs + ETH_REG2); /* Enable the internal phy */ From ebaebc5eaf431f7daeeb0a6788a8480d42136df1 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sat, 26 Apr 2025 15:12:19 +0700 Subject: [PATCH 567/770] xsk: respect the offsets when copying frags In commit 560d958c6c68 ("xsk: add generic XSk &xdp_buff -> skb conversion"), we introduce a helper to convert zerocopy xdp_buff to skb. However, in the frag copy, we mistakenly ignore the frag's offset. This commit adds the missing offset when copying frags in xdp_copy_frags_from_zc(). This function is not used anywhere so no backport is needed. Signed-off-by: Bui Quang Minh Link: https://patch.msgid.link/20250426081220.40689-2-minhquangbui99@gmail.com Signed-off-by: Jakub Kicinski --- net/core/xdp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/core/xdp.c b/net/core/xdp.c index ea819764ae39a..89111c68e5450 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -698,7 +698,8 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, nr_frags = xinfo->nr_frags; for (u32 i = 0; i < nr_frags; i++) { - u32 len = skb_frag_size(&xinfo->frags[i]); + const skb_frag_t *frag = &xinfo->frags[i]; + u32 len = skb_frag_size(frag); u32 offset, truesize = len; netmem_ref netmem; @@ -708,8 +709,8 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, return false; } - memcpy(__netmem_address(netmem), - __netmem_address(xinfo->frags[i].netmem), + memcpy(__netmem_address(netmem) + offset, + __netmem_address(frag->netmem) + skb_frag_off(frag), LARGEST_ALIGN(len)); __skb_fill_netmem_desc_noacc(sinfo, i, netmem, offset, len); From 7ead4405e06f67bf2163fd4708a6ce0a495b1dca Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sat, 26 Apr 2025 15:12:20 +0700 Subject: [PATCH 568/770] xsk: convert xdp_copy_frags_from_zc() to use page_pool_dev_alloc() This commit makes xdp_copy_frags_from_zc() use page allocation API page_pool_dev_alloc() instead of page_pool_dev_alloc_netmem() to avoid possible confusion of the returned value. Suggested-by: Jakub Kicinski Signed-off-by: Bui Quang Minh Link: https://patch.msgid.link/20250426081220.40689-3-minhquangbui99@gmail.com Signed-off-by: Jakub Kicinski --- net/core/xdp.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/core/xdp.c b/net/core/xdp.c index 89111c68e5450..4e91c77906711 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -701,21 +701,21 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, const skb_frag_t *frag = &xinfo->frags[i]; u32 len = skb_frag_size(frag); u32 offset, truesize = len; - netmem_ref netmem; + struct page *page; - netmem = page_pool_dev_alloc_netmem(pp, &offset, &truesize); - if (unlikely(!netmem)) { + page = page_pool_dev_alloc(pp, &offset, &truesize); + if (unlikely(!page)) { sinfo->nr_frags = i; return false; } - memcpy(__netmem_address(netmem) + offset, - __netmem_address(frag->netmem) + skb_frag_off(frag), + memcpy(page_address(page) + offset, + skb_frag_page(frag) + skb_frag_off(frag), LARGEST_ALIGN(len)); - __skb_fill_netmem_desc_noacc(sinfo, i, netmem, offset, len); + __skb_fill_page_desc_noacc(sinfo, i, page, offset, len); tsize += truesize; - pfmemalloc |= netmem_is_pfmemalloc(netmem); + pfmemalloc |= page_is_pfmemalloc(page); } xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size, From 8a558cbda51bef09773c72bf74a32047479110c7 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Fri, 4 Apr 2025 12:54:21 +0200 Subject: [PATCH 569/770] idpf: fix potential memory leak on kcalloc() failure In case of failing on rss_data->rss_key allocation the function is freeing vport without freeing earlier allocated q_vector_idxs. Fix it. Move from freeing in error branch to goto scheme. Fixes: d4d558718266 ("idpf: initialize interrupts and enable vport") Reviewed-by: Pavan Kumar Linga Reviewed-by: Aleksandr Loktionov Suggested-by: Pavan Kumar Linga Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_lib.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index 730a9c7a59f2b..82f09b4030bce 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -1113,11 +1113,9 @@ static struct idpf_vport *idpf_vport_alloc(struct idpf_adapter *adapter, num_max_q = max(max_q->max_txq, max_q->max_rxq); vport->q_vector_idxs = kcalloc(num_max_q, sizeof(u16), GFP_KERNEL); - if (!vport->q_vector_idxs) { - kfree(vport); + if (!vport->q_vector_idxs) + goto free_vport; - return NULL; - } idpf_vport_init(vport, max_q); /* This alloc is done separate from the LUT because it's not strictly @@ -1127,11 +1125,9 @@ static struct idpf_vport *idpf_vport_alloc(struct idpf_adapter *adapter, */ rss_data = &adapter->vport_config[idx]->user_config.rss_data; rss_data->rss_key = kzalloc(rss_data->rss_key_size, GFP_KERNEL); - if (!rss_data->rss_key) { - kfree(vport); + if (!rss_data->rss_key) + goto free_vector_idxs; - return NULL; - } /* Initialize default rss key */ netdev_rss_key_fill((void *)rss_data->rss_key, rss_data->rss_key_size); @@ -1144,6 +1140,13 @@ static struct idpf_vport *idpf_vport_alloc(struct idpf_adapter *adapter, adapter->next_vport = idpf_get_free_slot(adapter); return vport; + +free_vector_idxs: + kfree(vport->q_vector_idxs); +free_vport: + kfree(vport); + + return NULL; } /** From ed375b182140eeb9c73609b17939c8a29b27489e Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Thu, 10 Apr 2025 13:52:23 +0200 Subject: [PATCH 570/770] idpf: protect shutdown from reset Before the referenced commit, the shutdown just called idpf_remove(), this way IDPF_REMOVE_IN_PROG was protecting us from the serv_task rescheduling reset. Without this flag set the shutdown process is vulnerable to HW reset or any other triggering conditions (such as default mailbox being destroyed). When one of conditions checked in idpf_service_task becomes true, vc_event_task can be rescheduled during shutdown, this leads to accessing freed memory e.g. idpf_req_rel_vector_indexes() trying to read vport->q_vector_idxs. This in turn causes the system to become defunct during e.g. systemctl kexec. Considering using IDPF_REMOVE_IN_PROG would lead to more heavy shutdown process, instead just cancel the serv_task before cancelling adapter->serv_task before cancelling adapter->vc_event_task to ensure that reset will not be scheduled while we are doing a shutdown. Fixes: 4c9106f4906a ("idpf: fix adapter NULL pointer dereference on reboot") Reviewed-by: Michal Swiatkowski Signed-off-by: Larysa Zaremba Reviewed-by: Simon Horman Reviewed-by: Emil Tantilov Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/idpf/idpf_main.c b/drivers/net/ethernet/intel/idpf/idpf_main.c index bec4a02c53733..b35713036a54a 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_main.c +++ b/drivers/net/ethernet/intel/idpf/idpf_main.c @@ -89,6 +89,7 @@ static void idpf_shutdown(struct pci_dev *pdev) { struct idpf_adapter *adapter = pci_get_drvdata(pdev); + cancel_delayed_work_sync(&adapter->serv_task); cancel_delayed_work_sync(&adapter->vc_event_task); idpf_vc_core_deinit(adapter); idpf_deinit_dflt_mbx(adapter); From c7d6cb96d5c33b5148f3dc76fcd30a9b8cd9e973 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 22 Apr 2025 14:03:09 -0700 Subject: [PATCH 571/770] igc: fix lock order in igc_ptp_reset Commit 1a931c4f5e68 ("igc: add lock preventing multiple simultaneous PTM transactions") added a new mutex to protect concurrent PTM transactions. This lock is acquired in igc_ptp_reset() in order to ensure the PTM registers are properly disabled after a device reset. The flow where the lock is acquired already holds a spinlock, so acquiring a mutex leads to a sleep-while-locking bug, reported both by smatch, and the kernel test robot. The critical section in igc_ptp_reset() does correctly use the readx_poll_timeout_atomic variants, but the standard PTM flow uses regular sleeping variants. This makes converting the mutex to a spinlock a bit tricky. Instead, re-order the locking in igc_ptp_reset. Acquire the mutex first, and then the tmreg_lock spinlock. This is safe because there is no other ordering dependency on these locks, as this is the only place where both locks were acquired simultaneously. Indeed, any other flow acquiring locks in that order would be wrong regardless. Signed-off-by: Jacob Keller Fixes: 1a931c4f5e68 ("igc: add lock preventing multiple simultaneous PTM transactions") Link: https://lore.kernel.org/intel-wired-lan/Z_-P-Hc1yxcw0lTB@stanley.mountain/ Link: https://lore.kernel.org/intel-wired-lan/202504211511.f7738f5d-lkp@intel.com/T/#u Reviewed-by: Przemek Kitszel Reviewed-by: Vitaly Lifshits Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_ptp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_ptp.c b/drivers/net/ethernet/intel/igc/igc_ptp.c index 612ed26a29c5d..efc7b30e42113 100644 --- a/drivers/net/ethernet/intel/igc/igc_ptp.c +++ b/drivers/net/ethernet/intel/igc/igc_ptp.c @@ -1290,6 +1290,8 @@ void igc_ptp_reset(struct igc_adapter *adapter) /* reset the tstamp_config */ igc_ptp_set_timestamp_mode(adapter, &adapter->tstamp_config); + mutex_lock(&adapter->ptm_lock); + spin_lock_irqsave(&adapter->tmreg_lock, flags); switch (adapter->hw.mac.type) { @@ -1308,7 +1310,6 @@ void igc_ptp_reset(struct igc_adapter *adapter) if (!igc_is_crosststamp_supported(adapter)) break; - mutex_lock(&adapter->ptm_lock); wr32(IGC_PCIE_DIG_DELAY, IGC_PCIE_DIG_DELAY_DEFAULT); wr32(IGC_PCIE_PHY_DELAY, IGC_PCIE_PHY_DELAY_DEFAULT); @@ -1332,7 +1333,6 @@ void igc_ptp_reset(struct igc_adapter *adapter) netdev_err(adapter->netdev, "Timeout reading IGC_PTM_STAT register\n"); igc_ptm_reset(hw); - mutex_unlock(&adapter->ptm_lock); break; default: /* No work to do. */ @@ -1349,5 +1349,7 @@ void igc_ptp_reset(struct igc_adapter *adapter) out: spin_unlock_irqrestore(&adapter->tmreg_lock, flags); + mutex_unlock(&adapter->ptm_lock); + wrfl(); } From 6e0490fc36cdac696f96e57b61d93b9ae32e0f4c Mon Sep 17 00:00:00 2001 From: Chad Monroe Date: Sun, 27 Apr 2025 02:05:44 +0100 Subject: [PATCH 572/770] net: ethernet: mtk_eth_soc: fix SER panic with 4GB+ RAM If the mtk_poll_rx() function detects the MTK_RESETTING flag, it will jump to release_desc and refill the high word of the SDP on the 4GB RFB. Subsequently, mtk_rx_clean will process an incorrect SDP, leading to a panic. Add patch from MediaTek's SDK to resolve this. Fixes: 2d75891ebc09 ("net: ethernet: mtk_eth_soc: support 36-bit DMA addressing on MT7988") Link: https://git01.mediatek.com/plugins/gitiles/openwrt/feeds/mtk-openwrt-feeds/+/71f47ea785699c6aa3b922d66c2bdc1a43da25b1 Signed-off-by: Chad Monroe Link: https://patch.msgid.link/4adc2aaeb0fb1b9cdc56bf21cf8e7fa328daa345.1745715843.git.daniel@makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 83068925c589d..8fda4ce80d811 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -2248,14 +2248,18 @@ static int mtk_poll_rx(struct napi_struct *napi, int budget, ring->data[idx] = new_data; rxd->rxd1 = (unsigned int)dma_addr; release_desc: + if (MTK_HAS_CAPS(eth->soc->caps, MTK_36BIT_DMA)) { + if (unlikely(dma_addr == DMA_MAPPING_ERROR)) + addr64 = FIELD_GET(RX_DMA_ADDR64_MASK, + rxd->rxd2); + else + addr64 = RX_DMA_PREP_ADDR64(dma_addr); + } + if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) rxd->rxd2 = RX_DMA_LSO; else - rxd->rxd2 = RX_DMA_PREP_PLEN0(ring->buf_size); - - if (MTK_HAS_CAPS(eth->soc->caps, MTK_36BIT_DMA) && - likely(dma_addr != DMA_MAPPING_ERROR)) - rxd->rxd2 |= RX_DMA_PREP_ADDR64(dma_addr); + rxd->rxd2 = RX_DMA_PREP_PLEN0(ring->buf_size) | addr64; ring->calc_idx = idx; done++; From 426d487bca38b34f39c483edfc6313a036446b33 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 26 Apr 2025 17:48:55 +0300 Subject: [PATCH 573/770] net: dsa: felix: fix broken taprio gate states after clock jump Simplest setup to reproduce the issue: connect 2 ports of the LS1028A-RDB together (eno0 with swp0) and run: $ ip link set eno0 up && ip link set swp0 up $ tc qdisc replace dev swp0 parent root handle 100 taprio num_tc 8 \ queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 map 0 1 2 3 4 5 6 7 \ base-time 0 sched-entry S 20 300000 sched-entry S 10 200000 \ sched-entry S 20 300000 sched-entry S 48 200000 \ sched-entry S 20 300000 sched-entry S 83 200000 \ sched-entry S 40 300000 sched-entry S 00 200000 flags 2 $ ptp4l -i eno0 -f /etc/linuxptp/configs/gPTP.cfg -m & $ ptp4l -i swp0 -f /etc/linuxptp/configs/gPTP.cfg -m One will observe that the PTP state machine on swp0 starts synchronizing, then it attempts to do a clock step, and after that, it never fails to recover from the condition below. ptp4l[82.427]: selected best master clock 00049f.fffe.05f627 ptp4l[82.428]: port 1 (swp0): MASTER to UNCALIBRATED on RS_SLAVE ptp4l[83.252]: port 1 (swp0): UNCALIBRATED to SLAVE on MASTER_CLOCK_SELECTED ptp4l[83.886]: rms 4537731277 max 9075462553 freq -18518 +/- 11467 delay 818 +/- 0 ptp4l[84.170]: timed out while polling for tx timestamp ptp4l[84.171]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[84.172]: port 1 (swp0): send peer delay request failed ptp4l[84.173]: port 1 (swp0): clearing fault immediately ptp4l[84.269]: port 1 (swp0): SLAVE to LISTENING on INIT_COMPLETE ptp4l[85.303]: timed out while polling for tx timestamp ptp4l[84.171]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[84.172]: port 1 (swp0): send peer delay request failed ptp4l[84.173]: port 1 (swp0): clearing fault immediately ptp4l[84.269]: port 1 (swp0): SLAVE to LISTENING on INIT_COMPLETE ptp4l[85.303]: timed out while polling for tx timestamp ptp4l[85.304]: increasing tx_timestamp_timeout or increasing kworker priority may correct this issue, but a driver bug likely causes it ptp4l[85.305]: port 1 (swp0): send peer delay response failed ptp4l[85.306]: port 1 (swp0): clearing fault immediately ptp4l[86.304]: timed out while polling for tx timestamp A hint is given by the non-zero statistics for dropped packets which were expecting hardware TX timestamps: $ ethtool --include-statistics -T swp0 (...) Statistics: tx_pkts: 30 tx_lost: 11 tx_err: 0 We know that when PTP clock stepping takes place (from ocelot_ptp_settime64() or from ocelot_ptp_adjtime()), vsc9959_tas_clock_adjust() is called. Another interesting hint is that placing an early return in vsc9959_tas_clock_adjust(), so as to neutralize this function, fixes the issue and TX timestamps are no longer dropped. The debugging function written by me and included below is intended to read the GCL RAM, after the admin schedule became operational, through the two status registers available for this purpose: QSYS_GCL_STATUS_REG_1 and QSYS_GCL_STATUS_REG_2. static void vsc9959_print_tas_gcl(struct ocelot *ocelot) { u32 val, list_length, interval, gate_state; int i, err; err = read_poll_timeout(ocelot_read, val, !(val & QSYS_PARAM_STATUS_REG_8_CONFIG_PENDING), 10, 100000, false, ocelot, QSYS_PARAM_STATUS_REG_8); if (err) { dev_err(ocelot->dev, "Failed to wait for TAS config pending bit to clear: %pe\n", ERR_PTR(err)); return; } val = ocelot_read(ocelot, QSYS_PARAM_STATUS_REG_3); list_length = QSYS_PARAM_STATUS_REG_3_LIST_LENGTH_X(val); dev_info(ocelot->dev, "GCL length: %u\n", list_length); for (i = 0; i < list_length; i++) { ocelot_rmw(ocelot, QSYS_GCL_STATUS_REG_1_GCL_ENTRY_NUM(i), QSYS_GCL_STATUS_REG_1_GCL_ENTRY_NUM_M, QSYS_GCL_STATUS_REG_1); interval = ocelot_read(ocelot, QSYS_GCL_STATUS_REG_2); val = ocelot_read(ocelot, QSYS_GCL_STATUS_REG_1); gate_state = QSYS_GCL_STATUS_REG_1_GATE_STATE_X(val); dev_info(ocelot->dev, "GCL entry %d: states 0x%x interval %u\n", i, gate_state, interval); } } Calling it from two places: after the initial QSYS_TAS_PARAM_CFG_CTRL_CONFIG_CHANGE performed by vsc9959_qos_port_tas_set(), and after the one done by vsc9959_tas_clock_adjust(), I notice the following difference. From the tc-taprio process context, where the schedule was initially configured, the GCL looks like this: mscc_felix 0000:00:00.5: GCL length: 8 mscc_felix 0000:00:00.5: GCL entry 0: states 0x20 interval 300000 mscc_felix 0000:00:00.5: GCL entry 1: states 0x10 interval 200000 mscc_felix 0000:00:00.5: GCL entry 2: states 0x20 interval 300000 mscc_felix 0000:00:00.5: GCL entry 3: states 0x48 interval 200000 mscc_felix 0000:00:00.5: GCL entry 4: states 0x20 interval 300000 mscc_felix 0000:00:00.5: GCL entry 5: states 0x83 interval 200000 mscc_felix 0000:00:00.5: GCL entry 6: states 0x40 interval 300000 mscc_felix 0000:00:00.5: GCL entry 7: states 0x0 interval 200000 But from the ptp4l clock stepping process context, when the vsc9959_tas_clock_adjust() hook is called, the GCL RAM of the operational schedule now looks like this: mscc_felix 0000:00:00.5: GCL length: 8 mscc_felix 0000:00:00.5: GCL entry 0: states 0x0 interval 0 mscc_felix 0000:00:00.5: GCL entry 1: states 0x0 interval 0 mscc_felix 0000:00:00.5: GCL entry 2: states 0x0 interval 0 mscc_felix 0000:00:00.5: GCL entry 3: states 0x0 interval 0 mscc_felix 0000:00:00.5: GCL entry 4: states 0x0 interval 0 mscc_felix 0000:00:00.5: GCL entry 5: states 0x0 interval 0 mscc_felix 0000:00:00.5: GCL entry 6: states 0x0 interval 0 mscc_felix 0000:00:00.5: GCL entry 7: states 0x0 interval 0 I do not have a formal explanation, just experimental conclusions. It appears that after triggering QSYS_TAS_PARAM_CFG_CTRL_CONFIG_CHANGE for a port's TAS, the GCL entry RAM is updated anyway, despite what the documentation claims: "Specify the time interval in QSYS::GCL_CFG_REG_2.TIME_INTERVAL. This triggers the actual RAM write with the gate state and the time interval for the entry number specified". We don't touch that register (through vsc9959_tas_gcl_set()) from vsc9959_tas_clock_adjust(), yet the GCL RAM is updated anyway. It seems to be updated with effectively stale memory, which in my testing can hold a variety of things, including even pieces of the previously applied schedule, for particular schedule lengths. As such, in most circumstances it is very difficult to pinpoint this issue, because the newly updated schedule would "behave strangely", but ultimately might still pass traffic to some extent, due to some gate entries still being present in the stale GCL entry RAM. It is easy to miss. With the particular schedule given at the beginning, the GCL RAM "happens" to be reproducibly rewritten with all zeroes, and this is consistent with what we see: when the time-aware shaper has gate entries with all gates closed, traffic is dropped on TX, no wonder we can't retrieve TX timestamps. Rewriting the GCL entry RAM when reapplying the new base time fixes the observed issue. Fixes: 8670dc33f48b ("net: dsa: felix: update base time of time-aware shaper when adjusting PTP time") Reported-by: Richie Pearn Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250426144859.3128352-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/ocelot/felix_vsc9959.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/ocelot/felix_vsc9959.c b/drivers/net/dsa/ocelot/felix_vsc9959.c index 940f1b71226d6..7b35d24c38d76 100644 --- a/drivers/net/dsa/ocelot/felix_vsc9959.c +++ b/drivers/net/dsa/ocelot/felix_vsc9959.c @@ -1543,7 +1543,7 @@ static void vsc9959_tas_clock_adjust(struct ocelot *ocelot) struct tc_taprio_qopt_offload *taprio; struct ocelot_port *ocelot_port; struct timespec64 base_ts; - int port; + int i, port; u32 val; mutex_lock(&ocelot->fwd_domain_lock); @@ -1575,6 +1575,9 @@ static void vsc9959_tas_clock_adjust(struct ocelot *ocelot) QSYS_PARAM_CFG_REG_3_BASE_TIME_SEC_MSB_M, QSYS_PARAM_CFG_REG_3); + for (i = 0; i < taprio->num_entries; i++) + vsc9959_tas_gcl_set(ocelot, i, &taprio->entries[i]); + ocelot_rmw(ocelot, QSYS_TAS_PARAM_CFG_CTRL_CONFIG_CHANGE, QSYS_TAS_PARAM_CFG_CTRL_CONFIG_CHANGE, QSYS_TAS_PARAM_CFG_CTRL); From efa6eb7d77aaf5b05eed25c0ecbf7754cc325c83 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 26 Apr 2025 17:48:56 +0300 Subject: [PATCH 574/770] selftests: net: tsn_lib: create common helper for counting received packets This snippet will be necessary for a future isochron-based test, so provide a simpler high-level interface for counting the received packets. Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250426144859.3128352-3-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/ocelot/psfp.sh | 7 +------ tools/testing/selftests/net/forwarding/tsn_lib.sh | 11 +++++++++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/drivers/net/ocelot/psfp.sh b/tools/testing/selftests/drivers/net/ocelot/psfp.sh index bed748dde4b06..f96a4bc7120f3 100755 --- a/tools/testing/selftests/drivers/net/ocelot/psfp.sh +++ b/tools/testing/selftests/drivers/net/ocelot/psfp.sh @@ -272,12 +272,7 @@ run_test() "" \ "${isochron_dat}" - # Count all received packets by looking at the non-zero RX timestamps - received=$(isochron report \ - --input-file "${isochron_dat}" \ - --printf-format "%u\n" --printf-args "R" | \ - grep -w -v '0' | wc -l) - + received=$(isochron_report_num_received "${isochron_dat}") if [ "${received}" = "${expected}" ]; then RET=0 else diff --git a/tools/testing/selftests/net/forwarding/tsn_lib.sh b/tools/testing/selftests/net/forwarding/tsn_lib.sh index b91bcd8008a99..19da1ccceac82 100644 --- a/tools/testing/selftests/net/forwarding/tsn_lib.sh +++ b/tools/testing/selftests/net/forwarding/tsn_lib.sh @@ -247,3 +247,14 @@ isochron_do() cpufreq_restore ${ISOCHRON_CPU} } + +isochron_report_num_received() +{ + local isochron_dat=$1; shift + + # Count all received packets by looking at the non-zero RX timestamps + isochron report \ + --input-file "${isochron_dat}" \ + --printf-format "%u\n" --printf-args "R" | \ + grep -w -v '0' | wc -l +} From f52fe6efd61f54c5cb0e19ef1fde96cf23048a70 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 26 Apr 2025 17:48:57 +0300 Subject: [PATCH 575/770] selftests: net: tsn_lib: add window_size argument to isochron_do() Make out-of-band testing (send a packet when its traffic class gate is closed, expecting it to be delayed) more predictable by allowing the window size to be customized by isochron_do(). From man isochron-send, the window size alters the advance time (the delta between the transmission time of the packet, and its expected TX time when using SO_TXTIME or tc-taprio on the sender). In absence of the argument, isochron-send defaults to maximizing the advance time (making it equal to the cycle length). The default behavior is exactly what is problematic. An advance time that is too large will make packets intended to be out-of-band still be potentially in-band with an open gate from the schedule's previous cycle. We need to allow that advance time to be reduced. Perhaps a bit confusingly, isochron_do() has a shift_time argument currently, but that does not help here. The shift time shifts both the user space wakeup time and the expected TX time by equal amounts, it is unable of bringing them closer to one another. Set the window size properly for the Ocelot PSFP selftest as well. That used to work due to a very carefully chosen SHIFT_TIME_NS. I've re-tested that the test still works properly. Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250426144859.3128352-4-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/ocelot/psfp.sh | 1 + tools/testing/selftests/net/forwarding/tsn_lib.sh | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/tools/testing/selftests/drivers/net/ocelot/psfp.sh b/tools/testing/selftests/drivers/net/ocelot/psfp.sh index f96a4bc7120f3..8972f42dfe03e 100755 --- a/tools/testing/selftests/drivers/net/ocelot/psfp.sh +++ b/tools/testing/selftests/drivers/net/ocelot/psfp.sh @@ -266,6 +266,7 @@ run_test() "${base_time}" \ "${CYCLE_TIME_NS}" \ "${SHIFT_TIME_NS}" \ + "${GATE_DURATION_NS}" \ "${NUM_PKTS}" \ "${STREAM_VID}" \ "${STREAM_PRIO}" \ diff --git a/tools/testing/selftests/net/forwarding/tsn_lib.sh b/tools/testing/selftests/net/forwarding/tsn_lib.sh index 19da1ccceac82..bcee7960a39f2 100644 --- a/tools/testing/selftests/net/forwarding/tsn_lib.sh +++ b/tools/testing/selftests/net/forwarding/tsn_lib.sh @@ -182,6 +182,7 @@ isochron_do() local base_time=$1; shift local cycle_time=$1; shift local shift_time=$1; shift + local window_size=$1; shift local num_pkts=$1; shift local vid=$1; shift local priority=$1; shift @@ -212,6 +213,10 @@ isochron_do() extra_args="${extra_args} --shift-time=${shift_time}" fi + if ! [ -z "${window_size}" ]; then + extra_args="${extra_args} --window-size=${window_size}" + fi + if [ "${use_l2}" = "true" ]; then extra_args="${extra_args} --l2 --etype=0xdead ${vid}" receiver_extra_args="--l2 --etype=0xdead" From 4eb9da050f005fbbb7d301e8e99cfdb6e4771a0d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 26 Apr 2025 17:48:58 +0300 Subject: [PATCH 576/770] selftests: net: tc_taprio: new test Add a forwarding path test for tc-taprio, based on isochron. This is specifically intended for NICs with an offloaded data path (switchdev/DSA) and requires taprio 'flags 2'. Also, $h1 and $h2 must support hardware timestamping, and $h1 tc-etf offload, for isochron to work. Packets received by a switch while the egress port has a taprio schedule with an open gate for the traffic class must be sent right away. Packets received by the switch while the traffic class gate must be delayed until it opens. Packets received by the switch must be dropped if the gate for the traffic class never opens. Packets should pass if the maximum SDU for the traffic class allows it, and should be dropped otherwise. The schedule should auto-update itself if clock jumps take place while taprio is installed. Repeat most of the above tests after forcing two clock jumps, one backwards (in Jan 1970) and one back into the present. Symlink it from tools/testing/selftests/drivers/net/dsa, because usually DSA ports have the same MAC address, and we need STABLE_MAC_ADDRS=yes from its forwarding.config for the test to run successfully. Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250426144859.3128352-5-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- .../selftests/drivers/net/dsa/tc_taprio.sh | 1 + .../selftests/net/forwarding/tc_taprio.sh | 421 ++++++++++++++++++ .../selftests/net/forwarding/tsn_lib.sh | 10 + 3 files changed, 432 insertions(+) create mode 120000 tools/testing/selftests/drivers/net/dsa/tc_taprio.sh create mode 100755 tools/testing/selftests/net/forwarding/tc_taprio.sh diff --git a/tools/testing/selftests/drivers/net/dsa/tc_taprio.sh b/tools/testing/selftests/drivers/net/dsa/tc_taprio.sh new file mode 120000 index 0000000000000..d16a65e7595d9 --- /dev/null +++ b/tools/testing/selftests/drivers/net/dsa/tc_taprio.sh @@ -0,0 +1 @@ +run_net_forwarding_test.sh \ No newline at end of file diff --git a/tools/testing/selftests/net/forwarding/tc_taprio.sh b/tools/testing/selftests/net/forwarding/tc_taprio.sh new file mode 100755 index 0000000000000..8992aeabfe0b4 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/tc_taprio.sh @@ -0,0 +1,421 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ALL_TESTS=" \ + test_clock_jump_backward \ + test_taprio_after_ptp \ + test_max_sdu \ + test_clock_jump_backward_forward \ +" +NUM_NETIFS=4 +source tc_common.sh +source lib.sh +source tsn_lib.sh + +require_command python3 + +# The test assumes the usual topology from the README, where h1 is connected to +# swp1, h2 to swp2, and swp1 and swp2 are together in a bridge. +# Additional assumption: h1 and h2 use the same PHC, and so do swp1 and swp2. +# By synchronizing h1 to swp1 via PTP, h2 is also implicitly synchronized to +# swp1 (and both to CLOCK_REALTIME). +h1=${NETIFS[p1]} +swp1=${NETIFS[p2]} +swp2=${NETIFS[p3]} +h2=${NETIFS[p4]} + +UDS_ADDRESS_H1="/var/run/ptp4l_h1" +UDS_ADDRESS_SWP1="/var/run/ptp4l_swp1" + +H1_IPV4="192.0.2.1" +H2_IPV4="192.0.2.2" +H1_IPV6="2001:db8:1::1" +H2_IPV6="2001:db8:1::2" + +# Tunables +NUM_PKTS=100 +STREAM_VID=10 +STREAM_PRIO_1=6 +STREAM_PRIO_2=5 +STREAM_PRIO_3=4 +# PTP uses TC 0 +ALL_GATES=$((1 << 0 | 1 << STREAM_PRIO_1 | 1 << STREAM_PRIO_2)) +# Use a conservative cycle of 10 ms to allow the test to still pass when the +# kernel has some extra overhead like lockdep etc +CYCLE_TIME_NS=10000000 +# Create two Gate Control List entries, one OPEN and one CLOSE, of equal +# durations +GATE_DURATION_NS=$((CYCLE_TIME_NS / 2)) +# Give 2/3 of the cycle time to user space and 1/3 to the kernel +FUDGE_FACTOR=$((CYCLE_TIME_NS / 3)) +# Shift the isochron base time by half the gate time, so that packets are +# always received by swp1 close to the middle of the time slot, to minimize +# inaccuracies due to network sync +SHIFT_TIME_NS=$((GATE_DURATION_NS / 2)) + +path_delay= + +h1_create() +{ + simple_if_init $h1 $H1_IPV4/24 $H1_IPV6/64 +} + +h1_destroy() +{ + simple_if_fini $h1 $H1_IPV4/24 $H1_IPV6/64 +} + +h2_create() +{ + simple_if_init $h2 $H2_IPV4/24 $H2_IPV6/64 +} + +h2_destroy() +{ + simple_if_fini $h2 $H2_IPV4/24 $H2_IPV6/64 +} + +switch_create() +{ + local h2_mac_addr=$(mac_get $h2) + + ip link set $swp1 up + ip link set $swp2 up + + ip link add br0 type bridge vlan_filtering 1 + ip link set $swp1 master br0 + ip link set $swp2 master br0 + ip link set br0 up + + bridge vlan add dev $swp2 vid $STREAM_VID + bridge vlan add dev $swp1 vid $STREAM_VID + bridge fdb add dev $swp2 \ + $h2_mac_addr vlan $STREAM_VID static master +} + +switch_destroy() +{ + ip link del br0 +} + +ptp_setup() +{ + # Set up swp1 as a master PHC for h1, synchronized to the local + # CLOCK_REALTIME. + phc2sys_start $UDS_ADDRESS_SWP1 + ptp4l_start $h1 true $UDS_ADDRESS_H1 + ptp4l_start $swp1 false $UDS_ADDRESS_SWP1 +} + +ptp_cleanup() +{ + ptp4l_stop $swp1 + ptp4l_stop $h1 + phc2sys_stop +} + +txtime_setup() +{ + local if_name=$1 + + tc qdisc add dev $if_name clsact + # Classify PTP on TC 7 and isochron on TC 6 + tc filter add dev $if_name egress protocol 0x88f7 \ + flower action skbedit priority 7 + tc filter add dev $if_name egress protocol 802.1Q \ + flower vlan_ethtype 0xdead action skbedit priority 6 + tc qdisc add dev $if_name handle 100: parent root mqprio num_tc 8 \ + queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \ + map 0 1 2 3 4 5 6 7 \ + hw 1 + # Set up TC 5, 6, 7 for SO_TXTIME. tc-mqprio queues count from 1. + tc qdisc replace dev $if_name parent 100:$((STREAM_PRIO_1 + 1)) etf \ + clockid CLOCK_TAI offload delta $FUDGE_FACTOR + tc qdisc replace dev $if_name parent 100:$((STREAM_PRIO_2 + 1)) etf \ + clockid CLOCK_TAI offload delta $FUDGE_FACTOR + tc qdisc replace dev $if_name parent 100:$((STREAM_PRIO_3 + 1)) etf \ + clockid CLOCK_TAI offload delta $FUDGE_FACTOR +} + +txtime_cleanup() +{ + local if_name=$1 + + tc qdisc del dev $if_name clsact + tc qdisc del dev $if_name root +} + +taprio_replace() +{ + local if_name="$1"; shift + local extra_args="$1"; shift + + # STREAM_PRIO_1 always has an open gate. + # STREAM_PRIO_2 has a gate open for GATE_DURATION_NS (half the cycle time) + # STREAM_PRIO_3 always has a closed gate. + tc qdisc replace dev $if_name root stab overhead 24 taprio num_tc 8 \ + queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 \ + map 0 1 2 3 4 5 6 7 \ + sched-entry S $(printf "%x" $ALL_GATES) $GATE_DURATION_NS \ + sched-entry S $(printf "%x" $((ALL_GATES & ~(1 << STREAM_PRIO_2)))) $GATE_DURATION_NS \ + base-time 0 flags 0x2 $extra_args + taprio_wait_for_admin $if_name +} + +taprio_cleanup() +{ + local if_name=$1 + + tc qdisc del dev $if_name root +} + +probe_path_delay() +{ + local isochron_dat="$(mktemp)" + local received + + log_info "Probing path delay" + + isochron_do "$h1" "$h2" "$UDS_ADDRESS_H1" "" 0 \ + "$CYCLE_TIME_NS" "" "" "$NUM_PKTS" \ + "$STREAM_VID" "$STREAM_PRIO_1" "" "$isochron_dat" + + received=$(isochron_report_num_received "$isochron_dat") + if [ "$received" != "$NUM_PKTS" ]; then + echo "Cannot establish basic data path between $h1 and $h2" + exit $ksft_fail + fi + + printf "pdelay = {}\n" > isochron_data.py + isochron report --input-file "$isochron_dat" \ + --printf-format "pdelay[%u] = %d - %d\n" \ + --printf-args "qRT" \ + >> isochron_data.py + cat <<-'EOF' > isochron_postprocess.py + #!/usr/bin/env python3 + + from isochron_data import pdelay + import numpy as np + + w = np.array(list(pdelay.values())) + print("{}".format(np.max(w))) + EOF + path_delay=$(python3 ./isochron_postprocess.py) + + log_info "Path delay from $h1 to $h2 estimated at $path_delay ns" + + if [ "$path_delay" -gt "$GATE_DURATION_NS" ]; then + echo "Path delay larger than gate duration, aborting" + exit $ksft_fail + fi + + rm -f ./isochron_data.py 2> /dev/null + rm -f ./isochron_postprocess.py 2> /dev/null + rm -f "$isochron_dat" 2> /dev/null +} + +setup_prepare() +{ + vrf_prepare + + h1_create + h2_create + switch_create + + txtime_setup $h1 + + # Temporarily set up PTP just to probe the end-to-end path delay. + ptp_setup + probe_path_delay + ptp_cleanup +} + +cleanup() +{ + pre_cleanup + + isochron_recv_stop + txtime_cleanup $h1 + + switch_destroy + h2_destroy + h1_destroy + + vrf_cleanup +} + +run_test() +{ + local base_time=$1; shift + local stream_prio=$1; shift + local expected_delay=$1; shift + local should_fail=$1; shift + local test_name=$1; shift + local isochron_dat="$(mktemp)" + local received + local median_delay + + RET=0 + + # Set the shift time equal to the cycle time, which effectively + # cancels the default advance time. Packets won't be sent early in + # software, which ensures that they won't prematurely enter through + # the open gate in __test_out_of_band(). Also, the gate is open for + # long enough that this won't cause a problem in __test_in_band(). + isochron_do "$h1" "$h2" "$UDS_ADDRESS_H1" "" "$base_time" \ + "$CYCLE_TIME_NS" "$SHIFT_TIME_NS" "$GATE_DURATION_NS" \ + "$NUM_PKTS" "$STREAM_VID" "$stream_prio" "" "$isochron_dat" + + received=$(isochron_report_num_received "$isochron_dat") + [ "$received" = "$NUM_PKTS" ] + check_err_fail $should_fail $? "Reception of $NUM_PKTS packets" + + if [ $should_fail = 0 ] && [ "$received" = "$NUM_PKTS" ]; then + printf "pdelay = {}\n" > isochron_data.py + isochron report --input-file "$isochron_dat" \ + --printf-format "pdelay[%u] = %d - %d\n" \ + --printf-args "qRT" \ + >> isochron_data.py + cat <<-'EOF' > isochron_postprocess.py + #!/usr/bin/env python3 + + from isochron_data import pdelay + import numpy as np + + w = np.array(list(pdelay.values())) + print("{}".format(int(np.median(w)))) + EOF + median_delay=$(python3 ./isochron_postprocess.py) + + # If the condition below is true, packets were delayed by a closed gate + [ "$median_delay" -gt $((path_delay + expected_delay)) ] + check_fail $? "Median delay $median_delay is greater than expected delay $expected_delay plus path delay $path_delay" + + # If the condition below is true, packets were sent expecting them to + # hit a closed gate in the switch, but were not delayed + [ "$expected_delay" -gt 0 ] && [ "$median_delay" -lt "$expected_delay" ] + check_fail $? "Median delay $median_delay is less than expected delay $expected_delay" + fi + + log_test "$test_name" + + rm -f ./isochron_data.py 2> /dev/null + rm -f ./isochron_postprocess.py 2> /dev/null + rm -f "$isochron_dat" 2> /dev/null +} + +__test_always_open() +{ + run_test 0.000000000 $STREAM_PRIO_1 0 0 "Gate always open" +} + +__test_always_closed() +{ + run_test 0.000000000 $STREAM_PRIO_3 0 1 "Gate always closed" +} + +__test_in_band() +{ + # Send packets in-band with the OPEN gate entry + run_test 0.000000000 $STREAM_PRIO_2 0 0 "In band with gate" +} + +__test_out_of_band() +{ + # Send packets in-band with the CLOSE gate entry + run_test 0.005000000 $STREAM_PRIO_2 \ + $((GATE_DURATION_NS - SHIFT_TIME_NS)) 0 \ + "Out of band with gate" +} + +run_subtests() +{ + __test_always_open + __test_always_closed + __test_in_band + __test_out_of_band +} + +test_taprio_after_ptp() +{ + log_info "Setting up taprio after PTP" + ptp_setup + taprio_replace $swp2 + run_subtests + taprio_cleanup $swp2 + ptp_cleanup +} + +__test_under_max_sdu() +{ + # Limit max-sdu for STREAM_PRIO_1 + taprio_replace "$swp2" "max-sdu 0 0 0 0 0 0 100 0" + run_test 0.000000000 $STREAM_PRIO_1 0 0 "Under maximum SDU" +} + +__test_over_max_sdu() +{ + # Limit max-sdu for STREAM_PRIO_1 + taprio_replace "$swp2" "max-sdu 0 0 0 0 0 0 20 0" + run_test 0.000000000 $STREAM_PRIO_1 0 1 "Over maximum SDU" +} + +test_max_sdu() +{ + ptp_setup + __test_under_max_sdu + __test_over_max_sdu + taprio_cleanup $swp2 + ptp_cleanup +} + +# Perform a clock jump in the past without synchronization running, so that the +# time base remains where it was set by phc_ctl. +test_clock_jump_backward() +{ + # This is a more complex schedule specifically crafted in a way that + # has been problematic on NXP LS1028A. Not much to test with it other + # than the fact that it passes traffic. + tc qdisc replace dev $swp2 root stab overhead 24 taprio num_tc 8 \ + queues 1@0 1@1 1@2 1@3 1@4 1@5 1@6 1@7 map 0 1 2 3 4 5 6 7 \ + base-time 0 sched-entry S 20 300000 sched-entry S 10 200000 \ + sched-entry S 20 300000 sched-entry S 48 200000 \ + sched-entry S 20 300000 sched-entry S 83 200000 \ + sched-entry S 40 300000 sched-entry S 00 200000 flags 2 + + log_info "Forcing a backward clock jump" + phc_ctl $swp1 set 0 + + ping_test $h1 192.0.2.2 + taprio_cleanup $swp2 +} + +# Test that taprio tolerates clock jumps. +# Since ptp4l and phc2sys are running, it is expected for the time to +# eventually recover (through yet another clock jump). Isochron waits +# until that is the case. +test_clock_jump_backward_forward() +{ + log_info "Forcing a backward and a forward clock jump" + taprio_replace $swp2 + phc_ctl $swp1 set 0 + ptp_setup + ping_test $h1 192.0.2.2 + run_subtests + ptp_cleanup + taprio_cleanup $swp2 +} + +tc_offload_check +if [[ $? -ne 0 ]]; then + log_test_skip "Could not test offloaded functionality" + exit $EXIT_STATUS +fi + +trap cleanup EXIT + +setup_prepare +setup_wait +tests_run + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/tsn_lib.sh b/tools/testing/selftests/net/forwarding/tsn_lib.sh index bcee7960a39f2..08c044ff6689d 100644 --- a/tools/testing/selftests/net/forwarding/tsn_lib.sh +++ b/tools/testing/selftests/net/forwarding/tsn_lib.sh @@ -2,6 +2,8 @@ # SPDX-License-Identifier: GPL-2.0 # Copyright 2021-2022 NXP +tc_testing_scripts_dir=$(dirname $0)/../../tc-testing/scripts + REQUIRE_ISOCHRON=${REQUIRE_ISOCHRON:=yes} REQUIRE_LINUXPTP=${REQUIRE_LINUXPTP:=yes} @@ -18,6 +20,7 @@ fi if [[ "$REQUIRE_LINUXPTP" = "yes" ]]; then require_command phc2sys require_command ptp4l + require_command phc_ctl fi phc2sys_start() @@ -263,3 +266,10 @@ isochron_report_num_received() --printf-format "%u\n" --printf-args "R" | \ grep -w -v '0' | wc -l } + +taprio_wait_for_admin() +{ + local if_name="$1"; shift + + "$tc_testing_scripts_dir/taprio_wait_for_admin.sh" "$(which tc)" "$if_name" +} From b936a9b8d4a585ccb6d454921c36286bfe63e01d Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 26 Apr 2025 17:32:09 +0200 Subject: [PATCH 577/770] net: ipv6: fix UDPv6 GSO segmentation with NAT If any address or port is changed, update it in all packets and recalculate checksum. Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") Signed-off-by: Felix Fietkau Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250426153210.14044-1-nbd@nbd.name Signed-off-by: Jakub Kicinski --- net/ipv4/udp_offload.c | 61 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 2c0725583be39..9a8142ccbabe4 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -247,6 +247,62 @@ static struct sk_buff *__udpv4_gso_segment_list_csum(struct sk_buff *segs) return segs; } +static void __udpv6_gso_segment_csum(struct sk_buff *seg, + struct in6_addr *oldip, + const struct in6_addr *newip, + __be16 *oldport, __be16 newport) +{ + struct udphdr *uh = udp_hdr(seg); + + if (ipv6_addr_equal(oldip, newip) && *oldport == newport) + return; + + if (uh->check) { + inet_proto_csum_replace16(&uh->check, seg, oldip->s6_addr32, + newip->s6_addr32, true); + + inet_proto_csum_replace2(&uh->check, seg, *oldport, newport, + false); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + + *oldip = *newip; + *oldport = newport; +} + +static struct sk_buff *__udpv6_gso_segment_list_csum(struct sk_buff *segs) +{ + const struct ipv6hdr *iph; + const struct udphdr *uh; + struct ipv6hdr *iph2; + struct sk_buff *seg; + struct udphdr *uh2; + + seg = segs; + uh = udp_hdr(seg); + iph = ipv6_hdr(seg); + uh2 = udp_hdr(seg->next); + iph2 = ipv6_hdr(seg->next); + + if (!(*(const u32 *)&uh->source ^ *(const u32 *)&uh2->source) && + ipv6_addr_equal(&iph->saddr, &iph2->saddr) && + ipv6_addr_equal(&iph->daddr, &iph2->daddr)) + return segs; + + while ((seg = seg->next)) { + uh2 = udp_hdr(seg); + iph2 = ipv6_hdr(seg); + + __udpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, + &uh2->source, uh->source); + __udpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, + &uh2->dest, uh->dest); + } + + return segs; +} + static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, netdev_features_t features, bool is_ipv6) @@ -259,7 +315,10 @@ static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss); - return is_ipv6 ? skb : __udpv4_gso_segment_list_csum(skb); + if (is_ipv6) + return __udpv6_gso_segment_list_csum(skb); + else + return __udpv4_gso_segment_list_csum(skb); } struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, From aa6dcab1ea92a8a1f5a2ff7dec825f25eeebf17d Mon Sep 17 00:00:00 2001 From: Aryan Srivastava Date: Tue, 29 Apr 2025 09:49:20 +1200 Subject: [PATCH 578/770] net: phy: aquantia: fix commenting format Comment was erroneously added with /**, amend this to use /* as it is not a kernel-doc. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202504262247.1UBrDBVN-lkp@intel.com/ Signed-off-by: Aryan Srivastava Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250428214920.813038-1-aryan.srivastava@alliedtelesis.co.nz Signed-off-by: Jakub Kicinski --- drivers/net/phy/aquantia/aquantia_main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/aquantia/aquantia_main.c b/drivers/net/phy/aquantia/aquantia_main.c index 08b1c9cc902b8..77a48635d7bf0 100644 --- a/drivers/net/phy/aquantia/aquantia_main.c +++ b/drivers/net/phy/aquantia/aquantia_main.c @@ -516,8 +516,7 @@ static int aqr105_read_status(struct phy_device *phydev) if (!phydev->link || phydev->autoneg == AUTONEG_DISABLE) return 0; - /** - * The status register is not immediately correct on line side link up. + /* The status register is not immediately correct on line side link up. * Poll periodically until it reflects the correct ON state. * Only return fail for read error, timeout defaults to OFF state. */ @@ -634,8 +633,7 @@ static int aqr107_read_status(struct phy_device *phydev) if (!phydev->link || phydev->autoneg == AUTONEG_DISABLE) return 0; - /** - * The status register is not immediately correct on line side link up. + /* The status register is not immediately correct on line side link up. * Poll periodically until it reflects the correct ON state. * Only return fail for read error, timeout defaults to OFF state. */ From 6fc54c408dc9103bf98470b3877b741c44642baa Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 19 Mar 2025 11:26:39 +0100 Subject: [PATCH 579/770] igb: Link IRQs to NAPI instances Link IRQs to NAPI instances via netdev-genl API. This allows users to query that information via netlink: |$ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \ | --dump napi-get --json='{"ifindex": 2}' |[{'defer-hard-irqs': 0, | 'gro-flush-timeout': 0, | 'id': 8204, | 'ifindex': 2, | 'irq': 127, | 'irq-suspend-timeout': 0}, | {'defer-hard-irqs': 0, | 'gro-flush-timeout': 0, | 'id': 8203, | 'ifindex': 2, | 'irq': 126, | 'irq-suspend-timeout': 0}, | {'defer-hard-irqs': 0, | 'gro-flush-timeout': 0, | 'id': 8202, | 'ifindex': 2, | 'irq': 125, | 'irq-suspend-timeout': 0}, | {'defer-hard-irqs': 0, | 'gro-flush-timeout': 0, | 'id': 8201, | 'ifindex': 2, | 'irq': 124, | 'irq-suspend-timeout': 0}] |$ cat /proc/interrupts | grep enp2s0 |123: 0 1 IR-PCI-MSIX-0000:02:00.0 0-edge enp2s0 |124: 0 7 IR-PCI-MSIX-0000:02:00.0 1-edge enp2s0-TxRx-0 |125: 0 0 IR-PCI-MSIX-0000:02:00.0 2-edge enp2s0-TxRx-1 |126: 0 5 IR-PCI-MSIX-0000:02:00.0 3-edge enp2s0-TxRx-2 |127: 0 0 IR-PCI-MSIX-0000:02:00.0 4-edge enp2s0-TxRx-3 Reviewed-by: Joe Damato Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Kurt Kanzenbach Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index c646c71915f03..401af6dce2a85 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -947,6 +947,9 @@ static int igb_request_msix(struct igb_adapter *adapter) q_vector); if (err) goto err_free; + + netif_napi_set_irq(&q_vector->napi, + adapter->msix_entries[vector].vector); } igb_configure_msix(adapter); From b75a1dea500f4397a46e971cd41e613b7f18e317 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 19 Mar 2025 11:26:40 +0100 Subject: [PATCH 580/770] igb: Link queues to NAPI instances Link queues to NAPI instances via netdev-genl API. This is required to use XDP/ZC busy polling. See commit 5ef44b3cb43b ("xsk: Bring back busy polling support") for details. This also allows users to query the info with netlink: |$ ./tools/net/ynl/pyynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \ | --dump queue-get --json='{"ifindex": 2}' |[{'id': 0, 'ifindex': 2, 'napi-id': 8201, 'type': 'rx'}, | {'id': 1, 'ifindex': 2, 'napi-id': 8202, 'type': 'rx'}, | {'id': 2, 'ifindex': 2, 'napi-id': 8203, 'type': 'rx'}, | {'id': 3, 'ifindex': 2, 'napi-id': 8204, 'type': 'rx'}, | {'id': 0, 'ifindex': 2, 'napi-id': 8201, 'type': 'tx'}, | {'id': 1, 'ifindex': 2, 'napi-id': 8202, 'type': 'tx'}, | {'id': 2, 'ifindex': 2, 'napi-id': 8203, 'type': 'tx'}, | {'id': 3, 'ifindex': 2, 'napi-id': 8204, 'type': 'tx'}] Add rtnl locking to PCI error handlers, because netif_queue_set_napi() requires the lock held. While at __igb_open() use RCT coding style. Signed-off-by: Kurt Kanzenbach Reviewed-by: Joe Damato Tested-by: Sweta Kumari Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb.h | 2 ++ drivers/net/ethernet/intel/igb/igb_main.c | 43 ++++++++++++++++++++--- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 02f340280d20a..79eca385a751b 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -722,6 +722,8 @@ enum igb_boards { extern char igb_driver_name[]; +void igb_set_queue_napi(struct igb_adapter *adapter, int q_idx, + struct napi_struct *napi); int igb_xmit_xdp_ring(struct igb_adapter *adapter, struct igb_ring *ring, struct xdp_frame *xdpf); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 401af6dce2a85..0535cc72b11b7 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2099,6 +2099,22 @@ static void igb_check_swap_media(struct igb_adapter *adapter) wr32(E1000_CTRL_EXT, ctrl_ext); } +void igb_set_queue_napi(struct igb_adapter *adapter, int vector, + struct napi_struct *napi) +{ + struct igb_q_vector *q_vector = adapter->q_vector[vector]; + + if (q_vector->rx.ring) + netif_queue_set_napi(adapter->netdev, + q_vector->rx.ring->queue_index, + NETDEV_QUEUE_TYPE_RX, napi); + + if (q_vector->tx.ring) + netif_queue_set_napi(adapter->netdev, + q_vector->tx.ring->queue_index, + NETDEV_QUEUE_TYPE_TX, napi); +} + /** * igb_up - Open the interface and prepare it to handle traffic * @adapter: board private structure @@ -2106,6 +2122,7 @@ static void igb_check_swap_media(struct igb_adapter *adapter) int igb_up(struct igb_adapter *adapter) { struct e1000_hw *hw = &adapter->hw; + struct napi_struct *napi; int i; /* hardware has been reset, we need to reload some things */ @@ -2113,8 +2130,11 @@ int igb_up(struct igb_adapter *adapter) clear_bit(__IGB_DOWN, &adapter->state); - for (i = 0; i < adapter->num_q_vectors; i++) - napi_enable(&(adapter->q_vector[i]->napi)); + for (i = 0; i < adapter->num_q_vectors; i++) { + napi = &adapter->q_vector[i]->napi; + napi_enable(napi); + igb_set_queue_napi(adapter, i, napi); + } if (adapter->flags & IGB_FLAG_HAS_MSIX) igb_configure_msix(adapter); @@ -2184,6 +2204,7 @@ void igb_down(struct igb_adapter *adapter) for (i = 0; i < adapter->num_q_vectors; i++) { if (adapter->q_vector[i]) { napi_synchronize(&adapter->q_vector[i]->napi); + igb_set_queue_napi(adapter, i, NULL); napi_disable(&adapter->q_vector[i]->napi); } } @@ -4116,8 +4137,9 @@ static int igb_sw_init(struct igb_adapter *adapter) static int __igb_open(struct net_device *netdev, bool resuming) { struct igb_adapter *adapter = netdev_priv(netdev); - struct e1000_hw *hw = &adapter->hw; struct pci_dev *pdev = adapter->pdev; + struct e1000_hw *hw = &adapter->hw; + struct napi_struct *napi; int err; int i; @@ -4169,8 +4191,11 @@ static int __igb_open(struct net_device *netdev, bool resuming) /* From here on the code is the same as igb_up() */ clear_bit(__IGB_DOWN, &adapter->state); - for (i = 0; i < adapter->num_q_vectors; i++) - napi_enable(&(adapter->q_vector[i]->napi)); + for (i = 0; i < adapter->num_q_vectors; i++) { + napi = &adapter->q_vector[i]->napi; + napi_enable(napi); + igb_set_queue_napi(adapter, i, napi); + } /* Clear any pending interrupts. */ rd32(E1000_TSICR); @@ -9677,8 +9702,11 @@ static pci_ers_result_t igb_io_error_detected(struct pci_dev *pdev, if (state == pci_channel_io_perm_failure) return PCI_ERS_RESULT_DISCONNECT; + rtnl_lock(); if (netif_running(netdev)) igb_down(adapter); + rtnl_unlock(); + pci_disable_device(pdev); /* Request a slot reset. */ @@ -9737,16 +9765,21 @@ static void igb_io_resume(struct pci_dev *pdev) struct net_device *netdev = pci_get_drvdata(pdev); struct igb_adapter *adapter = netdev_priv(netdev); + rtnl_lock(); if (netif_running(netdev)) { if (!test_bit(__IGB_DOWN, &adapter->state)) { dev_dbg(&pdev->dev, "Resuming from non-fatal error, do nothing.\n"); + rtnl_unlock(); return; } + if (igb_up(adapter)) { dev_err(&pdev->dev, "igb_up failed after reset\n"); + rtnl_unlock(); return; } } + rtnl_unlock(); netif_device_attach(netdev); From fc0fb1f116e97012d90bb77a6032972564ed8fd6 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 19 Mar 2025 11:26:41 +0100 Subject: [PATCH 581/770] igb: Add support for persistent NAPI config Use netif_napi_add_config() to assign persistent per-NAPI config. This is useful for preserving NAPI settings when changing queue counts or for user space programs using SO_INCOMING_NAPI_ID. Reviewed-by: Joe Damato Reviewed-by: Aleksandr Loktionov Tested-by: Rinitha S (A Contingent worker at Intel) Signed-off-by: Kurt Kanzenbach Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 0535cc72b11b7..d9205573886eb 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -1197,7 +1197,8 @@ static int igb_alloc_q_vector(struct igb_adapter *adapter, return -ENOMEM; /* initialize NAPI */ - netif_napi_add(adapter->netdev, &q_vector->napi, igb_poll); + netif_napi_add_config(adapter->netdev, &q_vector->napi, igb_poll, + v_idx); /* tie q_vector and adapter together */ adapter->q_vector[v_idx] = q_vector; From a22ed15c99a052c6ea015b516acfd7e02bcdb995 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 19 Mar 2025 11:26:42 +0100 Subject: [PATCH 582/770] igb: Get rid of spurious interrupts When running the igc with XDP/ZC in busy polling mode with deferral of hard interrupts, interrupts still happen from time to time. That is caused by the igb task watchdog which triggers Rx interrupts periodically. That mechanism has been introduced to overcome skb/memory allocation failures [1]. So the Rx clean functions stop processing the Rx ring in case of such failure. The task watchdog triggers Rx interrupts periodically in the hope that memory became available in the mean time. The current behavior is undesirable for real time applications, because the driver induced Rx interrupts trigger also the softirq processing. However, all real time packets should be processed by the application which uses the busy polling method. Therefore, only trigger the Rx interrupts in case of real allocation failures. Introduce a new flag for signaling that condition. Follow the same logic as in commit 8dcf2c212078 ("igc: Get rid of spurious interrupts"). [1] - https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git/commit/?id=3be507547e6177e5c808544bd6a2efa2c7f1d436 Reviewed-by: Joe Damato Reviewed-by: Aleksandr Loktionov Tested-by: Sweta Kumari Signed-off-by: Kurt Kanzenbach Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igb/igb.h | 3 ++- drivers/net/ethernet/intel/igb/igb_main.c | 29 +++++++++++++++++++---- drivers/net/ethernet/intel/igb/igb_xsk.c | 1 + 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 79eca385a751b..f34ead8243e9f 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -391,7 +391,8 @@ enum e1000_ring_flags_t { IGB_RING_FLAG_RX_LB_VLAN_BSWAP, IGB_RING_FLAG_TX_CTX_IDX, IGB_RING_FLAG_TX_DETECT_HANG, - IGB_RING_FLAG_TX_DISABLED + IGB_RING_FLAG_TX_DISABLED, + IGB_RING_FLAG_RX_ALLOC_FAILED, }; #define ring_uses_large_buffer(ring) \ diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index d9205573886eb..9e9a5900e6e5a 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -5755,11 +5755,29 @@ static void igb_watchdog_task(struct work_struct *work) if (adapter->flags & IGB_FLAG_HAS_MSIX) { u32 eics = 0; - for (i = 0; i < adapter->num_q_vectors; i++) - eics |= adapter->q_vector[i]->eims_value; - wr32(E1000_EICS, eics); + for (i = 0; i < adapter->num_q_vectors; i++) { + struct igb_q_vector *q_vector = adapter->q_vector[i]; + struct igb_ring *rx_ring; + + if (!q_vector->rx.ring) + continue; + + rx_ring = adapter->rx_ring[q_vector->rx.ring->queue_index]; + + if (test_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags)) { + eics |= q_vector->eims_value; + clear_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); + } + } + if (eics) + wr32(E1000_EICS, eics); } else { - wr32(E1000_ICS, E1000_ICS_RXDMT0); + struct igb_ring *rx_ring = adapter->rx_ring[0]; + + if (test_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags)) { + clear_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); + wr32(E1000_ICS, E1000_ICS_RXDMT0); + } } igb_spoof_check(adapter); @@ -9090,6 +9108,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) if (!xdp_res && !skb) { rx_ring->rx_stats.alloc_failed++; rx_buffer->pagecnt_bias++; + set_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); break; } @@ -9149,6 +9168,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, page = dev_alloc_pages(igb_rx_pg_order(rx_ring)); if (unlikely(!page)) { rx_ring->rx_stats.alloc_failed++; + set_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); return false; } @@ -9165,6 +9185,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, __free_pages(page, igb_rx_pg_order(rx_ring)); rx_ring->rx_stats.alloc_failed++; + set_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); return false; } diff --git a/drivers/net/ethernet/intel/igb/igb_xsk.c b/drivers/net/ethernet/intel/igb/igb_xsk.c index 157d43787fa0b..5cf67ba292694 100644 --- a/drivers/net/ethernet/intel/igb/igb_xsk.c +++ b/drivers/net/ethernet/intel/igb/igb_xsk.c @@ -415,6 +415,7 @@ int igb_clean_rx_irq_zc(struct igb_q_vector *q_vector, /* exit if we failed to retrieve a buffer */ if (!skb) { rx_ring->rx_stats.alloc_failed++; + set_bit(IGB_RING_FLAG_RX_ALLOC_FAILED, &rx_ring->flags); break; } From 68f37f26b0ff28c93d94084178b946136da6d140 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Fri, 21 Mar 2025 14:52:38 +0100 Subject: [PATCH 583/770] igc: Limit netdev_tc calls to MQPRIO Limit netdev_tc calls to MQPRIO. Currently these calls are made in igc_tsn_enable_offload() and igc_tsn_disable_offload() which are used by TAPRIO and ETF as well. However, these are only required for MQPRIO. Signed-off-by: Kurt Kanzenbach Reviewed-by: Simon Horman Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 18 +++++++++++++++++- drivers/net/ethernet/intel/igc/igc_tsn.c | 20 -------------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 5b06765a35e9a..27575a1e1777f 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -6730,13 +6730,14 @@ static int igc_tsn_enable_mqprio(struct igc_adapter *adapter, struct tc_mqprio_qopt_offload *mqprio) { struct igc_hw *hw = &adapter->hw; - int i; + int err, i; if (hw->mac.type != igc_i225) return -EOPNOTSUPP; if (!mqprio->qopt.num_tc) { adapter->strict_priority_enable = false; + netdev_reset_tc(adapter->netdev); goto apply; } @@ -6767,6 +6768,21 @@ static int igc_tsn_enable_mqprio(struct igc_adapter *adapter, igc_save_mqprio_params(adapter, mqprio->qopt.num_tc, mqprio->qopt.offset); + err = netdev_set_num_tc(adapter->netdev, adapter->num_tc); + if (err) + return err; + + for (i = 0; i < adapter->num_tc; i++) { + err = netdev_set_tc_queue(adapter->netdev, i, 1, + adapter->queue_per_tc[i]); + if (err) + return err; + } + + /* In case the card is configured with less than four queues. */ + for (; i < IGC_MAX_TX_QUEUES; i++) + adapter->queue_per_tc[i] = i; + mqprio->qopt.hw = TC_MQPRIO_HW_OFFLOAD_TCS; apply: diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index b23bf0be007de..cbc49cdcaf6b6 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -320,9 +320,6 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) wr32(IGC_QBVCYCLET_S, 0); wr32(IGC_QBVCYCLET, NSEC_PER_SEC); - /* Reset mqprio TC configuration. */ - netdev_reset_tc(adapter->netdev); - /* Restore the default Tx arbitration: Priority 0 has the highest * priority and is assigned to queue 0 and so on and so forth. */ @@ -394,23 +391,6 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) igc_tsn_set_retx_qbvfullthreshold(adapter); if (adapter->strict_priority_enable) { - int err; - - err = netdev_set_num_tc(adapter->netdev, adapter->num_tc); - if (err) - return err; - - for (i = 0; i < adapter->num_tc; i++) { - err = netdev_set_tc_queue(adapter->netdev, i, 1, - adapter->queue_per_tc[i]); - if (err) - return err; - } - - /* In case the card is configured with less than four queues. */ - for (; i < IGC_MAX_TX_QUEUES; i++) - adapter->queue_per_tc[i] = i; - /* Configure queue priorities according to the user provided * mapping. */ From 876863c3fc755730c942b4c5e1297255d81787e6 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Fri, 21 Mar 2025 14:52:39 +0100 Subject: [PATCH 584/770] igc: Change Tx mode for MQPRIO offloading The current MQPRIO offload implementation uses the legacy TSN Tx mode. In this mode the hardware uses four packet buffers and considers queue priorities. In order to harmonize the TAPRIO implementation with MQPRIO, switch to the regular TSN Tx mode. This mode also uses four packet buffers and considers queue priorities. In addition to the legacy mode, transmission is always coupled to Qbv. The driver already has mechanisms to use a dummy schedule of 1 second with all gates open for ETF. Simply use this for MQPRIO too. This reduces code and makes it easier to add support for frame preemption later. Tested on i225 with real time application using high priority queue, iperf3 using low priority queue and network TAP device. Acked-by: Faizal Rahim Signed-off-by: Kurt Kanzenbach Reviewed-by: Simon Horman Tested-by: Mor Bar-Gabay Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc.h | 5 ++--- drivers/net/ethernet/intel/igc/igc_tsn.c | 19 ++----------------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h index 9bd243f23b469..859a15e4ccbab 100644 --- a/drivers/net/ethernet/intel/igc/igc.h +++ b/drivers/net/ethernet/intel/igc/igc.h @@ -394,12 +394,11 @@ extern char igc_driver_name[]; #define IGC_FLAG_RX_LEGACY BIT(16) #define IGC_FLAG_TSN_QBV_ENABLED BIT(17) #define IGC_FLAG_TSN_QAV_ENABLED BIT(18) -#define IGC_FLAG_TSN_LEGACY_ENABLED BIT(19) -#define IGC_FLAG_TSN_PREEMPT_ENABLED BIT(20) +#define IGC_FLAG_TSN_PREEMPT_ENABLED BIT(19) #define IGC_FLAG_TSN_ANY_ENABLED \ (IGC_FLAG_TSN_QBV_ENABLED | IGC_FLAG_TSN_QAV_ENABLED | \ - IGC_FLAG_TSN_LEGACY_ENABLED | IGC_FLAG_TSN_PREEMPT_ENABLED) + IGC_FLAG_TSN_PREEMPT_ENABLED) #define IGC_FLAG_RSS_FIELD_IPV4_UDP BIT(6) #define IGC_FLAG_RSS_FIELD_IPV6_UDP BIT(7) diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c index cbc49cdcaf6b6..f22cc4d4f459d 100644 --- a/drivers/net/ethernet/intel/igc/igc_tsn.c +++ b/drivers/net/ethernet/intel/igc/igc_tsn.c @@ -171,18 +171,14 @@ static unsigned int igc_tsn_new_flags(struct igc_adapter *adapter) { unsigned int new_flags = adapter->flags & ~IGC_FLAG_TSN_ANY_ENABLED; - if (adapter->taprio_offload_enable) - new_flags |= IGC_FLAG_TSN_QBV_ENABLED; - if (is_any_launchtime(adapter)) + if (adapter->taprio_offload_enable || is_any_launchtime(adapter) || + adapter->strict_priority_enable) new_flags |= IGC_FLAG_TSN_QBV_ENABLED; if (is_cbs_enabled(adapter)) new_flags |= IGC_FLAG_TSN_QAV_ENABLED; - if (adapter->strict_priority_enable) - new_flags |= IGC_FLAG_TSN_LEGACY_ENABLED; - if (adapter->fpe.mmsv.pmac_enabled) new_flags |= IGC_FLAG_TSN_PREEMPT_ENABLED; @@ -326,7 +322,6 @@ static int igc_tsn_disable_offload(struct igc_adapter *adapter) igc_tsn_tx_arb(adapter, queue_per_tc); adapter->flags &= ~IGC_FLAG_TSN_QBV_ENABLED; - adapter->flags &= ~IGC_FLAG_TSN_LEGACY_ENABLED; return 0; } @@ -395,16 +390,6 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) * mapping. */ igc_tsn_tx_arb(adapter, adapter->queue_per_tc); - - /* Enable legacy TSN mode which will do strict priority without - * any other TSN features. - */ - tqavctrl = rd32(IGC_TQAVCTRL); - tqavctrl |= IGC_TQAVCTRL_TRANSMIT_MODE_TSN; - tqavctrl &= ~IGC_TQAVCTRL_ENHANCED_QAV; - wr32(IGC_TQAVCTRL, tqavctrl); - - return 0; } for (i = 0; i < adapter->num_tx_queues; i++) { From 462cc09ac37ddde5a01477613220f0271fe49291 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 3 Mar 2025 13:06:27 +0100 Subject: [PATCH 585/770] ixgbe: create E610 specific ethtool_ops structure E610's implementation of various ethtool ops is different than the ones corresponding to ixgbe legacy products. Therefore create separate E610 ethtool_ops struct which will be filled out in the forthcoming patches. Add adequate ops struct basing on MAC type. This step requires changing a bit the flow of probing by placing ixgbe_set_ethtool_ops after hw.mac.type is assigned. So move the whole netdev assignment block after hw.mac.type is known. This step doesn't have any additional impact on probing sequence. Suggested-by: Aleksandr Loktionov Reviewed-by: Aleksandr Loktionov Signed-off-by: Jedrzej Jagielski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 52 ++++++++++++++++++- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 10 ++-- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index c86103eccc8aa..83d9ee3941e58 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -3650,7 +3650,57 @@ static const struct ethtool_ops ixgbe_ethtool_ops = { .set_link_ksettings = ixgbe_set_link_ksettings, }; +static const struct ethtool_ops ixgbe_ethtool_ops_e610 = { + .supported_coalesce_params = ETHTOOL_COALESCE_USECS, + .get_drvinfo = ixgbe_get_drvinfo, + .get_regs_len = ixgbe_get_regs_len, + .get_regs = ixgbe_get_regs, + .get_wol = ixgbe_get_wol, + .set_wol = ixgbe_set_wol, + .nway_reset = ixgbe_nway_reset, + .get_link = ethtool_op_get_link, + .get_eeprom_len = ixgbe_get_eeprom_len, + .get_eeprom = ixgbe_get_eeprom, + .set_eeprom = ixgbe_set_eeprom, + .get_ringparam = ixgbe_get_ringparam, + .set_ringparam = ixgbe_set_ringparam, + .get_pause_stats = ixgbe_get_pause_stats, + .get_pauseparam = ixgbe_get_pauseparam, + .set_pauseparam = ixgbe_set_pauseparam, + .get_msglevel = ixgbe_get_msglevel, + .set_msglevel = ixgbe_set_msglevel, + .self_test = ixgbe_diag_test, + .get_strings = ixgbe_get_strings, + .set_phys_id = ixgbe_set_phys_id, + .get_sset_count = ixgbe_get_sset_count, + .get_ethtool_stats = ixgbe_get_ethtool_stats, + .get_coalesce = ixgbe_get_coalesce, + .set_coalesce = ixgbe_set_coalesce, + .get_rxnfc = ixgbe_get_rxnfc, + .set_rxnfc = ixgbe_set_rxnfc, + .get_rxfh_indir_size = ixgbe_rss_indir_size, + .get_rxfh_key_size = ixgbe_get_rxfh_key_size, + .get_rxfh = ixgbe_get_rxfh, + .set_rxfh = ixgbe_set_rxfh, + .get_eee = ixgbe_get_eee, + .set_eee = ixgbe_set_eee, + .get_channels = ixgbe_get_channels, + .set_channels = ixgbe_set_channels, + .get_priv_flags = ixgbe_get_priv_flags, + .set_priv_flags = ixgbe_set_priv_flags, + .get_ts_info = ixgbe_get_ts_info, + .get_module_info = ixgbe_get_module_info, + .get_module_eeprom = ixgbe_get_module_eeprom, + .get_link_ksettings = ixgbe_get_link_ksettings, + .set_link_ksettings = ixgbe_set_link_ksettings, +}; + void ixgbe_set_ethtool_ops(struct net_device *netdev) { - netdev->ethtool_ops = &ixgbe_ethtool_ops; + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); + + if (adapter->hw.mac.type == ixgbe_mac_e610) + netdev->ethtool_ops = &ixgbe_ethtool_ops_e610; + else + netdev->ethtool_ops = &ixgbe_ethtool_ops; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index cdfafc477ee0d..40daab34e4fed 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -11433,11 +11433,6 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_ioremap; } - netdev->netdev_ops = &ixgbe_netdev_ops; - ixgbe_set_ethtool_ops(netdev); - netdev->watchdog_timeo = 5 * HZ; - strscpy(netdev->name, pci_name(pdev), sizeof(netdev->name)); - /* Setup hw api */ hw->mac.ops = *ii->mac_ops; hw->mac.type = ii->mac; @@ -11467,6 +11462,11 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) hw->phy.mdio.mdio_read = ixgbe_mdio_read; hw->phy.mdio.mdio_write = ixgbe_mdio_write; + netdev->netdev_ops = &ixgbe_netdev_ops; + ixgbe_set_ethtool_ops(netdev); + netdev->watchdog_timeo = 5 * HZ; + strscpy(netdev->name, pci_name(pdev), sizeof(netdev->name)); + /* setup the private structure */ err = ixgbe_sw_init(adapter, ii); if (err) From 451c6bc923e2217d1085e8c9ce88fe1b7a30a1db Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 3 Mar 2025 13:06:28 +0100 Subject: [PATCH 586/770] ixgbe: add support for ACPI WOL for E610 Currently only APM (Advanced Power Management) is supported by the ixgbe driver. It works for magic packets only, as for different sources of wake-up E610 adapter utilizes different feature. Add E610 specific implementation of ixgbe_set_wol() callback. When any of broadcast/multicast/unicast wake-up is set, disable APM and configure ACPI (Advanced Configuration and Power Interface). Reviewed-by: Michal Swiatkowski Reviewed-by: Aleksandr Loktionov Signed-off-by: Jedrzej Jagielski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 46 ++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 83d9ee3941e58..abc8c279192a9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -2365,6 +2365,50 @@ static int ixgbe_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) return 0; } +static int ixgbe_set_wol_acpi(struct net_device *netdev, + struct ethtool_wolinfo *wol) +{ + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); + struct ixgbe_hw *hw = &adapter->hw; + u32 grc; + + if (ixgbe_wol_exclusion(adapter, wol)) + return wol->wolopts ? -EOPNOTSUPP : 0; + + /* disable APM wakeup */ + grc = IXGBE_READ_REG(hw, IXGBE_GRC_X550EM_a); + grc &= ~IXGBE_GRC_APME; + IXGBE_WRITE_REG(hw, IXGBE_GRC_X550EM_a, grc); + + /* erase existing filters */ + IXGBE_WRITE_REG(hw, IXGBE_WUFC, 0); + adapter->wol = 0; + + if (wol->wolopts & WAKE_UCAST) + adapter->wol |= IXGBE_WUFC_EX; + if (wol->wolopts & WAKE_MCAST) + adapter->wol |= IXGBE_WUFC_MC; + if (wol->wolopts & WAKE_BCAST) + adapter->wol |= IXGBE_WUFC_BC; + + IXGBE_WRITE_REG(hw, IXGBE_WUC, IXGBE_WUC_PME_EN); + IXGBE_WRITE_REG(hw, IXGBE_WUFC, adapter->wol); + + hw->wol_enabled = adapter->wol; + device_set_wakeup_enable(&adapter->pdev->dev, adapter->wol); + + return 0; +} + +static int ixgbe_set_wol_e610(struct net_device *netdev, + struct ethtool_wolinfo *wol) +{ + if (wol->wolopts & (WAKE_UCAST | WAKE_MCAST | WAKE_BCAST)) + return ixgbe_set_wol_acpi(netdev, wol); + else + return ixgbe_set_wol(netdev, wol); +} + static int ixgbe_nway_reset(struct net_device *netdev) { struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); @@ -3656,7 +3700,7 @@ static const struct ethtool_ops ixgbe_ethtool_ops_e610 = { .get_regs_len = ixgbe_get_regs_len, .get_regs = ixgbe_get_regs, .get_wol = ixgbe_get_wol, - .set_wol = ixgbe_set_wol, + .set_wol = ixgbe_set_wol_e610, .nway_reset = ixgbe_nway_reset, .get_link = ethtool_op_get_link, .get_eeprom_len = ixgbe_get_eeprom_len, From 7f58648dbc53826d9480e45468717466d5b37343 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 3 Mar 2025 13:06:29 +0100 Subject: [PATCH 587/770] ixgbe: apply different rules for setting FC on E610 E610 device doesn't support disabling FC autonegotiation. Create dedicated E610 .set_pauseparam() implementation and assign it to ixgbe_ethtool_ops_e610. Reviewed-by: Aleksandr Loktionov Signed-off-by: Jedrzej Jagielski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Tony Nguyen --- .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 57 ++++++++++++++++--- 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index abc8c279192a9..435f3fc3cec34 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -564,6 +564,22 @@ static void ixgbe_get_pauseparam(struct net_device *netdev, } } +static void ixgbe_set_pauseparam_finalize(struct net_device *netdev, + struct ixgbe_fc_info *fc) +{ + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); + struct ixgbe_hw *hw = &adapter->hw; + + /* If the thing changed then we'll update and use new autoneg. */ + if (memcmp(fc, &hw->fc, sizeof(*fc))) { + hw->fc = *fc; + if (netif_running(netdev)) + ixgbe_reinit_locked(adapter); + else + ixgbe_reset(adapter); + } +} + static int ixgbe_set_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *pause) { @@ -592,15 +608,40 @@ static int ixgbe_set_pauseparam(struct net_device *netdev, else fc.requested_mode = ixgbe_fc_none; - /* if the thing changed then we'll update and use new autoneg */ - if (memcmp(&fc, &hw->fc, sizeof(struct ixgbe_fc_info))) { - hw->fc = fc; - if (netif_running(netdev)) - ixgbe_reinit_locked(adapter); - else - ixgbe_reset(adapter); + ixgbe_set_pauseparam_finalize(netdev, &fc); + + return 0; +} + +static int ixgbe_set_pauseparam_e610(struct net_device *netdev, + struct ethtool_pauseparam *pause) +{ + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); + struct ixgbe_hw *hw = &adapter->hw; + struct ixgbe_fc_info fc = hw->fc; + + if (!ixgbe_device_supports_autoneg_fc(hw)) + return -EOPNOTSUPP; + + if (pause->autoneg == AUTONEG_DISABLE) { + netdev_info(netdev, + "Cannot disable autonegotiation on this device.\n"); + return -EOPNOTSUPP; } + fc.disable_fc_autoneg = false; + + if (pause->rx_pause && pause->tx_pause) + fc.requested_mode = ixgbe_fc_full; + else if (pause->rx_pause) + fc.requested_mode = ixgbe_fc_rx_pause; + else if (pause->tx_pause) + fc.requested_mode = ixgbe_fc_tx_pause; + else + fc.requested_mode = ixgbe_fc_none; + + ixgbe_set_pauseparam_finalize(netdev, &fc); + return 0; } @@ -3710,7 +3751,7 @@ static const struct ethtool_ops ixgbe_ethtool_ops_e610 = { .set_ringparam = ixgbe_set_ringparam, .get_pause_stats = ixgbe_get_pause_stats, .get_pauseparam = ixgbe_get_pauseparam, - .set_pauseparam = ixgbe_set_pauseparam, + .set_pauseparam = ixgbe_set_pauseparam_e610, .get_msglevel = ixgbe_get_msglevel, .set_msglevel = ixgbe_set_msglevel, .self_test = ixgbe_diag_test, From 4bf2d11902efe9a3699cb0339f6a34cdef4e9aa6 Mon Sep 17 00:00:00 2001 From: Jedrzej Jagielski Date: Mon, 3 Mar 2025 13:06:30 +0100 Subject: [PATCH 588/770] ixgbe: add E610 .set_phys_id() callback implementation Legacy implementation of .set_phys_id() ethtool callback is not applicable for E610 device. Add new implementation which uses 0x06E9 command by calling ixgbe_aci_set_port_id_led(). Reviewed-by: Aleksandr Loktionov Reviewed-by: Michal Swiatkowski Signed-off-by: Jedrzej Jagielski Reviewed-by: Simon Horman Tested-by: Bharath R Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c | 29 +++++++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h | 1 + .../net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 22 +++++++++++++- .../ethernet/intel/ixgbe/ixgbe_type_e610.h | 14 +++++++++ 4 files changed, 65 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c index 9ada35f7d8f76..71ea25de1bac7 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.c @@ -1484,6 +1484,35 @@ static int ixgbe_start_hw_e610(struct ixgbe_hw *hw) return 0; } +/** + * ixgbe_aci_set_port_id_led - set LED value for the given port + * @hw: pointer to the HW struct + * @orig_mode: set LED original mode + * + * Set LED value for the given port (0x06E9) + * + * Return: the exit code of the operation. + */ +int ixgbe_aci_set_port_id_led(struct ixgbe_hw *hw, bool orig_mode) +{ + struct ixgbe_aci_cmd_set_port_id_led *cmd; + struct ixgbe_aci_desc desc; + + cmd = &desc.params.set_port_id_led; + + ixgbe_fill_dflt_direct_cmd_desc(&desc, ixgbe_aci_opc_set_port_id_led); + + cmd->lport_num = (u8)hw->bus.func; + cmd->lport_num_valid = IXGBE_ACI_PORT_ID_PORT_NUM_VALID; + + if (orig_mode) + cmd->ident_mode = IXGBE_ACI_PORT_IDENT_LED_ORIG; + else + cmd->ident_mode = IXGBE_ACI_PORT_IDENT_LED_BLINK; + + return ixgbe_aci_send_cmd(hw, &desc, NULL, 0); +} + /** * ixgbe_get_media_type_e610 - Gets media type * @hw: pointer to the HW struct diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h index 30bc1f1b2549b..bb31d65bd1c8f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_e610.h @@ -36,6 +36,7 @@ int ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool ena_lse, struct ixgbe_link_status *link); int ixgbe_aci_set_event_mask(struct ixgbe_hw *hw, u8 port_num, u16 mask); int ixgbe_configure_lse(struct ixgbe_hw *hw, bool activate, u16 mask); +int ixgbe_aci_set_port_id_led(struct ixgbe_hw *hw, bool orig_mode); enum ixgbe_media_type ixgbe_get_media_type_e610(struct ixgbe_hw *hw); int ixgbe_setup_link_e610(struct ixgbe_hw *hw, ixgbe_link_speed speed, bool autoneg_wait); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 435f3fc3cec34..d8a919ab7027a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -2491,6 +2491,26 @@ static int ixgbe_set_phys_id(struct net_device *netdev, return 0; } +static int ixgbe_set_phys_id_e610(struct net_device *netdev, + enum ethtool_phys_id_state state) +{ + struct ixgbe_adapter *adapter = ixgbe_from_netdev(netdev); + bool led_active; + + switch (state) { + case ETHTOOL_ID_ACTIVE: + led_active = true; + break; + case ETHTOOL_ID_INACTIVE: + led_active = false; + break; + default: + return -EOPNOTSUPP; + } + + return ixgbe_aci_set_port_id_led(&adapter->hw, !led_active); +} + static int ixgbe_get_coalesce(struct net_device *netdev, struct ethtool_coalesce *ec, struct kernel_ethtool_coalesce *kernel_coal, @@ -3756,7 +3776,7 @@ static const struct ethtool_ops ixgbe_ethtool_ops_e610 = { .set_msglevel = ixgbe_set_msglevel, .self_test = ixgbe_diag_test, .get_strings = ixgbe_get_strings, - .set_phys_id = ixgbe_set_phys_id, + .set_phys_id = ixgbe_set_phys_id_e610, .get_sset_count = ixgbe_get_sset_count, .get_ethtool_stats = ixgbe_get_ethtool_stats, .get_coalesce = ixgbe_get_coalesce, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h index bea94e5ccb730..09df67f03cf47 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_type_e610.h @@ -223,6 +223,7 @@ enum ixgbe_aci_opc { ixgbe_aci_opc_write_mdio = 0x06E5, ixgbe_aci_opc_set_gpio_by_func = 0x06E6, ixgbe_aci_opc_get_gpio_by_func = 0x06E7, + ixgbe_aci_opc_set_port_id_led = 0x06E9, ixgbe_aci_opc_set_gpio = 0x06EC, ixgbe_aci_opc_get_gpio = 0x06ED, ixgbe_aci_opc_sff_eeprom = 0x06EE, @@ -808,6 +809,18 @@ struct ixgbe_aci_cmd_get_link_topo_pin { u8 rsvd[7]; }; +/* Set Port Identification LED (direct, 0x06E9) */ +struct ixgbe_aci_cmd_set_port_id_led { + u8 lport_num; + u8 lport_num_valid; + u8 ident_mode; + u8 rsvd[13]; +}; + +#define IXGBE_ACI_PORT_ID_PORT_NUM_VALID BIT(0) +#define IXGBE_ACI_PORT_IDENT_LED_ORIG 0 +#define IXGBE_ACI_PORT_IDENT_LED_BLINK BIT(0) + /* Read/Write SFF EEPROM command (indirect 0x06EE) */ struct ixgbe_aci_cmd_sff_eeprom { u8 lport_num; @@ -985,6 +998,7 @@ struct ixgbe_aci_desc { struct ixgbe_aci_cmd_restart_an restart_an; struct ixgbe_aci_cmd_get_link_status get_link_status; struct ixgbe_aci_cmd_set_event_mask set_event_mask; + struct ixgbe_aci_cmd_set_port_id_led set_port_id_led; struct ixgbe_aci_cmd_get_link_topo get_link_topo; struct ixgbe_aci_cmd_get_link_topo_pin get_link_topo_pin; struct ixgbe_aci_cmd_sff_eeprom read_write_sff_param; From fe259a1bb26ec78842c975d992331705b0c2c2e8 Mon Sep 17 00:00:00 2001 From: Slawomir Mrozowicz Date: Fri, 11 Apr 2025 15:06:26 +0200 Subject: [PATCH 589/770] ixgbe: devlink: add devlink region support for E610 Provide support for the following devlink cmds: -DEVLINK_CMD_REGION_GET -DEVLINK_CMD_REGION_NEW -DEVLINK_CMD_REGION_DEL -DEVLINK_CMD_REGION_READ ixgbe devlink region implementation, similarly to the ice one, lets user to create snapshots of content of Non Volatile Memory, content of Shadow RAM, and capabilities of the device. For both NVM and SRAM regions provide .read() handler to let user read their contents without the need to create full snapshots. Reviewed-by: Przemek Kitszel Signed-off-by: Slawomir Mrozowicz Co-developed-by: Jedrzej Jagielski Signed-off-by: Jedrzej Jagielski Tested-by: Bharath R Signed-off-by: Tony Nguyen --- Documentation/networking/devlink/ixgbe.rst | 49 +++ drivers/net/ethernet/intel/ixgbe/Makefile | 3 +- .../ethernet/intel/ixgbe/devlink/devlink.h | 2 + .../net/ethernet/intel/ixgbe/devlink/region.c | 290 ++++++++++++++++++ drivers/net/ethernet/intel/ixgbe/ixgbe.h | 3 + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 + 6 files changed, 349 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/intel/ixgbe/devlink/region.c diff --git a/Documentation/networking/devlink/ixgbe.rst b/Documentation/networking/devlink/ixgbe.rst index 3fce291348fac..c27d1436c70ef 100644 --- a/Documentation/networking/devlink/ixgbe.rst +++ b/Documentation/networking/devlink/ixgbe.rst @@ -120,3 +120,52 @@ EMP firmware image. The driver does not currently support reloading the driver via ``DEVLINK_RELOAD_ACTION_DRIVER_REINIT``. + +Regions +======= + +The ``ixgbe`` driver implements the following regions for accessing internal +device data. + +.. list-table:: regions implemented + :widths: 15 85 + + * - Name + - Description + * - ``nvm-flash`` + - The contents of the entire flash chip, sometimes referred to as + the device's Non Volatile Memory. + * - ``shadow-ram`` + - The contents of the Shadow RAM, which is loaded from the beginning + of the flash. Although the contents are primarily from the flash, + this area also contains data generated during device boot which is + not stored in flash. + * - ``device-caps`` + - The contents of the device firmware's capabilities buffer. Useful to + determine the current state and configuration of the device. + +Both the ``nvm-flash`` and ``shadow-ram`` regions can be accessed without a +snapshot. The ``device-caps`` region requires a snapshot as the contents are +sent by firmware and can't be split into separate reads. + +Users can request an immediate capture of a snapshot for all three regions +via the ``DEVLINK_CMD_REGION_NEW`` command. + +.. code:: shell + + $ devlink region show + pci/0000:01:00.0/nvm-flash: size 10485760 snapshot [] max 1 + pci/0000:01:00.0/device-caps: size 4096 snapshot [] max 10 + + $ devlink region new pci/0000:01:00.0/nvm-flash snapshot 1 + + $ devlink region dump pci/0000:01:00.0/nvm-flash snapshot 1 + 0000000000000000 0014 95dc 0014 9514 0035 1670 0034 db30 + 0000000000000010 0000 0000 ffff ff04 0029 8c00 0028 8cc8 + 0000000000000020 0016 0bb8 0016 1720 0000 0000 c00f 3ffc + 0000000000000030 bada cce5 bada cce5 bada cce5 bada cce5 + + $ devlink region read pci/0000:01:00.0/nvm-flash snapshot 1 address 0 length 16 + 0000000000000000 0014 95dc 0014 9514 0035 1670 0034 db30 + + $ devlink region delete pci/0000:01:00.0/device-caps snapshot 1 diff --git a/drivers/net/ethernet/intel/ixgbe/Makefile b/drivers/net/ethernet/intel/ixgbe/Makefile index ce447540d146e..2e7738f41c58a 100644 --- a/drivers/net/ethernet/intel/ixgbe/Makefile +++ b/drivers/net/ethernet/intel/ixgbe/Makefile @@ -10,7 +10,8 @@ obj-$(CONFIG_IXGBE) += ixgbe.o ixgbe-y := ixgbe_main.o ixgbe_common.o ixgbe_ethtool.o \ ixgbe_82599.o ixgbe_82598.o ixgbe_phy.o ixgbe_sriov.o \ ixgbe_mbx.o ixgbe_x540.o ixgbe_x550.o ixgbe_lib.o ixgbe_ptp.o \ - ixgbe_xsk.o ixgbe_e610.o devlink/devlink.o ixgbe_fw_update.o + ixgbe_xsk.o ixgbe_e610.o devlink/devlink.o ixgbe_fw_update.o \ + devlink/region.o ixgbe-$(CONFIG_IXGBE_DCB) += ixgbe_dcb.o ixgbe_dcb_82598.o \ ixgbe_dcb_82599.o ixgbe_dcb_nl.o diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h index 0b27653a34072..381558058048f 100644 --- a/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h +++ b/drivers/net/ethernet/intel/ixgbe/devlink/devlink.h @@ -6,5 +6,7 @@ struct ixgbe_adapter *ixgbe_allocate_devlink(struct device *dev); int ixgbe_devlink_register_port(struct ixgbe_adapter *adapter); +void ixgbe_devlink_init_regions(struct ixgbe_adapter *adapter); +void ixgbe_devlink_destroy_regions(struct ixgbe_adapter *adapter); #endif /* _IXGBE_DEVLINK_H_ */ diff --git a/drivers/net/ethernet/intel/ixgbe/devlink/region.c b/drivers/net/ethernet/intel/ixgbe/devlink/region.c new file mode 100644 index 0000000000000..76f6571c3c342 --- /dev/null +++ b/drivers/net/ethernet/intel/ixgbe/devlink/region.c @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025, Intel Corporation. */ + +#include "ixgbe.h" +#include "devlink.h" + +#define IXGBE_DEVLINK_READ_BLK_SIZE (1024 * 1024) + +static const struct devlink_region_ops ixgbe_nvm_region_ops; +static const struct devlink_region_ops ixgbe_sram_region_ops; + +static int ixgbe_devlink_parse_region(struct ixgbe_hw *hw, + const struct devlink_region_ops *ops, + bool *read_shadow_ram, u32 *nvm_size) +{ + if (ops == &ixgbe_nvm_region_ops) { + *read_shadow_ram = false; + *nvm_size = hw->flash.flash_size; + } else if (ops == &ixgbe_sram_region_ops) { + *read_shadow_ram = true; + *nvm_size = hw->flash.sr_words * 2u; + } else { + return -EOPNOTSUPP; + } + + return 0; +} + +/** + * ixgbe_devlink_nvm_snapshot - Capture a snapshot of the NVM content + * @devlink: the devlink instance + * @ops: the devlink region being snapshotted + * @extack: extended ACK response structure + * @data: on exit points to snapshot data buffer + * + * This function is called in response to the DEVLINK_CMD_REGION_NEW cmd. + * + * Capture a snapshot of the whole requested NVM region. + * + * No need to worry with freeing @data, devlink core takes care if it. + * + * Return: 0 on success, -EOPNOTSUPP for unsupported regions, -EBUSY when + * cannot lock NVM, -ENOMEM when cannot alloc mem and -EIO when error + * occurs during reading. + */ +static int ixgbe_devlink_nvm_snapshot(struct devlink *devlink, + const struct devlink_region_ops *ops, + struct netlink_ext_ack *extack, u8 **data) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct ixgbe_hw *hw = &adapter->hw; + bool read_shadow_ram; + u8 *nvm_data, *buf; + u32 nvm_size, left; + u8 num_blks; + int err; + + err = ixgbe_devlink_parse_region(hw, ops, &read_shadow_ram, &nvm_size); + if (err) + return err; + + nvm_data = kvzalloc(nvm_size, GFP_KERNEL); + if (!nvm_data) + return -ENOMEM; + + num_blks = DIV_ROUND_UP(nvm_size, IXGBE_DEVLINK_READ_BLK_SIZE); + buf = nvm_data; + left = nvm_size; + + for (int i = 0; i < num_blks; i++) { + u32 read_sz = min_t(u32, IXGBE_DEVLINK_READ_BLK_SIZE, left); + + /* Need to acquire NVM lock during each loop run because the + * total period of reading whole NVM is longer than the maximum + * period the lock can be taken defined by the IXGBE_NVM_TIMEOUT. + */ + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to acquire NVM semaphore"); + kvfree(nvm_data); + return -EBUSY; + } + + err = ixgbe_read_flat_nvm(hw, i * IXGBE_DEVLINK_READ_BLK_SIZE, + &read_sz, buf, read_shadow_ram); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to read RAM content"); + ixgbe_release_nvm(hw); + kvfree(nvm_data); + return -EIO; + } + + ixgbe_release_nvm(hw); + + buf += read_sz; + left -= read_sz; + } + + *data = nvm_data; + return 0; +} + +/** + * ixgbe_devlink_devcaps_snapshot - Capture a snapshot of device capabilities + * @devlink: the devlink instance + * @ops: the devlink region being snapshotted + * @extack: extended ACK response structure + * @data: on exit points to snapshot data buffer + * + * This function is called in response to the DEVLINK_CMD_REGION_NEW for + * the device-caps devlink region. + * + * Capture a snapshot of the device capabilities reported by firmware. + * + * No need to worry with freeing @data, devlink core takes care if it. + * + * Return: 0 on success, -ENOMEM when cannot alloc mem, or return code of + * the reading operation. + */ +static int ixgbe_devlink_devcaps_snapshot(struct devlink *devlink, + const struct devlink_region_ops *ops, + struct netlink_ext_ack *extack, + u8 **data) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct ixgbe_aci_cmd_list_caps_elem *caps; + struct ixgbe_hw *hw = &adapter->hw; + int err; + + caps = kvzalloc(IXGBE_ACI_MAX_BUFFER_SIZE, GFP_KERNEL); + if (!caps) + return -ENOMEM; + + err = ixgbe_aci_list_caps(hw, caps, IXGBE_ACI_MAX_BUFFER_SIZE, NULL, + ixgbe_aci_opc_list_dev_caps); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to read device capabilities"); + kvfree(caps); + return err; + } + + *data = (u8 *)caps; + return 0; +} + +/** + * ixgbe_devlink_nvm_read - Read a portion of NVM flash content + * @devlink: the devlink instance + * @ops: the devlink region to snapshot + * @extack: extended ACK response structure + * @offset: the offset to start at + * @size: the amount to read + * @data: the data buffer to read into + * + * This function is called in response to DEVLINK_CMD_REGION_READ to directly + * read a section of the NVM contents. + * + * Read from either the nvm-flash region either shadow-ram region. + * + * Return: 0 on success, -EOPNOTSUPP for unsupported regions, -EBUSY when + * cannot lock NVM, -ERANGE when buffer limit exceeded and -EIO when error + * occurs during reading. + */ +static int ixgbe_devlink_nvm_read(struct devlink *devlink, + const struct devlink_region_ops *ops, + struct netlink_ext_ack *extack, + u64 offset, u32 size, u8 *data) +{ + struct ixgbe_adapter *adapter = devlink_priv(devlink); + struct ixgbe_hw *hw = &adapter->hw; + bool read_shadow_ram; + u32 nvm_size; + int err; + + err = ixgbe_devlink_parse_region(hw, ops, &read_shadow_ram, &nvm_size); + if (err) + return err; + + if (offset + size > nvm_size) { + NL_SET_ERR_MSG_MOD(extack, "Cannot read beyond the region size"); + return -ERANGE; + } + + err = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to acquire NVM semaphore"); + return -EBUSY; + } + + err = ixgbe_read_flat_nvm(hw, (u32)offset, &size, data, read_shadow_ram); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to read NVM contents"); + ixgbe_release_nvm(hw); + return -EIO; + } + + ixgbe_release_nvm(hw); + return 0; +} + +static const struct devlink_region_ops ixgbe_nvm_region_ops = { + .name = "nvm-flash", + .destructor = kvfree, + .snapshot = ixgbe_devlink_nvm_snapshot, + .read = ixgbe_devlink_nvm_read, +}; + +static const struct devlink_region_ops ixgbe_sram_region_ops = { + .name = "shadow-ram", + .destructor = kvfree, + .snapshot = ixgbe_devlink_nvm_snapshot, + .read = ixgbe_devlink_nvm_read, +}; + +static const struct devlink_region_ops ixgbe_devcaps_region_ops = { + .name = "device-caps", + .destructor = kvfree, + .snapshot = ixgbe_devlink_devcaps_snapshot, +}; + +/** + * ixgbe_devlink_init_regions - Initialize devlink regions + * @adapter: adapter instance + * + * Create devlink regions used to enable access to dump the contents of the + * flash memory of the device. + */ +void ixgbe_devlink_init_regions(struct ixgbe_adapter *adapter) +{ + struct devlink *devlink = adapter->devlink; + struct device *dev = &adapter->pdev->dev; + u64 nvm_size, sram_size; + + if (adapter->hw.mac.type != ixgbe_mac_e610) + return; + + nvm_size = adapter->hw.flash.flash_size; + adapter->nvm_region = devl_region_create(devlink, &ixgbe_nvm_region_ops, + 1, nvm_size); + if (IS_ERR(adapter->nvm_region)) { + dev_err(dev, + "Failed to create NVM devlink region, err %ld\n", + PTR_ERR(adapter->nvm_region)); + adapter->nvm_region = NULL; + } + + sram_size = adapter->hw.flash.sr_words * 2u; + adapter->sram_region = devl_region_create(devlink, &ixgbe_sram_region_ops, + 1, sram_size); + if (IS_ERR(adapter->sram_region)) { + dev_err(dev, + "Failed to create shadow-ram devlink region, err %ld\n", + PTR_ERR(adapter->sram_region)); + adapter->sram_region = NULL; + } + + adapter->devcaps_region = devl_region_create(devlink, + &ixgbe_devcaps_region_ops, + 10, IXGBE_ACI_MAX_BUFFER_SIZE); + if (IS_ERR(adapter->devcaps_region)) { + dev_err(dev, + "Failed to create device-caps devlink region, err %ld\n", + PTR_ERR(adapter->devcaps_region)); + adapter->devcaps_region = NULL; + } +} + +/** + * ixgbe_devlink_destroy_regions - Destroy devlink regions + * @adapter: adapter instance + * + * Remove previously created regions for this adapter instance. + */ +void ixgbe_devlink_destroy_regions(struct ixgbe_adapter *adapter) +{ + if (adapter->hw.mac.type != ixgbe_mac_e610) + return; + + if (adapter->nvm_region) + devl_region_destroy(adapter->nvm_region); + + if (adapter->sram_region) + devl_region_destroy(adapter->sram_region); + + if (adapter->devcaps_region) + devl_region_destroy(adapter->devcaps_region); +} diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index 23c2e2c2649ce..47311b134a7a9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -616,6 +616,9 @@ struct ixgbe_adapter { struct mii_bus *mii_bus; struct devlink *devlink; struct devlink_port devlink_port; + struct devlink_region *nvm_region; + struct devlink_region *sram_region; + struct devlink_region *devcaps_region; unsigned long state; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 40daab34e4fed..03d31e5b131dc 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -11317,6 +11317,7 @@ static int ixgbe_recovery_probe(struct ixgbe_adapter *adapter) ixgbe_devlink_register_port(adapter); SET_NETDEV_DEVLINK_PORT(adapter->netdev, &adapter->devlink_port); + ixgbe_devlink_init_regions(adapter); devl_register(adapter->devlink); devl_unlock(adapter->devlink); @@ -11824,6 +11825,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_netdev; + ixgbe_devlink_init_regions(adapter); devl_register(adapter->devlink); devl_unlock(adapter->devlink); return 0; @@ -11882,6 +11884,7 @@ static void ixgbe_remove(struct pci_dev *pdev) netdev = adapter->netdev; devl_lock(adapter->devlink); devl_unregister(adapter->devlink); + ixgbe_devlink_destroy_regions(adapter); ixgbe_dbg_adapter_exit(adapter); set_bit(__IXGBE_REMOVING, &adapter->state); From 508d374b8dc018a72f2d77dbd5658e0d2e576679 Mon Sep 17 00:00:00 2001 From: Mateusz Polchlopek Date: Sat, 1 Mar 2025 20:02:44 +0100 Subject: [PATCH 590/770] idpf: assign extracted ptype to struct libeth_rqe_info field Assign the ptype extracted from qword to the ptype field of struct libeth_rqe_info. Remove the now excess ptype param of idpf_rx_singleq_extract_fields(), idpf_rx_singleq_extract_base_fields() and idpf_rx_singleq_extract_flex_fields(). Suggested-by: Alexander Lobakin Reviewed-by: Przemek Kitszel Reviewed-by: Alexander Lobakin Signed-off-by: Mateusz Polchlopek Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- .../ethernet/intel/idpf/idpf_singleq_txrx.c | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index eae1b6f474e62..2e356dd108127 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -891,7 +891,6 @@ bool idpf_rx_singleq_buf_hw_alloc_all(struct idpf_rx_queue *rx_q, * idpf_rx_singleq_extract_base_fields - Extract fields from the Rx descriptor * @rx_desc: the descriptor to process * @fields: storage for extracted values - * @ptype: pointer that will store packet type * * Decode the Rx descriptor and extract relevant information including the * size and Rx packet type. @@ -901,21 +900,20 @@ bool idpf_rx_singleq_buf_hw_alloc_all(struct idpf_rx_queue *rx_q, */ static void idpf_rx_singleq_extract_base_fields(const union virtchnl2_rx_desc *rx_desc, - struct libeth_rqe_info *fields, u32 *ptype) + struct libeth_rqe_info *fields) { u64 qword; qword = le64_to_cpu(rx_desc->base_wb.qword1.status_error_ptype_len); fields->len = FIELD_GET(VIRTCHNL2_RX_BASE_DESC_QW1_LEN_PBUF_M, qword); - *ptype = FIELD_GET(VIRTCHNL2_RX_BASE_DESC_QW1_PTYPE_M, qword); + fields->ptype = FIELD_GET(VIRTCHNL2_RX_BASE_DESC_QW1_PTYPE_M, qword); } /** * idpf_rx_singleq_extract_flex_fields - Extract fields from the Rx descriptor * @rx_desc: the descriptor to process * @fields: storage for extracted values - * @ptype: pointer that will store packet type * * Decode the Rx descriptor and extract relevant information including the * size and Rx packet type. @@ -925,12 +923,12 @@ idpf_rx_singleq_extract_base_fields(const union virtchnl2_rx_desc *rx_desc, */ static void idpf_rx_singleq_extract_flex_fields(const union virtchnl2_rx_desc *rx_desc, - struct libeth_rqe_info *fields, u32 *ptype) + struct libeth_rqe_info *fields) { fields->len = FIELD_GET(VIRTCHNL2_RX_FLEX_DESC_PKT_LEN_M, le16_to_cpu(rx_desc->flex_nic_wb.pkt_len)); - *ptype = FIELD_GET(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M, - le16_to_cpu(rx_desc->flex_nic_wb.ptype_flex_flags0)); + fields->ptype = FIELD_GET(VIRTCHNL2_RX_FLEX_DESC_PTYPE_M, + le16_to_cpu(rx_desc->flex_nic_wb.ptype_flex_flags0)); } /** @@ -938,18 +936,17 @@ idpf_rx_singleq_extract_flex_fields(const union virtchnl2_rx_desc *rx_desc, * @rx_q: Rx descriptor queue * @rx_desc: the descriptor to process * @fields: storage for extracted values - * @ptype: pointer that will store packet type * */ static void idpf_rx_singleq_extract_fields(const struct idpf_rx_queue *rx_q, const union virtchnl2_rx_desc *rx_desc, - struct libeth_rqe_info *fields, u32 *ptype) + struct libeth_rqe_info *fields) { if (rx_q->rxdids == VIRTCHNL2_RXDID_1_32B_BASE_M) - idpf_rx_singleq_extract_base_fields(rx_desc, fields, ptype); + idpf_rx_singleq_extract_base_fields(rx_desc, fields); else - idpf_rx_singleq_extract_flex_fields(rx_desc, fields, ptype); + idpf_rx_singleq_extract_flex_fields(rx_desc, fields); } /** @@ -972,7 +969,6 @@ static int idpf_rx_singleq_clean(struct idpf_rx_queue *rx_q, int budget) struct libeth_rqe_info fields = { }; union virtchnl2_rx_desc *rx_desc; struct idpf_rx_buf *rx_buf; - u32 ptype; /* get the Rx desc from Rx queue based on 'next_to_clean' */ rx_desc = &rx_q->rx[ntc]; @@ -993,7 +989,7 @@ static int idpf_rx_singleq_clean(struct idpf_rx_queue *rx_q, int budget) */ dma_rmb(); - idpf_rx_singleq_extract_fields(rx_q, rx_desc, &fields, &ptype); + idpf_rx_singleq_extract_fields(rx_q, rx_desc, &fields); rx_buf = &rx_q->rx_buf[ntc]; if (!libeth_rx_sync_for_cpu(rx_buf, fields.len)) @@ -1037,7 +1033,8 @@ static int idpf_rx_singleq_clean(struct idpf_rx_queue *rx_q, int budget) total_rx_bytes += skb->len; /* protocol */ - idpf_rx_singleq_process_skb_fields(rx_q, skb, rx_desc, ptype); + idpf_rx_singleq_process_skb_fields(rx_q, skb, rx_desc, + fields.ptype); /* send completed skb up the stack */ napi_gro_receive(rx_q->pp->p.napi, skb); From 8e36fcaa494d7e3fefabb187c4931f107e010cf0 Mon Sep 17 00:00:00 2001 From: Ruben Wauters Date: Mon, 28 Apr 2025 22:51:09 +0100 Subject: [PATCH 591/770] tools: ynl: fix typo in info string replaces formmated with formatted also corrects grammar by replacing a with an, and capitalises RST Signed-off-by: Ruben Wauters Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250428215541.6029-1-rubenru09@aol.com Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_rst.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_rst.py b/tools/net/ynl/pyynl/ynl_gen_rst.py index 6c56d0d726b41..0cb6348e28d3b 100755 --- a/tools/net/ynl/pyynl/ynl_gen_rst.py +++ b/tools/net/ynl/pyynl/ynl_gen_rst.py @@ -392,7 +392,7 @@ def parse_arguments() -> argparse.Namespace: def parse_yaml_file(filename: str) -> str: - """Transform the YAML specified by filename into a rst-formmated string""" + """Transform the YAML specified by filename into an RST-formatted string""" with open(filename, "r", encoding="utf-8") as spec_file: yaml_data = yaml.safe_load(spec_file) content = parse_yaml(yaml_data) From c058c5f8b6e424261c4974b39f2596e9d521f13f Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Wed, 9 Apr 2025 08:29:45 +0200 Subject: [PATCH 592/770] idpf: remove unreachable code from setting mailbox Remove code that isn't reached. There is no need to check for adapter->req_vec_chunks, because if it isn't set idpf_set_mb_vec_id() won't be called. Only one path when idpf_set_mb_vec_id() is called: idpf_intr_req() -> idpf_send_alloc_vectors_msg() -> adapter->req_vec_chunk is allocated here, otherwise an error is returned and idpf_intr_req() exits with an error. The idpf_set_mb_vec_id() becomes one-liner and it is called only once. Remove it and set mailbox vector index directly. Reviewed-by: Aleksandr Loktionov Reviewed-by: Michal Kubiak Reviewed-by: Pavan Kumar Linga Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Samuel Salin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/idpf/idpf_lib.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c index aa755dedb41d9..0e428dc476ed7 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_lib.c +++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c @@ -143,22 +143,6 @@ static int idpf_mb_intr_req_irq(struct idpf_adapter *adapter) return 0; } -/** - * idpf_set_mb_vec_id - Set vector index for mailbox - * @adapter: adapter structure to access the vector chunks - * - * The first vector id in the requested vector chunks from the CP is for - * the mailbox - */ -static void idpf_set_mb_vec_id(struct idpf_adapter *adapter) -{ - if (adapter->req_vec_chunks) - adapter->mb_vector.v_idx = - le16_to_cpu(adapter->caps.mailbox_vector_id); - else - adapter->mb_vector.v_idx = 0; -} - /** * idpf_mb_intr_init - Initialize the mailbox interrupt * @adapter: adapter structure to store the mailbox vector @@ -349,7 +333,7 @@ int idpf_intr_req(struct idpf_adapter *adapter) goto free_irq; } - idpf_set_mb_vec_id(adapter); + adapter->mb_vector.v_idx = le16_to_cpu(adapter->caps.mailbox_vector_id); vecids = kcalloc(total_vecs, sizeof(u16), GFP_KERNEL); if (!vecids) { From 144530c15ec7fa95b29812d86f4be527338ea204 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 25 Apr 2025 13:46:16 -0700 Subject: [PATCH 593/770] pds_core: remove extra name description Fix the kernel-doc complaint include/linux/pds/pds_adminq.h:481: warning: Excess struct member 'name' description in 'pds_core_lif_getattr_comp' Reviewed-by: Simon Horman Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- include/linux/pds/pds_adminq.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index ddd111f04ca02..339156113fa5d 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -463,7 +463,6 @@ struct pds_core_lif_getattr_cmd { * @rsvd: Word boundary padding * @comp_index: Index in the descriptor ring for which this is the completion * @state: LIF state (enum pds_core_lif_state) - * @name: LIF name string, 0 terminated * @features: Features (enum pds_core_hw_features) * @rsvd2: Word boundary padding * @color: Color bit From 7c4f4c4fa9b6fbb7e483bebd02f7b9cbc20ca5cc Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 25 Apr 2025 13:46:17 -0700 Subject: [PATCH 594/770] pds_core: smaller adminq poll starting interval Shorten the adminq poll starting interval in order to notice any transaction errors more quickly. Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/pds_core/adminq.c | 4 ++-- include/linux/pds/pds_adminq.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amd/pds_core/adminq.c b/drivers/net/ethernet/amd/pds_core/adminq.c index 506f682d15c10..097bb092bdb8c 100644 --- a/drivers/net/ethernet/amd/pds_core/adminq.c +++ b/drivers/net/ethernet/amd/pds_core/adminq.c @@ -225,7 +225,7 @@ int pdsc_adminq_post(struct pdsc *pdsc, union pds_core_adminq_comp *comp, bool fast_poll) { - unsigned long poll_interval = 1; + unsigned long poll_interval = 200; unsigned long poll_jiffies; unsigned long time_limit; unsigned long time_start; @@ -252,7 +252,7 @@ int pdsc_adminq_post(struct pdsc *pdsc, time_limit = time_start + HZ * pdsc->devcmd_timeout; do { /* Timeslice the actual wait to catch IO errors etc early */ - poll_jiffies = msecs_to_jiffies(poll_interval); + poll_jiffies = usecs_to_jiffies(poll_interval); remaining = wait_for_completion_timeout(wc, poll_jiffies); if (remaining) break; diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index 339156113fa5d..40ff0ec2b8792 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -4,7 +4,7 @@ #ifndef _PDS_CORE_ADMINQ_H_ #define _PDS_CORE_ADMINQ_H_ -#define PDSC_ADMINQ_MAX_POLL_INTERVAL 256 +#define PDSC_ADMINQ_MAX_POLL_INTERVAL 256000 /* usecs */ enum pds_core_adminq_flags { PDS_AQ_FLAG_FASTPOLL = BIT(1), /* completion poll at 1ms */ From 6828208a45c1839e94a87500e0721618d03783e7 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Fri, 25 Apr 2025 13:46:18 -0700 Subject: [PATCH 595/770] pds_core: init viftype default in declaration Initialize the .enabled field of the FWCTL viftype default in the declaration rather than as a bit of code as it is always to be enabled and needs no logic around it. Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/pds_core/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c index 0e408f990f0b2..076dfe2910c77 100644 --- a/drivers/net/ethernet/amd/pds_core/core.c +++ b/drivers/net/ethernet/amd/pds_core/core.c @@ -402,6 +402,7 @@ static int pdsc_core_init(struct pdsc *pdsc) static struct pdsc_viftype pdsc_viftype_defaults[] = { [PDS_DEV_TYPE_FWCTL] = { .name = PDS_DEV_TYPE_FWCTL_STR, + .enabled = true, .vif_id = PDS_DEV_TYPE_FWCTL, .dl_id = -1 }, [PDS_DEV_TYPE_VDPA] = { .name = PDS_DEV_TYPE_VDPA_STR, @@ -432,9 +433,6 @@ static int pdsc_viftypes_init(struct pdsc *pdsc) /* See what the Core device has for support */ vt_support = !!le16_to_cpu(pdsc->dev_ident.vif_types[vt]); - if (vt == PDS_DEV_TYPE_FWCTL) - pdsc->viftype_status[vt].enabled = true; - dev_dbg(pdsc->dev, "VIF %s is %ssupported\n", pdsc->viftype_status[vt].name, vt_support ? "" : "not "); From 9ab7a709c926c16b4433cf02d04fcbcf35aaab2b Mon Sep 17 00:00:00 2001 From: Shravya KN Date: Mon, 28 Apr 2025 15:58:56 -0700 Subject: [PATCH 596/770] bnxt_en: Fix error handling path in bnxt_init_chip() WARN_ON() is triggered in __flush_work() if bnxt_init_chip() fails because we call cancel_work_sync() on dim work that has not been initialized. WARNING: CPU: 37 PID: 5223 at kernel/workqueue.c:4201 __flush_work.isra.0+0x212/0x230 The driver relies on the BNXT_STATE_NAPI_DISABLED bit to check if dim work has already been cancelled. But in the bnxt_open() path, BNXT_STATE_NAPI_DISABLED is not set and this causes the error path to think that it needs to cancel the uninitalized dim work. Fix it by setting BNXT_STATE_NAPI_DISABLED during initialization. The bit will be cleared when we enable NAPI and initialize dim work. Fixes: 40452969a506 ("bnxt_en: Fix DIM shutdown") Suggested-by: Somnath Kotur Reviewed-by: Somnath Kotur Signed-off-by: Shravya KN Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2c8e2c19d8548..c4bccc683597c 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -11602,6 +11602,9 @@ static void bnxt_init_napi(struct bnxt *bp) poll_fn = bnxt_poll_p5; else if (BNXT_CHIP_TYPE_NITRO_A0(bp)) cp_nr_rings--; + + set_bit(BNXT_STATE_NAPI_DISABLED, &bp->state); + for (i = 0; i < cp_nr_rings; i++) { bnapi = bp->bnapi[i]; netif_napi_add_config_locked(bp->dev, &bnapi->napi, poll_fn, From 8e6cc9045380f3f0c48ebda2bda5e1abe263388d Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 28 Apr 2025 15:58:57 -0700 Subject: [PATCH 597/770] bnxt_en: Fix ethtool selftest output in one of the failure cases When RDMA driver is loaded, running offline self test is not supported and driver returns failure early. But it is not clearing the input buffer and hence the application prints some junk characters for individual test results. Fix it by clearing the buffer before returning. Fixes: 895621f1c816 ("bnxt_en: Don't support offline self test when RoCE driver is loaded") Reviewed-by: Somnath Kotur Signed-off-by: Kalesh AP Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 48dd5922e4dd8..7be37976f3e49 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4991,6 +4991,7 @@ static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, if (!bp->num_tests || !BNXT_PF(bp)) return; + memset(buf, 0, sizeof(u64) * bp->num_tests); if (etest->flags & ETH_TEST_FL_OFFLINE && bnxt_ulp_registered(bp->edev)) { etest->flags |= ETH_TEST_FL_FAILED; @@ -4998,7 +4999,6 @@ static void bnxt_self_test(struct net_device *dev, struct ethtool_test *etest, return; } - memset(buf, 0, sizeof(u64) * bp->num_tests); if (!netif_running(dev)) { etest->flags |= ETH_TEST_FL_FAILED; return; From a63db07e4ecd45b027718168faf7d798bb47bf58 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Mon, 28 Apr 2025 15:58:58 -0700 Subject: [PATCH 598/770] bnxt_en: Add missing skb_mark_for_recycle() in bnxt_rx_vlan() If bnxt_rx_vlan() fails because the VLAN protocol ID is invalid, the SKB is freed but we're missing the call to recycle it. This may cause the warning: "page_pool_release_retry() stalled pool shutdown" Add the missing skb_mark_for_recycle() in bnxt_rx_vlan(). Fixes: 86b05508f775 ("bnxt_en: Use the unified RX page pool buffers for XDP and non-XDP") Reviewed-by: Kalesh AP Reviewed-by: Pavan Chebbi Signed-off-by: Somnath Kotur Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index c4bccc683597c..cfc9ccab39bf8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2015,6 +2015,7 @@ static struct sk_buff *bnxt_rx_vlan(struct sk_buff *skb, u8 cmp_type, } return skb; vlan_err: + skb_mark_for_recycle(skb); dev_kfree_skb(skb); return NULL; } From 1ae04e489dd757e1e61999362f33e7c554c3b9e3 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Mon, 28 Apr 2025 15:58:59 -0700 Subject: [PATCH 599/770] bnxt_en: call pci_alloc_irq_vectors() after bnxt_reserve_rings() On some architectures (e.g. ARM), calling pci_alloc_irq_vectors() will immediately cause the MSIX table to be written. This will not work if we haven't called bnxt_reserve_rings() to properly map the MSIX table to the MSIX vectors reserved by FW. Fix the FW error recovery path to delay the bnxt_init_int_mode() -> pci_alloc_irq_vectors() call by removing it from bnxt_hwrm_if_change(). bnxt_request_irq() later in the code path will call it and by then the MSIX table is properly mapped. Fixes: 4343838ca5eb ("bnxt_en: Replace deprecated PCI MSIX APIs") Suggested-by: Michael Chan Signed-off-by: Kashyap Desai Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index cfc9ccab39bf8..3b2ea36f25a2a 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -12399,13 +12399,8 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) set_bit(BNXT_STATE_ABORT_ERR, &bp->state); return rc; } + /* IRQ will be initialized later in bnxt_request_irq()*/ bnxt_clear_int_mode(bp); - rc = bnxt_init_int_mode(bp); - if (rc) { - clear_bit(BNXT_STATE_FW_RESET_DET, &bp->state); - netdev_err(bp->dev, "init int mode failed\n"); - return rc; - } } rc = bnxt_cancel_reservations(bp, fw_reset); } From c2d20a3814d1b57dea6db82229edd702bde9c878 Mon Sep 17 00:00:00 2001 From: Kashyap Desai Date: Mon, 28 Apr 2025 15:59:00 -0700 Subject: [PATCH 600/770] bnxt_en: delay pci_alloc_irq_vectors() in the AER path This patch is similar to the last patch to delay the pci_alloc_irq_vectors() call in the AER path until after calling bnxt_reserve_rings(). bnxt_reserve_rings() needs to properly map the MSIX table first before we call pci_alloc_irq_vectors() which may immediately write to the MSIX table in some architectures. Move the bnxt_init_int_mode() call from bnxt_io_slot_reset() to bnxt_io_resume() after calling bnxt_reserve_rings(). With this change, the AER path may call bnxt_open() -> bnxt_hwrm_if_change() with bp->irq_tbl set to NULL. bp->irq_tbl is cleared when we call bnxt_clear_int_mode() in bnxt_io_slot_reset(). So we cannot use !bp->irq_tbl to detect aborted FW reset. Add a new BNXT_FW_RESET_STATE_ABORT to detect aborted FW reset in bnxt_hwrm_if_change(). Signed-off-by: Kashyap Desai Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 17 +++++++++++------ drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 3b2ea36f25a2a..78e496b0ec262 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -12325,12 +12325,15 @@ static int bnxt_hwrm_if_change(struct bnxt *bp, bool up) { struct hwrm_func_drv_if_change_output *resp; struct hwrm_func_drv_if_change_input *req; - bool fw_reset = !bp->irq_tbl; bool resc_reinit = false; bool caps_change = false; int rc, retry = 0; + bool fw_reset; u32 flags = 0; + fw_reset = (bp->fw_reset_state == BNXT_FW_RESET_STATE_ABORT); + bp->fw_reset_state = 0; + if (!(bp->fw_cap & BNXT_FW_CAP_IF_CHANGE)) return 0; @@ -14833,7 +14836,7 @@ static void bnxt_fw_reset_abort(struct bnxt *bp, int rc) clear_bit(BNXT_STATE_IN_FW_RESET, &bp->state); if (bp->fw_reset_state != BNXT_FW_RESET_STATE_POLL_VF) bnxt_dl_health_fw_status_update(bp, false); - bp->fw_reset_state = 0; + bp->fw_reset_state = BNXT_FW_RESET_STATE_ABORT; netif_close(bp->dev); } @@ -16931,10 +16934,9 @@ static pci_ers_result_t bnxt_io_slot_reset(struct pci_dev *pdev) if (!err) result = PCI_ERS_RESULT_RECOVERED; + /* IRQ will be initialized later in bnxt_io_resume */ bnxt_ulp_irq_stop(bp); bnxt_clear_int_mode(bp); - err = bnxt_init_int_mode(bp); - bnxt_ulp_irq_restart(bp, err); } reset_exit: @@ -16963,10 +16965,13 @@ static void bnxt_io_resume(struct pci_dev *pdev) err = bnxt_hwrm_func_qcaps(bp); if (!err) { - if (netif_running(netdev)) + if (netif_running(netdev)) { err = bnxt_open(netdev); - else + } else { err = bnxt_reserve_rings(bp, true); + if (!err) + err = bnxt_init_int_mode(bp); + } } if (!err) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 21726cf565866..bc8b3b7e915d3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -2614,6 +2614,7 @@ struct bnxt { #define BNXT_FW_RESET_STATE_POLL_FW 4 #define BNXT_FW_RESET_STATE_OPENING 5 #define BNXT_FW_RESET_STATE_POLL_FW_DOWN 6 +#define BNXT_FW_RESET_STATE_ABORT 7 u16 fw_reset_min_dsecs; #define BNXT_DFLT_FW_RST_MIN_DSECS 20 From ea9376cf68230e05492f22ca45d329f16e262c7b Mon Sep 17 00:00:00 2001 From: Shruti Parab Date: Mon, 28 Apr 2025 15:59:01 -0700 Subject: [PATCH 601/770] bnxt_en: Fix coredump logic to free allocated buffer When handling HWRM_DBG_COREDUMP_LIST FW command in bnxt_hwrm_dbg_dma_data(), the allocated buffer info->dest_buf is not freed in the error path. In the normal path, info->dest_buf is assigned to coredump->data and it will eventually be freed after the coredump is collected. Free info->dest_buf immediately inside bnxt_hwrm_dbg_dma_data() in the error path. Fixes: c74751f4c392 ("bnxt_en: Return error if FW returns more data than dump length") Reported-by: Michael Chan Reviewed-by: Kalesh AP Signed-off-by: Shruti Parab Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c index 5576e7cf84631..9a74793648f69 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c @@ -116,6 +116,11 @@ static int bnxt_hwrm_dbg_dma_data(struct bnxt *bp, void *msg, memcpy(info->dest_buf + off, dma_buf, len); } else { rc = -ENOBUFS; + if (cmn_req->req_type == + cpu_to_le16(HWRM_DBG_COREDUMP_LIST)) { + kfree(info->dest_buf); + info->dest_buf = NULL; + } break; } } From 6b87bd94f34370bbf1dfa59352bed8efab5bf419 Mon Sep 17 00:00:00 2001 From: Shruti Parab Date: Mon, 28 Apr 2025 15:59:02 -0700 Subject: [PATCH 602/770] bnxt_en: Fix out-of-bound memcpy() during ethtool -w When retrieving the FW coredump using ethtool, it can sometimes cause memory corruption: BUG: KFENCE: memory corruption in __bnxt_get_coredump+0x3ef/0x670 [bnxt_en] Corrupted memory at 0x000000008f0f30e8 [ ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ] (in kfence-#45): __bnxt_get_coredump+0x3ef/0x670 [bnxt_en] ethtool_get_dump_data+0xdc/0x1a0 __dev_ethtool+0xa1e/0x1af0 dev_ethtool+0xa8/0x170 dev_ioctl+0x1b5/0x580 sock_do_ioctl+0xab/0xf0 sock_ioctl+0x1ce/0x2e0 __x64_sys_ioctl+0x87/0xc0 do_syscall_64+0x5c/0xf0 entry_SYSCALL_64_after_hwframe+0x78/0x80 ... This happens when copying the coredump segment list in bnxt_hwrm_dbg_dma_data() with the HWRM_DBG_COREDUMP_LIST FW command. The info->dest_buf buffer is allocated based on the number of coredump segments returned by the FW. The segment list is then DMA'ed by the FW and the length of the DMA is returned by FW. The driver then copies this DMA'ed segment list to info->dest_buf. In some cases, this DMA length may exceed the info->dest_buf length and cause the above BUG condition. Fix it by capping the copy length to not exceed the length of info->dest_buf. The extra DMA data contains no useful information. This code path is shared for the HWRM_DBG_COREDUMP_LIST and the HWRM_DBG_COREDUMP_RETRIEVE FW commands. The buffering is different for these 2 FW commands. To simplify the logic, we need to move the line to adjust the buffer length for HWRM_DBG_COREDUMP_RETRIEVE up, so that the new check to cap the copy length will work for both commands. Fixes: c74751f4c392 ("bnxt_en: Return error if FW returns more data than dump length") Reviewed-by: Kalesh AP Signed-off-by: Shruti Parab Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- .../net/ethernet/broadcom/bnxt/bnxt_coredump.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c index 9a74793648f69..a000d3f630bd3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_coredump.c @@ -110,10 +110,19 @@ static int bnxt_hwrm_dbg_dma_data(struct bnxt *bp, void *msg, } } + if (cmn_req->req_type == + cpu_to_le16(HWRM_DBG_COREDUMP_RETRIEVE)) + info->dest_buf_size += len; + if (info->dest_buf) { if ((info->seg_start + off + len) <= BNXT_COREDUMP_BUF_LEN(info->buf_len)) { - memcpy(info->dest_buf + off, dma_buf, len); + u16 copylen = min_t(u16, len, + info->dest_buf_size - off); + + memcpy(info->dest_buf + off, dma_buf, copylen); + if (copylen < len) + break; } else { rc = -ENOBUFS; if (cmn_req->req_type == @@ -125,10 +134,6 @@ static int bnxt_hwrm_dbg_dma_data(struct bnxt *bp, void *msg, } } - if (cmn_req->req_type == - cpu_to_le16(HWRM_DBG_COREDUMP_RETRIEVE)) - info->dest_buf_size += len; - if (!(cmn_resp->flags & HWRM_DBG_CMN_FLAGS_MORE)) break; From 02e8be5a032cae0f4ca33c6053c44d83cf4acc93 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 28 Apr 2025 15:59:03 -0700 Subject: [PATCH 603/770] bnxt_en: Fix ethtool -d byte order for 32-bit values For version 1 register dump that includes the PCIe stats, the existing code incorrectly assumes that all PCIe stats are 64-bit values. Fix it by using an array containing the starting and ending index of the 32-bit values. The loop in bnxt_get_regs() will use the array to do proper endian swap for the 32-bit values. Fixes: b5d600b027eb ("bnxt_en: Add support for 'ethtool -d'") Reviewed-by: Shruti Parab Reviewed-by: Kalesh AP Reviewed-by: Andy Gospodarek Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 38 ++++++++++++++++--- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 7be37976f3e49..f5d490bf997e3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -2062,6 +2062,17 @@ static int bnxt_get_regs_len(struct net_device *dev) return reg_len; } +#define BNXT_PCIE_32B_ENTRY(start, end) \ + { offsetof(struct pcie_ctx_hw_stats, start), \ + offsetof(struct pcie_ctx_hw_stats, end) } + +static const struct { + u16 start; + u16 end; +} bnxt_pcie_32b_entries[] = { + BNXT_PCIE_32B_ENTRY(pcie_ltssm_histogram[0], pcie_ltssm_histogram[3]), +}; + static void bnxt_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p) { @@ -2094,12 +2105,27 @@ static void bnxt_get_regs(struct net_device *dev, struct ethtool_regs *regs, req->pcie_stat_host_addr = cpu_to_le64(hw_pcie_stats_addr); rc = hwrm_req_send(bp, req); if (!rc) { - __le64 *src = (__le64 *)hw_pcie_stats; - u64 *dst = (u64 *)(_p + BNXT_PXP_REG_LEN); - int i; - - for (i = 0; i < sizeof(*hw_pcie_stats) / sizeof(__le64); i++) - dst[i] = le64_to_cpu(src[i]); + u8 *dst = (u8 *)(_p + BNXT_PXP_REG_LEN); + u8 *src = (u8 *)hw_pcie_stats; + int i, j; + + for (i = 0, j = 0; i < sizeof(*hw_pcie_stats); ) { + if (i >= bnxt_pcie_32b_entries[j].start && + i <= bnxt_pcie_32b_entries[j].end) { + u32 *dst32 = (u32 *)(dst + i); + + *dst32 = le32_to_cpu(*(__le32 *)(src + i)); + i += 4; + if (i > bnxt_pcie_32b_entries[j].end && + j < ARRAY_SIZE(bnxt_pcie_32b_entries) - 1) + j++; + } else { + u64 *dst64 = (u64 *)(dst + i); + + *dst64 = le64_to_cpu(*(__le64 *)(src + i)); + i += 8; + } + } } hwrm_req_drop(bp, req); } From 4f79eaa2ceac86a0e0f304b0bab556cca5bf4f30 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 30 Apr 2025 15:56:34 -0700 Subject: [PATCH 604/770] kbuild: Properly disable -Wunterminated-string-initialization for clang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clang and GCC have different behaviors around disabling warnings included in -Wall and -Wextra and the order in which flags are specified, which is exposed by clang's new support for -Wunterminated-string-initialization. $ cat test.c const char foo[3] = "FOO"; const char bar[3] __attribute__((__nonstring__)) = "BAR"; $ clang -fsyntax-only -Wextra test.c test.c:1:21: warning: initializer-string for character array is too long, array size is 3 but initializer has size 4 (including the null terminating character); did you mean to use the 'nonstring' attribute? [-Wunterminated-string-initialization] 1 | const char foo[3] = "FOO"; | ^~~~~ $ clang -fsyntax-only -Wextra -Wno-unterminated-string-initialization test.c $ clang -fsyntax-only -Wno-unterminated-string-initialization -Wextra test.c test.c:1:21: warning: initializer-string for character array is too long, array size is 3 but initializer has size 4 (including the null terminating character); did you mean to use the 'nonstring' attribute? [-Wunterminated-string-initialization] 1 | const char foo[3] = "FOO"; | ^~~~~ $ gcc -fsyntax-only -Wextra test.c test.c:1:21: warning: initializer-string for array of ‘char’ truncates NUL terminator but destination lacks ‘nonstring’ attribute (4 chars into 3 available) [-Wunterminated-string-initialization] 1 | const char foo[3] = "FOO"; | ^~~~~ $ gcc -fsyntax-only -Wextra -Wno-unterminated-string-initialization test.c $ gcc -fsyntax-only -Wno-unterminated-string-initialization -Wextra test.c Move -Wextra up right below -Wall in Makefile.extrawarn to ensure these flags are at the beginning of the warning options list. Move the couple of warning options that have been added to the main Makefile since commit e88ca24319e4 ("kbuild: consolidate warning flags in scripts/Makefile.extrawarn") to scripts/Makefile.extrawarn after -Wall / -Wextra to ensure they get properly disabled for all compilers. Fixes: 9d7a0577c9db ("gcc-15: disable '-Wunterminated-string-initialization' entirely for now") Link: https://github.com/llvm/llvm-project/issues/10359 Signed-off-by: Nathan Chancellor Signed-off-by: Linus Torvalds --- Makefile | 7 ------- scripts/Makefile.extrawarn | 9 ++++++++- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 5aa9ee52a765b..94be5dfb81fb3 100644 --- a/Makefile +++ b/Makefile @@ -1052,13 +1052,6 @@ NOSTDINC_FLAGS += -nostdinc # perform bounds checking. KBUILD_CFLAGS += $(call cc-option, -fstrict-flex-arrays=3) -#Currently, disable -Wstringop-overflow for GCC 11, globally. -KBUILD_CFLAGS-$(CONFIG_CC_NO_STRINGOP_OVERFLOW) += $(call cc-disable-warning, stringop-overflow) -KBUILD_CFLAGS-$(CONFIG_CC_STRINGOP_OVERFLOW) += $(call cc-option, -Wstringop-overflow) - -#Currently, disable -Wunterminated-string-initialization as broken -KBUILD_CFLAGS += $(call cc-disable-warning, unterminated-string-initialization) - # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += -fno-strict-overflow diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index 2d6e59561c9d0..d88acdf408552 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -8,6 +8,7 @@ # Default set of warnings, always enabled KBUILD_CFLAGS += -Wall +KBUILD_CFLAGS += -Wextra KBUILD_CFLAGS += -Wundef KBUILD_CFLAGS += -Werror=implicit-function-declaration KBUILD_CFLAGS += -Werror=implicit-int @@ -56,6 +57,13 @@ KBUILD_CFLAGS += -Wno-pointer-sign # globally built with -Wcast-function-type. KBUILD_CFLAGS += $(call cc-option, -Wcast-function-type) +# Currently, disable -Wstringop-overflow for GCC 11, globally. +KBUILD_CFLAGS-$(CONFIG_CC_NO_STRINGOP_OVERFLOW) += $(call cc-disable-warning, stringop-overflow) +KBUILD_CFLAGS-$(CONFIG_CC_STRINGOP_OVERFLOW) += $(call cc-option, -Wstringop-overflow) + +# Currently, disable -Wunterminated-string-initialization as broken +KBUILD_CFLAGS += $(call cc-disable-warning, unterminated-string-initialization) + # The allocators already balk at large sizes, so silence the compiler # warnings for bounds checks involving those possible values. While # -Wno-alloc-size-larger-than would normally be used here, earlier versions @@ -82,7 +90,6 @@ KBUILD_CFLAGS += $(call cc-option,-Werror=designated-init) # Warn if there is an enum types mismatch KBUILD_CFLAGS += $(call cc-option,-Wenum-conversion) -KBUILD_CFLAGS += -Wextra KBUILD_CFLAGS += -Wunused # From 927069d5c40c1cfa7b2d13cfc6d7d58bc6f85c50 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 30 Apr 2025 10:03:43 -0700 Subject: [PATCH 605/770] bnxt_en: fix module unload sequence Recent updates to the PTP part of bnxt changed the way PTP FIFO is cleared, skbs waiting for TX timestamps are now cleared during ndo_close() call. To do clearing procedure, the ptp structure must exist and point to a valid address. Module destroy sequence had ptp clear code running before netdev close causing invalid memory access and kernel crash. Change the sequence to destroy ptp structure after device close. Fixes: 8f7ae5a85137 ("bnxt_en: improve TX timestamping FIFO configuration") Reported-by: Taehee Yoo Closes: https://lore.kernel.org/netdev/CAMArcTWDe2cd41=ub=zzvYifaYcYv-N-csxfqxUvejy_L0D6UQ@mail.gmail.com/ Signed-off-by: Vadim Fedorenko Reviewed-by: Jacob Keller Reviewed-by: Michael Chan Tested-by: Taehee Yoo Link: https://patch.msgid.link/20250430170343.759126-1-vadfed@meta.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 78e496b0ec262..86a5de44b6f3f 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -16006,8 +16006,8 @@ static void bnxt_remove_one(struct pci_dev *pdev) bnxt_rdma_aux_device_del(bp); - bnxt_ptp_clear(bp); unregister_netdev(dev); + bnxt_ptp_clear(bp); bnxt_rdma_aux_device_uninit(bp); From f920436a44295ca791ebb6dae3f4190142eec703 Mon Sep 17 00:00:00 2001 From: Jibin Zhang Date: Tue, 29 Apr 2025 09:59:48 +0800 Subject: [PATCH 606/770] net: use sock_gen_put() when sk_state is TCP_TIME_WAIT It is possible for a pointer of type struct inet_timewait_sock to be returned from the functions __inet_lookup_established() and __inet6_lookup_established(). This can cause a crash when the returned pointer is of type struct inet_timewait_sock and sock_put() is called on it. The following is a crash call stack that shows sk->sk_wmem_alloc being accessed in sk_free() during the call to sock_put() on a struct inet_timewait_sock pointer. To avoid this issue, use sock_gen_put() instead of sock_put() when sk->sk_state is TCP_TIME_WAIT. mrdump.ko ipanic() + 120 vmlinux notifier_call_chain(nr_to_call=-1, nr_calls=0) + 132 vmlinux atomic_notifier_call_chain(val=0) + 56 vmlinux panic() + 344 vmlinux add_taint() + 164 vmlinux end_report() + 136 vmlinux kasan_report(size=0) + 236 vmlinux report_tag_fault() + 16 vmlinux do_tag_recovery() + 16 vmlinux __do_kernel_fault() + 88 vmlinux do_bad_area() + 28 vmlinux do_tag_check_fault() + 60 vmlinux do_mem_abort() + 80 vmlinux el1_abort() + 56 vmlinux el1h_64_sync_handler() + 124 vmlinux > 0xFFFFFFC080011294() vmlinux __lse_atomic_fetch_add_release(v=0xF2FFFF82A896087C) vmlinux __lse_atomic_fetch_sub_release(v=0xF2FFFF82A896087C) vmlinux arch_atomic_fetch_sub_release(i=1, v=0xF2FFFF82A896087C) + 8 vmlinux raw_atomic_fetch_sub_release(i=1, v=0xF2FFFF82A896087C) + 8 vmlinux atomic_fetch_sub_release(i=1, v=0xF2FFFF82A896087C) + 8 vmlinux __refcount_sub_and_test(i=1, r=0xF2FFFF82A896087C, oldp=0) + 8 vmlinux __refcount_dec_and_test(r=0xF2FFFF82A896087C, oldp=0) + 8 vmlinux refcount_dec_and_test(r=0xF2FFFF82A896087C) + 8 vmlinux sk_free(sk=0xF2FFFF82A8960700) + 28 vmlinux sock_put() + 48 vmlinux tcp6_check_fraglist_gro() + 236 vmlinux tcp6_gro_receive() + 624 vmlinux ipv6_gro_receive() + 912 vmlinux dev_gro_receive() + 1116 vmlinux napi_gro_receive() + 196 ccmni.ko ccmni_rx_callback() + 208 ccmni.ko ccmni_queue_recv_skb() + 388 ccci_dpmaif.ko dpmaif_rxq_push_thread() + 1088 vmlinux kthread() + 268 vmlinux 0xFFFFFFC08001F30C() Fixes: c9d1d23e5239 ("net: add heuristic for enabling TCP fraglist GRO") Signed-off-by: Jibin Zhang Signed-off-by: Shiming Cheng Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250429020412.14163-1-shiming.cheng@mediatek.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_offload.c | 2 +- net/ipv6/tcpv6_offload.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 934f777f29d36..d293087b426df 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -439,7 +439,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; if (sk) - sock_put(sk); + sock_gen_put(sk); } INDIRECT_CALLABLE_SCOPE diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index d9b11fe41bf0c..a8a04f441e788 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -42,7 +42,7 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; if (sk) - sock_put(sk); + sock_gen_put(sk); #endif /* IS_ENABLED(CONFIG_IPV6) */ } From e98386d79a23c57cf179fe4138322e277aa3aa74 Mon Sep 17 00:00:00 2001 From: Sagi Maimon Date: Tue, 29 Apr 2025 10:33:20 +0300 Subject: [PATCH 607/770] ptp: ocp: Fix NULL dereference in Adva board SMA sysfs operations On Adva boards, SMA sysfs store/get operations can call __handle_signal_outputs() or __handle_signal_inputs() while the `irig` and `dcf` pointers are uninitialized, leading to a NULL pointer dereference in __handle_signal() and causing a kernel crash. Adva boards don't use `irig` or `dcf` functionality, so add Adva-specific callbacks `ptp_ocp_sma_adva_set_outputs()` and `ptp_ocp_sma_adva_set_inputs()` that avoid invoking `irig` or `dcf` input/output routines. Fixes: ef61f5528fca ("ptp: ocp: add Adva timecard support") Signed-off-by: Sagi Maimon Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250429073320.33277-1-maimon.sagi@gmail.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_ocp.c | 52 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index faf6e027f89ab..2ccdca4f69604 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -2578,12 +2578,60 @@ static const struct ocp_sma_op ocp_fb_sma_op = { .set_output = ptp_ocp_sma_fb_set_output, }; +static int +ptp_ocp_sma_adva_set_output(struct ptp_ocp *bp, int sma_nr, u32 val) +{ + u32 reg, mask, shift; + unsigned long flags; + u32 __iomem *gpio; + + gpio = sma_nr > 2 ? &bp->sma_map1->gpio2 : &bp->sma_map2->gpio2; + shift = sma_nr & 1 ? 0 : 16; + + mask = 0xffff << (16 - shift); + + spin_lock_irqsave(&bp->lock, flags); + + reg = ioread32(gpio); + reg = (reg & mask) | (val << shift); + + iowrite32(reg, gpio); + + spin_unlock_irqrestore(&bp->lock, flags); + + return 0; +} + +static int +ptp_ocp_sma_adva_set_inputs(struct ptp_ocp *bp, int sma_nr, u32 val) +{ + u32 reg, mask, shift; + unsigned long flags; + u32 __iomem *gpio; + + gpio = sma_nr > 2 ? &bp->sma_map2->gpio1 : &bp->sma_map1->gpio1; + shift = sma_nr & 1 ? 0 : 16; + + mask = 0xffff << (16 - shift); + + spin_lock_irqsave(&bp->lock, flags); + + reg = ioread32(gpio); + reg = (reg & mask) | (val << shift); + + iowrite32(reg, gpio); + + spin_unlock_irqrestore(&bp->lock, flags); + + return 0; +} + static const struct ocp_sma_op ocp_adva_sma_op = { .tbl = { ptp_ocp_adva_sma_in, ptp_ocp_adva_sma_out }, .init = ptp_ocp_sma_fb_init, .get = ptp_ocp_sma_fb_get, - .set_inputs = ptp_ocp_sma_fb_set_inputs, - .set_output = ptp_ocp_sma_fb_set_output, + .set_inputs = ptp_ocp_sma_adva_set_inputs, + .set_output = ptp_ocp_sma_adva_set_output, }; static int From 2d52e2e38b85c8b7bc00dca55c2499f46f8c8198 Mon Sep 17 00:00:00 2001 From: Thangaraj Samynathan Date: Tue, 29 Apr 2025 10:55:27 +0530 Subject: [PATCH 608/770] net: lan743x: Fix memleak issue when GSO enabled Always map the `skb` to the LS descriptor. Previously skb was mapped to EXT descriptor when the number of fragments is zero with GSO enabled. Mapping the skb to EXT descriptor prevents it from being freed, leading to a memory leak Fixes: 23f0703c125b ("lan743x: Add main source files for new lan743x driver") Signed-off-by: Thangaraj Samynathan Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250429052527.10031-1-thangaraj.s@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan743x_main.c | 8 ++++++-- drivers/net/ethernet/microchip/lan743x_main.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 23760b613d3ec..e2d6bfb5d6933 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1815,6 +1815,7 @@ static void lan743x_tx_frame_add_lso(struct lan743x_tx *tx, if (nr_frags <= 0) { tx->frame_data0 |= TX_DESC_DATA0_LS_; tx->frame_data0 |= TX_DESC_DATA0_IOC_; + tx->frame_last = tx->frame_first; } tx_descriptor = &tx->ring_cpu_ptr[tx->frame_tail]; tx_descriptor->data0 = cpu_to_le32(tx->frame_data0); @@ -1884,6 +1885,7 @@ static int lan743x_tx_frame_add_fragment(struct lan743x_tx *tx, tx->frame_first = 0; tx->frame_data0 = 0; tx->frame_tail = 0; + tx->frame_last = 0; return -ENOMEM; } @@ -1924,16 +1926,18 @@ static void lan743x_tx_frame_end(struct lan743x_tx *tx, TX_DESC_DATA0_DTYPE_DATA_) { tx->frame_data0 |= TX_DESC_DATA0_LS_; tx->frame_data0 |= TX_DESC_DATA0_IOC_; + tx->frame_last = tx->frame_tail; } - tx_descriptor = &tx->ring_cpu_ptr[tx->frame_tail]; - buffer_info = &tx->buffer_info[tx->frame_tail]; + tx_descriptor = &tx->ring_cpu_ptr[tx->frame_last]; + buffer_info = &tx->buffer_info[tx->frame_last]; buffer_info->skb = skb; if (time_stamp) buffer_info->flags |= TX_BUFFER_INFO_FLAG_TIMESTAMP_REQUESTED; if (ignore_sync) buffer_info->flags |= TX_BUFFER_INFO_FLAG_IGNORE_SYNC; + tx_descriptor = &tx->ring_cpu_ptr[tx->frame_tail]; tx_descriptor->data0 = cpu_to_le32(tx->frame_data0); tx->frame_tail = lan743x_tx_next_index(tx, tx->frame_tail); tx->last_tail = tx->frame_tail; diff --git a/drivers/net/ethernet/microchip/lan743x_main.h b/drivers/net/ethernet/microchip/lan743x_main.h index 7f73d66854bee..db5fc73e41cca 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.h +++ b/drivers/net/ethernet/microchip/lan743x_main.h @@ -980,6 +980,7 @@ struct lan743x_tx { u32 frame_first; u32 frame_data0; u32 frame_tail; + u32 frame_last; struct lan743x_tx_buffer_info *buffer_info; From a179aad12badc43201cbf45d1e8ed2c1383c76b9 Mon Sep 17 00:00:00 2001 From: Mattias Barthel Date: Tue, 29 Apr 2025 11:08:26 +0200 Subject: [PATCH 609/770] net: fec: ERR007885 Workaround for conventional TX Activate TX hang workaround also in fec_enet_txq_submit_skb() when TSO is not enabled. Errata: ERR007885 Symptoms: NETDEV WATCHDOG: eth0 (fec): transmit queue 0 timed out commit 37d6017b84f7 ("net: fec: Workaround for imx6sx enet tx hang when enable three queues") There is a TDAR race condition for mutliQ when the software sets TDAR and the UDMA clears TDAR simultaneously or in a small window (2-4 cycles). This will cause the udma_tx and udma_tx_arbiter state machines to hang. So, the Workaround is checking TDAR status four time, if TDAR cleared by hardware and then write TDAR, otherwise don't set TDAR. Fixes: 53bb20d1faba ("net: fec: add variable reg_desc_active to speed things up") Signed-off-by: Mattias Barthel Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250429090826.3101258-1-mattiasbarthel@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index a86cfebedaa8b..17e9bddb9ddd5 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -714,7 +714,12 @@ static int fec_enet_txq_submit_skb(struct fec_enet_priv_tx_q *txq, txq->bd.cur = bdp; /* Trigger transmission start */ - writel(0, txq->bd.reg_desc_active); + if (!(fep->quirks & FEC_QUIRK_ERR007885) || + !readl(txq->bd.reg_desc_active) || + !readl(txq->bd.reg_desc_active) || + !readl(txq->bd.reg_desc_active) || + !readl(txq->bd.reg_desc_active)) + writel(0, txq->bd.reg_desc_active); return 0; } From 34f42736b325287a7b2ce37e415838f539767bda Mon Sep 17 00:00:00 2001 From: Sathesh B Edara Date: Tue, 29 Apr 2025 04:46:24 -0700 Subject: [PATCH 610/770] octeon_ep: Fix host hang issue during device reboot When the host loses heartbeat messages from the device, the driver calls the device-specific ndo_stop function, which frees the resources. If the driver is unloaded in this scenario, it calls ndo_stop again, attempting to free resources that have already been freed, leading to a host hang issue. To resolve this, dev_close should be called instead of the device-specific stop function.dev_close internally calls ndo_stop to stop the network interface and performs additional cleanup tasks. During the driver unload process, if the device is already down, ndo_stop is not called. Fixes: 5cb96c29aa0e ("octeon_ep: add heartbeat monitor") Signed-off-by: Sathesh B Edara Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250429114624.19104-1-sedara@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeon_ep/octep_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c index 0a679e95196fe..24499bb36c005 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_main.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_main.c @@ -1223,7 +1223,7 @@ static void octep_hb_timeout_task(struct work_struct *work) miss_cnt); rtnl_lock(); if (netif_running(oct->netdev)) - octep_stop(oct->netdev); + dev_close(oct->netdev); rtnl_unlock(); } From ef2383d078edcbe3055032436b16cdf206f26de2 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Wed, 30 Apr 2025 17:30:49 +0800 Subject: [PATCH 611/770] net: hns3: store rx VLAN tag offload state for VF The VF driver missed to store the rx VLAN tag strip state when user change the rx VLAN tag offload state. And it will default to enable the rx vlan tag strip when re-init VF device after reset. So if user disable rx VLAN tag offload, and trig reset, then the HW will still strip the VLAN tag from packet nad fill into RX BD, but the VF driver will ignore it for rx VLAN tag offload disabled. It may cause the rx VLAN tag dropped. Fixes: b2641e2ad456 ("net: hns3: Add support of hardware rx-vlan-offload to HNS3 VF driver") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250430093052.2400464-2-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../hisilicon/hns3/hns3vf/hclgevf_main.c | 25 ++++++++++++++----- .../hisilicon/hns3/hns3vf/hclgevf_main.h | 1 + 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 9ba767740a043..dada42e7e0ec9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1292,9 +1292,8 @@ static void hclgevf_sync_vlan_filter(struct hclgevf_dev *hdev) rtnl_unlock(); } -static int hclgevf_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable) +static int hclgevf_en_hw_strip_rxvtag_cmd(struct hclgevf_dev *hdev, bool enable) { - struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); struct hclge_vf_to_pf_msg send_msg; hclgevf_build_send_msg(&send_msg, HCLGE_MBX_SET_VLAN, @@ -1303,6 +1302,19 @@ static int hclgevf_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable) return hclgevf_send_mbx_msg(hdev, &send_msg, false, NULL, 0); } +static int hclgevf_en_hw_strip_rxvtag(struct hnae3_handle *handle, bool enable) +{ + struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); + int ret; + + ret = hclgevf_en_hw_strip_rxvtag_cmd(hdev, enable); + if (ret) + return ret; + + hdev->rxvtag_strip_en = enable; + return 0; +} + static int hclgevf_reset_tqp(struct hnae3_handle *handle) { #define HCLGEVF_RESET_ALL_QUEUE_DONE 1U @@ -2204,12 +2216,13 @@ static int hclgevf_rss_init_hw(struct hclgevf_dev *hdev) tc_valid, tc_size); } -static int hclgevf_init_vlan_config(struct hclgevf_dev *hdev) +static int hclgevf_init_vlan_config(struct hclgevf_dev *hdev, + bool rxvtag_strip_en) { struct hnae3_handle *nic = &hdev->nic; int ret; - ret = hclgevf_en_hw_strip_rxvtag(nic, true); + ret = hclgevf_en_hw_strip_rxvtag(nic, rxvtag_strip_en); if (ret) { dev_err(&hdev->pdev->dev, "failed to enable rx vlan offload, ret = %d\n", ret); @@ -2879,7 +2892,7 @@ static int hclgevf_reset_hdev(struct hclgevf_dev *hdev) if (ret) return ret; - ret = hclgevf_init_vlan_config(hdev); + ret = hclgevf_init_vlan_config(hdev, hdev->rxvtag_strip_en); if (ret) { dev_err(&hdev->pdev->dev, "failed(%d) to initialize VLAN config\n", ret); @@ -2994,7 +3007,7 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) goto err_config; } - ret = hclgevf_init_vlan_config(hdev); + ret = hclgevf_init_vlan_config(hdev, true); if (ret) { dev_err(&hdev->pdev->dev, "failed(%d) to initialize VLAN config\n", ret); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h index cccef32284616..0208425ab594f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.h @@ -253,6 +253,7 @@ struct hclgevf_dev { int *vector_irq; bool gro_en; + bool rxvtag_strip_en; unsigned long vlan_del_fail_bmap[BITS_TO_LONGS(VLAN_N_VID)]; From 8e6b9c6ea5a55045eed6526d8ee49e93192d1a58 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Wed, 30 Apr 2025 17:30:50 +0800 Subject: [PATCH 612/770] net: hns3: fix an interrupt residual problem When a VF is passthrough to a VM, and the VM is killed, the reported interrupt may not been handled, it will remain, and won't be clear by the nic engine even with a flr or tqp reset. When the VM restart, the interrupt of the first vector may be dropped by the second enable_irq in vfio, see the issue below: https://gitlab.com/qemu-project/qemu/-/issues/2884#note_2423361621 We notice that the vfio has always behaved this way, and the interrupt is a residue of the nic engine, so we fix the problem by moving the vector enable process out of the enable_irq loop. Fixes: 08a100689d4b ("net: hns3: re-organize vector handle") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20250430093052.2400464-3-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 82 +++++++++---------- 1 file changed, 39 insertions(+), 43 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 9ff797fb36c45..b03b8758c7774 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -473,20 +473,14 @@ static void hns3_mask_vector_irq(struct hns3_enet_tqp_vector *tqp_vector, writel(mask_en, tqp_vector->mask_addr); } -static void hns3_vector_enable(struct hns3_enet_tqp_vector *tqp_vector) +static void hns3_irq_enable(struct hns3_enet_tqp_vector *tqp_vector) { napi_enable(&tqp_vector->napi); enable_irq(tqp_vector->vector_irq); - - /* enable vector */ - hns3_mask_vector_irq(tqp_vector, 1); } -static void hns3_vector_disable(struct hns3_enet_tqp_vector *tqp_vector) +static void hns3_irq_disable(struct hns3_enet_tqp_vector *tqp_vector) { - /* disable vector */ - hns3_mask_vector_irq(tqp_vector, 0); - disable_irq(tqp_vector->vector_irq); napi_disable(&tqp_vector->napi); cancel_work_sync(&tqp_vector->rx_group.dim.work); @@ -707,11 +701,42 @@ static int hns3_set_rx_cpu_rmap(struct net_device *netdev) return 0; } +static void hns3_enable_irqs_and_tqps(struct net_device *netdev) +{ + struct hns3_nic_priv *priv = netdev_priv(netdev); + struct hnae3_handle *h = priv->ae_handle; + u16 i; + + for (i = 0; i < priv->vector_num; i++) + hns3_irq_enable(&priv->tqp_vector[i]); + + for (i = 0; i < priv->vector_num; i++) + hns3_mask_vector_irq(&priv->tqp_vector[i], 1); + + for (i = 0; i < h->kinfo.num_tqps; i++) + hns3_tqp_enable(h->kinfo.tqp[i]); +} + +static void hns3_disable_irqs_and_tqps(struct net_device *netdev) +{ + struct hns3_nic_priv *priv = netdev_priv(netdev); + struct hnae3_handle *h = priv->ae_handle; + u16 i; + + for (i = 0; i < h->kinfo.num_tqps; i++) + hns3_tqp_disable(h->kinfo.tqp[i]); + + for (i = 0; i < priv->vector_num; i++) + hns3_mask_vector_irq(&priv->tqp_vector[i], 0); + + for (i = 0; i < priv->vector_num; i++) + hns3_irq_disable(&priv->tqp_vector[i]); +} + static int hns3_nic_net_up(struct net_device *netdev) { struct hns3_nic_priv *priv = netdev_priv(netdev); struct hnae3_handle *h = priv->ae_handle; - int i, j; int ret; ret = hns3_nic_reset_all_ring(h); @@ -720,23 +745,13 @@ static int hns3_nic_net_up(struct net_device *netdev) clear_bit(HNS3_NIC_STATE_DOWN, &priv->state); - /* enable the vectors */ - for (i = 0; i < priv->vector_num; i++) - hns3_vector_enable(&priv->tqp_vector[i]); - - /* enable rcb */ - for (j = 0; j < h->kinfo.num_tqps; j++) - hns3_tqp_enable(h->kinfo.tqp[j]); + hns3_enable_irqs_and_tqps(netdev); /* start the ae_dev */ ret = h->ae_algo->ops->start ? h->ae_algo->ops->start(h) : 0; if (ret) { set_bit(HNS3_NIC_STATE_DOWN, &priv->state); - while (j--) - hns3_tqp_disable(h->kinfo.tqp[j]); - - for (j = i - 1; j >= 0; j--) - hns3_vector_disable(&priv->tqp_vector[j]); + hns3_disable_irqs_and_tqps(netdev); } return ret; @@ -823,17 +838,9 @@ static void hns3_reset_tx_queue(struct hnae3_handle *h) static void hns3_nic_net_down(struct net_device *netdev) { struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = hns3_get_handle(netdev); const struct hnae3_ae_ops *ops; - int i; - /* disable vectors */ - for (i = 0; i < priv->vector_num; i++) - hns3_vector_disable(&priv->tqp_vector[i]); - - /* disable rcb */ - for (i = 0; i < h->kinfo.num_tqps; i++) - hns3_tqp_disable(h->kinfo.tqp[i]); + hns3_disable_irqs_and_tqps(netdev); /* stop ae_dev */ ops = priv->ae_handle->ae_algo->ops; @@ -5864,8 +5871,6 @@ int hns3_set_channels(struct net_device *netdev, void hns3_external_lb_prepare(struct net_device *ndev, bool if_running) { struct hns3_nic_priv *priv = netdev_priv(ndev); - struct hnae3_handle *h = priv->ae_handle; - int i; if (!if_running) return; @@ -5876,11 +5881,7 @@ void hns3_external_lb_prepare(struct net_device *ndev, bool if_running) netif_carrier_off(ndev); netif_tx_disable(ndev); - for (i = 0; i < priv->vector_num; i++) - hns3_vector_disable(&priv->tqp_vector[i]); - - for (i = 0; i < h->kinfo.num_tqps; i++) - hns3_tqp_disable(h->kinfo.tqp[i]); + hns3_disable_irqs_and_tqps(ndev); /* delay ring buffer clearing to hns3_reset_notify_uninit_enet * during reset process, because driver may not be able @@ -5896,7 +5897,6 @@ void hns3_external_lb_restore(struct net_device *ndev, bool if_running) { struct hns3_nic_priv *priv = netdev_priv(ndev); struct hnae3_handle *h = priv->ae_handle; - int i; if (!if_running) return; @@ -5912,11 +5912,7 @@ void hns3_external_lb_restore(struct net_device *ndev, bool if_running) clear_bit(HNS3_NIC_STATE_DOWN, &priv->state); - for (i = 0; i < priv->vector_num; i++) - hns3_vector_enable(&priv->tqp_vector[i]); - - for (i = 0; i < h->kinfo.num_tqps; i++) - hns3_tqp_enable(h->kinfo.tqp[i]); + hns3_enable_irqs_and_tqps(ndev); netif_tx_wake_all_queues(ndev); From e317aebeefcb3b0c71f2305af3c22871ca6b3833 Mon Sep 17 00:00:00 2001 From: Hao Lan Date: Wed, 30 Apr 2025 17:30:51 +0800 Subject: [PATCH 613/770] net: hns3: fixed debugfs tm_qset size The size of the tm_qset file of debugfs is limited to 64 KB, which is too small in the scenario with 1280 qsets. The size needs to be expanded to 1 MB. Fixes: 5e69ea7ee2a6 ("net: hns3: refactor the debugfs process") Signed-off-by: Hao Lan Signed-off-by: Peiyang Wang Signed-off-by: Jijie Shao Link: https://patch.msgid.link/20250430093052.2400464-4-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 09749e9f73986..4e5d8bc39a1bf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -61,7 +61,7 @@ static struct hns3_dbg_cmd_info hns3_dbg_cmd[] = { .name = "tm_qset", .cmd = HNAE3_DBG_CMD_TM_QSET, .dentry = HNS3_DBG_DENTRY_TM, - .buf_len = HNS3_DBG_READ_LEN, + .buf_len = HNS3_DBG_READ_LEN_1MB, .init = hns3_dbg_common_file_init, }, { From 4971394d9d624f91689d766f31ce668d169d9959 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Wed, 30 Apr 2025 17:30:52 +0800 Subject: [PATCH 614/770] net: hns3: defer calling ptp_clock_register() Currently the ptp_clock_register() is called before relative ptp resource ready. It may cause unexpected result when upper layer called the ptp API during the timewindow. Fix it by moving the ptp_clock_register() to the function end. Fixes: 0bf5eb788512 ("net: hns3: add support for PTP") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250430093052.2400464-5-shaojijie@huawei.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c index 59cc9221185f2..ec581d4b696f5 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c @@ -440,6 +440,13 @@ static int hclge_ptp_create_clock(struct hclge_dev *hdev) ptp->info.settime64 = hclge_ptp_settime; ptp->info.n_alarm = 0; + + spin_lock_init(&ptp->lock); + ptp->io_base = hdev->hw.hw.io_base + HCLGE_PTP_REG_OFFSET; + ptp->ts_cfg.rx_filter = HWTSTAMP_FILTER_NONE; + ptp->ts_cfg.tx_type = HWTSTAMP_TX_OFF; + hdev->ptp = ptp; + ptp->clock = ptp_clock_register(&ptp->info, &hdev->pdev->dev); if (IS_ERR(ptp->clock)) { dev_err(&hdev->pdev->dev, @@ -451,12 +458,6 @@ static int hclge_ptp_create_clock(struct hclge_dev *hdev) return -ENODEV; } - spin_lock_init(&ptp->lock); - ptp->io_base = hdev->hw.hw.io_base + HCLGE_PTP_REG_OFFSET; - ptp->ts_cfg.rx_filter = HWTSTAMP_FILTER_NONE; - ptp->ts_cfg.tx_type = HWTSTAMP_TX_OFF; - hdev->ptp = ptp; - return 0; } From 55f362885951b2d00fd7fbb02ef0227deea572c2 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Wed, 30 Apr 2025 15:30:40 +0200 Subject: [PATCH 615/770] net: vertexcom: mse102x: Fix possible stuck of SPI interrupt The MSE102x doesn't provide any SPI commands for interrupt handling. So in case the interrupt fired before the driver requests the IRQ, the interrupt will never fire again. In order to fix this always poll for pending packets after opening the interface. Fixes: 2f207cbf0dd4 ("net: vertexcom: Add MSE102x SPI support") Signed-off-by: Stefan Wahren Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250430133043.7722-2-wahrenst@gmx.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/vertexcom/mse102x.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/vertexcom/mse102x.c b/drivers/net/ethernet/vertexcom/mse102x.c index 89dc4c401a8de..92ebf16331598 100644 --- a/drivers/net/ethernet/vertexcom/mse102x.c +++ b/drivers/net/ethernet/vertexcom/mse102x.c @@ -509,6 +509,7 @@ static irqreturn_t mse102x_irq(int irq, void *_mse) static int mse102x_net_open(struct net_device *ndev) { struct mse102x_net *mse = netdev_priv(ndev); + struct mse102x_net_spi *mses = to_mse102x_spi(mse); int ret; ret = request_threaded_irq(ndev->irq, NULL, mse102x_irq, IRQF_ONESHOT, @@ -524,6 +525,13 @@ static int mse102x_net_open(struct net_device *ndev) netif_carrier_on(ndev); + /* The SPI interrupt can stuck in case of pending packet(s). + * So poll for possible packet(s) to re-arm the interrupt. + */ + mutex_lock(&mses->lock); + mse102x_rx_pkt_spi(mse); + mutex_unlock(&mses->lock); + netif_dbg(mse, ifup, ndev, "network device up\n"); return 0; From 74987089ec678b4018dba0a609e9f4bf6ef7f4ad Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Wed, 30 Apr 2025 15:30:41 +0200 Subject: [PATCH 616/770] net: vertexcom: mse102x: Fix LEN_MASK The LEN_MASK for CMD_RTS doesn't cover the whole parameter mask. The Bit 11 is reserved, so adjust LEN_MASK accordingly. Fixes: 2f207cbf0dd4 ("net: vertexcom: Add MSE102x SPI support") Signed-off-by: Stefan Wahren Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250430133043.7722-3-wahrenst@gmx.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/vertexcom/mse102x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/vertexcom/mse102x.c b/drivers/net/ethernet/vertexcom/mse102x.c index 92ebf16331598..3edf2c3753f0e 100644 --- a/drivers/net/ethernet/vertexcom/mse102x.c +++ b/drivers/net/ethernet/vertexcom/mse102x.c @@ -33,7 +33,7 @@ #define CMD_CTR (0x2 << CMD_SHIFT) #define CMD_MASK GENMASK(15, CMD_SHIFT) -#define LEN_MASK GENMASK(CMD_SHIFT - 1, 0) +#define LEN_MASK GENMASK(CMD_SHIFT - 2, 0) #define DET_CMD_LEN 4 #define DET_SOF_LEN 2 From d4dda902dac194e3231a1ed0f76c6c3b6340ba8a Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Wed, 30 Apr 2025 15:30:42 +0200 Subject: [PATCH 617/770] net: vertexcom: mse102x: Add range check for CMD_RTS Since there is no protection in the SPI protocol against electrical interferences, the driver shouldn't blindly trust the length payload of CMD_RTS. So introduce a bounds check for incoming frames. Fixes: 2f207cbf0dd4 ("net: vertexcom: Add MSE102x SPI support") Signed-off-by: Stefan Wahren Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250430133043.7722-4-wahrenst@gmx.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/vertexcom/mse102x.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/vertexcom/mse102x.c b/drivers/net/ethernet/vertexcom/mse102x.c index 3edf2c3753f0e..2c06d1d05164f 100644 --- a/drivers/net/ethernet/vertexcom/mse102x.c +++ b/drivers/net/ethernet/vertexcom/mse102x.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -337,8 +338,9 @@ static void mse102x_rx_pkt_spi(struct mse102x_net *mse) } rxlen = cmd_resp & LEN_MASK; - if (!rxlen) { - net_dbg_ratelimited("%s: No frame length defined\n", __func__); + if (rxlen < ETH_ZLEN || rxlen > VLAN_ETH_FRAME_LEN) { + net_dbg_ratelimited("%s: Invalid frame length: %d\n", __func__, + rxlen); mse->stats.invalid_len++; return; } From ee512922ddd7d64afe2b28830a88f19063217649 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Wed, 30 Apr 2025 15:30:43 +0200 Subject: [PATCH 618/770] net: vertexcom: mse102x: Fix RX error handling In case the CMD_RTS got corrupted by interferences, the MSE102x doesn't allow a retransmission of the command. Instead the Ethernet frame must be shifted out of the SPI FIFO. Since the actual length is unknown, assume the maximum possible value. Fixes: 2f207cbf0dd4 ("net: vertexcom: Add MSE102x SPI support") Signed-off-by: Stefan Wahren Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250430133043.7722-5-wahrenst@gmx.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/vertexcom/mse102x.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/vertexcom/mse102x.c b/drivers/net/ethernet/vertexcom/mse102x.c index 2c06d1d05164f..e4d993f313740 100644 --- a/drivers/net/ethernet/vertexcom/mse102x.c +++ b/drivers/net/ethernet/vertexcom/mse102x.c @@ -263,7 +263,7 @@ static int mse102x_tx_frame_spi(struct mse102x_net *mse, struct sk_buff *txp, } static int mse102x_rx_frame_spi(struct mse102x_net *mse, u8 *buff, - unsigned int frame_len) + unsigned int frame_len, bool drop) { struct mse102x_net_spi *mses = to_mse102x_spi(mse); struct spi_transfer *xfer = &mses->spi_xfer; @@ -281,6 +281,9 @@ static int mse102x_rx_frame_spi(struct mse102x_net *mse, u8 *buff, netdev_err(mse->ndev, "%s: spi_sync() failed: %d\n", __func__, ret); mse->stats.xfer_err++; + } else if (drop) { + netdev_dbg(mse->ndev, "%s: Drop frame\n", __func__); + ret = -EINVAL; } else if (*sof != cpu_to_be16(DET_SOF)) { netdev_dbg(mse->ndev, "%s: SPI start of frame is invalid (0x%04x)\n", __func__, *sof); @@ -308,6 +311,7 @@ static void mse102x_rx_pkt_spi(struct mse102x_net *mse) struct sk_buff *skb; unsigned int rxalign; unsigned int rxlen; + bool drop = false; __be16 rx = 0; u16 cmd_resp; u8 *rxpkt; @@ -330,7 +334,8 @@ static void mse102x_rx_pkt_spi(struct mse102x_net *mse) net_dbg_ratelimited("%s: Unexpected response (0x%04x)\n", __func__, cmd_resp); mse->stats.invalid_rts++; - return; + drop = true; + goto drop; } net_dbg_ratelimited("%s: Unexpected response to first CMD\n", @@ -342,9 +347,16 @@ static void mse102x_rx_pkt_spi(struct mse102x_net *mse) net_dbg_ratelimited("%s: Invalid frame length: %d\n", __func__, rxlen); mse->stats.invalid_len++; - return; + drop = true; } + /* In case of a invalid CMD_RTS, the frame must be consumed anyway. + * So assume the maximum possible frame length. + */ +drop: + if (drop) + rxlen = VLAN_ETH_FRAME_LEN; + rxalign = ALIGN(rxlen + DET_SOF_LEN + DET_DFT_LEN, 4); skb = netdev_alloc_skb_ip_align(mse->ndev, rxalign); if (!skb) @@ -355,7 +367,7 @@ static void mse102x_rx_pkt_spi(struct mse102x_net *mse) * They are copied, but ignored. */ rxpkt = skb_put(skb, rxlen) - DET_SOF_LEN; - if (mse102x_rx_frame_spi(mse, rxpkt, rxlen)) { + if (mse102x_rx_frame_spi(mse, rxpkt, rxlen, drop)) { mse->ndev->stats.rx_errors++; dev_kfree_skb(skb); return; From 0454b9057e983455cd1011d2c8e1b3697ece14c9 Mon Sep 17 00:00:00 2001 From: Yixun Lan Date: Wed, 30 Apr 2025 13:32:04 +0800 Subject: [PATCH 619/770] dt-bindings: net: sun8i-emac: Add A523 EMAC0 compatible Allwinner A523 SoC variant (A527/T527) contains an "EMAC0" Ethernet MAC compatible to the A64 version. Reviewed-by: Andre Przywara Acked-by: Krzysztof Kozlowski Signed-off-by: Yixun Lan Link: https://patch.msgid.link/20250430-01-sun55i-emac0-v3-2-6fc000bbccbd@gentoo.org Signed-off-by: Jakub Kicinski --- .../devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml b/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml index 7fe0352dff0f8..7b6a2fde81753 100644 --- a/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml +++ b/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml @@ -23,6 +23,7 @@ properties: - allwinner,sun20i-d1-emac - allwinner,sun50i-h6-emac - allwinner,sun50i-h616-emac0 + - allwinner,sun55i-a523-emac0 - const: allwinner,sun50i-a64-emac reg: From c76bab22e920ba45665ed1e9f9600a9b561a4f5d Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 30 Apr 2025 08:48:01 +0300 Subject: [PATCH 620/770] selftests: drv-net: rss_input_xfrm: Check test prerequisites before running Ensure the following prerequisites before executing the test: 1. 'socat' is installed on the remote host. 2. Python version supports socket.SO_INCOMING_CPU (available since v3.11). Skip the test if either prerequisite is not met. Reviewed-by: Nimrod Oren Signed-off-by: Gal Pressman Link: https://patch.msgid.link/20250430054801.750646-1-gal@nvidia.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py index 53bb08cc29ec3..f439c434ba361 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py +++ b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py @@ -32,6 +32,11 @@ def test_rss_input_xfrm(cfg, ipver): if multiprocessing.cpu_count() < 2: raise KsftSkipEx("Need at least two CPUs to test symmetric RSS hash") + cfg.require_cmd("socat", remote=True) + + if not hasattr(socket, "SO_INCOMING_CPU"): + raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11") + input_xfrm = cfg.ethnl.rss_get( {'header': {'dev-name': cfg.ifname}}).get('input_xfrm') From 7840e4d6f48a75413470935ebdc4bab4fc0c035e Mon Sep 17 00:00:00 2001 From: Daniel Braunwarth Date: Tue, 29 Apr 2025 13:33:37 +0200 Subject: [PATCH 621/770] net: phy: realtek: Add support for WOL magic packet on RTL8211F The RTL8211F supports multiple WOL modes. This patch adds support for magic packets. The PHY notifies the system via the INTB/PMEB pin when a WOL event occurs. Signed-off-by: Daniel Braunwarth Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250429-realtek_wol-v2-1-8f84def1ef2c@kuka.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/realtek/realtek_main.c | 69 ++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/drivers/net/phy/realtek/realtek_main.c b/drivers/net/phy/realtek/realtek_main.c index 893c824796715..05c4f4d394a5f 100644 --- a/drivers/net/phy/realtek/realtek_main.c +++ b/drivers/net/phy/realtek/realtek_main.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -38,6 +39,24 @@ #define RTL8211F_INSR 0x1d +/* RTL8211F WOL interrupt configuration */ +#define RTL8211F_INTBCR_PAGE 0xd40 +#define RTL8211F_INTBCR 0x16 +#define RTL8211F_INTBCR_INTB_PMEB BIT(5) + +/* RTL8211F WOL settings */ +#define RTL8211F_WOL_SETTINGS_PAGE 0xd8a +#define RTL8211F_WOL_SETTINGS_EVENTS 16 +#define RTL8211F_WOL_EVENT_MAGIC BIT(12) +#define RTL8211F_WOL_SETTINGS_STATUS 17 +#define RTL8211F_WOL_STATUS_RESET (BIT(15) | 0x1fff) + +/* RTL8211F Unique phyiscal and multicast address (WOL) */ +#define RTL8211F_PHYSICAL_ADDR_PAGE 0xd8c +#define RTL8211F_PHYSICAL_ADDR_WORD0 16 +#define RTL8211F_PHYSICAL_ADDR_WORD1 17 +#define RTL8211F_PHYSICAL_ADDR_WORD2 18 + #define RTL8211F_LEDCR 0x10 #define RTL8211F_LEDCR_MODE BIT(15) #define RTL8211F_LEDCR_ACT_TXRX BIT(4) @@ -123,6 +142,7 @@ struct rtl821x_priv { u16 phycr2; bool has_phycr2; struct clk *clk; + u32 saved_wolopts; }; static int rtl821x_read_page(struct phy_device *phydev) @@ -354,6 +374,53 @@ static irqreturn_t rtl8211f_handle_interrupt(struct phy_device *phydev) return IRQ_HANDLED; } +static void rtl8211f_get_wol(struct phy_device *dev, struct ethtool_wolinfo *wol) +{ + wol->supported = WAKE_MAGIC; + if (phy_read_paged(dev, RTL8211F_WOL_SETTINGS_PAGE, RTL8211F_WOL_SETTINGS_EVENTS) + & RTL8211F_WOL_EVENT_MAGIC) + wol->wolopts = WAKE_MAGIC; +} + +static int rtl8211f_set_wol(struct phy_device *dev, struct ethtool_wolinfo *wol) +{ + const u8 *mac_addr = dev->attached_dev->dev_addr; + int oldpage; + + oldpage = phy_save_page(dev); + if (oldpage < 0) + goto err; + + if (wol->wolopts & WAKE_MAGIC) { + /* Store the device address for the magic packet */ + rtl821x_write_page(dev, RTL8211F_PHYSICAL_ADDR_PAGE); + __phy_write(dev, RTL8211F_PHYSICAL_ADDR_WORD0, mac_addr[1] << 8 | (mac_addr[0])); + __phy_write(dev, RTL8211F_PHYSICAL_ADDR_WORD1, mac_addr[3] << 8 | (mac_addr[2])); + __phy_write(dev, RTL8211F_PHYSICAL_ADDR_WORD2, mac_addr[5] << 8 | (mac_addr[4])); + + /* Enable magic packet matching and reset WOL status */ + rtl821x_write_page(dev, RTL8211F_WOL_SETTINGS_PAGE); + __phy_write(dev, RTL8211F_WOL_SETTINGS_EVENTS, RTL8211F_WOL_EVENT_MAGIC); + __phy_write(dev, RTL8211F_WOL_SETTINGS_STATUS, RTL8211F_WOL_STATUS_RESET); + + /* Enable the WOL interrupt */ + rtl821x_write_page(dev, RTL8211F_INTBCR_PAGE); + __phy_set_bits(dev, RTL8211F_INTBCR, RTL8211F_INTBCR_INTB_PMEB); + } else { + /* Disable the WOL interrupt */ + rtl821x_write_page(dev, RTL8211F_INTBCR_PAGE); + __phy_clear_bits(dev, RTL8211F_INTBCR, RTL8211F_INTBCR_INTB_PMEB); + + /* Disable magic packet matching and reset WOL status */ + rtl821x_write_page(dev, RTL8211F_WOL_SETTINGS_PAGE); + __phy_write(dev, RTL8211F_WOL_SETTINGS_EVENTS, 0); + __phy_write(dev, RTL8211F_WOL_SETTINGS_STATUS, RTL8211F_WOL_STATUS_RESET); + } + +err: + return phy_restore_page(dev, oldpage, 0); +} + static int rtl8211_config_aneg(struct phy_device *phydev) { int ret; @@ -1400,6 +1467,8 @@ static struct phy_driver realtek_drvs[] = { .read_status = rtlgen_read_status, .config_intr = &rtl8211f_config_intr, .handle_interrupt = rtl8211f_handle_interrupt, + .set_wol = rtl8211f_set_wol, + .get_wol = rtl8211f_get_wol, .suspend = rtl821x_suspend, .resume = rtl821x_resume, .read_page = rtl821x_read_page, From 7a4f15cadc5670dbc5fc01d7d75ee8b9443b64fe Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 28 Apr 2025 12:16:06 -0700 Subject: [PATCH 622/770] r8152: use SHA-256 library API instead of crypto_shash API This user of SHA-256 does not support any other algorithm, so the crypto_shash abstraction provides no value. Just use the SHA-256 library API instead, which is much simpler and easier to use. Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20250428191606.856198-1-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/usb/Kconfig | 4 +--- drivers/net/usb/r8152.c | 46 +++++++---------------------------------- 2 files changed, 8 insertions(+), 42 deletions(-) diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index 3c360d4f06352..370b32fc25880 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -101,9 +101,7 @@ config USB_RTL8152 select MII select PHYLIB select CRC32 - select CRYPTO - select CRYPTO_HASH - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 help This option adds support for Realtek RTL8152 based USB 2.0 10/100 Ethernet adapters and RTL8153 based USB 3.0 10/100/1000 diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 2cab046749a92..67f5d30ffcbab 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include @@ -4628,48 +4628,16 @@ static bool rtl8152_is_fw_mac_ok(struct r8152 *tp, struct fw_mac *mac) static long rtl8152_fw_verify_checksum(struct r8152 *tp, struct fw_header *fw_hdr, size_t size) { - unsigned char checksum[sizeof(fw_hdr->checksum)]; - struct crypto_shash *alg; - struct shash_desc *sdesc; - size_t len; - long rc; - - alg = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(alg)) { - rc = PTR_ERR(alg); - goto out; - } - - if (crypto_shash_digestsize(alg) != sizeof(fw_hdr->checksum)) { - rc = -EFAULT; - dev_err(&tp->intf->dev, "digestsize incorrect (%u)\n", - crypto_shash_digestsize(alg)); - goto free_shash; - } + u8 checksum[sizeof(fw_hdr->checksum)]; - len = sizeof(*sdesc) + crypto_shash_descsize(alg); - sdesc = kmalloc(len, GFP_KERNEL); - if (!sdesc) { - rc = -ENOMEM; - goto free_shash; - } - sdesc->tfm = alg; - - len = size - sizeof(fw_hdr->checksum); - rc = crypto_shash_digest(sdesc, fw_hdr->version, len, checksum); - kfree(sdesc); - if (rc) - goto free_shash; + BUILD_BUG_ON(sizeof(checksum) != SHA256_DIGEST_SIZE); + sha256(fw_hdr->version, size - sizeof(checksum), checksum); - if (memcmp(fw_hdr->checksum, checksum, sizeof(fw_hdr->checksum))) { + if (memcmp(fw_hdr->checksum, checksum, sizeof(checksum))) { dev_err(&tp->intf->dev, "checksum fail\n"); - rc = -EFAULT; + return -EFAULT; } - -free_shash: - crypto_free_shash(alg); -out: - return rc; + return 0; } static long rtl8152_check_firmware(struct r8152 *tp, struct rtl_fw *rtl_fw) From 51cf06ddafc91e017beecfaf81b3b082033b0b93 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Sun, 27 Apr 2025 02:01:29 +0100 Subject: [PATCH 623/770] net: ethernet: mtk_eth_soc: add support for MT7988 internal 2.5G PHY The MediaTek MT7988 SoC comes with an single built-in Ethernet PHY for 2500Base-T/1000Base-T/100Base-TX/10Base-T link partners in addition to the built-in 1GE switch. The built-in PHY only supports full duplex. Add muxes allowing to select GMAC2->2.5G PHY path and add basic support for XGMAC as the built-in 2.5G PHY is internally connected via XGMII. The XGMAC features will also be used by 5GBase-R, 10GBase-R and USXGMII SerDes modes which are going to be added once support for standalone PCS drivers is in place. In order to make use of the built-in 2.5G PHY the appropriate PHY driver as well as (proprietary) PHY firmware has to be present as well. Signed-off-by: Daniel Golle Link: https://patch.msgid.link/9072cefbff6db969720672ec98ed5cef65e8218c.1745715380.git.daniel@makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_path.c | 43 +++++++ drivers/net/ethernet/mediatek/mtk_eth_soc.c | 119 +++++++++++++++++-- drivers/net/ethernet/mediatek/mtk_eth_soc.h | 57 ++++++++- 3 files changed, 203 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_path.c b/drivers/net/ethernet/mediatek/mtk_eth_path.c index 6fbfb16438a51..b4c01e2878f6f 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_path.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_path.c @@ -31,6 +31,8 @@ static const char *mtk_eth_path_name(u64 path) return "gmac2_rgmii"; case MTK_ETH_PATH_GMAC2_SGMII: return "gmac2_sgmii"; + case MTK_ETH_PATH_GMAC2_2P5GPHY: + return "gmac2_2p5gphy"; case MTK_ETH_PATH_GMAC2_GEPHY: return "gmac2_gephy"; case MTK_ETH_PATH_GDM1_ESW: @@ -127,6 +129,29 @@ static int set_mux_u3_gmac2_to_qphy(struct mtk_eth *eth, u64 path) return 0; } +static int set_mux_gmac2_to_2p5gphy(struct mtk_eth *eth, u64 path) +{ + int ret; + + if (path == MTK_ETH_PATH_GMAC2_2P5GPHY) { + ret = regmap_clear_bits(eth->ethsys, ETHSYS_SYSCFG0, + SYSCFG0_SGMII_GMAC2_V2); + if (ret) + return ret; + + /* Setup mux to 2p5g PHY */ + ret = regmap_clear_bits(eth->infra, TOP_MISC_NETSYS_PCS_MUX, + MUX_G2_USXGMII_SEL); + if (ret) + return ret; + + dev_dbg(eth->dev, "path %s in %s updated\n", + mtk_eth_path_name(path), __func__); + } + + return 0; +} + static int set_mux_gmac1_gmac2_to_sgmii_rgmii(struct mtk_eth *eth, u64 path) { unsigned int val = 0; @@ -209,6 +234,10 @@ static const struct mtk_eth_muxc mtk_eth_muxc[] = { .name = "mux_u3_gmac2_to_qphy", .cap_bit = MTK_ETH_MUX_U3_GMAC2_TO_QPHY, .set_path = set_mux_u3_gmac2_to_qphy, + }, { + .name = "mux_gmac2_to_2p5gphy", + .cap_bit = MTK_ETH_MUX_GMAC2_TO_2P5GPHY, + .set_path = set_mux_gmac2_to_2p5gphy, }, { .name = "mux_gmac1_gmac2_to_sgmii_rgmii", .cap_bit = MTK_ETH_MUX_GMAC1_GMAC2_TO_SGMII_RGMII, @@ -260,6 +289,20 @@ int mtk_gmac_sgmii_path_setup(struct mtk_eth *eth, int mac_id) return mtk_eth_mux_setup(eth, path); } +int mtk_gmac_2p5gphy_path_setup(struct mtk_eth *eth, int mac_id) +{ + u64 path = 0; + + if (mac_id == MTK_GMAC2_ID) + path = MTK_ETH_PATH_GMAC2_2P5GPHY; + + if (!path) + return -EINVAL; + + /* Setup proper MUXes along the path */ + return mtk_eth_mux_setup(eth, path); +} + int mtk_gmac_gephy_path_setup(struct mtk_eth *eth, int mac_id) { u64 path = 0; diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 8fda4ce80d811..217355d79bbb7 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -503,7 +503,7 @@ static void mtk_gmac0_rgmii_adjust(struct mtk_eth *eth, static void mtk_setup_bridge_switch(struct mtk_eth *eth) { /* Force Port1 XGMAC Link Up */ - mtk_m32(eth, 0, MTK_XGMAC_FORCE_LINK(MTK_GMAC1_ID), + mtk_m32(eth, 0, MTK_XGMAC_FORCE_MODE(MTK_GMAC1_ID), MTK_XGMAC_STS(MTK_GMAC1_ID)); /* Adjust GSW bridge IPG to 11 */ @@ -532,6 +532,26 @@ static struct phylink_pcs *mtk_mac_select_pcs(struct phylink_config *config, return NULL; } +static int mtk_mac_prepare(struct phylink_config *config, unsigned int mode, + phy_interface_t iface) +{ + struct mtk_mac *mac = container_of(config, struct mtk_mac, + phylink_config); + struct mtk_eth *eth = mac->hw; + + if (mtk_interface_mode_is_xgmii(eth, iface) && + mac->id != MTK_GMAC1_ID) { + mtk_m32(mac->hw, XMAC_MCR_TRX_DISABLE, + XMAC_MCR_TRX_DISABLE, MTK_XMAC_MCR(mac->id)); + + mtk_m32(mac->hw, MTK_XGMAC_FORCE_MODE(mac->id) | + MTK_XGMAC_FORCE_LINK(mac->id), + MTK_XGMAC_FORCE_MODE(mac->id), MTK_XGMAC_STS(mac->id)); + } + + return 0; +} + static void mtk_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) { @@ -573,6 +593,12 @@ static void mtk_mac_config(struct phylink_config *config, unsigned int mode, } break; case PHY_INTERFACE_MODE_INTERNAL: + if (mac->id == MTK_GMAC2_ID && + MTK_HAS_CAPS(eth->soc->caps, MTK_2P5GPHY)) { + err = mtk_gmac_2p5gphy_path_setup(eth, mac->id); + if (err) + goto init_err; + } break; default: goto err_phy; @@ -644,12 +670,12 @@ static void mtk_mac_config(struct phylink_config *config, unsigned int mode, } /* Setup gmac */ - if (mtk_is_netsys_v3_or_greater(eth) && - mac->interface == PHY_INTERFACE_MODE_INTERNAL) { + if (mtk_interface_mode_is_xgmii(eth, state->interface)) { mtk_w32(mac->hw, MTK_GDMA_XGDM_SEL, MTK_GDMA_EG_CTRL(mac->id)); mtk_w32(mac->hw, MAC_MCR_FORCE_LINK_DOWN, MTK_MAC_MCR(mac->id)); - mtk_setup_bridge_switch(eth); + if (mac->id == MTK_GMAC1_ID) + mtk_setup_bridge_switch(eth); } return; @@ -696,10 +722,19 @@ static void mtk_mac_link_down(struct phylink_config *config, unsigned int mode, { struct mtk_mac *mac = container_of(config, struct mtk_mac, phylink_config); - u32 mcr = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id)); - mcr &= ~(MAC_MCR_TX_EN | MAC_MCR_RX_EN | MAC_MCR_FORCE_LINK); - mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id)); + if (!mtk_interface_mode_is_xgmii(mac->hw, interface)) { + /* GMAC modes */ + mtk_m32(mac->hw, + MAC_MCR_TX_EN | MAC_MCR_RX_EN | MAC_MCR_FORCE_LINK, 0, + MTK_MAC_MCR(mac->id)); + } else if (mac->id != MTK_GMAC1_ID) { + /* XGMAC except for built-in switch */ + mtk_m32(mac->hw, XMAC_MCR_TRX_DISABLE, XMAC_MCR_TRX_DISABLE, + MTK_XMAC_MCR(mac->id)); + mtk_m32(mac->hw, MTK_XGMAC_FORCE_LINK(mac->id), 0, + MTK_XGMAC_STS(mac->id)); + } } static void mtk_set_queue_speed(struct mtk_eth *eth, unsigned int idx, @@ -771,13 +806,12 @@ static void mtk_set_queue_speed(struct mtk_eth *eth, unsigned int idx, mtk_w32(eth, val, soc->reg_map->qdma.qtx_sch + ofs); } -static void mtk_mac_link_up(struct phylink_config *config, - struct phy_device *phy, - unsigned int mode, phy_interface_t interface, - int speed, int duplex, bool tx_pause, bool rx_pause) +static void mtk_gdm_mac_link_up(struct mtk_mac *mac, + struct phy_device *phy, + unsigned int mode, phy_interface_t interface, + int speed, int duplex, bool tx_pause, + bool rx_pause) { - struct mtk_mac *mac = container_of(config, struct mtk_mac, - phylink_config); u32 mcr; mcr = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id)); @@ -811,6 +845,56 @@ static void mtk_mac_link_up(struct phylink_config *config, mtk_w32(mac->hw, mcr, MTK_MAC_MCR(mac->id)); } +static void mtk_xgdm_mac_link_up(struct mtk_mac *mac, + struct phy_device *phy, + unsigned int mode, phy_interface_t interface, + int speed, int duplex, bool tx_pause, + bool rx_pause) +{ + u32 mcr; + + if (mac->id == MTK_GMAC1_ID) + return; + + /* Eliminate the interference(before link-up) caused by PHY noise */ + mtk_m32(mac->hw, XMAC_LOGIC_RST, 0, MTK_XMAC_LOGIC_RST(mac->id)); + mdelay(20); + mtk_m32(mac->hw, XMAC_GLB_CNTCLR, XMAC_GLB_CNTCLR, + MTK_XMAC_CNT_CTRL(mac->id)); + + mtk_m32(mac->hw, MTK_XGMAC_FORCE_LINK(mac->id), + MTK_XGMAC_FORCE_LINK(mac->id), MTK_XGMAC_STS(mac->id)); + + mcr = mtk_r32(mac->hw, MTK_XMAC_MCR(mac->id)); + mcr &= ~(XMAC_MCR_FORCE_TX_FC | XMAC_MCR_FORCE_RX_FC | + XMAC_MCR_TRX_DISABLE); + /* Configure pause modes - + * phylink will avoid these for half duplex + */ + if (tx_pause) + mcr |= XMAC_MCR_FORCE_TX_FC; + if (rx_pause) + mcr |= XMAC_MCR_FORCE_RX_FC; + + mtk_w32(mac->hw, mcr, MTK_XMAC_MCR(mac->id)); +} + +static void mtk_mac_link_up(struct phylink_config *config, + struct phy_device *phy, + unsigned int mode, phy_interface_t interface, + int speed, int duplex, bool tx_pause, bool rx_pause) +{ + struct mtk_mac *mac = container_of(config, struct mtk_mac, + phylink_config); + + if (mtk_interface_mode_is_xgmii(mac->hw, interface)) + mtk_xgdm_mac_link_up(mac, phy, mode, interface, speed, duplex, + tx_pause, rx_pause); + else + mtk_gdm_mac_link_up(mac, phy, mode, interface, speed, duplex, + tx_pause, rx_pause); +} + static void mtk_mac_disable_tx_lpi(struct phylink_config *config) { struct mtk_mac *mac = container_of(config, struct mtk_mac, @@ -828,6 +912,9 @@ static int mtk_mac_enable_tx_lpi(struct phylink_config *config, u32 timer, struct mtk_eth *eth = mac->hw; u32 val; + if (mtk_interface_mode_is_xgmii(eth, mac->interface)) + return -EOPNOTSUPP; + /* Tx idle timer in ms */ timer = DIV_ROUND_UP(timer, 1000); @@ -858,6 +945,7 @@ static int mtk_mac_enable_tx_lpi(struct phylink_config *config, u32 timer, } static const struct phylink_mac_ops mtk_phylink_ops = { + .mac_prepare = mtk_mac_prepare, .mac_select_pcs = mtk_mac_select_pcs, .mac_config = mtk_mac_config, .mac_finish = mtk_mac_finish, @@ -4763,6 +4851,11 @@ static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np) mac->phylink = phylink; + if (MTK_HAS_CAPS(mac->hw->soc->caps, MTK_2P5GPHY) && + id == MTK_GMAC2_ID) + __set_bit(PHY_INTERFACE_MODE_INTERNAL, + mac->phylink_config.supported_interfaces); + SET_NETDEV_DEV(eth->netdev[id], eth->dev); eth->netdev[id]->watchdog_timeo = 5 * HZ; eth->netdev[id]->netdev_ops = &mtk_netdev_ops; diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h index 88ef2e9c50fc1..2d4b9964d3dbc 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h @@ -431,7 +431,8 @@ /* XMAC status registers */ #define MTK_XGMAC_STS(x) (((x) == MTK_GMAC3_ID) ? 0x1001C : 0x1000C) -#define MTK_XGMAC_FORCE_LINK(x) (((x) == MTK_GMAC2_ID) ? BIT(31) : BIT(15)) +#define MTK_XGMAC_FORCE_MODE(x) (((x) == MTK_GMAC2_ID) ? BIT(31) : BIT(15)) +#define MTK_XGMAC_FORCE_LINK(x) (((x) == MTK_GMAC2_ID) ? BIT(27) : BIT(11)) #define MTK_USXGMII_PCS_LINK BIT(8) #define MTK_XGMAC_RX_FC BIT(5) #define MTK_XGMAC_TX_FC BIT(4) @@ -524,6 +525,21 @@ #define INTF_MODE_RGMII_1000 (TRGMII_MODE | TRGMII_CENTRAL_ALIGNED) #define INTF_MODE_RGMII_10_100 0 +/* XFI Mac control registers */ +#define MTK_XMAC_BASE(x) (0x12000 + (((x) - 1) * 0x1000)) +#define MTK_XMAC_MCR(x) (MTK_XMAC_BASE(x)) +#define XMAC_MCR_TRX_DISABLE 0xf +#define XMAC_MCR_FORCE_TX_FC BIT(5) +#define XMAC_MCR_FORCE_RX_FC BIT(4) + +/* XFI Mac logic reset registers */ +#define MTK_XMAC_LOGIC_RST(x) (MTK_XMAC_BASE(x) + 0x10) +#define XMAC_LOGIC_RST BIT(0) + +/* XFI Mac count global control */ +#define MTK_XMAC_CNT_CTRL(x) (MTK_XMAC_BASE(x) + 0x100) +#define XMAC_GLB_CNTCLR BIT(0) + /* GPIO port control registers for GMAC 2*/ #define GPIO_OD33_CTRL8 0x4c0 #define GPIO_BIAS_CTRL 0xed0 @@ -587,6 +603,10 @@ #define GEPHY_MAC_SEL BIT(1) /* Top misc registers */ +#define TOP_MISC_NETSYS_PCS_MUX 0x0 +#define NETSYS_PCS_MUX_MASK GENMASK(1, 0) +#define MUX_G2_USXGMII_SEL BIT(1) + #define USB_PHY_SWITCH_REG 0x218 #define QPHY_SEL_MASK GENMASK(1, 0) #define SGMII_QPHY_SEL 0x2 @@ -951,6 +971,7 @@ enum mkt_eth_capabilities { MTK_RGMII_BIT = 0, MTK_TRGMII_BIT, MTK_SGMII_BIT, + MTK_2P5GPHY_BIT, MTK_ESW_BIT, MTK_GEPHY_BIT, MTK_MUX_BIT, @@ -971,6 +992,7 @@ enum mkt_eth_capabilities { MTK_ETH_MUX_GDM1_TO_GMAC1_ESW_BIT, MTK_ETH_MUX_GMAC2_GMAC0_TO_GEPHY_BIT, MTK_ETH_MUX_U3_GMAC2_TO_QPHY_BIT, + MTK_ETH_MUX_GMAC2_TO_2P5GPHY_BIT, MTK_ETH_MUX_GMAC1_GMAC2_TO_SGMII_RGMII_BIT, MTK_ETH_MUX_GMAC12_TO_GEPHY_SGMII_BIT, @@ -980,6 +1002,7 @@ enum mkt_eth_capabilities { MTK_ETH_PATH_GMAC1_SGMII_BIT, MTK_ETH_PATH_GMAC2_RGMII_BIT, MTK_ETH_PATH_GMAC2_SGMII_BIT, + MTK_ETH_PATH_GMAC2_2P5GPHY_BIT, MTK_ETH_PATH_GMAC2_GEPHY_BIT, MTK_ETH_PATH_GDM1_ESW_BIT, }; @@ -988,6 +1011,7 @@ enum mkt_eth_capabilities { #define MTK_RGMII BIT_ULL(MTK_RGMII_BIT) #define MTK_TRGMII BIT_ULL(MTK_TRGMII_BIT) #define MTK_SGMII BIT_ULL(MTK_SGMII_BIT) +#define MTK_2P5GPHY BIT_ULL(MTK_2P5GPHY_BIT) #define MTK_ESW BIT_ULL(MTK_ESW_BIT) #define MTK_GEPHY BIT_ULL(MTK_GEPHY_BIT) #define MTK_MUX BIT_ULL(MTK_MUX_BIT) @@ -1010,6 +1034,8 @@ enum mkt_eth_capabilities { BIT_ULL(MTK_ETH_MUX_GMAC2_GMAC0_TO_GEPHY_BIT) #define MTK_ETH_MUX_U3_GMAC2_TO_QPHY \ BIT_ULL(MTK_ETH_MUX_U3_GMAC2_TO_QPHY_BIT) +#define MTK_ETH_MUX_GMAC2_TO_2P5GPHY \ + BIT_ULL(MTK_ETH_MUX_GMAC2_TO_2P5GPHY_BIT) #define MTK_ETH_MUX_GMAC1_GMAC2_TO_SGMII_RGMII \ BIT_ULL(MTK_ETH_MUX_GMAC1_GMAC2_TO_SGMII_RGMII_BIT) #define MTK_ETH_MUX_GMAC12_TO_GEPHY_SGMII \ @@ -1021,6 +1047,7 @@ enum mkt_eth_capabilities { #define MTK_ETH_PATH_GMAC1_SGMII BIT_ULL(MTK_ETH_PATH_GMAC1_SGMII_BIT) #define MTK_ETH_PATH_GMAC2_RGMII BIT_ULL(MTK_ETH_PATH_GMAC2_RGMII_BIT) #define MTK_ETH_PATH_GMAC2_SGMII BIT_ULL(MTK_ETH_PATH_GMAC2_SGMII_BIT) +#define MTK_ETH_PATH_GMAC2_2P5GPHY BIT_ULL(MTK_ETH_PATH_GMAC2_2P5GPHY_BIT) #define MTK_ETH_PATH_GMAC2_GEPHY BIT_ULL(MTK_ETH_PATH_GMAC2_GEPHY_BIT) #define MTK_ETH_PATH_GDM1_ESW BIT_ULL(MTK_ETH_PATH_GDM1_ESW_BIT) @@ -1030,6 +1057,7 @@ enum mkt_eth_capabilities { #define MTK_GMAC2_RGMII (MTK_ETH_PATH_GMAC2_RGMII | MTK_RGMII) #define MTK_GMAC2_SGMII (MTK_ETH_PATH_GMAC2_SGMII | MTK_SGMII) #define MTK_GMAC2_GEPHY (MTK_ETH_PATH_GMAC2_GEPHY | MTK_GEPHY) +#define MTK_GMAC2_2P5GPHY (MTK_ETH_PATH_GMAC2_2P5GPHY | MTK_2P5GPHY) #define MTK_GDM1_ESW (MTK_ETH_PATH_GDM1_ESW | MTK_ESW) /* MUXes present on SoCs */ @@ -1049,6 +1077,10 @@ enum mkt_eth_capabilities { (MTK_ETH_MUX_GMAC1_GMAC2_TO_SGMII_RGMII | MTK_MUX | \ MTK_SHARED_SGMII) +/* 2: GMAC2 -> 2P5GPHY */ +#define MTK_MUX_GMAC2_TO_2P5GPHY \ + (MTK_ETH_MUX_GMAC2_TO_2P5GPHY | MTK_MUX | MTK_INFRA) + /* 0: GMACx -> GEPHY, 1: GMACx -> SGMII where x is 1 or 2 */ #define MTK_MUX_GMAC12_TO_GEPHY_SGMII \ (MTK_ETH_MUX_GMAC12_TO_GEPHY_SGMII | MTK_MUX) @@ -1084,8 +1116,9 @@ enum mkt_eth_capabilities { MTK_MUX_GMAC12_TO_GEPHY_SGMII | MTK_QDMA | \ MTK_RSTCTRL_PPE1 | MTK_SRAM) -#define MT7988_CAPS (MTK_36BIT_DMA | MTK_GDM1_ESW | MTK_QDMA | \ - MTK_RSTCTRL_PPE1 | MTK_RSTCTRL_PPE2 | MTK_SRAM) +#define MT7988_CAPS (MTK_36BIT_DMA | MTK_GDM1_ESW | MTK_GMAC2_2P5GPHY | \ + MTK_MUX_GMAC2_TO_2P5GPHY | MTK_QDMA | MTK_RSTCTRL_PPE1 | \ + MTK_RSTCTRL_PPE2 | MTK_SRAM) struct mtk_tx_dma_desc_info { dma_addr_t addr; @@ -1437,6 +1470,23 @@ static inline u32 mtk_get_ib2_multicast_mask(struct mtk_eth *eth) return MTK_FOE_IB2_MULTICAST; } +static inline bool mtk_interface_mode_is_xgmii(struct mtk_eth *eth, + phy_interface_t interface) +{ + if (!mtk_is_netsys_v3_or_greater(eth)) + return false; + + switch (interface) { + case PHY_INTERFACE_MODE_INTERNAL: + case PHY_INTERFACE_MODE_USXGMII: + case PHY_INTERFACE_MODE_10GBASER: + case PHY_INTERFACE_MODE_5GBASER: + return true; + default: + return false; + } +} + /* read the hardware status register */ void mtk_stats_update_mac(struct mtk_mac *mac); @@ -1445,6 +1495,7 @@ u32 mtk_r32(struct mtk_eth *eth, unsigned reg); u32 mtk_m32(struct mtk_eth *eth, u32 mask, u32 set, unsigned int reg); int mtk_gmac_sgmii_path_setup(struct mtk_eth *eth, int mac_id); +int mtk_gmac_2p5gphy_path_setup(struct mtk_eth *eth, int mac_id); int mtk_gmac_gephy_path_setup(struct mtk_eth *eth, int mac_id); int mtk_gmac_rgmii_path_setup(struct mtk_eth *eth, int mac_id); From a3e1c0ad835702555d90565584ab6f723adf7f94 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 29 Apr 2025 08:04:46 +0200 Subject: [PATCH 624/770] net: phy: factor out provider part from mdio_bus.c After 52358dd63e34 ("net: phy: remove function stubs") there's a problem if CONFIG_MDIO_BUS is set, but CONFIG_PHYLIB is not. mdiobus_scan() uses phylib functions like get_phy_device(). Bringing back the stub wouldn't make much sense, because it would allow to compile mdiobus_scan(), but the function would be unusable. The stub returned NULL, and we have the following in mdiobus_scan(): phydev = get_phy_device(bus, addr, c45); if (IS_ERR(phydev)) return phydev; So calling mdiobus_scan() w/o CONFIG_PHYLIB would cause a crash later in mdiobus_scan(). In general the PHYLIB functionality isn't optional here. Consequently, MDIO bus providers depend on PHYLIB. Therefore factor it out and build it together with the libphy core modules. In addition make all MDIO bus providers under /drivers/net/mdio depend on PHYLIB. Same applies to enetc MDIO bus provider. Note that PHYLIB selects MDIO_DEVRES, therefore we can omit this here. Fixes: 52358dd63e34 ("net: phy: remove function stubs") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202504270639.mT0lh2o1-lkp@intel.com/ Reviewed-by: Jacob Keller Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/c74772a9-dab6-44bf-a657-389df89d85c2@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/Kconfig | 3 +- drivers/net/mdio/Kconfig | 16 +- drivers/net/phy/Makefile | 2 +- drivers/net/phy/mdio_bus.c | 462 +----------------- drivers/net/phy/mdio_bus_provider.c | 484 +++++++++++++++++++ include/linux/phy.h | 1 + 6 files changed, 494 insertions(+), 474 deletions(-) create mode 100644 drivers/net/phy/mdio_bus_provider.c diff --git a/drivers/net/ethernet/freescale/enetc/Kconfig b/drivers/net/ethernet/freescale/enetc/Kconfig index 5367e8af1e1a0..7250d3bbf6bb2 100644 --- a/drivers/net/ethernet/freescale/enetc/Kconfig +++ b/drivers/net/ethernet/freescale/enetc/Kconfig @@ -73,8 +73,7 @@ config FSL_ENETC_IERB config FSL_ENETC_MDIO tristate "ENETC MDIO driver" - depends on PCI && MDIO_BUS - select MDIO_DEVRES + depends on PCI && PHYLIB help This driver supports NXP ENETC Central MDIO controller as a PCIe physical function (PF) device. diff --git a/drivers/net/mdio/Kconfig b/drivers/net/mdio/Kconfig index f680ed6767973..107236cd6bd94 100644 --- a/drivers/net/mdio/Kconfig +++ b/drivers/net/mdio/Kconfig @@ -19,30 +19,25 @@ config MDIO_BUS reflects whether the mdio_bus/mdio_device code is built as a loadable module or built-in. +if PHYLIB + config FWNODE_MDIO - def_tristate PHYLIB - depends on (ACPI || OF) || COMPILE_TEST + def_tristate (ACPI || OF) || COMPILE_TEST select FIXED_PHY help FWNODE MDIO bus (Ethernet PHY) accessors config OF_MDIO - def_tristate PHYLIB - depends on OF - depends on PHYLIB + def_tristate OF select FIXED_PHY help OpenFirmware MDIO bus (Ethernet PHY) accessors config ACPI_MDIO - def_tristate PHYLIB - depends on ACPI - depends on PHYLIB + def_tristate ACPI help ACPI MDIO bus (Ethernet PHY) accessors -if MDIO_BUS - config MDIO_DEVRES tristate @@ -57,7 +52,6 @@ config MDIO_SUN4I config MDIO_XGENE tristate "APM X-Gene SoC MDIO bus controller" depends on ARCH_XGENE || COMPILE_TEST - depends on PHYLIB help This module provides a driver for the MDIO busses found in the APM X-Gene SoC's. diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 23ce205ae91d8..631859d444518 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -3,7 +3,7 @@ libphy-y := phy.o phy-c45.o phy-core.o phy_device.o \ linkmode.o phy_link_topology.o \ - phy_package.o phy_caps.o + phy_package.o phy_caps.o mdio_bus_provider.o mdio-bus-y += mdio_bus.o mdio_device.o ifdef CONFIG_MDIO_DEVICE diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index ede596c1a69d1..f5ccbe33a3841 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -8,17 +8,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include #include #include #include #include #include -#include #include #include -#include #include #include #include @@ -27,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -37,8 +33,6 @@ #define CREATE_TRACE_POINTS #include -#include "mdio-boardinfo.h" - static int mdiobus_register_gpiod(struct mdio_device *mdiodev) { /* Deassert the optional reset signal */ @@ -136,45 +130,6 @@ bool mdiobus_is_registered_device(struct mii_bus *bus, int addr) } EXPORT_SYMBOL(mdiobus_is_registered_device); -/** - * mdiobus_alloc_size - allocate a mii_bus structure - * @size: extra amount of memory to allocate for private storage. - * If non-zero, then bus->priv is points to that memory. - * - * Description: called by a bus driver to allocate an mii_bus - * structure to fill in. - */ -struct mii_bus *mdiobus_alloc_size(size_t size) -{ - struct mii_bus *bus; - size_t aligned_size = ALIGN(sizeof(*bus), NETDEV_ALIGN); - size_t alloc_size; - int i; - - /* If we alloc extra space, it should be aligned */ - if (size) - alloc_size = aligned_size + size; - else - alloc_size = sizeof(*bus); - - bus = kzalloc(alloc_size, GFP_KERNEL); - if (!bus) - return NULL; - - bus->state = MDIOBUS_ALLOCATED; - if (size) - bus->priv = (void *)bus + aligned_size; - - /* Initialise the interrupts to polling and 64-bit seqcounts */ - for (i = 0; i < PHY_MAX_ADDR; i++) { - bus->irq[i] = PHY_POLL; - u64_stats_init(&bus->stats[i].syncp); - } - - return bus; -} -EXPORT_SYMBOL(mdiobus_alloc_size); - /** * mdiobus_release - mii_bus device release callback * @d: the target struct device that contains the mii_bus @@ -403,11 +358,12 @@ static const struct attribute_group *mdio_bus_groups[] = { NULL, }; -static struct class mdio_bus_class = { +const struct class mdio_bus_class = { .name = "mdio_bus", .dev_release = mdiobus_release, .dev_groups = mdio_bus_groups, }; +EXPORT_SYMBOL_GPL(mdio_bus_class); /** * mdio_find_bus - Given the name of a mdiobus, find the mii_bus. @@ -451,422 +407,8 @@ struct mii_bus *of_mdio_find_bus(struct device_node *mdio_bus_np) return d ? to_mii_bus(d) : NULL; } EXPORT_SYMBOL(of_mdio_find_bus); - -/* Walk the list of subnodes of a mdio bus and look for a node that - * matches the mdio device's address with its 'reg' property. If - * found, set the of_node pointer for the mdio device. This allows - * auto-probed phy devices to be supplied with information passed in - * via DT. - * If a PHY package is found, PHY is searched also there. - */ -static int of_mdiobus_find_phy(struct device *dev, struct mdio_device *mdiodev, - struct device_node *np) -{ - struct device_node *child; - - for_each_available_child_of_node(np, child) { - int addr; - - if (of_node_name_eq(child, "ethernet-phy-package")) { - /* Validate PHY package reg presence */ - if (!of_property_present(child, "reg")) { - of_node_put(child); - return -EINVAL; - } - - if (!of_mdiobus_find_phy(dev, mdiodev, child)) { - /* The refcount for the PHY package will be - * incremented later when PHY join the Package. - */ - of_node_put(child); - return 0; - } - - continue; - } - - addr = of_mdio_parse_addr(dev, child); - if (addr < 0) - continue; - - if (addr == mdiodev->addr) { - device_set_node(dev, of_fwnode_handle(child)); - /* The refcount on "child" is passed to the mdio - * device. Do _not_ use of_node_put(child) here. - */ - return 0; - } - } - - return -ENODEV; -} - -static void of_mdiobus_link_mdiodev(struct mii_bus *bus, - struct mdio_device *mdiodev) -{ - struct device *dev = &mdiodev->dev; - - if (dev->of_node || !bus->dev.of_node) - return; - - of_mdiobus_find_phy(dev, mdiodev, bus->dev.of_node); -} -#else /* !IS_ENABLED(CONFIG_OF_MDIO) */ -static inline void of_mdiobus_link_mdiodev(struct mii_bus *mdio, - struct mdio_device *mdiodev) -{ -} #endif -/** - * mdiobus_create_device - create a full MDIO device given - * a mdio_board_info structure - * @bus: MDIO bus to create the devices on - * @bi: mdio_board_info structure describing the devices - * - * Returns 0 on success or < 0 on error. - */ -static int mdiobus_create_device(struct mii_bus *bus, - struct mdio_board_info *bi) -{ - struct mdio_device *mdiodev; - int ret = 0; - - mdiodev = mdio_device_create(bus, bi->mdio_addr); - if (IS_ERR(mdiodev)) - return -ENODEV; - - strscpy(mdiodev->modalias, bi->modalias, - sizeof(mdiodev->modalias)); - mdiodev->bus_match = mdio_device_bus_match; - mdiodev->dev.platform_data = (void *)bi->platform_data; - - ret = mdio_device_register(mdiodev); - if (ret) - mdio_device_free(mdiodev); - - return ret; -} - -static struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr, bool c45) -{ - struct phy_device *phydev = ERR_PTR(-ENODEV); - struct fwnode_handle *fwnode; - char node_name[16]; - int err; - - phydev = get_phy_device(bus, addr, c45); - if (IS_ERR(phydev)) - return phydev; - - /* For DT, see if the auto-probed phy has a corresponding child - * in the bus node, and set the of_node pointer in this case. - */ - of_mdiobus_link_mdiodev(bus, &phydev->mdio); - - /* Search for a swnode for the phy in the swnode hierarchy of the bus. - * If there is no swnode for the phy provided, just ignore it. - */ - if (dev_fwnode(&bus->dev) && !dev_fwnode(&phydev->mdio.dev)) { - snprintf(node_name, sizeof(node_name), "ethernet-phy@%d", - addr); - fwnode = fwnode_get_named_child_node(dev_fwnode(&bus->dev), - node_name); - if (fwnode) - device_set_node(&phydev->mdio.dev, fwnode); - } - - err = phy_device_register(phydev); - if (err) { - phy_device_free(phydev); - return ERR_PTR(-ENODEV); - } - - return phydev; -} - -/** - * mdiobus_scan_c22 - scan one address on a bus for C22 MDIO devices. - * @bus: mii_bus to scan - * @addr: address on bus to scan - * - * This function scans one address on the MDIO bus, looking for - * devices which can be identified using a vendor/product ID in - * registers 2 and 3. Not all MDIO devices have such registers, but - * PHY devices typically do. Hence this function assumes anything - * found is a PHY, or can be treated as a PHY. Other MDIO devices, - * such as switches, will probably not be found during the scan. - */ -struct phy_device *mdiobus_scan_c22(struct mii_bus *bus, int addr) -{ - return mdiobus_scan(bus, addr, false); -} -EXPORT_SYMBOL(mdiobus_scan_c22); - -/** - * mdiobus_scan_c45 - scan one address on a bus for C45 MDIO devices. - * @bus: mii_bus to scan - * @addr: address on bus to scan - * - * This function scans one address on the MDIO bus, looking for - * devices which can be identified using a vendor/product ID in - * registers 2 and 3. Not all MDIO devices have such registers, but - * PHY devices typically do. Hence this function assumes anything - * found is a PHY, or can be treated as a PHY. Other MDIO devices, - * such as switches, will probably not be found during the scan. - */ -static struct phy_device *mdiobus_scan_c45(struct mii_bus *bus, int addr) -{ - return mdiobus_scan(bus, addr, true); -} - -static int mdiobus_scan_bus_c22(struct mii_bus *bus) -{ - int i; - - for (i = 0; i < PHY_MAX_ADDR; i++) { - if ((bus->phy_mask & BIT(i)) == 0) { - struct phy_device *phydev; - - phydev = mdiobus_scan_c22(bus, i); - if (IS_ERR(phydev) && (PTR_ERR(phydev) != -ENODEV)) - return PTR_ERR(phydev); - } - } - return 0; -} - -static int mdiobus_scan_bus_c45(struct mii_bus *bus) -{ - int i; - - for (i = 0; i < PHY_MAX_ADDR; i++) { - if ((bus->phy_mask & BIT(i)) == 0) { - struct phy_device *phydev; - - /* Don't scan C45 if we already have a C22 device */ - if (bus->mdio_map[i]) - continue; - - phydev = mdiobus_scan_c45(bus, i); - if (IS_ERR(phydev) && (PTR_ERR(phydev) != -ENODEV)) - return PTR_ERR(phydev); - } - } - return 0; -} - -/* There are some C22 PHYs which do bad things when where is a C45 - * transaction on the bus, like accepting a read themselves, and - * stomping over the true devices reply, to performing a write to - * themselves which was intended for another device. Now that C22 - * devices have been found, see if any of them are bad for C45, and if we - * should skip the C45 scan. - */ -static bool mdiobus_prevent_c45_scan(struct mii_bus *bus) -{ - int i; - - for (i = 0; i < PHY_MAX_ADDR; i++) { - struct phy_device *phydev; - u32 oui; - - phydev = mdiobus_get_phy(bus, i); - if (!phydev) - continue; - oui = phydev->phy_id >> 10; - - if (oui == MICREL_OUI) - return true; - } - return false; -} - -/** - * __mdiobus_register - bring up all the PHYs on a given bus and attach them to bus - * @bus: target mii_bus - * @owner: module containing bus accessor functions - * - * Description: Called by a bus driver to bring up all the PHYs - * on a given bus, and attach them to the bus. Drivers should use - * mdiobus_register() rather than __mdiobus_register() unless they - * need to pass a specific owner module. MDIO devices which are not - * PHYs will not be brought up by this function. They are expected - * to be explicitly listed in DT and instantiated by of_mdiobus_register(). - * - * Returns 0 on success or < 0 on error. - */ -int __mdiobus_register(struct mii_bus *bus, struct module *owner) -{ - struct mdio_device *mdiodev; - struct gpio_desc *gpiod; - bool prevent_c45_scan; - int i, err; - - if (!bus || !bus->name) - return -EINVAL; - - /* An access method always needs both read and write operations */ - if (!!bus->read != !!bus->write || !!bus->read_c45 != !!bus->write_c45) - return -EINVAL; - - /* At least one method is mandatory */ - if (!bus->read && !bus->read_c45) - return -EINVAL; - - if (bus->parent && bus->parent->of_node) - bus->parent->of_node->fwnode.flags |= - FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD; - - WARN(bus->state != MDIOBUS_ALLOCATED && - bus->state != MDIOBUS_UNREGISTERED, - "%s: not in ALLOCATED or UNREGISTERED state\n", bus->id); - - bus->owner = owner; - bus->dev.parent = bus->parent; - bus->dev.class = &mdio_bus_class; - bus->dev.groups = NULL; - dev_set_name(&bus->dev, "%s", bus->id); - - /* If the bus state is allocated, we're registering a fresh bus - * that may have a fwnode associated with it. Grab a reference - * to the fwnode. This will be dropped when the bus is released. - * If the bus was set to unregistered, it means that the bus was - * previously registered, and we've already grabbed a reference. - */ - if (bus->state == MDIOBUS_ALLOCATED) - fwnode_handle_get(dev_fwnode(&bus->dev)); - - /* We need to set state to MDIOBUS_UNREGISTERED to correctly release - * the device in mdiobus_free() - * - * State will be updated later in this function in case of success - */ - bus->state = MDIOBUS_UNREGISTERED; - - err = device_register(&bus->dev); - if (err) { - pr_err("mii_bus %s failed to register\n", bus->id); - return -EINVAL; - } - - mutex_init(&bus->mdio_lock); - mutex_init(&bus->shared_lock); - - /* assert bus level PHY GPIO reset */ - gpiod = devm_gpiod_get_optional(&bus->dev, "reset", GPIOD_OUT_HIGH); - if (IS_ERR(gpiod)) { - err = dev_err_probe(&bus->dev, PTR_ERR(gpiod), - "mii_bus %s couldn't get reset GPIO\n", - bus->id); - device_del(&bus->dev); - return err; - } else if (gpiod) { - bus->reset_gpiod = gpiod; - fsleep(bus->reset_delay_us); - gpiod_set_value_cansleep(gpiod, 0); - if (bus->reset_post_delay_us > 0) - fsleep(bus->reset_post_delay_us); - } - - if (bus->reset) { - err = bus->reset(bus); - if (err) - goto error_reset_gpiod; - } - - if (bus->read) { - err = mdiobus_scan_bus_c22(bus); - if (err) - goto error; - } - - prevent_c45_scan = mdiobus_prevent_c45_scan(bus); - - if (!prevent_c45_scan && bus->read_c45) { - err = mdiobus_scan_bus_c45(bus); - if (err) - goto error; - } - - mdiobus_setup_mdiodev_from_board_info(bus, mdiobus_create_device); - - bus->state = MDIOBUS_REGISTERED; - dev_dbg(&bus->dev, "probed\n"); - return 0; - -error: - for (i = 0; i < PHY_MAX_ADDR; i++) { - mdiodev = bus->mdio_map[i]; - if (!mdiodev) - continue; - - mdiodev->device_remove(mdiodev); - mdiodev->device_free(mdiodev); - } -error_reset_gpiod: - /* Put PHYs in RESET to save power */ - if (bus->reset_gpiod) - gpiod_set_value_cansleep(bus->reset_gpiod, 1); - - device_del(&bus->dev); - return err; -} -EXPORT_SYMBOL(__mdiobus_register); - -void mdiobus_unregister(struct mii_bus *bus) -{ - struct mdio_device *mdiodev; - int i; - - if (WARN_ON_ONCE(bus->state != MDIOBUS_REGISTERED)) - return; - bus->state = MDIOBUS_UNREGISTERED; - - for (i = 0; i < PHY_MAX_ADDR; i++) { - mdiodev = bus->mdio_map[i]; - if (!mdiodev) - continue; - - if (mdiodev->reset_gpio) - gpiod_put(mdiodev->reset_gpio); - - mdiodev->device_remove(mdiodev); - mdiodev->device_free(mdiodev); - } - - /* Put PHYs in RESET to save power */ - if (bus->reset_gpiod) - gpiod_set_value_cansleep(bus->reset_gpiod, 1); - - device_del(&bus->dev); -} -EXPORT_SYMBOL(mdiobus_unregister); - -/** - * mdiobus_free - free a struct mii_bus - * @bus: mii_bus to free - * - * This function releases the reference to the underlying device - * object in the mii_bus. If this is the last reference, the mii_bus - * will be freed. - */ -void mdiobus_free(struct mii_bus *bus) -{ - /* For compatibility with error handling in drivers. */ - if (bus->state == MDIOBUS_ALLOCATED) { - kfree(bus); - return; - } - - WARN(bus->state != MDIOBUS_UNREGISTERED, - "%s: not in UNREGISTERED state\n", bus->id); - bus->state = MDIOBUS_RELEASED; - - put_device(&bus->dev); -} -EXPORT_SYMBOL(mdiobus_free); - static void mdiobus_stats_acct(struct mdio_bus_stats *stats, bool op, int ret) { preempt_disable(); diff --git a/drivers/net/phy/mdio_bus_provider.c b/drivers/net/phy/mdio_bus_provider.c new file mode 100644 index 0000000000000..65850e36284de --- /dev/null +++ b/drivers/net/phy/mdio_bus_provider.c @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* MDIO Bus provider interface + * + * Author: Andy Fleming + * + * Copyright (c) 2004 Freescale Semiconductor, Inc. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mdio-boardinfo.h" + +/** + * mdiobus_alloc_size - allocate a mii_bus structure + * @size: extra amount of memory to allocate for private storage. + * If non-zero, then bus->priv is points to that memory. + * + * Description: called by a bus driver to allocate an mii_bus + * structure to fill in. + */ +struct mii_bus *mdiobus_alloc_size(size_t size) +{ + struct mii_bus *bus; + size_t aligned_size = ALIGN(sizeof(*bus), NETDEV_ALIGN); + size_t alloc_size; + int i; + + /* If we alloc extra space, it should be aligned */ + if (size) + alloc_size = aligned_size + size; + else + alloc_size = sizeof(*bus); + + bus = kzalloc(alloc_size, GFP_KERNEL); + if (!bus) + return NULL; + + bus->state = MDIOBUS_ALLOCATED; + if (size) + bus->priv = (void *)bus + aligned_size; + + /* Initialise the interrupts to polling and 64-bit seqcounts */ + for (i = 0; i < PHY_MAX_ADDR; i++) { + bus->irq[i] = PHY_POLL; + u64_stats_init(&bus->stats[i].syncp); + } + + return bus; +} +EXPORT_SYMBOL(mdiobus_alloc_size); + +#if IS_ENABLED(CONFIG_OF_MDIO) +/* Walk the list of subnodes of a mdio bus and look for a node that + * matches the mdio device's address with its 'reg' property. If + * found, set the of_node pointer for the mdio device. This allows + * auto-probed phy devices to be supplied with information passed in + * via DT. + * If a PHY package is found, PHY is searched also there. + */ +static int of_mdiobus_find_phy(struct device *dev, struct mdio_device *mdiodev, + struct device_node *np) +{ + struct device_node *child; + + for_each_available_child_of_node(np, child) { + int addr; + + if (of_node_name_eq(child, "ethernet-phy-package")) { + /* Validate PHY package reg presence */ + if (!of_property_present(child, "reg")) { + of_node_put(child); + return -EINVAL; + } + + if (!of_mdiobus_find_phy(dev, mdiodev, child)) { + /* The refcount for the PHY package will be + * incremented later when PHY join the Package. + */ + of_node_put(child); + return 0; + } + + continue; + } + + addr = of_mdio_parse_addr(dev, child); + if (addr < 0) + continue; + + if (addr == mdiodev->addr) { + device_set_node(dev, of_fwnode_handle(child)); + /* The refcount on "child" is passed to the mdio + * device. Do _not_ use of_node_put(child) here. + */ + return 0; + } + } + + return -ENODEV; +} + +static void of_mdiobus_link_mdiodev(struct mii_bus *bus, + struct mdio_device *mdiodev) +{ + struct device *dev = &mdiodev->dev; + + if (dev->of_node || !bus->dev.of_node) + return; + + of_mdiobus_find_phy(dev, mdiodev, bus->dev.of_node); +} +#endif + +/** + * mdiobus_create_device - create a full MDIO device given + * a mdio_board_info structure + * @bus: MDIO bus to create the devices on + * @bi: mdio_board_info structure describing the devices + * + * Returns 0 on success or < 0 on error. + */ +static int mdiobus_create_device(struct mii_bus *bus, + struct mdio_board_info *bi) +{ + struct mdio_device *mdiodev; + int ret = 0; + + mdiodev = mdio_device_create(bus, bi->mdio_addr); + if (IS_ERR(mdiodev)) + return -ENODEV; + + strscpy(mdiodev->modalias, bi->modalias, + sizeof(mdiodev->modalias)); + mdiodev->bus_match = mdio_device_bus_match; + mdiodev->dev.platform_data = (void *)bi->platform_data; + + ret = mdio_device_register(mdiodev); + if (ret) + mdio_device_free(mdiodev); + + return ret; +} + +static struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr, bool c45) +{ + struct phy_device *phydev = ERR_PTR(-ENODEV); + struct fwnode_handle *fwnode; + char node_name[16]; + int err; + + phydev = get_phy_device(bus, addr, c45); + if (IS_ERR(phydev)) + return phydev; + +#if IS_ENABLED(CONFIG_OF_MDIO) + /* For DT, see if the auto-probed phy has a corresponding child + * in the bus node, and set the of_node pointer in this case. + */ + of_mdiobus_link_mdiodev(bus, &phydev->mdio); +#endif + + /* Search for a swnode for the phy in the swnode hierarchy of the bus. + * If there is no swnode for the phy provided, just ignore it. + */ + if (dev_fwnode(&bus->dev) && !dev_fwnode(&phydev->mdio.dev)) { + snprintf(node_name, sizeof(node_name), "ethernet-phy@%d", + addr); + fwnode = fwnode_get_named_child_node(dev_fwnode(&bus->dev), + node_name); + if (fwnode) + device_set_node(&phydev->mdio.dev, fwnode); + } + + err = phy_device_register(phydev); + if (err) { + phy_device_free(phydev); + return ERR_PTR(-ENODEV); + } + + return phydev; +} + +/** + * mdiobus_scan_c22 - scan one address on a bus for C22 MDIO devices. + * @bus: mii_bus to scan + * @addr: address on bus to scan + * + * This function scans one address on the MDIO bus, looking for + * devices which can be identified using a vendor/product ID in + * registers 2 and 3. Not all MDIO devices have such registers, but + * PHY devices typically do. Hence this function assumes anything + * found is a PHY, or can be treated as a PHY. Other MDIO devices, + * such as switches, will probably not be found during the scan. + */ +struct phy_device *mdiobus_scan_c22(struct mii_bus *bus, int addr) +{ + return mdiobus_scan(bus, addr, false); +} +EXPORT_SYMBOL(mdiobus_scan_c22); + +/** + * mdiobus_scan_c45 - scan one address on a bus for C45 MDIO devices. + * @bus: mii_bus to scan + * @addr: address on bus to scan + * + * This function scans one address on the MDIO bus, looking for + * devices which can be identified using a vendor/product ID in + * registers 2 and 3. Not all MDIO devices have such registers, but + * PHY devices typically do. Hence this function assumes anything + * found is a PHY, or can be treated as a PHY. Other MDIO devices, + * such as switches, will probably not be found during the scan. + */ +static struct phy_device *mdiobus_scan_c45(struct mii_bus *bus, int addr) +{ + return mdiobus_scan(bus, addr, true); +} + +static int mdiobus_scan_bus_c22(struct mii_bus *bus) +{ + int i; + + for (i = 0; i < PHY_MAX_ADDR; i++) { + if ((bus->phy_mask & BIT(i)) == 0) { + struct phy_device *phydev; + + phydev = mdiobus_scan_c22(bus, i); + if (IS_ERR(phydev) && (PTR_ERR(phydev) != -ENODEV)) + return PTR_ERR(phydev); + } + } + return 0; +} + +static int mdiobus_scan_bus_c45(struct mii_bus *bus) +{ + int i; + + for (i = 0; i < PHY_MAX_ADDR; i++) { + if ((bus->phy_mask & BIT(i)) == 0) { + struct phy_device *phydev; + + /* Don't scan C45 if we already have a C22 device */ + if (bus->mdio_map[i]) + continue; + + phydev = mdiobus_scan_c45(bus, i); + if (IS_ERR(phydev) && (PTR_ERR(phydev) != -ENODEV)) + return PTR_ERR(phydev); + } + } + return 0; +} + +/* There are some C22 PHYs which do bad things when where is a C45 + * transaction on the bus, like accepting a read themselves, and + * stomping over the true devices reply, to performing a write to + * themselves which was intended for another device. Now that C22 + * devices have been found, see if any of them are bad for C45, and if we + * should skip the C45 scan. + */ +static bool mdiobus_prevent_c45_scan(struct mii_bus *bus) +{ + int i; + + for (i = 0; i < PHY_MAX_ADDR; i++) { + struct phy_device *phydev; + u32 oui; + + phydev = mdiobus_get_phy(bus, i); + if (!phydev) + continue; + oui = phydev->phy_id >> 10; + + if (oui == MICREL_OUI) + return true; + } + return false; +} + +/** + * __mdiobus_register - bring up all the PHYs on a given bus and attach them to bus + * @bus: target mii_bus + * @owner: module containing bus accessor functions + * + * Description: Called by a bus driver to bring up all the PHYs + * on a given bus, and attach them to the bus. Drivers should use + * mdiobus_register() rather than __mdiobus_register() unless they + * need to pass a specific owner module. MDIO devices which are not + * PHYs will not be brought up by this function. They are expected + * to be explicitly listed in DT and instantiated by of_mdiobus_register(). + * + * Returns 0 on success or < 0 on error. + */ +int __mdiobus_register(struct mii_bus *bus, struct module *owner) +{ + struct mdio_device *mdiodev; + struct gpio_desc *gpiod; + bool prevent_c45_scan; + int i, err; + + if (!bus || !bus->name) + return -EINVAL; + + /* An access method always needs both read and write operations */ + if (!!bus->read != !!bus->write || !!bus->read_c45 != !!bus->write_c45) + return -EINVAL; + + /* At least one method is mandatory */ + if (!bus->read && !bus->read_c45) + return -EINVAL; + + if (bus->parent && bus->parent->of_node) + bus->parent->of_node->fwnode.flags |= + FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD; + + WARN(bus->state != MDIOBUS_ALLOCATED && + bus->state != MDIOBUS_UNREGISTERED, + "%s: not in ALLOCATED or UNREGISTERED state\n", bus->id); + + bus->owner = owner; + bus->dev.parent = bus->parent; + bus->dev.class = &mdio_bus_class; + bus->dev.groups = NULL; + dev_set_name(&bus->dev, "%s", bus->id); + + /* If the bus state is allocated, we're registering a fresh bus + * that may have a fwnode associated with it. Grab a reference + * to the fwnode. This will be dropped when the bus is released. + * If the bus was set to unregistered, it means that the bus was + * previously registered, and we've already grabbed a reference. + */ + if (bus->state == MDIOBUS_ALLOCATED) + fwnode_handle_get(dev_fwnode(&bus->dev)); + + /* We need to set state to MDIOBUS_UNREGISTERED to correctly release + * the device in mdiobus_free() + * + * State will be updated later in this function in case of success + */ + bus->state = MDIOBUS_UNREGISTERED; + + err = device_register(&bus->dev); + if (err) { + pr_err("mii_bus %s failed to register\n", bus->id); + return -EINVAL; + } + + mutex_init(&bus->mdio_lock); + mutex_init(&bus->shared_lock); + + /* assert bus level PHY GPIO reset */ + gpiod = devm_gpiod_get_optional(&bus->dev, "reset", GPIOD_OUT_HIGH); + if (IS_ERR(gpiod)) { + err = dev_err_probe(&bus->dev, PTR_ERR(gpiod), + "mii_bus %s couldn't get reset GPIO\n", + bus->id); + device_del(&bus->dev); + return err; + } else if (gpiod) { + bus->reset_gpiod = gpiod; + fsleep(bus->reset_delay_us); + gpiod_set_value_cansleep(gpiod, 0); + if (bus->reset_post_delay_us > 0) + fsleep(bus->reset_post_delay_us); + } + + if (bus->reset) { + err = bus->reset(bus); + if (err) + goto error_reset_gpiod; + } + + if (bus->read) { + err = mdiobus_scan_bus_c22(bus); + if (err) + goto error; + } + + prevent_c45_scan = mdiobus_prevent_c45_scan(bus); + + if (!prevent_c45_scan && bus->read_c45) { + err = mdiobus_scan_bus_c45(bus); + if (err) + goto error; + } + + mdiobus_setup_mdiodev_from_board_info(bus, mdiobus_create_device); + + bus->state = MDIOBUS_REGISTERED; + dev_dbg(&bus->dev, "probed\n"); + return 0; + +error: + for (i = 0; i < PHY_MAX_ADDR; i++) { + mdiodev = bus->mdio_map[i]; + if (!mdiodev) + continue; + + mdiodev->device_remove(mdiodev); + mdiodev->device_free(mdiodev); + } +error_reset_gpiod: + /* Put PHYs in RESET to save power */ + if (bus->reset_gpiod) + gpiod_set_value_cansleep(bus->reset_gpiod, 1); + + device_del(&bus->dev); + return err; +} +EXPORT_SYMBOL(__mdiobus_register); + +void mdiobus_unregister(struct mii_bus *bus) +{ + struct mdio_device *mdiodev; + int i; + + if (WARN_ON_ONCE(bus->state != MDIOBUS_REGISTERED)) + return; + bus->state = MDIOBUS_UNREGISTERED; + + for (i = 0; i < PHY_MAX_ADDR; i++) { + mdiodev = bus->mdio_map[i]; + if (!mdiodev) + continue; + + if (mdiodev->reset_gpio) + gpiod_put(mdiodev->reset_gpio); + + mdiodev->device_remove(mdiodev); + mdiodev->device_free(mdiodev); + } + + /* Put PHYs in RESET to save power */ + if (bus->reset_gpiod) + gpiod_set_value_cansleep(bus->reset_gpiod, 1); + + device_del(&bus->dev); +} +EXPORT_SYMBOL(mdiobus_unregister); + +/** + * mdiobus_free - free a struct mii_bus + * @bus: mii_bus to free + * + * This function releases the reference to the underlying device + * object in the mii_bus. If this is the last reference, the mii_bus + * will be freed. + */ +void mdiobus_free(struct mii_bus *bus) +{ + /* For compatibility with error handling in drivers. */ + if (bus->state == MDIOBUS_ALLOCATED) { + kfree(bus); + return; + } + + WARN(bus->state != MDIOBUS_UNREGISTERED, + "%s: not in UNREGISTERED state\n", bus->id); + bus->state = MDIOBUS_RELEASED; + + put_device(&bus->dev); +} +EXPORT_SYMBOL(mdiobus_free); diff --git a/include/linux/phy.h b/include/linux/phy.h index 3beaf225ee881..d62d292024bc2 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2062,6 +2062,7 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct netlink_ext_ack *extack); extern const struct bus_type mdio_bus_type; +extern const struct class mdio_bus_class; struct mdio_board_info { const char *bus_id; From 14a0087e7236228d56bfa3fab7084c19fcb513fb Mon Sep 17 00:00:00 2001 From: Andrea Mayer Date: Tue, 29 Apr 2025 15:24:53 +0200 Subject: [PATCH 625/770] ipv6: sr: switch to GFP_ATOMIC flag to allocate memory during seg6local LWT setup Recent updates to the locking mechanism that protects IPv6 routing tables [1] have affected the SRv6 networking subsystem. Such changes cause problems with some SRv6 Endpoints behaviors, like End.B6.Encaps and also impact SRv6 counters. Starting from commit 169fd62799e8 ("ipv6: Get rid of RTNL for SIOCADDRT and RTM_NEWROUTE."), the inet6_rtm_newroute() function no longer needs to acquire the RTNL lock for creating and configuring IPv6 routes and set up lwtunnels. The RTNL lock can be avoided because the ip6_route_add() function finishes setting up a new route in a section protected by RCU. This makes sure that no dev/nexthops can disappear during the operation. Because of this, the steps for setting up lwtunnels - i.e., calling lwtunnel_build_state() - are now done in a RCU lock section and not under the RTNL lock anymore. However, creating and configuring a lwtunnel instance in an RCU-protected section can be problematic when that tunnel needs to allocate memory using the GFP_KERNEL flag. For example, the following trace shows what happens when an SRv6 End.B6.Encaps behavior is instantiated after commit 169fd62799e8 ("ipv6: Get rid of RTNL for SIOCADDRT and RTM_NEWROUTE."): [ 3061.219696] BUG: sleeping function called from invalid context at ./include/linux/sched/mm.h:321 [ 3061.226136] in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 445, name: ip [ 3061.232101] preempt_count: 0, expected: 0 [ 3061.235414] RCU nest depth: 1, expected: 0 [ 3061.238622] 1 lock held by ip/445: [ 3061.241458] #0: ffffffff83ec64a0 (rcu_read_lock){....}-{1:3}, at: ip6_route_add+0x41/0x1e0 [ 3061.248520] CPU: 1 UID: 0 PID: 445 Comm: ip Not tainted 6.15.0-rc3-micro-vm-dev-00590-ge527e891492d #2058 PREEMPT(full) [ 3061.248532] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 [ 3061.248549] Call Trace: [ 3061.248620] [ 3061.248633] dump_stack_lvl+0xa9/0xc0 [ 3061.248846] __might_resched+0x218/0x360 [ 3061.248871] __kmalloc_node_track_caller_noprof+0x332/0x4e0 [ 3061.248889] ? rcu_is_watching+0x3a/0x70 [ 3061.248902] ? parse_nla_srh+0x56/0xa0 [ 3061.248938] kmemdup_noprof+0x1c/0x40 [ 3061.248952] parse_nla_srh+0x56/0xa0 [ 3061.248969] seg6_local_build_state+0x2e0/0x580 [ 3061.248992] ? __lock_acquire+0xaff/0x1cd0 [ 3061.249013] ? do_raw_spin_lock+0x111/0x1d0 [ 3061.249027] ? __pfx_seg6_local_build_state+0x10/0x10 [ 3061.249068] ? lwtunnel_build_state+0xe1/0x3a0 [ 3061.249274] lwtunnel_build_state+0x10d/0x3a0 [ 3061.249303] fib_nh_common_init+0xce/0x1e0 [ 3061.249337] ? __pfx_fib_nh_common_init+0x10/0x10 [ 3061.249352] ? in6_dev_get+0xaf/0x1f0 [ 3061.249369] ? __rcu_read_unlock+0x64/0x2e0 [ 3061.249392] fib6_nh_init+0x290/0xc30 [ 3061.249422] ? __pfx_fib6_nh_init+0x10/0x10 [ 3061.249447] ? __lock_acquire+0xaff/0x1cd0 [ 3061.249459] ? _raw_spin_unlock_irqrestore+0x22/0x70 [ 3061.249624] ? ip6_route_info_create+0x423/0x520 [ 3061.249641] ? rcu_is_watching+0x3a/0x70 [ 3061.249683] ip6_route_info_create_nh+0x190/0x390 [ 3061.249715] ip6_route_add+0x71/0x1e0 [ 3061.249730] ? __pfx_inet6_rtm_newroute+0x10/0x10 [ 3061.249743] inet6_rtm_newroute+0x426/0xc50 [ 3061.249764] ? avc_has_perm_noaudit+0x13d/0x360 [ 3061.249853] ? __pfx_inet6_rtm_newroute+0x10/0x10 [ 3061.249905] ? __lock_acquire+0xaff/0x1cd0 [ 3061.249962] ? rtnetlink_rcv_msg+0x52f/0x890 [ 3061.249996] ? __pfx_inet6_rtm_newroute+0x10/0x10 [ 3061.250012] rtnetlink_rcv_msg+0x551/0x890 [ 3061.250040] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 3061.250065] ? __lock_acquire+0xaff/0x1cd0 [ 3061.250092] netlink_rcv_skb+0xbd/0x1f0 [ 3061.250108] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 3061.250124] ? __pfx_netlink_rcv_skb+0x10/0x10 [ 3061.250179] ? netlink_deliver_tap+0x10b/0x700 [ 3061.250210] netlink_unicast+0x2e7/0x410 [ 3061.250232] ? __pfx_netlink_unicast+0x10/0x10 [ 3061.250241] ? __lock_acquire+0xaff/0x1cd0 [ 3061.250280] netlink_sendmsg+0x366/0x670 [ 3061.250306] ? __pfx_netlink_sendmsg+0x10/0x10 [ 3061.250313] ? find_held_lock+0x2d/0xa0 [ 3061.250344] ? import_ubuf+0xbc/0xf0 [ 3061.250370] ? __pfx_netlink_sendmsg+0x10/0x10 [ 3061.250381] __sock_sendmsg+0x13e/0x150 [ 3061.250420] ____sys_sendmsg+0x33d/0x450 [ 3061.250442] ? __pfx_____sys_sendmsg+0x10/0x10 [ 3061.250453] ? __pfx_copy_msghdr_from_user+0x10/0x10 [ 3061.250489] ? __pfx_slab_free_after_rcu_debug+0x10/0x10 [ 3061.250514] ___sys_sendmsg+0xe5/0x160 [ 3061.250530] ? __pfx____sys_sendmsg+0x10/0x10 [ 3061.250568] ? __lock_acquire+0xaff/0x1cd0 [ 3061.250617] ? find_held_lock+0x2d/0xa0 [ 3061.250678] ? __virt_addr_valid+0x199/0x340 [ 3061.250704] ? preempt_count_sub+0xf/0xc0 [ 3061.250736] __sys_sendmsg+0xca/0x140 [ 3061.250750] ? __pfx___sys_sendmsg+0x10/0x10 [ 3061.250786] ? syscall_exit_to_user_mode+0xa2/0x1e0 [ 3061.250825] do_syscall_64+0x62/0x140 [ 3061.250844] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 3061.250855] RIP: 0033:0x7f0b042ef914 [ 3061.250868] Code: 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b5 0f 1f 80 00 00 00 00 48 8d 05 e9 5d 0c 00 8b 00 85 c0 75 13 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 41 54 41 89 d4 55 48 89 f5 53 [ 3061.250876] RSP: 002b:00007ffc2d113ef8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 3061.250885] RAX: ffffffffffffffda RBX: 00000000680f93fa RCX: 00007f0b042ef914 [ 3061.250891] RDX: 0000000000000000 RSI: 00007ffc2d113f60 RDI: 0000000000000003 [ 3061.250897] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000008 [ 3061.250902] R10: fffffffffffff26d R11: 0000000000000246 R12: 0000000000000001 [ 3061.250907] R13: 000055a961f8a520 R14: 000055a961f63eae R15: 00007ffc2d115270 [ 3061.250952] To solve this issue, we replace the GFP_KERNEL flag with the GFP_ATOMIC one in those SRv6 Endpoints that need to allocate memory during the setup phase. This change makes sure that memory allocations are handled in a way that works with RCU critical sections. [1] - https://lore.kernel.org/all/20250418000443.43734-1-kuniyu@amazon.com/ Fixes: 169fd62799e8 ("ipv6: Get rid of RTNL for SIOCADDRT and RTM_NEWROUTE.") Signed-off-by: Andrea Mayer Reviewed-by: David Ahern Link: https://patch.msgid.link/20250429132453.31605-1-andrea.mayer@uniroma2.it Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_local.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index ac1dbd492c22d..ee5e448cc7a87 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1671,7 +1671,7 @@ static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt, if (!seg6_validate_srh(srh, len, false)) return -EINVAL; - slwt->srh = kmemdup(srh, len, GFP_KERNEL); + slwt->srh = kmemdup(srh, len, GFP_ATOMIC); if (!slwt->srh) return -ENOMEM; @@ -1911,7 +1911,7 @@ static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt, if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME]) return -EINVAL; - slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL); + slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_ATOMIC); if (!slwt->bpf.name) return -ENOMEM; @@ -1994,7 +1994,7 @@ static int parse_nla_counters(struct nlattr **attrs, return -EINVAL; /* counters are always zero initialized */ - pcounters = seg6_local_alloc_pcpu_counters(GFP_KERNEL); + pcounters = seg6_local_alloc_pcpu_counters(GFP_ATOMIC); if (!pcounters) return -ENOMEM; From 630cb33ccfcd04563598d0f0edd96c94ddf3352d Mon Sep 17 00:00:00 2001 From: Alexey Charkov Date: Wed, 30 Apr 2025 14:42:45 +0400 Subject: [PATCH 626/770] dt-bindings: net: via-rhine: Convert to YAML Rewrite the textual description for the VIA Rhine platform Ethernet controller as YAML schema, and switch the filename to follow the compatible string. These are used in several VIA/WonderMedia SoCs Signed-off-by: Alexey Charkov Reviewed-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20250430-rhine-binding-v2-1-4290156c0f57@gmail.com Signed-off-by: Jakub Kicinski --- .../bindings/net/via,vt8500-rhine.yaml | 41 +++++++++++++++++++ .../devicetree/bindings/net/via-rhine.txt | 17 -------- 2 files changed, 41 insertions(+), 17 deletions(-) create mode 100644 Documentation/devicetree/bindings/net/via,vt8500-rhine.yaml delete mode 100644 Documentation/devicetree/bindings/net/via-rhine.txt diff --git a/Documentation/devicetree/bindings/net/via,vt8500-rhine.yaml b/Documentation/devicetree/bindings/net/via,vt8500-rhine.yaml new file mode 100644 index 0000000000000..e663d5a2f0147 --- /dev/null +++ b/Documentation/devicetree/bindings/net/via,vt8500-rhine.yaml @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/via,vt8500-rhine.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: VIA Rhine 10/100 Network Controller + +description: + VIA's Ethernet controller integrated into VIA VT8500, + WonderMedia WM8950 and related SoCs + +maintainers: + - Alexey Charkov + +allOf: + - $ref: ethernet-controller.yaml# + +properties: + compatible: + const: via,vt8500-rhine + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + +required: + - reg + - interrupts + +unevaluatedProperties: false + +examples: + - | + ethernet@d8004000 { + compatible = "via,vt8500-rhine"; + reg = <0xd8004000 0x100>; + interrupts = <10>; + }; diff --git a/Documentation/devicetree/bindings/net/via-rhine.txt b/Documentation/devicetree/bindings/net/via-rhine.txt deleted file mode 100644 index 334eca2bf937c..0000000000000 --- a/Documentation/devicetree/bindings/net/via-rhine.txt +++ /dev/null @@ -1,17 +0,0 @@ -* VIA Rhine 10/100 Network Controller - -Required properties: -- compatible : Should be "via,vt8500-rhine" for integrated - Rhine controllers found in VIA VT8500, WonderMedia WM8950 - and similar. These are listed as 1106:3106 rev. 0x84 on the - virtual PCI bus under vendor-provided kernels -- reg : Address and length of the io space -- interrupts : Should contain the controller interrupt line - -Examples: - -ethernet@d8004000 { - compatible = "via,vt8500-rhine"; - reg = <0xd8004000 0x100>; - interrupts = <10>; -}; From a6471da7745aa16b1534848877ce73739668992b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 08:46:53 -0700 Subject: [PATCH 627/770] tools: ynl-gen: fix comment about nested struct dict The dict stores struct objects (of class Struct), not just a trivial set with directions. Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250429154704.2613851-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/pyynl/ynl_gen_c.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 9613a61350037..077aacd5f33a3 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -1015,7 +1015,7 @@ def resolve(self): # dict space-name -> 'request': set(attrs), 'reply': set(attrs) self.root_sets = dict() - # dict space-name -> set('request', 'reply') + # dict space-name -> Struct self.pure_nested_structs = dict() self._mark_notify() From 2286905f1b33cafa25e508f20bfceb47e20e85d8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 08:46:54 -0700 Subject: [PATCH 628/770] tools: ynl-gen: factor out free_needs_iter for a struct Instead of walking the entries in the code gen add a method for the struct class to return if any of the members need an iterator. Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250429154704.2613851-3-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/pyynl/ynl_gen_c.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 077aacd5f33a3..90f7fe6b623bd 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -809,6 +809,12 @@ def set_inherited(self, new_inherited): raise Exception("Inheriting different members not supported") self.inherited = [c_lower(x) for x in sorted(self._inherited)] + def free_needs_iter(self): + for _, attr in self.attr_list: + if attr.free_needs_iter(): + return True + return False + class EnumEntry(SpecEnumEntry): def __init__(self, enum_set, yaml, prev, value_start): @@ -2156,11 +2162,9 @@ def print_wrapped_type(ri): def _free_type_members_iter(ri, struct): - for _, attr in struct.member_list(): - if attr.free_needs_iter(): - ri.cw.p('unsigned int i;') - ri.cw.nl() - break + if struct.free_needs_iter(): + ri.cw.p('unsigned int i;') + ri.cw.nl() def _free_type_members(ri, var, struct, ref=''): From d12a7be02524a6f4d8d5d57920001ea63a93f28d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 08:46:55 -0700 Subject: [PATCH 629/770] tools: ynl-gen: fill in missing empty attr lists The C codegen refers to op attribute lists all over the place, without checking if they are present, even tho attribute list is technically an optional property. Add them automatically at init if missing so that we don't have to make specs longer. Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250429154704.2613851-4-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/pyynl/ynl_gen_c.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 90f7fe6b623bd..898c41a7a81fe 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -938,6 +938,14 @@ def new_attr(self, elem, value): class Operation(SpecOperation): def __init__(self, family, yaml, req_value, rsp_value): + # Fill in missing operation properties (for fixed hdr-only msgs) + for mode in ['do', 'dump', 'event']: + for direction in ['request', 'reply']: + try: + yaml[mode][direction].setdefault('attributes', []) + except KeyError: + pass + super().__init__(family, yaml, req_value, rsp_value) self.render_name = c_lower(family.ident_name + '_' + self.name) From fe7d57e040f7d189e3bd030f311bb2011a0cc35a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 08:46:56 -0700 Subject: [PATCH 630/770] tools: ynl: let classic netlink requests specify extra nlflags Classic netlink makes extensive use of flags. Support specifying them the same way as attributes are specified (using a helper), for example: rt_link_newlink_req_set_nlflags(req, NLM_F_CREATE | NLM_F_ECHO); Wrap the code up in a RenderInfo predicate. I think that some genetlink families may want this, too. It should be easy to add a spec property later. Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250429154704.2613851-5-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/lib/ynl-priv.h | 2 +- tools/net/ynl/lib/ynl.c | 4 ++-- tools/net/ynl/pyynl/ynl_gen_c.py | 21 ++++++++++++++++++++- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/tools/net/ynl/lib/ynl-priv.h b/tools/net/ynl/lib/ynl-priv.h index 634eb16548b93..5debb09491e7b 100644 --- a/tools/net/ynl/lib/ynl-priv.h +++ b/tools/net/ynl/lib/ynl-priv.h @@ -94,7 +94,7 @@ struct ynl_ntf_base_type { unsigned char data[] __attribute__((aligned(8))); }; -struct nlmsghdr *ynl_msg_start_req(struct ynl_sock *ys, __u32 id); +struct nlmsghdr *ynl_msg_start_req(struct ynl_sock *ys, __u32 id, __u16 flags); struct nlmsghdr *ynl_msg_start_dump(struct ynl_sock *ys, __u32 id); struct nlmsghdr * diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c index 70f899a540076..c16f01372ca32 100644 --- a/tools/net/ynl/lib/ynl.c +++ b/tools/net/ynl/lib/ynl.c @@ -451,9 +451,9 @@ ynl_gemsg_start(struct ynl_sock *ys, __u32 id, __u16 flags, return nlh; } -struct nlmsghdr *ynl_msg_start_req(struct ynl_sock *ys, __u32 id) +struct nlmsghdr *ynl_msg_start_req(struct ynl_sock *ys, __u32 id, __u16 flags) { - return ynl_msg_start(ys, id, NLM_F_REQUEST | NLM_F_ACK); + return ynl_msg_start(ys, id, NLM_F_REQUEST | NLM_F_ACK | flags); } struct nlmsghdr *ynl_msg_start_dump(struct ynl_sock *ys, __u32 id) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 898c41a7a81fe..c035abb8ae1c5 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -1294,6 +1294,9 @@ def __init__(self, cw, family, ku_space, op, op_mode, attr_set=None): def type_empty(self, key): return len(self.struct[key].attr_list) == 0 and self.fixed_hdr is None + def needs_nlflags(self, direction): + return self.op_mode == 'do' and direction == 'request' and self.family.is_classic() + class CodeWriter: def __init__(self, nlib, out_file=None, overwrite=True): @@ -1924,7 +1927,7 @@ def print_req(ri): ri.cw.write_func_lvar(local_vars) if ri.family.is_classic(): - ri.cw.p(f"nlh = ynl_msg_start_req(ys, {ri.op.enum_name});") + ri.cw.p(f"nlh = ynl_msg_start_req(ys, {ri.op.enum_name}, req->_nlmsg_flags);") else: ri.cw.p(f"nlh = ynl_gemsg_start_req(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);") @@ -2053,6 +2056,16 @@ def print_free_prototype(ri, direction, suffix=';'): ri.cw.write_func_prot('void', f"{name}_free", [f"struct {struct_name} *{arg}"], suffix=suffix) +def print_nlflags_set(ri, direction): + name = op_prefix(ri, direction) + ri.cw.write_func_prot(f'static inline void', f"{name}_set_nlflags", + [f"struct {name} *req", "__u16 nl_flags"]) + ri.cw.block_start() + ri.cw.p('req->_nlmsg_flags = nl_flags;') + ri.cw.block_end() + ri.cw.nl() + + def _print_type(ri, direction, struct): suffix = f'_{ri.type_name}{direction_to_suffix[direction]}' if not direction and ri.type_name_conflict: @@ -2063,6 +2076,9 @@ def _print_type(ri, direction, struct): ri.cw.block_start(line=f"struct {ri.family.c_name}{suffix}") + if ri.needs_nlflags(direction): + ri.cw.p('__u16 _nlmsg_flags;') + ri.cw.nl() if ri.fixed_hdr: ri.cw.p(ri.fixed_hdr + ' _hdr;') ri.cw.nl() @@ -2102,6 +2118,9 @@ def print_type_helpers(ri, direction, deref=False): print_free_prototype(ri, direction) ri.cw.nl() + if ri.needs_nlflags(direction): + print_nlflags_set(ri, direction) + if ri.ku_space == 'user' and direction == 'request': for _, attr in ri.struct[direction].member_list(): attr.setter(ri, ri.attr_set, direction, deref=deref) From bbfb3c557c66899480c679b7958908722b821ac3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 08:46:57 -0700 Subject: [PATCH 631/770] tools: ynl-gen: support using dump types for ntf Classic Netlink has GET callbacks with no doit support, just dumps. Support using their responses in notifications. If notification points at a type which only has a dump - use the dump's type. Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250429154704.2613851-6-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/pyynl/ynl_gen_c.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index c035abb8ae1c5..0febbb3912e3c 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -1281,7 +1281,7 @@ def __init__(self, cw, family, ku_space, op, op_mode, attr_set=None): self.struct = dict() if op_mode == 'notify': - op_mode = 'do' + op_mode = 'do' if 'do' in op else 'dump' for op_dir in ['request', 'reply']: if op: type_list = [] From 49398830a4aa650c00b402477042c90db7d6214c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 08:46:58 -0700 Subject: [PATCH 632/770] tools: ynl-gen: support CRUD-like notifications for classic Netlink Allow CRUD-style notification where the notification is more like the response to the request, which can optionally be looped back onto the requesting socket. Since the notification and request are different ops in the spec, for example: - name: delrule doc: Remove an existing FIB rule attribute-set: fib-rule-attrs do: request: value: 33 attributes: *fib-rule-all - name: delrule-ntf doc: Notify a rule deletion value: 33 notify: getrule We need to find the request by ID. Ideally we'd detect this model from the spec properties, rather than assume that its what all classic netlink families do. But maybe that'd cause this model to spread and its easy to get wrong. For now assume CRUD == classic. Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250429154704.2613851-7-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/pyynl/ynl_gen_c.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 0febbb3912e3c..31e904f1a2f05 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -2787,7 +2787,11 @@ def render_uapi(family, cw): def _render_user_ntf_entry(ri, op): - ri.cw.block_start(line=f"[{op.enum_name}] = ") + if not ri.family.is_classic(): + ri.cw.block_start(line=f"[{op.enum_name}] = ") + else: + crud_op = ri.family.req_by_value[op.rsp_value] + ri.cw.block_start(line=f"[{crud_op.enum_name}] = ") ri.cw.p(f".alloc_sz\t= sizeof({type_name(ri, 'event')}),") ri.cw.p(f".cb\t\t= {op_prefix(ri, 'reply', deref=True)}_parse,") ri.cw.p(f".policy\t\t= &{ri.struct['reply'].render_name}_nest,") From 0ea8cf56cc20ed51cfbef7c6410874ac7fd07642 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 08:46:59 -0700 Subject: [PATCH 633/770] tools: ynl-gen: multi-attr: type gen for string Add support for multi attr strings (needed for link alt_names). We record the length individual strings in a len member, to do the same for multi-attr create a struct ynl_string in ynl.h and use it as a layer holding both the string and its length. Since strings may be arbitrary length dynamically allocate each individual one. Adjust arg_member and struct member to avoid spacing the double pointers to get "type **name;" rather than "type * *name;" Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250429154704.2613851-8-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/lib/ynl.h | 13 +++++++++++++ tools/net/ynl/pyynl/ynl_gen_c.py | 29 +++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/tools/net/ynl/lib/ynl.h b/tools/net/ynl/lib/ynl.h index 59256e258130b..6b8a625aaa5ff 100644 --- a/tools/net/ynl/lib/ynl.h +++ b/tools/net/ynl/lib/ynl.h @@ -85,6 +85,19 @@ struct ynl_sock { unsigned char raw_buf[]; }; +/** + * struct ynl_string - parsed individual string + * @len: length of the string (excluding terminating character) + * @str: value of the string + * + * Parsed and nul-terminated string. This struct is only used for arrays of + * strings. Non-array string members are placed directly in respective types. + */ +struct ynl_string { + unsigned int len; + char str[]; +}; + struct ynl_sock * ynl_sock_create(const struct ynl_family *yf, struct ynl_error *e); void ynl_sock_destroy(struct ynl_sock *ys); diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 31e904f1a2f05..895bc1ca95058 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -175,7 +175,8 @@ def free(self, ri, var, ref): def arg_member(self, ri): member = self._complex_member_type(ri) if member: - arg = [member + ' *' + self.c_name] + spc = ' ' if member[-1] != '*' else '' + arg = [member + spc + '*' + self.c_name] if self.presence_type() == 'count': arg += ['unsigned int n_' + self.c_name] return arg @@ -189,7 +190,8 @@ def struct_member(self, ri): ptr = '*' if self.is_multi_val() else '' if self.is_recursive_for_op(ri): ptr = '*' - ri.cw.p(f"{member} {ptr}{self.c_name};") + spc = ' ' if member[-1] != '*' else '' + ri.cw.p(f"{member}{spc}{ptr}{self.c_name};") return members = self.arg_member(ri) for one in members: @@ -638,6 +640,8 @@ def presence_type(self): def _complex_member_type(self, ri): if 'type' not in self.attr or self.attr['type'] == 'nest': return self.nested_struct_type + elif self.attr['type'] == 'string': + return 'struct ynl_string *' elif self.attr['type'] in scalars: scalar_pfx = '__' if ri.ku_space == 'user' else '' return scalar_pfx + self.attr['type'] @@ -645,12 +649,18 @@ def _complex_member_type(self, ri): raise Exception(f"Sub-type {self.attr['type']} not supported yet") def free_needs_iter(self): - return 'type' not in self.attr or self.attr['type'] == 'nest' + return self.attr['type'] in {'nest', 'string'} def _free_lines(self, ri, var, ref): lines = [] if self.attr['type'] in scalars: lines += [f"free({var}->{ref}{self.c_name});"] + elif self.attr['type'] == 'string': + lines += [ + f"for (i = 0; i < {var}->{ref}n_{self.c_name}; i++)", + f"free({var}->{ref}{self.c_name}[i]);", + f"free({var}->{ref}{self.c_name});", + ] elif 'type' not in self.attr or self.attr['type'] == 'nest': lines += [ f"for (i = 0; i < {var}->{ref}n_{self.c_name}; i++)", @@ -675,6 +685,9 @@ def attr_put(self, ri, var): put_type = self.type ri.cw.p(f"for (i = 0; i < {var}->n_{self.c_name}; i++)") ri.cw.p(f"ynl_attr_put_{put_type}(nlh, {self.enum_name}, {var}->{self.c_name}[i]);") + elif self.attr['type'] == 'string': + ri.cw.p(f"for (i = 0; i < {var}->n_{self.c_name}; i++)") + ri.cw.p(f"ynl_attr_put_str(nlh, {self.enum_name}, {var}->{self.c_name}[i]->str);") elif 'type' not in self.attr or self.attr['type'] == 'nest': ri.cw.p(f"for (i = 0; i < {var}->n_{self.c_name}; i++)") self._attr_put_line(ri, var, f"{self.nested_render_name}_put(nlh, " + @@ -1834,8 +1847,16 @@ def _multi_parse(ri, struct, init_lines, local_vars): ri.cw.p('return YNL_PARSE_CB_ERROR;') elif aspec.type in scalars: ri.cw.p(f"dst->{aspec.c_name}[i] = ynl_attr_get_{aspec.type}(attr);") + elif aspec.type == 'string': + ri.cw.p('unsigned int len;') + ri.cw.nl() + ri.cw.p('len = strnlen(ynl_attr_get_str(attr), ynl_attr_data_len(attr));') + ri.cw.p(f'dst->{aspec.c_name}[i] = malloc(sizeof(struct ynl_string) + len + 1);') + ri.cw.p(f"dst->{aspec.c_name}[i]->len = len;") + ri.cw.p(f"memcpy(dst->{aspec.c_name}[i]->str, ynl_attr_get_str(attr), len);") + ri.cw.p(f"dst->{aspec.c_name}[i]->str[len] = 0;") else: - raise Exception('Nest parsing type not supported yet') + raise Exception(f'Nest parsing of type {aspec.type} not supported yet') ri.cw.p('i++;') ri.cw.block_end() ri.cw.block_end() From 3456084d6361f4c835b73a0bce04fa5894bf9c65 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 08:47:00 -0700 Subject: [PATCH 634/770] tools: ynl-gen: mutli-attr: support binary types with struct Binary types with struct are fixed size, relatively easy to handle for multi attr. Declare the member as a pointer. Count the members, allocate an array, copy in the data. Allow the netlink attr to be smaller or larger than our view of the struct in case the build headers are newer or older than the running kernel. Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250429154704.2613851-9-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/pyynl/ynl_gen_c.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 895bc1ca95058..a969762d557bf 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -640,6 +640,8 @@ def presence_type(self): def _complex_member_type(self, ri): if 'type' not in self.attr or self.attr['type'] == 'nest': return self.nested_struct_type + elif self.attr['type'] == 'binary' and 'struct' in self.attr: + return None # use arg_member() elif self.attr['type'] == 'string': return 'struct ynl_string *' elif self.attr['type'] in scalars: @@ -648,6 +650,12 @@ def _complex_member_type(self, ri): else: raise Exception(f"Sub-type {self.attr['type']} not supported yet") + def arg_member(self, ri): + if self.type == 'binary' and 'struct' in self.attr: + return [f'struct {c_lower(self.attr["struct"])} *{self.c_name}', + f'unsigned int n_{self.c_name}'] + return super().arg_member(ri) + def free_needs_iter(self): return self.attr['type'] in {'nest', 'string'} @@ -655,6 +663,8 @@ def _free_lines(self, ri, var, ref): lines = [] if self.attr['type'] in scalars: lines += [f"free({var}->{ref}{self.c_name});"] + elif self.attr['type'] == 'binary' and 'struct' in self.attr: + lines += [f"free({var}->{ref}{self.c_name});"] elif self.attr['type'] == 'string': lines += [ f"for (i = 0; i < {var}->{ref}n_{self.c_name}; i++)", @@ -685,6 +695,9 @@ def attr_put(self, ri, var): put_type = self.type ri.cw.p(f"for (i = 0; i < {var}->n_{self.c_name}; i++)") ri.cw.p(f"ynl_attr_put_{put_type}(nlh, {self.enum_name}, {var}->{self.c_name}[i]);") + elif self.attr['type'] == 'binary' and 'struct' in self.attr: + ri.cw.p(f"for (i = 0; i < {var}->n_{self.c_name}; i++)") + ri.cw.p(f"ynl_attr_put(nlh, {self.enum_name}, &{var}->{self.c_name}[i], sizeof(struct {c_lower(self.attr['struct'])}));") elif self.attr['type'] == 'string': ri.cw.p(f"for (i = 0; i < {var}->n_{self.c_name}; i++)") ri.cw.p(f"ynl_attr_put_str(nlh, {self.enum_name}, {var}->{self.c_name}[i]->str);") @@ -1847,6 +1860,12 @@ def _multi_parse(ri, struct, init_lines, local_vars): ri.cw.p('return YNL_PARSE_CB_ERROR;') elif aspec.type in scalars: ri.cw.p(f"dst->{aspec.c_name}[i] = ynl_attr_get_{aspec.type}(attr);") + elif aspec.type == 'binary' and 'struct' in aspec: + ri.cw.p('size_t len = ynl_attr_data_len(attr);') + ri.cw.nl() + ri.cw.p(f'if (len > sizeof(dst->{aspec.c_name}[0]))') + ri.cw.p(f'len = sizeof(dst->{aspec.c_name}[0]);') + ri.cw.p(f"memcpy(&dst->{aspec.c_name}[i], ynl_attr_data(attr), len);") elif aspec.type == 'string': ri.cw.p('unsigned int len;') ri.cw.nl() From 18b1886447d619fcd7515dcbda55b1903560e266 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 08:47:01 -0700 Subject: [PATCH 635/770] tools: ynl-gen: array-nest: support put for scalar C codegen supports ArrayNest AKA indexed-array carrying scalars, but only for the netlink -> struct parsing. Support rendering from struct to netlink. Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250429154704.2613851-10-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/pyynl/ynl_gen_c.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index a969762d557bf..a4e65da19696b 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -747,6 +747,23 @@ def _attr_get(self, ri, var): '}'] return get_lines, None, local_vars + def attr_put(self, ri, var): + ri.cw.p(f'array = ynl_attr_nest_start(nlh, {self.enum_name});') + if self.sub_type in scalars: + put_type = self.sub_type + ri.cw.block_start(line=f'for (i = 0; i < {var}->n_{self.c_name}; i++)') + ri.cw.p(f"ynl_attr_put_{put_type}(nlh, i, {var}->{self.c_name}[i]);") + ri.cw.block_end() + else: + raise Exception(f"Put for ArrayNest sub-type {self.attr['sub-type']} not supported, yet") + ri.cw.p('ynl_attr_nest_end(nlh, array);') + + def _setter_lines(self, ri, member, presence): + # For multi-attr we have a count, not presence, hack up the presence + presence = presence[:-(len('_present.') + len(self.c_name))] + "n_" + self.c_name + return [f"{member} = {self.c_name};", + f"{presence} = n_{self.c_name};"] + class TypeNestTypeValue(Type): def _complex_member_type(self, ri): @@ -1728,10 +1745,15 @@ def put_req_nested(ri, struct): local_vars.append('struct nlattr *nest;') init_lines.append("nest = ynl_attr_nest_start(nlh, attr_type);") + has_anest = False + has_count = False for _, arg in struct.member_list(): - if arg.presence_type() == 'count': - local_vars.append('unsigned int i;') - break + has_anest |= arg.type == 'indexed-array' + has_count |= arg.presence_type() == 'count' + if has_anest: + local_vars.append('struct nlattr *array;') + if has_count: + local_vars.append('unsigned int i;') put_req_nested_prototype(ri, struct, suffix='') ri.cw.block_start() From 5f7804dd8326d9ffb25553481c33ceaaedb4eb6b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 08:47:02 -0700 Subject: [PATCH 636/770] tools: ynl-gen: array-nest: support binary array with exact-len IPv6 addresses are expressed as binary arrays since we don't have u128. Since they are not variable length, however, they are relatively easy to represent as an array of known size. Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250429154704.2613851-11-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/pyynl/ynl_gen_c.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index a4e65da19696b..2d185c7ea16c3 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -183,10 +183,10 @@ def arg_member(self, ri): raise Exception(f"Struct member not implemented for class type {self.type}") def struct_member(self, ri): - if self.is_multi_val(): - ri.cw.p(f"unsigned int n_{self.c_name};") member = self._complex_member_type(ri) if member: + if self.is_multi_val(): + ri.cw.p(f"unsigned int n_{self.c_name};") ptr = '*' if self.is_multi_val() else '' if self.is_recursive_for_op(ri): ptr = '*' @@ -728,12 +728,22 @@ def _complex_member_type(self, ri): elif self.attr['sub-type'] in scalars: scalar_pfx = '__' if ri.ku_space == 'user' else '' return scalar_pfx + self.attr['sub-type'] + elif self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks: + return None # use arg_member() else: raise Exception(f"Sub-type {self.attr['sub-type']} not supported yet") + def arg_member(self, ri): + if self.sub_type == 'binary' and 'exact-len' in self.checks: + return [f'unsigned char (*{self.c_name})[{self.checks["exact-len"]}]', + f'unsigned int n_{self.c_name}'] + return super().arg_member(ri) + def _attr_typol(self): if self.attr['sub-type'] in scalars: return f'.type = YNL_PT_U{c_upper(self.sub_type[1:])}, ' + elif self.attr['sub-type'] == 'binary' and 'exact-len' in self.checks: + return f'.type = YNL_PT_BINARY, .len = {self.checks["exact-len"]}, ' else: return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, ' @@ -754,6 +764,9 @@ def attr_put(self, ri, var): ri.cw.block_start(line=f'for (i = 0; i < {var}->n_{self.c_name}; i++)') ri.cw.p(f"ynl_attr_put_{put_type}(nlh, i, {var}->{self.c_name}[i]);") ri.cw.block_end() + elif self.sub_type == 'binary' and 'exact-len' in self.checks: + ri.cw.p(f'for (i = 0; i < {var}->n_{self.c_name}; i++)') + ri.cw.p(f"ynl_attr_put(nlh, i, {var}->{self.c_name}[i], {self.checks['exact-len']});") else: raise Exception(f"Put for ArrayNest sub-type {self.attr['sub-type']} not supported, yet") ri.cw.p('ynl_attr_nest_end(nlh, array);') @@ -964,7 +977,7 @@ def new_attr(self, elem, value): elif elem['type'] == 'nest': t = TypeNest(self.family, self, elem, value) elif elem['type'] == 'indexed-array' and 'sub-type' in elem: - if elem["sub-type"] in ['nest', 'u32']: + if elem["sub-type"] in ['binary', 'nest', 'u32']: t = TypeArrayNest(self.family, self, elem, value) else: raise Exception(f'new_attr: unsupported sub-type {elem["sub-type"]}') @@ -1786,7 +1799,7 @@ def _multi_parse(ri, struct, init_lines, local_vars): needs_parg = False for arg, aspec in struct.member_list(): if aspec['type'] == 'indexed-array' and 'sub-type' in aspec: - if aspec["sub-type"] == 'nest': + if aspec["sub-type"] in {'binary', 'nest'}: local_vars.append(f'const struct nlattr *attr_{aspec.c_name};') array_nests.add(arg) elif aspec['sub-type'] in scalars: @@ -1859,6 +1872,9 @@ def _multi_parse(ri, struct, init_lines, local_vars): ri.cw.p('return YNL_PARSE_CB_ERROR;') elif aspec.sub_type in scalars: ri.cw.p(f"dst->{aspec.c_name}[i] = ynl_attr_get_{aspec.sub_type}(attr);") + elif aspec.sub_type == 'binary' and 'exact-len' in aspec.checks: + # Length is validated by typol + ri.cw.p(f'memcpy(dst->{aspec.c_name}[i], ynl_attr_data(attr), {aspec.checks["exact-len"]});') else: raise Exception(f"Nest parsing type not supported in {aspec['name']}") ri.cw.p('i++;') From 18d574c8dd3e075afb546408fae7edd6a14c9873 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 08:47:03 -0700 Subject: [PATCH 637/770] tools: ynl-gen: don't init enum checks for classic netlink rt-link has a vlan-protocols enum with: name: 8021q value: 33024 name: 8021ad value: 34984 It's nice to have, since it converts the values to strings in Python. For C, however, the codegen is trying to use enums to generate strict policy checks. Parsing such sparse enums is not possible via policies. Since for classic netlink we don't support kernel codegen and policy generation - skip the auto-generation of checks from enums. Reviewed-by: Donald Hunter Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250429154704.2613851-12-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/pyynl/ynl_gen_c.py | 46 ++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 2d185c7ea16c3..1dd9580086cd5 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -357,26 +357,10 @@ def __init__(self, family, attr_set, attr, value): if 'byte-order' in attr: self.byte_order_comment = f" /* {attr['byte-order']} */" - if 'enum' in self.attr: - enum = self.family.consts[self.attr['enum']] - low, high = enum.value_range() - if 'min' not in self.checks: - if low != 0 or self.type[0] == 's': - self.checks['min'] = low - if 'max' not in self.checks: - self.checks['max'] = high - - if 'min' in self.checks and 'max' in self.checks: - if self.get_limit('min') > self.get_limit('max'): - raise Exception(f'Invalid limit for "{self.name}" min: {self.get_limit("min")} max: {self.get_limit("max")}') - self.checks['range'] = True - - low = min(self.get_limit('min', 0), self.get_limit('max', 0)) - high = max(self.get_limit('min', 0), self.get_limit('max', 0)) - if low < 0 and self.type[0] == 'u': - raise Exception(f'Invalid limit for "{self.name}" negative limit for unsigned type') - if low < -32768 or high > 32767: - self.checks['full-range'] = True + # Classic families have some funny enums, don't bother + # computing checks, since we only need them for kernel policies + if not family.is_classic(): + self._init_checks() # Added by resolve(): self.is_bitfield = None @@ -401,6 +385,28 @@ def resolve(self): else: self.type_name = '__' + self.type + def _init_checks(self): + if 'enum' in self.attr: + enum = self.family.consts[self.attr['enum']] + low, high = enum.value_range() + if 'min' not in self.checks: + if low != 0 or self.type[0] == 's': + self.checks['min'] = low + if 'max' not in self.checks: + self.checks['max'] = high + + if 'min' in self.checks and 'max' in self.checks: + if self.get_limit('min') > self.get_limit('max'): + raise Exception(f'Invalid limit for "{self.name}" min: {self.get_limit("min")} max: {self.get_limit("max")}') + self.checks['range'] = True + + low = min(self.get_limit('min', 0), self.get_limit('max', 0)) + high = max(self.get_limit('min', 0), self.get_limit('max', 0)) + if low < 0 and self.type[0] == 'u': + raise Exception(f'Invalid limit for "{self.name}" negative limit for unsigned type') + if low < -32768 or high > 32767: + self.checks['full-range'] = True + def _attr_policy(self, policy): if 'flags-mask' in self.checks or self.is_bitfield: if self.is_bitfield: From 777c8029b551caafde89e0440c40127b8b5ba70e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 08:47:04 -0700 Subject: [PATCH 638/770] tools: ynl: allow fixed-header to be specified per op rtnetlink has variety of ops with different fixed headers. Detect that op fixed header is not the same as family one, and use sizeof() directly. For reverse parsing we need to pass the fixed header len along the policy (in the socket state). Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250429154704.2613851-13-kuba@kernel.org Signed-off-by: Paolo Abeni --- tools/net/ynl/lib/ynl.c | 8 ++++---- tools/net/ynl/lib/ynl.h | 1 + tools/net/ynl/pyynl/ynl_gen_c.py | 17 ++++++++++++++++- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/tools/net/ynl/lib/ynl.c b/tools/net/ynl/lib/ynl.c index c16f01372ca32..d263f6f40ad57 100644 --- a/tools/net/ynl/lib/ynl.c +++ b/tools/net/ynl/lib/ynl.c @@ -191,12 +191,12 @@ ynl_ext_ack_check(struct ynl_sock *ys, const struct nlmsghdr *nlh, n = snprintf(bad_attr, sizeof(bad_attr), "%sbad attribute: ", str ? " (" : ""); - start = ynl_nlmsg_data_offset(ys->nlh, ys->family->hdr_len); + start = ynl_nlmsg_data_offset(ys->nlh, ys->req_hdr_len); end = ynl_nlmsg_end_addr(ys->nlh); off = ys->err.attr_offs; off -= sizeof(struct nlmsghdr); - off -= ys->family->hdr_len; + off -= ys->req_hdr_len; n += ynl_err_walk(ys, start, end, off, ys->req_policy, &bad_attr[n], sizeof(bad_attr) - n, NULL); @@ -216,14 +216,14 @@ ynl_ext_ack_check(struct ynl_sock *ys, const struct nlmsghdr *nlh, n = snprintf(miss_attr, sizeof(miss_attr), "%smissing attribute: ", bad_attr[0] ? ", " : (str ? " (" : "")); - start = ynl_nlmsg_data_offset(ys->nlh, ys->family->hdr_len); + start = ynl_nlmsg_data_offset(ys->nlh, ys->req_hdr_len); end = ynl_nlmsg_end_addr(ys->nlh); nest_pol = ys->req_policy; if (tb[NLMSGERR_ATTR_MISS_NEST]) { off = ynl_attr_get_u32(tb[NLMSGERR_ATTR_MISS_NEST]); off -= sizeof(struct nlmsghdr); - off -= ys->family->hdr_len; + off -= ys->req_hdr_len; n += ynl_err_walk(ys, start, end, off, ys->req_policy, &miss_attr[n], sizeof(miss_attr) - n, diff --git a/tools/net/ynl/lib/ynl.h b/tools/net/ynl/lib/ynl.h index 6b8a625aaa5ff..32efeb2248293 100644 --- a/tools/net/ynl/lib/ynl.h +++ b/tools/net/ynl/lib/ynl.h @@ -80,6 +80,7 @@ struct ynl_sock { struct nlmsghdr *nlh; const struct ynl_policy_nest *req_policy; + size_t req_hdr_len; unsigned char *tx_buf; unsigned char *rx_buf; unsigned char raw_buf[]; diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 1dd9580086cd5..09b87c9a69081 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -1311,8 +1311,15 @@ def __init__(self, cw, family, ku_space, op, op_mode, attr_set=None): self.op = op self.fixed_hdr = None + self.fixed_hdr_len = 'ys->family->hdr_len' if op and op.fixed_header: self.fixed_hdr = 'struct ' + c_lower(op.fixed_header) + if op.fixed_header != family.fixed_header: + if family.is_classic(): + self.fixed_hdr_len = f"sizeof({self.fixed_hdr})" + else: + raise Exception(f"Per-op fixed header not supported, yet") + # 'do' and 'dump' response parsing is identical self.type_consistent = True @@ -1799,6 +1806,11 @@ def _multi_parse(ri, struct, init_lines, local_vars): if ri.fixed_hdr: local_vars += ['void *hdr;'] iter_line = "ynl_attr_for_each(attr, nlh, yarg->ys->family->hdr_len)" + if ri.op.fixed_header != ri.family.fixed_header: + if ri.family.is_classic(): + iter_line = f"ynl_attr_for_each(attr, nlh, sizeof({ri.fixed_hdr}))" + else: + raise Exception(f"Per-op fixed header not supported, yet") array_nests = set() multi_attrs = set() @@ -2016,6 +2028,7 @@ def print_req(ri): ri.cw.p(f"nlh = ynl_gemsg_start_req(ys, {ri.nl.get_family_id()}, {ri.op.enum_name}, 1);") ri.cw.p(f"ys->req_policy = &{ri.struct['request'].render_name}_nest;") + ri.cw.p(f"ys->req_hdr_len = {ri.fixed_hdr_len};") if 'reply' in ri.op[ri.op_mode]: ri.cw.p(f"yrs.yarg.rsp_policy = &{ri.struct['reply'].render_name}_nest;") ri.cw.nl() @@ -2095,6 +2108,7 @@ def print_dump(ri): if "request" in ri.op[ri.op_mode]: ri.cw.p(f"ys->req_policy = &{ri.struct['request'].render_name}_nest;") + ri.cw.p(f"ys->req_hdr_len = {ri.fixed_hdr_len};") ri.cw.nl() for _, attr in ri.struct["request"].member_list(): attr.attr_put(ri, "req") @@ -2914,7 +2928,8 @@ def render_user_family(family, cw, prototype): cw.p(f'.is_classic\t= true,') cw.p(f'.classic_id\t= {family.get("protonum")},') if family.is_classic(): - cw.p(f'.hdr_len\t= sizeof(struct {c_lower(family.fixed_header)}),') + if family.fixed_header: + cw.p(f'.hdr_len\t= sizeof(struct {c_lower(family.fixed_header)}),') elif family.fixed_header: cw.p(f'.hdr_len\t= sizeof(struct genlmsghdr) + sizeof(struct {c_lower(family.fixed_header)}),') else: From 299fe91517e1bf51050c4266ebf77c347b5667bd Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Sun, 27 Apr 2025 15:40:34 +0200 Subject: [PATCH 639/770] Documentation: networking: expand and clarify EEE_GET/EEE_SET documentation Improve the documentation for ETHTOOL_MSG_EEE_GET and ETHTOOL_MSG_EEE_SET to provide accurate descriptions of all netlink attributes involved. Signed-off-by: Oleksij Rempel Signed-off-by: NipaLocal --- Documentation/networking/ethtool-netlink.rst | 111 ++++++++++++++++--- include/uapi/linux/ethtool.h | 3 + 2 files changed, 96 insertions(+), 18 deletions(-) diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index b6e9af4d0f1b9..78ee481437a47 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1215,20 +1215,16 @@ Kernel response contents: ===================================== ====== ========================== ``ETHTOOL_A_EEE_HEADER`` nested request header - ``ETHTOOL_A_EEE_MODES_OURS`` bool supported/advertised modes - ``ETHTOOL_A_EEE_MODES_PEER`` bool peer advertised link modes + ``ETHTOOL_A_EEE_MODES_OURS`` bitset supported/advertised modes + ``ETHTOOL_A_EEE_MODES_PEER`` bitset peer advertised link modes ``ETHTOOL_A_EEE_ACTIVE`` bool EEE is actively used ``ETHTOOL_A_EEE_ENABLED`` bool EEE is enabled - ``ETHTOOL_A_EEE_TX_LPI_ENABLED`` bool Tx lpi enabled - ``ETHTOOL_A_EEE_TX_LPI_TIMER`` u32 Tx lpi timeout (in us) + ``ETHTOOL_A_EEE_TX_LPI_ENABLED`` bool Tx LPI enabled + ``ETHTOOL_A_EEE_TX_LPI_TIMER`` u32 Tx LPI timeout (in us) ===================================== ====== ========================== -In ``ETHTOOL_A_EEE_MODES_OURS``, mask consists of link modes for which EEE is -enabled, value of link modes for which EEE is advertised. Link modes for which -peer advertises EEE are listed in ``ETHTOOL_A_EEE_MODES_PEER`` (no mask). The -netlink interface allows reporting EEE status for all link modes but only -first 32 are provided by the ``ethtool_ops`` callback. - +For detailed explanation of each attribute, see the ``EEE Attributes`` +section. EEE_SET ======= @@ -1239,17 +1235,96 @@ Request contents: ===================================== ====== ========================== ``ETHTOOL_A_EEE_HEADER`` nested request header - ``ETHTOOL_A_EEE_MODES_OURS`` bool advertised modes + ``ETHTOOL_A_EEE_MODES_OURS`` bitset advertised modes ``ETHTOOL_A_EEE_ENABLED`` bool EEE is enabled - ``ETHTOOL_A_EEE_TX_LPI_ENABLED`` bool Tx lpi enabled - ``ETHTOOL_A_EEE_TX_LPI_TIMER`` u32 Tx lpi timeout (in us) + ``ETHTOOL_A_EEE_TX_LPI_ENABLED`` bool Tx LPI enabled + ``ETHTOOL_A_EEE_TX_LPI_TIMER`` u32 Tx LPI timeout (in us) ===================================== ====== ========================== -``ETHTOOL_A_EEE_MODES_OURS`` is used to either list link modes to advertise -EEE for (if there is no mask) or specify changes to the list (if there is -a mask). The netlink interface allows reporting EEE status for all link modes -but only first 32 can be set at the moment as that is what the ``ethtool_ops`` -callback supports. +For detailed explanation of each attribute, see the ``EEE Attributes`` +section. + +EEE Attributes +============== + +Limitations: + +The netlink interface allows configuring all link modes up to +``__ETHTOOL_LINK_MODE_MASK_NBITS``, but if the driver relies on legacy +``ethtool_ops``, only the first 32 link modes are supported. + +The following structure is used for the ioctl interface (``ETHTOOL_GEEE`` and +``ETHTOOL_SEEE``): + +.. kernel-doc:: include/uapi/linux/ethtool.h + :identifiers: ethtool_eee + +Mapping between netlink attributes and struct fields: + + ================================ ================================ + Netlink attribute struct ethtool_eee field + ================================ ================================ + ``ETHTOOL_A_EEE_MODES_OURS`` advertised + ``ETHTOOL_A_EEE_MODES_PEER`` lp_advertised + ``ETHTOOL_A_EEE_ACTIVE`` eee_active + ``ETHTOOL_A_EEE_ENABLED`` eee_enabled + ``ETHTOOL_A_EEE_TX_LPI_ENABLED`` tx_lpi_enabled + ``ETHTOOL_A_EEE_TX_LPI_TIMER`` tx_lpi_timer + ================================ ================================ + + +``ETHTOOL_A_EEE_MODES_OURS`` (bitset) +------------------------------------- +- Value: link modes that the driver intends to advertise for EEE. +- Mask: subset of link modes supported for EEE by the interface. + +The advertised EEE capabilities are maintained in software state and persist +across toggling EEE on or off. If ``ETHTOOL_A_EEE_ENABLED`` is false, the PHY +does not advertise EEE, but the configured value is reported. + +``ETHTOOL_A_EEE_MODES_PEER`` (bitset) +------------------------------------- +- Value: link modes that the link partner advertises for EEE. +- Mask: empty. + +This value is typically reported by the hardware and may represent only a +subset of the actual capabilities supported and advertised by the link partner. +The local hardware may not be able to detect or represent all EEE-capable modes +of the peer. + +``ETHTOOL_A_EEE_ACTIVE`` (bool) +------------------------------- +Indicates whether EEE is currently active on the link. EEE is considered active +if: + + - ``ETHTOOL_A_EEE_ENABLED`` is true, + - Autonegotiation is enabled, + - The current link mode is EEE-capable, + - Both the local advertisement and the peer advertisement include this link + mode. + +``ETHTOOL_A_EEE_ENABLED`` (bool) +-------------------------------- +A software-controlled flag. + +When ``ETHTOOL_A_EEE_ENABLED`` is set to true and autonegotiation is active, +the kernel programs the EEE advertisement settings into the PHY hardware +registers. This enables negotiation of EEE capability with the link partner. + +When ``ETHTOOL_A_EEE_ENABLED`` is set to false, EEE advertisement is disabled. +The PHY will not include EEE capability in its autonegotiation pages, and EEE +will not be negotiated even if it remains configured in software state. + +``ETHTOOL_A_EEE_TX_LPI_ENABLED`` (bool) +--------------------------------------- +Controls whether the system may enter the Low Power Idle (LPI) state after +transmission has stopped. + +``ETHTOOL_A_EEE_TX_LPI_TIMER`` (u32) +------------------------------------ +Defines the delay in microseconds after the last transmitted frame before the +MAC may enter the Low Power Idle (LPI) state. This value applies globally to +all link modes. A higher timer value delays LPI entry. TSINFO_GET diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 84833cca29fec..c596618633bcb 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -366,6 +366,9 @@ struct ethtool_eeprom { * its tx lpi (after reaching 'idle' state). Effective only when eee * was negotiated and tx_lpi_enabled was set. * @reserved: Reserved for future use; see the note on reserved space. + * + * More detailed documentation can be found in + * Documentation/networking/ethtool-netlink.rst section "EEE Attributes". */ struct ethtool_eee { __u32 cmd; From 2f203f462b90235d23365989fec8089099aa18c8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 28 Apr 2025 15:05:31 +0200 Subject: [PATCH 640/770] net: usb: lan78xx: Improve error handling in PHY initialization Ensure that return values from `lan78xx_write_reg()`, `lan78xx_read_reg()`, and `phy_find_first()` are properly checked and propagated. Use `ERR_PTR(ret)` for error reporting in `lan7801_phy_init()` and replace `-EIO` with `-ENODEV` where appropriate to provide more accurate error codes. Signed-off-by: Oleksij Rempel Signed-off-by: NipaLocal --- drivers/net/usb/lan78xx.c | 47 ++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index e4f1663b62047..19db18cf0504d 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2510,14 +2510,13 @@ static void lan78xx_remove_irq_domain(struct lan78xx_net *dev) static struct phy_device *lan7801_phy_init(struct lan78xx_net *dev) { - u32 buf; - int ret; struct fixed_phy_status fphy_status = { .link = 1, .speed = SPEED_1000, .duplex = DUPLEX_FULL, }; struct phy_device *phydev; + int ret; phydev = phy_find_first(dev->mdiobus); if (!phydev) { @@ -2525,30 +2524,40 @@ static struct phy_device *lan7801_phy_init(struct lan78xx_net *dev) phydev = fixed_phy_register(PHY_POLL, &fphy_status, NULL); if (IS_ERR(phydev)) { netdev_err(dev->net, "No PHY/fixed_PHY found\n"); - return NULL; + return ERR_PTR(-ENODEV); } netdev_dbg(dev->net, "Registered FIXED PHY\n"); dev->interface = PHY_INTERFACE_MODE_RGMII; ret = lan78xx_write_reg(dev, MAC_RGMII_ID, MAC_RGMII_ID_TXC_DELAY_EN_); + if (ret < 0) + return ERR_PTR(ret); + ret = lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00); - ret = lan78xx_read_reg(dev, HW_CFG, &buf); - buf |= HW_CFG_CLK125_EN_; - buf |= HW_CFG_REFCLK25_EN_; - ret = lan78xx_write_reg(dev, HW_CFG, buf); + if (ret < 0) + return ERR_PTR(ret); + + ret = lan78xx_update_reg(dev, HW_CFG, HW_CFG_CLK125_EN_ | + HW_CFG_REFCLK25_EN_, + HW_CFG_CLK125_EN_ | HW_CFG_REFCLK25_EN_); + if (ret < 0) + return ERR_PTR(ret); } else { if (!phydev->drv) { netdev_err(dev->net, "no PHY driver found\n"); - return NULL; + return ERR_PTR(-EINVAL); } dev->interface = PHY_INTERFACE_MODE_RGMII_ID; /* The PHY driver is responsible to configure proper RGMII * interface delays. Disable RGMII delays on MAC side. */ - lan78xx_write_reg(dev, MAC_RGMII_ID, 0); + ret = lan78xx_write_reg(dev, MAC_RGMII_ID, 0); + if (ret < 0) + return ERR_PTR(ret); phydev->is_internal = false; } + return phydev; } @@ -2562,9 +2571,10 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) switch (dev->chipid) { case ID_REV_CHIP_ID_7801_: phydev = lan7801_phy_init(dev); - if (!phydev) { - netdev_err(dev->net, "lan7801: PHY Init Failed"); - return -EIO; + if (IS_ERR(phydev)) { + netdev_err(dev->net, "lan7801: failed to init PHY: %pe\n", + phydev); + return PTR_ERR(phydev); } break; @@ -2573,7 +2583,7 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) phydev = phy_find_first(dev->mdiobus); if (!phydev) { netdev_err(dev->net, "no PHY found\n"); - return -EIO; + return -ENODEV; } phydev->is_internal = true; dev->interface = PHY_INTERFACE_MODE_GMII; @@ -2581,7 +2591,7 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) default: netdev_err(dev->net, "Unknown CHIP ID found\n"); - return -EIO; + return -ENODEV; } /* if phyirq is not set, use polling mode in phylib */ @@ -2633,7 +2643,10 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) sizeof(u32)); if (len >= 0) { /* Ensure the appropriate LEDs are enabled */ - lan78xx_read_reg(dev, HW_CFG, ®); + ret = lan78xx_read_reg(dev, HW_CFG, ®); + if (ret < 0) + return ret; + reg &= ~(HW_CFG_LED0_EN_ | HW_CFG_LED1_EN_ | HW_CFG_LED2_EN_ | @@ -2642,7 +2655,9 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) (len > 1) * HW_CFG_LED1_EN_ | (len > 2) * HW_CFG_LED2_EN_ | (len > 3) * HW_CFG_LED3_EN_; - lan78xx_write_reg(dev, HW_CFG, reg); + ret = lan78xx_write_reg(dev, HW_CFG, reg); + if (ret < 0) + return ret; } } From c237ad89956656d27ff5af5005dc63caf43beb39 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 28 Apr 2025 15:05:32 +0200 Subject: [PATCH 641/770] net: usb: lan78xx: remove explicit check for missing PHY driver RGMII timing correctness relies on the PHY providing internal delays. This is typically ensured via PHY driver, strap pins, or PCB layout. Explicitly checking for a PHY driver here is unnecessary and non-standard. This logic applies to all MACs, not just LAN78xx, and should be left to phylib, phylink, or platform configuration. Drop the check and rely on standard subsystem behavior. Signed-off-by: Oleksij Rempel Signed-off-by: NipaLocal --- drivers/net/usb/lan78xx.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 19db18cf0504d..9c0658227bde5 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2543,10 +2543,6 @@ static struct phy_device *lan7801_phy_init(struct lan78xx_net *dev) if (ret < 0) return ERR_PTR(ret); } else { - if (!phydev->drv) { - netdev_err(dev->net, "no PHY driver found\n"); - return ERR_PTR(-EINVAL); - } dev->interface = PHY_INTERFACE_MODE_RGMII_ID; /* The PHY driver is responsible to configure proper RGMII * interface delays. Disable RGMII delays on MAC side. From b0dbc187da0cab2c3d3f87c74b9a8a0f7ba7511d Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 28 Apr 2025 15:05:33 +0200 Subject: [PATCH 642/770] net: usb: lan78xx: refactor PHY init to separate detection and MAC configuration Split out PHY detection into lan78xx_get_phy() and MAC-side setup into lan78xx_mac_prepare_for_phy(), making the main lan78xx_phy_init() cleaner and easier to follow. This improves separation of concerns and prepares the code for a future transition to phylink. Fixed PHY registration and interface selection are now handled in lan78xx_get_phy(), while MAC-side delay configuration is done in lan78xx_mac_prepare_for_phy(). The fixed PHY fallback is preserved for setups like EVB-KSZ9897-1, where LAN7801 connects directly to a KSZ switch without a standard PHY or device tree support. No functional changes intended. Signed-off-by: Oleksij Rempel Signed-off-by: NipaLocal --- drivers/net/usb/lan78xx.c | 174 ++++++++++++++++++++++++++++---------- 1 file changed, 128 insertions(+), 46 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 9c0658227bde5..7f1ecc415d53d 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2508,53 +2508,145 @@ static void lan78xx_remove_irq_domain(struct lan78xx_net *dev) dev->domain_data.irqdomain = NULL; } -static struct phy_device *lan7801_phy_init(struct lan78xx_net *dev) +/** + * lan78xx_register_fixed_phy() - Register a fallback fixed PHY + * @dev: LAN78xx device + * + * Registers a fixed PHY with 1 Gbps full duplex. This is used in special cases + * like EVB-KSZ9897-1, where LAN7801 acts as a USB-to-Ethernet interface to a + * switch without a visible PHY. + * + * Return: pointer to the registered fixed PHY, or ERR_PTR() on error. + */ +static struct phy_device *lan78xx_register_fixed_phy(struct lan78xx_net *dev) { struct fixed_phy_status fphy_status = { .link = 1, .speed = SPEED_1000, .duplex = DUPLEX_FULL, }; + + netdev_info(dev->net, + "No PHY found on LAN7801 – registering fixed PHY (e.g. EVB-KSZ9897-1)\n"); + + return fixed_phy_register(PHY_POLL, &fphy_status, NULL); +} + +/** + * lan78xx_get_phy() - Probe or register PHY device and set interface mode + * @dev: LAN78xx device structure + * + * This function attempts to find a PHY on the MDIO bus. If no PHY is found + * and the chip is LAN7801, it registers a fixed PHY as fallback. It also + * sets dev->interface based on chip ID and detected PHY type. + * + * Return: a valid PHY device pointer, or ERR_PTR() on failure. + */ +static struct phy_device *lan78xx_get_phy(struct lan78xx_net *dev) +{ struct phy_device *phydev; - int ret; + /* Attempt to locate a PHY on the MDIO bus */ phydev = phy_find_first(dev->mdiobus); - if (!phydev) { - netdev_dbg(dev->net, "PHY Not Found!! Registering Fixed PHY\n"); - phydev = fixed_phy_register(PHY_POLL, &fphy_status, NULL); - if (IS_ERR(phydev)) { - netdev_err(dev->net, "No PHY/fixed_PHY found\n"); - return ERR_PTR(-ENODEV); + + switch (dev->chipid) { + case ID_REV_CHIP_ID_7801_: + if (phydev) { + /* External RGMII PHY detected */ + dev->interface = PHY_INTERFACE_MODE_RGMII_ID; + phydev->is_internal = false; + + if (!phydev->drv) + netdev_warn(dev->net, + "PHY driver not found – assuming RGMII delays are on PCB or strapped for the PHY\n"); + + return phydev; } - netdev_dbg(dev->net, "Registered FIXED PHY\n"); + dev->interface = PHY_INTERFACE_MODE_RGMII; + /* No PHY found – fallback to fixed PHY (e.g. KSZ switch board) */ + return lan78xx_register_fixed_phy(dev); + + case ID_REV_CHIP_ID_7800_: + case ID_REV_CHIP_ID_7850_: + if (!phydev) + return ERR_PTR(-ENODEV); + + /* These use internal GMII-connected PHY */ + dev->interface = PHY_INTERFACE_MODE_GMII; + phydev->is_internal = true; + return phydev; + + default: + netdev_err(dev->net, "Unknown CHIP ID: 0x%08x\n", dev->chipid); + return ERR_PTR(-ENODEV); + } +} + +/** + * lan78xx_mac_prepare_for_phy() - Preconfigure MAC-side interface settings + * @dev: LAN78xx device + * + * Configure MAC-side registers according to dev->interface, which should be + * set by lan78xx_get_phy(). + * + * - For PHY_INTERFACE_MODE_RGMII: + * Enable MAC-side TXC delay. This mode seems to be used in a special setup + * without a real PHY, likely on EVB-KSZ9897-1. In that design, LAN7801 is + * connected to the KSZ9897 switch, and the link timing is expected to be + * hardwired (e.g. via strapping or board layout). No devicetree support is + * assumed here. + * + * - For PHY_INTERFACE_MODE_RGMII_ID: + * Disable MAC-side delay and rely on the PHY driver to provide delay. + * + * - For GMII, no MAC-specific config is needed. + * + * Return: 0 on success or a negative error code. + */ +static int lan78xx_mac_prepare_for_phy(struct lan78xx_net *dev) +{ + int ret; + + switch (dev->interface) { + case PHY_INTERFACE_MODE_RGMII: + /* Enable MAC-side TX clock delay */ ret = lan78xx_write_reg(dev, MAC_RGMII_ID, MAC_RGMII_ID_TXC_DELAY_EN_); if (ret < 0) - return ERR_PTR(ret); + return ret; ret = lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00); if (ret < 0) - return ERR_PTR(ret); + return ret; - ret = lan78xx_update_reg(dev, HW_CFG, HW_CFG_CLK125_EN_ | - HW_CFG_REFCLK25_EN_, + ret = lan78xx_update_reg(dev, HW_CFG, + HW_CFG_CLK125_EN_ | HW_CFG_REFCLK25_EN_, HW_CFG_CLK125_EN_ | HW_CFG_REFCLK25_EN_); if (ret < 0) - return ERR_PTR(ret); - } else { - dev->interface = PHY_INTERFACE_MODE_RGMII_ID; - /* The PHY driver is responsible to configure proper RGMII - * interface delays. Disable RGMII delays on MAC side. - */ + return ret; + + break; + + case PHY_INTERFACE_MODE_RGMII_ID: + /* Disable MAC-side TXC delay, PHY provides it */ ret = lan78xx_write_reg(dev, MAC_RGMII_ID, 0); if (ret < 0) - return ERR_PTR(ret); + return ret; - phydev->is_internal = false; + break; + + case PHY_INTERFACE_MODE_GMII: + /* No MAC-specific configuration required */ + break; + + default: + netdev_warn(dev->net, "Unsupported interface mode: %d\n", + dev->interface); + break; } - return phydev; + return 0; } static int lan78xx_phy_init(struct lan78xx_net *dev) @@ -2564,31 +2656,13 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) u32 mii_adv; struct phy_device *phydev; - switch (dev->chipid) { - case ID_REV_CHIP_ID_7801_: - phydev = lan7801_phy_init(dev); - if (IS_ERR(phydev)) { - netdev_err(dev->net, "lan7801: failed to init PHY: %pe\n", - phydev); - return PTR_ERR(phydev); - } - break; - - case ID_REV_CHIP_ID_7800_: - case ID_REV_CHIP_ID_7850_: - phydev = phy_find_first(dev->mdiobus); - if (!phydev) { - netdev_err(dev->net, "no PHY found\n"); - return -ENODEV; - } - phydev->is_internal = true; - dev->interface = PHY_INTERFACE_MODE_GMII; - break; + phydev = lan78xx_get_phy(dev); + if (IS_ERR(phydev)) + return PTR_ERR(phydev); - default: - netdev_err(dev->net, "Unknown CHIP ID found\n"); - return -ENODEV; - } + ret = lan78xx_mac_prepare_for_phy(dev); + if (ret < 0) + goto free_phy; /* if phyirq is not set, use polling mode in phylib */ if (dev->domain_data.phyirq > 0) @@ -2662,6 +2736,14 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) dev->fc_autoneg = phydev->autoneg; return 0; + +free_phy: + if (phy_is_pseudo_fixed_link(phydev)) { + fixed_phy_unregister(phydev); + phy_device_free(phydev); + } + + return ret; } static int lan78xx_set_rx_max_frame_length(struct lan78xx_net *dev, int size) From a6b06a0049c0e785edebd5c3573d227c687932aa Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 28 Apr 2025 15:05:34 +0200 Subject: [PATCH 643/770] net: usb: lan78xx: move LED DT configuration to helper Extract the LED enable logic based on the "microchip,led-modes" property into a new helper function lan78xx_configure_leds_from_dt(). This simplifies lan78xx_phy_init() and improves modularity. No functional changes intended. Signed-off-by: Oleksij Rempel Signed-off-by: NipaLocal --- drivers/net/usb/lan78xx.c | 72 +++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 7f1ecc415d53d..07530eef82cb6 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2649,6 +2649,49 @@ static int lan78xx_mac_prepare_for_phy(struct lan78xx_net *dev) return 0; } +/** + * lan78xx_configure_leds_from_dt() - Configure LED enables based on DT + * @dev: LAN78xx device + * @phydev: PHY device (must be valid) + * + * Reads "microchip,led-modes" property from the PHY's DT node and enables + * the corresponding number of LEDs by writing to HW_CFG. + * + * This helper preserves the original logic, enabling up to 4 LEDs. + * If the property is not present, this function does nothing. + * + * Return: 0 on success or a negative error code. + */ +static int lan78xx_configure_leds_from_dt(struct lan78xx_net *dev, + struct phy_device *phydev) +{ + struct device_node *np = phydev->mdio.dev.of_node; + u32 reg; + int len, ret; + + if (!np) + return 0; + + len = of_property_count_elems_of_size(np, "microchip,led-modes", + sizeof(u32)); + if (len < 0) + return 0; + + ret = lan78xx_read_reg(dev, HW_CFG, ®); + if (ret < 0) + return ret; + + reg &= ~(HW_CFG_LED0_EN_ | HW_CFG_LED1_EN_ | + HW_CFG_LED2_EN_ | HW_CFG_LED3_EN_); + + reg |= (len > 0) * HW_CFG_LED0_EN_ | + (len > 1) * HW_CFG_LED1_EN_ | + (len > 2) * HW_CFG_LED2_EN_ | + (len > 3) * HW_CFG_LED3_EN_; + + return lan78xx_write_reg(dev, HW_CFG, reg); +} + static int lan78xx_phy_init(struct lan78xx_net *dev) { __ETHTOOL_DECLARE_LINK_MODE_MASK(fc) = { 0, }; @@ -2704,32 +2747,9 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) phy_support_eee(phydev); - if (phydev->mdio.dev.of_node) { - u32 reg; - int len; - - len = of_property_count_elems_of_size(phydev->mdio.dev.of_node, - "microchip,led-modes", - sizeof(u32)); - if (len >= 0) { - /* Ensure the appropriate LEDs are enabled */ - ret = lan78xx_read_reg(dev, HW_CFG, ®); - if (ret < 0) - return ret; - - reg &= ~(HW_CFG_LED0_EN_ | - HW_CFG_LED1_EN_ | - HW_CFG_LED2_EN_ | - HW_CFG_LED3_EN_); - reg |= (len > 0) * HW_CFG_LED0_EN_ | - (len > 1) * HW_CFG_LED1_EN_ | - (len > 2) * HW_CFG_LED2_EN_ | - (len > 3) * HW_CFG_LED3_EN_; - ret = lan78xx_write_reg(dev, HW_CFG, reg); - if (ret < 0) - return ret; - } - } + ret = lan78xx_configure_leds_from_dt(dev, phydev); + if (ret) + goto free_phy; genphy_config_aneg(phydev); From ecf472a7e2bbdb9f1c6c6cad57f63266a550f2e0 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 28 Apr 2025 15:05:35 +0200 Subject: [PATCH 644/770] net: usb: lan78xx: Extract PHY interrupt acknowledgment to helper Move the PHY interrupt acknowledgment logic from lan78xx_link_reset() to a new helper function lan78xx_phy_int_ack(). This simplifies the code and prepares for reusing the acknowledgment logic independently from the full link reset process, such as when using phylink. No functional change intended. Signed-off-by: Oleksij Rempel Signed-off-by: NipaLocal --- drivers/net/usb/lan78xx.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 07530eef82cb6..de2b429e906e8 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1636,6 +1636,20 @@ static int lan78xx_mac_reset(struct lan78xx_net *dev) return ret; } +/** + * lan78xx_phy_int_ack - Acknowledge PHY interrupt + * @dev: pointer to the LAN78xx device structure + * + * This function acknowledges the PHY interrupt by setting the + * INT_STS_PHY_INT_ bit in the interrupt status register (INT_STS). + * + * Return: 0 on success or a negative error code on failure. + */ +static int lan78xx_phy_int_ack(struct lan78xx_net *dev) +{ + return lan78xx_write_reg(dev, INT_STS, INT_STS_PHY_INT_); +} + static int lan78xx_link_reset(struct lan78xx_net *dev) { struct phy_device *phydev = dev->net->phydev; @@ -1644,7 +1658,7 @@ static int lan78xx_link_reset(struct lan78xx_net *dev) u32 buf; /* clear LAN78xx interrupt status */ - ret = lan78xx_write_reg(dev, INT_STS, INT_STS_PHY_INT_); + ret = lan78xx_phy_int_ack(dev); if (unlikely(ret < 0)) return ret; From 7877f3212f82238376da1cbab938dd8a13750b1c Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 28 Apr 2025 15:05:36 +0200 Subject: [PATCH 645/770] net: usb: lan78xx: Refactor USB link power configuration into helper Move the USB link power configuration logic from lan78xx_link_reset() to a new helper function lan78xx_configure_usb(). This simplifies the main link reset path and isolates USB-specific logic. The new function handles U1/U2 enablement based on Ethernet link speed, but only for SuperSpeed-capable devices (LAN7800 and LAN7801). LAN7850, a High-Speed-only device, is explicitly excluded. A warning is logged if SuperSpeed is reported unexpectedly for LAN7850. Add a forward declaration for lan78xx_configure_usb() as preparation for the upcoming phylink conversion, where it will also be used from the mac_link_up() callback. Open questions remain: - Why is the 1000 Mbps configuration split into two steps (U2 disable, then U1 enable), unlike the single-step config used for 10/100 Mbps? - U1/U2 behavior appears to depend on proper EEPROM configuration. There are known devices in the field without EEPROM. Should the driver enforce safe defaults in such cases? Due to lack of USB subsystem expertise, no changes were made to this logic beyond structural refactoring. Signed-off-by: Oleksij Rempel Signed-off-by: NipaLocal --- drivers/net/usb/lan78xx.c | 90 +++++++++++++++++++++++++-------------- 1 file changed, 59 insertions(+), 31 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index de2b429e906e8..bff53324c70a2 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1650,12 +1650,13 @@ static int lan78xx_phy_int_ack(struct lan78xx_net *dev) return lan78xx_write_reg(dev, INT_STS, INT_STS_PHY_INT_); } +static int lan78xx_configure_usb(struct lan78xx_net *dev, int speed); + static int lan78xx_link_reset(struct lan78xx_net *dev) { struct phy_device *phydev = dev->net->phydev; struct ethtool_link_ksettings ecmd; int ladv, radv, ret, link; - u32 buf; /* clear LAN78xx interrupt status */ ret = lan78xx_phy_int_ack(dev); @@ -1681,36 +1682,9 @@ static int lan78xx_link_reset(struct lan78xx_net *dev) phy_ethtool_ksettings_get(phydev, &ecmd); - if (dev->udev->speed == USB_SPEED_SUPER) { - if (ecmd.base.speed == 1000) { - /* disable U2 */ - ret = lan78xx_read_reg(dev, USB_CFG1, &buf); - if (ret < 0) - return ret; - buf &= ~USB_CFG1_DEV_U2_INIT_EN_; - ret = lan78xx_write_reg(dev, USB_CFG1, buf); - if (ret < 0) - return ret; - /* enable U1 */ - ret = lan78xx_read_reg(dev, USB_CFG1, &buf); - if (ret < 0) - return ret; - buf |= USB_CFG1_DEV_U1_INIT_EN_; - ret = lan78xx_write_reg(dev, USB_CFG1, buf); - if (ret < 0) - return ret; - } else { - /* enable U1 & U2 */ - ret = lan78xx_read_reg(dev, USB_CFG1, &buf); - if (ret < 0) - return ret; - buf |= USB_CFG1_DEV_U2_INIT_EN_; - buf |= USB_CFG1_DEV_U1_INIT_EN_; - ret = lan78xx_write_reg(dev, USB_CFG1, buf); - if (ret < 0) - return ret; - } - } + ret = lan78xx_configure_usb(dev, ecmd.base.speed); + if (ret < 0) + return ret; ladv = phy_read(phydev, MII_ADVERTISE); if (ladv < 0) @@ -2522,6 +2496,60 @@ static void lan78xx_remove_irq_domain(struct lan78xx_net *dev) dev->domain_data.irqdomain = NULL; } +/** + * lan78xx_configure_usb - Configure USB link power settings + * @dev: pointer to the LAN78xx device structure + * @speed: negotiated Ethernet link speed (in Mbps) + * + * This function configures U1/U2 link power management for SuperSpeed + * USB devices based on the current Ethernet link speed. It uses the + * USB_CFG1 register to enable or disable U1 and U2 low-power states. + * + * Note: Only LAN7800 and LAN7801 support SuperSpeed (USB 3.x). + * LAN7850 is a High-Speed-only (USB 2.0) device and is skipped. + * + * Return: 0 on success or a negative error code on failure. + */ +static int lan78xx_configure_usb(struct lan78xx_net *dev, int speed) +{ + u32 mask, val; + int ret; + + /* Only configure USB settings for SuperSpeed devices */ + if (dev->udev->speed != USB_SPEED_SUPER) + return 0; + + /* LAN7850 does not support USB 3.x */ + if (dev->chipid == ID_REV_CHIP_ID_7850_) { + netdev_warn_once(dev->net, "Unexpected SuperSpeed for LAN7850 (USB 2.0 only)\n"); + return 0; + } + + switch (speed) { + case SPEED_1000: + /* Disable U2, enable U1 */ + ret = lan78xx_update_reg(dev, USB_CFG1, + USB_CFG1_DEV_U2_INIT_EN_, 0); + if (ret < 0) + return ret; + + return lan78xx_update_reg(dev, USB_CFG1, + USB_CFG1_DEV_U1_INIT_EN_, + USB_CFG1_DEV_U1_INIT_EN_); + + case SPEED_100: + case SPEED_10: + /* Enable both U1 and U2 */ + mask = USB_CFG1_DEV_U1_INIT_EN_ | USB_CFG1_DEV_U2_INIT_EN_; + val = mask; + return lan78xx_update_reg(dev, USB_CFG1, mask, val); + + default: + netdev_warn(dev->net, "Unsupported link speed: %d\n", speed); + return -EINVAL; + } +} + /** * lan78xx_register_fixed_phy() - Register a fallback fixed PHY * @dev: LAN78xx device From de0a4d875279124c97e3f4fc468517bb33b877c0 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 28 Apr 2025 15:05:37 +0200 Subject: [PATCH 646/770] net: usb: lan78xx: Extract flow control configuration to helper Move flow control register configuration from lan78xx_update_flowcontrol() into a new helper function lan78xx_configure_flowcontrol(). This separates hardware-specific programming from policy logic and simplifies the upcoming phylink integration. The values used in this initial version of lan78xx_configure_flowcontrol() are taken over as-is from the original implementation to avoid functional changes. While they may not be optimal for all USB and link speed combinations, they are known to work reliably. Optimization of pause time and thresholds based on runtime conditions can be done in a separate follow-up patch. The forward declaration of lan78xx_configure_flowcontrol() will also be removed later during the phylink conversion. Signed-off-by: Oleksij Rempel Signed-off-by: NipaLocal --- drivers/net/usb/lan78xx.c | 105 +++++++++++++++++++++++++++++++------- 1 file changed, 87 insertions(+), 18 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index bff53324c70a2..58e3589e3b891 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1554,10 +1554,12 @@ static void lan78xx_set_multicast(struct net_device *netdev) schedule_work(&pdata->set_multicast); } +static int lan78xx_configure_flowcontrol(struct lan78xx_net *dev, + bool tx_pause, bool rx_pause); + static int lan78xx_update_flowcontrol(struct lan78xx_net *dev, u8 duplex, u16 lcladv, u16 rmtadv) { - u32 flow = 0, fct_flow = 0; u8 cap; if (dev->fc_autoneg) @@ -1565,27 +1567,13 @@ static int lan78xx_update_flowcontrol(struct lan78xx_net *dev, u8 duplex, else cap = dev->fc_request_control; - if (cap & FLOW_CTRL_TX) - flow |= (FLOW_CR_TX_FCEN_ | 0xFFFF); - - if (cap & FLOW_CTRL_RX) - flow |= FLOW_CR_RX_FCEN_; - - if (dev->udev->speed == USB_SPEED_SUPER) - fct_flow = FLOW_CTRL_THRESHOLD(FLOW_ON_SS, FLOW_OFF_SS); - else if (dev->udev->speed == USB_SPEED_HIGH) - fct_flow = FLOW_CTRL_THRESHOLD(FLOW_ON_HS, FLOW_OFF_HS); - netif_dbg(dev, link, dev->net, "rx pause %s, tx pause %s", (cap & FLOW_CTRL_RX ? "enabled" : "disabled"), (cap & FLOW_CTRL_TX ? "enabled" : "disabled")); - lan78xx_write_reg(dev, FCT_FLOW, fct_flow); - - /* threshold value should be set before enabling flow */ - lan78xx_write_reg(dev, FLOW, flow); - - return 0; + return lan78xx_configure_flowcontrol(dev, + cap & FLOW_CTRL_TX, + cap & FLOW_CTRL_RX); } static void lan78xx_rx_urb_submit_all(struct lan78xx_net *dev); @@ -2550,6 +2538,87 @@ static int lan78xx_configure_usb(struct lan78xx_net *dev, int speed) } } +/** + * lan78xx_configure_flowcontrol - Set MAC and FIFO flow control configuration + * @dev: pointer to the LAN78xx device structure + * @tx_pause: enable transmission of pause frames + * @rx_pause: enable reception of pause frames + * + * This function configures the LAN78xx flow control settings by writing + * to the FLOW and FCT_FLOW registers. The pause time is set to the + * maximum allowed value (65535 quanta). FIFO thresholds are selected + * based on USB speed. + * + * The Pause Time field is measured in units of 512-bit times (quanta): + * - At 1 Gbps: 1 quanta = 512 ns → max ~33.6 ms pause + * - At 100 Mbps: 1 quanta = 5.12 µs → max ~335 ms pause + * - At 10 Mbps: 1 quanta = 51.2 µs → max ~3.3 s pause + * + * Flow control thresholds (FCT_FLOW) are used to trigger pause/resume: + * - RXUSED is the number of bytes used in the RX FIFO + * - Flow is turned ON when RXUSED ≥ FLOW_ON threshold + * - Flow is turned OFF when RXUSED ≤ FLOW_OFF threshold + * - Both thresholds are encoded in units of 512 bytes (rounded up) + * + * Thresholds differ by USB speed because available USB bandwidth + * affects how fast packets can be drained from the RX FIFO: + * - USB 3.x (SuperSpeed): + * FLOW_ON = 9216 bytes → 18 units + * FLOW_OFF = 4096 bytes → 8 units + * - USB 2.0 (High-Speed): + * FLOW_ON = 8704 bytes → 17 units + * FLOW_OFF = 1024 bytes → 2 units + * + * Note: The FCT_FLOW register must be configured before enabling TX pause + * (i.e., before setting FLOW_CR_TX_FCEN_), as required by the hardware. + * + * Return: 0 on success or a negative error code on failure. + */ +static int lan78xx_configure_flowcontrol(struct lan78xx_net *dev, + bool tx_pause, bool rx_pause) +{ + /* Use maximum pause time: 65535 quanta (512-bit times) */ + const u32 pause_time_quanta = 65535; + u32 fct_flow = 0; + u32 flow = 0; + int ret; + + /* Prepare MAC flow control bits */ + if (tx_pause) + flow |= FLOW_CR_TX_FCEN_ | pause_time_quanta; + + if (rx_pause) + flow |= FLOW_CR_RX_FCEN_; + + /* Select RX FIFO thresholds based on USB speed + * + * FCT_FLOW layout: + * bits [6:0] FLOW_ON threshold (RXUSED ≥ ON → assert pause) + * bits [14:8] FLOW_OFF threshold (RXUSED ≤ OFF → deassert pause) + * thresholds are expressed in units of 512 bytes + */ + switch (dev->udev->speed) { + case USB_SPEED_SUPER: + fct_flow = FLOW_CTRL_THRESHOLD(FLOW_ON_SS, FLOW_OFF_SS); + break; + case USB_SPEED_HIGH: + fct_flow = FLOW_CTRL_THRESHOLD(FLOW_ON_HS, FLOW_OFF_HS); + break; + default: + netdev_warn(dev->net, "Unsupported USB speed: %d\n", + dev->udev->speed); + return -EINVAL; + } + + /* Step 1: Write FIFO thresholds before enabling pause frames */ + ret = lan78xx_write_reg(dev, FCT_FLOW, fct_flow); + if (ret < 0) + return ret; + + /* Step 2: Enable MAC pause functionality */ + return lan78xx_write_reg(dev, FLOW, flow); +} + /** * lan78xx_register_fixed_phy() - Register a fallback fixed PHY * @dev: LAN78xx device From c44c87841552c26d9ddafef9354354c1a256621d Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 28 Apr 2025 15:05:38 +0200 Subject: [PATCH 647/770] net: usb: lan78xx: Convert to PHYLINK for improved PHY and MAC management Convert the LAN78xx USB Ethernet driver to use the PHYLINK framework for managing PHY and MAC interactions. This improves consistency with other network drivers, simplifies pause frame handling, and enables cleaner suspend/resume support. Key changes: - Replace all PHYLIB-based logic with PHYLINK equivalents: - Replace phy_connect()/phy_disconnect() with phylink_connect_phy() - Replace phy_start()/phy_stop() with phylink_start()/phylink_stop() - Replace pauseparam handling with phylink_ethtool_get/set_pauseparam() - Introduce lan78xx_phylink_setup() to configure PHYLINK - Add phylink MAC operations: - lan78xx_mac_config() - lan78xx_mac_link_up() - lan78xx_mac_link_down() - Remove legacy link state handling: - lan78xx_link_status_change() - lan78xx_link_reset() - Handle fixed-link fallback for LAN7801 using phylink_set_fixed_link() - Replace deprecated flow control handling with phylink-managed logic Power management: - Switch suspend/resume paths to use phylink_suspend()/phylink_resume() - Ensure proper use of rtnl_lock() where required - Note: full runtime testing of power management is currently limited due to hardware setup constraints Note: Conversion of EEE (Energy Efficient Ethernet) handling to the PHYLINK-managed API will be done in a follow-up patch. For now, the legacy EEE enable logic is preserved in mac_link_up(). Signed-off-by: Oleksij Rempel Signed-off-by: NipaLocal --- drivers/net/usb/Kconfig | 3 +- drivers/net/usb/lan78xx.c | 551 ++++++++++++++++++-------------------- 2 files changed, 258 insertions(+), 296 deletions(-) diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index 370b32fc25880..0a678e31cfaaa 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -113,9 +113,8 @@ config USB_RTL8152 config USB_LAN78XX tristate "Microchip LAN78XX Based USB Ethernet Adapters" select MII - select PHYLIB + select PHYLINK select MICROCHIP_PHY - select FIXED_PHY select CRC32 help This option adds support for Microchip LAN78XX based USB 2 diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 58e3589e3b891..bbc218def7810 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -384,7 +385,7 @@ struct skb_data { /* skb->cb is one of these */ #define EVENT_RX_HALT 1 #define EVENT_RX_MEMORY 2 #define EVENT_STS_SPLIT 3 -#define EVENT_LINK_RESET 4 +#define EVENT_PHY_INT_ACK 4 #define EVENT_RX_PAUSED 5 #define EVENT_DEV_WAKING 6 #define EVENT_DEV_ASLEEP 7 @@ -455,7 +456,6 @@ struct lan78xx_net { unsigned long data[5]; - int link_on; u8 mdix_ctrl; u32 chipid; @@ -463,13 +463,13 @@ struct lan78xx_net { struct mii_bus *mdiobus; phy_interface_t interface; - int fc_autoneg; - u8 fc_request_control; - int delta; struct statstage stats; struct irq_domain_data domain_data; + + struct phylink *phylink; + struct phylink_config phylink_config; }; /* use ethtool to change the level for any given device */ @@ -1554,28 +1554,6 @@ static void lan78xx_set_multicast(struct net_device *netdev) schedule_work(&pdata->set_multicast); } -static int lan78xx_configure_flowcontrol(struct lan78xx_net *dev, - bool tx_pause, bool rx_pause); - -static int lan78xx_update_flowcontrol(struct lan78xx_net *dev, u8 duplex, - u16 lcladv, u16 rmtadv) -{ - u8 cap; - - if (dev->fc_autoneg) - cap = mii_resolve_flowctrl_fdx(lcladv, rmtadv); - else - cap = dev->fc_request_control; - - netif_dbg(dev, link, dev->net, "rx pause %s, tx pause %s", - (cap & FLOW_CTRL_RX ? "enabled" : "disabled"), - (cap & FLOW_CTRL_TX ? "enabled" : "disabled")); - - return lan78xx_configure_flowcontrol(dev, - cap & FLOW_CTRL_TX, - cap & FLOW_CTRL_RX); -} - static void lan78xx_rx_urb_submit_all(struct lan78xx_net *dev); static int lan78xx_mac_reset(struct lan78xx_net *dev) @@ -1638,75 +1616,6 @@ static int lan78xx_phy_int_ack(struct lan78xx_net *dev) return lan78xx_write_reg(dev, INT_STS, INT_STS_PHY_INT_); } -static int lan78xx_configure_usb(struct lan78xx_net *dev, int speed); - -static int lan78xx_link_reset(struct lan78xx_net *dev) -{ - struct phy_device *phydev = dev->net->phydev; - struct ethtool_link_ksettings ecmd; - int ladv, radv, ret, link; - - /* clear LAN78xx interrupt status */ - ret = lan78xx_phy_int_ack(dev); - if (unlikely(ret < 0)) - return ret; - - mutex_lock(&phydev->lock); - phy_read_status(phydev); - link = phydev->link; - mutex_unlock(&phydev->lock); - - if (!link && dev->link_on) { - dev->link_on = false; - - /* reset MAC */ - ret = lan78xx_mac_reset(dev); - if (ret < 0) - return ret; - - timer_delete(&dev->stat_monitor); - } else if (link && !dev->link_on) { - dev->link_on = true; - - phy_ethtool_ksettings_get(phydev, &ecmd); - - ret = lan78xx_configure_usb(dev, ecmd.base.speed); - if (ret < 0) - return ret; - - ladv = phy_read(phydev, MII_ADVERTISE); - if (ladv < 0) - return ladv; - - radv = phy_read(phydev, MII_LPA); - if (radv < 0) - return radv; - - netif_dbg(dev, link, dev->net, - "speed: %u duplex: %d anadv: 0x%04x anlpa: 0x%04x", - ecmd.base.speed, ecmd.base.duplex, ladv, radv); - - ret = lan78xx_update_flowcontrol(dev, ecmd.base.duplex, ladv, - radv); - if (ret < 0) - return ret; - - if (!timer_pending(&dev->stat_monitor)) { - dev->delta = 1; - mod_timer(&dev->stat_monitor, - jiffies + STAT_UPDATE_TIMER); - } - - lan78xx_rx_urb_submit_all(dev); - - local_bh_disable(); - napi_schedule(&dev->napi); - local_bh_enable(); - } - - return 0; -} - /* some work can't be done in tasklets, so we use keventd * * NOTE: annoying asymmetry: if it's active, schedule_work() fails, @@ -1733,7 +1642,7 @@ static void lan78xx_status(struct lan78xx_net *dev, struct urb *urb) if (intdata & INT_ENP_PHY_INT) { netif_dbg(dev, link, dev->net, "PHY INTR: 0x%08x\n", intdata); - lan78xx_defer_kevent(dev, EVENT_LINK_RESET); + lan78xx_defer_kevent(dev, EVENT_PHY_INT_ACK); if (dev->domain_data.phyirq > 0) generic_handle_irq_safe(dev->domain_data.phyirq); @@ -2015,63 +1924,16 @@ static void lan78xx_get_pause(struct net_device *net, struct ethtool_pauseparam *pause) { struct lan78xx_net *dev = netdev_priv(net); - struct phy_device *phydev = net->phydev; - struct ethtool_link_ksettings ecmd; - - phy_ethtool_ksettings_get(phydev, &ecmd); - - pause->autoneg = dev->fc_autoneg; - - if (dev->fc_request_control & FLOW_CTRL_TX) - pause->tx_pause = 1; - if (dev->fc_request_control & FLOW_CTRL_RX) - pause->rx_pause = 1; + phylink_ethtool_get_pauseparam(dev->phylink, pause); } static int lan78xx_set_pause(struct net_device *net, struct ethtool_pauseparam *pause) { struct lan78xx_net *dev = netdev_priv(net); - struct phy_device *phydev = net->phydev; - struct ethtool_link_ksettings ecmd; - int ret; - - phy_ethtool_ksettings_get(phydev, &ecmd); - - if (pause->autoneg && !ecmd.base.autoneg) { - ret = -EINVAL; - goto exit; - } - - dev->fc_request_control = 0; - if (pause->rx_pause) - dev->fc_request_control |= FLOW_CTRL_RX; - - if (pause->tx_pause) - dev->fc_request_control |= FLOW_CTRL_TX; - - if (ecmd.base.autoneg) { - __ETHTOOL_DECLARE_LINK_MODE_MASK(fc) = { 0, }; - u32 mii_adv; - - linkmode_clear_bit(ETHTOOL_LINK_MODE_Pause_BIT, - ecmd.link_modes.advertising); - linkmode_clear_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, - ecmd.link_modes.advertising); - mii_adv = (u32)mii_advertise_flowctrl(dev->fc_request_control); - mii_adv_to_linkmode_adv_t(fc, mii_adv); - linkmode_or(ecmd.link_modes.advertising, fc, - ecmd.link_modes.advertising); - phy_ethtool_ksettings_set(phydev, &ecmd); - } - - dev->fc_autoneg = pause->autoneg; - - ret = 0; -exit: - return ret; + return phylink_ethtool_set_pauseparam(dev->phylink, pause); } static int lan78xx_get_regs_len(struct net_device *netdev) @@ -2332,26 +2194,6 @@ static void lan78xx_remove_mdio(struct lan78xx_net *dev) mdiobus_free(dev->mdiobus); } -static void lan78xx_link_status_change(struct net_device *net) -{ - struct lan78xx_net *dev = netdev_priv(net); - struct phy_device *phydev = net->phydev; - u32 data; - int ret; - - ret = lan78xx_read_reg(dev, MAC_CR, &data); - if (ret < 0) - return; - - if (phydev->enable_tx_lpi) - data |= MAC_CR_EEE_EN_; - else - data &= ~MAC_CR_EEE_EN_; - lan78xx_write_reg(dev, MAC_CR, data); - - phy_print_status(phydev); -} - static int irq_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hwirq) { @@ -2484,6 +2326,75 @@ static void lan78xx_remove_irq_domain(struct lan78xx_net *dev) dev->domain_data.irqdomain = NULL; } +static void lan78xx_mac_config(struct phylink_config *config, unsigned int mode, + const struct phylink_link_state *state) +{ + struct net_device *net = to_net_dev(config->dev); + struct lan78xx_net *dev = netdev_priv(net); + u32 mac_cr = 0; + int ret; + + /* Check if the mode is supported */ + if (mode != MLO_AN_FIXED && mode != MLO_AN_PHY) { + netdev_err(net, "Unsupported negotiation mode: %u\n", mode); + return; + } + + switch (state->interface) { + case PHY_INTERFACE_MODE_GMII: + mac_cr |= MAC_CR_GMII_EN_; + break; + case PHY_INTERFACE_MODE_RGMII: + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_TXID: + case PHY_INTERFACE_MODE_RGMII_RXID: + break; + default: + netdev_warn(net, "Unsupported interface mode: %d\n", + state->interface); + return; + } + + ret = lan78xx_update_reg(dev, MAC_CR, MAC_CR_GMII_EN_, mac_cr); + if (ret < 0) + netdev_err(net, "Failed to config MAC with error %pe\n", + ERR_PTR(ret)); +} + +static void lan78xx_mac_link_down(struct phylink_config *config, + unsigned int mode, phy_interface_t interface) +{ + struct net_device *net = to_net_dev(config->dev); + struct lan78xx_net *dev = netdev_priv(net); + int ret; + + /* MAC reset will not de-assert TXEN/RXEN, we need to stop them + * manually before reset. TX and RX should be disabled before running + * link_up sequence. + */ + ret = lan78xx_stop_tx_path(dev); + if (ret < 0) + goto link_down_fail; + + ret = lan78xx_stop_rx_path(dev); + if (ret < 0) + goto link_down_fail; + + /* MAC reset seems to not affect MAC configuration, no idea if it is + * really needed, but it was done in previous driver version. So, leave + * it here. + */ + ret = lan78xx_mac_reset(dev); + if (ret < 0) + goto link_down_fail; + + return; + +link_down_fail: + netdev_err(dev->net, "Failed to set MAC down with error %pe\n", + ERR_PTR(ret)); +} + /** * lan78xx_configure_usb - Configure USB link power settings * @dev: pointer to the LAN78xx device structure @@ -2619,28 +2530,100 @@ static int lan78xx_configure_flowcontrol(struct lan78xx_net *dev, return lan78xx_write_reg(dev, FLOW, flow); } +static void lan78xx_mac_link_up(struct phylink_config *config, + struct phy_device *phy, + unsigned int mode, phy_interface_t interface, + int speed, int duplex, + bool tx_pause, bool rx_pause) +{ + struct net_device *net = to_net_dev(config->dev); + struct lan78xx_net *dev = netdev_priv(net); + u32 mac_cr = 0; + int ret; + + switch (speed) { + case SPEED_1000: + mac_cr |= MAC_CR_SPEED_1000_; + break; + case SPEED_100: + mac_cr |= MAC_CR_SPEED_100_; + break; + case SPEED_10: + mac_cr |= MAC_CR_SPEED_10_; + break; + default: + netdev_err(dev->net, "Unsupported speed %d\n", speed); + return; + } + + if (duplex == DUPLEX_FULL) + mac_cr |= MAC_CR_FULL_DUPLEX_; + + /* make sure TXEN and RXEN are disabled before reconfiguring MAC */ + ret = lan78xx_update_reg(dev, MAC_CR, MAC_CR_SPEED_MASK_ | + MAC_CR_FULL_DUPLEX_ | MAC_CR_EEE_EN_, mac_cr); + if (ret < 0) + goto link_up_fail; + + ret = lan78xx_configure_flowcontrol(dev, tx_pause, rx_pause); + if (ret < 0) + goto link_up_fail; + + ret = lan78xx_configure_usb(dev, speed); + if (ret < 0) + goto link_up_fail; + + lan78xx_rx_urb_submit_all(dev); + + ret = lan78xx_flush_rx_fifo(dev); + if (ret < 0) + goto link_up_fail; + + ret = lan78xx_flush_tx_fifo(dev); + if (ret < 0) + goto link_up_fail; + + ret = lan78xx_start_tx_path(dev); + if (ret < 0) + goto link_up_fail; + + ret = lan78xx_start_rx_path(dev); + if (ret < 0) + goto link_up_fail; + + return; +link_up_fail: + netdev_err(dev->net, "Failed to set MAC up with error %pe\n", + ERR_PTR(ret)); +} + +static const struct phylink_mac_ops lan78xx_phylink_mac_ops = { + .mac_config = lan78xx_mac_config, + .mac_link_down = lan78xx_mac_link_down, + .mac_link_up = lan78xx_mac_link_up, +}; + /** - * lan78xx_register_fixed_phy() - Register a fallback fixed PHY + * lan78xx_set_fixed_link() - Set fixed link configuration for LAN7801 * @dev: LAN78xx device * - * Registers a fixed PHY with 1 Gbps full duplex. This is used in special cases - * like EVB-KSZ9897-1, where LAN7801 acts as a USB-to-Ethernet interface to a - * switch without a visible PHY. + * Use fixed link configuration with 1 Gbps full duplex. This is used in special + * cases like EVB-KSZ9897-1, where LAN7801 acts as a USB-to-Ethernet interface + * to a switch without a visible PHY. * * Return: pointer to the registered fixed PHY, or ERR_PTR() on error. */ -static struct phy_device *lan78xx_register_fixed_phy(struct lan78xx_net *dev) +static int lan78xx_set_fixed_link(struct lan78xx_net *dev) { - struct fixed_phy_status fphy_status = { - .link = 1, + struct phylink_link_state state = { .speed = SPEED_1000, .duplex = DUPLEX_FULL, }; netdev_info(dev->net, - "No PHY found on LAN7801 – registering fixed PHY (e.g. EVB-KSZ9897-1)\n"); + "No PHY found on LAN7801 – using fixed link instead (e.g. EVB-KSZ9897-1)\n"); - return fixed_phy_register(PHY_POLL, &fphy_status, NULL); + return phylink_set_fixed_link(dev->phylink, &state); } /** @@ -2676,7 +2659,7 @@ static struct phy_device *lan78xx_get_phy(struct lan78xx_net *dev) dev->interface = PHY_INTERFACE_MODE_RGMII; /* No PHY found – fallback to fixed PHY (e.g. KSZ switch board) */ - return lan78xx_register_fixed_phy(dev); + return NULL; case ID_REV_CHIP_ID_7800_: case ID_REV_CHIP_ID_7850_: @@ -2803,20 +2786,63 @@ static int lan78xx_configure_leds_from_dt(struct lan78xx_net *dev, return lan78xx_write_reg(dev, HW_CFG, reg); } +static int lan78xx_phylink_setup(struct lan78xx_net *dev) +{ + struct phylink_config *pc = &dev->phylink_config; + struct phylink *phylink; + + pc->dev = &dev->net->dev; + pc->type = PHYLINK_NETDEV; + pc->mac_capabilities = MAC_SYM_PAUSE | MAC_ASYM_PAUSE | MAC_10 | + MAC_100 | MAC_1000FD; + pc->mac_managed_pm = true; + + if (dev->chipid == ID_REV_CHIP_ID_7801_) + phy_interface_set_rgmii(pc->supported_interfaces); + else + __set_bit(PHY_INTERFACE_MODE_GMII, pc->supported_interfaces); + + phylink = phylink_create(pc, dev->net->dev.fwnode, + dev->interface, &lan78xx_phylink_mac_ops); + if (IS_ERR(phylink)) + return PTR_ERR(phylink); + + dev->phylink = phylink; + + return 0; +} + static int lan78xx_phy_init(struct lan78xx_net *dev) { - __ETHTOOL_DECLARE_LINK_MODE_MASK(fc) = { 0, }; - int ret; - u32 mii_adv; struct phy_device *phydev; + int ret; phydev = lan78xx_get_phy(dev); + /* phydev can be NULL if no PHY is found and the chip is LAN7801, + * which will use a fixed link later. + * If an error occurs, return the error code immediately. + */ if (IS_ERR(phydev)) return PTR_ERR(phydev); + ret = lan78xx_phylink_setup(dev); + if (ret < 0) + return ret; + + /* If no PHY is found, set up a fixed link. It is very specific to + * the LAN7801 and is used in special cases like EVB-KSZ9897-1 where + * LAN7801 acts as a USB-to-Ethernet interface to a switch without + * a visible PHY. + */ + if (!phydev) { + ret = lan78xx_set_fixed_link(dev); + if (ret < 0) + return ret; + } + ret = lan78xx_mac_prepare_for_phy(dev); if (ret < 0) - goto free_phy; + return ret; /* if phyirq is not set, use polling mode in phylib */ if (dev->domain_data.phyirq > 0) @@ -2825,56 +2851,25 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) phydev->irq = PHY_POLL; netdev_dbg(dev->net, "phydev->irq = %d\n", phydev->irq); - /* set to AUTOMDIX */ - phydev->mdix = ETH_TP_MDI_AUTO; - - ret = phy_connect_direct(dev->net, phydev, - lan78xx_link_status_change, - dev->interface); + ret = phylink_connect_phy(dev->phylink, phydev); if (ret) { - netdev_err(dev->net, "can't attach PHY to %s\n", - dev->mdiobus->id); - if (dev->chipid == ID_REV_CHIP_ID_7801_) { - if (phy_is_pseudo_fixed_link(phydev)) { - fixed_phy_unregister(phydev); - phy_device_free(phydev); - } - } - return -EIO; + netdev_err(dev->net, "can't attach PHY to %s, error %pe\n", + dev->mdiobus->id, ERR_PTR(ret)); + return ret; } - /* MAC doesn't support 1000T Half */ - phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT); - - /* support both flow controls */ - dev->fc_request_control = (FLOW_CTRL_RX | FLOW_CTRL_TX); - linkmode_clear_bit(ETHTOOL_LINK_MODE_Pause_BIT, - phydev->advertising); - linkmode_clear_bit(ETHTOOL_LINK_MODE_Asym_Pause_BIT, - phydev->advertising); - mii_adv = (u32)mii_advertise_flowctrl(dev->fc_request_control); - mii_adv_to_linkmode_adv_t(fc, mii_adv); - linkmode_or(phydev->advertising, fc, phydev->advertising); - phy_support_eee(phydev); - ret = lan78xx_configure_leds_from_dt(dev, phydev); - if (ret) - goto free_phy; - - genphy_config_aneg(phydev); - - dev->fc_autoneg = phydev->autoneg; - - return 0; + return lan78xx_configure_leds_from_dt(dev, phydev); +} -free_phy: - if (phy_is_pseudo_fixed_link(phydev)) { - fixed_phy_unregister(phydev); - phy_device_free(phydev); +static void lan78xx_phy_uninit(struct lan78xx_net *dev) +{ + if (dev->phylink) { + phylink_disconnect_phy(dev->phylink); + phylink_destroy(dev->phylink); + dev->phylink = NULL; } - - return ret; } static int lan78xx_set_rx_max_frame_length(struct lan78xx_net *dev, int size) @@ -3213,7 +3208,6 @@ static int lan78xx_reset(struct lan78xx_net *dev) unsigned long timeout; int ret; u32 buf; - u8 sig; ret = lan78xx_read_reg(dev, HW_CFG, &buf); if (ret < 0) @@ -3370,22 +3364,12 @@ static int lan78xx_reset(struct lan78xx_net *dev) if (ret < 0) return ret; + buf &= ~(MAC_CR_AUTO_DUPLEX_ | MAC_CR_AUTO_SPEED_); + /* LAN7801 only has RGMII mode */ - if (dev->chipid == ID_REV_CHIP_ID_7801_) { + if (dev->chipid == ID_REV_CHIP_ID_7801_) buf &= ~MAC_CR_GMII_EN_; - /* Enable Auto Duplex and Auto speed */ - buf |= MAC_CR_AUTO_DUPLEX_ | MAC_CR_AUTO_SPEED_; - } - if (dev->chipid == ID_REV_CHIP_ID_7800_ || - dev->chipid == ID_REV_CHIP_ID_7850_) { - ret = lan78xx_read_raw_eeprom(dev, 0, 1, &sig); - if (!ret && sig != EEPROM_INDICATOR) { - /* Implies there is no external eeprom. Set mac speed */ - netdev_info(dev->net, "No External EEPROM. Setting MAC Speed\n"); - buf |= MAC_CR_AUTO_DUPLEX_ | MAC_CR_AUTO_SPEED_; - } - } ret = lan78xx_write_reg(dev, MAC_CR, buf); if (ret < 0) return ret; @@ -3435,9 +3419,11 @@ static int lan78xx_open(struct net_device *net) mutex_lock(&dev->dev_mutex); - phy_start(net->phydev); + lan78xx_init_stats(dev); + + napi_enable(&dev->napi); - netif_dbg(dev, ifup, dev->net, "phy initialised successfully"); + set_bit(EVENT_DEV_OPEN, &dev->flags); /* for Link Check */ if (dev->urb_intr) { @@ -3449,31 +3435,9 @@ static int lan78xx_open(struct net_device *net) } } - ret = lan78xx_flush_rx_fifo(dev); - if (ret < 0) - goto done; - ret = lan78xx_flush_tx_fifo(dev); - if (ret < 0) - goto done; - - ret = lan78xx_start_tx_path(dev); - if (ret < 0) - goto done; - ret = lan78xx_start_rx_path(dev); - if (ret < 0) - goto done; - - lan78xx_init_stats(dev); - - set_bit(EVENT_DEV_OPEN, &dev->flags); + phylink_start(dev->phylink); netif_start_queue(net); - - dev->link_on = false; - - napi_enable(&dev->napi); - - lan78xx_defer_kevent(dev, EVENT_LINK_RESET); done: mutex_unlock(&dev->dev_mutex); @@ -3541,12 +3505,7 @@ static int lan78xx_stop(struct net_device *net) net->stats.rx_packets, net->stats.tx_packets, net->stats.rx_errors, net->stats.tx_errors); - /* ignore errors that occur stopping the Tx and Rx data paths */ - lan78xx_stop_tx_path(dev); - lan78xx_stop_rx_path(dev); - - if (net->phydev) - phy_stop(net->phydev); + phylink_stop(dev->phylink); usb_kill_urb(dev->urb_intr); @@ -3556,7 +3515,7 @@ static int lan78xx_stop(struct net_device *net) */ clear_bit(EVENT_TX_HALT, &dev->flags); clear_bit(EVENT_RX_HALT, &dev->flags); - clear_bit(EVENT_LINK_RESET, &dev->flags); + clear_bit(EVENT_PHY_INT_ACK, &dev->flags); clear_bit(EVENT_STAT_UPDATE, &dev->flags); cancel_delayed_work_sync(&dev->wq); @@ -4480,14 +4439,14 @@ static void lan78xx_delayedwork(struct work_struct *work) } } - if (test_bit(EVENT_LINK_RESET, &dev->flags)) { + if (test_bit(EVENT_PHY_INT_ACK, &dev->flags)) { int ret = 0; - clear_bit(EVENT_LINK_RESET, &dev->flags); - if (lan78xx_link_reset(dev) < 0) { - netdev_info(dev->net, "link reset failed (%d)\n", - ret); - } + clear_bit(EVENT_PHY_INT_ACK, &dev->flags); + ret = lan78xx_phy_int_ack(dev); + if (ret) + netdev_info(dev->net, "PHY INT ack failed (%pe)\n", + ERR_PTR(ret)); } if (test_bit(EVENT_STAT_UPDATE, &dev->flags)) { @@ -4561,32 +4520,29 @@ static void lan78xx_disconnect(struct usb_interface *intf) struct lan78xx_net *dev; struct usb_device *udev; struct net_device *net; - struct phy_device *phydev; dev = usb_get_intfdata(intf); usb_set_intfdata(intf, NULL); if (!dev) return; - netif_napi_del(&dev->napi); - udev = interface_to_usbdev(intf); net = dev->net; + rtnl_lock(); + phylink_stop(dev->phylink); + phylink_disconnect_phy(dev->phylink); + rtnl_unlock(); + + netif_napi_del(&dev->napi); + unregister_netdev(net); timer_shutdown_sync(&dev->stat_monitor); set_bit(EVENT_DEV_DISCONNECT, &dev->flags); cancel_delayed_work_sync(&dev->wq); - phydev = net->phydev; - - phy_disconnect(net->phydev); - - if (phy_is_pseudo_fixed_link(phydev)) { - fixed_phy_unregister(phydev); - phy_device_free(phydev); - } + phylink_destroy(dev->phylink); usb_scuttle_anchored_urbs(&dev->deferred); @@ -4670,7 +4626,6 @@ static int lan78xx_probe(struct usb_interface *intf, goto out1; } - /* netdev_printk() needs this */ SET_NETDEV_DEV(netdev, &intf->dev); dev = netdev_priv(netdev); @@ -4784,12 +4739,12 @@ static int lan78xx_probe(struct usb_interface *intf, ret = lan78xx_phy_init(dev); if (ret < 0) - goto free_urbs; + goto phy_uninit; ret = register_netdev(netdev); if (ret != 0) { netif_err(dev, probe, netdev, "couldn't register the device\n"); - goto out8; + goto phy_uninit; } usb_set_intfdata(intf, dev); @@ -4804,8 +4759,8 @@ static int lan78xx_probe(struct usb_interface *intf, return 0; -out8: - phy_disconnect(netdev->phydev); +phy_uninit: + lan78xx_phy_uninit(dev); free_urbs: usb_free_urb(dev->urb_intr); out5: @@ -5140,6 +5095,10 @@ static int lan78xx_suspend(struct usb_interface *intf, pm_message_t message) spin_unlock_irq(&dev->txq.lock); } + rtnl_lock(); + phylink_suspend(dev->phylink, false); + rtnl_unlock(); + /* stop RX */ ret = lan78xx_stop_rx_path(dev); if (ret < 0) @@ -5367,11 +5326,15 @@ static int lan78xx_reset_resume(struct usb_interface *intf) if (ret < 0) return ret; - phy_start(dev->net->phydev); - ret = lan78xx_resume(intf); + if (ret < 0) + return ret; - return ret; + rtnl_lock(); + phylink_resume(dev->phylink); + rtnl_unlock(); + + return 0; } static const struct usb_device_id products[] = { From 3fcd6281a617c91fa958abecb854e899137cc1f7 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 28 Apr 2025 15:05:39 +0200 Subject: [PATCH 648/770] net: usb: lan78xx: Use ethtool_op_get_link to reflect current link status Replace the custom lan78xx_get_link implementation with the standard ethtool_op_get_link helper, which uses netif_carrier_ok to reflect the current link status accurately. Signed-off-by: Oleksij Rempel Reviewed-by: Maxime Chevallier Signed-off-by: NipaLocal --- drivers/net/usb/lan78xx.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index bbc218def7810..f10925b2bcaa2 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1839,18 +1839,6 @@ static int lan78xx_set_eee(struct net_device *net, struct ethtool_keee *edata) return ret; } -static u32 lan78xx_get_link(struct net_device *net) -{ - u32 link; - - mutex_lock(&net->phydev->lock); - phy_read_status(net->phydev); - link = net->phydev->link; - mutex_unlock(&net->phydev->lock); - - return link; -} - static void lan78xx_get_drvinfo(struct net_device *net, struct ethtool_drvinfo *info) { @@ -1970,7 +1958,7 @@ lan78xx_get_regs(struct net_device *netdev, struct ethtool_regs *regs, } static const struct ethtool_ops lan78xx_ethtool_ops = { - .get_link = lan78xx_get_link, + .get_link = ethtool_op_get_link, .nway_reset = phy_ethtool_nway_reset, .get_drvinfo = lan78xx_get_drvinfo, .get_msglevel = lan78xx_get_msglevel, From c17a6af6576c5347d2e349e8320cd2f11f597dc9 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 28 Apr 2025 15:05:40 +0200 Subject: [PATCH 649/770] net: usb: lan78xx: port link settings to phylink API Refactor lan78xx_get_link_ksettings and lan78xx_set_link_ksettings to use the phylink API (phylink_ethtool_ksettings_get and phylink_ethtool_ksettings_set) instead of directly interfacing with the PHY. This change simplifies the code and ensures better integration with the phylink framework for link management. Additionally, the explicit calls to usb_autopm_get_interface() and usb_autopm_put_interface() have been removed. These were originally needed to manage USB power management during register accesses. However, lan78xx_mdiobus_read() and lan78xx_mdiobus_write() already handle USB auto power management internally, ensuring that the interface remains active when necessary. Since there are no other direct register accesses in these functions that require explicit power management handling, the extra calls have become redundant and are no longer needed. Signed-off-by: Oleksij Rempel Reviewed-by: Maxime Chevallier Signed-off-by: NipaLocal --- drivers/net/usb/lan78xx.c | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index f10925b2bcaa2..45e9158655b7b 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1866,46 +1866,16 @@ static int lan78xx_get_link_ksettings(struct net_device *net, struct ethtool_link_ksettings *cmd) { struct lan78xx_net *dev = netdev_priv(net); - struct phy_device *phydev = net->phydev; - int ret; - - ret = usb_autopm_get_interface(dev->intf); - if (ret < 0) - return ret; - phy_ethtool_ksettings_get(phydev, cmd); - - usb_autopm_put_interface(dev->intf); - - return ret; + return phylink_ethtool_ksettings_get(dev->phylink, cmd); } static int lan78xx_set_link_ksettings(struct net_device *net, const struct ethtool_link_ksettings *cmd) { struct lan78xx_net *dev = netdev_priv(net); - struct phy_device *phydev = net->phydev; - int ret = 0; - int temp; - - ret = usb_autopm_get_interface(dev->intf); - if (ret < 0) - return ret; - - /* change speed & duplex */ - ret = phy_ethtool_ksettings_set(phydev, cmd); - if (!cmd->base.autoneg) { - /* force link down */ - temp = phy_read(phydev, MII_BMCR); - phy_write(phydev, MII_BMCR, temp | BMCR_LOOPBACK); - mdelay(1); - phy_write(phydev, MII_BMCR, temp); - } - - usb_autopm_put_interface(dev->intf); - - return ret; + return phylink_ethtool_ksettings_set(dev->phylink, cmd); } static void lan78xx_get_pause(struct net_device *net, From f5be05f438fd6f959a9896d1f83c69d3d8397ae8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 28 Apr 2025 15:05:41 +0200 Subject: [PATCH 650/770] net: usb: lan78xx: Integrate EEE support with phylink LPI API Refactor Energy-Efficient Ethernet (EEE) support in the LAN78xx driver to fully integrate with the phylink Low Power Idle (LPI) API. This includes: - Replacing direct calls to `phy_ethtool_get_eee` and `phy_ethtool_set_eee` with `phylink_ethtool_get_eee` and `phylink_ethtool_set_eee`. - Implementing `.mac_enable_tx_lpi` and `.mac_disable_tx_lpi` to control LPI transitions via phylink. - Configuring `lpi_timer_default` to align with recommended values from LAN7800 documentation. - ensure EEE is disabled on controller reset Signed-off-by: Oleksij Rempel Signed-off-by: NipaLocal --- drivers/net/usb/lan78xx.c | 123 ++++++++++++++++++++++++-------------- 1 file changed, 79 insertions(+), 44 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 45e9158655b7b..ba6a6cda779eb 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -1789,54 +1789,15 @@ static int lan78xx_set_wol(struct net_device *netdev, static int lan78xx_get_eee(struct net_device *net, struct ethtool_keee *edata) { struct lan78xx_net *dev = netdev_priv(net); - struct phy_device *phydev = net->phydev; - int ret; - u32 buf; - - ret = usb_autopm_get_interface(dev->intf); - if (ret < 0) - return ret; - - ret = phy_ethtool_get_eee(phydev, edata); - if (ret < 0) - goto exit; - ret = lan78xx_read_reg(dev, MAC_CR, &buf); - if (buf & MAC_CR_EEE_EN_) { - /* EEE_TX_LPI_REQ_DLY & tx_lpi_timer are same uSec unit */ - ret = lan78xx_read_reg(dev, EEE_TX_LPI_REQ_DLY, &buf); - edata->tx_lpi_timer = buf; - } else { - edata->tx_lpi_timer = 0; - } - - ret = 0; -exit: - usb_autopm_put_interface(dev->intf); - - return ret; + return phylink_ethtool_get_eee(dev->phylink, edata); } static int lan78xx_set_eee(struct net_device *net, struct ethtool_keee *edata) { struct lan78xx_net *dev = netdev_priv(net); - int ret; - u32 buf; - - ret = usb_autopm_get_interface(dev->intf); - if (ret < 0) - return ret; - ret = phy_ethtool_set_eee(net->phydev, edata); - if (ret < 0) - goto out; - - buf = (u32)edata->tx_lpi_timer; - ret = lan78xx_write_reg(dev, EEE_TX_LPI_REQ_DLY, buf); -out: - usb_autopm_put_interface(dev->intf); - - return ret; + return phylink_ethtool_set_eee(dev->phylink, edata); } static void lan78xx_get_drvinfo(struct net_device *net, @@ -2555,10 +2516,62 @@ static void lan78xx_mac_link_up(struct phylink_config *config, ERR_PTR(ret)); } +/** + * lan78xx_mac_eee_enable - Enable or disable MAC-side EEE support + * @dev: LAN78xx device + * @enable: true to enable EEE, false to disable + * + * This function sets or clears the MAC_CR_EEE_EN_ bit to control Energy + * Efficient Ethernet (EEE) operation. According to current understanding + * of the LAN7800 documentation, this bit can be modified while TX and RX + * are enabled. No explicit requirement was found to disable data paths + * before changing this bit. + * + * Return: 0 on success or a negative error code + */ +static int lan78xx_mac_eee_enable(struct lan78xx_net *dev, bool enable) +{ + u32 mac_cr = 0; + + if (enable) + mac_cr |= MAC_CR_EEE_EN_; + + return lan78xx_update_reg(dev, MAC_CR, MAC_CR_EEE_EN_, mac_cr); +} + +static void lan78xx_mac_disable_tx_lpi(struct phylink_config *config) +{ + struct net_device *net = to_net_dev(config->dev); + struct lan78xx_net *dev = netdev_priv(net); + + lan78xx_mac_eee_enable(dev, false); +} + +static int lan78xx_mac_enable_tx_lpi(struct phylink_config *config, u32 timer, + bool tx_clk_stop) +{ + struct net_device *net = to_net_dev(config->dev); + struct lan78xx_net *dev = netdev_priv(net); + int ret; + + /* Software should only change this field when Energy Efficient + * Ethernet Enable (EEEEN) is cleared. We ensure that by clearing + * EEEEN during probe, and phylink itself guarantees that + * mac_disable_tx_lpi() will have been previously called. + */ + ret = lan78xx_write_reg(dev, EEE_TX_LPI_REQ_DLY, timer); + if (ret < 0) + return ret; + + return lan78xx_mac_eee_enable(dev, true); +} + static const struct phylink_mac_ops lan78xx_phylink_mac_ops = { .mac_config = lan78xx_mac_config, .mac_link_down = lan78xx_mac_link_down, .mac_link_up = lan78xx_mac_link_up, + .mac_disable_tx_lpi = lan78xx_mac_disable_tx_lpi, + .mac_enable_tx_lpi = lan78xx_mac_enable_tx_lpi, }; /** @@ -2754,12 +2767,36 @@ static int lan78xx_phylink_setup(struct lan78xx_net *dev) pc->mac_capabilities = MAC_SYM_PAUSE | MAC_ASYM_PAUSE | MAC_10 | MAC_100 | MAC_1000FD; pc->mac_managed_pm = true; + pc->lpi_capabilities = MAC_100FD | MAC_1000FD; + /* + * Default TX LPI (Low Power Idle) request delay count is set to 50us. + * + * Source: LAN7800 Documentation, DS00001992H, Section 15.1.57, Page 204. + * + * Reasoning: + * According to the application note in the LAN7800 documentation, a + * zero delay may negatively impact the TX data path’s ability to + * support Gigabit operation. A value of 50us is recommended as a + * reasonable default when the part operates at Gigabit speeds, + * balancing stability and power efficiency in EEE mode. This delay can + * be increased based on performance testing, as EEE is designed for + * scenarios with mostly idle links and occasional bursts of full + * bandwidth transmission. The goal is to ensure reliable Gigabit + * performance without overly aggressive power optimization during + * inactive periods. + */ + pc->lpi_timer_default = 50; + pc->eee_enabled_default = true; if (dev->chipid == ID_REV_CHIP_ID_7801_) phy_interface_set_rgmii(pc->supported_interfaces); else __set_bit(PHY_INTERFACE_MODE_GMII, pc->supported_interfaces); + memcpy(dev->phylink_config.lpi_interfaces, + dev->phylink_config.supported_interfaces, + sizeof(dev->phylink_config.lpi_interfaces)); + phylink = phylink_create(pc, dev->net->dev.fwnode, dev->interface, &lan78xx_phylink_mac_ops); if (IS_ERR(phylink)) @@ -2816,8 +2853,6 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) return ret; } - phy_support_eee(phydev); - return lan78xx_configure_leds_from_dt(dev, phydev); } @@ -3322,7 +3357,7 @@ static int lan78xx_reset(struct lan78xx_net *dev) if (ret < 0) return ret; - buf &= ~(MAC_CR_AUTO_DUPLEX_ | MAC_CR_AUTO_SPEED_); + buf &= ~(MAC_CR_AUTO_DUPLEX_ | MAC_CR_AUTO_SPEED_ | MAC_CR_EEE_EN_); /* LAN7801 only has RGMII mode */ if (dev->chipid == ID_REV_CHIP_ID_7801_) From f8ad4ad13095fc6514a140d5008ae69663e8111a Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 28 Apr 2025 15:05:42 +0200 Subject: [PATCH 651/770] net: usb: lan78xx: remove unused struct members Remove unused members from struct lan78xx_net, including: driver_priv suspend_count mdix_ctrl These fields are no longer used in the driver and can be safely removed as part of a cleanup. Signed-off-by: Oleksij Rempel Signed-off-by: NipaLocal --- drivers/net/usb/lan78xx.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index ba6a6cda779eb..647ab9d66334c 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -414,7 +414,6 @@ struct lan78xx_net { struct net_device *net; struct usb_device *udev; struct usb_interface *intf; - void *driver_priv; unsigned int tx_pend_data_len; size_t n_tx_urbs; @@ -449,15 +448,12 @@ struct lan78xx_net { unsigned long flags; wait_queue_head_t *wait; - unsigned char suspend_count; unsigned int maxpacket; struct timer_list stat_monitor; unsigned long data[5]; - u8 mdix_ctrl; - u32 chipid; u32 chiprev; struct mii_bus *mdiobus; From 39a7af51d825b34d4466b2ebd23fa253baa4136e Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 28 Apr 2025 16:29:54 -0700 Subject: [PATCH 652/770] sch_htb: make htb_deactivate() idempotent Alan reported a NULL pointer dereference in htb_next_rb_node() after we made htb_qlen_notify() idempotent. It turns out in the following case it introduced some regression: htb_dequeue_tree(): |-> fq_codel_dequeue() |-> qdisc_tree_reduce_backlog() |-> htb_qlen_notify() |-> htb_deactivate() |-> htb_next_rb_node() |-> htb_deactivate() For htb_next_rb_node(), after calling the 1st htb_deactivate(), the clprio[prio]->ptr could be already set to NULL, which means htb_next_rb_node() is vulnerable here. For htb_deactivate(), although we checked qlen before calling it, in case of qlen==0 after qdisc_tree_reduce_backlog(), we may call it again which triggers the warning inside. To fix the issues here, we need to: 1) Make htb_deactivate() idempotent, that is, simply return if we already call it before. 2) Make htb_next_rb_node() safe against ptr==NULL. Many thanks to Alan for testing and for the reproducer. Fixes: 5ba8b837b522 ("sch_htb: make htb_qlen_notify() idempotent") Reported-by: Alan J. Wylie Signed-off-by: Cong Wang Signed-off-by: NipaLocal --- net/sched/sch_htb.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 4b9a639b642e1..14bf71f570570 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -348,7 +348,8 @@ static void htb_add_to_wait_tree(struct htb_sched *q, */ static inline void htb_next_rb_node(struct rb_node **n) { - *n = rb_next(*n); + if (*n) + *n = rb_next(*n); } /** @@ -609,8 +610,8 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) */ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(!cl->prio_activity); - + if (!cl->prio_activity) + return; htb_deactivate_prios(q, cl); cl->prio_activity = 0; } @@ -1485,8 +1486,6 @@ static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg) { struct htb_class *cl = (struct htb_class *)arg; - if (!cl->prio_activity) - return; htb_deactivate(qdisc_priv(sch), cl); } @@ -1740,8 +1739,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, if (cl->parent) cl->parent->children--; - if (cl->prio_activity) - htb_deactivate(q, cl); + htb_deactivate(q, cl); if (cl->cmode != HTB_CAN_SEND) htb_safe_rb_erase(&cl->pq_node, @@ -1949,8 +1947,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* turn parent into inner node */ qdisc_purge_queue(parent->leaf.q); parent_qdisc = parent->leaf.q; - if (parent->prio_activity) - htb_deactivate(q, parent); + htb_deactivate(q, parent); /* remove from evt list because of level change */ if (parent->cmode != HTB_CAN_SEND) { From c925553597ccdfb48e967d5cd1fecdcf8a88e0ce Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 28 Apr 2025 16:29:55 -0700 Subject: [PATCH 653/770] selftests/tc-testing: Add a test case to cover basic HTB+FQ_CODEL case Integrate the reproducer from Alan into TC selftests and use scapy to generate TCP traffic instead of relying on ping command. Cc: Alan J. Wylie Signed-off-by: Cong Wang Signed-off-by: NipaLocal --- .../tc-testing/tc-tests/infra/qdiscs.json | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index 0843f6d37e9c7..a951c0d33cd2d 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -538,5 +538,40 @@ "$TC qdisc del dev $DUMMY handle 1:0 root", "$IP addr del 10.10.10.10/24 dev $DUMMY || true" ] + }, + { + "id": "62c4", + "name": "Test HTB with FQ_CODEL - basic functionality", + "category": [ + "qdisc", + "htb", + "fq_codel" + ], + "plugins": { + "requires": [ + "nsPlugin", + "scapyPlugin" + ] + }, + "setup": [ + "$TC qdisc add dev $DEV1 root handle 1: htb default 11", + "$TC class add dev $DEV1 parent 1: classid 1:1 htb rate 10kbit", + "$TC class add dev $DEV1 parent 1:1 classid 1:11 htb rate 10kbit prio 0 quantum 1486", + "$TC qdisc add dev $DEV1 parent 1:11 fq_codel quantum 300 noecn", + "sleep 0.5" + ], + "scapy": { + "iface": "$DEV0", + "count": 5, + "packet": "Ether()/IP(dst='10.10.10.1', src='10.10.10.10')/TCP(sport=12345, dport=80)" + }, + "cmdUnderTest": "$TC -s qdisc show dev $DEV1", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DEV1 | grep -A 5 'qdisc fq_codel'", + "matchPattern": "Sent [0-9]+ bytes [0-9]+ pkt", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DEV1 handle 1: root" + ] } ] From 434c428f41976c92b442b992c9ae9c7ac37bf005 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 29 Apr 2025 03:26:37 +0000 Subject: [PATCH 654/770] netmem: add niov->type attribute to distinguish different net_iov types Later patches in the series adds TX net_iovs where there is no pp associated, so we can't rely on niov->pp->mp_ops to tell what is the type of the net_iov. Add a type enum to the net_iov which tells us the net_iov type. Signed-off-by: Mina Almasry Signed-off-by: NipaLocal --- include/net/netmem.h | 11 ++++++++++- io_uring/zcrx.c | 1 + net/core/devmem.c | 3 ++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/net/netmem.h b/include/net/netmem.h index c61d5b21e7b42..973fdbcfef38b 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -20,8 +20,17 @@ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); */ #define NET_IOV 0x01UL +enum net_iov_type { + NET_IOV_DMABUF, + NET_IOV_IOURING, + + /* Force size to unsigned long to make the NET_IOV_ASSERTS below pass. + */ + NET_IOV_MAX = ULONG_MAX +}; + struct net_iov { - unsigned long __unused_padding; + enum net_iov_type type; unsigned long pp_magic; struct page_pool *pp; struct net_iov_area *owner; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index fe86606b9f304..a07ad38935c86 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -259,6 +259,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, niov->owner = &area->nia; area->freelist[i] = i; atomic_set(&area->user_refs[i], 0); + niov->type = NET_IOV_IOURING; } area->free_count = nr_iovs; diff --git a/net/core/devmem.c b/net/core/devmem.c index 6e27a47d04935..f5c3a7e6dbb7b 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -30,7 +30,7 @@ static const struct memory_provider_ops dmabuf_devmem_ops; bool net_is_devmem_iov(struct net_iov *niov) { - return niov->pp->mp_ops == &dmabuf_devmem_ops; + return niov->type == NET_IOV_DMABUF; } static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, @@ -266,6 +266,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, for (i = 0; i < owner->area.num_niovs; i++) { niov = &owner->area.niovs[i]; + niov->type = NET_IOV_DMABUF; niov->owner = &owner->area; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); From 414ef3cb20ad1723b46acca021bb45d569a5bbb8 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 29 Apr 2025 03:26:38 +0000 Subject: [PATCH 655/770] net: add get_netmem/put_netmem support Currently net_iovs support only pp ref counts, and do not support a page ref equivalent. This is fine for the RX path as net_iovs are used exclusively with the pp and only pp refcounting is needed there. The TX path however does not use pp ref counts, thus, support for get_page/put_page equivalent is needed for netmem. Support get_netmem/put_netmem. Check the type of the netmem before passing it to page or net_iov specific code to obtain a page ref equivalent. For dmabuf net_iovs, we obtain a ref on the underlying binding. This ensures the entire binding doesn't disappear until all the net_iovs have been put_netmem'ed. We do not need to track the refcount of individual dmabuf net_iovs as we don't allocate/free them from a pool similar to what the buddy allocator does for pages. This code is written to be extensible by other net_iov implementers. get_netmem/put_netmem will check the type of the netmem and route it to the correct helper: pages -> [get|put]_page() dmabuf net_iovs -> net_devmem_[get|put]_net_iov() new net_iovs -> new helpers Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Signed-off-by: NipaLocal --- include/linux/skbuff_ref.h | 4 ++-- include/net/netmem.h | 3 +++ net/core/devmem.c | 10 ++++++++++ net/core/devmem.h | 20 ++++++++++++++++++++ net/core/skbuff.c | 30 ++++++++++++++++++++++++++++++ 5 files changed, 65 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h index 0f3c58007488a..9e49372ef1a05 100644 --- a/include/linux/skbuff_ref.h +++ b/include/linux/skbuff_ref.h @@ -17,7 +17,7 @@ */ static inline void __skb_frag_ref(skb_frag_t *frag) { - get_page(skb_frag_page(frag)); + get_netmem(skb_frag_netmem(frag)); } /** @@ -40,7 +40,7 @@ static inline void skb_page_unref(netmem_ref netmem, bool recycle) if (recycle && napi_pp_put_page(netmem)) return; #endif - put_page(netmem_to_page(netmem)); + put_netmem(netmem); } /** diff --git a/include/net/netmem.h b/include/net/netmem.h index 973fdbcfef38b..ecb6b29c93f61 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -273,4 +273,7 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) return __netmem_clear_lsb(netmem)->dma_addr; } +void get_netmem(netmem_ref netmem); +void put_netmem(netmem_ref netmem); + #endif /* _NET_NETMEM_H */ diff --git a/net/core/devmem.c b/net/core/devmem.c index f5c3a7e6dbb7b..dca2ff7cf6923 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -295,6 +295,16 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, return ERR_PTR(err); } +void net_devmem_get_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); +} + +void net_devmem_put_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov)); +} + /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) diff --git a/net/core/devmem.h b/net/core/devmem.h index 7fc158d527293..946f2e0157467 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -29,6 +29,10 @@ struct net_devmem_dmabuf_binding { * The binding undos itself and unmaps the underlying dmabuf once all * those refs are dropped and the binding is no longer desired or in * use. + * + * net_devmem_get_net_iov() on dmabuf net_iovs will increment this + * reference, making sure that the binding remains alive until all the + * net_iovs are no longer used. */ refcount_t ref; @@ -111,6 +115,9 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) __net_devmem_dmabuf_binding_free(binding); } +void net_devmem_get_net_iov(struct net_iov *niov); +void net_devmem_put_net_iov(struct net_iov *niov); + struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); @@ -120,6 +127,19 @@ bool net_is_devmem_iov(struct net_iov *niov); #else struct net_devmem_dmabuf_binding; +static inline void +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline void net_devmem_get_net_iov(struct net_iov *niov) +{ +} + +static inline void net_devmem_put_net_iov(struct net_iov *niov) +{ +} + static inline void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index d73ad79fe739d..00c22bce98e44 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -89,6 +89,7 @@ #include #include "dev.h" +#include "devmem.h" #include "netmem_priv.h" #include "sock_destructor.h" @@ -7313,3 +7314,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, return false; } EXPORT_SYMBOL(csum_and_copy_from_iter_full); + +void get_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_get_net_iov(netmem_to_net_iov(netmem)); + return; + } + get_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(get_netmem); + +void put_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_put_net_iov(netmem_to_net_iov(netmem)); + return; + } + + put_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(put_netmem); From ba0e67af10e1a4145c75f2766f5176dda90c38b2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 29 Apr 2025 03:26:39 +0000 Subject: [PATCH 656/770] net: devmem: TCP tx netlink api Add bind-tx netlink call to attach dmabuf for TX; queue is not required, only ifindex and dmabuf fd for attachment. Signed-off-by: Stanislav Fomichev Signed-off-by: Mina Almasry Signed-off-by: NipaLocal --- Documentation/netlink/specs/netdev.yaml | 12 ++++++++++++ include/uapi/linux/netdev.h | 1 + net/core/netdev-genl-gen.c | 13 +++++++++++++ net/core/netdev-genl-gen.h | 1 + net/core/netdev-genl.c | 6 ++++++ tools/include/uapi/linux/netdev.h | 1 + 6 files changed, 34 insertions(+) diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index f5e0750ab71db..c0ef6d0d77865 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -743,6 +743,18 @@ operations: - defer-hard-irqs - gro-flush-timeout - irq-suspend-timeout + - + name: bind-tx + doc: Bind dmabuf to netdev for TX + attribute-set: dmabuf + do: + request: + attributes: + - ifindex + - fd + reply: + attributes: + - id kernel-family: headers: [ "net/netdev_netlink.h"] diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7600bf62dbdf0..7eb9571786b83 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -219,6 +219,7 @@ enum { NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, + NETDEV_CMD_BIND_TX, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 739f7b6506a6a..4fc44587f4936 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -99,6 +99,12 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPE [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, }; +/* NETDEV_CMD_BIND_TX - do */ +static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -190,6 +196,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_BIND_TX, + .doit = netdev_nl_bind_tx_doit, + .policy = netdev_bind_tx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 17d39fd64c948..cf3fad74511f5 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -34,6 +34,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 2a1a399fc15d2..919ce851f1174 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -937,6 +937,12 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) return err; } +/* stub */ +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) +{ + return 0; +} + void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { INIT_LIST_HEAD(&priv->bindings); diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7600bf62dbdf0..7eb9571786b83 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -219,6 +219,7 @@ enum { NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, NETDEV_CMD_NAPI_SET, + NETDEV_CMD_BIND_TX, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) From b7bc7b482d0567e5802025d8fbad62b0516145b4 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 29 Apr 2025 03:26:40 +0000 Subject: [PATCH 657/770] net: devmem: Implement TX path Augment dmabuf binding to be able to handle TX. Additional to all the RX binding, we also create tx_vec needed for the TX path. Provide API for sendmsg to be able to send dmabufs bound to this device: - Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from. - MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf. Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY implementation, while disabling instances where MSG_ZEROCOPY falls back to copying. We additionally pipe the binding down to the new zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems instead of the traditional page netmems. We also special case skb_frag_dma_map to return the dma-address of these dmabuf net_iovs instead of attempting to map pages. The TX path may release the dmabuf in a context where we cannot wait. This happens when the user unbinds a TX dmabuf while there are still references to its netmems in the TX path. In that case, the netmems will be put_netmem'd from a context where we can't unmap the dmabuf, Resolve this by making __net_devmem_dmabuf_binding_free schedule_work'd. Based on work by Stanislav Fomichev . A lot of the meat of the implementation came from devmem TCP RFC v1[1], which included the TX path, but Stan did all the rebasing on top of netmem/net_iov. Cc: Stanislav Fomichev Signed-off-by: Kaiyuan Zhang Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Signed-off-by: NipaLocal --- include/linux/skbuff.h | 17 +++- include/net/sock.h | 1 + io_uring/zcrx.c | 2 +- net/core/datagram.c | 48 +++++++++- net/core/devmem.c | 118 ++++++++++++++++++++---- net/core/devmem.h | 61 +++++++++--- net/core/netdev-genl.c | 71 +++++++++++++- net/core/skbuff.c | 18 ++-- net/core/sock.c | 6 ++ net/ipv4/ip_output.c | 3 +- net/ipv4/tcp.c | 50 ++++++++-- net/ipv6/ip6_output.c | 3 +- net/vmw_vsock/virtio_transport_common.c | 5 +- 13 files changed, 341 insertions(+), 62 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index beb084ee4f4d2..5d62e8e77546c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1710,13 +1710,16 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops; struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg); + struct ubuf_info *uarg, bool devmem); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); +struct net_devmem_dmabuf_binding; + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length); + size_t length, + struct net_devmem_dmabuf_binding *binding); int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length); @@ -1724,12 +1727,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { - return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); + return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len, + NULL); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg); + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding); /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) @@ -3700,6 +3705,10 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev, size_t offset, size_t size, enum dma_data_direction dir) { + if (skb_frag_is_net_iov(frag)) { + return netmem_to_net_iov(frag->netmem)->dma_addr + offset + + frag->offset; + } return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } diff --git a/include/net/sock.h b/include/net/sock.h index e223102337c77..f266757a37c87 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1851,6 +1851,7 @@ struct sockcm_cookie { u32 tsflags; u32 ts_opt_id; u32 priority; + u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index a07ad38935c86..0c64129f5d50a 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -810,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, return io_zcrx_copy_frag(req, ifq, frag, off, len); niov = netmem_to_net_iov(frag->netmem); - if (niov->pp->mp_ops != &io_uring_pp_zc_ops || + if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(niov->pp) != ifq) return -EFAULT; diff --git a/net/core/datagram.c b/net/core/datagram.c index f0634f0cb8346..042a7dceb85ad 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -63,6 +63,8 @@ #include #include +#include "devmem.h" + /* * Is a socket 'connection oriented' ? */ @@ -691,9 +693,49 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, return 0; } +static int +zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, + int length, + struct net_devmem_dmabuf_binding *binding) +{ + int i = skb_shinfo(skb)->nr_frags; + size_t virt_addr, size, off; + struct net_iov *niov; + + /* Devmem filling works by taking an IOVEC from the user where the + * iov_addrs are interpreted as an offset in bytes into the dma-buf to + * send from. We do not support other iter types. + */ + if (iov_iter_type(from) != ITER_IOVEC) + return -EFAULT; + + while (length && iov_iter_count(from)) { + if (i == MAX_SKB_FRAGS) + return -EMSGSIZE; + + virt_addr = (size_t)iter_iov_addr(from); + niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); + if (!niov) + return -EFAULT; + + size = min_t(size_t, size, length); + size = min_t(size_t, size, iter_iov_len(from)); + + get_netmem(net_iov_to_netmem(niov)); + skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, + size, PAGE_SIZE); + iov_iter_advance(from, size); + length -= size; + i++; + } + + return 0; +} + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length) + size_t length, + struct net_devmem_dmabuf_binding *binding) { unsigned long orig_size = skb->truesize; unsigned long truesize; @@ -701,6 +743,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, if (msg && msg->msg_ubuf && msg->sg_from_iter) ret = msg->sg_from_iter(skb, from, length); + else if (unlikely(binding)) + ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); else ret = zerocopy_fill_skb_from_iter(skb, from, length); @@ -734,7 +778,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); } EXPORT_SYMBOL(zerocopy_sg_from_iter); diff --git a/net/core/devmem.c b/net/core/devmem.c index dca2ff7cf6923..88065e5ef6511 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "devmem.h" @@ -52,8 +53,10 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { + struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); + size_t size, avail; gen_pool_for_each_chunk(binding->chunk_pool, @@ -71,8 +74,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); + kvfree(binding->tx_vec); kfree(binding); } +EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free); struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) @@ -117,6 +122,13 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) unsigned long xa_idx; unsigned int rxq_idx; + xa_erase(&net_devmem_dmabuf_bindings, binding->id); + + /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the + * erase. + */ + synchronize_net(); + if (binding->list.next) list_del(&binding->list); @@ -131,8 +143,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); } - xa_erase(&net_devmem_dmabuf_bindings, binding->id); - net_devmem_dmabuf_binding_put(binding); } @@ -166,8 +176,9 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, } struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, - struct netlink_ext_ack *extack) +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netlink_ext_ack *extack) { struct net_devmem_dmabuf_binding *binding; static u32 id_alloc_next; @@ -189,13 +200,6 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, } binding->dev = dev; - - err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, - binding, xa_limit_32b, &id_alloc_next, - GFP_KERNEL); - if (err < 0) - goto err_free_binding; - xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); refcount_set(&binding->ref, 1); @@ -206,26 +210,36 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, if (IS_ERR(binding->attachment)) { err = PTR_ERR(binding->attachment); NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); - goto err_free_id; + goto err_free_binding; } binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, - DMA_FROM_DEVICE); + direction); if (IS_ERR(binding->sgt)) { err = PTR_ERR(binding->sgt); NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); goto err_detach; } + if (direction == DMA_TO_DEVICE) { + binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, + sizeof(struct net_iov *), + GFP_KERNEL); + if (!binding->tx_vec) { + err = -ENOMEM; + goto err_unmap; + } + } + /* For simplicity we expect to make PAGE_SIZE allocations, but the * binding can be much more flexible than that. We may be able to * allocate MTU sized chunks here. Leave that for future work... */ - binding->chunk_pool = - gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev)); + binding->chunk_pool = gen_pool_create(PAGE_SHIFT, + dev_to_node(&dev->dev)); if (!binding->chunk_pool) { err = -ENOMEM; - goto err_unmap; + goto err_tx_vec; } virtual = 0; @@ -270,24 +284,32 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, niov->owner = &owner->area; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); + if (direction == DMA_TO_DEVICE) + binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; } virtual += len; } + err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, + binding, xa_limit_32b, &id_alloc_next, + GFP_KERNEL); + if (err < 0) + goto err_free_chunks; + return binding; err_free_chunks: gen_pool_for_each_chunk(binding->chunk_pool, net_devmem_dmabuf_free_chunk_owner, NULL); gen_pool_destroy(binding->chunk_pool); +err_tx_vec: + kvfree(binding->tx_vec); err_unmap: dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, DMA_FROM_DEVICE); err_detach: dma_buf_detach(dmabuf, binding->attachment); -err_free_id: - xa_erase(&net_devmem_dmabuf_bindings, binding->id); err_free_binding: kfree(binding); err_put_dmabuf: @@ -295,6 +317,21 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, return ERR_PTR(err); } +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + struct net_devmem_dmabuf_binding *binding; + + rcu_read_lock(); + binding = xa_load(&net_devmem_dmabuf_bindings, id); + if (binding) { + if (!net_devmem_dmabuf_binding_get(binding)) + binding = NULL; + } + rcu_read_unlock(); + + return binding; +} + void net_devmem_get_net_iov(struct net_iov *niov) { net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); @@ -305,6 +342,49 @@ void net_devmem_put_net_iov(struct net_iov *niov) net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov)); } +struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, + unsigned int dmabuf_id) +{ + struct net_devmem_dmabuf_binding *binding; + struct dst_entry *dst = __sk_dst_get(sk); + int err = 0; + + binding = net_devmem_lookup_dmabuf(dmabuf_id); + if (!binding || !binding->tx_vec) { + err = -EINVAL; + goto out_err; + } + + /* The dma-addrs in this binding are only reachable to the corresponding + * net_device. + */ + if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) { + err = -ENODEV; + goto out_err; + } + + return binding; + +out_err: + if (binding) + net_devmem_dmabuf_binding_put(binding); + + return ERR_PTR(err); +} + +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, + size_t virt_addr, size_t *off, size_t *size) +{ + if (virt_addr >= binding->dmabuf->size) + return NULL; + + *off = virt_addr % PAGE_SIZE; + *size = PAGE_SIZE - *off; + + return binding->tx_vec[virt_addr / PAGE_SIZE]; +} + /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) diff --git a/net/core/devmem.h b/net/core/devmem.h index 946f2e0157467..67168aae5e5b3 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -23,8 +23,9 @@ struct net_devmem_dmabuf_binding { /* The user holds a ref (via the netlink API) for as long as they want * the binding to remain alive. Each page pool using this binding holds - * a ref to keep the binding alive. Each allocated net_iov holds a - * ref. + * a ref to keep the binding alive. The page_pool does not release the + * ref until all the net_iovs allocated from this binding are released + * back to the page_pool. * * The binding undos itself and unmaps the underlying dmabuf once all * those refs are dropped and the binding is no longer desired or in @@ -32,7 +33,10 @@ struct net_devmem_dmabuf_binding { * * net_devmem_get_net_iov() on dmabuf net_iovs will increment this * reference, making sure that the binding remains alive until all the - * net_iovs are no longer used. + * net_iovs are no longer used. net_iovs allocated from this binding + * that are stuck in the TX path for any reason (such as awaiting + * retransmits) hold a reference to the binding until the skb holding + * them is freed. */ refcount_t ref; @@ -48,6 +52,14 @@ struct net_devmem_dmabuf_binding { * active. */ u32 id; + + /* Array of net_iov pointers for this binding, sorted by virtual + * address. This array is convenient to map the virtual addresses to + * net_iovs in the TX path. + */ + struct net_iov **tx_vec; + + struct work_struct unbind_w; }; #if defined(CONFIG_NET_DEVMEM) @@ -64,14 +76,17 @@ struct dmabuf_genpool_chunk_owner { dma_addr_t base_dma_addr; }; -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); +void __net_devmem_dmabuf_binding_free(struct work_struct *wq); struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, - struct netlink_ext_ack *extack); +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netlink_ext_ack *extack); +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); +void net_devmem_bind_tx_release(struct sock *sk); static inline struct dmabuf_genpool_chunk_owner * net_devmem_iov_to_chunk_owner(const struct net_iov *niov) @@ -100,10 +115,10 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); } -static inline void +static inline bool net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { - refcount_inc(&binding->ref); + return refcount_inc_not_zero(&binding->ref); } static inline void @@ -112,7 +127,8 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) if (!refcount_dec_and_test(&binding->ref)) return; - __net_devmem_dmabuf_binding_free(binding); + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); } void net_devmem_get_net_iov(struct net_iov *niov); @@ -123,6 +139,11 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); bool net_is_devmem_iov(struct net_iov *niov); +struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size); #else struct net_devmem_dmabuf_binding; @@ -140,18 +161,23 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov) { } -static inline void -__net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { } static inline struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, + enum dma_data_direction direction, struct netlink_ext_ack *extack) { return ERR_PTR(-EOPNOTSUPP); } +static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + return NULL; +} + static inline void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) { @@ -190,6 +216,19 @@ static inline bool net_is_devmem_iov(struct net_iov *niov) { return false; } + +static inline struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size) +{ + return NULL; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 919ce851f1174..db0e9a6a4badc 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -876,7 +876,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } - binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); + binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, + info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock; @@ -937,10 +938,74 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) return err; } -/* stub */ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) { - return 0; + struct net_devmem_dmabuf_binding *binding; + struct netdev_nl_sock *priv; + struct net_device *netdev; + u32 ifindex, dmabuf_fd; + struct sk_buff *rsp; + int err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + mutex_lock(&priv->lock); + + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_unlock_sock; + } + + if (!netif_device_present(netdev)) { + err = -ENODEV; + goto err_unlock_netdev; + } + + binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, + info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_unlock_netdev; + } + + list_add(&binding->list, &priv->bindings); + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + netdev_unlock(netdev); + mutex_unlock(&priv->lock); + + return genlmsg_reply(rsp, info); + +err_unlock_netdev: + netdev_unlock(netdev); +err_unlock_sock: + mutex_unlock(&priv->lock); +err_genlmsg_free: + nlmsg_free(rsp); + return err; } void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 00c22bce98e44..4159107f1666c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1655,7 +1655,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, + bool devmem) { struct ubuf_info_msgzc *uarg; struct sk_buff *skb; @@ -1670,7 +1671,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) uarg = (void *)skb->cb; uarg->mmp.user = NULL; - if (mm_account_pinned_pages(&uarg->mmp, size)) { + if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { kfree_skb(skb); return NULL; } @@ -1693,7 +1694,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) } struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg) + struct ubuf_info *uarg, bool devmem) { if (uarg) { struct ubuf_info_msgzc *uarg_zc; @@ -1723,7 +1724,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg_zc->id + uarg_zc->len) == next) { - if (mm_account_pinned_pages(&uarg_zc->mmp, size)) + if (likely(!devmem) && + mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; uarg_zc->len++; uarg_zc->bytelen = bytelen; @@ -1738,7 +1740,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, } new_alloc: - return msg_zerocopy_alloc(sk, size); + return msg_zerocopy_alloc(sk, size, devmem); } EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); @@ -1842,7 +1844,8 @@ EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg) + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding) { int err, orig_len = skb->len; @@ -1861,7 +1864,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return -EEXIST; } - err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, + binding); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; diff --git a/net/core/sock.c b/net/core/sock.c index b64df2463300b..9dd2989040357 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3017,6 +3017,12 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) return -EPERM; sockc->priority = *(u32 *)CMSG_DATA(cmsg); + break; + case SCM_DEVMEM_DMABUF: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); + break; default: return -EINVAL; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6e18d7ec50624..a2705d454fd64 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1014,7 +1014,8 @@ static int __ip_append_data(struct sock *sk, uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 86c427f166367..b4ba21717e254 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1059,6 +1059,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { + struct net_devmem_dmabuf_binding *binding = NULL; struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; @@ -1066,11 +1067,24 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; + bool sockc_valid = true; int zc = 0; long timeo; flags = msg->msg_flags; + sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags), + .dmabuf_id = 0 }; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) + /* Don't return error until MSG_FASTOPEN has been + * processed; that may succeed even if the cmsg is + * invalid. + */ + sockc_valid = false; + } + if ((flags & MSG_ZEROCOPY) && size) { if (msg->msg_ubuf) { uarg = msg->msg_ubuf; @@ -1078,7 +1092,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); - uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb), + sockc_valid && !!sockc.dmabuf_id); if (!uarg) { err = -ENOBUFS; goto out_err; @@ -1087,12 +1102,27 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; else uarg_to_msgzc(uarg)->zerocopy = 0; + + if (sockc_valid && sockc.dmabuf_id) { + binding = net_devmem_get_binding(sk, sockc.dmabuf_id); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + binding = NULL; + goto out_err; + } + } } } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_SPLICE_PAGES; } + if (sockc_valid && sockc.dmabuf_id && + (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) { + err = -EINVAL; + goto out_err; + } + if (unlikely(flags & MSG_FASTOPEN || inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { @@ -1131,14 +1161,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags)}; - if (msg->msg_controllen) { - err = sock_cmsg_send(sk, msg, &sockc); - if (unlikely(err)) { - err = -EINVAL; - goto out_err; - } - } + if (!sockc_valid) + goto out_err; /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -1258,7 +1282,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto wait_for_space; } - err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg, + binding); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; @@ -1339,6 +1364,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put(uarg); + if (binding) + net_devmem_dmabuf_binding_put(binding); return copied + copied_syn; do_error: @@ -1356,6 +1383,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } + if (binding) + net_devmem_dmabuf_binding_put(binding); + return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ef052ccd93800..7bd29a9ff0db8 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1524,7 +1524,8 @@ static int __ip6_append_data(struct sock *sk, uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 7f7de6d880965..6e7b727c781c8 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -87,7 +87,7 @@ static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, uarg = msg_zerocopy_realloc(sk_vsock(vsk), iter->count, - NULL); + NULL, false); if (!uarg) return -1; @@ -107,8 +107,7 @@ static int virtio_transport_fill_skb(struct sk_buff *skb, { if (zcopy) return __zerocopy_sg_from_iter(info->msg, NULL, skb, - &info->msg->msg_iter, - len); + &info->msg->msg_iter, len, NULL); return memcpy_from_msg(skb_put(skb, len), info->msg, len); } From 2b1c0ab538d0564528961c21dd9bce52997a56bd Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 29 Apr 2025 03:26:41 +0000 Subject: [PATCH 658/770] net: add devmem TCP TX documentation Add documentation outlining the usage and details of the devmem TCP TX API. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Signed-off-by: NipaLocal --- Documentation/networking/devmem.rst | 150 +++++++++++++++++++++++++++- 1 file changed, 146 insertions(+), 4 deletions(-) diff --git a/Documentation/networking/devmem.rst b/Documentation/networking/devmem.rst index eb678ca454968..a6cd7236bfbd2 100644 --- a/Documentation/networking/devmem.rst +++ b/Documentation/networking/devmem.rst @@ -62,15 +62,15 @@ More Info https://lore.kernel.org/netdev/20240831004313.3713467-1-almasrymina@google.com/ -Interface -========= +RX Interface +============ Example ------- -tools/testing/selftests/net/ncdevmem.c:do_server shows an example of setting up -the RX path of this API. +./tools/testing/selftests/drivers/net/hw/ncdevmem:do_server shows an example of +setting up the RX path of this API. NIC Setup @@ -235,6 +235,148 @@ can be less than the tokens provided by the user in case of: (a) an internal kernel leak bug. (b) the user passed more than 1024 frags. +TX Interface +============ + + +Example +------- + +./tools/testing/selftests/drivers/net/hw/ncdevmem:do_client shows an example of +setting up the TX path of this API. + + +NIC Setup +--------- + +The user must bind a TX dmabuf to a given NIC using the netlink API:: + + struct netdev_bind_tx_req *req = NULL; + struct netdev_bind_tx_rsp *rsp = NULL; + struct ynl_error yerr; + + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); + + req = netdev_bind_tx_req_alloc(); + netdev_bind_tx_req_set_ifindex(req, ifindex); + netdev_bind_tx_req_set_fd(req, dmabuf_fd); + + rsp = netdev_bind_tx(*ys, req); + + tx_dmabuf_id = rsp->id; + + +The netlink API returns a dmabuf_id: a unique ID that refers to this dmabuf +that has been bound. + +The user can unbind the dmabuf from the netdevice by closing the netlink socket +that established the binding. We do this so that the binding is automatically +unbound even if the userspace process crashes. + +Note that any reasonably well-behaved dmabuf from any exporter should work with +devmem TCP, even if the dmabuf is not actually backed by devmem. An example of +this is udmabuf, which wraps user memory (non-devmem) in a dmabuf. + +Socket Setup +------------ + +The user application must use MSG_ZEROCOPY flag when sending devmem TCP. Devmem +cannot be copied by the kernel, so the semantics of the devmem TX are similar +to the semantics of MSG_ZEROCOPY:: + + setsockopt(socket_fd, SOL_SOCKET, SO_ZEROCOPY, &opt, sizeof(opt)); + +It is also recommended that the user binds the TX socket to the same interface +the dma-buf has been bound to via SO_BINDTODEVICE:: + + setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, strlen(ifname) + 1); + + +Sending data +------------ + +Devmem data is sent using the SCM_DEVMEM_DMABUF cmsg. + +The user should create a msghdr where, + +* iov_base is set to the offset into the dmabuf to start sending from +* iov_len is set to the number of bytes to be sent from the dmabuf + +The user passes the dma-buf id to send from via the dmabuf_tx_cmsg.dmabuf_id. + +The example below sends 1024 bytes from offset 100 into the dmabuf, and 2048 +from offset 2000 into the dmabuf. The dmabuf to send from is tx_dmabuf_id:: + + char ctrl_data[CMSG_SPACE(sizeof(struct dmabuf_tx_cmsg))]; + struct dmabuf_tx_cmsg ddmabuf; + struct msghdr msg = {}; + struct cmsghdr *cmsg; + struct iovec iov[2]; + + iov[0].iov_base = (void*)100; + iov[0].iov_len = 1024; + iov[1].iov_base = (void*)2000; + iov[1].iov_len = 2048; + + msg.msg_iov = iov; + msg.msg_iovlen = 2; + + msg.msg_control = ctrl_data; + msg.msg_controllen = sizeof(ctrl_data); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_DEVMEM_DMABUF; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct dmabuf_tx_cmsg)); + + ddmabuf.dmabuf_id = tx_dmabuf_id; + + *((struct dmabuf_tx_cmsg *)CMSG_DATA(cmsg)) = ddmabuf; + + sendmsg(socket_fd, &msg, MSG_ZEROCOPY); + + +Reusing TX dmabufs +------------------ + +Similar to MSG_ZEROCOPY with regular memory, the user should not modify the +contents of the dma-buf while a send operation is in progress. This is because +the kernel does not keep a copy of the dmabuf contents. Instead, the kernel +will pin and send data from the buffer available to the userspace. + +Just as in MSG_ZEROCOPY, the kernel notifies the userspace of send completions +using MSG_ERRQUEUE:: + + int64_t tstop = gettimeofday_ms() + waittime_ms; + char control[CMSG_SPACE(100)] = {}; + struct sock_extended_err *serr; + struct msghdr msg = {}; + struct cmsghdr *cm; + int retries = 10; + __u32 hi, lo; + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + while (gettimeofday_ms() < tstop) { + if (!do_poll(fd)) continue; + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); + + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + serr = (void *)CMSG_DATA(cm); + + hi = serr->ee_data; + lo = serr->ee_info; + + fprintf(stdout, "tx complete [%d,%d]\n", lo, hi); + } + } + +After the associated sendmsg has been completed, the dmabuf can be reused by +the userspace. + + Implementation & Caveats ======================== From 07a63931ad4a16835892e7d7b05155b5e056facc Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 29 Apr 2025 03:26:42 +0000 Subject: [PATCH 659/770] net: enable driver support for netmem TX Drivers need to make sure not to pass netmem dma-addrs to the dma-mapping API in order to support netmem TX. Add helpers and netmem_dma_*() helpers that enables special handling of netmem dma-addrs that drivers can use. Document in netmem.rst what drivers need to do to support netmem TX. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Signed-off-by: NipaLocal --- .../networking/net_cachelines/net_device.rst | 1 + Documentation/networking/netdev-features.rst | 5 ++++ Documentation/networking/netmem.rst | 23 +++++++++++++++++-- include/linux/netdevice.h | 2 ++ include/net/netmem.h | 20 ++++++++++++++++ 5 files changed, 49 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index ca8605eb82ffc..c69cc89c958e0 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -10,6 +10,7 @@ Type Name fastpath_tx_acce =================================== =========================== =================== =================== =================================================================================== unsigned_long:32 priv_flags read_mostly __dev_queue_xmit(tx) unsigned_long:1 lltx read_mostly HARD_TX_LOCK,HARD_TX_TRYLOCK,HARD_TX_UNLOCK(tx) +unsigned long:1 netmem_tx:1; read_mostly char name[16] struct netdev_name_node* name_node struct dev_ifalias* ifalias diff --git a/Documentation/networking/netdev-features.rst b/Documentation/networking/netdev-features.rst index 5014f7cc1398b..02bd7536fc0ca 100644 --- a/Documentation/networking/netdev-features.rst +++ b/Documentation/networking/netdev-features.rst @@ -188,3 +188,8 @@ Redundancy) frames from one port to another in hardware. This should be set for devices which duplicate outgoing HSR (High-availability Seamless Redundancy) or PRP (Parallel Redundancy Protocol) tags automatically frames in hardware. + +* netmem-tx + +This should be set for devices which support netmem TX. See +Documentation/networking/netmem.rst diff --git a/Documentation/networking/netmem.rst b/Documentation/networking/netmem.rst index 7de21ddb54129..b63aded463370 100644 --- a/Documentation/networking/netmem.rst +++ b/Documentation/networking/netmem.rst @@ -19,8 +19,8 @@ Benefits of Netmem : * Simplified Development: Drivers interact with a consistent API, regardless of the underlying memory implementation. -Driver Requirements -=================== +Driver RX Requirements +====================== 1. The driver must support page_pool. @@ -77,3 +77,22 @@ Driver Requirements that purpose, but be mindful that some netmem types might have longer circulation times, such as when userspace holds a reference in zerocopy scenarios. + +Driver TX Requirements +====================== + +1. The Driver must not pass the netmem dma_addr to any of the dma-mapping APIs + directly. This is because netmem dma_addrs may come from a source like + dma-buf that is not compatible with the dma-mapping APIs. + + Helpers like netmem_dma_unmap_page_attrs() & netmem_dma_unmap_addr_set() + should be used in lieu of dma_unmap_page[_attrs](), dma_unmap_addr_set(). + The netmem variants will handle netmem dma_addrs correctly regardless of the + source, delegating to the dma-mapping APIs when appropriate. + + Not all dma-mapping APIs have netmem equivalents at the moment. If your + driver relies on a missing netmem API, feel free to add and propose to + netdev@, or reach out to the maintainers and/or almasrymina@google.com for + help adding the netmem API. + +2. Driver should declare support by setting `netdev->netmem_tx = true` diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0321fd952f708..a661820a26c44 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1772,6 +1772,7 @@ enum netdev_reg_state { * @lltx: device supports lockless Tx. Deprecated for real HW * drivers. Mainly used by logical interfaces, such as * bonding and tunnels + * @netmem_tx: device support netmem_tx. * * @name: This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name @@ -2087,6 +2088,7 @@ struct net_device { struct_group(priv_flags_fast, unsigned long priv_flags:32; unsigned long lltx:1; + unsigned long netmem_tx:1; ); const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; diff --git a/include/net/netmem.h b/include/net/netmem.h index ecb6b29c93f61..386164fb9c185 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -8,6 +8,7 @@ #ifndef _NET_NETMEM_H #define _NET_NETMEM_H +#include #include #include @@ -276,4 +277,23 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) void get_netmem(netmem_ref netmem); void put_netmem(netmem_ref netmem); +#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL) \ + do { \ + if (!netmem_is_net_iov(NETMEM)) \ + dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \ + else \ + dma_unmap_addr_set(PTR, ADDR_NAME, 0); \ + } while (0) + +static inline void netmem_dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + if (!addr) + return; + + dma_unmap_page_attrs(dev, addr, size, dir, attrs); +} + #endif /* _NET_NETMEM_H */ From a0effe1fb8522156dac44ee21277f1a916c957df Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 29 Apr 2025 03:26:43 +0000 Subject: [PATCH 660/770] gve: add netmem TX support to GVE DQO-RDA mode Use netmem_dma_*() helpers in gve_tx_dqo.c DQO-RDA paths to enable netmem TX support in that mode. Declare support for netmem TX in GVE DQO-RDA mode. Signed-off-by: Mina Almasry Acked-by: Harshitha Ramamurthy Signed-off-by: NipaLocal --- drivers/net/ethernet/google/gve/gve_main.c | 3 +++ drivers/net/ethernet/google/gve/gve_tx_dqo.c | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 446e4b6fd3f17..e1ffbd561fac6 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -2659,6 +2659,9 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto abort_with_wq; + if (!gve_is_gqi(priv) && !gve_is_qpl(priv)) + dev->netmem_tx = true; + err = register_netdev(dev); if (err) goto abort_with_gve_init; diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c index 2eba868d80370..a27f1574a7337 100644 --- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c @@ -660,7 +660,8 @@ static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx, goto err; dma_unmap_len_set(pkt, len[pkt->num_bufs], len); - dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); + netmem_dma_unmap_addr_set(skb_frag_netmem(frag), pkt, + dma[pkt->num_bufs], addr); ++pkt->num_bufs; gve_tx_fill_pkt_desc_dqo(tx, desc_idx, skb, len, addr, @@ -1038,8 +1039,9 @@ static void gve_unmap_packet(struct device *dev, dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]), dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE); for (i = 1; i < pkt->num_bufs; i++) { - dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]), - dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE); + netmem_dma_unmap_page_attrs(dev, dma_unmap_addr(pkt, dma[i]), + dma_unmap_len(pkt, len[i]), + DMA_TO_DEVICE, 0); } pkt->num_bufs = 0; } From 1dd7b3244e65990757c99113a86e9322a362c541 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 29 Apr 2025 03:26:44 +0000 Subject: [PATCH 661/770] net: check for driver support in netmem TX We should not enable netmem TX for drivers that don't declare support. Check for driver netmem TX support during devmem TX binding and fail if the driver does not have the functionality. Check for driver support in validate_xmit_skb as well. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Signed-off-by: NipaLocal --- net/core/dev.c | 34 ++++++++++++++++++++++++++++++++-- net/core/devmem.h | 6 ++++++ net/core/netdev-genl.c | 7 +++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index d1a8cad0c99c4..66f0c122de80e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3896,12 +3896,42 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, } EXPORT_SYMBOL(skb_csum_hwoffload_help); +static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, + struct net_device *dev) +{ + struct skb_shared_info *shinfo; + struct net_iov *niov; + + if (likely(skb_frags_readable(skb))) + goto out; + + if (!dev->netmem_tx) + goto out_free; + + shinfo = skb_shinfo(skb); + + if (shinfo->nr_frags > 0) { + niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0])); + if (net_is_devmem_iov(niov) && + net_devmem_iov_binding(niov)->dev != dev) + goto out_free; + } + +out: + return skb; + +out_free: + kfree_skb(skb); + return NULL; +} + static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; - if (!skb_frags_readable(skb)) - goto out_kfree_skb; + skb = validate_xmit_unreadable_skb(skb, dev); + if (unlikely(!skb)) + goto out_null; features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); diff --git a/net/core/devmem.h b/net/core/devmem.h index 67168aae5e5b3..919e6ed28fdcd 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -229,6 +229,12 @@ net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, { return NULL; } + +static inline struct net_devmem_dmabuf_binding * +net_devmem_iov_binding(const struct net_iov *niov) +{ + return NULL; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index db0e9a6a4badc..119f4fbc0c944 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -982,6 +982,13 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock_netdev; } + if (!netdev->netmem_tx) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(info->extack, + "Driver does not support netmem TX"); + goto err_unlock_netdev; + } + binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, info->extack); if (IS_ERR(binding)) { From b98c53beaf800612b400e77eaf6d0a5ca1e90091 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Tue, 29 Apr 2025 03:26:45 +0000 Subject: [PATCH 662/770] selftests: ncdevmem: Implement devmem TCP TX Add support for devmem TX in ncdevmem. This is a combination of the ncdevmem from the devmem TCP series RFCv1 which included the TX path, and work by Stan to include the netlink API and refactored on top of his generic memory_provider support. Signed-off-by: Mina Almasry Signed-off-by: Stanislav Fomichev Acked-by: Stanislav Fomichev Signed-off-by: NipaLocal --- .../selftests/drivers/net/hw/devmem.py | 26 +- .../selftests/drivers/net/hw/ncdevmem.c | 300 +++++++++++++++++- 2 files changed, 311 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py index 3947e91571159..7fc686cf47a2d 100755 --- a/tools/testing/selftests/drivers/net/hw/devmem.py +++ b/tools/testing/selftests/drivers/net/hw/devmem.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 +from os import path from lib.py import ksft_run, ksft_exit from lib.py import ksft_eq, KsftSkipEx from lib.py import NetDrvEpEnv @@ -10,8 +11,7 @@ def require_devmem(cfg): if not hasattr(cfg, "_devmem_probed"): - port = rand_port() - probe_command = f"./ncdevmem -f {cfg.ifname}" + probe_command = f"{cfg.bin_local} -f {cfg.ifname}" cfg._devmem_supported = cmd(probe_command, fail=False, shell=True).ret == 0 cfg._devmem_probed = True @@ -25,7 +25,7 @@ def check_rx(cfg) -> None: require_devmem(cfg) port = rand_port() - listen_cmd = f"./ncdevmem -l -f {cfg.ifname} -s {cfg.addr_v['6']} -p {port}" + listen_cmd = f"{cfg.bin_local} -l -f {cfg.ifname} -s {cfg.addr_v['6']} -p {port}" with bkg(listen_cmd) as socat: wait_port_listen(port) @@ -34,9 +34,27 @@ def check_rx(cfg) -> None: ksft_eq(socat.stdout.strip(), "hello\nworld") +@ksft_disruptive +def check_tx(cfg) -> None: + cfg.require_ipver("6") + require_devmem(cfg) + + port = rand_port() + listen_cmd = f"socat -U - TCP6-LISTEN:{port}" + + with bkg(listen_cmd, exit_wait=True) as socat: + wait_port_listen(port) + cmd(f"echo -e \"hello\\nworld\"| {cfg.bin_remote} -f {cfg.ifname} -s {cfg.addr_v['6']} -p {port}", host=cfg.remote, shell=True) + + ksft_eq(socat.stdout.strip(), "hello\nworld") + + def main() -> None: with NetDrvEpEnv(__file__) as cfg: - ksft_run([check_rx], + cfg.bin_local = path.abspath(path.dirname(__file__) + "/ncdevmem") + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + ksft_run([check_rx, check_tx], args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index 2bf14ac2b8c62..f801a1b3545f0 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -9,22 +9,31 @@ * ncdevmem -s [-c ] -f eth1 -l -p 5201 * * On client: - * echo -n "hello\nworld" | nc -s 5201 -p 5201 + * echo -n "hello\nworld" | \ + * ncdevmem -s [-c ] -p 5201 -f eth1 * - * Test data validation: + * Note this is compatible with regular netcat. i.e. the sender or receiver can + * be replaced with regular netcat to test the RX or TX path in isolation. + * + * Test data validation (devmem TCP on RX only): * * On server: * ncdevmem -s [-c ] -f eth1 -l -p 5201 -v 7 * * On client: * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \ - * tr \\n \\0 | \ - * head -c 5G | \ + * head -c 1G | \ * nc 5201 -p 5201 * + * Test data validation (devmem TCP on RX and TX, validation happens on RX): * - * Note this is compatible with regular netcat. i.e. the sender or receiver can - * be replaced with regular netcat to test the RX or TX path in isolation. + * On server: + * ncdevmem -s [-c ] -l -p 5201 -v 8 -f eth1 + * + * On client: + * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06\\x07) | \ + * head -c 1M | \ + * ncdevmem -s [-c ] -p 5201 -f eth1 */ #define _GNU_SOURCE #define __EXPORTED_HEADERS__ @@ -40,15 +49,18 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include +#include #include #include #include @@ -79,6 +91,8 @@ static int num_queues = -1; static char *ifname; static unsigned int ifindex; static unsigned int dmabuf_id; +static uint32_t tx_dmabuf_id; +static int waittime_ms = 500; struct memory_buffer { int fd; @@ -92,6 +106,8 @@ struct memory_buffer { struct memory_provider { struct memory_buffer *(*alloc)(size_t size); void (*free)(struct memory_buffer *ctx); + void (*memcpy_to_device)(struct memory_buffer *dst, size_t off, + void *src, int n); void (*memcpy_from_device)(void *dst, struct memory_buffer *src, size_t off, int n); }; @@ -152,6 +168,20 @@ static void udmabuf_free(struct memory_buffer *ctx) free(ctx); } +static void udmabuf_memcpy_to_device(struct memory_buffer *dst, size_t off, + void *src, int n) +{ + struct dma_buf_sync sync = {}; + + sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE; + ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync); + + memcpy(dst->buf_mem + off, src, n); + + sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE; + ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync); +} + static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src, size_t off, int n) { @@ -169,6 +199,7 @@ static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src, static struct memory_provider udmabuf_memory_provider = { .alloc = udmabuf_alloc, .free = udmabuf_free, + .memcpy_to_device = udmabuf_memcpy_to_device, .memcpy_from_device = udmabuf_memcpy_from_device, }; @@ -187,14 +218,16 @@ void validate_buffer(void *line, size_t size) { static unsigned char seed = 1; unsigned char *ptr = line; - int errors = 0; + unsigned char expected; + static int errors; size_t i; for (i = 0; i < size; i++) { - if (ptr[i] != seed) { + expected = seed ? seed : '\n'; + if (ptr[i] != expected) { fprintf(stderr, "Failed validation: expected=%u, actual=%u, index=%lu\n", - seed, ptr[i], i); + expected, ptr[i], i); errors++; if (errors > 20) error(1, 0, "validation failed."); @@ -393,6 +426,49 @@ static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, return -1; } +static int bind_tx_queue(unsigned int ifindex, unsigned int dmabuf_fd, + struct ynl_sock **ys) +{ + struct netdev_bind_tx_req *req = NULL; + struct netdev_bind_tx_rsp *rsp = NULL; + struct ynl_error yerr; + + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); + if (!*ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return -1; + } + + req = netdev_bind_tx_req_alloc(); + netdev_bind_tx_req_set_ifindex(req, ifindex); + netdev_bind_tx_req_set_fd(req, dmabuf_fd); + + rsp = netdev_bind_tx(*ys, req); + if (!rsp) { + perror("netdev_bind_tx"); + goto err_close; + } + + if (!rsp->_present.id) { + perror("id not present"); + goto err_close; + } + + fprintf(stderr, "got tx dmabuf id=%d\n", rsp->id); + tx_dmabuf_id = rsp->id; + + netdev_bind_tx_req_free(req); + netdev_bind_tx_rsp_free(rsp); + + return 0; + +err_close: + fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg); + netdev_bind_tx_req_free(req); + ynl_sock_destroy(*ys); + return -1; +} + static void enable_reuseaddr(int fd) { int opt = 1; @@ -431,7 +507,7 @@ static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6) return 0; } -int do_server(struct memory_buffer *mem) +static int do_server(struct memory_buffer *mem) { char ctrl_data[sizeof(int) * 20000]; struct netdev_queue_id *queues; @@ -685,6 +761,206 @@ void run_devmem_tests(void) provider->free(mem); } +static uint64_t gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000ULL) + (tv.tv_usec / 1000ULL); +} + +static int do_poll(int fd) +{ + struct pollfd pfd; + int ret; + + pfd.revents = 0; + pfd.fd = fd; + + ret = poll(&pfd, 1, waittime_ms); + if (ret == -1) + error(1, errno, "poll"); + + return ret && (pfd.revents & POLLERR); +} + +static void wait_compl(int fd) +{ + int64_t tstop = gettimeofday_ms() + waittime_ms; + char control[CMSG_SPACE(100)] = {}; + struct sock_extended_err *serr; + struct msghdr msg = {}; + struct cmsghdr *cm; + __u32 hi, lo; + int ret; + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + while (gettimeofday_ms() < tstop) { + if (!do_poll(fd)) + continue; + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); + if (ret < 0) { + if (errno == EAGAIN) + continue; + error(1, errno, "recvmsg(MSG_ERRQUEUE)"); + return; + } + if (msg.msg_flags & MSG_CTRUNC) + error(1, 0, "MSG_CTRUNC\n"); + + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + if (cm->cmsg_level != SOL_IP && + cm->cmsg_level != SOL_IPV6) + continue; + if (cm->cmsg_level == SOL_IP && + cm->cmsg_type != IP_RECVERR) + continue; + if (cm->cmsg_level == SOL_IPV6 && + cm->cmsg_type != IPV6_RECVERR) + continue; + + serr = (void *)CMSG_DATA(cm); + if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) + error(1, 0, "wrong origin %u", serr->ee_origin); + if (serr->ee_errno != 0) + error(1, 0, "wrong errno %d", serr->ee_errno); + + hi = serr->ee_data; + lo = serr->ee_info; + + fprintf(stderr, "tx complete [%d,%d]\n", lo, hi); + return; + } + } + + error(1, 0, "did not receive tx completion"); +} + +static int do_client(struct memory_buffer *mem) +{ + char ctrl_data[CMSG_SPACE(sizeof(__u32))]; + struct sockaddr_in6 server_sin; + struct sockaddr_in6 client_sin; + struct ynl_sock *ys = NULL; + struct msghdr msg = {}; + ssize_t line_size = 0; + struct cmsghdr *cmsg; + struct iovec iov[2]; + char *line = NULL; + unsigned long mid; + size_t len = 0; + int socket_fd; + __u32 ddmabuf; + int opt = 1; + int ret; + + ret = parse_address(server_ip, atoi(port), &server_sin); + if (ret < 0) + error(1, 0, "parse server address"); + + socket_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (socket_fd < 0) + error(1, socket_fd, "create socket"); + + enable_reuseaddr(socket_fd); + + ret = setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, + strlen(ifname) + 1); + if (ret) + error(1, errno, "bindtodevice"); + + if (bind_tx_queue(ifindex, mem->fd, &ys)) + error(1, 0, "Failed to bind\n"); + + if (client_ip) { + ret = parse_address(client_ip, atoi(port), &client_sin); + if (ret < 0) + error(1, 0, "parse client address"); + + ret = bind(socket_fd, &client_sin, sizeof(client_sin)); + if (ret) + error(1, errno, "bind"); + } + + ret = setsockopt(socket_fd, SOL_SOCKET, SO_ZEROCOPY, &opt, sizeof(opt)); + if (ret) + error(1, errno, "set sock opt"); + + fprintf(stderr, "Connect to %s %d (via %s)\n", server_ip, + ntohs(server_sin.sin6_port), ifname); + + ret = connect(socket_fd, &server_sin, sizeof(server_sin)); + if (ret) + error(1, errno, "connect"); + + while (1) { + free(line); + line = NULL; + line_size = getline(&line, &len, stdin); + + if (line_size < 0) + break; + + mid = (line_size / 2) + 1; + + iov[0].iov_base = (void *)1; + iov[0].iov_len = mid; + iov[1].iov_base = (void *)(mid + 2); + iov[1].iov_len = line_size - mid; + + provider->memcpy_to_device(mem, (size_t)iov[0].iov_base, line, + iov[0].iov_len); + provider->memcpy_to_device(mem, (size_t)iov[1].iov_base, + line + iov[0].iov_len, + iov[1].iov_len); + + fprintf(stderr, + "read line_size=%ld iov[0].iov_base=%lu, iov[0].iov_len=%lu, iov[1].iov_base=%lu, iov[1].iov_len=%lu\n", + line_size, (unsigned long)iov[0].iov_base, + iov[0].iov_len, (unsigned long)iov[1].iov_base, + iov[1].iov_len); + + msg.msg_iov = iov; + msg.msg_iovlen = 2; + + msg.msg_control = ctrl_data; + msg.msg_controllen = sizeof(ctrl_data); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_DEVMEM_DMABUF; + cmsg->cmsg_len = CMSG_LEN(sizeof(__u32)); + + ddmabuf = tx_dmabuf_id; + + *((__u32 *)CMSG_DATA(cmsg)) = ddmabuf; + + ret = sendmsg(socket_fd, &msg, MSG_ZEROCOPY); + if (ret < 0) + error(1, errno, "Failed sendmsg"); + + fprintf(stderr, "sendmsg_ret=%d\n", ret); + + if (ret != line_size) + error(1, errno, "Did not send all bytes"); + + wait_compl(socket_fd); + } + + fprintf(stderr, "%s: tx ok\n", TEST_PREFIX); + + free(line); + close(socket_fd); + + if (ys) + ynl_sock_destroy(ys); + + return 0; +} + int main(int argc, char *argv[]) { struct memory_buffer *mem; @@ -728,6 +1004,8 @@ int main(int argc, char *argv[]) ifindex = if_nametoindex(ifname); + fprintf(stderr, "using ifindex=%u\n", ifindex); + if (!server_ip && !client_ip) { if (start_queue < 0 && num_queues < 0) { num_queues = rxq_num(ifindex); @@ -778,7 +1056,7 @@ int main(int argc, char *argv[]) error(1, 0, "Missing -p argument\n"); mem = provider->alloc(getpagesize() * NUM_PAGES); - ret = is_server ? do_server(mem) : 1; + ret = is_server ? do_server(mem) : do_client(mem); provider->free(mem); return ret; From b924b07e8df75f3552c878bc73a4e0bad20db261 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 29 Apr 2025 10:08:04 -0700 Subject: [PATCH 663/770] selftests: net: exit cleanly on SIGTERM / timeout ksft runner sends 2 SIGTERMs in a row if a test runs out of time. Handle this in a similar way we handle SIGINT - cleanup and stop running further tests. Because we get 2 signals we need a bit of logic to ignore the subsequent one, they come immediately one after the other (due to commit 9616cb34b08e ("kselftest/runner.sh: Propagate SIGTERM to runner child")). This change makes sure we run cleanup (scheduled defer()s) and also print a stack trace on SIGTERM, which doesn't happen by default. Tests occasionally hang in NIPA and it's impossible to tell what they are waiting from or doing. Signed-off-by: Jakub Kicinski Reviewed-by: Willem de Bruijn Signed-off-by: NipaLocal --- tools/testing/selftests/net/lib/py/ksft.py | 25 +++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/lib/py/ksft.py b/tools/testing/selftests/net/lib/py/ksft.py index 3cfad0fd45703..1b815768bf8ac 100644 --- a/tools/testing/selftests/net/lib/py/ksft.py +++ b/tools/testing/selftests/net/lib/py/ksft.py @@ -3,6 +3,7 @@ import builtins import functools import inspect +import signal import sys import time import traceback @@ -26,6 +27,10 @@ class KsftXfailEx(Exception): pass +class KsftTerminate(KeyboardInterrupt): + pass + + def ksft_pr(*objs, **kwargs): print("#", *objs, **kwargs) @@ -193,6 +198,17 @@ def get_bool(env, name): return env +def _ksft_intr(signum, frame): + # ksft runner.sh sends 2 SIGTERMs in a row on a timeout + # if we don't ignore the second one it will stop us from handling cleanup + global term_cnt + term_cnt += 1 + if term_cnt == 1: + raise KsftTerminate() + else: + ksft_pr(f"Ignoring SIGTERM (cnt: {term_cnt}), already exiting...") + + def ksft_run(cases=None, globs=None, case_pfx=None, args=()): cases = cases or [] @@ -205,6 +221,10 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): cases.append(value) break + global term_cnt + term_cnt = 0 + prev_sigterm = signal.signal(signal.SIGTERM, _ksft_intr) + totals = {"pass": 0, "fail": 0, "skip": 0, "xfail": 0} print("TAP version 13") @@ -229,11 +249,12 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): cnt_key = 'xfail' except BaseException as e: stop |= isinstance(e, KeyboardInterrupt) + stop |= isinstance(e, KsftTerminate) tb = traceback.format_exc() for line in tb.strip().split('\n'): ksft_pr("Exception|", line) if stop: - ksft_pr("Stopping tests due to KeyboardInterrupt.") + ksft_pr(f"Stopping tests due to {type(e).__name__}.") KSFT_RESULT = False cnt_key = 'fail' @@ -248,6 +269,8 @@ def ksft_run(cases=None, globs=None, case_pfx=None, args=()): if stop: break + signal.signal(signal.SIGTERM, prev_sigterm) + print( f"# Totals: pass:{totals['pass']} fail:{totals['fail']} xfail:{totals['xfail']} xpass:0 skip:{totals['skip']} error:0" ) From f56b444784222da4d06ba4f67308d63589cdc3e2 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:00 +0200 Subject: [PATCH 664/770] net: dsa: b53: allow leaky reserved multicast Allow reserved multicast to ignore VLAN membership so STP and other management protocols work without a PVID VLAN configured when using a vlan aware bridge. Fixes: 967dd82ffc52 ("net: dsa: b53: Add support for Broadcom RoboSwitch") Signed-off-by: Jonas Gorski Signed-off-by: NipaLocal --- drivers/net/dsa/b53/b53_common.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index e5ba718979064..62866165ad03c 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -373,9 +373,11 @@ static void b53_enable_vlan(struct b53_device *dev, int port, bool enable, b53_read8(dev, B53_VLAN_PAGE, B53_VLAN_CTRL5, &vc5); } + vc1 &= ~VC1_RX_MCST_FWD_EN; + if (enable) { vc0 |= VC0_VLAN_EN | VC0_VID_CHK_EN | VC0_VID_HASH_VID; - vc1 |= VC1_RX_MCST_UNTAG_EN | VC1_RX_MCST_FWD_EN; + vc1 |= VC1_RX_MCST_UNTAG_EN; vc4 &= ~VC4_ING_VID_CHECK_MASK; if (enable_filtering) { vc4 |= VC4_ING_VID_VIO_DROP << VC4_ING_VID_CHECK_S; @@ -393,7 +395,7 @@ static void b53_enable_vlan(struct b53_device *dev, int port, bool enable, } else { vc0 &= ~(VC0_VLAN_EN | VC0_VID_CHK_EN | VC0_VID_HASH_VID); - vc1 &= ~(VC1_RX_MCST_UNTAG_EN | VC1_RX_MCST_FWD_EN); + vc1 &= ~VC1_RX_MCST_UNTAG_EN; vc4 &= ~VC4_ING_VID_CHECK_MASK; vc5 &= ~VC5_DROP_VTABLE_MISS; From ef60c2cc77d26eaedae35922eaf9d114428f60e5 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:01 +0200 Subject: [PATCH 665/770] net: dsa: b53: keep CPU port always tagged again The Broadcom management header does not carry the original VLAN tag state information, just the ingress port, so for untagged frames we do not know from which VLAN they originated. Therefore keep the CPU port always tagged except for VLAN 0. Fixes the following setup: $ ip link add br0 type bridge vlan_filtering 1 $ ip link set sw1p1 master br0 $ bridge vlan add dev br0 pvid untagged self $ ip link add sw1p2.10 link sw1p2 type vlan id 10 Where VID 10 would stay untagged on the CPU port. Fixes: 2c32a3d3c233 ("net: dsa: b53: Do not force CPU to be always tagged") Signed-off-by: Jonas Gorski Signed-off-by: NipaLocal --- drivers/net/dsa/b53/b53_common.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 62866165ad03c..9d4fb54b4ced6 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1135,6 +1135,11 @@ static int b53_setup(struct dsa_switch *ds) */ ds->untag_bridge_pvid = dev->tag_protocol == DSA_TAG_PROTO_NONE; + /* The switch does not tell us the original VLAN for untagged + * packets, so keep the CPU port always tagged. + */ + ds->untag_vlan_aware_bridge_pvid = true; + ret = b53_reset_switch(dev); if (ret) { dev_err(ds->dev, "failed to reset switch\n"); @@ -1545,6 +1550,9 @@ int b53_vlan_add(struct dsa_switch *ds, int port, if (vlan->vid == 0 && vlan->vid == b53_default_pvid(dev)) untagged = true; + if (vlan->vid > 0 && dsa_is_cpu_port(ds, port)) + untagged = false; + vl->members |= BIT(port); if (untagged && !b53_vlan_port_needs_forced_tagged(ds, port)) vl->untag |= BIT(port); From b823c614fa05ddb69f6ecb358a4dc086313bdf73 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:02 +0200 Subject: [PATCH 666/770] net: dsa: b53: fix clearing PVID of a port Currently the PVID of ports are only set when adding/updating VLANs with PVID set or removing VLANs, but not when clearing the PVID flag of a VLAN. E.g. the following flow $ ip link add br0 type bridge vlan_filtering 1 $ ip link set sw1p1 master bridge $ bridge vlan add dev sw1p1 vid 10 pvid untagged $ bridge vlan add dev sw1p1 vid 10 untagged Would keep the PVID set as 10, despite the flag being cleared. Fix this by checking if we need to unset the PVID on vlan updates. Fixes: a2482d2ce349 ("net: dsa: b53: Plug in VLAN support") Signed-off-by: Jonas Gorski Signed-off-by: NipaLocal --- drivers/net/dsa/b53/b53_common.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 9d4fb54b4ced6..65d74c455c573 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1537,12 +1537,21 @@ int b53_vlan_add(struct dsa_switch *ds, int port, bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; struct b53_vlan *vl; + u16 old_pvid, new_pvid; int err; err = b53_vlan_prepare(ds, port, vlan); if (err) return err; + b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), &old_pvid); + if (pvid) + new_pvid = vlan->vid; + else if (!pvid && vlan->vid == old_pvid) + new_pvid = b53_default_pvid(dev); + else + new_pvid = old_pvid; + vl = &dev->vlans[vlan->vid]; b53_get_vlan_entry(dev, vlan->vid, vl); @@ -1562,9 +1571,9 @@ int b53_vlan_add(struct dsa_switch *ds, int port, b53_set_vlan_entry(dev, vlan->vid, vl); b53_fast_age_vlan(dev, vlan->vid); - if (pvid && !dsa_is_cpu_port(ds, port)) { + if (!dsa_is_cpu_port(ds, port) && new_pvid != old_pvid) { b53_write16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), - vlan->vid); + new_pvid); b53_fast_age_vlan(dev, vlan->vid); } From 46d063938f60881c69fe5b9f7eeac08c58edc914 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:03 +0200 Subject: [PATCH 667/770] net: dsa: b53: fix flushing old pvid VLAN on pvid change Presumably the intention here was to flush the VLAN of the old pvid, not the added VLAN again, which we already flushed before. Fixes: a2482d2ce349 ("net: dsa: b53: Plug in VLAN support") Signed-off-by: Jonas Gorski Signed-off-by: NipaLocal --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 65d74c455c573..c67c0b5fbc1be 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1574,7 +1574,7 @@ int b53_vlan_add(struct dsa_switch *ds, int port, if (!dsa_is_cpu_port(ds, port) && new_pvid != old_pvid) { b53_write16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), new_pvid); - b53_fast_age_vlan(dev, vlan->vid); + b53_fast_age_vlan(dev, old_pvid); } return 0; From 54a66d0dc89d86ededaad519c4778456b708b573 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:04 +0200 Subject: [PATCH 668/770] net: dsa: b53: fix VLAN ID for untagged vlan on bridge leave The untagged default VLAN is added to the default vlan, which may be one, but we modify the VLAN 0 entry on bridge leave. Fix this to use the correct VLAN entry for the default pvid. Fixes: fea83353177a ("net: dsa: b53: Fix default VLAN ID") Signed-off-by: Jonas Gorski Signed-off-by: NipaLocal --- drivers/net/dsa/b53/b53_common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index c67c0b5fbc1be..c60b552b945ce 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1986,7 +1986,7 @@ EXPORT_SYMBOL(b53_br_join); void b53_br_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge) { struct b53_device *dev = ds->priv; - struct b53_vlan *vl = &dev->vlans[0]; + struct b53_vlan *vl; s8 cpu_port = dsa_to_port(ds, port)->cpu_dp->index; unsigned int i; u16 pvlan, reg, pvid; @@ -2012,6 +2012,7 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge) dev->ports[port].vlan_ctl_mask = pvlan; pvid = b53_default_pvid(dev); + vl = &dev->vlans[pvid]; /* Make this port join all VLANs without VLAN entries */ if (is58xx(dev)) { From 3700a6ff3a2de4abb34888162cdbe84971650091 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:05 +0200 Subject: [PATCH 669/770] net: dsa: b53: always rejoin default untagged VLAN on bridge leave While JOIN_ALL_VLAN allows to join all VLANs, we still need to keep the default VLAN enabled so that untagged traffic stays untagged. So rejoin the default VLAN even for switches with JOIN_ALL_VLAN support. Fixes: 48aea33a77ab ("net: dsa: b53: Add JOIN_ALL_VLAN support") Signed-off-by: Jonas Gorski Signed-off-by: NipaLocal --- drivers/net/dsa/b53/b53_common.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index c60b552b945ce..4871e117f5ef0 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -2021,12 +2021,12 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge) if (!(reg & BIT(cpu_port))) reg |= BIT(cpu_port); b53_write16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, reg); - } else { - b53_get_vlan_entry(dev, pvid, vl); - vl->members |= BIT(port) | BIT(cpu_port); - vl->untag |= BIT(port) | BIT(cpu_port); - b53_set_vlan_entry(dev, pvid, vl); } + + b53_get_vlan_entry(dev, pvid, vl); + vl->members |= BIT(port) | BIT(cpu_port); + vl->untag |= BIT(port) | BIT(cpu_port); + b53_set_vlan_entry(dev, pvid, vl); } EXPORT_SYMBOL(b53_br_leave); From 2a90fc1aced87febe3beacc0a735986069ef4378 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:06 +0200 Subject: [PATCH 670/770] net: dsa: b53: do not allow to configure VLAN 0 Since we cannot set forwarding destinations per VLAN, we should not have a VLAN 0 configured, as it would allow untagged traffic to work across ports on VLAN aware bridges regardless if a PVID untagged VLAN exists. So remove the VLAN 0 on join, an re-add it on leave. But only do so if we have a VLAN aware bridge, as without it, untagged traffic would become tagged with VID 0 on a VLAN unaware bridge. Fixes: a2482d2ce349 ("net: dsa: b53: Plug in VLAN support") Signed-off-by: Jonas Gorski Signed-off-by: NipaLocal --- drivers/net/dsa/b53/b53_common.c | 36 ++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 4871e117f5ef0..0b28791cca526 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1544,6 +1544,9 @@ int b53_vlan_add(struct dsa_switch *ds, int port, if (err) return err; + if (vlan->vid == 0) + return 0; + b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), &old_pvid); if (pvid) new_pvid = vlan->vid; @@ -1556,10 +1559,7 @@ int b53_vlan_add(struct dsa_switch *ds, int port, b53_get_vlan_entry(dev, vlan->vid, vl); - if (vlan->vid == 0 && vlan->vid == b53_default_pvid(dev)) - untagged = true; - - if (vlan->vid > 0 && dsa_is_cpu_port(ds, port)) + if (dsa_is_cpu_port(ds, port)) untagged = false; vl->members |= BIT(port); @@ -1589,6 +1589,9 @@ int b53_vlan_del(struct dsa_switch *ds, int port, struct b53_vlan *vl; u16 pvid; + if (vlan->vid == 0) + return 0; + b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), &pvid); vl = &dev->vlans[vlan->vid]; @@ -1935,8 +1938,9 @@ int b53_br_join(struct dsa_switch *ds, int port, struct dsa_bridge bridge, bool *tx_fwd_offload, struct netlink_ext_ack *extack) { struct b53_device *dev = ds->priv; + struct b53_vlan *vl; s8 cpu_port = dsa_to_port(ds, port)->cpu_dp->index; - u16 pvlan, reg; + u16 pvlan, reg, pvid; unsigned int i; /* On 7278, port 7 which connects to the ASP should only receive @@ -1945,6 +1949,9 @@ int b53_br_join(struct dsa_switch *ds, int port, struct dsa_bridge bridge, if (dev->chip_id == BCM7278_DEVICE_ID && port == 7) return -EINVAL; + pvid = b53_default_pvid(dev); + vl = &dev->vlans[pvid]; + /* Make this port leave the all VLANs join since we will have proper * VLAN entries from now on */ @@ -1956,6 +1963,15 @@ int b53_br_join(struct dsa_switch *ds, int port, struct dsa_bridge bridge, b53_write16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, reg); } + if (ds->vlan_filtering) { + b53_get_vlan_entry(dev, pvid, vl); + vl->members &= ~BIT(port); + if (vl->members == BIT(cpu_port)) + vl->members &= ~BIT(cpu_port); + vl->untag = vl->members; + b53_set_vlan_entry(dev, pvid, vl); + } + b53_read16(dev, B53_PVLAN_PAGE, B53_PVLAN_PORT_MASK(port), &pvlan); b53_for_each_port(dev, i) { @@ -2023,10 +2039,12 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge) b53_write16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, reg); } - b53_get_vlan_entry(dev, pvid, vl); - vl->members |= BIT(port) | BIT(cpu_port); - vl->untag |= BIT(port) | BIT(cpu_port); - b53_set_vlan_entry(dev, pvid, vl); + if (ds->vlan_filtering) { + b53_get_vlan_entry(dev, pvid, vl); + vl->members |= BIT(port) | BIT(cpu_port); + vl->untag |= BIT(port) | BIT(cpu_port); + b53_set_vlan_entry(dev, pvid, vl); + } } EXPORT_SYMBOL(b53_br_leave); From c8b400d4be2ea91831705e54892cd948eb9c413e Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:07 +0200 Subject: [PATCH 671/770] net: dsa: b53: do not program vlans when vlan filtering is off Documentation/networking/switchdev.rst says: - with VLAN filtering turned off: the bridge is strictly VLAN unaware and its data path will process all Ethernet frames as if they are VLAN-untagged. The bridge VLAN database can still be modified, but the modifications should have no effect while VLAN filtering is turned off. This breaks if we immediately apply the VLAN configuration, so skip writing it when vlan_filtering is off. Fixes: 0ee2af4ebbe3 ("net: dsa: set configure_vlan_while_not_filtering to true by default") Signed-off-by: Jonas Gorski Signed-off-by: NipaLocal --- drivers/net/dsa/b53/b53_common.c | 48 +++++++++++++++++++------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 0b28791cca526..ee2f1be626184 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1547,6 +1547,9 @@ int b53_vlan_add(struct dsa_switch *ds, int port, if (vlan->vid == 0) return 0; + if (!ds->vlan_filtering) + return 0; + b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), &old_pvid); if (pvid) new_pvid = vlan->vid; @@ -1592,6 +1595,9 @@ int b53_vlan_del(struct dsa_switch *ds, int port, if (vlan->vid == 0) return 0; + if (!ds->vlan_filtering) + return 0; + b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), &pvid); vl = &dev->vlans[vlan->vid]; @@ -1952,18 +1958,20 @@ int b53_br_join(struct dsa_switch *ds, int port, struct dsa_bridge bridge, pvid = b53_default_pvid(dev); vl = &dev->vlans[pvid]; - /* Make this port leave the all VLANs join since we will have proper - * VLAN entries from now on - */ - if (is58xx(dev)) { - b53_read16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, ®); - reg &= ~BIT(port); - if ((reg & BIT(cpu_port)) == BIT(cpu_port)) - reg &= ~BIT(cpu_port); - b53_write16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, reg); - } - if (ds->vlan_filtering) { + /* Make this port leave the all VLANs join since we will have + * proper VLAN entries from now on + */ + if (is58xx(dev)) { + b53_read16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, + ®); + reg &= ~BIT(port); + if ((reg & BIT(cpu_port)) == BIT(cpu_port)) + reg &= ~BIT(cpu_port); + b53_write16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, + reg); + } + b53_get_vlan_entry(dev, pvid, vl); vl->members &= ~BIT(port); if (vl->members == BIT(cpu_port)) @@ -2030,16 +2038,16 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge) pvid = b53_default_pvid(dev); vl = &dev->vlans[pvid]; - /* Make this port join all VLANs without VLAN entries */ - if (is58xx(dev)) { - b53_read16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, ®); - reg |= BIT(port); - if (!(reg & BIT(cpu_port))) - reg |= BIT(cpu_port); - b53_write16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, reg); - } - if (ds->vlan_filtering) { + /* Make this port join all VLANs without VLAN entries */ + if (is58xx(dev)) { + b53_read16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, ®); + reg |= BIT(port); + if (!(reg & BIT(cpu_port))) + reg |= BIT(cpu_port); + b53_write16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, reg); + } + b53_get_vlan_entry(dev, pvid, vl); vl->members |= BIT(port) | BIT(cpu_port); vl->untag |= BIT(port) | BIT(cpu_port); From ba9ae282265b6f5bd09c657a89cf37082c9c0fea Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:08 +0200 Subject: [PATCH 672/770] net: dsa: b53: fix toggling vlan_filtering To allow runtime switching between vlan aware and vlan non-aware mode, we need to properly keep track of any bridge VLAN configuration. Likewise, we need to know when we actually switch between both modes, to not have to rewrite the full VLAN table every time we update the VLANs. So keep track of the current vlan_filtering mode, and on changes, apply the appropriate VLAN configuration. Fixes: 0ee2af4ebbe3 ("net: dsa: set configure_vlan_while_not_filtering to true by default") Signed-off-by: Jonas Gorski Signed-off-by: NipaLocal --- drivers/net/dsa/b53/b53_common.c | 104 ++++++++++++++++++++++--------- drivers/net/dsa/b53/b53_priv.h | 2 + 2 files changed, 75 insertions(+), 31 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index ee2f1be626184..0a7749b416f77 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -763,6 +763,22 @@ static bool b53_vlan_port_needs_forced_tagged(struct dsa_switch *ds, int port) return dev->tag_protocol == DSA_TAG_PROTO_NONE && dsa_is_cpu_port(ds, port); } +static bool b53_vlan_port_may_join_untagged(struct dsa_switch *ds, int port) +{ + struct b53_device *dev = ds->priv; + struct dsa_port *dp; + + if (!dev->vlan_filtering) + return true; + + dp = dsa_to_port(ds, port); + + if (dsa_port_is_cpu(dp)) + return true; + + return dp->bridge == NULL; +} + int b53_configure_vlan(struct dsa_switch *ds) { struct b53_device *dev = ds->priv; @@ -781,7 +797,7 @@ int b53_configure_vlan(struct dsa_switch *ds) b53_do_vlan_op(dev, VTA_CMD_CLEAR); } - b53_enable_vlan(dev, -1, dev->vlan_enabled, ds->vlan_filtering); + b53_enable_vlan(dev, -1, dev->vlan_enabled, dev->vlan_filtering); /* Create an untagged VLAN entry for the default PVID in case * CONFIG_VLAN_8021Q is disabled and there are no calls to @@ -789,26 +805,39 @@ int b53_configure_vlan(struct dsa_switch *ds) * entry. Do this only when the tagging protocol is not * DSA_TAG_PROTO_NONE */ + v = &dev->vlans[def_vid]; b53_for_each_port(dev, i) { - v = &dev->vlans[def_vid]; - v->members |= BIT(i); + if (!b53_vlan_port_may_join_untagged(ds, i)) + continue; + + vl.members |= BIT(i); if (!b53_vlan_port_needs_forced_tagged(ds, i)) - v->untag = v->members; - b53_write16(dev, B53_VLAN_PAGE, - B53_VLAN_PORT_DEF_TAG(i), def_vid); + vl.untag = vl.members; + b53_write16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(i), + def_vid); } + b53_set_vlan_entry(dev, def_vid, &vl); - /* Upon initial call we have not set-up any VLANs, but upon - * system resume, we need to restore all VLAN entries. - */ - for (vid = def_vid; vid < dev->num_vlans; vid++) { - v = &dev->vlans[vid]; + if (dev->vlan_filtering) { + /* Upon initial call we have not set-up any VLANs, but upon + * system resume, we need to restore all VLAN entries. + */ + for (vid = def_vid + 1; vid < dev->num_vlans; vid++) { + v = &dev->vlans[vid]; - if (!v->members) - continue; + if (!v->members) + continue; + + b53_set_vlan_entry(dev, vid, v); + b53_fast_age_vlan(dev, vid); + } - b53_set_vlan_entry(dev, vid, v); - b53_fast_age_vlan(dev, vid); + b53_for_each_port(dev, i) { + if (!dsa_is_cpu_port(ds, i)) + b53_write16(dev, B53_VLAN_PAGE, + B53_VLAN_PORT_DEF_TAG(i), + dev->ports[i].pvid); + } } return 0; @@ -1127,7 +1156,9 @@ EXPORT_SYMBOL(b53_setup_devlink_resources); static int b53_setup(struct dsa_switch *ds) { struct b53_device *dev = ds->priv; + struct b53_vlan *vl; unsigned int port; + u16 pvid; int ret; /* Request bridge PVID untagged when DSA_TAG_PROTO_NONE is set @@ -1146,6 +1177,15 @@ static int b53_setup(struct dsa_switch *ds) return ret; } + /* setup default vlan for filtering mode */ + pvid = b53_default_pvid(dev); + vl = &dev->vlans[pvid]; + b53_for_each_port(dev, port) { + vl->members |= BIT(port); + if (!b53_vlan_port_needs_forced_tagged(ds, port)) + vl->untag |= BIT(port); + } + b53_reset_mib(dev); ret = b53_apply_config(dev); @@ -1499,7 +1539,10 @@ int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering, { struct b53_device *dev = ds->priv; - b53_enable_vlan(dev, port, dev->vlan_enabled, vlan_filtering); + if (dev->vlan_filtering != vlan_filtering) { + dev->vlan_filtering = vlan_filtering; + b53_apply_config(dev); + } return 0; } @@ -1524,7 +1567,7 @@ static int b53_vlan_prepare(struct dsa_switch *ds, int port, if (vlan->vid >= dev->num_vlans) return -ERANGE; - b53_enable_vlan(dev, port, true, ds->vlan_filtering); + b53_enable_vlan(dev, port, true, dev->vlan_filtering); return 0; } @@ -1547,21 +1590,17 @@ int b53_vlan_add(struct dsa_switch *ds, int port, if (vlan->vid == 0) return 0; - if (!ds->vlan_filtering) - return 0; - - b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), &old_pvid); + old_pvid = dev->ports[port].pvid; if (pvid) new_pvid = vlan->vid; else if (!pvid && vlan->vid == old_pvid) new_pvid = b53_default_pvid(dev); else new_pvid = old_pvid; + dev->ports[port].pvid = new_pvid; vl = &dev->vlans[vlan->vid]; - b53_get_vlan_entry(dev, vlan->vid, vl); - if (dsa_is_cpu_port(ds, port)) untagged = false; @@ -1571,6 +1610,9 @@ int b53_vlan_add(struct dsa_switch *ds, int port, else vl->untag &= ~BIT(port); + if (!dev->vlan_filtering) + return 0; + b53_set_vlan_entry(dev, vlan->vid, vl); b53_fast_age_vlan(dev, vlan->vid); @@ -1595,23 +1637,22 @@ int b53_vlan_del(struct dsa_switch *ds, int port, if (vlan->vid == 0) return 0; - if (!ds->vlan_filtering) - return 0; - - b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), &pvid); + pvid = dev->ports[port].pvid; vl = &dev->vlans[vlan->vid]; - b53_get_vlan_entry(dev, vlan->vid, vl); - vl->members &= ~BIT(port); if (pvid == vlan->vid) pvid = b53_default_pvid(dev); + dev->ports[port].pvid = pvid; if (untagged && !b53_vlan_port_needs_forced_tagged(ds, port)) vl->untag &= ~(BIT(port)); + if (!dev->vlan_filtering) + return 0; + b53_set_vlan_entry(dev, vlan->vid, vl); b53_fast_age_vlan(dev, vlan->vid); @@ -1958,7 +1999,7 @@ int b53_br_join(struct dsa_switch *ds, int port, struct dsa_bridge bridge, pvid = b53_default_pvid(dev); vl = &dev->vlans[pvid]; - if (ds->vlan_filtering) { + if (dev->vlan_filtering) { /* Make this port leave the all VLANs join since we will have * proper VLAN entries from now on */ @@ -2038,7 +2079,7 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge) pvid = b53_default_pvid(dev); vl = &dev->vlans[pvid]; - if (ds->vlan_filtering) { + if (dev->vlan_filtering) { /* Make this port join all VLANs without VLAN entries */ if (is58xx(dev)) { b53_read16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, ®); @@ -2803,6 +2844,7 @@ struct b53_device *b53_switch_alloc(struct device *base, ds->ops = &b53_switch_ops; ds->phylink_mac_ops = &b53_phylink_mac_ops; dev->vlan_enabled = true; + dev->vlan_filtering = false; /* Let DSA handle the case were multiple bridges span the same switch * device and different VLAN awareness settings are requested, which * would be breaking filtering semantics for any of the other bridge diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 0166c37a13a7f..4636e27fd1ee9 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -96,6 +96,7 @@ struct b53_pcs { struct b53_port { u16 vlan_ctl_mask; + u16 pvid; struct ethtool_keee eee; }; @@ -147,6 +148,7 @@ struct b53_device { unsigned int num_vlans; struct b53_vlan *vlans; bool vlan_enabled; + bool vlan_filtering; unsigned int num_ports; struct b53_port *ports; From 3af5c1cd98f8b6a22b79852adfbc30819137e61c Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:09 +0200 Subject: [PATCH 673/770] net: dsa: b53: fix learning on VLAN unaware bridges When VLAN filtering is off, we configure the switch to forward, but not learn on VLAN table misses. This effectively disables learning while not filtering. Fix this by switching to forward and learn. Setting the learning disable register will still control whether learning actually happens. Fixes: dad8d7c6452b ("net: dsa: b53: Properly account for VLAN filtering") Signed-off-by: Jonas Gorski Signed-off-by: NipaLocal --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 0a7749b416f77..a2c0b44fc6bef 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -383,7 +383,7 @@ static void b53_enable_vlan(struct b53_device *dev, int port, bool enable, vc4 |= VC4_ING_VID_VIO_DROP << VC4_ING_VID_CHECK_S; vc5 |= VC5_DROP_VTABLE_MISS; } else { - vc4 |= VC4_ING_VID_VIO_FWD << VC4_ING_VID_CHECK_S; + vc4 |= VC4_NO_ING_VID_CHK << VC4_ING_VID_CHECK_S; vc5 &= ~VC5_DROP_VTABLE_MISS; } From d1d7aef5fda8d9d22c798220acb1193d805af50c Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:10 +0200 Subject: [PATCH 674/770] net: dsa: b53: do not set learning and unicast/multicast on up When a port gets set up, b53 disables learning and enables the port for flooding. This can undo any bridge configuration on the port. E.g. the following flow would disable learning on a port: $ ip link add br0 type bridge $ ip link set sw1p1 master br0 <- enables learning for sw1p1 $ ip link set br0 up $ ip link set sw1p1 up <- disables learning again Fix this by populating dsa_switch_ops::port_setup(), and set up initial config there. Fixes: f9b3827ee66c ("net: dsa: b53: Support setting learning on port") Signed-off-by: Jonas Gorski Signed-off-by: NipaLocal --- drivers/net/dsa/b53/b53_common.c | 21 +++++++++++++-------- drivers/net/dsa/b53/b53_priv.h | 1 + drivers/net/dsa/bcm_sf2.c | 1 + 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index a2c0b44fc6bef..9eb39cfa5fb27 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -578,6 +578,18 @@ static void b53_eee_enable_set(struct dsa_switch *ds, int port, bool enable) b53_write16(dev, B53_EEE_PAGE, B53_EEE_EN_CTRL, reg); } +int b53_setup_port(struct dsa_switch *ds, int port) +{ + struct b53_device *dev = ds->priv; + + b53_port_set_ucast_flood(dev, port, true); + b53_port_set_mcast_flood(dev, port, true); + b53_port_set_learning(dev, port, false); + + return 0; +} +EXPORT_SYMBOL(b53_setup_port); + int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) { struct b53_device *dev = ds->priv; @@ -590,10 +602,6 @@ int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy) cpu_port = dsa_to_port(ds, port)->cpu_dp->index; - b53_port_set_ucast_flood(dev, port, true); - b53_port_set_mcast_flood(dev, port, true); - b53_port_set_learning(dev, port, false); - if (dev->ops->irq_enable) ret = dev->ops->irq_enable(dev, port); if (ret) @@ -724,10 +732,6 @@ static void b53_enable_cpu_port(struct b53_device *dev, int port) b53_write8(dev, B53_CTRL_PAGE, B53_PORT_CTRL(port), port_ctrl); b53_brcm_hdr_setup(dev->ds, port); - - b53_port_set_ucast_flood(dev, port, true); - b53_port_set_mcast_flood(dev, port, true); - b53_port_set_learning(dev, port, false); } static void b53_enable_mib(struct b53_device *dev) @@ -2387,6 +2391,7 @@ static const struct dsa_switch_ops b53_switch_ops = { .phy_read = b53_phy_read16, .phy_write = b53_phy_write16, .phylink_get_caps = b53_phylink_get_caps, + .port_setup = b53_setup_port, .port_enable = b53_enable_port, .port_disable = b53_disable_port, .support_eee = b53_support_eee, diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 4636e27fd1ee9..2cf3e6a81e378 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -384,6 +384,7 @@ enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, int port, enum dsa_tag_protocol mprot); void b53_mirror_del(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); +int b53_setup_port(struct dsa_switch *ds, int port); int b53_enable_port(struct dsa_switch *ds, int port, struct phy_device *phy); void b53_disable_port(struct dsa_switch *ds, int port); void b53_brcm_hdr_setup(struct dsa_switch *ds, int port); diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index fa2bf3fa90191..454a8c7fd7eea 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1230,6 +1230,7 @@ static const struct dsa_switch_ops bcm_sf2_ops = { .resume = bcm_sf2_sw_resume, .get_wol = bcm_sf2_sw_get_wol, .set_wol = bcm_sf2_sw_set_wol, + .port_setup = b53_setup_port, .port_enable = bcm_sf2_port_setup, .port_disable = bcm_sf2_port_disable, .support_eee = b53_support_eee, From 35665a3f869655dba1da74d7fe492ed89904ca99 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 30 Apr 2025 08:57:22 +0000 Subject: [PATCH 675/770] net: Introduce direct data placement tcp offload This commit introduces direct data placement (DDP) offload for TCP. The motivation is saving compute resources/cycles that are spent to copy data from SKBs to the block layer buffers and CRC calculation/verification for received PDUs (Protocol Data Units). The DDP capability is accompanied by new net_device operations that configure hardware contexts. There is a context per socket, and a context per DDP operation. Additionally, a resynchronization routine is used to assist hardware handle TCP OOO, and continue the offload. Furthermore, we let the offloading driver advertise what is the max hw sectors/segments. The interface includes the following net-device ddp operations: 1. sk_add - add offload for the queue represented by socket+config pair 2. sk_del - remove the offload for the socket/queue 3. ddp_setup - request copy offload for buffers associated with an IO 4. ddp_teardown - release offload resources for that IO 5. limits - query NIC driver for quirks and limitations (e.g. max number of scatter gather entries per IO) 6. set_caps - request ULP DDP capabilities enablement 7. get_caps - request current ULP DDP capabilities 8. get_stats - query NIC driver for ULP DDP stats Using this interface, the NIC hardware will scatter TCP payload directly to the BIO pages according to the command_id. To maintain the correctness of the network stack, the driver is expected to construct SKBs that point to the BIO pages. The SKB passed to the network stack from the driver represents data as it is on the wire, while it is pointing directly to data in destination buffers. As a result, data from page frags should not be copied out to the linear part. To avoid needless copies, such as when using skb_condense, we mark the skb->no_condense bit. In addition, the skb->ulp_crc will be used by the upper layers to determine if CRC re-calculation is required. The two separated skb indications are needed to avoid false positives GRO flushing events. Follow-up patches will use this interface for DDP in NVMe-TCP. Capability bits stored in net_device allow drivers to report which ULP DDP capabilities a device supports. Control over these capabilities will be exposed to userspace in later patches. Signed-off-by: Boris Pismenny Signed-off-by: Ben Ben-Ishay Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Shai Malin Signed-off-by: Aurelien Aptel Signed-off-by: NipaLocal --- include/linux/netdevice.h | 5 + include/linux/skbuff.h | 50 +++++ include/net/inet_connection_sock.h | 6 + include/net/tcp.h | 3 +- include/net/ulp_ddp.h | 326 +++++++++++++++++++++++++++++ net/Kconfig | 20 ++ net/core/Makefile | 1 + net/core/skbuff.c | 4 +- net/core/ulp_ddp.c | 54 +++++ net/ipv4/tcp_input.c | 2 + net/ipv4/tcp_offload.c | 1 + 11 files changed, 470 insertions(+), 2 deletions(-) create mode 100644 include/net/ulp_ddp.h create mode 100644 net/core/ulp_ddp.c diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a661820a26c44..89efeb5070d0e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1397,6 +1397,8 @@ struct netdev_net_notifier { * struct kernel_hwtstamp_config *kernel_config, * struct netlink_ext_ack *extack); * Change the hardware timestamping parameters for NIC device. + * struct ulp_ddp_dev_ops *ulp_ddp_ops; + * ULP DDP operations (see include/net/ulp_ddp.h) */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1652,6 +1654,9 @@ struct net_device_ops { */ const struct net_shaper_ops *net_shaper_ops; #endif +#if IS_ENABLED(CONFIG_ULP_DDP) + const struct ulp_ddp_dev_ops *ulp_ddp_ops; +#endif }; /** diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5d62e8e77546c..58ee4554bf088 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -851,6 +851,8 @@ enum skb_tstamp_type { * @slow_gro: state present at GRO time, slower prepare step required * @tstamp_type: When set, skb->tstamp has the * delivery_time clock base of skb->tstamp. + * @no_condense: When set, don't condense fragments (DDP offloaded) + * @ulp_crc: CRC offloaded * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @alloc_cpu: CPU which did the skb allocation. @@ -1028,6 +1030,10 @@ struct sk_buff { __u8 csum_not_inet:1; #endif __u8 unreadable:1; +#ifdef CONFIG_ULP_DDP + __u8 no_condense:1; + __u8 ulp_crc:1; +#endif #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) __u16 tc_index; /* traffic control index */ #endif @@ -5268,5 +5274,49 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb) ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter, ssize_t maxsize, gfp_t gfp); +static inline bool skb_is_no_condense(const struct sk_buff *skb) +{ +#ifdef CONFIG_ULP_DDP + return skb->no_condense; +#else + return 0; +#endif +} + +static inline bool skb_is_ulp_crc(const struct sk_buff *skb) +{ +#ifdef CONFIG_ULP_DDP + return skb->ulp_crc; +#else + return 0; +#endif +} + +static inline bool skb_cmp_ulp_crc(const struct sk_buff *skb1, + const struct sk_buff *skb2) +{ +#ifdef CONFIG_ULP_DDP + return skb1->ulp_crc != skb2->ulp_crc; +#else + return 0; +#endif +} + +static inline void skb_copy_no_condense(struct sk_buff *to, + const struct sk_buff *from) +{ +#ifdef CONFIG_ULP_DDP + to->no_condense = from->no_condense; +#endif +} + +static inline void skb_copy_ulp_crc(struct sk_buff *to, + const struct sk_buff *from) +{ +#ifdef CONFIG_ULP_DDP + to->ulp_crc = from->ulp_crc; +#endif +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 1735db332aab5..65cca0d4d6c2e 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -63,6 +63,8 @@ struct inet_connection_sock_af_ops { * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data + * @icsk_ulp_ddp_ops Pluggable ULP direct data placement control hook + * @icsk_ulp_ddp_data ULP direct data placement private data * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -92,6 +94,10 @@ struct inet_connection_sock { const struct inet_connection_sock_af_ops *icsk_af_ops; const struct tcp_ulp_ops *icsk_ulp_ops; void __rcu *icsk_ulp_data; +#ifdef CONFIG_ULP_DDP + const struct ulp_ddp_ulp_ops *icsk_ulp_ddp_ops; + void __rcu *icsk_ulp_ddp_data; +#endif unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:5, icsk_ca_initialized:1, diff --git a/include/net/tcp.h b/include/net/tcp.h index 5078ad868feef..d0b0e73ad14b6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1148,7 +1148,8 @@ static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to, const struct sk_buff *from) { return likely(mptcp_skb_can_collapse(to, from) && - !skb_cmp_decrypted(to, from)); + !skb_cmp_decrypted(to, from) && + !skb_cmp_ulp_crc(to, from)); } /* Events passed to congestion control interface */ diff --git a/include/net/ulp_ddp.h b/include/net/ulp_ddp.h new file mode 100644 index 0000000000000..7b32bb9e2a08c --- /dev/null +++ b/include/net/ulp_ddp.h @@ -0,0 +1,326 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * ulp_ddp.h + * Author: Boris Pismenny + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ +#ifndef _ULP_DDP_H +#define _ULP_DDP_H + +#include +#include +#include + +enum ulp_ddp_type { + ULP_DDP_NVME = 1, +}; + +/** + * struct nvme_tcp_ddp_limits - nvme tcp driver limitations + * + * @full_ccid_range: true if the driver supports the full CID range + */ +struct nvme_tcp_ddp_limits { + bool full_ccid_range; +}; + +/** + * struct ulp_ddp_limits - Generic ulp ddp limits: tcp ddp + * protocol limits. + * Add new instances of ulp_ddp_limits in the union below (nvme-tcp, etc.). + * + * @type: type of this limits struct + * @max_ddp_sgl_len: maximum sgl size supported (zero means no limit) + * @io_threshold: minimum payload size required to offload + * @tls: support for ULP over TLS + * @nvmeotcp: NVMe-TCP specific limits + */ +struct ulp_ddp_limits { + enum ulp_ddp_type type; + int max_ddp_sgl_len; + int io_threshold; + bool tls:1; + union { + struct nvme_tcp_ddp_limits nvmeotcp; + }; +}; + +/** + * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue + * + * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0) + * @cpda: controller pdu data alignment (dwords, 0's based) + * @dgst: digest types enabled (header or data, see + * enum nvme_tcp_digest_option). + * The netdev will offload crc if it is supported. + * @queue_size: number of nvme-tcp IO queue elements + */ +struct nvme_tcp_ddp_config { + u16 pfv; + u8 cpda; + u8 dgst; + int queue_size; +}; + +/** + * struct ulp_ddp_config - Generic ulp ddp configuration + * Add new instances of ulp_ddp_config in the union below (nvme-tcp, etc.). + * + * @type: type of this config struct + * @nvmeotcp: NVMe-TCP specific config + * @affinity_hint: cpu core running the IO thread for this socket + */ +struct ulp_ddp_config { + enum ulp_ddp_type type; + int affinity_hint; + union { + struct nvme_tcp_ddp_config nvmeotcp; + }; +}; + +/** + * struct ulp_ddp_io - ulp ddp configuration for an IO request. + * + * @command_id: identifier on the wire associated with these buffers + * @nents: number of entries in the sg_table + * @sg_table: describing the buffers for this IO request + * @first_sgl: first SGL in sg_table + */ +struct ulp_ddp_io { + u32 command_id; + int nents; + struct sg_table sg_table; + struct scatterlist first_sgl[SG_CHUNK_SIZE]; +}; + +/** + * struct ulp_ddp_stats - ULP DDP offload statistics + * @rx_nvmeotcp_sk_add: number of sockets successfully prepared for offloading. + * @rx_nvmeotcp_sk_add_fail: number of sockets that failed to be prepared + * for offloading. + * @rx_nvmeotcp_sk_del: number of sockets where offloading has been removed. + * @rx_nvmeotcp_ddp_setup: number of NVMeTCP PDU successfully prepared for + * Direct Data Placement. + * @rx_nvmeotcp_ddp_setup_fail: number of PDUs that failed DDP preparation. + * @rx_nvmeotcp_ddp_teardown: number of PDUs done with DDP. + * @rx_nvmeotcp_drop: number of PDUs dropped. + * @rx_nvmeotcp_resync: number of resync. + * @rx_nvmeotcp_packets: number of offloaded PDUs. + * @rx_nvmeotcp_bytes: number of offloaded bytes. + */ +struct ulp_ddp_stats { + u64 rx_nvmeotcp_sk_add; + u64 rx_nvmeotcp_sk_add_fail; + u64 rx_nvmeotcp_sk_del; + u64 rx_nvmeotcp_ddp_setup; + u64 rx_nvmeotcp_ddp_setup_fail; + u64 rx_nvmeotcp_ddp_teardown; + u64 rx_nvmeotcp_drop; + u64 rx_nvmeotcp_resync; + u64 rx_nvmeotcp_packets; + u64 rx_nvmeotcp_bytes; + + /* + * add new stats at the end and keep in sync with + * Documentation/netlink/specs/ulp_ddp.yaml + */ +}; + +#define ULP_DDP_CAP_COUNT 1 + +struct ulp_ddp_dev_caps { + DECLARE_BITMAP(active, ULP_DDP_CAP_COUNT); + DECLARE_BITMAP(hw, ULP_DDP_CAP_COUNT); +}; + +struct netlink_ext_ack; + +/** + * struct ulp_ddp_dev_ops - operations used by an upper layer protocol + * to configure ddp offload + * + * @limits: query ulp driver limitations and quirks. + * @sk_add: add offload for the queue represented by socket+config + * pair. this function is used to configure either copy, crc + * or both offloads. + * @sk_del: remove offload from the socket, and release any device + * related resources. + * @setup: request copy offload for buffers associated with a + * command_id in ulp_ddp_io. + * @teardown: release offload resources association between buffers + * and command_id in ulp_ddp_io. + * @resync: respond to the driver's resync_request. Called only if + * resync is successful. + * @set_caps: set device ULP DDP capabilities. + * returns a negative error code or zero. + * @get_caps: get device ULP DDP capabilities. + * @get_stats: query ULP DDP statistics. + */ +struct ulp_ddp_dev_ops { + int (*limits)(struct net_device *netdev, + struct ulp_ddp_limits *limits); + int (*sk_add)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_config *config); + void (*sk_del)(struct net_device *netdev, + struct sock *sk); + int (*setup)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *io); + void (*teardown)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *io, + void *ddp_ctx); + void (*resync)(struct net_device *netdev, + struct sock *sk, u32 seq); + int (*set_caps)(struct net_device *dev, unsigned long *bits, + struct netlink_ext_ack *extack); + void (*get_caps)(struct net_device *dev, + struct ulp_ddp_dev_caps *caps); + int (*get_stats)(struct net_device *dev, + struct ulp_ddp_stats *stats); +}; + +#define ULP_DDP_RESYNC_PENDING BIT(0) + +/** + * struct ulp_ddp_ulp_ops - Interface to register upper layer + * Direct Data Placement (DDP) TCP offload. + * @resync_request: NIC requests ulp to indicate if @seq is the start + * of a message. + * @ddp_teardown_done: NIC driver informs the ulp that teardown is done, + * used for async completions. + */ +struct ulp_ddp_ulp_ops { + bool (*resync_request)(struct sock *sk, u32 seq, u32 flags); + void (*ddp_teardown_done)(void *ddp_ctx); +}; + +/** + * struct ulp_ddp_ctx - Generic ulp ddp context + * + * @type: type of this context struct + * @buf: protocol-specific context struct + */ +struct ulp_ddp_ctx { + enum ulp_ddp_type type; + unsigned char buf[]; +}; + +static inline struct ulp_ddp_ctx *ulp_ddp_get_ctx(struct sock *sk) +{ +#ifdef CONFIG_ULP_DDP + struct inet_connection_sock *icsk = inet_csk(sk); + + return (__force struct ulp_ddp_ctx *)icsk->icsk_ulp_ddp_data; +#else + return NULL; +#endif +} + +static inline void ulp_ddp_set_ctx(struct sock *sk, void *ctx) +{ +#ifdef CONFIG_ULP_DDP + struct inet_connection_sock *icsk = inet_csk(sk); + + rcu_assign_pointer(icsk->icsk_ulp_ddp_data, ctx); +#endif +} + +static inline int ulp_ddp_setup(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *io) +{ +#ifdef CONFIG_ULP_DDP + return netdev->netdev_ops->ulp_ddp_ops->setup(netdev, sk, io); +#else + return -EOPNOTSUPP; +#endif +} + +static inline void ulp_ddp_teardown(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *io, + void *ddp_ctx) +{ +#ifdef CONFIG_ULP_DDP + netdev->netdev_ops->ulp_ddp_ops->teardown(netdev, sk, io, ddp_ctx); +#endif +} + +static inline void ulp_ddp_resync(struct net_device *netdev, + struct sock *sk, + u32 seq) +{ +#ifdef CONFIG_ULP_DDP + netdev->netdev_ops->ulp_ddp_ops->resync(netdev, sk, seq); +#endif +} + +static inline int ulp_ddp_get_limits(struct net_device *netdev, + struct ulp_ddp_limits *limits, + enum ulp_ddp_type type) +{ +#ifdef CONFIG_ULP_DDP + limits->type = type; + return netdev->netdev_ops->ulp_ddp_ops->limits(netdev, limits); +#else + return -EOPNOTSUPP; +#endif +} + +static inline bool ulp_ddp_cap_turned_on(unsigned long *old, + unsigned long *new, + int bit_nr) +{ + return !test_bit(bit_nr, old) && test_bit(bit_nr, new); +} + +static inline bool ulp_ddp_cap_turned_off(unsigned long *old, + unsigned long *new, + int bit_nr) +{ + return test_bit(bit_nr, old) && !test_bit(bit_nr, new); +} + +#ifdef CONFIG_ULP_DDP + +int ulp_ddp_sk_add(struct net_device *netdev, + netdevice_tracker *tracker, + gfp_t gfp, + struct sock *sk, + struct ulp_ddp_config *config, + const struct ulp_ddp_ulp_ops *ops); + +void ulp_ddp_sk_del(struct net_device *netdev, + netdevice_tracker *tracker, + struct sock *sk); + +bool ulp_ddp_is_cap_active(struct net_device *netdev, int cap_bit_nr); + +#else + +static inline int ulp_ddp_sk_add(struct net_device *netdev, + netdevice_tracker *tracker, + gfp_t gfp, + struct sock *sk, + struct ulp_ddp_config *config, + const struct ulp_ddp_ulp_ops *ops) +{ + return -EOPNOTSUPP; +} + +static inline void ulp_ddp_sk_del(struct net_device *netdev, + netdevice_tracker *tracker, + struct sock *sk) +{} + +static inline bool ulp_ddp_is_cap_active(struct net_device *netdev, + int cap_bit_nr) +{ + return false; +} + +#endif + +#endif /* _ULP_DDP_H */ diff --git a/net/Kconfig b/net/Kconfig index 202cc595e5e6f..5059db6d41cf7 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -537,4 +537,24 @@ config NET_TEST If unsure, say N. +config ULP_DDP + bool "ULP direct data placement offload" + help + This feature provides a generic infrastructure for Direct + Data Placement (DDP) offload for Upper Layer Protocols (ULP, + such as NVMe-TCP). + + If the ULP and NIC driver supports it, the ULP code can + request the NIC to place ULP response data directly + into application memory, avoiding a costly copy. + + This infrastructure also allows for offloading the ULP data + integrity checks (e.g. data digest) that would otherwise + require another costly pass on the data we managed to avoid + copying. + + For more information, see + . + + endif # if NET diff --git a/net/core/Makefile b/net/core/Makefile index b2a76ce33932c..6d817870d7c3a 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o obj-y += netdev_rx_queue.o +obj-$(CONFIG_ULP_DDP) += ulp_ddp.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4159107f1666c..f47d7eab1a284 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -6915,7 +6916,8 @@ void skb_condense(struct sk_buff *skb) { if (skb->data_len) { if (skb->data_len > skb->end - skb->tail || - skb_cloned(skb) || !skb_frags_readable(skb)) + skb_cloned(skb) || !skb_frags_readable(skb) || + skb_is_no_condense(skb)) return; /* Nice, we can free page frag(s) right now */ diff --git a/net/core/ulp_ddp.c b/net/core/ulp_ddp.c new file mode 100644 index 0000000000000..f8ebd729e1198 --- /dev/null +++ b/net/core/ulp_ddp.c @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * ulp_ddp.c + * Author: Aurelien Aptel + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ + +#include + +int ulp_ddp_sk_add(struct net_device *netdev, + netdevice_tracker *tracker, + gfp_t gfp, + struct sock *sk, + struct ulp_ddp_config *config, + const struct ulp_ddp_ulp_ops *ops) +{ + int ret; + + /* put in ulp_ddp_sk_del() */ + netdev_hold(netdev, tracker, gfp); + + ret = netdev->netdev_ops->ulp_ddp_ops->sk_add(netdev, sk, config); + if (ret) { + dev_put(netdev); + return ret; + } + + inet_csk(sk)->icsk_ulp_ddp_ops = ops; + + return 0; +} +EXPORT_SYMBOL_GPL(ulp_ddp_sk_add); + +void ulp_ddp_sk_del(struct net_device *netdev, + netdevice_tracker *tracker, + struct sock *sk) +{ + netdev->netdev_ops->ulp_ddp_ops->sk_del(netdev, sk); + inet_csk(sk)->icsk_ulp_ddp_ops = NULL; + netdev_put(netdev, tracker); +} +EXPORT_SYMBOL_GPL(ulp_ddp_sk_del); + +bool ulp_ddp_is_cap_active(struct net_device *netdev, int cap_bit_nr) +{ + struct ulp_ddp_dev_caps caps; + + if (!netdev->netdev_ops->ulp_ddp_ops) + return false; + netdev->netdev_ops->ulp_ddp_ops->get_caps(netdev, &caps); + return test_bit(cap_bit_nr, caps.active); +} +EXPORT_SYMBOL_GPL(ulp_ddp_is_cap_active); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a35018e2d0ba2..ba4f40aab8263 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5499,6 +5499,8 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); skb_copy_decrypted(nskb, skb); + skb_copy_no_condense(nskb, skb); + skb_copy_ulp_crc(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; if (list) __skb_queue_before(list, skb, nskb); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index d293087b426df..77ae71ea25f6f 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -353,6 +353,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); flush |= skb_cmp_decrypted(p, skb); + flush |= skb_cmp_ulp_crc(p, skb); if (unlikely(NAPI_GRO_CB(p)->is_flist)) { flush |= (__force int)(flags ^ tcp_flag_word(th2)); From 3b99d5dd8a7879dd445bb4b50a9dbb85e821b26d Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Wed, 30 Apr 2025 08:57:23 +0000 Subject: [PATCH 676/770] netlink: add new family to manage ULP_DDP enablement and stats Add a new netlink family to get/set ULP DDP capabilities on a network device and to retrieve statistics. The messages use the genetlink infrastructure and are specified in a YAML file which was used to generate some of the files in this commit: ./tools/net/ynl/ynl-gen-c.py --mode kernel \ --spec ./Documentation/netlink/specs/ulp_ddp.yaml --header \ -o net/core/ulp_ddp_gen_nl.h ./tools/net/ynl/ynl-gen-c.py --mode kernel \ --spec ./Documentation/netlink/specs/ulp_ddp.yaml --source \ -o net/core/ulp_ddp_gen_nl.c ./tools/net/ynl/ynl-gen-c.py --mode uapi \ --spec ./Documentation/netlink/specs/ulp_ddp.yaml --header \ > include/uapi/linux/ulp_ddp.h Signed-off-by: Shai Malin Signed-off-by: Aurelien Aptel Reviewed-by: Jiri Pirko Signed-off-by: NipaLocal --- Documentation/netlink/specs/ulp_ddp.yaml | 172 +++++++++++ include/net/ulp_ddp.h | 3 +- include/uapi/linux/ulp_ddp.h | 61 ++++ net/core/Makefile | 2 +- net/core/ulp_ddp_gen_nl.c | 75 +++++ net/core/ulp_ddp_gen_nl.h | 30 ++ net/core/ulp_ddp_nl.c | 346 +++++++++++++++++++++++ 7 files changed, 687 insertions(+), 2 deletions(-) create mode 100644 Documentation/netlink/specs/ulp_ddp.yaml create mode 100644 include/uapi/linux/ulp_ddp.h create mode 100644 net/core/ulp_ddp_gen_nl.c create mode 100644 net/core/ulp_ddp_gen_nl.h create mode 100644 net/core/ulp_ddp_nl.c diff --git a/Documentation/netlink/specs/ulp_ddp.yaml b/Documentation/netlink/specs/ulp_ddp.yaml new file mode 100644 index 0000000000000..27a0b905ec286 --- /dev/null +++ b/Documentation/netlink/specs/ulp_ddp.yaml @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +# +# Author: Aurelien Aptel +# +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# + +name: ulp_ddp + +protocol: genetlink + +doc: Netlink protocol to manage ULP DPP on network devices. + +definitions: + - + type: enum + name: cap + render-max: true + entries: + - nvme-tcp + - nvme-tcp-ddgst-rx + +attribute-sets: + - + name: stats + attributes: + - + name: ifindex + doc: Interface index of the net device. + type: u32 + - + name: rx-nvme-tcp-sk-add + doc: Sockets successfully configured for NVMeTCP offloading. + type: uint + - + name: rx-nvme-tcp-sk-add-fail + doc: Sockets failed to be configured for NVMeTCP offloading. + type: uint + - + name: rx-nvme-tcp-sk-del + doc: Sockets with NVMeTCP offloading configuration removed. + type: uint + - + name: rx-nvme-tcp-setup + doc: NVMe-TCP IOs successfully configured for Rx Direct Data Placement. + type: uint + - + name: rx-nvme-tcp-setup-fail + doc: NVMe-TCP IOs failed to be configured for Rx Direct Data Placement. + type: uint + - + name: rx-nvme-tcp-teardown + doc: NVMe-TCP IOs with Rx Direct Data Placement configuration removed. + type: uint + - + name: rx-nvme-tcp-drop + doc: Packets failed the NVMeTCP offload validation. + type: uint + - + name: rx-nvme-tcp-resync + doc: > + NVMe-TCP resync operations were processed due to Rx TCP packets + re-ordering. + type: uint + - + name: rx-nvme-tcp-packets + doc: TCP packets successfully processed by the NVMeTCP offload. + type: uint + - + name: rx-nvme-tcp-bytes + doc: Bytes were successfully processed by the NVMeTCP offload. + type: uint + - + name: caps + attributes: + - + name: ifindex + doc: Interface index of the net device. + type: u32 + - + name: hw + doc: Bitmask of the capabilities supported by the device. + type: uint + enum: cap + enum-as-flags: true + - + name: active + doc: Bitmask of the capabilities currently enabled on the device. + type: uint + enum: cap + enum-as-flags: true + - + name: wanted + doc: > + New active bit values of the capabilities we want to set on the + device. + type: uint + enum: cap + enum-as-flags: true + - + name: wanted_mask + doc: Bitmask of the meaningful bits in the wanted field. + type: uint + enum: cap + enum-as-flags: true + +operations: + list: + - + name: caps-get + doc: Get ULP DDP capabilities. + attribute-set: caps + do: + request: + attributes: + - ifindex + reply: + attributes: + - ifindex + - hw + - active + pre: ulp_ddp_get_netdev + post: ulp_ddp_put_netdev + - + name: stats-get + doc: Get ULP DDP stats. + attribute-set: stats + do: + request: + attributes: + - ifindex + reply: + attributes: + - ifindex + - rx-nvme-tcp-sk-add + - rx-nvme-tcp-sk-add-fail + - rx-nvme-tcp-sk-del + - rx-nvme-tcp-setup + - rx-nvme-tcp-setup-fail + - rx-nvme-tcp-teardown + - rx-nvme-tcp-drop + - rx-nvme-tcp-resync + - rx-nvme-tcp-packets + - rx-nvme-tcp-bytes + pre: ulp_ddp_get_netdev + post: ulp_ddp_put_netdev + - + name: caps-set + doc: Set ULP DDP capabilities. + attribute-set: caps + do: + request: + attributes: + - ifindex + - wanted + - wanted_mask + reply: + attributes: + - ifindex + - hw + - active + pre: ulp_ddp_get_netdev + post: ulp_ddp_put_netdev + - + name: caps-set-ntf + doc: Notification for change in ULP DDP capabilities. + notify: caps-get + +mcast-groups: + list: + - + name: mgmt diff --git a/include/net/ulp_ddp.h b/include/net/ulp_ddp.h index 7b32bb9e2a08c..2e54d19c22f6a 100644 --- a/include/net/ulp_ddp.h +++ b/include/net/ulp_ddp.h @@ -10,6 +10,7 @@ #include #include #include +#include enum ulp_ddp_type { ULP_DDP_NVME = 1, @@ -126,7 +127,7 @@ struct ulp_ddp_stats { */ }; -#define ULP_DDP_CAP_COUNT 1 +#define ULP_DDP_CAP_COUNT (ULP_DDP_CAP_MAX + 1) struct ulp_ddp_dev_caps { DECLARE_BITMAP(active, ULP_DDP_CAP_COUNT); diff --git a/include/uapi/linux/ulp_ddp.h b/include/uapi/linux/ulp_ddp.h new file mode 100644 index 0000000000000..dbf6399d3aef7 --- /dev/null +++ b/include/uapi/linux/ulp_ddp.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ulp_ddp.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_ULP_DDP_H +#define _UAPI_LINUX_ULP_DDP_H + +#define ULP_DDP_FAMILY_NAME "ulp_ddp" +#define ULP_DDP_FAMILY_VERSION 1 + +enum ulp_ddp_cap { + ULP_DDP_CAP_NVME_TCP, + ULP_DDP_CAP_NVME_TCP_DDGST_RX, + + /* private: */ + __ULP_DDP_CAP_MAX, + ULP_DDP_CAP_MAX = (__ULP_DDP_CAP_MAX - 1) +}; + +enum { + ULP_DDP_A_STATS_IFINDEX = 1, + ULP_DDP_A_STATS_RX_NVME_TCP_SK_ADD, + ULP_DDP_A_STATS_RX_NVME_TCP_SK_ADD_FAIL, + ULP_DDP_A_STATS_RX_NVME_TCP_SK_DEL, + ULP_DDP_A_STATS_RX_NVME_TCP_SETUP, + ULP_DDP_A_STATS_RX_NVME_TCP_SETUP_FAIL, + ULP_DDP_A_STATS_RX_NVME_TCP_TEARDOWN, + ULP_DDP_A_STATS_RX_NVME_TCP_DROP, + ULP_DDP_A_STATS_RX_NVME_TCP_RESYNC, + ULP_DDP_A_STATS_RX_NVME_TCP_PACKETS, + ULP_DDP_A_STATS_RX_NVME_TCP_BYTES, + + __ULP_DDP_A_STATS_MAX, + ULP_DDP_A_STATS_MAX = (__ULP_DDP_A_STATS_MAX - 1) +}; + +enum { + ULP_DDP_A_CAPS_IFINDEX = 1, + ULP_DDP_A_CAPS_HW, + ULP_DDP_A_CAPS_ACTIVE, + ULP_DDP_A_CAPS_WANTED, + ULP_DDP_A_CAPS_WANTED_MASK, + + __ULP_DDP_A_CAPS_MAX, + ULP_DDP_A_CAPS_MAX = (__ULP_DDP_A_CAPS_MAX - 1) +}; + +enum { + ULP_DDP_CMD_CAPS_GET = 1, + ULP_DDP_CMD_STATS_GET, + ULP_DDP_CMD_CAPS_SET, + ULP_DDP_CMD_CAPS_SET_NTF, + + __ULP_DDP_CMD_MAX, + ULP_DDP_CMD_MAX = (__ULP_DDP_CMD_MAX - 1) +}; + +#define ULP_DDP_MCGRP_MGMT "mgmt" + +#endif /* _UAPI_LINUX_ULP_DDP_H */ diff --git a/net/core/Makefile b/net/core/Makefile index 6d817870d7c3a..128133dd2a4dc 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -20,7 +20,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o obj-y += netdev_rx_queue.o -obj-$(CONFIG_ULP_DDP) += ulp_ddp.o +obj-$(CONFIG_ULP_DDP) += ulp_ddp.o ulp_ddp_nl.o ulp_ddp_gen_nl.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o diff --git a/net/core/ulp_ddp_gen_nl.c b/net/core/ulp_ddp_gen_nl.c new file mode 100644 index 0000000000000..5675193ad8ca7 --- /dev/null +++ b/net/core/ulp_ddp_gen_nl.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ulp_ddp.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "ulp_ddp_gen_nl.h" + +#include + +/* ULP_DDP_CMD_CAPS_GET - do */ +static const struct nla_policy ulp_ddp_caps_get_nl_policy[ULP_DDP_A_CAPS_IFINDEX + 1] = { + [ULP_DDP_A_CAPS_IFINDEX] = { .type = NLA_U32, }, +}; + +/* ULP_DDP_CMD_STATS_GET - do */ +static const struct nla_policy ulp_ddp_stats_get_nl_policy[ULP_DDP_A_STATS_IFINDEX + 1] = { + [ULP_DDP_A_STATS_IFINDEX] = { .type = NLA_U32, }, +}; + +/* ULP_DDP_CMD_CAPS_SET - do */ +static const struct nla_policy ulp_ddp_caps_set_nl_policy[ULP_DDP_A_CAPS_WANTED_MASK + 1] = { + [ULP_DDP_A_CAPS_IFINDEX] = { .type = NLA_U32, }, + [ULP_DDP_A_CAPS_WANTED] = NLA_POLICY_MASK(NLA_UINT, 0x3), + [ULP_DDP_A_CAPS_WANTED_MASK] = NLA_POLICY_MASK(NLA_UINT, 0x3), +}; + +/* Ops table for ulp_ddp */ +static const struct genl_split_ops ulp_ddp_nl_ops[] = { + { + .cmd = ULP_DDP_CMD_CAPS_GET, + .pre_doit = ulp_ddp_get_netdev, + .doit = ulp_ddp_nl_caps_get_doit, + .post_doit = ulp_ddp_put_netdev, + .policy = ulp_ddp_caps_get_nl_policy, + .maxattr = ULP_DDP_A_CAPS_IFINDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = ULP_DDP_CMD_STATS_GET, + .pre_doit = ulp_ddp_get_netdev, + .doit = ulp_ddp_nl_stats_get_doit, + .post_doit = ulp_ddp_put_netdev, + .policy = ulp_ddp_stats_get_nl_policy, + .maxattr = ULP_DDP_A_STATS_IFINDEX, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = ULP_DDP_CMD_CAPS_SET, + .pre_doit = ulp_ddp_get_netdev, + .doit = ulp_ddp_nl_caps_set_doit, + .post_doit = ulp_ddp_put_netdev, + .policy = ulp_ddp_caps_set_nl_policy, + .maxattr = ULP_DDP_A_CAPS_WANTED_MASK, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group ulp_ddp_nl_mcgrps[] = { + [ULP_DDP_NLGRP_MGMT] = { "mgmt", }, +}; + +struct genl_family ulp_ddp_nl_family __ro_after_init = { + .name = ULP_DDP_FAMILY_NAME, + .version = ULP_DDP_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = ulp_ddp_nl_ops, + .n_split_ops = ARRAY_SIZE(ulp_ddp_nl_ops), + .mcgrps = ulp_ddp_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(ulp_ddp_nl_mcgrps), +}; diff --git a/net/core/ulp_ddp_gen_nl.h b/net/core/ulp_ddp_gen_nl.h new file mode 100644 index 0000000000000..368433cfa8679 --- /dev/null +++ b/net/core/ulp_ddp_gen_nl.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/ulp_ddp.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_ULP_DDP_GEN_H +#define _LINUX_ULP_DDP_GEN_H + +#include +#include + +#include + +int ulp_ddp_get_netdev(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); +void +ulp_ddp_put_netdev(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int ulp_ddp_nl_caps_get_doit(struct sk_buff *skb, struct genl_info *info); +int ulp_ddp_nl_stats_get_doit(struct sk_buff *skb, struct genl_info *info); +int ulp_ddp_nl_caps_set_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + ULP_DDP_NLGRP_MGMT, +}; + +extern struct genl_family ulp_ddp_nl_family; + +#endif /* _LINUX_ULP_DDP_GEN_H */ diff --git a/net/core/ulp_ddp_nl.c b/net/core/ulp_ddp_nl.c new file mode 100644 index 0000000000000..ff598c9559701 --- /dev/null +++ b/net/core/ulp_ddp_nl.c @@ -0,0 +1,346 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ulp_ddp_nl.c + * Author: Aurelien Aptel + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + */ +#include +#include "ulp_ddp_gen_nl.h" + +#define ULP_DDP_STATS_CNT (sizeof(struct ulp_ddp_stats) / sizeof(u64)) + +struct ulp_ddp_reply_context { + struct net_device *dev; + netdevice_tracker tracker; + struct ulp_ddp_dev_caps caps; + struct ulp_ddp_stats stats; +}; + +static size_t ulp_ddp_reply_size(int cmd) +{ + size_t len = 0; + + BUILD_BUG_ON(ULP_DDP_CAP_COUNT > 64); + + /* ifindex */ + len += nla_total_size(sizeof(u32)); + + switch (cmd) { + case ULP_DDP_CMD_CAPS_GET: + case ULP_DDP_CMD_CAPS_SET: + case ULP_DDP_CMD_CAPS_SET_NTF: + /* hw */ + len += nla_total_size_64bit(sizeof(u64)); + + /* active */ + len += nla_total_size_64bit(sizeof(u64)); + break; + case ULP_DDP_CMD_STATS_GET: + /* stats */ + len += nla_total_size_64bit(sizeof(u64)) * ULP_DDP_STATS_CNT; + break; + } + + return len; +} + +/* pre_doit */ +int ulp_ddp_get_netdev(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct ulp_ddp_reply_context *ctx; + u32 ifindex; + + if (GENL_REQ_ATTR_CHECK(info, ULP_DDP_A_CAPS_IFINDEX)) + return -EINVAL; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ifindex = nla_get_u32(info->attrs[ULP_DDP_A_CAPS_IFINDEX]); + ctx->dev = netdev_get_by_index(genl_info_net(info), + ifindex, + &ctx->tracker, + GFP_KERNEL); + if (!ctx->dev) { + kfree(ctx); + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[ULP_DDP_A_CAPS_IFINDEX], + "Network interface does not exist"); + return -ENODEV; + } + + if (!ctx->dev->netdev_ops->ulp_ddp_ops) { + netdev_put(ctx->dev, &ctx->tracker); + kfree(ctx); + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[ULP_DDP_A_CAPS_IFINDEX], + "Network interface does not support ULP DDP"); + return -EOPNOTSUPP; + } + + info->user_ptr[0] = ctx; + return 0; +} + +/* post_doit */ +void ulp_ddp_put_netdev(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct ulp_ddp_reply_context *ctx = info->user_ptr[0]; + + netdev_put(ctx->dev, &ctx->tracker); + kfree(ctx); +} + +static int ulp_ddp_prepare_context(struct ulp_ddp_reply_context *ctx, int cmd) +{ + const struct ulp_ddp_dev_ops *ops = ctx->dev->netdev_ops->ulp_ddp_ops; + + switch (cmd) { + case ULP_DDP_CMD_CAPS_GET: + case ULP_DDP_CMD_CAPS_SET: + case ULP_DDP_CMD_CAPS_SET_NTF: + ops->get_caps(ctx->dev, &ctx->caps); + break; + case ULP_DDP_CMD_STATS_GET: + ops->get_stats(ctx->dev, &ctx->stats); + break; + } + + return 0; +} + +static int ulp_ddp_write_reply(struct sk_buff *rsp, + struct ulp_ddp_reply_context *ctx, + int cmd, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + switch (cmd) { + case ULP_DDP_CMD_CAPS_GET: + case ULP_DDP_CMD_CAPS_SET: + case ULP_DDP_CMD_CAPS_SET_NTF: + if (nla_put_u32(rsp, ULP_DDP_A_CAPS_IFINDEX, + ctx->dev->ifindex) || + nla_put_uint(rsp, ULP_DDP_A_CAPS_HW, ctx->caps.hw[0]) || + nla_put_uint(rsp, ULP_DDP_A_CAPS_ACTIVE, + ctx->caps.active[0])) + goto err_cancel_msg; + break; + case ULP_DDP_CMD_STATS_GET: + if (nla_put_u32(rsp, ULP_DDP_A_STATS_IFINDEX, + ctx->dev->ifindex) || + nla_put_uint(rsp, + ULP_DDP_A_STATS_RX_NVME_TCP_SK_ADD, + ctx->stats.rx_nvmeotcp_sk_add) || + nla_put_uint(rsp, + ULP_DDP_A_STATS_RX_NVME_TCP_SK_ADD_FAIL, + ctx->stats.rx_nvmeotcp_sk_add_fail) || + nla_put_uint(rsp, + ULP_DDP_A_STATS_RX_NVME_TCP_SK_DEL, + ctx->stats.rx_nvmeotcp_sk_del) || + nla_put_uint(rsp, + ULP_DDP_A_STATS_RX_NVME_TCP_SETUP, + ctx->stats.rx_nvmeotcp_ddp_setup) || + nla_put_uint(rsp, + ULP_DDP_A_STATS_RX_NVME_TCP_SETUP_FAIL, + ctx->stats.rx_nvmeotcp_ddp_setup_fail) || + nla_put_uint(rsp, + ULP_DDP_A_STATS_RX_NVME_TCP_TEARDOWN, + ctx->stats.rx_nvmeotcp_ddp_teardown) || + nla_put_uint(rsp, + ULP_DDP_A_STATS_RX_NVME_TCP_DROP, + ctx->stats.rx_nvmeotcp_drop) || + nla_put_uint(rsp, + ULP_DDP_A_STATS_RX_NVME_TCP_RESYNC, + ctx->stats.rx_nvmeotcp_resync) || + nla_put_uint(rsp, + ULP_DDP_A_STATS_RX_NVME_TCP_PACKETS, + ctx->stats.rx_nvmeotcp_packets) || + nla_put_uint(rsp, + ULP_DDP_A_STATS_RX_NVME_TCP_BYTES, + ctx->stats.rx_nvmeotcp_bytes)) + goto err_cancel_msg; + } + genlmsg_end(rsp, hdr); + + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + + return -EMSGSIZE; +} + +int ulp_ddp_nl_caps_get_doit(struct sk_buff *req, struct genl_info *info) +{ + struct ulp_ddp_reply_context *ctx = info->user_ptr[0]; + struct sk_buff *rsp; + int ret = 0; + + ret = ulp_ddp_prepare_context(ctx, ULP_DDP_CMD_CAPS_GET); + if (ret) + return ret; + + rsp = genlmsg_new(ulp_ddp_reply_size(ULP_DDP_CMD_CAPS_GET), GFP_KERNEL); + if (!rsp) + return -EMSGSIZE; + + ret = ulp_ddp_write_reply(rsp, ctx, ULP_DDP_CMD_CAPS_GET, info); + if (ret) + goto err_rsp; + + return genlmsg_reply(rsp, info); + +err_rsp: + nlmsg_free(rsp); + return ret; +} + +static void ulp_ddp_nl_notify_dev(struct ulp_ddp_reply_context *ctx) +{ + struct genl_info info; + struct sk_buff *ntf; + int ret; + + if (!genl_has_listeners(&ulp_ddp_nl_family, dev_net(ctx->dev), + ULP_DDP_NLGRP_MGMT)) + return; + + genl_info_init_ntf(&info, &ulp_ddp_nl_family, ULP_DDP_CMD_CAPS_SET_NTF); + ntf = genlmsg_new(ulp_ddp_reply_size(ULP_DDP_CMD_CAPS_SET_NTF), + GFP_KERNEL); + if (!ntf) + return; + + ret = ulp_ddp_write_reply(ntf, ctx, ULP_DDP_CMD_CAPS_SET_NTF, &info); + if (ret) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&ulp_ddp_nl_family, dev_net(ctx->dev), ntf, + 0, ULP_DDP_NLGRP_MGMT, GFP_KERNEL); +} + +static int ulp_ddp_apply_bits(struct ulp_ddp_reply_context *ctx, + unsigned long *req_wanted, + unsigned long *req_mask, + struct genl_info *info, + bool *notify) +{ + DECLARE_BITMAP(old_active, ULP_DDP_CAP_COUNT); + DECLARE_BITMAP(new_active, ULP_DDP_CAP_COUNT); + const struct ulp_ddp_dev_ops *ops; + struct ulp_ddp_dev_caps caps; + int ret; + + ops = ctx->dev->netdev_ops->ulp_ddp_ops; + ops->get_caps(ctx->dev, &caps); + + /* new_active = (old_active & ~req_mask) | (wanted & req_mask) + * new_active &= caps_hw + */ + bitmap_copy(old_active, caps.active, ULP_DDP_CAP_COUNT); + bitmap_and(req_wanted, req_wanted, req_mask, ULP_DDP_CAP_COUNT); + bitmap_andnot(new_active, old_active, req_mask, ULP_DDP_CAP_COUNT); + bitmap_or(new_active, new_active, req_wanted, ULP_DDP_CAP_COUNT); + bitmap_and(new_active, new_active, caps.hw, ULP_DDP_CAP_COUNT); + if (!bitmap_equal(old_active, new_active, ULP_DDP_CAP_COUNT)) { + ret = ops->set_caps(ctx->dev, new_active, info->extack); + if (ret) + return ret; + ops->get_caps(ctx->dev, &caps); + bitmap_copy(new_active, caps.active, ULP_DDP_CAP_COUNT); + } + + /* notify if capabilities were changed */ + *notify = !bitmap_equal(old_active, new_active, ULP_DDP_CAP_COUNT); + + return 0; +} + +int ulp_ddp_nl_caps_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ulp_ddp_reply_context *ctx = info->user_ptr[0]; + unsigned long wanted, wanted_mask; + struct sk_buff *rsp; + bool notify = false; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, ULP_DDP_A_CAPS_WANTED) || + GENL_REQ_ATTR_CHECK(info, ULP_DDP_A_CAPS_WANTED_MASK)) + return -EINVAL; + + rsp = genlmsg_new(ulp_ddp_reply_size(ULP_DDP_CMD_CAPS_SET), GFP_KERNEL); + if (!rsp) + return -EMSGSIZE; + + wanted = nla_get_uint(info->attrs[ULP_DDP_A_CAPS_WANTED]); + wanted_mask = nla_get_uint(info->attrs[ULP_DDP_A_CAPS_WANTED_MASK]); + + rtnl_lock(); + ret = ulp_ddp_apply_bits(ctx, &wanted, &wanted_mask, info, ¬ify); + rtnl_unlock(); + if (ret) + goto err_rsp; + + ret = ulp_ddp_prepare_context(ctx, ULP_DDP_CMD_CAPS_SET); + if (ret) + goto err_rsp; + + ret = ulp_ddp_write_reply(rsp, ctx, ULP_DDP_CMD_CAPS_SET, info); + if (ret) + goto err_rsp; + + ret = genlmsg_reply(rsp, info); + if (notify) + ulp_ddp_nl_notify_dev(ctx); + + return ret; + +err_rsp: + nlmsg_free(rsp); + + return ret; +} + +int ulp_ddp_nl_stats_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct ulp_ddp_reply_context *ctx = info->user_ptr[0]; + struct sk_buff *rsp; + int ret = 0; + + ret = ulp_ddp_prepare_context(ctx, ULP_DDP_CMD_STATS_GET); + if (ret) + return ret; + + rsp = genlmsg_new(ulp_ddp_reply_size(ULP_DDP_CMD_STATS_GET), + GFP_KERNEL); + if (!rsp) + return -EMSGSIZE; + + ret = ulp_ddp_write_reply(rsp, ctx, ULP_DDP_CMD_STATS_GET, info); + if (ret) + goto err_rsp; + + return genlmsg_reply(rsp, info); + +err_rsp: + nlmsg_free(rsp); + return ret; +} + +static int __init ulp_ddp_init(void) +{ + return genl_register_family(&ulp_ddp_nl_family); +} + +subsys_initcall(ulp_ddp_init); From e80233f9a179229981f595b6b1c2a976d3d87bb8 Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Wed, 30 Apr 2025 08:57:24 +0000 Subject: [PATCH 677/770] iov_iter: skip copy if src == dst for direct data placement When using direct data placement (DDP) the NIC could write the payload directly into the destination buffer and constructs SKBs such that they point to this data. To skip copies when SKB data already resides in the destination buffer we check if (src == dst), and skip the copy when it's true. Signed-off-by: Ben Ben-Ishay Signed-off-by: Boris Pismenny Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Shai Malin Signed-off-by: Aurelien Aptel Reviewed-by: Chaitanya Kulkarni Reviewed-by: Max Gurtovoy Signed-off-by: NipaLocal --- lib/iov_iter.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index bc9391e55d57e..f9132281a88c8 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -62,7 +62,14 @@ static __always_inline size_t memcpy_to_iter(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { - memcpy(iter_to, from + progress, len); + /* + * When using direct data placement (DDP) the hardware writes + * data directly to the destination buffer, and constructs + * IOVs such that they point to this data. + * Thus, when the src == dst we skip the memcpy. + */ + if (!(IS_ENABLED(CONFIG_ULP_DDP) && iter_to == from + progress)) + memcpy(iter_to, from + progress, len); return 0; } From bade1b2550108d0290e9d0ba58f4c5be1db1694b Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Wed, 30 Apr 2025 08:57:25 +0000 Subject: [PATCH 678/770] net/tls,core: export get_netdev_for_sock * remove netdev_sk_get_lowest_dev() from net/core * move get_netdev_for_sock() from net/tls to net/core * update existing users in net/tls/tls_device.c get_netdev_for_sock() is a utility that is used to obtain the net_device structure from a connected socket. Later patches will use this for nvme-tcp DDP and DDP DDGST offloads. Suggested-by: Christoph Hellwig Signed-off-by: Ben Ben-Ishay Signed-off-by: Shai Malin Signed-off-by: Aurelien Aptel Reviewed-by: Sagi Grimberg Signed-off-by: NipaLocal --- include/linux/netdevice.h | 5 +++-- net/core/dev.c | 32 ++++++++++++++++++++------------ net/tls/tls_device.c | 31 +++++++++---------------------- 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 89efeb5070d0e..36b8f2351c512 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3386,8 +3386,9 @@ void free_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, bool all_slaves); -struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, - struct sock *sk); +struct net_device *get_netdev_for_sock(struct sock *sk, + netdevice_tracker *tracker, + gfp_t gfp); struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *netdev_get_by_index(struct net *net, int ifindex, diff --git a/net/core/dev.c b/net/core/dev.c index 66f0c122de80e..52d746e418af5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9087,27 +9087,35 @@ static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev, } /** - * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket - * @dev: device + * get_netdev_for_sock - Get the lowest device in socket * @sk: the socket + * @tracker: tracking object for the acquired reference + * @gfp: allocation flags for the tracker * - * %NULL is returned if no lower device is found. + * Assumes that the socket is already connected. + * Returns the lower device or %NULL if no lower device is found. */ - -struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, - struct sock *sk) +struct net_device *get_netdev_for_sock(struct sock *sk, + netdevice_tracker *tracker, + gfp_t gfp) { - struct net_device *lower; + struct dst_entry *dst = sk_dst_get(sk); + struct net_device *dev, *lower; - lower = netdev_sk_get_lower_dev(dev, sk); - while (lower) { + if (unlikely(!dst)) + return NULL; + + dev = dst->dev; + while ((lower = netdev_sk_get_lower_dev(dev, sk))) dev = lower; - lower = netdev_sk_get_lower_dev(dev, sk); - } + if (is_vlan_dev(dev)) + dev = vlan_dev_real_dev(dev); + netdev_hold(dev, tracker, gfp); + dst_release(dst); return dev; } -EXPORT_SYMBOL(netdev_sk_get_lowest_dev); +EXPORT_SYMBOL_GPL(get_netdev_for_sock); static void netdev_adjacent_add_links(struct net_device *dev) { diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index f672a62a9a52f..150410ee2c6c1 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -120,22 +120,6 @@ static void tls_device_queue_ctx_destruction(struct tls_context *ctx) tls_device_free_ctx(ctx); } -/* We assume that the socket is already connected */ -static struct net_device *get_netdev_for_sock(struct sock *sk) -{ - struct dst_entry *dst = sk_dst_get(sk); - struct net_device *netdev = NULL; - - if (likely(dst)) { - netdev = netdev_sk_get_lowest_dev(dst->dev, sk); - dev_hold(netdev); - } - - dst_release(dst); - - return netdev; -} - static void destroy_record(struct tls_record_info *record) { int i; @@ -1060,6 +1044,7 @@ int tls_set_device_offload(struct sock *sk) struct tls_offload_context_tx *offload_ctx; const struct tls_cipher_desc *cipher_desc; struct tls_crypto_info *crypto_info; + netdevice_tracker netdev_tracker; struct tls_prot_info *prot; struct net_device *netdev; struct tls_context *ctx; @@ -1072,7 +1057,7 @@ int tls_set_device_offload(struct sock *sk) if (ctx->priv_ctx_tx) return -EEXIST; - netdev = get_netdev_for_sock(sk); + netdev = get_netdev_for_sock(sk, &netdev_tracker, GFP_KERNEL); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); return -EINVAL; @@ -1166,7 +1151,7 @@ int tls_set_device_offload(struct sock *sk) * by the netdev's xmit function. */ smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb); - dev_put(netdev); + netdev_put(netdev, &netdev_tracker); return 0; @@ -1180,7 +1165,7 @@ int tls_set_device_offload(struct sock *sk) free_marker_record: kfree(start_marker_record); release_netdev: - dev_put(netdev); + netdev_put(netdev, &netdev_tracker); return rc; } @@ -1188,13 +1173,15 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { struct tls12_crypto_info_aes_gcm_128 *info; struct tls_offload_context_rx *context; + netdevice_tracker netdev_tracker; struct net_device *netdev; + int rc = 0; if (ctx->crypto_recv.info.version != TLS_1_2_VERSION) return -EOPNOTSUPP; - netdev = get_netdev_for_sock(sk); + netdev = get_netdev_for_sock(sk, &netdev_tracker, GFP_KERNEL); if (!netdev) { pr_err_ratelimited("%s: netdev not found\n", __func__); return -EINVAL; @@ -1243,7 +1230,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) tls_device_attach(ctx, sk, netdev); up_read(&device_offload_lock); - dev_put(netdev); + netdev_put(netdev, &netdev_tracker); return 0; @@ -1256,7 +1243,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) release_lock: up_read(&device_offload_lock); release_netdev: - dev_put(netdev); + netdev_put(netdev, &netdev_tracker); return rc; } From bdbdc5f4244e759505168ca554c5a6f196d5fa81 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 30 Apr 2025 08:57:26 +0000 Subject: [PATCH 679/770] nvme-tcp: Add DDP offload control path This commit introduces direct data placement offload to NVME TCP. There is a context per queue, which is established after the handshake using the sk_add/del NDOs. Additionally, a resynchronization routine is used to assist hardware recovery from TCP OOO, and continue the offload. Resynchronization operates as follows: 1. TCP OOO causes the NIC HW to stop the offload 2. NIC HW identifies a PDU header at some TCP sequence number, and asks NVMe-TCP to confirm it. This request is delivered from the NIC driver to NVMe-TCP by first finding the socket for the packet that triggered the request, and then finding the nvme_tcp_queue that is used by this routine. Finally, the request is recorded in the nvme_tcp_queue. 3. When NVMe-TCP observes the requested TCP sequence, it will compare it with the PDU header TCP sequence, and report the result to the NIC driver (resync), which will update the HW, and resume offload when all is successful. Some HW implementation such as ConnectX-7 assume linear CCID (0...N-1 for queue of size N) where the linux nvme driver uses part of the 16 bit CCID for generation counter. To address that, we use the existing quirk in the nvme layer when the HW driver advertises if the device is not supports the full 16 bit CCID range. Furthermore, we let the offloading driver advertise what is the max hw sectors/segments via ulp_ddp_limits. A follow-up patch introduces the data-path changes required for this offload. Socket operations need a netdev reference. This reference is dropped on NETDEV_GOING_DOWN events to allow the device to go down in a follow-up patch. Signed-off-by: Boris Pismenny Signed-off-by: Ben Ben-Ishay Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Shai Malin Signed-off-by: Aurelien Aptel Signed-off-by: Max Gurtovoy Reviewed-by: Chaitanya Kulkarni Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: NipaLocal --- drivers/nvme/host/tcp.c | 269 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 258 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 72d260201d8cf..fc9debd649004 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -20,6 +20,10 @@ #include #include +#ifdef CONFIG_ULP_DDP +#include +#endif + #include "nvme.h" #include "fabrics.h" @@ -55,6 +59,16 @@ MODULE_PARM_DESC(tls_handshake_timeout, static atomic_t nvme_tcp_cpu_queues[NR_CPUS]; +#ifdef CONFIG_ULP_DDP +/* NVMeTCP direct data placement and data digest offload will not + * happen if this parameter false (default), regardless of what the + * underlying netdev capabilities are. + */ +static bool ddp_offload; +module_param(ddp_offload, bool, 0644); +MODULE_PARM_DESC(ddp_offload, "Enable or disable NVMeTCP direct data placement support"); +#endif + #ifdef CONFIG_DEBUG_LOCK_ALLOC /* lockdep can detect a circular dependency of the form * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock @@ -129,6 +143,7 @@ enum nvme_tcp_queue_flags { NVME_TCP_Q_LIVE = 1, NVME_TCP_Q_POLLING = 2, NVME_TCP_Q_IO_CPU_SET = 3, + NVME_TCP_Q_OFF_DDP = 4, }; enum nvme_tcp_recv_state { @@ -156,6 +171,18 @@ struct nvme_tcp_queue { size_t ddgst_remaining; unsigned int nr_cqe; +#ifdef CONFIG_ULP_DDP + /* + * resync_tcp_seq is a speculative PDU header tcp seq number (with + * an additional flag in the lower 32 bits) that the HW send to + * the SW, for the SW to verify. + * - The 32 high bits store the seq number + * - The 32 low bits are used as a flag to know if a request + * is pending (ULP_DDP_RESYNC_PENDING). + */ + atomic64_t resync_tcp_seq; +#endif + /* send state */ struct nvme_tcp_request *request; @@ -197,6 +224,14 @@ struct nvme_tcp_ctrl { struct delayed_work connect_work; struct nvme_tcp_request async_req; u32 io_queues[HCTX_MAX_TYPES]; + + struct net_device *ddp_netdev; + netdevice_tracker netdev_tracker; + netdevice_tracker netdev_ddp_tracker; + u32 ddp_threshold; +#ifdef CONFIG_ULP_DDP + struct ulp_ddp_limits ddp_limits; +#endif }; static LIST_HEAD(nvme_tcp_ctrl_list); @@ -335,6 +370,178 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, return nvme_tcp_pdu_data_left(req) <= len; } +#ifdef CONFIG_ULP_DDP + +static struct net_device * +nvme_tcp_get_ddp_netdev_with_limits(struct nvme_tcp_ctrl *ctrl) +{ + struct net_device *netdev; + int ret; + + if (!ddp_offload) + return NULL; + + rtnl_lock(); + /* netdev ref is put in nvme_tcp_stop_admin_queue() */ + netdev = get_netdev_for_sock(ctrl->queues[0].sock->sk, &ctrl->netdev_tracker, GFP_KERNEL); + rtnl_unlock(); + if (!netdev) { + dev_dbg(ctrl->ctrl.device, "netdev not found\n"); + return NULL; + } + + if (!ulp_ddp_is_cap_active(netdev, ULP_DDP_CAP_NVME_TCP)) + goto err; + + ret = ulp_ddp_get_limits(netdev, &ctrl->ddp_limits, ULP_DDP_NVME); + if (ret) + goto err; + + if (nvme_tcp_tls_configured(&ctrl->ctrl) && !ctrl->ddp_limits.tls) + goto err; + + return netdev; +err: + netdev_put(netdev, &ctrl->netdev_tracker); + return NULL; +} + +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); +static const struct ulp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops = { + .resync_request = nvme_tcp_resync_request, +}; + +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) +{ + struct ulp_ddp_config config = {.type = ULP_DDP_NVME}; + int ret; + + config.affinity_hint = queue->io_cpu == WORK_CPU_UNBOUND ? + queue->sock->sk->sk_incoming_cpu : queue->io_cpu; + config.nvmeotcp.pfv = NVME_TCP_PFV_1_0; + config.nvmeotcp.cpda = 0; + config.nvmeotcp.dgst = + queue->hdr_digest ? NVME_TCP_HDR_DIGEST_ENABLE : 0; + config.nvmeotcp.dgst |= + queue->data_digest ? NVME_TCP_DATA_DIGEST_ENABLE : 0; + config.nvmeotcp.queue_size = queue->ctrl->ctrl.sqsize + 1; + + ret = ulp_ddp_sk_add(queue->ctrl->ddp_netdev, + &queue->ctrl->netdev_ddp_tracker, + GFP_KERNEL, + queue->sock->sk, + &config, + &nvme_tcp_ddp_ulp_ops); + if (ret) + return ret; + + set_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); + + return 0; +} + +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) +{ + clear_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); + ulp_ddp_sk_del(queue->ctrl->ddp_netdev, + &queue->ctrl->netdev_ddp_tracker, + queue->sock->sk); +} + +static void nvme_tcp_ddp_apply_limits(struct nvme_tcp_ctrl *ctrl) +{ + ctrl->ctrl.max_segments = ctrl->ddp_limits.max_ddp_sgl_len; + ctrl->ctrl.max_hw_sectors = + ctrl->ddp_limits.max_ddp_sgl_len << (NVME_CTRL_PAGE_SHIFT - SECTOR_SHIFT); + ctrl->ddp_threshold = ctrl->ddp_limits.io_threshold; + + /* offloading HW doesn't support full ccid range, apply the quirk */ + ctrl->ctrl.quirks |= + ctrl->ddp_limits.nvmeotcp.full_ccid_range ? 0 : NVME_QUIRK_SKIP_CID_GEN; +} + +/* In presence of packet drops or network packet reordering, the device may lose + * synchronization between the TCP stream and the L5P framing, and require a + * resync with the kernel's TCP stack. + * + * - NIC HW identifies a PDU header at some TCP sequence number, + * and asks NVMe-TCP to confirm it. + * - When NVMe-TCP observes the requested TCP sequence, it will compare + * it with the PDU header TCP sequence, and report the result to the + * NIC driver + */ +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, + struct sk_buff *skb, unsigned int offset) +{ + u64 pdu_seq = TCP_SKB_CB(skb)->seq + offset - queue->pdu_offset; + struct net_device *netdev = queue->ctrl->ddp_netdev; + u64 pdu_val = (pdu_seq << 32) | ULP_DDP_RESYNC_PENDING; + u64 resync_val; + u32 resync_seq; + + resync_val = atomic64_read(&queue->resync_tcp_seq); + /* Lower 32 bit flags. Check validity of the request */ + if ((resync_val & ULP_DDP_RESYNC_PENDING) == 0) + return; + + /* + * Obtain and check requested sequence number: is this PDU header + * before the request? + */ + resync_seq = resync_val >> 32; + if (before(pdu_seq, resync_seq)) + return; + + /* + * The atomic operation guarantees that we don't miss any NIC driver + * resync requests submitted after the above checks. + */ + if (atomic64_cmpxchg(&queue->resync_tcp_seq, pdu_val, + pdu_val & ~ULP_DDP_RESYNC_PENDING) != + atomic64_read(&queue->resync_tcp_seq)) + ulp_ddp_resync(netdev, queue->sock->sk, pdu_seq); +} + +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags) +{ + struct nvme_tcp_queue *queue = sk->sk_user_data; + + /* + * "seq" (TCP seq number) is what the HW assumes is the + * beginning of a PDU. The nvme-tcp layer needs to store the + * number along with the "flags" (ULP_DDP_RESYNC_PENDING) to + * indicate that a request is pending. + */ + atomic64_set(&queue->resync_tcp_seq, (((uint64_t)seq << 32) | flags)); + + return true; +} + +#else + +static struct net_device * +nvme_tcp_get_ddp_netdev_with_limits(struct nvme_tcp_ctrl *ctrl) +{ + return NULL; +} + +static void nvme_tcp_ddp_apply_limits(struct nvme_tcp_ctrl *ctrl) +{} + +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) +{ + return 0; +} + +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) +{} + +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, + struct sk_buff *skb, unsigned int offset) +{} + +#endif + static void nvme_tcp_init_iter(struct nvme_tcp_request *req, unsigned int dir) { @@ -817,6 +1024,9 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining); int ret; + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) + nvme_tcp_resync_response(queue, skb, *offset); + ret = skb_copy_bits(skb, *offset, &pdu[queue->pdu_offset], rcv_len); if (unlikely(ret)) @@ -1944,6 +2154,8 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) kernel_sock_shutdown(queue->sock, SHUT_RDWR); nvme_tcp_restore_sock_ops(queue); cancel_work_sync(&queue->io_work); + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) + nvme_tcp_unoffload_socket(queue); } static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) @@ -1965,6 +2177,20 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) mutex_unlock(&queue->queue_lock); } +static void nvme_tcp_stop_admin_queue(struct nvme_ctrl *nctrl) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + + nvme_tcp_stop_queue(nctrl, 0); + + /* + * We are called twice by nvme_tcp_teardown_admin_queue() + * Set ddp_netdev to NULL to avoid putting it twice + */ + netdev_put(ctrl->ddp_netdev, &ctrl->netdev_tracker); + ctrl->ddp_netdev = NULL; +} + static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue) { write_lock_bh(&queue->sock->sk->sk_callback_lock); @@ -1994,17 +2220,35 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) if (idx) { nvme_tcp_set_queue_io_cpu(queue); ret = nvmf_connect_io_queue(nctrl, idx); - } else + if (ret) + goto err; + + if (ctrl->ddp_netdev) { + ret = nvme_tcp_offload_socket(queue); + if (ret) { + dev_info(nctrl->device, + "failed to setup offload on queue %d ret=%d\n", + idx, ret); + } + } + } else { ret = nvmf_connect_admin_queue(nctrl); + if (ret) + goto err; + + ctrl->ddp_netdev = nvme_tcp_get_ddp_netdev_with_limits(ctrl); + if (ctrl->ddp_netdev) + nvme_tcp_ddp_apply_limits(ctrl); - if (!ret) { - set_bit(NVME_TCP_Q_LIVE, &queue->flags); - } else { - if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) - __nvme_tcp_stop_queue(queue); - dev_err(nctrl->device, - "failed to connect queue: %d ret=%d\n", idx, ret); } + + set_bit(NVME_TCP_Q_LIVE, &queue->flags); + return 0; +err: + if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) + __nvme_tcp_stop_queue(queue); + dev_err(nctrl->device, + "failed to connect queue: %d ret=%d\n", idx, ret); return ret; } @@ -2260,7 +2504,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); out_stop_queue: - nvme_tcp_stop_queue(ctrl, 0); + nvme_tcp_stop_admin_queue(ctrl); nvme_cancel_admin_tagset(ctrl); out_cleanup_tagset: if (new) @@ -2275,7 +2519,7 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, { nvme_quiesce_admin_queue(ctrl); blk_sync_queue(ctrl->admin_q); - nvme_tcp_stop_queue(ctrl, 0); + nvme_tcp_stop_admin_queue(ctrl); nvme_cancel_admin_tagset(ctrl); if (remove) { nvme_unquiesce_admin_queue(ctrl); @@ -2618,7 +2862,10 @@ static void nvme_tcp_complete_timed_out(struct request *rq) struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; - nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); + if (nvme_tcp_admin_queue(req->queue)) + nvme_tcp_stop_admin_queue(ctrl); + else + nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); nvmf_complete_timed_out_request(rq); } From fb90f2662cdd280c460435f3f59025aeebe7db52 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 30 Apr 2025 08:57:27 +0000 Subject: [PATCH 680/770] nvme-tcp: Add DDP data-path Introduce the NVMe-TCP DDP data-path offload. Using this interface, the NIC hardware will scatter TCP payload directly to the BIO pages according to the command_id in the PDU. To maintain the correctness of the network stack, the driver is expected to construct SKBs that point to the BIO pages. The data-path interface contains two routines: setup/teardown. The setup provides the mapping from command_id to the request buffers, while the teardown removes this mapping. For efficiency, we introduce an asynchronous nvme completion, which is split between NVMe-TCP and the NIC driver as follows: NVMe-TCP performs the specific completion, while NIC driver performs the generic mq_blk completion. Signed-off-by: Boris Pismenny Signed-off-by: Ben Ben-Ishay Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Shai Malin Signed-off-by: Aurelien Aptel Reviewed-by: Chaitanya Kulkarni Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: NipaLocal --- drivers/nvme/host/tcp.c | 136 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 131 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index fc9debd649004..498b524b8eef6 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -129,6 +129,9 @@ struct nvme_tcp_request { struct llist_node lentry; __le32 ddgst; + /* ddp async completion */ + union nvme_result result; + struct bio *curr_bio; struct iov_iter iter; @@ -136,6 +139,11 @@ struct nvme_tcp_request { size_t offset; size_t data_sent; enum nvme_tcp_send_state state; + +#ifdef CONFIG_ULP_DDP + bool offloaded; + struct ulp_ddp_io ddp; +#endif }; enum nvme_tcp_queue_flags { @@ -372,6 +380,25 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, #ifdef CONFIG_ULP_DDP +static bool nvme_tcp_is_ddp_offloaded(const struct nvme_tcp_request *req) +{ + return req->offloaded; +} + +static int nvme_tcp_ddp_alloc_sgl(struct nvme_tcp_request *req, struct request *rq) +{ + int ret; + + req->ddp.sg_table.sgl = req->ddp.first_sgl; + ret = sg_alloc_table_chained(&req->ddp.sg_table, + blk_rq_nr_phys_segments(rq), + req->ddp.sg_table.sgl, SG_CHUNK_SIZE); + if (ret) + return -ENOMEM; + req->ddp.nents = blk_rq_map_sg(rq, req->ddp.sg_table.sgl); + return 0; +} + static struct net_device * nvme_tcp_get_ddp_netdev_with_limits(struct nvme_tcp_ctrl *ctrl) { @@ -407,10 +434,68 @@ nvme_tcp_get_ddp_netdev_with_limits(struct nvme_tcp_ctrl *ctrl) } static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); +static void nvme_tcp_ddp_teardown_done(void *ddp_ctx); static const struct ulp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops = { .resync_request = nvme_tcp_resync_request, + .ddp_teardown_done = nvme_tcp_ddp_teardown_done, }; +static void nvme_tcp_teardown_ddp(struct nvme_tcp_queue *queue, + struct request *rq) +{ + struct net_device *netdev = queue->ctrl->ddp_netdev; + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + ulp_ddp_teardown(netdev, queue->sock->sk, &req->ddp, rq); + sg_free_table_chained(&req->ddp.sg_table, SG_CHUNK_SIZE); +} + +static void nvme_tcp_ddp_teardown_done(void *ddp_ctx) +{ + struct request *rq = ddp_ctx; + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + if (!nvme_try_complete_req(rq, req->status, req->result)) + nvme_complete_rq(rq); +} + +static void nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue, + struct request *rq) +{ + struct net_device *netdev = queue->ctrl->ddp_netdev; + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + int ret; + + if (rq_data_dir(rq) != READ || + queue->ctrl->ddp_threshold > blk_rq_payload_bytes(rq)) + return; + + /* + * DDP offload is best-effort, errors are ignored. + */ + + req->ddp.command_id = nvme_cid(rq); + ret = nvme_tcp_ddp_alloc_sgl(req, rq); + if (ret) + goto err; + + ret = ulp_ddp_setup(netdev, queue->sock->sk, &req->ddp); + if (ret) { + sg_free_table_chained(&req->ddp.sg_table, SG_CHUNK_SIZE); + goto err; + } + + /* if successful, sg table is freed in nvme_tcp_teardown_ddp() */ + req->offloaded = true; + + return; +err: + WARN_ONCE(ret, "ddp setup failed (queue 0x%x, cid 0x%x, ret=%d)", + nvme_tcp_queue_id(queue), + nvme_cid(rq), + ret); +} + static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) { struct ulp_ddp_config config = {.type = ULP_DDP_NVME}; @@ -519,6 +604,11 @@ static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags) #else +static bool nvme_tcp_is_ddp_offloaded(const struct nvme_tcp_request *req) +{ + return false; +} + static struct net_device * nvme_tcp_get_ddp_netdev_with_limits(struct nvme_tcp_ctrl *ctrl) { @@ -536,6 +626,14 @@ static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) {} +static void nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue, + struct request *rq) +{} + +static void nvme_tcp_teardown_ddp(struct nvme_tcp_queue *queue, + struct request *rq) +{} + static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int offset) {} @@ -817,6 +915,26 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); } +static void nvme_tcp_complete_request(struct request *rq, + __le16 status, + union nvme_result result, + __u16 command_id) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + req->status = status; + req->result = result; + + if (nvme_tcp_is_ddp_offloaded(req)) { + /* complete when teardown is confirmed to be done */ + nvme_tcp_teardown_ddp(req->queue, rq); + return; + } + + if (!nvme_try_complete_req(rq, status, result)) + nvme_complete_rq(rq); +} + static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, struct nvme_completion *cqe) { @@ -836,10 +954,9 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, if (req->status == cpu_to_le16(NVME_SC_SUCCESS)) req->status = cqe->status; - if (!nvme_try_complete_req(rq, req->status, cqe->result)) - nvme_complete_rq(rq); + nvme_tcp_complete_request(rq, req->status, cqe->result, + cqe->command_id); queue->nr_cqe++; - return 0; } @@ -1093,10 +1210,13 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, static inline void nvme_tcp_end_request(struct request *rq, u16 status) { + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_queue *queue = req->queue; + struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; union nvme_result res = {}; - if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res)) - nvme_complete_rq(rq); + nvme_tcp_complete_request(rq, cpu_to_le16(status << 1), res, + pdu->command_id); } static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, @@ -2941,6 +3061,9 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, if (ret) return ret; +#ifdef CONFIG_ULP_DDP + req->offloaded = false; +#endif req->state = NVME_TCP_SEND_CMD_PDU; req->status = cpu_to_le16(NVME_SC_SUCCESS); req->offset = 0; @@ -2979,6 +3102,9 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, return ret; } + if (test_bit(NVME_TCP_Q_OFF_DDP, &queue->flags)) + nvme_tcp_setup_ddp(queue, rq); + return 0; } From 33dd863e4519ac9286d0f28e6e282522d1ec18f5 Mon Sep 17 00:00:00 2001 From: Yoray Zack Date: Wed, 30 Apr 2025 08:57:28 +0000 Subject: [PATCH 681/770] nvme-tcp: RX DDGST offload Enable rx side of DDGST offload when supported. At the end of the capsule, check if all the skb bits are on, and if not recalculate the DDGST in SW and check it. Signed-off-by: Yoray Zack Signed-off-by: Boris Pismenny Signed-off-by: Ben Ben-Ishay Signed-off-by: Or Gerlitz Signed-off-by: Shai Malin Signed-off-by: Aurelien Aptel Reviewed-by: Chaitanya Kulkarni Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: NipaLocal --- drivers/nvme/host/tcp.c | 96 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 91 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 498b524b8eef6..81d97fe5b2c87 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -152,6 +152,7 @@ enum nvme_tcp_queue_flags { NVME_TCP_Q_POLLING = 2, NVME_TCP_Q_IO_CPU_SET = 3, NVME_TCP_Q_OFF_DDP = 4, + NVME_TCP_Q_OFF_DDGST_RX = 5, }; enum nvme_tcp_recv_state { @@ -189,6 +190,7 @@ struct nvme_tcp_queue { * is pending (ULP_DDP_RESYNC_PENDING). */ atomic64_t resync_tcp_seq; + bool ddp_ddgst_valid; #endif /* send state */ @@ -433,6 +435,46 @@ nvme_tcp_get_ddp_netdev_with_limits(struct nvme_tcp_ctrl *ctrl) return NULL; } +static bool nvme_tcp_ddp_ddgst_ok(struct nvme_tcp_queue *queue) +{ + return queue->ddp_ddgst_valid; +} + +static void nvme_tcp_ddp_ddgst_update(struct nvme_tcp_queue *queue, + struct sk_buff *skb) +{ + if (queue->ddp_ddgst_valid) + queue->ddp_ddgst_valid = skb_is_ulp_crc(skb); +} + +static void nvme_tcp_ddp_ddgst_recalc(struct ahash_request *hash, + struct request *rq, + __le32 *ddgst) +{ + struct nvme_tcp_request *req; + + req = blk_mq_rq_to_pdu(rq); + if (!req->offloaded) { + /* if we have DDGST_RX offload but DDP was skipped + * because it's under the min IO threshold then the + * request won't have an SGL, so we need to make it + * here + */ + if (nvme_tcp_ddp_alloc_sgl(req, rq)) + return; + } + + ahash_request_set_crypt(hash, req->ddp.sg_table.sgl, (u8 *)ddgst, + req->data_len); + crypto_ahash_digest(hash); + if (!req->offloaded) { + /* without DDP, ddp_teardown() won't be called, so + * free the table here + */ + sg_free_table_chained(&req->ddp.sg_table, SG_CHUNK_SIZE); + } +} + static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); static void nvme_tcp_ddp_teardown_done(void *ddp_ctx); static const struct ulp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops = { @@ -521,6 +563,10 @@ static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) return ret; set_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); + if (queue->data_digest && + ulp_ddp_is_cap_active(queue->ctrl->ddp_netdev, + ULP_DDP_CAP_NVME_TCP_DDGST_RX)) + set_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags); return 0; } @@ -528,6 +574,7 @@ static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue) static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue) { clear_bit(NVME_TCP_Q_OFF_DDP, &queue->flags); + clear_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags); ulp_ddp_sk_del(queue->ctrl->ddp_netdev, &queue->ctrl->netdev_ddp_tracker, queue->sock->sk); @@ -638,6 +685,20 @@ static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int offset) {} +static bool nvme_tcp_ddp_ddgst_ok(struct nvme_tcp_queue *queue) +{ + return false; +} + +static void nvme_tcp_ddp_ddgst_update(struct nvme_tcp_queue *queue, + struct sk_buff *skb) +{} + +static void nvme_tcp_ddp_ddgst_recalc(struct ahash_request *hash, + struct request *rq, + __le32 *ddgst) +{} + #endif static void nvme_tcp_init_iter(struct nvme_tcp_request *req, @@ -904,6 +965,9 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue) queue->pdu_offset = 0; queue->data_remaining = -1; queue->ddgst_remaining = 0; +#ifdef CONFIG_ULP_DDP + queue->ddp_ddgst_valid = true; +#endif } static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) @@ -1227,6 +1291,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id); struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + if (test_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags)) + nvme_tcp_ddp_ddgst_update(queue, skb); + while (true) { int recv_len, ret; @@ -1255,7 +1322,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, recv_len = min_t(size_t, recv_len, iov_iter_count(&req->iter)); - if (queue->data_digest) + if (queue->data_digest && + !test_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags)) ret = skb_copy_and_hash_datagram_iter(skb, *offset, &req->iter, recv_len, queue->rcv_hash); else @@ -1297,8 +1365,11 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, char *ddgst = (char *)&queue->recv_ddgst; size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining); off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining; + struct request *rq; int ret; + if (test_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags)) + nvme_tcp_ddp_ddgst_update(queue, skb); ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len); if (unlikely(ret)) return ret; @@ -1309,9 +1380,25 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, if (queue->ddgst_remaining) return 0; + rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); + + if (test_bit(NVME_TCP_Q_OFF_DDGST_RX, &queue->flags)) { + /* + * If HW successfully offloaded the digest + * verification, we can skip it + */ + if (nvme_tcp_ddp_ddgst_ok(queue)) + goto ddgst_valid; + /* + * Otherwise we have to recalculate and verify the + * digest with the software-fallback + */ + nvme_tcp_ddp_ddgst_recalc(queue->rcv_hash, rq, + &queue->exp_ddgst); + } + if (queue->recv_ddgst != queue->exp_ddgst) { - struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), - pdu->command_id); struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR); @@ -1322,9 +1409,8 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, le32_to_cpu(queue->exp_ddgst)); } +ddgst_valid: if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { - struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), - pdu->command_id); struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); nvme_tcp_end_request(rq, le16_to_cpu(req->status)); From 07c797c6c0910e6e582992a0f25624da6f108446 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 30 Apr 2025 08:57:29 +0000 Subject: [PATCH 682/770] nvme-tcp: Deal with netdevice DOWN events For ddp setup/teardown and resync, the offloading logic uses HW resources at the NIC driver such as SQ and CQ. These resources are destroyed when the netdevice does down and hence we must stop using them before the NIC driver destroys them. Use netdevice notifier for that matter -- offloaded connections are stopped before the stack continues to call the NIC driver close ndo. We use the existing recovery flow which has the advantage of resuming the offload once the connection is re-set. This also buys us proper handling for the UNREGISTER event b/c our offloading starts in the UP state, and down is always there between up to unregister. Signed-off-by: Or Gerlitz Signed-off-by: Boris Pismenny Signed-off-by: Ben Ben-Ishay Signed-off-by: Yoray Zack Signed-off-by: Shai Malin Signed-off-by: Aurelien Aptel Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: NipaLocal --- drivers/nvme/host/tcp.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 81d97fe5b2c87..bf57a88a984b4 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -246,6 +246,7 @@ struct nvme_tcp_ctrl { static LIST_HEAD(nvme_tcp_ctrl_list); static DEFINE_MUTEX(nvme_tcp_ctrl_mutex); +static struct notifier_block nvme_tcp_netdevice_nb; static struct workqueue_struct *nvme_tcp_wq; static const struct blk_mq_ops nvme_tcp_mq_ops; static const struct blk_mq_ops nvme_tcp_admin_mq_ops; @@ -3454,6 +3455,32 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, return ERR_PTR(ret); } +static int nvme_tcp_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ +#ifdef CONFIG_ULP_DDP + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct nvme_tcp_ctrl *ctrl; + + if (event != NETDEV_GOING_DOWN) + return NOTIFY_DONE; + + mutex_lock(&nvme_tcp_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) { + if (ndev == ctrl->ddp_netdev) + nvme_tcp_error_recovery(&ctrl->ctrl); + } + mutex_unlock(&nvme_tcp_ctrl_mutex); + flush_workqueue(nvme_reset_wq); + /* + * The associated controllers teardown has completed, + * ddp contexts were also torn down so we should be + * safe to continue... + */ +#endif + return NOTIFY_DONE; +} + static struct nvmf_transport_ops nvme_tcp_transport = { .name = "tcp", .module = THIS_MODULE, @@ -3471,6 +3498,7 @@ static int __init nvme_tcp_init_module(void) { unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS; int cpu; + int ret; BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8); BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72); @@ -3490,9 +3518,19 @@ static int __init nvme_tcp_init_module(void) for_each_possible_cpu(cpu) atomic_set(&nvme_tcp_cpu_queues[cpu], 0); + nvme_tcp_netdevice_nb.notifier_call = nvme_tcp_netdev_event; + ret = register_netdevice_notifier(&nvme_tcp_netdevice_nb); + if (ret) { + pr_err("failed to register netdev notifier\n"); + goto out_free_workqueue; + } nvmf_register_transport(&nvme_tcp_transport); return 0; + +out_free_workqueue: + destroy_workqueue(nvme_tcp_wq); + return ret; } static void __exit nvme_tcp_cleanup_module(void) @@ -3500,6 +3538,7 @@ static void __exit nvme_tcp_cleanup_module(void) struct nvme_tcp_ctrl *ctrl; nvmf_unregister_transport(&nvme_tcp_transport); + unregister_netdevice_notifier(&nvme_tcp_netdevice_nb); mutex_lock(&nvme_tcp_ctrl_mutex); list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) From da981d87fff421dc79bfd2f2c9a3f996f72b9501 Mon Sep 17 00:00:00 2001 From: Yoray Zack Date: Wed, 30 Apr 2025 08:57:30 +0000 Subject: [PATCH 683/770] Documentation: add ULP DDP offload documentation Document the new ULP DDP API and add it under "networking". Use NVMe-TCP implementation as an example. Signed-off-by: Boris Pismenny Signed-off-by: Ben Ben-Ishay Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Shai Malin Signed-off-by: Aurelien Aptel Reviewed-by: Bagas Sanjaya Signed-off-by: NipaLocal --- Documentation/networking/index.rst | 1 + Documentation/networking/ulp-ddp-offload.rst | 372 +++++++++++++++++++ 2 files changed, 373 insertions(+) create mode 100644 Documentation/networking/ulp-ddp-offload.rst diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index ac90b82f3ce95..41ab7d1f9c0ec 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -120,6 +120,7 @@ Contents: tc-queue-filters tcp_ao tcp-thin + ulp-ddp-offload team timestamping tipc diff --git a/Documentation/networking/ulp-ddp-offload.rst b/Documentation/networking/ulp-ddp-offload.rst new file mode 100644 index 0000000000000..4133e5094ff54 --- /dev/null +++ b/Documentation/networking/ulp-ddp-offload.rst @@ -0,0 +1,372 @@ +.. SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) + +================================= +ULP direct data placement offload +================================= + +Overview +======== + +The Linux kernel ULP direct data placement (DDP) offload infrastructure +provides tagged request-response protocols, such as NVMe-TCP, the ability to +place response data directly in pre-registered buffers according to header +tags. DDP is particularly useful for data-intensive pipelined protocols whose +responses may be reordered. + +For example, in NVMe-TCP numerous read requests are sent together and each +request is tagged using the PDU header CID field. Receiving servers process +requests as fast as possible and sometimes responses for smaller requests +bypasses responses to larger requests, e.g., 4KB reads bypass 1GB reads. +Thereafter, clients correlate responses to requests using PDU header CID tags. +The processing of each response requires copying data from SKBs to read +request destination buffers; The offload avoids this copy. The offload is +oblivious to destination buffers which can reside either in userspace +(O_DIRECT) or in kernel pagecache. + +Request TCP byte-stream: + +.. parsed-literal:: + + +---------------+-------+---------------+-------+---------------+-------+ + | PDU hdr CID=1 | Req 1 | PDU hdr CID=2 | Req 2 | PDU hdr CID=3 | Req 3 | + +---------------+-------+---------------+-------+---------------+-------+ + +Response TCP byte-stream: + +.. parsed-literal:: + + +---------------+--------+---------------+--------+---------------+--------+ + | PDU hdr CID=2 | Resp 2 | PDU hdr CID=3 | Resp 3 | PDU hdr CID=1 | Resp 1 | + +---------------+--------+---------------+--------+---------------+--------+ + +The driver builds SKB page fragments that point to destination buffers. +Consequently, SKBs represent the original data on the wire, which enables +*transparent* inter-operation with the network stack. To avoid copies between +SKBs and destination buffers, the layer-5 protocol (L5P) will check +``if (src == dst)`` for SKB page fragments, success indicates that data is +already placed there by NIC hardware and copy should be skipped. + +In addition, L5P might have DDGST which ensures data integrity over +the network. If not offloaded, ULP DDP might not be efficient as L5P +will need to go over the data and calculate it by itself, cancelling +out the benefits of the DDP copy skip. ULP DDP has support for Rx/Tx +DDGST offload. On the received side the NIC will verify DDGST for +received PDUs and update SKB->ulp_ddp and SKB->ulp_crc bits. If all the SKBs +making up a L5P PDU have crc on, L5P will skip on calculating and +verifying the DDGST for the corresponding PDU. On the Tx side, the NIC +will be responsible for calculating and filling the DDGST fields in +the sent PDUs. + +Offloading does require NIC hardware to track L5P protocol framing, similarly +to RX TLS offload (see Documentation/networking/tls-offload.rst). NIC hardware +will parse PDU headers, extract fields such as operation type, length, tag +identifier, etc. and only offload segments that correspond to tags registered +with the NIC, see the :ref:`buf_reg` section. + +Device configuration +==================== + +During driver initialization the driver sets the ULP DDP operations +for the :c:type:`struct net_device ` via +`netdev->netdev_ops->ulp_ddp_ops`. + +The :c:member:`get_caps` operation returns the ULP DDP capabilities +enabled and/or supported by the device to the caller. The current list +of capabilities is represented as a bitset: + +.. code-block:: c + + enum ulp_ddp_cap { + ULP_DDP_CAP_NVME_TCP, + ULP_DDP_CAP_NVME_TCP_DDGST, + }; + +The enablement of capabilities can be controlled via the +:c:member:`set_caps` operation. This operation is exposed to userspace +via netlink. See Documentation/netlink/specs/ulp_ddp.yaml for more +details. + +Later, after the L5P completes its handshake, the L5P queries the +driver for its runtime limitations via the :c:member:`limits` operation: + +.. code-block:: c + + int (*limits)(struct net_device *netdev, + struct ulp_ddp_limits *lim); + + +All L5P share a common set of limits and parameters (:c:type:`struct ulp_ddp_limits `): + +.. code-block:: c + + /** + * struct ulp_ddp_limits - Generic ulp ddp limits: tcp ddp + * protocol limits. + * Add new instances of ulp_ddp_limits in the union below (nvme-tcp, etc.). + * + * @type: type of this limits struct + * @max_ddp_sgl_len: maximum sgl size supported (zero means no limit) + * @io_threshold: minimum payload size required to offload + * @tls: support for ULP over TLS + * @nvmeotcp: NVMe-TCP specific limits + */ + struct ulp_ddp_limits { + enum ulp_ddp_type type; + int max_ddp_sgl_len; + int io_threshold; + bool tls:1; + union { + /* ... protocol-specific limits ... */ + struct nvme_tcp_ddp_limits nvmeotcp; + }; + }; + +But each L5P can also add protocol-specific limits e.g.: + +.. code-block:: c + + /** + * struct nvme_tcp_ddp_limits - nvme tcp driver limitations + * + * @full_ccid_range: true if the driver supports the full CID range + */ + struct nvme_tcp_ddp_limits { + bool full_ccid_range; + }; + +Once the L5P has made sure the device is supported the offload +operations are installed on the socket. + +If offload installation fails, then the connection is handled by software as if +offload was not attempted. + +To request offload for a socket `sk`, the L5P calls :c:member:`sk_add`: + +.. code-block:: c + + int (*sk_add)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_config *config); + +The function return 0 for success. In case of failure, L5P software should +fallback to normal non-offloaded operations. The `config` parameter indicates +the L5P type and any metadata relevant for that protocol. For example, in +NVMe-TCP the following config is used: + +.. code-block:: c + + /** + * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue + * + * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0) + * @cpda: controller pdu data alignment (dwords, 0's based) + * @dgst: digest types enabled. + * The netdev will offload crc if L5P data digest is supported. + * @queue_size: number of nvme-tcp IO queue elements + */ + struct nvme_tcp_ddp_config { + u16 pfv; + u8 cpda; + u8 dgst; + int queue_size; + }; + +When offload is not needed anymore, e.g. when the socket is being released, the L5P +calls :c:member:`sk_del` to release device contexts: + +.. code-block:: c + + void (*sk_del)(struct net_device *netdev, + struct sock *sk); + +Normal operation +================ + +At the very least, the device maintains the following state for each connection: + + * 5-tuple + * expected TCP sequence number + * mapping between tags and corresponding buffers + * current offset within PDU, PDU length, current PDU tag + +NICs should not assume any correlation between PDUs and TCP packets. +If TCP packets arrive in-order, offload will place PDU payloads +directly inside corresponding registered buffers. NIC offload should +not delay packets. If offload is not possible, than the packet is +passed as-is to software. To perform offload on incoming packets +without buffering packets in the NIC, the NIC stores some inter-packet +state, such as partial PDU headers. + +RX data-path +------------ + +After the device validates TCP checksums, it can perform DDP offload. The +packet is steered to the DDP offload context according to the 5-tuple. +Thereafter, the expected TCP sequence number is checked against the packet +TCP sequence number. If there is a match, offload is performed: the PDU payload +is DMA written to the corresponding destination buffer according to the PDU header +tag. The data should be DMAed only once, and the NIC receive ring will only +store the remaining TCP and PDU headers. + +We remark that a single TCP packet may have numerous PDUs embedded inside. NICs +can choose to offload one or more of these PDUs according to various +trade-offs. Possibly, offloading such small PDUs is of little value, and it is +better to leave it to software. + +Upon receiving a DDP offloaded packet, the driver reconstructs the original SKB +using page frags, while pointing to the destination buffers whenever possible. +This method enables seamless integration with the network stack, which can +inspect and modify packet fields transparently to the offload. + +.. _buf_reg: + +Destination buffer registration +------------------------------- + +To register the mapping between tags and destination buffers for a socket +`sk`, the L5P calls :c:member:`setup` of :c:type:`struct ulp_ddp_dev_ops +`: + +.. code-block:: c + + int (*setup)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *io); + + +The `io` provides the buffer via scatter-gather list (`sg_table`) and +corresponding tag (`command_id`): + +.. code-block:: c + + /** + * struct ulp_ddp_io - tcp ddp configuration for an IO request. + * + * @command_id: identifier on the wire associated with these buffers + * @nents: number of entries in the sg_table + * @sg_table: describing the buffers for this IO request + * @first_sgl: first SGL in sg_table + */ + struct ulp_ddp_io { + u32 command_id; + int nents; + struct sg_table sg_table; + struct scatterlist first_sgl[SG_CHUNK_SIZE]; + }; + +After the buffers have been consumed by the L5P, to release the NIC mapping of +buffers the L5P calls :c:member:`teardown` of :c:type:`struct +ulp_ddp_dev_ops `: + +.. code-block:: c + + void (*teardown)(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *io, + void *ddp_ctx); + +`teardown` receives the same `io` context and an additional opaque +`ddp_ctx` that is used for asynchronous teardown, see the :ref:`async_release` +section. + +.. _async_release: + +Asynchronous teardown +--------------------- + +To teardown the association between tags and buffers and allow tag reuse NIC HW +is called by the NIC driver during `teardown`. This operation may be +performed either synchronously or asynchronously. In asynchronous teardown, +`teardown` returns immediately without unmapping NIC HW buffers. Later, +when the unmapping completes by NIC HW, the NIC driver will call up to L5P +using :c:member:`ddp_teardown_done` of :c:type:`struct ulp_ddp_ulp_ops `: + +.. code-block:: c + + void (*ddp_teardown_done)(void *ddp_ctx); + +The `ddp_ctx` parameter passed in `ddp_teardown_done` is the same on provided +in `teardown` and it is used to carry some context about the buffers +and tags that are released. + +Resync handling +=============== + +RX +-- +In presence of packet drops or network packet reordering, the device may lose +synchronization between the TCP stream and the L5P framing, and require a +resync with the kernel's TCP stack. When the device is out of sync, no offload +takes place, and packets are passed as-is to software. Resync is very similar +to TLS offload (see documentation at Documentation/networking/tls-offload.rst) + +If only packets with L5P data are lost or reordered, then resynchronization may +be avoided by NIC HW that keeps tracking PDU headers. If, however, PDU headers +are reordered, then resynchronization is necessary. + +To resynchronize hardware during traffic, we use a handshake between hardware +and software. The NIC HW searches for a sequence of bytes that identifies L5P +headers (i.e., magic pattern). For example, in NVMe-TCP, the PDU operation +type can be used for this purpose. Using the PDU header length field, the NIC +HW will continue to find and match magic patterns in subsequent PDU headers. If +the pattern is missing in an expected position, then searching for the pattern +starts anew. + +The NIC will not resume offload when the magic pattern is first identified. +Instead, it will request L5P software to confirm that indeed this is a PDU +header. To request confirmation the NIC driver calls up to L5P using +:c:member:`resync_request` of :c:type:`struct ulp_ddp_ulp_ops `: + +.. code-block:: c + + bool (*resync_request)(struct sock *sk, u32 seq, u32 flags); + +The `seq` parameter contains the TCP sequence of the last byte in the PDU header. +The `flags` parameter contains a flag (`ULP_DDP_RESYNC_PENDING`) indicating whether +a request is pending or not. +L5P software will respond to this request after observing the packet containing +TCP sequence `seq` in-order. If the PDU header is indeed there, then L5P +software calls the NIC driver using the :c:member:`resync` function of +the :c:type:`struct ulp_ddp_dev_ops ` inside the :c:type:`struct +net_device ` while passing the same `seq` to confirm it is a PDU +header. + +.. code-block:: c + + void (*resync)(struct net_device *netdev, + struct sock *sk, u32 seq); + +Statistics +========== + +Per L5P protocol, the NIC driver must report statistics for the above +netdevice operations and packets processed by offload. +These statistics are per-device and can be retrieved from userspace +via netlink (see Documentation/netlink/specs/ulp_ddp.yaml). + +For example, NVMe-TCP offload reports: + + * ``rx_nvme_tcp_sk_add`` - number of NVMe-TCP Rx offload contexts created. + * ``rx_nvme_tcp_sk_add_fail`` - number of NVMe-TCP Rx offload context creation + failures. + * ``rx_nvme_tcp_sk_del`` - number of NVMe-TCP Rx offload contexts destroyed. + * ``rx_nvme_tcp_setup`` - number of DDP buffers mapped. + * ``rx_nvme_tcp_setup_fail`` - number of DDP buffers mapping that failed. + * ``rx_nvme_tcp_teardown`` - number of DDP buffers unmapped. + * ``rx_nvme_tcp_drop`` - number of packets dropped in the driver due to fatal + errors. + * ``rx_nvme_tcp_resync`` - number of packets with resync requests. + * ``rx_nvme_tcp_packets`` - number of packets that used offload. + * ``rx_nvme_tcp_bytes`` - number of bytes placed in DDP buffers. + +NIC requirements +================ + +NIC hardware should meet the following requirements to provide this offload: + + * Offload must never buffer TCP packets. + * Offload must never modify TCP packet headers. + * Offload must never reorder TCP packets within a flow. + * Offload must never drop TCP packets. + * Offload must not depend on any TCP fields beyond the + 5-tuple and TCP sequence number. From 0f5da109edb0ecb6f85b0ee8b55e81b2c3834970 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 30 Apr 2025 08:57:31 +0000 Subject: [PATCH 684/770] net/mlx5e: Rename from tls to transport static params The static params structure is used in TLS but also in other transports we're offloading like nvmeotcp: - Rename the relevant structures/fields - Create common file for appropriate transports - Apply changes in the TLS code No functional change here. Signed-off-by: Or Gerlitz Signed-off-by: Ben Ben-Ishay Signed-off-by: Aurelien Aptel Reviewed-by: Tariq Toukan Signed-off-by: NipaLocal --- .../mlx5/core/en_accel/common_utils.h | 34 +++++++++++++++ .../mellanox/mlx5/core/en_accel/ktls.c | 3 +- .../mellanox/mlx5/core/en_accel/ktls_rx.c | 6 +-- .../mellanox/mlx5/core/en_accel/ktls_tx.c | 9 ++-- .../mellanox/mlx5/core/en_accel/ktls_txrx.c | 41 +++++++++---------- .../mellanox/mlx5/core/en_accel/ktls_utils.h | 17 +------- include/linux/mlx5/device.h | 8 ++-- include/linux/mlx5/mlx5_ifc.h | 8 +++- 8 files changed, 75 insertions(+), 51 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/common_utils.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/common_utils.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/common_utils.h new file mode 100644 index 0000000000000..b3efed9161672 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/common_utils.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */ +#ifndef __MLX5E_COMMON_UTILS_H__ +#define __MLX5E_COMMON_UTILS_H__ + +#include "en.h" + +struct mlx5e_set_transport_static_params_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_wqe_umr_ctrl_seg uctrl; + struct mlx5_mkey_seg mkc; + struct mlx5_wqe_transport_static_params_seg params; +}; + +/* macros for transport_static_params handling */ +#define MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS \ + (DIV_ROUND_UP(sizeof(struct mlx5e_set_transport_static_params_wqe), \ + MLX5_SEND_WQE_BB)) + +#define MLX5E_TRANSPORT_FETCH_SET_STATIC_PARAMS_WQE(sq, pi) \ + ((struct mlx5e_set_transport_static_params_wqe *)\ + mlx5e_fetch_wqe(&(sq)->wq, pi, \ + sizeof(struct mlx5e_set_transport_static_params_wqe))) + +#define MLX5E_TRANSPORT_STATIC_PARAMS_WQE_SZ \ + (sizeof(struct mlx5e_set_transport_static_params_wqe)) + +#define MLX5E_TRANSPORT_STATIC_PARAMS_DS_CNT \ + (DIV_ROUND_UP(MLX5E_TRANSPORT_STATIC_PARAMS_WQE_SZ, MLX5_SEND_WQE_DS)) + +#define MLX5E_TRANSPORT_STATIC_PARAMS_OCTWORD_SIZE \ + (MLX5_ST_SZ_BYTES(transport_static_params) / MLX5_SEND_WQE_DS) + +#endif /* __MLX5E_COMMON_UTILS_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c index e3e57c849436a..6787440c8b411 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.c @@ -100,7 +100,8 @@ bool mlx5e_is_ktls_rx(struct mlx5_core_dev *mdev) return false; /* Check the possibility to post the required ICOSQ WQEs. */ - if (WARN_ON_ONCE(max_sq_wqebbs < MLX5E_TLS_SET_STATIC_PARAMS_WQEBBS)) + if (WARN_ON_ONCE(max_sq_wqebbs + < MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS)) return false; if (WARN_ON_ONCE(max_sq_wqebbs < MLX5E_TLS_SET_PROGRESS_PARAMS_WQEBBS)) return false; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index 65ccb33edafb7..3c501466634c9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -136,16 +136,16 @@ static struct mlx5_wqe_ctrl_seg * post_static_params(struct mlx5e_icosq *sq, struct mlx5e_ktls_offload_context_rx *priv_rx) { - struct mlx5e_set_tls_static_params_wqe *wqe; + struct mlx5e_set_transport_static_params_wqe *wqe; struct mlx5e_icosq_wqe_info wi; u16 pi, num_wqebbs; - num_wqebbs = MLX5E_TLS_SET_STATIC_PARAMS_WQEBBS; + num_wqebbs = MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS; if (unlikely(!mlx5e_icosq_can_post_wqe(sq, num_wqebbs))) return ERR_PTR(-ENOSPC); pi = mlx5e_icosq_get_next_pi(sq, num_wqebbs); - wqe = MLX5E_TLS_FETCH_SET_STATIC_PARAMS_WQE(sq, pi); + wqe = MLX5E_TRANSPORT_FETCH_SET_STATIC_PARAMS_WQE(sq, pi); mlx5e_ktls_build_static_params(wqe, sq->pc, sq->sqn, &priv_rx->crypto_info, mlx5e_tir_get_tirn(&priv_rx->tir), mlx5_crypto_dek_get_id(priv_rx->dek), diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c index 3db31cc107192..cce596d083a2b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -33,7 +33,8 @@ u16 mlx5e_ktls_get_stop_room(struct mlx5_core_dev *mdev, struct mlx5e_params *pa num_dumps = mlx5e_ktls_dumps_num_wqes(params, MAX_SKB_FRAGS, TLS_MAX_PAYLOAD_SIZE); - stop_room += mlx5e_stop_room_for_wqe(mdev, MLX5E_TLS_SET_STATIC_PARAMS_WQEBBS); + stop_room += mlx5e_stop_room_for_wqe(mdev, + MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS); stop_room += mlx5e_stop_room_for_wqe(mdev, MLX5E_TLS_SET_PROGRESS_PARAMS_WQEBBS); stop_room += num_dumps * mlx5e_stop_room_for_wqe(mdev, MLX5E_KTLS_DUMP_WQEBBS); stop_room += 1; /* fence nop */ @@ -550,12 +551,12 @@ post_static_params(struct mlx5e_txqsq *sq, struct mlx5e_ktls_offload_context_tx *priv_tx, bool fence) { - struct mlx5e_set_tls_static_params_wqe *wqe; + struct mlx5e_set_transport_static_params_wqe *wqe; u16 pi, num_wqebbs; - num_wqebbs = MLX5E_TLS_SET_STATIC_PARAMS_WQEBBS; + num_wqebbs = MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS; pi = mlx5e_txqsq_get_next_pi(sq, num_wqebbs); - wqe = MLX5E_TLS_FETCH_SET_STATIC_PARAMS_WQE(sq, pi); + wqe = MLX5E_TRANSPORT_FETCH_SET_STATIC_PARAMS_WQE(sq, pi); mlx5e_ktls_build_static_params(wqe, sq->pc, sq->sqn, &priv_tx->crypto_info, priv_tx->tisn, mlx5_crypto_dek_get_id(priv_tx->dek), diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.c index 570a912dd6faf..fb21ee417048d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_txrx.c @@ -8,10 +8,6 @@ enum { MLX5E_STATIC_PARAMS_CONTEXT_TLS_1_2 = 0x2, }; -enum { - MLX5E_ENCRYPTION_STANDARD_TLS = 0x1, -}; - #define EXTRACT_INFO_FIELDS do { \ salt = info->salt; \ rec_seq = info->rec_seq; \ @@ -20,7 +16,7 @@ enum { } while (0) static void -fill_static_params(struct mlx5_wqe_tls_static_params_seg *params, +fill_static_params(struct mlx5_wqe_transport_static_params_seg *params, union mlx5e_crypto_info *crypto_info, u32 key_id, u32 resync_tcp_sn) { @@ -53,25 +49,26 @@ fill_static_params(struct mlx5_wqe_tls_static_params_seg *params, return; } - gcm_iv = MLX5_ADDR_OF(tls_static_params, ctx, gcm_iv); - initial_rn = MLX5_ADDR_OF(tls_static_params, ctx, initial_record_number); + gcm_iv = MLX5_ADDR_OF(transport_static_params, ctx, gcm_iv); + initial_rn = MLX5_ADDR_OF(transport_static_params, ctx, + initial_record_number); memcpy(gcm_iv, salt, salt_sz); memcpy(initial_rn, rec_seq, rec_seq_sz); tls_version = MLX5E_STATIC_PARAMS_CONTEXT_TLS_1_2; - MLX5_SET(tls_static_params, ctx, tls_version, tls_version); - MLX5_SET(tls_static_params, ctx, const_1, 1); - MLX5_SET(tls_static_params, ctx, const_2, 2); - MLX5_SET(tls_static_params, ctx, encryption_standard, - MLX5E_ENCRYPTION_STANDARD_TLS); - MLX5_SET(tls_static_params, ctx, resync_tcp_sn, resync_tcp_sn); - MLX5_SET(tls_static_params, ctx, dek_index, key_id); + MLX5_SET(transport_static_params, ctx, tls_version, tls_version); + MLX5_SET(transport_static_params, ctx, const_1, 1); + MLX5_SET(transport_static_params, ctx, const_2, 2); + MLX5_SET(transport_static_params, ctx, acc_type, + MLX5_TRANSPORT_STATIC_PARAMS_ACC_TYPE_TLS); + MLX5_SET(transport_static_params, ctx, resync_tcp_sn, resync_tcp_sn); + MLX5_SET(transport_static_params, ctx, dek_index, key_id); } void -mlx5e_ktls_build_static_params(struct mlx5e_set_tls_static_params_wqe *wqe, +mlx5e_ktls_build_static_params(struct mlx5e_set_transport_static_params_wqe *wqe, u16 pc, u32 sqn, union mlx5e_crypto_info *crypto_info, u32 tis_tir_num, u32 key_id, u32 resync_tcp_sn, @@ -80,19 +77,19 @@ mlx5e_ktls_build_static_params(struct mlx5e_set_tls_static_params_wqe *wqe, struct mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl; struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; u8 opmod = direction == TLS_OFFLOAD_CTX_DIR_TX ? - MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS : - MLX5_OPC_MOD_TLS_TIR_STATIC_PARAMS; - -#define STATIC_PARAMS_DS_CNT DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_DS) + MLX5_OPC_MOD_TRANSPORT_TIS_STATIC_PARAMS : + MLX5_OPC_MOD_TRANSPORT_TIR_STATIC_PARAMS; - cseg->opmod_idx_opcode = cpu_to_be32((pc << 8) | MLX5_OPCODE_UMR | (opmod << 24)); + cseg->opmod_idx_opcode = cpu_to_be32((pc << 8) | MLX5_OPCODE_UMR | + (opmod << 24)); cseg->qpn_ds = cpu_to_be32((sqn << MLX5_WQE_CTRL_QPN_SHIFT) | - STATIC_PARAMS_DS_CNT); + MLX5E_TRANSPORT_STATIC_PARAMS_DS_CNT); cseg->fm_ce_se = fence ? MLX5_FENCE_MODE_INITIATOR_SMALL : 0; cseg->tis_tir_num = cpu_to_be32(tis_tir_num << 8); ucseg->flags = MLX5_UMR_INLINE; - ucseg->bsf_octowords = cpu_to_be16(MLX5_ST_SZ_BYTES(tls_static_params) / 16); + ucseg->bsf_octowords = + cpu_to_be16(MLX5E_TRANSPORT_STATIC_PARAMS_OCTWORD_SIZE); fill_static_params(&wqe->params, crypto_info, key_id, resync_tcp_sn); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_utils.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_utils.h index 3d79cd3798907..5e2d186778aac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_utils.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_utils.h @@ -6,6 +6,7 @@ #include #include "en.h" +#include "en_accel/common_utils.h" enum { MLX5E_TLS_PROGRESS_PARAMS_AUTH_STATE_NO_OFFLOAD = 0, @@ -33,13 +34,6 @@ union mlx5e_crypto_info { struct tls12_crypto_info_aes_gcm_256 crypto_info_256; }; -struct mlx5e_set_tls_static_params_wqe { - struct mlx5_wqe_ctrl_seg ctrl; - struct mlx5_wqe_umr_ctrl_seg uctrl; - struct mlx5_mkey_seg mkc; - struct mlx5_wqe_tls_static_params_seg params; -}; - struct mlx5e_set_tls_progress_params_wqe { struct mlx5_wqe_ctrl_seg ctrl; struct mlx5_wqe_tls_progress_params_seg params; @@ -50,19 +44,12 @@ struct mlx5e_get_tls_progress_params_wqe { struct mlx5_seg_get_psv psv; }; -#define MLX5E_TLS_SET_STATIC_PARAMS_WQEBBS \ - (DIV_ROUND_UP(sizeof(struct mlx5e_set_tls_static_params_wqe), MLX5_SEND_WQE_BB)) - #define MLX5E_TLS_SET_PROGRESS_PARAMS_WQEBBS \ (DIV_ROUND_UP(sizeof(struct mlx5e_set_tls_progress_params_wqe), MLX5_SEND_WQE_BB)) #define MLX5E_KTLS_GET_PROGRESS_WQEBBS \ (DIV_ROUND_UP(sizeof(struct mlx5e_get_tls_progress_params_wqe), MLX5_SEND_WQE_BB)) -#define MLX5E_TLS_FETCH_SET_STATIC_PARAMS_WQE(sq, pi) \ - ((struct mlx5e_set_tls_static_params_wqe *)\ - mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5e_set_tls_static_params_wqe))) - #define MLX5E_TLS_FETCH_SET_PROGRESS_PARAMS_WQE(sq, pi) \ ((struct mlx5e_set_tls_progress_params_wqe *)\ mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5e_set_tls_progress_params_wqe))) @@ -76,7 +63,7 @@ struct mlx5e_get_tls_progress_params_wqe { mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5e_dump_wqe))) void -mlx5e_ktls_build_static_params(struct mlx5e_set_tls_static_params_wqe *wqe, +mlx5e_ktls_build_static_params(struct mlx5e_set_transport_static_params_wqe *wqe, u16 pc, u32 sqn, union mlx5e_crypto_info *crypto_info, u32 tis_tir_num, u32 key_id, u32 resync_tcp_sn, diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 6822cfa5f4ad3..b071df6e4e53e 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -457,8 +457,8 @@ enum { }; enum { - MLX5_OPC_MOD_TLS_TIS_STATIC_PARAMS = 0x1, - MLX5_OPC_MOD_TLS_TIR_STATIC_PARAMS = 0x2, + MLX5_OPC_MOD_TRANSPORT_TIS_STATIC_PARAMS = 0x1, + MLX5_OPC_MOD_TRANSPORT_TIR_STATIC_PARAMS = 0x2, }; enum { @@ -466,8 +466,8 @@ enum { MLX5_OPC_MOD_TLS_TIR_PROGRESS_PARAMS = 0x2, }; -struct mlx5_wqe_tls_static_params_seg { - u8 ctx[MLX5_ST_SZ_BYTES(tls_static_params)]; +struct mlx5_wqe_transport_static_params_seg { + u8 ctx[MLX5_ST_SZ_BYTES(transport_static_params)]; }; struct mlx5_wqe_tls_progress_params_seg { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2c09df4ee5742..0e348b2065a89 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -12901,12 +12901,16 @@ enum { MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_PURPOSE_MACSEC = 0x4, }; -struct mlx5_ifc_tls_static_params_bits { +enum { + MLX5_TRANSPORT_STATIC_PARAMS_ACC_TYPE_TLS = 0x1, +}; + +struct mlx5_ifc_transport_static_params_bits { u8 const_2[0x2]; u8 tls_version[0x4]; u8 const_1[0x2]; u8 reserved_at_8[0x14]; - u8 encryption_standard[0x4]; + u8 acc_type[0x4]; u8 reserved_at_20[0x20]; From 5019d3f82a4e1e6523530ef99e57a9e4e960f2d9 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Wed, 30 Apr 2025 08:57:32 +0000 Subject: [PATCH 685/770] net/mlx5e: Refactor ico sq polling to get budget The mlx5e driver uses ICO SQs for internal control operations which are not visible to the network stack, such as UMR mapping for striding RQ (MPWQ) and etc more cases. The upcoming nvmeotcp offload uses ico sq for umr mapping as part of the offload. As a pre-step for nvmeotcp ico sqs which have their own napi and need to comply with budget, add the budget as parameter to the polling of cqs related to ico sqs. The polling already stops after a limit is reached, so just have the caller to provide this limit as the budget. Additionally, we move the mdev pointer directly on the icosq structure. This provides better separation between channels to ICO SQs for use-cases where they are not tightly coupled (such as the upcoming nvmeotcp code). No functional change here. Signed-off-by: Or Gerlitz Signed-off-by: Aurelien Aptel Reviewed-by: Tariq Toukan Signed-off-by: NipaLocal --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 + drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 ++--- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 4 ++-- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 32ed4963b8ada..d87ab31dd0af0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -546,6 +546,7 @@ struct mlx5e_icosq { /* control path */ struct mlx5_wq_ctrl wq_ctrl; struct mlx5e_channel *channel; + struct mlx5_core_dev *mdev; struct work_struct recover_work; } ____cacheline_aligned_in_smp; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index e75759533ae0c..89807aaa870d4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -46,7 +46,7 @@ static int mlx5e_query_rq_state(struct mlx5_core_dev *dev, u32 rqn, u8 *state) static int mlx5e_wait_for_icosq_flush(struct mlx5e_icosq *icosq) { - struct mlx5_core_dev *dev = icosq->channel->mdev; + struct mlx5_core_dev *dev = icosq->mdev; unsigned long exp_time; exp_time = jiffies + msecs_to_jiffies(mlx5_tout_ms(dev, FLUSH_ON_ERROR)); @@ -91,7 +91,7 @@ static int mlx5e_rx_reporter_err_icosq_cqe_recover(void *ctx) rq = &icosq->channel->rq; if (test_bit(MLX5E_RQ_STATE_ENABLED, &icosq->channel->xskrq.state)) xskrq = &icosq->channel->xskrq; - mdev = icosq->channel->mdev; + mdev = icosq->mdev; dev = icosq->channel->netdev; err = mlx5_core_query_sq_state(mdev, icosq->sqn, &state); if (err) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index e837c21d3d213..0d31d012954dd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -83,7 +83,7 @@ void mlx5e_trigger_irq(struct mlx5e_icosq *sq); void mlx5e_completion_event(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe); void mlx5e_cq_error_event(struct mlx5_core_cq *mcq, enum mlx5_event event); int mlx5e_napi_poll(struct napi_struct *napi, int budget); -int mlx5e_poll_ico_cq(struct mlx5e_cq *cq); +int mlx5e_poll_ico_cq(struct mlx5e_cq *cq, int budget); /* RX */ INDIRECT_CALLABLE_DECLARE(bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3506024c24539..de25401547d24 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1539,6 +1539,7 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c, int err; sq->channel = c; + sq->mdev = mdev; sq->uar_map = mdev->mlx5e_res.hw_objs.bfreg.map; sq->reserved_room = param->stop_room; @@ -1997,11 +1998,9 @@ void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq) static void mlx5e_close_icosq(struct mlx5e_icosq *sq) { - struct mlx5e_channel *c = sq->channel; - if (sq->ktls_resync) mlx5e_ktls_rx_resync_destroy_resp_list(sq->ktls_resync); - mlx5e_destroy_sq(c->mdev, sq->sqn); + mlx5e_destroy_sq(sq->mdev, sq->sqn); mlx5e_free_icosq_descs(sq); mlx5e_free_icosq(sq); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 5fd70b4d55beb..c4c612ab62c4c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -988,7 +988,7 @@ static void mlx5e_handle_shampo_hd_umr(struct mlx5e_shampo_umr umr, mlx5e_shampo_fill_umr(rq, umr.len); } -int mlx5e_poll_ico_cq(struct mlx5e_cq *cq) +int mlx5e_poll_ico_cq(struct mlx5e_cq *cq, int budget) { struct mlx5e_icosq *sq = container_of(cq, struct mlx5e_icosq, cq); struct mlx5_cqe64 *cqe; @@ -1063,7 +1063,7 @@ int mlx5e_poll_ico_cq(struct mlx5e_cq *cq) wi->wqe_type); } } while (!last_wqe); - } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); + } while ((++i < budget) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); sq->cc = sqcc; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c index 76108299ea57d..af0ae65cb87cf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c @@ -179,8 +179,8 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget) busy |= work_done == budget; - mlx5e_poll_ico_cq(&c->icosq.cq); - if (mlx5e_poll_ico_cq(&c->async_icosq.cq)) + mlx5e_poll_ico_cq(&c->icosq.cq, MLX5E_TX_CQ_POLL_BUDGET); + if (mlx5e_poll_ico_cq(&c->async_icosq.cq, MLX5E_TX_CQ_POLL_BUDGET)) /* Don't clear the flag if nothing was polled to prevent * queueing more WQEs and overflowing the async ICOSQ. */ From 8b122b106d924a8dd0d2195afff59bf4ea2725e3 Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Wed, 30 Apr 2025 08:57:33 +0000 Subject: [PATCH 686/770] net/mlx5: Add NVMEoTCP caps, HW bits, 128B CQE and enumerations Add the necessary infrastructure for NVMEoTCP offload: - Create mlx5_cqe128 structure for NVMEoTCP offload. The new structure consist from the regular mlx5_cqe64 + NVMEoTCP data information for offloaded packets. - Add nvmetcp field to mlx5_cqe64, this field define the type of the data that the additional NVMEoTCP part represents. - Add nvmeotcp_zero_copy_en + nvmeotcp_crc_en bit to the TIR, for identify NVMEoTCP offload flow and tag_buffer_id that will be used by the connected nvmeotcp_queues. - Add new capability to HCA_CAP that represents the NVMEoTCP offload ability. Signed-off-by: Ben Ben-Ishay Signed-off-by: Or Gerlitz Signed-off-by: Aurelien Aptel Reviewed-by: Tariq Toukan Signed-off-by: NipaLocal --- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 6 ++ include/linux/mlx5/device.h | 51 ++++++++++++- include/linux/mlx5/mlx5_ifc.h | 77 +++++++++++++++++++- include/linux/mlx5/qp.h | 1 + 4 files changed, 130 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 57476487e31fe..a1b437b91c4cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -294,6 +294,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) return err; } + if (MLX5_CAP_GEN(dev, nvmeotcp)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_DEV_NVMEOTCP); + if (err) + return err; + } + return 0; } diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index b071df6e4e53e..4ec55b8881a9b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -265,6 +265,7 @@ enum { enum { MLX5_MKEY_MASK_LEN = 1ull << 0, MLX5_MKEY_MASK_PAGE_SIZE = 1ull << 1, + MLX5_MKEY_MASK_XLT_OCT_SIZE = 1ull << 2, MLX5_MKEY_MASK_START_ADDR = 1ull << 6, MLX5_MKEY_MASK_PD = 1ull << 7, MLX5_MKEY_MASK_EN_RINVAL = 1ull << 8, @@ -821,7 +822,11 @@ struct mlx5_err_cqe { struct mlx5_cqe64 { u8 tls_outer_l3_tunneled; - u8 rsvd0; + u8 rsvd16bit:4; + u8 nvmeotcp_zc:1; + u8 nvmeotcp_ddgst:1; + u8 nvmeotcp_resync:1; + u8 rsvd23bit:1; __be16 wqe_id; union { struct { @@ -870,6 +875,19 @@ struct mlx5_cqe64 { u8 op_own; }; +struct mlx5e_cqe128 { + __be16 cclen; + __be16 hlen; + union { + __be32 resync_tcp_sn; + __be32 ccoff; + }; + __be16 ccid; + __be16 rsvd8; + u8 rsvd12[52]; + struct mlx5_cqe64 cqe64; +}; + struct mlx5_mini_cqe8 { union { __be32 rx_hash_result; @@ -905,6 +923,28 @@ enum { #define MLX5_MINI_CQE_ARRAY_SIZE 8 +static inline bool cqe_is_nvmeotcp_resync(struct mlx5_cqe64 *cqe) +{ + return cqe->nvmeotcp_resync; +} + +static inline bool cqe_is_nvmeotcp_crcvalid(struct mlx5_cqe64 *cqe) +{ + return cqe->nvmeotcp_ddgst; +} + +static inline bool cqe_is_nvmeotcp_zc(struct mlx5_cqe64 *cqe) +{ + return cqe->nvmeotcp_zc; +} + +/* check if cqe is zc or crc or resync */ +static inline bool cqe_is_nvmeotcp(struct mlx5_cqe64 *cqe) +{ + return cqe_is_nvmeotcp_zc(cqe) || cqe_is_nvmeotcp_crcvalid(cqe) || + cqe_is_nvmeotcp_resync(cqe); +} + static inline u8 mlx5_get_cqe_format(struct mlx5_cqe64 *cqe) { return (cqe->op_own >> 2) & 0x3; @@ -1245,6 +1285,7 @@ enum mlx5_cap_type { MLX5_CAP_VDPA_EMULATION = 0x13, MLX5_CAP_DEV_EVENT = 0x14, MLX5_CAP_IPSEC, + MLX5_CAP_DEV_NVMEOTCP = 0x19, MLX5_CAP_CRYPTO = 0x1a, MLX5_CAP_SHAMPO = 0x1d, MLX5_CAP_MACSEC = 0x1f, @@ -1486,6 +1527,14 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_SHAMPO(mdev, cap) \ MLX5_GET(shampo_cap, mdev->caps.hca[MLX5_CAP_SHAMPO]->cur, cap) +#define MLX5_CAP_DEV_NVMEOTCP(mdev, cap)\ + MLX5_GET(nvmeotcp_cap, \ + (mdev)->caps.hca[MLX5_CAP_DEV_NVMEOTCP]->cur, cap) + +#define MLX5_CAP64_DEV_NVMEOTCP(mdev, cap)\ + MLX5_GET64(nvmeotcp_cap, \ + (mdev)->caps.hca[MLX5_CAP_DEV_NVMEOTCP]->cur, cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 0e348b2065a89..c4f957e5fe949 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1598,6 +1598,20 @@ enum { MLX5_STEERING_FORMAT_CONNECTX_8 = 3, }; +struct mlx5_ifc_nvmeotcp_cap_bits { + u8 zerocopy[0x1]; + u8 crc_rx[0x1]; + u8 crc_tx[0x1]; + u8 reserved_at_3[0x15]; + u8 version[0x8]; + + u8 reserved_at_20[0x13]; + u8 log_max_nvmeotcp_tag_buffer_table[0x5]; + u8 reserved_at_38[0x3]; + u8 log_max_nvmeotcp_tag_buffer_size[0x5]; + u8 reserved_at_40[0x7c0]; +}; + struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_0[0x6]; u8 page_request_disable[0x1]; @@ -1625,7 +1639,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 event_cap[0x1]; u8 reserved_at_91[0x2]; u8 isolate_vl_tc_new[0x1]; - u8 reserved_at_94[0x4]; + u8 reserved_at_94[0x2]; + u8 nvmeotcp[0x1]; + u8 reserved_at_97[0x1]; u8 prio_tag_required[0x1]; u8 reserved_at_99[0x2]; u8 log_max_qp[0x5]; @@ -3772,6 +3788,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_macsec_cap_bits macsec_cap; struct mlx5_ifc_crypto_cap_bits crypto_cap; struct mlx5_ifc_ipsec_cap_bits ipsec_cap; + struct mlx5_ifc_nvmeotcp_cap_bits nvmeotcp_cap; u8 reserved_at_0[0x8000]; }; @@ -4024,7 +4041,9 @@ struct mlx5_ifc_tirc_bits { u8 disp_type[0x4]; u8 tls_en[0x1]; - u8 reserved_at_25[0x1b]; + u8 nvmeotcp_zero_copy_en[0x1]; + u8 nvmeotcp_crc_en[0x1]; + u8 reserved_at_27[0x19]; u8 reserved_at_40[0x40]; @@ -4055,7 +4074,8 @@ struct mlx5_ifc_tirc_bits { struct mlx5_ifc_rx_hash_field_select_bits rx_hash_field_selector_inner; - u8 reserved_at_2c0[0x4c0]; + u8 nvmeotcp_tag_buffer_table_id[0x20]; + u8 reserved_at_2e0[0x4a0]; }; enum { @@ -12505,6 +12525,8 @@ enum { MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = BIT_ULL(0xc), MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_IPSEC = BIT_ULL(0x13), MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_SAMPLER = BIT_ULL(0x20), + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_NVMEOTCP_TAG_BUFFER_TABLE = + BIT_ULL(0x21), MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = BIT_ULL(0x24), }; @@ -12516,6 +12538,7 @@ enum { MLX5_GENERAL_OBJECT_TYPES_ENCRYPTION_KEY = 0xc, MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13, MLX5_GENERAL_OBJECT_TYPES_SAMPLER = 0x20, + MLX5_GENERAL_OBJECT_TYPES_NVMEOTCP_TAG_BUFFER_TABLE = 0x21, MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = 0x24, MLX5_GENERAL_OBJECT_TYPES_MACSEC = 0x27, MLX5_GENERAL_OBJECT_TYPES_INT_KEK = 0x47, @@ -12890,6 +12913,21 @@ struct mlx5_ifc_query_sampler_obj_out_bits { struct mlx5_ifc_sampler_obj_bits sampler_object; }; +struct mlx5_ifc_nvmeotcp_tag_buf_table_obj_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x1b]; + u8 log_tag_buffer_table_size[0x5]; +}; + +struct mlx5_ifc_create_nvmeotcp_tag_buf_table_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_nvmeotcp_tag_buf_table_obj_bits + nvmeotcp_tag_buf_table_obj; +}; + enum { MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_128 = 0x0, MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_KEY_SIZE_256 = 0x1, @@ -12903,6 +12941,13 @@ enum { enum { MLX5_TRANSPORT_STATIC_PARAMS_ACC_TYPE_TLS = 0x1, + MLX5_TRANSPORT_STATIC_PARAMS_ACC_TYPE_NVMETCP = 0x2, + MLX5_TRANSPORT_STATIC_PARAMS_ACC_TYPE_NVMETCP_WITH_TLS = 0x3, +}; + +enum { + MLX5_TRANSPORT_STATIC_PARAMS_TI_INITIATOR = 0x0, + MLX5_TRANSPORT_STATIC_PARAMS_TI_TARGET = 0x1, }; struct mlx5_ifc_transport_static_params_bits { @@ -12925,7 +12970,20 @@ struct mlx5_ifc_transport_static_params_bits { u8 reserved_at_100[0x8]; u8 dek_index[0x18]; - u8 reserved_at_120[0xe0]; + u8 reserved_at_120[0x14]; + + u8 cccid_ttag[0x1]; + u8 ti[0x1]; + u8 zero_copy_en[0x1]; + u8 ddgst_offload_en[0x1]; + u8 hdgst_offload_en[0x1]; + u8 ddgst_en[0x1]; + u8 hddgst_en[0x1]; + u8 pda[0x5]; + + u8 nvme_resync_tcp_sn[0x20]; + + u8 reserved_at_160[0xa0]; }; struct mlx5_ifc_tls_progress_params_bits { @@ -13283,4 +13341,15 @@ struct mlx5_ifc_mrtcq_reg_bits { u8 reserved_at_80[0x180]; }; +struct mlx5_ifc_nvmeotcp_progress_params_bits { + u8 next_pdu_tcp_sn[0x20]; + + u8 hw_resync_tcp_sn[0x20]; + + u8 pdu_tracker_state[0x2]; + u8 offloading_state[0x2]; + u8 reserved_at_44[0xc]; + u8 cccid_ttag[0x10]; +}; + #endif /* MLX5_IFC_H */ diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index fc7eeff99a8a6..10267ddf1bfe8 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -228,6 +228,7 @@ struct mlx5_wqe_ctrl_seg { #define MLX5_WQE_CTRL_OPCODE_MASK 0xff #define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00 #define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8 +#define MLX5_WQE_CTRL_TIR_TIS_INDEX_SHIFT 8 enum { MLX5_ETH_WQE_L3_INNER_CSUM = 1 << 4, From c456e410511138c543d64b4aa0631dd895049d4e Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Wed, 30 Apr 2025 08:57:34 +0000 Subject: [PATCH 687/770] net/mlx5e: NVMEoTCP, offload initialization This commit introduces the driver structures and initialization blocks for NVMEoTCP offload. The mlx5 nvmeotcp structures are: - queue (mlx5e_nvmeotcp_queue) - pairs 1:1 with nvmeotcp driver queues and deals with the offloading parts. The mlx5e queue is accessed in the ddp ops: initialized on sk_add, used in ddp setup,teardown,resync and in the fast path when dealing with packets, destroyed in the sk_del op. - queue entry (nvmeotcp_queue_entry) - pairs 1:1 with offloaded IO from that queue. Keeps pointers to the SG elements describing the buffers used for the IO and the ddp context of it. - queue handler (mlx5e_nvmeotcp_queue_handler) - we use icosq per NVME-TCP queue for UMR mapping as part of the ddp offload. Those dedicated SQs are unique in the sense that they are driven directly by the NVME-TCP layer to submit and invalidate ddp requests. Since the life-cycle of these icosqs is not tied to the channels, we create dedicated napi contexts for polling them such that channels can be re-created during offloading. The queue handler has pointer to the cq associated with the queue's sq and napi context. - main offload context (mlx5e_nvmeotcp) - has ida and hash table instances. Each offloaded queue gets an ID from the ida instance and the pairs are kept in the hash table. The id is programmed as flow tag to be set by HW on the completion (cqe) of all packets related to this queue (by 5-tuple steering). The fast path which deals with packets uses the flow tag to access the hash table and retrieve the queue for the processing. We query nvmeotcp capabilities to see if the offload can be supported and use 128B CQEs when this happens. By default, the offload is off but can be enabled with `ethtool --ulp-ddp nvme-tcp-ddp on`. Signed-off-by: Ben Ben-Ishay Signed-off-by: Boris Pismenny Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Shai Malin Signed-off-by: Aurelien Aptel Reviewed-by: Tariq Toukan Signed-off-by: NipaLocal --- .../net/ethernet/mellanox/mlx5/core/Kconfig | 11 + .../net/ethernet/mellanox/mlx5/core/Makefile | 2 + drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 + .../net/ethernet/mellanox/mlx5/core/en/fs.h | 4 +- .../ethernet/mellanox/mlx5/core/en/params.c | 13 +- .../ethernet/mellanox/mlx5/core/en/params.h | 3 + .../mellanox/mlx5/core/en_accel/en_accel.h | 3 + .../mellanox/mlx5/core/en_accel/fs_tcp.h | 2 +- .../mellanox/mlx5/core/en_accel/nvmeotcp.c | 228 ++++++++++++++++++ .../mellanox/mlx5/core/en_accel/nvmeotcp.h | 125 ++++++++++ .../ethernet/mellanox/mlx5/core/en_ethtool.c | 6 + .../net/ethernet/mellanox/mlx5/core/en_fs.c | 4 +- .../net/ethernet/mellanox/mlx5/core/en_main.c | 17 ++ .../net/ethernet/mellanox/mlx5/core/main.c | 1 + 14 files changed, 414 insertions(+), 9 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig index 6ec7d6e0181de..8e64ec593725d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig +++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig @@ -165,6 +165,17 @@ config MLX5_EN_TLS help Build support for TLS cryptography-offload acceleration in the NIC. +config MLX5_EN_NVMEOTCP + bool "NVMEoTCP acceleration" + depends on ULP_DDP + depends on MLX5_CORE_EN + default y + help + Build support for NVMEoTCP acceleration in the NIC. + This includes Direct Data Placement and CRC offload. + Note: Support for hardware with this capability needs to be selected + for this option to become available. + config MLX5_SW_STEERING bool "Mellanox Technologies software-managed steering" depends on MLX5_CORE_EN && MLX5_ESWITCH diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index d292e6a9e22c3..7e01096bdde0f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -110,6 +110,8 @@ mlx5_core-$(CONFIG_MLX5_EN_TLS) += en_accel/ktls_stats.o \ en_accel/fs_tcp.o en_accel/ktls.o en_accel/ktls_txrx.o \ en_accel/ktls_tx.o en_accel/ktls_rx.o +mlx5_core-$(CONFIG_MLX5_EN_NVMEOTCP) += en_accel/fs_tcp.o en_accel/nvmeotcp.o + # # SW Steering # diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index d87ab31dd0af0..4051f23c0fc4a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -316,6 +316,7 @@ struct mlx5e_params { int hard_mtu; bool ptp_rx; __be32 terminate_lkey_be; + bool nvmeotcp; }; static inline u8 mlx5e_get_dcb_num_tc(struct mlx5e_params *params) @@ -931,6 +932,9 @@ struct mlx5e_priv { #endif #ifdef CONFIG_MLX5_EN_TLS struct mlx5e_tls *tls; +#endif +#ifdef CONFIG_MLX5_EN_NVMEOTCP + struct mlx5e_nvmeotcp *nvmeotcp; #endif struct devlink_health_reporter *tx_reporter; struct devlink_health_reporter *rx_reporter; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h index b5c3a2a9d2a59..88b1d01f6c195 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -77,7 +77,7 @@ enum { MLX5E_INNER_TTC_FT_LEVEL, MLX5E_FS_TT_UDP_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, MLX5E_FS_TT_ANY_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, -#ifdef CONFIG_MLX5_EN_TLS +#if defined(CONFIG_MLX5_EN_TLS) || defined(CONFIG_MLX5_EN_NVMEOTCP) MLX5E_ACCEL_FS_TCP_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, #endif #ifdef CONFIG_MLX5_EN_ARFS @@ -182,7 +182,7 @@ struct mlx5e_fs_any *mlx5e_fs_get_any(struct mlx5e_flow_steering *fs); void mlx5e_fs_set_any(struct mlx5e_flow_steering *fs, struct mlx5e_fs_any *any); struct mlx5e_fs_udp *mlx5e_fs_get_udp(struct mlx5e_flow_steering *fs); void mlx5e_fs_set_udp(struct mlx5e_flow_steering *fs, struct mlx5e_fs_udp *udp); -#ifdef CONFIG_MLX5_EN_TLS +#if defined(CONFIG_MLX5_EN_TLS) || defined(CONFIG_MLX5_EN_NVMEOTCP) struct mlx5e_accel_fs_tcp *mlx5e_fs_get_accel_tcp(struct mlx5e_flow_steering *fs); void mlx5e_fs_set_accel_tcp(struct mlx5e_flow_steering *fs, struct mlx5e_accel_fs_tcp *accel_tcp); #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c index 58ec5e44aa7ad..674bd729971cf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c @@ -826,7 +826,8 @@ static void mlx5e_build_common_cq_param(struct mlx5_core_dev *mdev, void *cqc = param->cqc; MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); - if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) + if (MLX5_CAP_GEN(mdev, cqe_128_always) && + (cache_line_size() >= 128 || param->force_cqe128)) MLX5_SET(cqc, cqc, cqe_sz, CQE_STRIDE_128_PAD); } @@ -857,6 +858,10 @@ static void mlx5e_build_rx_cq_param(struct mlx5_core_dev *mdev, void *cqc = param->cqc; u8 log_cq_size; + /* nvme-tcp offload mandates 128 byte cqes */ + param->force_cqe128 |= IS_ENABLED(CONFIG_MLX5_EN_NVMEOTCP) && + params->nvmeotcp; + switch (params->rq_wq_type) { case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: hw_stridx = MLX5_CAP_GEN(mdev, mini_cqe_resp_stride_index); @@ -1207,9 +1212,9 @@ static u8 mlx5e_build_async_icosq_log_wq_sz(struct mlx5_core_dev *mdev) return MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE; } -static void mlx5e_build_icosq_param(struct mlx5_core_dev *mdev, - u8 log_wq_size, - struct mlx5e_sq_param *param) +void mlx5e_build_icosq_param(struct mlx5_core_dev *mdev, + u8 log_wq_size, + struct mlx5e_sq_param *param) { void *sqc = param->sqc; void *wq = MLX5_ADDR_OF(sqc, sqc, wq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h index bd5877acc5b1e..7349478e4c30e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h @@ -17,6 +17,7 @@ struct mlx5e_cq_param { struct mlx5_wq_param wq; u16 eq_ix; u8 cq_period_mode; + bool force_cqe128; }; struct mlx5e_rq_param { @@ -140,6 +141,8 @@ void mlx5e_build_xdpsq_param(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_xsk_param *xsk, struct mlx5e_sq_param *param); +void mlx5e_build_icosq_param(struct mlx5_core_dev *mdev, + u8 log_wq_size, struct mlx5e_sq_param *param); int mlx5e_build_channel_param(struct mlx5_core_dev *mdev, struct mlx5e_params *params, struct mlx5e_channel_param *cparam); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h index 33e32584b07f5..ee35925d987f4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/en_accel.h @@ -40,6 +40,7 @@ #include "en_accel/ktls.h" #include "en_accel/ktls_txrx.h" #include +#include "en_accel/nvmeotcp.h" #include "en.h" #include "en/txrx.h" @@ -208,11 +209,13 @@ static inline void mlx5e_accel_tx_finish(struct mlx5e_txqsq *sq, static inline int mlx5e_accel_init_rx(struct mlx5e_priv *priv) { + mlx5e_nvmeotcp_init_rx(priv); return mlx5e_ktls_init_rx(priv); } static inline void mlx5e_accel_cleanup_rx(struct mlx5e_priv *priv) { + mlx5e_nvmeotcp_cleanup_rx(priv); mlx5e_ktls_cleanup_rx(priv); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h index 7e899c716267e..6714644986a1f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h @@ -6,7 +6,7 @@ #include "en/fs.h" -#ifdef CONFIG_MLX5_EN_TLS +#if defined(CONFIG_MLX5_EN_TLS) || defined(CONFIG_MLX5_EN_NVMEOTCP) int mlx5e_accel_fs_tcp_create(struct mlx5e_flow_steering *fs); void mlx5e_accel_fs_tcp_destroy(struct mlx5e_flow_steering *fs); struct mlx5_flow_handle *mlx5e_accel_fs_add_sk(struct mlx5e_flow_steering *fs, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c new file mode 100644 index 0000000000000..ca9c3aaf941fc --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + +#include +#include +#include "en_accel/nvmeotcp.h" +#include "en_accel/fs_tcp.h" +#include "en/txrx.h" + +#define MAX_NUM_NVMEOTCP_QUEUES (4000) +#define MIN_NUM_NVMEOTCP_QUEUES (1) + +static const struct rhashtable_params rhash_queues = { + .key_len = sizeof(int), + .key_offset = offsetof(struct mlx5e_nvmeotcp_queue, id), + .head_offset = offsetof(struct mlx5e_nvmeotcp_queue, hash), + .automatic_shrinking = true, + .min_size = MIN_NUM_NVMEOTCP_QUEUES, + .max_size = MAX_NUM_NVMEOTCP_QUEUES, +}; + +static int +mlx5e_nvmeotcp_offload_limits(struct net_device *netdev, + struct ulp_ddp_limits *limits) +{ + return 0; +} + +static int +mlx5e_nvmeotcp_queue_init(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_config *config) +{ + return 0; +} + +static void +mlx5e_nvmeotcp_queue_teardown(struct net_device *netdev, + struct sock *sk) +{ +} + +static int +mlx5e_nvmeotcp_ddp_setup(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *ddp) +{ + return 0; +} + +static void +mlx5e_nvmeotcp_ddp_teardown(struct net_device *netdev, + struct sock *sk, + struct ulp_ddp_io *ddp, + void *ddp_ctx) +{ +} + +static void +mlx5e_nvmeotcp_ddp_resync(struct net_device *netdev, + struct sock *sk, u32 seq) +{ +} + +int set_ulp_ddp_nvme_tcp(struct net_device *netdev, bool enable) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_params new_params; + int err = 0; + + /* There may be offloaded queues when an netlink callback to + * disable the feature is made. Hence, we can't destroy the + * tcp flow-table since it may be referenced by the offload + * related flows and we'll keep the 128B CQEs on the channel + * RQs. Also, since we don't deref/destroy the fs tcp table + * when the feature is disabled, we don't ref it again if the + * feature is enabled multiple times. + */ + if (!enable || priv->nvmeotcp->enabled) + return 0; + + err = mlx5e_accel_fs_tcp_create(priv->fs); + if (err) + return err; + + new_params = priv->channels.params; + new_params.nvmeotcp = enable; + err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true); + if (err) + goto fs_tcp_destroy; + + priv->nvmeotcp->enabled = true; + return 0; + +fs_tcp_destroy: + mlx5e_accel_fs_tcp_destroy(priv->fs); + return err; +} + +static int mlx5e_ulp_ddp_set_caps(struct net_device *netdev, + unsigned long *new_caps, + struct netlink_ext_ack *extack) +{ + struct mlx5e_priv *priv = netdev_priv(netdev); + DECLARE_BITMAP(old_caps, ULP_DDP_CAP_COUNT); + struct mlx5e_params *params; + int ret = 0; + int nvme = -1; + + mutex_lock(&priv->state_lock); + params = &priv->channels.params; + bitmap_copy(old_caps, priv->nvmeotcp->ddp_caps.active, + ULP_DDP_CAP_COUNT); + + /* always handle nvme-tcp-ddp and nvme-tcp-ddgst-rx together + * (all or nothing) + */ + + if (ulp_ddp_cap_turned_on(old_caps, new_caps, ULP_DDP_CAP_NVME_TCP) && + ulp_ddp_cap_turned_on(old_caps, new_caps, + ULP_DDP_CAP_NVME_TCP_DDGST_RX)) { + if (MLX5E_GET_PFLAG(params, MLX5E_PFLAG_RX_CQE_COMPRESS)) { + NL_SET_ERR_MSG_MOD(extack, + "NVMe-TCP offload not supported when CQE compress is active. Disable rx_cqe_compress ethtool private flag first"); + goto out; + } + + if (netdev->features & (NETIF_F_LRO | NETIF_F_GRO_HW)) { + NL_SET_ERR_MSG_MOD(extack, + "NVMe-TCP offload not supported when HW_GRO/LRO is active. Disable rx-gro-hw ethtool feature first"); + goto out; + } + nvme = 1; + } else if (ulp_ddp_cap_turned_off(old_caps, new_caps, + ULP_DDP_CAP_NVME_TCP) && + ulp_ddp_cap_turned_off(old_caps, new_caps, + ULP_DDP_CAP_NVME_TCP_DDGST_RX)) { + nvme = 0; + } + + if (nvme >= 0) { + ret = set_ulp_ddp_nvme_tcp(netdev, nvme); + if (ret) + goto out; + change_bit(ULP_DDP_CAP_NVME_TCP, + priv->nvmeotcp->ddp_caps.active); + change_bit(ULP_DDP_CAP_NVME_TCP_DDGST_RX, + priv->nvmeotcp->ddp_caps.active); + } + +out: + mutex_unlock(&priv->state_lock); + return ret; +} + +static void mlx5e_ulp_ddp_get_caps(struct net_device *dev, + struct ulp_ddp_dev_caps *caps) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + mutex_lock(&priv->state_lock); + memcpy(caps, &priv->nvmeotcp->ddp_caps, sizeof(*caps)); + mutex_unlock(&priv->state_lock); +} + +const struct ulp_ddp_dev_ops mlx5e_nvmeotcp_ops = { + .limits = mlx5e_nvmeotcp_offload_limits, + .sk_add = mlx5e_nvmeotcp_queue_init, + .sk_del = mlx5e_nvmeotcp_queue_teardown, + .setup = mlx5e_nvmeotcp_ddp_setup, + .teardown = mlx5e_nvmeotcp_ddp_teardown, + .resync = mlx5e_nvmeotcp_ddp_resync, + .set_caps = mlx5e_ulp_ddp_set_caps, + .get_caps = mlx5e_ulp_ddp_get_caps, +}; + +void mlx5e_nvmeotcp_cleanup_rx(struct mlx5e_priv *priv) +{ + if (priv->nvmeotcp && priv->nvmeotcp->enabled) + mlx5e_accel_fs_tcp_destroy(priv->fs); +} + +int mlx5e_nvmeotcp_init(struct mlx5e_priv *priv) +{ + struct mlx5e_nvmeotcp *nvmeotcp = NULL; + int ret = 0; + + if (!(MLX5_CAP_GEN(priv->mdev, nvmeotcp) && + MLX5_CAP_DEV_NVMEOTCP(priv->mdev, zerocopy) && + MLX5_CAP_DEV_NVMEOTCP(priv->mdev, crc_rx) && + MLX5_CAP_GEN(priv->mdev, cqe_128_always))) + return 0; + + nvmeotcp = kzalloc(sizeof(*nvmeotcp), GFP_KERNEL); + + if (!nvmeotcp) + return -ENOMEM; + + ida_init(&nvmeotcp->queue_ids); + ret = rhashtable_init(&nvmeotcp->queue_hash, &rhash_queues); + if (ret) + goto err_ida; + + /* report ULP DPP as supported, but don't enable it by default */ + set_bit(ULP_DDP_CAP_NVME_TCP, nvmeotcp->ddp_caps.hw); + set_bit(ULP_DDP_CAP_NVME_TCP_DDGST_RX, nvmeotcp->ddp_caps.hw); + nvmeotcp->enabled = false; + priv->nvmeotcp = nvmeotcp; + return 0; + +err_ida: + ida_destroy(&nvmeotcp->queue_ids); + kfree(nvmeotcp); + return ret; +} + +void mlx5e_nvmeotcp_cleanup(struct mlx5e_priv *priv) +{ + struct mlx5e_nvmeotcp *nvmeotcp = priv->nvmeotcp; + + if (!nvmeotcp) + return; + + rhashtable_destroy(&nvmeotcp->queue_hash); + ida_destroy(&nvmeotcp->queue_ids); + kfree(nvmeotcp); + priv->nvmeotcp = NULL; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h new file mode 100644 index 0000000000000..f6432a3737cd4 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */ +#ifndef __MLX5E_NVMEOTCP_H__ +#define __MLX5E_NVMEOTCP_H__ + +#ifdef CONFIG_MLX5_EN_NVMEOTCP + +#include +#include "en.h" +#include "en/params.h" + +struct mlx5e_nvmeotcp_queue_entry { + struct mlx5e_nvmeotcp_queue *queue; + u32 sgl_length; + u32 klm_mkey; + struct scatterlist *sgl; + u32 ccid_gen; + u64 size; + + /* for the ddp invalidate done callback */ + void *ddp_ctx; + struct ulp_ddp_io *ddp; +}; + +struct mlx5e_nvmeotcp_queue_handler { + struct napi_struct napi; + struct mlx5e_cq *cq; +}; + +/** + * struct mlx5e_nvmeotcp_queue - mlx5 metadata for NVMEoTCP queue + * @ulp_ddp_ctx: Generic ulp ddp context + * @tir: Destination TIR created for NVMEoTCP offload + * @fh: Flow handle representing the 5-tuple steering for this flow + * @id: Flow tag ID used to identify this queue + * @size: NVMEoTCP queue depth + * @ccid_gen: Generation ID for the CCID, used to avoid conflicts in DDP + * @max_klms_per_wqe: Number of KLMs per DDP operation + * @hash: Hash table of queues mapped by @id + * @pda: Padding alignment + * @tag_buf_table_id: Tag buffer table for CCIDs + * @dgst: Digest supported (header and/or data) + * @sq: Send queue used for posting umrs + * @ref_count: Reference count for this structure + * @after_resync_cqe: Indicate if resync occurred + * @ccid_table: Table holding metadata for each CC (Command Capsule) + * @ccid: ID of the current CC + * @ccsglidx: Index within the scatter-gather list (SGL) of the current CC + * @ccoff: Offset within the current CC + * @ccoff_inner: Current offset within the @ccsglidx element + * @channel_ix: Channel IX for this nvmeotcp_queue + * @sk: The socket used by the NVMe-TCP queue + * @crc_rx: CRC Rx offload indication for this queue + * @priv: mlx5e netdev priv + * @static_params_done: Async completion structure for the initial umr + * mapping synchronization + * @sq_lock: Spin lock for the icosq + * @qh: Completion queue handler for processing umr completions + */ +struct mlx5e_nvmeotcp_queue { + struct ulp_ddp_ctx ulp_ddp_ctx; + struct mlx5e_tir tir; + struct mlx5_flow_handle *fh; + int id; + u32 size; + /* needed when the upper layer + * immediately reuses CCID + some packet loss happens + */ + u32 ccid_gen; + u32 max_klms_per_wqe; + struct rhash_head hash; + int pda; + u32 tag_buf_table_id; + u8 dgst; + struct mlx5e_icosq sq; + + /* data-path section cache aligned */ + refcount_t ref_count; + /* for MASK HW resync cqe */ + bool after_resync_cqe; + struct mlx5e_nvmeotcp_queue_entry *ccid_table; + /* current ccid fields */ + int ccid; + int ccsglidx; + off_t ccoff; + int ccoff_inner; + + u32 channel_ix; + struct sock *sk; + u8 crc_rx:1; + /* for ddp invalidate flow */ + struct mlx5e_priv *priv; + /* end of data-path section */ + + struct completion static_params_done; + /* spin lock for the ico sq, ULP can issue requests + * from multiple contexts + */ + spinlock_t sq_lock; + struct mlx5e_nvmeotcp_queue_handler qh; +}; + +struct mlx5e_nvmeotcp { + struct ida queue_ids; + struct rhashtable queue_hash; + struct ulp_ddp_dev_caps ddp_caps; + bool enabled; +}; + +int mlx5e_nvmeotcp_init(struct mlx5e_priv *priv); +int set_ulp_ddp_nvme_tcp(struct net_device *netdev, bool enable); +void mlx5e_nvmeotcp_cleanup(struct mlx5e_priv *priv); +static inline void mlx5e_nvmeotcp_init_rx(struct mlx5e_priv *priv) {} +void mlx5e_nvmeotcp_cleanup_rx(struct mlx5e_priv *priv); +extern const struct ulp_ddp_dev_ops mlx5e_nvmeotcp_ops; +#else + +static inline int mlx5e_nvmeotcp_init(struct mlx5e_priv *priv) { return 0; } +static inline void mlx5e_nvmeotcp_cleanup(struct mlx5e_priv *priv) {} +static inline int set_ulp_ddp_nvme_tcp(struct net_device *dev, bool en) +{ return -EOPNOTSUPP; } +static inline void mlx5e_nvmeotcp_init_rx(struct mlx5e_priv *priv) {} +static inline void mlx5e_nvmeotcp_cleanup_rx(struct mlx5e_priv *priv) {} +#endif +#endif /* __MLX5E_NVMEOTCP_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index fdf9e9bb99ace..c52911dd8f0f4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -41,6 +41,7 @@ #include "en/ptp.h" #include "lib/clock.h" #include "en/fs_ethtool.h" +#include "en_accel/nvmeotcp.h" #define LANES_UNKNOWN 0 #define MAX_LANES 8 @@ -2149,6 +2150,11 @@ int mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool new_val return -EINVAL; } + if (priv->channels.params.nvmeotcp) { + netdev_warn(priv->netdev, "Can't set CQE compression after ULP DDP NVMe-TCP offload\n"); + return -EINVAL; + } + new_params = priv->channels.params; MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_RX_CQE_COMPRESS, new_val); if (rx_filter) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c index 05058710d2c79..07d2122ee88ae 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c @@ -62,7 +62,7 @@ struct mlx5e_flow_steering { #ifdef CONFIG_MLX5_EN_ARFS struct mlx5e_arfs_tables *arfs; #endif -#ifdef CONFIG_MLX5_EN_TLS +#if defined(CONFIG_MLX5_EN_TLS) || defined(CONFIG_MLX5_EN_NVMEOTCP) struct mlx5e_accel_fs_tcp *accel_tcp; #endif struct mlx5e_fs_udp *udp; @@ -1554,7 +1554,7 @@ void mlx5e_fs_set_any(struct mlx5e_flow_steering *fs, struct mlx5e_fs_any *any) fs->any = any; } -#ifdef CONFIG_MLX5_EN_TLS +#if defined(CONFIG_MLX5_EN_TLS) || defined(CONFIG_MLX5_EN_NVMEOTCP) struct mlx5e_accel_fs_tcp *mlx5e_fs_get_accel_tcp(struct mlx5e_flow_steering *fs) { return fs->accel_tcp; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index de25401547d24..3cd1513e4c66e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -53,6 +53,7 @@ #include "en_accel/macsec.h" #include "en_accel/en_accel.h" #include "en_accel/ktls.h" +#include "en_accel/nvmeotcp.h" #include "lib/vxlan.h" #include "lib/clock.h" #include "en/port.h" @@ -4427,6 +4428,13 @@ static netdev_features_t mlx5e_fix_features(struct net_device *netdev, netdev->netns_immutable = false; } + if (features & (NETIF_F_LRO | NETIF_F_GRO_HW)) { + if (params->nvmeotcp) { + netdev_warn(netdev, "Disabling HW-GRO/LRO, not supported after ULP DDP NVMe-TCP offload\n"); + features &= ~(NETIF_F_LRO | NETIF_F_GRO_HW); + } + } + mutex_unlock(&priv->state_lock); return features; @@ -5186,6 +5194,9 @@ const struct net_device_ops mlx5e_netdev_ops = { .ndo_has_offload_stats = mlx5e_has_offload_stats, .ndo_get_offload_stats = mlx5e_get_offload_stats, #endif +#ifdef CONFIG_MLX5_EN_NVMEOTCP + .ulp_ddp_ops = &mlx5e_nvmeotcp_ops, +#endif }; void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu) @@ -5665,6 +5676,11 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, if (err) mlx5_core_err(mdev, "TLS initialization failed, %d\n", err); + err = mlx5e_nvmeotcp_init(priv); + if (err) + mlx5_core_err(mdev, "NVMEoTCP initialization failed, %d\n", + err); + mlx5e_health_create_reporters(priv); /* If netdev is already registered (e.g. move from uplink to nic profile), @@ -5685,6 +5701,7 @@ static int mlx5e_nic_init(struct mlx5_core_dev *mdev, static void mlx5e_nic_cleanup(struct mlx5e_priv *priv) { mlx5e_health_destroy_reporters(priv); + mlx5e_nvmeotcp_cleanup(priv); mlx5e_ktls_cleanup(priv); mlx5e_fs_cleanup(priv->fs); debugfs_remove_recursive(priv->dfs_root); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 41e8660c819c0..2c2422b1cb03a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1804,6 +1804,7 @@ static const int types[] = { MLX5_CAP_CRYPTO, MLX5_CAP_SHAMPO, MLX5_CAP_ADV_RDMA, + MLX5_CAP_DEV_NVMEOTCP, }; static void mlx5_hca_caps_free(struct mlx5_core_dev *dev) From 040119431cf569ff747f58705e29307586960539 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Wed, 30 Apr 2025 08:57:35 +0000 Subject: [PATCH 688/770] net/mlx5e: TCP flow steering for nvme-tcp acceleration Both nvme-tcp and tls acceleration require tcp flow steering. Add reference counter to share TCP flow steering structure. Signed-off-by: Boris Pismenny Signed-off-by: Ben Ben-Ishay Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Aurelien Aptel Reviewed-by: Tariq Toukan Signed-off-by: NipaLocal --- .../ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c index 4f83e3172767a..f5c67f9cb2f9d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c @@ -14,6 +14,7 @@ enum accel_fs_tcp_type { struct mlx5e_accel_fs_tcp { struct mlx5e_flow_table tables[ACCEL_FS_TCP_NUM_TYPES]; struct mlx5_flow_handle *default_rules[ACCEL_FS_TCP_NUM_TYPES]; + refcount_t user_count; }; static enum mlx5_traffic_types fs_accel2tt(enum accel_fs_tcp_type i) @@ -361,6 +362,9 @@ void mlx5e_accel_fs_tcp_destroy(struct mlx5e_flow_steering *fs) if (!accel_tcp) return; + if (!refcount_dec_and_test(&accel_tcp->user_count)) + return; + accel_fs_tcp_disable(fs); for (i = 0; i < ACCEL_FS_TCP_NUM_TYPES; i++) @@ -372,12 +376,17 @@ void mlx5e_accel_fs_tcp_destroy(struct mlx5e_flow_steering *fs) int mlx5e_accel_fs_tcp_create(struct mlx5e_flow_steering *fs) { - struct mlx5e_accel_fs_tcp *accel_tcp; + struct mlx5e_accel_fs_tcp *accel_tcp = mlx5e_fs_get_accel_tcp(fs); int i, err; if (!MLX5_CAP_FLOWTABLE_NIC_RX(mlx5e_fs_get_mdev(fs), ft_field_support.outer_ip_version)) return -EOPNOTSUPP; + if (accel_tcp) { + refcount_inc(&accel_tcp->user_count); + return 0; + } + accel_tcp = kzalloc(sizeof(*accel_tcp), GFP_KERNEL); if (!accel_tcp) return -ENOMEM; @@ -393,6 +402,7 @@ int mlx5e_accel_fs_tcp_create(struct mlx5e_flow_steering *fs) if (err) goto err_destroy_tables; + refcount_set(&accel_tcp->user_count, 1); return 0; err_destroy_tables: From ac353da9c5e1cb31a1c10479882b171f877adc17 Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Wed, 30 Apr 2025 08:57:36 +0000 Subject: [PATCH 689/770] net/mlx5e: NVMEoTCP, use KLM UMRs for buffer registration NVMEoTCP offload uses buffer registration for ddp operation. Every request comprises from SG list that might consist from elements with multiple combination sizes, thus the appropriate way to perform buffer registration is with KLM UMRs. UMR stands for user-mode memory registration, it is a mechanism to alter address translation properties of MKEY by posting WorkQueueElement aka WQE on send queue. MKEY stands for memory key, MKEY are used to describe a region in memory that can be later used by HW. KLM stands for {Key, Length, MemVa}, KLM_MKEY is indirect MKEY that enables to map multiple memory spaces with different sizes in unified MKEY. KLM UMR is a UMR that use to update a KLM_MKEY. Nothing needs to be done on memory registration completion and this notification is expensive so we add a wrapper to be able to ring the doorbell without generating any. Signed-off-by: Ben Ben-Ishay Signed-off-by: Boris Pismenny Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Aurelien Aptel Reviewed-by: Tariq Toukan Signed-off-by: NipaLocal --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 21 +++ .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 16 +- .../mellanox/mlx5/core/en_accel/nvmeotcp.c | 149 ++++++++++++++++++ .../mlx5/core/en_accel/nvmeotcp_utils.h | 25 +++ .../net/ethernet/mellanox/mlx5/core/en_rx.c | 4 + 5 files changed, 212 insertions(+), 3 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_utils.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 4051f23c0fc4a..77a358720bd57 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -146,6 +146,27 @@ struct page_pool; #define MLX5E_TX_XSK_POLL_BUDGET 64 #define MLX5E_SQ_RECOVER_MIN_INTERVAL 500 /* msecs */ +#define MLX5E_KLM_UMR_WQE_SZ(sgl_len)\ + (sizeof(struct mlx5e_umr_wqe) + \ + (sizeof(struct mlx5_klm) * (sgl_len))) + +#define MLX5E_KLM_UMR_WQEBBS(klm_entries) \ + (DIV_ROUND_UP(MLX5E_KLM_UMR_WQE_SZ(klm_entries), MLX5_SEND_WQE_BB)) + +#define MLX5E_KLM_UMR_DS_CNT(klm_entries)\ + (DIV_ROUND_UP(MLX5E_KLM_UMR_WQE_SZ(klm_entries), MLX5_SEND_WQE_DS)) + +#define MLX5E_KLM_MAX_ENTRIES_PER_WQE(wqe_size)\ + (((wqe_size) - sizeof(struct mlx5e_umr_wqe)) / sizeof(struct mlx5_klm)) + +#define MLX5E_KLM_ENTRIES_PER_WQE(wqe_size)\ + ALIGN_DOWN(MLX5E_KLM_MAX_ENTRIES_PER_WQE(wqe_size), \ + MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT) + +#define MLX5E_MAX_KLM_PER_WQE(mdev) \ + MLX5E_KLM_ENTRIES_PER_WQE(MLX5_SEND_WQE_BB * \ + mlx5e_get_max_sq_aligned_wqebbs(mdev)) + #define mlx5e_state_dereference(priv, p) \ rcu_dereference_protected((p), lockdep_is_held(&(priv)->state_lock)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 0d31d012954dd..a4d30eb9f571e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -71,6 +71,9 @@ enum mlx5e_icosq_wqe_type { MLX5E_ICOSQ_WQE_SET_PSV_TLS, MLX5E_ICOSQ_WQE_GET_PSV_TLS, #endif +#ifdef CONFIG_MLX5_EN_NVMEOTCP + MLX5E_ICOSQ_WQE_UMR_NVMEOTCP, +#endif }; /* General */ @@ -290,10 +293,10 @@ static inline u16 mlx5e_icosq_get_next_pi(struct mlx5e_icosq *sq, u16 size) } static inline void -mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc, void __iomem *uar_map, - struct mlx5_wqe_ctrl_seg *ctrl) +__mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc, void __iomem *uar_map, + struct mlx5_wqe_ctrl_seg *ctrl, u8 cq_update) { - ctrl->fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; + ctrl->fm_ce_se |= cq_update; /* ensure wqe is visible to device before updating doorbell record */ dma_wmb(); @@ -307,6 +310,13 @@ mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc, void __iomem *uar_map, mlx5_write64((__be32 *)ctrl, uar_map); } +static inline void +mlx5e_notify_hw(struct mlx5_wq_cyc *wq, u16 pc, void __iomem *uar_map, + struct mlx5_wqe_ctrl_seg *ctrl) +{ + __mlx5e_notify_hw(wq, pc, uar_map, ctrl, MLX5_WQE_CTRL_CQ_UPDATE); +} + static inline void mlx5e_cq_arm(struct mlx5e_cq *cq) { struct mlx5_core_cq *mcq; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c index ca9c3aaf941fc..f5df4b41c3ef9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c @@ -4,6 +4,7 @@ #include #include #include "en_accel/nvmeotcp.h" +#include "en_accel/nvmeotcp_utils.h" #include "en_accel/fs_tcp.h" #include "en/txrx.h" @@ -19,6 +20,146 @@ static const struct rhashtable_params rhash_queues = { .max_size = MAX_NUM_NVMEOTCP_QUEUES, }; +static void +fill_nvmeotcp_klm_wqe(struct mlx5e_nvmeotcp_queue *queue, + struct mlx5e_umr_wqe *wqe, u16 ccid, + u32 klm_entries, u16 klm_offset) +{ + struct scatterlist *sgl_mkey; + u32 lkey, i; + + lkey = queue->priv->mdev->mlx5e_res.hw_objs.mkey; + for (i = 0; i < klm_entries; i++) { + sgl_mkey = &queue->ccid_table[ccid].sgl[i + klm_offset]; + wqe->inline_klms[i].bcount = cpu_to_be32(sg_dma_len(sgl_mkey)); + wqe->inline_klms[i].key = cpu_to_be32(lkey); + wqe->inline_klms[i].va = cpu_to_be64(sgl_mkey->dma_address); + } + + for (; i < ALIGN(klm_entries, MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT); + i++) { + wqe->inline_klms[i].bcount = 0; + wqe->inline_klms[i].key = 0; + wqe->inline_klms[i].va = 0; + } +} + +static void +build_nvmeotcp_klm_umr(struct mlx5e_nvmeotcp_queue *queue, + struct mlx5e_umr_wqe *wqe, u16 ccid, int klm_entries, + u32 klm_offset, u32 len, enum wqe_type klm_type) +{ + struct mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->hdr.uctrl; + struct mlx5_wqe_ctrl_seg *cseg = &wqe->hdr.ctrl; + struct mlx5_mkey_seg *mkc = &wqe->hdr.mkc; + u32 sqn = queue->sq.sqn; + u16 pc = queue->sq.pc; + u32 ds_cnt; + u8 opc_mod; + u32 id; + + ds_cnt = + MLX5E_KLM_UMR_DS_CNT(ALIGN(klm_entries, + MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT)); + + if (klm_type == BSF_KLM_UMR) { + id = mlx5e_tir_get_tirn(&queue->tir) << + MLX5_WQE_CTRL_TIR_TIS_INDEX_SHIFT; + opc_mod = MLX5_OPC_MOD_TRANSPORT_TIR_STATIC_PARAMS; + } else { + id = queue->ccid_table[ccid].klm_mkey; + opc_mod = MLX5_CTRL_SEGMENT_OPC_MOD_UMR_UMR; + } + + cseg->opmod_idx_opcode = + cpu_to_be32((pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | + MLX5_OPCODE_UMR | (opc_mod) << 24); + cseg->qpn_ds = cpu_to_be32((sqn << MLX5_WQE_CTRL_QPN_SHIFT) | ds_cnt); + cseg->general_id = cpu_to_be32(id); + + if (klm_type == KLM_UMR && !klm_offset) { + ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_XLT_OCT_SIZE | + MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_FREE); + mkc->xlt_oct_size = + cpu_to_be32(ALIGN(len, + MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT)); + mkc->len = cpu_to_be64(queue->ccid_table[ccid].size); + } + + ucseg->flags = MLX5_UMR_INLINE | MLX5_UMR_TRANSLATION_OFFSET_EN; + ucseg->xlt_octowords = + cpu_to_be16(ALIGN(klm_entries, + MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT)); + ucseg->xlt_offset = cpu_to_be16(klm_offset); + fill_nvmeotcp_klm_wqe(queue, wqe, ccid, klm_entries, klm_offset); +} + +static void +mlx5e_nvmeotcp_fill_wi(struct mlx5e_icosq *sq, u32 wqebbs, u16 pi) +{ + struct mlx5e_icosq_wqe_info *wi = &sq->db.wqe_info[pi]; + + memset(wi, 0, sizeof(*wi)); + + wi->num_wqebbs = wqebbs; + wi->wqe_type = MLX5E_ICOSQ_WQE_UMR_NVMEOTCP; +} + +static u32 +post_klm_wqe(struct mlx5e_nvmeotcp_queue *queue, + enum wqe_type wqe_type, + u16 ccid, + u32 klm_length, + u32 klm_offset) +{ + struct mlx5e_icosq *sq = &queue->sq; + u32 wqebbs, cur_klm_entries; + struct mlx5e_umr_wqe *wqe; + u16 pi, wqe_sz; + + cur_klm_entries = min_t(int, queue->max_klms_per_wqe, + klm_length - klm_offset); + wqe_sz = + MLX5E_KLM_UMR_WQE_SZ(ALIGN(cur_klm_entries, + MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT)); + wqebbs = DIV_ROUND_UP(wqe_sz, MLX5_SEND_WQE_BB); + pi = mlx5e_icosq_get_next_pi(sq, wqebbs); + wqe = MLX5E_NVMEOTCP_FETCH_KLM_WQE(sq, pi); + mlx5e_nvmeotcp_fill_wi(sq, wqebbs, pi); + build_nvmeotcp_klm_umr(queue, wqe, ccid, cur_klm_entries, klm_offset, + klm_length, wqe_type); + sq->pc += wqebbs; + sq->doorbell_cseg = &wqe->hdr.ctrl; + return cur_klm_entries; +} + +static void +mlx5e_nvmeotcp_post_klm_wqe(struct mlx5e_nvmeotcp_queue *queue, + enum wqe_type wqe_type, u16 ccid, u32 klm_length) +{ + struct mlx5e_icosq *sq = &queue->sq; + u32 klm_offset = 0, wqes, i; + + wqes = DIV_ROUND_UP(klm_length, queue->max_klms_per_wqe); + + spin_lock_bh(&queue->sq_lock); + + for (i = 0; i < wqes; i++) + klm_offset += post_klm_wqe(queue, wqe_type, ccid, klm_length, + klm_offset); + + /* not asking for completion on ddp_setup UMRs */ + if (wqe_type == KLM_UMR) + __mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, + sq->doorbell_cseg, 0); + else + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, + sq->doorbell_cseg); + + spin_unlock_bh(&queue->sq_lock); +} + static int mlx5e_nvmeotcp_offload_limits(struct net_device *netdev, struct ulp_ddp_limits *limits) @@ -45,6 +186,14 @@ mlx5e_nvmeotcp_ddp_setup(struct net_device *netdev, struct sock *sk, struct ulp_ddp_io *ddp) { + struct mlx5e_nvmeotcp_queue *queue; + + queue = container_of(ulp_ddp_get_ctx(sk), + struct mlx5e_nvmeotcp_queue, ulp_ddp_ctx); + + /* Placeholder - map_sg and initializing the count */ + + mlx5e_nvmeotcp_post_klm_wqe(queue, KLM_UMR, ddp->command_id, 0); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_utils.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_utils.h new file mode 100644 index 0000000000000..6ef92679c5d0a --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_utils.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */ +#ifndef __MLX5E_NVMEOTCP_UTILS_H__ +#define __MLX5E_NVMEOTCP_UTILS_H__ + +#include "en.h" + +#define MLX5E_NVMEOTCP_FETCH_KLM_WQE(sq, pi) \ + ((struct mlx5e_umr_wqe *)\ + mlx5e_fetch_wqe(&(sq)->wq, pi, sizeof(struct mlx5e_umr_wqe))) + +#define MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIR_PROGRESS_PARAMS 0x4 + +#define MLX5_CTRL_SEGMENT_OPC_MOD_UMR_TIR_PARAMS 0x2 +#define MLX5_CTRL_SEGMENT_OPC_MOD_UMR_UMR 0x0 + +enum wqe_type { + KLM_UMR, + BSF_KLM_UMR, + SET_PSV_UMR, + BSF_UMR, + KLM_INV_UMR, +}; + +#endif /* __MLX5E_NVMEOTCP_UTILS_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index c4c612ab62c4c..5370d238c8182 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1056,6 +1056,10 @@ int mlx5e_poll_ico_cq(struct mlx5e_cq *cq, int budget) case MLX5E_ICOSQ_WQE_GET_PSV_TLS: mlx5e_ktls_handle_get_psv_completion(wi, sq); break; +#endif +#ifdef CONFIG_MLX5_EN_NVMEOTCP + case MLX5E_ICOSQ_WQE_UMR_NVMEOTCP: + break; #endif default: netdev_WARN_ONCE(cq->netdev, From a4586f7a529541a660ebbfa1fb8a562797207219 Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Wed, 30 Apr 2025 08:57:37 +0000 Subject: [PATCH 690/770] net/mlx5e: NVMEoTCP, queue init/teardown Adds the ddp ops of sk_add, sk_del and offload limits. When nvme-tcp establishes new queue/connection, the sk_add op is called. We allocate a hardware context to offload operations for this queue: - use a steering rule based on the connection 5-tuple to mark packets of this queue/connection with a flow-tag in their completion (cqe) - use a dedicated TIR to identify the queue and maintain the HW context - use a dedicated ICOSQ to maintain the HW context by UMR postings - use a dedicated tag buffer for buffer registration - maintain static and progress HW contexts by posting the proper WQEs. When nvme-tcp teardowns a queue/connection, the sk_del op is called. We teardown the queue and free the corresponding contexts. The offload limits we advertise deal with the max SG supported. [Re-enabled calling open/close icosq out of en_main.c] Signed-off-by: Ben Ben-Ishay Signed-off-by: Boris Pismenny Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Aurelien Aptel Reviewed-by: Tariq Toukan Signed-off-by: NipaLocal --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 + .../ethernet/mellanox/mlx5/core/en/rx_res.c | 30 + .../ethernet/mellanox/mlx5/core/en/rx_res.h | 5 + .../net/ethernet/mellanox/mlx5/core/en/tir.c | 16 + .../net/ethernet/mellanox/mlx5/core/en/tir.h | 3 + .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 6 + .../mellanox/mlx5/core/en_accel/nvmeotcp.c | 596 +++++++++++++++++- .../mellanox/mlx5/core/en_accel/nvmeotcp.h | 4 + .../mlx5/core/en_accel/nvmeotcp_utils.h | 43 ++ .../net/ethernet/mellanox/mlx5/core/en_main.c | 8 +- .../net/ethernet/mellanox/mlx5/core/en_rx.c | 15 +- 11 files changed, 720 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 77a358720bd57..20cc66ac4e929 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1066,6 +1066,10 @@ bool mlx5e_reset_rx_channels_moderation(struct mlx5e_channels *chs, u8 cq_period bool dim_enabled, bool keep_dim_state); struct mlx5e_sq_param; +int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params, + struct mlx5e_sq_param *param, struct mlx5e_icosq *sq, + work_func_t recover_work_func); +void mlx5e_close_icosq(struct mlx5e_icosq *sq); int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params, struct mlx5e_sq_param *param, struct xsk_buff_pool *xsk_pool, struct mlx5e_xdpsq *sq, bool is_redirect); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c index 5fcbe47337b08..dd05ad30ea66c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c @@ -679,6 +679,36 @@ struct mlx5e_rss_params_hash mlx5e_rx_res_get_current_hash(struct mlx5e_rx_res * return mlx5e_rss_get_hash(res->rss[0]); } +int mlx5e_rx_res_nvmeotcp_tir_create(struct mlx5e_rx_res *res, unsigned int rxq, + bool crc_rx, + u32 tag_buf_id, struct mlx5e_tir *tir) +{ + bool inner_ft_support = res->features & MLX5E_RX_RES_FEATURE_INNER_FT; + struct mlx5e_tir_builder *builder; + u32 rqtn; + int err; + + builder = mlx5e_tir_builder_alloc(false); + if (!builder) + return -ENOMEM; + + rqtn = mlx5e_rx_res_get_rqtn_direct(res, rxq); + + mlx5e_tir_builder_build_rqt(builder, + res->mdev->mlx5e_res.hw_objs.td.tdn, rqtn, + inner_ft_support); + mlx5e_tir_builder_build_direct(builder); + mlx5e_tir_builder_build_nvmeotcp(builder, crc_rx, tag_buf_id); + down_read(&res->pkt_merge_param_sem); + mlx5e_tir_builder_build_packet_merge(builder, &res->pkt_merge_param); + err = mlx5e_tir_init(tir, builder, res->mdev, false); + up_read(&res->pkt_merge_param_sem); + + mlx5e_tir_builder_free(builder); + + return err; +} + int mlx5e_rx_res_tls_tir_create(struct mlx5e_rx_res *res, unsigned int rxq, struct mlx5e_tir *tir) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h index 3e09d91281af9..d9eccf6e1025b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h @@ -74,4 +74,9 @@ struct mlx5e_rss_params_hash mlx5e_rx_res_get_current_hash(struct mlx5e_rx_res * /* Accel TIRs */ int mlx5e_rx_res_tls_tir_create(struct mlx5e_rx_res *res, unsigned int rxq, struct mlx5e_tir *tir); + +int mlx5e_rx_res_nvmeotcp_tir_create(struct mlx5e_rx_res *res, + unsigned int rxq, bool crc_rx, + u32 tag_buf_id, struct mlx5e_tir *tir); + #endif /* __MLX5_EN_RX_RES_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c index 19499072f67f6..4b742036fdfb0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.c @@ -146,6 +146,22 @@ void mlx5e_tir_builder_build_direct(struct mlx5e_tir_builder *builder) MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_INVERTED_XOR8); } +void mlx5e_tir_builder_build_nvmeotcp(struct mlx5e_tir_builder *builder, + bool crc_rx, + u32 tag_buf_id) +{ + void *tirc = mlx5e_tir_builder_get_tirc(builder); + + WARN_ON(builder->modify); + + MLX5_SET(tirc, tirc, nvmeotcp_zero_copy_en, 1); + MLX5_SET(tirc, tirc, nvmeotcp_tag_buffer_table_id, tag_buf_id); + MLX5_SET(tirc, tirc, nvmeotcp_crc_en, !!crc_rx); + MLX5_SET(tirc, tirc, self_lb_block, + MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST | + MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST); +} + void mlx5e_tir_builder_build_tls(struct mlx5e_tir_builder *builder) { void *tirc = mlx5e_tir_builder_get_tirc(builder); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h index e8df3aaf6562f..5451fcc69b9b9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tir.h @@ -36,6 +36,9 @@ void mlx5e_tir_builder_build_rss(struct mlx5e_tir_builder *builder, bool inner); void mlx5e_tir_builder_build_direct(struct mlx5e_tir_builder *builder); void mlx5e_tir_builder_build_tls(struct mlx5e_tir_builder *builder); +void mlx5e_tir_builder_build_nvmeotcp(struct mlx5e_tir_builder *builder, + bool crc_rx, + u32 tag_buf_id); struct mlx5_core_dev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index a4d30eb9f571e..528e5c97f5a43 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -73,6 +73,7 @@ enum mlx5e_icosq_wqe_type { #endif #ifdef CONFIG_MLX5_EN_NVMEOTCP MLX5E_ICOSQ_WQE_UMR_NVMEOTCP, + MLX5E_ICOSQ_WQE_SET_PSV_NVMEOTCP, #endif }; @@ -258,6 +259,11 @@ struct mlx5e_icosq_wqe_info { struct { struct mlx5e_ktls_rx_resync_buf *buf; } tls_get_params; +#endif +#ifdef CONFIG_MLX5_EN_NVMEOTCP + struct { + struct mlx5e_nvmeotcp_queue *queue; + } nvmeotcp_q; #endif }; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c index f5df4b41c3ef9..91865d1450a93 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c @@ -3,6 +3,7 @@ #include #include +#include #include "en_accel/nvmeotcp.h" #include "en_accel/nvmeotcp_utils.h" #include "en_accel/fs_tcp.h" @@ -11,6 +12,11 @@ #define MAX_NUM_NVMEOTCP_QUEUES (4000) #define MIN_NUM_NVMEOTCP_QUEUES (1) +/* Max PDU data will be 512K */ +#define MLX5E_NVMEOTCP_MAX_SEGMENTS (128) +#define MLX5E_NVMEOTCP_IO_THRESHOLD (32 * 1024) +#define MLX5E_NVMEOTCP_FULL_CCID_RANGE (0) + static const struct rhashtable_params rhash_queues = { .key_len = sizeof(int), .key_offset = offsetof(struct mlx5e_nvmeotcp_queue, id), @@ -20,6 +26,96 @@ static const struct rhashtable_params rhash_queues = { .max_size = MAX_NUM_NVMEOTCP_QUEUES, }; +static u32 mlx5e_get_max_sgl(struct mlx5_core_dev *mdev) +{ + return min_t(u32, + MLX5E_NVMEOTCP_MAX_SEGMENTS, + 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size)); +} + +static u32 +mlx5e_get_channel_ix_from_io_cpu(struct mlx5e_params *params, u32 io_cpu) +{ + int num_channels = params->num_channels; + u32 channel_ix = io_cpu; + + if (channel_ix >= num_channels) + channel_ix = channel_ix % num_channels; + + return channel_ix; +} + +static +int mlx5e_create_nvmeotcp_tag_buf_table(struct mlx5_core_dev *mdev, + struct mlx5e_nvmeotcp_queue *queue, + u8 log_table_size) +{ + u32 in[MLX5_ST_SZ_DW(create_nvmeotcp_tag_buf_table_in)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + u64 general_obj_types; + void *obj; + int err; + + obj = MLX5_ADDR_OF(create_nvmeotcp_tag_buf_table_in, in, + nvmeotcp_tag_buf_table_obj); + + general_obj_types = MLX5_CAP_GEN_64(mdev, general_obj_types); + if (!(general_obj_types & + MLX5_HCA_CAP_GENERAL_OBJECT_TYPES_NVMEOTCP_TAG_BUFFER_TABLE)) + return -EINVAL; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_GENERAL_OBJECT_TYPES_NVMEOTCP_TAG_BUFFER_TABLE); + MLX5_SET(nvmeotcp_tag_buf_table_obj, obj, + log_tag_buffer_table_size, log_table_size); + + err = mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); + if (!err) + queue->tag_buf_table_id = MLX5_GET(general_obj_out_cmd_hdr, + out, obj_id); + return err; +} + +static +void mlx5_destroy_nvmeotcp_tag_buf_table(struct mlx5_core_dev *mdev, u32 uid) +{ + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_GENERAL_OBJECT_TYPES_NVMEOTCP_TAG_BUFFER_TABLE); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, uid); + + mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + +static void +fill_nvmeotcp_bsf_klm_wqe(struct mlx5e_nvmeotcp_queue *queue, + struct mlx5e_umr_wqe *wqe, u16 ccid, u32 klm_entries, + u16 klm_offset) +{ + u32 i; + + /* BSF_KLM_UMR is used to update the tag_buffer. To spare the + * need to update both mkc.length and tag_buffer[i].len in two + * different UMRs we initialize the tag_buffer[*].len to the + * maximum size of an entry so the HW check will pass and the + * validity of the MKEY len will be checked against the + * updated mkey context field. + */ + for (i = 0; i < klm_entries; i++) { + u32 lkey = queue->ccid_table[i + klm_offset].klm_mkey; + + wqe->inline_klms[i].bcount = cpu_to_be32(U32_MAX); + wqe->inline_klms[i].key = cpu_to_be32(lkey); + wqe->inline_klms[i].va = 0; + } +} + static void fill_nvmeotcp_klm_wqe(struct mlx5e_nvmeotcp_queue *queue, struct mlx5e_umr_wqe *wqe, u16 ccid, @@ -92,18 +188,159 @@ build_nvmeotcp_klm_umr(struct mlx5e_nvmeotcp_queue *queue, cpu_to_be16(ALIGN(klm_entries, MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT)); ucseg->xlt_offset = cpu_to_be16(klm_offset); - fill_nvmeotcp_klm_wqe(queue, wqe, ccid, klm_entries, klm_offset); + if (klm_type == BSF_KLM_UMR) + fill_nvmeotcp_bsf_klm_wqe(queue, wqe, ccid, klm_entries, + klm_offset); + else + fill_nvmeotcp_klm_wqe(queue, wqe, ccid, klm_entries, + klm_offset); +} + +static void +fill_nvmeotcp_progress_params(struct mlx5e_nvmeotcp_queue *queue, + struct mlx5_seg_nvmeotcp_progress_params *params, + u32 seq) +{ + void *ctx = params->ctx; + + params->tir_num = cpu_to_be32(mlx5e_tir_get_tirn(&queue->tir)); + + MLX5_SET(nvmeotcp_progress_params, ctx, next_pdu_tcp_sn, seq); + MLX5_SET(nvmeotcp_progress_params, ctx, pdu_tracker_state, + MLX5E_NVMEOTCP_PROGRESS_PARAMS_PDU_TRACKER_STATE_START); +} + +void +build_nvmeotcp_progress_params(struct mlx5e_nvmeotcp_queue *queue, + struct mlx5e_set_nvmeotcp_progress_params_wqe *wqe, + u32 seq) +{ + struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; + u32 sqn = queue->sq.sqn; + u16 pc = queue->sq.pc; + u8 opc_mod; + + memset(wqe, 0, MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQE_SZ); + opc_mod = MLX5_CTRL_SEGMENT_OPC_MOD_UMR_NVMEOTCP_TIR_PROGRESS_PARAMS; + cseg->opmod_idx_opcode = + cpu_to_be32((pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | + MLX5_OPCODE_SET_PSV | (opc_mod << 24)); + cseg->qpn_ds = cpu_to_be32((sqn << MLX5_WQE_CTRL_QPN_SHIFT) | + PROGRESS_PARAMS_DS_CNT); + fill_nvmeotcp_progress_params(queue, &wqe->params, seq); } static void -mlx5e_nvmeotcp_fill_wi(struct mlx5e_icosq *sq, u32 wqebbs, u16 pi) +fill_nvmeotcp_static_params(struct mlx5e_nvmeotcp_queue *queue, + struct mlx5_wqe_transport_static_params_seg *params, + u32 resync_seq, bool ddgst_offload_en) +{ + void *ctx = params->ctx; + + MLX5_SET(transport_static_params, ctx, const_1, 1); + MLX5_SET(transport_static_params, ctx, const_2, 2); + MLX5_SET(transport_static_params, ctx, acc_type, + MLX5_TRANSPORT_STATIC_PARAMS_ACC_TYPE_NVMETCP); + MLX5_SET(transport_static_params, ctx, nvme_resync_tcp_sn, resync_seq); + MLX5_SET(transport_static_params, ctx, pda, queue->pda); + MLX5_SET(transport_static_params, ctx, ddgst_en, + !!(queue->dgst & NVME_TCP_DATA_DIGEST_ENABLE)); + MLX5_SET(transport_static_params, ctx, ddgst_offload_en, + ddgst_offload_en); + MLX5_SET(transport_static_params, ctx, hddgst_en, + !!(queue->dgst & NVME_TCP_HDR_DIGEST_ENABLE)); + MLX5_SET(transport_static_params, ctx, hdgst_offload_en, 0); + MLX5_SET(transport_static_params, ctx, ti, + MLX5_TRANSPORT_STATIC_PARAMS_TI_INITIATOR); + MLX5_SET(transport_static_params, ctx, cccid_ttag, 1); + MLX5_SET(transport_static_params, ctx, zero_copy_en, 1); +} + +void +build_nvmeotcp_static_params(struct mlx5e_nvmeotcp_queue *queue, + struct mlx5e_set_transport_static_params_wqe *wqe, + u32 resync_seq, bool crc_rx) +{ + u8 opc_mod = MLX5_OPC_MOD_TRANSPORT_TIR_STATIC_PARAMS; + struct mlx5_wqe_umr_ctrl_seg *ucseg = &wqe->uctrl; + struct mlx5_wqe_ctrl_seg *cseg = &wqe->ctrl; + u32 sqn = queue->sq.sqn; + u16 pc = queue->sq.pc; + + memset(wqe, 0, MLX5E_TRANSPORT_STATIC_PARAMS_WQE_SZ); + + cseg->opmod_idx_opcode = + cpu_to_be32((pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | + MLX5_OPCODE_UMR | (opc_mod) << 24); + cseg->qpn_ds = cpu_to_be32((sqn << MLX5_WQE_CTRL_QPN_SHIFT) | + MLX5E_TRANSPORT_STATIC_PARAMS_DS_CNT); + cseg->imm = cpu_to_be32(mlx5e_tir_get_tirn(&queue->tir) + << MLX5_WQE_CTRL_TIR_TIS_INDEX_SHIFT); + + ucseg->flags = MLX5_UMR_INLINE; + ucseg->bsf_octowords = + cpu_to_be16(MLX5E_TRANSPORT_STATIC_PARAMS_OCTWORD_SIZE); + fill_nvmeotcp_static_params(queue, &wqe->params, resync_seq, crc_rx); +} + +static void +mlx5e_nvmeotcp_fill_wi(struct mlx5e_nvmeotcp_queue *nvmeotcp_queue, + struct mlx5e_icosq *sq, u32 wqebbs, u16 pi, + enum wqe_type type) { struct mlx5e_icosq_wqe_info *wi = &sq->db.wqe_info[pi]; memset(wi, 0, sizeof(*wi)); wi->num_wqebbs = wqebbs; - wi->wqe_type = MLX5E_ICOSQ_WQE_UMR_NVMEOTCP; + switch (type) { + case SET_PSV_UMR: + wi->wqe_type = MLX5E_ICOSQ_WQE_SET_PSV_NVMEOTCP; + wi->nvmeotcp_q.queue = nvmeotcp_queue; + break; + default: + /* cases where no further action is required upon + * completion, such as ddp setup + */ + wi->wqe_type = MLX5E_ICOSQ_WQE_UMR_NVMEOTCP; + break; + } +} + +static void +mlx5e_nvmeotcp_rx_post_static_params_wqe(struct mlx5e_nvmeotcp_queue *queue, + u32 resync_seq) +{ + struct mlx5e_set_transport_static_params_wqe *wqe; + struct mlx5e_icosq *sq = &queue->sq; + u16 pi, wqebbs; + + spin_lock_bh(&queue->sq_lock); + wqebbs = MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS; + pi = mlx5e_icosq_get_next_pi(sq, wqebbs); + wqe = MLX5E_TRANSPORT_FETCH_SET_STATIC_PARAMS_WQE(sq, pi); + mlx5e_nvmeotcp_fill_wi(NULL, sq, wqebbs, pi, BSF_UMR); + build_nvmeotcp_static_params(queue, wqe, resync_seq, queue->crc_rx); + sq->pc += wqebbs; + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &wqe->ctrl); + spin_unlock_bh(&queue->sq_lock); +} + +static void +mlx5e_nvmeotcp_rx_post_progress_params_wqe(struct mlx5e_nvmeotcp_queue *queue, + u32 seq) +{ + struct mlx5e_set_nvmeotcp_progress_params_wqe *wqe; + struct mlx5e_icosq *sq = &queue->sq; + u16 pi, wqebbs; + + wqebbs = MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS; + pi = mlx5e_icosq_get_next_pi(sq, wqebbs); + wqe = MLX5E_NVMEOTCP_FETCH_PROGRESS_PARAMS_WQE(sq, pi); + mlx5e_nvmeotcp_fill_wi(queue, sq, wqebbs, pi, SET_PSV_UMR); + build_nvmeotcp_progress_params(queue, wqe, seq); + sq->pc += wqebbs; + mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &wqe->ctrl); } static u32 @@ -126,7 +363,7 @@ post_klm_wqe(struct mlx5e_nvmeotcp_queue *queue, wqebbs = DIV_ROUND_UP(wqe_sz, MLX5_SEND_WQE_BB); pi = mlx5e_icosq_get_next_pi(sq, wqebbs); wqe = MLX5E_NVMEOTCP_FETCH_KLM_WQE(sq, pi); - mlx5e_nvmeotcp_fill_wi(sq, wqebbs, pi); + mlx5e_nvmeotcp_fill_wi(queue, sq, wqebbs, pi, wqe_type); build_nvmeotcp_klm_umr(queue, wqe, ccid, cur_klm_entries, klm_offset, klm_length, wqe_type); sq->pc += wqebbs; @@ -160,11 +397,256 @@ mlx5e_nvmeotcp_post_klm_wqe(struct mlx5e_nvmeotcp_queue *queue, spin_unlock_bh(&queue->sq_lock); } +static int mlx5e_create_nvmeotcp_mkey(struct mlx5_core_dev *mdev, + u8 access_mode, + u32 translation_octword_size, + u32 *mkey) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + void *mkc; + u32 *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, translations_octword_size, translation_octword_size); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode); + + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn); + + err = mlx5_core_create_mkey(mdev, mkey, in, inlen); + + kvfree(in); + return err; +} + static int mlx5e_nvmeotcp_offload_limits(struct net_device *netdev, struct ulp_ddp_limits *limits) { + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + + if (limits->type != ULP_DDP_NVME) + return -EOPNOTSUPP; + + limits->max_ddp_sgl_len = mlx5e_get_max_sgl(mdev); + limits->io_threshold = MLX5E_NVMEOTCP_IO_THRESHOLD; + limits->tls = false; + limits->nvmeotcp.full_ccid_range = MLX5E_NVMEOTCP_FULL_CCID_RANGE; + return 0; +} + +static int mlx5e_nvmeotcp_queue_handler_poll(struct napi_struct *napi, + int budget) +{ + struct mlx5e_nvmeotcp_queue_handler *qh; + int work_done; + + qh = container_of(napi, struct mlx5e_nvmeotcp_queue_handler, napi); + + work_done = mlx5e_poll_ico_cq(qh->cq, budget); + + if (work_done == budget || !napi_complete_done(napi, work_done)) + goto out; + + mlx5e_cq_arm(qh->cq); + +out: + return work_done; +} + +static void +mlx5e_nvmeotcp_destroy_icosq(struct mlx5e_icosq *sq) +{ + mlx5e_close_icosq(sq); + mlx5e_close_cq(&sq->cq); +} + +static void mlx5e_nvmeotcp_icosq_err_cqe_work(struct work_struct *recover_work) +{ + struct mlx5e_icosq *sq = container_of(recover_work, struct mlx5e_icosq, + recover_work); + + /* Not implemented yet. */ + + netdev_warn(sq->channel->netdev, + "nvmeotcp icosq recovery is not implemented\n"); +} + +static int +mlx5e_nvmeotcp_build_icosq(struct mlx5e_nvmeotcp_queue *queue, + struct mlx5e_priv *priv, int io_cpu) +{ + u16 max_sgl, max_klm_per_wqe, max_umr_per_ccid, sgl_rest, wqebbs_rest; + struct mlx5e_channel *c = priv->channels.c[queue->channel_ix]; + struct mlx5e_sq_param icosq_param = {}; + struct mlx5e_create_cq_param ccp = {}; + struct dim_cq_moder icocq_moder = {}; + struct mlx5e_icosq *icosq; + int err = -ENOMEM; + u16 log_icosq_sz; + u32 max_wqebbs; + + icosq = &queue->sq; + max_sgl = mlx5e_get_max_sgl(priv->mdev); + max_klm_per_wqe = queue->max_klms_per_wqe; + max_umr_per_ccid = max_sgl / max_klm_per_wqe; + sgl_rest = max_sgl % max_klm_per_wqe; + wqebbs_rest = sgl_rest ? MLX5E_KLM_UMR_WQEBBS(sgl_rest) : 0; + max_wqebbs = (MLX5E_KLM_UMR_WQEBBS(max_klm_per_wqe) * + max_umr_per_ccid + wqebbs_rest) * queue->size; + log_icosq_sz = order_base_2(max_wqebbs); + + mlx5e_build_icosq_param(priv->mdev, log_icosq_sz, &icosq_param); + ccp.napi = &queue->qh.napi; + ccp.ch_stats = &priv->channel_stats[queue->channel_ix]->ch; + ccp.node = cpu_to_node(io_cpu); + ccp.ix = queue->channel_ix; + + err = mlx5e_open_cq(priv->mdev, icocq_moder, &icosq_param.cqp, &ccp, + &icosq->cq); + if (err) + goto err_nvmeotcp_sq; + err = mlx5e_open_icosq(c, &priv->channels.params, &icosq_param, icosq, + mlx5e_nvmeotcp_icosq_err_cqe_work); + if (err) + goto close_cq; + + spin_lock_init(&queue->sq_lock); return 0; + +close_cq: + mlx5e_close_cq(&icosq->cq); +err_nvmeotcp_sq: + return err; +} + +static void +mlx5e_nvmeotcp_destroy_rx(struct mlx5e_priv *priv, + struct mlx5e_nvmeotcp_queue *queue, + struct mlx5_core_dev *mdev) +{ + int i; + + mlx5e_accel_fs_del_sk(queue->fh); + + for (i = 0; i < queue->size; i++) + mlx5_core_destroy_mkey(mdev, queue->ccid_table[i].klm_mkey); + + mlx5e_tir_destroy(&queue->tir); + mlx5_destroy_nvmeotcp_tag_buf_table(mdev, queue->tag_buf_table_id); + + mlx5e_deactivate_icosq(&queue->sq); + napi_disable(&queue->qh.napi); + mlx5e_nvmeotcp_destroy_icosq(&queue->sq); + netif_napi_del(&queue->qh.napi); +} + +static int +mlx5e_nvmeotcp_queue_rx_init(struct mlx5e_nvmeotcp_queue *queue, + struct ulp_ddp_config *config, + struct net_device *netdev) +{ + struct nvme_tcp_ddp_config *nvme_config = &config->nvmeotcp; + u8 log_queue_size = order_base_2(nvme_config->queue_size); + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + struct sock *sk = queue->sk; + int err, max_sgls, i; + + if (nvme_config->queue_size > + BIT(MLX5_CAP_DEV_NVMEOTCP(mdev, log_max_nvmeotcp_tag_buffer_size))) + return -EINVAL; + + err = mlx5e_create_nvmeotcp_tag_buf_table(mdev, queue, log_queue_size); + if (err) + return err; + + queue->qh.cq = &queue->sq.cq; + netif_napi_add(priv->netdev, &queue->qh.napi, + mlx5e_nvmeotcp_queue_handler_poll); + + mutex_lock(&priv->state_lock); + err = mlx5e_nvmeotcp_build_icosq(queue, priv, config->affinity_hint); + mutex_unlock(&priv->state_lock); + if (err) + goto del_napi; + + napi_enable(&queue->qh.napi); + mlx5e_activate_icosq(&queue->sq); + + /* initializes queue->tir */ + err = mlx5e_rx_res_nvmeotcp_tir_create(priv->rx_res, queue->channel_ix, + queue->crc_rx, + queue->tag_buf_table_id, + &queue->tir); + if (err) + goto destroy_icosq; + + mlx5e_nvmeotcp_rx_post_static_params_wqe(queue, 0); + mlx5e_nvmeotcp_rx_post_progress_params_wqe(queue, + tcp_sk(sk)->copied_seq); + + queue->ccid_table = kcalloc(queue->size, + sizeof(struct mlx5e_nvmeotcp_queue_entry), + GFP_KERNEL); + if (!queue->ccid_table) { + err = -ENOMEM; + goto destroy_tir; + } + + max_sgls = mlx5e_get_max_sgl(mdev); + for (i = 0; i < queue->size; i++) { + err = + mlx5e_create_nvmeotcp_mkey(mdev, + MLX5_MKC_ACCESS_MODE_KLMS, + max_sgls, + &queue->ccid_table[i].klm_mkey); + if (err) + goto free_ccid_table; + } + + mlx5e_nvmeotcp_post_klm_wqe(queue, BSF_KLM_UMR, 0, queue->size); + + if (!(WARN_ON(!wait_for_completion_timeout(&queue->static_params_done, + msecs_to_jiffies(3000))))) + queue->fh = + mlx5e_accel_fs_add_sk(priv->fs, sk, + mlx5e_tir_get_tirn(&queue->tir), + queue->id); + + if (IS_ERR_OR_NULL(queue->fh)) { + err = -EINVAL; + goto destroy_mkeys; + } + + return 0; + +destroy_mkeys: + while ((i--)) + mlx5_core_destroy_mkey(mdev, queue->ccid_table[i].klm_mkey); +free_ccid_table: + kfree(queue->ccid_table); +destroy_tir: + mlx5e_tir_destroy(&queue->tir); +destroy_icosq: + mlx5e_deactivate_icosq(&queue->sq); + napi_disable(&queue->qh.napi); + mlx5e_nvmeotcp_destroy_icosq(&queue->sq); +del_napi: + netif_napi_del(&queue->qh.napi); + mlx5_destroy_nvmeotcp_tag_buf_table(mdev, queue->tag_buf_table_id); + + return err; } static int @@ -172,13 +654,92 @@ mlx5e_nvmeotcp_queue_init(struct net_device *netdev, struct sock *sk, struct ulp_ddp_config *config) { + struct nvme_tcp_ddp_config *nvme_config = &config->nvmeotcp; + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_nvmeotcp_queue *queue; + int queue_id, err; + + if (config->type != ULP_DDP_NVME) { + err = -EOPNOTSUPP; + goto out; + } + + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) { + err = -ENOMEM; + goto out; + } + + queue_id = ida_simple_get(&priv->nvmeotcp->queue_ids, + MIN_NUM_NVMEOTCP_QUEUES, + MAX_NUM_NVMEOTCP_QUEUES, + GFP_KERNEL); + if (queue_id < 0) { + err = -ENOSPC; + goto free_queue; + } + + queue->crc_rx = !!(nvme_config->dgst & NVME_TCP_DATA_DIGEST_ENABLE); + queue->ulp_ddp_ctx.type = ULP_DDP_NVME; + queue->sk = sk; + queue->id = queue_id; + queue->dgst = nvme_config->dgst; + queue->pda = nvme_config->cpda; + queue->channel_ix = + mlx5e_get_channel_ix_from_io_cpu(&priv->channels.params, + config->affinity_hint); + queue->size = nvme_config->queue_size; + queue->max_klms_per_wqe = MLX5E_MAX_KLM_PER_WQE(mdev); + queue->priv = priv; + init_completion(&queue->static_params_done); + + err = mlx5e_nvmeotcp_queue_rx_init(queue, config, netdev); + if (err) + goto remove_queue_id; + + err = rhashtable_insert_fast(&priv->nvmeotcp->queue_hash, &queue->hash, + rhash_queues); + if (err) + goto destroy_rx; + + write_lock_bh(&sk->sk_callback_lock); + ulp_ddp_set_ctx(sk, queue); + write_unlock_bh(&sk->sk_callback_lock); + refcount_set(&queue->ref_count, 1); return 0; + +destroy_rx: + mlx5e_nvmeotcp_destroy_rx(priv, queue, mdev); +remove_queue_id: + ida_simple_remove(&priv->nvmeotcp->queue_ids, queue_id); +free_queue: + kfree(queue); +out: + return err; } static void mlx5e_nvmeotcp_queue_teardown(struct net_device *netdev, struct sock *sk) { + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5_core_dev *mdev = priv->mdev; + struct mlx5e_nvmeotcp_queue *queue; + + queue = container_of(ulp_ddp_get_ctx(sk), struct mlx5e_nvmeotcp_queue, + ulp_ddp_ctx); + + WARN_ON(refcount_read(&queue->ref_count) != 1); + mlx5e_nvmeotcp_destroy_rx(priv, queue, mdev); + + rhashtable_remove_fast(&priv->nvmeotcp->queue_hash, &queue->hash, + rhash_queues); + ida_simple_remove(&priv->nvmeotcp->queue_ids, queue->id); + write_lock_bh(&sk->sk_callback_lock); + ulp_ddp_set_ctx(sk, NULL); + write_unlock_bh(&sk->sk_callback_lock); + mlx5e_nvmeotcp_put_queue(queue); } static int @@ -197,6 +758,13 @@ mlx5e_nvmeotcp_ddp_setup(struct net_device *netdev, return 0; } +void mlx5e_nvmeotcp_ctx_complete(struct mlx5e_icosq_wqe_info *wi) +{ + struct mlx5e_nvmeotcp_queue *queue = wi->nvmeotcp_q.queue; + + complete(&queue->static_params_done); +} + static void mlx5e_nvmeotcp_ddp_teardown(struct net_device *netdev, struct sock *sk, @@ -211,6 +779,26 @@ mlx5e_nvmeotcp_ddp_resync(struct net_device *netdev, { } +struct mlx5e_nvmeotcp_queue * +mlx5e_nvmeotcp_get_queue(struct mlx5e_nvmeotcp *nvmeotcp, int id) +{ + struct mlx5e_nvmeotcp_queue *queue; + + queue = rhashtable_lookup_fast(&nvmeotcp->queue_hash, + &id, rhash_queues); + if (!IS_ERR_OR_NULL(queue)) + refcount_inc(&queue->ref_count); + return queue; +} + +void mlx5e_nvmeotcp_put_queue(struct mlx5e_nvmeotcp_queue *queue) +{ + if (refcount_dec_and_test(&queue->ref_count)) { + kfree(queue->ccid_table); + kfree(queue); + } +} + int set_ulp_ddp_nvme_tcp(struct net_device *netdev, bool enable) { struct mlx5e_priv *priv = netdev_priv(netdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h index f6432a3737cd4..4850c19e18c7d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h @@ -110,6 +110,10 @@ struct mlx5e_nvmeotcp { int mlx5e_nvmeotcp_init(struct mlx5e_priv *priv); int set_ulp_ddp_nvme_tcp(struct net_device *netdev, bool enable); void mlx5e_nvmeotcp_cleanup(struct mlx5e_priv *priv); +struct mlx5e_nvmeotcp_queue * +mlx5e_nvmeotcp_get_queue(struct mlx5e_nvmeotcp *nvmeotcp, int id); +void mlx5e_nvmeotcp_put_queue(struct mlx5e_nvmeotcp_queue *queue); +void mlx5e_nvmeotcp_ctx_complete(struct mlx5e_icosq_wqe_info *wi); static inline void mlx5e_nvmeotcp_init_rx(struct mlx5e_priv *priv) {} void mlx5e_nvmeotcp_cleanup_rx(struct mlx5e_priv *priv); extern const struct ulp_ddp_dev_ops mlx5e_nvmeotcp_ops; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_utils.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_utils.h index 6ef92679c5d0a..fbcb9430ae461 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_utils.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_utils.h @@ -4,6 +4,36 @@ #define __MLX5E_NVMEOTCP_UTILS_H__ #include "en.h" +#include "en_accel/nvmeotcp.h" +#include "en_accel/common_utils.h" + +enum { + MLX5E_NVMEOTCP_PROGRESS_PARAMS_PDU_TRACKER_STATE_START = 0, + MLX5E_NVMEOTCP_PROGRESS_PARAMS_PDU_TRACKER_STATE_TRACKING = 1, + MLX5E_NVMEOTCP_PROGRESS_PARAMS_PDU_TRACKER_STATE_SEARCHING = 2, +}; + +struct mlx5_seg_nvmeotcp_progress_params { + __be32 tir_num; + u8 ctx[MLX5_ST_SZ_BYTES(nvmeotcp_progress_params)]; +}; + +struct mlx5e_set_nvmeotcp_progress_params_wqe { + struct mlx5_wqe_ctrl_seg ctrl; + struct mlx5_seg_nvmeotcp_progress_params params; +}; + +/* macros for wqe handling */ +#define MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQE_SZ \ + (sizeof(struct mlx5e_set_nvmeotcp_progress_params_wqe)) + +#define MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS \ + (DIV_ROUND_UP(MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQE_SZ, MLX5_SEND_WQE_BB)) + +#define MLX5E_NVMEOTCP_FETCH_PROGRESS_PARAMS_WQE(sq, pi) \ + ((struct mlx5e_set_nvmeotcp_progress_params_wqe *)\ + mlx5e_fetch_wqe(&(sq)->wq, pi, \ + sizeof(struct mlx5e_set_nvmeotcp_progress_params_wqe))) #define MLX5E_NVMEOTCP_FETCH_KLM_WQE(sq, pi) \ ((struct mlx5e_umr_wqe *)\ @@ -14,6 +44,9 @@ #define MLX5_CTRL_SEGMENT_OPC_MOD_UMR_TIR_PARAMS 0x2 #define MLX5_CTRL_SEGMENT_OPC_MOD_UMR_UMR 0x0 +#define PROGRESS_PARAMS_DS_CNT \ + DIV_ROUND_UP(MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQE_SZ, MLX5_SEND_WQE_DS) + enum wqe_type { KLM_UMR, BSF_KLM_UMR, @@ -22,4 +55,14 @@ enum wqe_type { KLM_INV_UMR, }; +void +build_nvmeotcp_progress_params(struct mlx5e_nvmeotcp_queue *queue, + struct mlx5e_set_nvmeotcp_progress_params_wqe *w, + u32 seq); + +void +build_nvmeotcp_static_params(struct mlx5e_nvmeotcp_queue *queue, + struct mlx5e_set_transport_static_params_wqe *wqe, + u32 resync_seq, bool crc_rx); + #endif /* __MLX5E_NVMEOTCP_UTILS_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 3cd1513e4c66e..2e590eb4e2bdc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1951,9 +1951,9 @@ bool mlx5e_reset_tx_channels_moderation(struct mlx5e_channels *chs, u8 cq_period return reset; } -static int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params, - struct mlx5e_sq_param *param, struct mlx5e_icosq *sq, - work_func_t recover_work_func) +int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params, + struct mlx5e_sq_param *param, struct mlx5e_icosq *sq, + work_func_t recover_work_func) { struct mlx5e_create_sq_param csp = {}; int err; @@ -1997,7 +1997,7 @@ void mlx5e_deactivate_icosq(struct mlx5e_icosq *icosq) synchronize_net(); /* Sync with NAPI. */ } -static void mlx5e_close_icosq(struct mlx5e_icosq *sq) +void mlx5e_close_icosq(struct mlx5e_icosq *sq) { if (sq->ktls_resync) mlx5e_ktls_rx_resync_destroy_resp_list(sq->ktls_resync); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 5370d238c8182..38c8825d86782 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -53,6 +53,7 @@ #include "en_accel/macsec.h" #include "en_accel/ipsec_rxtx.h" #include "en_accel/ktls_txrx.h" +#include "en_accel/nvmeotcp.h" #include "en/xdp.h" #include "en/xsk/rx.h" #include "en/health.h" @@ -947,16 +948,23 @@ void mlx5e_free_icosq_descs(struct mlx5e_icosq *sq) ci = mlx5_wq_cyc_ctr2ix(&sq->wq, sqcc); wi = &sq->db.wqe_info[ci]; sqcc += wi->num_wqebbs; -#ifdef CONFIG_MLX5_EN_TLS switch (wi->wqe_type) { +#ifdef CONFIG_MLX5_EN_TLS case MLX5E_ICOSQ_WQE_SET_PSV_TLS: mlx5e_ktls_handle_ctx_completion(wi); break; case MLX5E_ICOSQ_WQE_GET_PSV_TLS: mlx5e_ktls_handle_get_psv_completion(wi, sq); break; - } #endif +#ifdef CONFIG_MLX5_EN_NVMEOTCP + case MLX5E_ICOSQ_WQE_SET_PSV_NVMEOTCP: + mlx5e_nvmeotcp_ctx_complete(wi); + break; +#endif + default: + break; + } } sq->cc = sqcc; } @@ -1060,6 +1068,9 @@ int mlx5e_poll_ico_cq(struct mlx5e_cq *cq, int budget) #ifdef CONFIG_MLX5_EN_NVMEOTCP case MLX5E_ICOSQ_WQE_UMR_NVMEOTCP: break; + case MLX5E_ICOSQ_WQE_SET_PSV_NVMEOTCP: + mlx5e_nvmeotcp_ctx_complete(wi); + break; #endif default: netdev_WARN_ONCE(cq->netdev, From 497945a035ee6659487f9631f68edbbf9160f02b Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Wed, 30 Apr 2025 08:57:38 +0000 Subject: [PATCH 691/770] net/mlx5e: NVMEoTCP, ddp setup and resync NVMEoTCP offload uses buffer registration for every NVME request to perform direct data placement. This is achieved by creating a NIC HW mapping between the CCID (command capsule ID) to the set of buffers that compose the request. The registration is implemented via MKEY for which we do fast/async mapping using KLM UMR WQE. The buffer registration takes place when the ULP calls the ddp_setup op which is done before they send their corresponding request to the other side (e.g nvmf target). We don't wait for the completion of the registration before returning back to the ulp. The reason being that the HW mapping should be in place fast enough vs the RTT it would take for the request to be responded. If this doesn't happen, some IO may not be ddp-offloaded, but that doesn't stop the overall offloading session. When the offloading HW gets out of sync with the protocol session, a hardware/software handshake takes place to resync. The ddp_resync op is the part of the handshake where the SW confirms to the HW that a indeed they identified correctly a PDU header at a certain TCP sequence number. This allows the HW to resume the offload. The 1st part of the handshake is when the HW identifies such sequence number in an arriving packet. A special mark is made on the completion (cqe) and then the mlx5 driver invokes the ddp resync_request callback advertised by the ULP in the ddp context - this is in downstream patch. Signed-off-by: Ben Ben-Ishay Signed-off-by: Boris Pismenny Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Aurelien Aptel Reviewed-by: Tariq Toukan Signed-off-by: NipaLocal --- .../mellanox/mlx5/core/en_accel/nvmeotcp.c | 150 +++++++++++++++++- 1 file changed, 148 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c index 91865d1450a93..48dd242af2bbc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c @@ -742,19 +742,159 @@ mlx5e_nvmeotcp_queue_teardown(struct net_device *netdev, mlx5e_nvmeotcp_put_queue(queue); } +static bool +mlx5e_nvmeotcp_validate_small_sgl_suffix(struct scatterlist *sg, int sg_len, + int mtu) +{ + int i, hole_size, hole_len, chunk_size = 0; + + for (i = 1; i < sg_len; i++) + chunk_size += sg_dma_len(&sg[i]); + + if (chunk_size >= mtu) + return true; + + hole_size = mtu - chunk_size - 1; + hole_len = DIV_ROUND_UP(hole_size, PAGE_SIZE); + + if (sg_len + hole_len > MAX_SKB_FRAGS) + return false; + + return true; +} + +static bool +mlx5e_nvmeotcp_validate_big_sgl_suffix(struct scatterlist *sg, int sg_len, + int mtu) +{ + int i, j, last_elem, window_idx, window_size = MAX_SKB_FRAGS - 1; + int chunk_size = 0; + + last_elem = sg_len - window_size; + window_idx = window_size; + + for (j = 1; j < window_size; j++) + chunk_size += sg_dma_len(&sg[j]); + + for (i = 1; i <= last_elem; i++, window_idx++) { + chunk_size += sg_dma_len(&sg[window_idx]); + if (chunk_size < mtu - 1) + return false; + + chunk_size -= sg_dma_len(&sg[i]); + } + + return true; +} + +/* This function makes sure that the middle/suffix of a PDU SGL meets the + * restriction of MAX_SKB_FRAGS. There are two cases here: + * 1. sg_len < MAX_SKB_FRAGS - the extreme case here is a packet that consists + * of one byte from the first SG element + the rest of the SGL and the remaining + * space of the packet will be scattered to the WQE and will be pointed by + * SKB frags. + * 2. sg_len => MAX_SKB_FRAGS - the extreme case here is a packet that consists + * of one byte from middle SG element + 15 continuous SG elements + one byte + * from a sequential SG element or the rest of the packet. + */ +static bool +mlx5e_nvmeotcp_validate_sgl_suffix(struct scatterlist *sg, int sg_len, int mtu) +{ + int ret; + + if (sg_len < MAX_SKB_FRAGS) + ret = mlx5e_nvmeotcp_validate_small_sgl_suffix(sg, sg_len, mtu); + else + ret = mlx5e_nvmeotcp_validate_big_sgl_suffix(sg, sg_len, mtu); + + return ret; +} + +static bool +mlx5e_nvmeotcp_validate_sgl_prefix(struct scatterlist *sg, int sg_len, int mtu) +{ + int i, hole_size, hole_len, tmp_len, chunk_size = 0; + + tmp_len = min_t(int, sg_len, MAX_SKB_FRAGS); + + for (i = 0; i < tmp_len; i++) + chunk_size += sg_dma_len(&sg[i]); + + if (chunk_size >= mtu) + return true; + + hole_size = mtu - chunk_size; + hole_len = DIV_ROUND_UP(hole_size, PAGE_SIZE); + + if (tmp_len + hole_len > MAX_SKB_FRAGS) + return false; + + return true; +} + +/* This function is responsible to ensure that a PDU could be offloaded. + * PDU is offloaded by building a non-linear SKB such that each SGL element is + * placed in frag, thus this function should ensure that all packets that + * represent part of the PDU won't exaggerate from MAX_SKB_FRAGS SGL. + * In addition NVMEoTCP offload has one PDU offload for packet restriction. + * Packet could start with a new PDU and then we should check that the prefix + * of the PDU meets the requirement or a packet can start in the middle of SG + * element and then we should check that the suffix of PDU meets the + * requirement. + */ +static bool +mlx5e_nvmeotcp_validate_sgl(struct scatterlist *sg, int sg_len, int mtu) +{ + int max_hole_frags; + + max_hole_frags = DIV_ROUND_UP(mtu, PAGE_SIZE); + if (sg_len + max_hole_frags <= MAX_SKB_FRAGS) + return true; + + if (!mlx5e_nvmeotcp_validate_sgl_prefix(sg, sg_len, mtu) || + !mlx5e_nvmeotcp_validate_sgl_suffix(sg, sg_len, mtu)) + return false; + + return true; +} + static int mlx5e_nvmeotcp_ddp_setup(struct net_device *netdev, struct sock *sk, struct ulp_ddp_io *ddp) { + struct scatterlist *sg = ddp->sg_table.sgl; + struct mlx5e_nvmeotcp_queue_entry *nvqt; struct mlx5e_nvmeotcp_queue *queue; + struct mlx5_core_dev *mdev; + int i, size = 0, count = 0; queue = container_of(ulp_ddp_get_ctx(sk), struct mlx5e_nvmeotcp_queue, ulp_ddp_ctx); + mdev = queue->priv->mdev; + count = dma_map_sg(mdev->device, ddp->sg_table.sgl, ddp->nents, + DMA_FROM_DEVICE); - /* Placeholder - map_sg and initializing the count */ + if (count <= 0) + return -EINVAL; + + if (WARN_ON(count > mlx5e_get_max_sgl(mdev))) + return -ENOSPC; + + if (!mlx5e_nvmeotcp_validate_sgl(sg, count, READ_ONCE(netdev->mtu))) + return -EOPNOTSUPP; + + for (i = 0; i < count; i++) + size += sg_dma_len(&sg[i]); + + nvqt = &queue->ccid_table[ddp->command_id]; + nvqt->size = size; + nvqt->ddp = ddp; + nvqt->sgl = sg; + nvqt->ccid_gen++; + nvqt->sgl_length = count; + mlx5e_nvmeotcp_post_klm_wqe(queue, KLM_UMR, ddp->command_id, count); - mlx5e_nvmeotcp_post_klm_wqe(queue, KLM_UMR, ddp->command_id, 0); return 0; } @@ -777,6 +917,12 @@ static void mlx5e_nvmeotcp_ddp_resync(struct net_device *netdev, struct sock *sk, u32 seq) { + struct mlx5e_nvmeotcp_queue *queue = + container_of(ulp_ddp_get_ctx(sk), struct mlx5e_nvmeotcp_queue, + ulp_ddp_ctx); + + queue->after_resync_cqe = 1; + mlx5e_nvmeotcp_rx_post_static_params_wqe(queue, seq); } struct mlx5e_nvmeotcp_queue * From 227c420911bc5a971607cbae54c39c1d5f049b55 Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Wed, 30 Apr 2025 08:57:39 +0000 Subject: [PATCH 692/770] net/mlx5e: NVMEoTCP, async ddp invalidation After the ULP consumed the buffers of the offloaded request, it calls the ddp_teardown op to release the NIC mapping for them and allow the NIC to reuse the HW contexts associated with offloading this IO. We do a fast/async un-mapping via UMR WQE. In this case, the ULP does holds off with completing the request towards the upper/application layers until the HW unmapping is done. When the corresponding CQE is received, a notification is done via the the teardown_done ddp callback advertised by the ULP in the ddp context. Signed-off-by: Ben Ben-Ishay Signed-off-by: Boris Pismenny Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Aurelien Aptel Reviewed-by: Tariq Toukan Signed-off-by: NipaLocal --- .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 4 ++ .../mellanox/mlx5/core/en_accel/nvmeotcp.c | 58 +++++++++++++++++-- .../mellanox/mlx5/core/en_accel/nvmeotcp.h | 1 + .../net/ethernet/mellanox/mlx5/core/en_rx.c | 6 ++ 4 files changed, 63 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index 528e5c97f5a43..b9264a7fe5872 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -73,6 +73,7 @@ enum mlx5e_icosq_wqe_type { #endif #ifdef CONFIG_MLX5_EN_NVMEOTCP MLX5E_ICOSQ_WQE_UMR_NVMEOTCP, + MLX5E_ICOSQ_WQE_UMR_NVMEOTCP_INVALIDATE, MLX5E_ICOSQ_WQE_SET_PSV_NVMEOTCP, #endif }; @@ -264,6 +265,9 @@ struct mlx5e_icosq_wqe_info { struct { struct mlx5e_nvmeotcp_queue *queue; } nvmeotcp_q; + struct { + struct mlx5e_nvmeotcp_queue_entry *entry; + } nvmeotcp_qe; #endif }; }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c index 48dd242af2bbc..639a9187d88c5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c @@ -173,6 +173,13 @@ build_nvmeotcp_klm_umr(struct mlx5e_nvmeotcp_queue *queue, cseg->qpn_ds = cpu_to_be32((sqn << MLX5_WQE_CTRL_QPN_SHIFT) | ds_cnt); cseg->general_id = cpu_to_be32(id); + if (!klm_entries) { /* this is invalidate */ + ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + ucseg->flags = MLX5_UMR_INLINE; + mkc->status = MLX5_MKEY_STATUS_FREE; + return; + } + if (klm_type == KLM_UMR && !klm_offset) { ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_XLT_OCT_SIZE | MLX5_MKEY_MASK_LEN | @@ -285,8 +292,8 @@ build_nvmeotcp_static_params(struct mlx5e_nvmeotcp_queue *queue, static void mlx5e_nvmeotcp_fill_wi(struct mlx5e_nvmeotcp_queue *nvmeotcp_queue, - struct mlx5e_icosq *sq, u32 wqebbs, u16 pi, - enum wqe_type type) + struct mlx5e_icosq *sq, u32 wqebbs, + u16 pi, u16 ccid, enum wqe_type type) { struct mlx5e_icosq_wqe_info *wi = &sq->db.wqe_info[pi]; @@ -298,6 +305,10 @@ mlx5e_nvmeotcp_fill_wi(struct mlx5e_nvmeotcp_queue *nvmeotcp_queue, wi->wqe_type = MLX5E_ICOSQ_WQE_SET_PSV_NVMEOTCP; wi->nvmeotcp_q.queue = nvmeotcp_queue; break; + case KLM_INV_UMR: + wi->wqe_type = MLX5E_ICOSQ_WQE_UMR_NVMEOTCP_INVALIDATE; + wi->nvmeotcp_qe.entry = &nvmeotcp_queue->ccid_table[ccid]; + break; default: /* cases where no further action is required upon * completion, such as ddp setup @@ -319,7 +330,7 @@ mlx5e_nvmeotcp_rx_post_static_params_wqe(struct mlx5e_nvmeotcp_queue *queue, wqebbs = MLX5E_TRANSPORT_SET_STATIC_PARAMS_WQEBBS; pi = mlx5e_icosq_get_next_pi(sq, wqebbs); wqe = MLX5E_TRANSPORT_FETCH_SET_STATIC_PARAMS_WQE(sq, pi); - mlx5e_nvmeotcp_fill_wi(NULL, sq, wqebbs, pi, BSF_UMR); + mlx5e_nvmeotcp_fill_wi(NULL, sq, wqebbs, pi, 0, BSF_UMR); build_nvmeotcp_static_params(queue, wqe, resync_seq, queue->crc_rx); sq->pc += wqebbs; mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &wqe->ctrl); @@ -337,7 +348,7 @@ mlx5e_nvmeotcp_rx_post_progress_params_wqe(struct mlx5e_nvmeotcp_queue *queue, wqebbs = MLX5E_NVMEOTCP_PROGRESS_PARAMS_WQEBBS; pi = mlx5e_icosq_get_next_pi(sq, wqebbs); wqe = MLX5E_NVMEOTCP_FETCH_PROGRESS_PARAMS_WQE(sq, pi); - mlx5e_nvmeotcp_fill_wi(queue, sq, wqebbs, pi, SET_PSV_UMR); + mlx5e_nvmeotcp_fill_wi(queue, sq, wqebbs, pi, 0, SET_PSV_UMR); build_nvmeotcp_progress_params(queue, wqe, seq); sq->pc += wqebbs; mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, &wqe->ctrl); @@ -363,7 +374,7 @@ post_klm_wqe(struct mlx5e_nvmeotcp_queue *queue, wqebbs = DIV_ROUND_UP(wqe_sz, MLX5_SEND_WQE_BB); pi = mlx5e_icosq_get_next_pi(sq, wqebbs); wqe = MLX5E_NVMEOTCP_FETCH_KLM_WQE(sq, pi); - mlx5e_nvmeotcp_fill_wi(queue, sq, wqebbs, pi, wqe_type); + mlx5e_nvmeotcp_fill_wi(queue, sq, wqebbs, pi, ccid, wqe_type); build_nvmeotcp_klm_umr(queue, wqe, ccid, cur_klm_entries, klm_offset, klm_length, wqe_type); sq->pc += wqebbs; @@ -378,7 +389,10 @@ mlx5e_nvmeotcp_post_klm_wqe(struct mlx5e_nvmeotcp_queue *queue, struct mlx5e_icosq *sq = &queue->sq; u32 klm_offset = 0, wqes, i; - wqes = DIV_ROUND_UP(klm_length, queue->max_klms_per_wqe); + if (wqe_type == KLM_INV_UMR) + wqes = 1; + else + wqes = DIV_ROUND_UP(klm_length, queue->max_klms_per_wqe); spin_lock_bh(&queue->sq_lock); @@ -905,12 +919,44 @@ void mlx5e_nvmeotcp_ctx_complete(struct mlx5e_icosq_wqe_info *wi) complete(&queue->static_params_done); } +void mlx5e_nvmeotcp_ddp_inv_done(struct mlx5e_icosq_wqe_info *wi) +{ + struct mlx5e_nvmeotcp_queue_entry *q_entry = wi->nvmeotcp_qe.entry; + struct mlx5e_nvmeotcp_queue *queue = q_entry->queue; + struct mlx5_core_dev *mdev = queue->priv->mdev; + struct ulp_ddp_io *ddp = q_entry->ddp; + const struct ulp_ddp_ulp_ops *ulp_ops; + + dma_unmap_sg(mdev->device, ddp->sg_table.sgl, + ddp->nents, DMA_FROM_DEVICE); + + q_entry->sgl_length = 0; + + ulp_ops = inet_csk(queue->sk)->icsk_ulp_ddp_ops; + if (ulp_ops && ulp_ops->ddp_teardown_done) + ulp_ops->ddp_teardown_done(q_entry->ddp_ctx); +} + static void mlx5e_nvmeotcp_ddp_teardown(struct net_device *netdev, struct sock *sk, struct ulp_ddp_io *ddp, void *ddp_ctx) { + struct mlx5e_nvmeotcp_queue_entry *q_entry; + struct mlx5e_nvmeotcp_queue *queue; + + queue = container_of(ulp_ddp_get_ctx(sk), struct mlx5e_nvmeotcp_queue, + ulp_ddp_ctx); + q_entry = &queue->ccid_table[ddp->command_id]; + WARN_ONCE(q_entry->sgl_length == 0, + "Invalidation of empty sgl (CID 0x%x, queue 0x%x)\n", + ddp->command_id, queue->id); + + q_entry->ddp_ctx = ddp_ctx; + q_entry->queue = queue; + + mlx5e_nvmeotcp_post_klm_wqe(queue, KLM_INV_UMR, ddp->command_id, 0); } static void diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h index 4850c19e18c7d..67805adc6fdf8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h @@ -113,6 +113,7 @@ void mlx5e_nvmeotcp_cleanup(struct mlx5e_priv *priv); struct mlx5e_nvmeotcp_queue * mlx5e_nvmeotcp_get_queue(struct mlx5e_nvmeotcp *nvmeotcp, int id); void mlx5e_nvmeotcp_put_queue(struct mlx5e_nvmeotcp_queue *queue); +void mlx5e_nvmeotcp_ddp_inv_done(struct mlx5e_icosq_wqe_info *wi); void mlx5e_nvmeotcp_ctx_complete(struct mlx5e_icosq_wqe_info *wi); static inline void mlx5e_nvmeotcp_init_rx(struct mlx5e_priv *priv) {} void mlx5e_nvmeotcp_cleanup_rx(struct mlx5e_priv *priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 38c8825d86782..8af51d1886bde 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -958,6 +958,9 @@ void mlx5e_free_icosq_descs(struct mlx5e_icosq *sq) break; #endif #ifdef CONFIG_MLX5_EN_NVMEOTCP + case MLX5E_ICOSQ_WQE_UMR_NVMEOTCP_INVALIDATE: + mlx5e_nvmeotcp_ddp_inv_done(wi); + break; case MLX5E_ICOSQ_WQE_SET_PSV_NVMEOTCP: mlx5e_nvmeotcp_ctx_complete(wi); break; @@ -1068,6 +1071,9 @@ int mlx5e_poll_ico_cq(struct mlx5e_cq *cq, int budget) #ifdef CONFIG_MLX5_EN_NVMEOTCP case MLX5E_ICOSQ_WQE_UMR_NVMEOTCP: break; + case MLX5E_ICOSQ_WQE_UMR_NVMEOTCP_INVALIDATE: + mlx5e_nvmeotcp_ddp_inv_done(wi); + break; case MLX5E_ICOSQ_WQE_SET_PSV_NVMEOTCP: mlx5e_nvmeotcp_ctx_complete(wi); break; From 924876dc5c7e154841c01ba8bfd5ac692ccc1a5d Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Wed, 30 Apr 2025 08:57:40 +0000 Subject: [PATCH 693/770] net/mlx5e: NVMEoTCP, data-path for DDP+DDGST offload This patch implements the data-path for direct data placement (DDP) and DDGST offloads. NVMEoTCP DDP constructs an SKB from each CQE, while pointing at NVME destination buffers. In turn, this enables the offload, as the NVMe-TCP layer will skip the copy when src == dst. Additionally, this patch adds support for DDGST (CRC32) offload. HW will report DDGST offload only if it has not encountered an error in the received packet. We pass this indication in skb->ulp_crc up the stack to NVMe-TCP to skip computing the DDGST if all corresponding SKBs were verified by HW. This patch also handles context resynchronization requests made by NIC HW. The resync request is passed to the NVMe-TCP layer to be handled at a later point in time. Finally, we also use the skb->no_condense bit to avoid skb_condense. This is critical as every SKB that uses DDP has a hole that fits perfectly with skb_condense's policy, but filling this hole is counter-productive as the data there already resides in its destination buffer. Signed-off-by: Ben Ben-Ishay Signed-off-by: Boris Pismenny Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Aurelien Aptel Reviewed-by: Tariq Toukan Signed-off-by: NipaLocal --- .../net/ethernet/mellanox/mlx5/core/Makefile | 2 +- .../net/ethernet/mellanox/mlx5/core/en/txrx.h | 6 + .../mlx5/core/en_accel/nvmeotcp_rxtx.c | 359 ++++++++++++++++++ .../mlx5/core/en_accel/nvmeotcp_rxtx.h | 39 ++ .../net/ethernet/mellanox/mlx5/core/en_rx.c | 46 ++- 5 files changed, 437 insertions(+), 15 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 7e01096bdde0f..c661d46cd5209 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -110,7 +110,7 @@ mlx5_core-$(CONFIG_MLX5_EN_TLS) += en_accel/ktls_stats.o \ en_accel/fs_tcp.o en_accel/ktls.o en_accel/ktls_txrx.o \ en_accel/ktls_tx.o en_accel/ktls_rx.o -mlx5_core-$(CONFIG_MLX5_EN_NVMEOTCP) += en_accel/fs_tcp.o en_accel/nvmeotcp.o +mlx5_core-$(CONFIG_MLX5_EN_NVMEOTCP) += en_accel/fs_tcp.o en_accel/nvmeotcp.o en_accel/nvmeotcp_rxtx.o # # SW Steering diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h index b9264a7fe5872..12770cef4560a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h @@ -595,4 +595,10 @@ static inline struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz)); } + +static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix) +{ + return &rq->wqe.frags[ix << rq->wqe.info.log_num_frags]; +} + #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c new file mode 100644 index 0000000000000..2b7f0bf7270c3 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + +#include +#include "en_accel/nvmeotcp_rxtx.h" +#include +#include "en/txrx.h" + +#define MLX5E_TC_FLOW_ID_MASK 0x00ffffff + +static struct mlx5e_frag_page *mlx5e_get_frag(struct mlx5e_rq *rq, + struct mlx5_cqe64 *cqe) +{ + struct mlx5e_frag_page *fp; + + if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) { + u16 wqe_id = be16_to_cpu(cqe->wqe_id); + u16 stride_ix = mpwrq_get_cqe_stride_index(cqe); + u32 wqe_offset = stride_ix << rq->mpwqe.log_stride_sz; + u32 page_idx = wqe_offset >> rq->mpwqe.page_shift; + struct mlx5e_mpw_info *wi = mlx5e_get_mpw_info(rq, wqe_id); + union mlx5e_alloc_units *au = &wi->alloc_units; + + fp = &au->frag_pages[page_idx]; + } else { + /* Legacy */ + struct mlx5_wq_cyc *wq = &rq->wqe.wq; + u16 ci = mlx5_wq_cyc_ctr2ix(wq, be16_to_cpu(cqe->wqe_counter)); + struct mlx5e_wqe_frag_info *wi = get_frag(rq, ci); + + fp = wi->frag_page; + } + + return fp; +} + +static void nvmeotcp_update_resync(struct mlx5e_nvmeotcp_queue *queue, + struct mlx5e_cqe128 *cqe128) +{ + const struct ulp_ddp_ulp_ops *ulp_ops; + u32 seq; + + seq = be32_to_cpu(cqe128->resync_tcp_sn); + ulp_ops = inet_csk(queue->sk)->icsk_ulp_ddp_ops; + if (ulp_ops && ulp_ops->resync_request) + ulp_ops->resync_request(queue->sk, seq, ULP_DDP_RESYNC_PENDING); +} + +static void mlx5e_nvmeotcp_advance_sgl_iter(struct mlx5e_nvmeotcp_queue *queue) +{ + struct mlx5e_nvmeotcp_queue_entry *nqe; + + nqe = &queue->ccid_table[queue->ccid]; + queue->ccoff += nqe->sgl[queue->ccsglidx].length; + queue->ccoff_inner = 0; + queue->ccsglidx++; +} + +static void +mlx5e_nvmeotcp_add_skb_frag(struct net_device *netdev, struct sk_buff *skb, + struct mlx5e_nvmeotcp_queue *queue, + struct mlx5e_nvmeotcp_queue_entry *nqe, u32 fragsz) +{ + dma_sync_single_for_cpu(&netdev->dev, + nqe->sgl[queue->ccsglidx].offset + + queue->ccoff_inner, + fragsz, DMA_FROM_DEVICE); + + page_ref_inc(compound_head(sg_page(&nqe->sgl[queue->ccsglidx]))); + + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, + sg_page(&nqe->sgl[queue->ccsglidx]), + nqe->sgl[queue->ccsglidx].offset + queue->ccoff_inner, + fragsz, + fragsz); +} + +static void +mlx5_nvmeotcp_add_tail_nonlinear(struct sk_buff *skb, skb_frag_t *org_frags, + int org_nr_frags, int frag_index) +{ + while (org_nr_frags != frag_index) { + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, + skb_frag_page(&org_frags[frag_index]), + skb_frag_off(&org_frags[frag_index]), + skb_frag_size(&org_frags[frag_index]), + skb_frag_size(&org_frags[frag_index])); + frag_index++; + } +} + +static void +mlx5_nvmeotcp_add_tail(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, + struct mlx5e_nvmeotcp_queue *queue, struct sk_buff *skb, + int offset, int len) +{ + struct mlx5e_frag_page *frag_page = mlx5e_get_frag(rq, cqe); + + frag_page->frags++; + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, + virt_to_page(skb->data), offset, len, len); +} + +static void mlx5_nvmeotcp_trim_nonlinear(struct sk_buff *skb, + skb_frag_t *org_frags, + int *frag_index, int remaining) +{ + unsigned int frag_size; + int nr_frags; + + /* skip @remaining bytes in frags */ + *frag_index = 0; + while (remaining) { + frag_size = skb_frag_size(&skb_shinfo(skb)->frags[*frag_index]); + if (frag_size > remaining) { + skb_frag_off_add(&skb_shinfo(skb)->frags[*frag_index], + remaining); + skb_frag_size_sub(&skb_shinfo(skb)->frags[*frag_index], + remaining); + remaining = 0; + } else { + remaining -= frag_size; + skb_frag_unref(skb, *frag_index); + *frag_index += 1; + } + } + + /* save original frags for the tail and unref */ + nr_frags = skb_shinfo(skb)->nr_frags; + memcpy(&org_frags[*frag_index], &skb_shinfo(skb)->frags[*frag_index], + (nr_frags - *frag_index) * sizeof(skb_frag_t)); + + /* remove frags from skb */ + skb_shinfo(skb)->nr_frags = 0; + skb->len -= skb->data_len; + skb->truesize -= skb->data_len; + skb->data_len = 0; +} + +static bool +mlx5e_nvmeotcp_rebuild_rx_skb_nonlinear(struct mlx5e_rq *rq, + struct sk_buff *skb, + struct mlx5_cqe64 *cqe, + u32 cqe_bcnt) +{ + int ccoff, cclen, hlen, ccid, remaining, fragsz, to_copy = 0; + struct net_device *netdev = rq->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_nvmeotcp_queue_entry *nqe; + skb_frag_t org_frags[MAX_SKB_FRAGS]; + struct mlx5e_nvmeotcp_queue *queue; + int org_nr_frags, frag_index; + struct mlx5e_cqe128 *cqe128; + u32 queue_id; + + queue_id = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK); + queue = mlx5e_nvmeotcp_get_queue(priv->nvmeotcp, queue_id); + if (unlikely(!queue)) { + dev_kfree_skb_any(skb); + return false; + } + + cqe128 = container_of(cqe, struct mlx5e_cqe128, cqe64); + if (cqe_is_nvmeotcp_resync(cqe)) { + nvmeotcp_update_resync(queue, cqe128); + mlx5e_nvmeotcp_put_queue(queue); + return true; + } + + /* If a resync occurred in the previous cqe, + * the current cqe.crcvalid bit may not be valid, + * so we will treat it as 0 + */ + if (unlikely(queue->after_resync_cqe) && + cqe_is_nvmeotcp_crcvalid(cqe)) { + skb->ulp_crc = 0; + queue->after_resync_cqe = 0; + } else { + if (queue->crc_rx) + skb->ulp_crc = cqe_is_nvmeotcp_crcvalid(cqe); + } + + skb->no_condense = cqe_is_nvmeotcp_zc(cqe); + if (!cqe_is_nvmeotcp_zc(cqe)) { + mlx5e_nvmeotcp_put_queue(queue); + return true; + } + + /* cc ddp from cqe */ + ccid = be16_to_cpu(cqe128->ccid); + ccoff = be32_to_cpu(cqe128->ccoff); + cclen = be16_to_cpu(cqe128->cclen); + hlen = be16_to_cpu(cqe128->hlen); + + /* carve a hole in the skb for DDP data */ + org_nr_frags = skb_shinfo(skb)->nr_frags; + mlx5_nvmeotcp_trim_nonlinear(skb, org_frags, &frag_index, cclen); + nqe = &queue->ccid_table[ccid]; + + /* packet starts new ccid? */ + if (queue->ccid != ccid || queue->ccid_gen != nqe->ccid_gen) { + queue->ccid = ccid; + queue->ccoff = 0; + queue->ccoff_inner = 0; + queue->ccsglidx = 0; + queue->ccid_gen = nqe->ccid_gen; + } + + /* skip inside cc until the ccoff in the cqe */ + while (queue->ccoff + queue->ccoff_inner < ccoff) { + remaining = nqe->sgl[queue->ccsglidx].length - + queue->ccoff_inner; + fragsz = min_t(off_t, remaining, + ccoff - (queue->ccoff + queue->ccoff_inner)); + + if (fragsz == remaining) + mlx5e_nvmeotcp_advance_sgl_iter(queue); + else + queue->ccoff_inner += fragsz; + } + + /* adjust the skb according to the cqe cc */ + while (to_copy < cclen) { + remaining = nqe->sgl[queue->ccsglidx].length - + queue->ccoff_inner; + fragsz = min_t(int, remaining, cclen - to_copy); + + mlx5e_nvmeotcp_add_skb_frag(netdev, skb, queue, nqe, fragsz); + to_copy += fragsz; + if (fragsz == remaining) + mlx5e_nvmeotcp_advance_sgl_iter(queue); + else + queue->ccoff_inner += fragsz; + } + + if (cqe_bcnt > hlen + cclen) { + remaining = cqe_bcnt - hlen - cclen; + mlx5_nvmeotcp_add_tail_nonlinear(skb, org_frags, + org_nr_frags, + frag_index); + } + + mlx5e_nvmeotcp_put_queue(queue); + return true; +} + +static bool +mlx5e_nvmeotcp_rebuild_rx_skb_linear(struct mlx5e_rq *rq, struct sk_buff *skb, + struct mlx5_cqe64 *cqe, u32 cqe_bcnt) +{ + int ccoff, cclen, hlen, ccid, remaining, fragsz, to_copy = 0; + struct net_device *netdev = rq->netdev; + struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_nvmeotcp_queue_entry *nqe; + struct mlx5e_nvmeotcp_queue *queue; + struct mlx5e_cqe128 *cqe128; + u32 queue_id; + + queue_id = (be32_to_cpu(cqe->sop_drop_qpn) & MLX5E_TC_FLOW_ID_MASK); + queue = mlx5e_nvmeotcp_get_queue(priv->nvmeotcp, queue_id); + if (unlikely(!queue)) { + dev_kfree_skb_any(skb); + return false; + } + + cqe128 = container_of(cqe, struct mlx5e_cqe128, cqe64); + if (cqe_is_nvmeotcp_resync(cqe)) { + nvmeotcp_update_resync(queue, cqe128); + mlx5e_nvmeotcp_put_queue(queue); + return true; + } + + /* If a resync occurred in the previous cqe, + * the current cqe.crcvalid bit may not be valid, + * so we will treat it as 0 + */ + if (unlikely(queue->after_resync_cqe) && + cqe_is_nvmeotcp_crcvalid(cqe)) { + skb->ulp_crc = 0; + queue->after_resync_cqe = 0; + } else { + if (queue->crc_rx) + skb->ulp_crc = cqe_is_nvmeotcp_crcvalid(cqe); + } + + skb->no_condense = cqe_is_nvmeotcp_zc(cqe); + if (!cqe_is_nvmeotcp_zc(cqe)) { + mlx5e_nvmeotcp_put_queue(queue); + return true; + } + + /* cc ddp from cqe */ + ccid = be16_to_cpu(cqe128->ccid); + ccoff = be32_to_cpu(cqe128->ccoff); + cclen = be16_to_cpu(cqe128->cclen); + hlen = be16_to_cpu(cqe128->hlen); + + /* carve a hole in the skb for DDP data */ + skb_trim(skb, hlen); + nqe = &queue->ccid_table[ccid]; + + /* packet starts new ccid? */ + if (queue->ccid != ccid || queue->ccid_gen != nqe->ccid_gen) { + queue->ccid = ccid; + queue->ccoff = 0; + queue->ccoff_inner = 0; + queue->ccsglidx = 0; + queue->ccid_gen = nqe->ccid_gen; + } + + /* skip inside cc until the ccoff in the cqe */ + while (queue->ccoff + queue->ccoff_inner < ccoff) { + remaining = nqe->sgl[queue->ccsglidx].length - + queue->ccoff_inner; + fragsz = min_t(off_t, remaining, + ccoff - (queue->ccoff + queue->ccoff_inner)); + + if (fragsz == remaining) + mlx5e_nvmeotcp_advance_sgl_iter(queue); + else + queue->ccoff_inner += fragsz; + } + + /* adjust the skb according to the cqe cc */ + while (to_copy < cclen) { + remaining = nqe->sgl[queue->ccsglidx].length - + queue->ccoff_inner; + fragsz = min_t(int, remaining, cclen - to_copy); + + mlx5e_nvmeotcp_add_skb_frag(netdev, skb, queue, nqe, fragsz); + to_copy += fragsz; + if (fragsz == remaining) + mlx5e_nvmeotcp_advance_sgl_iter(queue); + else + queue->ccoff_inner += fragsz; + } + + if (cqe_bcnt > hlen + cclen) { + remaining = cqe_bcnt - hlen - cclen; + mlx5_nvmeotcp_add_tail(rq, cqe, queue, skb, + offset_in_page(skb->data) + + hlen + cclen, remaining); + } + + mlx5e_nvmeotcp_put_queue(queue); + return true; +} + +bool +mlx5e_nvmeotcp_rebuild_rx_skb(struct mlx5e_rq *rq, struct sk_buff *skb, + struct mlx5_cqe64 *cqe, u32 cqe_bcnt) +{ + if (skb->data_len) + return mlx5e_nvmeotcp_rebuild_rx_skb_nonlinear(rq, skb, cqe, + cqe_bcnt); + else + return mlx5e_nvmeotcp_rebuild_rx_skb_linear(rq, skb, cqe, + cqe_bcnt); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.h new file mode 100644 index 0000000000000..5f208cc10bfa4 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. */ +#ifndef __MLX5E_NVMEOTCP_RXTX_H__ +#define __MLX5E_NVMEOTCP_RXTX_H__ + +#ifdef CONFIG_MLX5_EN_NVMEOTCP + +#include +#include "en_accel/nvmeotcp.h" + +bool +mlx5e_nvmeotcp_rebuild_rx_skb(struct mlx5e_rq *rq, struct sk_buff *skb, + struct mlx5_cqe64 *cqe, u32 cqe_bcnt); + +static inline int mlx5_nvmeotcp_get_headlen(struct mlx5_cqe64 *cqe, + u32 cqe_bcnt) +{ + struct mlx5e_cqe128 *cqe128; + + if (!cqe_is_nvmeotcp_zc(cqe)) + return cqe_bcnt; + + cqe128 = container_of(cqe, struct mlx5e_cqe128, cqe64); + return be16_to_cpu(cqe128->hlen); +} + +#else + +static inline bool +mlx5e_nvmeotcp_rebuild_rx_skb(struct mlx5e_rq *rq, struct sk_buff *skb, + struct mlx5_cqe64 *cqe, u32 cqe_bcnt) +{ return true; } + +static inline int mlx5_nvmeotcp_get_headlen(struct mlx5_cqe64 *cqe, + u32 cqe_bcnt) +{ return cqe_bcnt; } + +#endif /* CONFIG_MLX5_EN_NVMEOTCP */ +#endif /* __MLX5E_NVMEOTCP_RXTX_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 8af51d1886bde..95639d845757c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -53,7 +53,7 @@ #include "en_accel/macsec.h" #include "en_accel/ipsec_rxtx.h" #include "en_accel/ktls_txrx.h" -#include "en_accel/nvmeotcp.h" +#include "en_accel/nvmeotcp_rxtx.h" #include "en/xdp.h" #include "en/xsk/rx.h" #include "en/health.h" @@ -336,10 +336,6 @@ static inline void mlx5e_put_rx_frag(struct mlx5e_rq *rq, mlx5e_page_release_fragmented(rq, frag->frag_page); } -static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix) -{ - return &rq->wqe.frags[ix << rq->wqe.info.log_num_frags]; -} static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe, u16 ix) @@ -1560,7 +1556,7 @@ static inline void mlx5e_handle_csum(struct net_device *netdev, #define MLX5E_CE_BIT_MASK 0x80 -static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe, +static inline bool mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe, u32 cqe_bcnt, struct mlx5e_rq *rq, struct sk_buff *skb) @@ -1571,6 +1567,14 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe, skb->mac_len = ETH_HLEN; + if (IS_ENABLED(CONFIG_MLX5_EN_NVMEOTCP) && cqe_is_nvmeotcp(cqe)) { + bool ret = mlx5e_nvmeotcp_rebuild_rx_skb(rq, skb, cqe, + cqe_bcnt); + + if (unlikely(!ret)) + return ret; + } + if (unlikely(get_cqe_tls_offload(cqe))) mlx5e_ktls_handle_rx_skb(rq, skb, cqe, &cqe_bcnt); @@ -1617,6 +1621,8 @@ static inline void mlx5e_build_rx_skb(struct mlx5_cqe64 *cqe, if (unlikely(mlx5e_skb_is_multicast(skb))) stats->mcast_packets++; + + return true; } static void mlx5e_shampo_complete_rx_cqe(struct mlx5e_rq *rq, @@ -1638,7 +1644,7 @@ static void mlx5e_shampo_complete_rx_cqe(struct mlx5e_rq *rq, } } -static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq, +static inline bool mlx5e_complete_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe, u32 cqe_bcnt, struct sk_buff *skb) @@ -1647,7 +1653,7 @@ static inline void mlx5e_complete_rx_cqe(struct mlx5e_rq *rq, stats->packets++; stats->bytes += cqe_bcnt; - mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb); + return mlx5e_build_rx_skb(cqe, cqe_bcnt, rq, skb); } static inline @@ -1861,7 +1867,8 @@ static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) goto wq_cyc_pop; } - mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + if (unlikely(!mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb))) + goto wq_cyc_pop; if (mlx5e_cqe_regb_chain(cqe)) if (!mlx5e_tc_update_skb_nic(cqe, skb)) { @@ -1908,7 +1915,8 @@ static void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe) goto wq_cyc_pop; } - mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + if (unlikely(!mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb))) + goto wq_cyc_pop; if (rep->vlan && skb_vlan_tag_present(skb)) skb_vlan_pop(skb); @@ -1957,7 +1965,8 @@ static void mlx5e_handle_rx_cqe_mpwrq_rep(struct mlx5e_rq *rq, struct mlx5_cqe64 if (!skb) goto mpwrq_cqe_out; - mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + if (unlikely(!mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb))) + goto mpwrq_cqe_out; mlx5e_rep_tc_receive(cqe, rq, skb); @@ -1997,13 +2006,19 @@ mlx5e_shampo_fill_skb_data(struct sk_buff *skb, struct mlx5e_rq *rq, } while (data_bcnt); } +static u16 mlx5e_get_headlen_hint(struct mlx5_cqe64 *cqe, u32 cqe_bcnt) +{ + return min_t(u32, MLX5E_RX_MAX_HEAD, + mlx5_nvmeotcp_get_headlen(cqe, cqe_bcnt)); +} + static struct sk_buff * mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, struct mlx5_cqe64 *cqe, u16 cqe_bcnt, u32 head_offset, u32 page_idx) { struct mlx5e_frag_page *frag_page = &wi->alloc_units.frag_pages[page_idx]; - u16 headlen = min_t(u16, MLX5E_RX_MAX_HEAD, cqe_bcnt); + u16 headlen = mlx5e_get_headlen_hint(cqe, cqe_bcnt); struct mlx5e_frag_page *head_page = frag_page; u32 frag_offset = head_offset; u32 byte_cnt = cqe_bcnt; @@ -2429,7 +2444,8 @@ static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cq if (!skb) goto mpwrq_cqe_out; - mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + if (unlikely(!mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb))) + goto mpwrq_cqe_out; if (mlx5e_cqe_regb_chain(cqe)) if (!mlx5e_tc_update_skb_nic(cqe, skb)) { @@ -2762,7 +2778,9 @@ static void mlx5e_trap_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe if (!skb) goto wq_cyc_pop; - mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb); + if (unlikely(!mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb))) + goto wq_cyc_pop; + skb_push(skb, ETH_HLEN); mlx5_devlink_trap_report(rq->mdev, trap_id, skb, From 47760b097d6c3dad3760b2890864d2b6e8f0c7cd Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Wed, 30 Apr 2025 08:57:41 +0000 Subject: [PATCH 694/770] net/mlx5e: NVMEoTCP, statistics NVMEoTCP offload statistics include both control and data path statistic: counters for the netdev ddp ops, offloaded packets/bytes, resync and dropped packets. Expose the statistics using ulp_ddp_ops->get_stats() instead of the regular statistics flow. Signed-off-by: Ben Ben-Ishay Signed-off-by: Boris Pismenny Signed-off-by: Or Gerlitz Signed-off-by: Yoray Zack Signed-off-by: Shai Malin Signed-off-by: Aurelien Aptel Reviewed-by: Tariq Toukan Signed-off-by: NipaLocal --- .../net/ethernet/mellanox/mlx5/core/Makefile | 3 +- .../mellanox/mlx5/core/en_accel/nvmeotcp.c | 54 ++++++++++++--- .../mellanox/mlx5/core/en_accel/nvmeotcp.h | 17 +++++ .../mlx5/core/en_accel/nvmeotcp_rxtx.c | 11 ++- .../mlx5/core/en_accel/nvmeotcp_stats.c | 69 +++++++++++++++++++ .../ethernet/mellanox/mlx5/core/en_stats.h | 9 +++ 6 files changed, 151 insertions(+), 12 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_stats.c diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index c661d46cd5209..89d07eca52d23 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -110,7 +110,8 @@ mlx5_core-$(CONFIG_MLX5_EN_TLS) += en_accel/ktls_stats.o \ en_accel/fs_tcp.o en_accel/ktls.o en_accel/ktls_txrx.o \ en_accel/ktls_tx.o en_accel/ktls_rx.o -mlx5_core-$(CONFIG_MLX5_EN_NVMEOTCP) += en_accel/fs_tcp.o en_accel/nvmeotcp.o en_accel/nvmeotcp_rxtx.o +mlx5_core-$(CONFIG_MLX5_EN_NVMEOTCP) += en_accel/fs_tcp.o en_accel/nvmeotcp.o \ + en_accel/nvmeotcp_rxtx.o en_accel/nvmeotcp_stats.o # # SW Steering diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c index 639a9187d88c5..c4cf48141f023 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.c @@ -670,9 +670,15 @@ mlx5e_nvmeotcp_queue_init(struct net_device *netdev, { struct nvme_tcp_ddp_config *nvme_config = &config->nvmeotcp; struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_nvmeotcp_sw_stats *sw_stats; struct mlx5_core_dev *mdev = priv->mdev; struct mlx5e_nvmeotcp_queue *queue; int queue_id, err; + u32 channel_ix; + + channel_ix = mlx5e_get_channel_ix_from_io_cpu(&priv->channels.params, + config->affinity_hint); + sw_stats = &priv->nvmeotcp->sw_stats; if (config->type != ULP_DDP_NVME) { err = -EOPNOTSUPP; @@ -700,12 +706,11 @@ mlx5e_nvmeotcp_queue_init(struct net_device *netdev, queue->id = queue_id; queue->dgst = nvme_config->dgst; queue->pda = nvme_config->cpda; - queue->channel_ix = - mlx5e_get_channel_ix_from_io_cpu(&priv->channels.params, - config->affinity_hint); + queue->channel_ix = channel_ix; queue->size = nvme_config->queue_size; queue->max_klms_per_wqe = MLX5E_MAX_KLM_PER_WQE(mdev); queue->priv = priv; + queue->sw_stats = sw_stats; init_completion(&queue->static_params_done); err = mlx5e_nvmeotcp_queue_rx_init(queue, config, netdev); @@ -717,6 +722,7 @@ mlx5e_nvmeotcp_queue_init(struct net_device *netdev, if (err) goto destroy_rx; + atomic64_inc(&sw_stats->rx_nvmeotcp_sk_add); write_lock_bh(&sk->sk_callback_lock); ulp_ddp_set_ctx(sk, queue); write_unlock_bh(&sk->sk_callback_lock); @@ -730,6 +736,7 @@ mlx5e_nvmeotcp_queue_init(struct net_device *netdev, free_queue: kfree(queue); out: + atomic64_inc(&sw_stats->rx_nvmeotcp_sk_add_fail); return err; } @@ -744,6 +751,8 @@ mlx5e_nvmeotcp_queue_teardown(struct net_device *netdev, queue = container_of(ulp_ddp_get_ctx(sk), struct mlx5e_nvmeotcp_queue, ulp_ddp_ctx); + atomic64_inc(&queue->sw_stats->rx_nvmeotcp_sk_del); + WARN_ON(refcount_read(&queue->ref_count) != 1); mlx5e_nvmeotcp_destroy_rx(priv, queue, mdev); @@ -878,25 +887,34 @@ mlx5e_nvmeotcp_ddp_setup(struct net_device *netdev, struct ulp_ddp_io *ddp) { struct scatterlist *sg = ddp->sg_table.sgl; + struct mlx5e_nvmeotcp_sw_stats *sw_stats; struct mlx5e_nvmeotcp_queue_entry *nvqt; struct mlx5e_nvmeotcp_queue *queue; struct mlx5_core_dev *mdev; int i, size = 0, count = 0; + int ret = 0; queue = container_of(ulp_ddp_get_ctx(sk), struct mlx5e_nvmeotcp_queue, ulp_ddp_ctx); + sw_stats = queue->sw_stats; mdev = queue->priv->mdev; count = dma_map_sg(mdev->device, ddp->sg_table.sgl, ddp->nents, DMA_FROM_DEVICE); - if (count <= 0) - return -EINVAL; + if (count <= 0) { + ret = -EINVAL; + goto ddp_setup_fail; + } - if (WARN_ON(count > mlx5e_get_max_sgl(mdev))) - return -ENOSPC; + if (WARN_ON(count > mlx5e_get_max_sgl(mdev))) { + ret = -ENOSPC; + goto ddp_setup_fail; + } - if (!mlx5e_nvmeotcp_validate_sgl(sg, count, READ_ONCE(netdev->mtu))) - return -EOPNOTSUPP; + if (!mlx5e_nvmeotcp_validate_sgl(sg, count, READ_ONCE(netdev->mtu))) { + ret = -EOPNOTSUPP; + goto ddp_setup_fail; + } for (i = 0; i < count; i++) size += sg_dma_len(&sg[i]); @@ -908,8 +926,14 @@ mlx5e_nvmeotcp_ddp_setup(struct net_device *netdev, nvqt->ccid_gen++; nvqt->sgl_length = count; mlx5e_nvmeotcp_post_klm_wqe(queue, KLM_UMR, ddp->command_id, count); - + atomic64_inc(&sw_stats->rx_nvmeotcp_ddp_setup); return 0; + +ddp_setup_fail: + dma_unmap_sg(mdev->device, ddp->sg_table.sgl, ddp->nents, + DMA_FROM_DEVICE); + atomic64_inc(&sw_stats->rx_nvmeotcp_ddp_setup_fail); + return ret; } void mlx5e_nvmeotcp_ctx_complete(struct mlx5e_icosq_wqe_info *wi) @@ -957,6 +981,7 @@ mlx5e_nvmeotcp_ddp_teardown(struct net_device *netdev, q_entry->queue = queue; mlx5e_nvmeotcp_post_klm_wqe(queue, KLM_INV_UMR, ddp->command_id, 0); + atomic64_inc(&queue->sw_stats->rx_nvmeotcp_ddp_teardown); } static void @@ -991,6 +1016,14 @@ void mlx5e_nvmeotcp_put_queue(struct mlx5e_nvmeotcp_queue *queue) } } +static int mlx5e_ulp_ddp_get_stats(struct net_device *dev, + struct ulp_ddp_stats *stats) +{ + struct mlx5e_priv *priv = netdev_priv(dev); + + return mlx5e_nvmeotcp_get_stats(priv, stats); +} + int set_ulp_ddp_nvme_tcp(struct net_device *netdev, bool enable) { struct mlx5e_priv *priv = netdev_priv(netdev); @@ -1101,6 +1134,7 @@ const struct ulp_ddp_dev_ops mlx5e_nvmeotcp_ops = { .resync = mlx5e_nvmeotcp_ddp_resync, .set_caps = mlx5e_ulp_ddp_set_caps, .get_caps = mlx5e_ulp_ddp_get_caps, + .get_stats = mlx5e_ulp_ddp_get_stats, }; void mlx5e_nvmeotcp_cleanup_rx(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h index 67805adc6fdf8..54b0b39825b93 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp.h @@ -9,6 +9,15 @@ #include "en.h" #include "en/params.h" +struct mlx5e_nvmeotcp_sw_stats { + atomic64_t rx_nvmeotcp_sk_add; + atomic64_t rx_nvmeotcp_sk_add_fail; + atomic64_t rx_nvmeotcp_sk_del; + atomic64_t rx_nvmeotcp_ddp_setup; + atomic64_t rx_nvmeotcp_ddp_setup_fail; + atomic64_t rx_nvmeotcp_ddp_teardown; +}; + struct mlx5e_nvmeotcp_queue_entry { struct mlx5e_nvmeotcp_queue *queue; u32 sgl_length; @@ -52,6 +61,7 @@ struct mlx5e_nvmeotcp_queue_handler { * @sk: The socket used by the NVMe-TCP queue * @crc_rx: CRC Rx offload indication for this queue * @priv: mlx5e netdev priv + * @sw_stats: Global software statistics for nvmeotcp offload * @static_params_done: Async completion structure for the initial umr * mapping synchronization * @sq_lock: Spin lock for the icosq @@ -90,6 +100,7 @@ struct mlx5e_nvmeotcp_queue { u8 crc_rx:1; /* for ddp invalidate flow */ struct mlx5e_priv *priv; + struct mlx5e_nvmeotcp_sw_stats *sw_stats; /* end of data-path section */ struct completion static_params_done; @@ -101,6 +112,7 @@ struct mlx5e_nvmeotcp_queue { }; struct mlx5e_nvmeotcp { + struct mlx5e_nvmeotcp_sw_stats sw_stats; struct ida queue_ids; struct rhashtable queue_hash; struct ulp_ddp_dev_caps ddp_caps; @@ -117,6 +129,8 @@ void mlx5e_nvmeotcp_ddp_inv_done(struct mlx5e_icosq_wqe_info *wi); void mlx5e_nvmeotcp_ctx_complete(struct mlx5e_icosq_wqe_info *wi); static inline void mlx5e_nvmeotcp_init_rx(struct mlx5e_priv *priv) {} void mlx5e_nvmeotcp_cleanup_rx(struct mlx5e_priv *priv); +int mlx5e_nvmeotcp_get_stats(struct mlx5e_priv *priv, + struct ulp_ddp_stats *stats); extern const struct ulp_ddp_dev_ops mlx5e_nvmeotcp_ops; #else @@ -126,5 +140,8 @@ static inline int set_ulp_ddp_nvme_tcp(struct net_device *dev, bool en) { return -EOPNOTSUPP; } static inline void mlx5e_nvmeotcp_init_rx(struct mlx5e_priv *priv) {} static inline void mlx5e_nvmeotcp_cleanup_rx(struct mlx5e_priv *priv) {} +static inline int mlx5e_nvmeotcp_get_stats(struct mlx5e_priv *priv, + struct ulp_ddp_stats *stats) +{ return 0; } #endif #endif /* __MLX5E_NVMEOTCP_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c index 2b7f0bf7270c3..ec621d886a398 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_rxtx.c @@ -146,6 +146,7 @@ mlx5e_nvmeotcp_rebuild_rx_skb_nonlinear(struct mlx5e_rq *rq, int ccoff, cclen, hlen, ccid, remaining, fragsz, to_copy = 0; struct net_device *netdev = rq->netdev; struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_rq_stats *stats = rq->stats; struct mlx5e_nvmeotcp_queue_entry *nqe; skb_frag_t org_frags[MAX_SKB_FRAGS]; struct mlx5e_nvmeotcp_queue *queue; @@ -157,12 +158,14 @@ mlx5e_nvmeotcp_rebuild_rx_skb_nonlinear(struct mlx5e_rq *rq, queue = mlx5e_nvmeotcp_get_queue(priv->nvmeotcp, queue_id); if (unlikely(!queue)) { dev_kfree_skb_any(skb); + stats->nvmeotcp_drop++; return false; } cqe128 = container_of(cqe, struct mlx5e_cqe128, cqe64); if (cqe_is_nvmeotcp_resync(cqe)) { nvmeotcp_update_resync(queue, cqe128); + stats->nvmeotcp_resync++; mlx5e_nvmeotcp_put_queue(queue); return true; } @@ -239,7 +242,8 @@ mlx5e_nvmeotcp_rebuild_rx_skb_nonlinear(struct mlx5e_rq *rq, org_nr_frags, frag_index); } - + stats->nvmeotcp_packets++; + stats->nvmeotcp_bytes += cclen; mlx5e_nvmeotcp_put_queue(queue); return true; } @@ -251,6 +255,7 @@ mlx5e_nvmeotcp_rebuild_rx_skb_linear(struct mlx5e_rq *rq, struct sk_buff *skb, int ccoff, cclen, hlen, ccid, remaining, fragsz, to_copy = 0; struct net_device *netdev = rq->netdev; struct mlx5e_priv *priv = netdev_priv(netdev); + struct mlx5e_rq_stats *stats = rq->stats; struct mlx5e_nvmeotcp_queue_entry *nqe; struct mlx5e_nvmeotcp_queue *queue; struct mlx5e_cqe128 *cqe128; @@ -260,12 +265,14 @@ mlx5e_nvmeotcp_rebuild_rx_skb_linear(struct mlx5e_rq *rq, struct sk_buff *skb, queue = mlx5e_nvmeotcp_get_queue(priv->nvmeotcp, queue_id); if (unlikely(!queue)) { dev_kfree_skb_any(skb); + stats->nvmeotcp_drop++; return false; } cqe128 = container_of(cqe, struct mlx5e_cqe128, cqe64); if (cqe_is_nvmeotcp_resync(cqe)) { nvmeotcp_update_resync(queue, cqe128); + stats->nvmeotcp_resync++; mlx5e_nvmeotcp_put_queue(queue); return true; } @@ -342,6 +349,8 @@ mlx5e_nvmeotcp_rebuild_rx_skb_linear(struct mlx5e_rq *rq, struct sk_buff *skb, hlen + cclen, remaining); } + stats->nvmeotcp_packets++; + stats->nvmeotcp_bytes += cclen; mlx5e_nvmeotcp_put_queue(queue); return true; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_stats.c new file mode 100644 index 0000000000000..7b0dad4fa272f --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/nvmeotcp_stats.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. + +#include "en_accel/nvmeotcp.h" + +struct ulp_ddp_counter_map { + size_t eth_offset; + size_t mlx_offset; +}; + +#define DECLARE_ULP_SW_STAT(fld) \ + { offsetof(struct ulp_ddp_stats, fld), \ + offsetof(struct mlx5e_nvmeotcp_sw_stats, fld) } + +#define DECLARE_ULP_RQ_STAT(fld) \ + { offsetof(struct ulp_ddp_stats, rx_ ## fld), \ + offsetof(struct mlx5e_rq_stats, fld) } + +#define READ_CTR_ATOMIC64(ptr, dsc, i) \ + atomic64_read((atomic64_t *)((char *)(ptr) + (dsc)[i].mlx_offset)) + +#define READ_CTR(ptr, desc, i) \ + (*((u64 *)((char *)(ptr) + (desc)[i].mlx_offset))) + +#define SET_ULP_STAT(ptr, desc, i, val) \ + (*(u64 *)((char *)(ptr) + (desc)[i].eth_offset) = (val)) + +/* Global counters */ +static const struct ulp_ddp_counter_map sw_stats_desc[] = { + DECLARE_ULP_SW_STAT(rx_nvmeotcp_sk_add), + DECLARE_ULP_SW_STAT(rx_nvmeotcp_sk_del), + DECLARE_ULP_SW_STAT(rx_nvmeotcp_ddp_setup), + DECLARE_ULP_SW_STAT(rx_nvmeotcp_ddp_setup_fail), + DECLARE_ULP_SW_STAT(rx_nvmeotcp_ddp_teardown), +}; + +/* Per-rx-queue counters */ +static const struct ulp_ddp_counter_map rq_stats_desc[] = { + DECLARE_ULP_RQ_STAT(nvmeotcp_drop), + DECLARE_ULP_RQ_STAT(nvmeotcp_resync), + DECLARE_ULP_RQ_STAT(nvmeotcp_packets), + DECLARE_ULP_RQ_STAT(nvmeotcp_bytes), +}; + +int mlx5e_nvmeotcp_get_stats(struct mlx5e_priv *priv, + struct ulp_ddp_stats *stats) +{ + unsigned int i, ch, n = 0; + + if (!priv->nvmeotcp) + return 0; + + for (i = 0; i < ARRAY_SIZE(sw_stats_desc); i++, n++) + SET_ULP_STAT(stats, sw_stats_desc, i, + READ_CTR_ATOMIC64(&priv->nvmeotcp->sw_stats, + sw_stats_desc, i)); + + for (i = 0; i < ARRAY_SIZE(rq_stats_desc); i++, n++) { + u64 sum = 0; + + for (ch = 0; ch < priv->stats_nch; ch++) + sum += READ_CTR(&priv->channel_stats[ch]->rq, + rq_stats_desc, i); + + SET_ULP_STAT(stats, rq_stats_desc, i, sum); + } + + return n; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h index def5dea1463db..e350d5d04c215 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h @@ -132,6 +132,9 @@ void mlx5e_stats_ts_get(struct mlx5e_priv *priv, struct ethtool_ts_stats *ts_stats); void mlx5e_get_link_ext_stats(struct net_device *dev, struct ethtool_link_ext_stats *stats); +struct ulp_ddp_stats; +void mlx5e_stats_ulp_ddp_get(struct mlx5e_priv *priv, + struct ulp_ddp_stats *stats); /* Concrete NIC Stats */ @@ -406,6 +409,12 @@ struct mlx5e_rq_stats { u64 tls_resync_res_skip; u64 tls_err; #endif +#ifdef CONFIG_MLX5_EN_NVMEOTCP + u64 nvmeotcp_drop; + u64 nvmeotcp_resync; + u64 nvmeotcp_packets; + u64 nvmeotcp_bytes; +#endif }; struct mlx5e_sq_stats { From 5d329bd084d26fd8092f7e42485e659a91448f39 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 30 Apr 2025 13:02:40 +0300 Subject: [PATCH 695/770] ipv4: Honor "ignore_routes_with_linkdown" sysctl in nexthop selection Commit 32607a332cfe ("ipv4: prefer multipath nexthop that matches source address") changed IPv4 nexthop selection to prefer a nexthop whose nexthop device is assigned the specified source address for locally generated traffic. While the selection honors the "fib_multipath_use_neigh" sysctl and will not choose a nexthop with an invalid neighbour, it does not honor the "ignore_routes_with_linkdown" sysctl and can choose a nexthop without a carrier: $ sysctl net.ipv4.conf.all.ignore_routes_with_linkdown net.ipv4.conf.all.ignore_routes_with_linkdown = 1 $ ip route show 198.51.100.0/24 198.51.100.0/24 nexthop via 192.0.2.2 dev dummy1 weight 1 nexthop via 192.0.2.18 dev dummy2 weight 1 dead linkdown $ ip route get 198.51.100.1 from 192.0.2.17 198.51.100.1 from 192.0.2.17 via 192.0.2.18 dev dummy2 uid 0 Solve this by skipping over nexthops whose assigned hash upper bound is minus one, which is the value assigned to nexthops that do not have a carrier when the "ignore_routes_with_linkdown" sysctl is set. In practice, this probably does not matter a lot as the initial route lookup for the source address would not choose a nexthop that does not have a carrier in the first place, but the change does make the code clearer. Signed-off-by: Ido Schimmel Reviewed-by: Willem de Bruijn Reviewed-by: David Ahern Signed-off-by: NipaLocal --- net/ipv4/fib_semantics.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 03959c60d1285..dabe2b7044ab8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -2188,7 +2188,14 @@ void fib_select_multipath(struct fib_result *res, int hash, saddr = fl4 ? fl4->saddr : 0; change_nexthops(fi) { - if (use_neigh && !fib_good_nh(nexthop_nh)) + int nh_upper_bound; + + /* Nexthops without a carrier are assigned an upper bound of + * minus one when "ignore_routes_with_linkdown" is set. + */ + nh_upper_bound = atomic_read(&nexthop_nh->fib_nh_upper_bound); + if (nh_upper_bound == -1 || + (use_neigh && !fib_good_nh(nexthop_nh))) continue; if (!found) { @@ -2197,7 +2204,7 @@ void fib_select_multipath(struct fib_result *res, int hash, found = !saddr || nexthop_nh->nh_saddr == saddr; } - if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound)) + if (hash > nh_upper_bound) continue; if (!saddr || nexthop_nh->nh_saddr == saddr) { From 9498266d9cdc43f4a0b7643937bce459acaca911 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 30 Apr 2025 12:11:19 +0200 Subject: [PATCH 696/770] dt-bindings: dpll: Add DPLL device and pin Add a common DT schema for DPLL device and its associated pins. The DPLL (device phase-locked loop) is a device used for precise clock synchronization in networking and telecom hardware. The device includes one or more DPLLs (channels) and one or more physical input/output pins. Each DPLL channel is used either to provide a pulse-per-clock signal or to drive an Ethernet equipment clock. The input and output pins have the following properties: * label: specifies board label * connection type: specifies its usage depending on wiring * list of supported or allowed frequencies: depending on how the pin is connected and where) * embedded sync capability: indicates whether the pin supports this Reviewed-by: Krzysztof Kozlowski Signed-off-by: Ivan Vecera Signed-off-by: NipaLocal --- .../devicetree/bindings/dpll/dpll-device.yaml | 76 +++++++++++++++++++ .../devicetree/bindings/dpll/dpll-pin.yaml | 45 +++++++++++ MAINTAINERS | 2 + 3 files changed, 123 insertions(+) create mode 100644 Documentation/devicetree/bindings/dpll/dpll-device.yaml create mode 100644 Documentation/devicetree/bindings/dpll/dpll-pin.yaml diff --git a/Documentation/devicetree/bindings/dpll/dpll-device.yaml b/Documentation/devicetree/bindings/dpll/dpll-device.yaml new file mode 100644 index 0000000000000..fb8d7a9a3693f --- /dev/null +++ b/Documentation/devicetree/bindings/dpll/dpll-device.yaml @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/dpll/dpll-device.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Digital Phase-Locked Loop (DPLL) Device + +maintainers: + - Ivan Vecera + +description: + Digital Phase-Locked Loop (DPLL) device is used for precise clock + synchronization in networking and telecom hardware. The device can + have one or more channels (DPLLs) and one or more physical input and + output pins. Each DPLL channel can either produce pulse-per-clock signal + or drive ethernet equipment clock. The type of each channel can be + indicated by dpll-types property. + +properties: + $nodename: + pattern: "^dpll(@.*)?$" + + "#address-cells": + const: 0 + + "#size-cells": + const: 0 + + dpll-types: + description: List of DPLL channel types, one per DPLL instance. + $ref: /schemas/types.yaml#/definitions/non-unique-string-array + items: + enum: [pps, eec] + + input-pins: + type: object + description: DPLL input pins + unevaluatedProperties: false + + properties: + "#address-cells": + const: 1 + "#size-cells": + const: 0 + + patternProperties: + "^pin@[0-9a-f]+$": + $ref: /schemas/dpll/dpll-pin.yaml + unevaluatedProperties: false + + required: + - "#address-cells" + - "#size-cells" + + output-pins: + type: object + description: DPLL output pins + unevaluatedProperties: false + + properties: + "#address-cells": + const: 1 + "#size-cells": + const: 0 + + patternProperties: + "^pin@[0-9]+$": + $ref: /schemas/dpll/dpll-pin.yaml + unevaluatedProperties: false + + required: + - "#address-cells" + - "#size-cells" + +additionalProperties: true diff --git a/Documentation/devicetree/bindings/dpll/dpll-pin.yaml b/Documentation/devicetree/bindings/dpll/dpll-pin.yaml new file mode 100644 index 0000000000000..51db93b77306f --- /dev/null +++ b/Documentation/devicetree/bindings/dpll/dpll-pin.yaml @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/dpll/dpll-pin.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: DPLL Pin + +maintainers: + - Ivan Vecera + +description: | + The DPLL pin is either a physical input or output pin that is provided + by a DPLL( Digital Phase-Locked Loop) device. The pin is identified by + its physical order number that is stored in reg property and can have + an additional set of properties like supported (allowed) frequencies, + label, type and may support embedded sync. + + Note that the pin in this context has nothing to do with pinctrl. + +properties: + reg: + description: Hardware index of the DPLL pin. + maxItems: 1 + + connection-type: + description: Connection type of the pin + $ref: /schemas/types.yaml#/definitions/string + enum: [ext, gnss, int, mux, synce] + + esync-control: + description: Indicates whether the pin supports embedded sync functionality. + type: boolean + + label: + description: String exposed as the pin board label + $ref: /schemas/types.yaml#/definitions/string + + supported-frequencies-hz: + description: List of supported frequencies for this pin, expressed in Hz. + +required: + - reg + +additionalProperties: false diff --git a/MAINTAINERS b/MAINTAINERS index 5c31814c96874..23e71450cd841 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7196,6 +7196,8 @@ M: Arkadiusz Kubalewski M: Jiri Pirko L: netdev@vger.kernel.org S: Supported +F: Documentation/devicetree/bindings/dpll/dpll-device.yaml +F: Documentation/devicetree/bindings/dpll/dpll-pin.yaml F: Documentation/driver-api/dpll.rst F: drivers/dpll/* F: include/linux/dpll.h From 51836af0543311da2d7ced614c9c151facd789a6 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 30 Apr 2025 12:11:20 +0200 Subject: [PATCH 697/770] dt-bindings: dpll: Add support for Microchip Azurite chip family Add DT bindings for Microchip Azurite DPLL chip family. These chips provide up to 5 independent DPLL channels, 10 differential or single-ended inputs and 10 differential or 20 single-ended outputs. They can be connected via I2C or SPI busses. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Ivan Vecera Signed-off-by: NipaLocal --- .../bindings/dpll/microchip,zl30731.yaml | 115 ++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 Documentation/devicetree/bindings/dpll/microchip,zl30731.yaml diff --git a/Documentation/devicetree/bindings/dpll/microchip,zl30731.yaml b/Documentation/devicetree/bindings/dpll/microchip,zl30731.yaml new file mode 100644 index 0000000000000..17747f754b845 --- /dev/null +++ b/Documentation/devicetree/bindings/dpll/microchip,zl30731.yaml @@ -0,0 +1,115 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/dpll/microchip,zl30731.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Microchip Azurite DPLL device + +maintainers: + - Ivan Vecera + +description: + Microchip Azurite DPLL (ZL3073x) is a family of DPLL devices that + provides up to 5 independent DPLL channels, up to 10 differential or + single-ended inputs and 10 differential or 20 single-ended outputs. + These devices support both I2C and SPI interfaces. + +properties: + compatible: + enum: + - microchip,zl30731 + - microchip,zl30732 + - microchip,zl30733 + - microchip,zl30734 + - microchip,zl30735 + + reg: + maxItems: 1 + +required: + - compatible + - reg + +allOf: + - $ref: /schemas/dpll/dpll-device.yaml# + - $ref: /schemas/spi/spi-peripheral-props.yaml# + +unevaluatedProperties: false + +examples: + - | + i2c { + #address-cells = <1>; + #size-cells = <0>; + + dpll@70 { + compatible = "microchip,zl30732"; + reg = <0x70>; + dpll-types = "pps", "eec"; + + input-pins { + #address-cells = <1>; + #size-cells = <0>; + + pin@0 { /* REF0P */ + reg = <0>; + connection-type = "ext"; + label = "Input 0"; + supported-frequencies-hz = /bits/ 64 <1 1000>; + }; + }; + + output-pins { + #address-cells = <1>; + #size-cells = <0>; + + pin@3 { /* OUT1N */ + reg = <3>; + connection-type = "gnss"; + esync-control; + label = "Output 1"; + supported-frequencies-hz = /bits/ 64 <1 10000>; + }; + }; + }; + }; + - | + spi { + #address-cells = <1>; + #size-cells = <0>; + + dpll@70 { + compatible = "microchip,zl30731"; + reg = <0x70>; + spi-max-frequency = <12500000>; + + dpll-types = "pps"; + + input-pins { + #address-cells = <1>; + #size-cells = <0>; + + pin@0 { /* REF0P */ + reg = <0>; + connection-type = "ext"; + label = "Input 0"; + supported-frequencies-hz = /bits/ 64 <1 1000>; + }; + }; + + output-pins { + #address-cells = <1>; + #size-cells = <0>; + + pin@3 { /* OUT1N */ + reg = <3>; + connection-type = "gnss"; + esync-control; + label = "Output 1"; + supported-frequencies-hz = /bits/ 64 <1 10000>; + }; + }; + }; + }; +... From 4ec882ce5504da96c03d7df361c45e82caf46859 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 30 Apr 2025 12:11:21 +0200 Subject: [PATCH 698/770] mfd: Add Microchip ZL3073x support Add base MFD driver for Microchip Azurite ZL3073x chip family. These chips provide DPLL and PHC (PTP) functionality and can be connected over I2C or SPI bus. The MFD driver handles basic communication and synchronization over the bus, as well as common functionality that are used by the DPLL driver (part 2) and by the PTP driver (to be added later). The chip family has the following characteristics: * up to 5 separate DPLL units (channels) * 5 synthesizers * 10 input pins (references) * 10 outputs * 20 output pins (output pin pair shares one output) * Each reference and output can operate in either differential or single-ended mode (differential mode uses 2 pins) * Each output is connected to one of the synthesizers * Each synthesizer is driven by one of the DPLL unit The device uses 7-bit addresses and 8-bits values. It exposes 8-, 16-, 32- and 48-bits registers in address range <0x000,0x77F>. Due to 7bit addressing, the range is organized into pages of 128 bytes, with each page containing a page selector register at address 0x7F. For reading/writing multi-byte registers, the device supports bulk transfers. Signed-off-by: Ivan Vecera Signed-off-by: NipaLocal --- MAINTAINERS | 9 + drivers/mfd/Kconfig | 30 ++ drivers/mfd/Makefile | 5 + drivers/mfd/zl3073x-core.c | 472 +++++++++++++++++++++++++++++++ drivers/mfd/zl3073x-i2c.c | 66 +++++ drivers/mfd/zl3073x-regs.h | 17 ++ drivers/mfd/zl3073x-spi.c | 66 +++++ drivers/mfd/zl3073x.h | 31 ++ include/linux/mfd/zl3073x-regs.h | 66 +++++ include/linux/mfd/zl3073x.h | 35 +++ 10 files changed, 797 insertions(+) create mode 100644 drivers/mfd/zl3073x-core.c create mode 100644 drivers/mfd/zl3073x-i2c.c create mode 100644 drivers/mfd/zl3073x-regs.h create mode 100644 drivers/mfd/zl3073x-spi.c create mode 100644 drivers/mfd/zl3073x.h create mode 100644 include/linux/mfd/zl3073x-regs.h create mode 100644 include/linux/mfd/zl3073x.h diff --git a/MAINTAINERS b/MAINTAINERS index 23e71450cd841..deff813bf966b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16059,6 +16059,15 @@ L: linux-wireless@vger.kernel.org S: Supported F: drivers/net/wireless/microchip/ +MICROCHIP ZL3073X DRIVER +M: Ivan Vecera +M: Prathosh Satish +L: netdev@vger.kernel.org +S: Supported +F: Documentation/devicetree/bindings/dpll/microchip,zl30731.yaml +F: drivers/mfd/zl3073x* +F: include/linux/mfd/zl3073x.h + MICROSEMI MIPS SOCS M: Alexandre Belloni M: UNGLinuxDriver@microchip.com diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 22b9363100394..7d7902ec1d89a 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -2422,5 +2422,35 @@ config MFD_UPBOARD_FPGA To compile this driver as a module, choose M here: the module will be called upboard-fpga. +config MFD_ZL3073X_CORE + tristate + select MFD_CORE + +config MFD_ZL3073X_I2C + tristate "Microchip Azurite DPLL/PTP/SyncE with I2C" + depends on I2C + select MFD_ZL3073X_CORE + select REGMAP_I2C + help + Say Y here if you want to build I2C support for the Microchip + Azurite DPLL/PTP/SyncE chip family. + + To compile this driver as a module, choose M here: the module + will be called zl3073x_i2c and you will also get zl3073x for + the core module. + +config MFD_ZL3073X_SPI + tristate "Microchip Azurite DPLL/PTP/SyncE with SPI" + depends on SPI + select MFD_ZL3073X_CORE + select REGMAP_SPI + help + Say Y here if you want to build SPI support for the Microchip + Azurite DPLL/PTP/SyncE chip family. + + To compile this driver as a module, choose M here: the module + will be called zl3073x_spi and you will also get zl3073x for + the core module. + endmenu endif diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 948cbdf42a18b..76e2babc1538f 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -290,3 +290,8 @@ obj-$(CONFIG_MFD_RSMU_I2C) += rsmu_i2c.o rsmu_core.o obj-$(CONFIG_MFD_RSMU_SPI) += rsmu_spi.o rsmu_core.o obj-$(CONFIG_MFD_UPBOARD_FPGA) += upboard-fpga.o + +zl3073x-y := zl3073x-core.o +obj-$(CONFIG_MFD_ZL3073X_CORE) += zl3073x.o +obj-$(CONFIG_MFD_ZL3073X_I2C) += zl3073x-i2c.o +obj-$(CONFIG_MFD_ZL3073X_SPI) += zl3073x-spi.o diff --git a/drivers/mfd/zl3073x-core.c b/drivers/mfd/zl3073x-core.c new file mode 100644 index 0000000000000..c408aafb0f8ac --- /dev/null +++ b/drivers/mfd/zl3073x-core.c @@ -0,0 +1,472 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zl3073x.h" +#include "zl3073x-regs.h" + +/* Chip IDs for zl30731 */ +static const u16 zl30731_ids[] = { + 0x0E93, + 0x1E93, + 0x2E93, +}; + +/* Chip IDs for zl30732 */ +static const u16 zl30732_ids[] = { + 0x0E30, + 0x0E94, + 0x1E94, + 0x1F60, + 0x2E94, + 0x3FC4, +}; + +/* Chip IDs for zl30733 */ +static const u16 zl30733_ids[] = { + 0x0E95, + 0x1E95, + 0x2E95, +}; + +/* Chip IDs for zl30734 */ +static const u16 zl30734_ids[] = { + 0x0E96, + 0x1E96, + 0x2E96, +}; + +/* Chip IDs for zl30735 */ +static const u16 zl30735_ids[] = { + 0x0E97, + 0x1E97, + 0x2E97, +}; + +const struct zl3073x_chip_info zl3073x_chip_info[] = { + [ZL30731] = { + .ids = zl30731_ids, + .num_ids = ARRAY_SIZE(zl30731_ids), + .num_channels = 1, + }, + [ZL30732] = { + .ids = zl30732_ids, + .num_ids = ARRAY_SIZE(zl30732_ids), + .num_channels = 2, + }, + [ZL30733] = { + .ids = zl30733_ids, + .num_ids = ARRAY_SIZE(zl30733_ids), + .num_channels = 3, + }, + [ZL30734] = { + .ids = zl30734_ids, + .num_ids = ARRAY_SIZE(zl30734_ids), + .num_channels = 4, + }, + [ZL30735] = { + .ids = zl30735_ids, + .num_ids = ARRAY_SIZE(zl30735_ids), + .num_channels = 5, + }, +}; +EXPORT_SYMBOL_NS_GPL(zl3073x_chip_info, "ZL3073X"); + +#define ZL_RANGE_OFFSET 0x80 +#define ZL_PAGE_SIZE 0x80 +#define ZL_NUM_PAGES 15 +#define ZL_PAGE_SEL 0x7F +#define ZL_PAGE_SEL_MASK GENMASK(3, 0) +#define ZL_NUM_REGS (ZL_NUM_PAGES * ZL_PAGE_SIZE) + +/* Regmap range configuration */ +static const struct regmap_range_cfg zl3073x_regmap_range = { + .range_min = ZL_RANGE_OFFSET, + .range_max = ZL_RANGE_OFFSET + ZL_NUM_REGS - 1, + .selector_reg = ZL_PAGE_SEL, + .selector_mask = ZL_PAGE_SEL_MASK, + .selector_shift = 0, + .window_start = 0, + .window_len = ZL_PAGE_SIZE, +}; + +static bool +zl3073x_is_volatile_reg(struct device *dev __maybe_unused, unsigned int reg) +{ + /* Only page selector is non-volatile */ + return reg != ZL_PAGE_SEL; +} + +static const struct regmap_config zl3073x_regmap_config = { + .reg_bits = 8, + .val_bits = 8, + .max_register = ZL_RANGE_OFFSET + ZL_NUM_REGS - 1, + .ranges = &zl3073x_regmap_range, + .num_ranges = 1, + .cache_type = REGCACHE_MAPLE, + .volatile_reg = zl3073x_is_volatile_reg, +}; + +static bool +zl3073x_check_reg(struct zl3073x_dev *zldev, unsigned int reg, size_t size) +{ + /* Check the index is in valid range for indexed register */ + if (ZL_REG_OFFSET(reg) > ZL_REG_MAX_OFFSET(reg)) { + dev_err(zldev->dev, "Index out of range for reg 0x%04lx\n", + ZL_REG_ADDR(reg)); + return false; + } + /* Check the requested size corresponds to register size */ + if (ZL_REG_SIZE(reg) != size) { + dev_err(zldev->dev, "Invalid size %zu for reg 0x%04lx\n", + size, ZL_REG_ADDR(reg)); + return false; + } + + return true; +} + +static int +zl3073x_read_reg(struct zl3073x_dev *zldev, unsigned int reg, void *val, + size_t size) +{ + int rc; + + if (!zl3073x_check_reg(zldev, reg, size)) + return -EINVAL; + + /* Map the register address to virtual range */ + reg = ZL_REG_ADDR(reg) + ZL_RANGE_OFFSET; + + rc = regmap_bulk_read(zldev->regmap, reg, val, size); + if (rc) { + dev_err(zldev->dev, "Failed to read reg 0x%04x: %pe\n", reg, + ERR_PTR(rc)); + return rc; + } + + return 0; +} + +static int +zl3073x_write_reg(struct zl3073x_dev *zldev, unsigned int reg, const void *val, + size_t size) +{ + int rc; + + if (!zl3073x_check_reg(zldev, reg, size)) + return -EINVAL; + + /* Map the register address to virtual range */ + reg = ZL_REG_ADDR(reg) + ZL_RANGE_OFFSET; + + rc = regmap_bulk_write(zldev->regmap, reg, val, size); + if (rc) { + dev_err(zldev->dev, "Failed to write reg 0x%04x: %pe\n", reg, + ERR_PTR(rc)); + return rc; + } + + return 0; +} + +/** + * zl3073x_read_u8 - read value from 8bit register + * @zldev: zl3073x device pointer + * @reg: register to write to + * @val: value to write + * + * Reads value from given 8bit register. + * + * Returns: 0 on success, <0 on error + */ +int zl3073x_read_u8(struct zl3073x_dev *zldev, unsigned int reg, u8 *val) +{ + return zl3073x_read_reg(zldev, reg, val, sizeof(*val)); +} +EXPORT_SYMBOL_NS_GPL(zl3073x_read_u8, "ZL3073X"); + +/** + * zl3073x_write_u8 - write value to 16bit register + * @zldev: zl3073x device pointer + * @reg: register to write to + * @val: value to write + * + * Writes value into given 8bit register. + * + * Returns: 0 on success, <0 on error + */ +int zl3073x_write_u8(struct zl3073x_dev *zldev, unsigned int reg, u8 val) +{ + return zl3073x_write_reg(zldev, reg, &val, sizeof(val)); +} +EXPORT_SYMBOL_NS_GPL(zl3073x_write_u8, "ZL3073X"); + +/** + * zl3073x_read_u16 - read value from 16bit register + * @zldev: zl3073x device pointer + * @reg: register to write to + * @val: value to write + * + * Reads value from given 16bit register. + * + * Returns: 0 on success, <0 on error + */ +int zl3073x_read_u16(struct zl3073x_dev *zldev, unsigned int reg, u16 *val) +{ + int rc; + + rc = zl3073x_read_reg(zldev, reg, val, sizeof(*val)); + if (!rc) + be16_to_cpus(val); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(zl3073x_read_u16, "ZL3073X"); + +/** + * zl3073x_write_u16 - write value to 16bit register + * @zldev: zl3073x device pointer + * @reg: register to write to + * @val: value to write + * + * Writes value into given 16bit register. + * + * Returns: 0 on success, <0 on error + */ +int zl3073x_write_u16(struct zl3073x_dev *zldev, unsigned int reg, u16 val) +{ + cpu_to_be16s(&val); + + return zl3073x_write_reg(zldev, reg, &val, sizeof(val)); +} +EXPORT_SYMBOL_NS_GPL(zl3073x_write_u16, "ZL3073X"); + +/** + * zl3073x_read_u32 - read value from 32bit register + * @zldev: zl3073x device pointer + * @reg: register to write to + * @val: value to write + * + * Reads value from given 32bit register. + * + * Returns: 0 on success, <0 on error + */ +int zl3073x_read_u32(struct zl3073x_dev *zldev, unsigned int reg, u32 *val) +{ + int rc; + + rc = zl3073x_read_reg(zldev, reg, val, sizeof(*val)); + if (!rc) + be32_to_cpus(val); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(zl3073x_read_u32, "ZL3073X"); + +/** + * zl3073x_write_u32 - write value to 32bit register + * @zldev: zl3073x device pointer + * @reg: register to write to + * @val: value to write + * + * Writes value into given 32bit register. + * + * Returns: 0 on success, <0 on error + */ +int zl3073x_write_u32(struct zl3073x_dev *zldev, unsigned int reg, u32 val) +{ + cpu_to_be32s(&val); + + return zl3073x_write_reg(zldev, reg, &val, sizeof(val)); +} +EXPORT_SYMBOL_NS_GPL(zl3073x_write_u32, "ZL3073X"); + +/** + * zl3073x_read_u48 - read value from 48bit register + * @zldev: zl3073x device pointer + * @reg: register to write to + * @val: value to write + * + * Reads value from given 48bit register. + * + * Returns: 0 on success, <0 on error + */ +int zl3073x_read_u48(struct zl3073x_dev *zldev, unsigned int reg, u64 *val) +{ + u8 buf[6]; + int rc; + + rc = zl3073x_read_reg(zldev, reg, buf, sizeof(buf)); + if (!rc) + *val = get_unaligned_be48(buf); + + return rc; +} +EXPORT_SYMBOL_NS_GPL(zl3073x_read_u48, "ZL3073X"); + +/** + * zl3073x_write_u48 - write value to 48bit register + * @zldev: zl3073x device pointer + * @reg: register to write to + * @val: value to write + * + * Writes value into given 48bit register. + * The value must be from the interval -S48_MIN to U48_MAX. + * + * Returns: 0 on success, <0 on error + */ +int zl3073x_write_u48(struct zl3073x_dev *zldev, unsigned int reg, u64 val) +{ + u8 buf[6]; + + /* Check the value belongs to + * Any value >= S48_MIN has bits 47..63 set. + */ + if (val > GENMASK_ULL(47, 0) && val < GENMASK_ULL(63, 47)) { + dev_err(zldev->dev, "Value 0x%0llx out of range\n", val); + return -EINVAL; + } + + put_unaligned_be48(val, buf); + + return zl3073x_write_reg(zldev, reg, buf, sizeof(buf)); +} +EXPORT_SYMBOL_NS_GPL(zl3073x_write_u48, "ZL3073X"); + +/** + * zl3073x_poll_zero_u8 - wait for register to be cleared by device + * @zldev: zl3073x device pointer + * @reg: register to poll (has to be 8bit register) + * @mask: bit mask for polling + * + * Waits for bits specified by @mask in register @reg value to be cleared + * by the device. + * + * Returns: 0 on success, <0 on error + */ +int zl3073x_poll_zero_u8(struct zl3073x_dev *zldev, unsigned int reg, u8 mask) +{ + /* Register polling sleep & timeout */ +#define ZL_POLL_SLEEP_US 10 +#define ZL_POLL_TIMEOUT_US 2000000 + unsigned int val; + + /* Check the register is 8bit */ + if (ZL_REG_SIZE(reg) != 1) { + dev_err(zldev->dev, "Invalid reg 0x%04lx size for polling\n", + ZL_REG_ADDR(reg)); + return -EINVAL; + } + + /* Map the register address to virtual range */ + reg = ZL_REG_ADDR(reg) + ZL_RANGE_OFFSET; + + return regmap_read_poll_timeout(zldev->regmap, reg, val, !(val & mask), + ZL_POLL_SLEEP_US, ZL_POLL_TIMEOUT_US); +} +EXPORT_SYMBOL_NS_GPL(zl3073x_poll_zero_u8, "ZL3073X"); + +/** + * zl3073x_devm_alloc - allocates zl3073x device structure + * @dev: pointer to device structure + * + * Allocates zl3073x device structure as device resource. + * + * Return: pointer to zl3073x device on success, error pointer on error + */ +struct zl3073x_dev *zl3073x_devm_alloc(struct device *dev) +{ + struct zl3073x_dev *zldev; + + zldev = devm_kzalloc(dev, sizeof(*zldev), GFP_KERNEL); + if (!zldev) + return ERR_PTR(-ENOMEM); + + zldev->dev = dev; + dev_set_drvdata(zldev->dev, zldev); + + return zldev; +} +EXPORT_SYMBOL_NS_GPL(zl3073x_devm_alloc, "ZL3073X"); + +/** + * zl3073x_dev_init_regmap_config - initialize regmap config + * @regmap_cfg: regmap_config structure to fill + * + * Initializes regmap config common for I2C and SPI. + */ +void zl3073x_dev_init_regmap_config(struct regmap_config *regmap_cfg) +{ + *regmap_cfg = zl3073x_regmap_config; +} +EXPORT_SYMBOL_NS_GPL(zl3073x_dev_init_regmap_config, "ZL3073X"); + +/** + * zl3073x_dev_probe - initialize zl3073x device + * @zldev: pointer to zl3073x device + * @chip_info: chip info based on compatible + * + * Common initialization of zl3073x device structure. + * + * Returns: 0 on success, <0 on error + */ +int zl3073x_dev_probe(struct zl3073x_dev *zldev, + const struct zl3073x_chip_info *chip_info) +{ + u16 id, revision, fw_ver; + unsigned int i; + u32 cfg_ver; + int rc; + + /* Read chip ID */ + rc = zl3073x_read_u16(zldev, ZL_REG_ID, &id); + if (rc) + return rc; + + /* Check it matches */ + for (i = 0; i < chip_info->num_ids; i++) { + if (id == chip_info->ids[i]) + break; + } + + if (i == chip_info->num_ids) { + return dev_err_probe(zldev->dev, -ENODEV, + "Unknown or non-match chip ID: 0x%0x\n", + id); + } + + /* Read revision, firmware version and custom config version */ + rc = zl3073x_read_u16(zldev, ZL_REG_REVISION, &revision); + if (rc) + return rc; + rc = zl3073x_read_u16(zldev, ZL_REG_FW_VER, &fw_ver); + if (rc) + return rc; + rc = zl3073x_read_u32(zldev, ZL_REG_CUSTOM_CONFIG_VER, &cfg_ver); + if (rc) + return rc; + + dev_dbg(zldev->dev, "ChipID(%X), ChipRev(%X), FwVer(%u)\n", id, + revision, fw_ver); + dev_dbg(zldev->dev, "Custom config version: %lu.%lu.%lu.%lu\n", + FIELD_GET(GENMASK(31, 24), cfg_ver), + FIELD_GET(GENMASK(23, 16), cfg_ver), + FIELD_GET(GENMASK(15, 8), cfg_ver), + FIELD_GET(GENMASK(7, 0), cfg_ver)); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(zl3073x_dev_probe, "ZL3073X"); + +MODULE_AUTHOR("Ivan Vecera "); +MODULE_DESCRIPTION("Microchip ZL3073x core driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mfd/zl3073x-i2c.c b/drivers/mfd/zl3073x-i2c.c new file mode 100644 index 0000000000000..da8bbd702d76c --- /dev/null +++ b/drivers/mfd/zl3073x-i2c.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include "zl3073x.h" + +static int zl3073x_i2c_probe(struct i2c_client *client) +{ + struct regmap_config regmap_cfg; + struct device *dev = &client->dev; + struct zl3073x_dev *zldev; + + zldev = zl3073x_devm_alloc(dev); + if (IS_ERR(zldev)) + return PTR_ERR(zldev); + + zl3073x_dev_init_regmap_config(®map_cfg); + + zldev->regmap = devm_regmap_init_i2c(client, ®map_cfg); + if (IS_ERR(zldev->regmap)) { + dev_err_probe(dev, PTR_ERR(zldev->regmap), + "Failed to initialize regmap\n"); + return PTR_ERR(zldev->regmap); + } + + return zl3073x_dev_probe(zldev, i2c_get_match_data(client)); +} + +static const struct i2c_device_id zl3073x_i2c_id[] = { + { "zl30731", .driver_data = (kernel_ulong_t)&zl3073x_chip_info[ZL30731] }, + { "zl30732", .driver_data = (kernel_ulong_t)&zl3073x_chip_info[ZL30732] }, + { "zl30733", .driver_data = (kernel_ulong_t)&zl3073x_chip_info[ZL30733] }, + { "zl30734", .driver_data = (kernel_ulong_t)&zl3073x_chip_info[ZL30734] }, + { "zl30735", .driver_data = (kernel_ulong_t)&zl3073x_chip_info[ZL30735] }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(i2c, zl3073x_i2c_id); + +static const struct of_device_id zl3073x_i2c_of_match[] = { + { .compatible = "microchip,zl30731", .data = &zl3073x_chip_info[ZL30731] }, + { .compatible = "microchip,zl30732", .data = &zl3073x_chip_info[ZL30732] }, + { .compatible = "microchip,zl30733", .data = &zl3073x_chip_info[ZL30733] }, + { .compatible = "microchip,zl30734", .data = &zl3073x_chip_info[ZL30734] }, + { .compatible = "microchip,zl30735", .data = &zl3073x_chip_info[ZL30735] }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, zl3073x_i2c_of_match); + +static struct i2c_driver zl3073x_i2c_driver = { + .driver = { + .name = "zl3073x-i2c", + .of_match_table = zl3073x_i2c_of_match, + }, + .probe = zl3073x_i2c_probe, + .id_table = zl3073x_i2c_id, +}; +module_i2c_driver(zl3073x_i2c_driver); + +MODULE_AUTHOR("Ivan Vecera "); +MODULE_DESCRIPTION("Microchip ZL3073x I2C driver"); +MODULE_IMPORT_NS("ZL3073X"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mfd/zl3073x-regs.h b/drivers/mfd/zl3073x-regs.h new file mode 100644 index 0000000000000..6bb7ea1ef0b52 --- /dev/null +++ b/drivers/mfd/zl3073x-regs.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ZL3073X_REGS_H +#define __ZL3073X_REGS_H + +#include + +/************************** + * Register Page 0, General + **************************/ + +#define ZL_REG_ID ZL_REG(0, 0x01, 2) +#define ZL_REG_REVISION ZL_REG(0, 0x03, 2) +#define ZL_REG_FW_VER ZL_REG(0, 0x05, 2) +#define ZL_REG_CUSTOM_CONFIG_VER ZL_REG(0, 0x07, 4) + +#endif /* __ZL3073X_REGS_H */ diff --git a/drivers/mfd/zl3073x-spi.c b/drivers/mfd/zl3073x-spi.c new file mode 100644 index 0000000000000..962b6845c0325 --- /dev/null +++ b/drivers/mfd/zl3073x-spi.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include +#include "zl3073x.h" + +static int zl3073x_spi_probe(struct spi_device *spi) +{ + struct regmap_config regmap_cfg; + struct device *dev = &spi->dev; + struct zl3073x_dev *zldev; + + zldev = zl3073x_devm_alloc(dev); + if (IS_ERR(zldev)) + return PTR_ERR(zldev); + + zl3073x_dev_init_regmap_config(®map_cfg); + + zldev->regmap = devm_regmap_init_spi(spi, ®map_cfg); + if (IS_ERR(zldev->regmap)) { + dev_err_probe(dev, PTR_ERR(zldev->regmap), + "Failed to initialize regmap\n"); + return PTR_ERR(zldev->regmap); + } + + return zl3073x_dev_probe(zldev, spi_get_device_match_data(spi)); +} + +static const struct spi_device_id zl3073x_spi_id[] = { + { "zl30731", .driver_data = (kernel_ulong_t)&zl3073x_chip_info[ZL30731] }, + { "zl30731", .driver_data = (kernel_ulong_t)&zl3073x_chip_info[ZL30732] }, + { "zl30731", .driver_data = (kernel_ulong_t)&zl3073x_chip_info[ZL30733] }, + { "zl30731", .driver_data = (kernel_ulong_t)&zl3073x_chip_info[ZL30734] }, + { "zl30731", .driver_data = (kernel_ulong_t)&zl3073x_chip_info[ZL30735] }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(spi, zl3073x_spi_id); + +static const struct of_device_id zl3073x_spi_of_match[] = { + { .compatible = "microchip,zl30731", .data = &zl3073x_chip_info[ZL30731] }, + { .compatible = "microchip,zl30732", .data = &zl3073x_chip_info[ZL30732] }, + { .compatible = "microchip,zl30733", .data = &zl3073x_chip_info[ZL30733] }, + { .compatible = "microchip,zl30734", .data = &zl3073x_chip_info[ZL30734] }, + { .compatible = "microchip,zl30735", .data = &zl3073x_chip_info[ZL30735] }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, zl3073x_spi_of_match); + +static struct spi_driver zl3073x_spi_driver = { + .driver = { + .name = "zl3073x-spi", + .of_match_table = zl3073x_spi_of_match, + }, + .probe = zl3073x_spi_probe, + .id_table = zl3073x_spi_id, +}; +module_spi_driver(zl3073x_spi_driver); + +MODULE_AUTHOR("Ivan Vecera "); +MODULE_DESCRIPTION("Microchip ZL3073x SPI driver"); +MODULE_IMPORT_NS("ZL3073X"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mfd/zl3073x.h b/drivers/mfd/zl3073x.h new file mode 100644 index 0000000000000..3a2fea61cf579 --- /dev/null +++ b/drivers/mfd/zl3073x.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __ZL3073X_CORE_H +#define __ZL3073X_CORE_H + +struct device; +struct regmap_config; +struct zl3073x_dev; + +enum zl3073x_chip_type { + ZL30731, + ZL30732, + ZL30733, + ZL30734, + ZL30735, +}; + +struct zl3073x_chip_info { + const u16 *ids; + size_t num_ids; + int num_channels; +}; + +extern const struct zl3073x_chip_info zl3073x_chip_info[]; + +struct zl3073x_dev *zl3073x_devm_alloc(struct device *dev); +void zl3073x_dev_init_regmap_config(struct regmap_config *regmap_cfg); +int zl3073x_dev_probe(struct zl3073x_dev *zldev, + const struct zl3073x_chip_info *chip_info); + +#endif /* __ZL3073X_CORE_H */ diff --git a/include/linux/mfd/zl3073x-regs.h b/include/linux/mfd/zl3073x-regs.h new file mode 100644 index 0000000000000..cd2baed244eb0 --- /dev/null +++ b/include/linux/mfd/zl3073x-regs.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __LINUX_MFD_ZL3073X_REGS_H +#define __LINUX_MFD_ZL3073X_REGS_H + +#include +#include + +/* + * Register address structure: + * =========================== + * 25 19 18 16 15 7 6 0 + * +------------------------------------------+ + * | max_offset | size | page | page_offset | + * +------------------------------------------+ + * + * page_offset ... <0x00..0x7F> + * page .......... HW page number + * size .......... register byte size (1, 2, 4 or 6) + * max_offset .... maximal offset for indexed registers + * (for non-indexed regs max_offset == page_offset) + */ + +#define ZL_REG_OFFSET_MASK GENMASK(6, 0) +#define ZL_REG_PAGE_MASK GENMASK(15, 7) +#define ZL_REG_SIZE_MASK GENMASK(18, 16) +#define ZL_REG_MAX_OFFSET_MASK GENMASK(25, 19) +#define ZL_REG_ADDR_MASK GENMASK(15, 0) + +#define ZL_REG_OFFSET(_reg) FIELD_GET(ZL_REG_OFFSET_MASK, _reg) +#define ZL_REG_PAGE(_reg) FIELD_GET(ZL_REG_PAGE_MASK, _reg) +#define ZL_REG_MAX_OFFSET(_reg) FIELD_GET(ZL_REG_MAX_OFFSET_MASK, _reg) +#define ZL_REG_SIZE(_reg) FIELD_GET(ZL_REG_SIZE_MASK, _reg) +#define ZL_REG_ADDR(_reg) FIELD_GET(ZL_REG_ADDR_MASK, _reg) + +/** + * ZL_REG_IDX - define indexed register + * @_idx: index of register to access + * @_page: register page + * @_offset: register offset in page + * @_size: register byte size (1, 2, 4 or 6) + * @_items: number of register indices + * @_stride: stride between items in bytes + * + * All parameters except @_idx should be constant. + */ +#define ZL_REG_IDX(_idx, _page, _offset, _size, _items, _stride) \ + (FIELD_PREP(ZL_REG_OFFSET_MASK, \ + (_offset) + (_idx) * (_stride)) | \ + FIELD_PREP_CONST(ZL_REG_PAGE_MASK, _page) | \ + FIELD_PREP_CONST(ZL_REG_SIZE_MASK, _size) | \ + FIELD_PREP_CONST(ZL_REG_MAX_OFFSET_MASK, \ + (_offset) + ((_items) - 1) * (_stride))) + +/** + * ZL_REG - define simple (non-indexed) register + * @_page: register page + * @_offset: register offset in page + * @_size: register byte size (1, 2, 4 or 6) + * + * All parameters should be constant. + */ +#define ZL_REG(_page, _offset, _size) \ + ZL_REG_IDX(0, _page, _offset, _size, 1, 0) + +#endif /* __LINUX_MFD_ZL3073X_REGS_H */ diff --git a/include/linux/mfd/zl3073x.h b/include/linux/mfd/zl3073x.h new file mode 100644 index 0000000000000..8df10b82e3c21 --- /dev/null +++ b/include/linux/mfd/zl3073x.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __LINUX_MFD_ZL3073X_H +#define __LINUX_MFD_ZL3073X_H + +#include + +struct device; +struct regmap; + +/** + * struct zl3073x_dev - zl3073x device + * @dev: pointer to device + * @regmap: regmap to access device registers + */ +struct zl3073x_dev { + struct device *dev; + struct regmap *regmap; +}; + +/********************** + * Registers operations + **********************/ + +int zl3073x_poll_zero_u8(struct zl3073x_dev *zldev, unsigned int reg, u8 mask); +int zl3073x_read_u8(struct zl3073x_dev *zldev, unsigned int reg, u8 *val); +int zl3073x_read_u16(struct zl3073x_dev *zldev, unsigned int reg, u16 *val); +int zl3073x_read_u32(struct zl3073x_dev *zldev, unsigned int reg, u32 *val); +int zl3073x_read_u48(struct zl3073x_dev *zldev, unsigned int reg, u64 *val); +int zl3073x_write_u8(struct zl3073x_dev *zldev, unsigned int reg, u8 val); +int zl3073x_write_u16(struct zl3073x_dev *zldev, unsigned int reg, u16 val); +int zl3073x_write_u32(struct zl3073x_dev *zldev, unsigned int reg, u32 val); +int zl3073x_write_u48(struct zl3073x_dev *zldev, unsigned int reg, u64 val); + +#endif /* __LINUX_MFD_ZL3073X_H */ From 32231a47ad35fab668409f4ad8ccd903991f75c0 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 30 Apr 2025 12:11:22 +0200 Subject: [PATCH 699/770] mfd: zl3073x: Add support for devlink device info Use devlink_alloc() to allocate zl3073x_dev structure, register the device as a devlink device, and add devlink callback to provide device info. Signed-off-by: Ivan Vecera Signed-off-by: NipaLocal --- Documentation/networking/devlink/index.rst | 1 + Documentation/networking/devlink/zl3073x.rst | 37 +++++++ drivers/mfd/Kconfig | 2 + drivers/mfd/zl3073x-core.c | 108 ++++++++++++++++++- 4 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 Documentation/networking/devlink/zl3073x.rst diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst index 8319f43b5933d..250ae71f40236 100644 --- a/Documentation/networking/devlink/index.rst +++ b/Documentation/networking/devlink/index.rst @@ -98,3 +98,4 @@ parameters, info versions, and other features it supports. iosm octeontx2 sfc + zl3073x diff --git a/Documentation/networking/devlink/zl3073x.rst b/Documentation/networking/devlink/zl3073x.rst new file mode 100644 index 0000000000000..9a6744fb2e866 --- /dev/null +++ b/Documentation/networking/devlink/zl3073x.rst @@ -0,0 +1,37 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +zl3073x devlink support +======================= + +This document describes the devlink features implemented by the ``zl3073x`` +device driver. + +Info versions +============= + +The ``zl3073x`` driver reports the following versions + +.. list-table:: devlink info versions implemented + :widths: 5 5 5 90 + + * - Name + - Type + - Example + - Description + * - ``asic.id`` + - fixed + - 1E94 + - Chip identification number + * - ``asic.rev`` + - fixed + - 300 + - Chip revision number + * - ``fw`` + - running + - 7006 + - Firmware version number + * - ``cfg.custom_ver`` + - running + - 1.3.0.1 + - Device configuration version customized by OEM diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 7d7902ec1d89a..e4eca15af175d 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -2424,6 +2424,8 @@ config MFD_UPBOARD_FPGA config MFD_ZL3073X_CORE tristate + depends on NET + select NET_DEVLINK select MFD_CORE config MFD_ZL3073X_I2C diff --git a/drivers/mfd/zl3073x-core.c b/drivers/mfd/zl3073x-core.c index c408aafb0f8ac..0795506825108 100644 --- a/drivers/mfd/zl3073x-core.c +++ b/drivers/mfd/zl3073x-core.c @@ -8,8 +8,11 @@ #include #include #include +#include #include +#include #include +#include #include "zl3073x.h" #include "zl3073x-regs.h" @@ -375,6 +378,83 @@ int zl3073x_poll_zero_u8(struct zl3073x_dev *zldev, unsigned int reg, u8 mask) } EXPORT_SYMBOL_NS_GPL(zl3073x_poll_zero_u8, "ZL3073X"); +/** + * zl3073x_devlink_info_get - Devlink device info callback + * @devlink: devlink structure pointer + * @req: devlink request pointer to store information + * @extack: netlink extack pointer to report errors + * + * Return: 0 on success, <0 on error + */ +static int zl3073x_devlink_info_get(struct devlink *devlink, + struct devlink_info_req *req, + struct netlink_ext_ack *extack) +{ + struct zl3073x_dev *zldev = devlink_priv(devlink); + u16 id, revision, fw_ver; + char buf[16]; + u32 cfg_ver; + int rc; + + rc = zl3073x_read_u16(zldev, ZL_REG_ID, &id); + if (rc) + return rc; + + snprintf(buf, sizeof(buf), "%X", id); + rc = devlink_info_version_fixed_put(req, + DEVLINK_INFO_VERSION_GENERIC_ASIC_ID, + buf); + if (rc) + return rc; + + rc = zl3073x_read_u16(zldev, ZL_REG_REVISION, &revision); + if (rc) + return rc; + + snprintf(buf, sizeof(buf), "%X", revision); + rc = devlink_info_version_fixed_put(req, + DEVLINK_INFO_VERSION_GENERIC_ASIC_REV, + buf); + if (rc) + return rc; + + rc = zl3073x_read_u16(zldev, ZL_REG_FW_VER, &fw_ver); + if (rc) + return rc; + + snprintf(buf, sizeof(buf), "%u", fw_ver); + rc = devlink_info_version_running_put(req, + DEVLINK_INFO_VERSION_GENERIC_FW, + buf); + if (rc) + return rc; + + rc = zl3073x_read_u32(zldev, ZL_REG_CUSTOM_CONFIG_VER, &cfg_ver); + if (rc) + return rc; + + /* No custom config version */ + if (cfg_ver == U32_MAX) + return 0; + + snprintf(buf, sizeof(buf), "%lu.%lu.%lu.%lu", + FIELD_GET(GENMASK(31, 24), cfg_ver), + FIELD_GET(GENMASK(23, 16), cfg_ver), + FIELD_GET(GENMASK(15, 8), cfg_ver), + FIELD_GET(GENMASK(7, 0), cfg_ver)); + + return devlink_info_version_running_put(req, "cfg.custom_ver", buf); +} + +static const struct devlink_ops zl3073x_devlink_ops = { + .info_get = zl3073x_devlink_info_get, +}; + +static void zl3073x_devlink_free(void *ptr) +{ + devlink_free(ptr); +} + /** * zl3073x_devm_alloc - allocates zl3073x device structure * @dev: pointer to device structure @@ -386,11 +466,19 @@ EXPORT_SYMBOL_NS_GPL(zl3073x_poll_zero_u8, "ZL3073X"); struct zl3073x_dev *zl3073x_devm_alloc(struct device *dev) { struct zl3073x_dev *zldev; + struct devlink *devlink; + int rc; - zldev = devm_kzalloc(dev, sizeof(*zldev), GFP_KERNEL); - if (!zldev) + devlink = devlink_alloc(&zl3073x_devlink_ops, sizeof(*zldev), dev); + if (!devlink) return ERR_PTR(-ENOMEM); + /* Add devres action to free devlink device */ + rc = devm_add_action_or_reset(dev, zl3073x_devlink_free, devlink); + if (rc) + return ERR_PTR(rc); + + zldev = devlink_priv(devlink); zldev->dev = dev; dev_set_drvdata(zldev->dev, zldev); @@ -410,6 +498,11 @@ void zl3073x_dev_init_regmap_config(struct regmap_config *regmap_cfg) } EXPORT_SYMBOL_NS_GPL(zl3073x_dev_init_regmap_config, "ZL3073X"); +static void zl3073x_devlink_unregister(void *ptr) +{ + devlink_unregister(ptr); +} + /** * zl3073x_dev_probe - initialize zl3073x device * @zldev: pointer to zl3073x device @@ -423,6 +516,7 @@ int zl3073x_dev_probe(struct zl3073x_dev *zldev, const struct zl3073x_chip_info *chip_info) { u16 id, revision, fw_ver; + struct devlink *devlink; unsigned int i; u32 cfg_ver; int rc; @@ -463,6 +557,16 @@ int zl3073x_dev_probe(struct zl3073x_dev *zldev, FIELD_GET(GENMASK(15, 8), cfg_ver), FIELD_GET(GENMASK(7, 0), cfg_ver)); + /* Register the device as devlink device */ + devlink = priv_to_devlink(zldev); + devlink_register(devlink); + + /* Add devres action to unregister devlink device */ + rc = devm_add_action_or_reset(zldev->dev, zl3073x_devlink_unregister, + devlink); + if (rc) + return rc; + return 0; } EXPORT_SYMBOL_NS_GPL(zl3073x_dev_probe, "ZL3073X"); From 53967199f2c83ba1a99aef938159cca7e8483805 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 30 Apr 2025 12:11:23 +0200 Subject: [PATCH 700/770] mfd: zl3073x: Protect operations requiring multiple register accesses Registers located on page 10 and above are called mailbox-type registers. Each page represents a mailbox and is used to read from and write to configuration of a specific object (DPLL, output, reference or synth). Each mailbox page contains a mask register, which selects an index of the target object to interact with and a semaphore register, which indicates the requested operation. The remaining registers within the page are latch registers, which are populated by the firmware during read operations or by the driver prior to write operations. Operations with these registers requires multiple register reads, writes and polls and all of them need to be done atomically. So add multiop_lock mutex to protect such operations and check the mutex is held by the caller when it's accessing registers from page 10 and above. Signed-off-by: Ivan Vecera Signed-off-by: NipaLocal --- drivers/mfd/zl3073x-core.c | 14 ++++++++++++++ include/linux/mfd/zl3073x.h | 3 +++ 2 files changed, 17 insertions(+) diff --git a/drivers/mfd/zl3073x-core.c b/drivers/mfd/zl3073x-core.c index 0795506825108..a12cfc7eb6ff9 100644 --- a/drivers/mfd/zl3073x-core.c +++ b/drivers/mfd/zl3073x-core.c @@ -121,6 +121,12 @@ static const struct regmap_config zl3073x_regmap_config = { static bool zl3073x_check_reg(struct zl3073x_dev *zldev, unsigned int reg, size_t size) { + /* Check that multiop lock is held when accessing registers + * from page 10 and above. + */ + if (ZL_REG_PAGE(reg) >= 10) + lockdep_assert_held(&zldev->multiop_lock); + /* Check the index is in valid range for indexed register */ if (ZL_REG_OFFSET(reg) > ZL_REG_MAX_OFFSET(reg)) { dev_err(zldev->dev, "Index out of range for reg 0x%04lx\n", @@ -557,6 +563,14 @@ int zl3073x_dev_probe(struct zl3073x_dev *zldev, FIELD_GET(GENMASK(15, 8), cfg_ver), FIELD_GET(GENMASK(7, 0), cfg_ver)); + /* Initialize mutex for operations where multiple reads, writes + * and/or polls are required to be done atomically. + */ + rc = devm_mutex_init(zldev->dev, &zldev->multiop_lock); + if (rc) + return dev_err_probe(zldev->dev, rc, + "Failed to initialize mutex\n"); + /* Register the device as devlink device */ devlink = priv_to_devlink(zldev); devlink_register(devlink); diff --git a/include/linux/mfd/zl3073x.h b/include/linux/mfd/zl3073x.h index 8df10b82e3c21..a42a275577c49 100644 --- a/include/linux/mfd/zl3073x.h +++ b/include/linux/mfd/zl3073x.h @@ -3,6 +3,7 @@ #ifndef __LINUX_MFD_ZL3073X_H #define __LINUX_MFD_ZL3073X_H +#include #include struct device; @@ -12,10 +13,12 @@ struct regmap; * struct zl3073x_dev - zl3073x device * @dev: pointer to device * @regmap: regmap to access device registers + * @multiop_lock: to serialize multiple register operations */ struct zl3073x_dev { struct device *dev; struct regmap *regmap; + struct mutex multiop_lock; }; /********************** From a55695bdcd8453b4ce8f13947041514e1d6a902a Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 30 Apr 2025 12:11:24 +0200 Subject: [PATCH 701/770] mfd: zl3073x: Fetch invariants during probe Several configuration parameters will remain constant at runtime, so we can load them during probe to avoid excessive reads from the hardware. These parameters will be frequently accessed by the DPLL sub-device driver (in follow-up series), and later by the PHC/PTP sub-device driver. Read the following parameters from the device during probe and store them for later use: * frequencies of the synthesizers and their associated DPLL channels * enablement and type (single-ended or differential) of input pins * associated synthesizers, signal format, and enablement status of outputs Signed-off-by: Ivan Vecera Signed-off-by: NipaLocal --- drivers/mfd/zl3073x-core.c | 251 +++++++++++++++++++++++++++++++ drivers/mfd/zl3073x-regs.h | 37 +++++ include/linux/mfd/zl3073x-regs.h | 22 +++ include/linux/mfd/zl3073x.h | 151 +++++++++++++++++++ 4 files changed, 461 insertions(+) diff --git a/drivers/mfd/zl3073x-core.c b/drivers/mfd/zl3073x-core.c index a12cfc7eb6ff9..135fb9ba7f190 100644 --- a/drivers/mfd/zl3073x-core.c +++ b/drivers/mfd/zl3073x-core.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -504,6 +505,251 @@ void zl3073x_dev_init_regmap_config(struct regmap_config *regmap_cfg) } EXPORT_SYMBOL_NS_GPL(zl3073x_dev_init_regmap_config, "ZL3073X"); +static int +zl3073x_mb_op(struct zl3073x_dev *zldev, unsigned int op_reg, u8 op_val, + unsigned int mask_reg, u16 mask_val) +{ + int rc; + + /* Set mask for the operation */ + rc = zl3073x_write_u16(zldev, mask_reg, mask_val); + if (rc) + return rc; + + /* Trigger the operation */ + rc = zl3073x_write_u8(zldev, op_reg, op_val); + if (rc) + return rc; + + /* Wait for the operation to actually finish */ + return zl3073x_poll_zero_u8(zldev, op_reg, op_val); +} + +/** + * zl3073x_input_state_fetch - get input state + * @zldev: pointer to zl3073x_dev structure + * @index: input pin index to fetch state for + * + * Function fetches information for the given input reference that are + * invariant and stores them for later use. + * + * Return: 0 on success, <0 on error + */ +static int +zl3073x_input_state_fetch(struct zl3073x_dev *zldev, u8 index) +{ + struct zl3073x_input *input; + u8 ref_config; + int rc; + + input = &zldev->input[index]; + + /* If the input is differential then the configuration for N-pin + * reference is ignored and P-pin config is used for both. + */ + if (zl3073x_is_n_pin(index) && + zl3073x_input_is_diff(zldev, index - 1)) { + input->enabled = zl3073x_input_is_enabled(zldev, index - 1); + input->diff = true; + + return 0; + } + + guard(mutex)(&zldev->multiop_lock); + + /* Read reference configuration */ + rc = zl3073x_mb_op(zldev, ZL_REG_REF_MB_SEM, ZL_REF_MB_SEM_RD, + ZL_REG_REF_MB_MASK, BIT(index)); + if (rc) + return rc; + + /* Read ref_config register */ + rc = zl3073x_read_u8(zldev, ZL_REG_REF_CONFIG, &ref_config); + if (rc) + return rc; + + input->enabled = FIELD_GET(ZL_REF_CONFIG_ENABLE, ref_config); + input->diff = FIELD_GET(ZL_REF_CONFIG_DIFF_EN, ref_config); + + dev_dbg(zldev->dev, "INPUT%u is %s and configured as %s\n", index, + input->enabled ? "enabled" : "disabled", + input->diff ? "differential" : "single-ended"); + + return rc; +} + +/** + * zl3073x_output_state_fetch - get output state + * @zldev: pointer to zl3073x_dev structure + * @index: output index to fetch state for + * + * Function fetches information for the given output (not output pin) + * that are invariant and stores them for later use. + * + * Return: 0 on success, <0 on error + */ +static int +zl3073x_output_state_fetch(struct zl3073x_dev *zldev, u8 index) +{ + struct zl3073x_output *output; + u8 output_ctrl, output_mode; + int rc; + + output = &zldev->output[index]; + + /* Read output configuration */ + rc = zl3073x_read_u8(zldev, ZL_REG_OUTPUT_CTRL(index), &output_ctrl); + if (rc) + return rc; + + /* Store info about output enablement and synthesizer the output + * is connected to. + */ + output->enabled = FIELD_GET(ZL_OUTPUT_CTRL_EN, output_ctrl); + output->synth = FIELD_GET(ZL_OUTPUT_CTRL_SYNTH_SEL, output_ctrl); + + dev_dbg(zldev->dev, "OUTPUT%u is %s, connected to SYNTH%u\n", + index, output->enabled ? "enabled" : "disabled", output->synth); + + guard(mutex)(&zldev->multiop_lock); + + /* Read output configuration */ + rc = zl3073x_mb_op(zldev, ZL_REG_OUTPUT_MB_SEM, ZL_OUTPUT_MB_SEM_RD, + ZL_REG_OUTPUT_MB_MASK, BIT(index)); + if (rc) + return rc; + + /* Read output_mode */ + rc = zl3073x_read_u8(zldev, ZL_REG_OUTPUT_MODE, &output_mode); + if (rc) + return rc; + + /* Extract and store output signal format */ + output->signal_format = FIELD_GET(ZL_OUTPUT_MODE_SIGNAL_FORMAT, + output_mode); + + dev_dbg(zldev->dev, "OUTPUT%u has signal format 0x%02x\n", index, + output->signal_format); + + return rc; +} + +/** + * zl3073x_synth_state_fetch - get synth state + * @zldev: pointer to zl3073x_dev structure + * @index: synth index to fetch state for + * + * Function fetches information for the given synthesizer that are + * invariant and stores them for later use. + * + * Return: 0 on success, <0 on error + */ +static int +zl3073x_synth_state_fetch(struct zl3073x_dev *zldev, u8 index) +{ + u16 base, num, denom; + u8 synth_ctrl; + u32 mult; + int rc; + + /* Read synth control register */ + rc = zl3073x_read_u8(zldev, ZL_REG_SYNTH_CTRL(index), &synth_ctrl); + if (rc) + return rc; + + /* Extract and store DPLL channel the synth is driven by */ + zldev->synth[index].dpll = FIELD_GET(ZL_SYNTH_CTRL_DPLL_SEL, + synth_ctrl); + + dev_dbg(zldev->dev, "SYNTH%u is connected to DPLL%u\n", index, + zldev->synth[index].dpll); + + guard(mutex)(&zldev->multiop_lock); + + /* Read synth configuration */ + rc = zl3073x_mb_op(zldev, ZL_REG_SYNTH_MB_SEM, ZL_SYNTH_MB_SEM_RD, + ZL_REG_SYNTH_MB_MASK, BIT(index)); + if (rc) + return rc; + + /* The output frequency is determined by the following formula: + * base * multiplier * numerator / denominator + * + * Read registers with these values + */ + rc = zl3073x_read_u16(zldev, ZL_REG_SYNTH_FREQ_BASE, &base); + if (rc) + return rc; + + rc = zl3073x_read_u32(zldev, ZL_REG_SYNTH_FREQ_MULT, &mult); + if (rc) + return rc; + + rc = zl3073x_read_u16(zldev, ZL_REG_SYNTH_FREQ_M, &num); + if (rc) + return rc; + + rc = zl3073x_read_u16(zldev, ZL_REG_SYNTH_FREQ_N, &denom); + if (rc) + return rc; + + /* Check denominator for zero to avoid div by 0 */ + if (!denom) { + dev_err(zldev->dev, + "Zero divisor for SYNTH%u retrieved from device\n", + index); + return -EINVAL; + } + + /* Compute and store synth frequency */ + zldev->synth[index].freq = mul_u64_u32_div(mul_u32_u32(base, mult), + num, denom); + + dev_dbg(zldev->dev, "SYNTH%u frequency: %llu Hz\n", index, + zldev->synth[index].freq); + + return rc; +} + +static int +zl3073x_dev_state_fetch(struct zl3073x_dev *zldev) +{ + int rc; + u8 i; + + for (i = 0; i < ZL3073X_NUM_INPUTS; i++) { + rc = zl3073x_input_state_fetch(zldev, i); + if (rc) { + dev_err(zldev->dev, + "Failed to fetch input state: %pe\n", + ERR_PTR(rc)); + return rc; + } + } + + for (i = 0; i < ZL3073X_NUM_SYNTHS; i++) { + rc = zl3073x_synth_state_fetch(zldev, i); + if (rc) { + dev_err(zldev->dev, + "Failed to fetch synth state: %pe\n", + ERR_PTR(rc)); + return rc; + } + } + + for (i = 0; i < ZL3073X_NUM_OUTPUTS; i++) { + rc = zl3073x_output_state_fetch(zldev, i); + if (rc) { + dev_err(zldev->dev, + "Failed to fetch output state: %pe\n", + ERR_PTR(rc)); + return rc; + } + } + + return rc; +} + static void zl3073x_devlink_unregister(void *ptr) { devlink_unregister(ptr); @@ -571,6 +817,11 @@ int zl3073x_dev_probe(struct zl3073x_dev *zldev, return dev_err_probe(zldev->dev, rc, "Failed to initialize mutex\n"); + /* Fetch device state */ + rc = zl3073x_dev_state_fetch(zldev); + if (rc) + return rc; + /* Register the device as devlink device */ devlink = priv_to_devlink(zldev); devlink_register(devlink); diff --git a/drivers/mfd/zl3073x-regs.h b/drivers/mfd/zl3073x-regs.h index 6bb7ea1ef0b52..dcd278acab4ba 100644 --- a/drivers/mfd/zl3073x-regs.h +++ b/drivers/mfd/zl3073x-regs.h @@ -14,4 +14,41 @@ #define ZL_REG_FW_VER ZL_REG(0, 0x05, 2) #define ZL_REG_CUSTOM_CONFIG_VER ZL_REG(0, 0x07, 4) +/*********************************** + * Register Page 9, Synth and Output + ***********************************/ + +#define ZL_REG_SYNTH_CTRL(_idx) \ + ZL_REG_IDX(_idx, 9, 0x00, 1, ZL3073X_NUM_SYNTHS, 1) +#define ZL_SYNTH_CTRL_EN BIT(0) +#define ZL_SYNTH_CTRL_DPLL_SEL GENMASK(6, 4) + +#define ZL_REG_OUTPUT_CTRL(_idx) \ + ZL_REG_IDX(_idx, 9, 0x28, 1, ZL3073X_NUM_OUTPUTS, 1) +#define ZL_OUTPUT_CTRL_EN BIT(0) +#define ZL_OUTPUT_CTRL_SYNTH_SEL GENMASK(6, 4) + +/******************************* + * Register Page 10, Ref Mailbox + *******************************/ + +#define ZL_REG_REF_CONFIG ZL_REG(10, 0x0d, 1) +#define ZL_REF_CONFIG_ENABLE BIT(0) +#define ZL_REF_CONFIG_DIFF_EN BIT(2) + +/********************************* + * Register Page 13, Synth Mailbox + *********************************/ + +#define ZL_REG_SYNTH_MB_MASK ZL_REG(13, 0x02, 2) + +#define ZL_REG_SYNTH_MB_SEM ZL_REG(13, 0x04, 1) +#define ZL_SYNTH_MB_SEM_WR BIT(0) +#define ZL_SYNTH_MB_SEM_RD BIT(1) + +#define ZL_REG_SYNTH_FREQ_BASE ZL_REG(13, 0x06, 2) +#define ZL_REG_SYNTH_FREQ_MULT ZL_REG(13, 0x08, 4) +#define ZL_REG_SYNTH_FREQ_M ZL_REG(13, 0x0c, 2) +#define ZL_REG_SYNTH_FREQ_N ZL_REG(13, 0x0e, 2) + #endif /* __ZL3073X_REGS_H */ diff --git a/include/linux/mfd/zl3073x-regs.h b/include/linux/mfd/zl3073x-regs.h index cd2baed244eb0..d94bd389f9a7f 100644 --- a/include/linux/mfd/zl3073x-regs.h +++ b/include/linux/mfd/zl3073x-regs.h @@ -63,4 +63,26 @@ #define ZL_REG(_page, _offset, _size) \ ZL_REG_IDX(0, _page, _offset, _size, 1, 0) +/******************************* + * Register Page 10, Ref Mailbox + *******************************/ + +#define ZL_REG_REF_MB_MASK ZL_REG(10, 0x02, 2) + +#define ZL_REG_REF_MB_SEM ZL_REG(10, 0x04, 1) +#define ZL_REF_MB_SEM_WR BIT(0) +#define ZL_REF_MB_SEM_RD BIT(1) + +/********************************** + * Register Page 14, Output Mailbox + **********************************/ +#define ZL_REG_OUTPUT_MB_MASK ZL_REG(14, 0x02, 2) + +#define ZL_REG_OUTPUT_MB_SEM ZL_REG(14, 0x04, 1) +#define ZL_OUTPUT_MB_SEM_WR BIT(0) +#define ZL_OUTPUT_MB_SEM_RD BIT(1) + +#define ZL_REG_OUTPUT_MODE ZL_REG(14, 0x05, 1) +#define ZL_OUTPUT_MODE_SIGNAL_FORMAT GENMASK(7, 4) + #endif /* __LINUX_MFD_ZL3073X_REGS_H */ diff --git a/include/linux/mfd/zl3073x.h b/include/linux/mfd/zl3073x.h index a42a275577c49..da4b7ae6a89ec 100644 --- a/include/linux/mfd/zl3073x.h +++ b/include/linux/mfd/zl3073x.h @@ -9,16 +9,63 @@ struct device; struct regmap; +/* + * Hardware limits for ZL3073x chip family + */ +#define ZL3073X_NUM_INPUTS 10 +#define ZL3073X_NUM_OUTPUTS 10 +#define ZL3073X_NUM_SYNTHS 5 + +/** + * struct zl3073x_input - input invariant info + * @enabled: input is enabled or disabled + * @diff: true if input is differential + */ +struct zl3073x_input { + bool enabled; + bool diff; +}; + +/** + * struct zl3073x_output - output invariant info + * @enabled: output is enabled or disabled + * @synth: synthesizer the output is connected to + * @signal_format: output signal format + */ +struct zl3073x_output { + bool enabled; + u8 synth; + u8 signal_format; +}; + +/** + * struct zl3073x_synth - synthesizer invariant info + * @freq: synthesizer frequency + * @dpll: ID of DPLL the synthesizer is driven by + */ +struct zl3073x_synth { + u64 freq; + u8 dpll; +}; + /** * struct zl3073x_dev - zl3073x device * @dev: pointer to device * @regmap: regmap to access device registers * @multiop_lock: to serialize multiple register operations + * @input: array of inputs' invariants + * @output: array of outputs' invariants + * @synth: array of synthesizers' invariants */ struct zl3073x_dev { struct device *dev; struct regmap *regmap; struct mutex multiop_lock; + + /* Invariants */ + struct zl3073x_input input[ZL3073X_NUM_INPUTS]; + struct zl3073x_output output[ZL3073X_NUM_OUTPUTS]; + struct zl3073x_synth synth[ZL3073X_NUM_SYNTHS]; }; /********************** @@ -35,4 +82,108 @@ int zl3073x_write_u16(struct zl3073x_dev *zldev, unsigned int reg, u16 val); int zl3073x_write_u32(struct zl3073x_dev *zldev, unsigned int reg, u32 val); int zl3073x_write_u48(struct zl3073x_dev *zldev, unsigned int reg, u64 val); +static inline +bool zl3073x_is_n_pin(u8 index) +{ + /* P-pins indices are even while N-pins are odd */ + return index & 1; +} + +static inline +bool zl3073x_is_p_pin(u8 index) +{ + return !zl3073x_is_n_pin(index); +} + +/** + * zl3073x_input_is_diff - check if the given input ref is differential + * @zldev: pointer to zl3073x device + * @index: output index + * + * Return: true if input is differential, false if input is single-ended + */ +static inline +bool zl3073x_input_is_diff(struct zl3073x_dev *zldev, u8 index) +{ + return zldev->input[index].diff; +} + +/** + * zl3073x_input_is_enabled - check if the given input ref is enabled + * @zldev: pointer to zl3073x device + * @index: input index + * + * Return: true if input is enabled, false if input is disabled + */ +static inline +bool zl3073x_input_is_enabled(struct zl3073x_dev *zldev, u8 index) +{ + return zldev->input[index].enabled; +} + +/** + * zl3073x_output_is_enabled - check if the given output is enabled + * @zldev: pointer to zl3073x device + * @index: output index + * + * Return: true if output is enabled, false if output is disabled + */ +static inline +u8 zl3073x_output_is_enabled(struct zl3073x_dev *zldev, u8 index) +{ + return zldev->output[index].enabled; +} + +/** + * zl3073x_output_signal_format_get - get output signal format + * @zldev: pointer to zl3073x device + * @index: output index + * + * Return: signal format of given output + */ +static inline +u8 zl3073x_output_signal_format_get(struct zl3073x_dev *zldev, u8 index) +{ + return zldev->output[index].signal_format; +} + +/** + * zl3073x_output_synth_get - get synth connected to given output + * @zldev: pointer to zl3073x device + * @index: output index + * + * Return: index of synth connected to given output. + */ +static inline +u8 zl3073x_output_synth_get(struct zl3073x_dev *zldev, u8 index) +{ + return zldev->output[index].synth; +} + +/** + * zl3073x_synth_dpll_get - get DPLL ID the synth is driven by + * @zldev: pointer to zl3073x device + * @index: synth index + * + * Return: ID of DPLL the given synthetizer is driven by + */ +static inline +u64 zl3073x_synth_dpll_get(struct zl3073x_dev *zldev, u8 index) +{ + return zldev->synth[index].dpll; +} + +/** + * zl3073x_synth_freq_get - get synth current freq + * @zldev: pointer to zl3073x device + * @index: synth index + * + * Return: frequency of given synthetizer + */ +static inline +u64 zl3073x_synth_freq_get(struct zl3073x_dev *zldev, u8 index) +{ + return zldev->synth[index].freq; +} + #endif /* __LINUX_MFD_ZL3073X_H */ From 97901ceea33ec95134130124ca46d4730eeb540c Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 30 Apr 2025 12:11:25 +0200 Subject: [PATCH 702/770] mfd: zl3073x: Add clock_id field Add .clock_id to zl3073x_dev structure that will be used by later commits introducing DPLL driver. The clock ID is required for DPLL device registration. To generate this ID, use chip ID read during device initialization. In case where multiple zl3073x based chips are present, the chip ID is shifted and lower bits are filled by an unique value - using the I2C device address for I2C connections and the chip-select value for SPI connections. Signed-off-by: Ivan Vecera v5->v6: * no change Signed-off-by: NipaLocal --- drivers/mfd/zl3073x-core.c | 6 +++++- drivers/mfd/zl3073x-i2c.c | 4 +++- drivers/mfd/zl3073x-spi.c | 4 +++- drivers/mfd/zl3073x.h | 2 +- include/linux/mfd/zl3073x.h | 2 ++ 5 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/mfd/zl3073x-core.c b/drivers/mfd/zl3073x-core.c index 135fb9ba7f190..050dc57c90c32 100644 --- a/drivers/mfd/zl3073x-core.c +++ b/drivers/mfd/zl3073x-core.c @@ -759,13 +759,14 @@ static void zl3073x_devlink_unregister(void *ptr) * zl3073x_dev_probe - initialize zl3073x device * @zldev: pointer to zl3073x device * @chip_info: chip info based on compatible + * @dev_id: device ID to be used as part of clock ID * * Common initialization of zl3073x device structure. * * Returns: 0 on success, <0 on error */ int zl3073x_dev_probe(struct zl3073x_dev *zldev, - const struct zl3073x_chip_info *chip_info) + const struct zl3073x_chip_info *chip_info, u8 dev_id) { u16 id, revision, fw_ver; struct devlink *devlink; @@ -809,6 +810,9 @@ int zl3073x_dev_probe(struct zl3073x_dev *zldev, FIELD_GET(GENMASK(15, 8), cfg_ver), FIELD_GET(GENMASK(7, 0), cfg_ver)); + /* Use chip ID and given dev ID as clock ID */ + zldev->clock_id = ((u64)id << 8) | dev_id; + /* Initialize mutex for operations where multiple reads, writes * and/or polls are required to be done atomically. */ diff --git a/drivers/mfd/zl3073x-i2c.c b/drivers/mfd/zl3073x-i2c.c index da8bbd702d76c..e00277f87de92 100644 --- a/drivers/mfd/zl3073x-i2c.c +++ b/drivers/mfd/zl3073x-i2c.c @@ -27,7 +27,9 @@ static int zl3073x_i2c_probe(struct i2c_client *client) return PTR_ERR(zldev->regmap); } - return zl3073x_dev_probe(zldev, i2c_get_match_data(client)); + /* Initialize device and use I2C address as dev ID */ + return zl3073x_dev_probe(zldev, i2c_get_match_data(client), + client->addr); } static const struct i2c_device_id zl3073x_i2c_id[] = { diff --git a/drivers/mfd/zl3073x-spi.c b/drivers/mfd/zl3073x-spi.c index 962b6845c0325..368001ae19db9 100644 --- a/drivers/mfd/zl3073x-spi.c +++ b/drivers/mfd/zl3073x-spi.c @@ -27,7 +27,9 @@ static int zl3073x_spi_probe(struct spi_device *spi) return PTR_ERR(zldev->regmap); } - return zl3073x_dev_probe(zldev, spi_get_device_match_data(spi)); + /* Initialize device and use SPI chip select value as dev ID */ + return zl3073x_dev_probe(zldev, spi_get_device_match_data(spi), + spi_get_chipselect(spi, 0)); } static const struct spi_device_id zl3073x_spi_id[] = { diff --git a/drivers/mfd/zl3073x.h b/drivers/mfd/zl3073x.h index 3a2fea61cf579..abd1ab9a56ded 100644 --- a/drivers/mfd/zl3073x.h +++ b/drivers/mfd/zl3073x.h @@ -26,6 +26,6 @@ extern const struct zl3073x_chip_info zl3073x_chip_info[]; struct zl3073x_dev *zl3073x_devm_alloc(struct device *dev); void zl3073x_dev_init_regmap_config(struct regmap_config *regmap_cfg); int zl3073x_dev_probe(struct zl3073x_dev *zldev, - const struct zl3073x_chip_info *chip_info); + const struct zl3073x_chip_info *chip_info, u8 dev_id); #endif /* __ZL3073X_CORE_H */ diff --git a/include/linux/mfd/zl3073x.h b/include/linux/mfd/zl3073x.h index da4b7ae6a89ec..1512e8c7cc7bb 100644 --- a/include/linux/mfd/zl3073x.h +++ b/include/linux/mfd/zl3073x.h @@ -53,6 +53,7 @@ struct zl3073x_synth { * @dev: pointer to device * @regmap: regmap to access device registers * @multiop_lock: to serialize multiple register operations + * @clock_id: clock id of the device * @input: array of inputs' invariants * @output: array of outputs' invariants * @synth: array of synthesizers' invariants @@ -61,6 +62,7 @@ struct zl3073x_dev { struct device *dev; struct regmap *regmap; struct mutex multiop_lock; + u64 clock_id; /* Invariants */ struct zl3073x_input input[ZL3073X_NUM_INPUTS]; From f1368f32b228a1500ffb5f4d96ca8af4f94fdaea Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 30 Apr 2025 12:11:26 +0200 Subject: [PATCH 703/770] mfd: zl3073x: Register DPLL sub-device during init Register DPLL sub-devices to expose the functionality provided by ZL3073x chip family. Each sub-device represents one of the available DPLL channels. Signed-off-by: Ivan Vecera Signed-off-by: NipaLocal --- drivers/mfd/zl3073x-core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/mfd/zl3073x-core.c b/drivers/mfd/zl3073x-core.c index 050dc57c90c32..3e665cdf228f1 100644 --- a/drivers/mfd/zl3073x-core.c +++ b/drivers/mfd/zl3073x-core.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -755,6 +756,14 @@ static void zl3073x_devlink_unregister(void *ptr) devlink_unregister(ptr); } +static const struct mfd_cell zl3073x_dpll_cells[] = { + MFD_CELL_BASIC("zl3073x-dpll", NULL, NULL, 0, 0), + MFD_CELL_BASIC("zl3073x-dpll", NULL, NULL, 0, 1), + MFD_CELL_BASIC("zl3073x-dpll", NULL, NULL, 0, 2), + MFD_CELL_BASIC("zl3073x-dpll", NULL, NULL, 0, 3), + MFD_CELL_BASIC("zl3073x-dpll", NULL, NULL, 0, 4), +}; + /** * zl3073x_dev_probe - initialize zl3073x device * @zldev: pointer to zl3073x device @@ -826,6 +835,16 @@ int zl3073x_dev_probe(struct zl3073x_dev *zldev, if (rc) return rc; + /* Add DPLL sub-device cell for each DPLL channel */ + rc = devm_mfd_add_devices(zldev->dev, PLATFORM_DEVID_AUTO, + zl3073x_dpll_cells, chip_info->num_channels, + NULL, 0, NULL); + if (rc) { + dev_err_probe(zldev->dev, rc, + "Failed to add DPLL sub-device\n"); + return rc; + } + /* Register the device as devlink device */ devlink = priv_to_devlink(zldev); devlink_register(devlink); From 29e02a2b6a835920dda6b5cd52ea7d69ec6a2a1d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:41 +0200 Subject: [PATCH 704/770] net: page_pool: Don't recycle into cache on PREEMPT_RT With preemptible softirq and no per-CPU locking in local_bh_disable() on PREEMPT_RT the consumer can be preempted while a skb is returned. Avoid the race by disabling the recycle into the cache on PREEMPT_RT. Cc: Jesper Dangaard Brouer Cc: Ilias Apalodimas Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/core/page_pool.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 2b76848659418..974f3eef2efaf 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -839,6 +839,10 @@ static bool page_pool_napi_local(const struct page_pool *pool) const struct napi_struct *napi; u32 cpuid; + /* On PREEMPT_RT the softirq can be preempted by the consumer */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; + if (unlikely(!in_softirq())) return false; From c616c5eb871bfdf6bd4f5635c7061a4dc8a210ef Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:42 +0200 Subject: [PATCH 705/770] net: dst_cache: Use nested-BH locking for dst_cache::cache dst_cache::cache is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Add a local_lock_t to the data structure and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/core/dst_cache.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 70c634b9e7b02..93a04d18e5054 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -17,6 +17,7 @@ struct dst_cache_pcpu { unsigned long refresh_ts; struct dst_entry *dst; + local_lock_t bh_lock; u32 cookie; union { struct in_addr in_saddr; @@ -65,10 +66,15 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) { + struct dst_entry *dst; + if (!dst_cache->cache) return NULL; - return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_lock_nested_bh(&dst_cache->cache->bh_lock); + dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst; } EXPORT_SYMBOL_GPL(dst_cache_get); @@ -80,12 +86,16 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) if (!dst_cache->cache) return NULL; + local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; + } *saddr = idst->in_saddr.s_addr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); @@ -98,9 +108,11 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, if (!dst_cache->cache) return; + local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, 0); idst->in_saddr.s_addr = saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip4); @@ -113,10 +125,13 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, if (!dst_cache->cache) return; + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, rt6_get_cookie(dst_rt6_info(dst))); idst->in6_saddr = *saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); @@ -129,12 +144,17 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, if (!dst_cache->cache) return NULL; + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; + } *saddr = idst->in6_saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst; } EXPORT_SYMBOL_GPL(dst_cache_get_ip6); @@ -142,10 +162,14 @@ EXPORT_SYMBOL_GPL(dst_cache_get_ip6); int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) { + unsigned int i; + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, gfp | __GFP_ZERO); if (!dst_cache->cache) return -ENOMEM; + for_each_possible_cpu(i) + local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock); dst_cache_reset(dst_cache); return 0; From 77cbe3996b92d08067d7e5820438e2de2857e04b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:43 +0200 Subject: [PATCH 706/770] ipv4/route: Use this_cpu_inc() for stats on PREEMPT_RT The statistics are incremented with raw_cpu_inc() assuming it always happens with bottom half disabled. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this is no longer true. Use this_cpu_inc() on PREEMPT_RT for the increment to not worry about preemption. Cc: David Ahern Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/ipv4/route.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 507b2e5dec50f..fccb05fb3a794 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -189,7 +189,11 @@ const __u8 ip_tos2prio[16] = { EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); +#ifndef CONFIG_PREEMPT_RT #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) +#else +#define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field) +#endif #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) From 9c760e1c212640b1f1212c68adc38a3527846759 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:44 +0200 Subject: [PATCH 707/770] ipv6: sr: Use nested-BH locking for hmac_storage hmac_storage is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Add a local_lock_t to the data structure and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: David Ahern Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/ipv6/seg6_hmac.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index bbf5b84a70fca..f78ecb6ad8383 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -40,7 +40,14 @@ #include #include -static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring); +struct hmac_storage { + local_lock_t bh_lock; + char hmac_ring[SEG6_HMAC_RING_SIZE]; +}; + +static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -187,7 +194,8 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, */ local_bh_disable(); - ring = this_cpu_ptr(hmac_ring); + local_lock_nested_bh(&hmac_storage.bh_lock); + ring = this_cpu_ptr(hmac_storage.hmac_ring); off = ring; /* source address */ @@ -212,6 +220,7 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, dgsize = __do_hmac(hinfo, ring, plen, tmp_out, SEG6_HMAC_MAX_DIGESTSIZE); + local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); if (dgsize < 0) From bc584c3c026def08689a973e33ca96512fdb654a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:45 +0200 Subject: [PATCH 708/770] xdp: Use nested-BH locking for system_page_pool system_page_pool is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Make a struct with a page_pool member (original system_page_pool) and a local_lock_t and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Andrew Lunn Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jesper Dangaard Brouer Cc: John Fastabend Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- include/linux/netdevice.h | 7 ++++++- net/core/dev.c | 15 ++++++++++----- net/core/xdp.c | 11 +++++++++-- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 36b8f2351c512..3d81fb112a17c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3509,7 +3509,12 @@ struct softnet_data { }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); -DECLARE_PER_CPU(struct page_pool *, system_page_pool); + +struct page_pool_bh { + struct page_pool *pool; + local_lock_t bh_lock; +}; +DECLARE_PER_CPU(struct page_pool_bh, system_page_pool); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) diff --git a/net/core/dev.c b/net/core/dev.c index 52d746e418af5..2e072e9082edd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -462,7 +462,9 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ -DEFINE_PER_CPU(struct page_pool *, system_page_pool); +DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #ifdef CONFIG_LOCKDEP /* @@ -5322,7 +5324,10 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) struct sk_buff *skb = *pskb; int err, hroom, troom; - if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) + local_lock_nested_bh(&system_page_pool.bh_lock); + err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog); + local_unlock_nested_bh(&system_page_pool.bh_lock); + if (!err) return 0; /* In case we have to go down the path and also linearize, @@ -12730,7 +12735,7 @@ static int net_page_pool_create(int cpuid) return err; } - per_cpu(system_page_pool, cpuid) = pp_ptr; + per_cpu(system_page_pool.pool, cpuid) = pp_ptr; #endif return 0; } @@ -12860,13 +12865,13 @@ static int __init net_dev_init(void) for_each_possible_cpu(i) { struct page_pool *pp_ptr; - pp_ptr = per_cpu(system_page_pool, i); + pp_ptr = per_cpu(system_page_pool.pool, i); if (!pp_ptr) continue; xdp_unreg_page_pool(pp_ptr); page_pool_destroy(pp_ptr); - per_cpu(system_page_pool, i) = NULL; + per_cpu(system_page_pool.pool, i) = NULL; } } diff --git a/net/core/xdp.c b/net/core/xdp.c index 4e91c77906711..d8cb529ff6308 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -739,10 +739,10 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, */ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) { - struct page_pool *pp = this_cpu_read(system_page_pool); const struct xdp_rxq_info *rxq = xdp->rxq; u32 len = xdp->data_end - xdp->data_meta; u32 truesize = xdp->frame_sz; + struct page_pool *pp; struct sk_buff *skb; int metalen; void *data; @@ -750,13 +750,18 @@ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) if (!IS_ENABLED(CONFIG_PAGE_POOL)) return NULL; + local_lock_nested_bh(&system_page_pool.bh_lock); + pp = this_cpu_read(system_page_pool.pool); data = page_pool_dev_alloc_va(pp, &truesize); - if (unlikely(!data)) + if (unlikely(!data)) { + local_unlock_nested_bh(&system_page_pool.bh_lock); return NULL; + } skb = napi_build_skb(data, truesize); if (unlikely(!skb)) { page_pool_free_va(pp, data, true); + local_unlock_nested_bh(&system_page_pool.bh_lock); return NULL; } @@ -775,9 +780,11 @@ struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) if (unlikely(xdp_buff_has_frags(xdp)) && unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) { + local_unlock_nested_bh(&system_page_pool.bh_lock); napi_consume_skb(skb, true); return NULL; } + local_unlock_nested_bh(&system_page_pool.bh_lock); xsk_buff_free(xdp); From e7724b370cdedd1b952a73d9e06e8388bb185fd7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:46 +0200 Subject: [PATCH 709/770] netfilter: nf_dup{4, 6}: Move duplication check to task_struct nf_skb_duplicated is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Due to the recursion involved, the simplest change is to make it a per-task variable. Move the per-CPU variable nf_skb_duplicated to task_struct and name it in_nf_duplicate. Add it to the existing bitfield so it doesn't use additional memory. Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Valentin Schneider Acked-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- include/linux/netfilter.h | 11 ----------- include/linux/sched.h | 1 + net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv4/netfilter/nf_dup_ipv4.c | 6 +++--- net/ipv6/netfilter/ip6_tables.c | 2 +- net/ipv6/netfilter/nf_dup_ipv6.c | 6 +++--- net/netfilter/core.c | 3 --- 7 files changed, 9 insertions(+), 22 deletions(-) diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 2b8aac2c70ada..892d12823ed4b 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -497,17 +497,6 @@ struct nf_defrag_hook { extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook; extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook; -/* - * nf_skb_duplicated - TEE target has sent a packet - * - * When a xtables target sends a packet, the OUTPUT and POSTROUTING - * hooks are traversed again, i.e. nft and xtables are invoked recursively. - * - * This is used by xtables TEE target to prevent the duplicated skb from - * being duplicated again. - */ -DECLARE_PER_CPU(bool, nf_skb_duplicated); - /* * Contains bitmask of ctnetlink event subscribers, if any. * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag. diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac19828934..52d9c52dc8f27 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1044,6 +1044,7 @@ struct task_struct { /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif + unsigned in_nf_duplicate:1; #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 3d101613f27fa..23c8deff8095a 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -270,7 +270,7 @@ ipt_do_table(void *priv, * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + jumpstack += private->stacksize * current->in_nf_duplicate; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index 25e1e8eb18dd5..ed08fb78cfa8c 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -54,7 +54,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, struct iphdr *iph; local_bh_disable(); - if (this_cpu_read(nf_skb_duplicated)) + if (current->in_nf_duplicate) goto out; /* * Copy the skb, and route the copy. Will later return %XT_CONTINUE for @@ -86,9 +86,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, --iph->ttl; if (nf_dup_ipv4_route(net, skb, gw, oif)) { - __this_cpu_write(nf_skb_duplicated, true); + current->in_nf_duplicate = true; ip_local_out(net, skb->sk, skb); - __this_cpu_write(nf_skb_duplicated, false); + current->in_nf_duplicate = false; } else { kfree_skb(skb); } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7d5602950ae72..d585ac3c11133 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -292,7 +292,7 @@ ip6t_do_table(void *priv, struct sk_buff *skb, * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + jumpstack += private->stacksize * current->in_nf_duplicate; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 0c39c77fe8a8a..b903c62c00c9e 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -48,7 +48,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, const struct in6_addr *gw, int oif) { local_bh_disable(); - if (this_cpu_read(nf_skb_duplicated)) + if (current->in_nf_duplicate) goto out; skb = pskb_copy(skb, GFP_ATOMIC); if (skb == NULL) @@ -64,9 +64,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, --iph->hop_limit; } if (nf_dup_ipv6_route(net, skb, gw, oif)) { - __this_cpu_write(nf_skb_duplicated, true); + current->in_nf_duplicate = true; ip6_local_out(net, skb->sk, skb); - __this_cpu_write(nf_skb_duplicated, false); + current->in_nf_duplicate = false; } else { kfree_skb(skb); } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b9f551f02c813..11a702065bab5 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -31,9 +31,6 @@ const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); -DEFINE_PER_CPU(bool, nf_skb_duplicated); -EXPORT_SYMBOL_GPL(nf_skb_duplicated); - #ifdef CONFIG_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); From 8cab8a62cd909926317eba2944c460b8c6a664f9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:47 +0200 Subject: [PATCH 710/770] netfilter: nft_inner: Use nested-BH locking for nft_pcpu_tun_ctx nft_pcpu_tun_ctx is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Make a struct with a nft_inner_tun_ctx member (original nft_pcpu_tun_ctx) and a local_lock_t and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: netfilter-devel@vger.kernel.org Cc: coreteam@netfilter.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/netfilter/nft_inner.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 817ab978d24a1..c4569d4b92285 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -23,7 +23,14 @@ #include #include -static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx); +struct nft_inner_tun_ctx_locked { + struct nft_inner_tun_ctx ctx; + local_lock_t bh_lock; +}; + +static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; /* Same layout as nft_expr but it embeds the private expression data area. */ struct __nft_expr { @@ -237,12 +244,15 @@ static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *this_cpu_tun_ctx; local_bh_disable(); - this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) { local_bh_enable(); + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); return false; } *tun_ctx = *this_cpu_tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); local_bh_enable(); return true; @@ -254,9 +264,11 @@ static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *this_cpu_tun_ctx; local_bh_disable(); - this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); if (this_cpu_tun_ctx->cookie != tun_ctx->cookie) *this_cpu_tun_ctx = *tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); local_bh_enable(); } From 8397f2da13d3c7e09e6d8eb8cad66b25f42e117a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:48 +0200 Subject: [PATCH 711/770] netfilter: nf_dup_netdev: Move the recursion counter struct netdev_xmit nf_dup_skb_recursion is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Move nf_dup_skb_recursion to struct netdev_xmit, provide wrappers. Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: netfilter-devel@vger.kernel.org Cc: coreteam@netfilter.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- include/linux/netdevice_xmit.h | 3 +++ net/netfilter/nf_dup_netdev.c | 22 ++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 38325e0702968..3bbbc1a9860a3 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -8,6 +8,9 @@ struct netdev_xmit { #ifdef CONFIG_NET_EGRESS u8 skip_txqueue; #endif +#if IS_ENABLED(CONFIG_NF_DUP_NETDEV) + u8 nf_dup_skb_recursion; +#endif }; #endif diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index a8e2425e43b0d..fab8b9011098f 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -15,12 +15,26 @@ #define NF_RECURSION_LIMIT 2 -static DEFINE_PER_CPU(u8, nf_dup_skb_recursion); +#ifndef CONFIG_PREEMPT_RT +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); +} +#else + +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return ¤t->net_xmit.nf_dup_skb_recursion; +} + +#endif static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, enum nf_dev_hooks hook) { - if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT) + u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion(); + + if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) goto err; if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) { @@ -32,9 +46,9 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb_clear_tstamp(skb); - __this_cpu_inc(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)++; dev_queue_xmit(skb); - __this_cpu_dec(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)--; return; err: kfree_skb(skb); From 7e0a19c3edaf19f7d8ec9f1cf894ff9faab77d6a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:49 +0200 Subject: [PATCH 712/770] xfrm: Use nested-BH locking for nat_keepalive_sk_ipv[46] nat_keepalive_sk_ipv[46] is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Use sock_bh_locked which has a sock pointer and a local_lock_t. Use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Steffen Klassert Cc: Herbert Xu Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/xfrm/xfrm_nat_keepalive.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/net/xfrm/xfrm_nat_keepalive.c b/net/xfrm/xfrm_nat_keepalive.c index 82f0a301683f0..ebf95d48e86c1 100644 --- a/net/xfrm/xfrm_nat_keepalive.c +++ b/net/xfrm/xfrm_nat_keepalive.c @@ -9,9 +9,13 @@ #include #include -static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv4); +static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv4) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #if IS_ENABLED(CONFIG_IPV6) -static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv6); +static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv6) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #endif struct nat_keepalive { @@ -56,10 +60,12 @@ static int nat_keepalive_send_ipv4(struct sk_buff *skb, skb_dst_set(skb, &rt->dst); - sk = *this_cpu_ptr(&nat_keepalive_sk_ipv4); + local_lock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock); + sk = this_cpu_read(nat_keepalive_sk_ipv4.sock); sock_net_set(sk, net); err = ip_build_and_send_pkt(skb, sk, fl4.saddr, fl4.daddr, NULL, tos); sock_net_set(sk, &init_net); + local_unlock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock); return err; } @@ -89,15 +95,19 @@ static int nat_keepalive_send_ipv6(struct sk_buff *skb, fl6.fl6_sport = ka->encap_sport; fl6.fl6_dport = ka->encap_dport; - sk = *this_cpu_ptr(&nat_keepalive_sk_ipv6); + local_lock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); + sk = this_cpu_read(nat_keepalive_sk_ipv6.sock); sock_net_set(sk, net); dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, &fl6, NULL); - if (IS_ERR(dst)) + if (IS_ERR(dst)) { + local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); return PTR_ERR(dst); + } skb_dst_set(skb, dst); err = ipv6_stub->ip6_xmit(sk, skb, &fl6, skb->mark, NULL, 0, 0); sock_net_set(sk, &init_net); + local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); return err; } #endif @@ -202,7 +212,7 @@ static void nat_keepalive_work(struct work_struct *work) (ctx.next_run - ctx.now) * HZ); } -static int nat_keepalive_sk_init(struct sock * __percpu *socks, +static int nat_keepalive_sk_init(struct sock_bh_locked __percpu *socks, unsigned short family) { struct sock *sk; @@ -214,22 +224,22 @@ static int nat_keepalive_sk_init(struct sock * __percpu *socks, if (err < 0) goto err; - *per_cpu_ptr(socks, i) = sk; + per_cpu_ptr(socks, i)->sock = sk; } return 0; err: for_each_possible_cpu(i) - inet_ctl_sock_destroy(*per_cpu_ptr(socks, i)); + inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock); return err; } -static void nat_keepalive_sk_fini(struct sock * __percpu *socks) +static void nat_keepalive_sk_fini(struct sock_bh_locked __percpu *socks) { int i; for_each_possible_cpu(i) - inet_ctl_sock_destroy(*per_cpu_ptr(socks, i)); + inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock); } void xfrm_nat_keepalive_state_updated(struct xfrm_state *x) From 41077de6f20314bf05c482df1da5d60c3fdf7e8e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:50 +0200 Subject: [PATCH 713/770] openvswitch: Merge three per-CPU structures into one exec_actions_level is a per-CPU integer allocated at compile time. action_fifos and flow_keys are per-CPU pointer and have their data allocated at module init time. There is no gain in splitting it, once the module is allocated, the structures are allocated. Merge the three per-CPU variables into ovs_pcpu_storage, adapt callers. Cc: Aaron Conole Cc: Eelco Chaudron Cc: Ilya Maximets Cc: dev@openvswitch.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/openvswitch/actions.c | 49 +++++++++++++------------------------- net/openvswitch/datapath.c | 9 +------ net/openvswitch/datapath.h | 3 --- 3 files changed, 17 insertions(+), 44 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 61fea7baae5d5..bebaf16ba8af6 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -78,17 +78,22 @@ struct action_flow_keys { struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; }; -static struct action_fifo __percpu *action_fifos; -static struct action_flow_keys __percpu *flow_keys; -static DEFINE_PER_CPU(int, exec_actions_level); +struct ovs_pcpu_storage { + struct action_fifo action_fifos; + struct action_flow_keys flow_keys; + int exec_level; +}; + +static DEFINE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage); /* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys' * space. Return NULL if out of key spaces. */ static struct sw_flow_key *clone_key(const struct sw_flow_key *key_) { - struct action_flow_keys *keys = this_cpu_ptr(flow_keys); - int level = this_cpu_read(exec_actions_level); + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(&ovs_pcpu_storage); + struct action_flow_keys *keys = &ovs_pcpu->flow_keys; + int level = ovs_pcpu->exec_level; struct sw_flow_key *key = NULL; if (level <= OVS_DEFERRED_ACTION_THRESHOLD) { @@ -132,10 +137,9 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb, const struct nlattr *actions, const int actions_len) { - struct action_fifo *fifo; + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage.action_fifos); struct deferred_action *da; - fifo = this_cpu_ptr(action_fifos); da = action_fifo_put(fifo); if (da) { da->skb = skb; @@ -1609,13 +1613,13 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, if (actions) { /* Sample action */ if (clone_flow_key) - __this_cpu_inc(exec_actions_level); + __this_cpu_inc(ovs_pcpu_storage.exec_level); err = do_execute_actions(dp, skb, clone, actions, len); if (clone_flow_key) - __this_cpu_dec(exec_actions_level); + __this_cpu_dec(ovs_pcpu_storage.exec_level); } else { /* Recirc action */ clone->recirc_id = recirc_id; ovs_dp_process_packet(skb, clone); @@ -1651,7 +1655,7 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, static void process_deferred_actions(struct datapath *dp) { - struct action_fifo *fifo = this_cpu_ptr(action_fifos); + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage.action_fifos); /* Do not touch the FIFO in case there is no deferred actions. */ if (action_fifo_is_empty(fifo)) @@ -1682,7 +1686,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, { int err, level; - level = __this_cpu_inc_return(exec_actions_level); + level = __this_cpu_inc_return(ovs_pcpu_storage.exec_level); if (unlikely(level > OVS_RECURSION_LIMIT)) { net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", ovs_dp_name(dp)); @@ -1699,27 +1703,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, process_deferred_actions(dp); out: - __this_cpu_dec(exec_actions_level); + __this_cpu_dec(ovs_pcpu_storage.exec_level); return err; } - -int action_fifos_init(void) -{ - action_fifos = alloc_percpu(struct action_fifo); - if (!action_fifos) - return -ENOMEM; - - flow_keys = alloc_percpu(struct action_flow_keys); - if (!flow_keys) { - free_percpu(action_fifos); - return -ENOMEM; - } - - return 0; -} - -void action_fifos_exit(void) -{ - free_percpu(action_fifos); - free_percpu(flow_keys); -} diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 5d548eda742df..aaa6277bb49c2 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2729,13 +2729,9 @@ static int __init dp_init(void) pr_info("Open vSwitch switching datapath\n"); - err = action_fifos_init(); - if (err) - goto error; - err = ovs_internal_dev_rtnl_link_register(); if (err) - goto error_action_fifos_exit; + goto error; err = ovs_flow_init(); if (err) @@ -2778,8 +2774,6 @@ static int __init dp_init(void) ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); -error_action_fifos_exit: - action_fifos_exit(); error: return err; } @@ -2795,7 +2789,6 @@ static void dp_cleanup(void) ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); - action_fifos_exit(); } module_init(dp_init); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 384ca77f4e794..a126407926058 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -281,9 +281,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, void ovs_dp_notify_wq(struct work_struct *work); -int action_fifos_init(void); -void action_fifos_exit(void); - /* 'KEY' must not have any bits set outside of the 'MASK' */ #define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) #define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) From 0b6328afff4e38553418cd5cdab9933ed464eb20 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:51 +0200 Subject: [PATCH 714/770] openvswitch: Use nested-BH locking for ovs_pcpu_storage ovs_pcpu_storage is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. The data structure can be referenced recursive and there is a recursion counter to avoid too many recursions. Add a local_lock_t to the data structure and use local_lock_nested_bh() for locking. Add an owner of the struct which is the current task and acquire the lock only if the structure is not owned by the current task. Cc: Aaron Conole Cc: Eelco Chaudron Cc: Ilya Maximets Cc: dev@openvswitch.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/openvswitch/actions.c | 31 ++----------------------------- net/openvswitch/datapath.c | 24 ++++++++++++++++++++++++ net/openvswitch/datapath.h | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 29 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index bebaf16ba8af6..f4996c11aefac 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -39,15 +39,6 @@ #include "flow_netlink.h" #include "openvswitch_trace.h" -struct deferred_action { - struct sk_buff *skb; - const struct nlattr *actions; - int actions_len; - - /* Store pkt_key clone when creating deferred action. */ - struct sw_flow_key pkt_key; -}; - #define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) struct ovs_frag_data { unsigned long dst; @@ -64,28 +55,10 @@ struct ovs_frag_data { static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); -#define DEFERRED_ACTION_FIFO_SIZE 10 -#define OVS_RECURSION_LIMIT 5 -#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) -struct action_fifo { - int head; - int tail; - /* Deferred action fifo queue storage. */ - struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; -}; - -struct action_flow_keys { - struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; -}; - -struct ovs_pcpu_storage { - struct action_fifo action_fifos; - struct action_flow_keys flow_keys; - int exec_level; +DEFINE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; -static DEFINE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage); - /* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys' * space. Return NULL if out of key spaces. */ diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index aaa6277bb49c2..b8f766978466d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -244,11 +244,13 @@ void ovs_dp_detach_port(struct vport *p) /* Must be called with rcu_read_lock. */ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(&ovs_pcpu_storage); const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; + bool ovs_pcpu_locked = false; u64 *stats_counter; u32 n_mask_hit; u32 n_cache_hit; @@ -290,10 +292,26 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) ovs_flow_stats_update(flow, key->tp.flags, skb); sf_acts = rcu_dereference(flow->sf_acts); + /* This path can be invoked recursively: Use the current task to + * identify recursive invocation - the lock must be acquired only once. + * Even with disabled bottom halves this can be preempted on PREEMPT_RT. + * Limit the provecc to RT to avoid assigning `owner' if it can be + * avoided. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && ovs_pcpu->owner != current) { + local_lock_nested_bh(&ovs_pcpu_storage.bh_lock); + ovs_pcpu->owner = current; + ovs_pcpu_locked = true; + } + error = ovs_execute_actions(dp, skb, sf_acts, key); if (unlikely(error)) net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", ovs_dp_name(dp), error); + if (ovs_pcpu_locked) { + ovs_pcpu->owner = NULL; + local_unlock_nested_bh(&ovs_pcpu_storage.bh_lock); + } stats_counter = &stats->n_hit; @@ -671,7 +689,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); + local_lock_nested_bh(&ovs_pcpu_storage.bh_lock); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_write(ovs_pcpu_storage.owner, current); err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_write(ovs_pcpu_storage.owner, NULL); + local_unlock_nested_bh(&ovs_pcpu_storage.bh_lock); local_bh_enable(); rcu_read_unlock(); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index a126407926058..4a665c3cfa906 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -173,6 +173,39 @@ struct ovs_net { bool xt_label; }; +struct deferred_action { + struct sk_buff *skb; + const struct nlattr *actions; + int actions_len; + + /* Store pkt_key clone when creating deferred action. */ + struct sw_flow_key pkt_key; +}; + +#define DEFERRED_ACTION_FIFO_SIZE 10 +#define OVS_RECURSION_LIMIT 5 +#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) + +struct action_fifo { + int head; + int tail; + /* Deferred action fifo queue storage. */ + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +}; + +struct action_flow_keys { + struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; +}; + +struct ovs_pcpu_storage { + struct action_fifo action_fifos; + struct action_flow_keys flow_keys; + int exec_level; + struct task_struct *owner; + local_lock_t bh_lock; +}; +DECLARE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage); + /** * enum ovs_pkt_hash_types - hash info to include with a packet * to send to userspace. From a4faf8045a87764c58557c0ce3cecbfa0b4286c4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:52 +0200 Subject: [PATCH 715/770] openvswitch: Move ovs_frag_data_storage into the struct ovs_pcpu_storage ovs_frag_data_storage is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Move ovs_frag_data_storage into the struct ovs_pcpu_storage which already provides locking for the structure. Cc: Aaron Conole Cc: Eelco Chaudron Cc: Ilya Maximets Cc: dev@openvswitch.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/openvswitch/actions.c | 20 ++------------------ net/openvswitch/datapath.h | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index f4996c11aefac..4d20eadd77ceb 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -39,22 +39,6 @@ #include "flow_netlink.h" #include "openvswitch_trace.h" -#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) -struct ovs_frag_data { - unsigned long dst; - struct vport *vport; - struct ovs_skb_cb cb; - __be16 inner_protocol; - u16 network_offset; /* valid only for MPLS */ - u16 vlan_tci; - __be16 vlan_proto; - unsigned int l2_len; - u8 mac_proto; - u8 l2_data[MAX_L2_LEN]; -}; - -static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); - DEFINE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; @@ -771,7 +755,7 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); + struct ovs_frag_data *data = this_cpu_ptr(&ovs_pcpu_storage.frag_data); struct vport *vport = data->vport; if (skb_cow_head(skb, data->l2_len) < 0) { @@ -823,7 +807,7 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb, unsigned int hlen = skb_network_offset(skb); struct ovs_frag_data *data; - data = this_cpu_ptr(&ovs_frag_data_storage); + data = this_cpu_ptr(&ovs_pcpu_storage.frag_data); data->dst = skb->_skb_refdst; data->vport = vport; data->cb = *OVS_CB(skb); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 4a665c3cfa906..1b5348b0f5594 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "conntrack.h" #include "flow.h" @@ -173,6 +174,20 @@ struct ovs_net { bool xt_label; }; +#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) +struct ovs_frag_data { + unsigned long dst; + struct vport *vport; + struct ovs_skb_cb cb; + __be16 inner_protocol; + u16 network_offset; /* valid only for MPLS */ + u16 vlan_tci; + __be16 vlan_proto; + unsigned int l2_len; + u8 mac_proto; + u8 l2_data[MAX_L2_LEN]; +}; + struct deferred_action { struct sk_buff *skb; const struct nlattr *actions; @@ -200,6 +215,7 @@ struct action_flow_keys { struct ovs_pcpu_storage { struct action_fifo action_fifos; struct action_flow_keys flow_keys; + struct ovs_frag_data frag_data; int exec_level; struct task_struct *owner; local_lock_t bh_lock; From fd07c69e7162374c522ffa09016bc5c0ed986c65 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:53 +0200 Subject: [PATCH 716/770] net/sched: act_mirred: Move the recursion counter struct netdev_xmit mirred_nest_level is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Move mirred_nest_level to struct netdev_xmit as u8, provide wrappers. Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- include/linux/netdevice_xmit.h | 3 +++ net/sched/act_mirred.c | 28 +++++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 3bbbc1a9860a3..4793ec42b1faa 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -11,6 +11,9 @@ struct netdev_xmit { #if IS_ENABLED(CONFIG_NF_DUP_NETDEV) u8 nf_dup_skb_recursion; #endif +#if IS_ENABLED(CONFIG_NET_ACT_MIRRED) + u8 sched_mirred_nest; +#endif }; #endif diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5b38143659249..5f01f567c934d 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,7 +30,29 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); #define MIRRED_NEST_LIMIT 4 -static DEFINE_PER_CPU(unsigned int, mirred_nest_level); + +#ifndef CONFIG_PREEMPT_RT +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); +} + +static void tcf_mirred_nest_level_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); +} + +#else +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return current->net_xmit.sched_mirred_nest++; +} + +static void tcf_mirred_nest_level_dec(void) +{ + current->net_xmit.sched_mirred_nest--; +} +#endif static bool tcf_mirred_is_act_redirect(int action) { @@ -423,7 +445,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, int m_eaction; u32 blockid; - nest_level = __this_cpu_inc_return(mirred_nest_level); + nest_level = tcf_mirred_nest_level_inc_return(); if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); @@ -454,7 +476,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, retval); dec_nest_level: - __this_cpu_dec(mirred_nest_level); + tcf_mirred_nest_level_dec(); return retval; } From 3b4155ba6bbe7b9d9a90c2b26c20d63d26eff7c9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:54 +0200 Subject: [PATCH 717/770] net/sched: Use nested-BH locking for sch_frag_data_storage sch_frag_data_storage is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Add local_lock_t to the struct and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/sched/sch_frag.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c index ce63414185fd6..d1d87dce7f3f7 100644 --- a/net/sched/sch_frag.c +++ b/net/sched/sch_frag.c @@ -16,14 +16,18 @@ struct sch_frag_data { unsigned int l2_len; u8 l2_data[VLAN_ETH_HLEN]; int (*xmit)(struct sk_buff *skb); + local_lock_t bh_lock; }; -static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage); +static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage); + lockdep_assert_held(&data->bh_lock); if (skb_cow_head(skb, data->l2_len) < 0) { kfree_skb(skb); return -ENOMEM; @@ -95,6 +99,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, struct rtable sch_frag_rt = { 0 }; unsigned long orig_dst; + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); sch_frag_prepare_frag(skb, xmit); dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); @@ -105,11 +110,13 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, IPCB(skb)->frag_max_size = mru; ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit); + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); refdst_drop(orig_dst); } else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) { unsigned long orig_dst; struct rt6_info sch_frag_rt; + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); sch_frag_prepare_frag(skb, xmit); memset(&sch_frag_rt, 0, sizeof(sch_frag_rt)); dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, @@ -122,6 +129,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb, sch_frag_xmit); + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); refdst_drop(orig_dst); } else { net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n", From 5d7c24cd82cc8dc588abbe70976bca22bdbabaf7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:55 +0200 Subject: [PATCH 718/770] mptcp: Use nested-BH locking for hmac_storage mptcp_delegated_actions is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Add a local_lock_t to the data structure and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Matthieu Baerts Cc: Mat Martineau Cc: Geliang Tang Cc: mptcp@lists.linux.dev Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/mptcp/protocol.c | 4 +++- net/mptcp/protocol.h | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 26ffa06c21e8d..9c3a124b45f56 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -46,7 +46,9 @@ static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_sm static void __mptcp_destroy_sock(struct sock *sk); static void mptcp_check_send_data_fin(struct sock *sk); -DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); +DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static struct net_device *mptcp_napi_dev; /* Returns end sequence number of the receiver's advertised window */ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 7aa38d74fef6b..3dd11dd3ba16e 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -479,6 +479,7 @@ mptcp_subflow_rsk(const struct request_sock *rsk) struct mptcp_delegated_action { struct napi_struct napi; + local_lock_t bh_lock; struct list_head head; }; @@ -670,9 +671,11 @@ static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node))) return; + local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); delegated = this_cpu_ptr(&mptcp_delegated_actions); schedule = list_empty(&delegated->head); list_add_tail(&subflow->delegated_node, &delegated->head); + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); sock_hold(mptcp_subflow_tcp_sock(subflow)); if (schedule) napi_schedule(&delegated->napi); @@ -684,11 +687,15 @@ mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) { struct mptcp_subflow_context *ret; - if (list_empty(&delegated->head)) + local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); + if (list_empty(&delegated->head)) { + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return NULL; + } ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node); list_del_init(&ret->delegated_node); + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return ret; } From f53f03f4db1b9d2aad711eb453b3223035a89df1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:56 +0200 Subject: [PATCH 719/770] rds: Disable only bottom halves in rds_page_remainder_alloc() rds_page_remainder_alloc() is invoked from a preemptible context or a tasklet. There is no need to disable interrupts for locking. Use local_bh_disable() instead of local_irq_save() for locking. Cc: Allison Henderson Cc: linux-rdma@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/rds/page.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/rds/page.c b/net/rds/page.c index 7cc57e098ddb9..e0dd4f62ea47a 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -69,7 +69,6 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, gfp_t gfp) { struct rds_page_remainder *rem; - unsigned long flags; struct page *page; int ret; @@ -88,7 +87,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, } rem = &per_cpu(rds_page_remainders, get_cpu()); - local_irq_save(flags); + local_bh_disable(); while (1) { /* avoid a tiny region getting stuck by tossing it */ @@ -116,13 +115,13 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, } /* alloc if there is nothing for us to use */ - local_irq_restore(flags); + local_bh_enable(); put_cpu(); page = alloc_page(gfp); rem = &per_cpu(rds_page_remainders, get_cpu()); - local_irq_save(flags); + local_bh_disable(); if (!page) { ret = -ENOMEM; @@ -140,7 +139,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, rem->r_offset = 0; } - local_irq_restore(flags); + local_bh_enable(); put_cpu(); out: rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, From 956ee2112131515c03d977dcf4ac05c27e8ea76c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:57 +0200 Subject: [PATCH 720/770] rds: Acquire per-CPU pointer within BH disabled section rds_page_remainder_alloc() obtains the current CPU with get_cpu() while disabling preemption. Then the CPU number is used to access the per-CPU data structure via per_cpu(). This can be optimized by relying on local_bh_disable() to provide a stable CPU number/ avoid migration and then using this_cpu_ptr() to retrieve the data structure. Cc: Allison Henderson Cc: linux-rdma@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/rds/page.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/rds/page.c b/net/rds/page.c index e0dd4f62ea47a..58a8548a915a9 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -86,8 +86,8 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, goto out; } - rem = &per_cpu(rds_page_remainders, get_cpu()); local_bh_disable(); + rem = this_cpu_ptr(&rds_page_remainders); while (1) { /* avoid a tiny region getting stuck by tossing it */ @@ -116,12 +116,11 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, /* alloc if there is nothing for us to use */ local_bh_enable(); - put_cpu(); page = alloc_page(gfp); - rem = &per_cpu(rds_page_remainders, get_cpu()); local_bh_disable(); + rem = this_cpu_ptr(&rds_page_remainders); if (!page) { ret = -ENOMEM; @@ -140,7 +139,6 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, } local_bh_enable(); - put_cpu(); out: rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, From f044f8a12216ddf83c277e94328b1277f4f977a6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 30 Apr 2025 14:47:58 +0200 Subject: [PATCH 721/770] rds: Use nested-BH locking for rds_page_remainder rds_page_remainder is a per-CPU variable and relies on disabled BH for its locking. Without per-CPU locking in local_bh_disable() on PREEMPT_RT this data structure requires explicit locking. Add a local_lock_t to the data structure and use local_lock_nested_bh() for locking. This change adds only lockdep coverage and does not alter the functional behaviour for !PREEMPT_RT. Cc: Allison Henderson Cc: linux-rdma@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: NipaLocal --- net/rds/page.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/rds/page.c b/net/rds/page.c index 58a8548a915a9..afb151eac271c 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -40,10 +40,12 @@ struct rds_page_remainder { struct page *r_page; unsigned long r_offset; + local_lock_t bh_lock; }; -static -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; /** * rds_page_remainder_alloc - build up regions of a message. @@ -87,6 +89,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, } local_bh_disable(); + local_lock_nested_bh(&rds_page_remainders.bh_lock); rem = this_cpu_ptr(&rds_page_remainders); while (1) { @@ -115,11 +118,13 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, } /* alloc if there is nothing for us to use */ + local_unlock_nested_bh(&rds_page_remainders.bh_lock); local_bh_enable(); page = alloc_page(gfp); local_bh_disable(); + local_lock_nested_bh(&rds_page_remainders.bh_lock); rem = this_cpu_ptr(&rds_page_remainders); if (!page) { @@ -138,6 +143,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, rem->r_offset = 0; } + local_unlock_nested_bh(&rds_page_remainders.bh_lock); local_bh_enable(); out: rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, From fdf5440c3ef3c4625f13161b867310949e28af75 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 30 Apr 2025 17:38:02 +0300 Subject: [PATCH 722/770] net: phy: Refactor fwnode_get_phy_node() Refactor to check if the fwnode we got is correct and return if so, otherwise do additional checks. Using same pattern in all conditionals makes it slightly easier to read and understand. Signed-off-by: Andy Shevchenko Signed-off-by: NipaLocal --- drivers/net/phy/phy_device.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index f85c172c446c5..2eb735e68dd82 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3265,12 +3265,12 @@ struct fwnode_handle *fwnode_get_phy_node(const struct fwnode_handle *fwnode) /* Only phy-handle is used for ACPI */ phy_node = fwnode_find_reference(fwnode, "phy-handle", 0); - if (is_acpi_node(fwnode) || !IS_ERR(phy_node)) + if (!IS_ERR(phy_node) || is_acpi_node(fwnode)) return phy_node; phy_node = fwnode_find_reference(fwnode, "phy", 0); - if (IS_ERR(phy_node)) - phy_node = fwnode_find_reference(fwnode, "phy-device", 0); - return phy_node; + if (!IS_ERR(phy_node)) + return phy_node; + return fwnode_find_reference(fwnode, "phy-device", 0); } EXPORT_SYMBOL_GPL(fwnode_get_phy_node); From e6aa5cd3e19aa1c4012acc1f0cfc015955564c3f Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Wed, 30 Apr 2025 16:45:41 +0100 Subject: [PATCH 723/770] mptcp: Align mptcp_inet6_sk with other protocols Ever since commit f5f80e32de12 ("ipv6: remove hard coded limitation on ipv6_pinfo") that protocols stopped using the old "obj_size - sizeof(struct ipv6_pinfo)" way of grabbing ipv6_pinfo, that severely restricted struct layout and caused fun, hard to see issues. However, mptcp_inet6_sk wasn't fixed (unlike tcp_inet6_sk). Do so. The non-cloned sockets already do the right thing using ipv6_pinfo_offset + the generic IPv6 code. Signed-off-by: Pedro Falcato Signed-off-by: NipaLocal --- net/mptcp/protocol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9c3a124b45f56..0749733ea897b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3144,9 +3144,9 @@ static int mptcp_disconnect(struct sock *sk, int flags) #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) { - unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); + struct mptcp6_sock *msk6 = container_of(mptcp_sk(sk), struct mptcp6_sock, msk); - return (struct ipv6_pinfo *)(((u8 *)sk) + offset); + return &msk6->np; } static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk) From 86071b9e96d969df519d1f0100af84ef94b34207 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 30 Apr 2025 11:21:35 -0500 Subject: [PATCH 724/770] dt-bindings: net: ethernet-controller: Add informative text about RGMII delays Device Tree and Ethernet MAC driver writers often misunderstand RGMII delays. Rewrite the Normative section in terms of the PCB, is the PCB adding the 2ns delay. This meaning was previous implied by the definition, but often wrongly interpreted due to the ambiguous wording and looking at the definition from the wrong perspective. The new definition concentrates clearly on the hardware, and should be less ambiguous. Add an Informative section to the end of the binding describing in detail what the four RGMII delays mean. This expands on just the PCB meaning, adding in the implications for the MAC and PHY. Additionally, when the MAC or PHY needs to add a delay, which is software configuration, describe how Linux does this, in the hope of reducing errors. Make it clear other users of device tree binding may implement the software configuration in other ways while still conforming to the binding. Fixes: 9d3de3c58347 ("dt-bindings: net: Add YAML schemas for the generic Ethernet options") Signed-off-by: Andrew Lunn Acked-by: Conor Dooley Signed-off-by: NipaLocal --- .../bindings/net/ethernet-controller.yaml | 97 +++++++++++++++++-- 1 file changed, 90 insertions(+), 7 deletions(-) diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml index 61e51ea58a985..b72bea2a1c7c0 100644 --- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml +++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml @@ -74,19 +74,17 @@ properties: - rev-rmii - moca - # RX and TX delays are added by the MAC when required + # RX and TX delays are provided by the PCB. See below - rgmii - # RGMII with internal RX and TX delays provided by the PHY, - # the MAC should not add the RX or TX delays in this case + # RX and TX delays are not provided by the PCB. This is the most + # frequent case. See below - rgmii-id - # RGMII with internal RX delay provided by the PHY, the MAC - # should not add an RX delay in this case + # TX delay is provided by the PCB. See below - rgmii-rxid - # RGMII with internal TX delay provided by the PHY, the MAC - # should not add an TX delay in this case + # RX delay is provided by the PCB. See below - rgmii-txid - rtbi - smii @@ -286,4 +284,89 @@ allOf: additionalProperties: true +# Informative +# =========== +# +# 'phy-modes' & 'phy-connection-type' properties 'rgmii', 'rgmii-id', +# 'rgmii-rxid', and 'rgmii-txid' are frequently used wrongly by +# developers. This informative section clarifies their usage. +# +# The RGMII specification requires a 2ns delay between the data and +# clock signals on the RGMII bus. How this delay is implemented is not +# specified. +# +# One option is to make the clock traces on the PCB longer than the +# data traces. A sufficiently difference in length can provide the 2ns +# delay. If both the RX and TX delays are implemented in this manner, +# 'rgmii' should be used, so indicating the PCB adds the delays. +# +# If the PCB does not add these delays via extra long traces, +# 'rgmii-id' should be used. Here, 'id' refers to 'internal delay', +# where either the MAC or PHY adds the delay. +# +# If only one of the two delays are implemented via extra long clock +# lines, either 'rgmii-rxid' or 'rgmii-txid' should be used, +# indicating the MAC or PHY should implement one of the delays +# internally, while the PCB implements the other delay. +# +# Device Tree describes hardware, and in this case, it describes the +# PCB between the MAC and the PHY, if the PCB implements delays or +# not. +# +# In practice, very few PCBs make use of extra long clock lines. Hence +# any RGMII phy mode other than 'rgmii-id' is probably wrong, and is +# unlikely to be accepted during review without details provided in +# the commit description and comments in the .dts file. +# +# When the PCB does not implement the delays, the MAC or PHY must. As +# such, this is software configuration, and so not described in Device +# Tree. +# +# The following describes how Linux implements the configuration of +# the MAC and PHY to add these delays when the PCB does not. As stated +# above, developers often get this wrong, and the aim of this section +# is reduce the frequency of these errors by Linux developers. Other +# users of the Device Tree may implement it differently, and still be +# consistent with both the normative and informative description +# above. +# +# By default in Linux, when using phylib/phylink, the MAC is expected +# to read the 'phy-mode' from Device Tree, not implement any delays, +# and pass the value to the PHY. The PHY will then implement delays as +# specified by the 'phy-mode'. The PHY should always be reconfigured +# to implement the needed delays, replacing any setting performed by +# strapping or the bootloader, etc. +# +# Experience to date is that all PHYs which implement RGMII also +# implement the ability to add or not add the needed delays. Hence +# this default is expected to work in all cases. Ignoring this default +# is likely to be questioned by Reviews, and require a strong argument +# to be accepted. +# +# There are a small number of cases where the MAC has hard coded +# delays which cannot be disabled. The 'phy-mode' only describes the +# PCB. The inability to disable the delays in the MAC does not change +# the meaning of 'phy-mode'. It does however mean that a 'phy-mode' of +# 'rgmii' is now invalid, it cannot be supported, since both the PCB +# and the MAC and PHY adding delays cannot result in a functional +# link. Thus the MAC should report a fatal error for any modes which +# cannot be supported. When the MAC implements the delay, it must +# ensure that the PHY does not also implement the same delay. So it +# must modify the phy-mode it passes to the PHY, removing the delay it +# has added. Failure to remove the delay will result in a +# non-functioning link. +# +# Sometimes there is a need to fine tune the delays. Often the MAC or +# PHY can perform this fine tuning. In the MAC node, the Device Tree +# properties 'rx-internal-delay-ps' and 'tx-internal-delay-ps' should +# be used to indicate fine tuning performed by the MAC. The values +# expected here are small. A value of 2000ps, i.e 2ns, and a phy-mode +# of 'rgmii' will not be accepted by Reviewers. +# +# If the PHY is to perform fine tuning, the properties +# 'rx-internal-delay-ps' and 'tx-internal-delay-ps' in the PHY node +# should be used. When the PHY is implementing delays, e.g. 'rgmii-id' +# these properties should have a value near to 2000ps. If the PCB is +# implementing delays, e.g. 'rgmii', a small value can be used to fine +# tune the delay added by the PCB. ... From 04ab8c34ee77287c0791ce31f3187ca83e77c0b0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 30 Apr 2025 09:37:58 -0700 Subject: [PATCH 725/770] virtio-net: don't re-enable refill work too early when NAPI is disabled Commit 4bc12818b363 ("virtio-net: disable delayed refill when pausing rx") fixed a deadlock between reconfig paths and refill work trying to disable the same NAPI instance. The refill work can't run in parallel with reconfig because trying to double-disable a NAPI instance causes a stall under the instance lock, which the reconfig path needs to re-enable the NAPI and therefore unblock the stalled thread. There are two cases where we re-enable refill too early. One is in the virtnet_set_queues() handler. We call it when installing XDP: virtnet_rx_pause_all(vi); ... virtnet_napi_tx_disable(..); ... virtnet_set_queues(..); ... virtnet_rx_resume_all(..); We want the work to be disabled until we call virtnet_rx_resume_all(), but virtnet_set_queues() kicks it before NAPIs were re-enabled. The other case is a more trivial case of mis-ordering in __virtnet_rx_resume() found by code inspection. Taking the spin lock in virtnet_set_queues() (requested during review) may be unnecessary as we are under rtnl_lock and so are all paths writing to ->refill_enabled. Reviewed-by: Bui Quang Minh Fixes: 4bc12818b363 ("virtio-net: disable delayed refill when pausing rx") Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()") Signed-off-by: Jakub Kicinski Signed-off-by: NipaLocal --- drivers/net/virtio_net.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 848fab51dfa1b..b5549d542c022 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3383,12 +3383,15 @@ static void __virtnet_rx_resume(struct virtnet_info *vi, bool refill) { bool running = netif_running(vi->dev); + bool schedule_refill = false; if (refill && !try_fill_recv(vi, rq, GFP_KERNEL)) - schedule_delayed_work(&vi->refill, 0); - + schedule_refill = true; if (running) virtnet_napi_enable(rq); + + if (schedule_refill) + schedule_delayed_work(&vi->refill, 0); } static void virtnet_rx_resume_all(struct virtnet_info *vi) @@ -3728,8 +3731,10 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) succ: vi->curr_queue_pairs = queue_pairs; /* virtnet_open() will refill when device is going to up. */ - if (dev->flags & IFF_UP) + spin_lock_bh(&vi->refill_lock); + if (dev->flags & IFF_UP && vi->refill_enabled) schedule_delayed_work(&vi->refill, 0); + spin_unlock_bh(&vi->refill_lock); return 0; } From 9146c89eb7311fe16d259e284e4db7e64f1e7d66 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 30 Apr 2025 09:38:36 -0700 Subject: [PATCH 726/770] virtio-net: free xsk_buffs on error in virtnet_xsk_pool_enable() The selftests added to our CI by Bui Quang Minh recently reveals that there is a mem leak on the error path of virtnet_xsk_pool_enable(): unreferenced object 0xffff88800a68a000 (size 2048): comm "xdp_helper", pid 318, jiffies 4294692778 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 0): __kvmalloc_node_noprof+0x402/0x570 virtnet_xsk_pool_enable+0x293/0x6a0 (drivers/net/virtio_net.c:5882) xp_assign_dev+0x369/0x670 (net/xdp/xsk_buff_pool.c:226) xsk_bind+0x6a5/0x1ae0 __sys_bind+0x15e/0x230 __x64_sys_bind+0x72/0xb0 do_syscall_64+0xc1/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f Acked-by: Jason Wang Fixes: e9f3962441c0 ("virtio_net: xsk: rx: support fill with xsk buffer") Signed-off-by: Jakub Kicinski Signed-off-by: NipaLocal --- drivers/net/virtio_net.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b5549d542c022..f9e3e628ec4df 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -5890,8 +5890,10 @@ static int virtnet_xsk_pool_enable(struct net_device *dev, hdr_dma = virtqueue_dma_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len, DMA_TO_DEVICE, 0); - if (virtqueue_dma_mapping_error(sq->vq, hdr_dma)) - return -ENOMEM; + if (virtqueue_dma_mapping_error(sq->vq, hdr_dma)) { + err = -ENOMEM; + goto err_free_buffs; + } err = xsk_pool_dma_map(pool, dma_dev, 0); if (err) @@ -5919,6 +5921,8 @@ static int virtnet_xsk_pool_enable(struct net_device *dev, err_xsk_map: virtqueue_dma_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len, DMA_TO_DEVICE, 0); +err_free_buffs: + kvfree(rq->xsk_buffs); return err; } From 054c6add642d2585dca2ff177a073345a24fa1bb Mon Sep 17 00:00:00 2001 From: Jon Kohler Date: Wed, 30 Apr 2025 11:29:20 -0700 Subject: [PATCH 727/770] xdp: add xdp_skb_reserve_put helper Add helper for calling skb_{put|reserve} to reduce repetitive pattern across various drivers. Plumb into tap and tun to start. No functional change intended. Signed-off-by: Jon Kohler Signed-off-by: NipaLocal --- drivers/net/tap.c | 3 +-- drivers/net/tun.c | 3 +-- include/net/xdp.h | 8 ++++++++ net/core/xdp.c | 3 +-- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index d4ece538f1b23..54ce492da5e9f 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -1062,8 +1062,7 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) goto err; } - skb_reserve(skb, xdp->data - xdp->data_hard_start); - skb_put(skb, xdp->data_end - xdp->data); + xdp_skb_reserve_put(xdp, skb); skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_header(skb); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 7babd1e9a378b..30701ad5c27dd 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2415,8 +2415,7 @@ static int tun_xdp_one(struct tun_struct *tun, goto out; } - skb_reserve(skb, xdp->data - xdp->data_hard_start); - skb_put(skb, xdp->data_end - xdp->data); + xdp_skb_reserve_put(xdp, skb); /* The externally provided xdp_buff may have no metadata support, which * is marked by xdp->data_meta being xdp->data + 1. This will lead to a diff --git a/include/net/xdp.h b/include/net/xdp.h index b40f1f96cb117..6fe3b41351f63 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -345,6 +345,14 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); +static __always_inline +void xdp_skb_reserve_put(const struct xdp_buff *xdp, + struct sk_buff *skb) +{ + skb_reserve(skb, xdp->data - xdp->data_hard_start); + __skb_put(skb, xdp->data_end - xdp->data); +} + static inline void xdp_convert_frame_to_buff(const struct xdp_frame *frame, struct xdp_buff *xdp) diff --git a/net/core/xdp.c b/net/core/xdp.c index d8cb529ff6308..8f796fdf3295c 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -647,8 +647,7 @@ struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) if (unlikely(!skb)) return NULL; - skb_reserve(skb, xdp->data - xdp->data_hard_start); - __skb_put(skb, xdp->data_end - xdp->data); + xdp_skb_reserve_put(xdp, skb); metalen = xdp->data - xdp->data_meta; if (metalen > 0) From 12444e0b6d06f8df7712ecdc815a79e5ea14a90c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 1 May 2025 01:24:02 +0100 Subject: [PATCH 728/770] strparser: Remove unused __strp_unpause The last use of __strp_unpause() was removed in 2022 by commit 84c61fe1a75b ("tls: rx: do not use the standard strparser") Remove it. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Simon Horman Signed-off-by: NipaLocal --- include/net/strparser.h | 2 -- net/strparser/strparser.c | 13 ------------- 2 files changed, 15 deletions(-) diff --git a/include/net/strparser.h b/include/net/strparser.h index 0a83010b3a64a..0ed73e364faa2 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -114,8 +114,6 @@ static inline void strp_pause(struct strparser *strp) /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); -/* Must be called with process lock held (lock_sock) */ -void __strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 95696f42647ec..d946bfb424c7f 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -485,19 +485,6 @@ int strp_init(struct strparser *strp, struct sock *sk, } EXPORT_SYMBOL_GPL(strp_init); -/* Sock process lock held (lock_sock) */ -void __strp_unpause(struct strparser *strp) -{ - strp->paused = 0; - - if (strp->need_bytes) { - if (strp_peek_len(strp) < strp->need_bytes) - return; - } - strp_read_sock(strp); -} -EXPORT_SYMBOL_GPL(__strp_unpause); - void strp_unpause(struct strparser *strp) { strp->paused = 0; From 020023649cd30a682246106bfda3dbd31f0fb06f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 30 Apr 2025 17:53:29 -0700 Subject: [PATCH 729/770] ipv6: Restore fib6_config validation for SIOCADDRT. syzkaller reported out-of-bounds read in ipv6_addr_prefix(), where the prefix length was over 128. The cited commit accidentally removed some fib6_config validation from the ioctl path. Let's restore the validation. [0]: BUG: KASAN: slab-out-of-bounds in ip6_route_info_create (./include/net/ipv6.h:616 net/ipv6/route.c:3814) Read of size 1 at addr ff11000138020ad4 by task repro/261 CPU: 3 UID: 0 PID: 261 Comm: repro Not tainted 6.15.0-rc3-00614-g0d15a26b247d #87 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:123) print_report (mm/kasan/report.c:409 mm/kasan/report.c:521) kasan_report (mm/kasan/report.c:636) ip6_route_info_create (./include/net/ipv6.h:616 net/ipv6/route.c:3814) ip6_route_add (net/ipv6/route.c:3902) ipv6_route_ioctl (net/ipv6/route.c:4523) inet6_ioctl (net/ipv6/af_inet6.c:577) sock_do_ioctl (net/socket.c:1190) sock_ioctl (net/socket.c:1314) __x64_sys_ioctl (fs/ioctl.c:51 fs/ioctl.c:906 fs/ioctl.c:892 fs/ioctl.c:892) do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) RIP: 0033:0x7f518fb2de5d Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 73 9f 1b 00 f7 d8 64 89 01 48 RSP: 002b:00007fff14f38d18 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f518fb2de5d RDX: 00000000200015c0 RSI: 000000000000890b RDI: 0000000000000003 RBP: 00007fff14f38d30 R08: 0000000000000800 R09: 0000000000000800 R10: 0000000000000000 R11: 0000000000000202 R12: 00007fff14f38e48 R13: 0000000000401136 R14: 0000000000403df0 R15: 00007f518fd3c000 Fixes: fa76c1674f2e ("ipv6: Move some validation from ip6_route_info_create() to rtm_to_fib6_config().") Reported-by: syzkaller Reported-by: Yi Lai Closes: https://lore.kernel.org/netdev/aBAcKDEFoN%2FLntBF@ly-workstation/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: David Ahern Signed-off-by: NipaLocal --- net/ipv6/route.c | 97 +++++++++++++++++++++++++++--------------------- 1 file changed, 55 insertions(+), 42 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index aa6b45bd35157..44300962230bd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4503,6 +4503,53 @@ void rt6_purge_dflt_routers(struct net *net) rcu_read_unlock(); } +static int fib6_config_validate(struct fib6_config *cfg, + struct netlink_ext_ack *extack) +{ + /* RTF_PCPU is an internal flag; can not be set by userspace */ + if (cfg->fc_flags & RTF_PCPU) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); + goto errout; + } + + /* RTF_CACHE is an internal flag; can not be set by userspace */ + if (cfg->fc_flags & RTF_CACHE) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); + goto errout; + } + + if (cfg->fc_type > RTN_MAX) { + NL_SET_ERR_MSG(extack, "Invalid route type"); + goto errout; + } + + if (cfg->fc_dst_len > 128) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + goto errout; + } + +#ifdef CONFIG_IPV6_SUBTREES + if (cfg->fc_src_len > 128) { + NL_SET_ERR_MSG(extack, "Invalid source address length"); + goto errout; + } + + if (cfg->fc_nh_id && cfg->fc_src_len) { + NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + goto errout; + } +#else + if (cfg->fc_src_len) { + NL_SET_ERR_MSG(extack, + "Specifying source address requires IPV6_SUBTREES to be enabled"); + goto errout; + } +#endif + return 0; +errout: + return -EINVAL; +} + static void rtmsg_to_fib6_config(struct net *net, struct in6_rtmsg *rtmsg, struct fib6_config *cfg) @@ -4540,6 +4587,10 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) switch (cmd) { case SIOCADDRT: + err = fib6_config_validate(&cfg, NULL); + if (err) + break; + /* Only do the default setting of fc_metric in route adding */ if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; @@ -5274,48 +5325,6 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, } } - if (newroute) { - /* RTF_PCPU is an internal flag; can not be set by userspace */ - if (cfg->fc_flags & RTF_PCPU) { - NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); - goto errout; - } - - /* RTF_CACHE is an internal flag; can not be set by userspace */ - if (cfg->fc_flags & RTF_CACHE) { - NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); - goto errout; - } - - if (cfg->fc_type > RTN_MAX) { - NL_SET_ERR_MSG(extack, "Invalid route type"); - goto errout; - } - - if (cfg->fc_dst_len > 128) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - goto errout; - } - -#ifdef CONFIG_IPV6_SUBTREES - if (cfg->fc_src_len > 128) { - NL_SET_ERR_MSG(extack, "Invalid source address length"); - goto errout; - } - - if (cfg->fc_nh_id && cfg->fc_src_len) { - NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); - goto errout; - } -#else - if (cfg->fc_src_len) { - NL_SET_ERR_MSG(extack, - "Specifying source address requires IPV6_SUBTREES to be enabled"); - goto errout; - } -#endif - } - err = 0; errout: return err; @@ -5710,6 +5719,10 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + err = fib6_config_validate(&cfg, extack); + if (err) + return err; + if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; From 3b6e24ebdd49f94e9c0b3e34d26ca700c5881d9e Mon Sep 17 00:00:00 2001 From: Jon Kohler Date: Wed, 30 Apr 2025 19:04:28 -0700 Subject: [PATCH 730/770] vhost/net: Defer TX queue re-enable until after sendmsg In handle_tx_copy, TX batching processes packets below ~PAGE_SIZE and batches up to 64 messages before calling sock->sendmsg. Currently, when there are no more messages on the ring to dequeue, handle_tx_copy re-enables kicks on the ring *before* firing off the batch sendmsg. However, sock->sendmsg incurs a non-zero delay, especially if it needs to wake up a thread (e.g., another vhost worker). If the guest submits additional messages immediately after the last ring check and disablement, it triggers an EPT_MISCONFIG vmexit to attempt to kick the vhost worker. This may happen while the worker is still processing the sendmsg, leading to wasteful exit(s). This is particularly problematic for single-threaded guest submission threads, as they must exit, wait for the exit to be processed (potentially involving a TTWU), and then resume. In scenarios like a constant stream of UDP messages, this results in a sawtooth pattern where the submitter frequently vmexits, and the vhost-net worker alternates between sleeping and waking. A common solution is to configure vhost-net busy polling via userspace (e.g., qemu poll-us). However, treating the sendmsg as the "busy" period by keeping kicks disabled during the final sendmsg and performing one additional ring check afterward provides a significant performance improvement without any excess busy poll cycles. If messages are found in the ring after the final sendmsg, requeue the TX handler. This ensures fairness for the RX handler and allows vhost_run_work_list to cond_resched() as needed. Test Case TX VM: taskset -c 2 iperf3 -c rx-ip-here -t 60 -p 5200 -b 0 -u -i 5 RX VM: taskset -c 2 iperf3 -s -p 5200 -D 6.12.0, each worker backed by tun interface with IFF_NAPI setup. Note: TCP side is largely unchanged as that was copy bound 6.12.0 unpatched EPT_MISCONFIG/second: 5411 Datagrams/second: ~382k Interval Transfer Bitrate Lost/Total Datagrams 0.00-30.00 sec 15.5 GBytes 4.43 Gbits/sec 0/11481630 (0%) sender 6.12.0 patched EPT_MISCONFIG/second: 58 (~93x reduction) Datagrams/second: ~650k (~1.7x increase) Interval Transfer Bitrate Lost/Total Datagrams 0.00-30.00 sec 26.4 GBytes 7.55 Gbits/sec 0/19554720 (0%) sender Acked-by: Jason Wang Signed-off-by: Jon Kohler Acked-by: Michael S. Tsirkin Signed-off-by: NipaLocal --- drivers/vhost/net.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index b9b9e9d409518..7cbfc7d718b3f 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -755,10 +755,10 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) int err; int sent_pkts = 0; bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); + bool busyloop_intr; do { - bool busyloop_intr = false; - + busyloop_intr = false; if (nvq->done_idx == VHOST_NET_BATCH) vhost_tx_batch(net, nvq, sock, &msg); @@ -769,13 +769,10 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { - if (unlikely(busyloop_intr)) { - vhost_poll_queue(&vq->poll); - } else if (unlikely(vhost_enable_notify(&net->dev, - vq))) { - vhost_disable_notify(&net->dev, vq); - continue; - } + /* Kicks are disabled at this point, break loop and + * process any remaining batched packets. Queue will + * be re-enabled afterwards. + */ break; } @@ -825,7 +822,22 @@ static void handle_tx_copy(struct vhost_net *net, struct socket *sock) ++nvq->done_idx; } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); + /* Kicks are still disabled, dispatch any remaining batched msgs. */ vhost_tx_batch(net, nvq, sock, &msg); + + if (unlikely(busyloop_intr)) + /* If interrupted while doing busy polling, requeue the + * handler to be fair handle_rx as well as other tasks + * waiting on cpu. + */ + vhost_poll_queue(&vq->poll); + else + /* All of our work has been completed; however, before + * leaving the TX handler, do one last check for work, + * and requeue handler if necessary. If there is no work, + * queue will be reenabled. + */ + vhost_net_busy_poll_try_queue(net, vq); } static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) From 5c18b1a7cc980256e82dffadc726b827eed60088 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 1 May 2025 10:05:22 +0200 Subject: [PATCH 731/770] vsock/virtio: Linger on unsent data Currently vsock's lingering effectively boils down to waiting (or timing out) until packets are consumed or dropped by the peer; be it by receiving the data, closing or shutting down the connection. To align with the semantics described in the SO_LINGER section of man socket(7) and to mimic AF_INET's behaviour more closely, change the logic of a lingering close(): instead of waiting for all data to be handled, block until data is considered sent from the vsock's transport point of view. That is until worker picks the packets for processing and decrements virtio_vsock_sock::bytes_unsent down to 0. Note that (some interpretation of) lingering was always limited to transports that called virtio_transport_wait_close() on transport release. This does not change, i.e. under Hyper-V and VMCI no lingering would be observed. The implementation does not adhere strictly to man page's interpretation of SO_LINGER: shutdown() will not trigger the lingering. This follows AF_INET. Signed-off-by: Michal Luczaj Signed-off-by: NipaLocal --- net/vmw_vsock/virtio_transport_common.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 6e7b727c781c8..f2f1b166731b1 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1195,12 +1195,14 @@ static void virtio_transport_wait_close(struct sock *sk, long timeout) { if (timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct vsock_sock *vsk = vsock_sk(sk); add_wait_queue(sk_sleep(sk), &wait); do { if (sk_wait_event(sk, &timeout, - sock_flag(sk, SOCK_DONE), &wait)) + virtio_transport_unsent_bytes(vsk) == 0, + &wait)) break; } while (!signal_pending(current) && timeout); From c8c51ea47c5244f971ba295c4d468a8264484af0 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 1 May 2025 10:05:23 +0200 Subject: [PATCH 732/770] vsock: Move lingering logic to af_vsock core Lingering should be transport-independent in the long run. In preparation for supporting other transports, as well the linger on shutdown(), move code to core. Generalize by querying vsock_transport::unsent_bytes(), guard against the callback being unimplemented. Do not pass sk_lingertime explicitly. Pull SOCK_LINGER check into vsock_linger(). Flatten the function. Remove the nested block by inverting the condition: return early on !timeout. Suggested-by: Stefano Garzarella Signed-off-by: Michal Luczaj Signed-off-by: NipaLocal --- include/net/af_vsock.h | 1 + net/vmw_vsock/af_vsock.c | 30 +++++++++++++++++++++++++ net/vmw_vsock/virtio_transport_common.c | 23 ++----------------- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 9e85424c83435..d56e6e1351589 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -221,6 +221,7 @@ void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); +void vsock_linger(struct sock *sk); /**** TAP ****/ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index fc6afbc8d6806..a31ad6b141cd3 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1013,6 +1013,36 @@ static int vsock_getname(struct socket *sock, return err; } +void vsock_linger(struct sock *sk) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + ssize_t (*unsent)(struct vsock_sock *vsk); + struct vsock_sock *vsk = vsock_sk(sk); + long timeout; + + if (!sock_flag(sk, SOCK_LINGER)) + return; + + timeout = sk->sk_lingertime; + if (!timeout) + return; + + /* unsent_bytes() may be unimplemented. */ + unsent = vsk->transport->unsent_bytes; + if (!unsent) + return; + + add_wait_queue(sk_sleep(sk), &wait); + + do { + if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait)) + break; + } while (!signal_pending(current) && timeout); + + remove_wait_queue(sk_sleep(sk), &wait); +} +EXPORT_SYMBOL_GPL(vsock_linger); + static int vsock_shutdown(struct socket *sock, int mode) { int err; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index f2f1b166731b1..7897fd970dd86 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1191,25 +1191,6 @@ static void virtio_transport_remove_sock(struct vsock_sock *vsk) vsock_remove_sock(vsk); } -static void virtio_transport_wait_close(struct sock *sk, long timeout) -{ - if (timeout) { - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct vsock_sock *vsk = vsock_sk(sk); - - add_wait_queue(sk_sleep(sk), &wait); - - do { - if (sk_wait_event(sk, &timeout, - virtio_transport_unsent_bytes(vsk) == 0, - &wait)) - break; - } while (!signal_pending(current) && timeout); - - remove_wait_queue(sk_sleep(sk), &wait); - } -} - static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, bool cancel_timeout) { @@ -1279,8 +1260,8 @@ static bool virtio_transport_close(struct vsock_sock *vsk) if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK) (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK); - if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) - virtio_transport_wait_close(sk, sk->sk_lingertime); + if (!(current->flags & PF_EXITING)) + vsock_linger(sk); if (sock_flag(sk, SOCK_DONE)) { return true; From 0c7f75a1f7885e0486e7cfbe2809c78b13766d01 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 1 May 2025 10:05:24 +0200 Subject: [PATCH 733/770] vsock/test: Expand linger test to ensure close() does not misbehave There was an issue with SO_LINGER: instead of blocking until all queued messages for the socket have been successfully sent (or the linger timeout has been reached), close() would block until packets were handled by the peer. Add a check to alert on close() lingering when it should not. Signed-off-by: Michal Luczaj Signed-off-by: NipaLocal --- tools/testing/vsock/vsock_test.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c index d0f6d253ac72d..82d0bc20dfa75 100644 --- a/tools/testing/vsock/vsock_test.c +++ b/tools/testing/vsock/vsock_test.c @@ -1788,13 +1788,16 @@ static void test_stream_connect_retry_server(const struct test_opts *opts) close(fd); } +#define LINGER_TIMEOUT 1 /* seconds */ + static void test_stream_linger_client(const struct test_opts *opts) { struct linger optval = { .l_onoff = 1, - .l_linger = 1 + .l_linger = LINGER_TIMEOUT }; - int fd; + int bytes_unsent, fd; + time_t ts; fd = vsock_stream_connect(opts->peer_cid, opts->peer_port); if (fd < 0) { @@ -1807,7 +1810,28 @@ static void test_stream_linger_client(const struct test_opts *opts) exit(EXIT_FAILURE); } + /* Byte left unread to expose any incorrect behaviour. */ + send_byte(fd, 1, 0); + + /* Reuse LINGER_TIMEOUT to wait for bytes_unsent == 0. */ + timeout_begin(LINGER_TIMEOUT); + do { + if (ioctl(fd, SIOCOUTQ, &bytes_unsent) < 0) { + perror("ioctl(SIOCOUTQ)"); + exit(EXIT_FAILURE); + } + timeout_check("ioctl(SIOCOUTQ) == 0"); + } while (bytes_unsent != 0); + timeout_end(); + + ts = current_nsec(); close(fd); + if ((current_nsec() - ts) / NSEC_PER_SEC > 0) { + fprintf(stderr, "Unexpected lingering on close()\n"); + exit(EXIT_FAILURE); + } + + control_writeln("DONE"); } static void test_stream_linger_server(const struct test_opts *opts) @@ -1820,7 +1844,7 @@ static void test_stream_linger_server(const struct test_opts *opts) exit(EXIT_FAILURE); } - vsock_wait_remote_close(fd); + control_expectln("DONE"); close(fd); } From d9c6ee778b631ff46e69542186d9fe274b6535f9 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 1 May 2025 12:45:01 +0100 Subject: [PATCH 734/770] net: stmmac: use a local variable for priv->phylink_config Use a local variable for priv->phylink_config in stmmac_phy_setup() which makes the code a bit easier to read, allowing some lines to be merged. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: NipaLocal --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f59a2363f1506..fe7f9e3a92a56 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1259,19 +1259,22 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) { struct stmmac_mdio_bus_data *mdio_bus_data; int mode = priv->plat->phy_interface; + struct phylink_config *config; struct fwnode_handle *fwnode; struct phylink_pcs *pcs; struct phylink *phylink; - priv->phylink_config.dev = &priv->dev->dev; - priv->phylink_config.type = PHYLINK_NETDEV; - priv->phylink_config.mac_managed_pm = true; + config = &priv->phylink_config; + + config->dev = &priv->dev->dev; + config->type = PHYLINK_NETDEV; + config->mac_managed_pm = true; /* Stmmac always requires an RX clock for hardware initialization */ - priv->phylink_config.mac_requires_rxc = true; + config->mac_requires_rxc = true; if (!(priv->plat->flags & STMMAC_FLAG_RX_CLK_RUNS_IN_LPI)) - priv->phylink_config.eee_rx_clk_stop_enable = true; + config->eee_rx_clk_stop_enable = true; /* Set the default transmit clock stop bit based on the platform glue */ priv->tx_lpi_clk_stop = priv->plat->flags & @@ -1279,13 +1282,12 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) mdio_bus_data = priv->plat->mdio_bus_data; if (mdio_bus_data) - priv->phylink_config.default_an_inband = - mdio_bus_data->default_an_inband; + config->default_an_inband = mdio_bus_data->default_an_inband; /* Set the platform/firmware specified interface mode. Note, phylink * deals with the PHY interface mode, not the MAC interface mode. */ - __set_bit(mode, priv->phylink_config.supported_interfaces); + __set_bit(mode, config->supported_interfaces); /* If we have an xpcs, it defines which PHY interfaces are supported. */ if (priv->hw->xpcs) @@ -1294,29 +1296,26 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) pcs = priv->hw->phylink_pcs; if (pcs) - phy_interface_or(priv->phylink_config.supported_interfaces, - priv->phylink_config.supported_interfaces, + phy_interface_or(config->supported_interfaces, + config->supported_interfaces, pcs->supported_interfaces); if (priv->dma_cap.eee) { /* Assume all supported interfaces also support LPI */ - memcpy(priv->phylink_config.lpi_interfaces, - priv->phylink_config.supported_interfaces, - sizeof(priv->phylink_config.lpi_interfaces)); + memcpy(config->lpi_interfaces, config->supported_interfaces, + sizeof(config->lpi_interfaces)); /* All full duplex speeds above 100Mbps are supported */ - priv->phylink_config.lpi_capabilities = ~(MAC_1000FD - 1) | - MAC_100FD; - priv->phylink_config.lpi_timer_default = eee_timer * 1000; - priv->phylink_config.eee_enabled_default = true; + config->lpi_capabilities = ~(MAC_1000FD - 1) | MAC_100FD; + config->lpi_timer_default = eee_timer * 1000; + config->eee_enabled_default = true; } fwnode = priv->plat->port_node; if (!fwnode) fwnode = dev_fwnode(priv->device); - phylink = phylink_create(&priv->phylink_config, fwnode, - mode, &stmmac_phylink_mac_ops); + phylink = phylink_create(config, fwnode, mode, &stmmac_phylink_mac_ops); if (IS_ERR(phylink)) return PTR_ERR(phylink); From 79a36013be24c75a9387c9e8f4aa09225f296453 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 1 May 2025 12:45:06 +0100 Subject: [PATCH 735/770] net: stmmac: use priv->plat->phy_interface directly Avoid using a local variable for priv->plat->phy_interface as this may be modified in the .get_interfaces() method added in a future commit. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: NipaLocal --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index fe7f9e3a92a56..ac6ab121eb330 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1258,7 +1258,6 @@ static int stmmac_init_phy(struct net_device *dev) static int stmmac_phy_setup(struct stmmac_priv *priv) { struct stmmac_mdio_bus_data *mdio_bus_data; - int mode = priv->plat->phy_interface; struct phylink_config *config; struct fwnode_handle *fwnode; struct phylink_pcs *pcs; @@ -1287,7 +1286,7 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) /* Set the platform/firmware specified interface mode. Note, phylink * deals with the PHY interface mode, not the MAC interface mode. */ - __set_bit(mode, config->supported_interfaces); + __set_bit(priv->plat->phy_interface, config->supported_interfaces); /* If we have an xpcs, it defines which PHY interfaces are supported. */ if (priv->hw->xpcs) @@ -1315,7 +1314,8 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) if (!fwnode) fwnode = dev_fwnode(priv->device); - phylink = phylink_create(config, fwnode, mode, &stmmac_phylink_mac_ops); + phylink = phylink_create(config, fwnode, priv->plat->phy_interface, + &stmmac_phylink_mac_ops); if (IS_ERR(phylink)) return PTR_ERR(phylink); From b5feafe3e588054c3dc94843c4ac2ecda5b1d9b0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 1 May 2025 12:45:11 +0100 Subject: [PATCH 736/770] net: stmmac: add get_interfaces() platform method Add a get_interfaces() platform method to allow platforms to indicate to phylink which interface modes they support - which then allows phylink to validate on initialisation that the configured PHY interface mode is actually supported. Signed-off-by: Russell King (Oracle) Signed-off-by: NipaLocal --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 16 +++++++++++++--- include/linux/stmmac.h | 2 ++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index ac6ab121eb330..5f66f816a2496 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1283,10 +1283,20 @@ static int stmmac_phy_setup(struct stmmac_priv *priv) if (mdio_bus_data) config->default_an_inband = mdio_bus_data->default_an_inband; - /* Set the platform/firmware specified interface mode. Note, phylink - * deals with the PHY interface mode, not the MAC interface mode. + /* Get the PHY interface modes (at the PHY end of the link) that + * are supported by the platform. */ - __set_bit(priv->plat->phy_interface, config->supported_interfaces); + if (priv->plat->get_interfaces) + priv->plat->get_interfaces(priv, priv->plat->bsp_priv, + config->supported_interfaces); + + /* Set the platform/firmware specified interface mode if the + * supported interfaces have not already been provided using + * phy_interface as a last resort. + */ + if (phy_interface_empty(config->supported_interfaces)) + __set_bit(priv->plat->phy_interface, + config->supported_interfaces); /* If we have an xpcs, it defines which PHY interfaces are supported. */ if (priv->hw->xpcs) diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 8aed09d65b4a1..537bced69c46e 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -233,6 +233,8 @@ struct plat_stmmacenet_data { u8 tx_sched_algorithm; struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES]; struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES]; + void (*get_interfaces)(struct stmmac_priv *priv, void *bsp_priv, + unsigned long *interfaces); int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); From 5035ef279ee216a176a9f99dfd57a284f67b7502 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 1 May 2025 12:45:16 +0100 Subject: [PATCH 737/770] net: stmmac: intel: move phy_interface init to tgl_common_data() Move the initialisation of plat->phy_interface to tgl_common_data() as all callers set this same interface mode. This moves it to a single location to make the change to get_interfaces() more obvious. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: NipaLocal --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 96c893dd262f8..4b263a3c65a43 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -929,6 +929,7 @@ static int tgl_common_data(struct pci_dev *pdev, plat->rx_queues_to_use = 6; plat->tx_queues_to_use = 4; plat->clk_ptp_rate = 204800000; + plat->phy_interface = PHY_INTERFACE_MODE_SGMII; plat->speed_mode_2500 = intel_speed_mode_2500; plat->safety_feat_cfg->tsoee = 1; @@ -948,7 +949,6 @@ static int tgl_sgmii_phy0_data(struct pci_dev *pdev, struct plat_stmmacenet_data *plat) { plat->bus_id = 1; - plat->phy_interface = PHY_INTERFACE_MODE_SGMII; plat->serdes_powerup = intel_serdes_powerup; plat->serdes_powerdown = intel_serdes_powerdown; return tgl_common_data(pdev, plat); @@ -962,7 +962,6 @@ static int tgl_sgmii_phy1_data(struct pci_dev *pdev, struct plat_stmmacenet_data *plat) { plat->bus_id = 2; - plat->phy_interface = PHY_INTERFACE_MODE_SGMII; plat->serdes_powerup = intel_serdes_powerup; plat->serdes_powerdown = intel_serdes_powerdown; return tgl_common_data(pdev, plat); @@ -976,7 +975,6 @@ static int adls_sgmii_phy0_data(struct pci_dev *pdev, struct plat_stmmacenet_data *plat) { plat->bus_id = 1; - plat->phy_interface = PHY_INTERFACE_MODE_SGMII; /* SerDes power up and power down are done in BIOS for ADL */ @@ -991,7 +989,6 @@ static int adls_sgmii_phy1_data(struct pci_dev *pdev, struct plat_stmmacenet_data *plat) { plat->bus_id = 2; - plat->phy_interface = PHY_INTERFACE_MODE_SGMII; /* SerDes power up and power down are done in BIOS for ADL */ From e9c936a88818e7f359e8685bf941d454addc3d45 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 1 May 2025 12:45:21 +0100 Subject: [PATCH 738/770] net: stmmac: intel: convert speed_mode_2500() to get_interfaces() TGL platforms support either SGMII or 2500BASE-X, which is determined by reading a SERDES register. Thus, plat->phy_interface (and phylink's supported_interfaces) depend on this. Use the new .get_interfaces() method to set both plat->phy_interface and the supported_interfaces bitmap. This removes the only user of the .speed_mode_2500() method. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: NipaLocal --- .../net/ethernet/stmicro/stmmac/dwmac-intel.c | 30 ++++++++++--------- .../net/ethernet/stmicro/stmmac/dwmac-intel.h | 1 - 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 4b263a3c65a43..9a47015254bbe 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -284,25 +284,28 @@ static void intel_serdes_powerdown(struct net_device *ndev, void *intel_data) } } -static void intel_speed_mode_2500(struct net_device *ndev, void *intel_data) +static void tgl_get_interfaces(struct stmmac_priv *priv, void *bsp_priv, + unsigned long *interfaces) { - struct intel_priv_data *intel_priv = intel_data; - struct stmmac_priv *priv = netdev_priv(ndev); - int serdes_phy_addr = 0; - u32 data = 0; - - serdes_phy_addr = intel_priv->mdio_adhoc_addr; + struct intel_priv_data *intel_priv = bsp_priv; + phy_interface_t interface; + int data; /* Determine the link speed mode: 2.5Gbps/1Gbps */ - data = mdiobus_read(priv->mii, serdes_phy_addr, - SERDES_GCR); + data = mdiobus_read(priv->mii, intel_priv->mdio_adhoc_addr, SERDES_GCR); + if (data < 0) + return; - if (((data & SERDES_LINK_MODE_MASK) >> SERDES_LINK_MODE_SHIFT) == - SERDES_LINK_MODE_2G5) { + if (FIELD_GET(SERDES_LINK_MODE_MASK, data) == SERDES_LINK_MODE_2G5) { dev_info(priv->device, "Link Speed Mode: 2.5Gbps\n"); - priv->plat->phy_interface = PHY_INTERFACE_MODE_2500BASEX; priv->plat->mdio_bus_data->default_an_inband = false; + interface = PHY_INTERFACE_MODE_2500BASEX; + } else { + interface = PHY_INTERFACE_MODE_SGMII; } + + __set_bit(interface, interfaces); + priv->plat->phy_interface = interface; } /* Program PTP Clock Frequency for different variant of @@ -929,8 +932,7 @@ static int tgl_common_data(struct pci_dev *pdev, plat->rx_queues_to_use = 6; plat->tx_queues_to_use = 4; plat->clk_ptp_rate = 204800000; - plat->phy_interface = PHY_INTERFACE_MODE_SGMII; - plat->speed_mode_2500 = intel_speed_mode_2500; + plat->get_interfaces = tgl_get_interfaces; plat->safety_feat_cfg->tsoee = 1; plat->safety_feat_cfg->mrxpee = 0; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.h b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.h index a12f8e65f89f1..7511c224b3125 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.h @@ -21,7 +21,6 @@ #define SERDES_RATE_MASK GENMASK(9, 8) #define SERDES_PCLK_MASK GENMASK(14, 12) /* PCLK rate to PHY */ #define SERDES_LINK_MODE_MASK GENMASK(2, 1) -#define SERDES_LINK_MODE_SHIFT 1 #define SERDES_PWR_ST_SHIFT 4 #define SERDES_PWR_ST_P0 0x0 #define SERDES_PWR_ST_P3 0x3 From 51b796f04be6734cbf5c4527d6216e71d8335492 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 1 May 2025 12:45:27 +0100 Subject: [PATCH 739/770] net: stmmac: remove speed_mode_2500() method Remove the speed_mode_2500() platform method which is no longer used or necessary, being superseded by the more flexible get_interfaces() method. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: NipaLocal --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 --- include/linux/stmmac.h | 1 - 2 files changed, 4 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 5f66f816a2496..28b62bd73e231 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7736,9 +7736,6 @@ int stmmac_dvr_probe(struct device *device, goto error_mdio_register; } - if (priv->plat->speed_mode_2500) - priv->plat->speed_mode_2500(ndev, priv->plat->bsp_priv); - ret = stmmac_pcs_setup(ndev); if (ret) goto error_pcs_setup; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 537bced69c46e..26ddf95d23f95 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -241,7 +241,6 @@ struct plat_stmmacenet_data { int (*fix_soc_reset)(void *priv, void __iomem *ioaddr); int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); - void (*speed_mode_2500)(struct net_device *ndev, void *priv); int (*mac_finish)(struct net_device *ndev, void *priv, unsigned int mode, From a54ca5ea1f3def0399efe55bab60467a9f4d3cfd Mon Sep 17 00:00:00 2001 From: Dave Marquardt Date: Thu, 1 May 2025 14:49:42 -0500 Subject: [PATCH 740/770] net: ibmveth: Indented struct ibmveth_adapter correctly Made struct ibmveth_adapter follow indentation rules Signed-off-by: Dave Marquardt Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Signed-off-by: NipaLocal --- drivers/net/ethernet/ibm/ibmveth.h | 64 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmveth.h b/drivers/net/ethernet/ibm/ibmveth.h index 8468e2c59d7a4..0f72ce54e7cfb 100644 --- a/drivers/net/ethernet/ibm/ibmveth.h +++ b/drivers/net/ethernet/ibm/ibmveth.h @@ -134,38 +134,38 @@ struct ibmveth_rx_q { }; struct ibmveth_adapter { - struct vio_dev *vdev; - struct net_device *netdev; - struct napi_struct napi; - unsigned int mcastFilterSize; - void * buffer_list_addr; - void * filter_list_addr; - void *tx_ltb_ptr[IBMVETH_MAX_QUEUES]; - unsigned int tx_ltb_size; - dma_addr_t tx_ltb_dma[IBMVETH_MAX_QUEUES]; - dma_addr_t buffer_list_dma; - dma_addr_t filter_list_dma; - struct ibmveth_buff_pool rx_buff_pool[IBMVETH_NUM_BUFF_POOLS]; - struct ibmveth_rx_q rx_queue; - int rx_csum; - int large_send; - bool is_active_trunk; - - u64 fw_ipv6_csum_support; - u64 fw_ipv4_csum_support; - u64 fw_large_send_support; - /* adapter specific stats */ - u64 replenish_task_cycles; - u64 replenish_no_mem; - u64 replenish_add_buff_failure; - u64 replenish_add_buff_success; - u64 rx_invalid_buffer; - u64 rx_no_buffer; - u64 tx_map_failed; - u64 tx_send_failed; - u64 tx_large_packets; - u64 rx_large_packets; - /* Ethtool settings */ + struct vio_dev *vdev; + struct net_device *netdev; + struct napi_struct napi; + unsigned int mcastFilterSize; + void *buffer_list_addr; + void *filter_list_addr; + void *tx_ltb_ptr[IBMVETH_MAX_QUEUES]; + unsigned int tx_ltb_size; + dma_addr_t tx_ltb_dma[IBMVETH_MAX_QUEUES]; + dma_addr_t buffer_list_dma; + dma_addr_t filter_list_dma; + struct ibmveth_buff_pool rx_buff_pool[IBMVETH_NUM_BUFF_POOLS]; + struct ibmveth_rx_q rx_queue; + int rx_csum; + int large_send; + bool is_active_trunk; + + u64 fw_ipv6_csum_support; + u64 fw_ipv4_csum_support; + u64 fw_large_send_support; + /* adapter specific stats */ + u64 replenish_task_cycles; + u64 replenish_no_mem; + u64 replenish_add_buff_failure; + u64 replenish_add_buff_success; + u64 rx_invalid_buffer; + u64 rx_no_buffer; + u64 tx_map_failed; + u64 tx_send_failed; + u64 tx_large_packets; + u64 rx_large_packets; + /* Ethtool settings */ u8 duplex; u32 speed; }; From 94ab26dc8e837fab8e62b59291456c57ceee830c Mon Sep 17 00:00:00 2001 From: Dave Marquardt Date: Thu, 1 May 2025 14:49:43 -0500 Subject: [PATCH 741/770] net: ibmveth: Reset the adapter when unexpected states are detected Reset the adapter through new function ibmveth_reset, called in WARN_ON situations. Removed conflicting and unneeded forward declaration. Signed-off-by: Dave Marquardt Signed-off-by: NipaLocal --- drivers/net/ethernet/ibm/ibmveth.c | 118 ++++++++++++++++++++++++----- drivers/net/ethernet/ibm/ibmveth.h | 1 + 2 files changed, 100 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index 04192190bebab..cff494739bc99 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -39,8 +39,6 @@ #include "ibmveth.h" static irqreturn_t ibmveth_interrupt(int irq, void *dev_instance); -static void ibmveth_rxq_harvest_buffer(struct ibmveth_adapter *adapter, - bool reuse); static unsigned long ibmveth_get_desired_dma(struct vio_dev *vdev); static struct kobj_type ktype_veth_pool; @@ -231,7 +229,10 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, index = pool->free_map[free_index]; skb = NULL; - BUG_ON(index == IBM_VETH_INVALID_MAP); + if (WARN_ON(index == IBM_VETH_INVALID_MAP)) { + schedule_work(&adapter->work); + goto bad_index_failure; + } /* are we allocating a new buffer or recycling an old one */ if (pool->skbuff[index]) @@ -300,6 +301,7 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, DMA_FROM_DEVICE); dev_kfree_skb_any(pool->skbuff[index]); pool->skbuff[index] = NULL; +bad_index_failure: adapter->replenish_add_buff_failure++; mb(); @@ -370,20 +372,36 @@ static void ibmveth_free_buffer_pool(struct ibmveth_adapter *adapter, } } -/* remove a buffer from a pool */ -static void ibmveth_remove_buffer_from_pool(struct ibmveth_adapter *adapter, - u64 correlator, bool reuse) +/** + * ibmveth_remove_buffer_from_pool - remove a buffer from a pool + * @adapter: adapter instance + * @correlator: identifies pool and index + * @reuse: whether to reuse buffer + * + * Return: + * * %0 - success + * * %-EINVAL - correlator maps to pool or index out of range + * * %-EFAULT - pool and index map to null skb + */ +static int ibmveth_remove_buffer_from_pool(struct ibmveth_adapter *adapter, + u64 correlator, bool reuse) { unsigned int pool = correlator >> 32; unsigned int index = correlator & 0xffffffffUL; unsigned int free_index; struct sk_buff *skb; - BUG_ON(pool >= IBMVETH_NUM_BUFF_POOLS); - BUG_ON(index >= adapter->rx_buff_pool[pool].size); + if (WARN_ON(pool >= IBMVETH_NUM_BUFF_POOLS) || + WARN_ON(index >= adapter->rx_buff_pool[pool].size)) { + schedule_work(&adapter->work); + return -EINVAL; + } skb = adapter->rx_buff_pool[pool].skbuff[index]; - BUG_ON(skb == NULL); + if (WARN_ON(!skb)) { + schedule_work(&adapter->work); + return -EFAULT; + } /* if we are going to reuse the buffer then keep the pointers around * but mark index as available. replenish will see the skb pointer and @@ -411,6 +429,8 @@ static void ibmveth_remove_buffer_from_pool(struct ibmveth_adapter *adapter, mb(); atomic_dec(&(adapter->rx_buff_pool[pool].available)); + + return 0; } /* get the current buffer on the rx queue */ @@ -420,24 +440,44 @@ static inline struct sk_buff *ibmveth_rxq_get_buffer(struct ibmveth_adapter *ada unsigned int pool = correlator >> 32; unsigned int index = correlator & 0xffffffffUL; - BUG_ON(pool >= IBMVETH_NUM_BUFF_POOLS); - BUG_ON(index >= adapter->rx_buff_pool[pool].size); + if (WARN_ON(pool >= IBMVETH_NUM_BUFF_POOLS) || + WARN_ON(index >= adapter->rx_buff_pool[pool].size)) { + schedule_work(&adapter->work); + return NULL; + } return adapter->rx_buff_pool[pool].skbuff[index]; } -static void ibmveth_rxq_harvest_buffer(struct ibmveth_adapter *adapter, - bool reuse) +/** + * ibmveth_rxq_harvest_buffer - Harvest buffer from pool + * + * @adapter: pointer to adapter + * @reuse: whether to reuse buffer + * + * Context: called from ibmveth_poll + * + * Return: + * * %0 - success + * * other - non-zero return from ibmveth_remove_buffer_from_pool + */ +static int ibmveth_rxq_harvest_buffer(struct ibmveth_adapter *adapter, + bool reuse) { u64 cor; + int rc; cor = adapter->rx_queue.queue_addr[adapter->rx_queue.index].correlator; - ibmveth_remove_buffer_from_pool(adapter, cor, reuse); + rc = ibmveth_remove_buffer_from_pool(adapter, cor, reuse); + if (unlikely(rc)) + return rc; if (++adapter->rx_queue.index == adapter->rx_queue.num_slots) { adapter->rx_queue.index = 0; adapter->rx_queue.toggle = !adapter->rx_queue.toggle; } + + return 0; } static void ibmveth_free_tx_ltb(struct ibmveth_adapter *adapter, int idx) @@ -709,6 +749,35 @@ static int ibmveth_close(struct net_device *netdev) return 0; } +/** + * ibmveth_reset - Handle scheduled reset work + * + * @w: pointer to work_struct embedded in adapter structure + * + * Context: This routine acquires rtnl_mutex and disables its NAPI through + * ibmveth_close. It can't be called directly in a context that has + * already acquired rtnl_mutex or disabled its NAPI, or directly from + * a poll routine. + * + * Return: void + */ +static void ibmveth_reset(struct work_struct *w) +{ + struct ibmveth_adapter *adapter = container_of(w, struct ibmveth_adapter, work); + struct net_device *netdev = adapter->netdev; + + netdev_dbg(netdev, "reset starting\n"); + + rtnl_lock(); + + dev_close(adapter->netdev); + dev_open(adapter->netdev, NULL); + + rtnl_unlock(); + + netdev_dbg(netdev, "reset complete\n"); +} + static int ibmveth_set_link_ksettings(struct net_device *dev, const struct ethtool_link_ksettings *cmd) { @@ -1324,7 +1393,8 @@ static int ibmveth_poll(struct napi_struct *napi, int budget) wmb(); /* suggested by larson1 */ adapter->rx_invalid_buffer++; netdev_dbg(netdev, "recycling invalid buffer\n"); - ibmveth_rxq_harvest_buffer(adapter, true); + if (unlikely(ibmveth_rxq_harvest_buffer(adapter, true))) + break; } else { struct sk_buff *skb, *new_skb; int length = ibmveth_rxq_frame_length(adapter); @@ -1334,6 +1404,8 @@ static int ibmveth_poll(struct napi_struct *napi, int budget) __sum16 iph_check = 0; skb = ibmveth_rxq_get_buffer(adapter); + if (unlikely(!skb)) + break; /* if the large packet bit is set in the rx queue * descriptor, the mss will be written by PHYP eight @@ -1357,10 +1429,12 @@ static int ibmveth_poll(struct napi_struct *napi, int budget) if (rx_flush) ibmveth_flush_buffer(skb->data, length + offset); - ibmveth_rxq_harvest_buffer(adapter, true); + if (unlikely(ibmveth_rxq_harvest_buffer(adapter, true))) + break; skb = new_skb; } else { - ibmveth_rxq_harvest_buffer(adapter, false); + if (unlikely(ibmveth_rxq_harvest_buffer(adapter, false))) + break; skb_reserve(skb, offset); } @@ -1407,7 +1481,10 @@ static int ibmveth_poll(struct napi_struct *napi, int budget) * then check once more to make sure we are done. */ lpar_rc = h_vio_signal(adapter->vdev->unit_address, VIO_IRQ_ENABLE); - BUG_ON(lpar_rc != H_SUCCESS); + if (WARN_ON(lpar_rc != H_SUCCESS)) { + schedule_work(&adapter->work); + goto out; + } if (ibmveth_rxq_pending_buffer(adapter) && napi_schedule(napi)) { lpar_rc = h_vio_signal(adapter->vdev->unit_address, @@ -1428,7 +1505,7 @@ static irqreturn_t ibmveth_interrupt(int irq, void *dev_instance) if (napi_schedule_prep(&adapter->napi)) { lpar_rc = h_vio_signal(adapter->vdev->unit_address, VIO_IRQ_DISABLE); - BUG_ON(lpar_rc != H_SUCCESS); + WARN_ON(lpar_rc != H_SUCCESS); __napi_schedule(&adapter->napi); } return IRQ_HANDLED; @@ -1670,6 +1747,7 @@ static int ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id) adapter->vdev = dev; adapter->netdev = netdev; + INIT_WORK(&adapter->work, ibmveth_reset); adapter->mcastFilterSize = be32_to_cpu(*mcastFilterSize_p); ibmveth_init_link_settings(netdev); @@ -1762,6 +1840,8 @@ static void ibmveth_remove(struct vio_dev *dev) struct ibmveth_adapter *adapter = netdev_priv(netdev); int i; + cancel_work_sync(&adapter->work); + for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) kobject_put(&adapter->rx_buff_pool[i].kobj); diff --git a/drivers/net/ethernet/ibm/ibmveth.h b/drivers/net/ethernet/ibm/ibmveth.h index 0f72ce54e7cfb..b0a2460ec9f9a 100644 --- a/drivers/net/ethernet/ibm/ibmveth.h +++ b/drivers/net/ethernet/ibm/ibmveth.h @@ -137,6 +137,7 @@ struct ibmveth_adapter { struct vio_dev *vdev; struct net_device *netdev; struct napi_struct napi; + struct work_struct work; unsigned int mcastFilterSize; void *buffer_list_addr; void *filter_list_addr; From 4433f33b87bab49cdd3df157f856c2af17287302 Mon Sep 17 00:00:00 2001 From: Dave Marquardt Date: Thu, 1 May 2025 14:49:44 -0500 Subject: [PATCH 742/770] net: ibmveth: added KUnit tests for some buffer pool functions Added KUnit tests for ibmveth_remove_buffer_from_pool and ibmveth_rxq_get_buffer under new IBMVETH_KUNIT_TEST config option. Signed-off-by: Dave Marquardt Reviewed-by: Simon Horman Signed-off-by: NipaLocal --- drivers/net/ethernet/ibm/Kconfig | 13 +++ drivers/net/ethernet/ibm/ibmveth.c | 129 +++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) diff --git a/drivers/net/ethernet/ibm/Kconfig b/drivers/net/ethernet/ibm/Kconfig index c0c112d95b891..4f4b23465c478 100644 --- a/drivers/net/ethernet/ibm/Kconfig +++ b/drivers/net/ethernet/ibm/Kconfig @@ -27,6 +27,19 @@ config IBMVETH To compile this driver as a module, choose M here. The module will be called ibmveth. +config IBMVETH_KUNIT_TEST + bool "KUnit test for IBM LAN Virtual Ethernet support" if !KUNIT_ALL_TESTS + depends on KUNIT + depends on KUNIT=y && IBMVETH=y + default KUNIT_ALL_TESTS + help + This builds unit tests for the IBM LAN Virtual Ethernet driver. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + source "drivers/net/ethernet/ibm/emac/Kconfig" config EHEA diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index cff494739bc99..45143467286e5 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -2042,3 +2042,132 @@ static void __exit ibmveth_module_exit(void) module_init(ibmveth_module_init); module_exit(ibmveth_module_exit); + +#ifdef CONFIG_IBMVETH_KUNIT_TEST +#include + +/** + * ibmveth_reset_kunit - reset routine for running in KUnit environment + * + * @w: pointer to work_struct embedded in adapter structure + * + * Context: Called in the KUnit environment. Does nothing. + * + * Return: void + */ +static void ibmveth_reset_kunit(struct work_struct *w) +{ + netdev_dbg(NULL, "reset_kunit starting\n"); + netdev_dbg(NULL, "reset_kunit complete\n"); +} + +/** + * ibmveth_remove_buffer_from_pool_test - unit test for some of + * ibmveth_remove_buffer_from_pool + * @test: pointer to kunit structure + * + * Tests the error returns from ibmveth_remove_buffer_from_pool. + * ibmveth_remove_buffer_from_pool also calls WARN_ON, so dmesg should be + * checked to see that these warnings happened. + * + * Return: void + */ +static void ibmveth_remove_buffer_from_pool_test(struct kunit *test) +{ + struct ibmveth_adapter *adapter = kunit_kzalloc(test, sizeof(*adapter), GFP_KERNEL); + struct ibmveth_buff_pool *pool; + u64 correlator; + + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, adapter); + + INIT_WORK(&adapter->work, ibmveth_reset_kunit); + + /* Set sane values for buffer pools */ + for (int i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) + ibmveth_init_buffer_pool(&adapter->rx_buff_pool[i], i, + pool_count[i], pool_size[i], + pool_active[i]); + + pool = &adapter->rx_buff_pool[0]; + pool->skbuff = kunit_kcalloc(test, pool->size, sizeof(void *), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pool->skbuff); + + correlator = ((u64)IBMVETH_NUM_BUFF_POOLS << 32) | 0; + KUNIT_EXPECT_EQ(test, -EINVAL, ibmveth_remove_buffer_from_pool(adapter, correlator, false)); + KUNIT_EXPECT_EQ(test, -EINVAL, ibmveth_remove_buffer_from_pool(adapter, correlator, true)); + + correlator = ((u64)0 << 32) | adapter->rx_buff_pool[0].size; + KUNIT_EXPECT_EQ(test, -EINVAL, ibmveth_remove_buffer_from_pool(adapter, correlator, false)); + KUNIT_EXPECT_EQ(test, -EINVAL, ibmveth_remove_buffer_from_pool(adapter, correlator, true)); + + correlator = (u64)0 | 0; + pool->skbuff[0] = NULL; + KUNIT_EXPECT_EQ(test, -EFAULT, ibmveth_remove_buffer_from_pool(adapter, correlator, false)); + KUNIT_EXPECT_EQ(test, -EFAULT, ibmveth_remove_buffer_from_pool(adapter, correlator, true)); + + flush_work(&adapter->work); +} + +/** + * ibmveth_rxq_get_buffer_test - unit test for ibmveth_rxq_get_buffer + * @test: pointer to kunit structure + * + * Tests ibmveth_rxq_get_buffer. ibmveth_rxq_get_buffer also calls WARN_ON for + * the NULL returns, so dmesg should be checked to see that these warnings + * happened. + * + * Return: void + */ +static void ibmveth_rxq_get_buffer_test(struct kunit *test) +{ + struct ibmveth_adapter *adapter = kunit_kzalloc(test, sizeof(*adapter), GFP_KERNEL); + struct sk_buff *skb = kunit_kzalloc(test, sizeof(*skb), GFP_KERNEL); + struct ibmveth_buff_pool *pool; + + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, adapter); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + + INIT_WORK(&adapter->work, ibmveth_reset_kunit); + + adapter->rx_queue.queue_len = 1; + adapter->rx_queue.index = 0; + adapter->rx_queue.queue_addr = kunit_kzalloc(test, sizeof(struct ibmveth_rx_q_entry), + GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, adapter->rx_queue.queue_addr); + + /* Set sane values for buffer pools */ + for (int i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) + ibmveth_init_buffer_pool(&adapter->rx_buff_pool[i], i, + pool_count[i], pool_size[i], + pool_active[i]); + + pool = &adapter->rx_buff_pool[0]; + pool->skbuff = kunit_kcalloc(test, pool->size, sizeof(void *), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pool->skbuff); + + adapter->rx_queue.queue_addr[0].correlator = (u64)IBMVETH_NUM_BUFF_POOLS << 32 | 0; + KUNIT_EXPECT_PTR_EQ(test, NULL, ibmveth_rxq_get_buffer(adapter)); + + adapter->rx_queue.queue_addr[0].correlator = (u64)0 << 32 | adapter->rx_buff_pool[0].size; + KUNIT_EXPECT_PTR_EQ(test, NULL, ibmveth_rxq_get_buffer(adapter)); + + pool->skbuff[0] = skb; + adapter->rx_queue.queue_addr[0].correlator = (u64)0 << 32 | 0; + KUNIT_EXPECT_PTR_EQ(test, skb, ibmveth_rxq_get_buffer(adapter)); + + flush_work(&adapter->work); +} + +static struct kunit_case ibmveth_test_cases[] = { + KUNIT_CASE(ibmveth_remove_buffer_from_pool_test), + KUNIT_CASE(ibmveth_rxq_get_buffer_test), + {} +}; + +static struct kunit_suite ibmveth_test_suite = { + .name = "ibmveth-kunit-test", + .test_cases = ibmveth_test_cases, +}; + +kunit_test_suite(ibmveth_test_suite); +#endif From 069977b383dd5116804316a275e879be5a210310 Mon Sep 17 00:00:00 2001 From: Ruben Wauters Date: Thu, 1 May 2025 21:23:55 +0100 Subject: [PATCH 743/770] ipv4: ip_tunnel: Replace strcpy use with strscpy Use of strcpy is decpreated, replaces the use of strcpy with strscpy as recommended. strscpy was chosen as it requires a NUL terminated non-padded string, which is the case here. I am aware there is an explicit bounds check above the second instance, however using strscpy protects against buffer overflows in any future code, and there is no good reason I can see to not use it. I have also replaced the scrscpy above that had 3 params with the version using 2 params. These are functionally equivalent, but it is cleaner to have both using 2 params. Signed-off-by: Ruben Wauters Reviewed-by: Simon Horman Signed-off-by: NipaLocal --- net/ipv4/ip_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 3913ec89ad207..678b8f96e3e9c 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -243,11 +243,11 @@ static struct net_device *__ip_tunnel_create(struct net *net, if (parms->name[0]) { if (!dev_valid_name(parms->name)) goto failed; - strscpy(name, parms->name, IFNAMSIZ); + strscpy(name, parms->name); } else { if (strlen(ops->kind) > (IFNAMSIZ - 3)) goto failed; - strcpy(name, ops->kind); + strscpy(name, ops->kind); strcat(name, "%d"); } From 6999739f1438a42e44dcb563ac07d461846548c1 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 1 May 2025 16:29:57 -0700 Subject: [PATCH 744/770] fbnic: Fix initialization of mailbox descriptor rings Address to issues with the FW mailbox descriptor initialization. We need to reverse the order of accesses when we invalidate an entry versus writing an entry. When writing an entry we write upper and then lower as the lower 32b contain the valid bit that makes the entire address valid. However for invalidation we should write it in the reverse order so that the upper is marked invalid before we update it. Without this change we may see FW attempt to access pages with the upper 32b of the address set to 0 which will likely result in DMAR faults due to write access failures on mailbox shutdown. Fixes: da3cde08209e ("eth: fbnic: Add FW communication mechanism") Signed-off-by: Alexander Duyck Reviewed-by: Simon Horman Signed-off-by: NipaLocal --- drivers/net/ethernet/meta/fbnic/fbnic_fw.c | 32 ++++++++++++++++------ 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c index 88db3dacb9405..c4956f0a741e8 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c @@ -17,11 +17,29 @@ static void __fbnic_mbx_wr_desc(struct fbnic_dev *fbd, int mbx_idx, { u32 desc_offset = FBNIC_IPC_MBX(mbx_idx, desc_idx); + /* Write the upper 32b and then the lower 32b. Doing this the + * FW can then read lower, upper, lower to verify that the state + * of the descriptor wasn't changed mid-transaction. + */ fw_wr32(fbd, desc_offset + 1, upper_32_bits(desc)); fw_wrfl(fbd); fw_wr32(fbd, desc_offset, lower_32_bits(desc)); } +static void __fbnic_mbx_invalidate_desc(struct fbnic_dev *fbd, int mbx_idx, + int desc_idx, u32 desc) +{ + u32 desc_offset = FBNIC_IPC_MBX(mbx_idx, desc_idx); + + /* For initialization we write the lower 32b of the descriptor first. + * This way we can set the state to mark it invalid before we clear the + * upper 32b. + */ + fw_wr32(fbd, desc_offset, desc); + fw_wrfl(fbd); + fw_wr32(fbd, desc_offset + 1, 0); +} + static u64 __fbnic_mbx_rd_desc(struct fbnic_dev *fbd, int mbx_idx, int desc_idx) { u32 desc_offset = FBNIC_IPC_MBX(mbx_idx, desc_idx); @@ -41,21 +59,17 @@ static void fbnic_mbx_init_desc_ring(struct fbnic_dev *fbd, int mbx_idx) * solid stop for the firmware to hit when it is done looping * through the ring. */ - __fbnic_mbx_wr_desc(fbd, mbx_idx, 0, 0); - - fw_wrfl(fbd); + __fbnic_mbx_invalidate_desc(fbd, mbx_idx, 0, 0); /* We then fill the rest of the ring starting at the end and moving * back toward descriptor 0 with skip descriptors that have no * length nor address, and tell the firmware that they can skip * them and just move past them to the one we initialized to 0. */ - for (desc_idx = FBNIC_IPC_MBX_DESC_LEN; --desc_idx;) { - __fbnic_mbx_wr_desc(fbd, mbx_idx, desc_idx, - FBNIC_IPC_MBX_DESC_FW_CMPL | - FBNIC_IPC_MBX_DESC_HOST_CMPL); - fw_wrfl(fbd); - } + for (desc_idx = FBNIC_IPC_MBX_DESC_LEN; --desc_idx;) + __fbnic_mbx_invalidate_desc(fbd, mbx_idx, desc_idx, + FBNIC_IPC_MBX_DESC_FW_CMPL | + FBNIC_IPC_MBX_DESC_HOST_CMPL); } void fbnic_mbx_init(struct fbnic_dev *fbd) From c4642ae33fa2e6dc11ecae3a4102191d0674510b Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 1 May 2025 16:30:03 -0700 Subject: [PATCH 745/770] fbnic: Gate AXI read/write enabling on FW mailbox In order to prevent the device from throwing spurious writes and/or reads at us we need to gate the AXI fabric interface to the PCIe until such time as we know the FW is in a known good state. To accomplish this we use the mailbox as a mechanism for us to recognize that the FW has acknowledged our presence and is no longer sending any stale message data to us. We start in fbnic_mbx_init by calling fbnic_mbx_reset_desc_ring function, disabling the DMA in both directions, and then invalidating all the descriptors in each ring. We then poll the mailbox in fbnic_mbx_poll_tx_ready and when the interrupt is set by the FW we pick it up and mark the mailboxes as ready, while also enabling the DMA. Once we have completed all the transactions and need to shut down we call into fbnic_mbx_clean which will in turn call fbnic_mbx_reset_desc_ring for each ring and shut down the DMA and once again invalidate the descriptors. Fixes: 3646153161f1 ("eth: fbnic: Add register init to set PCIe/Ethernet device config") Fixes: da3cde08209e ("eth: fbnic: Add FW communication mechanism") Signed-off-by: Alexander Duyck Reviewed-by: Simon Horman Signed-off-by: NipaLocal --- drivers/net/ethernet/meta/fbnic/fbnic_csr.h | 2 ++ drivers/net/ethernet/meta/fbnic/fbnic_fw.c | 38 +++++++++++++++++---- drivers/net/ethernet/meta/fbnic/fbnic_mac.c | 6 ---- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h index 0c217c195c6a3..36393a17d92d2 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h @@ -818,8 +818,10 @@ enum { /* PUL User Registers */ #define FBNIC_CSR_START_PUL_USER 0x31000 /* CSR section delimiter */ #define FBNIC_PUL_OB_TLP_HDR_AW_CFG 0x3103d /* 0xc40f4 */ +#define FBNIC_PUL_OB_TLP_HDR_AW_CFG_FLUSH CSR_BIT(19) #define FBNIC_PUL_OB_TLP_HDR_AW_CFG_BME CSR_BIT(18) #define FBNIC_PUL_OB_TLP_HDR_AR_CFG 0x3103e /* 0xc40f8 */ +#define FBNIC_PUL_OB_TLP_HDR_AR_CFG_FLUSH CSR_BIT(19) #define FBNIC_PUL_OB_TLP_HDR_AR_CFG_BME CSR_BIT(18) #define FBNIC_PUL_USER_OB_RD_TLP_CNT_31_0 \ 0x3106e /* 0xc41b8 */ diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c index c4956f0a741e8..f4749bfd840c5 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c @@ -51,10 +51,26 @@ static u64 __fbnic_mbx_rd_desc(struct fbnic_dev *fbd, int mbx_idx, int desc_idx) return desc; } -static void fbnic_mbx_init_desc_ring(struct fbnic_dev *fbd, int mbx_idx) +static void fbnic_mbx_reset_desc_ring(struct fbnic_dev *fbd, int mbx_idx) { int desc_idx; + /* Disable DMA transactions from the device, + * and flush any transactions triggered during cleaning + */ + switch (mbx_idx) { + case FBNIC_IPC_MBX_RX_IDX: + wr32(fbd, FBNIC_PUL_OB_TLP_HDR_AW_CFG, + FBNIC_PUL_OB_TLP_HDR_AW_CFG_FLUSH); + break; + case FBNIC_IPC_MBX_TX_IDX: + wr32(fbd, FBNIC_PUL_OB_TLP_HDR_AR_CFG, + FBNIC_PUL_OB_TLP_HDR_AR_CFG_FLUSH); + break; + } + + wrfl(fbd); + /* Initialize first descriptor to all 0s. Doing this gives us a * solid stop for the firmware to hit when it is done looping * through the ring. @@ -90,7 +106,7 @@ void fbnic_mbx_init(struct fbnic_dev *fbd) wr32(fbd, FBNIC_INTR_CLEAR(0), 1u << FBNIC_FW_MSIX_ENTRY); for (i = 0; i < FBNIC_IPC_MBX_INDICES; i++) - fbnic_mbx_init_desc_ring(fbd, i); + fbnic_mbx_reset_desc_ring(fbd, i); } static int fbnic_mbx_map_msg(struct fbnic_dev *fbd, int mbx_idx, @@ -155,7 +171,7 @@ static void fbnic_mbx_clean_desc_ring(struct fbnic_dev *fbd, int mbx_idx) { int i; - fbnic_mbx_init_desc_ring(fbd, mbx_idx); + fbnic_mbx_reset_desc_ring(fbd, mbx_idx); for (i = FBNIC_IPC_MBX_DESC_LEN; i--;) fbnic_mbx_unmap_and_free_msg(fbd, mbx_idx, i); @@ -354,7 +370,7 @@ static int fbnic_fw_xmit_cap_msg(struct fbnic_dev *fbd) return (err == -EOPNOTSUPP) ? 0 : err; } -static void fbnic_mbx_postinit_desc_ring(struct fbnic_dev *fbd, int mbx_idx) +static void fbnic_mbx_init_desc_ring(struct fbnic_dev *fbd, int mbx_idx) { struct fbnic_fw_mbx *mbx = &fbd->mbx[mbx_idx]; @@ -366,10 +382,18 @@ static void fbnic_mbx_postinit_desc_ring(struct fbnic_dev *fbd, int mbx_idx) switch (mbx_idx) { case FBNIC_IPC_MBX_RX_IDX: + /* Enable DMA writes from the device */ + wr32(fbd, FBNIC_PUL_OB_TLP_HDR_AW_CFG, + FBNIC_PUL_OB_TLP_HDR_AW_CFG_BME); + /* Make sure we have a page for the FW to write to */ fbnic_mbx_alloc_rx_msgs(fbd); break; case FBNIC_IPC_MBX_TX_IDX: + /* Enable DMA reads from the device */ + wr32(fbd, FBNIC_PUL_OB_TLP_HDR_AR_CFG, + FBNIC_PUL_OB_TLP_HDR_AR_CFG_BME); + /* Force version to 1 if we successfully requested an update * from the firmware. This should be overwritten once we get * the actual version from the firmware in the capabilities @@ -386,7 +410,7 @@ static void fbnic_mbx_postinit(struct fbnic_dev *fbd) { int i; - /* We only need to do this on the first interrupt following init. + /* We only need to do this on the first interrupt following reset. * this primes the mailbox so that we will have cleared all the * skip descriptors. */ @@ -396,7 +420,7 @@ static void fbnic_mbx_postinit(struct fbnic_dev *fbd) wr32(fbd, FBNIC_INTR_CLEAR(0), 1u << FBNIC_FW_MSIX_ENTRY); for (i = 0; i < FBNIC_IPC_MBX_INDICES; i++) - fbnic_mbx_postinit_desc_ring(fbd, i); + fbnic_mbx_init_desc_ring(fbd, i); } /** @@ -894,7 +918,7 @@ int fbnic_mbx_poll_tx_ready(struct fbnic_dev *fbd) * avoid the mailbox getting stuck closed if the interrupt * is reset. */ - fbnic_mbx_init_desc_ring(fbd, FBNIC_IPC_MBX_TX_IDX); + fbnic_mbx_reset_desc_ring(fbd, FBNIC_IPC_MBX_TX_IDX); msleep(200); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_mac.c b/drivers/net/ethernet/meta/fbnic/fbnic_mac.c index 14291401f4632..dde4a37116e20 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_mac.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_mac.c @@ -79,12 +79,6 @@ static void fbnic_mac_init_axi(struct fbnic_dev *fbd) fbnic_init_readrq(fbd, FBNIC_QM_RNI_RBP_CTL, cls, readrq); fbnic_init_mps(fbd, FBNIC_QM_RNI_RDE_CTL, cls, mps); fbnic_init_mps(fbd, FBNIC_QM_RNI_RCM_CTL, cls, mps); - - /* Enable XALI AR/AW outbound */ - wr32(fbd, FBNIC_PUL_OB_TLP_HDR_AW_CFG, - FBNIC_PUL_OB_TLP_HDR_AW_CFG_BME); - wr32(fbd, FBNIC_PUL_OB_TLP_HDR_AR_CFG, - FBNIC_PUL_OB_TLP_HDR_AR_CFG_BME); } static void fbnic_mac_init_qm(struct fbnic_dev *fbd) From 51ea680d3e0875b0f9424e66d1dab084539afdd4 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 1 May 2025 16:30:10 -0700 Subject: [PATCH 746/770] fbnic: Add additional handling of IRQs We have two issues that need to be addressed in our IRQ handling. One is the fact that we can end up double-freeing IRQs in the event of an exception handling error such as a PCIe reset/recovery that fails. To prevent that from becoming an issue we can use the msix_vector values to indicate that we have successfully requested/freed the IRQ by only setting or clearing them when we have completed the tiven action. The other issue is that we have several potential races in our IRQ path due to us manipulating the mask before the vector has been truly disabled. In order to handle that in the case of the FW mailbox we need to not auto-enable the IRQ and instead will be enabling/disabling it separately. In the case of the PCS vector we can mitigate this by unmapping it and synchronizing the IRQ before we clear the mask. The general order of operations after this change is now to request the interrupt, poll the FW mailbox to ready, and then enable the interrupt. For the shutdown we do the reverse where we disable the interrupt, flush any pending Tx, and then free the IRQ. I am renaming the enable/disable to request/free to be equivilent with the IRQ calls being used. We may see additions in the future to enable/disable the IRQs versus request/free them for certain use cases. Fixes: da3cde08209e ("eth: fbnic: Add FW communication mechanism") Fixes: 69684376eed5 ("eth: fbnic: Add link detection") Signed-off-by: Alexander Duyck Signed-off-by: NipaLocal --- drivers/net/ethernet/meta/fbnic/fbnic.h | 8 +- drivers/net/ethernet/meta/fbnic/fbnic_irq.c | 142 ++++++++++++------ .../net/ethernet/meta/fbnic/fbnic_netdev.c | 5 +- drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 14 +- 4 files changed, 110 insertions(+), 59 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index 80d54edaac557..ad01ed05d78bb 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -157,14 +157,14 @@ struct fbnic_dev *fbnic_devlink_alloc(struct pci_dev *pdev); void fbnic_devlink_register(struct fbnic_dev *fbd); void fbnic_devlink_unregister(struct fbnic_dev *fbd); -int fbnic_fw_enable_mbx(struct fbnic_dev *fbd); -void fbnic_fw_disable_mbx(struct fbnic_dev *fbd); +int fbnic_fw_request_mbx(struct fbnic_dev *fbd); +void fbnic_fw_free_mbx(struct fbnic_dev *fbd); void fbnic_hwmon_register(struct fbnic_dev *fbd); void fbnic_hwmon_unregister(struct fbnic_dev *fbd); -int fbnic_pcs_irq_enable(struct fbnic_dev *fbd); -void fbnic_pcs_irq_disable(struct fbnic_dev *fbd); +int fbnic_pcs_request_irq(struct fbnic_dev *fbd); +void fbnic_pcs_free_irq(struct fbnic_dev *fbd); void fbnic_napi_name_irqs(struct fbnic_dev *fbd); int fbnic_napi_request_irq(struct fbnic_dev *fbd, diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_irq.c b/drivers/net/ethernet/meta/fbnic/fbnic_irq.c index 1bbc0e56f3a03..1c88a2bf3a7a7 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_irq.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_irq.c @@ -19,69 +19,105 @@ static irqreturn_t fbnic_fw_msix_intr(int __always_unused irq, void *data) return IRQ_HANDLED; } +static int __fbnic_fw_enable_mbx(struct fbnic_dev *fbd, int vector) +{ + int err; + + /* Initialize mailbox and attempt to poll it into ready state */ + fbnic_mbx_init(fbd); + err = fbnic_mbx_poll_tx_ready(fbd); + if (err) { + dev_warn(fbd->dev, "FW mailbox did not enter ready state\n"); + return err; + } + + /* Enable interrupt and unmask the vector */ + enable_irq(vector); + fbnic_wr32(fbd, FBNIC_INTR_MASK_CLEAR(0), 1u << FBNIC_FW_MSIX_ENTRY); + + return 0; +} + /** - * fbnic_fw_enable_mbx - Configure and initialize Firmware Mailbox + * fbnic_fw_request_mbx - Configure and initialize Firmware Mailbox * @fbd: Pointer to device to initialize * - * This function will initialize the firmware mailbox rings, enable the IRQ - * and initialize the communication between the Firmware and the host. The - * firmware is expected to respond to the initialization by sending an - * interrupt essentially notifying the host that it has seen the - * initialization and is now synced up. + * This function will allocate the IRQ and then reinitialize the mailbox + * starting communication between the host and firmware. * * Return: non-zero on failure. **/ -int fbnic_fw_enable_mbx(struct fbnic_dev *fbd) +int fbnic_fw_request_mbx(struct fbnic_dev *fbd) { - u32 vector = fbd->fw_msix_vector; - int err; + struct pci_dev *pdev = to_pci_dev(fbd->dev); + int vector, err; + + WARN_ON(fbd->fw_msix_vector); + + vector = pci_irq_vector(pdev, FBNIC_FW_MSIX_ENTRY); + if (vector < 0) + return vector; /* Request the IRQ for FW Mailbox vector. */ err = request_threaded_irq(vector, NULL, &fbnic_fw_msix_intr, - IRQF_ONESHOT, dev_name(fbd->dev), fbd); + IRQF_ONESHOT | IRQF_NO_AUTOEN, + dev_name(fbd->dev), fbd); if (err) return err; /* Initialize mailbox and attempt to poll it into ready state */ - fbnic_mbx_init(fbd); - err = fbnic_mbx_poll_tx_ready(fbd); - if (err) { - dev_warn(fbd->dev, "FW mailbox did not enter ready state\n"); + err = __fbnic_fw_enable_mbx(fbd, vector); + if (err) free_irq(vector, fbd); - return err; - } - /* Enable interrupts */ - fbnic_wr32(fbd, FBNIC_INTR_MASK_CLEAR(0), 1u << FBNIC_FW_MSIX_ENTRY); + fbd->fw_msix_vector = vector; - return 0; + return err; } /** - * fbnic_fw_disable_mbx - Disable mailbox and place it in standby state - * @fbd: Pointer to device to disable + * fbnic_fw_disable_mbx - Temporarily place mailbox in standby state + * @fbd: Pointer to device * - * This function will disable the mailbox interrupt, free any messages still - * in the mailbox and place it into a standby state. The firmware is - * expected to see the update and assume that the host is in the reset state. + * Shutdown the mailbox by notifying the firmware to stop sending us logs, mask + * and synchronize the IRQ, and then clean up the rings. **/ -void fbnic_fw_disable_mbx(struct fbnic_dev *fbd) +static void fbnic_fw_disable_mbx(struct fbnic_dev *fbd) { - /* Disable interrupt and free vector */ - fbnic_wr32(fbd, FBNIC_INTR_MASK_SET(0), 1u << FBNIC_FW_MSIX_ENTRY); + /* Disable interrupt and synchronize the IRQ */ + disable_irq(fbd->fw_msix_vector); - /* Free the vector */ - free_irq(fbd->fw_msix_vector, fbd); + /* Mask the vector */ + fbnic_wr32(fbd, FBNIC_INTR_MASK_SET(0), 1u << FBNIC_FW_MSIX_ENTRY); /* Make sure disabling logs message is sent, must be done here to * avoid risk of completing without a running interrupt. */ fbnic_mbx_flush_tx(fbd); - - /* Reset the mailboxes to the initialized state */ fbnic_mbx_clean(fbd); } +/** + * fbnic_fw_free_mbx - Disable mailbox and place it in standby state + * @fbd: Pointer to device to disable + * + * This function will disable the mailbox interrupt, free any messages still + * in the mailbox and place it into a disabled state. The firmware is + * expected to see the update and assume that the host is in the reset state. + **/ +void fbnic_fw_free_mbx(struct fbnic_dev *fbd) +{ + /* Vector has already been freed */ + if (!fbd->fw_msix_vector) + return; + + fbnic_fw_disable_mbx(fbd); + + /* Free the vector */ + free_irq(fbd->fw_msix_vector, fbd); + fbd->fw_msix_vector = 0; +} + static irqreturn_t fbnic_pcs_msix_intr(int __always_unused irq, void *data) { struct fbnic_dev *fbd = data; @@ -101,7 +137,7 @@ static irqreturn_t fbnic_pcs_msix_intr(int __always_unused irq, void *data) } /** - * fbnic_pcs_irq_enable - Configure the MAC to enable it to advertise link + * fbnic_pcs_request_irq - Configure the PCS to enable it to advertise link * @fbd: Pointer to device to initialize * * This function provides basic bringup for the MAC/PCS IRQ. For now the IRQ @@ -109,41 +145,61 @@ static irqreturn_t fbnic_pcs_msix_intr(int __always_unused irq, void *data) * * Return: non-zero on failure. **/ -int fbnic_pcs_irq_enable(struct fbnic_dev *fbd) +int fbnic_pcs_request_irq(struct fbnic_dev *fbd) { - u32 vector = fbd->pcs_msix_vector; - int err; + struct pci_dev *pdev = to_pci_dev(fbd->dev); + int vector, err; - /* Request the IRQ for MAC link vector. - * Map MAC cause to it, and unmask it + WARN_ON(fbd->pcs_msix_vector); + + vector = pci_irq_vector(pdev, FBNIC_PCS_MSIX_ENTRY); + if (vector < 0) + return vector; + + /* Request the IRQ for PCS link vector. + * Map PCS cause to it, and unmask it */ err = request_irq(vector, &fbnic_pcs_msix_intr, 0, fbd->netdev->name, fbd); if (err) return err; + /* Map and enable interrupt, unmask vector after link is configured */ fbnic_wr32(fbd, FBNIC_INTR_MSIX_CTRL(FBNIC_INTR_MSIX_CTRL_PCS_IDX), FBNIC_PCS_MSIX_ENTRY | FBNIC_INTR_MSIX_CTRL_ENABLE); + fbd->pcs_msix_vector = vector; + return 0; } /** - * fbnic_pcs_irq_disable - Teardown the MAC IRQ to prepare for stopping + * fbnic_pcs_free_irq - Teardown the PCS IRQ to prepare for stopping * @fbd: Pointer to device that is stopping * - * This function undoes the work done in fbnic_pcs_irq_enable and prepares + * This function undoes the work done in fbnic_pcs_request_irq and prepares * the device to no longer receive traffic on the host interface. **/ -void fbnic_pcs_irq_disable(struct fbnic_dev *fbd) +void fbnic_pcs_free_irq(struct fbnic_dev *fbd) { + /* Vector has already been freed */ + if (!fbd->pcs_msix_vector) + return; + /* Disable interrupt */ fbnic_wr32(fbd, FBNIC_INTR_MSIX_CTRL(FBNIC_INTR_MSIX_CTRL_PCS_IDX), FBNIC_PCS_MSIX_ENTRY); + fbnic_wrfl(fbd); + + /* Synchronize IRQ to prevent race that would unmask vector */ + synchronize_irq(fbd->pcs_msix_vector); + + /* Mask the vector */ fbnic_wr32(fbd, FBNIC_INTR_MASK_SET(0), 1u << FBNIC_PCS_MSIX_ENTRY); /* Free the vector */ free_irq(fbd->pcs_msix_vector, fbd); + fbd->pcs_msix_vector = 0; } void fbnic_synchronize_irq(struct fbnic_dev *fbd, int nr) @@ -226,9 +282,6 @@ void fbnic_free_irqs(struct fbnic_dev *fbd) { struct pci_dev *pdev = to_pci_dev(fbd->dev); - fbd->pcs_msix_vector = 0; - fbd->fw_msix_vector = 0; - fbd->num_irqs = 0; pci_free_irq_vectors(pdev); @@ -254,8 +307,5 @@ int fbnic_alloc_irqs(struct fbnic_dev *fbd) fbd->num_irqs = num_irqs; - fbd->pcs_msix_vector = pci_irq_vector(pdev, FBNIC_PCS_MSIX_ENTRY); - fbd->fw_msix_vector = pci_irq_vector(pdev, FBNIC_FW_MSIX_ENTRY); - return 0; } diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index d699f58dda211..aefc5661fbaf1 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -44,9 +44,10 @@ int __fbnic_open(struct fbnic_net *fbn) if (err) goto time_stop; - err = fbnic_pcs_irq_enable(fbd); + err = fbnic_pcs_request_irq(fbd); if (err) goto time_stop; + /* Pull the BMC config and initialize the RPC */ fbnic_bmc_rpc_init(fbd); fbnic_rss_reinit(fbd, fbn); @@ -82,7 +83,7 @@ static int fbnic_stop(struct net_device *netdev) struct fbnic_net *fbn = netdev_priv(netdev); fbnic_down(fbn); - fbnic_pcs_irq_disable(fbn->fbd); + fbnic_pcs_free_irq(fbn->fbd); fbnic_time_stop(fbn); fbnic_fw_xmit_ownership_msg(fbn->fbd, false); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index 1f76ebdd6ad18..70a852b3e99d0 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -283,7 +283,7 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto free_irqs; } - err = fbnic_fw_enable_mbx(fbd); + err = fbnic_fw_request_mbx(fbd); if (err) { dev_err(&pdev->dev, "Firmware mailbox initialization failure\n"); @@ -364,7 +364,7 @@ static void fbnic_remove(struct pci_dev *pdev) fbnic_hwmon_unregister(fbd); fbnic_dbg_fbd_exit(fbd); fbnic_devlink_unregister(fbd); - fbnic_fw_disable_mbx(fbd); + fbnic_fw_free_mbx(fbd); fbnic_free_irqs(fbd); fbnic_devlink_free(fbd); @@ -388,7 +388,7 @@ static int fbnic_pm_suspend(struct device *dev) rtnl_unlock(); null_uc_addr: - fbnic_fw_disable_mbx(fbd); + fbnic_fw_free_mbx(fbd); /* Free the IRQs so they aren't trying to occupy sleeping CPUs */ fbnic_free_irqs(fbd); @@ -421,7 +421,7 @@ static int __fbnic_pm_resume(struct device *dev) fbd->mac->init_regs(fbd); /* Re-enable mailbox */ - err = fbnic_fw_enable_mbx(fbd); + err = fbnic_fw_request_mbx(fbd); if (err) goto err_free_irqs; @@ -439,15 +439,15 @@ static int __fbnic_pm_resume(struct device *dev) if (netif_running(netdev)) { err = __fbnic_open(fbn); if (err) - goto err_disable_mbx; + goto err_free_mbx; } rtnl_unlock(); return 0; -err_disable_mbx: +err_free_mbx: rtnl_unlock(); - fbnic_fw_disable_mbx(fbd); + fbnic_fw_free_mbx(fbd); err_free_irqs: fbnic_free_irqs(fbd); err_invalidate_uc_addr: From dc414a6807a2c07593763822aef4d0a17779fc77 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 1 May 2025 16:30:16 -0700 Subject: [PATCH 747/770] fbnic: Actually flush_tx instead of stalling out The fbnic_mbx_flush_tx function had a number of issues. First, we were waiting 200ms for the firmware to process the packets. We can drop this to 20ms and in almost all cases this should be more than enough time. So by changing this we can significantly reduce shutdown time. Second, we were not making sure that the Tx path was actually shut off. As such we could still have packets added while we were flushing the mailbox. To prevent that we can now clear the ready flag for the Tx side and it should stay down since the interrupt is disabled. Third, we kept re-reading the tail due to the second issue. The tail should not move after we have started the flush so we can just read it once while we are holding the mailbox Tx lock. By doing that we are guaranteed that the value should be consistent. Fourth, we were keeping a count of descriptors cleaned due to the second and third issues called out. That count is not a valid reason to be exiting the cleanup, and with the tail only being read once we shouldn't see any cases where the tail moves after the disable so the tracking of count can be dropped. Fifth, we were using attempts * sleep time to determine how long we would wait in our polling loop to flush out the Tx. This can be very imprecise. In order to tighten up the timing we are shifting over to using a jiffies value of jiffies + 10 * HZ + 1 to determine the jiffies value we should stop polling at as this should be accurate within once sleep cycle for the total amount of time spent polling. Fixes: da3cde08209e ("eth: fbnic: Add FW communication mechanism") Signed-off-by: Alexander Duyck Reviewed-by: Simon Horman Signed-off-by: NipaLocal --- drivers/net/ethernet/meta/fbnic/fbnic_fw.c | 31 +++++++++++----------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c index f4749bfd840c5..d019191d6ae97 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c @@ -930,35 +930,36 @@ int fbnic_mbx_poll_tx_ready(struct fbnic_dev *fbd) void fbnic_mbx_flush_tx(struct fbnic_dev *fbd) { + unsigned long timeout = jiffies + 10 * HZ + 1; struct fbnic_fw_mbx *tx_mbx; - int attempts = 50; - u8 count = 0; - - /* Nothing to do if there is no mailbox */ - if (!fbnic_fw_present(fbd)) - return; + u8 tail; /* Record current Rx stats */ tx_mbx = &fbd->mbx[FBNIC_IPC_MBX_TX_IDX]; - /* Nothing to do if mailbox never got to ready */ - if (!tx_mbx->ready) - return; + spin_lock_irq(&fbd->fw_tx_lock); + + /* Clear ready to prevent any further attempts to transmit */ + tx_mbx->ready = false; + + /* Read tail to determine the last tail state for the ring */ + tail = tx_mbx->tail; + + spin_unlock_irq(&fbd->fw_tx_lock); /* Give firmware time to process packet, - * we will wait up to 10 seconds which is 50 waits of 200ms. + * we will wait up to 10 seconds which is 500 waits of 20ms. */ do { u8 head = tx_mbx->head; - if (head == tx_mbx->tail) + /* Tx ring is empty once head == tail */ + if (head == tail) break; - msleep(200); + msleep(20); fbnic_mbx_process_tx_msgs(fbd); - - count += (tx_mbx->head - head) % FBNIC_IPC_MBX_DESC_LEN; - } while (count < FBNIC_IPC_MBX_DESC_LEN && --attempts); + } while (time_is_after_jiffies(timeout)); } void fbnic_get_fw_ver_commit_str(struct fbnic_dev *fbd, char *fw_version, From 0d607f6c1ac3cc7791ba534d5dc757e258217b45 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 1 May 2025 16:30:22 -0700 Subject: [PATCH 748/770] fbnic: Cleanup handling of completions There was an issue in that if we were to shutdown we could be left with a completion in flight as the mailbox went away. To address that I have added an fbnic_mbx_evict_all_cmpl function that is meant to essentially create a "broken pipe" type response so that all callers will receive an error indicating that the connection has been broken as a result of us shutting down the mailbox. In addition the naming was inconsistent between the creation and clearing of completions. Since the cmpl seems to be the common suffix to use for the handling of cmpl_data I went through and renamed fbnic_fw_clear_compl to fbnic_fw_clear_cmpl. Fixes: 378e5cc1c6c6 ("eth: fbnic: hwmon: Add completion infrastructure for firmware requests") Signed-off-by: Alexander Duyck Signed-off-by: NipaLocal --- drivers/net/ethernet/meta/fbnic/fbnic_fw.c | 22 ++++++++++++++++++++- drivers/net/ethernet/meta/fbnic/fbnic_fw.h | 2 +- drivers/net/ethernet/meta/fbnic/fbnic_mac.c | 2 +- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c index d019191d6ae97..efc0176f1a9a1 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c @@ -928,6 +928,23 @@ int fbnic_mbx_poll_tx_ready(struct fbnic_dev *fbd) return attempts ? 0 : -ETIMEDOUT; } +static void __fbnic_fw_evict_cmpl(struct fbnic_fw_completion *cmpl_data) +{ + cmpl_data->result = -EPIPE; + complete(&cmpl_data->done); +} + +static void fbnic_mbx_evict_all_cmpl(struct fbnic_dev *fbd) +{ + struct fbnic_fw_completion *cmpl_data; + + cmpl_data = fbd->cmpl_data; + if (cmpl_data) + __fbnic_fw_evict_cmpl(cmpl_data); + + memset(&fbd->cmpl_data, 0, sizeof(fbd->cmpl_data)); +} + void fbnic_mbx_flush_tx(struct fbnic_dev *fbd) { unsigned long timeout = jiffies + 10 * HZ + 1; @@ -945,6 +962,9 @@ void fbnic_mbx_flush_tx(struct fbnic_dev *fbd) /* Read tail to determine the last tail state for the ring */ tail = tx_mbx->tail; + /* Flush any completions as we are no longer processing Rx */ + fbnic_mbx_evict_all_cmpl(fbd); + spin_unlock_irq(&fbd->fw_tx_lock); /* Give firmware time to process packet, @@ -983,7 +1003,7 @@ void fbnic_fw_init_cmpl(struct fbnic_fw_completion *fw_cmpl, kref_init(&fw_cmpl->ref_count); } -void fbnic_fw_clear_compl(struct fbnic_dev *fbd) +void fbnic_fw_clear_cmpl(struct fbnic_dev *fbd) { unsigned long flags; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h index a3618e7826c25..2d5e0ff1982c2 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h @@ -69,7 +69,7 @@ int fbnic_fw_xmit_tsene_read_msg(struct fbnic_dev *fbd, struct fbnic_fw_completion *cmpl_data); void fbnic_fw_init_cmpl(struct fbnic_fw_completion *cmpl_data, u32 msg_type); -void fbnic_fw_clear_compl(struct fbnic_dev *fbd); +void fbnic_fw_clear_cmpl(struct fbnic_dev *fbd); void fbnic_fw_put_cmpl(struct fbnic_fw_completion *cmpl_data); #define fbnic_mk_full_fw_ver_str(_rev_id, _delim, _commit, _str, _str_sz) \ diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_mac.c b/drivers/net/ethernet/meta/fbnic/fbnic_mac.c index dde4a37116e20..7e54f82535f65 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_mac.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_mac.c @@ -744,7 +744,7 @@ static int fbnic_mac_get_sensor_asic(struct fbnic_dev *fbd, int id, *val = *sensor; exit_cleanup: - fbnic_fw_clear_compl(fbd); + fbnic_fw_clear_cmpl(fbd); exit_free: fbnic_fw_put_cmpl(fw_cmpl); From 3bbde15a33286035faba49fef781fd93c23bea3d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 1 May 2025 16:30:30 -0700 Subject: [PATCH 749/770] fbnic: Pull fbnic_fw_xmit_cap_msg use out of interrupt context This change pulls the call to fbnic_fw_xmit_cap_msg out of fbnic_mbx_init_desc_ring and instead places it in the polling function for getting the Tx ready. Doing that we can avoid the potential issue with an interrupt coming in later from the firmware that causes it to get fired in interrupt context. In addition we can add additional verification to the poll_tx_ready function to make sure that the mailbox is actually ready by verifying that it has populated the capabilities from the firmware. This is important as the link config relies on this and we were currently delaying this until the open call was made which would force the capbabilities message to be processed then. This resolves potential issues with the link state being inconsistent between the netdev being registered and the open call being made. Lastly we can make the overall mailbox poll-to-ready more reliable/responsive by reducing the overall sleep time and using a jiffies based timeout method instead of relying on X number of sleeps/"attempts". Fixes: 20d2e88cc746 ("eth: fbnic: Add initial messaging to notify FW of our presence") Signed-off-by: Alexander Duyck Signed-off-by: NipaLocal --- drivers/net/ethernet/meta/fbnic/fbnic_fw.c | 98 +++++++++++----------- 1 file changed, 51 insertions(+), 47 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c index efc0176f1a9a1..0452ea573d4a0 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c @@ -95,6 +95,9 @@ void fbnic_mbx_init(struct fbnic_dev *fbd) /* Initialize lock to protect Tx ring */ spin_lock_init(&fbd->fw_tx_lock); + /* Reset FW Capabilities */ + memset(&fbd->fw_cap, 0, sizeof(fbd->fw_cap)); + /* Reinitialize mailbox memory */ for (i = 0; i < FBNIC_IPC_MBX_INDICES; i++) memset(&fbd->mbx[i], 0, sizeof(struct fbnic_fw_mbx)); @@ -352,32 +355,10 @@ static int fbnic_fw_xmit_simple_msg(struct fbnic_dev *fbd, u32 msg_type) return err; } -/** - * fbnic_fw_xmit_cap_msg - Allocate and populate a FW capabilities message - * @fbd: FBNIC device structure - * - * Return: NULL on failure to allocate, error pointer on error, or pointer - * to new TLV test message. - * - * Sends a single TLV header indicating the host wants the firmware to - * confirm the capabilities and version. - **/ -static int fbnic_fw_xmit_cap_msg(struct fbnic_dev *fbd) -{ - int err = fbnic_fw_xmit_simple_msg(fbd, FBNIC_TLV_MSG_ID_HOST_CAP_REQ); - - /* Return 0 if we are not calling this on ASIC */ - return (err == -EOPNOTSUPP) ? 0 : err; -} - static void fbnic_mbx_init_desc_ring(struct fbnic_dev *fbd, int mbx_idx) { struct fbnic_fw_mbx *mbx = &fbd->mbx[mbx_idx]; - /* This is a one time init, so just exit if it is completed */ - if (mbx->ready) - return; - mbx->ready = true; switch (mbx_idx) { @@ -393,34 +374,22 @@ static void fbnic_mbx_init_desc_ring(struct fbnic_dev *fbd, int mbx_idx) /* Enable DMA reads from the device */ wr32(fbd, FBNIC_PUL_OB_TLP_HDR_AR_CFG, FBNIC_PUL_OB_TLP_HDR_AR_CFG_BME); - - /* Force version to 1 if we successfully requested an update - * from the firmware. This should be overwritten once we get - * the actual version from the firmware in the capabilities - * request message. - */ - if (!fbnic_fw_xmit_cap_msg(fbd) && - !fbd->fw_cap.running.mgmt.version) - fbd->fw_cap.running.mgmt.version = 1; break; } } -static void fbnic_mbx_postinit(struct fbnic_dev *fbd) +static bool fbnic_mbx_event(struct fbnic_dev *fbd) { - int i; - /* We only need to do this on the first interrupt following reset. * this primes the mailbox so that we will have cleared all the * skip descriptors. */ if (!(rd32(fbd, FBNIC_INTR_STATUS(0)) & (1u << FBNIC_FW_MSIX_ENTRY))) - return; + return false; wr32(fbd, FBNIC_INTR_CLEAR(0), 1u << FBNIC_FW_MSIX_ENTRY); - for (i = 0; i < FBNIC_IPC_MBX_INDICES; i++) - fbnic_mbx_init_desc_ring(fbd, i); + return true; } /** @@ -897,7 +866,7 @@ static void fbnic_mbx_process_rx_msgs(struct fbnic_dev *fbd) void fbnic_mbx_poll(struct fbnic_dev *fbd) { - fbnic_mbx_postinit(fbd); + fbnic_mbx_event(fbd); fbnic_mbx_process_tx_msgs(fbd); fbnic_mbx_process_rx_msgs(fbd); @@ -905,27 +874,62 @@ void fbnic_mbx_poll(struct fbnic_dev *fbd) int fbnic_mbx_poll_tx_ready(struct fbnic_dev *fbd) { - struct fbnic_fw_mbx *tx_mbx; - int attempts = 50; + struct fbnic_fw_mbx *tx_mbx = &fbd->mbx[FBNIC_IPC_MBX_TX_IDX]; + unsigned long timeout = jiffies + 10 * HZ + 1; + int err, i; - /* Immediate fail if BAR4 isn't there */ - if (!fbnic_fw_present(fbd)) - return -ENODEV; + do { + if (!time_is_after_jiffies(timeout)) + return -ETIMEDOUT; - tx_mbx = &fbd->mbx[FBNIC_IPC_MBX_TX_IDX]; - while (!tx_mbx->ready && --attempts) { /* Force the firmware to trigger an interrupt response to * avoid the mailbox getting stuck closed if the interrupt * is reset. */ fbnic_mbx_reset_desc_ring(fbd, FBNIC_IPC_MBX_TX_IDX); - msleep(200); + /* Immediate fail if BAR4 went away */ + if (!fbnic_fw_present(fbd)) + return -ENODEV; + + msleep(20); + } while (!fbnic_mbx_event(fbd)); + + /* FW has shown signs of life. Enable DMA and start Tx/Rx */ + for (i = 0; i < FBNIC_IPC_MBX_INDICES; i++) + fbnic_mbx_init_desc_ring(fbd, i); + /* Request an update from the firmware. This should overwrite + * mgmt.version once we get the actual version from the firmware + * in the capabilities request message. + */ + err = fbnic_fw_xmit_simple_msg(fbd, FBNIC_TLV_MSG_ID_HOST_CAP_REQ); + if (err) + goto clean_mbx; + + /* Poll until we get a current management firmware version, use "1" + * to indicate we entered the polling state waiting for a response + */ + for (fbd->fw_cap.running.mgmt.version = 1; + fbd->fw_cap.running.mgmt.version < MIN_FW_VERSION_CODE;) { + if (!tx_mbx->ready) + err = -ENODEV; + if (err) + goto clean_mbx; + + msleep(20); fbnic_mbx_poll(fbd); + + /* set err, but wait till mgmt.version check to report it */ + if (!time_is_after_jiffies(timeout)) + err = -ETIMEDOUT; } - return attempts ? 0 : -ETIMEDOUT; + return 0; +clean_mbx: + /* Cleanup Rx buffers and disable mailbox */ + fbnic_mbx_clean(fbd); + return err; } static void __fbnic_fw_evict_cmpl(struct fbnic_fw_completion *cmpl_data) From 96ae1f3392ab114cf20ea5b3350d521f0f2e8d84 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 2 May 2025 00:38:15 +0100 Subject: [PATCH 750/770] sctp: Remove unused sctp_assoc_del_peer and sctp_chunk_iif sctp_assoc_del_peer() last use was removed in 2015 by commit 73e6742027f5 ("sctp: Do not try to search for the transport twice") which now uses rm_peer instead of del_peer. sctp_chunk_iif() last use was removed in 2016 by commit 1f45f78f8e51 ("sctp: allow GSO frags to access the chunk too") Remove them. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: NipaLocal --- include/net/sctp/sm.h | 1 - include/net/sctp/structs.h | 2 -- net/sctp/associola.c | 18 ------------------ net/sctp/sm_make_chunk.c | 8 -------- 4 files changed, 29 deletions(-) diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 64c42bd56bb24..3bfd261a53cc9 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -161,7 +161,6 @@ const struct sctp_sm_table_entry *sctp_sm_lookup_event( enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype); -int sctp_chunk_iif(const struct sctp_chunk *); struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *, struct sctp_chunk *, gfp_t gfp); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index dcd288fa1bb6f..1ad7ce71d0a78 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -2152,8 +2152,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *, const union sctp_addr *address, const gfp_t gfp, const int peer_state); -void sctp_assoc_del_peer(struct sctp_association *asoc, - const union sctp_addr *addr); void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer); void sctp_assoc_control_transport(struct sctp_association *asoc, diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 760152e751c7c..5793d71852b83 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -736,24 +736,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, return peer; } -/* Delete a transport address from an association. */ -void sctp_assoc_del_peer(struct sctp_association *asoc, - const union sctp_addr *addr) -{ - struct list_head *pos; - struct list_head *temp; - struct sctp_transport *transport; - - list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { - transport = list_entry(pos, struct sctp_transport, transports); - if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) { - /* Do book keeping for removing the peer and free it. */ - sctp_assoc_rm_peer(asoc, transport); - break; - } - } -} - /* Lookup a transport by address. */ struct sctp_transport *sctp_assoc_lookup_paddr( const struct sctp_association *asoc, diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index f80208edd6a5c..3ead591c72fd3 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -115,14 +115,6 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk) skb->destructor = sctp_control_release_owner; } -/* What was the inbound interface for this chunk? */ -int sctp_chunk_iif(const struct sctp_chunk *chunk) -{ - struct sk_buff *skb = chunk->skb; - - return SCTP_INPUT_CB(skb)->af->skb_iif(skb); -} - /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 2: The ECN capable field is reserved for future use of From 7499672a0ec95d4f2e3fbfa4e2aa2c9a753dc4c7 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Fri, 2 May 2025 12:22:20 +0800 Subject: [PATCH 751/770] selftests: iou-zcrx: Clean up build warnings for error format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up two build warnings: [1]: iou-zcrx.c: In function ‘process_recvzc’: iou-zcrx.c:263:37: warning: too many arguments for format [-Wformat-extra-args] 263 | error(1, 0, "payload mismatch at ", i); | ^~~~~~~~~~~~~~~~~~~~~~ [2]: iou-zcrx.c: In function ‘run_client’: iou-zcrx.c:357:47: warning: format ‘%d’ expects argument of type ‘int’, but argument 4 has type ‘ssize_t’ {aka ‘long int’} [-Wformat=] 357 | error(1, 0, "send(): %d", sent); | ~^ ~~~~ | | | | int ssize_t {aka long int} | %ld Signed-off-by: Haiyue Wang Signed-off-by: NipaLocal --- tools/testing/selftests/drivers/net/hw/iou-zcrx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c index 8aa426014c87b..6eb95983a7e98 100644 --- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c @@ -260,7 +260,7 @@ static void process_recvzc(struct io_uring *ring, struct io_uring_cqe *cqe) for (i = 0; i < n; i++) { if (*(data + i) != payload[(received + i)]) - error(1, 0, "payload mismatch at ", i); + error(1, 0, "payload mismatch at %d", i); } received += n; @@ -354,7 +354,7 @@ static void run_client(void) chunk = min_t(ssize_t, cfg_payload_len, to_send); res = send(fd, src, chunk, 0); if (res < 0) - error(1, 0, "send(): %d", sent); + error(1, 0, "send(): %ld", sent); sent += res; to_send -= res; } From 0d5eaf3cc66f205ae11a79d06ed53bc74fa29381 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Fri, 2 May 2025 09:44:41 +0200 Subject: [PATCH 752/770] dt-bindings: net: Add MTIP L2 switch description This patch provides description of the MTIP L2 switch available in some NXP's SOCs - e.g. imx287. Signed-off-by: Lukasz Majewski Reviewed-by: Stefan Wahren Signed-off-by: NipaLocal --- .../bindings/net/nxp,imx28-mtip-switch.yaml | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/nxp,imx28-mtip-switch.yaml diff --git a/Documentation/devicetree/bindings/net/nxp,imx28-mtip-switch.yaml b/Documentation/devicetree/bindings/net/nxp,imx28-mtip-switch.yaml new file mode 100644 index 0000000000000..35f1fe268de7b --- /dev/null +++ b/Documentation/devicetree/bindings/net/nxp,imx28-mtip-switch.yaml @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/net/nxp,imx28-mtip-switch.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: NXP SoC Ethernet Switch Controller (L2 MoreThanIP switch) + +maintainers: + - Lukasz Majewski + +description: + The 2-port switch ethernet subsystem provides ethernet packet (L2) + communication and can be configured as an ethernet switch. It provides the + reduced media independent interface (RMII), the management data input + output (MDIO) for physical layer device (PHY) management. + +properties: + compatible: + const: nxp,imx28-mtip-switch + + reg: + maxItems: 1 + + phy-supply: + description: + Regulator that powers Ethernet PHYs. + + clocks: + items: + - description: Register accessing clock + - description: Bus access clock + - description: Output clock for external device - e.g. PHY source clock + - description: IEEE1588 timer clock + + clock-names: + items: + - const: ipg + - const: ahb + - const: enet_out + - const: ptp + + interrupts: + items: + - description: Switch interrupt + - description: ENET0 interrupt + - description: ENET1 interrupt + + interrupt-names: + items: + - const: enet_switch + - const: enet0 + - const: enet1 + + pinctrl-names: true + + ethernet-ports: + type: object + $ref: ethernet-switch.yaml#/$defs/ethernet-ports/patternProperties + additionalProperties: true + + patternProperties: + '^ethernet-port@[12]$': + type: object + additionalProperties: true + properties: + reg: + items: + - enum: [1, 2] + description: MTIP L2 switch port number + + required: + - reg + - label + - phy-mode + - phy-handle + + mdio: + type: object + $ref: mdio.yaml# + unevaluatedProperties: false + description: + Specifies the mdio bus in the switch, used as a container for phy nodes. + +required: + - compatible + - reg + - clocks + - clock-names + - interrupts + - interrupt-names + - mdio + - ethernet-ports + +unevaluatedProperties: false + +examples: + - | + #include + #include + switch@800f0000 { + compatible = "nxp,imx28-mtip-switch"; + reg = <0x800f0000 0x20000>; + pinctrl-names = "default"; + pinctrl-0 = <&mac0_pins_a>, <&mac1_pins_a>; + phy-supply = <®_fec_3v3>; + interrupts = <100>, <101>, <102>; + interrupt-names = "enet_switch", "enet0", "enet1"; + clocks = <&clks 57>, <&clks 57>, <&clks 64>, <&clks 35>; + clock-names = "ipg", "ahb", "enet_out", "ptp"; + + ethernet-ports { + #address-cells = <1>; + #size-cells = <0>; + + mtip_port1: ethernet-port@1 { + reg = <1>; + label = "lan0"; + local-mac-address = [ 00 00 00 00 00 00 ]; + phy-mode = "rmii"; + phy-handle = <ðphy0>; + }; + + mtip_port2: ethernet-port@2 { + reg = <2>; + label = "lan1"; + local-mac-address = [ 00 00 00 00 00 00 ]; + phy-mode = "rmii"; + phy-handle = <ðphy1>; + }; + }; + + mdio_sw: mdio { + #address-cells = <1>; + #size-cells = <0>; + + reset-gpios = <&gpio2 13 GPIO_ACTIVE_LOW>; + reset-delay-us = <25000>; + reset-post-delay-us = <10000>; + + ethphy0: ethernet-phy@0 { + reg = <0>; + }; + + ethphy1: ethernet-phy@1 { + reg = <1>; + }; + }; + }; From caa5b742ba3996286a1f7da5d5ab81299e70ef71 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Fri, 2 May 2025 09:44:42 +0200 Subject: [PATCH 753/770] ARM: dts: nxp: mxs: Adjust the imx28.dtsi L2 switch description The current range of 'reg' property is too small to allow full control of the L2 switch on imx287. As this IP block also uses ENET-MAC blocks for its operation, the address range for it must be included as well. Moreover, some SoC common properties (like compatible, clocks, interrupts numbers) have been moved to this node. Signed-off-by: Lukasz Majewski Reviewed-by: Andrew Lunn Reviewed-by: Stefan Wahren Signed-off-by: NipaLocal --- arch/arm/boot/dts/nxp/mxs/imx28.dtsi | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/nxp/mxs/imx28.dtsi b/arch/arm/boot/dts/nxp/mxs/imx28.dtsi index bbea8b77386f1..8aff2e87980ed 100644 --- a/arch/arm/boot/dts/nxp/mxs/imx28.dtsi +++ b/arch/arm/boot/dts/nxp/mxs/imx28.dtsi @@ -1321,8 +1321,13 @@ status = "disabled"; }; - eth_switch: switch@800f8000 { - reg = <0x800f8000 0x8000>; + eth_switch: switch@800f0000 { + compatible = "nxp,imx28-mtip-switch"; + reg = <0x800f0000 0x20000>; + interrupts = <100>, <101>, <102>; + interrupt-names = "enet_switch", "enet0", "enet1"; + clocks = <&clks 57>, <&clks 57>, <&clks 64>, <&clks 35>; + clock-names = "ipg", "ahb", "enet_out", "ptp"; status = "disabled"; }; }; From e1b20f50ce00b9dae83d37affe7758fa1127221d Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Fri, 2 May 2025 09:44:43 +0200 Subject: [PATCH 754/770] ARM: dts: nxp: mxs: Adjust XEA board's DTS to support L2 switch The description is similar to the one used with the new CPSW driver. Signed-off-by: Lukasz Majewski Reviewed-by: Andrew Lunn Reviewed-by: Stefan Wahren Signed-off-by: NipaLocal --- arch/arm/boot/dts/nxp/mxs/imx28-xea.dts | 56 +++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/arch/arm/boot/dts/nxp/mxs/imx28-xea.dts b/arch/arm/boot/dts/nxp/mxs/imx28-xea.dts index 6c5e6856648af..69032b29d7679 100644 --- a/arch/arm/boot/dts/nxp/mxs/imx28-xea.dts +++ b/arch/arm/boot/dts/nxp/mxs/imx28-xea.dts @@ -5,6 +5,7 @@ */ /dts-v1/; +#include #include "imx28-lwe.dtsi" / { @@ -90,6 +91,61 @@ gpio = <&gpio0 2 0>; }; +ð_switch { + pinctrl-names = "default"; + pinctrl-0 = <&mac0_pins_a>, <&mac1_pins_a>; + phy-supply = <®_fec_3v3>; + status = "okay"; + + ethernet-ports { + #address-cells = <1>; + #size-cells = <0>; + + mtip_port1: ethernet-port@1 { + reg = <1>; + label = "lan0"; + local-mac-address = [ 00 00 00 00 00 00 ]; + phy-mode = "rmii"; + phy-handle = <ðphy0>; + }; + + mtip_port2: ethernet-port@2 { + reg = <2>; + label = "lan1"; + local-mac-address = [ 00 00 00 00 00 00 ]; + phy-mode = "rmii"; + phy-handle = <ðphy1>; + }; + }; + + mdio_sw: mdio { + #address-cells = <1>; + #size-cells = <0>; + + reset-gpios = <&gpio3 21 GPIO_ACTIVE_LOW>; + reset-delay-us = <25000>; + reset-post-delay-us = <10000>; + + ethphy0: ethernet-phy@0 { + reg = <0>; + smsc,disable-energy-detect; + /* + * Both PHYs (i.e. 0,1) have the same, single GPIO, + * line to handle both, their interrupts (OR'ed) + */ + interrupt-parent = <&gpio4>; + interrupts = <13 IRQ_TYPE_LEVEL_LOW>; + }; + + ethphy1: ethernet-phy@1 { + reg = <1>; + smsc,disable-energy-detect; + interrupt-parent = <&gpio4>; + interrupts = <13 IRQ_TYPE_LEVEL_LOW>; + }; + }; +}; + &spi2_pins_a { fsl,pinmux-ids = < MX28_PAD_SSP2_SCK__SSP2_SCK From 38a54509d01c15ff7ae8aeabf70d9e2186a18f32 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Fri, 2 May 2025 09:44:44 +0200 Subject: [PATCH 755/770] net: mtip: The L2 switch driver for imx287 This patch series provides support for More Than IP L2 switch embedded in the imx287 SoC. This is a two port switch (placed between uDMA[01] and MAC-NET[01]), which can be used for offloading the network traffic. It can be used interchangeably with current FEC driver - to be more specific: one can use either of it, depending on the requirements. The biggest difference is the usage of DMA - when FEC is used, separate DMAs are available for each ENET-MAC block. However, with switch enabled - only the DMA0 is used to send/receive data to/form switch (and then switch sends them to respecitive ports). Signed-off-by: Lukasz Majewski Reviewed-by: Stefan Wahren Reviewed-by: Andrew Lunn Signed-off-by: NipaLocal --- MAINTAINERS | 7 + drivers/net/ethernet/freescale/Kconfig | 1 + drivers/net/ethernet/freescale/Makefile | 1 + drivers/net/ethernet/freescale/mtipsw/Kconfig | 13 + .../net/ethernet/freescale/mtipsw/Makefile | 4 + .../net/ethernet/freescale/mtipsw/mtipl2sw.c | 1978 +++++++++++++++++ .../net/ethernet/freescale/mtipsw/mtipl2sw.h | 771 +++++++ .../ethernet/freescale/mtipsw/mtipl2sw_br.c | 120 + .../ethernet/freescale/mtipsw/mtipl2sw_mgnt.c | 436 ++++ 9 files changed, 3331 insertions(+) create mode 100644 drivers/net/ethernet/freescale/mtipsw/Kconfig create mode 100644 drivers/net/ethernet/freescale/mtipsw/Makefile create mode 100644 drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c create mode 100644 drivers/net/ethernet/freescale/mtipsw/mtipl2sw.h create mode 100644 drivers/net/ethernet/freescale/mtipsw/mtipl2sw_br.c create mode 100644 drivers/net/ethernet/freescale/mtipsw/mtipl2sw_mgnt.c diff --git a/MAINTAINERS b/MAINTAINERS index deff813bf966b..3e075163913e5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9459,6 +9459,13 @@ S: Maintained F: Documentation/devicetree/bindings/i2c/i2c-mpc.yaml F: drivers/i2c/busses/i2c-mpc.c +FREESCALE MTIP ETHERNET SWITCH DRIVER +M: Lukasz Majewski +L: netdev@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/net/nxp,imx28-mtip-switch.yaml +F: drivers/net/ethernet/freescale/mtipsw/* + FREESCALE QORIQ DPAA ETHERNET DRIVER M: Madalin Bucur L: netdev@vger.kernel.org diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig index a2d7300925a82..056a11c3a74ef 100644 --- a/drivers/net/ethernet/freescale/Kconfig +++ b/drivers/net/ethernet/freescale/Kconfig @@ -60,6 +60,7 @@ config FEC_MPC52xx_MDIO source "drivers/net/ethernet/freescale/fs_enet/Kconfig" source "drivers/net/ethernet/freescale/fman/Kconfig" +source "drivers/net/ethernet/freescale/mtipsw/Kconfig" config FSL_PQ_MDIO tristate "Freescale PQ MDIO" diff --git a/drivers/net/ethernet/freescale/Makefile b/drivers/net/ethernet/freescale/Makefile index de7b318422330..0e6cacb0948a1 100644 --- a/drivers/net/ethernet/freescale/Makefile +++ b/drivers/net/ethernet/freescale/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_FSL_DPAA_ETH) += dpaa/ obj-$(CONFIG_FSL_DPAA2_ETH) += dpaa2/ obj-y += enetc/ +obj-y += mtipsw/ diff --git a/drivers/net/ethernet/freescale/mtipsw/Kconfig b/drivers/net/ethernet/freescale/mtipsw/Kconfig new file mode 100644 index 0000000000000..a6fbdb59854fe --- /dev/null +++ b/drivers/net/ethernet/freescale/mtipsw/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +config FEC_MTIP_L2SW + tristate "MoreThanIP L2 switch support to FEC driver" + depends on OF + depends on NET_SWITCHDEV + depends on BRIDGE + depends on SOC_IMX28 || COMPILE_TEST + help + This enables support for the MoreThan IP L2 switch on i.MX + SoCs (e.g. iMX287). It offloads bridging to this IP block's + hardware and allows switch management with standard Linux tools. + This switch driver can be used interchangeable with the already + available FEC driver, depending on the use case's requirements. diff --git a/drivers/net/ethernet/freescale/mtipsw/Makefile b/drivers/net/ethernet/freescale/mtipsw/Makefile new file mode 100644 index 0000000000000..81e2b0e03e6c3 --- /dev/null +++ b/drivers/net/ethernet/freescale/mtipsw/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_FEC_MTIP_L2SW) += nxp-mtipl2sw.o +nxp-mtipl2sw-objs := mtipl2sw.o mtipl2sw_mgnt.o mtipl2sw_br.o diff --git a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c new file mode 100644 index 0000000000000..4e52c796fd374 --- /dev/null +++ b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c @@ -0,0 +1,1978 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * L2 switch Controller (Ethernet L2 switch) driver for MTIP block. + * + * Copyright (C) 2025 DENX Software Engineering GmbH + * Lukasz Majewski + * + * Based on a previous work by: + * + * Copyright 2010-2012 Freescale Semiconductor, Inc. + * Alison Wang (b18965@freescale.com) + * Jason Jin (Jason.jin@freescale.com) + * + * Copyright (C) 2010-2013 Freescale Semiconductor, Inc. All Rights Reserved. + * Shrek Wu (B16972@freescale.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mtipl2sw.h" + +static void swap_buffer(void *bufaddr, int len) +{ + int i; + unsigned int *buf = bufaddr; + + for (i = 0; i < len; i += 4, buf++) + swab32s(buf); +} + +struct mtip_devinfo { + u32 quirks; +}; + +static void mtip_enet_init(struct switch_enet_private *fep, int port) +{ + void __iomem *enet_addr = fep->enet_addr; + u32 mii_speed, holdtime, reg; + + if (port == 2) + enet_addr += MCF_ESW_ENET_PORT_OFFSET; + + reg = MCF_FEC_RCR_PROM | MCF_FEC_RCR_MII_MODE | + MCF_FEC_RCR_MAX_FL(1522); + + if (fep->phy_interface[port - 1] == PHY_INTERFACE_MODE_RMII) + reg |= MCF_FEC_RCR_RMII_MODE; + + writel(reg, enet_addr + MCF_FEC_RCR); + + writel(MCF_FEC_TCR_FDEN, enet_addr + MCF_FEC_TCR); + writel(MCF_FEC_ECR_ETHER_EN, enet_addr + MCF_FEC_ECR); + + mii_speed = DIV_ROUND_UP(clk_get_rate(fep->clk_ipg), 5000000); + mii_speed--; + + holdtime = DIV_ROUND_UP(clk_get_rate(fep->clk_ipg), 100000000) - 1; + + fep->phy_speed = mii_speed << 1 | holdtime << 8; + + writel(fep->phy_speed, enet_addr + MCF_FEC_MSCR); +} + +static void mtip_setup_mac(struct net_device *dev) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + unsigned char *iap, mac_addr[ETH_ALEN]; + + /* Use MAC address from DTS */ + iap = &fep->mac[priv->portnum - 1][0]; + + /* Use MAC address set by bootloader */ + if (!is_valid_ether_addr(iap)) { + *((__be32 *)&mac_addr[0]) = + cpu_to_be32(readl(fep->enet_addr + MCF_FEC_PALR)); + *((__be16 *)&mac_addr[4]) = + cpu_to_be16(readl(fep->enet_addr + + MCF_FEC_PAUR) >> 16); + iap = &mac_addr[0]; + } + + /* Use random MAC address */ + if (!is_valid_ether_addr(iap)) { + eth_hw_addr_random(dev); + dev_info(&fep->pdev->dev, "Using random MAC address: %pM\n", + dev->dev_addr); + iap = (unsigned char *)dev->dev_addr; + } + + /* Adjust MAC if using macaddr (and increment if needed) */ + eth_hw_addr_gen(dev, iap, priv->portnum - 1); +} + +/** + * crc8_calc - calculate CRC for MAC storage + * + * @pmacaddress: A 6-byte array with the MAC address. The first byte is + * the first byte transmitted. + * + * Calculate Galois Field Arithmetic CRC for Polynom x^8+x^2+x+1. + * It omits the final shift in of 8 zeroes a "normal" CRC would do + * (getting the remainder). + * + * Examples (hexadecimal values):
+ * 10-11-12-13-14-15 => CRC=0xc2 + * 10-11-cc-dd-ee-00 => CRC=0xe6 + * + * Return: The 8-bit CRC in bits 7:0 + */ +static int crc8_calc(unsigned char *pmacaddress) +{ + int byt; /* byte index */ + int bit; /* bit index */ + int crc = 0x12; + int inval; + + for (byt = 0; byt < ETH_ALEN; byt++) { + inval = (((int)pmacaddress[byt]) & 0xFF); + /* shift bit 0 to bit 8 so all our bits + * travel through bit 8 + * (simplifies below calc) + */ + inval <<= 8; + + for (bit = 0; bit < 8; bit++) { + /* next input bit comes into d7 after shift */ + crc |= inval & 0x100; + if (crc & 0x01) + /* before shift */ + crc ^= 0x1C0; + + crc >>= 1; + inval >>= 1; + } + } + /* upper bits are clean as we shifted in zeroes! */ + return crc; +} + +static void mtip_read_atable(struct switch_enet_private *fep, int index, + u32 *read_lo, u32 *read_hi) +{ + struct addr_table64b_entry __iomem *atable_base = + fep->hwentry->mtip_table64b_entry; + + *read_lo = readl(&atable_base[index].lo); + *read_hi = readl(&atable_base[index].hi); +} + +static void mtip_write_atable(struct switch_enet_private *fep, int index, + u32 write_lo, u32 write_hi) +{ + struct addr_table64b_entry __iomem *atable_base = + fep->hwentry->mtip_table64b_entry; + + writel(write_lo, &atable_base[index].lo); + writel(write_hi, &atable_base[index].hi); +} + +/** + * mtip_portinfofifo_read - Read element from receive FIFO + * + * @fep: Structure describing switch + * + * Read one element from the HW receive FIFO (Queue) + * if available and return it. + * + * Return: mtip_port_info or NULL if no data is available. + */ +static +struct mtip_port_info *mtip_portinfofifo_read(struct switch_enet_private *fep) +{ + struct mtip_port_info *info = &fep->g_info; + u32 reg; + + reg = readl(fep->hwp + ESW_LSR); + if (reg == 0) { + dev_dbg(&fep->pdev->dev, "%s: ESW_LSR = 0x%x\n", __func__, reg); + return NULL; + } + + /* read word from FIFO */ + info->maclo = readl(fep->hwp + ESW_LREC0); + if (info->maclo == 0) { + dev_dbg(&fep->pdev->dev, "%s: mac lo 0x%x\n", __func__, + info->maclo); + return NULL; + } + + /* read 2nd word from FIFO */ + reg = readl(fep->hwp + ESW_LREC1); + info->machi = reg & 0xFFFF; + info->hash = (reg >> 16) & 0xFF; + info->port = (reg >> 24) & 0xF; + + return info; +} + +static void mtip_atable_get_entry_port_number(struct switch_enet_private *fep, + unsigned char *mac_addr, u8 *port) +{ + int block_index, block_index_end, entry; + u32 mac_addr_lo, mac_addr_hi; + u32 read_lo, read_hi; + + mac_addr_lo = (u32)((mac_addr[3] << 24) | (mac_addr[2] << 16) | + (mac_addr[1] << 8) | mac_addr[0]); + mac_addr_hi = (u32)((mac_addr[5] << 8) | (mac_addr[4])); + + block_index = GET_BLOCK_PTR(crc8_calc(mac_addr)); + block_index_end = block_index + ATABLE_ENTRY_PER_SLOT; + + /* now search all the entries in the selected block */ + for (entry = block_index; entry < block_index_end; entry++) { + mtip_read_atable(fep, entry, &read_lo, &read_hi); + *port = MTIP_PORT_FORWARDING_INIT; + + if (read_lo == mac_addr_lo && + ((read_hi & 0x0000FFFF) == + (mac_addr_hi & 0x0000FFFF))) { + /* found the correct address */ + if ((read_hi & (1 << 16)) && (!(read_hi & (1 << 17)))) + *port = AT_EXTRACT_PORT(read_hi); + break; + } + } + + dev_dbg(&fep->pdev->dev, "%s: MAC: %pM PORT: 0x%x\n", __func__, + mac_addr, *port); +} + +/* Clear complete MAC Look Up Table */ +void mtip_clear_atable(struct switch_enet_private *fep) +{ + int index; + + for (index = 0; index < MTIP_ATABLE_MEM_NUM_ENTRIES; index++) + mtip_write_atable(fep, index, 0, 0); +} + +/** + * mtip_update_atable_static - Update switch static address table + * + * @mac_addr: Pointer to the array containing MAC address to + * be put as static entry + * @port: Port bitmask numbers to be added in static entry, + * valid values are 1-7 + * @priority: The priority for the static entry in table + * + * @fep: Pointer to the structure describing the switch + * + * Updates MAC address lookup table with a static entry. + * + * Searches if the MAC address is already there in the block and replaces + * the older entry with the new one. If MAC address is not there then puts + * a new entry in the first empty slot available in the block. + * + * Return: 0 for a successful update else -ENOSPC when no slot available + */ +static int mtip_update_atable_static(unsigned char *mac_addr, unsigned int port, + unsigned int priority, + struct switch_enet_private *fep) +{ + unsigned long block_index, entry, index_end; + u32 write_lo, write_hi, read_lo, read_hi; + + write_lo = (u32)((mac_addr[3] << 24) | (mac_addr[2] << 16) | + (mac_addr[1] << 8) | mac_addr[0]); + write_hi = (u32)(0 | (port << AT_SENTRY_PORTMASK_shift) | + (priority << AT_SENTRY_PRIO_shift) | + (AT_ENTRY_TYPE_STATIC << AT_ENTRY_TYPE_shift) | + (AT_ENTRY_RECORD_VALID << AT_ENTRY_VALID_shift) | + (mac_addr[5] << 8) | (mac_addr[4])); + + block_index = GET_BLOCK_PTR(crc8_calc(mac_addr)); + index_end = block_index + ATABLE_ENTRY_PER_SLOT; + /* Now search all the entries in the selected block */ + for (entry = block_index; entry < index_end; entry++) { + mtip_read_atable(fep, entry, &read_lo, &read_hi); + /* MAC address matched, so update the + * existing entry + * even if its a dynamic one + */ + if (read_lo == write_lo && + ((read_hi & 0x0000FFFF) == + (write_hi & 0x0000FFFF))) { + mtip_write_atable(fep, entry, write_lo, write_hi); + return 0; + } else if (!(read_hi & (1 << 16))) { + /* Fill this empty slot (valid bit zero), + * assuming no holes in the block + */ + mtip_write_atable(fep, entry, write_lo, write_hi); + fep->at_curr_entries++; + return 0; + } + } + + /* No space available for this static entry */ + return -ENOSPC; +} + +static bool mtip_update_atable_dynamic1(u32 write_lo, u32 write_hi, + int block_index, unsigned int port, + unsigned int curr_time, + struct switch_enet_private *fep) +{ + unsigned long entry, index_end; + int time, timeold, indexold; + u32 read_lo, read_hi; + unsigned long conf; + + /* prepare update port and timestamp */ + conf = AT_ENTRY_RECORD_VALID << AT_ENTRY_VALID_shift; + conf |= AT_ENTRY_TYPE_DYNAMIC << AT_ENTRY_TYPE_shift; + conf |= curr_time << AT_DENTRY_TIME_shift; + conf |= port << AT_DENTRY_PORT_shift; + conf |= write_hi; + + /* linear search through all slot + * entries and update if found + */ + index_end = block_index + ATABLE_ENTRY_PER_SLOT; + /* Now search all the entries in the selected block */ + for (entry = block_index; entry < index_end; entry++) { + mtip_read_atable(fep, entry, &read_lo, &read_hi); + if (read_lo == write_lo && + ((read_hi & 0x0000FFFF) == + (write_hi & 0x0000FFFF))) { + /* found correct address, + * update timestamp. + */ + mtip_write_atable(fep, entry, write_lo, conf); + + return false; + } else if (!(read_hi & (1 << 16))) { + /* slot is empty, then use it + * for new entry + * Note: There are no holes, + * therefore cannot be any + * more that need to be compared. + */ + mtip_write_atable(fep, entry, write_lo, conf); + /* statistics (we do it between writing + * .hi an .lo due to + * hardware limitation... + */ + fep->at_curr_entries++; + /* newly inserted */ + + return true; + } + } + + /* No more entry available in block overwrite oldest */ + timeold = 0; + indexold = 0; + for (entry = block_index; entry < index_end; entry++) { + mtip_read_atable(fep, entry, &read_lo, &read_hi); + time = AT_EXTRACT_TIMESTAMP(read_hi); + dev_dbg(&fep->pdev->dev, "%s : time %x currtime %x\n", + __func__, time, curr_time); + time = TIMEDELTA(curr_time, time); + if (time > timeold) { + /* is it older ? */ + timeold = time; + indexold = entry; + } + } + + mtip_write_atable(fep, indexold, write_lo, conf); + + /* Statistics (do it inbetween writing to .lo and .hi */ + fep->at_block_overflows++; + dev_err(&fep->pdev->dev, "%s update time, at_block_overflows %x\n", + __func__, fep->at_block_overflows); + /* newly inserted */ + return true; +} + +/* dynamicms MAC address table learn and migration */ +static void +mtip_atable_dynamicms_learn_migration(struct switch_enet_private *fep, + int curr_time, unsigned char *mac, + u8 *rx_port) +{ + u8 port = MTIP_PORT_FORWARDING_INIT; + struct mtip_port_info *port_info; + u32 rx_mac_lo, rx_mac_hi; + unsigned long flags; + int index; + + spin_lock_irqsave(&fep->learn_lock, flags); + + if (mac && is_valid_ether_addr(mac)) { + rx_mac_lo = (u32)((mac[3] << 24) | (mac[2] << 16) | + (mac[1] << 8) | mac[0]); + rx_mac_hi = (u32)((mac[5] << 8) | (mac[4])); + } + + port_info = mtip_portinfofifo_read(fep); + while (port_info) { + /* get block index from lookup table */ + index = GET_BLOCK_PTR(port_info->hash); + mtip_update_atable_dynamic1(port_info->maclo, port_info->machi, + index, port_info->port, + curr_time, fep); + + if (mac && is_valid_ether_addr(mac) && + port == MTIP_PORT_FORWARDING_INIT) { + if (rx_mac_lo == port_info->maclo && + rx_mac_hi == port_info->machi) { + /* The newly learned MAC is the source of + * our filtered frame. + */ + port = (u8)port_info->port; + } + } + port_info = mtip_portinfofifo_read(fep); + } + + if (rx_port) + *rx_port = port; + + spin_unlock_irqrestore(&fep->learn_lock, flags); +} + +static void mtip_aging_timer(struct timer_list *t) +{ + struct switch_enet_private *fep = from_timer(fep, t, timer_aging); + + fep->curr_time = mtip_timeincrement(fep->curr_time); + + mod_timer(&fep->timer_aging, + jiffies + msecs_to_jiffies(LEARNING_AGING_INTERVAL)); +} + +static void esw_mac_addr_static(struct switch_enet_private *fep) +{ + int i; + + for (i = 0; i < SWITCH_EPORT_NUMBER; i++) + mtip_update_atable_static((unsigned char *) + fep->ndev[i]->dev_addr, 7, 7, fep); +} + +static void mtip_config_switch(struct switch_enet_private *fep) +{ + esw_mac_addr_static(fep); + + writel(0, fep->hwp + ESW_BKLR); + + /* Do NOT disable learning */ + mtip_port_learning_config(fep, 0, 0, 0); + mtip_port_learning_config(fep, 1, 0, 0); + mtip_port_learning_config(fep, 2, 0, 0); + + /* Disable blocking */ + mtip_port_blocking_config(fep, 0, 0); + mtip_port_blocking_config(fep, 1, 0); + mtip_port_blocking_config(fep, 2, 0); + + writel(MCF_ESW_IMR_TXF | MCF_ESW_IMR_RXF, + fep->hwp + ESW_IMR); + + mtip_port_enable_config(fep, 0, 1, 1); + mtip_port_enable_config(fep, 1, 1, 1); + mtip_port_enable_config(fep, 2, 1, 1); + + mtip_port_broadcast_config(fep, 0, 1); + mtip_port_broadcast_config(fep, 1, 1); + mtip_port_broadcast_config(fep, 2, 1); + + /* Disable multicast receive on port 0 (MGNT) */ + mtip_port_multicast_config(fep, 0, 0); + mtip_port_multicast_config(fep, 1, 1); + mtip_port_multicast_config(fep, 2, 1); + + /* Setup VLANs to provide port separation */ + if (!fep->br_offload) + mtip_switch_en_port_separation(fep); +} + +static netdev_tx_t mtip_start_xmit_port(struct sk_buff *skb, + struct net_device *dev, int port) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + unsigned short status; + unsigned long flags; + struct cbd_t *bdp; + void *bufaddr; + + spin_lock_irqsave(&fep->hw_lock, flags); + + if (!fep->link[0] && !fep->link[1]) { + /* Link is down or autonegotiation is in progress. */ + netif_stop_queue(dev); + spin_unlock_irqrestore(&fep->hw_lock, flags); + return NETDEV_TX_BUSY; + } + + /* Fill in a Tx ring entry */ + bdp = fep->cur_tx; + + status = bdp->cbd_sc; + + if (status & BD_ENET_TX_READY) { + /* All transmit buffers are full. Bail out. + * This should not happen, since dev->tbusy should be set. + */ + dev_err(&fep->pdev->dev, "%s: tx queue full!.\n", dev->name); + spin_unlock_irqrestore(&fep->hw_lock, flags); + return NETDEV_TX_BUSY; + } + + /* Clear all of the status flags */ + status &= ~BD_ENET_TX_STATS; + + /* Set buffer length and buffer pointer */ + bufaddr = skb->data; + bdp->cbd_datlen = skb->len; + + /* On some FEC implementations data must be aligned on + * 4-byte boundaries. Use bounce buffers to copy data + * and get it aligned. + */ + if ((unsigned long)bufaddr & MTIP_ALIGNMENT) { + unsigned int index; + + index = bdp - fep->tx_bd_base; + memcpy(fep->tx_bounce[index], + (void *)skb->data, skb->len); + bufaddr = fep->tx_bounce[index]; + } + + if (fep->quirks & FEC_QUIRK_SWAP_FRAME) + swap_buffer(bufaddr, skb->len); + + /* Save skb pointer. */ + fep->tx_skbuff[fep->skb_cur] = skb; + + dev->stats.tx_bytes += skb->len; + fep->skb_cur = (fep->skb_cur + 1) & TX_RING_MOD_MASK; + + /* Push the data cache so the CPM does not get stale memory + * data. + */ + bdp->cbd_bufaddr = dma_map_single(&fep->pdev->dev, bufaddr, + MTIP_SWITCH_TX_FRSIZE, + DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(&fep->pdev->dev, bdp->cbd_bufaddr))) { + dev_err(&fep->pdev->dev, + "Failed to map descriptor tx buffer\n"); + dev->stats.tx_errors++; + dev->stats.tx_dropped++; + dev_kfree_skb_any(skb); + goto err; + } + + /* Send it on its way. Tell FEC it's ready, interrupt when done, + * it's the last BD of the frame, and to put the CRC on the end. + */ + + status |= (BD_ENET_TX_READY | BD_ENET_TX_INTR + | BD_ENET_TX_LAST | BD_ENET_TX_TC); + bdp->cbd_sc = status; + + netif_trans_update(dev); + skb_tx_timestamp(skb); + + /* For port separation - force sending via specified port */ + if (!fep->br_offload && port != 0) + mtip_forced_forward(fep, port, 1); + + /* Trigger transmission start */ + writel(MCF_ESW_TDAR_X_DES_ACTIVE, fep->hwp + ESW_TDAR); + + /* If this was the last BD in the ring, + * start at the beginning again. + */ + if (status & BD_ENET_TX_WRAP) + bdp = fep->tx_bd_base; + else + bdp++; + + if (bdp == fep->dirty_tx) { + fep->tx_full = 1; + netif_stop_queue(dev); + } + + fep->cur_tx = bdp; + err: + spin_unlock_irqrestore(&fep->hw_lock, flags); + + return NETDEV_TX_OK; +} + +static netdev_tx_t mtip_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + + return mtip_start_xmit_port(skb, dev, priv->portnum); +} + +static void mtip_configure_enet_mii(struct switch_enet_private *fep, int port) +{ + struct phy_device *phydev = fep->phy_dev[port - 1]; + struct net_device *dev = fep->ndev[port - 1]; + void __iomem *enet_addr = fep->enet_addr; + int duplex = fep->full_duplex[port - 1]; + u32 rcr; + + if (port == 2) + enet_addr += MCF_ESW_ENET_PORT_OFFSET; + + /* ECR */ + writel(MCF_FEC_ECR_MAGIC_ENA, enet_addr + MCF_FEC_ECR); + + /* EMRBR */ + writel(PKT_MAXBLR_SIZE, enet_addr + MCF_FEC_EMRBR); + + /* set the receive and transmit BDs ring base to + * hardware registers(ERDSR & ETDSR) + */ + writel(fep->bd_dma, enet_addr + MCF_FEC_ERDSR); + writel((unsigned long)fep->bd_dma + sizeof(struct cbd_t) * RX_RING_SIZE, + enet_addr + MCF_FEC_ETDSR); + + writel(fep->phy_speed, enet_addr + MCF_FEC_MSCR); + + /* EIR */ + writel(0, enet_addr + MCF_FEC_EIR); + + /* IAUR */ + writel(0, enet_addr + MCF_FEC_IAUR); + + /* IALR */ + writel(0, enet_addr + MCF_FEC_IALR); + + /* GAUR */ + writel(0, enet_addr + MCF_FEC_GAUR); + + /* GALR */ + writel(0, enet_addr + MCF_FEC_GALR); + + /* EMRBR */ + writel(PKT_MAXBLR_SIZE, enet_addr + MCF_FEC_EMRBR); + + /* EIMR */ + writel(0, enet_addr + MCF_FEC_EIMR); + + /* PALR PAUR */ + /* Set the station address for the ENET Adapter */ + writel(dev->dev_addr[3] | + dev->dev_addr[2] << 8 | + dev->dev_addr[1] << 16 | + dev->dev_addr[0] << 24, enet_addr + MCF_FEC_PALR); + writel(dev->dev_addr[5] << 16 | + (dev->dev_addr[4] + (unsigned char)(0)) << 24, + enet_addr + MCF_FEC_PAUR); + + /* RCR */ + rcr = readl(enet_addr + MCF_FEC_RCR); + if (phydev && phydev->speed == SPEED_100) + rcr &= ~MCF_FEC_RCR_RMII_10BASET; + else + rcr |= MCF_FEC_RCR_RMII_10BASET; + + if (duplex == DUPLEX_FULL) + rcr &= ~MCF_FEC_RCR_DRT; + else + rcr |= MCF_FEC_RCR_DRT; + + writel(rcr, enet_addr + MCF_FEC_RCR); + + /* TCR */ + if (duplex == DUPLEX_FULL) + writel(0x1C, enet_addr + MCF_FEC_TCR); + else + writel(0x18, enet_addr + MCF_FEC_TCR); + + /* ECR */ + writel(readl(enet_addr + MCF_FEC_ECR) | MCF_FEC_ECR_ETHER_EN, + enet_addr + MCF_FEC_ECR); +} + +/* This function is called to start or restart the FEC during a link + * change. This only happens when switching between half and full + * duplex. + */ +static void mtip_switch_restart(struct net_device *dev, int duplex0, + int duplex1) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + int i; + + /* Perform a reset. We should wait for this. */ + writel(MCF_ESW_MODE_SW_RST, fep->hwp + ESW_MODE); + + /* Delay of 10us specified in the documentation to perform + * SW reset by the switch internally. + */ + udelay(10); + writel(MCF_ESW_MODE_STATRST, fep->hwp + ESW_MODE); + writel(MCF_ESW_MODE_SW_EN, fep->hwp + ESW_MODE); + + /* Management port configuration, + * make port 0 as management port + */ + writel(0, fep->hwp + ESW_BMPC); + + /* Clear any outstanding interrupt */ + writel(0xFFFFFFFF, fep->hwp + ESW_ISR); + + /* Set backpressure threshold to minimize discarded frames + * during due to congestion. + */ + writel(P0BC_THRESHOLD, fep->hwp + ESW_P0BCT); + + /* Set maximum receive buffer size */ + writel(PKT_MAXBLR_SIZE, fep->hwp + ESW_MRBR); + + /* Set receive and transmit descriptor base */ + writel(fep->bd_dma, fep->hwp + ESW_RDSR); + writel((unsigned long)fep->bd_dma + + sizeof(struct cbd_t) * RX_RING_SIZE, + fep->hwp + ESW_TDSR); + + fep->cur_tx = fep->tx_bd_base; + fep->cur_rx = fep->rx_bd_base; + fep->dirty_tx = fep->cur_tx; + + /* Reset SKB transmit buffers */ + fep->skb_cur = 0; + fep->skb_dirty = 0; + for (i = 0; i <= TX_RING_MOD_MASK; i++) { + if (fep->tx_skbuff[i]) { + dev_kfree_skb_any(fep->tx_skbuff[i]); + fep->tx_skbuff[i] = NULL; + } + } + + fep->full_duplex[0] = duplex0; + fep->full_duplex[1] = duplex1; + + mtip_configure_enet_mii(fep, 1); + mtip_configure_enet_mii(fep, 2); + mtip_clear_atable(fep); + + /* And last, enable the transmit and receive processing */ + writel(MCF_ESW_RDAR_R_DES_ACTIVE, fep->hwp + ESW_RDAR); + + /* Enable interrupts we wish to service */ + writel(0xFFFFFFFF, fep->hwp + ESW_ISR); + writel(MCF_ESW_IMR_TXF | MCF_ESW_IMR_RXF, + fep->hwp + ESW_IMR); + + mtip_config_switch(fep); +} + +static void mtip_timeout(struct net_device *dev, unsigned int txqueue) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + struct cbd_t *bdp; + int i; + + dev->stats.tx_errors++; + + if (IS_ENABLED(CONFIG_SWITCH_DEBUG)) { + dev_info(&dev->dev, "%s: transmit timed out.\n", dev->name); + dev_info(&dev->dev, + "Ring data: cur_tx %lx%s, dirty_tx %lx cur_rx: %lx\n", + (unsigned long)fep->cur_tx, + fep->tx_full ? " (full)" : "", + (unsigned long)fep->dirty_tx, + (unsigned long)fep->cur_rx); + + bdp = fep->tx_bd_base; + dev_info(&dev->dev, " tx: %u buffers\n", TX_RING_SIZE); + for (i = 0; i < TX_RING_SIZE; i++) { + dev_info(&dev->dev, " %08lx: %04x %04x %08x\n", + (kernel_ulong_t)bdp, bdp->cbd_sc, + bdp->cbd_datlen, (int)bdp->cbd_bufaddr); + bdp++; + } + + bdp = fep->rx_bd_base; + dev_info(&dev->dev, " rx: %lu buffers\n", + (unsigned long)RX_RING_SIZE); + for (i = 0 ; i < RX_RING_SIZE; i++) { + dev_info(&dev->dev, " %08lx: %04x %04x %08x\n", + (kernel_ulong_t)bdp, + bdp->cbd_sc, bdp->cbd_datlen, + (int)bdp->cbd_bufaddr); + bdp++; + } + } + + rtnl_lock(); + if (netif_device_present(dev) || netif_running(dev)) { + napi_disable(&fep->napi); + netif_tx_lock_bh(dev); + mtip_switch_restart(dev, fep->full_duplex[0], + fep->full_duplex[1]); + netif_tx_wake_all_queues(dev); + netif_tx_unlock_bh(dev); + napi_enable(&fep->napi); + } + rtnl_unlock(); +} + +static irqreturn_t mtip_interrupt(int irq, void *ptr_fep) +{ + struct switch_enet_private *fep = ptr_fep; + irqreturn_t ret = IRQ_NONE; + u32 int_events, int_imask; + + /* Get the interrupt events that caused us to be here */ + int_events = readl(fep->hwp + ESW_ISR); + writel(int_events, fep->hwp + ESW_ISR); + + if (int_events & (MCF_ESW_ISR_RXF | MCF_ESW_ISR_TXF)) { + ret = IRQ_HANDLED; + /* Disable the RX interrupt */ + if (napi_schedule_prep(&fep->napi)) { + int_imask = readl(fep->hwp + ESW_IMR); + int_imask &= ~MCF_ESW_IMR_RXF; + writel(int_imask, fep->hwp + ESW_IMR); + __napi_schedule(&fep->napi); + } + } + + return ret; +} + +static void mtip_switch_tx(struct net_device *dev) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + unsigned short status; + struct sk_buff *skb; + unsigned long flags; + struct cbd_t *bdp; + + spin_lock_irqsave(&fep->hw_lock, flags); + bdp = fep->dirty_tx; + + while (((status = bdp->cbd_sc) & BD_ENET_TX_READY) == 0) { + if (bdp == fep->cur_tx && fep->tx_full == 0) + break; + + dma_unmap_single(&fep->pdev->dev, bdp->cbd_bufaddr, + MTIP_SWITCH_TX_FRSIZE, DMA_TO_DEVICE); + bdp->cbd_bufaddr = 0; + skb = fep->tx_skbuff[fep->skb_dirty]; + /* Check for errors */ + if (status & (BD_ENET_TX_HB | BD_ENET_TX_LC | + BD_ENET_TX_RL | BD_ENET_TX_UN | + BD_ENET_TX_CSL)) { + dev->stats.tx_errors++; + if (status & BD_ENET_TX_HB) /* No heartbeat */ + dev->stats.tx_heartbeat_errors++; + if (status & BD_ENET_TX_LC) /* Late collision */ + dev->stats.tx_window_errors++; + if (status & BD_ENET_TX_RL) /* Retrans limit */ + dev->stats.tx_aborted_errors++; + if (status & BD_ENET_TX_UN) /* Underrun */ + dev->stats.tx_fifo_errors++; + if (status & BD_ENET_TX_CSL) /* Carrier lost */ + dev->stats.tx_carrier_errors++; + } else { + dev->stats.tx_packets++; + } + + if (status & BD_ENET_TX_READY) + dev_err(&fep->pdev->dev, + "Enet xmit interrupt and TX_READY.\n"); + + /* Deferred means some collisions occurred during transmit, + * but we eventually sent the packet OK. + */ + if (status & BD_ENET_TX_DEF) + dev->stats.collisions++; + + /* Free the sk buffer associated with this last transmit */ + dev_consume_skb_irq(skb); + fep->tx_skbuff[fep->skb_dirty] = NULL; + fep->skb_dirty = (fep->skb_dirty + 1) & TX_RING_MOD_MASK; + + /* Update pointer to next buffer descriptor to be transmitted */ + if (status & BD_ENET_TX_WRAP) + bdp = fep->tx_bd_base; + else + bdp++; + + /* Since we have freed up a buffer, the ring is no longer + * full. + */ + if (fep->tx_full) { + fep->tx_full = 0; + if (netif_queue_stopped(dev)) + netif_wake_queue(dev); + } + } + fep->dirty_tx = bdp; + spin_unlock_irqrestore(&fep->hw_lock, flags); +} + +/* During a receive, the cur_rx points to the current incoming buffer. + * When we update through the ring, if the next incoming buffer has + * not been given to the system, we just set the empty indicator, + * effectively tossing the packet. + */ +static int mtip_switch_rx(struct net_device *dev, int budget, int *port) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + u8 *data, rx_port = MTIP_PORT_FORWARDING_INIT; + struct switch_enet_private *fep = priv->fep; + unsigned short status, pkt_len; + struct net_device *pndev; + struct ethhdr *eth_hdr; + int pkt_received = 0; + struct sk_buff *skb; + unsigned long flags; + struct cbd_t *bdp; + + spin_lock_irqsave(&fep->hw_lock, flags); + + /* First, grab all of the stats for the incoming packet. + * These get messed up if we get called due to a busy condition. + */ + bdp = fep->cur_rx; + + while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) { + if (pkt_received >= budget) + break; + + pkt_received++; + /* Since we have allocated space to hold a complete frame, + * the last indicator should be set. + */ + if ((status & BD_ENET_RX_LAST) == 0) + dev_warn_ratelimited(&dev->dev, + "SWITCH ENET: rcv is not +last\n"); + + if (!fep->usage_count) + goto rx_processing_done; + + /* Check for errors. */ + if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH | BD_ENET_RX_NO | + BD_ENET_RX_CR | BD_ENET_RX_OV)) { + dev->stats.rx_errors++; + if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH)) { + /* Frame too long or too short. */ + dev->stats.rx_length_errors++; + } + if (status & BD_ENET_RX_NO) /* Frame alignment */ + dev->stats.rx_frame_errors++; + if (status & BD_ENET_RX_CR) /* CRC Error */ + dev->stats.rx_crc_errors++; + if (status & BD_ENET_RX_OV) /* FIFO overrun */ + dev->stats.rx_fifo_errors++; + } + + /* Report late collisions as a frame error. + * On this error, the BD is closed, but we don't know what we + * have in the buffer. So, just drop this frame on the floor. + */ + if (status & BD_ENET_RX_CL) { + dev->stats.rx_errors++; + dev->stats.rx_frame_errors++; + goto rx_processing_done; + } + + /* Process the incoming frame */ + pkt_len = bdp->cbd_datlen; + data = (__u8 *)__va(bdp->cbd_bufaddr); + + dma_unmap_single(&fep->pdev->dev, bdp->cbd_bufaddr, + bdp->cbd_datlen, DMA_FROM_DEVICE); + + if (fep->quirks & FEC_QUIRK_SWAP_FRAME) + swap_buffer(data, pkt_len); + + if (data) { + eth_hdr = (struct ethhdr *)data; + mtip_atable_get_entry_port_number(fep, + eth_hdr->h_source, + &rx_port); + if (rx_port == MTIP_PORT_FORWARDING_INIT) + mtip_atable_dynamicms_learn_migration(fep, + fep->curr_time, + eth_hdr->h_source, + &rx_port); + } + + if (!fep->br_offload && (rx_port == 1 || rx_port == 2)) + pndev = fep->ndev[rx_port - 1]; + else + pndev = dev; + + *port = rx_port; + pndev->stats.rx_packets++; + pndev->stats.rx_bytes += pkt_len; + + /* This does 16 byte alignment, exactly what we need. + * The packet length includes FCS, but we don't want to + * include that when passing upstream as it messes up + * bridging applications. + */ + skb = netdev_alloc_skb(pndev, pkt_len + NET_IP_ALIGN); + if (unlikely(!skb)) { + dev_dbg(&fep->pdev->dev, + "%s: Memory squeeze, dropping packet.\n", + pndev->name); + pndev->stats.rx_dropped++; + goto err_mem; + } else { + skb_reserve(skb, NET_IP_ALIGN); + skb_put(skb, pkt_len); /* Make room */ + skb_copy_to_linear_data(skb, data, pkt_len); + skb->protocol = eth_type_trans(skb, pndev); + napi_gro_receive(&fep->napi, skb); + } + + bdp->cbd_bufaddr = dma_map_single(&fep->pdev->dev, data, + bdp->cbd_datlen, + DMA_FROM_DEVICE); + if (unlikely(dma_mapping_error(&fep->pdev->dev, + bdp->cbd_bufaddr))) { + dev_err(&fep->pdev->dev, + "Failed to map descriptor rx buffer\n"); + pndev->stats.rx_errors++; + pndev->stats.rx_dropped++; + dev_kfree_skb_any(skb); + goto err_mem; + } + + rx_processing_done: + /* Clear the status flags for this buffer */ + status &= ~BD_ENET_RX_STATS; + + /* Mark the buffer empty */ + status |= BD_ENET_RX_EMPTY; + bdp->cbd_sc = status; + + /* Update BD pointer to next entry */ + if (status & BD_ENET_RX_WRAP) + bdp = fep->rx_bd_base; + else + bdp++; + + /* Doing this here will keep the FEC running while we process + * incoming frames. On a heavily loaded network, we should be + * able to keep up at the expense of system resources. + */ + writel(MCF_ESW_RDAR_R_DES_ACTIVE, fep->hwp + ESW_RDAR); + } /* while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) */ + + fep->cur_rx = bdp; + spin_unlock_irqrestore(&fep->hw_lock, flags); + + return pkt_received; + + err_mem: + spin_unlock_irqrestore(&fep->hw_lock, flags); + return -ENOMEM; +} + +static void mtip_adjust_link(struct net_device *dev) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + struct phy_device *phy_dev; + int status_change = 0, idx; + unsigned long flags; + + spin_lock_irqsave(&fep->hw_lock, flags); + + idx = priv->portnum - 1; + phy_dev = fep->phy_dev[idx]; + + /* Duplex link change */ + if (phy_dev->link && fep->full_duplex[idx] != phy_dev->duplex) { + if (idx == 0) + mtip_switch_restart(dev, phy_dev->duplex, + fep->full_duplex[!idx]); + else + mtip_switch_restart(dev, fep->full_duplex[!idx], + phy_dev->duplex); + status_change = 1; + } + + /* Link on or off change */ + if (phy_dev->link != fep->link[idx]) { + fep->link[idx] = phy_dev->link; + if (phy_dev->link) { + if (idx == 0) + mtip_switch_restart(dev, phy_dev->duplex, + fep->full_duplex[!idx]); + else + mtip_switch_restart(dev, fep->full_duplex[!idx], + phy_dev->duplex); + /* if link becomes up and tx be stopped, start it */ + if (netif_queue_stopped(dev)) { + netif_start_queue(dev); + netif_wake_queue(dev); + } + } + status_change = 1; + } + + spin_unlock_irqrestore(&fep->hw_lock, flags); + + if (status_change) + phy_print_status(phy_dev); +} + +static int mtip_mdio_wait(struct switch_enet_private *fep) +{ + uint ievent = 0; + int ret; + + ret = readl_poll_timeout_atomic(fep->enet_addr + MCF_FEC_EIR, ievent, + ievent & MCF_ENET_MII, 2, 30000); + if (!ret) + writel(MCF_ENET_MII, fep->enet_addr + MCF_FEC_EIR); + + return ret; +} + +static int mtip_mdio_read(struct mii_bus *bus, int mii_id, int regnum) +{ + struct switch_enet_private *fep = bus->priv; + int ret; + + /* start a read op */ + writel(FEC_MMFR_ST | FEC_MMFR_OP_READ | + FEC_MMFR_PA(mii_id) | FEC_MMFR_RA(regnum) | + FEC_MMFR_TA, fep->enet_addr + MCF_FEC_MII_DATA); + + /* wait for end of transfer */ + ret = mtip_mdio_wait(fep); + if (ret) { + dev_err(&fep->pdev->dev, "MTIP: MDIO (%s:%d) read timeout\n", + bus->id, mii_id); + return ret; + } + + /* return value */ + return FEC_MMFR_DATA(readl(fep->enet_addr + MCF_FEC_MII_DATA)); +} + +static int mtip_mdio_write(struct mii_bus *bus, int mii_id, int regnum, + u16 value) +{ + struct switch_enet_private *fep = bus->priv; + int ret; + + /* start a write op */ + writel(FEC_MMFR_ST | FEC_MMFR_OP_WRITE | + FEC_MMFR_PA(mii_id) | FEC_MMFR_RA(regnum) | + FEC_MMFR_TA | FEC_MMFR_DATA(value), + fep->enet_addr + MCF_FEC_MII_DATA); + + /* wait for end of transfer */ + ret = mtip_mdio_wait(fep); + if (ret) + dev_err(&fep->pdev->dev, "MTIP: MDIO (%s:%d) write timeout\n", + bus->id, mii_id); + + return ret; +} + +static int mtip_mii_probe(struct net_device *dev) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + struct phy_device *phy_dev = NULL; + int port_idx = priv->portnum - 1; + + if (fep->phy_np[port_idx]) { + phy_dev = of_phy_connect(dev, fep->phy_np[port_idx], + &mtip_adjust_link, 0, + fep->phy_interface[port_idx]); + if (!phy_dev) { + netdev_err(dev, "Unable to connect to phy\n"); + return -ENODEV; + } + } + + phy_set_max_speed(phy_dev, 100); + fep->phy_dev[port_idx] = phy_dev; + fep->link[port_idx] = 0; + fep->full_duplex[port_idx] = 0; + + dev_dbg(&dev->dev, + "MTIP PHY driver [%s] (mii_bus:phy_addr=%s, irq=%d)\n", + fep->phy_dev[port_idx]->drv->name, + phydev_name(fep->phy_dev[port_idx]), + fep->phy_dev[port_idx]->irq); + + return 0; +} + +static int mtip_mdiobus_reset(struct mii_bus *bus) +{ + if (!bus || !bus->reset_gpiod) { + dev_err(&bus->dev, "Reset GPIO pin not provided!\n"); + return -EINVAL; + } + + gpiod_set_value_cansleep(bus->reset_gpiod, 0); + + /* Extra time to allow: + * 1. GPIO RESET pin go high to prevent situation where its value is + * "LOW" as it is NOT configured. + * 2. The ENET CLK to stabilize before GPIO RESET is asserted + */ + usleep_range(200, 300); + + gpiod_set_value_cansleep(bus->reset_gpiod, 1); + usleep_range(bus->reset_delay_us, bus->reset_delay_us + 1000); + gpiod_set_value_cansleep(bus->reset_gpiod, 0); + + if (bus->reset_post_delay_us > 0) + usleep_range(bus->reset_post_delay_us, + bus->reset_post_delay_us + 1000); + + return 0; +} + +static int mtip_mii_init(struct switch_enet_private *fep, + struct platform_device *pdev) +{ + struct device_node *node; + int err = -ENXIO; + + /* Clear MMFR to avoid to generate MII event by writing MSCR. + * MII event generation condition: + * - writing MSCR: + * - mmfr[31:0]_not_zero & mscr[7:0]_is_zero & + * mscr_reg_data_in[7:0] != 0 + * - writing MMFR: + * - mscr[7:0]_not_zero + */ + writel(0, fep->hwp + MCF_FEC_MII_DATA); + /* Clear any pending transaction complete indication */ + writel(MCF_ENET_MII, fep->enet_addr + MCF_FEC_EIR); + + fep->mii_bus = mdiobus_alloc(); + if (!fep->mii_bus) { + err = -ENOMEM; + goto err_out; + } + + fep->mii_bus->name = "mtip_mii_bus"; + fep->mii_bus->read = mtip_mdio_read; + fep->mii_bus->write = mtip_mdio_write; + fep->mii_bus->reset = mtip_mdiobus_reset; + snprintf(fep->mii_bus->id, MII_BUS_ID_SIZE, "%x", 0); + fep->mii_bus->priv = fep; + fep->mii_bus->parent = &pdev->dev; + + node = of_get_child_by_name(pdev->dev.of_node, "mdio"); + if (node) + dev_err(&fep->pdev->dev, "%s: PHY name: %s\n", + __func__, node->name); + + err = of_mdiobus_register(fep->mii_bus, node); + if (node) + of_node_put(node); + if (err) + goto err_out_free_mdiobus; + + return 0; + +err_out_free_mdiobus: + mdiobus_free(fep->mii_bus); +err_out: + return err; +} + +static void mtip_mii_remove(struct switch_enet_private *fep) +{ + int i; + + for (i = 0; i < SWITCH_EPORT_NUMBER; i++) { + if (fep->phy_np[i]) + of_node_put(fep->phy_np[i]); + + if (fep->phy_dev[i]) + phy_disconnect(fep->phy_dev[i]); + } + + mdiobus_unregister(fep->mii_bus); + mdiobus_free(fep->mii_bus); +} + +static void mtip_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + + strscpy(info->driver, fep->pdev->dev.driver->name, + sizeof(info->driver)); + strscpy(info->bus_info, dev_name(&dev->dev), + sizeof(info->bus_info)); +} + +static void mtip_free_buffers(struct net_device *dev) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + struct sk_buff *skb; + struct cbd_t *bdp; + int i; + + bdp = fep->rx_bd_base; + for (i = 0; i < RX_RING_SIZE; i++) { + skb = fep->rx_skbuff[i]; + + if (bdp->cbd_bufaddr) + dma_unmap_single(&fep->pdev->dev, bdp->cbd_bufaddr, + MTIP_SWITCH_RX_FRSIZE, + DMA_FROM_DEVICE); + if (skb) + dev_kfree_skb(skb); + bdp++; + } + + bdp = fep->tx_bd_base; + for (i = 0; i < TX_RING_SIZE; i++) + kfree(fep->tx_bounce[i]); +} + +static int mtip_alloc_buffers(struct net_device *dev) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + struct sk_buff *skb; + struct cbd_t *bdp; + int i; + + bdp = fep->rx_bd_base; + for (i = 0; i < RX_RING_SIZE; i++) { + skb = netdev_alloc_skb(dev, MTIP_SWITCH_RX_FRSIZE); + if (!skb) + goto err; + + fep->rx_skbuff[i] = skb; + + bdp->cbd_bufaddr = dma_map_single(&fep->pdev->dev, skb->data, + MTIP_SWITCH_RX_FRSIZE, + DMA_FROM_DEVICE); + if (unlikely(dma_mapping_error(&fep->pdev->dev, + bdp->cbd_bufaddr))) { + dev_err(&fep->pdev->dev, + "Failed to map descriptor rx buffer\n"); + dev_kfree_skb_any(skb); + goto err; + } + + bdp->cbd_sc = BD_ENET_RX_EMPTY; + bdp++; + } + + /* Set the last buffer to wrap. */ + bdp--; + bdp->cbd_sc |= BD_SC_WRAP; + + bdp = fep->tx_bd_base; + for (i = 0; i < TX_RING_SIZE; i++) { + fep->tx_bounce[i] = kmalloc(MTIP_SWITCH_TX_FRSIZE, GFP_KERNEL); + + bdp->cbd_sc = 0; + bdp->cbd_bufaddr = 0; + bdp++; + } + + /* Set the last buffer to wrap. */ + bdp--; + bdp->cbd_sc |= BD_SC_WRAP; + + return 0; + + err: + mtip_free_buffers(dev); + return -ENOMEM; +} + +static int mtip_rx_napi(struct napi_struct *napi, int budget) +{ + struct mtip_ndev_priv *priv = netdev_priv(napi->dev); + struct switch_enet_private *fep = priv->fep; + int pkts, port; + + pkts = mtip_switch_rx(napi->dev, budget, &port); + if (pkts == -ENOMEM) { + napi_complete(napi); + return 0; + } + + if (!fep->br_offload && + (port == 1 || port == 2) && fep->ndev[port - 1]) + mtip_switch_tx(fep->ndev[port - 1]); + else + mtip_switch_tx(napi->dev); + + if (pkts < budget) { + napi_complete_done(napi, pkts); + /* Set default interrupt mask for L2 switch */ + writel(MCF_ESW_IMR_RXF | MCF_ESW_IMR_TXF, + fep->hwp + ESW_IMR); + } + return pkts; +} + +static int mtip_open(struct net_device *dev) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + int ret, port_idx = priv->portnum - 1; + + if (fep->usage_count == 0) { + ret = clk_enable(fep->clk_ipg); + if (ret) { + dev_err(&fep->pdev->dev, + "Cannot enable switch IPG clock\n"); + return ret; + } + + netif_napi_add(dev, &fep->napi, mtip_rx_napi); + + ret = mtip_alloc_buffers(dev); + if (ret) + goto mtip_alloc_buffers_err; + } + + fep->link[port_idx] = 0; + + /* Probe and connect to PHY when open the interface, if already + * NOT done in the switch driver probe (or when the device is + * re-opened). + */ + ret = mtip_mii_probe(dev); + if (ret) + goto mtip_mii_probe_err; + + phy_start(fep->phy_dev[port_idx]); + + if (fep->usage_count == 0) { + napi_enable(&fep->napi); + mtip_switch_restart(dev, 1, 1); + + fep->curr_time = 0; + netif_start_queue(dev); + } + + fep->usage_count++; + return 0; + + mtip_mii_probe_err: + mtip_free_buffers(dev); + mtip_alloc_buffers_err: + if (fep->usage_count == 0) { + netif_napi_del(&fep->napi); + clk_disable(fep->clk_ipg); + } + return ret; +}; + +static int mtip_close(struct net_device *dev) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + int idx = priv->portnum - 1; + + fep->link[idx] = 0; + + if (fep->phy_dev[idx]) { + phy_stop(fep->phy_dev[idx]); + netif_stop_queue(dev); + phy_disconnect(fep->phy_dev[idx]); + fep->phy_dev[idx] = NULL; + } + + if (fep->usage_count == 1) { + napi_disable(&fep->napi); + netif_napi_del(&fep->napi); + mtip_free_buffers(dev); + clk_disable(fep->clk_ipg); + } + + fep->usage_count--; + + return 0; +} + +#define FEC_HASH_BITS 6 /* #bits in hash */ +static void mtip_set_multicast_list(struct net_device *dev) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + unsigned int hash_high = 0, hash_low = 0, crc; + struct switch_enet_private *fep = priv->fep; + void __iomem *enet_addr = fep->enet_addr; + struct netdev_hw_addr *ha; + unsigned char hash; + + if (priv->portnum == 2) + enet_addr += MCF_ESW_ENET_PORT_OFFSET; + + if (dev->flags & IFF_PROMISC) { + /* Promisc mode is required for switch - it is + * already enabled during driver's probe. + */ + dev_dbg(&dev->dev, "%s: IFF_PROMISC\n", __func__); + return; + } + + if (dev->flags & IFF_ALLMULTI) { + dev_dbg(&dev->dev, "%s: IFF_ALLMULTI\n", __func__); + + /* Allow all multicast addresses */ + writel(0xFFFFFFFF, enet_addr + MCF_FEC_GRP_HASH_TABLE_HIGH); + writel(0xFFFFFFFF, enet_addr + MCF_FEC_GRP_HASH_TABLE_LOW); + + return; + } + + netdev_for_each_mc_addr(ha, dev) { + /* Calculate crc32 value of mac address */ + crc = ether_crc_le(dev->addr_len, ha->addr); + + /* Only upper 6 bits (FEC_HASH_BITS) are used + * which point to specific bit in the hash registers + */ + hash = (crc >> (32 - FEC_HASH_BITS)) & 0x3F; + + if (hash > 31) + hash_high |= 1 << (hash - 32); + else + hash_low |= 1 << hash; + } + + writel(hash_high, enet_addr + MCF_FEC_GRP_HASH_TABLE_HIGH); + writel(hash_low, enet_addr + MCF_FEC_GRP_HASH_TABLE_LOW); +} + +static int mtip_set_mac_address(struct net_device *dev, void *p) +{ + struct mtip_ndev_priv *priv = netdev_priv(dev); + struct switch_enet_private *fep = priv->fep; + void __iomem *enet_addr = fep->enet_addr; + struct sockaddr *addr = p; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + eth_hw_addr_set(dev, addr->sa_data); + + if (priv->portnum == 2) + enet_addr += MCF_ESW_ENET_PORT_OFFSET; + + writel(dev->dev_addr[3] | (dev->dev_addr[2] << 8) | + (dev->dev_addr[1] << 16) | (dev->dev_addr[0] << 24), + enet_addr + MCF_FEC_PALR); + writel((dev->dev_addr[5] << 16) | (dev->dev_addr[4] << 24), + enet_addr + MCF_FEC_PAUR); + + return mtip_update_atable_static((unsigned char *)dev->dev_addr, + 7, 7, fep); +} + +static const struct ethtool_ops mtip_ethtool_ops = { + .get_link_ksettings = phy_ethtool_get_link_ksettings, + .set_link_ksettings = phy_ethtool_set_link_ksettings, + .get_drvinfo = mtip_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_ts_info = ethtool_op_get_ts_info, +}; + +static const struct net_device_ops mtip_netdev_ops = { + .ndo_open = mtip_open, + .ndo_stop = mtip_close, + .ndo_start_xmit = mtip_start_xmit, + .ndo_set_rx_mode = mtip_set_multicast_list, + .ndo_tx_timeout = mtip_timeout, + .ndo_set_mac_address = mtip_set_mac_address, +}; + +bool mtip_is_switch_netdev_port(const struct net_device *ndev) +{ + return ndev->netdev_ops == &mtip_netdev_ops; +} + +static int mtip_switch_dma_init(struct switch_enet_private *fep) +{ + struct cbd_t *bdp, *cbd_base; + int ret, i; + + /* Check mask of the streaming and coherent API */ + ret = dma_set_mask_and_coherent(&fep->pdev->dev, DMA_BIT_MASK(32)); + if (ret < 0) { + dev_err(&fep->pdev->dev, "No suitable DMA available\n"); + return ret; + } + + /* Allocate memory for buffer descriptors */ + cbd_base = dma_alloc_coherent(&fep->pdev->dev, PAGE_SIZE, &fep->bd_dma, + GFP_KERNEL); + if (!cbd_base) + return -ENOMEM; + + /* Set receive and transmit descriptor base */ + fep->rx_bd_base = cbd_base; + fep->tx_bd_base = cbd_base + RX_RING_SIZE; + + /* Initialize the receive buffer descriptors */ + bdp = fep->rx_bd_base; + for (i = 0; i < RX_RING_SIZE; i++) { + bdp->cbd_sc = 0; + bdp++; + } + + /* Set the last buffer to wrap */ + bdp--; + bdp->cbd_sc |= BD_SC_WRAP; + + /* ...and the same for transmit */ + bdp = fep->tx_bd_base; + for (i = 0; i < TX_RING_SIZE; i++) { + /* Initialize the BD for every fragment in the page */ + bdp->cbd_sc = 0; + bdp->cbd_bufaddr = 0; + bdp++; + } + + /* Set the last buffer to wrap */ + bdp--; + bdp->cbd_sc |= BD_SC_WRAP; + + return 0; +} + +static void mtip_ndev_cleanup(struct switch_enet_private *fep) +{ + int i; + + for (i = 0; i < SWITCH_EPORT_NUMBER; i++) { + if (fep->ndev[i]) { + unregister_netdev(fep->ndev[i]); + free_netdev(fep->ndev[i]); + } + } +} + +static int mtip_ndev_init(struct switch_enet_private *fep, + struct platform_device *pdev) +{ + struct mtip_ndev_priv *priv; + int i, ret = 0; + + for (i = 0; i < SWITCH_EPORT_NUMBER; i++) { + fep->ndev[i] = alloc_netdev(sizeof(struct mtip_ndev_priv), + fep->ndev_name[i], NET_NAME_USER, + ether_setup); + if (!fep->ndev[i]) { + ret = -ENOMEM; + break; + } + + fep->ndev[i]->ethtool_ops = &mtip_ethtool_ops; + fep->ndev[i]->netdev_ops = &mtip_netdev_ops; + SET_NETDEV_DEV(fep->ndev[i], &pdev->dev); + + priv = netdev_priv(fep->ndev[i]); + priv->dev = fep->ndev[i]; + priv->fep = fep; + priv->portnum = i + 1; + fep->ndev[i]->irq = fep->irq; + + mtip_setup_mac(fep->ndev[i]); + + ret = register_netdev(fep->ndev[i]); + if (ret) { + dev_err(&fep->ndev[i]->dev, + "%s: ndev %s register err: %d\n", __func__, + fep->ndev[i]->name, ret); + break; + } + dev_dbg(&fep->ndev[i]->dev, "%s: MTIP eth L2 switch %pM\n", + fep->ndev[i]->name, fep->ndev[i]->dev_addr); + } + + if (ret) + mtip_ndev_cleanup(fep); + + return ret; +} + +static int mtip_parse_of(struct switch_enet_private *fep, + struct device_node *np) +{ + struct device_node *p; + unsigned int port_num; + int ret = 0; + + p = of_find_node_by_name(np, "ethernet-ports"); + + for_each_available_child_of_node_scoped(p, port) { + if (of_property_read_u32(port, "reg", &port_num)) + continue; + + if (port_num > SWITCH_EPORT_NUMBER) { + dev_err(&fep->pdev->dev, + "%s: The switch supports up to %d ports!\n", + __func__, SWITCH_EPORT_NUMBER); + goto of_get_err; + } + + fep->n_ports = port_num; + ret = of_get_mac_address(port, &fep->mac[port_num - 1][0]); + if (ret) + dev_dbg(&fep->pdev->dev, + "of_get_mac_address(%pOF) failed (%d)!\n", + port, ret); + + ret = of_property_read_string(port, "label", + &fep->ndev_name[port_num - 1]); + if (ret < 0) { + dev_err(&fep->pdev->dev, + "%s: Cannot get ethernet port name (%d)!\n", + __func__, ret); + goto of_get_err; + } + + ret = of_get_phy_mode(port, &fep->phy_interface[port_num - 1]); + if (ret < 0) { + dev_err(&fep->pdev->dev, + "%s: Cannot get PHY mode (%d)!\n", __func__, + ret); + goto of_get_err; + } + + fep->phy_np[port_num - 1] = of_parse_phandle(port, + "phy-handle", 0); + } + + of_get_err: + of_node_put(p); + + return ret; +} + +static int mtip_sw_learning(void *arg) +{ + struct switch_enet_private *fep = arg; + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + /* check learning record valid */ + mtip_atable_dynamicms_learn_migration(fep, fep->curr_time, + NULL, NULL); + schedule_timeout(HZ / 100); + } + + return 0; +} + +static void mtip_mii_unregister(struct switch_enet_private *fep) +{ + mdiobus_unregister(fep->mii_bus); + mdiobus_free(fep->mii_bus); +} + +static const struct mtip_devinfo mtip_imx28_l2switch_info = { + .quirks = FEC_QUIRK_BUG_CAPTURE | FEC_QUIRK_SINGLE_MDIO | + FEC_QUIRK_SWAP_FRAME, +}; + +static const struct of_device_id mtipl2_of_match[] = { + { .compatible = "nxp,imx28-mtip-switch", + .data = &mtip_imx28_l2switch_info}, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, mtipl2_of_match); + +static int mtip_sw_probe(struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + const struct of_device_id *of_id; + struct switch_enet_private *fep; + struct mtip_devinfo *dev_info; + int ret; + + fep = devm_kzalloc(&pdev->dev, sizeof(*fep), GFP_KERNEL); + if (!fep) + return -ENOMEM; + + of_id = of_match_node(mtipl2_of_match, pdev->dev.of_node); + if (of_id) { + dev_info = (struct mtip_devinfo *)of_id->data; + if (dev_info) + fep->quirks = dev_info->quirks; + } + + fep->pdev = pdev; + platform_set_drvdata(pdev, fep); + + fep->enet_addr = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(fep->enet_addr)) + return PTR_ERR(fep->enet_addr); + + fep->irq = platform_get_irq_byname(pdev, "enet_switch"); + if (fep->irq < 0) + return fep->irq; + + ret = mtip_parse_of(fep, np); + if (ret < 0) { + dev_err(&pdev->dev, "%s: OF parse error (%d)!\n", __func__, + ret); + return ret; + } + + /* Create an Ethernet device instance. + * The switch lookup address memory starts at 0x800FC000 + */ + fep->hwp_enet = fep->enet_addr; + fep->hwp = fep->enet_addr + ENET_SWI_PHYS_ADDR_OFFSET; + fep->hwentry = (struct mtip_addr_table __iomem *) + (fep->hwp + MCF_ESW_LOOKUP_MEM_OFFSET); + + ret = devm_regulator_get_enable_optional(&pdev->dev, "phy"); + if (ret) + return dev_err_probe(&pdev->dev, ret, + "Unable to get and enable 'phy'\n"); + + fep->clk_ipg = devm_clk_get_enabled(&pdev->dev, "ipg"); + if (IS_ERR(fep->clk_ipg)) + return dev_err_probe(&pdev->dev, PTR_ERR(fep->clk_ipg), + "Unable to acquire 'ipg' clock\n"); + + fep->clk_ahb = devm_clk_get_enabled(&pdev->dev, "ahb"); + if (IS_ERR(fep->clk_ahb)) + return dev_err_probe(&pdev->dev, PTR_ERR(fep->clk_ahb), + "Unable to acquire 'ahb' clock\n"); + + fep->clk_enet_out = devm_clk_get_optional_enabled(&pdev->dev, + "enet_out"); + if (IS_ERR(fep->clk_enet_out)) + return dev_err_probe(&pdev->dev, PTR_ERR(fep->clk_enet_out), + "Unable to acquire 'enet_out' clock\n"); + + /* setup MII interface for external switch ports */ + mtip_enet_init(fep, 1); + mtip_enet_init(fep, 2); + + spin_lock_init(&fep->learn_lock); + spin_lock_init(&fep->hw_lock); + spin_lock_init(&fep->mii_lock); + + ret = devm_request_irq(&pdev->dev, fep->irq, mtip_interrupt, 0, + dev_name(&pdev->dev), fep); + if (ret) + return dev_err_probe(&pdev->dev, fep->irq, + "Could not alloc IRQ\n"); + + ret = mtip_register_notifiers(fep); + if (ret) + return ret; + + ret = mtip_ndev_init(fep, pdev); + if (ret) { + dev_err(&pdev->dev, "%s: Failed to create virtual ndev (%d)\n", + __func__, ret); + goto ndev_init_err; + } + + ret = mtip_switch_dma_init(fep); + if (ret) { + dev_err(&pdev->dev, "%s: ethernet switch init fail (%d)!\n", + __func__, ret); + goto dma_init_err; + } + + ret = mtip_mii_init(fep, pdev); + if (ret) { + dev_err(&pdev->dev, "%s: Cannot init phy bus (%d)!\n", __func__, + ret); + goto mii_init_err; + } + /* setup timer for learning aging function */ + timer_setup(&fep->timer_aging, mtip_aging_timer, 0); + mod_timer(&fep->timer_aging, + jiffies + msecs_to_jiffies(LEARNING_AGING_INTERVAL)); + + fep->task = kthread_run(mtip_sw_learning, fep, "mtip_l2sw_learning"); + if (IS_ERR(fep->task)) { + ret = PTR_ERR(fep->task); + dev_err(&pdev->dev, "%s: learning kthread_run error (%d)!\n", + __func__, ret); + goto task_learning_err; + } + + return 0; + + task_learning_err: + timer_delete_sync(&fep->timer_aging); + mtip_mii_unregister(fep); + mii_init_err: + dma_init_err: + mtip_ndev_cleanup(fep); + ndev_init_err: + mtip_unregister_notifiers(fep); + + return ret; +} + +static void mtip_sw_remove(struct platform_device *pdev) +{ + struct switch_enet_private *fep = platform_get_drvdata(pdev); + + mtip_unregister_notifiers(fep); + mtip_ndev_cleanup(fep); + + mtip_mii_remove(fep); + + kthread_stop(fep->task); + timer_delete_sync(&fep->timer_aging); + platform_set_drvdata(pdev, NULL); + + kfree(fep); +} + +static struct platform_driver mtipl2plat_driver = { + .driver = { + .name = "mtipl2sw", + .of_match_table = mtipl2_of_match, + .suppress_bind_attrs = true, + }, + .probe = mtip_sw_probe, + .remove = mtip_sw_remove, +}; + +module_platform_driver(mtipl2plat_driver); + +MODULE_AUTHOR("Lukasz Majewski "); +MODULE_DESCRIPTION("Driver for MTIP L2 on SOC switch"); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.h b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.h new file mode 100644 index 0000000000000..33cb20cd1360b --- /dev/null +++ b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.h @@ -0,0 +1,771 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 DENX Software Engineering GmbH + * Lukasz Majewski + */ + +#ifndef __MTIP_L2SWITCH_H_ +#define __MTIP_L2SWITCH_H_ + +#include +#include +#include +#include +#include +#include + +#define PKT_MAXBUF_SIZE 1518 +#define PKT_MINBUF_SIZE 64 +#define PKT_MAXBLR_SIZE 1520 + +/* The number of Tx and Rx buffers. These are allocated from the page + * pool. The code may assume these are power of two, so it is best + * to keep them that size. + * We don't need to allocate pages for the transmitter. We just use + * the skbuffer directly. + */ +#define MTIP_SWITCH_RX_PAGES 8 +#define MTIP_SWITCH_RX_FRSIZE 2048 +#define MTIP_SWITCH_RX_FRPPG (PAGE_SIZE / MTIP_SWITCH_RX_FRSIZE) +#define RX_RING_SIZE (MTIP_SWITCH_RX_FRPPG * MTIP_SWITCH_RX_PAGES) +#define MTIP_SWITCH_TX_FRSIZE 2048 +#define MTIP_SWITCH_TX_FRPPG (PAGE_SIZE / MTIP_SWITCH_TX_FRSIZE) + +#define TX_RING_SIZE 16 /* Must be power of two */ +#define TX_RING_MOD_MASK 15 /* for this to work */ + +#define SWITCH_EPORT_NUMBER 2 + +#if (((RX_RING_SIZE + TX_RING_SIZE) * 8) > PAGE_SIZE) +#error "L2SWITCH: descriptor ring size constants too large" +#endif + +#define ESW_REVISION (0x000) +#define ESW_SCRATCH (0x004) +#define ESW_PER (0x008) +#define ESW_VLANV (0x010) +#define ESW_DBCR (0x014) +#define ESW_DMCR (0x018) +#define ESW_BKLR (0x01C) +#define ESW_BMPC (0x020) +#define ESW_MODE (0x024) +#define ESW_VIMSEL (0x028) +#define ESW_VOMSEL (0x02C) +#define ESW_VIMEN (0x030) +#define ESW_VID (0x034) + +#define ESW_MCR (0x040) +#define ESW_EGMAP (0x044) +#define ESW_INGMAP (0x048) +#define ESW_INGSAL (0x04C) +#define ESW_INGSAH (0x050) +#define ESW_INGDAL (0x054) +#define ESW_INGDAH (0x058) +#define ESW_ENGSAL (0x05C) +#define ESW_ENGSAH (0x060) +#define ESW_ENGDAL (0x064) +#define ESW_ENGDAH (0x068) +#define ESW_MCVAL (0x06C) + +#define ESW_MMSR (0x080) +#define ESW_LMT (0x084) +#define ESW_LFC (0x088) +#define ESW_PCSR (0x08C) +#define ESW_IOSR (0x090) +#define ESW_QWT (0x094) + +#define ESW_P0BCT (0x09C) + +#define ESW_P0FFEN (0x0BC) + +#define ESW_PSNP_BASE (0x0C0) +#define ESW_PSNP(x) (ESW_PSNP_BASE + (4 * (x))) + +#define ESW_IPSNP_BASE (0x0EC) +#define ESW_IPSNP(x) (ESW_IPSNP_BASE + (4 * (x))) + +#define ESW_PVRES_BASE (0x100) +#define ESW_PVRES(x) (ESW_PVRES_BASE + (4 * (x))) + +#define ESW_IPRES (0x140) + +#define ESW_PRES_BASE (0x180) +#define ESW_PRES(x) (ESW_PRES_BASE + (4 * (x))) + +#define ESW_PID_BASE (0x200) +#define ESW_PID(x) (ESW_PID_BASE + (4 * (x))) + +#define ESW_VRES_BASE (0x280) +#define ESW_VRES(x) (ESW_VRES_BASE + (4 * (x))) + +#define ESW_DISCN (0x300) +#define ESW_DISCB (0x304) +#define ESW_NDISCN (0x308) +#define ESW_NDISCB (0x30C) + +#define ESW_ISR (0x400) +#define ESW_IMR (0x404) +#define ESW_RDSR (0x408) +#define ESW_TDSR (0x40C) +#define ESW_MRBR (0x410) +#define ESW_RDAR (0x414) +#define ESW_TDAR (0x418) + +#define ESW_LREC0 (0x500) +#define ESW_LREC1 (0x504) +#define ESW_LSR (0x508) + +struct addr_table64b_entry { + u32 lo; /* lower 32 bits */ + u32 hi; /* upper 32 bits */ +}; + +struct mtip_addr_table { + struct addr_table64b_entry mtip_table64b_entry[2048]; +}; + +#define MCF_ESW_LOOKUP_MEM_OFFSET 0x4000 +#define MCF_ESW_ENET_PORT_OFFSET 0x4000 +#define ENET_SWI_PHYS_ADDR_OFFSET 0x8000 +#define MCF_ESW_PER (0x08) +#define MCF_ESW_DBCR (0x14) +#define MCF_ESW_IMR (0x404) + +#define MCF_FEC_BASE_ADDR (fep->enet_addr) +#define MCF_FEC_EIR (0x04) +#define MCF_FEC_EIMR (0x08) +#define MCF_FEC_MMFR (0x40) +#define MCF_FEC_MSCR (0x44) + +#define MCF_FEC_RCR (0x84) +#define MCF_FEC_TCR (0xC4) +#define MCF_FEC_ECR (0x24) + +#define MCF_FEC_PALR (0xE4) +#define MCF_FEC_PAUR (0xE8) + +#define MCF_FEC_ERDSR (0x180) +#define MCF_FEC_ETDSR (0x184) + +#define MCF_FEC_IAUR (0x118) +#define MCF_FEC_IALR (0x11C) + +#define MCF_FEC_GAUR (0x120) +#define MCF_FEC_GALR (0x124) + +#define MCF_FEC_EMRBR (0x188) + +#define MCF_FEC_RCR_DRT BIT(1) +#define MCF_FEC_RCR_MII_MODE BIT(2) +#define MCF_FEC_RCR_PROM BIT(3) +#define MCF_FEC_RCR_FCE BIT(5) +#define MCF_FEC_RCR_RMII_MODE BIT(8) +#define MCF_FEC_RCR_RMII_10BASET BIT(9) +#define MCF_FEC_RCR_MAX_FL(x) (((x) & 0x00003FFF) << 16) +#define MCF_FEC_RCR_CRC_FWD BIT(14) +#define MCF_FEC_RCR_NO_LGTH_CHECK BIT(30) +#define MCF_FEC_TCR_FDEN BIT(2) + +#define MCF_FEC_ECR_RESET BIT(0) +#define MCF_FEC_ECR_ETHER_EN BIT(1) +#define MCF_FEC_ECR_MAGIC_ENA BIT(2) +#define MCF_FEC_ECR_ENA_1588 BIT(4) + +#define MTIP_ALIGNMENT 0xf +#define MCF_ENET_MII BIT(23) + +/* FEC MII MMFR bits definition */ +#define FEC_MMFR_ST BIT(30) +#define FEC_MMFR_OP_READ BIT(29) +#define FEC_MMFR_OP_WRITE BIT(28) +#define FEC_MMFR_PA(v) (((v) & 0x1F) << 23) +#define FEC_MMFR_RA(v) (((v) & 0x1F) << 18) +#define FEC_MMFR_TA (2 << 16) +#define FEC_MMFR_DATA(v) ((v) & 0xffff) + +/* Port 0 backpressure congestion threshold */ +#define P0BC_THRESHOLD 0x40 +#define LEARNING_AGING_INTERVAL 100 +/* Info received from Hardware Learning FIFO, + * holding MAC address and corresponding Hash Value and + * port number where the frame was received (disassembled). + */ +struct mtip_port_info { + /* MAC lower 32 bits (first byte is 7:0). */ + u32 maclo; + /* MAC upper 16 bits (47:32). */ + u32 machi; + /* the hash value for this MAC address. */ + u32 hash; + /* the port number this MAC address is associated with. */ + u32 port; +}; + +/* Define the buffer descriptor structure. */ +struct cbd_t { + u16 cbd_datlen; /* Data length */ + u16 cbd_sc; /* Control and status info */ + u32 cbd_bufaddr; /* Buffer address */ +}; + +/* The switch buffer descriptors track the ring buffers. The rx_bd_base and + * tx_bd_base always point to the base of the buffer descriptors. The + * cur_rx and cur_tx point to the currently available buffer. + * The dirty_tx tracks the current buffer that is being sent by the + * controller. The cur_tx and dirty_tx are equal under both completely + * empty and completely full conditions. The empty/ready indicator in + * the buffer descriptor determines the actual condition. + */ +struct switch_enet_private { + /* Base addresses for HW registers of the switch device */ + void __iomem *hwp_enet, *hwp, *enet_addr; + struct mtip_addr_table __iomem *hwentry; + + struct platform_device *pdev; + + /* Switch internals */ + struct mtip_port_info g_info; + + /* Clocks */ + struct clk *clk_ipg; + struct clk *clk_ahb; + struct clk *clk_enet_out; + + /* skbuff */ + unsigned char *tx_bounce[TX_RING_SIZE]; + struct sk_buff *tx_skbuff[TX_RING_SIZE]; + struct sk_buff *rx_skbuff[RX_RING_SIZE]; + ushort skb_cur; + ushort skb_dirty; + + /* DMA */ + dma_addr_t bd_dma; + struct cbd_t *rx_bd_base; /* Address of Rx and Tx buffers. */ + struct cbd_t *tx_bd_base; + struct cbd_t *cur_rx, *cur_tx; /* The next free ring entry */ + struct cbd_t *dirty_tx; /* The ring entries to be free()ed. */ + uint tx_full; + + /* Locking */ + spinlock_t hw_lock; /* Lock for HW configuration */ + spinlock_t mii_lock; /* Lock for MII operation */ + spinlock_t learn_lock; /* Lock for learning DB adjustments */ + + /* NAPI support */ + struct napi_struct napi; + + /* Timer for Aging */ + struct timer_list timer_aging; + struct task_struct *task; + int at_block_overflows; + int at_curr_entries; + int curr_time; + + /* PHY and MDIO */ + struct mii_bus *mii_bus; + struct phy_device *phy_dev[SWITCH_EPORT_NUMBER]; + uint phy_speed; + int link[SWITCH_EPORT_NUMBER]; + int full_duplex[SWITCH_EPORT_NUMBER]; + phy_interface_t phy_interface[SWITCH_EPORT_NUMBER]; + struct device_node *phy_np[SWITCH_EPORT_NUMBER]; + + /* IRQ number */ + int irq; + + /* lan[01] ports */ + int n_ports; + const char *ndev_name[SWITCH_EPORT_NUMBER]; + struct net_device *ndev[SWITCH_EPORT_NUMBER]; + unsigned char mac[SWITCH_EPORT_NUMBER][ETH_ALEN]; + + /* Switch state */ + u8 br_members; /* Bit field with active members */ + u8 br_offload; /* Bridge in-HW offloading flag */ + int usage_count; /* Number of configured ports */ + + /* Driver related */ + u32 quirks; +}; + +struct mtip_ndev_priv { + int portnum; + struct net_device *dev; + struct net_device_stats stats; + struct net_device *master_dev; + struct switch_enet_private *fep; +}; + +#define MCF_FEC_MII_DATA 0x040 /* MII manage frame reg */ +#define MCF_FEC_GRP_HASH_TABLE_HIGH 0x120 /* High 32bits hash table */ +#define MCF_FEC_GRP_HASH_TABLE_LOW 0x124 /* Low 32bits hash table */ + +#define BD_SC_EMPTY ((ushort)0x8000) /* Receive is empty */ +#define BD_SC_READY ((ushort)0x8000) /* Transmit is ready */ +#define BD_SC_WRAP ((ushort)0x2000) /* Last buffer descriptor */ +#define BD_SC_INTRPT ((ushort)0x1000) /* Interrupt on change */ +#define BD_SC_CM ((ushort)0x0200) /* Continuous mode */ +#define BD_SC_ID ((ushort)0x0100) /* Rec'd too many idles */ +#define BD_SC_P ((ushort)0x0100) /* xmt preamble */ +#define BD_SC_BR ((ushort)0x0020) /* Break received */ +#define BD_SC_FR ((ushort)0x0010) /* Framing error */ +#define BD_SC_PR ((ushort)0x0008) /* Parity error */ +#define BD_SC_OV ((ushort)0x0002) /* Overrun */ +#define BD_SC_CD ((ushort)0x0001) + +/* Buffer descriptor control/status used by Ethernet receive. */ +#define BD_ENET_RX_EMPTY ((ushort)0x8000) +#define BD_ENET_RX_WRAP ((ushort)0x2000) +#define BD_ENET_RX_INTR ((ushort)0x1000) +#define BD_ENET_RX_LAST ((ushort)0x0800) +#define BD_ENET_RX_FIRST ((ushort)0x0400) +#define BD_ENET_RX_MISS ((ushort)0x0100) +#define BD_ENET_RX_LG ((ushort)0x0020) +#define BD_ENET_RX_NO ((ushort)0x0010) +#define BD_ENET_RX_SH ((ushort)0x0008) +#define BD_ENET_RX_CR ((ushort)0x0004) +#define BD_ENET_RX_OV ((ushort)0x0002) +#define BD_ENET_RX_CL ((ushort)0x0001) +/* All status bits */ +#define BD_ENET_RX_STATS ((ushort)0x013f) + +/* Buffer descriptor control/status used by Ethernet transmit.*/ +#define BD_ENET_TX_READY ((ushort)0x8000) +#define BD_ENET_TX_PAD ((ushort)0x4000) +#define BD_ENET_TX_WRAP ((ushort)0x2000) +#define BD_ENET_TX_INTR ((ushort)0x1000) +#define BD_ENET_TX_LAST ((ushort)0x0800) +#define BD_ENET_TX_TC ((ushort)0x0400) +#define BD_ENET_TX_DEF ((ushort)0x0200) +#define BD_ENET_TX_HB ((ushort)0x0100) +#define BD_ENET_TX_LC ((ushort)0x0080) +#define BD_ENET_TX_RL ((ushort)0x0040) +#define BD_ENET_TX_RCMASK ((ushort)0x003c) +#define BD_ENET_TX_UN ((ushort)0x0002) +#define BD_ENET_TX_CSL ((ushort)0x0001) +/* All status bits */ +#define BD_ENET_TX_STATS ((ushort)0x03ff) + +/* Copy from validation code */ +#define RX_BUFFER_SIZE 256 +#define TX_BUFFER_SIZE 256 + +#define TX_BD_R BIT(15) +#define TX_BD_TO1 BIT(14) +#define TX_BD_W BIT(13) +#define TX_BD_TO2 BIT(12) +#define TX_BD_L BIT(11) +#define TX_BD_TC BIT(10) + +#define TX_BD_INT BIT(30) +#define TX_BD_TS BIT(29) +#define TX_BD_PINS BIT(28) +#define TX_BD_IINS BIT(27) +#define TX_BD_TXE BIT(15) +#define TX_BD_UE BIT(13) +#define TX_BD_EE BIT(12) +#define TX_BD_FE BIT(11) +#define TX_BD_LCE BIT(10) +#define TX_BD_OE BIT(9) +#define TX_BD_TSE BIT(8) +#define TX_BD_BDU BIT(31) + +#define RX_BD_E BIT(15) +#define RX_BD_R01 BIT(14) +#define RX_BD_W BIT(13) +#define RX_BD_R02 BIT(12) +#define RX_BD_L BIT(11) +#define RX_BD_M BIT(8) +#define RX_BD_BC BIT(7) +#define RX_BD_MC BIT(6) +#define RX_BD_LG BIT(5) +#define RX_BD_NO BIT(4) +#define RX_BD_CR BIT(2) +#define RX_BD_OV BIT(1) +#define RX_BD_TR BIT(0) + +#define RX_BD_ME BIT(31) +#define RX_BD_PE 0x04000000 +#define RX_BD_CE 0x02000000 +#define RX_BD_UC 0x01000000 +#define RX_BD_INT 0x00800000 +#define RX_BD_ICE BIT(5) +#define RX_BD_PCR BIT(4) +#define RX_BD_VLAN BIT(2) +#define RX_BD_IPV6 BIT(1) +#define RX_BD_FRAG BIT(0) +#define RX_BD_BDU BIT(31) +/****************************************************************************/ + +/* Address Table size in bytes(2048 64bit entry ) */ +#define MTIP_ATABLE_MEM_SIZE (2048 * 8) +/* How many 64-bit elements fit in the address table */ +#define MTIP_ATABLE_MEM_NUM_ENTRIES (2048) +/* Address Table Maximum number of entries in each Slot */ +#define ATABLE_ENTRY_PER_SLOT 8 +/* log2(ATABLE_ENTRY_PER_SLOT)*/ +#define ATABLE_ENTRY_PER_SLOT_bits 3 +/* entry size in byte */ +#define ATABLE_ENTRY_SIZE 8 +/* slot size in byte */ +#define ATABLE_SLOT_SIZE (ATABLE_ENTRY_PER_SLOT * ATABLE_ENTRY_SIZE) +/* width of timestamp variable (bits) within address table entry */ +#define AT_DENTRY_TIMESTAMP_WIDTH 10 +/* number of bits for port number storage */ +#define AT_DENTRY_PORT_WIDTH 4 +/* number of bits for port bitmask number storage */ +#define AT_SENTRY_PORT_WIDTH 11 +/* address table static entry port bitmask start address bit */ +#define AT_SENTRY_PORTMASK_shift 21 +/* address table static entry priority start address bit */ +#define AT_SENTRY_PRIO_shift 18 +/* address table dynamic entry port start address bit */ +#define AT_DENTRY_PORT_shift 28 +/* address table dynamic entry timestamp start address bit */ +#define AT_DENTRY_TIME_shift 18 +/* address table entry record type start address bit */ +#define AT_ENTRY_TYPE_shift 17 +/* address table entry record type bit: 1 static, 0 dynamic */ +#define AT_ENTRY_TYPE_STATIC 1 +#define AT_ENTRY_TYPE_DYNAMIC 0 +/* address table entry record valid start address bit */ +#define AT_ENTRY_VALID_shift 16 +#define AT_ENTRY_RECORD_VALID 1 + +/* return block corresponding to the 8 bit hash value calculated */ +#define GET_BLOCK_PTR(hash) ((hash) << 3) +#define AT_EXTRACT_TIMESTAMP(x) \ + (((x) >> AT_DENTRY_TIME_shift) & ((1 << AT_DENTRY_TIMESTAMP_WIDTH) - 1)) +#define AT_EXTRACT_PORT(x) \ + (((x) >> AT_DENTRY_PORT_shift) & ((1 << AT_DENTRY_PORT_WIDTH) - 1)) +#define TIMEDELTA(newtime, oldtime) \ + (((newtime) - (oldtime)) & \ + ((1 << AT_DENTRY_TIMESTAMP_WIDTH) - 1)) + +/* increment time value respecting modulo. */ +static inline int mtip_timeincrement(int time) +{ + return (time + 1) & ((1 << AT_DENTRY_TIMESTAMP_WIDTH) - 1); +} + +/* ------------------------------------------------------------------------- */ +/* Bit definitions and macros for MCF_ESW_REVISION */ +#define MCF_MTIP_REVISION_CORE_REVISION(x) ((x) & 0x0000FFFF) +#define MCF_MTIP_REVISION_CUSTOMER_REVISION(x) (((x) & 0xFFFF0000) >> 16) + +/* Bit definitions and macros for MCF_ESW_PER */ +#define MCF_ESW_PER_TE0 BIT(0) +#define MCF_ESW_PER_TE1 BIT(1) +#define MCF_ESW_PER_TE2 BIT(2) +#define MCF_ESW_PER_RE0 BIT(16) +#define MCF_ESW_PER_RE1 BIT(17) +#define MCF_ESW_PER_RE2 BIT(18) + +/* Bit definitions and macros for MCF_ESW_VLANV */ +#define MCF_ESW_VLANV_VV0 BIT(0) +#define MCF_ESW_VLANV_VV1 BIT(1) +#define MCF_ESW_VLANV_VV2 BIT(2) +#define MCF_ESW_VLANV_DU0 BIT(16) +#define MCF_ESW_VLANV_DU1 BIT(17) +#define MCF_ESW_VLANV_DU2 BIT(18) + +/* Bit definitions and macros for MCF_ESW_DBCR */ +#define MCF_ESW_DBCR_P0 BIT(0) +#define MCF_ESW_DBCR_P1 BIT(1) +#define MCF_ESW_DBCR_P2 BIT(2) + +/* Bit definitions and macros for MCF_ESW_DMCR */ +#define MCF_ESW_DMCR_P0 BIT(0) +#define MCF_ESW_DMCR_P1 BIT(1) +#define MCF_ESW_DMCR_P2 BIT(2) + +/* Bit definitions and macros for MCF_ESW_BKLR */ +#define MCF_ESW_BKLR_BE0 BIT(0) +#define MCF_ESW_BKLR_BE1 BIT(1) +#define MCF_ESW_BKLR_BE2 BIT(2) +#define MCF_ESW_BKLR_LD0 BIT(16) +#define MCF_ESW_BKLR_LD1 BIT(17) +#define MCF_ESW_BKLR_LD2 BIT(18) + +/* Bit definitions and macros for MCF_ESW_BMPC */ +#define MCF_ESW_BMPC_PORT(x) (((x) & 0x0000000F) << 0) +#define MCF_ESW_BMPC_MSG_TX BIT(5) +#define MCF_ESW_BMPC_EN BIT(6) +#define MCF_ESW_BMPC_DIS BIT(7) +#define MCF_ESW_BMPC_PRIORITY(x) (((x) & 0x00000007) << 13) +#define MCF_ESW_BMPC_PORTMASK(x) (((x) & 0x00000007) << 16) + +/* Bit definitions and macros for MCF_ESW_MODE */ +#define MCF_ESW_MODE_SW_RST BIT(0) +#define MCF_ESW_MODE_SW_EN BIT(1) +#define MCF_ESW_MODE_STOP BIT(7) +#define MCF_ESW_MODE_CRC_TRAN BIT(8) +#define MCF_ESW_MODE_P0CT BIT(9) +#define MCF_ESW_MODE_STATRST BIT(31) + +/* Bit definitions and macros for MCF_ESW_VIMSEL */ +#define MCF_ESW_VIMSEL_IM0(x) (((x) & 0x00000003) << 0) +#define MCF_ESW_VIMSEL_IM1(x) (((x) & 0x00000003) << 2) +#define MCF_ESW_VIMSEL_IM2(x) (((x) & 0x00000003) << 4) + +/* Bit definitions and macros for MCF_ESW_VOMSEL */ +#define MCF_ESW_VOMSEL_OM0(x) (((x) & 0x00000003) << 0) +#define MCF_ESW_VOMSEL_OM1(x) (((x) & 0x00000003) << 2) +#define MCF_ESW_VOMSEL_OM2(x) (((x) & 0x00000003) << 4) + +/* Bit definitions and macros for MCF_ESW_VIMEN */ +#define MCF_ESW_VIMEN_EN0 BIT(0) +#define MCF_ESW_VIMEN_EN1 BIT(1) +#define MCF_ESW_VIMEN_EN2 BIT(2) + +/* Bit definitions and macros for MCF_ESW_VID */ +#define MCF_ESW_VID_TAG(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_MCR */ +#define MCF_ESW_MCR_PORT(x) (((x) & 0x0000000F) << 0) +#define MCF_ESW_MCR_MEN BIT(4) +#define MCF_ESW_MCR_INGMAP BIT(5) +#define MCF_ESW_MCR_EGMAP BIT(6) +#define MCF_ESW_MCR_INGSA BIT(7) +#define MCF_ESW_MCR_INGDA BIT(8) +#define MCF_ESW_MCR_EGSA BIT(9) +#define MCF_ESW_MCR_EGDA BIT(10) + +/* Bit definitions and macros for MCF_ESW_EGMAP */ +#define MCF_ESW_EGMAP_EG0 BIT(0) +#define MCF_ESW_EGMAP_EG1 BIT(1) +#define MCF_ESW_EGMAP_EG2 BIT(2) + +/* Bit definitions and macros for MCF_ESW_INGMAP */ +#define MCF_ESW_INGMAP_ING0 BIT(0) +#define MCF_ESW_INGMAP_ING1 BIT(1) +#define MCF_ESW_INGMAP_ING2 BIT(2) + +/* Bit definitions and macros for MCF_ESW_INGSAL */ +#define MCF_ESW_INGSAL_ADDLOW(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_INGSAH */ +#define MCF_ESW_INGSAH_ADDHIGH(x) (((x) & 0x0000FFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_INGDAL */ +#define MCF_ESW_INGDAL_ADDLOW(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_INGDAH */ +#define MCF_ESW_INGDAH_ADDHIGH(x) (((x) & 0x0000FFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_ENGSAL */ +#define MCF_ESW_ENGSAL_ADDLOW(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_ENGSAH */ +#define MCF_ESW_ENGSAH_ADDHIGH(x) (((x) & 0x0000FFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_ENGDAL */ +#define MCF_ESW_ENGDAL_ADDLOW(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_ENGDAH */ +#define MCF_ESW_ENGDAH_ADDHIGH(x) (((x) & 0x0000FFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_MCVAL */ +#define MCF_ESW_MCVAL_COUNT(x) (((x) & 0x000000FF) << 0) + +/* Bit definitions and macros for MCF_ESW_MMSR */ +#define MCF_ESW_MMSR_BUSY BIT(0) +#define MCF_ESW_MMSR_NOCELL BIT(1) +#define MCF_ESW_MMSR_MEMFULL BIT(2) +#define MCF_ESW_MMSR_MFLATCH BIT(3) +#define MCF_ESW_MMSR_DQ_GRNT BIT(6) +#define MCF_ESW_MMSR_CELLS_AVAIL(x) (((x) & 0x000000FF) << 16) + +/* Bit definitions and macros for MCF_ESW_LMT */ +#define MCF_ESW_LMT_THRESH(x) (((x) & 0x000000FF) << 0) + +/* Bit definitions and macros for MCF_ESW_LFC */ +#define MCF_ESW_LFC_COUNT(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_PCSR */ +#define MCF_ESW_PCSR_PC0 BIT(0) +#define MCF_ESW_PCSR_PC1 BIT(1) +#define MCF_ESW_PCSR_PC2 BIT(2) + +/* Bit definitions and macros for MCF_ESW_IOSR */ +#define MCF_ESW_IOSR_OR0 BIT(0) +#define MCF_ESW_IOSR_OR1 BIT(1) +#define MCF_ESW_IOSR_OR2 BIT(2) + +/* Bit definitions and macros for MCF_ESW_QWT */ +#define MCF_ESW_QWT_Q0WT(x) (((x) & 0x0000001F) << 0) +#define MCF_ESW_QWT_Q1WT(x) (((x) & 0x0000001F) << 8) +#define MCF_ESW_QWT_Q2WT(x) (((x) & 0x0000001F) << 16) +#define MCF_ESW_QWT_Q3WT(x) (((x) & 0x0000001F) << 24) + +/* Bit definitions and macros for MCF_ESW_P0BCT */ +#define MCF_ESW_P0BCT_THRESH(x) (((x) & 0x000000FF) << 0) + +/* Bit definitions and macros for MCF_ESW_P0FFEN */ +#define MCF_ESW_P0FFEN_FEN BIT(0) +#define MCF_ESW_P0FFEN_FD(x) (((x) & 0x00000003) << 2) + +/* Bit definitions and macros for MCF_ESW_PSNP */ +#define MCF_ESW_PSNP_EN BIT(0) +#define MCF_ESW_PSNP_MODE(x) (((x) & 0x00000003) << 1) +#define MCF_ESW_PSNP_CD BIT(3) +#define MCF_ESW_PSNP_CS BIT(4) +#define MCF_ESW_PSNP_PORT_COMPARE(x) (((x) & 0x0000FFFF) << 16) + +/* Bit definitions and macros for MCF_ESW_IPSNP */ +#define MCF_ESW_IPSNP_EN BIT(0) +#define MCF_ESW_IPSNP_MODE(x) (((x) & 0x00000003) << 1) +#define MCF_ESW_IPSNP_PROTOCOL(x) (((x) & 0x000000FF) << 8) + +/* Bit definitions and macros for MCF_ESW_PVRES */ +#define MCF_ESW_PVRES_PRI0(x) (((x) & 0x00000007) << 0) +#define MCF_ESW_PVRES_PRI1(x) (((x) & 0x00000007) << 3) +#define MCF_ESW_PVRES_PRI2(x) (((x) & 0x00000007) << 6) +#define MCF_ESW_PVRES_PRI3(x) (((x) & 0x00000007) << 9) +#define MCF_ESW_PVRES_PRI4(x) (((x) & 0x00000007) << 12) +#define MCF_ESW_PVRES_PRI5(x) (((x) & 0x00000007) << 15) +#define MCF_ESW_PVRES_PRI6(x) (((x) & 0x00000007) << 18) +#define MCF_ESW_PVRES_PRI7(x) (((x) & 0x00000007) << 21) + +/* Bit definitions and macros for MCF_ESW_IPRES */ +#define MCF_ESW_IPRES_ADDRESS(x) (((x) & 0x000000FF) << 0) +#define MCF_ESW_IPRES_IPV4SEL BIT(8) +#define MCF_ESW_IPRES_PRI0(x) (((x) & 0x00000003) << 9) +#define MCF_ESW_IPRES_PRI1(x) (((x) & 0x00000003) << 11) +#define MCF_ESW_IPRES_PRI2(x) (((x) & 0x00000003) << 13) +#define MCF_ESW_IPRES_READ BIT(31) + +/* Bit definitions and macros for MCF_ESW_PRES */ +#define MCF_ESW_PRES_VLAN BIT(0) +#define MCF_ESW_PRES_IP BIT(1) +#define MCF_ESW_PRES_MAC BIT(2) +#define MCF_ESW_PRES_DFLT_PRI(x) (((x) & 0x00000007) << 4) + +/* Bit definitions and macros for MCF_ESW_PID */ +#define MCF_ESW_PID_VLANID(x) (((x) & 0x0000FFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_VRES */ +#define MCF_ESW_VRES_P0 BIT(0) +#define MCF_ESW_VRES_P1 BIT(1) +#define MCF_ESW_VRES_P2 BIT(2) +#define MCF_ESW_VRES_VLANID(x) (((x) & 0x00000FFF) << 3) + +/* Bit definitions and macros for MCF_ESW_DISCN */ +#define MCF_ESW_DISCN_COUNT(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_DISCB */ +#define MCF_ESW_DISCB_COUNT(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_NDISCN */ +#define MCF_ESW_NDISCN_COUNT(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_NDISCB */ +#define MCF_ESW_NDISCB_COUNT(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_POQC */ +#define MCF_ESW_POQC_COUNT(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_PMVID */ +#define MCF_ESW_PMVID_COUNT(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_PMVTAG */ +#define MCF_ESW_PMVTAG_COUNT(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_PBL */ +#define MCF_ESW_PBL_COUNT(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_ISR */ +#define MCF_ESW_ISR_EBERR BIT(0) +#define MCF_ESW_ISR_RXB BIT(1) +#define MCF_ESW_ISR_RXF BIT(2) +#define MCF_ESW_ISR_TXB BIT(3) +#define MCF_ESW_ISR_TXF BIT(4) +#define MCF_ESW_ISR_QM BIT(5) +#define MCF_ESW_ISR_OD0 BIT(6) +#define MCF_ESW_ISR_OD1 BIT(7) +#define MCF_ESW_ISR_OD2 BIT(8) +#define MCF_ESW_ISR_LRN BIT(9) + +/* Bit definitions and macros for MCF_ESW_IMR */ +#define MCF_ESW_IMR_EBERR BIT(0) +#define MCF_ESW_IMR_RXB BIT(1) +#define MCF_ESW_IMR_RXF BIT(2) +#define MCF_ESW_IMR_TXB BIT(3) +#define MCF_ESW_IMR_TXF BIT(4) +#define MCF_ESW_IMR_QM BIT(5) +#define MCF_ESW_IMR_OD0 BIT(6) +#define MCF_ESW_IMR_OD1 BIT(7) +#define MCF_ESW_IMR_OD2 BIT(8) +#define MCF_ESW_IMR_LRN BIT(9) + +/* Bit definitions and macros for MCF_ESW_RDSR */ +#define MCF_ESW_RDSR_ADDRESS(x) (((x) & 0x3FFFFFFF) << 2) + +/* Bit definitions and macros for MCF_ESW_TDSR */ +#define MCF_ESW_TDSR_ADDRESS(x) (((x) & 0x3FFFFFFF) << 2) + +/* Bit definitions and macros for MCF_ESW_MRBR */ +#define MCF_ESW_MRBR_SIZE(x) (((x) & 0x000003FF) << 4) + +/* Bit definitions and macros for MCF_ESW_RDAR */ +#define MCF_ESW_RDAR_R_DES_ACTIVE BIT(24) + +/* Bit definitions and macros for MCF_ESW_TDAR */ +#define MCF_ESW_TDAR_X_DES_ACTIVE BIT(24) + +/* Bit definitions and macros for MCF_ESW_LREC0 */ +#define MCF_ESW_LREC0_MACADDR0(x) (((x) & 0xFFFFFFFF) << 0) + +/* Bit definitions and macros for MCF_ESW_LREC1 */ +#define MCF_ESW_LREC1_MACADDR1(x) (((x) & 0x0000FFFF) << 0) +#define MCF_ESW_LREC1_HASH(x) (((x) & 0x000000FF) << 16) +#define MCF_ESW_LREC1_SWPORT(x) (((x) & 0x00000003) << 24) + +/* Bit definitions and macros for MCF_ESW_LSR */ +#define MCF_ESW_LSR_DA BIT(0) + +/* QUIRKS */ +/* Controller needs driver to swap frame */ +#define FEC_QUIRK_SWAP_FRAME BIT(1) +/* ENET Block Guide/ Chapter for the iMX6SX (PELE) address one issue: + * After set ENET_ATCR[Capture], there need some time cycles before the counter + * value is capture in the register clock domain. + * The wait-time-cycles is at least 6 clock cycles of the slower clock between + * the register clock and the 1588 clock. The 1588 ts_clk is fixed to 25Mhz, + * register clock is 66Mhz, so the wait-time-cycles must be greater than 240ns + * (40ns * 6). + */ +#define FEC_QUIRK_BUG_CAPTURE BIT(10) +/* Controller has only one MDIO bus */ +#define FEC_QUIRK_SINGLE_MDIO BIT(11) + +#define MTIP_PORT_FORWARDING_INIT 0xFF + +/* Switch Management functions */ +int mtip_vlan_input_process(struct switch_enet_private *fep, + int port, int mode, unsigned short port_vlanid, + int vlan_verify_en, int vlan_domain_num, + int vlan_domain_port); +int mtip_set_vlan_verification(struct switch_enet_private *fep, int port, + int vlan_domain_verify_en, + int vlan_discard_unknown_en); +int mtip_port_multicast_config(struct switch_enet_private *fep, int port, + bool enable); +int mtip_vlan_output_process(struct switch_enet_private *fep, int port, + int mode); +void mtip_switch_en_port_separation(struct switch_enet_private *fep); +void mtip_switch_dis_port_separation(struct switch_enet_private *fep); +int mtip_port_broadcast_config(struct switch_enet_private *fep, + int port, bool enable); +int mtip_forced_forward(struct switch_enet_private *fep, int port, bool enable); +int mtip_port_learning_config(struct switch_enet_private *fep, int port, + bool disable, bool irq_adj); +int mtip_port_blocking_config(struct switch_enet_private *fep, int port, + bool enable); +bool mtip_is_switch_netdev_port(const struct net_device *ndev); +int mtip_register_notifiers(struct switch_enet_private *fep); +void mtip_unregister_notifiers(struct switch_enet_private *fep); +int mtip_port_enable_config(struct switch_enet_private *fep, int port, + bool tx_en, bool rx_en); +void mtip_clear_atable(struct switch_enet_private *fep); +#endif /* __MTIP_L2SWITCH_H_ */ diff --git a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw_br.c b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw_br.c new file mode 100644 index 0000000000000..edfd95a7790db --- /dev/null +++ b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw_br.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * L2 switch Controller driver for MTIP block - bridge network interface + * + * Copyright (C) 2025 DENX Software Engineering GmbH + * Lukasz Majewski + */ + +#include +#include +#include + +#include "mtipl2sw.h" + +static int mtip_ndev_port_link(struct net_device *ndev, + struct net_device *br_ndev, + struct netlink_ext_ack *extack) +{ + struct mtip_ndev_priv *priv = netdev_priv(ndev), *other_priv; + struct switch_enet_private *fep = priv->fep; + struct net_device *other_ndev; + + /* Check if one port of MTIP switch is already bridged */ + if (fep->br_members && !fep->br_offload) { + /* Get the second bridge ndev */ + other_ndev = fep->ndev[fep->br_members - 1]; + other_priv = netdev_priv(other_ndev); + if (other_priv->master_dev != br_ndev) { + NL_SET_ERR_MSG_MOD(extack, + "L2 offloading only possible for the same bridge!"); + return notifier_from_errno(-EOPNOTSUPP); + } + + fep->br_offload = 1; + mtip_switch_dis_port_separation(fep); + mtip_clear_atable(fep); + } + + if (!priv->master_dev) + priv->master_dev = br_ndev; + + fep->br_members |= BIT(priv->portnum - 1); + + dev_dbg(&ndev->dev, + "%s: ndev: %s br: %s fep: %p members: 0x%x offload: %d\n", + __func__, ndev->name, br_ndev->name, fep, fep->br_members, + fep->br_offload); + + return NOTIFY_DONE; +} + +static void mtip_netdevice_port_unlink(struct net_device *ndev) +{ + struct mtip_ndev_priv *priv = netdev_priv(ndev); + struct switch_enet_private *fep = priv->fep; + + dev_dbg(&ndev->dev, "%s: ndev: %s members: 0x%x\n", __func__, + ndev->name, fep->br_members); + + fep->br_members &= ~BIT(priv->portnum - 1); + priv->master_dev = NULL; + + if (fep->br_members && fep->br_offload) { + fep->br_offload = 0; + mtip_switch_en_port_separation(fep); + mtip_clear_atable(fep); + } +} + +/* netdev notifier */ +static int mtip_netdevice_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct netdev_notifier_changeupper_info *info = ptr; + struct netlink_ext_ack *extack; + int ret = NOTIFY_DONE; + + if (!mtip_is_switch_netdev_port(ndev)) + return NOTIFY_DONE; + + extack = netdev_notifier_info_to_extack(&info->info); + + switch (event) { + case NETDEV_CHANGEUPPER: + if (!netif_is_bridge_master(info->upper_dev)) + break; + + if (info->linking) + ret = mtip_ndev_port_link(ndev, info->upper_dev, + extack); + else + mtip_netdevice_port_unlink(ndev); + + break; + default: + return NOTIFY_DONE; + } + + return notifier_from_errno(ret); +} + +static struct notifier_block mtip_netdevice_nb __read_mostly = { + .notifier_call = mtip_netdevice_event, +}; + +int mtip_register_notifiers(struct switch_enet_private *fep) +{ + int ret = register_netdevice_notifier(&mtip_netdevice_nb); + + if (ret) + dev_err(&fep->pdev->dev, "can't register netdevice notifier\n"); + + return ret; +} + +void mtip_unregister_notifiers(struct switch_enet_private *fep) +{ + unregister_netdevice_notifier(&mtip_netdevice_nb); +} diff --git a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw_mgnt.c b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw_mgnt.c new file mode 100644 index 0000000000000..1e9f6a5c3c954 --- /dev/null +++ b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw_mgnt.c @@ -0,0 +1,436 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * L2 switch Controller driver for MTIP block - switch MGNT + * + * Copyright (C) 2025 DENX Software Engineering GmbH + * Lukasz Majewski + * + * Based on a previous work by: + * + * Copyright 2010-2012 Freescale Semiconductor, Inc. + * Alison Wang (b18965@freescale.com) + * Jason Jin (Jason.jin@freescale.com) + * + * Copyright (C) 2010-2013 Freescale Semiconductor, Inc. All Rights Reserved. + * Shrek Wu (B16972@freescale.com) + */ + +#include +#include +#include + +#include "mtipl2sw.h" + +int mtip_vlan_input_process(struct switch_enet_private *fep, + int port, int mode, unsigned short port_vlanid, + int vlan_verify_en, int vlan_domain_num, + int vlan_domain_port) +{ + /* Only modes from 1 to 4 are valid*/ + if (mode < 0 || mode > 4) { + dev_err(&fep->pdev->dev, + "%s: VLAN input processing mode (%d) not supported\n", + __func__, mode); + return -EINVAL; + } + + if (port < 0 || port > 2) { + dev_err(&fep->pdev->dev, "%s: Port (%d) not supported!\n", + __func__, port); + return -EINVAL; + } + + if (vlan_verify_en == 1 && + (vlan_domain_num < 0 || vlan_domain_num > 32)) { + dev_err(&fep->pdev->dev, "%s: Domain out of range\n", __func__); + return -EINVAL; + } + + writel(MCF_ESW_PID_VLANID(port_vlanid), fep->hwp + ESW_PID(port)); + if (port == 0) { + if (vlan_verify_en == 1) + writel(MCF_ESW_VRES_VLANID(port_vlanid) | + MCF_ESW_VRES_P0, + fep->hwp + ESW_VRES(vlan_domain_num)); + + writel(readl(fep->hwp + ESW_VIMEN) | MCF_ESW_VIMEN_EN0, + fep->hwp + ESW_VIMEN); + writel(readl(fep->hwp + ESW_VIMSEL) | MCF_ESW_VIMSEL_IM0(mode), + fep->hwp + ESW_VIMSEL); + } else if (port == 1) { + if (vlan_verify_en == 1) + writel(MCF_ESW_VRES_VLANID(port_vlanid) | + MCF_ESW_VRES_P1, + fep->hwp + ESW_VRES(vlan_domain_num)); + + writel(readl(fep->hwp + ESW_VIMEN) | MCF_ESW_VIMEN_EN1, + fep->hwp + ESW_VIMEN); + writel(readl(fep->hwp + ESW_VIMSEL) | MCF_ESW_VIMSEL_IM1(mode), + fep->hwp + ESW_VIMSEL); + } else if (port == 2) { + if (vlan_verify_en == 1) + writel(MCF_ESW_VRES_VLANID(port_vlanid) | + MCF_ESW_VRES_P2, + fep->hwp + ESW_VRES(vlan_domain_num)); + + writel(readl(fep->hwp + ESW_VIMEN) | MCF_ESW_VIMEN_EN2, + fep->hwp + ESW_VIMEN); + writel(readl(fep->hwp + ESW_VIMSEL) | MCF_ESW_VIMSEL_IM2(mode), + fep->hwp + ESW_VIMSEL); + } + + return 0; +} + +int mtip_vlan_output_process(struct switch_enet_private *fep, int port, + int mode) +{ + if (port < 0 || port > 2) { + dev_err(&fep->pdev->dev, "%s: Port (%d) not supported!\n", + __func__, port); + return -EINVAL; + } + + if (port == 0) { + writel(readl(fep->hwp + ESW_VOMSEL) | MCF_ESW_VOMSEL_OM0(mode), + fep->hwp + ESW_VOMSEL); + } else if (port == 1) { + writel(readl(fep->hwp + ESW_VOMSEL) | MCF_ESW_VOMSEL_OM1(mode), + fep->hwp + ESW_VOMSEL); + } else if (port == 2) { + writel(readl(fep->hwp + ESW_VOMSEL) | MCF_ESW_VOMSEL_OM2(mode), + fep->hwp + ESW_VOMSEL); + } + + return 0; +} + +int mtip_set_vlan_verification(struct switch_enet_private *fep, int port, + int vlan_domain_verify_en, + int vlan_discard_unknown_en) +{ + if (port < 0 || port > 2) { + dev_err(&fep->pdev->dev, "%s: Port (%d) not supported!\n", + __func__, port); + return -EINVAL; + } + + if (vlan_domain_verify_en == 1) { + if (port == 0) + writel(readl(fep->hwp + ESW_VLANV) | MCF_ESW_VLANV_VV0, + fep->hwp + ESW_VLANV); + else if (port == 1) + writel(readl(fep->hwp + ESW_VLANV) | MCF_ESW_VLANV_VV1, + fep->hwp + ESW_VLANV); + else if (port == 2) + writel(readl(fep->hwp + ESW_VLANV) | MCF_ESW_VLANV_VV2, + fep->hwp + ESW_VLANV); + } else if (vlan_domain_verify_en == 0) { + if (port == 0) + writel(readl(fep->hwp + ESW_VLANV) & ~MCF_ESW_VLANV_VV0, + fep->hwp + ESW_VLANV); + else if (port == 1) + writel(readl(fep->hwp + ESW_VLANV) & ~MCF_ESW_VLANV_VV1, + fep->hwp + ESW_VLANV); + else if (port == 2) + writel(readl(fep->hwp + ESW_VLANV) & ~MCF_ESW_VLANV_VV2, + fep->hwp + ESW_VLANV); + } + + if (vlan_discard_unknown_en == 1) { + if (port == 0) + writel(readl(fep->hwp + ESW_VLANV) | MCF_ESW_VLANV_DU0, + fep->hwp + ESW_VLANV); + else if (port == 1) + writel(readl(fep->hwp + ESW_VLANV) | MCF_ESW_VLANV_DU1, + fep->hwp + ESW_VLANV); + else if (port == 2) + writel(readl(fep->hwp + ESW_VLANV) | MCF_ESW_VLANV_DU2, + fep->hwp + ESW_VLANV); + } else if (vlan_discard_unknown_en == 0) { + if (port == 0) + writel(readl(fep->hwp + ESW_VLANV) & ~MCF_ESW_VLANV_DU0, + fep->hwp + ESW_VLANV); + else if (port == 1) + writel(readl(fep->hwp + ESW_VLANV) & ~MCF_ESW_VLANV_DU1, + fep->hwp + ESW_VLANV); + else if (port == 2) + writel(readl(fep->hwp + ESW_VLANV) & ~MCF_ESW_VLANV_DU2, + fep->hwp + ESW_VLANV); + } + + dev_dbg(&fep->pdev->dev, "%s: ESW_VLANV %#x\n", __func__, + readl(fep->hwp + ESW_VLANV)); + + return 0; +} + +int mtip_port_multicast_config(struct switch_enet_private *fep, + int port, bool enable) +{ + u32 reg = 0; + + if (port < 0 || port > 2) { + dev_err(&fep->pdev->dev, "%s: Port (%d) not supported\n", + __func__, port); + return -EINVAL; + } + + reg = readl(fep->hwp + ESW_DMCR); + if (enable) { + if (port == 0) + reg |= MCF_ESW_DMCR_P0; + else if (port == 1) + reg |= MCF_ESW_DMCR_P1; + else if (port == 2) + reg |= MCF_ESW_DMCR_P2; + } else { + if (port == 0) + reg &= ~MCF_ESW_DMCR_P0; + else if (port == 1) + reg &= ~MCF_ESW_DMCR_P1; + else if (port == 2) + reg &= ~MCF_ESW_DMCR_P2; + } + + writel(reg, fep->hwp + ESW_DMCR); + return 0; +} + +/* enable or disable port n tx or rx + * tx_en 0 disable port n tx + * tx_en 1 enable port n tx + * rx_en 0 disable port n rx + * rx_en 1 enable port n rx + */ +int mtip_port_enable_config(struct switch_enet_private *fep, int port, + bool tx_en, bool rx_en) +{ + u32 reg = 0; + + if (port < 0 || port > 2) { + dev_err(&fep->pdev->dev, "%s: Port (%d) not supported\n", + __func__, port); + return -EINVAL; + } + + reg = readl(fep->hwp + ESW_PER); + if (tx_en) { + if (port == 0) + reg |= MCF_ESW_PER_TE0; + else if (port == 1) + reg |= MCF_ESW_PER_TE1; + else if (port == 2) + reg |= MCF_ESW_PER_TE2; + } else { + if (port == 0) + reg &= (~MCF_ESW_PER_TE0); + else if (port == 1) + reg &= (~MCF_ESW_PER_TE1); + else if (port == 2) + reg &= (~MCF_ESW_PER_TE2); + } + + if (rx_en) { + if (port == 0) + reg |= MCF_ESW_PER_RE0; + else if (port == 1) + reg |= MCF_ESW_PER_RE1; + else if (port == 2) + reg |= MCF_ESW_PER_RE2; + } else { + if (port == 0) + reg &= (~MCF_ESW_PER_RE0); + else if (port == 1) + reg &= (~MCF_ESW_PER_RE1); + else if (port == 2) + reg &= (~MCF_ESW_PER_RE2); + } + + writel(reg, fep->hwp + ESW_PER); + return 0; +} + +void mtip_switch_en_port_separation(struct switch_enet_private *fep) +{ + u32 reg; + + mtip_vlan_input_process(fep, 0, 3, 0x10, 1, 0, 0); + mtip_vlan_input_process(fep, 1, 3, 0x11, 1, 1, 0); + mtip_vlan_input_process(fep, 2, 3, 0x12, 1, 2, 0); + + reg = readl(fep->hwp + ESW_VRES(0)); + writel(reg | MCF_ESW_VRES_P1 | MCF_ESW_VRES_P2, + fep->hwp + ESW_VRES(0)); + + reg = readl(fep->hwp + ESW_VRES(1)); + writel(reg | MCF_ESW_VRES_P0, fep->hwp + ESW_VRES(1)); + + reg = readl(fep->hwp + ESW_VRES(2)); + writel(reg | MCF_ESW_VRES_P0, fep->hwp + ESW_VRES(2)); + + dev_dbg(&fep->pdev->dev, "%s: VRES0: 0x%x\n", + __func__, readl(fep->hwp + ESW_VRES(0))); + dev_dbg(&fep->pdev->dev, "%s: VRES1: 0x%x\n", __func__, + readl(fep->hwp + ESW_VRES(1))); + dev_dbg(&fep->pdev->dev, "%s: VRES2: 0x%x\n", __func__, + readl(fep->hwp + ESW_VRES(2))); + + mtip_set_vlan_verification(fep, 0, 1, 0); + mtip_set_vlan_verification(fep, 1, 1, 0); + mtip_set_vlan_verification(fep, 2, 1, 0); + + mtip_vlan_output_process(fep, 0, 2); + mtip_vlan_output_process(fep, 1, 2); + mtip_vlan_output_process(fep, 2, 2); +} + +void mtip_switch_dis_port_separation(struct switch_enet_private *fep) +{ + writel(0, fep->hwp + ESW_PID(0)); + writel(0, fep->hwp + ESW_PID(1)); + writel(0, fep->hwp + ESW_PID(2)); + + writel(0, fep->hwp + ESW_VRES(0)); + writel(0, fep->hwp + ESW_VRES(1)); + writel(0, fep->hwp + ESW_VRES(2)); + + writel(0, fep->hwp + ESW_VIMEN); + writel(0, fep->hwp + ESW_VIMSEL); + writel(0, fep->hwp + ESW_VLANV); + writel(0, fep->hwp + ESW_VOMSEL); +} + +int mtip_port_broadcast_config(struct switch_enet_private *fep, + int port, bool enable) +{ + u32 reg = 0; + + if (port < 0 || port > 2) { + dev_err(&fep->pdev->dev, "%s: Port (%d) not supported\n", + __func__, port); + return -EINVAL; + } + + reg = readl(fep->hwp + ESW_DBCR); + if (enable) { + if (port == 0) + reg |= MCF_ESW_DBCR_P0; + else if (port == 1) + reg |= MCF_ESW_DBCR_P1; + else if (port == 2) + reg |= MCF_ESW_DBCR_P2; + } else { + if (port == 0) + reg &= ~MCF_ESW_DBCR_P0; + else if (port == 1) + reg &= ~MCF_ESW_DBCR_P1; + else if (port == 2) + reg &= ~MCF_ESW_DBCR_P2; + } + + writel(reg, fep->hwp + ESW_DBCR); + return 0; +} + +/* The frame is forwarded to the forced destination ports. + * It only replace the MAC lookup function, + * all other filtering(eg.VLAN verification) act as normal + */ +int mtip_forced_forward(struct switch_enet_private *fep, int port, bool enable) +{ + u32 reg = 0; + + if (port & ~GENMASK(1, 0)) { + dev_err(&fep->pdev->dev, + "%s: Forced forward for port(s): 0x%x not supported!\n", + __func__, port); + return -EINVAL; + } + + /* Enable Forced forwarding for port(s) */ + reg |= MCF_ESW_P0FFEN_FD(port & GENMASK(1, 0)); + + if (enable) + reg |= MCF_ESW_P0FFEN_FEN; + else + reg &= ~MCF_ESW_P0FFEN_FEN; + + writel(reg, fep->hwp + ESW_P0FFEN); + return 0; +} + +int mtip_port_learning_config(struct switch_enet_private *fep, int port, + bool disable, bool irq_adj) +{ + u32 reg = 0; + + if (port < 0 || port > 2) { + dev_err(&fep->pdev->dev, "%s: Port (%d) not supported\n", + __func__, port); + return -EINVAL; + } + + reg = readl(fep->hwp + ESW_BKLR); + if (disable) { + if (irq_adj) + writel(readl(fep->hwp + ESW_IMR) & ~MCF_ESW_IMR_LRN, + fep->hwp + ESW_IMR); + + if (port == 0) + reg |= MCF_ESW_BKLR_LD0; + else if (port == 1) + reg |= MCF_ESW_BKLR_LD1; + else if (port == 2) + reg |= MCF_ESW_BKLR_LD2; + } else { + if (irq_adj) + writel(readl(fep->hwp + ESW_IMR) | MCF_ESW_IMR_LRN, + fep->hwp + ESW_IMR); + + if (port == 0) + reg &= ~MCF_ESW_BKLR_LD0; + else if (port == 1) + reg &= ~MCF_ESW_BKLR_LD1; + else if (port == 2) + reg &= ~MCF_ESW_BKLR_LD2; + } + + writel(reg, fep->hwp + ESW_BKLR); + dev_dbg(&fep->pdev->dev, "%s ESW_BKLR %#x, ESW_IMR %#x\n", __func__, + readl(fep->hwp + ESW_BKLR), readl(fep->hwp + ESW_IMR)); + + return 0; +} + +int mtip_port_blocking_config(struct switch_enet_private *fep, int port, + bool enable) +{ + u32 reg = 0; + + if (port < 0 || port > 2) { + dev_err(&fep->pdev->dev, "%s: Port (%d) not supported\n", + __func__, port); + return -EINVAL; + } + + reg = readl(fep->hwp + ESW_BKLR); + if (enable) { + if (port == 0) + reg |= MCF_ESW_BKLR_BE0; + else if (port == 1) + reg |= MCF_ESW_BKLR_BE1; + else if (port == 2) + reg |= MCF_ESW_BKLR_BE2; + } else { + if (port == 0) + reg &= ~MCF_ESW_BKLR_BE0; + else if (port == 1) + reg &= ~MCF_ESW_BKLR_BE1; + else if (port == 2) + reg &= ~MCF_ESW_BKLR_BE2; + } + + writel(reg, fep->hwp + ESW_BKLR); + return 0; +} From 0215aa56c4f0077ed601afcb7dcd07b64762e915 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Fri, 2 May 2025 09:44:45 +0200 Subject: [PATCH 756/770] ARM: mxs_defconfig: Enable CONFIG_NFS_FSCACHE It is not possible to enable by user the CONFIG_NETFS_SUPPORT anymore and hence it depends on CONFIG_NFS_FSCACHE being enabled. This patch fixes potential performance regression for NFS on the mxs devices. Signed-off-by: Lukasz Majewski Reviewed-by: Stefan Wahren Suggested-by: Stefan Wahren Signed-off-by: NipaLocal --- arch/arm/configs/mxs_defconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig index c76d66135abb1..22f7639f61fe2 100644 --- a/arch/arm/configs/mxs_defconfig +++ b/arch/arm/configs/mxs_defconfig @@ -138,8 +138,6 @@ CONFIG_PWM_MXS=y CONFIG_NVMEM_MXS_OCOTP=y CONFIG_EXT4_FS=y # CONFIG_DNOTIFY is not set -CONFIG_NETFS_SUPPORT=m -CONFIG_FSCACHE=y CONFIG_FSCACHE_STATS=y CONFIG_CACHEFILES=m CONFIG_VFAT_FS=y @@ -155,6 +153,7 @@ CONFIG_NFS_FS=y CONFIG_NFS_V3_ACL=y CONFIG_NFS_V4=y CONFIG_ROOT_NFS=y +CONFIG_NFS_FSCACHE=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_850=y CONFIG_NLS_ISO8859_1=y From 79af5fece22a3a32ba1587d3df65f81a025abb4f Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Fri, 2 May 2025 09:44:46 +0200 Subject: [PATCH 757/770] ARM: mxs_defconfig: Update mxs_defconfig to 6.15-rc1 This file is the updated version of mxs_defconfig for the v6.15-rc1 linux-next. Detailed description of removed configuration entries: -CONFIG_MTD_M25P80=y -> it has been replaced MTD_SPI_NOR (which is enabled) -CONFIG_SMSC_PHY=y -> is enabled implicit by USB_NET_SMSC95XX -CONFIG_GPIO_SYSFS=y -> it has been deprecated by moving to EXPERT and its replacement GPIO_CDEV is enabled by default Signed-off-by: Lukasz Majewski Reviewed-by: Stefan Wahren Suggested-by: Stefan Wahren Signed-off-by: NipaLocal --- arch/arm/configs/mxs_defconfig | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig index 22f7639f61fe2..b1a31cb914c83 100644 --- a/arch/arm/configs/mxs_defconfig +++ b/arch/arm/configs/mxs_defconfig @@ -32,9 +32,6 @@ CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y CONFIG_SYN_COOKIES=y -# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -# CONFIG_INET_XFRM_MODE_TUNNEL is not set -# CONFIG_INET_XFRM_MODE_BEET is not set # CONFIG_INET_DIAG is not set # CONFIG_IPV6 is not set CONFIG_CAN=m @@ -45,7 +42,6 @@ CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y CONFIG_MTD_BLOCK=y CONFIG_MTD_DATAFLASH=y -CONFIG_MTD_M25P80=y CONFIG_MTD_SST25L=y CONFIG_MTD_RAW_NAND=y CONFIG_MTD_NAND_GPMI_NAND=y @@ -60,7 +56,6 @@ CONFIG_ENC28J60=y CONFIG_ICPLUS_PHY=y CONFIG_MICREL_PHY=y CONFIG_REALTEK_PHY=y -CONFIG_SMSC_PHY=y CONFIG_CAN_FLEXCAN=m CONFIG_USB_USBNET=y CONFIG_USB_NET_SMSC95XX=y @@ -77,13 +72,11 @@ CONFIG_SERIAL_AMBA_PL011=y CONFIG_SERIAL_AMBA_PL011_CONSOLE=y CONFIG_SERIAL_MXS_AUART=y # CONFIG_HW_RANDOM is not set -# CONFIG_I2C_COMPAT is not set CONFIG_I2C_CHARDEV=y CONFIG_I2C_MXS=y CONFIG_SPI=y CONFIG_SPI_GPIO=m CONFIG_SPI_MXS=y -CONFIG_GPIO_SYSFS=y # CONFIG_HWMON is not set CONFIG_WATCHDOG=y CONFIG_STMP3XXX_RTC_WATCHDOG=y From a0348b385725ad3fd3fcc35135acc227e7b4b1b6 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Fri, 2 May 2025 09:44:47 +0200 Subject: [PATCH 758/770] ARM: mxs_defconfig: Enable CONFIG_FEC_MTIP_L2SW to support MTIP L2 switch This patch enables support for More Than IP L2 switch available on some imx28[7] devices. Moreover, it also enables CONFIG_SWITCHDEV and CONFIG_BRIDGE required by this driver for correct operation. Signed-off-by: Lukasz Majewski Reviewed-by: Stefan Wahren Signed-off-by: NipaLocal --- arch/arm/configs/mxs_defconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig index b1a31cb914c83..ef45562222744 100644 --- a/arch/arm/configs/mxs_defconfig +++ b/arch/arm/configs/mxs_defconfig @@ -34,6 +34,8 @@ CONFIG_IP_PNP_DHCP=y CONFIG_SYN_COOKIES=y # CONFIG_INET_DIAG is not set # CONFIG_IPV6 is not set +CONFIG_BRIDGE=y +CONFIG_NET_SWITCHDEV=y CONFIG_CAN=m # CONFIG_WIRELESS is not set CONFIG_DEVTMPFS=y @@ -52,6 +54,7 @@ CONFIG_EEPROM_AT24=y CONFIG_SCSI=y CONFIG_BLK_DEV_SD=y CONFIG_NETDEVICES=y +CONFIG_FEC_MTIP_L2SW=y CONFIG_ENC28J60=y CONFIG_ICPLUS_PHY=y CONFIG_MICREL_PHY=y From 0c0134523cd838628bc76516846de35b4ec84c89 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 2 May 2025 10:52:39 +0200 Subject: [PATCH 759/770] net: ethtool: Introduce per-PHY DUMP operations ethnl commands that target a phy_device need a DUMP implementation that will fill the reply for every PHY behind a netdev. We therefore need to iterate over the dev->topo to list them. When multiple PHYs are behind the same netdev, it's also useful to perform DUMP with a filter on a given netdev, to get the capability of every PHY. Implement dedicated genl ->start(), ->dumpit() and ->done() operations for PHY-targetting command, allowing filtered dumps and using a dump context that keep track of the PHY iteration for multi-message dump. PSE-PD and PLCA are converted to this new set of ops along the way. Signed-off-by: Maxime Chevallier Signed-off-by: NipaLocal --- net/ethtool/netlink.c | 203 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 194 insertions(+), 9 deletions(-) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 977beeaaa2f99..a58570c68db27 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -357,6 +357,18 @@ struct ethnl_dump_ctx { unsigned long pos_ifindex; }; +/** + * struct ethnl_perphy_dump_ctx - context for dumpit() PHY-aware callbacks + * @ethnl_ctx: generic ethnl context + * @ifindex: For Filtered DUMP requests, the ifindex of the targeted netdev + * @pos_phyindex: iterator position for multi-msg DUMP + */ +struct ethnl_perphy_dump_ctx { + struct ethnl_dump_ctx ethnl_ctx; + unsigned int ifindex; + unsigned long pos_phyindex; +}; + static const struct ethnl_request_ops * ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STRSET_GET] = ðnl_strset_request_ops, @@ -407,6 +419,12 @@ static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) return (struct ethnl_dump_ctx *)cb->ctx; } +static struct ethnl_perphy_dump_ctx * +ethnl_perphy_dump_context(struct netlink_callback *cb) +{ + return (struct ethnl_perphy_dump_ctx *)cb->ctx; +} + /** * ethnl_default_parse() - Parse request message * @req_info: pointer to structure to put data into @@ -662,6 +680,173 @@ static int ethnl_default_start(struct netlink_callback *cb) return ret; } +/* per-PHY ->start() handler for GET requests */ +static int ethnl_perphy_start(struct netlink_callback *cb) +{ + struct ethnl_perphy_dump_ctx *phy_ctx = ethnl_perphy_dump_context(cb); + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct ethnl_dump_ctx *ctx = &phy_ctx->ethnl_ctx; + struct ethnl_reply_data *reply_data; + const struct ethnl_request_ops *ops; + struct ethnl_req_info *req_info; + struct genlmsghdr *ghdr; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + ghdr = nlmsg_data(cb->nlh); + ops = ethnl_default_requests[ghdr->cmd]; + if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd)) + return -EOPNOTSUPP; + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); + if (!reply_data) { + ret = -ENOMEM; + goto free_req_info; + } + + /* Unlike per-dev dump, don't ignore dev. The dump handler + * will notice it and dump PHYs from given dev. We only keep track of + * the dev's ifindex, .dumpit() will grab and release the netdev itself. + */ + ret = ethnl_default_parse(req_info, &info->info, ops, false); + if (req_info->dev) { + phy_ctx->ifindex = req_info->dev->ifindex; + netdev_put(req_info->dev, &req_info->dev_tracker); + req_info->dev = NULL; + } + if (ret < 0) + goto free_reply_data; + + ctx->ops = ops; + ctx->req_info = req_info; + ctx->reply_data = reply_data; + ctx->pos_ifindex = 0; + + return 0; + +free_reply_data: + kfree(reply_data); +free_req_info: + kfree(req_info); + + return ret; +} + +static int ethnl_perphy_dump_one_dev(struct sk_buff *skb, + struct ethnl_perphy_dump_ctx *ctx, + const struct genl_info *info) +{ + struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; + struct net_device *dev = ethnl_ctx->req_info->dev; + struct phy_device_node *pdn; + int ret; + + if (!dev->link_topo) + return 0; + + xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn, + ctx->pos_phyindex) { + ethnl_ctx->req_info->phy_index = ctx->pos_phyindex; + + /* We can re-use the original dump_one as ->prepare_data in + * commands use ethnl_req_get_phydev(), which gets the PHY from + * the req_info->phy_index + */ + ret = ethnl_default_dump_one(skb, dev, ethnl_ctx, info); + if (ret) + return ret; + } + + ctx->pos_phyindex = 0; + + return 0; +} + +static int ethnl_perphy_dump_all_dev(struct sk_buff *skb, + struct ethnl_perphy_dump_ctx *ctx, + const struct genl_info *info) +{ + struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; + struct net *net = sock_net(skb->sk); + netdevice_tracker dev_tracker; + struct net_device *dev; + int ret = 0; + + rcu_read_lock(); + for_each_netdev_dump(net, dev, ethnl_ctx->pos_ifindex) { + netdev_hold(dev, &dev_tracker, GFP_ATOMIC); + rcu_read_unlock(); + + /* per-PHY commands use ethnl_req_get_phydev(), which needs the + * net_device in the req_info + */ + ethnl_ctx->req_info->dev = dev; + ret = ethnl_perphy_dump_one_dev(skb, ctx, info); + + rcu_read_lock(); + netdev_put(dev, &dev_tracker); + ethnl_ctx->req_info->dev = NULL; + + if (ret < 0 && ret != -EOPNOTSUPP) { + if (likely(skb->len)) + ret = skb->len; + break; + } + ret = 0; + } + rcu_read_unlock(); + + return ret; +} + +/* per-PHY ->dumpit() handler for GET requests. */ +static int ethnl_perphy_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb); + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; + int ret = 0; + + if (ctx->ifindex) { + netdevice_tracker dev_tracker; + struct net_device *dev; + + dev = netdev_get_by_index(genl_info_net(&info->info), + ctx->ifindex, &dev_tracker, + GFP_KERNEL); + if (!dev) + return -ENODEV; + + ethnl_ctx->req_info->dev = dev; + ret = ethnl_perphy_dump_one_dev(skb, ctx, genl_info_dump(cb)); + + if (ret < 0 && ret != -EOPNOTSUPP && likely(skb->len)) + ret = skb->len; + + netdev_put(dev, &dev_tracker); + } else { + ret = ethnl_perphy_dump_all_dev(skb, ctx, genl_info_dump(cb)); + } + + return ret; +} + +/* per-PHY ->done() handler for GET requests */ +static int ethnl_perphy_done(struct netlink_callback *cb) +{ + struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb); + struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; + + kfree(ethnl_ctx->reply_data); + kfree(ethnl_ctx->req_info); + + return 0; +} + /* default ->done() handler for GET requests */ static int ethnl_default_done(struct netlink_callback *cb) { @@ -1200,9 +1385,9 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PSE_GET, .doit = ethnl_default_doit, - .start = ethnl_default_start, - .dumpit = ethnl_default_dumpit, - .done = ethnl_default_done, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, .policy = ethnl_pse_get_policy, .maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1, }, @@ -1224,9 +1409,9 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PLCA_GET_CFG, .doit = ethnl_default_doit, - .start = ethnl_default_start, - .dumpit = ethnl_default_dumpit, - .done = ethnl_default_done, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, .policy = ethnl_plca_get_cfg_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1, }, @@ -1240,9 +1425,9 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PLCA_GET_STATUS, .doit = ethnl_default_doit, - .start = ethnl_default_start, - .dumpit = ethnl_default_dumpit, - .done = ethnl_default_done, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, .policy = ethnl_plca_get_status_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1, }, From fde0d407e4c46ceb410120d9c628c494bfb23457 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 2 May 2025 10:52:40 +0200 Subject: [PATCH 760/770] net: ethtool: phy: Convert the PHY_GET command to generic phy dump Now that we have an infrastructure in ethnl for perphy DUMPs, we can get rid of the custom ->doit and ->dumpit to deal with PHY listing commands. As most of the code was custom, this basically means re-writing how we deal with PHY listing. Signed-off-by: Maxime Chevallier Signed-off-by: NipaLocal --- net/ethtool/netlink.c | 9 +- net/ethtool/netlink.h | 4 - net/ethtool/phy.c | 342 ++++++++++++------------------------------ 3 files changed, 101 insertions(+), 254 deletions(-) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index a58570c68db27..c8ed5a6f1e921 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -412,6 +412,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_MM_SET] = ðnl_mm_request_ops, [ETHTOOL_MSG_TSCONFIG_GET] = ðnl_tsconfig_request_ops, [ETHTOOL_MSG_TSCONFIG_SET] = ðnl_tsconfig_request_ops, + [ETHTOOL_MSG_PHY_GET] = ðnl_phy_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -1456,10 +1457,10 @@ static const struct genl_ops ethtool_genl_ops[] = { }, { .cmd = ETHTOOL_MSG_PHY_GET, - .doit = ethnl_phy_doit, - .start = ethnl_phy_start, - .dumpit = ethnl_phy_dumpit, - .done = ethnl_phy_done, + .doit = ethnl_default_doit, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, .policy = ethnl_phy_get_policy, .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1, }, diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index ec6ab5443a6f2..91b953924af38 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -499,10 +499,6 @@ int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info); int ethnl_rss_dump_start(struct netlink_callback *cb); int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb); -int ethnl_phy_start(struct netlink_callback *cb); -int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info); -int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb); -int ethnl_phy_done(struct netlink_callback *cb); int ethnl_tsinfo_start(struct netlink_callback *cb); int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_tsinfo_done(struct netlink_callback *cb); diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c index 1f590e8d75ed6..68372bef4b2fe 100644 --- a/net/ethtool/phy.c +++ b/net/ethtool/phy.c @@ -12,304 +12,154 @@ #include struct phy_req_info { - struct ethnl_req_info base; - struct phy_device_node *pdn; + struct ethnl_req_info base; }; -#define PHY_REQINFO(__req_base) \ - container_of(__req_base, struct phy_req_info, base) +struct phy_reply_data { + struct ethnl_reply_data base; + u32 phyindex; + char *drvname; + char *name; + unsigned int upstream_type; + char *upstream_sfp_name; + unsigned int upstream_index; + char *downstream_sfp_name; +}; + +#define PHY_REPDATA(__reply_base) \ + container_of(__reply_base, struct phy_reply_data, base) const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1] = { [ETHTOOL_A_PHY_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; -/* Caller holds rtnl */ -static ssize_t -ethnl_phy_reply_size(const struct ethnl_req_info *req_base, - struct netlink_ext_ack *extack) +static int phy_reply_size(const struct ethnl_req_info *req_info, + const struct ethnl_reply_data *reply_data) { - struct phy_req_info *req_info = PHY_REQINFO(req_base); - struct phy_device_node *pdn = req_info->pdn; - struct phy_device *phydev = pdn->phy; + struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); size_t size = 0; - ASSERT_RTNL(); - /* ETHTOOL_A_PHY_INDEX */ size += nla_total_size(sizeof(u32)); /* ETHTOOL_A_DRVNAME */ - if (phydev->drv) - size += nla_total_size(strlen(phydev->drv->name) + 1); + if (rep_data->drvname) + size += nla_total_size(strlen(rep_data->drvname) + 1); /* ETHTOOL_A_NAME */ - size += nla_total_size(strlen(dev_name(&phydev->mdio.dev)) + 1); + size += nla_total_size(strlen(rep_data->name) + 1); /* ETHTOOL_A_PHY_UPSTREAM_TYPE */ size += nla_total_size(sizeof(u32)); - if (phy_on_sfp(phydev)) { - const char *upstream_sfp_name = sfp_get_name(pdn->parent_sfp_bus); - - /* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */ - if (upstream_sfp_name) - size += nla_total_size(strlen(upstream_sfp_name) + 1); + /* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */ + if (rep_data->upstream_sfp_name) + size += nla_total_size(strlen(rep_data->upstream_sfp_name) + 1); - /* ETHTOOL_A_PHY_UPSTREAM_INDEX */ + /* ETHTOOL_A_PHY_UPSTREAM_INDEX */ + if (rep_data->upstream_index) size += nla_total_size(sizeof(u32)); - } /* ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME */ - if (phydev->sfp_bus) { - const char *sfp_name = sfp_get_name(phydev->sfp_bus); - - if (sfp_name) - size += nla_total_size(strlen(sfp_name) + 1); - } + if (rep_data->downstream_sfp_name) + size += nla_total_size(strlen(rep_data->downstream_sfp_name) + 1); return size; } -static int -ethnl_phy_fill_reply(const struct ethnl_req_info *req_base, struct sk_buff *skb) +static int phy_prepare_data(const struct ethnl_req_info *req_info, + struct ethnl_reply_data *reply_data, + const struct genl_info *info) { - struct phy_req_info *req_info = PHY_REQINFO(req_base); - struct phy_device_node *pdn = req_info->pdn; - struct phy_device *phydev = pdn->phy; - enum phy_upstream ptype; + struct phy_link_topology *topo = reply_data->dev->link_topo; + struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); + struct nlattr **tb = info->attrs; + struct phy_device_node *pdn; + struct phy_device *phydev; - ptype = pdn->upstream_type; + /* RTNL is held by the caller */ + phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PHY_HEADER, + info->extack); + if (IS_ERR_OR_NULL(phydev)) + return -EOPNOTSUPP; - if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, phydev->phyindex) || - nla_put_string(skb, ETHTOOL_A_PHY_NAME, dev_name(&phydev->mdio.dev)) || - nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, ptype)) - return -EMSGSIZE; + pdn = xa_load(&topo->phys, phydev->phyindex); + if (!pdn) + return -EOPNOTSUPP; - if (phydev->drv && - nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, phydev->drv->name)) - return -EMSGSIZE; + rep_data->phyindex = phydev->phyindex; + rep_data->name = kstrdup(dev_name(&phydev->mdio.dev), GFP_KERNEL); + rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); + rep_data->upstream_type = pdn->upstream_type; - if (ptype == PHY_UPSTREAM_PHY) { + if (pdn->upstream_type == PHY_UPSTREAM_PHY) { struct phy_device *upstream = pdn->upstream.phydev; - const char *sfp_upstream_name; - - /* Parent index */ - if (nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX, upstream->phyindex)) - return -EMSGSIZE; - - if (pdn->parent_sfp_bus) { - sfp_upstream_name = sfp_get_name(pdn->parent_sfp_bus); - if (sfp_upstream_name && - nla_put_string(skb, ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, - sfp_upstream_name)) - return -EMSGSIZE; - } - } - - if (phydev->sfp_bus) { - const char *sfp_name = sfp_get_name(phydev->sfp_bus); - - if (sfp_name && - nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, - sfp_name)) - return -EMSGSIZE; + rep_data->upstream_index = upstream->phyindex; } - return 0; -} - -static int ethnl_phy_parse_request(struct ethnl_req_info *req_base, - struct nlattr **tb, - struct netlink_ext_ack *extack) -{ - struct phy_link_topology *topo = req_base->dev->link_topo; - struct phy_req_info *req_info = PHY_REQINFO(req_base); - struct phy_device *phydev; + if (pdn->parent_sfp_bus) + rep_data->upstream_sfp_name = kstrdup(sfp_get_name(pdn->parent_sfp_bus), + GFP_KERNEL); - phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PHY_HEADER, - extack); - if (!phydev) - return 0; - - if (IS_ERR(phydev)) - return PTR_ERR(phydev); - - if (!topo) - return 0; - - req_info->pdn = xa_load(&topo->phys, phydev->phyindex); + if (phydev->sfp_bus) + rep_data->downstream_sfp_name = kstrdup(sfp_get_name(phydev->sfp_bus), + GFP_KERNEL); return 0; } -int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info) +static int phy_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_info, + const struct ethnl_reply_data *reply_data) { - struct phy_req_info req_info = {}; - struct nlattr **tb = info->attrs; - struct sk_buff *rskb; - void *reply_payload; - int reply_len; - int ret; - - ret = ethnl_parse_header_dev_get(&req_info.base, - tb[ETHTOOL_A_PHY_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - - rtnl_lock(); - netdev_lock_ops(req_info.base.dev); - - ret = ethnl_phy_parse_request(&req_info.base, tb, info->extack); - if (ret < 0) - goto err_unlock; - - /* No PHY, return early */ - if (!req_info.pdn) - goto err_unlock; - - ret = ethnl_phy_reply_size(&req_info.base, info->extack); - if (ret < 0) - goto err_unlock; - reply_len = ret + ethnl_reply_header_size(); - - rskb = ethnl_reply_init(reply_len, req_info.base.dev, - ETHTOOL_MSG_PHY_GET_REPLY, - ETHTOOL_A_PHY_HEADER, - info, &reply_payload); - if (!rskb) { - ret = -ENOMEM; - goto err_unlock; - } - - ret = ethnl_phy_fill_reply(&req_info.base, rskb); - if (ret) - goto err_free_msg; + struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); - netdev_unlock_ops(req_info.base.dev); - rtnl_unlock(); - ethnl_parse_header_dev_put(&req_info.base); - genlmsg_end(rskb, reply_payload); - - return genlmsg_reply(rskb, info); - -err_free_msg: - nlmsg_free(rskb); -err_unlock: - netdev_unlock_ops(req_info.base.dev); - rtnl_unlock(); - ethnl_parse_header_dev_put(&req_info.base); - return ret; -} - -struct ethnl_phy_dump_ctx { - struct phy_req_info *phy_req_info; - unsigned long ifindex; - unsigned long phy_index; -}; - -int ethnl_phy_start(struct netlink_callback *cb) -{ - const struct genl_info *info = genl_info_dump(cb); - struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; - int ret; - - BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); - - ctx->phy_req_info = kzalloc(sizeof(*ctx->phy_req_info), GFP_KERNEL); - if (!ctx->phy_req_info) - return -ENOMEM; - - ret = ethnl_parse_header_dev_get(&ctx->phy_req_info->base, - info->attrs[ETHTOOL_A_PHY_HEADER], - sock_net(cb->skb->sk), cb->extack, - false); - ctx->ifindex = 0; - ctx->phy_index = 0; - - if (ret) - kfree(ctx->phy_req_info); + if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, rep_data->phyindex) || + nla_put_string(skb, ETHTOOL_A_PHY_NAME, rep_data->name) || + nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, rep_data->upstream_type)) + return -EMSGSIZE; - return ret; -} + if (rep_data->drvname && + nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, rep_data->drvname)) + return -EMSGSIZE; -int ethnl_phy_done(struct netlink_callback *cb) -{ - struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; + if (rep_data->upstream_index && + nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX, + rep_data->upstream_index)) + return -EMSGSIZE; - if (ctx->phy_req_info->base.dev) - ethnl_parse_header_dev_put(&ctx->phy_req_info->base); + if (rep_data->upstream_sfp_name && + nla_put_string(skb, ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, + rep_data->upstream_sfp_name)) + return -EMSGSIZE; - kfree(ctx->phy_req_info); + if (rep_data->downstream_sfp_name && + nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, + rep_data->downstream_sfp_name)) + return -EMSGSIZE; return 0; } -static int ethnl_phy_dump_one_dev(struct sk_buff *skb, struct net_device *dev, - struct netlink_callback *cb) +static void phy_cleanup_data(struct ethnl_reply_data *reply_data) { - struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; - struct phy_req_info *pri = ctx->phy_req_info; - struct phy_device_node *pdn; - int ret = 0; - void *ehdr; - - if (!dev->link_topo) - return 0; - - xa_for_each_start(&dev->link_topo->phys, ctx->phy_index, pdn, ctx->phy_index) { - ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_PHY_GET_REPLY); - if (!ehdr) { - ret = -EMSGSIZE; - break; - } - - ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_PHY_HEADER); - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - break; - } - - pri->pdn = pdn; - ret = ethnl_phy_fill_reply(&pri->base, skb); - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - break; - } - - genlmsg_end(skb, ehdr); - } + struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); - return ret; + kfree(rep_data->drvname); + kfree(rep_data->name); + kfree(rep_data->upstream_sfp_name); + kfree(rep_data->downstream_sfp_name); } -int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; - struct net *net = sock_net(skb->sk); - struct net_device *dev; - int ret = 0; - - rtnl_lock(); - - if (ctx->phy_req_info->base.dev) { - dev = ctx->phy_req_info->base.dev; - netdev_lock_ops(dev); - ret = ethnl_phy_dump_one_dev(skb, dev, cb); - netdev_unlock_ops(dev); - } else { - for_each_netdev_dump(net, dev, ctx->ifindex) { - netdev_lock_ops(dev); - ret = ethnl_phy_dump_one_dev(skb, dev, cb); - netdev_unlock_ops(dev); - if (ret) - break; - - ctx->phy_index = 0; - } - } - rtnl_unlock(); - - return ret; -} +const struct ethnl_request_ops ethnl_phy_request_ops = { + .request_cmd = ETHTOOL_MSG_PHY_GET, + .reply_cmd = ETHTOOL_MSG_PHY_GET_REPLY, + .hdr_attr = ETHTOOL_A_PHY_HEADER, + .req_info_size = sizeof(struct phy_req_info), + .reply_data_size = sizeof(struct phy_reply_data), + + .prepare_data = phy_prepare_data, + .reply_size = phy_reply_size, + .fill_reply = phy_fill_reply, + .cleanup_data = phy_cleanup_data, +}; From f11f0e43caff27928994fa872c8ad5bfe90cd21a Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 2 May 2025 10:52:41 +0200 Subject: [PATCH 761/770] net: ethtool: netlink: Use netdev_hold for dumpit() operations Move away from dev_hold and use netdev_hold with a local reftracker when performing a DUMP on each netdev. Signed-off-by: Maxime Chevallier Signed-off-by: NipaLocal --- net/ethtool/netlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index c8ed5a6f1e921..9de828df46cd3 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -603,18 +603,19 @@ static int ethnl_default_dumpit(struct sk_buff *skb, { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct net *net = sock_net(skb->sk); + netdevice_tracker dev_tracker; struct net_device *dev; int ret = 0; rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->pos_ifindex) { - dev_hold(dev); + netdev_hold(dev, &dev_tracker, GFP_ATOMIC); rcu_read_unlock(); ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb)); rcu_read_lock(); - dev_put(dev); + netdev_put(dev, &dev_tracker); if (ret < 0 && ret != -EOPNOTSUPP) { if (likely(skb->len)) From 40fe5ab96def519363d780aee7ae9c85b4c6755e Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Fri, 2 May 2025 16:12:35 +0530 Subject: [PATCH 762/770] net: ti: icssg-prueth: add TAPRIO offload support The Time-Aware Shaper (TAS) is a key feature of the Enhanced Scheduled Traffic (EST) mechanism defined in IEEE 802.1Q-2018. This patch adds TAS support for the ICSSG driver by interacting with the ICSSG firmware to manage gate control lists, cycle times, and other TAS parameters. The firmware maintains active and shadow lists. The driver updates the operating list using API `tas_update_oper_list()` which, - Updates firmware list pointers via `tas_update_fw_list_pointers`. - Writes gate masks, window end times, and clears unused entries in the shadow list. - Updates gate close times and Max SDU values for each queue. - Triggers list changes using `tas_set_trigger_list_change`, which - Computes cycle count (base-time % cycle-time) and extend (base-time % cycle-time) - Writes cycle time, cycle count, and extend values to firmware memory. - base-time being in past or base-time not being a multiple of cycle-time is taken care by the firmware. Driver just writes these variable for firmware and firmware takes care of the scheduling. - If base-time is not a multiple of cycle-time, the value of extend (base-time % cycle-time) is used by the firmware to extend the last cycle. - Sets `config_change` and `config_pending` flags to notify firmware of the new shadow list and its readiness for activation. - Sends the `ICSSG_EMAC_PORT_TAS_TRIGGER` r30 command to ask firmware to swap active and shadow lists. - Waits for the firmware to clear the `config_change` flag before completing the update and returning successfully. This implementation ensures seamless TAS functionality by offloading scheduling complexities to the firmware. Signed-off-by: Roger Quadros Signed-off-by: Vignesh Raghavendra Reviewed-by: Simon Horman Signed-off-by: MD Danish Anwar Signed-off-by: NipaLocal --- drivers/net/ethernet/ti/Kconfig | 1 + drivers/net/ethernet/ti/Makefile | 2 +- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 7 + drivers/net/ethernet/ti/icssg/icssg_prueth.h | 2 + drivers/net/ethernet/ti/icssg/icssg_qos.c | 310 ++++++++++++++++++ drivers/net/ethernet/ti/icssg/icssg_qos.h | 112 +++++++ .../net/ethernet/ti/icssg/icssg_switch_map.h | 6 + 7 files changed, 439 insertions(+), 1 deletion(-) create mode 100644 drivers/net/ethernet/ti/icssg/icssg_qos.c create mode 100644 drivers/net/ethernet/ti/icssg/icssg_qos.h diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig index a07c910c497a9..a69a2220b20ed 100644 --- a/drivers/net/ethernet/ti/Kconfig +++ b/drivers/net/ethernet/ti/Kconfig @@ -192,6 +192,7 @@ config TI_ICSSG_PRUETH depends on NET_SWITCHDEV depends on ARCH_K3 && OF && TI_K3_UDMA_GLUE_LAYER depends on PTP_1588_CLOCK_OPTIONAL + depends on NET_SCH_TAPRIO || NET_SCH_TAPRIO=n help Support dual Gigabit Ethernet ports over the ICSSG PRU Subsystem. This subsystem is available starting with the AM65 platform. diff --git a/drivers/net/ethernet/ti/Makefile b/drivers/net/ethernet/ti/Makefile index cbcf448069240..d0ff793a86394 100644 --- a/drivers/net/ethernet/ti/Makefile +++ b/drivers/net/ethernet/ti/Makefile @@ -32,7 +32,7 @@ ti-am65-cpsw-nuss-$(CONFIG_TI_K3_AM65_CPSW_SWITCHDEV) += am65-cpsw-switchdev.o obj-$(CONFIG_TI_K3_AM65_CPTS) += am65-cpts.o obj-$(CONFIG_TI_ICSSG_PRUETH) += icssg-prueth.o icssg.o -icssg-prueth-y := icssg/icssg_prueth.o icssg/icssg_switchdev.o +icssg-prueth-y := icssg/icssg_prueth.o icssg/icssg_switchdev.o icssg/icssg_qos.o obj-$(CONFIG_TI_ICSSG_PRUETH_SR1) += icssg-prueth-sr1.o icssg.o icssg-prueth-sr1-y := icssg/icssg_prueth_sr1.o diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 443f90fa65575..a6f3e0f797a62 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -453,6 +453,7 @@ static u64 prueth_iep_gettime(void *clockops_data, struct ptp_system_timestamp * ts = ((u64)hi_rollover_count) << 23 | iepcount_hi; ts = ts * (u64)IEP_DEFAULT_CYCLE_TIME_NS + iepcount_lo; + ts += readl(prueth->shram.va + TIMESYNC_CYCLE_EXTN_TIME); return ts; } @@ -491,6 +492,9 @@ static void prueth_iep_settime(void *clockops_data, u64 ns) usleep_range(500, 1000); } + /* Clear the Cycle extension adjustments */ + writel(0, emac->dram.va + TIMESYNC_CYCLE_EXTN_TIME); + dev_err(emac->prueth->dev, "settime timeout\n"); } @@ -1160,6 +1164,7 @@ static const struct net_device_ops emac_netdev_ops = { .ndo_vlan_rx_kill_vid = emac_ndo_vlan_rx_del_vid, .ndo_bpf = emac_ndo_bpf, .ndo_xdp_xmit = emac_xdp_xmit, + .ndo_setup_tc = icssg_qos_ndo_setup_tc, }; static int prueth_netdev_init(struct prueth *prueth, @@ -1297,6 +1302,8 @@ static int prueth_netdev_init(struct prueth *prueth, HRTIMER_MODE_REL_PINNED); prueth->emac[mac] = emac; + icssg_qos_tas_init(ndev); + return 0; free: diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h index 23c465f1ce7ff..b88d6ea527faa 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h @@ -41,6 +41,7 @@ #include "icssg_config.h" #include "icss_iep.h" #include "icssg_switch_map.h" +#include "icssg_qos.h" #define PRUETH_MAX_MTU (2000 - ETH_HLEN - ETH_FCS_LEN) #define PRUETH_MIN_PKT_SIZE (VLAN_ETH_ZLEN) @@ -240,6 +241,7 @@ struct prueth_emac { struct netdev_hw_addr_list vlan_mcast_list[MAX_VLAN_ID]; struct bpf_prog *xdp_prog; struct xdp_attachment_info xdpi; + struct prueth_qos qos; }; /* The buf includes headroom compatible with both skb and xdpf */ diff --git a/drivers/net/ethernet/ti/icssg/icssg_qos.c b/drivers/net/ethernet/ti/icssg/icssg_qos.c new file mode 100644 index 0000000000000..b42c896675d3e --- /dev/null +++ b/drivers/net/ethernet/ti/icssg/icssg_qos.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Texas Instruments ICSSG PRUETH QoS submodule + * Copyright (C) 2023 Texas Instruments Incorporated - http://www.ti.com/ + */ + +#include +#include "icssg_prueth.h" +#include "icssg_switch_map.h" + +static void tas_update_fw_list_pointers(struct prueth_emac *emac) +{ + struct tas_config *tas = &emac->qos.tas.config; + + if ((readb(tas->active_list)) == TAS_LIST0) { + tas->fw_active_list = emac->dram.va + TAS_GATE_MASK_LIST0; + tas->fw_shadow_list = emac->dram.va + TAS_GATE_MASK_LIST1; + } else { + tas->fw_active_list = emac->dram.va + TAS_GATE_MASK_LIST1; + tas->fw_shadow_list = emac->dram.va + TAS_GATE_MASK_LIST0; + } +} + +static void tas_update_maxsdu_table(struct prueth_emac *emac) +{ + struct tas_config *tas = &emac->qos.tas.config; + u16 __iomem *max_sdu_tbl_ptr; + u8 gate_idx; + + max_sdu_tbl_ptr = emac->dram.va + TAS_QUEUE_MAX_SDU_LIST; + + /* In the tc-taprio UAPI, a max-sdu value of 0 is special and means "no + * maxSDU limit for this TX queue". The firmware doesn't treat max-sdu + * value of 0 specially. As a result the driver needs to change the + * max-sdu value of 0 to PRUETH_MAX_MTU so that the firmware can treat + * it accordingly. + */ + for (gate_idx = 0; gate_idx < TAS_MAX_NUM_QUEUES; gate_idx++) { + if (!tas->max_sdu_table.max_sdu[gate_idx]) + tas->max_sdu_table.max_sdu[gate_idx] = PRUETH_MAX_MTU; + writew(tas->max_sdu_table.max_sdu[gate_idx], &max_sdu_tbl_ptr[gate_idx]); + } +} + +static void tas_reset(struct prueth_emac *emac) +{ + struct tas_config *tas = &emac->qos.tas.config; + int i; + + for (i = 0; i < TAS_MAX_NUM_QUEUES; i++) + tas->max_sdu_table.max_sdu[i] = PRUETH_MAX_MTU; + + tas_update_maxsdu_table(emac); + + memset_io(tas->fw_active_list, 0, sizeof(*tas->fw_active_list)); + memset_io(tas->fw_shadow_list, 0, sizeof(*tas->fw_shadow_list)); +} + +static int tas_set_state(struct prueth_emac *emac, enum tas_state state) +{ + struct tas_config *tas = &emac->qos.tas.config; + int ret; + + if (tas->state == state) + return 0; + + switch (state) { + case TAS_STATE_RESET: + tas_reset(emac); + ret = icssg_set_port_state(emac, ICSSG_EMAC_PORT_TAS_RESET); + break; + case TAS_STATE_ENABLE: + ret = icssg_set_port_state(emac, ICSSG_EMAC_PORT_TAS_ENABLE); + break; + case TAS_STATE_DISABLE: + ret = icssg_set_port_state(emac, ICSSG_EMAC_PORT_TAS_DISABLE); + break; + } + + if (!ret) + tas->state = state; + + return ret; +} + +static int tas_set_trigger_list_change(struct prueth_emac *emac) +{ + struct tc_taprio_qopt_offload *admin_list = emac->qos.tas.taprio_admin; + struct tas_config *tas = &emac->qos.tas.config; + u32 change_cycle_count; + u32 cycle_time; + u64 base_time; + u32 extend; + + /* IEP clock has a hardware errata due to which it wraps around exactly + * once every taprio cycle. To compensate for that, adjust cycle time + * by the wrap around time which is stored in emac->iep->def_inc + */ + cycle_time = admin_list->cycle_time - emac->iep->def_inc; + base_time = admin_list->base_time; + + change_cycle_count = base_time / cycle_time; + extend = base_time % cycle_time; + + writel(cycle_time, emac->dram.va + TAS_ADMIN_CYCLE_TIME); + writel(change_cycle_count, emac->dram.va + TAS_CONFIG_CHANGE_CYCLE_COUNT); + writeb(admin_list->num_entries, emac->dram.va + TAS_ADMIN_LIST_LENGTH); + writel(extend, emac->dram.va + TAS_CONFIG_CYCLE_EXTEND); + + /* config_change cleared by f/w to ack reception of new shadow list */ + writeb(1, &tas->config_list->config_change); + /* config_pending cleared by f/w when new shadow list is copied to active list */ + writeb(1, &tas->config_list->config_pending); + + return icssg_set_port_state(emac, ICSSG_EMAC_PORT_TAS_TRIGGER); +} + +static int tas_update_oper_list(struct prueth_emac *emac) +{ + struct tc_taprio_qopt_offload *admin_list = emac->qos.tas.taprio_admin; + struct tas_config *tas = &emac->qos.tas.config; + u32 tas_acc_gate_close_time = 0; + u8 idx, gate_idx, val; + int ret; + + tas_update_fw_list_pointers(emac); + + for (idx = 0; idx < admin_list->num_entries; idx++) { + writeb(admin_list->entries[idx].gate_mask, + &tas->fw_shadow_list->gate_mask_list[idx]); + tas_acc_gate_close_time += admin_list->entries[idx].interval; + + /* extend last entry till end of cycle time */ + if (idx == admin_list->num_entries - 1) + writel(admin_list->cycle_time, + &tas->fw_shadow_list->win_end_time_list[idx]); + else + writel(tas_acc_gate_close_time, + &tas->fw_shadow_list->win_end_time_list[idx]); + } + + /* clear remaining entries */ + for (idx = admin_list->num_entries; idx < TAS_MAX_CMD_LISTS; idx++) { + writeb(0, &tas->fw_shadow_list->gate_mask_list[idx]); + writel(0, &tas->fw_shadow_list->win_end_time_list[idx]); + } + + /* update the Array of gate close time for each queue in each window */ + for (idx = 0 ; idx < admin_list->num_entries; idx++) { + /* On Linux, only PRUETH_MAX_TX_QUEUES are supported per port */ + for (gate_idx = 0; gate_idx < PRUETH_MAX_TX_QUEUES; gate_idx++) { + u8 gate_mask_list_idx = readb(&tas->fw_shadow_list->gate_mask_list[idx]); + u32 gate_close_time = 0; + + if (gate_mask_list_idx & BIT(gate_idx)) + gate_close_time = readl(&tas->fw_shadow_list->win_end_time_list[idx]); + + writel(gate_close_time, + &tas->fw_shadow_list->gate_close_time_list[idx][gate_idx]); + } + } + + /* Update the maxsdu table for firmware */ + tas_update_maxsdu_table(emac); + + /* tell f/w to swap active & shadow list */ + ret = tas_set_trigger_list_change(emac); + if (ret) { + netdev_err(emac->ndev, "failed to swap f/w config list: %d\n", ret); + return ret; + } + + /* Wait for completion */ + ret = readb_poll_timeout(&tas->config_list->config_change, val, !val, + USEC_PER_MSEC, 10 * USEC_PER_MSEC); + if (ret) { + netdev_err(emac->ndev, "TAS list change completion time out\n"); + return ret; + } + + return 0; +} + +static int emac_taprio_replace(struct net_device *ndev, + struct tc_taprio_qopt_offload *taprio) +{ + struct prueth_emac *emac = netdev_priv(ndev); + int ret; + + if (taprio->cycle_time_extension) { + NL_SET_ERR_MSG_MOD(taprio->extack, "Cycle time extension not supported"); + return -EOPNOTSUPP; + } + + if (taprio->cycle_time > TAS_MAX_CYCLE_TIME) { + NL_SET_ERR_MSG_FMT_MOD(taprio->extack, "cycle_time %llu is more than max supported cycle_time", + taprio->cycle_time); + return -EINVAL; + } + + if (taprio->cycle_time < TAS_MIN_CYCLE_TIME) { + NL_SET_ERR_MSG_FMT_MOD(taprio->extack, "cycle_time %llu is less than min supported cycle_time %d", + taprio->cycle_time, TAS_MIN_CYCLE_TIME); + return -EINVAL; + } + + if (taprio->num_entries > TAS_MAX_CMD_LISTS) { + NL_SET_ERR_MSG_FMT_MOD(taprio->extack, "num_entries %lu is more than max supported entries %d", + taprio->num_entries, TAS_MAX_CMD_LISTS); + return -EINVAL; + } + + if (emac->qos.tas.taprio_admin) + taprio_offload_free(emac->qos.tas.taprio_admin); + + emac->qos.tas.taprio_admin = taprio_offload_get(taprio); + ret = tas_update_oper_list(emac); + if (ret) + goto clear_taprio; + + ret = tas_set_state(emac, TAS_STATE_ENABLE); + if (ret) + goto clear_taprio; + + return 0; + +clear_taprio: + emac->qos.tas.taprio_admin = NULL; + taprio_offload_free(taprio); + + return ret; +} + +static int emac_taprio_destroy(struct net_device *ndev, + struct tc_taprio_qopt_offload *taprio) +{ + struct prueth_emac *emac = netdev_priv(ndev); + int ret; + + ret = tas_set_state(emac, TAS_STATE_DISABLE); + if (ret) + return ret; + + return tas_set_state(emac, TAS_STATE_RESET); +} + +static int emac_setup_taprio(struct net_device *ndev, void *type_data) +{ + struct tc_taprio_qopt_offload *taprio = type_data; + int ret; + + switch (taprio->cmd) { + case TAPRIO_CMD_REPLACE: + ret = emac_taprio_replace(ndev, taprio); + break; + case TAPRIO_CMD_DESTROY: + ret = emac_taprio_destroy(ndev, taprio); + break; + default: + ret = -EOPNOTSUPP; + } + + return ret; +} + +static int emac_tc_query_caps(struct net_device *ndev, void *type_data) +{ + struct tc_query_caps_base *base = type_data; + + switch (base->type) { + case TC_SETUP_QDISC_TAPRIO: { + struct tc_taprio_caps *caps = base->caps; + + caps->gate_mask_per_txq = true; + + return 0; + } + default: + return -EOPNOTSUPP; + } +} + +int icssg_qos_ndo_setup_tc(struct net_device *ndev, enum tc_setup_type type, + void *type_data) +{ + switch (type) { + case TC_SETUP_QDISC_TAPRIO: + return emac_setup_taprio(ndev, type_data); + case TC_QUERY_CAPS: + return emac_tc_query_caps(ndev, type_data); + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(icssg_qos_ndo_setup_tc); + +void icssg_qos_tas_init(struct net_device *ndev) +{ + struct prueth_emac *emac = netdev_priv(ndev); + struct tas_config *tas; + + tas = &emac->qos.tas.config; + + tas->config_list = emac->dram.va + TAS_CONFIG_CHANGE_TIME; + tas->active_list = emac->dram.va + TAS_ACTIVE_LIST_INDEX; + + tas_update_fw_list_pointers(emac); + + tas_set_state(emac, TAS_STATE_RESET); +} +EXPORT_SYMBOL_GPL(icssg_qos_tas_init); diff --git a/drivers/net/ethernet/ti/icssg/icssg_qos.h b/drivers/net/ethernet/ti/icssg/icssg_qos.h new file mode 100644 index 0000000000000..60cd50c661b69 --- /dev/null +++ b/drivers/net/ethernet/ti/icssg/icssg_qos.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2023 Texas Instruments Incorporated - http://www.ti.com/ + */ + +#ifndef __NET_TI_ICSSG_QOS_H +#define __NET_TI_ICSSG_QOS_H + +#include +#include +#include + +/* Maximum number of gate command entries in each list. */ +#define TAS_MAX_CMD_LISTS (16) + +/* Maximum number of transmit queues supported by implementation */ +#define TAS_MAX_NUM_QUEUES (8) + +/* Minimum cycle time supported by implementation (in ns) */ +#define TAS_MIN_CYCLE_TIME (1000000) + +/* Minimum cycle time supported by implementation (in ns) */ +#define TAS_MAX_CYCLE_TIME (4000000000) + +/* Minimum TAS window duration supported by implementation (in ns) */ +#define TAS_MIN_WINDOW_DURATION (10000) + +/** + * enum tas_list_num - TAS list number + * @TAS_LIST0: TAS list number is 0 + * @TAS_LIST1: TAS list number is 1 + */ +enum tas_list_num { + TAS_LIST0 = 0, + TAS_LIST1 = 1 +}; + +/** + * enum tas_state - State of TAS in firmware + * @TAS_STATE_DISABLE: TAS state machine is disabled. + * @TAS_STATE_ENABLE: TAS state machine is enabled. + * @TAS_STATE_RESET: TAS state machine is reset. + */ +enum tas_state { + TAS_STATE_DISABLE = 0, + TAS_STATE_ENABLE = 1, + TAS_STATE_RESET = 2, +}; + +/** + * struct tas_config_list - Config state machine variables + * @config_change_time: New list is copied at this time + * @config_change_error_counter: Incremented if admin->BaseTime < current time + * and TAS_enabled is true + * @config_pending: True if list update is pending + * @config_change: Set to true when application trigger updating of admin list + * to active list, cleared when configChangeTime is updated + */ +struct tas_config_list { + u64 config_change_time; + u32 config_change_error_counter; + u8 config_pending; + u8 config_change; +} __packed; + +/* Max SDU table. See IEEE Std 802.1Q-2018 12.29.1.1 */ +struct tas_max_sdu_table { + u16 max_sdu[TAS_MAX_NUM_QUEUES]; +}; + +/** + * struct tas_firmware_list - TAS List Structure based on firmware memory map + * @gate_mask_list: Window gate mask list + * @win_end_time_list: Window end time list + * @gate_close_time_list: Array of gate close time for each queue in each window + */ +struct tas_firmware_list { + u8 gate_mask_list[TAS_MAX_CMD_LISTS]; + u32 win_end_time_list[TAS_MAX_CMD_LISTS]; + u32 gate_close_time_list[TAS_MAX_CMD_LISTS][TAS_MAX_NUM_QUEUES]; +} __packed; + +/** + * struct tas_config - Main Time Aware Shaper Handle + * @state: TAS state + * @max_sdu_table: Max SDU table + * @config_list: Config change variables + * @active_list: Current operating list operating list + * @fw_active_list: Active List pointer, used by firmware + * @fw_shadow_list: Shadow List pointer, used by driver + */ +struct tas_config { + enum tas_state state; + struct tas_max_sdu_table max_sdu_table; + struct tas_config_list __iomem *config_list; + u8 __iomem *active_list; + struct tas_firmware_list __iomem *fw_active_list; + struct tas_firmware_list __iomem *fw_shadow_list; +}; + +struct prueth_qos_tas { + struct tc_taprio_qopt_offload *taprio_admin; + struct tas_config config; +}; + +struct prueth_qos { + struct prueth_qos_tas tas; +}; + +void icssg_qos_tas_init(struct net_device *ndev); +int icssg_qos_ndo_setup_tc(struct net_device *ndev, enum tc_setup_type type, + void *type_data); +#endif /* __NET_TI_ICSSG_QOS_H */ diff --git a/drivers/net/ethernet/ti/icssg/icssg_switch_map.h b/drivers/net/ethernet/ti/icssg/icssg_switch_map.h index 490a9cc06fb05..e659a66bb7d7f 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_switch_map.h +++ b/drivers/net/ethernet/ti/icssg/icssg_switch_map.h @@ -46,6 +46,9 @@ /* Same as P2_PORT_DF_VLAN_OFFSET */ #define EMAC_ICSSG_SWITCH_PORT2_DEFAULT_VLAN_OFFSET P2_PORT_DF_VLAN_OFFSET +/* Time adjustment to be done due to Cycle extension */ +#define TIMESYNC_CYCLE_EXTN_TIME 0x0028 + /* VLAN-FID Table offset. 4096 VIDs. 2B per VID = 8KB = 0x2000 */ #define VLAN_STATIC_REG_TABLE_OFFSET 0x0100 @@ -177,6 +180,9 @@ /* Stores the table used for priority mapping. 1B per PCP/Queue */ #define PORT_Q_PRIORITY_MAPPING_OFFSET 0x003C +/* Memory to store time value to be used for Cycle extension */ +#define TAS_CONFIG_CYCLE_EXTEND 0x00A4 + /* Used to notify the FW of the current link speed */ #define PORT_LINK_SPEED_OFFSET 0x00A8 From 36129aeb2d615082682d143dfe8cd3ed1f5020b6 Mon Sep 17 00:00:00 2001 From: fedora Cloud User Date: Thu, 1 Feb 2024 06:34:18 -0800 Subject: [PATCH 763/770] forwarding: set timeout to 3 hours tc_actions.sh keeps hanging the forwarding tests. sdf@: tdc & tdc-dbg started intermittenly failing around Sep 25th Signed-off-by: NipaLocal --- tools/testing/selftests/net/forwarding/settings | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/settings b/tools/testing/selftests/net/forwarding/settings index e7b9417537fbc..ff3de4936a009 100644 --- a/tools/testing/selftests/net/forwarding/settings +++ b/tools/testing/selftests/net/forwarding/settings @@ -1 +1,2 @@ -timeout=0 +timeout=10800 +profile=1 From 567089f092b26ced8ec4d3ebfcf3ab4fc7f21423 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 17 Aug 2024 10:53:12 -0700 Subject: [PATCH 764/770] profile patch Signed-off-by: Jakub Kicinski Signed-off-by: NipaLocal --- tools/testing/selftests/kselftest/prefix.pl | 8 ++++++++ tools/testing/selftests/kselftest/runner.sh | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kselftest/prefix.pl b/tools/testing/selftests/kselftest/prefix.pl index 12a7f4ca2684d..b8e4d697e3b3a 100755 --- a/tools/testing/selftests/kselftest/prefix.pl +++ b/tools/testing/selftests/kselftest/prefix.pl @@ -4,12 +4,15 @@ # to have unbuffering forced with "stdbuf -i0 -o0 -e0 $cmd". use strict; use IO::Handle; +use Time::HiRes qw( time ); binmode STDIN; binmode STDOUT; STDOUT->autoflush(1); +my $start_time = time(); +my $prev_time = $start_time; my $needed = 1; while (1) { my $char; @@ -17,6 +20,11 @@ exit 0 if ($bytes == 0); if ($needed) { print "# "; + if ($ENV{kselftest_profile}) { + my $now = time(); + printf("%.2f [+%.2f] ", $now - $start_time, $now - $prev_time); + $prev_time = $now; + } $needed = 0; } print $char; diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh index 2c3c58e65a419..e65bffb9981f8 100644 --- a/tools/testing/selftests/kselftest/runner.sh +++ b/tools/testing/selftests/kselftest/runner.sh @@ -89,7 +89,7 @@ run_one() fi field=$(echo "$line" | cut -d= -f1) value=$(echo "$line" | cut -d= -f2-) - eval "kselftest_$field"="$value" + eval "export kselftest_$field"="$value" done < "$settings" fi From 8646a9c038baad0d94f45035ac08baa822fad086 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 29 Aug 2024 19:57:39 -0700 Subject: [PATCH 765/770] tc_action dbg Signed-off-by: Jakub Kicinski Signed-off-by: NipaLocal --- .../selftests/net/forwarding/tc_actions.sh | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh index ea89e558672db..5823d94b19e9b 100755 --- a/tools/testing/selftests/net/forwarding/tc_actions.sh +++ b/tools/testing/selftests/net/forwarding/tc_actions.sh @@ -223,19 +223,32 @@ mirred_egress_to_ingress_tcp_test() ip_proto icmp \ action drop - ip vrf exec v$h1 ncat --recv-only -w10 -l -p 12345 -o $mirred_e2i_tf2 & - local rpid=$! - ip vrf exec v$h1 ncat -w1 --send-only 192.0.2.2 12345 <$mirred_e2i_tf1 - wait -n $rpid - cmp -s $mirred_e2i_tf1 $mirred_e2i_tf2 - check_err $? "server output check failed" + echo P2 $MZ $h1 -c 10 -p 64 -a $h1mac -b $h1mac -A 192.0.2.1 -B 192.0.2.1 \ -t icmp "ping,id=42,seq=5" -q + echo P2.1 tc_check_packets "dev $h1 egress" 101 10 check_err $? "didn't mirred redirect ICMP" + echo P2.2 tc_check_packets "dev $h1 ingress" 102 10 check_err $? "didn't drop mirred ICMP" + echo P2.3 + + echo P1 + + ip vrf exec v$h1 ncat --recv-only -w10 -l -p 12345 -o $mirred_e2i_tf2 & + local rpid=$! + sleep 0.2 + echo P1.1 + ip vrf exec v$h1 ncat -w1 --send-only 192.0.2.2 12345 <$mirred_e2i_tf1 + echo P1.2 + sleep 0.2 + wait -n $rpid + cmp -s $mirred_e2i_tf1 $mirred_e2i_tf2 + check_err $? "server output check failed" + + echo P3 tc filter del dev $h1 egress protocol ip pref 100 handle 100 flower tc filter del dev $h1 egress protocol ip pref 101 handle 101 flower @@ -359,4 +372,5 @@ else tests_run fi +dmesg exit $EXIT_STATUS From 542661ca758a68c5f491359fa50906c9ba534f78 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 3 Nov 2024 16:10:42 -0800 Subject: [PATCH 766/770] selftests: net: enable profiling Signed-off-by: Jakub Kicinski Signed-off-by: NipaLocal --- tools/testing/selftests/net/settings | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/settings b/tools/testing/selftests/net/settings index ed8418e8217a0..a38764182822e 100644 --- a/tools/testing/selftests/net/settings +++ b/tools/testing/selftests/net/settings @@ -1 +1,2 @@ timeout=3600 +profile=1 From 0859459099a80d7c659915c201aee37fbf8f737a Mon Sep 17 00:00:00 2001 From: NipaLocal Date: Tue, 11 Mar 2025 22:49:47 -0700 Subject: [PATCH 767/770] drv: net: add timeout Signed-off-by: NipaLocal --- tools/testing/selftests/drivers/net/settings | 1 + 1 file changed, 1 insertion(+) create mode 100644 tools/testing/selftests/drivers/net/settings diff --git a/tools/testing/selftests/drivers/net/settings b/tools/testing/selftests/drivers/net/settings new file mode 100644 index 0000000000000..a953c96aa16e1 --- /dev/null +++ b/tools/testing/selftests/drivers/net/settings @@ -0,0 +1 @@ +timeout=180 From ade56cf7efa2f942cc4ac02802a7632b5872866c Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 31 Mar 2025 07:13:26 -0700 Subject: [PATCH 768/770] dbg: tests: bonding: print info on failure Signed-off-by: NipaLocal --- .../net/bonding/bond_macvlan_ipvlan.sh | 42 ++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh index c4711272fe45d..c7c9c8c5f1fd5 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_macvlan_ipvlan.sh @@ -30,9 +30,49 @@ check_connection() local message=${3} RET=0 + ip netns exec ${ns} nstat >/dev/null + ip netns exec ${xvlan2_ns} nstat >/dev/null ip netns exec ${ns} ping ${target} -c 4 -i 0.1 &>/dev/null - check_err $? "ping failed" + R=$? + N=$( ip netns exec ${ns} nstat) + N2=$(ip netns exec ${xvlan2_ns} nstat) + check_err $R "ping failed" log_test "${bond_mode}/${xvlan_type}_${xvlan_mode}: ${message}" + + if [ $R -ne 0 ]; then + echo "===" + echo $O + echo + echo $N + echo + echo $N2 + echo + echo 'local' + ip link + echo + ip addr + echo + ip neigh + echo + ip route + echo 'ns' + ip -s -s -netns ${ns} link + echo + ip -netns ${ns} addr + echo + ip -netns ${ns} neigh + echo + ip -netns ${ns} route + echo 'X ns 2' + ip -s -s -netns ${xvlan2_ns} link + echo + ip -netns ${xvlan2_ns} addr + echo + ip -netns ${xvlan2_ns} neigh + echo + ip -netns ${xvlan2_ns} route + echo "=<=" + fi } xvlan_over_bond() From 095dc3d47b58d3f657b8a88435d37810da2d913c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 31 Mar 2025 07:11:00 -0700 Subject: [PATCH 769/770] config: set preempt Signed-off-by: Jakub Kicinski Signed-off-by: NipaLocal --- kernel/configs/debug.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config index e81327d2cd639..b0673704c4df9 100644 --- a/kernel/configs/debug.config +++ b/kernel/configs/debug.config @@ -24,6 +24,11 @@ CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_FRAME_WARN=2048 CONFIG_SECTION_MISMATCH_WARN_ONLY=y # +# General Setup +# +CONFIG_PREEMPT=y +CONFIG_DEBUG_PREEMPT=y +# # Generic Kernel Debugging Instruments # # CONFIG_UBSAN_ALIGNMENT is not set From b48471422e11dba8d8944d74c690830dd054ae0c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Apr 2025 15:57:58 -0700 Subject: [PATCH 770/770] disable cirrus kunit Signed-off-by: Jakub Kicinski Signed-off-by: NipaLocal --- drivers/firmware/cirrus/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/firmware/cirrus/Kconfig b/drivers/firmware/cirrus/Kconfig index 0a883091259a2..1fdf948535957 100644 --- a/drivers/firmware/cirrus/Kconfig +++ b/drivers/firmware/cirrus/Kconfig @@ -12,7 +12,6 @@ config FW_CS_DSP_KUNIT_TEST_UTILS config FW_CS_DSP_KUNIT_TEST tristate "KUnit tests for Cirrus Logic cs_dsp" if !KUNIT_ALL_TESTS depends on KUNIT && REGMAP - default KUNIT_ALL_TESTS select FW_CS_DSP select FW_CS_DSP_KUNIT_TEST_UTILS help