From 7851f340d117e5d75ca2ba1c26055f0a75c8e75c Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Fri, 28 Feb 2025 16:17:37 +0000 Subject: [PATCH 01/34] BACKPORT: net: page_pool: rename page_pool_alloc_netmem to *_netmems page_pool_alloc_netmem (without an s) was the mirror of page_pool_alloc_pages (with an s), which was confusing. Rename to page_pool_alloc_netmems so it's the mirror of page_pool_alloc_pages. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- include/net/page_pool/types.h | 2 +- net/core/page_pool.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index c022c410abe39d..8f564fe9eb9a9a 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -242,7 +242,7 @@ struct page_pool { }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); -netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp); +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp); netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a813d30d213536..77de79c1933b25 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -574,7 +574,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ -netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) { netmem_ref netmem; @@ -590,11 +590,11 @@ netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; } -EXPORT_SYMBOL(page_pool_alloc_netmem); +EXPORT_SYMBOL(page_pool_alloc_netmems); struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) { - return netmem_to_page(page_pool_alloc_netmem(pool, gfp)); + return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); } EXPORT_SYMBOL(page_pool_alloc_pages); ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL); @@ -956,7 +956,7 @@ netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, } if (!netmem) { - netmem = page_pool_alloc_netmem(pool, gfp); + netmem = page_pool_alloc_netmems(pool, gfp); if (unlikely(!netmem)) { pool->frag_page = 0; return 0; From 69db6a9d200b4a06bc24973e14ea434d333fbcd5 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Fri, 28 Feb 2025 16:22:28 +0000 Subject: [PATCH 02/34] BACKPORT: net: page_pool: create page_pool_alloc_netmem Create page_pool_alloc_netmem to be the mirror of page_pool_alloc. This enables drivers that want currently use page_pool_alloc to transition to netmem by converting the call sites to page_pool_alloc_netmem. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- include/net/page_pool/helpers.h | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 60a5347922becc..898897035092c0 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -116,22 +116,22 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, return page_pool_alloc_frag(pool, offset, size, gfp); } -static inline struct page *page_pool_alloc(struct page_pool *pool, - unsigned int *offset, - unsigned int *size, gfp_t gfp) +static inline netmem_ref page_pool_alloc_netmem(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; - struct page *page; + netmem_ref netmem; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; - return page_pool_alloc_pages(pool, gfp); + return page_pool_alloc_netmems(pool, gfp); } - page = page_pool_alloc_frag(pool, offset, *size, gfp); - if (unlikely(!page)) - return NULL; + netmem = page_pool_alloc_frag_netmem(pool, offset, *size, gfp); + if (unlikely(!netmem)) + return 0; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize @@ -142,7 +142,14 @@ static inline struct page *page_pool_alloc(struct page_pool *pool, pool->frag_offset = max_size; } - return page; + return netmem; +} + +static inline struct page *page_pool_alloc(struct page_pool *pool, + unsigned int *offset, + unsigned int *size, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_netmem(pool, offset, size, gfp)); } /** From 3b7e186bce5de1cc2e1e2c0c093631deec1b7a44 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Fri, 28 Feb 2025 16:29:10 +0000 Subject: [PATCH 03/34] BACKPORT: page_pool: Set `dma_sync` to false for devmem memory provider Move the `dma_map` and `dma_sync` checks to `page_pool_init` to make them generic. Set dma_sync to false for devmem memory provider because the dma_sync APIs should not be used for dma_buf backed devmem memory provider. Cc: Jason Gunthorpe Signed-off-by: Samiullah Khawaja Signed-off-by: Mina Almasry Signed-off-by: Pranjal Shrivastava --- net/core/devmem.c | 9 ++++----- net/core/page_pool.c | 3 +++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/net/core/devmem.c b/net/core/devmem.c index 11b91c12ee1135..3ebdeed2bf188d 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -331,11 +331,10 @@ int mp_dmabuf_devmem_init(struct page_pool *pool) if (!binding) return -EINVAL; - if (!pool->dma_map) - return -EOPNOTSUPP; - - if (pool->dma_sync) - return -EOPNOTSUPP; + /* dma-buf dma addresses do not need and should not be used with + * dma_sync_for_cpu/device. Force disable dma_sync. + */ + pool->dma_sync = false; if (pool->p.order != 0) return -E2BIG; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 77de79c1933b25..528dd4d18eab41 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -287,6 +287,9 @@ static int page_pool_init(struct page_pool *pool, } if (pool->mp_priv) { + if (!pool->dma_map || !pool->dma_sync) + return -EOPNOTSUPP; + err = mp_dmabuf_devmem_init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, From 893dfbf91c07422fc1085de57393bc69194493d2 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Fri, 28 Feb 2025 16:40:48 +0000 Subject: [PATCH 04/34] BACKPORT: page_pool: disable sync for cpu for dmabuf memory provider dmabuf dma-addresses should not be dma_sync'd for CPU/device. Typically its the driver responsibility to dma_sync for CPU, but the driver should not dma_sync for CPU if the netmem is actually coming from a dmabuf memory provider. The page_pool already exposes a helper for dma_sync_for_cpu: page_pool_dma_sync_for_cpu. Upgrade this existing helper to handle netmem, and have it skip dma_sync if the memory is from a dmabuf memory provider. Drivers should migrate to using this helper when adding support for netmem. Also minimize the impact on the dma syncing performance for pages. Special case the dma-sync path for pages to not go through the overhead checks for dma-syncing and conversion to netmem. Cc: Alexander Lobakin Cc: Jason Gunthorpe Signed-off-by: Mina Almasry Signed-off-by: Pranjal Shrivastava --- include/net/page_pool/helpers.h | 35 ++++++++++++++++++++++++++++----- include/net/page_pool/types.h | 3 ++- net/core/devmem.c | 1 + net/core/page_pool.c | 1 + 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 898897035092c0..35dfeb05362ef1 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -425,7 +425,21 @@ static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { - return page_pool_get_dma_addr_netmem(page_to_netmem((struct page *)page)); + dma_addr_t ret = page->dma_addr; + + if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) + ret <<= PAGE_SHIFT; + + return ret; +} + +static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, + const dma_addr_t dma_addr, + u32 offset, u32 dma_sync_size) +{ + dma_sync_single_range_for_cpu(pool->p.dev, dma_addr, + offset + pool->p.offset, dma_sync_size, + page_pool_get_dma_dir(pool)); } /** @@ -444,10 +458,21 @@ static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool, const struct page *page, u32 offset, u32 dma_sync_size) { - dma_sync_single_range_for_cpu(pool->p.dev, - page_pool_get_dma_addr(page), - offset + pool->p.offset, dma_sync_size, - page_pool_get_dma_dir(pool)); + __page_pool_dma_sync_for_cpu(pool, page_pool_get_dma_addr(page), offset, + dma_sync_size); +} + +static inline void +page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, + const netmem_ref netmem, u32 offset, + u32 dma_sync_size) +{ + if (!pool->dma_sync_for_cpu) + return; + + __page_pool_dma_sync_for_cpu(pool, + page_pool_get_dma_addr_netmem(netmem), + offset, dma_sync_size); } static inline bool page_pool_put(struct page_pool *pool) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 8f564fe9eb9a9a..19924504efca0c 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -164,7 +164,8 @@ struct page_pool { bool has_init_callback:1; /* slow::init_callback is set */ bool dma_map:1; /* Perform DMA mapping */ - bool dma_sync:1; /* Perform DMA sync */ + bool dma_sync:1; /* Perform DMA sync for device */ + bool dma_sync_for_cpu:1; /* Perform DMA sync for cpu */ #ifdef CONFIG_PAGE_POOL_STATS bool system:1; /* This is a global percpu pool */ #endif diff --git a/net/core/devmem.c b/net/core/devmem.c index 3ebdeed2bf188d..0b6ed7525b22ac 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -335,6 +335,7 @@ int mp_dmabuf_devmem_init(struct page_pool *pool) * dma_sync_for_cpu/device. Force disable dma_sync. */ pool->dma_sync = false; + pool->dma_sync_for_cpu = false; if (pool->p.order != 0) return -E2BIG; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 528dd4d18eab41..dbef64d841e5c6 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -201,6 +201,7 @@ static int page_pool_init(struct page_pool *pool, memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); pool->cpuid = cpuid; + pool->dma_sync_for_cpu = true; /* Validate only known flags were used */ if (pool->slow.flags & ~PP_FLAG_ALL) From 63881ea90f9a5d74d3f095a28b7be73f7f3e3e6e Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Fri, 28 Feb 2025 16:44:16 +0000 Subject: [PATCH 05/34] BACKPORT: net: Document netmem driver support Document expectations from drivers looking to add support for device memory tcp or other netmem based features. Signed-off-by: Mina Almasry Signed-off-by: Pranjal Shrivastava --- Documentation/networking/index.rst | 1 + Documentation/networking/netmem.rst | 79 +++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 Documentation/networking/netmem.rst diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 803dfc1efb7518..86aefb5938afcf 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -85,6 +85,7 @@ Contents: netdevices netfilter-sysctl netif-msg + netmem nexthop-group-resilient nf_conntrack-sysctl nf_flowtable diff --git a/Documentation/networking/netmem.rst b/Documentation/networking/netmem.rst new file mode 100644 index 00000000000000..7de21ddb54129f --- /dev/null +++ b/Documentation/networking/netmem.rst @@ -0,0 +1,79 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================== +Netmem Support for Network Drivers +================================== + +This document outlines the requirements for network drivers to support netmem, +an abstract memory type that enables features like device memory TCP. By +supporting netmem, drivers can work with various underlying memory types +with little to no modification. + +Benefits of Netmem : + +* Flexibility: Netmem can be backed by different memory types (e.g., struct + page, DMA-buf), allowing drivers to support various use cases such as device + memory TCP. +* Future-proof: Drivers with netmem support are ready for upcoming + features that rely on it. +* Simplified Development: Drivers interact with a consistent API, + regardless of the underlying memory implementation. + +Driver Requirements +=================== + +1. The driver must support page_pool. + +2. The driver must support the tcp-data-split ethtool option. + +3. The driver must use the page_pool netmem APIs for payload memory. The netmem + APIs currently 1-to-1 correspond with page APIs. Conversion to netmem should + be achievable by switching the page APIs to netmem APIs and tracking memory + via netmem_refs in the driver rather than struct page * : + + - page_pool_alloc -> page_pool_alloc_netmem + - page_pool_get_dma_addr -> page_pool_get_dma_addr_netmem + - page_pool_put_page -> page_pool_put_netmem + + Not all page APIs have netmem equivalents at the moment. If your driver + relies on a missing netmem API, feel free to add and propose to netdev@, or + reach out to the maintainers and/or almasrymina@google.com for help adding + the netmem API. + +4. The driver must use the following PP_FLAGS: + + - PP_FLAG_DMA_MAP: netmem is not dma-mappable by the driver. The driver + must delegate the dma mapping to the page_pool, which knows when + dma-mapping is (or is not) appropriate. + - PP_FLAG_DMA_SYNC_DEV: netmem dma addr is not necessarily dma-syncable + by the driver. The driver must delegate the dma syncing to the page_pool, + which knows when dma-syncing is (or is not) appropriate. + - PP_FLAG_ALLOW_UNREADABLE_NETMEM. The driver must specify this flag iff + tcp-data-split is enabled. + +5. The driver must not assume the netmem is readable and/or backed by pages. + The netmem returned by the page_pool may be unreadable, in which case + netmem_address() will return NULL. The driver must correctly handle + unreadable netmem, i.e. don't attempt to handle its contents when + netmem_address() is NULL. + + Ideally, drivers should not have to check the underlying netmem type via + helpers like netmem_is_net_iov() or convert the netmem to any of its + underlying types via netmem_to_page() or netmem_to_net_iov(). In most cases, + netmem or page_pool helpers that abstract this complexity are provided + (and more can be added). + +6. The driver must use page_pool_dma_sync_netmem_for_cpu() in lieu of + dma_sync_single_range_for_cpu(). For some memory providers, dma_syncing for + CPU will be done by the page_pool, for others (particularly dmabuf memory + provider), dma syncing for CPU is the responsibility of the userspace using + dmabuf APIs. The driver must delegate the entire dma-syncing operation to + the page_pool which will do it correctly. + +7. Avoid implementing driver-specific recycling on top of the page_pool. Drivers + cannot hold onto a struct page to do their own recycling as the netmem may + not be backed by a struct page. However, you may hold onto a page_pool + reference with page_pool_fragment_netmem() or page_pool_ref_netmem() for + that purpose, but be mindful that some netmem types might have longer + circulation times, such as when userspace holds a reference in zerocopy + scenarios. From 098641fd1f2505c3ae0542f5b5a9534b88c25a4b Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Mon, 3 Mar 2025 15:09:44 +0000 Subject: [PATCH 06/34] BACKPORT: net: add get_netmem/put_netmem support Currently net_iovs support only pp ref counts, and do not support a page ref equivalent. This is fine for the RX path as net_iovs are used exclusively with the pp and only pp refcounting is needed there. The TX path however does not use pp ref counts, thus, support for get_page/put_page equivalent is needed for netmem. Support get_netmem/put_netmem. Check the type of the netmem before passing it to page or net_iov specific code to obtain a page ref equivalent. For dmabuf net_iovs, we obtain a ref on the underlying binding. This ensures the entire binding doesn't disappear until all the net_iovs have been put_netmem'ed. We do not need to track the refcount of individual dmabuf net_iovs as we don't allocate/free them from a pool similar to what the buddy allocator does for pages. This code is written to be extensible by other net_iov implementers. get_netmem/put_netmem will check the type of the netmem and route it to the correct helper: pages -> [get|put]_page() dmabuf net_iovs -> net_devmem_[get|put]_net_iov() new net_iovs -> new helpers The function net_iov_binding was renamed to net_devmem_iov_binding as a part of io uring zero-copy by Pavel in kernel 6.14: https://lore.kernel.org/all/20250204215622.695511-3-dw@davidwei.uk/ Hence, use the old API `net_iov_binding` that exists in 6.12. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- include/linux/skbuff_ref.h | 4 ++-- include/net/netmem.h | 3 +++ net/core/devmem.c | 10 ++++++++++ net/core/devmem.h | 20 ++++++++++++++++++++ net/core/skbuff.c | 30 ++++++++++++++++++++++++++++++ 5 files changed, 65 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h index 0f3c58007488ab..9e49372ef1a05e 100644 --- a/include/linux/skbuff_ref.h +++ b/include/linux/skbuff_ref.h @@ -17,7 +17,7 @@ */ static inline void __skb_frag_ref(skb_frag_t *frag) { - get_page(skb_frag_page(frag)); + get_netmem(skb_frag_netmem(frag)); } /** @@ -40,7 +40,7 @@ static inline void skb_page_unref(netmem_ref netmem, bool recycle) if (recycle && napi_pp_put_page(netmem)) return; #endif - put_page(netmem_to_page(netmem)); + put_netmem(netmem); } /** diff --git a/include/net/netmem.h b/include/net/netmem.h index 8a6e20be4b9d30..b2b0779653eb7e 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -171,4 +171,7 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) return __netmem_clear_lsb(netmem)->dma_addr; } +void get_netmem(netmem_ref netmem); +void put_netmem(netmem_ref netmem); + #endif /* _NET_NETMEM_H */ diff --git a/net/core/devmem.c b/net/core/devmem.c index 0b6ed7525b22ac..c2a79598188bde 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -322,6 +322,16 @@ void dev_dmabuf_uninstall(struct net_device *dev) } } +void net_devmem_get_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_get(net_iov_binding(niov)); +} + +void net_devmem_put_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_put(net_iov_binding(niov)); +} + /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) diff --git a/net/core/devmem.h b/net/core/devmem.h index 76099ef9c482f5..8b51caff5a0eae 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -27,6 +27,10 @@ struct net_devmem_dmabuf_binding { * The binding undos itself and unmaps the underlying dmabuf once all * those refs are dropped and the binding is no longer desired or in * use. + * + * net_devmem_get_net_iov() on dmabuf net_iovs will increment this + * reference, making sure that the binding remains alive until all the + * net_iovs are no longer used. */ refcount_t ref; @@ -119,6 +123,9 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) __net_devmem_dmabuf_binding_free(binding); } +void net_devmem_get_net_iov(struct net_iov *niov); +void net_devmem_put_net_iov(struct net_iov *niov); + struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); @@ -126,6 +133,19 @@ void net_devmem_free_dmabuf(struct net_iov *ppiov); #else struct net_devmem_dmabuf_binding; +static inline void +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline void net_devmem_get_net_iov(struct net_iov *niov) +{ +} + +static inline void net_devmem_put_net_iov(struct net_iov *niov) +{ +} + static inline void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 61a950f13a91c7..bb829ae915242b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -89,6 +89,7 @@ #include #include "dev.h" +#include "devmem.h" #include "netmem_priv.h" #include "sock_destructor.h" @@ -7294,3 +7295,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, return false; } EXPORT_SYMBOL(csum_and_copy_from_iter_full); + +void get_netmem(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) { + /* Assume any net_iov is devmem and route it to + * net_devmem_get_net_iov. As new net_iov types are added they + * need to be checked here. + */ + net_devmem_get_net_iov(netmem_to_net_iov(netmem)); + return; + } + get_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(get_netmem); + +void put_netmem(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) { + /* Assume any net_iov is devmem and route it to + * net_devmem_put_net_iov. As new net_iov types are added they + * need to be checked here. + */ + net_devmem_put_net_iov(netmem_to_net_iov(netmem)); + return; + } + + put_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(put_netmem); From 64302e6e1d88df37e31c62c97f1433f79b964a57 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Mon, 3 Mar 2025 15:37:34 +0000 Subject: [PATCH 07/34] BACKPORT: net: devmem: TCP tx netlink api Add bind-tx netlink call to attach dmabuf for TX; queue is not required, only ifindex and dmabuf fd for attachment. Signed-off-by: Stanislav Fomichev Signed-off-by: Mina Almasry Signed-off-by: Pranjal Shrivastava --- Documentation/netlink/specs/netdev.yaml | 12 ++++++++++++ include/uapi/linux/netdev.h | 1 + net/core/netdev-genl-gen.c | 13 +++++++++++++ net/core/netdev-genl-gen.h | 1 + net/core/netdev-genl.c | 6 ++++++ tools/include/uapi/linux/netdev.h | 1 + 6 files changed, 34 insertions(+) diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml index 08412c279297bf..8d44493cc89bee 100644 --- a/Documentation/netlink/specs/netdev.yaml +++ b/Documentation/netlink/specs/netdev.yaml @@ -676,6 +676,18 @@ operations: reply: attributes: - id + - + name: bind-tx + doc: Bind dmabuf to netdev for TX + attribute-set: dmabuf + do: + request: + attributes: + - ifindex + - fd + reply: + attributes: + - id kernel-family: headers: [ "linux/list.h"] diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7c308f04e7a063..56f7b87f8bd0ca 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -199,6 +199,7 @@ enum { NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, + NETDEV_CMD_BIND_TX, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 8614988fc67b9a..b9bbd8a19ad9bd 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -87,6 +87,12 @@ static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), }; +/* NETDEV_CMD_BIND_TX - do */ +static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -171,6 +177,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_DMABUF_FD, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_BIND_TX, + .doit = netdev_nl_bind_tx_doit, + .policy = netdev_bind_tx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index 8cda334fd04296..dc742b6410e91f 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -33,6 +33,7 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index ad426b3a03b526..1eae1d4dfa3839 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -839,6 +839,12 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) return err; } +/* stub */ +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) +{ + return 0; +} + void netdev_nl_sock_priv_init(struct list_head *priv) { INIT_LIST_HEAD(priv); diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7c308f04e7a063..56f7b87f8bd0ca 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -199,6 +199,7 @@ enum { NETDEV_CMD_NAPI_GET, NETDEV_CMD_QSTATS_GET, NETDEV_CMD_BIND_RX, + NETDEV_CMD_BIND_TX, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) From 84f16e1bfcb880a3930bf99a6680d2a464a4be2b Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Tue, 4 Mar 2025 21:57:38 +0000 Subject: [PATCH 08/34] BACKPORT: net: devmem: Implement TX path Augment dmabuf binding to be able to handle TX. Additional to all the RX binding, we also create tx_vec needed for the TX path. Provide API for sendmsg to be able to send dmabufs bound to this device: - Provide a new dmabuf_tx_cmsg which includes the dmabuf to send from. - MSG_ZEROCOPY with SCM_DEVMEM_DMABUF cmsg indicates send from dma-buf. Devmem is uncopyable, so piggyback off the existing MSG_ZEROCOPY implementation, while disabling instances where MSG_ZEROCOPY falls back to copying. We additionally pipe the binding down to the new zerocopy_fill_skb_from_devmem which fills a TX skb with net_iov netmems instead of the traditional page netmems. We also special case skb_frag_dma_map to return the dma-address of these dmabuf net_iovs instead of attempting to map pages. The TX path may release the dmabuf in a context where we cannot wait. This happens when the user unbinds a TX dmabuf while there are still references to its netmems in the TX path. In that case, the netmems will be put_netmem'd from a context where we can't unmap the dmabuf, Resolve this by making __net_devmem_dmabuf_binding_free schedule_work'd. Based on work by Stanislav Fomichev . A lot of the meat of the implementation came from devmem TCP RFC v1[1], which included the TX path, but Stan did all the rebasing on top of netmem/net_iov. The net_iov_owners were generalized as part of the io_uring zero-copy rx implementation in 6.14: https://lore.kernel.org/all/20250204215622.695511-4-dw@davidwei.uk/ Hence, struct net_devmem_dmabuf_binding itself has all the contents that were later refactored within the `net_iov_owner` abstraction. Use the members of struct net_devmem_dmabuf_binding directly. Cc: Stanislav Fomichev Signed-off-by: Kaiyuan Zhang Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- include/linux/skbuff.h | 18 +++- include/net/sock.h | 1 + net/core/datagram.c | 48 ++++++++++- net/core/devmem.c | 107 ++++++++++++++++++++++-- net/core/devmem.h | 64 +++++++++++--- net/core/netdev-genl.c | 63 +++++++++++++- net/core/skbuff.c | 18 ++-- net/core/sock.c | 5 ++ net/ipv4/ip_output.c | 2 +- net/ipv4/tcp.c | 50 ++++++++--- net/ipv6/ip6_output.c | 2 +- net/vmw_vsock/virtio_transport_common.c | 4 +- 12 files changed, 331 insertions(+), 51 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 39f1d16f362887..492f23695ba68e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1698,13 +1698,16 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops; struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg); + struct ubuf_info *uarg, bool devmem); + +struct net_devmem_dmabuf_binding; void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length); + size_t length, + struct net_devmem_dmabuf_binding *binding); int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length); @@ -1712,12 +1715,14 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { - return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); + return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len, + NULL); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg); + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding); /* Internal */ #define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB))) @@ -3651,6 +3656,11 @@ static inline dma_addr_t skb_frag_dma_map(struct device *dev, size_t offset, size_t size, enum dma_data_direction dir) { + if (skb_frag_is_net_iov(frag)) { + return netmem_to_net_iov(frag->netmem)->dma_addr + offset + + frag->offset; + } + return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } diff --git a/include/net/sock.h b/include/net/sock.h index fa055cf1785efd..84bcf1740e9250 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1804,6 +1804,7 @@ struct sockcm_cookie { u64 transmit_time; u32 mark; u32 tsflags; + u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, diff --git a/net/core/datagram.c b/net/core/datagram.c index f0693707aece46..09c74a1d836b66 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -63,6 +63,8 @@ #include #include +#include "devmem.h" + /* * Is a socket 'connection oriented' ? */ @@ -692,9 +694,49 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, return 0; } +static int +zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, + int length, + struct net_devmem_dmabuf_binding *binding) +{ + int i = skb_shinfo(skb)->nr_frags; + size_t virt_addr, size, off; + struct net_iov *niov; + + /* Devmem filling works by taking an IOVEC from the user where the + * iov_addrs are interpreted as an offset in bytes into the dma-buf to + * send from. We do not support other iter types. + */ + if (iov_iter_type(from) != ITER_IOVEC) + return -EFAULT; + + while (length && iov_iter_count(from)) { + if (i == MAX_SKB_FRAGS) + return -EMSGSIZE; + + virt_addr = (size_t)iter_iov_addr(from); + niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); + if (!niov) + return -EFAULT; + + size = min_t(size_t, size, length); + size = min_t(size_t, size, iter_iov_len(from)); + + get_netmem(net_iov_to_netmem(niov)); + skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, + size, PAGE_SIZE); + iov_iter_advance(from, size); + length -= size; + i++; + } + + return 0; +} + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length) + size_t length, + struct net_devmem_dmabuf_binding *binding) { unsigned long orig_size = skb->truesize; unsigned long truesize; @@ -702,6 +744,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, if (msg && msg->msg_ubuf && msg->sg_from_iter) ret = msg->sg_from_iter(skb, from, length); + else if (unlikely(binding)) + ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); else ret = zerocopy_fill_skb_from_iter(skb, from, length); @@ -735,7 +779,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); } EXPORT_SYMBOL(zerocopy_sg_from_iter); diff --git a/net/core/devmem.c b/net/core/devmem.c index c2a79598188bde..df6ff563919883 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "devmem.h" @@ -44,8 +45,10 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { + struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); + size_t size, avail; gen_pool_for_each_chunk(binding->chunk_pool, @@ -63,8 +66,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); + kvfree(binding->tx_vec); kfree(binding); } +EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free); struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) @@ -109,6 +114,14 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) unsigned long xa_idx; unsigned int rxq_idx; + xa_erase(&net_devmem_dmabuf_bindings, binding->id); + + /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the + * erase + */ + + synchronize_net(); + if (binding->list.next) list_del(&binding->list); @@ -122,8 +135,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) WARN_ON(netdev_rx_queue_restart(binding->dev, rxq_idx)); } - xa_erase(&net_devmem_dmabuf_bindings, binding->id); - net_devmem_dmabuf_binding_put(binding); } @@ -174,8 +185,9 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, } struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, - struct netlink_ext_ack *extack) +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netlink_ext_ack *extack) { struct net_devmem_dmabuf_binding *binding; static u32 id_alloc_next; @@ -218,22 +230,33 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, } binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, - DMA_FROM_DEVICE); + direction); if (IS_ERR(binding->sgt)) { err = PTR_ERR(binding->sgt); NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); goto err_detach; } + if (direction == DMA_TO_DEVICE) { + binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, + sizeof(struct net_iov *), + GFP_KERNEL); + if (!binding->tx_vec) { + err = -ENOMEM; + goto err_unmap; + } + } + /* For simplicity we expect to make PAGE_SIZE allocations, but the * binding can be much more flexible than that. We may be able to * allocate MTU sized chunks here. Leave that for future work... */ - binding->chunk_pool = - gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev)); + binding->chunk_pool = gen_pool_create(PAGE_SHIFT, + dev_to_node(&dev->dev)); + if (!binding->chunk_pool) { err = -ENOMEM; - goto err_unmap; + goto err_tx_vec; } virtual = 0; @@ -277,6 +300,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, niov->owner = owner; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); + if (direction == DMA_TO_DEVICE) + binding->tx_vec[owner->base_virtual / PAGE_SIZE + i] = niov; } virtual += len; @@ -288,6 +313,8 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, gen_pool_for_each_chunk(binding->chunk_pool, net_devmem_dmabuf_free_chunk_owner, NULL); gen_pool_destroy(binding->chunk_pool); +err_tx_vec: + kvfree(binding->tx_vec); err_unmap: dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, DMA_FROM_DEVICE); @@ -302,6 +329,21 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, return ERR_PTR(err); } +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + struct net_devmem_dmabuf_binding *binding; + + rcu_read_lock(); + binding = xa_load(&net_devmem_dmabuf_bindings, id); + if (binding) { + if (!net_devmem_dmabuf_binding_get(binding)) + binding = NULL; + } + rcu_read_unlock(); + + return binding; +} + void dev_dmabuf_uninstall(struct net_device *dev) { struct net_devmem_dmabuf_binding *binding; @@ -332,6 +374,53 @@ void net_devmem_put_net_iov(struct net_iov *niov) net_devmem_dmabuf_binding_put(net_iov_binding(niov)); } +struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, + unsigned int dmabuf_id) +{ + struct net_devmem_dmabuf_binding *binding; + struct dst_entry *dst = __sk_dst_get(sk); + int err = 0; + + binding = net_devmem_lookup_dmabuf(dmabuf_id); + if (!binding || !binding->tx_vec) { + err = -EINVAL; + goto out_err; + } + + /* The dma-addrs in this binding are only reachable to the corresponding + * net_device. + */ + if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) { + err = -ENODEV; + goto out_err; + } + + return binding; + +out_err: + if (binding) + net_devmem_dmabuf_binding_put(binding); + + return ERR_PTR(err); +} + +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, + size_t virt_addr, size_t *off, size_t *size) +{ + size_t idx; + + if (virt_addr >= binding->dmabuf->size) + return NULL; + + idx = virt_addr / PAGE_SIZE; + + *off = virt_addr % PAGE_SIZE; + *size = PAGE_SIZE - *off; + + return binding->tx_vec[idx]; +} + /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) diff --git a/net/core/devmem.h b/net/core/devmem.h index 8b51caff5a0eae..3334a85292696b 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -21,8 +21,9 @@ struct net_devmem_dmabuf_binding { /* The user holds a ref (via the netlink API) for as long as they want * the binding to remain alive. Each page pool using this binding holds - * a ref to keep the binding alive. Each allocated net_iov holds a - * ref. + * a ref to keep the binding alive. The page_pool does not release the + * ref until all the net_iovs allocated from this binding are released + * back to the page_pool. * * The binding undos itself and unmaps the underlying dmabuf once all * those refs are dropped and the binding is no longer desired or in @@ -30,7 +31,10 @@ struct net_devmem_dmabuf_binding { * * net_devmem_get_net_iov() on dmabuf net_iovs will increment this * reference, making sure that the binding remains alive until all the - * net_iovs are no longer used. + * net_iovs are no longer used. net_iovs allocated from this binding + * that are stuck in the TX path for any reason (such as awaiting + * retransmits) hold a reference to the binding until the skb holding + * them is freed. */ refcount_t ref; @@ -46,6 +50,14 @@ struct net_devmem_dmabuf_binding { * active. */ u32 id; + + /* Array of net_iov pointers for this binding, sorted by virtual + * address. This array is convenient to map the virtual addresses to + * net_iovs in the TX path. + */ + struct net_iov **tx_vec; + + struct work_struct unbind_w; }; #if defined(CONFIG_NET_DEVMEM) @@ -68,15 +80,17 @@ struct dmabuf_genpool_chunk_owner { struct net_devmem_dmabuf_binding *binding; }; -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); +void __net_devmem_dmabuf_binding_free(struct work_struct *wq); struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, - struct netlink_ext_ack *extack); +net_devmem_bind_dmabuf(struct net_device *dev, enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netlink_ext_ack *extack); +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); void dev_dmabuf_uninstall(struct net_device *dev); +void net_devmem_bind_tx_release(struct sock *sk); static inline struct dmabuf_genpool_chunk_owner * net_iov_owner(const struct net_iov *niov) @@ -108,10 +122,10 @@ static inline u32 net_iov_binding_id(const struct net_iov *niov) return net_iov_owner(niov)->binding->id; } -static inline void +static inline bool net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { - refcount_inc(&binding->ref); + return refcount_inc_not_zero(&binding->ref); } static inline void @@ -119,8 +133,9 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) { if (!refcount_dec_and_test(&binding->ref)) return; - - __net_devmem_dmabuf_binding_free(binding); + + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); } void net_devmem_get_net_iov(struct net_iov *niov); @@ -129,6 +144,11 @@ void net_devmem_put_net_iov(struct net_iov *niov); struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); +struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size); #else struct net_devmem_dmabuf_binding; @@ -147,17 +167,24 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov) } static inline void -__net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +__net_devmem_dmabuf_binding_free(struct work_struct *wq) { } static inline struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netlink_ext_ack *extack) { return ERR_PTR(-EOPNOTSUPP); } +static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + return NULL; +} + static inline void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) { @@ -195,6 +222,19 @@ static inline u32 net_iov_binding_id(const struct net_iov *niov) { return 0; } + +static inline struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size) +{ + return NULL; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1eae1d4dfa3839..eff2e7ce4a1d75 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -782,7 +782,7 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } - binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); + binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock; @@ -839,10 +839,67 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) return err; } -/* stub */ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) { - return 0; + struct net_devmem_dmabuf_binding *binding; + struct list_head *sock_binding_list; + struct net_device *netdev; + u32 ifindex, dmabuf_fd; + struct sk_buff *rsp; + int err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + sock_binding_list = genl_sk_priv_get(&netdev_nl_family, + NETLINK_CB(skb).sk); + if (IS_ERR(sock_binding_list)) + return PTR_ERR(sock_binding_list); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + rtnl_lock(); + + netdev = __dev_get_by_index(genl_info_net(info), ifindex); + if (!netdev || !netif_device_present(netdev)) { + err = -ENODEV; + goto err_unlock; + } + + binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, + info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_unlock; + } + + list_add(&binding->list, sock_binding_list); + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + rtnl_unlock(); + + return genlmsg_reply(rsp, info); + +err_unlock: + rtnl_unlock(); +err_genlmsg_free: + nlmsg_free(rsp); + return err; } void netdev_nl_sock_priv_init(struct list_head *priv) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bb829ae915242b..1b524c81f40a33 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1699,7 +1699,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, + bool devmem) { struct ubuf_info_msgzc *uarg; struct sk_buff *skb; @@ -1714,7 +1715,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) uarg = (void *)skb->cb; uarg->mmp.user = NULL; - if (mm_account_pinned_pages(&uarg->mmp, size)) { + if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { kfree_skb(skb); return NULL; } @@ -1737,7 +1738,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) } struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg) + struct ubuf_info *uarg, bool devmem) { if (uarg) { struct ubuf_info_msgzc *uarg_zc; @@ -1767,7 +1768,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg_zc->id + uarg_zc->len) == next) { - if (mm_account_pinned_pages(&uarg_zc->mmp, size)) + if (likely(!devmem) && + mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; uarg_zc->len++; uarg_zc->bytelen = bytelen; @@ -1782,7 +1784,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, } new_alloc: - return msg_zerocopy_alloc(sk, size); + return msg_zerocopy_alloc(sk, size, devmem); } EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); @@ -1886,7 +1888,8 @@ EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg) + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding) { int err, orig_len = skb->len; @@ -1905,7 +1908,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return -EEXIST; } - err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, + binding); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; diff --git a/net/core/sock.c b/net/core/sock.c index a83f64a1d96a29..1ab81cbe89bd4b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2940,6 +2940,11 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, case SCM_RIGHTS: case SCM_CREDENTIALS: break; + case SCM_DEVMEM_DMABUF: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); + break; default: return -EINVAL; } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 49811c9281d424..3587e82a70ca5d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1023,7 +1023,7 @@ static int __ip_append_data(struct sock *sk, uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 68cb6a966b18b8..c1bd9d04448f16 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1050,6 +1050,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { + struct net_devmem_dmabuf_binding *binding = NULL; struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; @@ -1057,11 +1058,24 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; + bool sockc_valid = true; int zc = 0; long timeo; flags = msg->msg_flags; + sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags), + .dmabuf_id = 0 }; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) + /* Don't return error until MSG_FASTOPEN has been + * processed; that may succeed even if the cmsg is + * invalid. + */ + sockc_valid = false; + } + if ((flags & MSG_ZEROCOPY) && size) { if (msg->msg_ubuf) { uarg = msg->msg_ubuf; @@ -1069,7 +1083,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); - uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb), + sockc_valid && !!sockc.dmabuf_id); if (!uarg) { err = -ENOBUFS; goto out_err; @@ -1078,12 +1093,27 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; else uarg_to_msgzc(uarg)->zerocopy = 0; + + if (sockc_valid && sockc.dmabuf_id) { + binding = net_devmem_get_binding(sk, sockc.dmabuf_id); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + binding = NULL; + goto out_err; + } + } } } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_SPLICE_PAGES; } + if (sockc_valid && sockc.dmabuf_id && + (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) { + err = -EINVAL; + goto out_err; + } + if (unlikely(flags & MSG_FASTOPEN || inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { @@ -1122,14 +1152,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockcm_init(&sockc, sk); - if (msg->msg_controllen) { - err = sock_cmsg_send(sk, msg, &sockc); - if (unlikely(err)) { - err = -EINVAL; - goto out_err; - } - } + if (!sockc_valid) + goto out_err; /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); @@ -1247,7 +1271,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto wait_for_space; } - err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg, + binding); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; @@ -1328,6 +1353,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put(uarg); + if (binding) + net_devmem_dmabuf_binding_put(binding); return copied + copied_syn; do_error: @@ -1345,6 +1372,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } + if (binding) + net_devmem_dmabuf_binding_put(binding); + return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 434ddf263b88a3..ad4632c3e795ba 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1519,7 +1519,7 @@ static int __ip6_append_data(struct sock *sk, uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 7f7de6d8809655..9bfaa530ebb3d1 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -87,7 +87,7 @@ static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, uarg = msg_zerocopy_realloc(sk_vsock(vsk), iter->count, - NULL); + NULL, false); if (!uarg) return -1; @@ -108,7 +108,7 @@ static int virtio_transport_fill_skb(struct sk_buff *skb, if (zcopy) return __zerocopy_sg_from_iter(info->msg, NULL, skb, &info->msg->msg_iter, - len); + len, NULL); return memcpy_from_msg(skb_put(skb, len), info->msg, len); } From 597150e006a95f9febd895255ee10245d023fdf0 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Tue, 4 Mar 2025 22:07:16 +0000 Subject: [PATCH 09/34] BACKPORT: net: add devmem TCP TX documentation Add documentation outlining the usage and details of the devmem TCP TX API. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- Documentation/networking/devmem.rst | 143 +++++++++++++++++++++++++++- 1 file changed, 142 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/devmem.rst b/Documentation/networking/devmem.rst index d9536364533149..cf0904a3397aff 100644 --- a/Documentation/networking/devmem.rst +++ b/Documentation/networking/devmem.rst @@ -62,7 +62,7 @@ More Info https://lore.kernel.org/netdev/20240831004313.3713467-1-almasrymina@google.com/ -Interface +Rx Interface ========= @@ -235,6 +235,147 @@ can be less than the tokens provided by the user in case of: (a) an internal kernel leak bug. (b) the user passed more than 1024 frags. +TX Interface +============ + + +Example +------- + +./tools/testing/selftests/drivers/net/hw/ncdevmem:do_client shows an example of +setting up the TX path of this API. + + +NIC Setup +--------- + +The user must bind a TX dmabuf to a given NIC using the netlink API:: + + struct netdev_bind_tx_req *req = NULL; + struct netdev_bind_tx_rsp *rsp = NULL; + struct ynl_error yerr; + + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); + + req = netdev_bind_tx_req_alloc(); + netdev_bind_tx_req_set_ifindex(req, ifindex); + netdev_bind_tx_req_set_fd(req, dmabuf_fd); + + rsp = netdev_bind_tx(*ys, req); + + tx_dmabuf_id = rsp->id; + + +The netlink API returns a dmabuf_id: a unique ID that refers to this dmabuf +that has been bound. + +The user can unbind the dmabuf from the netdevice by closing the netlink socket +that established the binding. We do this so that the binding is automatically +unbound even if the userspace process crashes. + +Note that any reasonably well-behaved dmabuf from any exporter should work with +devmem TCP, even if the dmabuf is not actually backed by devmem. An example of +this is udmabuf, which wraps user memory (non-devmem) in a dmabuf. + +Socket Setup +------------ + +The user application must use MSG_ZEROCOPY flag when sending devmem TCP. Devmem +cannot be copied by the kernel, so the semantics of the devmem TX are similar +to the semantics of MSG_ZEROCOPY:: + + setsockopt(socket_fd, SOL_SOCKET, SO_ZEROCOPY, &opt, sizeof(opt)); + +It is also recommended that the user binds the TX socket to the same interface +the dma-buf has been bound to via SO_BINDTODEVICE:: + + setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, strlen(ifname) + 1); + + +Sending data +------------ + +Devmem data is sent using the SCM_DEVMEM_DMABUF cmsg. + +The user should create a msghdr where, + +* iov_base is set to the offset into the dmabuf to start sending from +* iov_len is set to the number of bytes to be sent from the dmabuf + +The user passes the dma-buf id to send from via the dmabuf_tx_cmsg.dmabuf_id. + +The example below sends 1024 bytes from offset 100 into the dmabuf, and 2048 +from offset 2000 into the dmabuf. The dmabuf to send from is tx_dmabuf_id:: + + char ctrl_data[CMSG_SPACE(sizeof(struct dmabuf_tx_cmsg))]; + struct dmabuf_tx_cmsg ddmabuf; + struct msghdr msg = {}; + struct cmsghdr *cmsg; + struct iovec iov[2]; + + iov[0].iov_base = (void*)100; + iov[0].iov_len = 1024; + iov[1].iov_base = (void*)2000; + iov[1].iov_len = 2048; + + msg.msg_iov = iov; + msg.msg_iovlen = 2; + + msg.msg_control = ctrl_data; + msg.msg_controllen = sizeof(ctrl_data); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_DEVMEM_DMABUF; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct dmabuf_tx_cmsg)); + + ddmabuf.dmabuf_id = tx_dmabuf_id; + + *((struct dmabuf_tx_cmsg *)CMSG_DATA(cmsg)) = ddmabuf; + + sendmsg(socket_fd, &msg, MSG_ZEROCOPY); + + +Reusing TX dmabufs +------------------ + +Similar to MSG_ZEROCOPY with regular memory, the user should not modify the +contents of the dma-buf while a send operation is in progress. This is because +the kernel does not keep a copy of the dmabuf contents. Instead, the kernel +will pin and send data from the buffer available to the userspace. + +Just as in MSG_ZEROCOPY, the kernel notifies the userspace of send completions +using MSG_ERRQUEUE:: + + int64_t tstop = gettimeofday_ms() + waittime_ms; + char control[CMSG_SPACE(100)] = {}; + struct sock_extended_err *serr; + struct msghdr msg = {}; + struct cmsghdr *cm; + int retries = 10; + __u32 hi, lo; + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + while (gettimeofday_ms() < tstop) { + if (!do_poll(fd)) continue; + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); + + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + serr = (void *)CMSG_DATA(cm); + + hi = serr->ee_data; + lo = serr->ee_info; + + fprintf(stdout, "tx complete [%d,%d]\n", lo, hi); + } + } + +After the associated sendmsg has been completed, the dmabuf can be reused by +the userspace. + Implementation & Caveats ======================== From f7cf06fd214aaba9c2d5afb6a3d113b2e5915ebd Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 07:15:18 +0000 Subject: [PATCH 10/34] BACKPORT: net: enable driver support for netmem TX Drivers need to make sure not to pass netmem dma-addrs to the dma-mapping API in order to support netmem TX. Add helpers and netmem_dma_*() helpers that enables special handling of netmem dma-addrs that drivers can use. Document in netmem.rst what drivers need to do to support netmem TX. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- .../networking/net_cachelines/net_device.rst | 1 + Documentation/networking/netdev-features.rst | 5 ++++ Documentation/networking/netmem.rst | 23 +++++++++++++++++-- include/linux/netdevice.h | 2 ++ include/net/netmem.h | 20 ++++++++++++++++ 5 files changed, 49 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 22b07c814f4a45..2d43d80af55ae4 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -9,6 +9,7 @@ Type Name fastpath_tx_access ..struct ..net_device unsigned_long:32 priv_flags read_mostly - __dev_queue_xmit(tx) unsigned_long:1 lltx read_mostly - HARD_TX_LOCK,HARD_TX_TRYLOCK,HARD_TX_UNLOCK(tx) +unsigned long:1 netmem_tx:1; read_mostly char name[16] - - struct_netdev_name_node* name_node struct_dev_ifalias* ifalias diff --git a/Documentation/networking/netdev-features.rst b/Documentation/networking/netdev-features.rst index 5014f7cc1398ba..02bd7536fc0ca2 100644 --- a/Documentation/networking/netdev-features.rst +++ b/Documentation/networking/netdev-features.rst @@ -188,3 +188,8 @@ Redundancy) frames from one port to another in hardware. This should be set for devices which duplicate outgoing HSR (High-availability Seamless Redundancy) or PRP (Parallel Redundancy Protocol) tags automatically frames in hardware. + +* netmem-tx + +This should be set for devices which support netmem TX. See +Documentation/networking/netmem.rst diff --git a/Documentation/networking/netmem.rst b/Documentation/networking/netmem.rst index 7de21ddb54129f..fa541a75c1f86c 100644 --- a/Documentation/networking/netmem.rst +++ b/Documentation/networking/netmem.rst @@ -19,8 +19,8 @@ Benefits of Netmem : * Simplified Development: Drivers interact with a consistent API, regardless of the underlying memory implementation. -Driver Requirements -=================== +Driver Rx Requirements +====================== 1. The driver must support page_pool. @@ -77,3 +77,22 @@ Driver Requirements that purpose, but be mindful that some netmem types might have longer circulation times, such as when userspace holds a reference in zerocopy scenarios. + +Driver TX Requirements +====================== + +1. The Driver must not pass the netmem dma_addr to any of the dma-mapping APIs + directly. This is because netmem dma_addrs may come from a source like + dma-buf that is not compatible with the dma-mapping APIs. + + Helpers like netmem_dma_unmap_page_attrs() & netmem_dma_unmap_addr_set() + should be used in lieu of dma_unmap_page[_attrs](), dma_unmap_addr_set(). + The netmem variants will handle netmem dma_addrs correctly regardless of the + source, delegating to the dma-mapping APIs when appropriate. + + Not all dma-mapping APIs have netmem equivalents at the moment. If your + driver relies on a missing netmem API, feel free to add and propose to + netdev@, or reach out to the maintainers and/or almasrymina@google.com for + help adding the netmem API. + +2. Driver should declare support by setting `netdev->netmem_tx = true` diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 35b886385f3298..f9a0093d41ad23 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1723,6 +1723,7 @@ enum netdev_reg_state { * @lltx: device supports lockless Tx. Deprecated for real HW * drivers. Mainly used by logical interfaces, such as * bonding and tunnels + * @netmem_tx: device support netmem_tx. * * @name: This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name @@ -2024,6 +2025,7 @@ struct net_device { struct_group(priv_flags_fast, unsigned long priv_flags:32; unsigned long lltx:1; + unsigned long netmem_tx:1; ); const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; diff --git a/include/net/netmem.h b/include/net/netmem.h index b2b0779653eb7e..9f4e436d7ca62a 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -8,6 +8,7 @@ #ifndef _NET_NETMEM_H #define _NET_NETMEM_H +#include #include #include @@ -174,4 +175,23 @@ static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) void get_netmem(netmem_ref netmem); void put_netmem(netmem_ref netmem); +#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL) \ + do { \ + if (!netmem_is_net_iov(NETMEM)) \ + dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \ + else \ + dma_unmap_addr_set(PTR, ADDR_NAME, 0); \ + } while (0) + +static inline void netmem_dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + if (!addr) + return; + + dma_unmap_page_attrs(dev, addr, size, dir, attrs); +} + #endif /* _NET_NETMEM_H */ From 9dea192c6bde04744321dfee1898d1ce39efbd65 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 07:44:21 +0000 Subject: [PATCH 11/34] BACKPORT: gve: add netmem TX support to GVE DQO-RDA mode Use netmem_dma_*() helpers in gve_tx_dqo.c DQO-RDA paths to enable netmem TX support in that mode. Declare support for netmem TX in GVE DQO-RDA mode. Signed-off-by: Mina Almasry Signed-off-by: Pranjal Shrivastava --- drivers/net/ethernet/google/gve/gve_main.c | 4 ++++ drivers/net/ethernet/google/gve/gve_tx_dqo.c | 8 +++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 862c4575701fec..8673b10326243b 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -2669,6 +2669,10 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) dev_info(&pdev->dev, "GVE version %s\n", gve_version_str); dev_info(&pdev->dev, "GVE queue format %d\n", (int)priv->queue_format); + + if (!gve_is_gqi(priv) && !gve_is_qpl(priv)) + dev->netmem_tx = true; + gve_clear_probe_in_progress(priv); queue_work(priv->gve_wq, &priv->service_task); return 0; diff --git a/drivers/net/ethernet/google/gve/gve_tx_dqo.c b/drivers/net/ethernet/google/gve/gve_tx_dqo.c index f879426cb5523a..8127cbef3e94a6 100644 --- a/drivers/net/ethernet/google/gve/gve_tx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_tx_dqo.c @@ -667,7 +667,8 @@ static int gve_tx_add_skb_no_copy_dqo(struct gve_tx_ring *tx, goto err; dma_unmap_len_set(pkt, len[pkt->num_bufs], len); - dma_unmap_addr_set(pkt, dma[pkt->num_bufs], addr); + netmem_dma_unmap_addr_set(skb_frag_netmem(frag), pkt, + dma[pkt->num_bufs], addr); ++pkt->num_bufs; gve_tx_fill_pkt_desc_dqo(tx, desc_idx, skb, len, addr, @@ -1045,8 +1046,9 @@ static void gve_unmap_packet(struct device *dev, dma_unmap_single(dev, dma_unmap_addr(pkt, dma[0]), dma_unmap_len(pkt, len[0]), DMA_TO_DEVICE); for (i = 1; i < pkt->num_bufs; i++) { - dma_unmap_page(dev, dma_unmap_addr(pkt, dma[i]), - dma_unmap_len(pkt, len[i]), DMA_TO_DEVICE); + netmem_dma_unmap_page_attrs(dev, dma_unmap_addr(pkt, dma[i]), + dma_unmap_len(pkt, len[i]), + DMA_TO_DEVICE, 0); } pkt->num_bufs = 0; } From 26de6108f56bd79a77143d514a1ae50b188cde82 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 07:51:33 +0000 Subject: [PATCH 12/34] BACKPORT: net: check for driver support in netmem TX We should not enable netmem TX for drivers that don't declare support. Check for driver netmem TX support during devmem TX binding and fail if the driver does not have the functionality. Check for driver support in validate_xmit_skb as well. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- net/core/dev.c | 3 +++ net/core/netdev-genl.c | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index c761f862bc5a2d..ac5c81d378c792 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3765,6 +3765,9 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device skb = validate_xmit_xfrm(skb, features, again); + if (!skb_frags_readable(skb) && !dev->netmem_tx) + goto out_kfree_skb; + return skb; out_kfree_skb: diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index eff2e7ce4a1d75..32503a32fb89a3 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -879,6 +879,13 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } + if (!netdev->netmem_tx) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(info->extack, + "Driver does not support netmem TX"); + goto err_unlock; + } + binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, info->extack); if (IS_ERR(binding)) { From 2a3c9534206170406567a65fbe00b96b5d983631 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 09:47:51 +0000 Subject: [PATCH 13/34] BACLPORT: selftests: ncdevmem: Redirect all non-payload output to stderr That should make it possible to do expected payload validation on the caller side. Reviewed-by: Mina Almasry Reviewed-by: Joe Damato Signed-off-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- tools/testing/selftests/net/ncdevmem.c | 61 +++++++++++++------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c index 64d6805381c50c..9245d3f158dd3e 100644 --- a/tools/testing/selftests/net/ncdevmem.c +++ b/tools/testing/selftests/net/ncdevmem.c @@ -88,7 +88,6 @@ void print_nonzero_bytes(void *ptr, size_t size) for (i = 0; i < size; i++) putchar(p[i]); - printf("\n"); } void validate_buffer(void *line, size_t size) @@ -120,7 +119,7 @@ void validate_buffer(void *line, size_t size) char command[256]; \ memset(command, 0, sizeof(command)); \ snprintf(command, sizeof(command), cmd, ##__VA_ARGS__); \ - printf("Running: %s\n", command); \ + fprintf(stderr, "Running: %s\n", command); \ system(command); \ }) @@ -128,22 +127,22 @@ static int reset_flow_steering(void) { int ret = 0; - ret = run_command("sudo ethtool -K %s ntuple off", ifname); + ret = run_command("sudo ethtool -K %s ntuple off >&2", ifname); if (ret) return ret; - return run_command("sudo ethtool -K %s ntuple on", ifname); + return run_command("sudo ethtool -K %s ntuple on >&2", ifname); } static int configure_headersplit(bool on) { - return run_command("sudo ethtool -G %s tcp-data-split %s", ifname, + return run_command("sudo ethtool -G %s tcp-data-split %s >&2", ifname, on ? "on" : "off"); } static int configure_rss(void) { - return run_command("sudo ethtool -X %s equal %d", ifname, start_queue); + return run_command("sudo ethtool -X %s equal %d >&2", ifname, start_queue); } static int configure_channels(unsigned int rx, unsigned int tx) @@ -153,7 +152,7 @@ static int configure_channels(unsigned int rx, unsigned int tx) static int configure_flow_steering(void) { - return run_command("sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d", + return run_command("sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d >&2", ifname, client_ip, server_ip, port, port, start_queue); } @@ -187,7 +186,7 @@ static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, goto err_close; } - printf("got dmabuf id=%d\n", rsp->id); + fprintf(stderr, "got dmabuf id=%d\n", rsp->id); dmabuf_id = rsp->id; netdev_bind_rx_req_free(req); @@ -314,8 +313,8 @@ int do_server(void) if (ret) error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); - printf("binding to address %s:%d\n", server_ip, - ntohs(server_sin.sin_port)); + fprintf(stderr, "binding to address %s:%d\n", server_ip, + ntohs(server_sin.sin_port)); ret = bind(socket_fd, &server_sin, sizeof(server_sin)); if (ret) @@ -329,14 +328,14 @@ int do_server(void) inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer, sizeof(buffer)); - printf("Waiting or connection on %s:%d\n", buffer, - ntohs(server_sin.sin_port)); + fprintf(stderr, "Waiting or connection on %s:%d\n", buffer, + ntohs(server_sin.sin_port)); client_fd = accept(socket_fd, &client_addr, &client_addr_len); inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer, sizeof(buffer)); - printf("Got connection from %s:%d\n", buffer, - ntohs(client_addr.sin_port)); + fprintf(stderr, "Got connection from %s:%d\n", buffer, + ntohs(client_addr.sin_port)); while (1) { struct iovec iov = { .iov_base = iobuf, @@ -349,14 +348,13 @@ int do_server(void) ssize_t ret; is_devmem = false; - printf("\n\n"); msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_control = ctrl_data; msg.msg_controllen = sizeof(ctrl_data); ret = recvmsg(client_fd, &msg, MSG_SOCK_DEVMEM); - printf("recvmsg ret=%ld\n", ret); + fprintf(stderr, "recvmsg ret=%ld\n", ret); if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) continue; if (ret < 0) { @@ -364,7 +362,7 @@ int do_server(void) continue; } if (ret == 0) { - printf("client exited\n"); + fprintf(stderr, "client exited\n"); goto cleanup; } @@ -373,7 +371,7 @@ int do_server(void) if (cm->cmsg_level != SOL_SOCKET || (cm->cmsg_type != SCM_DEVMEM_DMABUF && cm->cmsg_type != SCM_DEVMEM_LINEAR)) { - fprintf(stdout, "skipping non-devmem cmsg\n"); + fprintf(stderr, "skipping non-devmem cmsg\n"); continue; } @@ -384,7 +382,7 @@ int do_server(void) /* TODO: process data copied from skb's linear * buffer. */ - fprintf(stdout, + fprintf(stderr, "SCM_DEVMEM_LINEAR. dmabuf_cmsg->frag_size=%u\n", dmabuf_cmsg->frag_size); @@ -395,12 +393,13 @@ int do_server(void) token.token_count = 1; total_received += dmabuf_cmsg->frag_size; - printf("received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n", - dmabuf_cmsg->frag_offset >> PAGE_SHIFT, - dmabuf_cmsg->frag_offset % getpagesize(), - dmabuf_cmsg->frag_offset, dmabuf_cmsg->frag_size, - dmabuf_cmsg->frag_token, total_received, - dmabuf_cmsg->dmabuf_id); + fprintf(stderr, + "received frag_page=%llu, in_page_offset=%llu, frag_offset=%llu, frag_size=%u, token=%u, total_received=%lu, dmabuf_id=%u\n", + dmabuf_cmsg->frag_offset >> PAGE_SHIFT, + dmabuf_cmsg->frag_offset % getpagesize(), + dmabuf_cmsg->frag_offset, + dmabuf_cmsg->frag_size, dmabuf_cmsg->frag_token, + total_received, dmabuf_cmsg->dmabuf_id); if (dmabuf_cmsg->dmabuf_id != dmabuf_id) error(1, 0, @@ -438,15 +437,15 @@ int do_server(void) if (!is_devmem) error(1, 0, "flow steering error\n"); - printf("total_received=%lu\n", total_received); + fprintf(stderr, "total_received=%lu\n", total_received); } - fprintf(stdout, "%s: ok\n", TEST_PREFIX); + fprintf(stderr, "%s: ok\n", TEST_PREFIX); - fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", + fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", page_aligned_frags, non_page_aligned_frags); - fprintf(stdout, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", + fprintf(stderr, "page_aligned_frags=%lu, non_page_aligned_frags=%lu\n", page_aligned_frags, non_page_aligned_frags); cleanup: @@ -551,7 +550,7 @@ int main(int argc, char *argv[]) ifname = optarg; break; case '?': - printf("unknown option: %c\n", optopt); + fprintf(stderr, "unknown option: %c\n", optopt); break; } } @@ -559,7 +558,7 @@ int main(int argc, char *argv[]) ifindex = if_nametoindex(ifname); for (; optind < argc; optind++) - printf("extra arguments: %s\n", argv[optind]); + fprintf(stderr, "extra arguments: %s\n", argv[optind]); run_devmem_tests(); From e2fe2cb72fbc988d8e4927c1ad369fabd9d66d66 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 09:52:11 +0000 Subject: [PATCH 14/34] BACKPORT: selftests: ncdevmem: Separate out dmabuf provider So we can plug the other ones in the future if needed. Reviewed-by: Mina Almasry Reviewed-by: Joe Damato Signed-off-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- tools/testing/selftests/net/ncdevmem.c | 203 +++++++++++++++---------- 1 file changed, 119 insertions(+), 84 deletions(-) diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c index 9245d3f158dd3e..3e7ef2eedd60b4 100644 --- a/tools/testing/selftests/net/ncdevmem.c +++ b/tools/testing/selftests/net/ncdevmem.c @@ -71,17 +71,101 @@ static char *ifname = "eth1"; static unsigned int ifindex; static unsigned int dmabuf_id; -void print_bytes(void *ptr, size_t size) +struct memory_buffer { + int fd; + size_t size; + + int devfd; + int memfd; + char *buf_mem; +}; + +struct memory_provider { + struct memory_buffer *(*alloc)(size_t size); + void (*free)(struct memory_buffer *ctx); + void (*memcpy_from_device)(void *dst, struct memory_buffer *src, + size_t off, int n); +}; + +static struct memory_buffer *udmabuf_alloc(size_t size) { - unsigned char *p = ptr; - int i; + struct udmabuf_create create; + struct memory_buffer *ctx; + int ret; - for (i = 0; i < size; i++) - printf("%02hhX ", p[i]); - printf("\n"); + ctx = malloc(sizeof(*ctx)); + if (!ctx) + error(1, ENOMEM, "malloc failed"); + + ctx->size = size; + + ctx->devfd = open("/dev/udmabuf", O_RDWR); + if (ctx->devfd < 0) + error(1, errno, + "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n", + TEST_PREFIX); + + ctx->memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING); + if (ctx->memfd < 0) + error(1, errno, "%s: [skip,no-memfd]\n", TEST_PREFIX); + + ret = fcntl(ctx->memfd, F_ADD_SEALS, F_SEAL_SHRINK); + if (ret < 0) + error(1, errno, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX); + + ret = ftruncate(ctx->memfd, size); + if (ret == -1) + error(1, errno, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX); + + memset(&create, 0, sizeof(create)); + + create.memfd = ctx->memfd; + create.offset = 0; + create.size = size; + ctx->fd = ioctl(ctx->devfd, UDMABUF_CREATE, &create); + if (ctx->fd < 0) + error(1, errno, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX); + + ctx->buf_mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + ctx->fd, 0); + if (ctx->buf_mem == MAP_FAILED) + error(1, errno, "%s: [FAIL, map udmabuf]\n", TEST_PREFIX); + + return ctx; +} + +static void udmabuf_free(struct memory_buffer *ctx) +{ + munmap(ctx->buf_mem, ctx->size); + close(ctx->fd); + close(ctx->memfd); + close(ctx->devfd); + free(ctx); } -void print_nonzero_bytes(void *ptr, size_t size) +static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src, + size_t off, int n) +{ + struct dma_buf_sync sync = {}; + + sync.flags = DMA_BUF_SYNC_START; + ioctl(src->fd, DMA_BUF_IOCTL_SYNC, &sync); + + memcpy(dst, src->buf_mem + off, n); + + sync.flags = DMA_BUF_SYNC_END; + ioctl(src->fd, DMA_BUF_IOCTL_SYNC, &sync); +} + +static struct memory_provider udmabuf_memory_provider = { + .alloc = udmabuf_alloc, + .free = udmabuf_free, + .memcpy_from_device = udmabuf_memcpy_from_device, +}; + +static struct memory_provider *provider = &udmabuf_memory_provider; + +static void print_nonzero_bytes(void *ptr, size_t size) { unsigned char *p = ptr; unsigned int i; @@ -201,42 +285,7 @@ static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, return -1; } -static void create_udmabuf(int *devfd, int *memfd, int *buf, size_t dmabuf_size) -{ - struct udmabuf_create create; - int ret; - - *devfd = open("/dev/udmabuf", O_RDWR); - if (*devfd < 0) { - error(70, 0, - "%s: [skip,no-udmabuf: Unable to access DMA buffer device file]\n", - TEST_PREFIX); - } - - *memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING); - if (*memfd < 0) - error(70, 0, "%s: [skip,no-memfd]\n", TEST_PREFIX); - - /* Required for udmabuf */ - ret = fcntl(*memfd, F_ADD_SEALS, F_SEAL_SHRINK); - if (ret < 0) - error(73, 0, "%s: [skip,fcntl-add-seals]\n", TEST_PREFIX); - - ret = ftruncate(*memfd, dmabuf_size); - if (ret == -1) - error(74, 0, "%s: [FAIL,memfd-truncate]\n", TEST_PREFIX); - - memset(&create, 0, sizeof(create)); - - create.memfd = *memfd; - create.offset = 0; - create.size = dmabuf_size; - *buf = ioctl(*devfd, UDMABUF_CREATE, &create); - if (*buf < 0) - error(75, 0, "%s: [FAIL, create udmabuf]\n", TEST_PREFIX); -} - -int do_server(void) +int do_server(struct memory_buffer *mem) { char ctrl_data[sizeof(int) * 20000]; struct netdev_queue_id *queues; @@ -244,23 +293,18 @@ int do_server(void) struct sockaddr_in client_addr; struct sockaddr_in server_sin; size_t page_aligned_frags = 0; - int devfd, memfd, buf, ret; size_t total_received = 0; socklen_t client_addr_len; bool is_devmem = false; - char *buf_mem = NULL; + char *tmp_mem = NULL; struct ynl_sock *ys; - size_t dmabuf_size; char iobuf[819200]; char buffer[256]; int socket_fd; int client_fd; size_t i = 0; int opt = 1; - - dmabuf_size = getpagesize() * NUM_PAGES; - - create_udmabuf(&devfd, &memfd, &buf, dmabuf_size); + int ret; if (reset_flow_steering()) error(1, 0, "Failed to reset flow steering\n"); @@ -284,13 +328,12 @@ int do_server(void) queues[i].id = start_queue + i; } - if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) + if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys)) error(1, 0, "Failed to bind\n"); - buf_mem = mmap(NULL, dmabuf_size, PROT_READ | PROT_WRITE, MAP_SHARED, - buf, 0); - if (buf_mem == MAP_FAILED) - error(1, 0, "mmap()"); + tmp_mem = malloc(mem->size); + if (!tmp_mem) + error(1, ENOMEM, "malloc failed"); server_sin.sin_family = AF_INET; server_sin.sin_port = htons(atoi(port)); @@ -341,7 +384,6 @@ int do_server(void) struct iovec iov = { .iov_base = iobuf, .iov_len = sizeof(iobuf) }; struct dmabuf_cmsg *dmabuf_cmsg = NULL; - struct dma_buf_sync sync = { 0 }; struct cmsghdr *cm = NULL; struct msghdr msg = { 0 }; struct dmabuf_token token; @@ -410,22 +452,16 @@ int do_server(void) else page_aligned_frags++; - sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_START; - ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync); + provider->memcpy_from_device(tmp_mem, mem, + dmabuf_cmsg->frag_offset, + dmabuf_cmsg->frag_size); if (do_validation) - validate_buffer( - ((unsigned char *)buf_mem) + - dmabuf_cmsg->frag_offset, - dmabuf_cmsg->frag_size); + validate_buffer(tmp_mem, + dmabuf_cmsg->frag_size); else - print_nonzero_bytes( - ((unsigned char *)buf_mem) + - dmabuf_cmsg->frag_offset, - dmabuf_cmsg->frag_size); - - sync.flags = DMA_BUF_SYNC_READ | DMA_BUF_SYNC_END; - ioctl(buf, DMA_BUF_IOCTL_SYNC, &sync); + print_nonzero_bytes(tmp_mem, + dmabuf_cmsg->frag_size); ret = setsockopt(client_fd, SOL_SOCKET, SO_DEVMEM_DONTNEED, &token, @@ -450,12 +486,9 @@ int do_server(void) cleanup: - munmap(buf_mem, dmabuf_size); + free(tmp_mem); close(client_fd); close(socket_fd); - close(buf); - close(memfd); - close(devfd); ynl_sock_destroy(ys); return 0; @@ -464,14 +497,11 @@ int do_server(void) void run_devmem_tests(void) { struct netdev_queue_id *queues; - int devfd, memfd, buf; + struct memory_buffer *mem; struct ynl_sock *ys; - size_t dmabuf_size; size_t i = 0; - dmabuf_size = getpagesize() * NUM_PAGES; - - create_udmabuf(&devfd, &memfd, &buf, dmabuf_size); + mem = provider->alloc(getpagesize() * NUM_PAGES); /* Configure RSS to divert all traffic from our devmem queues */ if (configure_rss()) @@ -482,7 +512,7 @@ void run_devmem_tests(void) if (configure_headersplit(1)) error(1, 0, "Failed to configure header split\n"); - if (!bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) + if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys)) error(1, 0, "Binding empty queues array should have failed\n"); for (i = 0; i < num_queues; i++) { @@ -495,7 +525,7 @@ void run_devmem_tests(void) if (configure_headersplit(0)) error(1, 0, "Failed to configure header split\n"); - if (!bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) + if (!bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys)) error(1, 0, "Configure dmabuf with header split off should have failed\n"); if (configure_headersplit(1)) @@ -508,7 +538,7 @@ void run_devmem_tests(void) queues[i].id = start_queue + i; } - if (bind_rx_queue(ifindex, buf, queues, num_queues, &ys)) + if (bind_rx_queue(ifindex, mem->fd, queues, num_queues, &ys)) error(1, 0, "Failed to bind\n"); /* Deactivating a bound queue should not be legal */ @@ -517,11 +547,15 @@ void run_devmem_tests(void) /* Closing the netlink socket does an implicit unbind */ ynl_sock_destroy(ys); + + provider->free(mem); } int main(int argc, char *argv[]) { + struct memory_buffer *mem; int is_server = 0, opt; + int ret; while ((opt = getopt(argc, argv, "ls:c:p:v:q:t:f:")) != -1) { switch (opt) { @@ -562,8 +596,9 @@ int main(int argc, char *argv[]) run_devmem_tests(); - if (is_server) - return do_server(); + mem = provider->alloc(getpagesize() * NUM_PAGES); + ret = is_server ? do_server(mem) : 1; + provider->free(mem); - return 0; + return ret; } From 0736ab7867884fc5aef1b2d37e176f75a6c9d0f9 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 09:54:01 +0000 Subject: [PATCH 15/34] BACKPORT: selftests: ncdevmem: Unify error handling There is a bunch of places where error() calls look out of place. Use the same error(1, errno, ...) pattern everywhere. Reviewed-by: Mina Almasry Reviewed-by: Joe Damato Signed-off-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- tools/testing/selftests/net/ncdevmem.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c index 3e7ef2eedd60b4..4733d1a0aab5de 100644 --- a/tools/testing/selftests/net/ncdevmem.c +++ b/tools/testing/selftests/net/ncdevmem.c @@ -339,33 +339,33 @@ int do_server(struct memory_buffer *mem) server_sin.sin_port = htons(atoi(port)); ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr); - if (socket < 0) - error(79, 0, "%s: [FAIL, create socket]\n", TEST_PREFIX); + if (ret < 0) + error(1, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX); socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0); - if (socket < 0) - error(errno, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX); + if (socket_fd < 0) + error(1, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX); ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)); if (ret) - error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); + error(1, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); if (ret) - error(errno, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); + error(1, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); fprintf(stderr, "binding to address %s:%d\n", server_ip, ntohs(server_sin.sin_port)); ret = bind(socket_fd, &server_sin, sizeof(server_sin)); if (ret) - error(errno, errno, "%s: [FAIL, bind]\n", TEST_PREFIX); + error(1, errno, "%s: [FAIL, bind]\n", TEST_PREFIX); ret = listen(socket_fd, 1); if (ret) - error(errno, errno, "%s: [FAIL, listen]\n", TEST_PREFIX); + error(1, errno, "%s: [FAIL, listen]\n", TEST_PREFIX); client_addr_len = sizeof(client_addr); From 67a1f624305fb59a64f8b3cfade57a9b88d56349 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 09:57:51 +0000 Subject: [PATCH 16/34] BACKPORT: selftests: ncdevmem: Make client_ip optional Support 3-tuple filtering by making client_ip optional. When -c is not passed, don't specify src-ip/src-port in the filter. Reviewed-by: Mina Almasry Reviewed-by: Joe Damato Signed-off-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- tools/testing/selftests/net/ncdevmem.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c index 4733d1a0aab5de..faa9dce121c727 100644 --- a/tools/testing/selftests/net/ncdevmem.c +++ b/tools/testing/selftests/net/ncdevmem.c @@ -62,7 +62,7 @@ */ static char *server_ip = "192.168.1.4"; -static char *client_ip = "192.168.1.2"; +static char *client_ip; static char *port = "5201"; static size_t do_validation; static int start_queue = 8; @@ -236,8 +236,14 @@ static int configure_channels(unsigned int rx, unsigned int tx) static int configure_flow_steering(void) { - return run_command("sudo ethtool -N %s flow-type tcp4 src-ip %s dst-ip %s src-port %s dst-port %s queue %d >&2", - ifname, client_ip, server_ip, port, port, start_queue); + return run_command("sudo ethtool -N %s flow-type tcp4 %s %s dst-ip %s %s %s dst-port %s queue %d >&2", + ifname, + client_ip ? "src-ip" : "", + client_ip ?: "", + server_ip, + client_ip ? "src-port" : "", + client_ip ? port : "", + port, start_queue); } static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, From 379da3ee37b0890c7e0dc913ecf597f9667c9013 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 09:59:19 +0000 Subject: [PATCH 17/34] BACKPORT: selftests: ncdevmem: Remove default arguments To make it clear what's required and what's not. Also, some of the values don't seem like a good defaults; for example eth1. Move the invocation comment to the top, add missing -s to the client and cleanup the client invocation a bit to make more readable. Reviewed-by: Mina Almasry Reviewed-by: Joe Damato Signed-off-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- tools/testing/selftests/net/ncdevmem.c | 61 ++++++++++++++++---------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c index faa9dce121c727..0feeca56c04973 100644 --- a/tools/testing/selftests/net/ncdevmem.c +++ b/tools/testing/selftests/net/ncdevmem.c @@ -1,4 +1,31 @@ // SPDX-License-Identifier: GPL-2.0 +/* + * tcpdevmem netcat. Works similarly to netcat but does device memory TCP + * instead of regular TCP. Uses udmabuf to mock a dmabuf provider. + * + * Usage: + * + * On server: + * ncdevmem -s [-c ] -f eth1 -l -p 5201 + * + * On client: + * echo -n "hello\nworld" | nc -s 5201 -p 5201 + * + * Test data validation: + * + * On server: + * ncdevmem -s [-c ] -f eth1 -l -p 5201 -v 7 + * + * On client: + * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \ + * tr \\n \\0 | \ + * head -c 5G | \ + * nc 5201 -p 5201 + * + * + * Note this is compatible with regular netcat. i.e. the sender or receiver can + * be replaced with regular netcat to test the RX or TX path in isolation. + */ #define _GNU_SOURCE #define __EXPORTED_HEADERS__ @@ -42,32 +69,13 @@ #define MSG_SOCK_DEVMEM 0x2000000 #endif -/* - * tcpdevmem netcat. Works similarly to netcat but does device memory TCP - * instead of regular TCP. Uses udmabuf to mock a dmabuf provider. - * - * Usage: - * - * On server: - * ncdevmem -s -c -f eth1 -l -p 5201 -v 7 - * - * On client: - * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \ - * tr \\n \\0 | \ - * head -c 5G | \ - * nc 5201 -p 5201 - * - * Note this is compatible with regular netcat. i.e. the sender or receiver can - * be replaced with regular netcat to test the RX or TX path in isolation. - */ - -static char *server_ip = "192.168.1.4"; +static char *server_ip; static char *client_ip; -static char *port = "5201"; +static char *port; static size_t do_validation; static int start_queue = 8; static int num_queues = 8; -static char *ifname = "eth1"; +static char *ifname; static unsigned int ifindex; static unsigned int dmabuf_id; @@ -595,6 +603,15 @@ int main(int argc, char *argv[]) } } + if (!server_ip) + error(1, 0, "Missing -s argument\n"); + + if (!port) + error(1, 0, "Missing -p argument\n"); + + if (!ifname) + error(1, 0, "Missing -f argument\n"); + ifindex = if_nametoindex(ifname); for (; optind < argc; optind++) From e6619f042f3ab2f3bf34a73b51ed2e993261ba3e Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 10:01:01 +0000 Subject: [PATCH 18/34] BACKPORT: selftests: ncdevmem: Switch to AF_INET6 Use dualstack socket to support both v4 and v6. v4-mapped-v6 address can be used to do v4. Reviewed-by: Mina Almasry Reviewed-by: Joe Damato Signed-off-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- tools/testing/selftests/net/ncdevmem.c | 97 ++++++++++++++++++-------- 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c index 0feeca56c04973..645ef0bb63ec2f 100644 --- a/tools/testing/selftests/net/ncdevmem.c +++ b/tools/testing/selftests/net/ncdevmem.c @@ -242,13 +242,26 @@ static int configure_channels(unsigned int rx, unsigned int tx) return run_command("sudo ethtool -L %s rx %u tx %u", ifname, rx, tx); } -static int configure_flow_steering(void) +static int configure_flow_steering(struct sockaddr_in6 *server_sin) { - return run_command("sudo ethtool -N %s flow-type tcp4 %s %s dst-ip %s %s %s dst-port %s queue %d >&2", + const char *type = "tcp6"; + const char *server_addr; + char buf[40]; + + inet_ntop(AF_INET6, &server_sin->sin6_addr, buf, sizeof(buf)); + server_addr = buf; + + if (IN6_IS_ADDR_V4MAPPED(&server_sin->sin6_addr)) { + type = "tcp4"; + server_addr = strrchr(server_addr, ':') + 1; + } + + return run_command("sudo ethtool -N %s flow-type %s %s %s dst-ip %s %s %s dst-port %s queue %d >&2", ifname, + type, client_ip ? "src-ip" : "", client_ip ?: "", - server_ip, + server_addr, client_ip ? "src-port" : "", client_ip ? port : "", port, start_queue); @@ -299,13 +312,51 @@ static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, return -1; } +static void enable_reuseaddr(int fd) +{ + int opt = 1; + int ret; + + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)); + if (ret) + error(1, errno, "%s: [FAIL, SO_REUSEPORT]\n", TEST_PREFIX); + + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + if (ret) + error(1, errno, "%s: [FAIL, SO_REUSEADDR]\n", TEST_PREFIX); +} + +static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6) +{ + int ret; + + sin6->sin6_family = AF_INET6; + sin6->sin6_port = htons(port); + + ret = inet_pton(sin6->sin6_family, str, &sin6->sin6_addr); + if (ret != 1) { + /* fallback to plain IPv4 */ + ret = inet_pton(AF_INET, str, &sin6->sin6_addr.s6_addr32[3]); + if (ret != 1) + return -1; + + /* add ::ffff prefix */ + sin6->sin6_addr.s6_addr32[0] = 0; + sin6->sin6_addr.s6_addr32[1] = 0; + sin6->sin6_addr.s6_addr16[4] = 0; + sin6->sin6_addr.s6_addr16[5] = 0xffff; + } + + return 0; +} + int do_server(struct memory_buffer *mem) { char ctrl_data[sizeof(int) * 20000]; struct netdev_queue_id *queues; size_t non_page_aligned_frags = 0; - struct sockaddr_in client_addr; - struct sockaddr_in server_sin; + struct sockaddr_in6 client_addr; + struct sockaddr_in6 server_sin; size_t page_aligned_frags = 0; size_t total_received = 0; socklen_t client_addr_len; @@ -317,9 +368,12 @@ int do_server(struct memory_buffer *mem) int socket_fd; int client_fd; size_t i = 0; - int opt = 1; int ret; + ret = parse_address(server_ip, atoi(port), &server_sin); + if (ret < 0) + error(1, 0, "parse server address"); + if (reset_flow_steering()) error(1, 0, "Failed to reset flow steering\n"); @@ -328,7 +382,7 @@ int do_server(struct memory_buffer *mem) error(1, 0, "Failed to configure rss\n"); /* Flow steer our devmem flows to start_queue */ - if (configure_flow_steering()) + if (configure_flow_steering(&server_sin)) error(1, 0, "Failed to configure flow steering\n"); sleep(1); @@ -349,29 +403,14 @@ int do_server(struct memory_buffer *mem) if (!tmp_mem) error(1, ENOMEM, "malloc failed"); - server_sin.sin_family = AF_INET; - server_sin.sin_port = htons(atoi(port)); - - ret = inet_pton(server_sin.sin_family, server_ip, &server_sin.sin_addr); - if (ret < 0) - error(1, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX); - - socket_fd = socket(server_sin.sin_family, SOCK_STREAM, 0); + socket_fd = socket(AF_INET6, SOCK_STREAM, 0); if (socket_fd < 0) error(1, errno, "%s: [FAIL, create socket]\n", TEST_PREFIX); - ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEPORT, &opt, - sizeof(opt)); - if (ret) - error(1, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); - - ret = setsockopt(socket_fd, SOL_SOCKET, SO_REUSEADDR, &opt, - sizeof(opt)); - if (ret) - error(1, errno, "%s: [FAIL, set sock opt]\n", TEST_PREFIX); + enable_reuseaddr(socket_fd); fprintf(stderr, "binding to address %s:%d\n", server_ip, - ntohs(server_sin.sin_port)); + ntohs(server_sin.sin6_port)); ret = bind(socket_fd, &server_sin, sizeof(server_sin)); if (ret) @@ -383,16 +422,16 @@ int do_server(struct memory_buffer *mem) client_addr_len = sizeof(client_addr); - inet_ntop(server_sin.sin_family, &server_sin.sin_addr, buffer, + inet_ntop(AF_INET6, &server_sin.sin6_addr, buffer, sizeof(buffer)); fprintf(stderr, "Waiting or connection on %s:%d\n", buffer, - ntohs(server_sin.sin_port)); + ntohs(server_sin.sin6_port)); client_fd = accept(socket_fd, &client_addr, &client_addr_len); - inet_ntop(client_addr.sin_family, &client_addr.sin_addr, buffer, + inet_ntop(AF_INET6, &client_addr.sin6_addr, buffer, sizeof(buffer)); fprintf(stderr, "Got connection from %s:%d\n", buffer, - ntohs(client_addr.sin_port)); + ntohs(client_addr.sin6_port)); while (1) { struct iovec iov = { .iov_base = iobuf, From c0e9efc1a87be20b07502c0e2782376b21437f82 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 10:02:03 +0000 Subject: [PATCH 19/34] BACKPORT: selftests: ncdevmem: Properly reset flow steering ntuple off/on might be not enough to do it on all NICs. Add a bunch of shell crap to explicitly remove the rules. Reviewed-by: Mina Almasry Reviewed-by: Joe Damato Signed-off-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- tools/testing/selftests/net/ncdevmem.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c index 645ef0bb63ec2f..ad6de8e0e97bcb 100644 --- a/tools/testing/selftests/net/ncdevmem.c +++ b/tools/testing/selftests/net/ncdevmem.c @@ -217,13 +217,18 @@ void validate_buffer(void *line, size_t size) static int reset_flow_steering(void) { - int ret = 0; - - ret = run_command("sudo ethtool -K %s ntuple off >&2", ifname); - if (ret) - return ret; - - return run_command("sudo ethtool -K %s ntuple on >&2", ifname); + /* Depending on the NIC, toggling ntuple off and on might not + * be allowed. Additionally, attempting to delete existing filters + * will fail if no filters are present. Therefore, do not enforce + * the exit status. + */ + + run_command("sudo ethtool -K %s ntuple off >&2", ifname); + run_command("sudo ethtool -K %s ntuple on >&2", ifname); + run_command( + "sudo ethtool -n %s | grep 'Filter:' | awk '{print $2}' | xargs -n1 ethtool -N %s delete >&2", + ifname, ifname); + return 0; } static int configure_headersplit(bool on) From 9a93c8882e90ee0be20408957dd3cd8e1352e5c7 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 10:03:40 +0000 Subject: [PATCH 20/34] BACKPORT: selftests: ncdevmem: Use YNL to enable TCP header split In the next patch the hard-coded queue numbers are gonna be removed. So introduce some initial support for ethtool YNL and use it to enable header split. Also, tcp-data-split requires latest ethtool which is unlikely to be present in the distros right now. (ideally, we should not shell out to ethtool at all). Reviewed-by: Mina Almasry Reviewed-by: Joe Damato Signed-off-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/ncdevmem.c | 57 +++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 2c4b6e404a7c7f..edcca4c3f21e23 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -114,7 +114,7 @@ TEST_INCLUDES := forwarding/lib.sh include ../lib.mk # YNL build -YNL_GENS := netdev +YNL_GENS := ethtool netdev include ynl.mk $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c index ad6de8e0e97bcb..9ca2da3a2f63bc 100644 --- a/tools/testing/selftests/net/ncdevmem.c +++ b/tools/testing/selftests/net/ncdevmem.c @@ -55,10 +55,12 @@ #include #include #include +#include #include #include #include "netdev-user.h" +#include "ethtool-user.h" #include #define PAGE_SHIFT 12 @@ -231,10 +233,58 @@ static int reset_flow_steering(void) return 0; } +static const char *tcp_data_split_str(int val) +{ + switch (val) { + case 0: + return "off"; + case 1: + return "auto"; + case 2: + return "on"; + default: + return "?"; + } +} + static int configure_headersplit(bool on) { - return run_command("sudo ethtool -G %s tcp-data-split %s >&2", ifname, - on ? "on" : "off"); + struct ethtool_rings_get_req *get_req; + struct ethtool_rings_get_rsp *get_rsp; + struct ethtool_rings_set_req *req; + struct ynl_error yerr; + struct ynl_sock *ys; + int ret; + + ys = ynl_sock_create(&ynl_ethtool_family, &yerr); + if (!ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return -1; + } + + req = ethtool_rings_set_req_alloc(); + ethtool_rings_set_req_set_header_dev_index(req, ifindex); + /* 0 - off, 1 - auto, 2 - on */ + ethtool_rings_set_req_set_tcp_data_split(req, on ? 2 : 0); + ret = ethtool_rings_set(ys, req); + if (ret < 0) + fprintf(stderr, "YNL failed: %s\n", ys->err.msg); + ethtool_rings_set_req_free(req); + + if (ret == 0) { + get_req = ethtool_rings_get_req_alloc(); + ethtool_rings_get_req_set_header_dev_index(get_req, ifindex); + get_rsp = ethtool_rings_get(ys, get_req); + ethtool_rings_get_req_free(get_req); + if (get_rsp) + fprintf(stderr, "TCP header split: %s\n", + tcp_data_split_str(get_rsp->tcp_data_split)); + ethtool_rings_get_rsp_free(get_rsp); + } + + ynl_sock_destroy(ys); + + return ret; } static int configure_rss(void) @@ -382,6 +432,9 @@ int do_server(struct memory_buffer *mem) if (reset_flow_steering()) error(1, 0, "Failed to reset flow steering\n"); + if (configure_headersplit(1)) + error(1, 0, "Failed to enable TCP header split\n"); + /* Configure RSS to divert all traffic from our devmem queues */ if (configure_rss()) error(1, 0, "Failed to configure rss\n"); From 37d688260829c228fd1511ef252ddff5f6cc6956 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 10:04:39 +0000 Subject: [PATCH 21/34] BACKPORT: selftests: ncdevmem: Remove hard-coded queue numbers Use single last queue of the device and probe it dynamically. Reviewed-by: Mina Almasry Reviewed-by: Joe Damato Signed-off-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- tools/testing/selftests/net/ncdevmem.c | 40 ++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c index 9ca2da3a2f63bc..1ea62c129ddc25 100644 --- a/tools/testing/selftests/net/ncdevmem.c +++ b/tools/testing/selftests/net/ncdevmem.c @@ -75,8 +75,8 @@ static char *server_ip; static char *client_ip; static char *port; static size_t do_validation; -static int start_queue = 8; -static int num_queues = 8; +static int start_queue = -1; +static int num_queues = 1; static char *ifname; static unsigned int ifindex; static unsigned int dmabuf_id; @@ -208,6 +208,33 @@ void validate_buffer(void *line, size_t size) fprintf(stdout, "Validated buffer\n"); } +static int rxq_num(int ifindex) +{ + struct ethtool_channels_get_req *req; + struct ethtool_channels_get_rsp *rsp; + struct ynl_error yerr; + struct ynl_sock *ys; + int num = -1; + + ys = ynl_sock_create(&ynl_ethtool_family, &yerr); + if (!ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return -1; + } + + req = ethtool_channels_get_req_alloc(); + ethtool_channels_get_req_set_header_dev_index(req, ifindex); + rsp = ethtool_channels_get(ys, req); + if (rsp) + num = rsp->rx_count + rsp->combined_count; + ethtool_channels_get_req_free(req); + ethtool_channels_get_rsp_free(rsp); + + ynl_sock_destroy(ys); + + return num; +} + #define run_command(cmd, ...) \ ({ \ char command[256]; \ @@ -711,6 +738,15 @@ int main(int argc, char *argv[]) ifindex = if_nametoindex(ifname); + if (start_queue < 0) { + start_queue = rxq_num(ifindex) - 1; + + if (start_queue < 0) + error(1, 0, "couldn't detect number of queues\n"); + + fprintf(stderr, "using queues %d..%d\n", start_queue, start_queue + num_queues); + } + for (; optind < argc; optind++) fprintf(stderr, "extra arguments: %s\n", argv[optind]); From 6f19d8076ee551a620411761ffe242ee056e71fd Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 10:06:08 +0000 Subject: [PATCH 22/34] BACKPORT: selftests: ncdevmem: Run selftest when none of the -s or -c has been provided This will be used as a 'probe' mode in the selftest to check whether the device supports the devmem or not. Use hard-coded queue layout (two last queues) and prevent user from passing custom -q and/or -t. Reviewed-by: Mina Almasry Reviewed-by: Joe Damato Signed-off-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- tools/testing/selftests/net/ncdevmem.c | 49 ++++++++++++++++++++------ 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/net/ncdevmem.c index 1ea62c129ddc25..8e502a1f8f9bc1 100644 --- a/tools/testing/selftests/net/ncdevmem.c +++ b/tools/testing/selftests/net/ncdevmem.c @@ -76,7 +76,7 @@ static char *client_ip; static char *port; static size_t do_validation; static int start_queue = -1; -static int num_queues = 1; +static int num_queues = -1; static char *ifname; static unsigned int ifindex; static unsigned int dmabuf_id; @@ -727,19 +727,38 @@ int main(int argc, char *argv[]) } } - if (!server_ip) - error(1, 0, "Missing -s argument\n"); - - if (!port) - error(1, 0, "Missing -p argument\n"); - if (!ifname) error(1, 0, "Missing -f argument\n"); ifindex = if_nametoindex(ifname); - if (start_queue < 0) { - start_queue = rxq_num(ifindex) - 1; + if (!server_ip && !client_ip) { + if (start_queue < 0 && num_queues < 0) { + num_queues = rxq_num(ifindex); + if (num_queues < 0) + error(1, 0, "couldn't detect number of queues\n"); + if (num_queues < 2) + error(1, 0, + "number of device queues is too low\n"); + /* make sure can bind to multiple queues */ + start_queue = num_queues / 2; + num_queues /= 2; + } + + if (start_queue < 0 || num_queues < 0) + error(1, 0, "Both -t and -q are required\n"); + + run_devmem_tests(); + return 0; + } + + if (start_queue < 0 && num_queues < 0) { + num_queues = rxq_num(ifindex); + if (num_queues < 2) + error(1, 0, "number of device queues is too low\n"); + + num_queues = 1; + start_queue = rxq_num(ifindex) - num_queues; if (start_queue < 0) error(1, 0, "couldn't detect number of queues\n"); @@ -750,7 +769,17 @@ int main(int argc, char *argv[]) for (; optind < argc; optind++) fprintf(stderr, "extra arguments: %s\n", argv[optind]); - run_devmem_tests(); + if (start_queue < 0) + error(1, 0, "Missing -t argument\n"); + + if (num_queues < 0) + error(1, 0, "Missing -q argument\n"); + + if (!server_ip) + error(1, 0, "Missing -s argument\n"); + + if (!port) + error(1, 0, "Missing -p argument\n"); mem = provider->alloc(getpagesize() * NUM_PAGES); ret = is_server ? do_server(mem) : 1; From 29a372b98a886951ca7effd6e354e457b058b549 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 10:18:46 +0000 Subject: [PATCH 23/34] BACKPORT: selftests: ncdevmem: Move ncdevmem under drivers/net/hw This is where all the tests that depend on the HW functionality live in and this is where the automated test is gonna be added in the next patch. Reviewed-by: Mina Almasry Signed-off-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- tools/testing/selftests/drivers/net/hw/.gitignore | 1 + tools/testing/selftests/drivers/net/hw/Makefile | 8 ++++++++ .../testing/selftests/{net => drivers/net/hw}/ncdevmem.c | 0 tools/testing/selftests/net/.gitignore | 1 - tools/testing/selftests/net/Makefile | 6 ------ 5 files changed, 9 insertions(+), 7 deletions(-) create mode 100644 tools/testing/selftests/drivers/net/hw/.gitignore rename tools/testing/selftests/{net => drivers/net/hw}/ncdevmem.c (100%) diff --git a/tools/testing/selftests/drivers/net/hw/.gitignore b/tools/testing/selftests/drivers/net/hw/.gitignore new file mode 100644 index 00000000000000..e9fe6ede681ab8 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/.gitignore @@ -0,0 +1 @@ +ncdevmem diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index c9f2f48fc30f4d..182348f4bd4001 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -26,4 +26,12 @@ TEST_INCLUDES := \ ../../../net/forwarding/tc_common.sh \ # +# YNL files, must be before "include ..lib.mk" +YNL_GEN_FILES := ncdevmem +TEST_GEN_FILES += $(YNL_GEN_FILES) + include ../../../lib.mk + +# YNL build +YNL_GENS := ethtool netdev +include ../../../net/ynl.mk diff --git a/tools/testing/selftests/net/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c similarity index 100% rename from tools/testing/selftests/net/ncdevmem.c rename to tools/testing/selftests/drivers/net/hw/ncdevmem.c diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 59fe07ee2df9d9..cec3e1296aa271 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -18,7 +18,6 @@ ipv6_flowlabel_mgr log.txt msg_oob msg_zerocopy -ncdevmem netlink-dumps nettest psock_fanout diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index edcca4c3f21e23..0466ce43409fa2 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -101,8 +101,6 @@ TEST_PROGS += ipv6_route_update_soft_lockup.sh # YNL files, must be before "include ..lib.mk" EXTRA_CLEAN += $(OUTPUT)/libynl.a -YNL_GEN_FILES := ncdevmem -TEST_GEN_FILES += $(YNL_GEN_FILES) TEST_FILES := settings TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh @@ -113,10 +111,6 @@ TEST_INCLUDES := forwarding/lib.sh include ../lib.mk -# YNL build -YNL_GENS := ethtool netdev -include ynl.mk - $(OUTPUT)/epoll_busy_poll: LDLIBS += -lcap $(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma $(OUTPUT)/tcp_mmap: LDLIBS += -lpthread -lcrypto From 79da72d4890260bd81abfd179caeeb639ae1ceb1 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 10:21:21 +0000 Subject: [PATCH 24/34] BACKPORT: selftests: ncdevmem: Add automated test Only RX side for now and small message to test the setup. In the future, we can extend it to TX side and to testing both sides with a couple of megs of data. make \ -C tools/testing/selftests \ TARGETS="drivers/hw/net" \ install INSTALL_PATH=~/tmp/ksft scp ~/tmp/ksft ${HOST}: scp ~/tmp/ksft ${PEER}: cfg+="NETIF=${DEV}\n" cfg+="LOCAL_V6=${HOST_IP}\n" cfg+="REMOTE_V6=${PEER_IP}\n" cfg+="REMOTE_TYPE=ssh\n" cfg+="REMOTE_ARGS=root@${PEER}\n" echo -e "$cfg" | ssh root@${HOST} "cat > ksft/drivers/net/net.config" ssh root@${HOST} "cd ksft && ./run_kselftest.sh -t drivers/net:devmem.py" Reviewed-by: Mina Almasry Signed-off-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- .../testing/selftests/drivers/net/hw/Makefile | 1 + .../selftests/drivers/net/hw/devmem.py | 45 +++++++++++++++++++ 2 files changed, 46 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/hw/devmem.py diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index 182348f4bd4001..1c6a77480923c4 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -3,6 +3,7 @@ TEST_PROGS = \ csum.py \ devlink_port_split.py \ + devmem.py \ ethtool.sh \ ethtool_extended_state.sh \ ethtool_mm.sh \ diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py new file mode 100755 index 00000000000000..1223f0f5c10c34 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/devmem.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +from lib.py import ksft_run, ksft_exit +from lib.py import ksft_eq, KsftSkipEx +from lib.py import NetDrvEpEnv +from lib.py import bkg, cmd, rand_port, wait_port_listen +from lib.py import ksft_disruptive + + +def require_devmem(cfg): + if not hasattr(cfg, "_devmem_probed"): + port = rand_port() + probe_command = f"./ncdevmem -f {cfg.ifname}" + cfg._devmem_supported = cmd(probe_command, fail=False, shell=True).ret == 0 + cfg._devmem_probed = True + + if not cfg._devmem_supported: + raise KsftSkipEx("Test requires devmem support") + + +@ksft_disruptive +def check_rx(cfg) -> None: + cfg.require_v6() + require_devmem(cfg) + + port = rand_port() + listen_cmd = f"./ncdevmem -l -f {cfg.ifname} -s {cfg.v6} -p {port}" + + with bkg(listen_cmd) as socat: + wait_port_listen(port) + cmd(f"echo -e \"hello\\nworld\"| socat -u - TCP6:[{cfg.v6}]:{port}", host=cfg.remote, shell=True) + + ksft_eq(socat.stdout.strip(), "hello\nworld") + + +def main() -> None: + with NetDrvEpEnv(__file__) as cfg: + ksft_run([check_rx], + args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() From 6bc9789602dfdc90f788c40f248e027c4647e08d Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 11:09:59 +0000 Subject: [PATCH 25/34] BACKPORT: ncdevmem doesn't need libmnl, remove the unnecessary include Since YNL doesn't depend on libmnl either, any more, it's actually possible to build selftests without having libmnl installed. Note: This is required to let the ncdevmem test to build for 6.12 Signed-off-by: Jakub Kicinski Signed-off-by: Pranjal Shrivastava --- tools/testing/selftests/drivers/net/hw/ncdevmem.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index 8e502a1f8f9bc1..a63b2bfd295f16 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include From 9abf1452fc77e7899562389f97c300607828ca5a Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 11:37:19 +0000 Subject: [PATCH 26/34] BACKPORT: selftests: ncdevmem: Implement devmem TCP TX Add support for devmem TX in ncdevmem. This is a combination of the ncdevmem from the devmem TCP series RFCv1 which included the TX path, and work by Stan to include the netlink API and refactored on top of his generic memory_provider support. Signed-off-by: Mina Almasry Signed-off-by: Stanislav Fomichev Acked-by: Stanislav Fomichev Signed-off-by: Pranjal Shrivastava --- .../selftests/drivers/net/hw/devmem.py | 24 +- .../selftests/drivers/net/hw/ncdevmem.c | 299 +++++++++++++++++- 2 files changed, 308 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/drivers/net/hw/devmem.py b/tools/testing/selftests/drivers/net/hw/devmem.py index 1223f0f5c10c34..39696aaf219d47 100755 --- a/tools/testing/selftests/drivers/net/hw/devmem.py +++ b/tools/testing/selftests/drivers/net/hw/devmem.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # SPDX-License-Identifier: GPL-2.0 +from os import path from lib.py import ksft_run, ksft_exit from lib.py import ksft_eq, KsftSkipEx from lib.py import NetDrvEpEnv @@ -10,8 +11,7 @@ def require_devmem(cfg): if not hasattr(cfg, "_devmem_probed"): - port = rand_port() - probe_command = f"./ncdevmem -f {cfg.ifname}" + probe_command = f"{cfg.bin_local} -f {cfg.ifname}" cfg._devmem_supported = cmd(probe_command, fail=False, shell=True).ret == 0 cfg._devmem_probed = True @@ -25,7 +25,7 @@ def check_rx(cfg) -> None: require_devmem(cfg) port = rand_port() - listen_cmd = f"./ncdevmem -l -f {cfg.ifname} -s {cfg.v6} -p {port}" + listen_cmd = f"{cfg.bin_local} -l -f {cfg.ifname} -s {cfg.addr_v['6']} -p {port}" with bkg(listen_cmd) as socat: wait_port_listen(port) @@ -33,10 +33,26 @@ def check_rx(cfg) -> None: ksft_eq(socat.stdout.strip(), "hello\nworld") +@ksft_disruptive +def check_tx(cfg) -> None: + cfg.require_ipver("6") + require_devmem(cfg) + + port = rand_port() + listen_cmd = f"socat -U - TCP6-LISTEN:{port}" + + with bkg(listen_cmd, exit_wait=True) as socat: + wait_port_listen(port) + cmd(f"echo -e \"hello\\nworld\"| {cfg.bin_remote} -f {cfg.ifname} -s {cfg.addr_v['6']} -p {port}", host=cfg.remote, shell=True) + + ksft_eq(socat.stdout.strip(), "hello\nworld") def main() -> None: with NetDrvEpEnv(__file__) as cfg: - ksft_run([check_rx], + cfg.bin_local = path.abspath(path.dirname(__file__) + "/ncdevmem") + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + ksft_run([check_rx, check_tx], args=(cfg, )) ksft_exit() diff --git a/tools/testing/selftests/drivers/net/hw/ncdevmem.c b/tools/testing/selftests/drivers/net/hw/ncdevmem.c index a63b2bfd295f16..fe8a5ceaa98038 100644 --- a/tools/testing/selftests/drivers/net/hw/ncdevmem.c +++ b/tools/testing/selftests/drivers/net/hw/ncdevmem.c @@ -9,22 +9,30 @@ * ncdevmem -s [-c ] -f eth1 -l -p 5201 * * On client: - * echo -n "hello\nworld" | nc -s 5201 -p 5201 + * echo -n "hello\nworld" | \ + * ncdevmem -s [-c ] -p 5201 -f eth1 + * Note this is compatible with regular netcat. i.e. the sender or receiver can + * be replaced with regular netcat to test the RX or TX path in isolation. * - * Test data validation: + * Test data validation (devmem TCP on RX only): * * On server: * ncdevmem -s [-c ] -f eth1 -l -p 5201 -v 7 * * On client: * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06) | \ - * tr \\n \\0 | \ - * head -c 5G | \ + * head -c 1G | \ * nc 5201 -p 5201 * + * Test data validation (devmem TCP on RX and TX, validation happens on RX): * - * Note this is compatible with regular netcat. i.e. the sender or receiver can - * be replaced with regular netcat to test the RX or TX path in isolation. + * On server: + * ncdevmem -s [-c ] -l -p 5201 -v 8 -f eth1 + * + * On client: + * yes $(echo -e \\x01\\x02\\x03\\x04\\x05\\x06\\x07) | \ + * head -c 1M | \ + * ncdevmem -s [-c ] -p 5201 -f eth1 */ #define _GNU_SOURCE #define __EXPORTED_HEADERS__ @@ -40,15 +48,18 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include +#include #include #include #include @@ -79,6 +90,8 @@ static int num_queues = -1; static char *ifname; static unsigned int ifindex; static unsigned int dmabuf_id; +static uint32_t tx_dmabuf_id; +static int waittime_ms = 500; struct memory_buffer { int fd; @@ -92,6 +105,8 @@ struct memory_buffer { struct memory_provider { struct memory_buffer *(*alloc)(size_t size); void (*free)(struct memory_buffer *ctx); + void (*memcpy_to_device)(struct memory_buffer *dst, size_t off, + void *src, int n); void (*memcpy_from_device)(void *dst, struct memory_buffer *src, size_t off, int n); }; @@ -152,6 +167,20 @@ static void udmabuf_free(struct memory_buffer *ctx) free(ctx); } +static void udmabuf_memcpy_to_device(struct memory_buffer *dst, size_t off, + void *src, int n) +{ + struct dma_buf_sync sync = {}; + + sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE; + ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync); + + memcpy(dst->buf_mem + off, src, n); + + sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE; + ioctl(dst->fd, DMA_BUF_IOCTL_SYNC, &sync); +} + static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src, size_t off, int n) { @@ -169,6 +198,7 @@ static void udmabuf_memcpy_from_device(void *dst, struct memory_buffer *src, static struct memory_provider udmabuf_memory_provider = { .alloc = udmabuf_alloc, .free = udmabuf_free, + .memcpy_to_device = udmabuf_memcpy_to_device, .memcpy_from_device = udmabuf_memcpy_from_device, }; @@ -187,14 +217,16 @@ void validate_buffer(void *line, size_t size) { static unsigned char seed = 1; unsigned char *ptr = line; - int errors = 0; + unsigned char expected; + static int errors; size_t i; for (i = 0; i < size; i++) { - if (ptr[i] != seed) { + expected = seed ? seed : '\n'; + if (ptr[i] != expected) { fprintf(stderr, "Failed validation: expected=%u, actual=%u, index=%lu\n", - seed, ptr[i], i); + expected, ptr[i], i); errors++; if (errors > 20) error(1, 0, "validation failed."); @@ -393,6 +425,49 @@ static int bind_rx_queue(unsigned int ifindex, unsigned int dmabuf_fd, return -1; } +static int bind_tx_queue(unsigned int ifindex, unsigned int dmabuf_fd, + struct ynl_sock **ys) +{ + struct netdev_bind_tx_req *req = NULL; + struct netdev_bind_tx_rsp *rsp = NULL; + struct ynl_error yerr; + + *ys = ynl_sock_create(&ynl_netdev_family, &yerr); + if (!*ys) { + fprintf(stderr, "YNL: %s\n", yerr.msg); + return -1; + } + + req = netdev_bind_tx_req_alloc(); + netdev_bind_tx_req_set_ifindex(req, ifindex); + netdev_bind_tx_req_set_fd(req, dmabuf_fd); + + rsp = netdev_bind_tx(*ys, req); + if (!rsp) { + perror("netdev_bind_tx"); + goto err_close; + } + + if (!rsp->_present.id) { + perror("id not present"); + goto err_close; + } + + fprintf(stderr, "got tx dmabuf id=%d\n", rsp->id); + tx_dmabuf_id = rsp->id; + + netdev_bind_tx_req_free(req); + netdev_bind_tx_rsp_free(rsp); + + return 0; + +err_close: + fprintf(stderr, "YNL failed: %s\n", (*ys)->err.msg); + netdev_bind_tx_req_free(req); + ynl_sock_destroy(*ys); + return -1; +} + static void enable_reuseaddr(int fd) { int opt = 1; @@ -431,7 +506,7 @@ static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6) return 0; } -int do_server(struct memory_buffer *mem) +static int do_server(struct memory_buffer *mem) { char ctrl_data[sizeof(int) * 20000]; struct netdev_queue_id *queues; @@ -688,6 +763,206 @@ void run_devmem_tests(void) provider->free(mem); } +static uint64_t gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000ULL) + (tv.tv_usec / 1000ULL); +} + +static int do_poll(int fd) +{ + struct pollfd pfd; + int ret; + + pfd.revents = 0; + pfd.fd = fd; + + ret = poll(&pfd, 1, waittime_ms); + if (ret == -1) + error(1, errno, "poll"); + + return ret && (pfd.revents & POLLERR); +} + +static void wait_compl(int fd) +{ + int64_t tstop = gettimeofday_ms() + waittime_ms; + char control[CMSG_SPACE(100)] = {}; + struct sock_extended_err *serr; + struct msghdr msg = {}; + struct cmsghdr *cm; + __u32 hi, lo; + int ret; + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + while (gettimeofday_ms() < tstop) { + if (!do_poll(fd)) + continue; + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); + if (ret < 0) { + if (errno == EAGAIN) + continue; + error(1, errno, "recvmsg(MSG_ERRQUEUE)"); + return; + } + if (msg.msg_flags & MSG_CTRUNC) + error(1, 0, "MSG_CTRUNC\n"); + + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + if (cm->cmsg_level != SOL_IP && + cm->cmsg_level != SOL_IPV6) + continue; + if (cm->cmsg_level == SOL_IP && + cm->cmsg_type != IP_RECVERR) + continue; + if (cm->cmsg_level == SOL_IPV6 && + cm->cmsg_type != IPV6_RECVERR) + continue; + + serr = (void *)CMSG_DATA(cm); + if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY) + error(1, 0, "wrong origin %u", serr->ee_origin); + if (serr->ee_errno != 0) + error(1, 0, "wrong errno %d", serr->ee_errno); + + hi = serr->ee_data; + lo = serr->ee_info; + + fprintf(stderr, "tx complete [%d,%d]\n", lo, hi); + return; + } + } + + error(1, 0, "did not receive tx completion"); +} + +static int do_client(struct memory_buffer *mem) +{ + char ctrl_data[CMSG_SPACE(sizeof(__u32))]; + struct sockaddr_in6 server_sin; + struct sockaddr_in6 client_sin; + struct ynl_sock *ys = NULL; + struct msghdr msg = {}; + ssize_t line_size = 0; + struct cmsghdr *cmsg; + struct iovec iov[2]; + char *line = NULL; + unsigned long mid; + size_t len = 0; + int socket_fd; + __u32 ddmabuf; + int opt = 1; + int ret; + + ret = parse_address(server_ip, atoi(port), &server_sin); + if (ret < 0) + error(1, 0, "parse server address"); + + socket_fd = socket(AF_INET6, SOCK_STREAM, 0); + if (socket_fd < 0) + error(1, socket_fd, "create socket"); + + enable_reuseaddr(socket_fd); + + ret = setsockopt(socket_fd, SOL_SOCKET, SO_BINDTODEVICE, ifname, + strlen(ifname) + 1); + if (ret) + error(1, errno, "bindtodevice"); + + if (bind_tx_queue(ifindex, mem->fd, &ys)) + error(1, 0, "Failed to bind\n"); + + if (client_ip) { + ret = parse_address(client_ip, atoi(port), &client_sin); + if (ret < 0) + error(1, 0, "parse client address"); + + ret = bind(socket_fd, &client_sin, sizeof(client_sin)); + if (ret) + error(1, errno, "bind"); + } + + ret = setsockopt(socket_fd, SOL_SOCKET, SO_ZEROCOPY, &opt, sizeof(opt)); + if (ret) + error(1, errno, "set sock opt"); + + fprintf(stderr, "Connect to %s %d (via %s)\n", server_ip, + ntohs(server_sin.sin6_port), ifname); + + ret = connect(socket_fd, &server_sin, sizeof(server_sin)); + if (ret) + error(1, errno, "connect"); + + while (1) { + free(line); + line = NULL; + line_size = getline(&line, &len, stdin); + + if (line_size < 0) + break; + + mid = (line_size / 2) + 1; + + iov[0].iov_base = (void *)1; + iov[0].iov_len = mid; + iov[1].iov_base = (void *)(mid + 2); + iov[1].iov_len = line_size - mid; + + provider->memcpy_to_device(mem, (size_t)iov[0].iov_base, line, + iov[0].iov_len); + provider->memcpy_to_device(mem, (size_t)iov[1].iov_base, + line + iov[0].iov_len, + iov[1].iov_len); + + fprintf(stderr, + "read line_size=%ld iov[0].iov_base=%lu, iov[0].iov_len=%lu, iov[1].iov_base=%lu, iov[1].iov_len=%lu\n", + line_size, (unsigned long)iov[0].iov_base, + iov[0].iov_len, (unsigned long)iov[1].iov_base, + iov[1].iov_len); + + msg.msg_iov = iov; + msg.msg_iovlen = 2; + + msg.msg_control = ctrl_data; + msg.msg_controllen = sizeof(ctrl_data); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_DEVMEM_DMABUF; + cmsg->cmsg_len = CMSG_LEN(sizeof(__u32)); + + ddmabuf = tx_dmabuf_id; + + *((__u32 *)CMSG_DATA(cmsg)) = ddmabuf; + + ret = sendmsg(socket_fd, &msg, MSG_ZEROCOPY); + if (ret < 0) + error(1, errno, "Failed sendmsg"); + + fprintf(stderr, "sendmsg_ret=%d\n", ret); + + if (ret != line_size) + error(1, errno, "Did not send all bytes"); + + wait_compl(socket_fd); + } + + fprintf(stderr, "%s: tx ok\n", TEST_PREFIX); + + free(line); + close(socket_fd); + + if (ys) + ynl_sock_destroy(ys); + + return 0; +} + int main(int argc, char *argv[]) { struct memory_buffer *mem; @@ -731,6 +1006,8 @@ int main(int argc, char *argv[]) ifindex = if_nametoindex(ifname); + fprintf(stderr, "using ifindex=%u\n", ifindex); + if (!server_ip && !client_ip) { if (start_queue < 0 && num_queues < 0) { num_queues = rxq_num(ifindex); @@ -781,7 +1058,7 @@ int main(int argc, char *argv[]) error(1, 0, "Missing -p argument\n"); mem = provider->alloc(getpagesize() * NUM_PAGES); - ret = is_server ? do_server(mem) : 1; + ret = is_server ? do_server(mem) : do_client(mem); provider->free(mem); return ret; From d81e9664977c277b69250fea7a6b51c53a3187dd Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 12:40:13 +0000 Subject: [PATCH 27/34] BACKPORT: gve: Add RSS cache for non RSS device option scenario Not all the devices have the capability for the driver to query for the registered RSS configuration. The driver can discover this by checking the relevant device option during setup. If it cannot, the driver needs to store the RSS config cache and directly return such cache when queried by the ethtool. RSS config is inited when driver probes. Also the default RSS config will be adjusted when there is RX queue count change. At this point, only keys of GVE_RSS_KEY_SIZE and indirection tables of GVE_RSS_INDIR_SIZE are supported. Signed-off-by: Ziwei Xiao Reviewed-by: Harshitha Ramamurthy Reviewed-by: Willem de Bruijn Signed-off-by: Praveen Kaligineedi Signed-off-by: Jeroen de Borst Signed-off-by: Pranjal Shrivastava --- drivers/net/ethernet/google/gve/gve.h | 16 +++- drivers/net/ethernet/google/gve/gve_adminq.c | 66 +++++++++++--- drivers/net/ethernet/google/gve/gve_ethtool.c | 60 +++++++++++-- drivers/net/ethernet/google/gve/gve_main.c | 86 ++++++++++++++++++- 4 files changed, 205 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 8ddd366d9fde54..242c8b1db310cf 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -65,6 +65,9 @@ #define GVE_FLOW_RULE_IDS_CACHE_SIZE \ (GVE_ADMINQ_BUFFER_SIZE / sizeof(((struct gve_adminq_queried_flow_rule *)0)->location)) +#define GVE_RSS_KEY_SIZE 40 +#define GVE_RSS_INDIR_SIZE 128 + #define GVE_XDP_ACTIONS 5 #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 @@ -666,6 +669,7 @@ struct gve_rx_alloc_rings_cfg { u16 packet_buffer_size; bool raw_addressing; bool enable_header_split; + bool reset_rss; /* Allocated resources are returned here */ struct gve_rx_ring *rx; @@ -716,6 +720,11 @@ struct gve_flow_rules_cache { u32 rule_ids_cache_num; }; +struct gve_rss_config { + u8 *hash_key; + u32 *hash_lut; +}; + struct gve_priv { struct net_device *dev; struct gve_tx_ring *tx; /* array of tx_cfg.num_queues */ @@ -836,6 +845,8 @@ struct gve_priv { u16 rss_key_size; u16 rss_lut_size; + bool cache_rss_config; + struct gve_rss_config rss_config; }; enum gve_service_task_flags_bit { @@ -1184,13 +1195,16 @@ int gve_adjust_config(struct gve_priv *priv, struct gve_rx_alloc_rings_cfg *rx_alloc_cfg); int gve_adjust_queues(struct gve_priv *priv, struct gve_queue_config new_rx_config, - struct gve_queue_config new_tx_config); + struct gve_queue_config new_tx_config, + bool reset_rss); /* flow steering rule */ int gve_get_flow_rule_entry(struct gve_priv *priv, struct ethtool_rxnfc *cmd); int gve_get_flow_rule_ids(struct gve_priv *priv, struct ethtool_rxnfc *cmd, u32 *rule_locs); int gve_add_flow_rule(struct gve_priv *priv, struct ethtool_rxnfc *cmd); int gve_del_flow_rule(struct gve_priv *priv, struct ethtool_rxnfc *cmd); int gve_flow_rules_reset(struct gve_priv *priv); +/* RSS config */ +int gve_init_rss_config(struct gve_priv *priv, u16 num_queues); /* report stats handling */ void gve_handle_report_stats(struct gve_priv *priv); /* exported by ethtool.c */ diff --git a/drivers/net/ethernet/google/gve/gve_adminq.c b/drivers/net/ethernet/google/gve/gve_adminq.c index 060e0e6749380f..b7585a0f4221b1 100644 --- a/drivers/net/ethernet/google/gve/gve_adminq.c +++ b/drivers/net/ethernet/google/gve/gve_adminq.c @@ -885,6 +885,15 @@ static void gve_set_default_desc_cnt(struct gve_priv *priv, priv->min_rx_desc_cnt = priv->rx_desc_cnt; } +static void gve_set_default_rss_sizes(struct gve_priv *priv) +{ + if (!gve_is_gqi(priv)) { + priv->rss_key_size = GVE_RSS_KEY_SIZE; + priv->rss_lut_size = GVE_RSS_INDIR_SIZE; + priv->cache_rss_config = true; + } +} + static void gve_enable_supported_features(struct gve_priv *priv, u32 supported_features_mask, const struct gve_device_option_jumbo_frames @@ -968,6 +977,10 @@ static void gve_enable_supported_features(struct gve_priv *priv, be16_to_cpu(dev_op_rss_config->hash_key_size); priv->rss_lut_size = be16_to_cpu(dev_op_rss_config->hash_lut_size); + priv->cache_rss_config = false; + dev_dbg(&priv->pdev->dev, + "RSS device option enabled with key size of %u, lut size of %u.\n", + priv->rss_key_size, priv->rss_lut_size); } } @@ -1052,6 +1065,8 @@ int gve_adminq_describe_device(struct gve_priv *priv) /* set default descriptor counts */ gve_set_default_desc_cnt(priv, descriptor); + gve_set_default_rss_sizes(priv); + /* DQO supports LRO. */ if (!gve_is_gqi(priv)) priv->dev->hw_features |= NETIF_F_LRO; @@ -1290,8 +1305,9 @@ int gve_adminq_reset_flow_rules(struct gve_priv *priv) int gve_adminq_configure_rss(struct gve_priv *priv, struct ethtool_rxfh_param *rxfh) { + const u32 *hash_lut_to_config = NULL; + const u8 *hash_key_to_config = NULL; dma_addr_t lut_bus = 0, key_bus = 0; - u16 key_size = 0, lut_size = 0; union gve_adminq_command cmd; __be32 *lut = NULL; u8 hash_alg = 0; @@ -1301,7 +1317,7 @@ int gve_adminq_configure_rss(struct gve_priv *priv, struct ethtool_rxfh_param *r switch (rxfh->hfunc) { case ETH_RSS_HASH_NO_CHANGE: - break; + fallthrough; case ETH_RSS_HASH_TOP: hash_alg = ETH_RSS_HASH_TOP; break; @@ -1310,27 +1326,46 @@ int gve_adminq_configure_rss(struct gve_priv *priv, struct ethtool_rxfh_param *r } if (rxfh->indir) { - lut_size = priv->rss_lut_size; + if (rxfh->indir_size != priv->rss_lut_size) + return -EINVAL; + + hash_lut_to_config = rxfh->indir; + } else if (priv->cache_rss_config) { + hash_lut_to_config = priv->rss_config.hash_lut; + } + + if (hash_lut_to_config) { lut = dma_alloc_coherent(&priv->pdev->dev, - lut_size * sizeof(*lut), + priv->rss_lut_size * sizeof(*lut), &lut_bus, GFP_KERNEL); if (!lut) return -ENOMEM; for (i = 0; i < priv->rss_lut_size; i++) - lut[i] = cpu_to_be32(rxfh->indir[i]); + lut[i] = cpu_to_be32(hash_lut_to_config[i]); } if (rxfh->key) { - key_size = priv->rss_key_size; + if (rxfh->key_size != priv->rss_key_size) { + err = -EINVAL; + goto out; + } + + hash_key_to_config = rxfh->key; + } else if (priv->cache_rss_config) { + hash_key_to_config = priv->rss_config.hash_key; + } + + if (hash_key_to_config) { key = dma_alloc_coherent(&priv->pdev->dev, - key_size, &key_bus, GFP_KERNEL); + priv->rss_key_size, + &key_bus, GFP_KERNEL); if (!key) { err = -ENOMEM; goto out; } - memcpy(key, rxfh->key, key_size); + memcpy(key, hash_key_to_config, priv->rss_key_size); } /* Zero-valued fields in the cmd.configure_rss instruct the device to @@ -1344,8 +1379,10 @@ int gve_adminq_configure_rss(struct gve_priv *priv, struct ethtool_rxfh_param *r BIT(GVE_RSS_HASH_TCPV6) | BIT(GVE_RSS_HASH_UDPV6)), .hash_alg = hash_alg, - .hash_key_size = cpu_to_be16(key_size), - .hash_lut_size = cpu_to_be16(lut_size), + .hash_key_size = + cpu_to_be16((key_bus) ? priv->rss_key_size : 0), + .hash_lut_size = + cpu_to_be16((lut_bus) ? priv->rss_lut_size : 0), .hash_key_addr = cpu_to_be64(key_bus), .hash_lut_addr = cpu_to_be64(lut_bus), }; @@ -1355,11 +1392,11 @@ int gve_adminq_configure_rss(struct gve_priv *priv, struct ethtool_rxfh_param *r out: if (lut) dma_free_coherent(&priv->pdev->dev, - lut_size * sizeof(*lut), + priv->rss_lut_size * sizeof(*lut), lut, lut_bus); if (key) dma_free_coherent(&priv->pdev->dev, - key_size, key, key_bus); + priv->rss_key_size, key, key_bus); return err; } @@ -1463,12 +1500,15 @@ static int gve_adminq_process_rss_query(struct gve_priv *priv, rxfh->hfunc = descriptor->hash_alg; rss_info_addr = (void *)(descriptor + 1); - if (rxfh->key) + if (rxfh->key) { + rxfh->key_size = priv->rss_key_size; memcpy(rxfh->key, rss_info_addr, priv->rss_key_size); + } rss_info_addr += priv->rss_key_size; lut = (__be32 *)rss_info_addr; if (rxfh->indir) { + rxfh->indir_size = priv->rss_lut_size; for (i = 0; i < priv->rss_lut_size; i++) rxfh->indir[i] = be32_to_cpu(lut[i]); } diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index bdfc6e77b2af56..a572f1e059342e 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -482,6 +482,7 @@ static int gve_set_channels(struct net_device *netdev, struct ethtool_channels old_settings; int new_tx = cmd->tx_count; int new_rx = cmd->rx_count; + bool reset_rss = false; gve_get_channels(netdev, &old_settings); @@ -498,16 +499,14 @@ static int gve_set_channels(struct net_device *netdev, return -EINVAL; } - if (!netif_running(netdev)) { - priv->tx_cfg.num_queues = new_tx; - priv->rx_cfg.num_queues = new_rx; - return 0; - } + if (new_rx != priv->rx_cfg.num_queues && + priv->cache_rss_config && !netif_is_rxfh_configured(netdev)) + reset_rss = true; new_tx_cfg.num_queues = new_tx; new_rx_cfg.num_queues = new_rx; - return gve_adjust_queues(priv, new_rx_cfg, new_tx_cfg); + return gve_adjust_queues(priv, new_rx_cfg, new_tx_cfg, reset_rss); } static void gve_get_ringparam(struct net_device *netdev, @@ -855,6 +854,25 @@ static u32 gve_get_rxfh_indir_size(struct net_device *netdev) return priv->rss_lut_size; } +static void gve_get_rss_config_cache(struct gve_priv *priv, + struct ethtool_rxfh_param *rxfh) +{ + struct gve_rss_config *rss_config = &priv->rss_config; + + rxfh->hfunc = ETH_RSS_HASH_TOP; + + if (rxfh->key) { + rxfh->key_size = priv->rss_key_size; + memcpy(rxfh->key, rss_config->hash_key, priv->rss_key_size); + } + + if (rxfh->indir) { + rxfh->indir_size = priv->rss_lut_size; + memcpy(rxfh->indir, rss_config->hash_lut, + priv->rss_lut_size * sizeof(*rxfh->indir)); + } +} + static int gve_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh) { struct gve_priv *priv = netdev_priv(netdev); @@ -862,18 +880,46 @@ static int gve_get_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rx if (!priv->rss_key_size || !priv->rss_lut_size) return -EOPNOTSUPP; + if (priv->cache_rss_config) { + gve_get_rss_config_cache(priv, rxfh); + return 0; + } + return gve_adminq_query_rss_config(priv, rxfh); } +static void gve_set_rss_config_cache(struct gve_priv *priv, + struct ethtool_rxfh_param *rxfh) +{ + struct gve_rss_config *rss_config = &priv->rss_config; + + if (rxfh->key) + memcpy(rss_config->hash_key, rxfh->key, priv->rss_key_size); + + if (rxfh->indir) + memcpy(rss_config->hash_lut, rxfh->indir, + priv->rss_lut_size * sizeof(*rxfh->indir)); +} + static int gve_set_rxfh(struct net_device *netdev, struct ethtool_rxfh_param *rxfh, struct netlink_ext_ack *extack) { struct gve_priv *priv = netdev_priv(netdev); + int err; if (!priv->rss_key_size || !priv->rss_lut_size) return -EOPNOTSUPP; - return gve_adminq_configure_rss(priv, rxfh); + err = gve_adminq_configure_rss(priv, rxfh); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Fail to configure RSS config"); + return err; + } + + if (priv->cache_rss_config) + gve_set_rss_config_cache(priv, rxfh); + + return 0; } const struct ethtool_ops gve_ethtool_ops = { diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 8673b10326243b..99a4827e98552d 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -184,6 +184,43 @@ static void gve_free_flow_rule_caches(struct gve_priv *priv) flow_rules_cache->rules_cache = NULL; } +static int gve_alloc_rss_config_cache(struct gve_priv *priv) +{ + struct gve_rss_config *rss_config = &priv->rss_config; + + if (!priv->cache_rss_config) + return 0; + + rss_config->hash_key = kcalloc(priv->rss_key_size, + sizeof(rss_config->hash_key[0]), + GFP_KERNEL); + if (!rss_config->hash_key) + return -ENOMEM; + + rss_config->hash_lut = kcalloc(priv->rss_lut_size, + sizeof(rss_config->hash_lut[0]), + GFP_KERNEL); + if (!rss_config->hash_lut) + goto free_rss_key_cache; + + return 0; + +free_rss_key_cache: + kfree(rss_config->hash_key); + rss_config->hash_key = NULL; + return -ENOMEM; +} + +static void gve_free_rss_config_cache(struct gve_priv *priv) +{ + struct gve_rss_config *rss_config = &priv->rss_config; + + kfree(rss_config->hash_key); + kfree(rss_config->hash_lut); + + memset(rss_config, 0, sizeof(*rss_config)); +} + static int gve_alloc_counter_array(struct gve_priv *priv) { priv->counter_array = @@ -575,9 +612,12 @@ static int gve_setup_device_resources(struct gve_priv *priv) err = gve_alloc_flow_rule_caches(priv); if (err) return err; - err = gve_alloc_counter_array(priv); + err = gve_alloc_rss_config_cache(priv); if (err) goto abort_with_flow_rule_caches; + err = gve_alloc_counter_array(priv); + if (err) + goto abort_with_rss_config_cache; err = gve_alloc_notify_blocks(priv); if (err) goto abort_with_counter; @@ -611,6 +651,12 @@ static int gve_setup_device_resources(struct gve_priv *priv) } } + err = gve_init_rss_config(priv, priv->rx_cfg.num_queues); + if (err) { + dev_err(&priv->pdev->dev, "Failed to init RSS config"); + goto abort_with_ptype_lut; + } + err = gve_adminq_report_stats(priv, priv->stats_report_len, priv->stats_report_bus, GVE_STATS_REPORT_TIMER_PERIOD); @@ -629,6 +675,8 @@ static int gve_setup_device_resources(struct gve_priv *priv) gve_free_notify_blocks(priv); abort_with_counter: gve_free_counter_array(priv); +abort_with_rss_config_cache: + gve_free_rss_config_cache(priv); abort_with_flow_rule_caches: gve_free_flow_rule_caches(priv); @@ -669,6 +717,7 @@ static void gve_teardown_device_resources(struct gve_priv *priv) priv->ptype_lut_dqo = NULL; gve_free_flow_rule_caches(priv); + gve_free_rss_config_cache(priv); gve_free_counter_array(priv); gve_free_notify_blocks(priv); gve_free_stats_report(priv); @@ -1390,6 +1439,12 @@ static int gve_queues_start(struct gve_priv *priv, if (err) goto stop_and_free_rings; + if (rx_alloc_cfg->reset_rss) { + err = gve_init_rss_config(priv, priv->rx_cfg.num_queues); + if (err) + goto reset; + } + err = gve_register_qpls(priv); if (err) goto reset; @@ -1786,6 +1841,26 @@ static int gve_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } +int gve_init_rss_config(struct gve_priv *priv, u16 num_queues) +{ + struct gve_rss_config *rss_config = &priv->rss_config; + struct ethtool_rxfh_param rxfh = {0}; + u16 i; + + if (!priv->cache_rss_config) + return 0; + + for (i = 0; i < priv->rss_lut_size; i++) + rss_config->hash_lut[i] = + ethtool_rxfh_indir_default(i, num_queues); + + netdev_rss_key_fill(rss_config->hash_key, priv->rss_key_size); + + rxfh.hfunc = ETH_RSS_HASH_TOP; + + return gve_adminq_configure_rss(priv, &rxfh); +} + int gve_flow_rules_reset(struct gve_priv *priv) { if (!priv->max_flow_rules) @@ -1834,7 +1909,8 @@ int gve_adjust_config(struct gve_priv *priv, int gve_adjust_queues(struct gve_priv *priv, struct gve_queue_config new_rx_config, - struct gve_queue_config new_tx_config) + struct gve_queue_config new_tx_config, + bool reset_rss) { struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0}; struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0}; @@ -1847,6 +1923,7 @@ int gve_adjust_queues(struct gve_priv *priv, tx_alloc_cfg.qcfg = &new_tx_config; rx_alloc_cfg.qcfg_tx = &new_tx_config; rx_alloc_cfg.qcfg = &new_rx_config; + rx_alloc_cfg.reset_rss = reset_rss; tx_alloc_cfg.num_rings = new_tx_config.num_queues; /* Add dedicated XDP TX queues if enabled. */ @@ -1858,6 +1935,11 @@ int gve_adjust_queues(struct gve_priv *priv, return err; } /* Set the config for the next up. */ + if (reset_rss) { + err = gve_init_rss_config(priv, new_rx_config.num_queues); + if (err) + return err; + } priv->tx_cfg = new_tx_config; priv->rx_cfg = new_rx_config; From 57d3ecdfd539648d5890e1f6d3f9b08f5b406110 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 12:51:00 +0000 Subject: [PATCH 28/34] BACKPORT: gve: move DQO rx buffer management related code to a new file In preparation for the upcoming page pool adoption for DQO raw addressing mode, move RX buffer management code to a new file. In the follow on patches, page pool code will be added to this file. No functional change, just movement of code. Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Signed-off-by: Harshitha Ramamurthy Signed-off-by: Pranjal Shrivastava --- drivers/net/ethernet/google/gve/Makefile | 3 +- drivers/net/ethernet/google/gve/gve.h | 18 ++ .../ethernet/google/gve/gve_buffer_mgmt_dqo.c | 230 ++++++++++++++++++ drivers/net/ethernet/google/gve/gve_rx_dqo.c | 225 ----------------- 4 files changed, 250 insertions(+), 226 deletions(-) create mode 100644 drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c diff --git a/drivers/net/ethernet/google/gve/Makefile b/drivers/net/ethernet/google/gve/Makefile index 9ed07080b38a25..4520f1c07a638f 100644 --- a/drivers/net/ethernet/google/gve/Makefile +++ b/drivers/net/ethernet/google/gve/Makefile @@ -1,4 +1,5 @@ # Makefile for the Google virtual Ethernet (gve) driver obj-$(CONFIG_GVE) += gve.o -gve-objs := gve_main.o gve_tx.o gve_tx_dqo.o gve_rx.o gve_rx_dqo.o gve_ethtool.o gve_adminq.o gve_utils.o gve_flow_rule.o +gve-objs := gve_main.o gve_tx.o gve_tx_dqo.o gve_rx.o gve_rx_dqo.o gve_ethtool.o gve_adminq.o gve_utils.o gve_flow_rule.o \ + gve_buffer_mgmt_dqo.o diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 242c8b1db310cf..bc20f85867b3f8 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -1184,6 +1184,24 @@ void gve_rx_stop_ring_gqi(struct gve_priv *priv, int idx); u16 gve_get_pkt_buf_size(const struct gve_priv *priv, bool enable_hplit); bool gve_header_split_supported(const struct gve_priv *priv); int gve_set_hsplit_config(struct gve_priv *priv, u8 tcp_data_split); +/* rx buffer handling */ +int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs); +void gve_free_page_dqo(struct gve_priv *priv, struct gve_rx_buf_state_dqo *bs, + bool free_page); +struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx); +bool gve_buf_state_is_allocated(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +void gve_free_buf_state(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +struct gve_rx_buf_state_dqo *gve_dequeue_buf_state(struct gve_rx_ring *rx, + struct gve_index_list *list); +void gve_enqueue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list, + struct gve_rx_buf_state_dqo *buf_state); +struct gve_rx_buf_state_dqo *gve_get_recycled_buf_state(struct gve_rx_ring *rx); +int gve_alloc_page_dqo(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); /* Reset */ void gve_schedule_reset(struct gve_priv *priv); int gve_reset(struct gve_priv *priv, bool attempt_teardown); diff --git a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c new file mode 100644 index 00000000000000..8e50f0e4bb2ecb --- /dev/null +++ b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: (GPL-2.0 OR MIT) +/* Google virtual Ethernet (gve) driver + * + * Copyright (C) 2015-2024 Google, Inc. + */ + +#include "gve.h" +#include "gve_utils.h" + +int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs) +{ + return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias; +} + +void gve_free_page_dqo(struct gve_priv *priv, struct gve_rx_buf_state_dqo *bs, + bool free_page) +{ + page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1); + if (free_page) + gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr, + DMA_FROM_DEVICE); + bs->page_info.page = NULL; +} + +struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_state_dqo *buf_state; + s16 buffer_id; + + buffer_id = rx->dqo.free_buf_states; + if (unlikely(buffer_id == -1)) + return NULL; + + buf_state = &rx->dqo.buf_states[buffer_id]; + + /* Remove buf_state from free list */ + rx->dqo.free_buf_states = buf_state->next; + + /* Point buf_state to itself to mark it as allocated */ + buf_state->next = buffer_id; + + return buf_state; +} + +bool gve_buf_state_is_allocated(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + return buf_state->next == buffer_id; +} + +void gve_free_buf_state(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + buf_state->next = rx->dqo.free_buf_states; + rx->dqo.free_buf_states = buffer_id; +} + +struct gve_rx_buf_state_dqo *gve_dequeue_buf_state(struct gve_rx_ring *rx, + struct gve_index_list *list) +{ + struct gve_rx_buf_state_dqo *buf_state; + s16 buffer_id; + + buffer_id = list->head; + if (unlikely(buffer_id == -1)) + return NULL; + + buf_state = &rx->dqo.buf_states[buffer_id]; + + /* Remove buf_state from list */ + list->head = buf_state->next; + if (buf_state->next == -1) + list->tail = -1; + + /* Point buf_state to itself to mark it as allocated */ + buf_state->next = buffer_id; + + return buf_state; +} + +void gve_enqueue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list, + struct gve_rx_buf_state_dqo *buf_state) +{ + s16 buffer_id = buf_state - rx->dqo.buf_states; + + buf_state->next = -1; + + if (list->head == -1) { + list->head = buffer_id; + list->tail = buffer_id; + } else { + int tail = list->tail; + + rx->dqo.buf_states[tail].next = buffer_id; + list->tail = buffer_id; + } +} + +struct gve_rx_buf_state_dqo *gve_get_recycled_buf_state(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_state_dqo *buf_state; + int i; + + /* Recycled buf states are immediately usable. */ + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states); + if (likely(buf_state)) + return buf_state; + + if (unlikely(rx->dqo.used_buf_states.head == -1)) + return NULL; + + /* Used buf states are only usable when ref count reaches 0, which means + * no SKBs refer to them. + * + * Search a limited number before giving up. + */ + for (i = 0; i < 5; i++) { + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); + if (gve_buf_ref_cnt(buf_state) == 0) { + rx->dqo.used_buf_states_cnt--; + return buf_state; + } + + gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); + } + + /* For QPL, we cannot allocate any new buffers and must + * wait for the existing ones to be available. + */ + if (rx->dqo.qpl) + return NULL; + + /* If there are no free buf states discard an entry from + * `used_buf_states` so it can be used. + */ + if (unlikely(rx->dqo.free_buf_states == -1)) { + buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); + if (gve_buf_ref_cnt(buf_state) == 0) + return buf_state; + + gve_free_page_dqo(rx->gve, buf_state, true); + gve_free_buf_state(rx, buf_state); + } + + return NULL; +} + +int gve_alloc_page_dqo(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + struct gve_priv *priv = rx->gve; + u32 idx; + + if (!rx->dqo.qpl) { + int err; + + err = gve_alloc_page(priv, &priv->pdev->dev, + &buf_state->page_info.page, + &buf_state->addr, + DMA_FROM_DEVICE, GFP_ATOMIC); + if (err) + return err; + } else { + idx = rx->dqo.next_qpl_page_idx; + if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) { + net_err_ratelimited("%s: Out of QPL pages\n", + priv->dev->name); + return -ENOMEM; + } + buf_state->page_info.page = rx->dqo.qpl->pages[idx]; + buf_state->addr = rx->dqo.qpl->page_buses[idx]; + rx->dqo.next_qpl_page_idx++; + } + buf_state->page_info.page_offset = 0; + buf_state->page_info.page_address = + page_address(buf_state->page_info.page); + buf_state->last_single_ref_offset = 0; + + /* The page already has 1 ref. */ + page_ref_add(buf_state->page_info.page, INT_MAX - 1); + buf_state->page_info.pagecnt_bias = INT_MAX; + + return 0; +} + +void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + const u16 data_buffer_size = priv->data_buffer_size_dqo; + int pagecount; + + /* Can't reuse if we only fit one buffer per page */ + if (data_buffer_size * 2 > PAGE_SIZE) + goto mark_used; + + pagecount = gve_buf_ref_cnt(buf_state); + + /* Record the offset when we have a single remaining reference. + * + * When this happens, we know all of the other offsets of the page are + * usable. + */ + if (pagecount == 1) { + buf_state->last_single_ref_offset = + buf_state->page_info.page_offset; + } + + /* Use the next buffer sized chunk in the page. */ + buf_state->page_info.page_offset += data_buffer_size; + buf_state->page_info.page_offset &= (PAGE_SIZE - 1); + + /* If we wrap around to the same offset without ever dropping to 1 + * reference, then we don't know if this offset was ever freed. + */ + if (buf_state->page_info.page_offset == + buf_state->last_single_ref_offset) { + goto mark_used; + } + + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); + return; + +mark_used: + gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); + rx->dqo.used_buf_states_cnt++; +} diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index 1154c1d8f66f05..b343be2fb118ad 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -16,189 +16,6 @@ #include #include -static int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs) -{ - return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias; -} - -static void gve_free_page_dqo(struct gve_priv *priv, - struct gve_rx_buf_state_dqo *bs, - bool free_page) -{ - page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1); - if (free_page) - gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr, - DMA_FROM_DEVICE); - bs->page_info.page = NULL; -} - -static struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx) -{ - struct gve_rx_buf_state_dqo *buf_state; - s16 buffer_id; - - buffer_id = rx->dqo.free_buf_states; - if (unlikely(buffer_id == -1)) - return NULL; - - buf_state = &rx->dqo.buf_states[buffer_id]; - - /* Remove buf_state from free list */ - rx->dqo.free_buf_states = buf_state->next; - - /* Point buf_state to itself to mark it as allocated */ - buf_state->next = buffer_id; - - return buf_state; -} - -static bool gve_buf_state_is_allocated(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) -{ - s16 buffer_id = buf_state - rx->dqo.buf_states; - - return buf_state->next == buffer_id; -} - -static void gve_free_buf_state(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) -{ - s16 buffer_id = buf_state - rx->dqo.buf_states; - - buf_state->next = rx->dqo.free_buf_states; - rx->dqo.free_buf_states = buffer_id; -} - -static struct gve_rx_buf_state_dqo * -gve_dequeue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list) -{ - struct gve_rx_buf_state_dqo *buf_state; - s16 buffer_id; - - buffer_id = list->head; - if (unlikely(buffer_id == -1)) - return NULL; - - buf_state = &rx->dqo.buf_states[buffer_id]; - - /* Remove buf_state from list */ - list->head = buf_state->next; - if (buf_state->next == -1) - list->tail = -1; - - /* Point buf_state to itself to mark it as allocated */ - buf_state->next = buffer_id; - - return buf_state; -} - -static void gve_enqueue_buf_state(struct gve_rx_ring *rx, - struct gve_index_list *list, - struct gve_rx_buf_state_dqo *buf_state) -{ - s16 buffer_id = buf_state - rx->dqo.buf_states; - - buf_state->next = -1; - - if (list->head == -1) { - list->head = buffer_id; - list->tail = buffer_id; - } else { - int tail = list->tail; - - rx->dqo.buf_states[tail].next = buffer_id; - list->tail = buffer_id; - } -} - -static struct gve_rx_buf_state_dqo * -gve_get_recycled_buf_state(struct gve_rx_ring *rx) -{ - struct gve_rx_buf_state_dqo *buf_state; - int i; - - /* Recycled buf states are immediately usable. */ - buf_state = gve_dequeue_buf_state(rx, &rx->dqo.recycled_buf_states); - if (likely(buf_state)) - return buf_state; - - if (unlikely(rx->dqo.used_buf_states.head == -1)) - return NULL; - - /* Used buf states are only usable when ref count reaches 0, which means - * no SKBs refer to them. - * - * Search a limited number before giving up. - */ - for (i = 0; i < 5; i++) { - buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); - if (gve_buf_ref_cnt(buf_state) == 0) { - rx->dqo.used_buf_states_cnt--; - return buf_state; - } - - gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); - } - - /* For QPL, we cannot allocate any new buffers and must - * wait for the existing ones to be available. - */ - if (rx->dqo.qpl) - return NULL; - - /* If there are no free buf states discard an entry from - * `used_buf_states` so it can be used. - */ - if (unlikely(rx->dqo.free_buf_states == -1)) { - buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); - if (gve_buf_ref_cnt(buf_state) == 0) - return buf_state; - - gve_free_page_dqo(rx->gve, buf_state, true); - gve_free_buf_state(rx, buf_state); - } - - return NULL; -} - -static int gve_alloc_page_dqo(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) -{ - struct gve_priv *priv = rx->gve; - u32 idx; - - if (!rx->dqo.qpl) { - int err; - - err = gve_alloc_page(priv, &priv->pdev->dev, - &buf_state->page_info.page, - &buf_state->addr, - DMA_FROM_DEVICE, GFP_ATOMIC); - if (err) - return err; - } else { - idx = rx->dqo.next_qpl_page_idx; - if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) { - net_err_ratelimited("%s: Out of QPL pages\n", - priv->dev->name); - return -ENOMEM; - } - buf_state->page_info.page = rx->dqo.qpl->pages[idx]; - buf_state->addr = rx->dqo.qpl->page_buses[idx]; - rx->dqo.next_qpl_page_idx++; - } - buf_state->page_info.page_offset = 0; - buf_state->page_info.page_address = - page_address(buf_state->page_info.page); - buf_state->last_single_ref_offset = 0; - - /* The page already has 1 ref. */ - page_ref_add(buf_state->page_info.page, INT_MAX - 1); - buf_state->page_info.pagecnt_bias = INT_MAX; - - return 0; -} - static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx) { struct device *hdev = &priv->pdev->dev; @@ -557,48 +374,6 @@ void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) rx->fill_cnt += num_posted; } -static void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) -{ - const u16 data_buffer_size = priv->data_buffer_size_dqo; - int pagecount; - - /* Can't reuse if we only fit one buffer per page */ - if (data_buffer_size * 2 > PAGE_SIZE) - goto mark_used; - - pagecount = gve_buf_ref_cnt(buf_state); - - /* Record the offset when we have a single remaining reference. - * - * When this happens, we know all of the other offsets of the page are - * usable. - */ - if (pagecount == 1) { - buf_state->last_single_ref_offset = - buf_state->page_info.page_offset; - } - - /* Use the next buffer sized chunk in the page. */ - buf_state->page_info.page_offset += data_buffer_size; - buf_state->page_info.page_offset &= (PAGE_SIZE - 1); - - /* If we wrap around to the same offset without ever dropping to 1 - * reference, then we don't know if this offset was ever freed. - */ - if (buf_state->page_info.page_offset == - buf_state->last_single_ref_offset) { - goto mark_used; - } - - gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); - return; - -mark_used: - gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); - rx->dqo.used_buf_states_cnt++; -} - static void gve_rx_skb_csum(struct sk_buff *skb, const struct gve_rx_compl_desc_dqo *desc, struct gve_ptype ptype) From 95f5af6b314de0ea11e20ab7fff94f9dc57272ba Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 12:53:27 +0000 Subject: [PATCH 29/34] BACKPORT: gve: adopt page pool for DQ RDA mode For DQ queue format in raw DMA addressing(RDA) mode, implement page pool recycling of buffers by leveraging a few helper functions. DQ QPL mode will continue to use the exisiting recycling logic. This is because in QPL mode, the pages come from a constant set of pages that the driver pre-allocates and registers with the device. Reviewed-by: Praveen Kaligineedi Reviewed-by: Shailend Chand Reviewed-by: Willem de Bruijn Signed-off-by: Harshitha Ramamurthy Signed-off-by: Pranjal Shrivastava --- drivers/net/ethernet/google/Kconfig | 1 + drivers/net/ethernet/google/gve/gve.h | 22 ++- .../ethernet/google/gve/gve_buffer_mgmt_dqo.c | 180 +++++++++++++----- drivers/net/ethernet/google/gve/gve_rx_dqo.c | 89 ++++----- 4 files changed, 198 insertions(+), 94 deletions(-) diff --git a/drivers/net/ethernet/google/Kconfig b/drivers/net/ethernet/google/Kconfig index 8641a00f8e6385..564862a57124e2 100644 --- a/drivers/net/ethernet/google/Kconfig +++ b/drivers/net/ethernet/google/Kconfig @@ -18,6 +18,7 @@ if NET_VENDOR_GOOGLE config GVE tristate "Google Virtual NIC (gVNIC) support" depends on (PCI_MSI && (X86 || CPU_LITTLE_ENDIAN)) + select PAGE_POOL help This driver supports Google Virtual NIC (gVNIC)" diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index bc20f85867b3f8..216d6e157befbf 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "gve_desc.h" @@ -60,6 +61,8 @@ #define GVE_DEFAULT_RX_BUFFER_OFFSET 2048 +#define GVE_PAGE_POOL_SIZE_MULTIPLIER 4 + #define GVE_FLOW_RULES_CACHE_SIZE \ (GVE_ADMINQ_BUFFER_SIZE / sizeof(struct gve_adminq_queried_flow_rule)) #define GVE_FLOW_RULE_IDS_CACHE_SIZE \ @@ -105,6 +108,7 @@ struct gve_rx_slot_page_info { struct page *page; void *page_address; u32 page_offset; /* offset to write to in page */ + unsigned int buf_size; int pagecnt_bias; /* expected pagecnt if only the driver has a ref */ u16 pad; /* adjustment for rx padding */ u8 can_flip; /* tracks if the networking stack is using the page */ @@ -276,6 +280,8 @@ struct gve_rx_ring { /* Address info of the buffers for header-split */ struct gve_header_buf hdr_bufs; + + struct page_pool *page_pool; } dqo; }; @@ -1198,10 +1204,22 @@ struct gve_rx_buf_state_dqo *gve_dequeue_buf_state(struct gve_rx_ring *rx, void gve_enqueue_buf_state(struct gve_rx_ring *rx, struct gve_index_list *list, struct gve_rx_buf_state_dqo *buf_state); struct gve_rx_buf_state_dqo *gve_get_recycled_buf_state(struct gve_rx_ring *rx); -int gve_alloc_page_dqo(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state); void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_buf_state_dqo *buf_state); +void gve_free_to_page_pool(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state, + bool allow_direct); +int gve_alloc_qpl_page_dqo(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +void gve_free_qpl_page_dqo(struct gve_rx_buf_state_dqo *buf_state); +void gve_reuse_buffer(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +void gve_free_buffer(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state); +int gve_alloc_buffer(struct gve_rx_ring *rx, struct gve_rx_desc_dqo *desc); +struct page_pool *gve_rx_create_page_pool(struct gve_priv *priv, + struct gve_rx_ring *rx); + /* Reset */ void gve_schedule_reset(struct gve_priv *priv); int gve_reset(struct gve_priv *priv, bool attempt_teardown); diff --git a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c index 8e50f0e4bb2ecb..05bf1f80a79cfa 100644 --- a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c @@ -12,16 +12,6 @@ int gve_buf_ref_cnt(struct gve_rx_buf_state_dqo *bs) return page_count(bs->page_info.page) - bs->page_info.pagecnt_bias; } -void gve_free_page_dqo(struct gve_priv *priv, struct gve_rx_buf_state_dqo *bs, - bool free_page) -{ - page_ref_sub(bs->page_info.page, bs->page_info.pagecnt_bias - 1); - if (free_page) - gve_free_page(&priv->pdev->dev, bs->page_info.page, bs->addr, - DMA_FROM_DEVICE); - bs->page_info.page = NULL; -} - struct gve_rx_buf_state_dqo *gve_alloc_buf_state(struct gve_rx_ring *rx) { struct gve_rx_buf_state_dqo *buf_state; @@ -128,56 +118,28 @@ struct gve_rx_buf_state_dqo *gve_get_recycled_buf_state(struct gve_rx_ring *rx) gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); } - /* For QPL, we cannot allocate any new buffers and must - * wait for the existing ones to be available. - */ - if (rx->dqo.qpl) - return NULL; - - /* If there are no free buf states discard an entry from - * `used_buf_states` so it can be used. - */ - if (unlikely(rx->dqo.free_buf_states == -1)) { - buf_state = gve_dequeue_buf_state(rx, &rx->dqo.used_buf_states); - if (gve_buf_ref_cnt(buf_state) == 0) - return buf_state; - - gve_free_page_dqo(rx->gve, buf_state, true); - gve_free_buf_state(rx, buf_state); - } - return NULL; } -int gve_alloc_page_dqo(struct gve_rx_ring *rx, - struct gve_rx_buf_state_dqo *buf_state) +int gve_alloc_qpl_page_dqo(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) { struct gve_priv *priv = rx->gve; u32 idx; - if (!rx->dqo.qpl) { - int err; - - err = gve_alloc_page(priv, &priv->pdev->dev, - &buf_state->page_info.page, - &buf_state->addr, - DMA_FROM_DEVICE, GFP_ATOMIC); - if (err) - return err; - } else { - idx = rx->dqo.next_qpl_page_idx; - if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) { - net_err_ratelimited("%s: Out of QPL pages\n", - priv->dev->name); - return -ENOMEM; - } - buf_state->page_info.page = rx->dqo.qpl->pages[idx]; - buf_state->addr = rx->dqo.qpl->page_buses[idx]; - rx->dqo.next_qpl_page_idx++; + idx = rx->dqo.next_qpl_page_idx; + if (idx >= gve_get_rx_pages_per_qpl_dqo(priv->rx_desc_cnt)) { + net_err_ratelimited("%s: Out of QPL pages\n", + priv->dev->name); + return -ENOMEM; } + buf_state->page_info.page = rx->dqo.qpl->pages[idx]; + buf_state->addr = rx->dqo.qpl->page_buses[idx]; + rx->dqo.next_qpl_page_idx++; buf_state->page_info.page_offset = 0; buf_state->page_info.page_address = page_address(buf_state->page_info.page); + buf_state->page_info.buf_size = priv->data_buffer_size_dqo; buf_state->last_single_ref_offset = 0; /* The page already has 1 ref. */ @@ -187,6 +149,16 @@ int gve_alloc_page_dqo(struct gve_rx_ring *rx, return 0; } +void gve_free_qpl_page_dqo(struct gve_rx_buf_state_dqo *buf_state) +{ + if (!buf_state->page_info.page) + return; + + page_ref_sub(buf_state->page_info.page, + buf_state->page_info.pagecnt_bias - 1); + buf_state->page_info.page = NULL; +} + void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_buf_state_dqo *buf_state) { @@ -228,3 +200,113 @@ void gve_try_recycle_buf(struct gve_priv *priv, struct gve_rx_ring *rx, gve_enqueue_buf_state(rx, &rx->dqo.used_buf_states, buf_state); rx->dqo.used_buf_states_cnt++; } + +void gve_free_to_page_pool(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state, + bool allow_direct) +{ + struct page *page = buf_state->page_info.page; + + if (!page) + return; + + page_pool_put_page(page->pp, page, buf_state->page_info.buf_size, + allow_direct); + buf_state->page_info.page = NULL; +} + +static int gve_alloc_from_page_pool(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + struct gve_priv *priv = rx->gve; + struct page *page; + + buf_state->page_info.buf_size = priv->data_buffer_size_dqo; + page = page_pool_alloc(rx->dqo.page_pool, + &buf_state->page_info.page_offset, + &buf_state->page_info.buf_size, GFP_ATOMIC); + + if (!page) + return -ENOMEM; + + buf_state->page_info.page = page; + buf_state->page_info.page_address = page_address(page); + buf_state->addr = page_pool_get_dma_addr(page); + + return 0; +} + +struct page_pool *gve_rx_create_page_pool(struct gve_priv *priv, + struct gve_rx_ring *rx) +{ + u32 ntfy_id = gve_rx_idx_to_ntfy(priv, rx->q_num); + struct page_pool_params pp = { + .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, + .order = 0, + .pool_size = GVE_PAGE_POOL_SIZE_MULTIPLIER * priv->rx_desc_cnt, + .dev = &priv->pdev->dev, + .netdev = priv->dev, + .napi = &priv->ntfy_blocks[ntfy_id].napi, + .max_len = PAGE_SIZE, + .dma_dir = DMA_FROM_DEVICE, + }; + + return page_pool_create(&pp); +} + +void gve_free_buffer(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + if (rx->dqo.page_pool) { + gve_free_to_page_pool(rx, buf_state, true); + gve_free_buf_state(rx, buf_state); + } else { + gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, + buf_state); + } +} + +void gve_reuse_buffer(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state) +{ + if (rx->dqo.page_pool) { + buf_state->page_info.page = NULL; + gve_free_buf_state(rx, buf_state); + } else { + gve_dec_pagecnt_bias(&buf_state->page_info); + gve_try_recycle_buf(rx->gve, rx, buf_state); + } +} + +int gve_alloc_buffer(struct gve_rx_ring *rx, struct gve_rx_desc_dqo *desc) +{ + struct gve_rx_buf_state_dqo *buf_state; + + if (rx->dqo.page_pool) { + buf_state = gve_alloc_buf_state(rx); + if (WARN_ON_ONCE(!buf_state)) + return -ENOMEM; + + if (gve_alloc_from_page_pool(rx, buf_state)) + goto free_buf_state; + } else { + buf_state = gve_get_recycled_buf_state(rx); + if (unlikely(!buf_state)) { + buf_state = gve_alloc_buf_state(rx); + if (unlikely(!buf_state)) + return -ENOMEM; + + if (unlikely(gve_alloc_qpl_page_dqo(rx, buf_state))) + goto free_buf_state; + } + } + desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states); + desc->buf_addr = cpu_to_le64(buf_state->addr + + buf_state->page_info.page_offset); + + return 0; + +free_buf_state: + gve_free_buf_state(rx, buf_state); + return -ENOMEM; +} diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index b343be2fb118ad..8ac0047f1ada11 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -95,8 +95,10 @@ static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx) for (i = 0; i < rx->dqo.num_buf_states; i++) { struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; - if (bs->page_info.page) - gve_free_page_dqo(priv, bs, !rx->dqo.qpl); + if (rx->dqo.page_pool) + gve_free_to_page_pool(rx, bs, false); + else + gve_free_qpl_page_dqo(bs); } } @@ -138,9 +140,11 @@ void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, for (i = 0; i < rx->dqo.num_buf_states; i++) { struct gve_rx_buf_state_dqo *bs = &rx->dqo.buf_states[i]; - /* Only free page for RDA. QPL pages are freed in gve_main. */ - if (bs->page_info.page) - gve_free_page_dqo(priv, bs, !rx->dqo.qpl); + + if (rx->dqo.page_pool) + gve_free_to_page_pool(rx, bs, false); + else + gve_free_qpl_page_dqo(bs); } if (rx->dqo.qpl) { @@ -167,6 +171,11 @@ void gve_rx_free_ring_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, kvfree(rx->dqo.buf_states); rx->dqo.buf_states = NULL; + if (rx->dqo.page_pool) { + page_pool_destroy(rx->dqo.page_pool); + rx->dqo.page_pool = NULL; + } + gve_rx_free_hdr_bufs(priv, rx); netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx); @@ -199,6 +208,7 @@ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int idx) { struct device *hdev = &priv->pdev->dev; + struct page_pool *pool; int qpl_page_cnt; size_t size; u32 qpl_id; @@ -212,8 +222,7 @@ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, rx->gve = priv; rx->q_num = idx; - rx->dqo.num_buf_states = cfg->raw_addressing ? - min_t(s16, S16_MAX, buffer_queue_slots * 4) : + rx->dqo.num_buf_states = cfg->raw_addressing ? buffer_queue_slots : gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); rx->dqo.buf_states = kvcalloc(rx->dqo.num_buf_states, sizeof(rx->dqo.buf_states[0]), @@ -241,7 +250,13 @@ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, if (!rx->dqo.bufq.desc_ring) goto err; - if (!cfg->raw_addressing) { + if (cfg->raw_addressing) { + pool = gve_rx_create_page_pool(priv, rx); + if (IS_ERR(pool)) + goto err; + + rx->dqo.page_pool = pool; + } else { qpl_id = gve_get_rx_qpl_id(cfg->qcfg_tx, rx->q_num); qpl_page_cnt = gve_get_rx_pages_per_qpl_dqo(cfg->ring_size); @@ -338,26 +353,14 @@ void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx) num_avail_slots = min_t(u32, num_avail_slots, complq->num_free_slots); while (num_posted < num_avail_slots) { struct gve_rx_desc_dqo *desc = &bufq->desc_ring[bufq->tail]; - struct gve_rx_buf_state_dqo *buf_state; - - buf_state = gve_get_recycled_buf_state(rx); - if (unlikely(!buf_state)) { - buf_state = gve_alloc_buf_state(rx); - if (unlikely(!buf_state)) - break; - - if (unlikely(gve_alloc_page_dqo(rx, buf_state))) { - u64_stats_update_begin(&rx->statss); - rx->rx_buf_alloc_fail++; - u64_stats_update_end(&rx->statss); - gve_free_buf_state(rx, buf_state); - break; - } + + if (unlikely(gve_alloc_buffer(rx, desc))) { + u64_stats_update_begin(&rx->statss); + rx->rx_buf_alloc_fail++; + u64_stats_update_end(&rx->statss); + break; } - desc->buf_id = cpu_to_le16(buf_state - rx->dqo.buf_states); - desc->buf_addr = cpu_to_le64(buf_state->addr + - buf_state->page_info.page_offset); if (rx->dqo.hdr_bufs.data) desc->header_buf_addr = cpu_to_le64(rx->dqo.hdr_bufs.addr + @@ -488,6 +491,9 @@ static int gve_rx_append_frags(struct napi_struct *napi, if (!skb) return -1; + if (rx->dqo.page_pool) + skb_mark_for_recycle(skb); + if (rx->ctx.skb_tail == rx->ctx.skb_head) skb_shinfo(rx->ctx.skb_head)->frag_list = skb; else @@ -498,7 +504,7 @@ static int gve_rx_append_frags(struct napi_struct *napi, if (rx->ctx.skb_tail != rx->ctx.skb_head) { rx->ctx.skb_head->len += buf_len; rx->ctx.skb_head->data_len += buf_len; - rx->ctx.skb_head->truesize += priv->data_buffer_size_dqo; + rx->ctx.skb_head->truesize += buf_state->page_info.buf_size; } /* Trigger ondemand page allocation if we are running low on buffers */ @@ -508,13 +514,8 @@ static int gve_rx_append_frags(struct napi_struct *napi, skb_add_rx_frag(rx->ctx.skb_tail, num_frags, buf_state->page_info.page, buf_state->page_info.page_offset, - buf_len, priv->data_buffer_size_dqo); - gve_dec_pagecnt_bias(&buf_state->page_info); - - /* Advances buffer page-offset if page is partially used. - * Marks buffer as used if page is full. - */ - gve_try_recycle_buf(priv, rx, buf_state); + buf_len, buf_state->page_info.buf_size); + gve_reuse_buffer(rx, buf_state); return 0; } @@ -548,8 +549,7 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, } if (unlikely(compl_desc->rx_error)) { - gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, - buf_state); + gve_free_buffer(rx, buf_state); return -EINVAL; } @@ -573,6 +573,9 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, if (unlikely(!rx->ctx.skb_head)) goto error; rx->ctx.skb_tail = rx->ctx.skb_head; + + if (rx->dqo.page_pool) + skb_mark_for_recycle(rx->ctx.skb_head); } else { unsplit = 1; } @@ -609,8 +612,7 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, rx->rx_copybreak_pkt++; u64_stats_update_end(&rx->statss); - gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, - buf_state); + gve_free_buffer(rx, buf_state); return 0; } @@ -625,16 +627,17 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, return 0; } + if (rx->dqo.page_pool) + skb_mark_for_recycle(rx->ctx.skb_head); + skb_add_rx_frag(rx->ctx.skb_head, 0, buf_state->page_info.page, buf_state->page_info.page_offset, buf_len, - priv->data_buffer_size_dqo); - gve_dec_pagecnt_bias(&buf_state->page_info); - - gve_try_recycle_buf(priv, rx, buf_state); + buf_state->page_info.buf_size); + gve_reuse_buffer(rx, buf_state); return 0; error: - gve_enqueue_buf_state(rx, &rx->dqo.recycled_buf_states, buf_state); + gve_free_buffer(rx, buf_state); return -ENOMEM; } From 04ba396999a7205b0c9d0191f35033e7919417c1 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 12:55:20 +0000 Subject: [PATCH 30/34] BACKPORT: gve: add support for basic queue stats Implement netdev_stats_ops to export basic per-queue stats. With page pool support for DQO added in the previous patches, rx-alloc-fail captures failures in page pool allocations as well since the rx_buf_alloc_fail stat tracked in the driver is incremented when gve_alloc_buffer returns error. Reviewed-by: Praveen Kaligineedi Reviewed-by: Willem de Bruijn Signed-off-by: Harshitha Ramamurthy Signed-off-by: Pranjal Shrivastava --- drivers/net/ethernet/google/gve/gve_main.c | 49 ++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 99a4827e98552d..13b9fe651d10e2 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -2643,6 +2643,54 @@ static const struct netdev_queue_mgmt_ops gve_queue_mgmt_ops = { .ndo_queue_stop = gve_rx_queue_stop, }; +static void gve_get_rx_queue_stats(struct net_device *dev, int idx, + struct netdev_queue_stats_rx *rx_stats) +{ + struct gve_priv *priv = netdev_priv(dev); + struct gve_rx_ring *rx = &priv->rx[idx]; + unsigned int start; + + do { + start = u64_stats_fetch_begin(&rx->statss); + rx_stats->packets = rx->rpackets; + rx_stats->bytes = rx->rbytes; + rx_stats->alloc_fail = rx->rx_skb_alloc_fail + + rx->rx_buf_alloc_fail; + } while (u64_stats_fetch_retry(&rx->statss, start)); +} + +static void gve_get_tx_queue_stats(struct net_device *dev, int idx, + struct netdev_queue_stats_tx *tx_stats) +{ + struct gve_priv *priv = netdev_priv(dev); + struct gve_tx_ring *tx = &priv->tx[idx]; + unsigned int start; + + do { + start = u64_stats_fetch_begin(&tx->statss); + tx_stats->packets = tx->pkt_done; + tx_stats->bytes = tx->bytes_done; + } while (u64_stats_fetch_retry(&tx->statss, start)); +} + +static void gve_get_base_stats(struct net_device *dev, + struct netdev_queue_stats_rx *rx, + struct netdev_queue_stats_tx *tx) +{ + rx->packets = 0; + rx->bytes = 0; + rx->alloc_fail = 0; + + tx->packets = 0; + tx->bytes = 0; +} + +static const struct netdev_stat_ops gve_stat_ops = { + .get_queue_stats_rx = gve_get_rx_queue_stats, + .get_queue_stats_tx = gve_get_tx_queue_stats, + .get_base_stats = gve_get_base_stats, +}; + static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { int max_tx_queues, max_rx_queues; @@ -2698,6 +2746,7 @@ static int gve_probe(struct pci_dev *pdev, const struct pci_device_id *ent) dev->ethtool_ops = &gve_ethtool_ops; dev->netdev_ops = &gve_netdev_ops; dev->queue_mgmt_ops = &gve_queue_mgmt_ops; + dev->stat_ops = &gve_stat_ops; /* Set default and supported features. * From b54d4a3f4bec9cd89e887a161b700a9356d50c9a Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 12:58:47 +0000 Subject: [PATCH 31/34] BACKPORT: gve: change to use page_pool_put_full_page when recycling pages The driver currently uses page_pool_put_page() to recycle page pool pages. Since gve uses split pages, if the fragment being recycled is not the last fragment in the page, there is no dma sync operation. When the last fragment is recycled, dma sync is performed by page pool infra according to the value passed as dma_sync_size which right now is set to the size of fragment. But the correct thing to do is to dma sync the entire page when the last fragment is recycled. Hence change to using page_pool_put_full_page(). Signed-off-by: Pranjal Shrivastava --- drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c index 05bf1f80a79cfa..403f0f335ba669 100644 --- a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c @@ -210,8 +210,7 @@ void gve_free_to_page_pool(struct gve_rx_ring *rx, if (!page) return; - page_pool_put_page(page->pp, page, buf_state->page_info.buf_size, - allow_direct); + page_pool_put_full_page(page->pp, page, allow_direct); buf_state->page_info.page = NULL; } From ff6b5a25ab1952aca0edd465ad9988cb2d8ea72a Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 17:05:02 +0000 Subject: [PATCH 32/34] BACKPORT: gve: unlink old napi when stopping a queue using queue API When a queue is stopped using the ndo queue API, before destroying its page pool, the associated NAPI instance needs to be unlinked to avoid warnings. Handle this by calling page_pool_disable_direct_recycling() when stopping a queue. Fixes: ebdfae0d377b ("gve: adopt page pool for DQ RDA mode") Reviewed-by: Praveen Kaligineedi Signed-off-by: Harshitha Ramamurthy Signed-off-by: Pranjal Shrivastava --- drivers/net/ethernet/google/gve/gve_rx_dqo.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index 8ac0047f1ada11..f0674a44356708 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -109,10 +109,12 @@ static void gve_rx_reset_ring_dqo(struct gve_priv *priv, int idx) void gve_rx_stop_ring_dqo(struct gve_priv *priv, int idx) { int ntfy_idx = gve_rx_idx_to_ntfy(priv, idx); + struct gve_rx_ring *rx = &priv->rx[idx]; if (!gve_rx_was_added_to_block(priv, idx)) return; + page_pool_disable_direct_recycling(rx->dqo.page_pool); gve_remove_napi(priv, ntfy_idx); gve_rx_remove_from_block(priv, idx); gve_rx_reset_ring_dqo(priv, idx); From d8a5e1d359641ca1b92dc6421db22daa88fe1044 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Wed, 5 Mar 2025 17:41:22 +0000 Subject: [PATCH 33/34] BACKPORT: gve: convert to use netmem for DQO RDA mode Use netmem_ref instead of struct page to track pages for DQO RDA mode which will support devmem TCP. To achieve this, track netmem_ref in struct gve_rx_slot_page_info for DQO raw addressing mode while page is used for QPL mode. Change-Id: I40da83100b765e4af60fe23d33e92963794e93fe Signed-off-by: Harshitha Ramamurthy Signed-off-by: Pranjal Shrivastava --- drivers/net/ethernet/google/gve/gve.h | 5 ++- .../ethernet/google/gve/gve_buffer_mgmt_dqo.c | 27 ++++++++------- drivers/net/ethernet/google/gve/gve_rx_dqo.c | 34 ++++++++++++++----- 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 216d6e157befbf..9ce9572e0299af 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -105,7 +105,10 @@ struct gve_rx_desc_queue { /* The page info for a single slot in the RX data queue */ struct gve_rx_slot_page_info { - struct page *page; + union { + struct page *page; /* QPL mode */ + netmem_ref netmem; /* RDA mode */ + }; void *page_address; u32 page_offset; /* offset to write to in page */ unsigned int buf_size; diff --git a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c index 403f0f335ba669..bcfceb1d5f2046 100644 --- a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c @@ -205,32 +205,33 @@ void gve_free_to_page_pool(struct gve_rx_ring *rx, struct gve_rx_buf_state_dqo *buf_state, bool allow_direct) { - struct page *page = buf_state->page_info.page; + netmem_ref netmem = buf_state->page_info.netmem; - if (!page) + if (!netmem) return; - page_pool_put_full_page(page->pp, page, allow_direct); - buf_state->page_info.page = NULL; + page_pool_put_full_netmem(rx->dqo.page_pool, netmem, allow_direct); + buf_state->page_info.netmem = 0; } static int gve_alloc_from_page_pool(struct gve_rx_ring *rx, struct gve_rx_buf_state_dqo *buf_state) { struct gve_priv *priv = rx->gve; - struct page *page; + netmem_ref netmem; buf_state->page_info.buf_size = priv->data_buffer_size_dqo; - page = page_pool_alloc(rx->dqo.page_pool, - &buf_state->page_info.page_offset, - &buf_state->page_info.buf_size, GFP_ATOMIC); + netmem = page_pool_alloc_netmem(rx->dqo.page_pool, + &buf_state->page_info.page_offset, + &buf_state->page_info.buf_size, + GFP_ATOMIC); - if (!page) + if (!netmem) return -ENOMEM; - buf_state->page_info.page = page; - buf_state->page_info.page_address = page_address(page); - buf_state->addr = page_pool_get_dma_addr(page); + buf_state->page_info.netmem = netmem; + buf_state->page_info.page_address = netmem_address(netmem); + buf_state->addr = page_pool_get_dma_addr_netmem(netmem); return 0; } @@ -269,7 +270,7 @@ void gve_reuse_buffer(struct gve_rx_ring *rx, struct gve_rx_buf_state_dqo *buf_state) { if (rx->dqo.page_pool) { - buf_state->page_info.page = NULL; + buf_state->page_info.netmem = 0; gve_free_buf_state(rx, buf_state); } else { gve_dec_pagecnt_bias(&buf_state->page_info); diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index f0674a44356708..523597902a55b0 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -476,6 +476,24 @@ static int gve_rx_copy_ondemand(struct gve_rx_ring *rx, return 0; } +static void gve_skb_add_rx_frag(struct gve_rx_ring *rx, + struct gve_rx_buf_state_dqo *buf_state, + int num_frags, u16 buf_len) +{ + if (rx->dqo.qpl) { + skb_add_rx_frag(rx->ctx.skb_tail, num_frags, + buf_state->page_info.page, + buf_state->page_info.page_offset, + buf_len, buf_state->page_info.buf_size); + } else { + skb_add_rx_frag_netmem(rx->ctx.skb_tail, num_frags, + buf_state->page_info.netmem, + buf_state->page_info.page_offset, + buf_len, + buf_state->page_info.buf_size); + } +} + /* Chains multi skbs for single rx packet. * Returns 0 if buffer is appended, -1 otherwise. */ @@ -513,10 +531,7 @@ static int gve_rx_append_frags(struct napi_struct *napi, if (gve_rx_should_trigger_copy_ondemand(rx)) return gve_rx_copy_ondemand(rx, buf_state, buf_len); - skb_add_rx_frag(rx->ctx.skb_tail, num_frags, - buf_state->page_info.page, - buf_state->page_info.page_offset, - buf_len, buf_state->page_info.buf_size); + gve_skb_add_rx_frag(rx, buf_state, num_frags, buf_len); gve_reuse_buffer(rx, buf_state); return 0; } @@ -561,7 +576,12 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, /* Page might have not been used for awhile and was likely last written * by a different thread. */ - prefetch(buf_state->page_info.page); + if (rx->dqo.qpl) { + prefetch(buf_state->page_info.page); + } else { + if (!netmem_is_net_iov(buf_state->page_info.netmem)) + prefetch(netmem_to_page(buf_state->page_info.netmem)); + } /* Copy the header into the skb in the case of header split */ if (hsplit) { @@ -632,9 +652,7 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, if (rx->dqo.page_pool) skb_mark_for_recycle(rx->ctx.skb_head); - skb_add_rx_frag(rx->ctx.skb_head, 0, buf_state->page_info.page, - buf_state->page_info.page_offset, buf_len, - buf_state->page_info.buf_size); + gve_skb_add_rx_frag(rx, buf_state, 0, buf_len); gve_reuse_buffer(rx, buf_state); return 0; From 55337fd3b69e7accf235d96828228628b0e30072 Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Thu, 6 Mar 2025 08:04:52 +0000 Subject: [PATCH 34/34] gve: tcp devmem implementation GVE driver changes for TCP devmem Signed-off-by: Mina Almasry Signed-off-by: Pranjal Shrivastava --- drivers/net/ethernet/google/gve/gve.h | 2 + .../ethernet/google/gve/gve_buffer_mgmt_dqo.c | 5 +++ drivers/net/ethernet/google/gve/gve_ethtool.c | 19 ++++++-- drivers/net/ethernet/google/gve/gve_rx_dqo.c | 44 ++++++++++++++++--- 4 files changed, 60 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 9ce9572e0299af..93d8dde2b07805 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -295,6 +295,8 @@ struct gve_rx_ring { u32 fill_cnt; /* free-running total number of descs and buffs posted */ u32 mask; /* masks the cnt and fill_cnt to the size of the ring */ u64 rx_hsplit_pkt; /* free-running packets with headers split */ + u64 rx_devmem_pkt; /* devmem packets processed */ + u64 rx_devmem_dropped; /* devmem pkts dropped */ u64 rx_copybreak_pkt; /* free-running count of copybreak packets */ u64 rx_copied_pkt; /* free-running total number of copied packets */ u64 rx_skb_alloc_fail; /* free-running count of skb alloc fails */ diff --git a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c index bcfceb1d5f2046..1d11b043fd820f 100644 --- a/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_buffer_mgmt_dqo.c @@ -250,6 +250,11 @@ struct page_pool *gve_rx_create_page_pool(struct gve_priv *priv, .max_len = PAGE_SIZE, .dma_dir = DMA_FROM_DEVICE, }; + + if (priv->header_split_enabled) { + pp.flags |= PP_FLAG_ALLOW_UNREADABLE_NETMEM; + pp.queue_idx = rx->q_num; + } return page_pool_create(&pp); } diff --git a/drivers/net/ethernet/google/gve/gve_ethtool.c b/drivers/net/ethernet/google/gve/gve_ethtool.c index a572f1e059342e..425fe753602a30 100644 --- a/drivers/net/ethernet/google/gve/gve_ethtool.c +++ b/drivers/net/ethernet/google/gve/gve_ethtool.c @@ -5,6 +5,7 @@ */ #include +#include #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" @@ -40,7 +41,8 @@ static u32 gve_get_msglevel(struct net_device *netdev) * as declared in enum xdp_action inside file uapi/linux/bpf.h . */ static const char gve_gstrings_main_stats[][ETH_GSTRING_LEN] = { - "rx_packets", "rx_hsplit_pkt", "tx_packets", "rx_bytes", + "rx_packets", "rx_hsplit_pkt", "rx_devmem_pkts", + "rx_devmem_dropped", "tx_packets", "rx_bytes", "tx_bytes", "rx_dropped", "tx_dropped", "tx_timeouts", "rx_skb_alloc_fail", "rx_buf_alloc_fail", "rx_desc_err_dropped_pkt", "rx_hsplit_unsplit_pkt", @@ -149,13 +151,14 @@ static void gve_get_ethtool_stats(struct net_device *netdev, struct ethtool_stats *stats, u64 *data) { - u64 tmp_rx_pkts, tmp_rx_hsplit_pkt, tmp_rx_bytes, tmp_rx_hsplit_bytes, + u64 tmp_rx_pkts, tmp_rx_hsplit_pkt, tmp_rx_devmem_pkt, + tmp_rx_devmem_dropped, tmp_rx_bytes, tmp_rx_hsplit_bytes, tmp_rx_skb_alloc_fail, tmp_rx_buf_alloc_fail, tmp_rx_desc_err_dropped_pkt, tmp_rx_hsplit_unsplit_pkt, tmp_tx_pkts, tmp_tx_bytes; u64 rx_buf_alloc_fail, rx_desc_err_dropped_pkt, rx_hsplit_unsplit_pkt, - rx_pkts, rx_hsplit_pkt, rx_skb_alloc_fail, rx_bytes, tx_pkts, tx_bytes, - tx_dropped; + rx_pkts, rx_hsplit_pkt, rx_devmem_pkt, rx_devmem_dropped, + rx_skb_alloc_fail, rx_bytes, tx_pkts, tx_bytes, tx_dropped; int stats_idx, base_stats_idx, max_stats_idx; struct stats *report_stats; int *rx_qid_to_stats_idx; @@ -196,6 +199,7 @@ gve_get_ethtool_stats(struct net_device *netdev, } for (rx_pkts = 0, rx_bytes = 0, rx_hsplit_pkt = 0, + rx_devmem_pkt = 0, rx_devmem_dropped = 0, rx_skb_alloc_fail = 0, rx_buf_alloc_fail = 0, rx_desc_err_dropped_pkt = 0, rx_hsplit_unsplit_pkt = 0, ring = 0; @@ -208,6 +212,8 @@ gve_get_ethtool_stats(struct net_device *netdev, u64_stats_fetch_begin(&priv->rx[ring].statss); tmp_rx_pkts = rx->rpackets; tmp_rx_hsplit_pkt = rx->rx_hsplit_pkt; + tmp_rx_devmem_pkt = rx->rx_devmem_pkt; + tmp_rx_devmem_dropped = rx->rx_devmem_dropped; tmp_rx_bytes = rx->rbytes; tmp_rx_skb_alloc_fail = rx->rx_skb_alloc_fail; tmp_rx_buf_alloc_fail = rx->rx_buf_alloc_fail; @@ -219,6 +225,9 @@ gve_get_ethtool_stats(struct net_device *netdev, start)); rx_pkts += tmp_rx_pkts; rx_hsplit_pkt += tmp_rx_hsplit_pkt; + rx_devmem_pkt += tmp_rx_devmem_pkt; + rx_devmem_dropped += tmp_rx_devmem_dropped; + rx_bytes += tmp_rx_bytes; rx_skb_alloc_fail += tmp_rx_skb_alloc_fail; rx_buf_alloc_fail += tmp_rx_buf_alloc_fail; @@ -245,6 +254,8 @@ gve_get_ethtool_stats(struct net_device *netdev, i = 0; data[i++] = rx_pkts; data[i++] = rx_hsplit_pkt; + data[i++] = rx_devmem_pkt; + data[i++] = rx_devmem_dropped; data[i++] = tx_pkts; data[i++] = rx_bytes; data[i++] = tx_bytes; diff --git a/drivers/net/ethernet/google/gve/gve_rx_dqo.c b/drivers/net/ethernet/google/gve/gve_rx_dqo.c index 523597902a55b0..9abbacb32bbce8 100644 --- a/drivers/net/ethernet/google/gve/gve_rx_dqo.c +++ b/drivers/net/ethernet/google/gve/gve_rx_dqo.c @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include static void gve_rx_free_hdr_bufs(struct gve_priv *priv, struct gve_rx_ring *rx) @@ -583,6 +586,25 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, prefetch(netmem_to_page(buf_state->page_info.netmem)); } + if (!hsplit && !rx->ctx.skb_head && + netmem_is_net_iov(buf_state->page_info.netmem)) { + /* !hsplit indicates the packet is not split, and the header went + * to the packet buffer. If the packet buffer is a dma_buf + * page, those can't be easily mapped into the kernel space to + * access the header required to process the packet. + * + * In the future we may be able to map the dma_buf page to + * kernel space to access the header for dma_buf providers that + * support that, but for now, simply drop the packet. We expect + * the TCP packets that we care about to be header split + * anyway. + */ + rx->rx_devmem_dropped++; + + gve_free_buffer(rx, buf_state); + return -EFAULT; + } + /* Copy the header into the skb in the case of header split */ if (hsplit) { int unsplit = 0; @@ -609,9 +631,10 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, } /* Sync the portion of dma buffer for CPU to read. */ - dma_sync_single_range_for_cpu(&priv->pdev->dev, buf_state->addr, - buf_state->page_info.page_offset, - buf_len, DMA_FROM_DEVICE); + page_pool_dma_sync_netmem_for_cpu(rx->dqo.page_pool, + buf_state->page_info.netmem, + buf_state->page_info.page_offset, + buf_len); /* Append to current skb if one exists. */ if (rx->ctx.skb_head) { @@ -622,7 +645,10 @@ static int gve_rx_dqo(struct napi_struct *napi, struct gve_rx_ring *rx, return 0; } - if (eop && buf_len <= priv->rx_copybreak) { + /* We can't copy dma-buf pages. Ignore any copybreak setting. */ + if (eop && buf_len <= priv->rx_copybreak && + (!netmem_is_net_iov(buf_state->page_info.netmem) || !buf_len)) { + rx->ctx.skb_head = gve_rx_copy(priv->dev, napi, &buf_state->page_info, buf_len); if (unlikely(!rx->ctx.skb_head)) @@ -712,10 +738,16 @@ static int gve_rx_complete_skb(struct gve_rx_ring *rx, struct napi_struct *napi, return err; } - if (skb_headlen(rx->ctx.skb_head) == 0) + if (skb_headlen(rx->ctx.skb_head) == 0) { + if (!skb_frags_readable(napi_get_frags(napi))) + rx->rx_devmem_pkt++; napi_gro_frags(napi); - else + } + else { + if (!skb_frags_readable(rx->ctx.skb_head)) + rx->rx_devmem_pkt++; napi_gro_receive(napi, rx->ctx.skb_head); + } return 0; }