Merge branch '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/net-queue
Tony Nguyen says:
====================
ice: fix Rx data path for heavy 9k MTU traffic
Maciej Fijalkowski says:
This patchset fixes a pretty nasty issue that was reported by RedHat
folks which occurred after ~30 minutes (this value varied, just trying
here to state that it was not observed immediately but rather after a
considerable longer amount of time) when ice driver was tortured with
jumbo frames via mix of iperf traffic executed simultaneously with
wrk/nginx on client/server sides (HTTP and TCP workloads basically).
The reported splats were spanning across all the bad things that can
happen to the state of page - refcount underflow, use-after-free, etc.
One of these looked as follows:
[ 2084.019891] BUG: Bad page state in process swapper/34 pfn:97fcd0
[ 2084.025990] page:00000000a60ee772 refcount:-1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x97fcd0
[ 2084.035462] flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff)
[ 2084.041990] raw: 0017ffffc0000000 dead000000000100 dead000000000122 0000000000000000
[ 2084.049730] raw: 0000000000000000 0000000000000000 ffffffffffffffff 0000000000000000
[ 2084.057468] page dumped because: nonzero _refcount
[ 2084.062260] Modules linked in: bonding tls sunrpc intel_rapl_msr intel_rapl_common intel_uncore_frequency intel_uncore_frequency_common i10nm_edac nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm mgag200 irqd
[ 2084.137829] CPU: 34 PID: 0 Comm: swapper/34 Kdump: loaded Not tainted 5.14.0-427.37.1.el9_4.x86_64 #1
[ 2084.147039] Hardware name: Dell Inc. PowerEdge R750/0216NK, BIOS 1.13.2 12/19/2023
[ 2084.154604] Call Trace:
[ 2084.157058] <IRQ>
[ 2084.159080] dump_stack_lvl+0x34/0x48
[ 2084.162752] bad_page.cold+0x63/0x94
[ 2084.166333] check_new_pages+0xb3/0xe0
[ 2084.170083] rmqueue_bulk+0x2d2/0x9e0
[ 2084.173749] ? ktime_get+0x35/0xa0
[ 2084.177159] rmqueue_pcplist+0x13b/0x210
[ 2084.181081] rmqueue+0x7d3/0xd40
[ 2084.184316] ? xas_load+0x9/0xa0
[ 2084.187547] ? xas_find+0x183/0x1d0
[ 2084.191041] ? xa_find_after+0xd0/0x130
[ 2084.194879] ? intel_iommu_iotlb_sync_map+0x89/0xe0
[ 2084.199759] get_page_from_freelist+0x11f/0x530
[ 2084.204291] __alloc_pages+0xf2/0x250
[ 2084.207958] ice_alloc_rx_bufs+0xcc/0x1c0 [ice]
[ 2084.212543] ice_clean_rx_irq+0x631/0xa20 [ice]
[ 2084.217111] ice_napi_poll+0xdf/0x2a0 [ice]
[ 2084.221330] __napi_poll+0x27/0x170
[ 2084.224824] net_rx_action+0x233/0x2f0
[ 2084.228575] __do_softirq+0xc7/0x2ac
[ 2084.232155] __irq_exit_rcu+0xa1/0xc0
[ 2084.235821] common_interrupt+0x80/0xa0
[ 2084.239662] </IRQ>
[ 2084.241768] <TASK>
The fix is mostly about reverting what was done in commit 1dc1a7e7f4
("ice: Centrallize Rx buffer recycling") followed by proper timing on
page_count() storage and then removing the ice_rx_buf::act related logic
(which was mostly introduced for purposes from cited commit).
Special thanks to Xu Du for providing reproducer and Jacob Keller for
initial extensive analysis.
* '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/net-queue:
ice: stop storing XDP verdict within ice_rx_buf
ice: gather page_count()'s of each frag right before XDP prog call
ice: put Rx buffers after being done with current frame
====================
Link: https://patch.msgid.link/20250131185415.3741532-1-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
88be092224
3 changed files with 103 additions and 91 deletions
|
@ -527,15 +527,14 @@ err:
|
|||
* @xdp: xdp_buff used as input to the XDP program
|
||||
* @xdp_prog: XDP program to run
|
||||
* @xdp_ring: ring to be used for XDP_TX action
|
||||
* @rx_buf: Rx buffer to store the XDP action
|
||||
* @eop_desc: Last descriptor in packet to read metadata from
|
||||
*
|
||||
* Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
|
||||
*/
|
||||
static void
|
||||
static u32
|
||||
ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
|
||||
struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring,
|
||||
struct ice_rx_buf *rx_buf, union ice_32b_rx_flex_desc *eop_desc)
|
||||
union ice_32b_rx_flex_desc *eop_desc)
|
||||
{
|
||||
unsigned int ret = ICE_XDP_PASS;
|
||||
u32 act;
|
||||
|
@ -574,7 +573,7 @@ out_failure:
|
|||
ret = ICE_XDP_CONSUMED;
|
||||
}
|
||||
exit:
|
||||
ice_set_rx_bufs_act(xdp, rx_ring, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -860,10 +859,8 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
|
|||
xdp_buff_set_frags_flag(xdp);
|
||||
}
|
||||
|
||||
if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) {
|
||||
ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
|
||||
if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS))
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
__skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page,
|
||||
rx_buf->page_offset, size);
|
||||
|
@ -924,7 +921,6 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
|
|||
struct ice_rx_buf *rx_buf;
|
||||
|
||||
rx_buf = &rx_ring->rx_buf[ntc];
|
||||
rx_buf->pgcnt = page_count(rx_buf->page);
|
||||
prefetchw(rx_buf->page);
|
||||
|
||||
if (!size)
|
||||
|
@ -940,6 +936,31 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
|
|||
return rx_buf;
|
||||
}
|
||||
|
||||
/**
|
||||
* ice_get_pgcnts - grab page_count() for gathered fragments
|
||||
* @rx_ring: Rx descriptor ring to store the page counts on
|
||||
*
|
||||
* This function is intended to be called right before running XDP
|
||||
* program so that the page recycling mechanism will be able to take
|
||||
* a correct decision regarding underlying pages; this is done in such
|
||||
* way as XDP program can change the refcount of page
|
||||
*/
|
||||
static void ice_get_pgcnts(struct ice_rx_ring *rx_ring)
|
||||
{
|
||||
u32 nr_frags = rx_ring->nr_frags + 1;
|
||||
u32 idx = rx_ring->first_desc;
|
||||
struct ice_rx_buf *rx_buf;
|
||||
u32 cnt = rx_ring->count;
|
||||
|
||||
for (int i = 0; i < nr_frags; i++) {
|
||||
rx_buf = &rx_ring->rx_buf[idx];
|
||||
rx_buf->pgcnt = page_count(rx_buf->page);
|
||||
|
||||
if (++idx == cnt)
|
||||
idx = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ice_build_skb - Build skb around an existing buffer
|
||||
* @rx_ring: Rx descriptor ring to transact packets on
|
||||
|
@ -1051,12 +1072,12 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp)
|
|||
rx_buf->page_offset + headlen, size,
|
||||
xdp->frame_sz);
|
||||
} else {
|
||||
/* buffer is unused, change the act that should be taken later
|
||||
* on; data was copied onto skb's linear part so there's no
|
||||
/* buffer is unused, restore biased page count in Rx buffer;
|
||||
* data was copied onto skb's linear part so there's no
|
||||
* need for adjusting page offset and we can reuse this buffer
|
||||
* as-is
|
||||
*/
|
||||
rx_buf->act = ICE_SKB_CONSUMED;
|
||||
rx_buf->pagecnt_bias++;
|
||||
}
|
||||
|
||||
if (unlikely(xdp_buff_has_frags(xdp))) {
|
||||
|
@ -1103,6 +1124,65 @@ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf)
|
|||
rx_buf->page = NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* ice_put_rx_mbuf - ice_put_rx_buf() caller, for all frame frags
|
||||
* @rx_ring: Rx ring with all the auxiliary data
|
||||
* @xdp: XDP buffer carrying linear + frags part
|
||||
* @xdp_xmit: XDP_TX/XDP_REDIRECT verdict storage
|
||||
* @ntc: a current next_to_clean value to be stored at rx_ring
|
||||
* @verdict: return code from XDP program execution
|
||||
*
|
||||
* Walk through gathered fragments and satisfy internal page
|
||||
* recycle mechanism; we take here an action related to verdict
|
||||
* returned by XDP program;
|
||||
*/
|
||||
static void ice_put_rx_mbuf(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
|
||||
u32 *xdp_xmit, u32 ntc, u32 verdict)
|
||||
{
|
||||
u32 nr_frags = rx_ring->nr_frags + 1;
|
||||
u32 idx = rx_ring->first_desc;
|
||||
u32 cnt = rx_ring->count;
|
||||
u32 post_xdp_frags = 1;
|
||||
struct ice_rx_buf *buf;
|
||||
int i;
|
||||
|
||||
if (unlikely(xdp_buff_has_frags(xdp)))
|
||||
post_xdp_frags += xdp_get_shared_info_from_buff(xdp)->nr_frags;
|
||||
|
||||
for (i = 0; i < post_xdp_frags; i++) {
|
||||
buf = &rx_ring->rx_buf[idx];
|
||||
|
||||
if (verdict & (ICE_XDP_TX | ICE_XDP_REDIR)) {
|
||||
ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
|
||||
*xdp_xmit |= verdict;
|
||||
} else if (verdict & ICE_XDP_CONSUMED) {
|
||||
buf->pagecnt_bias++;
|
||||
} else if (verdict == ICE_XDP_PASS) {
|
||||
ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
|
||||
}
|
||||
|
||||
ice_put_rx_buf(rx_ring, buf);
|
||||
|
||||
if (++idx == cnt)
|
||||
idx = 0;
|
||||
}
|
||||
/* handle buffers that represented frags released by XDP prog;
|
||||
* for these we keep pagecnt_bias as-is; refcount from struct page
|
||||
* has been decremented within XDP prog and we do not have to increase
|
||||
* the biased refcnt
|
||||
*/
|
||||
for (; i < nr_frags; i++) {
|
||||
buf = &rx_ring->rx_buf[idx];
|
||||
ice_put_rx_buf(rx_ring, buf);
|
||||
if (++idx == cnt)
|
||||
idx = 0;
|
||||
}
|
||||
|
||||
xdp->data = NULL;
|
||||
rx_ring->first_desc = ntc;
|
||||
rx_ring->nr_frags = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
|
||||
* @rx_ring: Rx descriptor ring to transact packets on
|
||||
|
@ -1120,15 +1200,13 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
|
|||
unsigned int total_rx_bytes = 0, total_rx_pkts = 0;
|
||||
unsigned int offset = rx_ring->rx_offset;
|
||||
struct xdp_buff *xdp = &rx_ring->xdp;
|
||||
u32 cached_ntc = rx_ring->first_desc;
|
||||
struct ice_tx_ring *xdp_ring = NULL;
|
||||
struct bpf_prog *xdp_prog = NULL;
|
||||
u32 ntc = rx_ring->next_to_clean;
|
||||
u32 cached_ntu, xdp_verdict;
|
||||
u32 cnt = rx_ring->count;
|
||||
u32 xdp_xmit = 0;
|
||||
u32 cached_ntu;
|
||||
bool failure;
|
||||
u32 first;
|
||||
|
||||
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
|
||||
if (xdp_prog) {
|
||||
|
@ -1190,6 +1268,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
|
|||
xdp_prepare_buff(xdp, hard_start, offset, size, !!offset);
|
||||
xdp_buff_clear_frags_flag(xdp);
|
||||
} else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) {
|
||||
ice_put_rx_mbuf(rx_ring, xdp, NULL, ntc, ICE_XDP_CONSUMED);
|
||||
break;
|
||||
}
|
||||
if (++ntc == cnt)
|
||||
|
@ -1199,15 +1278,15 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
|
|||
if (ice_is_non_eop(rx_ring, rx_desc))
|
||||
continue;
|
||||
|
||||
ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_buf, rx_desc);
|
||||
if (rx_buf->act == ICE_XDP_PASS)
|
||||
ice_get_pgcnts(rx_ring);
|
||||
xdp_verdict = ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_desc);
|
||||
if (xdp_verdict == ICE_XDP_PASS)
|
||||
goto construct_skb;
|
||||
total_rx_bytes += xdp_get_buff_len(xdp);
|
||||
total_rx_pkts++;
|
||||
|
||||
xdp->data = NULL;
|
||||
rx_ring->first_desc = ntc;
|
||||
rx_ring->nr_frags = 0;
|
||||
ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict);
|
||||
|
||||
continue;
|
||||
construct_skb:
|
||||
if (likely(ice_ring_uses_build_skb(rx_ring)))
|
||||
|
@ -1217,18 +1296,12 @@ construct_skb:
|
|||
/* exit if we failed to retrieve a buffer */
|
||||
if (!skb) {
|
||||
rx_ring->ring_stats->rx_stats.alloc_page_failed++;
|
||||
rx_buf->act = ICE_XDP_CONSUMED;
|
||||
if (unlikely(xdp_buff_has_frags(xdp)))
|
||||
ice_set_rx_bufs_act(xdp, rx_ring,
|
||||
ICE_XDP_CONSUMED);
|
||||
xdp->data = NULL;
|
||||
rx_ring->first_desc = ntc;
|
||||
rx_ring->nr_frags = 0;
|
||||
break;
|
||||
xdp_verdict = ICE_XDP_CONSUMED;
|
||||
}
|
||||
xdp->data = NULL;
|
||||
rx_ring->first_desc = ntc;
|
||||
rx_ring->nr_frags = 0;
|
||||
ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict);
|
||||
|
||||
if (!skb)
|
||||
break;
|
||||
|
||||
stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
|
||||
if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
|
||||
|
@ -1257,23 +1330,6 @@ construct_skb:
|
|||
total_rx_pkts++;
|
||||
}
|
||||
|
||||
first = rx_ring->first_desc;
|
||||
while (cached_ntc != first) {
|
||||
struct ice_rx_buf *buf = &rx_ring->rx_buf[cached_ntc];
|
||||
|
||||
if (buf->act & (ICE_XDP_TX | ICE_XDP_REDIR)) {
|
||||
ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
|
||||
xdp_xmit |= buf->act;
|
||||
} else if (buf->act & ICE_XDP_CONSUMED) {
|
||||
buf->pagecnt_bias++;
|
||||
} else if (buf->act == ICE_XDP_PASS) {
|
||||
ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
|
||||
}
|
||||
|
||||
ice_put_rx_buf(rx_ring, buf);
|
||||
if (++cached_ntc >= cnt)
|
||||
cached_ntc = 0;
|
||||
}
|
||||
rx_ring->next_to_clean = ntc;
|
||||
/* return up to cleaned_count buffers to hardware */
|
||||
failure = ice_alloc_rx_bufs(rx_ring, ICE_RX_DESC_UNUSED(rx_ring));
|
||||
|
|
|
@ -201,7 +201,6 @@ struct ice_rx_buf {
|
|||
struct page *page;
|
||||
unsigned int page_offset;
|
||||
unsigned int pgcnt;
|
||||
unsigned int act;
|
||||
unsigned int pagecnt_bias;
|
||||
};
|
||||
|
||||
|
|
|
@ -5,49 +5,6 @@
|
|||
#define _ICE_TXRX_LIB_H_
|
||||
#include "ice.h"
|
||||
|
||||
/**
|
||||
* ice_set_rx_bufs_act - propagate Rx buffer action to frags
|
||||
* @xdp: XDP buffer representing frame (linear and frags part)
|
||||
* @rx_ring: Rx ring struct
|
||||
* act: action to store onto Rx buffers related to XDP buffer parts
|
||||
*
|
||||
* Set action that should be taken before putting Rx buffer from first frag
|
||||
* to the last.
|
||||
*/
|
||||
static inline void
|
||||
ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring,
|
||||
const unsigned int act)
|
||||
{
|
||||
u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
|
||||
u32 nr_frags = rx_ring->nr_frags + 1;
|
||||
u32 idx = rx_ring->first_desc;
|
||||
u32 cnt = rx_ring->count;
|
||||
struct ice_rx_buf *buf;
|
||||
|
||||
for (int i = 0; i < nr_frags; i++) {
|
||||
buf = &rx_ring->rx_buf[idx];
|
||||
buf->act = act;
|
||||
|
||||
if (++idx == cnt)
|
||||
idx = 0;
|
||||
}
|
||||
|
||||
/* adjust pagecnt_bias on frags freed by XDP prog */
|
||||
if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) {
|
||||
u32 delta = rx_ring->nr_frags - sinfo_frags;
|
||||
|
||||
while (delta) {
|
||||
if (idx == 0)
|
||||
idx = cnt - 1;
|
||||
else
|
||||
idx--;
|
||||
buf = &rx_ring->rx_buf[idx];
|
||||
buf->pagecnt_bias--;
|
||||
delta--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* ice_test_staterr - tests bits in Rx descriptor status and error fields
|
||||
* @status_err_n: Rx descriptor status_error0 or status_error1 bits
|
||||
|
|
Loading…
Add table
Reference in a new issue