1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00
linux/drivers/net/ethernet/google/gve/gve_desc.h
Willem de Bruijn 497dbb2b97 gve: Add optional metadata descriptor type GVE_TXD_MTD
Allow drivers to pass metadata along with packet data to the device.
Introduce a new metadata descriptor type

* GVE_TXD_MTD

This descriptor is optional. If present it immediate follows the
packet descriptor and precedes the segment descriptor.

This descriptor may be repeated. Multiple metadata descriptors may
follow. There are no immediate uses for this, this is for future
proofing. At present devices allow only 1 MTD descriptor.

The lower four bits of the type_flags field encode GVE_TXD_MTD.
The upper four bits of the type_flags field encodes a *sub*type.

Introduce one such metadata descriptor subtype

* GVE_MTD_SUBTYPE_PATH

This shares path information with the device for network failure
discovery and robust response:

Linux derives ipv6 flowlabel and ECMP multipath from sk->sk_txhash,
and updates this field on error with sk_rethink_txhash. Allow the host
stack to do the same. Pass the tx_hash value if set. Also communicate
whether the path hash is set, or more exactly, what its type is. Define
two common types

  GVE_MTD_PATH_HASH_NONE
  GVE_MTD_PATH_HASH_L4

Concrete examples of error conditions that are resolved are
mentioned in the commits that add sk_rethink_txhash calls. Such as
commit 7788174e87 ("tcp: change IPv6 flow-label upon receiving
spurious retransmission").

Experimental results mirror what the theory suggests: where IPv6
FlowLabel is included in path selection (e.g., LAG/ECMP), flowlabel
rotation on TCP timeout avoids the vast majority of TCP disconnects
that would otherwise have occurred during link failures in long-haul
backbones, when an alternative path is available.

Rotation can be applied to various bad connection signals, such as
timeouts and spurious retransmissions. In aggregate, such flow level
signals can help locate network issues. Define initial common states:

  GVE_MTD_PATH_STATE_DEFAULT
  GVE_MTD_PATH_STATE_TIMEOUT
  GVE_MTD_PATH_STATE_CONGESTION
  GVE_MTD_PATH_STATE_RETRANSMIT

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David Awogbemila <awogbemila@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2021-12-16 10:41:54 +00:00

139 lines
4.5 KiB
C

/* SPDX-License-Identifier: (GPL-2.0 OR MIT)
* Google virtual Ethernet (gve) driver
*
* Copyright (C) 2015-2019 Google, Inc.
*/
/* GVE Transmit Descriptor formats */
#ifndef _GVE_DESC_H_
#define _GVE_DESC_H_
#include <linux/build_bug.h>
/* A note on seg_addrs
*
* Base addresses encoded in seg_addr are not assumed to be physical
* addresses. The ring format assumes these come from some linear address
* space. This could be physical memory, kernel virtual memory, user virtual
* memory.
* If raw dma addressing is not supported then gVNIC uses lists of registered
* pages. Each queue is assumed to be associated with a single such linear
* address space to ensure a consistent meaning for seg_addrs posted to its
* rings.
*/
struct gve_tx_pkt_desc {
u8 type_flags; /* desc type is lower 4 bits, flags upper */
u8 l4_csum_offset; /* relative offset of L4 csum word */
u8 l4_hdr_offset; /* Offset of start of L4 headers in packet */
u8 desc_cnt; /* Total descriptors for this packet */
__be16 len; /* Total length of this packet (in bytes) */
__be16 seg_len; /* Length of this descriptor's segment */
__be64 seg_addr; /* Base address (see note) of this segment */
} __packed;
struct gve_tx_mtd_desc {
u8 type_flags; /* type is lower 4 bits, subtype upper */
u8 path_state; /* state is lower 4 bits, hash type upper */
__be16 reserved0;
__be32 path_hash;
__be64 reserved1;
} __packed;
struct gve_tx_seg_desc {
u8 type_flags; /* type is lower 4 bits, flags upper */
u8 l3_offset; /* TSO: 2 byte units to start of IPH */
__be16 reserved;
__be16 mss; /* TSO MSS */
__be16 seg_len;
__be64 seg_addr;
} __packed;
/* GVE Transmit Descriptor Types */
#define GVE_TXD_STD (0x0 << 4) /* Std with Host Address */
#define GVE_TXD_TSO (0x1 << 4) /* TSO with Host Address */
#define GVE_TXD_SEG (0x2 << 4) /* Seg with Host Address */
#define GVE_TXD_MTD (0x3 << 4) /* Metadata */
/* GVE Transmit Descriptor Flags for Std Pkts */
#define GVE_TXF_L4CSUM BIT(0) /* Need csum offload */
#define GVE_TXF_TSTAMP BIT(2) /* Timestamp required */
/* GVE Transmit Descriptor Flags for TSO Segs */
#define GVE_TXSF_IPV6 BIT(1) /* IPv6 TSO */
/* GVE Transmit Descriptor Options for MTD Segs */
#define GVE_MTD_SUBTYPE_PATH 0
#define GVE_MTD_PATH_STATE_DEFAULT 0
#define GVE_MTD_PATH_STATE_TIMEOUT 1
#define GVE_MTD_PATH_STATE_CONGESTION 2
#define GVE_MTD_PATH_STATE_RETRANSMIT 3
#define GVE_MTD_PATH_HASH_NONE (0x0 << 4)
#define GVE_MTD_PATH_HASH_L4 (0x1 << 4)
/* GVE Receive Packet Descriptor */
/* The start of an ethernet packet comes 2 bytes into the rx buffer.
* gVNIC adds this padding so that both the DMA and the L3/4 protocol header
* access is aligned.
*/
#define GVE_RX_PAD 2
struct gve_rx_desc {
u8 padding[48];
__be32 rss_hash; /* Receive-side scaling hash (Toeplitz for gVNIC) */
__be16 mss;
__be16 reserved; /* Reserved to zero */
u8 hdr_len; /* Header length (L2-L4) including padding */
u8 hdr_off; /* 64-byte-scaled offset into RX_DATA entry */
__sum16 csum; /* 1's-complement partial checksum of L3+ bytes */
__be16 len; /* Length of the received packet */
__be16 flags_seq; /* Flags [15:3] and sequence number [2:0] (1-7) */
} __packed;
static_assert(sizeof(struct gve_rx_desc) == 64);
/* If the device supports raw dma addressing then the addr in data slot is
* the dma address of the buffer.
* If the device only supports registered segments then the addr is a byte
* offset into the registered segment (an ordered list of pages) where the
* buffer is.
*/
union gve_rx_data_slot {
__be64 qpl_offset;
__be64 addr;
};
/* GVE Recive Packet Descriptor Seq No */
#define GVE_SEQNO(x) (be16_to_cpu(x) & 0x7)
/* GVE Recive Packet Descriptor Flags */
#define GVE_RXFLG(x) cpu_to_be16(1 << (3 + (x)))
#define GVE_RXF_FRAG GVE_RXFLG(3) /* IP Fragment */
#define GVE_RXF_IPV4 GVE_RXFLG(4) /* IPv4 */
#define GVE_RXF_IPV6 GVE_RXFLG(5) /* IPv6 */
#define GVE_RXF_TCP GVE_RXFLG(6) /* TCP Packet */
#define GVE_RXF_UDP GVE_RXFLG(7) /* UDP Packet */
#define GVE_RXF_ERR GVE_RXFLG(8) /* Packet Error Detected */
#define GVE_RXF_PKT_CONT GVE_RXFLG(10) /* Multi Fragment RX packet */
/* GVE IRQ */
#define GVE_IRQ_ACK BIT(31)
#define GVE_IRQ_MASK BIT(30)
#define GVE_IRQ_EVENT BIT(29)
static inline bool gve_needs_rss(__be16 flag)
{
if (flag & GVE_RXF_FRAG)
return false;
if (flag & (GVE_RXF_IPV4 | GVE_RXF_IPV6))
return true;
return false;
}
static inline u8 gve_next_seqno(u8 seq)
{
return (seq + 1) == 8 ? 1 : seq + 1;
}
#endif /* _GVE_DESC_H_ */