On 11/01/2019 01:34, Darrell Ball wrote: > thanks; not a full review Thanks, responses in-line.
> > On Wed, Jan 9, 2019 at 10:06 AM Tiago Lam <tiago....@intel.com > <mailto:tiago....@intel.com>> wrote: > > Previous commits have added support to the dp_packet API to handle > multi-segmented packets, where data is not stored contiguously in > memory. However, in some cases, it is inevitable and data must be > provided contiguously. Examples of such cases are when performing csums > over the entire packet data, or when write()'ing to a file descriptor > (for a tap interface, for example). For such cases, the dp_packet API > has been extended to provide a way to transform a multi-segmented > DPBUF_DPDK packet into a DPBUF_MALLOC system packet (at the expense of a > copy of memory). If the packet's data is already stored in memory > contigously then there's no need to convert the packet. > > Thus, the main use cases that were assuming that a dp_packet's data is > always held contiguously in memory were changed to make use of the new > "linear functions" in the dp_packet API when there's a need to traverse > the entire's packet data. Per the example above, when the packet's data > needs to be write() to the tap's file descriptor, or when the conntrack > module needs to verify a packet's checksum, the data is now linearized. > > Additionally, the miniflow_extract() function has been modified to check > if the respective packet headers don't span across multiple mbufs. This > requirement is needed to guarantee that callers can assume headers are > always in contiguous memory. > > Signed-off-by: Tiago Lam <tiago....@intel.com > <mailto:tiago....@intel.com>> > --- > lib/bfd.c | 3 +- > lib/conntrack-private.h | 4 +- > lib/conntrack.c | 43 +++++-- > lib/crc32c.c | 35 ++++-- > lib/crc32c.h | 2 + > lib/dp-packet.c | 18 +++ > lib/dp-packet.h | 267 > ++++++++++++++++++++++++++++++------------ > lib/dpif-netdev.c | 13 +- > lib/dpif-netlink.c | 5 + > lib/dpif.c | 9 ++ > lib/flow.c | 97 ++++++++++++--- > lib/flow.h | 2 +- > lib/mcast-snooping.c | 2 + > lib/netdev-bsd.c | 5 + > lib/netdev-dummy.c | 13 +- > lib/netdev-linux.c | 13 +- > lib/netdev-native-tnl.c | 26 ++-- > lib/odp-execute.c | 12 +- > lib/ovs-lldp.c | 3 +- > lib/packets.c | 85 ++++++++++++-- > lib/packets.h | 7 ++ > ofproto/ofproto-dpif-upcall.c | 20 +++- > ofproto/ofproto-dpif-xlate.c | 32 ++++- > tests/test-rstp.c | 8 +- > tests/test-stp.c | 8 +- > 25 files changed, 579 insertions(+), 153 deletions(-) > > diff --git a/lib/bfd.c b/lib/bfd.c > index cc8c685..12e076a 100644 > --- a/lib/bfd.c > +++ b/lib/bfd.c > @@ -721,7 +721,8 @@ bfd_process_packet(struct bfd *bfd, const struct > flow *flow, > if (!msg) { > VLOG_INFO_RL(&rl, "%s: Received too-short BFD control > message (only " > "%"PRIdPTR" bytes long, at least %d required).", > - bfd->name, (uint8_t *) dp_packet_tail(p) - l7, > + bfd->name, dp_packet_size(p) - > + (l7 - (uint8_t *) dp_packet_data(p)), > BFD_PACKET_LEN); > goto out; > } > diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h > index a344801..1be2df1 100644 > --- a/lib/conntrack-private.h > +++ b/lib/conntrack-private.h > @@ -159,8 +159,8 @@ tcp_payload_length(struct dp_packet *pkt) > { > const char *tcp_payload = dp_packet_get_tcp_payload(pkt); > if (tcp_payload) { > - return ((char *) dp_packet_tail(pkt) - > dp_packet_l2_pad_size(pkt) > - - tcp_payload); > + return dp_packet_l4_size(pkt) - > + (tcp_payload - (char *) dp_packet_l4(pkt)); > > > why was this change needed ? `dp_packet_get_tcp_payload()` already makes sure the payload is within the a segment, returning NULL otherwise. So, while this change itself isn't needed to make this code path correct, it helps with readability in the future now with the adoption with multi-segment mbufs. Above, the resulting addresses of `dp_packet_tail(pkt)` and `tcp_payload` may not be in the same segment when using multi-segment mbufs. > > > } else { > return 0; > } > diff --git a/lib/conntrack.c b/lib/conntrack.c > index 6f6021a..0dd2dcc 100644 > --- a/lib/conntrack.c > +++ b/lib/conntrack.c > @@ -636,12 +636,22 @@ reverse_pat_packet(struct dp_packet *pkt, > const struct conn *conn) > static void > reverse_nat_packet(struct dp_packet *pkt, const struct conn *conn) > { > - char *tail = dp_packet_tail(pkt); > - char pad = dp_packet_l2_pad_size(pkt); > + char *tail; > + char pad; > struct conn_key inner_key; > const char *inner_l4 = NULL; > - uint16_t orig_l3_ofs = pkt->l3_ofs; > - uint16_t orig_l4_ofs = pkt->l4_ofs; > + uint16_t orig_l3_ofs; > + uint16_t orig_l4_ofs; > + > + /* We need the whole packet to parse the packet below */ > + if (!dp_packet_is_linear(pkt)) { > + dp_packet_linearize(pkt); > + } > + > + tail = dp_packet_tail(pkt); > + pad = dp_packet_l2_pad_size(pkt); > + orig_l3_ofs = pkt->l3_ofs; > + orig_l4_ofs = pkt->l4_ofs; > > > if (conn->key.dl_type == htons(ETH_TYPE_IP)) { > struct ip_header *nh = dp_packet_l3(pkt); > @@ -1323,6 +1333,7 @@ conntrack_execute(struct conntrack *ct, struct > dp_packet_batch *pkt_batch, > write_ct_md(packet, zone, NULL, NULL, NULL); > continue; > } > + > > > pls don't make whitespace changes, especially for large patchsets Sure. Will remove. > > > process_one(ct, packet, &ctx, zone, force, commit, now, > setmark, > setlabel, nat_action_info, tp_src, tp_dst, helper); > } > @@ -1904,9 +1915,18 @@ static bool > conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, > ovs_be16 dl_type, > struct conn_lookup_ctx *ctx, uint16_t zone) > { > - const struct eth_header *l2 = dp_packet_eth(pkt); > - const struct ip_header *l3 = dp_packet_l3(pkt); > - const char *l4 = dp_packet_l4(pkt); > + const struct eth_header *l2; > + const struct ip_header *l3; > + const char *l4; > + > + /* We need the whole packet to parse the packet below */ > + if (!dp_packet_is_linear(pkt)) { > + dp_packet_linearize(pkt); > + } > + > + l2 = dp_packet_eth(pkt); > + l3 = dp_packet_l3(pkt); > + l4 = dp_packet_l4(pkt); > > > iirc, the minimum segment size is 2k > it seems a wrapper around csum_continue() is what is needed; is > packet_csum_continue() the right api ? Likely. The analysis and implementation of moving conntrack.c to be compliant with multi-segment mbufs hasn't been done for this patchset - other than "linearize the packet once it gets to conntrack". This is expected to have a performance penalty, if one uses the Userspace connection tracker *and enables* multi-segment mbufs. This is also mentioned in Documentation/topics/dpdk/jumbo-frames.rst. > > > > memset(ctx, 0, sizeof *ctx); > > @@ -3174,7 +3194,7 @@ handle_ftp_ctl(struct conntrack *ct, const > struct conn_lookup_ctx *ctx, > const struct conn *conn_for_expectation, > long long now, enum ftp_ctl_pkt ftp_ctl, bool nat) > { > - struct ip_header *l3_hdr = dp_packet_l3(pkt); > + struct ip_header *l3_hdr; > ovs_be32 v4_addr_rep = 0; > struct ct_addr v6_addr_rep; > size_t addr_offset_from_ftp_data_start = 0; > @@ -3183,6 +3203,13 @@ handle_ftp_ctl(struct conntrack *ct, const > struct conn_lookup_ctx *ctx, > bool do_seq_skew_adj = true; > enum ct_alg_mode mode = CT_FTP_MODE_ACTIVE; > > + /* We need the whole packet to parse the packet below */ > + if (!dp_packet_is_linear(pkt)) { > + dp_packet_linearize(pkt); > + } > + > + l3_hdr = dp_packet_l3(pkt); > + > if (detect_ftp_ctl_type(ctx, pkt) != ftp_ctl) { > return; > } > diff --git a/lib/crc32c.c b/lib/crc32c.c > index e8dd6ee..dd5bb9a 100644 > --- a/lib/crc32c.c > +++ b/lib/crc32c.c > @@ -132,28 +132,39 @@ static const uint32_t crc32Table[256] = { > 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L > }; > > -/* > - * Compute a CRC32c checksum as per the SCTP requirements in > RFC4960. This > - * includes beginning with a checksum of all ones, and returning > the negated > - * CRC. Unlike the RFC, we return the checksum in network byte-order. > - */ > -ovs_be32 > -crc32c(const uint8_t *data, size_t size) > +uint32_t > +crc32c_continue(uint32_t partial, const uint8_t *data, size_t size) > { > - uint32_t crc = 0xffffffffL; > - > while (size--) { > - crc = crc32Table[(crc ^ *data++) & 0xff] ^ (crc >> 8); > + partial = crc32Table[(partial ^ *data++) & 0xff] ^ (partial > >> 8); > } > > + return partial; > +} > + > +ovs_be32 > +crc32c_finish(uint32_t partial) > +{ > /* The result of this CRC calculation provides us a value in > the reverse > * byte-order as compared with our architecture. On big-endian > systems, > * this is opposite to our return type. So, to return a big-endian > * value, we must swap the byte-order. */ > #if defined(WORDS_BIGENDIAN) > - crc = uint32_byteswap(crc); > + crc = uint32_byteswap(partial); > #endif > > /* Our value is in network byte-order. OVS_FORCE keeps sparse > happy. */ > - return (OVS_FORCE ovs_be32) ~crc; > + return (OVS_FORCE ovs_be32) ~partial; > +} > + > +/* > + * Compute a CRC32c checksum as per the SCTP requirements in > RFC4960. This > + * includes beginning with a checksum of all ones, and returning > the negated > + * CRC. Unlike the RFC, we return the checksum in network byte-order. > + */ > +ovs_be32 > +crc32c(const uint8_t *data, size_t size) > +{ > + uint32_t crc = 0xffffffffL; > + return crc32c_finish(crc32c_continue(crc, data, size)); > } > diff --git a/lib/crc32c.h b/lib/crc32c.h > index 92c7d7f..17c8190 100644 > --- a/lib/crc32c.h > +++ b/lib/crc32c.h > @@ -20,6 +20,8 @@ > > #include "openvswitch/types.h" > > +uint32_t crc32c_continue(uint32_t partial, const uint8_t *data, > size_t size); > +ovs_be32 crc32c_finish(uint32_t partial); > ovs_be32 crc32c(const uint8_t *data, size_t); > > #endif /* crc32c.h */ > diff --git a/lib/dp-packet.c b/lib/dp-packet.c > index 4e4b8fb..32cc881 100644 > --- a/lib/dp-packet.c > +++ b/lib/dp-packet.c > @@ -103,6 +103,9 @@ void > dp_packet_init_dpdk(struct dp_packet *b) > { > b->source = DPBUF_DPDK; > +#ifdef DPDK_NETDEV > + b->mstate = NULL; > +#endif > } > > /* Initializes 'b' as an empty dp_packet with an initial capacity > of 'size' > @@ -120,6 +123,21 @@ dp_packet_uninit(struct dp_packet *b) > if (b) { > if (b->source == DPBUF_MALLOC) { > free(dp_packet_base(b)); > + > +#ifdef DPDK_NETDEV > + /* Packet has been "linearized" */ > + if (b->mstate) { > + b->source = DPBUF_DPDK; > + b->mbuf.buf_addr = b->mstate->addr; > + b->mbuf.buf_len = b->mstate->len; > + b->mbuf.data_off = b->mstate->off; > + > + free(b->mstate); > + b->mstate = NULL; > + > + free_dpdk_buf((struct dp_packet *) b); > + } > +#endif > } else if (b->source == DPBUF_DPDK) { > #ifdef DPDK_NETDEV > /* If this dp_packet was allocated by DPDK it must have > been > diff --git a/lib/dp-packet.h b/lib/dp-packet.h > index a003546..970aaf2 100644 > --- a/lib/dp-packet.h > +++ b/lib/dp-packet.h > @@ -27,7 +27,6 @@ > > #include "netdev-dpdk.h" > #include "openvswitch/list.h" > -#include "packets.h" > #include "util.h" > #include "flow.h" > > @@ -46,6 +45,16 @@ enum OVS_PACKED_ENUM dp_packet_source { > > #define DP_PACKET_CONTEXT_SIZE 64 > > +#ifdef DPDK_NETDEV > +/* Struct to save data for when a DPBUF_DPDK packet is converted to > + * DPBUF_MALLOC. */ > +struct mbuf_state { > + void *addr; > + uint16_t len; > + uint16_t off; > +}; > +#endif > + > /* Buffer for holding packet data. A dp_packet is automatically > reallocated > * as necessary if it grows too large for the available memory. > * By default the packet type is set to Ethernet (PT_ETH). > @@ -53,6 +62,7 @@ enum OVS_PACKED_ENUM dp_packet_source { > struct dp_packet { > #ifdef DPDK_NETDEV > struct rte_mbuf mbuf; /* DPDK mbuf */ > + struct mbuf_state *mstate; /* Used when packet has been > "linearized" */ > #else > void *base_; /* First byte of allocated space. */ > uint16_t allocated_; /* Number of bytes allocated. */ > @@ -85,6 +95,9 @@ static inline void dp_packet_set_data(struct > dp_packet *, void *); > static inline void *dp_packet_base(const struct dp_packet *); > static inline void dp_packet_set_base(struct dp_packet *, void *); > > +static inline bool dp_packet_is_linear(const struct dp_packet *); > +static inline void dp_packet_linearize(struct dp_packet *); > + > static inline uint32_t dp_packet_size(const struct dp_packet *); > static inline void dp_packet_set_size(struct dp_packet *, uint32_t); > > @@ -101,6 +114,7 @@ static inline void *dp_packet_l2_5(const struct > dp_packet *); > static inline void dp_packet_set_l2_5(struct dp_packet *, void *); > static inline void *dp_packet_l3(const struct dp_packet *); > static inline void dp_packet_set_l3(struct dp_packet *, void *); > +static inline size_t dp_packet_l3_size(const struct dp_packet *); > static inline void *dp_packet_l4(const struct dp_packet *); > static inline void dp_packet_set_l4(struct dp_packet *, void *); > static inline size_t dp_packet_l4_size(const struct dp_packet *); > @@ -137,6 +151,11 @@ static inline void *dp_packet_at(const struct > dp_packet *, size_t offset, > size_t size); > static inline void *dp_packet_at_assert(const struct dp_packet *, > size_t offset, size_t size); > + > +static inline void > +dp_packet_copy_from_offset(const struct dp_packet *b, size_t offset, > + size_t size, void *buf); > + > #ifdef DPDK_NETDEV > static inline const struct rte_mbuf * > dp_packet_mbuf_from_offset(const struct dp_packet *b, size_t *offset); > @@ -175,21 +194,22 @@ void *dp_packet_steal_data(struct dp_packet *); > static inline bool dp_packet_equal(const struct dp_packet *, > const struct dp_packet *); > > +static inline ssize_t > +dp_packet_read_data(const struct dp_packet *b, size_t offset, > size_t size, > + void **ptr, void *buf); > + > + > > /* Frees memory that 'b' points to, as well as 'b' itself. */ > static inline void > dp_packet_delete(struct dp_packet *b) > { > if (b) { > - if (b->source == DPBUF_DPDK) { > - /* If this dp_packet was allocated by DPDK it must have > been > - * created as a dp_packet */ > - free_dpdk_buf((struct dp_packet*) b); > - return; > - } > - > dp_packet_uninit(b); > - free(b); > + > + if (b->source != DPBUF_DPDK) { > + free(b); > + } > } > } > > @@ -205,7 +225,9 @@ dp_packet_copy_common_members(struct dp_packet > *new_b, > } > > /* If 'b' contains at least 'offset + size' bytes of data, returns > a pointer to > - * byte 'offset'. Otherwise, returns a null pointer. */ > + * byte 'offset'. Otherwise, returns a null pointer. For DPDK > packets, this > + * means the 'offset' + 'size' must fall within the same mbuf (not > necessarily > + * the first mbuf), otherwise null is returned */ > static inline void * > dp_packet_at(const struct dp_packet *b, size_t offset, size_t size) > { > @@ -215,18 +237,15 @@ dp_packet_at(const struct dp_packet *b, size_t > offset, size_t size) > > #ifdef DPDK_NETDEV > if (b->source == DPBUF_DPDK) { > - struct rte_mbuf *buf = CONST_CAST(struct rte_mbuf *, &b->mbuf); > + const struct rte_mbuf *mbuf = dp_packet_mbuf_from_offset(b, > &offset); > > - while (buf && offset > buf->data_len) { > - offset -= buf->data_len; > - > - buf = buf->next; > + if (!mbuf || offset + size > mbuf->data_len) { > + return NULL; > } > > - return buf ? rte_pktmbuf_mtod_offset(buf, char *, offset) : > NULL; > + return rte_pktmbuf_mtod_offset(mbuf, char *, offset); > } > #endif > - > return (char *) dp_packet_data(b) + offset; > } > > @@ -325,20 +344,24 @@ dp_packet_pull(struct dp_packet *b, size_t size) > return data; > } > > -#ifdef DPDK_NETDEV > /* Similar to dp_packet_try_pull() but doesn't actually pull any > data, only > - * checks if it could and returns true or false accordingly. > - * > - * Valid for dp_packets carrying mbufs only. */ > + * checks if it could and returns 'true' or 'false', accordingly. > For DPDK > + * packets, 'true' is only returned in case the 'offset' + 'size' > falls within > + * the first mbuf, otherwise 'false' is returned */ > static inline bool > -dp_packet_mbuf_may_pull(const struct dp_packet *b, size_t size) { > - if (size > b->mbuf.data_len) { > +dp_packet_may_pull(const struct dp_packet *b, uint16_t offset, > size_t size) > +{ > + if (offset == UINT16_MAX) { > + return false; > + } > +#ifdef DPDK_NETDEV > + /* Offset needs to be within the first mbuf */ > + if (offset + size > b->mbuf.data_len) { > return false; > } > - > - return true; > -} > #endif > + return (offset + size > dp_packet_size(b)) ? false : true; > +} > > /* If 'b' has at least 'size' bytes of data, removes that many > bytes from the > * head end of 'b' and returns the first byte removed. Otherwise, > returns a > @@ -347,7 +370,7 @@ static inline void * > dp_packet_try_pull(struct dp_packet *b, size_t size) > { > #ifdef DPDK_NETDEV > - if (!dp_packet_mbuf_may_pull(b, size)) { > + if (!dp_packet_may_pull(b, 0, size)) { > return NULL; > } > #endif > @@ -356,6 +379,39 @@ dp_packet_try_pull(struct dp_packet *b, size_t > size) > ? dp_packet_pull(b, size) : NULL; > } > > +/* Reads 'size' bytes from 'offset' in 'b', linearly, to 'ptr', if > 'buf' is > + * NULL. Otherwise, if a 'buf' is provided, it must have 'size' > bytes, and the > + * data will be copied there, iff it is found to be non-linear. */ > +static inline ssize_t > +dp_packet_read_data(const struct dp_packet *b, size_t offset, > size_t size, > + void **ptr, void *buf) { > + /* Zero copy */ > + if ((*ptr = dp_packet_at(b, offset, size)) != NULL) { > + return 0; > + } > + > + /* Copy available linear data */ > + if (buf == NULL) { > +#ifdef DPDK_NETDEV > + size_t mofs = offset; > + const struct rte_mbuf *mbuf = dp_packet_mbuf_from_offset(b, > &mofs); > + *ptr = dp_packet_at(b, offset, mbuf->data_len - mofs); > + > + return size - (mbuf->data_len - mofs); > +#else > + /* Non-DPDK dp_packets should always hit the above condition */ > + ovs_assert(1); > +#endif > + } > + > + /* Copy all data */ > + > + *ptr = buf; > + dp_packet_copy_from_offset(b, offset, size, buf); > + > + return 0; > +} > + > static inline bool > dp_packet_is_eth(const struct dp_packet *b) > { > @@ -398,12 +454,6 @@ dp_packet_set_l2_pad_size(struct dp_packet *b, > uint8_t pad_size) > static inline void * > dp_packet_l2_5(const struct dp_packet *b) > { > -#ifdef DPDK_NETDEV > - if (!dp_packet_mbuf_may_pull(b, b->l2_5_ofs)) { > - return NULL; > - } > -#endif > - > return b->l2_5_ofs != UINT16_MAX > ? (char *) dp_packet_data(b) + b->l2_5_ofs > : NULL; > @@ -420,12 +470,6 @@ dp_packet_set_l2_5(struct dp_packet *b, void *l2_5) > static inline void * > dp_packet_l3(const struct dp_packet *b) > { > -#ifdef DPDK_NETDEV > - if (!dp_packet_mbuf_may_pull(b, b->l3_ofs)) { > - return NULL; > - } > -#endif > - > return b->l3_ofs != UINT16_MAX > ? (char *) dp_packet_data(b) + b->l3_ofs > : NULL; > @@ -437,15 +481,29 @@ dp_packet_set_l3(struct dp_packet *b, void *l3) > b->l3_ofs = l3 ? (char *) l3 - (char *) dp_packet_data(b) : > UINT16_MAX; > } > > -static inline void * > -dp_packet_l4(const struct dp_packet *b) > +/* Returns the size of the l3 header. Caller must make sure both > l3_ofs and > + * l4_ofs are set*/ > +static inline size_t > +dp_packet_l3h_size(const struct dp_packet *b) > { > -#ifdef DPDK_NETDEV > - if (!dp_packet_mbuf_may_pull(b, b->l4_ofs)) { > - return NULL; > + return b->l4_ofs - b->l3_ofs; > +} > + > +static inline size_t > +dp_packet_l3_size(const struct dp_packet *b) > +{ > + if (!dp_packet_may_pull(b, b->l3_ofs, 0)) { > + return 0; > } > -#endif > > + size_t l3_size = dp_packet_size(b) - b->l3_ofs; > + > + return l3_size - dp_packet_l2_pad_size(b); > +} > + > +static inline void * > +dp_packet_l4(const struct dp_packet *b) > +{ > return b->l4_ofs != UINT16_MAX > ? (char *) dp_packet_data(b) + b->l4_ofs > : NULL; > @@ -460,31 +518,13 @@ dp_packet_set_l4(struct dp_packet *b, void *l4) > static inline size_t > dp_packet_l4_size(const struct dp_packet *b) > { > -#ifdef DPDK_NETDEV > - if (b->source == DPBUF_DPDK) { > - if (!dp_packet_mbuf_may_pull(b, b->l4_ofs)) { > - return 0; > - } > - > - struct rte_mbuf *mbuf = CONST_CAST(struct rte_mbuf *, > &b->mbuf); > - size_t l4_size = mbuf->data_len - b->l4_ofs; > - > - mbuf = mbuf->next; > - while (mbuf) { > - l4_size += mbuf->data_len; > - > - mbuf = mbuf->next; > - } > + if (!dp_packet_may_pull(b, b->l4_ofs, 0)) { > + return 0; > + } > > - l4_size -= dp_packet_l2_pad_size(b); > + size_t l4_size = dp_packet_size(b) - b->l4_ofs; > > - return l4_size; > - } > -#endif > - return b->l4_ofs != UINT16_MAX > - ? (const char *)dp_packet_tail(b) - (const char > *)dp_packet_l4(b) > - - dp_packet_l2_pad_size(b) > - : 0; > + return l4_size - dp_packet_l2_pad_size(b); > } > > static inline const void * > @@ -497,7 +537,8 @@ dp_packet_get_tcp_payload(const struct dp_packet *b) > int tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4; > > if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= > l4_size)) { > - return (const char *)tcp + tcp_len; > + tcp = dp_packet_at(b, b->l4_ofs, tcp_len); > + return (tcp == NULL) ? NULL : tcp + tcp_len; > } > } > return NULL; > @@ -507,21 +548,24 @@ static inline const void * > dp_packet_get_udp_payload(const struct dp_packet *b) > { > return OVS_LIKELY(dp_packet_l4_size(b) >= UDP_HEADER_LEN) > - ? (const char *)dp_packet_l4(b) + UDP_HEADER_LEN : NULL; > + ? (const char *) dp_packet_l4(b) + UDP_HEADER_LEN > + : NULL; > } > > static inline const void * > dp_packet_get_sctp_payload(const struct dp_packet *b) > { > return OVS_LIKELY(dp_packet_l4_size(b) >= SCTP_HEADER_LEN) > - ? (const char *)dp_packet_l4(b) + SCTP_HEADER_LEN : NULL; > + ? (const char *) dp_packet_l4(b) + SCTP_HEADER_LEN > + : NULL; > } > > static inline const void * > dp_packet_get_icmp_payload(const struct dp_packet *b) > { > return OVS_LIKELY(dp_packet_l4_size(b) >= ICMP_HEADER_LEN) > - ? (const char *)dp_packet_l4(b) + ICMP_HEADER_LEN : NULL; > + ? (const char *) dp_packet_l4(b) + ICMP_HEADER_LEN > + : NULL; > } > > static inline const void * > @@ -769,6 +813,7 @@ dp_packet_mbuf_init(struct dp_packet *p) > p->mbuf.ol_flags = p->mbuf.tx_offload = p->mbuf.packet_type = 0; > p->mbuf.nb_segs = 1; > p->mbuf.next = NULL; > + p->mstate = NULL; > } > > static inline bool > @@ -817,6 +862,66 @@ dp_packet_has_flow_mark(struct dp_packet *p, > uint32_t *mark) > return false; > } > > +static inline void > +dp_packet_copy_from_offset(const struct dp_packet *b, size_t offset, > + size_t size, void *buf) { > + if (dp_packet_is_linear(b)) { > + memcpy(buf, (char *)dp_packet_data(b) + offset, size); > + } else { > + const struct rte_mbuf *mbuf = dp_packet_mbuf_from_offset(b, > &offset); > + rte_pktmbuf_read(mbuf, offset, size, buf); > + } > +} > + > +static inline bool > +dp_packet_is_linear(const struct dp_packet *b) > +{ > + if (b->source == DPBUF_DPDK) { > + return rte_pktmbuf_is_contiguous(&b->mbuf); > > > check seems lightweight when RTE_LIBRTE_MBUF_DEBUG is off (by default) > > static inline int rte_pktmbuf_is_contiguous > <https://doc.dpdk.org/api/rte__mbuf_8h.html#a511a116ae4822037d4f1fb561aa4ffcf>(const > struct rte_mbuf <https://doc.dpdk.org/api/structrte__mbuf.html> *m) > 2100 { > 2101 __rte_mbuf_sanity_check > <https://doc.dpdk.org/api/rte__mbuf_8h.html#a93da89e4fcf908c0615bcc71c2998413>(m, > 1); > 2102 return !!(m->nb_segs > <https://doc.dpdk.org/api/structrte__mbuf.html#a44fb29a9283000c42a77b76e9ef0b070> > == 1); > 2103 } > > > #ifdef RTE_LIBRTE_MBUF_DEBUG > 867 > 869 #define __rte_mbuf_sanity_check(m, is_h) rte_mbuf_sanity_check(m, is_h) > 870 > 871 #else /* RTE_LIBRTE_MBUF_DEBUG */ > 872 > 874 > <https://doc.dpdk.org/api/rte__mbuf_8h.html#a93da89e4fcf908c0615bcc71c2998413> > #define > __rte_mbuf_sanity_check(m, is_h) do { } while (0) > > > > /* do some sanity checks on a mbuf: panic if it fails */ void > rte_mbuf_sanity_check(const struct rte_mbuf *m, int is_header) { const > struct rte_mbuf *m_seg; unsigned int nb_segs; if (m == NULL) > rte_panic("mbuf is NULL\n"); /* generic checks */ if (m->pool == NULL) > rte_panic("bad mbuf pool\n"); if (m->buf_iova == 0) rte_panic("bad IO > addr\n"); if (m->buf_addr == NULL) rte_panic("bad virt addr\n"); > uint16_t cnt = rte_mbuf_refcnt_read(m); if ((cnt == 0) || (cnt == > UINT16_MAX)) rte_panic("bad ref cnt\n"); /* nothing to check for > sub-segments */ if (is_header == 0) return; nb_segs = m->nb_segs; m_seg > = m; while (m_seg && nb_segs != 0) { m_seg = m_seg->next; nb_segs--; } > if (nb_segs != 0) rte_panic("bad nb_segs\n"); } > > > > + } > + > + return true; > +} > + > +/* Linearizes the data on packet 'b', by copying the data into > system's memory. > + * After this the packet is effectively a DPBUF_MALLOC packet. The > caller is > + * responsible * for ensuring 'b' needs linearization, by calling > + * dp_packet_is_linear(). > + * > + * This is an expensive operation which should only be performed as > a last > + * resort, when multi-segments are under use but data must be accessed > + * linearly. */ > +static inline void > +dp_packet_linearize(struct dp_packet *b) > +{ > + struct rte_mbuf *mbuf = CONST_CAST(struct rte_mbuf *, &b->mbuf); > + struct dp_packet *pkt = CONST_CAST(struct dp_packet *, b); > + uint32_t pkt_len = dp_packet_size(pkt); > + struct mbuf_state *mstate = NULL; > + void *dst = xmalloc(pkt_len); > + > + /* Copy packet's data to system's memory */ > + if (!rte_pktmbuf_read(mbuf, 0, pkt_len, dst)) { > + return; > + } > + > + /* Free all mbufs except for the first */ > + dp_packet_clear(pkt); > + > + /* Save mbuf's buf_addr to restore later */ > + mstate = xmalloc(sizeof(*mstate)); > + mstate->addr = pkt->mbuf.buf_addr; > + mstate->len = pkt->mbuf.buf_len; > + mstate->off = pkt->mbuf.data_off; > + pkt->mstate = mstate; > + > + /* Tranform DPBUF_DPDK packet into a DPBUF_MALLOC packet */ > + pkt->source = DPBUF_MALLOC; > + pkt->mbuf.buf_addr = dst; > + pkt->mbuf.buf_len = pkt_len; > + pkt->mbuf.data_off = 0; > + dp_packet_set_size(pkt, pkt_len); > +} > #else /* DPDK_NETDEV */ > static inline bool > dp_packet_equal(const struct dp_packet *a, const struct dp_packet *b) > @@ -945,6 +1050,24 @@ dp_packet_has_flow_mark(struct dp_packet *p > OVS_UNUSED, > { > return false; > } > + > +static inline void > +dp_packet_copy_from_offset(const struct dp_packet *b, size_t offset, > + size_t size, void *buf) > +{ > + memcpy(buf, (char *)dp_packet_data(b) + offset, size); > +} > + > +static inline bool > +dp_packet_is_linear(const struct dp_packet *b OVS_UNUSED) > +{ > + return true; > +} > + > +static inline void > +dp_packet_linearize(struct dp_packet *b OVS_UNUSED) > +{ > +} > #endif /* DPDK_NETDEV */ > > static inline void > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index 1564db9..1f1188d 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -5710,6 +5710,11 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread > *pmd, struct dp_packet *packet_, > .support = dp_netdev_support, > }; > > + /* Gather the whole data for printing the packet (if debug > enabled) */ > + if (!dp_packet_is_linear(packet_)) { > + dp_packet_linearize(packet_); > + } > + > ofpbuf_init(&key, 0); > odp_flow_key_from_flow(&odp_parms, &key); > packet_str = ofp_dp_packet_to_string(packet_); > @@ -5954,6 +5959,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > bool smc_enable_db; > size_t map_cnt = 0; > bool batch_enable = true; > + int error; > > atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); > atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min); > @@ -6001,7 +6007,12 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > } > } > > - miniflow_extract(packet, &key->mf); > + error = miniflow_extract(packet, &key->mf); > + if (OVS_UNLIKELY(error)) { > + dp_packet_delete(packet); > + continue; > + } > + > key->len = 0; /* Not computed yet. */ > /* If EMC and SMC disabled skip hash computation */ > if (smc_enable_db == true || cur_min != 0) { > diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c > index 0347060..54f60db 100644 > --- a/lib/dpif-netlink.c > +++ b/lib/dpif-netlink.c > @@ -1849,6 +1849,11 @@ dpif_netlink_operate__(struct dpif_netlink *dpif, > } > n_ops = i; > } else { > + /* We will need to pass the whole to encode the > message */ > + if (!dp_packet_is_linear(op->execute.packet)) { > + dp_packet_linearize(op->execute.packet); > + } > + > dpif_netlink_encode_execute(dpif->dp_ifindex, > &op->execute, > &aux->request); > } > diff --git a/lib/dpif.c b/lib/dpif.c > index e35f111..b04c4a0 100644 > --- a/lib/dpif.c > +++ b/lib/dpif.c > @@ -1244,6 +1244,7 @@ dpif_execute_helper_cb(void *aux_, struct > dp_packet_batch *packets_, > execute.probe = false; > execute.mtu = 0; > aux->error = dpif_execute(aux->dpif, &execute); > + > log_execute_message(aux->dpif, &this_module, &execute, > true, aux->error); > > @@ -1407,6 +1408,7 @@ dpif_operate(struct dpif *dpif, struct dpif_op > **ops, size_t n_ops, > > case DPIF_OP_EXECUTE: > COVERAGE_INC(dpif_execute); > + > log_execute_message(dpif, &this_module, > &op->execute, > false, error); > break; > @@ -1834,6 +1836,13 @@ log_execute_message(const struct dpif *dpif, > uint64_t stub[1024 / 8]; > struct ofpbuf md = OFPBUF_STUB_INITIALIZER(stub); > > + /* We will need the whole data for logging */ > + struct dp_packet *p = CONST_CAST(struct dp_packet *, > + execute->packet); > + if (!dp_packet_is_linear(p)) { > + dp_packet_linearize(p); > + } > + > packet = ofp_packet_to_string(dp_packet_data(execute->packet), > dp_packet_size(execute->packet), > execute->packet->packet_type); > diff --git a/lib/flow.c b/lib/flow.c > index c60446f..9d3e788 100644 > --- a/lib/flow.c > +++ b/lib/flow.c > @@ -694,7 +694,7 @@ ipv6_sanity_check(const struct > ovs_16aligned_ip6_hdr *nh, size_t size) > > /* Caller is responsible for initializing 'dst' with enough storage for > * FLOW_U64S * 8 bytes. */ > -void > +int > miniflow_extract(struct dp_packet *packet, struct miniflow *dst) > { > const struct pkt_metadata *md = &packet->md; > @@ -816,6 +816,13 @@ miniflow_extract(struct dp_packet *packet, > struct miniflow *dst) > int ip_len; > uint16_t tot_len; > > + /* Check if header is in first mbuf, otherwise return error */ > + if (!dp_packet_is_linear(packet)) { > + if (!dp_packet_may_pull(packet, packet->l3_ofs, sizeof > *nh)) { > + return -1; > + } > + } > + > if (OVS_UNLIKELY(!ipv4_sanity_check(nh, size, &ip_len, > &tot_len))) { > goto out; > } > @@ -846,6 +853,12 @@ miniflow_extract(struct dp_packet *packet, > struct miniflow *dst) > ovs_be32 tc_flow; > uint16_t plen; > > + if (!dp_packet_is_linear(packet)) { > + if (!dp_packet_may_pull(packet, packet->l3_ofs, sizeof > *nh)) { > + return -1; > + } > + } > + > if (OVS_UNLIKELY(!ipv6_sanity_check(nh, size))) { > goto out; > } > @@ -889,6 +902,14 @@ miniflow_extract(struct dp_packet *packet, > struct miniflow *dst) > if (dl_type == htons(ETH_TYPE_ARP) || > dl_type == htons(ETH_TYPE_RARP)) { > struct eth_addr arp_buf[2]; > + > + if (!dp_packet_is_linear(packet)) { > + if (!dp_packet_may_pull(packet, packet->l3_ofs, > + ARP_ETH_HEADER_LEN)) { > + return -1; > + } > + } > + > const struct arp_eth_header *arp = (const struct > arp_eth_header *) > data_try_pull(&data, &size, ARP_ETH_HEADER_LEN); > > @@ -936,6 +957,13 @@ miniflow_extract(struct dp_packet *packet, > struct miniflow *dst) > if (OVS_LIKELY(size >= TCP_HEADER_LEN)) { > const struct tcp_header *tcp = data; > > + if (!dp_packet_is_linear(packet)) { > + if (!dp_packet_may_pull(packet, packet->l4_ofs, > + TCP_HEADER_LEN)) { > + return -1; > + } > + } > + > miniflow_push_be32(mf, arp_tha.ea[2], 0); > miniflow_push_be32(mf, tcp_flags, > TCP_FLAGS_BE32(tcp->tcp_ctl)); > @@ -948,6 +976,13 @@ miniflow_extract(struct dp_packet *packet, > struct miniflow *dst) > if (OVS_LIKELY(size >= UDP_HEADER_LEN)) { > const struct udp_header *udp = data; > > + if (!dp_packet_is_linear(packet)) { > + if (!dp_packet_may_pull(packet, packet->l4_ofs, > + UDP_HEADER_LEN)) { > + return -1; > + } > + } > + > miniflow_push_be16(mf, tp_src, udp->udp_src); > miniflow_push_be16(mf, tp_dst, udp->udp_dst); > miniflow_push_be16(mf, ct_tp_src, ct_tp_src); > @@ -957,6 +992,13 @@ miniflow_extract(struct dp_packet *packet, > struct miniflow *dst) > if (OVS_LIKELY(size >= SCTP_HEADER_LEN)) { > const struct sctp_header *sctp = data; > > + if (!dp_packet_is_linear(packet)) { > + if (!dp_packet_may_pull(packet, packet->l4_ofs, > + SCTP_HEADER_LEN)) { > + return -1; > + } > + } > + > miniflow_push_be16(mf, tp_src, sctp->sctp_src); > miniflow_push_be16(mf, tp_dst, sctp->sctp_dst); > miniflow_push_be16(mf, ct_tp_src, ct_tp_src); > @@ -966,6 +1008,13 @@ miniflow_extract(struct dp_packet *packet, > struct miniflow *dst) > if (OVS_LIKELY(size >= ICMP_HEADER_LEN)) { > const struct icmp_header *icmp = data; > > + if (!dp_packet_is_linear(packet)) { > + if (!dp_packet_may_pull(packet, packet->l4_ofs, > + ICMP_HEADER_LEN)) { > + return -1; > + } > + } > + > miniflow_push_be16(mf, tp_src, htons(icmp->icmp_type)); > miniflow_push_be16(mf, tp_dst, htons(icmp->icmp_code)); > miniflow_push_be16(mf, ct_tp_src, ct_tp_src); > @@ -975,6 +1024,13 @@ miniflow_extract(struct dp_packet *packet, > struct miniflow *dst) > if (OVS_LIKELY(size >= IGMP_HEADER_LEN)) { > const struct igmp_header *igmp = data; > > + if (!dp_packet_is_linear(packet)) { > + if (!dp_packet_may_pull(packet, packet->l4_ofs, > + IGMP_HEADER_LEN)) { > + return -1; > + } > + } > + > miniflow_push_be16(mf, tp_src, htons(igmp->igmp_type)); > miniflow_push_be16(mf, tp_dst, htons(igmp->igmp_code)); > miniflow_push_be16(mf, ct_tp_src, ct_tp_src); > @@ -987,8 +1043,18 @@ miniflow_extract(struct dp_packet *packet, > struct miniflow *dst) > if (OVS_LIKELY(size >= sizeof(struct icmp6_hdr))) { > const struct in6_addr *nd_target; > struct eth_addr arp_buf[2]; > - const struct icmp6_hdr *icmp = data_pull(&data, &size, > - sizeof *icmp); > + const struct icmp6_hdr *icmp; > + > + if (!dp_packet_is_linear(packet)) { > + if (!dp_packet_may_pull(packet, packet->l4_ofs, > + sizeof *icmp)) { > + return -1; > + } > + } > + > + icmp = data_pull(&data, &size, sizeof *icmp); > + > + > if (parse_icmpv6(&data, &size, icmp, &nd_target, > arp_buf)) { > if (nd_target) { > miniflow_push_words(mf, nd_target, nd_target, > @@ -1011,6 +1077,7 @@ miniflow_extract(struct dp_packet *packet, > struct miniflow *dst) > } > out: > dst->map = mf.map; > + return 0; > } > > ovs_be16 > @@ -3007,7 +3074,8 @@ static void > flow_compose_l4_csum(struct dp_packet *p, const struct flow *flow, > uint32_t pseudo_hdr_csum) > { > - size_t l4_len = (char *) dp_packet_tail(p) - (char *) > dp_packet_l4(p); > + //size_t l4_len = (char *) dp_packet_tail(p) - (char *) > dp_packet_l4(p); > + size_t l4_len = dp_packet_l4_size(p); > > if (!(flow->nw_frag & FLOW_NW_FRAG_ANY) > || !(flow->nw_frag & FLOW_NW_FRAG_LATER)) { > @@ -3015,30 +3083,31 @@ flow_compose_l4_csum(struct dp_packet *p, > const struct flow *flow, > struct tcp_header *tcp = dp_packet_l4(p); > > tcp->tcp_csum = 0; > - tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum, > - tcp, l4_len)); > + tcp->tcp_csum = csum_finish( > + packet_csum_continue(p, pseudo_hdr_csum, p->l4_ofs, > l4_len)); > } else if (flow->nw_proto == IPPROTO_UDP) { > struct udp_header *udp = dp_packet_l4(p); > > udp->udp_csum = 0; > - udp->udp_csum = csum_finish(csum_continue(pseudo_hdr_csum, > - udp, l4_len)); > + udp->udp_csum = csum_finish( > + packet_csum_continue(p, pseudo_hdr_csum, p->l4_ofs, > l4_len)); > } else if (flow->nw_proto == IPPROTO_ICMP) { > struct icmp_header *icmp = dp_packet_l4(p); > > icmp->icmp_csum = 0; > - icmp->icmp_csum = csum(icmp, l4_len); > + icmp->icmp_csum = packet_csum(p, p->l4_ofs, l4_len); > } else if (flow->nw_proto == IPPROTO_IGMP) { > struct igmp_header *igmp = dp_packet_l4(p); > > igmp->igmp_csum = 0; > - igmp->igmp_csum = csum(igmp, l4_len); > + igmp->igmp_csum = packet_csum(p, p->l4_ofs, l4_len); > } else if (flow->nw_proto == IPPROTO_ICMPV6) { > struct icmp6_hdr *icmp = dp_packet_l4(p); > > icmp->icmp6_cksum = 0; > icmp->icmp6_cksum = (OVS_FORCE uint16_t) > - csum_finish(csum_continue(pseudo_hdr_csum, icmp, > l4_len)); > + csum_finish(packet_csum_continue(p, > pseudo_hdr_csum, p->l4_ofs, > + l4_len)); > } > } > } > @@ -3064,12 +3133,12 @@ packet_expand(struct dp_packet *p, const > struct flow *flow, size_t size) > eth->eth_type = htons(dp_packet_size(p)); > } else if (dl_type_is_ip_any(flow->dl_type)) { > uint32_t pseudo_hdr_csum; > - size_t l4_len = (char *) dp_packet_tail(p) - (char *) > dp_packet_l4(p); > + size_t l4_len = dp_packet_l4_size(p); > > if (flow->dl_type == htons(ETH_TYPE_IP)) { > struct ip_header *ip = dp_packet_l3(p); > > - ip->ip_tot_len = htons(p->l4_ofs - p->l3_ofs + l4_len); > + ip->ip_tot_len = htons(dp_packet_l3_size(p)); > ip->ip_csum = 0; > ip->ip_csum = csum(ip, sizeof *ip); > > @@ -3153,7 +3222,7 @@ flow_compose(struct dp_packet *p, const struct > flow *flow, > l4_len = flow_compose_l4(p, flow, l7, l7_len); > > ip = dp_packet_l3(p); > - ip->ip_tot_len = htons(p->l4_ofs - p->l3_ofs + l4_len); > + ip->ip_tot_len = htons(dp_packet_l3_size(p)); > /* Checksum has already been zeroed by put_zeros call. */ > ip->ip_csum = csum(ip, sizeof *ip); > > diff --git a/lib/flow.h b/lib/flow.h > index 5ebdb1f..75b0062 100644 > --- a/lib/flow.h > +++ b/lib/flow.h > @@ -539,7 +539,7 @@ struct pkt_metadata; > /* The 'dst' must follow with buffer space for FLOW_U64S 64-bit units. > * 'dst->map' is ignored on input and set on output to indicate > which fields > * were extracted. */ > -void miniflow_extract(struct dp_packet *packet, struct miniflow *dst); > +int miniflow_extract(struct dp_packet *packet, struct miniflow *dst); > void miniflow_map_init(struct miniflow *, const struct flow *); > void flow_wc_map(const struct flow *, struct flowmap *); > size_t miniflow_alloc(struct miniflow *dsts[], size_t n, > diff --git a/lib/mcast-snooping.c b/lib/mcast-snooping.c > index 6730301..875b7a1 100644 > --- a/lib/mcast-snooping.c > +++ b/lib/mcast-snooping.c > @@ -455,6 +455,7 @@ mcast_snooping_add_report(struct mcast_snooping *ms, > if (!igmpv3) { > return 0; > } > + offset = (char *) igmpv3 - (char *) dp_packet_data(p); > ngrp = ntohs(igmpv3->ngrp); > offset += IGMPV3_HEADER_LEN; > while (ngrp--) { > @@ -507,6 +508,7 @@ mcast_snooping_add_mld(struct mcast_snooping *ms, > if (!mld) { > return 0; > } > + offset = (char *) mld - (char *) dp_packet_data(p); > ngrp = ntohs(mld->ngrp); > offset += MLD_HEADER_LEN; > addr = dp_packet_at(p, offset, sizeof(struct in6_addr)); > diff --git a/lib/netdev-bsd.c b/lib/netdev-bsd.c > index 46698d5..278c8a9 100644 > --- a/lib/netdev-bsd.c > +++ b/lib/netdev-bsd.c > @@ -700,6 +700,11 @@ netdev_bsd_send(struct netdev *netdev_, int qid > OVS_UNUSED, > } > > DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > + /* We need the whole data to send the packet on the device */ > + if (!dp_packet_is_linear(packet)) { > + dp_packet_linearize(packet); > + } > + > const void *data = dp_packet_data(packet); > size_t size = dp_packet_size(packet); > > diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c > index 72b4f7a..c56c86b 100644 > --- a/lib/netdev-dummy.c > +++ b/lib/netdev-dummy.c > @@ -233,7 +233,13 @@ dummy_packet_stream_run(struct netdev_dummy > *dev, struct dummy_packet_stream *s) > > ASSIGN_CONTAINER(txbuf_node, ovs_list_front(&s->txq), > list_node); > txbuf = txbuf_node->pkt; > - retval = stream_send(s->stream, dp_packet_data(txbuf), > dp_packet_size(txbuf)); > + > + if (!dp_packet_is_linear(txbuf)) { > + dp_packet_linearize(txbuf); > + } > + > + retval = stream_send(s->stream, dp_packet_data(txbuf), > + dp_packet_size(txbuf)); > > if (retval > 0) { > dp_packet_pull(txbuf, retval); > @@ -1087,6 +1093,11 @@ netdev_dummy_send(struct netdev *netdev, int > qid OVS_UNUSED, > > struct dp_packet *packet; > DP_PACKET_BATCH_FOR_EACH(i, packet, batch) { > + /* We need the whole data to send the packet on the device */ > + if (!dp_packet_is_linear(packet)) { > + dp_packet_linearize(packet); > + } > + > const void *buffer = dp_packet_data(packet); > size_t size = dp_packet_size(packet); > > diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c > index f86dcd0..fa79b2a 100644 > --- a/lib/netdev-linux.c > +++ b/lib/netdev-linux.c > @@ -1379,6 +1379,11 @@ netdev_linux_sock_batch_send(int sock, int > ifindex, > > struct dp_packet *packet; > DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > + /* We need the whole data to send the packet on the device */ > + if (!dp_packet_is_linear(packet)) { > + dp_packet_linearize(packet); > + } > + > iov[i].iov_base = dp_packet_data(packet); > iov[i].iov_len = dp_packet_size(packet); > mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll, > @@ -1432,8 +1437,14 @@ netdev_linux_tap_batch_send(struct netdev > *netdev_, > ssize_t retval; > int error; > > + /* We need the whole data to send the packet on the device */ > + if (!dp_packet_is_linear(packet)) { > + dp_packet_linearize(packet); > + } > + > do { > - retval = write(netdev->tap_fd, dp_packet_data(packet), > size); > + retval = write(netdev->tap_fd, dp_packet_data(packet), > + size); > error = retval < 0 ? errno : 0; > } while (error == EINTR); > > diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c > index 56baaa2..285b927 100644 > --- a/lib/netdev-native-tnl.c > +++ b/lib/netdev-native-tnl.c > @@ -65,7 +65,7 @@ netdev_tnl_ip_extract_tnl_md(struct dp_packet > *packet, struct flow_tnl *tnl, > void *nh; > struct ip_header *ip; > struct ovs_16aligned_ip6_hdr *ip6; > - void *l4; > + char *l4; > int l3_size; > > nh = dp_packet_l3(packet); > @@ -79,15 +79,15 @@ netdev_tnl_ip_extract_tnl_md(struct dp_packet > *packet, struct flow_tnl *tnl, > > *hlen = sizeof(struct eth_header); > > - l3_size = dp_packet_size(packet) - > - ((char *)nh - (char *)dp_packet_data(packet)); > + l3_size = dp_packet_l3_size(packet); > > if (IP_VER(ip->ip_ihl_ver) == 4) { > > ovs_be32 ip_src, ip_dst; > > if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(packet))) { > - if (csum(ip, IP_IHL(ip->ip_ihl_ver) * 4)) { > + if (packet_csum(packet, packet->l3_ofs, > + IP_IHL(ip->ip_ihl_ver) * 4)) { > VLOG_WARN_RL(&err_rl, "ip packet has invalid > checksum"); > return NULL; > } > @@ -196,10 +196,8 @@ udp_extract_tnl_md(struct dp_packet *packet, > struct flow_tnl *tnl, > csum = packet_csum_pseudoheader(dp_packet_l3(packet)); > } > > - csum = csum_continue(csum, udp, dp_packet_size(packet) - > - ((const unsigned char *)udp - > - (const unsigned char > *)dp_packet_eth(packet) > - )); > + csum = packet_csum_continue(packet, csum, packet->l4_ofs, > + dp_packet_l4_size(packet)); > if (csum_finish(csum)) { > return NULL; > } > @@ -236,7 +234,7 @@ netdev_tnl_push_udp_header(const struct netdev > *netdev OVS_UNUSED, > csum = > packet_csum_pseudoheader(netdev_tnl_ip_hdr(dp_packet_data(packet))); > } > > - csum = csum_continue(csum, udp, ip_tot_size); > + csum = packet_csum_continue(packet, csum, packet->l4_ofs, > ip_tot_size); > udp->udp_csum = csum_finish(csum); > > if (!udp->udp_csum) { > @@ -373,9 +371,8 @@ parse_gre_header(struct dp_packet *packet, > if (greh->flags & htons(GRE_CSUM)) { > ovs_be16 pkt_csum; > > - pkt_csum = csum(greh, dp_packet_size(packet) - > - ((const unsigned char *)greh - > - (const unsigned char > *)dp_packet_eth(packet))); > + pkt_csum = packet_csum(packet, packet->l4_ofs, > + dp_packet_l4_size(packet)); > if (pkt_csum) { > return -EINVAL; > } > @@ -448,8 +445,9 @@ netdev_gre_push_header(const struct netdev *netdev, > greh = netdev_tnl_push_ip_header(packet, data->header, > data->header_len, &ip_tot_size); > > if (greh->flags & htons(GRE_CSUM)) { > - ovs_be16 *csum_opt = (ovs_be16 *) (greh + 1); > - *csum_opt = csum(greh, ip_tot_size); > + greh = dp_packet_l4(packet); > + ovs_be16 *csum_opt = (ovs_be16 *) greh; > + *csum_opt = packet_csum(packet, packet->l4_ofs, ip_tot_size); > } > > if (greh->flags & htons(GRE_SEQ)) { > diff --git a/lib/odp-execute.c b/lib/odp-execute.c > index 3b6890e..2c33d6a 100644 > --- a/lib/odp-execute.c > +++ b/lib/odp-execute.c > @@ -231,8 +231,16 @@ static void > odp_set_nd(struct dp_packet *packet, const struct ovs_key_nd *key, > const struct ovs_key_nd *mask) > { > - const struct ovs_nd_msg *ns = dp_packet_l4(packet); > - const struct ovs_nd_lla_opt *lla_opt = > dp_packet_get_nd_payload(packet); > + const struct ovs_nd_msg *ns; > + const struct ovs_nd_lla_opt *lla_opt; > + > + /* To orocess neighbor discovery options, we need the whole > packet */ > + if (!dp_packet_is_linear(packet)) { > + dp_packet_linearize(packet); > + } > + > + ns = dp_packet_l4(packet); > + lla_opt = dp_packet_get_nd_payload(packet); > > if (OVS_LIKELY(ns && lla_opt)) { > int bytes_remain = dp_packet_l4_size(packet) - sizeof(*ns); > diff --git a/lib/ovs-lldp.c b/lib/ovs-lldp.c > index 05c1dd4..39d677a 100644 > --- a/lib/ovs-lldp.c > +++ b/lib/ovs-lldp.c > @@ -668,7 +668,8 @@ lldp_process_packet(struct lldp *lldp, const > struct dp_packet *p) > { > if (lldp) { > lldpd_recv(lldp->lldpd, lldpd_first_hardware(lldp->lldpd), > - (char *) dp_packet_data(p), dp_packet_size(p)); > + (char *) dp_packet_data(p), > + dp_packet_size(p)); > } > } > > diff --git a/lib/packets.c b/lib/packets.c > index 2d6f77b..d95c8cc 100644 > --- a/lib/packets.c > +++ b/lib/packets.c > @@ -1007,12 +1007,20 @@ packet_rh_present(struct dp_packet *packet, > uint8_t *nexthdr) > const struct ovs_16aligned_ip6_hdr *nh; > size_t len; > size_t remaining; > - uint8_t *data = dp_packet_l3(packet); > + uint8_t *data; > > - remaining = packet->l4_ofs - packet->l3_ofs; > + remaining = dp_packet_l3h_size(packet); > if (remaining < sizeof *nh) { > return false; > } > + > + /* We will need the whole data for processing the headers below */ > + if (!dp_packet_is_linear(packet)) { > + dp_packet_linearize(packet); > > > dp_packet_linearize() could check for dp_packet_is_linear() and if > linear return early without doing anything Sure, that would be another way to go. It has been mentioned before, and this has become the final form because, 1. `dp_packet_linearize()` changes the address space of the packet and would be easy to misuse it, 2. I feel this gives a little more context, instead of a single `dp_packet_linearize()` on its own. > > > > + } > + > + data = dp_packet_l3(packet); > + > nh = ALIGNED_CAST(struct ovs_16aligned_ip6_hdr *, data); > data += sizeof *nh; > remaining -= sizeof *nh; > @@ -1254,12 +1262,12 @@ packet_set_sctp_port(struct dp_packet > *packet, ovs_be16 src, ovs_be16 dst) > > old_csum = get_16aligned_be32(&sh->sctp_csum); > put_16aligned_be32(&sh->sctp_csum, 0); > - old_correct_csum = crc32c((void *)sh, tp_len); > + old_correct_csum = packet_crc32c(packet, packet->l4_ofs, tp_len); > > sh->sctp_src = src; > sh->sctp_dst = dst; > > - new_csum = crc32c((void *)sh, tp_len); > + new_csum = packet_crc32c(packet, packet->l4_ofs, tp_len); > put_16aligned_be32(&sh->sctp_csum, old_csum ^ old_correct_csum > ^ new_csum); > } > > @@ -1293,6 +1301,11 @@ packet_set_nd(struct dp_packet *packet, const > struct in6_addr *target, > return; > } > > + /* To process neighbor discovery options, we need the whole > packet */ > + if (!dp_packet_is_linear(packet)) { > + dp_packet_linearize(packet); > + } > + > ns = dp_packet_l4(packet); > opt = &ns->options[0]; > bytes_remain -= sizeof(*ns); > @@ -1515,8 +1528,8 @@ compose_nd_ns(struct dp_packet *b, const > struct eth_addr eth_src, > > ns->icmph.icmp6_cksum = 0; > icmp_csum = packet_csum_pseudoheader6(dp_packet_l3(b)); > - ns->icmph.icmp6_cksum = csum_finish( > - csum_continue(icmp_csum, ns, ND_MSG_LEN + ND_LLA_OPT_LEN)); > + ns->icmph.icmp6_cksum = csum_finish(packet_csum_continue( > + b, icmp_csum, b->l4_ofs, ND_MSG_LEN + ND_LLA_OPT_LEN)); > } > > /* Compose an IPv6 Neighbor Discovery Neighbor Advertisement > message. */ > @@ -1546,8 +1559,8 @@ compose_nd_na(struct dp_packet *b, > > na->icmph.icmp6_cksum = 0; > icmp_csum = packet_csum_pseudoheader6(dp_packet_l3(b)); > - na->icmph.icmp6_cksum = csum_finish(csum_continue( > - icmp_csum, na, ND_MSG_LEN + ND_LLA_OPT_LEN)); > + na->icmph.icmp6_cksum = csum_finish(packet_csum_continue( > + b, icmp_csum, b->l4_ofs, ND_MSG_LEN + ND_LLA_OPT_LEN)); > } > > /* Compose an IPv6 Neighbor Discovery Router Advertisement message with > @@ -1597,8 +1610,8 @@ compose_nd_ra(struct dp_packet *b, > > ra->icmph.icmp6_cksum = 0; > uint32_t icmp_csum = packet_csum_pseudoheader6(dp_packet_l3(b)); > - ra->icmph.icmp6_cksum = csum_finish(csum_continue( > - icmp_csum, ra, RA_MSG_LEN + ND_LLA_OPT_LEN + mtu_opt_len)); > + ra->icmph.icmp6_cksum = csum_finish(packet_csum_continue( > + b, icmp_csum, b->l4_ofs, RA_MSG_LEN + ND_LLA_OPT_LEN + > mtu_opt_len)); > } > > /* Append an IPv6 Neighbor Discovery Prefix Information option to a > @@ -1627,8 +1640,8 @@ packet_put_ra_prefix_opt(struct dp_packet *b, > struct ovs_ra_msg *ra = dp_packet_l4(b); > ra->icmph.icmp6_cksum = 0; > uint32_t icmp_csum = packet_csum_pseudoheader6(dp_packet_l3(b)); > - ra->icmph.icmp6_cksum = csum_finish(csum_continue( > - icmp_csum, ra, prev_l4_size + ND_PREFIX_OPT_LEN)); > + ra->icmph.icmp6_cksum = csum_finish(packet_csum_continue( > + b, icmp_csum, b->l4_ofs, prev_l4_size + ND_PREFIX_OPT_LEN)); > } > > uint32_t > @@ -1680,6 +1693,54 @@ packet_csum_upperlayer6(const struct > ovs_16aligned_ip6_hdr *ip6, > } > #endif > > > It is a proposed 'general' API; a function header is needed > same for other proposed apis It is there in packets.h, or is that not what you're referring to? Tiago. > > > > +uint32_t > +packet_csum_continue(const struct dp_packet *b, uint32_t partial, > + uint16_t offset, size_t n) > +{ > + char *ptr = NULL; > + size_t rem = 0; > + size_t size = 0; > + > + while (n > 1) { > + rem = dp_packet_read_data(b, offset, n, (void *)&ptr, NULL); > + > + size = n - rem; > + partial = csum_continue(partial, ptr, size); > + > + offset += size; > + n = rem; > + } > + > + return partial; > +} > + > +ovs_be16 > +packet_csum(const struct dp_packet *b, uint16_t offset, size_t n) > +{ > + return csum_finish(packet_csum_continue(b, 0, offset, n)); > +} > + > +ovs_be32 > +packet_crc32c(const struct dp_packet *b, uint16_t offset, size_t n) > +{ > + char *ptr = NULL; > + size_t rem = 0; > + size_t size = 0; > + uint32_t partial = 0xffffffffL; > + > + while (n > 1) { > + rem = dp_packet_read_data(b, offset, n, (void *)&ptr, NULL); > + > + size = n - rem; > + partial = crc32c_continue(partial, (uint8_t *) ptr, size); > + > + offset += size; > + n = rem; > + } > + > + return crc32c_finish(partial); > +} > + > void > IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6) > { > diff --git a/lib/packets.h b/lib/packets.h > index 09a0ac3..9e7f5a1 100644 > --- a/lib/packets.h > +++ b/lib/packets.h > @@ -1573,6 +1573,13 @@ void packet_put_ra_prefix_opt(struct dp_packet *, > ovs_be32 preferred_lifetime, > const ovs_be128 router_prefix); > uint32_t packet_csum_pseudoheader(const struct ip_header *); > +uint32_t > +packet_csum_continue(const struct dp_packet *b, uint32_t partial, > + uint16_t offset, size_t n); > +ovs_be16 > +packet_csum(const struct dp_packet *b, uint16_t offset, size_t n); > +ovs_be32 > +packet_crc32c(const struct dp_packet *b, uint16_t offset, size_t n); > void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6); > > #define DNS_HEADER_LEN 12 > diff --git a/ofproto/ofproto-dpif-upcall.c > b/ofproto/ofproto-dpif-upcall.c > index dc30824..ed1c1ac 100644 > --- a/ofproto/ofproto-dpif-upcall.c > +++ b/ofproto/ofproto-dpif-upcall.c > @@ -1417,12 +1417,18 @@ process_upcall(struct udpif *udpif, struct > upcall *upcall, > case SFLOW_UPCALL: > if (upcall->sflow) { > struct dpif_sflow_actions sflow_actions; > + struct dp_packet *p = CONST_CAST(struct dp_packet *, > packet); > > memset(&sflow_actions, 0, sizeof sflow_actions); > > actions_len = dpif_read_actions(udpif, upcall, flow, > upcall->type, > &sflow_actions); > - dpif_sflow_received(upcall->sflow, packet, flow, > + /* Gather the whole data */ > + if (!dp_packet_is_linear(p)) { > + dp_packet_linearize(p); > + } > + > + dpif_sflow_received(upcall->sflow, p, flow, > flow->in_port.odp_port, > &upcall->cookie, > actions_len > 0 ? &sflow_actions : > NULL); > } > @@ -1483,6 +1489,12 @@ process_upcall(struct udpif *udpif, struct > upcall *upcall, > > const struct frozen_state *state = &recirc_node->state; > > + /* Gather the whole data */ > + struct dp_packet *p = CONST_CAST(struct dp_packet *, > packet); > + if (!dp_packet_is_linear(p)) { > + dp_packet_linearize(p); > + } > + > struct ofproto_async_msg *am = xmalloc(sizeof *am); > *am = (struct ofproto_async_msg) { > .controller_id = cookie->controller.controller_id, > @@ -1490,9 +1502,9 @@ process_upcall(struct udpif *udpif, struct > upcall *upcall, > .pin = { > .up = { > .base = { > - .packet = xmemdup(dp_packet_data(packet), > - dp_packet_size(packet)), > - .packet_len = dp_packet_size(packet), > + .packet = xmemdup(dp_packet_data(p), > + dp_packet_size(p)), > + .packet_len = dp_packet_size(p), > .reason = cookie->controller.reason, > .table_id = state->table_id, > .cookie = get_32aligned_be64( > diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c > index 839fddd..b2c31b7 100644 > --- a/ofproto/ofproto-dpif-xlate.c > +++ b/ofproto/ofproto-dpif-xlate.c > @@ -1737,7 +1737,8 @@ stp_process_packet(const struct xport *xport, > const struct dp_packet *packet) > } > > if (dp_packet_try_pull(&payload, ETH_HEADER_LEN + > LLC_HEADER_LEN)) { > - stp_received_bpdu(sp, dp_packet_data(&payload), > dp_packet_size(&payload)); > + stp_received_bpdu(sp, dp_packet_data(&payload), > + dp_packet_size(&payload)); > } > } > > @@ -1788,7 +1789,8 @@ rstp_process_packet(const struct xport *xport, > const struct dp_packet *packet) > } > > if (dp_packet_try_pull(&payload, ETH_HEADER_LEN + > LLC_HEADER_LEN)) { > - rstp_port_received_bpdu(xport->rstp_port, > dp_packet_data(&payload), > + rstp_port_received_bpdu(xport->rstp_port, > + dp_packet_data(&payload), > dp_packet_size(&payload)); > } > } > @@ -2563,6 +2565,7 @@ update_mcast_snooping_table4__(const struct > xlate_ctx *ctx, > > offset = (char *) dp_packet_l4(packet) - (char *) > dp_packet_data(packet); > igmp = dp_packet_at(packet, offset, IGMP_HEADER_LEN); > + > if (!igmp || csum(igmp, dp_packet_l4_size(packet)) != 0) { > xlate_report_debug(ctx, OFT_DETAIL, > "multicast snooping received bad IGMP " > @@ -2975,6 +2978,13 @@ xlate_normal(struct xlate_ctx *ctx) > && is_ip_any(flow)) { > struct mcast_snooping *ms = ctx->xbridge->ms; > struct mcast_group *grp = NULL; > + struct dp_packet *p = CONST_CAST(struct dp_packet *, > + ctx->xin->packet); > + > + /* We will need the whole data for processing the packet > below */ > + if (p && !dp_packet_is_linear(p)) { > + dp_packet_linearize(p); > + } > > if (is_igmp(flow, wc)) { > /* > @@ -3279,7 +3289,8 @@ process_special(struct xlate_ctx *ctx, const > struct xport *xport) > const struct flow *flow = &ctx->xin->flow; > struct flow_wildcards *wc = ctx->wc; > const struct xbridge *xbridge = ctx->xbridge; > - const struct dp_packet *packet = ctx->xin->packet; > + struct dp_packet *packet = CONST_CAST(struct dp_packet *, > + ctx->xin->packet); > enum slow_path_reason slow; > > if (!xport) { > @@ -3291,6 +3302,11 @@ process_special(struct xlate_ctx *ctx, const > struct xport *xport) > slow = SLOW_CFM; > } else if (xport->bfd && bfd_should_process_flow(xport->bfd, > flow, wc)) { > if (packet) { > + /* Gather the whole data for further processing */ > + if (!dp_packet_is_linear(packet)) { > + dp_packet_linearize(packet); > + } > + > bfd_process_packet(xport->bfd, flow, packet); > /* If POLL received, immediately sends FINAL back. */ > if (bfd_should_send_packet(xport->bfd)) { > @@ -3307,6 +3323,11 @@ process_special(struct xlate_ctx *ctx, const > struct xport *xport) > } else if ((xbridge->stp || xbridge->rstp) && > stp_should_process_flow(flow, wc)) { > if (packet) { > + /* Gather the whole data for further processing */ > + if (!dp_packet_is_linear(packet)) { > + dp_packet_linearize(packet); > + } > + > xbridge->stp > ? stp_process_packet(xport, packet) > : rstp_process_packet(xport, packet); > @@ -3314,6 +3335,11 @@ process_special(struct xlate_ctx *ctx, const > struct xport *xport) > slow = SLOW_STP; > } else if (xport->lldp && lldp_should_process_flow(xport->lldp, > flow)) { > if (packet) { > + /* Gather the whole data for further processing */ > + if (!dp_packet_is_linear(packet)) { > + dp_packet_linearize(packet); > + } > + > lldp_process_packet(xport->lldp, packet); > } > slow = SLOW_LLDP; > diff --git a/tests/test-rstp.c b/tests/test-rstp.c > index 01aeaf8..48f1cfa 100644 > --- a/tests/test-rstp.c > +++ b/tests/test-rstp.c > @@ -86,8 +86,12 @@ send_bpdu(struct dp_packet *pkt, void *port_, > void *b_) > assert(port_no < b->n_ports); > lan = b->ports[port_no]; > if (lan) { > - const void *data = dp_packet_l3(pkt); > - size_t size = (char *) dp_packet_tail(pkt) - (char *) data; > + if (!dp_packet_is_linear(pkt)) { > + dp_packet_linearize(pkt); > + } > + > + const char *data = dp_packet_l3(pkt); > + size_t size = dp_packet_size(pkt) - pkt->l3_ofs; > int i; > > for (i = 0; i < lan->n_conns; i++) { > diff --git a/tests/test-stp.c b/tests/test-stp.c > index c85c99d..fd5bfad 100644 > --- a/tests/test-stp.c > +++ b/tests/test-stp.c > @@ -94,8 +94,12 @@ send_bpdu(struct dp_packet *pkt, int port_no, > void *b_) > assert(port_no < b->n_ports); > lan = b->ports[port_no]; > if (lan) { > - const void *data = dp_packet_l3(pkt); > - size_t size = (char *) dp_packet_tail(pkt) - (char *) data; > + if (!dp_packet_is_linear(pkt)) { > + dp_packet_linearize(pkt); > + } > + > + const char *data = dp_packet_l3(pkt); > + size_t size = dp_packet_size(pkt) - pkt->l3_ofs; > int i; > > for (i = 0; i < lan->n_conns; i++) { > -- > 2.7.4 > _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev