From: Mark Kavanagh <mark.b.kavan...@intel.com>
There are numerous factors that must be considered when calculating
the size of an mbuf:
- the data portion of the mbuf must be sized in accordance With Rx
buffer alignment (typically 1024B). So, for example, in order to
successfully receive and capture a 1500B packet, mbufs with a
data portion of size 2048B must be used.
- in OvS, the elements that comprise an mbuf are:
* the dp packet, which includes a struct rte mbuf (704B)
* RTE_PKTMBUF_HEADROOM (128B)
* packet data (aligned to 1k, as previously described)
* RTE_PKTMBUF_TAILROOM (typically 0)
Some PMDs require that the total mbuf size (i.e. the total sum of all
of the above-listed components' lengths) is cache-aligned. To satisfy
this requirement, it may be necessary to round up the total mbuf size
with respect to cacheline size. In doing so, it's possible that the
dp_packet's data portion is inadvertently increased in size, such that
it no longer adheres to Rx buffer alignment. Consequently, the
following property of the mbuf no longer holds true:
mbuf.data_len == mbuf.buf_len - mbuf.data_off
This creates a problem in the case of multi-segment mbufs, where that
assumption is assumed to be true for all but the final segment in an
mbuf chain. Resolve this issue by adjusting the size of the mbuf's
private data portion, as opposed to the packet data portion when
aligning mbuf size to cachelines.
Fixes: 4be4d22 ("netdev-dpdk: clean up mbuf initialization")
Fixes: 31b88c9 ("netdev-dpdk: round up mbuf_size to cache_line_size")
CC: Santosh Shukla <santosh.shu...@caviumnetworks.com>
Signed-off-by: Mark Kavanagh <mark.b.kavan...@intel.com>
---
lib/netdev-dpdk.c | 48
+++++++++++++++++++++++++++++++-----------------
1 file changed, 31 insertions(+), 17 deletions(-)
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 2e2f568..468ab36 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -82,12 +82,6 @@ static struct vlog_rate_limit rl =
VLOG_RATE_LIMIT_INIT(5, 20);
+ (2 * VLAN_HEADER_LEN))
#define MTU_TO_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_LEN +
ETHER_CRC_LEN)
#define MTU_TO_MAX_FRAME_LEN(mtu) ((mtu) + ETHER_HDR_MAX_LEN)
-#define FRAME_LEN_TO_MTU(frame_len) ((frame_len) \
- - ETHER_HDR_LEN - ETHER_CRC_LEN)
-#define MBUF_SIZE(mtu)
ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) \
- + sizeof(struct
dp_packet) \
- + RTE_PKTMBUF_HEADROOM),
\
- RTE_CACHE_LINE_SIZE)
#define NETDEV_DPDK_MBUF_ALIGN 1024
#define NETDEV_DPDK_MAX_PKT_LEN 9728
@@ -493,7 +487,7 @@ is_dpdk_class(const struct netdev_class *class)
* behaviour, which reduces performance. To prevent this, use a
buffer size
* that is closest to 'mtu', but which satisfies the aforementioned
criteria.
*/
-static uint32_t
+static uint16_t
dpdk_buf_size(int mtu)
{
return ROUND_UP((MTU_TO_MAX_FRAME_LEN(mtu) +
RTE_PKTMBUF_HEADROOM),
@@ -585,7 +579,7 @@ dpdk_mp_do_not_free(struct rte_mempool *mp)
OVS_REQUIRES(dpdk_mp_mutex)
* - a new mempool was just created;
* - a matching mempool already exists. */
static struct rte_mempool *
-dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
+dpdk_mp_create(struct netdev_dpdk *dev, uint16_t mbuf_pkt_data_len)
{
char mp_name[RTE_MEMPOOL_NAMESIZE];
const char *netdev_name = netdev_get_name(&dev->up);
@@ -593,6 +587,7 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
uint32_t n_mbufs;
uint32_t hash = hash_string(netdev_name, 0);
struct rte_mempool *mp = NULL;
+ uint16_t mbuf_size, aligned_mbuf_size, mbuf_priv_data_len;
/*
* XXX: rough estimation of number of mbufs required for this
port:
@@ -611,13 +606,14 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
/* Full DPDK memory pool name must be unique and cannot be
* longer than RTE_MEMPOOL_NAMESIZE. */
int ret = snprintf(mp_name, RTE_MEMPOOL_NAMESIZE,
- "ovs%08x%02d%05d%07u",
- hash, socket_id, mtu, n_mbufs);
+ "ovs%08x%02d%05u%07u",
+ hash, socket_id, mbuf_pkt_data_len,
n_mbufs);
if (ret < 0 || ret >= RTE_MEMPOOL_NAMESIZE) {
VLOG_DBG("snprintf returned %d. "
"Failed to generate a mempool name for \"%s\". "
- "Hash:0x%x, socket_id: %d, mtu:%d, mbufs:%u.",
- ret, netdev_name, hash, socket_id, mtu,
n_mbufs);
+ "Hash:0x%x, socket_id: %d, pkt data room:%u,
mbufs:%u.",
+ ret, netdev_name, hash, socket_id,
mbuf_pkt_data_len,
+ n_mbufs);
break;
}
@@ -626,13 +622,31 @@ dpdk_mp_create(struct netdev_dpdk *dev, int mtu)
netdev_name, n_mbufs, socket_id,
dev->requested_n_rxq, dev->requested_n_txq);
+ mbuf_priv_data_len = sizeof(struct dp_packet) -
+ sizeof(struct rte_mbuf);
+ /* The size of the entire mbuf. */
+ mbuf_size = sizeof (struct dp_packet) +
+ mbuf_pkt_data_len +
RTE_PKTMBUF_HEADROOM;
+ /* mbuf size, rounded up to cacheline size. */
+ aligned_mbuf_size = ROUND_UP(mbuf_size, RTE_CACHE_LINE_SIZE);
+ /* If there is a size discrepancy, add padding to
mbuf_priv_data_len.
+ * This maintains mbuf size cache alignment, while also
honoring RX
+ * buffer alignment in the data portion of the mbuf. If this
adjustment
+ * is not made, there is a possiblity later on that for an
element of
+ * the mempool, buf, buf->data_len < (buf->buf_len -
buf->data_off).
+ * This is problematic in the case of multi-segment mbufs,
particularly
+ * when an mbuf segment needs to be resized (when
[push|popp]ing a VLAN
+ * header, for example.
+ */
+ mbuf_priv_data_len += (aligned_mbuf_size - mbuf_size);
+
mp = rte_pktmbuf_pool_create(mp_name, n_mbufs, MP_CACHE_SZ,
- sizeof (struct dp_packet) - sizeof (struct
rte_mbuf),
- MBUF_SIZE(mtu) - sizeof(struct dp_packet),
socket_id);
+ mbuf_priv_data_len,
+ mbuf_pkt_data_len + RTE_PKTMBUF_HEADROOM, socket_id);
if (mp) {
VLOG_DBG("Allocated \"%s\" mempool with %u mbufs",
- mp_name, n_mbufs);
+ mp_name, n_mbufs);