Re: [PATCH net-next] mlx4: reorganize struct mlx4_en_tx_ring

2016-11-24 Thread David Miller
From: Eric Dumazet 
Date: Tue, 22 Nov 2016 15:56:10 -0800

> From: Eric Dumazet 
> 
> Goal is to reorganize this critical structure to increase performance.
> 
> ndo_start_xmit() should only dirty one cache line, and access as few
> cache lines as possible.
> 
> Add sp_ (Slow Path) prefix to fields that are not used in fast path,
> to make clear what is going on.
> 
> After this patch pahole reports something much better, as all
> ndo_start_xmit() needed fields are packed into two cache lines instead
> of seven or eight
 ...
> Signed-off-by: Eric Dumazet 

Applied, thanks Eric.


Re: [PATCH net-next] mlx4: reorganize struct mlx4_en_tx_ring

2016-11-24 Thread Tariq Toukan

Hi Eric,
Thanks for your patch.

On 23/11/2016 1:56 AM, Eric Dumazet wrote:

From: Eric Dumazet 

Goal is to reorganize this critical structure to increase performance.

ndo_start_xmit() should only dirty one cache line, and access as few
cache lines as possible.

Add sp_ (Slow Path) prefix to fields that are not used in fast path,
to make clear what is going on.

After this patch pahole reports something much better, as all
ndo_start_xmit() needed fields are packed into two cache lines instead
of seven or eight

struct mlx4_en_tx_ring {
u32last_nr_txbb; /* 0   0x4 */
u32cons; /*   0x4   0x4 */
long unsigned int  wake_queue;   /*   0x8   0x8 */
struct netdev_queue *  tx_queue; /*  0x10   0x8 */
u32(*free_tx_desc)(struct mlx4_en_priv *, 
struct mlx4_en_tx_ring *, int, u8, u64, int); /*  0x18   0x8 */
struct mlx4_en_rx_ring *   recycle_ring; /*  0x20   0x8 */

/* XXX 24 bytes hole, try to pack */

/* --- cacheline 1 boundary (64 bytes) --- */
u32prod; /*  0x40   0x4 */
unsigned int   tx_dropped;   /*  0x44   0x4 */
long unsigned int  bytes;/*  0x48   0x8 */
long unsigned int  packets;  /*  0x50   0x8 */
long unsigned int  tx_csum;  /*  0x58   0x8 */
long unsigned int  tso_packets;  /*  0x60   0x8 */
long unsigned int  xmit_more;/*  0x68   0x8 */
struct mlx4_bf bf;   /*  0x70  0x18 */
/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
__be32 doorbell_qpn; /*  0x88   0x4 */
__be32 mr_key;   /*  0x8c   0x4 */
u32size; /*  0x90   0x4 */
u32size_mask;/*  0x94   0x4 */
u32full_size;/*  0x98   0x4 */
u32buf_size; /*  0x9c   0x4 */
void * buf;  /*  0xa0   0x8 */
struct mlx4_en_tx_info *   tx_info;  /*  0xa8   0x8 */
intqpn;  /*  0xb0   0x4 */
u8 queue_index;  /*  0xb4   0x1 */
bool   bf_enabled;   /*  0xb5   0x1 */
bool   bf_alloced;   /*  0xb6   0x1 */
u8 hwtstamp_tx_type; /*  0xb7   0x1 */
u8 *   bounce_buf;   /*  0xb8   0x8 */
/* --- cacheline 3 boundary (192 bytes) --- */
long unsigned int  queue_stopped;/*  0xc0   0x8 */
struct mlx4_hwq_resources  sp_wqres; /*  0xc8  0x58 */
/* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
struct mlx4_qp sp_qp;/* 0x120  0x30 */
/* --- cacheline 5 boundary (320 bytes) was 16 bytes ago --- */
struct mlx4_qp_context sp_context;   /* 0x150  0xf8 */
/* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */
cpumask_t  sp_affinity_mask; /* 0x248  0x20 */
enum mlx4_qp_state sp_qp_state;  /* 0x268   0x4 */
u16sp_stride;/* 0x26c   0x2 */
u16sp_cqn;   /* 0x26e   0x2 */

/* size: 640, cachelines: 10, members: 36 */
/* sum members: 600, holes: 1, sum holes: 24 */
/* padding: 16 */
};

Instead of this silly placement :

struct mlx4_en_tx_ring {
u32last_nr_txbb; /* 0   0x4 */
u32cons; /*   0x4   0x4 */
long unsigned int  wake_queue;   /*   0x8   0x8 */

/* XXX 48 bytes hole, try to pack */

/* --- cacheline 1 boundary (64 bytes) --- */
u32prod; /*  0x40   0x4 */

/* XXX 4 bytes hole, try to pack */

long unsigned int  bytes;/*  0x48   0x8 */
long unsigned int  packets;  /*  0x50   0x8 */
long unsigned int  tx_csum;  /*  0x58   0x8 */
long unsigned int  tso_packets;  /*  0x60   0x8 */
long unsigned int  xmit_more;/*  0x68   0x8 */
unsigned int   tx_dropped;   /*  0x70   0x4 */

/* XXX 4 bytes hole, try to pack */

struct mlx4_bf bf;   /*  0x78  0x18 */

[PATCH net-next] mlx4: reorganize struct mlx4_en_tx_ring

2016-11-22 Thread Eric Dumazet
From: Eric Dumazet 

Goal is to reorganize this critical structure to increase performance.

ndo_start_xmit() should only dirty one cache line, and access as few
cache lines as possible.

Add sp_ (Slow Path) prefix to fields that are not used in fast path,
to make clear what is going on.

After this patch pahole reports something much better, as all
ndo_start_xmit() needed fields are packed into two cache lines instead
of seven or eight

struct mlx4_en_tx_ring {
u32last_nr_txbb; /* 0   0x4 */
u32cons; /*   0x4   0x4 */
long unsigned int  wake_queue;   /*   0x8   0x8 */
struct netdev_queue *  tx_queue; /*  0x10   0x8 */
u32(*free_tx_desc)(struct mlx4_en_priv *, 
struct mlx4_en_tx_ring *, int, u8, u64, int); /*  0x18   0x8 */
struct mlx4_en_rx_ring *   recycle_ring; /*  0x20   0x8 */

/* XXX 24 bytes hole, try to pack */

/* --- cacheline 1 boundary (64 bytes) --- */
u32prod; /*  0x40   0x4 */
unsigned int   tx_dropped;   /*  0x44   0x4 */
long unsigned int  bytes;/*  0x48   0x8 */
long unsigned int  packets;  /*  0x50   0x8 */
long unsigned int  tx_csum;  /*  0x58   0x8 */
long unsigned int  tso_packets;  /*  0x60   0x8 */
long unsigned int  xmit_more;/*  0x68   0x8 */
struct mlx4_bf bf;   /*  0x70  0x18 */
/* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
__be32 doorbell_qpn; /*  0x88   0x4 */
__be32 mr_key;   /*  0x8c   0x4 */
u32size; /*  0x90   0x4 */
u32size_mask;/*  0x94   0x4 */
u32full_size;/*  0x98   0x4 */
u32buf_size; /*  0x9c   0x4 */
void * buf;  /*  0xa0   0x8 */
struct mlx4_en_tx_info *   tx_info;  /*  0xa8   0x8 */
intqpn;  /*  0xb0   0x4 */
u8 queue_index;  /*  0xb4   0x1 */
bool   bf_enabled;   /*  0xb5   0x1 */
bool   bf_alloced;   /*  0xb6   0x1 */
u8 hwtstamp_tx_type; /*  0xb7   0x1 */
u8 *   bounce_buf;   /*  0xb8   0x8 */
/* --- cacheline 3 boundary (192 bytes) --- */
long unsigned int  queue_stopped;/*  0xc0   0x8 */
struct mlx4_hwq_resources  sp_wqres; /*  0xc8  0x58 */
/* --- cacheline 4 boundary (256 bytes) was 32 bytes ago --- */
struct mlx4_qp sp_qp;/* 0x120  0x30 */
/* --- cacheline 5 boundary (320 bytes) was 16 bytes ago --- */
struct mlx4_qp_context sp_context;   /* 0x150  0xf8 */
/* --- cacheline 9 boundary (576 bytes) was 8 bytes ago --- */
cpumask_t  sp_affinity_mask; /* 0x248  0x20 */
enum mlx4_qp_state sp_qp_state;  /* 0x268   0x4 */
u16sp_stride;/* 0x26c   0x2 */
u16sp_cqn;   /* 0x26e   0x2 */

/* size: 640, cachelines: 10, members: 36 */
/* sum members: 600, holes: 1, sum holes: 24 */
/* padding: 16 */
};

Instead of this silly placement :

struct mlx4_en_tx_ring {
u32last_nr_txbb; /* 0   0x4 */
u32cons; /*   0x4   0x4 */
long unsigned int  wake_queue;   /*   0x8   0x8 */

/* XXX 48 bytes hole, try to pack */

/* --- cacheline 1 boundary (64 bytes) --- */
u32prod; /*  0x40   0x4 */

/* XXX 4 bytes hole, try to pack */

long unsigned int  bytes;/*  0x48   0x8 */
long unsigned int  packets;  /*  0x50   0x8 */
long unsigned int  tx_csum;  /*  0x58   0x8 */
long unsigned int  tso_packets;  /*  0x60   0x8 */
long unsigned int  xmit_more;/*  0x68   0x8 */
unsigned int   tx_dropped;   /*  0x70   0x4 */

/* XXX 4 bytes hole, try to pack */

struct mlx4_bf bf;   /*  0x78  0x18 */
/* --- cacheline 2 boundary (128 bytes) was 16 bytes ago --- */
long