Hi all,

We are using 2 Mellanox VFs with DPDK v22.11 but seeing an issue when
dpdk rte_proc_secondary process is trying to xmit packets out. Please
note DPDK rte_proc_primary process is able to successfully xmit
packets out. Issue seems to be in check_cqe as it always returns
MLX5_CQE_STATUS_HW_OWN.




*admin@10-50-54-244:~$ lspci | grep "Mellanox"00:07.0 Ethernet
controller: Mellanox Technologies MT27700 Family [ConnectX-4 Virtual
Function]00:08.0 Ethernet controller: Mellanox Technologies MT27700
Family [ConnectX-4 Virtual Function]*


In our application.

proc0 -> is DPDK rte_proc_primary which initializes the necessary
shared memory data structures.

proc1 -> is DPDK rte_proc_secondary which attaches to pre-initialized
shared memory.


proc0(rte_proc_primary) uses port0(*00:07.0*) to xmit packets out -
works fine as expected.

But proc1(rte_proc_secondary) uses port1(*00:08.0)* to xmit packets
out - doesn't work as the packet is not seen on the wire.


code snippet for below gdb outputs

mlx5_tx.c

180  */
181 void
182 mlx5_tx_handle_completion(struct mlx5_txq_data *__rte_restrict txq,
183               unsigned int olx __rte_unused)
184 {
185     unsigned int count = MLX5_TX_COMP_MAX_CQE;
186     volatile struct mlx5_cqe *last_cqe = NULL;
187     bool ring_doorbell = false;
188     int ret;
189
190     do {
191         volatile struct mlx5_cqe *cqe;
192
193         cqe = &txq->cqes[txq->cq_ci & txq->cqe_m];
194         ret = check_cqe(cqe, txq->cqe_s, txq->cq_ci);
195         if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
196             if (likely(ret != MLX5_CQE_STATUS_ERR)) {
197                 /* No new CQEs in completion queue. */
198                 MLX5_ASSERT(ret == MLX5_CQE_STATUS_HW_OWN);
199                 break;
200             }


mlx5_common.h

195 static __rte_always_inline enum mlx5_cqe_status
196 check_cqe(volatile struct mlx5_cqe *cqe, const uint16_t cqes_n,
197       const uint16_t ci)
198 {
199     const uint16_t idx = ci & cqes_n;
200     const uint8_t op_own = cqe->op_own;
201     const uint8_t op_owner = MLX5_CQE_OWNER(op_own);
202     const uint8_t op_code = MLX5_CQE_OPCODE(op_own);
203
204     if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
205         return MLX5_CQE_STATUS_HW_OWN;
206     rte_io_rmb();
207     if (unlikely(op_code == MLX5_CQE_RESP_ERR ||
208              op_code == MLX5_CQE_REQ_ERR))
209         return MLX5_CQE_STATUS_ERR;
210     return MLX5_CQE_STATUS_SW_OWN;
211 }

*proc1(non-working process):* we have noticed the cq_ci remains 0 and
doesn't increase.

Thread 1 "se_dp" hit Breakpoint 1, mlx5_tx_handle_completion
(txq=0x6000496c72c0, olx=127)
    at ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c:184
184     in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
185     in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
186     in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
187     in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
193     in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
194     in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
195     in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) info locals
cqe = 0x60004962b000
count = 2
last_cqe = 0x0
ring_doorbell = false
ret = -2
(gdb) p *txq
$1 = {elts_head = 35, elts_tail = 0, elts_comp = 32, elts_s = 1024,
elts_m = 1023, wqe_ci = 35,
  wqe_pi = 0, wqe_s = 4096, wqe_m = 4095, wqe_comp = 32, wqe_thres =
512, cq_ci = 0, cq_pi = 1,
  cqe_s = 64, cqe_m = 63, elts_n = 10, cqe_n = 6, wqe_n = 12, tso_en =
1, tunnel_en = 0, swp_en = 0,
  vlan_en = 0, db_nc = 0, db_heu = 0, rt_timestamp = 0, wait_on_time =
0, fast_free = 0,
  inlen_send = 18, inlen_empw = 0, inlen_mode = 18, qp_num_8s =
340992, offloads = 32815, mr_ctrl = {
    dev_gen_ptr = 0x60004c2d62b4, cur_gen = 0, mru = 0, head = 0,
cache = {{start = 0, end = 0,
        lkey = 0}, {start = 0, end = 0, lkey = 0}, {start = 0, end =
0, lkey = 0}, {start = 0,
        end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}, {start =
0, end = 0, lkey = 0}, {
        start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey =
0}}, cache_bh = {len = 1,
      size = 256, table = 0x6000496c5d40}}, wqes = 0x60004c255000,
wqes_end = 0x60004c295000,
  fcqs = 0x60004c295dc0, cqes = 0x60004962b000, qp_db =
0x60004c295004, cq_db = 0x60004962c000,
  port_id = 1, idx = 0, rt_timemask = 0, ts_mask = 0, ts_offset = -1,
sh = 0x60004b865880, stats = {
    opackets = 35, obytes = 2228, oerrors = 0}, stats_reset =
{opackets = 0, obytes = 0, oerrors = 0},
  uar_data = {db = 0x0}, elts = 0x6000496c7448}


and check_cqe always returns MLX5_CQE_STATUS_HW_OWN

(gdb)
194     in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) s
check_cqe (ci=0, cqes_n=64, cqe=0x60004962b000) at
../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h:199
199     
../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h:
No such file or directory.
(gdb) n
200     in 
../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h
(gdb)
201     in 
../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h
(gdb)
202     in 
../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h
(gdb)
204     in 
../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h
(gdb) n
205     in 
../../../../../../service_engine/dpdk-2211/drivers/common/mlx5/mlx5_common.h
(gdb) info locals
idx = 0
op_own = 241 '\361'
op_owner = 1 '\001'
op_code = 15 '\017'

Because of *check_cqe* return being *MLX5_CQE_STATUS_HW_OWN* , we
break in line 199 in *mlx5_tx_handle_completion* and *ring_doorbell*
remains *false* forever.

Below are the logs from mlx5_txq_devx_obj_new which is called by
proc0(rte_proc_primary) for port 1

ppriv: 0x60004b8316c0 ,ppriv->uar_table: 0x60004b8316c8,
txq_ctrl->uar_mmap_offset:0,
ppriv->uar_table[txq_data->idx]:0x7f6b2d211800, txq_data->idx: 0,
txq_data->db_nc:0

and logs from txq_uar_init_secondary which gets called by
proc1(rte_proc_secondary) for port 1

priv: 0x60004b8352c0, priv->sh: 0x60004b865880, priv->sh->pppriv: 0x60004b8316c0

txq_ctrl:0x6000496c71c0 priv:0x60004b8352c0

primary_ppriv->uar_table: 0x60004b8316c8 ,uar_va:7f6b2d211800
offset:800 addr:0x7f6b3fe47800

ppriv:0x60004962a180 ppriv->uar_table[txq->idx]:0x7f6b3fe47800, txq->idx:0


Now for the working cases all the counters are incrementing as expected.

*proc0(rte_proc_primary - working case)*:  cq_ci, cq_pi and other
counters are as expected.

Thread 1 "se_dp" hit Breakpoint 1, mlx5_tx_handle_completion
(txq=0x60004b898940, olx=127) at
../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c:184
184     in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) n
185     in ../../../../../../service_engine/dpdk-2211/drivers/net/mlx5/mlx5_tx.c
(gdb) p *txq
$2 = {elts_head = 960, elts_tail = 931, elts_comp = 931, elts_s =
1024, elts_m = 1023, wqe_ci = 960, wqe_pi = 930, wqe_s = 4096, wqe_m =
4095, wqe_comp = 931, wqe_thres = 512, cq_ci = 28, cq_pi = 28, cqe_s =
64,
  cqe_m = 63, elts_n = 10, cqe_n = 6, wqe_n = 12, tso_en = 1,
tunnel_en = 0, swp_en = 0, vlan_en = 0, db_nc = 0, db_heu = 0,
rt_timestamp = 0, wait_on_time = 0, fast_free = 0, inlen_send = 18,
inlen_empw = 0,
  inlen_mode = 18, qp_num_8s = 865280, offloads = 32815, mr_ctrl =
{dev_gen_ptr = 0x600049a000f4, cur_gen = 0, mru = 0, head = 0, cache =
{{start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}, {
        start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0},
{start = 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}, {start
= 0, end = 0, lkey = 0}, {start = 0, end = 0, lkey = 0}}, cache_bh = {
      len = 1, size = 256, table = 0x60004b8973c0}}, wqes =
0x600049655000, wqes_end = 0x600049695000, fcqs = 0x600049697100, cqes
= 0x600049696000, qp_db = 0x600049695004, cq_db = 0x600049697000,
port_id = 0,
  idx = 0, rt_timemask = 0, ts_mask = 0, ts_offset = -1, sh =
0x60004be00c40, stats = {opackets = 960, obytes = 73222, oerrors = 0},
stats_reset = {opackets = 0, obytes = 0, oerrors = 0}, uar_data = {db
= 0x0},
  elts = 0x60004b898ac8}
(gdb)


Few questions:

1. Why isn't the cqi counter increasing in proc1(rte_proc_secondary)?
Does it mean the mlx backend hardware is not consuming the packets?

2. Why is the check_cqe stuck at MLX5_CQE_STATUS_HW_OWN in
proc1(rte_proc_secondary) ?


Thanks,

Samar

-- 
This electronic communication and the information and any files transmitted 
with it, or attached to it, are confidential and are intended solely for 
the use of the individual or entity to whom it is addressed and may contain 
information that is confidential, legally privileged, protected by privacy 
laws, or otherwise restricted from disclosure to anyone else. If you are 
not the intended recipient or the person responsible for delivering the 
e-mail to the intended recipient, you are hereby notified that any use, 
copying, distributing, dissemination, forwarding, printing, or copying of 
this e-mail is strictly prohibited. If you received this e-mail in error, 
please return the e-mail to the sender, delete it from your computer, and 
destroy any printed copy of it.

Reply via email to