The first 8 bytes of a WQE are written to the hardware doorbell.
When compiling with -mcpu=cortex-a78ae, the compiler reorders the
64-bit load of these bytes before the 32-bit stores that construct them.
This happens because mlx5_doorbell_ring() receives the WQE value
by-value as a function argument, so the compiler evaluates the load
at the call site. The A78AE-specific instruction scheduler then hoists
this load above the WQE construction stores, as it fails to detect
the memory alias between the addresses used for the store and load.
The result is that stale data is written to the hardware doorbell,
corrupting the WQE and causing an infinite loop in the crypto
datapath.
Fix this by changing mlx5_doorbell_ring() to accept a pointer to the
WQE value instead of the value itself. A compiler barrier
(rte_compiler_barrier) is placed before the dereference to ensure all
WQE construction stores are visible before the value is read.
Fixes: 5dfa003db53f ("common/mlx5: fix post doorbell barrier")
Cc: [email protected]
Signed-off-by: Shani Peretz <[email protected]>
Acked-by: Bing Zhao <[email protected]>
---
drivers/common/mlx5/mlx5_common.h | 12 ++++++++----
drivers/compress/mlx5/mlx5_compress.c | 2 +-
drivers/crypto/mlx5/mlx5_crypto_gcm.c | 6 +++---
drivers/crypto/mlx5/mlx5_crypto_xts.c | 2 +-
drivers/net/mlx5/mlx5_flow_aso.c | 12 ++++++------
drivers/net/mlx5/mlx5_flow_quota.c | 2 +-
drivers/net/mlx5/mlx5_rxq.c | 3 ++-
drivers/net/mlx5/mlx5_tx.h | 4 ++--
drivers/net/mlx5/mlx5_txpp.c | 4 ++--
drivers/regex/mlx5/mlx5_regex_fastpath.c | 2 +-
drivers/vdpa/mlx5/mlx5_vdpa_event.c | 2 +-
11 files changed, 28 insertions(+), 23 deletions(-)
diff --git a/drivers/common/mlx5/mlx5_common.h
b/drivers/common/mlx5/mlx5_common.h
index 47fcc3fb06..7e8d6349e4 100644
--- a/drivers/common/mlx5/mlx5_common.h
+++ b/drivers/common/mlx5/mlx5_common.h
@@ -420,8 +420,8 @@ struct mlx5_uar {
*
* @param uar
* Pointer to UAR data structure.
- * @param val
- * value to write in big endian format.
+ * @param wqe
+ * Pointer to the first 64 bits of the WQE to write in big endian format.
* @param index
* Index of doorbell record.
* @param db_rec
@@ -430,9 +430,13 @@ struct mlx5_uar {
* Decide whether to flush the DB writing using a memory barrier.
*/
static __rte_always_inline void
-mlx5_doorbell_ring(struct mlx5_uar_data *uar, uint64_t val, uint32_t index,
- volatile uint32_t *db_rec, bool flash)
+mlx5_doorbell_ring(struct mlx5_uar_data *uar, volatile uint64_t *wqe,
+ uint32_t index, volatile uint32_t *db_rec, bool flash)
{
+ uint64_t val;
+
+ rte_compiler_barrier();
+ val = *wqe;
rte_io_wmb();
*db_rec = rte_cpu_to_be_32(index);
/* Ensure ordering between DB record actual update and UAR access. */
diff --git a/drivers/compress/mlx5/mlx5_compress.c
b/drivers/compress/mlx5/mlx5_compress.c
index e5325c6150..fa687b0eec 100644
--- a/drivers/compress/mlx5/mlx5_compress.c
+++ b/drivers/compress/mlx5/mlx5_compress.c
@@ -589,7 +589,7 @@ mlx5_compress_enqueue_burst(void *queue_pair, struct
rte_comp_op **ops,
qp->pi++;
} while (--remain);
qp->stats.enqueued_count += nb_ops;
- mlx5_doorbell_ring(&qp->priv->uar.bf_db, *(volatile uint64_t *)wqe,
+ mlx5_doorbell_ring(&qp->priv->uar.bf_db, (volatile uint64_t *)wqe,
qp->pi, &qp->qp.db_rec[MLX5_SND_DBR],
!qp->priv->uar.dbnc);
return nb_ops;
diff --git a/drivers/crypto/mlx5/mlx5_crypto_gcm.c
b/drivers/crypto/mlx5/mlx5_crypto_gcm.c
index 89f32c7722..8875f9b71c 100644
--- a/drivers/crypto/mlx5/mlx5_crypto_gcm.c
+++ b/drivers/crypto/mlx5/mlx5_crypto_gcm.c
@@ -858,12 +858,12 @@ mlx5_crypto_gcm_enqueue_burst(void *queue_pair,
RTE_BE32(MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET);
/* Only when there are no pending SEND_EN WQEs in background. */
if (!umr_cnt && !qp->has_umr) {
- mlx5_doorbell_ring(&priv->uar.bf_db, *(volatile uint64_t
*)qp->wqe,
+ mlx5_doorbell_ring(&priv->uar.bf_db, (volatile uint64_t
*)qp->wqe,
qp->pi, &qp->qp_obj.db_rec[MLX5_SND_DBR],
!priv->uar.dbnc);
} else {
mlx5_crypto_gcm_build_send_en(qp);
- mlx5_doorbell_ring(&priv->uar.bf_db, *(volatile uint64_t
*)qp->umr_wqe,
+ mlx5_doorbell_ring(&priv->uar.bf_db, (volatile uint64_t
*)qp->umr_wqe,
qp->umr_pi,
&qp->umr_qp_obj.db_rec[MLX5_SND_DBR],
!priv->uar.dbnc);
qp->last_gga_pi = qp->pi;
@@ -1078,7 +1078,7 @@ mlx5_crypto_gcm_ipsec_enqueue_burst(void *queue_pair,
/* Update the last GGA cseg with COMP. */
((struct mlx5_wqe_cseg *)qp->wqe)->flags =
RTE_BE32(MLX5_COMP_ALWAYS << MLX5_COMP_MODE_OFFSET);
- mlx5_doorbell_ring(&priv->uar.bf_db, *(volatile uint64_t *)qp->wqe,
+ mlx5_doorbell_ring(&priv->uar.bf_db, (volatile uint64_t *)qp->wqe,
qp->pi, &qp->qp_obj.db_rec[MLX5_SND_DBR],
!priv->uar.dbnc);
return nb_ops;
diff --git a/drivers/crypto/mlx5/mlx5_crypto_xts.c
b/drivers/crypto/mlx5/mlx5_crypto_xts.c
index 1c914caa85..e62641d154 100644
--- a/drivers/crypto/mlx5/mlx5_crypto_xts.c
+++ b/drivers/crypto/mlx5/mlx5_crypto_xts.c
@@ -353,7 +353,7 @@ mlx5_crypto_xts_enqueue_burst(void *queue_pair, struct
rte_crypto_op **ops,
qp->pi++;
} while (--remain);
qp->stats.enqueued_count += nb_ops;
- mlx5_doorbell_ring(&priv->uar.bf_db, *(volatile uint64_t *)qp->wqe,
+ mlx5_doorbell_ring(&priv->uar.bf_db, (volatile uint64_t *)qp->wqe,
qp->db_pi, &qp->qp_obj.db_rec[MLX5_SND_DBR],
!priv->uar.dbnc);
return nb_ops;
diff --git a/drivers/net/mlx5/mlx5_flow_aso.c b/drivers/net/mlx5/mlx5_flow_aso.c
index 5e2a81ef9c..3091cec411 100644
--- a/drivers/net/mlx5/mlx5_flow_aso.c
+++ b/drivers/net/mlx5/mlx5_flow_aso.c
@@ -469,7 +469,7 @@ mlx5_aso_sq_enqueue_burst(struct mlx5_dev_ctx_shared *sh,
uint16_t n)
} while (max);
wqe->general_cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
MLX5_COMP_MODE_OFFSET);
- mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)wqe,
+ mlx5_doorbell_ring(&sh->tx_uar.bf_db, (volatile uint64_t *)wqe,
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
return sq->elts[start_head & mask].burst_size;
@@ -576,7 +576,7 @@ mlx5_aso_push_wqe(struct mlx5_dev_ctx_shared *sh,
{
if (sq->db_pi == sq->pi)
return;
- mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)sq->db,
+ mlx5_doorbell_ring(&sh->tx_uar.bf_db, (volatile uint64_t *)sq->db,
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
sq->db_pi = sq->pi;
@@ -885,7 +885,7 @@ mlx5_aso_mtr_sq_enqueue_single(struct mlx5_dev_ctx_shared
*sh,
sq->head++;
sq->pi += 2;/* Each WQE contains 2 WQEBB's. */
if (push) {
- mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)wqe,
+ mlx5_doorbell_ring(&sh->tx_uar.bf_db, (volatile uint64_t *)wqe,
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
sq->db_pi = sq->pi;
@@ -1338,7 +1338,7 @@ mlx5_aso_ct_sq_enqueue_single(struct mlx5_dev_ctx_shared
*sh,
sq->head++;
sq->pi += 2; /* Each WQE contains 2 WQEBB's. */
if (push) {
- mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)wqe,
+ mlx5_doorbell_ring(&sh->tx_uar.bf_db, (volatile uint64_t *)wqe,
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
sq->db_pi = sq->pi;
@@ -1470,7 +1470,7 @@ mlx5_aso_ct_sq_query_single(struct mlx5_dev_ctx_shared
*sh,
*/
sq->pi += 2;
if (push) {
- mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)wqe,
+ mlx5_doorbell_ring(&sh->tx_uar.bf_db, (volatile uint64_t *)wqe,
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
sq->db_pi = sq->pi;
@@ -1904,7 +1904,7 @@ mlx5_aso_cnt_sq_enqueue_burst(struct mlx5_hws_cnt_pool
*cpool,
} while (max);
wqe->general_cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
MLX5_COMP_MODE_OFFSET);
- mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)wqe,
+ mlx5_doorbell_ring(&sh->tx_uar.bf_db, (volatile uint64_t *)wqe,
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
bursted_cnts = RTE_MIN((uint32_t)(sq->elts[0].burst_size * 4), n);
diff --git a/drivers/net/mlx5/mlx5_flow_quota.c
b/drivers/net/mlx5/mlx5_flow_quota.c
index d94167d0b0..72d2c545aa 100644
--- a/drivers/net/mlx5/mlx5_flow_quota.c
+++ b/drivers/net/mlx5/mlx5_flow_quota.c
@@ -312,7 +312,7 @@ mlx5_quota_cmd_wqe(struct rte_eth_dev *dev, struct
mlx5_quota *quota_obj,
sq->head++;
sq->pi += 2; /* Each WQE contains 2 WQEBB */
if (push) {
- mlx5_doorbell_ring(&sh->tx_uar.bf_db, *(volatile uint64_t *)wqe,
+ mlx5_doorbell_ring(&sh->tx_uar.bf_db, (volatile uint64_t *)wqe,
sq->pi, &sq->sq_obj.db_rec[MLX5_SND_DBR],
!sh->tx_uar.dbnc);
sq->db_pi = sq->pi;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 48d982a8c2..5e9bc0e666 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -1265,7 +1265,8 @@ mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq)
doorbell_hi = sq_n << MLX5_CQ_SQN_OFFSET | (rxq->cq_ci & MLX5_CI_MASK);
doorbell = (uint64_t)doorbell_hi << 32;
doorbell |= rxq->cqn;
- mlx5_doorbell_ring(&rxq->uar_data, rte_cpu_to_be_64(doorbell),
+ doorbell = rte_cpu_to_be_64(doorbell);
+ mlx5_doorbell_ring(&rxq->uar_data, &doorbell,
doorbell_hi, &rxq->cq_db[MLX5_CQ_ARM_DB], 0);
}
diff --git a/drivers/net/mlx5/mlx5_tx.h b/drivers/net/mlx5/mlx5_tx.h
index 0134a2e003..cd35a89ff8 100644
--- a/drivers/net/mlx5/mlx5_tx.h
+++ b/drivers/net/mlx5/mlx5_tx.h
@@ -313,7 +313,7 @@ mlx5_tx_bfreg(struct mlx5_txq_data *txq)
static __rte_always_inline void
mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
{
- mlx5_doorbell_ring(mlx5_tx_bfreg(txq), *(volatile uint64_t *)wqe,
+ mlx5_doorbell_ring(mlx5_tx_bfreg(txq), (volatile uint64_t *)wqe,
txq->wqe_ci, txq->qp_db, 1);
}
@@ -3796,7 +3796,7 @@ mlx5_tx_burst_tmpl(struct mlx5_txq_data *__rte_restrict
txq,
* the next burst (after descriptor writing, at least).
*/
mlx5_doorbell_ring(mlx5_tx_bfreg(txq),
- *(volatile uint64_t *)loc.wqe_last, txq->wqe_ci,
+ (volatile uint64_t *)loc.wqe_last, txq->wqe_ci,
txq->qp_db, !txq->db_nc &&
(!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST));
/* Not all of the mbufs may be stored into elts yet. */
diff --git a/drivers/net/mlx5/mlx5_txpp.c b/drivers/net/mlx5/mlx5_txpp.c
index 0e99b58bde..ef94a5a7d5 100644
--- a/drivers/net/mlx5/mlx5_txpp.c
+++ b/drivers/net/mlx5/mlx5_txpp.c
@@ -171,7 +171,7 @@ mlx5_txpp_doorbell_rearm_queue(struct mlx5_dev_ctx_shared
*sh, uint16_t ci)
(wqe[ci & (wq->sq_size - 1)].ctrl[0]) | (ci - 1) << 8);
cs.w32[1] = wqe[ci & (wq->sq_size - 1)].ctrl[1];
/* Update SQ doorbell record with new SQ ci. */
- mlx5_doorbell_ring(&sh->tx_uar.bf_db, cs.w64, wq->sq_ci,
+ mlx5_doorbell_ring(&sh->tx_uar.bf_db, &cs.w64, wq->sq_ci,
wq->sq_obj.db_rec, !sh->tx_uar.dbnc);
}
@@ -480,7 +480,7 @@ mlx5_txpp_cq_arm(struct mlx5_dev_ctx_shared *sh)
uint64_t db_be =
rte_cpu_to_be_64(((uint64_t)db_hi << 32) | aq->cq_obj.cq->id);
- mlx5_doorbell_ring(&sh->tx_uar.cq_db, db_be, db_hi,
+ mlx5_doorbell_ring(&sh->tx_uar.cq_db, &db_be, db_hi,
&aq->cq_obj.db_rec[MLX5_CQ_ARM_DB], 0);
aq->arm_sn++;
}
diff --git a/drivers/regex/mlx5/mlx5_regex_fastpath.c
b/drivers/regex/mlx5/mlx5_regex_fastpath.c
index 3207bcbc60..52a0a93ad9 100644
--- a/drivers/regex/mlx5/mlx5_regex_fastpath.c
+++ b/drivers/regex/mlx5/mlx5_regex_fastpath.c
@@ -215,7 +215,7 @@ send_doorbell(struct mlx5_regex_priv *priv, struct
mlx5_regex_hw_qp *qp)
/* Or the fm_ce_se instead of set, avoid the fence be cleared. */
((struct mlx5_wqe_ctrl_seg *)wqe)->fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE;
- mlx5_doorbell_ring(&priv->uar.bf_db, *(volatile uint64_t *)wqe,
+ mlx5_doorbell_ring(&priv->uar.bf_db, (volatile uint64_t *)wqe,
actual_pi, &qp->qp_obj.db_rec[MLX5_SND_DBR],
!priv->uar.dbnc);
}
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_event.c
b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
index 43585a94b2..b9983a61ef 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_event.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
@@ -77,7 +77,7 @@ mlx5_vdpa_cq_arm(struct mlx5_vdpa_priv *priv, struct
mlx5_vdpa_cq *cq)
uint64_t doorbell = ((uint64_t)doorbell_hi << 32) | cq->cq_obj.cq->id;
uint64_t db_be = rte_cpu_to_be_64(doorbell);
- mlx5_doorbell_ring(&priv->uar.cq_db, db_be, doorbell_hi,
+ mlx5_doorbell_ring(&priv->uar.cq_db, &db_be, doorbell_hi,
&cq->cq_obj.db_rec[MLX5_CQ_ARM_DB], 0);
cq->arm_sn++;
cq->armed = 1;
--
2.34.1