On Sun, Jun 12, 2022 at 07:40:55PM +0800, Jinhao Fan wrote:
> 
> > On Jun 10, 2022, at 1:27 AM, Klaus Jensen <i...@irrelevant.dk> wrote:
> > 
> > I'm ok with following the concensus here, but we all agree that this is
> > a blatant spec violation that ended up manifesting itself down the
> > stack, right?
> > 
> > So... if QEMU wants to be compliant here, I guess we could ask the
> > kernel to introduce a quirk for *compliant* controllers. Now, THAT would
> > be a first! Not sure if I am being serious or not here ;)
> 
> Hi all,
> 
> Is this our final decision?

What a mess...

The spec should have gone into more details on initializing the shadow and
event buffers if they really intended it to be run on a live queue.

Anyway, the following hack on top of your patch should allow the host to use
admin shadow queues, and also remain backward compatible for the "broken"
hosts, like Linux and SPDK.

---
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index a0a9208c0f..03d84feecf 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4267,7 +4267,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
uint64_t dma_addr,
     }
     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
 
-    if (sqid && n->dbbuf_dbs && n->dbbuf_eis) {
+    if (n->dbbuf_dbs && n->dbbuf_eis) {
         sq->db_addr = n->dbbuf_dbs + 2 * sqid * stride;
         sq->ei_addr = n->dbbuf_eis + 2 * sqid * stride;
     }
@@ -4632,7 +4632,7 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, 
uint64_t dma_addr,
     cq->head = cq->tail = 0;
     QTAILQ_INIT(&cq->req_list);
     QTAILQ_INIT(&cq->sq_list);
-    if (cqid && n->dbbuf_dbs && n->dbbuf_eis) {
+    if (n->dbbuf_dbs && n->dbbuf_eis) {
         cq->db_addr = n->dbbuf_dbs + (2 * cqid + 1) * stride;
         cq->ei_addr = n->dbbuf_eis + (2 * cqid + 1) * stride;
     }
@@ -5805,7 +5805,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
     n->dbbuf_dbs = dbs_addr;
     n->dbbuf_eis = eis_addr;
 
-    for (i = 1; i < n->params.max_ioqpairs + 1; i++) {
+    for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
         NvmeSQueue *sq = n->sq[i];
         NvmeCQueue *cq = n->cq[i];
 
@@ -5813,12 +5813,16 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
             /* Submission queue tail pointer location, 2 * QID * stride */
             sq->db_addr = dbs_addr + 2 * i * stride;
             sq->ei_addr = eis_addr + 2 * i * stride;
+            pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
+                    sizeof(sq->tail));
         }
 
         if (cq) {
             /* Completion queue head pointer location, (2 * QID + 1) * stride 
*/
             cq->db_addr = dbs_addr + (2 * i + 1) * stride;
             cq->ei_addr = eis_addr + (2 * i + 1) * stride;
+            pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
+                    sizeof(cq->head));
         }
     }
 
@@ -6479,8 +6483,10 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, 
int val)
         trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
 
         start_sqs = nvme_cq_full(cq) ? 1 : 0;
-        if (!cq->db_addr) {
         cq->head = new_head;
+        if (cq->db_addr) {
+            pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
+                    sizeof(cq->head));
         }
         if (start_sqs) {
             NvmeSQueue *sq;
@@ -6538,9 +6544,25 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, 
int val)
 
         trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
 
-        if (!sq->db_addr) {
         sq->tail = new_tail;
+        if (sq->db_addr) {
+            /*
+             * The spec states "the host shall also update the controller's
+             * corresponding doorbell property to match the value of that entry
+             * in the Shadow Doorbell buffer."
+             *
+             * Since this context is currently a VM trap, we can safely enforce
+             * the requirement from the device side in case the host is
+             * misbehaving.
+             *
+             * Note, we shouldn't have to do this, but various drivers
+             * including ones that run on Linux, are not updating Admin Queues,
+             * so we can't trust reading it for an appropriate sq tail.
+             */
+            pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
+                    sizeof(sq->tail));
         }
+
         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
     }
 }
--

Reply via email to