[PATCH v5 3/3] hw/ufs: Support for UFS logical unit

2023-07-18 Thread Jeuk Kim
From: Jeuk Kim 

This commit adds support for ufs logical unit.
The LU handles processing for the SCSI command,
unit descriptor query request.

This commit enables the UFS device to process
IO requests.

Signed-off-by: Jeuk Kim 
---
 hw/ufs/lu.c  | 1439 ++
 hw/ufs/meson.build   |2 +-
 hw/ufs/trace-events  |   25 +
 hw/ufs/ufs.c |  252 ++-
 hw/ufs/ufs.h |   43 ++
 include/scsi/constants.h |1 +
 6 files changed, 1755 insertions(+), 7 deletions(-)
 create mode 100644 hw/ufs/lu.c

diff --git a/hw/ufs/lu.c b/hw/ufs/lu.c
new file mode 100644
index 00..ab40f42190
--- /dev/null
+++ b/hw/ufs/lu.c
@@ -0,0 +1,1439 @@
+/*
+ * QEMU UFS Logical Unit
+ *
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All rights reserved.
+ *
+ * Written by Jeuk Kim 
+ *
+ * This code is licensed under the GNU GPL v2 or later.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/units.h"
+#include "qapi/error.h"
+#include "qemu/memalign.h"
+#include "hw/scsi/scsi.h"
+#include "scsi/constants.h"
+#include "sysemu/block-backend.h"
+#include "qemu/cutils.h"
+#include "trace.h"
+#include "ufs.h"
+
+/*
+ * The code below handling SCSI commands is copied from hw/scsi/scsi-disk.c,
+ * with minor adjustments to make it work for UFS.
+ */
+
+#define SCSI_DMA_BUF_SIZE (128 * KiB)
+#define SCSI_MAX_INQUIRY_LEN 256
+#define SCSI_INQUIRY_DATA_SIZE 36
+#define SCSI_MAX_MODE_LEN 256
+
+typedef struct UfsSCSIReq {
+SCSIRequest req;
+/* Both sector and sector_count are in terms of BDRV_SECTOR_SIZE bytes.  */
+uint64_t sector;
+uint32_t sector_count;
+uint32_t buflen;
+bool started;
+bool need_fua_emulation;
+struct iovec iov;
+QEMUIOVector qiov;
+BlockAcctCookie acct;
+} UfsSCSIReq;
+
+static void ufs_scsi_free_request(SCSIRequest *req)
+{
+UfsSCSIReq *r = DO_UPCAST(UfsSCSIReq, req, req);
+
+qemu_vfree(r->iov.iov_base);
+}
+
+static void scsi_check_condition(UfsSCSIReq *r, SCSISense sense)
+{
+trace_ufs_scsi_check_condition(r->req.tag, sense.key, sense.asc,
+   sense.ascq);
+scsi_req_build_sense(&r->req, sense);
+scsi_req_complete(&r->req, CHECK_CONDITION);
+}
+
+static int ufs_scsi_emulate_vpd_page(SCSIRequest *req, uint8_t *outbuf,
+ uint32_t outbuf_len)
+{
+UfsHc *u = UFS(req->bus->qbus.parent);
+UfsLu *lu = DO_UPCAST(UfsLu, qdev, req->dev);
+uint8_t page_code = req->cmd.buf[2];
+int start, buflen = 0;
+
+if (outbuf_len < SCSI_INQUIRY_DATA_SIZE) {
+return -1;
+}
+
+outbuf[buflen++] = lu->qdev.type & 0x1f;
+outbuf[buflen++] = page_code;
+outbuf[buflen++] = 0x00;
+outbuf[buflen++] = 0x00;
+start = buflen;
+
+switch (page_code) {
+case 0x00: /* Supported page codes, mandatory */
+{
+trace_ufs_scsi_emulate_vpd_page_00(req->cmd.xfer);
+outbuf[buflen++] = 0x00; /* list of supported pages (this page) */
+if (u->params.serial) {
+outbuf[buflen++] = 0x80; /* unit serial number */
+}
+outbuf[buflen++] = 0x87; /* mode page policy */
+break;
+}
+case 0x80: /* Device serial number, optional */
+{
+int l;
+
+if (!u->params.serial) {
+trace_ufs_scsi_emulate_vpd_page_80_not_supported();
+return -1;
+}
+
+l = strlen(u->params.serial);
+if (l > SCSI_INQUIRY_DATA_SIZE) {
+l = SCSI_INQUIRY_DATA_SIZE;
+}
+
+trace_ufs_scsi_emulate_vpd_page_80(req->cmd.xfer);
+memcpy(outbuf + buflen, u->params.serial, l);
+buflen += l;
+break;
+}
+case 0x87: /* Mode Page Policy, mandatory */
+{
+trace_ufs_scsi_emulate_vpd_page_87(req->cmd.xfer);
+outbuf[buflen++] = 0x3f; /* apply to all mode pages and subpages */
+outbuf[buflen++] = 0xff;
+outbuf[buflen++] = 0; /* shared */
+outbuf[buflen++] = 0;
+break;
+}
+default:
+return -1;
+}
+/* done with EVPD */
+assert(buflen - start <= 255);
+outbuf[start - 1] = buflen - start;
+return buflen;
+}
+
+static int ufs_scsi_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf,
+uint32_t outbuf_len)
+{
+int buflen = 0;
+
+if (outbuf_len < SCSI_INQUIRY_DATA_SIZE) {
+return -1;
+}
+
+if (req->cmd.buf[1] & 0x1) {
+/* Vital product data */
+return ufs_scsi_emulate_vpd_page(req, outbuf, outbuf_len);
+}
+
+/* Standard INQUIRY data */
+if (req->cmd.buf[2] != 0) {
+return -1;
+}
+
+/* PAGE CODE == 0 */
+buflen = req->cmd.xfer;
+if (buflen > SCSI_MAX_INQUIRY_LEN) {
+buflen = SCSI_MAX_INQUIRY_LEN;
+}
+
+if (is_wlun(req->lun)) {
+outbuf[0] = TYPE_WLUN;
+} else {
+outbuf[0] = 0;
+}
+outbuf[1] = 0;
+
+strpadcpy((char *)&outbuf[16], 16, "QEMU UFS",

[PATCH v5 2/3] hw/ufs: Support for Query Transfer Requests

2023-07-18 Thread Jeuk Kim
From: Jeuk Kim 

This commit makes the UFS device support query
and nop out transfer requests.

The next patch would be support for UFS logical
unit and scsi command transfer request.

Signed-off-by: Jeuk Kim 
---
 hw/ufs/trace-events |   1 +
 hw/ufs/ufs.c| 980 +++-
 hw/ufs/ufs.h|  46 +++
 3 files changed, 1025 insertions(+), 2 deletions(-)

diff --git a/hw/ufs/trace-events b/hw/ufs/trace-events
index d1badcad10..665e1a942b 100644
--- a/hw/ufs/trace-events
+++ b/hw/ufs/trace-events
@@ -18,6 +18,7 @@ ufs_err_dma_read_req_upiu(uint32_t slot, uint64_t addr) 
"failed to read req upiu
 ufs_err_dma_read_prdt(uint32_t slot, uint64_t addr) "failed to read prdt. 
UTRLDBR slot %"PRIu32", prdt addr %"PRIu64""
 ufs_err_dma_write_utrd(uint32_t slot, uint64_t addr) "failed to write utrd. 
UTRLDBR slot %"PRIu32", UTRD dma addr %"PRIu64""
 ufs_err_dma_write_rsp_upiu(uint32_t slot, uint64_t addr) "failed to write rsp 
upiu. UTRLDBR slot %"PRIu32", response upiu addr %"PRIu64""
+ufs_err_utrl_slot_error(uint32_t slot) "UTRLDBR slot %"PRIu32" is in error"
 ufs_err_utrl_slot_busy(uint32_t slot) "UTRLDBR slot %"PRIu32" is busy"
 ufs_err_unsupport_register_offset(uint32_t offset) "Register offset 
0x%"PRIx32" is not yet supported"
 ufs_err_invalid_register_offset(uint32_t offset) "Register offset 0x%"PRIx32" 
is invalid"
diff --git a/hw/ufs/ufs.c b/hw/ufs/ufs.c
index 101082a8a3..cd33af2cde 100644
--- a/hw/ufs/ufs.c
+++ b/hw/ufs/ufs.c
@@ -15,10 +15,221 @@
 #include "ufs.h"
 
 /* The QEMU-UFS device follows spec version 3.1 */
-#define UFS_SPEC_VER 0x0310
+#define UFS_SPEC_VER 0x0310
 #define UFS_MAX_NUTRS 32
 #define UFS_MAX_NUTMRS 8
 
+static MemTxResult ufs_addr_read(UfsHc *u, hwaddr addr, void *buf, int size)
+{
+hwaddr hi = addr + size - 1;
+
+if (hi < addr) {
+return MEMTX_DECODE_ERROR;
+}
+
+if (!FIELD_EX32(u->reg.cap, CAP, 64AS) && (hi >> 32)) {
+return MEMTX_DECODE_ERROR;
+}
+
+return pci_dma_read(PCI_DEVICE(u), addr, buf, size);
+}
+
+static MemTxResult ufs_addr_write(UfsHc *u, hwaddr addr, const void *buf,
+  int size)
+{
+hwaddr hi = addr + size - 1;
+if (hi < addr) {
+return MEMTX_DECODE_ERROR;
+}
+
+if (!FIELD_EX32(u->reg.cap, CAP, 64AS) && (hi >> 32)) {
+return MEMTX_DECODE_ERROR;
+}
+
+return pci_dma_write(PCI_DEVICE(u), addr, buf, size);
+}
+
+static void ufs_complete_req(UfsRequest *req, UfsReqResult req_result);
+
+static inline hwaddr ufs_get_utrd_addr(UfsHc *u, uint32_t slot)
+{
+hwaddr utrl_base_addr = (((hwaddr)u->reg.utrlbau) << 32) + u->reg.utrlba;
+hwaddr utrd_addr = utrl_base_addr + slot * sizeof(UtpTransferReqDesc);
+
+return utrd_addr;
+}
+
+static inline hwaddr ufs_get_req_upiu_base_addr(const UtpTransferReqDesc *utrd)
+{
+uint32_t cmd_desc_base_addr_lo =
+le32_to_cpu(utrd->command_desc_base_addr_lo);
+uint32_t cmd_desc_base_addr_hi =
+le32_to_cpu(utrd->command_desc_base_addr_hi);
+
+return (((hwaddr)cmd_desc_base_addr_hi) << 32) + cmd_desc_base_addr_lo;
+}
+
+static inline hwaddr ufs_get_rsp_upiu_base_addr(const UtpTransferReqDesc *utrd)
+{
+hwaddr req_upiu_base_addr = ufs_get_req_upiu_base_addr(utrd);
+uint32_t rsp_upiu_byte_off =
+le16_to_cpu(utrd->response_upiu_offset) * sizeof(uint32_t);
+return req_upiu_base_addr + rsp_upiu_byte_off;
+}
+
+static MemTxResult ufs_dma_read_utrd(UfsRequest *req)
+{
+UfsHc *u = req->hc;
+hwaddr utrd_addr = ufs_get_utrd_addr(u, req->slot);
+MemTxResult ret;
+
+ret = ufs_addr_read(u, utrd_addr, &req->utrd, sizeof(req->utrd));
+if (ret) {
+trace_ufs_err_dma_read_utrd(req->slot, utrd_addr);
+}
+return ret;
+}
+
+static MemTxResult ufs_dma_read_req_upiu(UfsRequest *req)
+{
+UfsHc *u = req->hc;
+hwaddr req_upiu_base_addr = ufs_get_req_upiu_base_addr(&req->utrd);
+UtpUpiuReq *req_upiu = &req->req_upiu;
+uint32_t copy_size;
+uint16_t data_segment_length;
+MemTxResult ret;
+
+/*
+ * To know the size of the req_upiu, we need to read the
+ * data_segment_length in the header first.
+ */
+ret = ufs_addr_read(u, req_upiu_base_addr, &req_upiu->header,
+sizeof(UtpUpiuHeader));
+if (ret) {
+trace_ufs_err_dma_read_req_upiu(req->slot, req_upiu_base_addr);
+return ret;
+}
+data_segment_length = be16_to_cpu(req_upiu->header.data_segment_length);
+
+copy_size = sizeof(UtpUpiuHeader) + UFS_TRANSACTION_SPECIFIC_FIELD_SIZE +
+data_segment_length;
+
+ret = ufs_addr_read(u, req_upiu_base_addr, &req->req_upiu, copy_size);
+if (ret) {
+trace_ufs_err_dma_read_req_upiu(req->slot, req_upiu_base_addr);
+}
+return ret;
+}
+
+static MemTxResult ufs_dma_read_prdt(UfsRequest *req)
+{
+UfsHc *u = req->hc;
+uint16_t prdt_len = le16_to_cpu(req->utrd.prd_table_length);
+uint16_t prdt_byte_off 

[PATCH v5 1/3] hw/ufs: Initial commit for emulated Universal-Flash-Storage

2023-07-18 Thread Jeuk Kim
From: Jeuk Kim 

Universal Flash Storage (UFS) is a high-performance mass storage device
with a serial interface. It is primarily used as a high-performance
data storage device for embedded applications.

This commit contains code for UFS device to be recognized
as a UFS PCI device.
Patches to handle UFS logical unit and Transfer Request will follow.

Signed-off-by: Jeuk Kim 
---
 MAINTAINERS  |6 +
 docs/specs/pci-ids.rst   |2 +
 hw/Kconfig   |1 +
 hw/meson.build   |1 +
 hw/ufs/Kconfig   |4 +
 hw/ufs/meson.build   |1 +
 hw/ufs/trace-events  |   32 ++
 hw/ufs/trace.h   |1 +
 hw/ufs/ufs.c |  278 ++
 hw/ufs/ufs.h |   42 ++
 include/block/ufs.h  | 1048 ++
 include/hw/pci/pci.h |1 +
 include/hw/pci/pci_ids.h |1 +
 meson.build  |1 +
 14 files changed, 1419 insertions(+)
 create mode 100644 hw/ufs/Kconfig
 create mode 100644 hw/ufs/meson.build
 create mode 100644 hw/ufs/trace-events
 create mode 100644 hw/ufs/trace.h
 create mode 100644 hw/ufs/ufs.c
 create mode 100644 hw/ufs/ufs.h
 create mode 100644 include/block/ufs.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 12e59b6b27..0c8a955b42 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2256,6 +2256,12 @@ F: tests/qtest/nvme-test.c
 F: docs/system/devices/nvme.rst
 T: git git://git.infradead.org/qemu-nvme.git nvme-next
 
+ufs
+M: Jeuk Kim 
+S: Supported
+F: hw/ufs/*
+F: include/block/ufs.h
+
 megasas
 M: Hannes Reinecke 
 L: qemu-block@nongnu.org
diff --git a/docs/specs/pci-ids.rst b/docs/specs/pci-ids.rst
index e302bea484..d6707fa069 100644
--- a/docs/specs/pci-ids.rst
+++ b/docs/specs/pci-ids.rst
@@ -92,6 +92,8 @@ PCI devices (other than virtio):
   PCI PVPanic device (``-device pvpanic-pci``)
 1b36:0012
   PCI ACPI ERST device (``-device acpi-erst``)
+1b36:0013
+  PCI UFS device (``-device ufs``)
 
 All these devices are documented in :doc:`index`.
 
diff --git a/hw/Kconfig b/hw/Kconfig
index ba62ff6417..9ca7b38c31 100644
--- a/hw/Kconfig
+++ b/hw/Kconfig
@@ -38,6 +38,7 @@ source smbios/Kconfig
 source ssi/Kconfig
 source timer/Kconfig
 source tpm/Kconfig
+source ufs/Kconfig
 source usb/Kconfig
 source virtio/Kconfig
 source vfio/Kconfig
diff --git a/hw/meson.build b/hw/meson.build
index c7ac7d3d75..f01fac4617 100644
--- a/hw/meson.build
+++ b/hw/meson.build
@@ -37,6 +37,7 @@ subdir('smbios')
 subdir('ssi')
 subdir('timer')
 subdir('tpm')
+subdir('ufs')
 subdir('usb')
 subdir('vfio')
 subdir('virtio')
diff --git a/hw/ufs/Kconfig b/hw/ufs/Kconfig
new file mode 100644
index 00..b7b3392e85
--- /dev/null
+++ b/hw/ufs/Kconfig
@@ -0,0 +1,4 @@
+config UFS_PCI
+bool
+default y if PCI_DEVICES
+depends on PCI
diff --git a/hw/ufs/meson.build b/hw/ufs/meson.build
new file mode 100644
index 00..eb5164bde9
--- /dev/null
+++ b/hw/ufs/meson.build
@@ -0,0 +1 @@
+system_ss.add(when: 'CONFIG_UFS_PCI', if_true: files('ufs.c'))
diff --git a/hw/ufs/trace-events b/hw/ufs/trace-events
new file mode 100644
index 00..d1badcad10
--- /dev/null
+++ b/hw/ufs/trace-events
@@ -0,0 +1,32 @@
+# ufs.c
+ufs_irq_raise(void) "INTx"
+ufs_irq_lower(void) "INTx"
+ufs_mmio_read(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" 
data 0x%"PRIx64" size %d"
+ufs_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" 
data 0x%"PRIx64" size %d"
+ufs_process_db(uint32_t slot) "UTRLDBR slot %"PRIu32""
+ufs_process_req(uint32_t slot) "UTRLDBR slot %"PRIu32""
+ufs_complete_req(uint32_t slot) "UTRLDBR slot %"PRIu32""
+ufs_sendback_req(uint32_t slot) "UTRLDBR slot %"PRIu32""
+ufs_exec_nop_cmd(uint32_t slot) "UTRLDBR slot %"PRIu32""
+ufs_exec_scsi_cmd(uint32_t slot, uint8_t lun, uint8_t opcode) "slot %"PRIu32", 
lun 0x%"PRIx8", opcode 0x%"PRIx8""
+ufs_exec_query_cmd(uint32_t slot, uint8_t opcode) "slot %"PRIu32", opcode 
0x%"PRIx8""
+ufs_process_uiccmd(uint32_t uiccmd, uint32_t ucmdarg1, uint32_t ucmdarg2, 
uint32_t ucmdarg3) "uiccmd 0x%"PRIx32", ucmdarg1 0x%"PRIx32", ucmdarg2 
0x%"PRIx32", ucmdarg3 0x%"PRIx32""
+
+# error condition
+ufs_err_dma_read_utrd(uint32_t slot, uint64_t addr) "failed to read utrd. 
UTRLDBR slot %"PRIu32", UTRD dma addr %"PRIu64""
+ufs_err_dma_read_req_upiu(uint32_t slot, uint64_t addr) "failed to read req 
upiu. UTRLDBR slot %"PRIu32", request upiu addr %"PRIu64""
+ufs_err_dma_read_prdt(uint32_t slot, uint64_t addr) "failed to read prdt. 
UTRLDBR slot %"PRIu32", prdt addr %"PRIu64""
+ufs_err_dma_write_utrd(uint32_t slot, uint64_t addr) "failed to write utrd. 
UTRLDBR slot %"PRIu32", UTRD dma addr %"PRIu64""
+ufs_err_dma_write_rsp_upiu(uint32_t slot, uint64_t addr) "failed to write rsp 
upiu. UTRLDBR slot %"PRIu32", response upiu addr %"PRIu64""
+ufs_err_utrl_slot_busy(uint32_t slot) "UTRLDBR slot %"PRIu32" is busy"
+ufs_err_unsupport_register_offset(uint32_t offset) "Register offset 
0x%"PRIx32" is not yet supported"
+ufs_err_invalid_register_

[PATCH v5 0/3] hw/ufs: Add Universal Flash Storage (UFS) support

2023-07-18 Thread Jeuk Kim
Since v4:
Addressed review comment from Stefan Hajnoczi. The fixes are as
follows.
- Keep u->reg fields in host endian (Removed little-endian helper 
  functions from MemoryRegionOps)
- Remove unnecessary NULL checks for g_new and g_malloc0
- Replace DEFINE_PROP_DRIVE_IOTHREAD -> DEFINE_PROP_DRIVE

In case you were wondering, ufs and ufs-lu have been tested for the
following behaviours.
1. Checked ufs device recognition in Windows10 environment
2. Verified ufs device recognition in Ubuntu 22.04 environment
3. Verified io behaviour via fio in Ubuntu 22.04 environment
4. Verified query request via ufs-tools in Ubuntu 22.04 environment

Since v3:
- Replace softmmu_ss -> system_ss in meson

Since v2:
Addressed review comment from Stefan Hajnoczi. The main fixes are as
follows.
- Use of SPDX licence identifiers
- fixed endianness error
- removed memory leak
- fixed DMA error handling logic

Since v1:
- use macros of "hw/registerfields.h" (Addressed Philippe's review
  comments)

This patch series adds support for a new PCI-based UFS device.

The UFS pci device id (PCI_DEVICE_ID_REDHAT_UFS) is not registered
in the Linux kernel yet, so it does not work right away, but I confirmed
that it works with Linux when the UFS pci device id is registered.

I have also verified that it works with Windows 10.

Jeuk Kim (3):
  hw/ufs: Initial commit for emulated Universal-Flash-Storage
  hw/ufs: Support for Query Transfer Requests
  hw/ufs: Support for UFS logical unit

 MAINTAINERS  |6 +
 docs/specs/pci-ids.rst   |2 +
 hw/Kconfig   |1 +
 hw/meson.build   |1 +
 hw/ufs/Kconfig   |4 +
 hw/ufs/lu.c  | 1439 
 hw/ufs/meson.build   |1 +
 hw/ufs/trace-events  |   58 ++
 hw/ufs/trace.h   |1 +
 hw/ufs/ufs.c | 1494 ++
 hw/ufs/ufs.h |  131 
 include/block/ufs.h  | 1048 ++
 include/hw/pci/pci.h |1 +
 include/hw/pci/pci_ids.h |1 +
 include/scsi/constants.h |1 +
 meson.build  |1 +
 16 files changed, 4190 insertions(+)
 create mode 100644 hw/ufs/Kconfig
 create mode 100644 hw/ufs/lu.c
 create mode 100644 hw/ufs/meson.build
 create mode 100644 hw/ufs/trace-events
 create mode 100644 hw/ufs/trace.h
 create mode 100644 hw/ufs/ufs.c
 create mode 100644 hw/ufs/ufs.h
 create mode 100644 include/block/ufs.h

-- 
2.34.1




Re: PING: [PATCH v4 0/3] hw/ufs: Add Universal Flash Storage (UFS) support

2023-07-18 Thread Jeuk Kim

On 2023-07-19 오전 3:56, Stefan Hajnoczi wrote:

On Tue, Jul 11, 2023 at 07:31:02PM +0900, Jeuk Kim wrote:

Hi,
Any more reviews...?

Dear Stefan
If you don't mind, Could you give it "reviewed-by"?
And is there anything else I should do...?


Sorry for the late reply. I was on vacation and am working my way
through pending code review and bug reports.

I have started reviewing this series and should be finish on Wednesday
or Thursday.

Stefan


Thank you for your review.
I have seen your comments on PATCH v4.
I'll send you a patch v5 as soon as I fix the things you commented on.

Thanks again.
Jeuk



Re: [Libguestfs] [libnbd PATCH v3 04/22] states: Prepare to send 64-bit requests

2023-07-18 Thread Eric Blake
On Tue, May 30, 2023 at 01:18:22PM -0500, Eric Blake wrote:
> > > +  /* It is more convenient to manage PAYLOAD_LEN by what was negotiated
> > > +   * than to require the user to have to set it correctly.
> > > +   * TODO: Add new h->strict bit to allow intentional protocol violation
> > > +   * for interoperability testing.
> > > +   */
> > > +  if (h->extended_headers)
> > > +flags |= LIBNBD_CMD_FLAG_PAYLOAD_LEN;
> > > +  else
> > > +flags &= ~LIBNBD_CMD_FLAG_PAYLOAD_LEN;
> > 
> > Nice -- I wanted to ask for:
> > 
> > flags &= ~(uint32_t)LIBNBD_CMD_FLAG_PAYLOAD_LEN;
> > 
> > due to LIBNBD_CMD_FLAG_PAYLOAD_LEN having type "int".
> > 
> > However: in patch#3, what has type "int" is:
> > 
> > +#define NBD_CMD_FLAG_PAYLOAD_LEN (1<<5)
> > 
> > and here we have LIBNBD_CMD_FLAG_PAYLOAD_LEN instead -- and the latter
> > has type unsigned int already, from your recent commit 69eecae2c03a
> > ("api: Generate flag values as unsigned", 2022-11-11).
> 
> Still, worth a (separate) cleanup patch to nbd-protocol.h to prefer
> unsigned constants for the flag values where they are not generated.

I pushed a preliminary commit 65011cf6 for libnbd along those lines,
and will copy the same changes to nbd-protocol.h over to nbdkit
shortly.  v4 of this series will be rebased on that.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v4 3/3] hw/ufs: Support for UFS logical unit

2023-07-18 Thread Stefan Hajnoczi
On Tue, Jul 04, 2023 at 05:33:59PM +0900, Jeuk Kim wrote:
> +static Property ufs_lu_props[] = {
> +DEFINE_PROP_DRIVE_IOTHREAD("drive", UfsLu, qdev.conf.blk),

This device is not aware of IOThreads, so I think DEFINE_PROP_DRIVE()
should be used instead.


signature.asc
Description: PGP signature


Re: [PATCH v4 2/3] hw/ufs: Support for Query Transfer Requests

2023-07-18 Thread Stefan Hajnoczi
On Tue, Jul 04, 2023 at 05:33:58PM +0900, Jeuk Kim wrote:
> +static MemTxResult ufs_dma_read_prdt(UfsRequest *req)
> +{
> +UfsHc *u = req->hc;
> +uint16_t prdt_len = le16_to_cpu(req->utrd.prd_table_length);
> +uint16_t prdt_byte_off =
> +le16_to_cpu(req->utrd.prd_table_offset) * sizeof(uint32_t);
> +uint32_t prdt_size = prdt_len * sizeof(UfshcdSgEntry);
> +g_autofree UfshcdSgEntry *prd_entries = NULL;
> +hwaddr req_upiu_base_addr, prdt_base_addr;
> +int err;
> +
> +assert(!req->sg);
> +
> +if (prdt_len == 0) {
> +return MEMTX_OK;
> +}
> +
> +prd_entries = g_new(UfshcdSgEntry, prdt_size);
> +if (!prd_entries) {

g_new() never returns NULL. The process aborts if there is not enough
memory available.

Use g_try_new() if you want to handle memory allocation failure.

> +trace_ufs_err_memory_allocation();
> +return MEMTX_ERROR;
> +}
> +
> +req_upiu_base_addr = ufs_get_req_upiu_base_addr(&req->utrd);
> +prdt_base_addr = req_upiu_base_addr + prdt_byte_off;
> +
> +err = ufs_addr_read(u, prdt_base_addr, prd_entries, prdt_size);
> +if (err) {
> +trace_ufs_err_dma_read_prdt(req->slot, prdt_base_addr);
> +return err;
> +}
> +
> +req->sg = g_malloc0(sizeof(QEMUSGList));
> +if (!req->sg) {

g_malloc0() never returns NULL. The process aborts if there is not
enough memory available.



signature.asc
Description: PGP signature


Re: [PATCH v4 1/3] hw/ufs: Initial commit for emulated Universal-Flash-Storage

2023-07-18 Thread Stefan Hajnoczi
On Tue, Jul 04, 2023 at 05:33:57PM +0900, Jeuk Kim wrote:
> From: Jeuk Kim 
> 
> Universal Flash Storage (UFS) is a high-performance mass storage device
> with a serial interface. It is primarily used as a high-performance
> data storage device for embedded applications.
> 
> This commit contains code for UFS device to be recognized
> as a UFS PCI device.
> Patches to handle UFS logical unit and Transfer Request will follow.
> 
> Signed-off-by: Jeuk Kim 
> ---
>  MAINTAINERS  |6 +
>  docs/specs/pci-ids.rst   |2 +
>  hw/Kconfig   |1 +
>  hw/meson.build   |1 +
>  hw/ufs/Kconfig   |4 +
>  hw/ufs/meson.build   |1 +
>  hw/ufs/trace-events  |   33 ++
>  hw/ufs/trace.h   |1 +
>  hw/ufs/ufs.c |  304 +++
>  hw/ufs/ufs.h |   42 ++
>  include/block/ufs.h  | 1048 ++
>  include/hw/pci/pci.h |1 +
>  include/hw/pci/pci_ids.h |1 +
>  meson.build  |1 +
>  14 files changed, 1446 insertions(+)
>  create mode 100644 hw/ufs/Kconfig
>  create mode 100644 hw/ufs/meson.build
>  create mode 100644 hw/ufs/trace-events
>  create mode 100644 hw/ufs/trace.h
>  create mode 100644 hw/ufs/ufs.c
>  create mode 100644 hw/ufs/ufs.h
>  create mode 100644 include/block/ufs.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 4feea49a6e..756aae8623 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2237,6 +2237,12 @@ F: tests/qtest/nvme-test.c
>  F: docs/system/devices/nvme.rst
>  T: git git://git.infradead.org/qemu-nvme.git nvme-next
>  
> +ufs
> +M: Jeuk Kim 
> +S: Supported
> +F: hw/ufs/*
> +F: include/block/ufs.h
> +
>  megasas
>  M: Hannes Reinecke 
>  L: qemu-block@nongnu.org
> diff --git a/docs/specs/pci-ids.rst b/docs/specs/pci-ids.rst
> index e302bea484..d6707fa069 100644
> --- a/docs/specs/pci-ids.rst
> +++ b/docs/specs/pci-ids.rst
> @@ -92,6 +92,8 @@ PCI devices (other than virtio):
>PCI PVPanic device (``-device pvpanic-pci``)
>  1b36:0012
>PCI ACPI ERST device (``-device acpi-erst``)
> +1b36:0013
> +  PCI UFS device (``-device ufs``)
>  
>  All these devices are documented in :doc:`index`.
>  
> diff --git a/hw/Kconfig b/hw/Kconfig
> index ba62ff6417..9ca7b38c31 100644
> --- a/hw/Kconfig
> +++ b/hw/Kconfig
> @@ -38,6 +38,7 @@ source smbios/Kconfig
>  source ssi/Kconfig
>  source timer/Kconfig
>  source tpm/Kconfig
> +source ufs/Kconfig
>  source usb/Kconfig
>  source virtio/Kconfig
>  source vfio/Kconfig
> diff --git a/hw/meson.build b/hw/meson.build
> index c7ac7d3d75..f01fac4617 100644
> --- a/hw/meson.build
> +++ b/hw/meson.build
> @@ -37,6 +37,7 @@ subdir('smbios')
>  subdir('ssi')
>  subdir('timer')
>  subdir('tpm')
> +subdir('ufs')
>  subdir('usb')
>  subdir('vfio')
>  subdir('virtio')
> diff --git a/hw/ufs/Kconfig b/hw/ufs/Kconfig
> new file mode 100644
> index 00..b7b3392e85
> --- /dev/null
> +++ b/hw/ufs/Kconfig
> @@ -0,0 +1,4 @@
> +config UFS_PCI
> +bool
> +default y if PCI_DEVICES
> +depends on PCI
> diff --git a/hw/ufs/meson.build b/hw/ufs/meson.build
> new file mode 100644
> index 00..eb5164bde9
> --- /dev/null
> +++ b/hw/ufs/meson.build
> @@ -0,0 +1 @@
> +system_ss.add(when: 'CONFIG_UFS_PCI', if_true: files('ufs.c'))
> diff --git a/hw/ufs/trace-events b/hw/ufs/trace-events
> new file mode 100644
> index 00..17793929b1
> --- /dev/null
> +++ b/hw/ufs/trace-events
> @@ -0,0 +1,33 @@
> +# ufs.c
> +ufs_irq_raise(void) "INTx"
> +ufs_irq_lower(void) "INTx"
> +ufs_mmio_read(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" 
> data 0x%"PRIx64" size %d"
> +ufs_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 
> 0x%"PRIx64" data 0x%"PRIx64" size %d"
> +ufs_process_db(uint32_t slot) "UTRLDBR slot %"PRIu32""
> +ufs_process_req(uint32_t slot) "UTRLDBR slot %"PRIu32""
> +ufs_complete_req(uint32_t slot) "UTRLDBR slot %"PRIu32""
> +ufs_sendback_req(uint32_t slot) "UTRLDBR slot %"PRIu32""
> +ufs_exec_nop_cmd(uint32_t slot) "UTRLDBR slot %"PRIu32""
> +ufs_exec_scsi_cmd(uint32_t slot, uint8_t lun, uint8_t opcode) "slot 
> %"PRIu32", lun 0x%"PRIx8", opcode 0x%"PRIx8""
> +ufs_exec_query_cmd(uint32_t slot, uint8_t opcode) "slot %"PRIu32", opcode 
> 0x%"PRIx8""
> +ufs_process_uiccmd(uint32_t uiccmd, uint32_t ucmdarg1, uint32_t ucmdarg2, 
> uint32_t ucmdarg3) "uiccmd 0x%"PRIx32", ucmdarg1 0x%"PRIx32", ucmdarg2 
> 0x%"PRIx32", ucmdarg3 0x%"PRIx32""
> +
> +# error condition
> +ufs_err_memory_allocation(void) "failed to allocate memory"
> +ufs_err_dma_read_utrd(uint32_t slot, uint64_t addr) "failed to read utrd. 
> UTRLDBR slot %"PRIu32", UTRD dma addr %"PRIu64""
> +ufs_err_dma_read_req_upiu(uint32_t slot, uint64_t addr) "failed to read req 
> upiu. UTRLDBR slot %"PRIu32", request upiu addr %"PRIu64""
> +ufs_err_dma_read_prdt(uint32_t slot, uint64_t addr) "failed to read prdt. 
> UTRLDBR slot %"PRIu32", prdt addr %"PRIu64""
> +ufs_err_dma_write_utrd(uint32_t slot, uint64_t addr) "failed to wr

Re: PING: [PATCH v4 0/3] hw/ufs: Add Universal Flash Storage (UFS) support

2023-07-18 Thread Stefan Hajnoczi
On Tue, Jul 11, 2023 at 07:31:02PM +0900, Jeuk Kim wrote:
> Hi,
> Any more reviews...?
> 
> Dear Stefan
> If you don't mind, Could you give it "reviewed-by"?
> And is there anything else I should do...?

Sorry for the late reply. I was on vacation and am working my way
through pending code review and bug reports.

I have started reviewing this series and should be finish on Wednesday
or Thursday.

Stefan


signature.asc
Description: PGP signature


Re: [PATCH 6/5] qemu-nbd: make verbose bool and local variable in main()

2023-07-18 Thread Eric Blake
On Mon, Jul 17, 2023 at 10:25:20PM +0200, Denis V. Lunev wrote:
> Pass 'verbose' to nbd_client_thread() inside NbdClientOpts which looks
> a little bit cleaner and make it bool as it is used as bool actually.
> 
> Signed-off-by: Denis V. Lunev 
> CC: Eric Blake 
> CC: Vladimir Sementsov-Ogievskiy 
> ---
>  qemu-nbd.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)

Reviewed-by: Eric Blake 

I'll queue the series through my NBD tree.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 5/5] qemu-nbd: handle dup2() error when qemu-nbd finished setup process

2023-07-18 Thread Eric Blake
On Mon, Jul 17, 2023 at 04:55:44PM +0200, Denis V. Lunev wrote:
> Fail on error, we are in trouble.
> 
> Signed-off-by: Denis V. Lunev 
> CC: Eric Blake 
> CC: Vladimir Sementsov-Ogievskiy 
> ---
>  qemu-nbd.c | 14 --
>  1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/qemu-nbd.c b/qemu-nbd.c
> index f27613cb57..cd0e965705 100644
> --- a/qemu-nbd.c
> +++ b/qemu-nbd.c
> @@ -323,7 +323,12 @@ static void *nbd_client_thread(void *arg)
>  opts->device, srcpath);
>  } else {
>  /* Close stderr so that the qemu-nbd process exits.  */
> -dup2(STDOUT_FILENO, STDERR_FILENO);
> +int err = dup2(STDOUT_FILENO, STDERR_FILENO);
> +if (err < 0) {

Shorter to drop the temporary variable, and just do:

   if (dup2(...) < 0) {

> +error_report("Could not set stderr to /dev/null: %s",
> + strerror(errno));
> +exit(EXIT_FAILURE);
> +}
>  }

Either way,
Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 4/5] qemu-nbd: properly report error if qemu_daemon() is failed

2023-07-18 Thread Eric Blake
On Mon, Jul 17, 2023 at 04:55:43PM +0200, Denis V. Lunev wrote:
> errno has been overwritten by dup2() just below qemu_daemon() and thus
> improperly returned to the caller. Fix accordingly.
> 
> Signed-off-by: Denis V. Lunev 
> CC: Eric Blake 
> CC: Vladimir Sementsov-Ogievskiy 
> ---
>  qemu-nbd.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)

Reviewed-by: Eric Blake 

> 
> diff --git a/qemu-nbd.c b/qemu-nbd.c
> index 4450cc826b..f27613cb57 100644
> --- a/qemu-nbd.c
> +++ b/qemu-nbd.c
> @@ -932,9 +932,12 @@ int main(int argc, char **argv)
>  error_report("Failed to fork: %s", strerror(errno));
>  exit(EXIT_FAILURE);
>  } else if (pid == 0) {
> +int saved_errno;
> +
>  close(stderr_fd[0]);
>  
>  ret = qemu_daemon(1, 0);
> +saved_errno = errno;/* dup2 will overwrite error below */

I would have written 'may', not 'will'; but that's a triviality not
worth changing.


-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 3/5] qemu-nbd: properly report error on error in dup2() after qemu_daemon()

2023-07-18 Thread Eric Blake
On Mon, Jul 17, 2023 at 04:55:42PM +0200, Denis V. Lunev wrote:
> We are trying to temporary redirect stderr of daemonized process to

temporarily

> a pipe to report a error and get failed. In that case we could not
> use error_report() helper, but should write the message directly into
> the problematic pipe.
> 
> Signed-off-by: Denis V. Lunev 
> CC: Eric Blake 
> CC: Vladimir Sementsov-Ogievskiy 
> ---
>  qemu-nbd.c | 15 ++-
>  1 file changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/qemu-nbd.c b/qemu-nbd.c
> index 186ce9474c..4450cc826b 100644
> --- a/qemu-nbd.c
> +++ b/qemu-nbd.c
> @@ -937,7 +937,20 @@ int main(int argc, char **argv)
>  ret = qemu_daemon(1, 0);
>

errno from qemu_daemon...

>  /* Temporarily redirect stderr to the parent's pipe...  */
> -dup2(stderr_fd[1], STDERR_FILENO);
> +if (dup2(stderr_fd[1], STDERR_FILENO) < 0) {

...is still potentially lost here...

> +char str[256];
> +snprintf(str, sizeof(str),
> + "%s: Failed to link stderr to the pipe: %s\n",
> + g_get_prgname(), strerror(errno));
> +/*
> + * We are unable to use error_report() here as we need to get
> + * stderr pointed to the parent's pipe. Write to that pipe
> + * manually.
> + */
> +ret = write(stderr_fd[1], str, strlen(str));
> +exit(EXIT_FAILURE);
> +}
> +
>  if (ret < 0) {
>  error_report("Failed to daemonize: %s", strerror(errno));

...before use here.

Patch 4/5 addresses that, but I'm inclined to just swap the order of
the two patches when applying the series in my NBD tree.

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH] hw/nvme: fix endianness issue for shadow doorbells

2023-07-18 Thread Philippe Mathieu-Daudé

On 18/7/23 13:34, Klaus Jensen wrote:

On Jul 18 13:18, Philippe Mathieu-Daudé wrote:

On 18/7/23 12:35, Klaus Jensen wrote:

From: Klaus Jensen 

In commit 2fda0726e514 ("hw/nvme: fix missing endian conversions for
doorbell buffers"), we fixed shadow doorbells for big-endian guests
running on little endian hosts. But I did not fix little-endian guests
on big-endian hosts. Fix this.

Solves issue #1765.

Fixes: 3f7fe8de3d49 ("hw/nvme: Implement shadow doorbell buffer support")
Cc: qemu-sta...@nongnu.org
Reported-by: Thomas Huth 
Signed-off-by: Klaus Jensen 
---
   hw/nvme/ctrl.c | 18 +-
   1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8e8e870b9a80..dadc2dc7da10 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -6801,6 +6801,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
   PCIDevice *pci = PCI_DEVICE(n);
   uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
   uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
+uint32_t v;
   int i;
   /* Address should be page aligned */
@@ -6818,6 +6819,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
   NvmeCQueue *cq = n->cq[i];
   if (sq) {
+v = cpu_to_le32(sq->tail);
+
   /*
* CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
* nvme_process_db() uses this hard-coded way to calculate
@@ -6825,7 +6828,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
*/
   sq->db_addr = dbs_addr + (i << 3);
   sq->ei_addr = eis_addr + (i << 3);
-pci_dma_write(pci, sq->db_addr, &sq->tail, sizeof(sq->tail));
+pci_dma_write(pci, sq->db_addr, &v, sizeof(sq->tail));


Or use the PCI DMA ldst API which does the swapping for you:

   stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);



Nice! That's definitely preferable. I'll queue that up for next, but
leave this patch as-is since it's been tested on a real big-endian host
and gotten its tags ;)


OK, then:

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH] hw/nvme: fix endianness issue for shadow doorbells

2023-07-18 Thread Klaus Jensen
On Jul 18 13:18, Philippe Mathieu-Daudé wrote:
> On 18/7/23 12:35, Klaus Jensen wrote:
> > From: Klaus Jensen 
> > 
> > In commit 2fda0726e514 ("hw/nvme: fix missing endian conversions for
> > doorbell buffers"), we fixed shadow doorbells for big-endian guests
> > running on little endian hosts. But I did not fix little-endian guests
> > on big-endian hosts. Fix this.
> > 
> > Solves issue #1765.
> > 
> > Fixes: 3f7fe8de3d49 ("hw/nvme: Implement shadow doorbell buffer support")
> > Cc: qemu-sta...@nongnu.org
> > Reported-by: Thomas Huth 
> > Signed-off-by: Klaus Jensen 
> > ---
> >   hw/nvme/ctrl.c | 18 +-
> >   1 file changed, 13 insertions(+), 5 deletions(-)
> > 
> > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > index 8e8e870b9a80..dadc2dc7da10 100644
> > --- a/hw/nvme/ctrl.c
> > +++ b/hw/nvme/ctrl.c
> > @@ -6801,6 +6801,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
> > NvmeRequest *req)
> >   PCIDevice *pci = PCI_DEVICE(n);
> >   uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
> >   uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
> > +uint32_t v;
> >   int i;
> >   /* Address should be page aligned */
> > @@ -6818,6 +6819,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
> > NvmeRequest *req)
> >   NvmeCQueue *cq = n->cq[i];
> >   if (sq) {
> > +v = cpu_to_le32(sq->tail);
> > +
> >   /*
> >* CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
> >* nvme_process_db() uses this hard-coded way to calculate
> > @@ -6825,7 +6828,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
> > NvmeRequest *req)
> >*/
> >   sq->db_addr = dbs_addr + (i << 3);
> >   sq->ei_addr = eis_addr + (i << 3);
> > -pci_dma_write(pci, sq->db_addr, &sq->tail, sizeof(sq->tail));
> > +pci_dma_write(pci, sq->db_addr, &v, sizeof(sq->tail));
> 
> Or use the PCI DMA ldst API which does the swapping for you:
> 
>   stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
> 

Nice! That's definitely preferable. I'll queue that up for next, but
leave this patch as-is since it's been tested on a real big-endian host
and gotten its tags ;)


signature.asc
Description: PGP signature


Re: [PATCH] hw/nvme: fix endianness issue for shadow doorbells

2023-07-18 Thread Philippe Mathieu-Daudé

On 18/7/23 12:35, Klaus Jensen wrote:

From: Klaus Jensen 

In commit 2fda0726e514 ("hw/nvme: fix missing endian conversions for
doorbell buffers"), we fixed shadow doorbells for big-endian guests
running on little endian hosts. But I did not fix little-endian guests
on big-endian hosts. Fix this.

Solves issue #1765.

Fixes: 3f7fe8de3d49 ("hw/nvme: Implement shadow doorbell buffer support")
Cc: qemu-sta...@nongnu.org
Reported-by: Thomas Huth 
Signed-off-by: Klaus Jensen 
---
  hw/nvme/ctrl.c | 18 +-
  1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8e8e870b9a80..dadc2dc7da10 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -6801,6 +6801,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
  PCIDevice *pci = PCI_DEVICE(n);
  uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
  uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
+uint32_t v;
  int i;
  
  /* Address should be page aligned */

@@ -6818,6 +6819,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
  NvmeCQueue *cq = n->cq[i];
  
  if (sq) {

+v = cpu_to_le32(sq->tail);
+
  /*
   * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
   * nvme_process_db() uses this hard-coded way to calculate
@@ -6825,7 +6828,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
   */
  sq->db_addr = dbs_addr + (i << 3);
  sq->ei_addr = eis_addr + (i << 3);
-pci_dma_write(pci, sq->db_addr, &sq->tail, sizeof(sq->tail));
+pci_dma_write(pci, sq->db_addr, &v, sizeof(sq->tail));


Or use the PCI DMA ldst API which does the swapping for you:

  stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);

  
  if (n->params.ioeventfd && sq->sqid != 0) {

  if (!nvme_init_sq_ioeventfd(sq)) {
@@ -6835,10 +6838,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
  }
  
  if (cq) {

+v = cpu_to_le32(cq->head);
+
  /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
  cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
  cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
-pci_dma_write(pci, cq->db_addr, &cq->head, sizeof(cq->head));
+pci_dma_write(pci, cq->db_addr, &v, sizeof(cq->head));


  stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);

  
  if (n->params.ioeventfd && cq->cqid != 0) {

  if (!nvme_init_cq_ioeventfd(cq)) {
@@ -7587,7 +7592,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, 
unsigned size)
  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
  {
  PCIDevice *pci = PCI_DEVICE(n);
-uint32_t qid;
+uint32_t qid, v;
  
  if (unlikely(addr & ((1 << 2) - 1))) {

  NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
@@ -7654,7 +7659,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
  start_sqs = nvme_cq_full(cq) ? 1 : 0;
  cq->head = new_head;
  if (!qid && n->dbbuf_enabled) {
-pci_dma_write(pci, cq->db_addr, &cq->head, sizeof(cq->head));
+v = cpu_to_le32(cq->head);
+pci_dma_write(pci, cq->db_addr, &v, sizeof(cq->head));


  stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);


  }
  if (start_sqs) {
  NvmeSQueue *sq;
@@ -7714,6 +7720,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
  
  sq->tail = new_tail;

  if (!qid && n->dbbuf_enabled) {
+v = cpu_to_le32(sq->tail);
+
  /*
   * The spec states "the host shall also update the controller's
   * corresponding doorbell property to match the value of that 
entry
@@ -7727,7 +7735,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
   * including ones that run on Linux, are not updating Admin 
Queues,
   * so we can't trust reading it for an appropriate sq tail.
   */
-pci_dma_write(pci, sq->db_addr, &sq->tail, sizeof(sq->tail));
+pci_dma_write(pci, sq->db_addr, &v, sizeof(sq->tail));


  stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);


  }
  
  qemu_bh_schedule(sq->bh);





Re: [PATCH] hw/nvme: fix endianness issue for shadow doorbells

2023-07-18 Thread Keith Busch
On Tue, Jul 18, 2023 at 12:35:12PM +0200, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> In commit 2fda0726e514 ("hw/nvme: fix missing endian conversions for
> doorbell buffers"), we fixed shadow doorbells for big-endian guests
> running on little endian hosts. But I did not fix little-endian guests
> on big-endian hosts. Fix this.
> 
> Solves issue #1765.
> 
> Fixes: 3f7fe8de3d49 ("hw/nvme: Implement shadow doorbell buffer support")
> Cc: qemu-sta...@nongnu.org
> Reported-by: Thomas Huth 
> Signed-off-by: Klaus Jensen 

Looks good!

Reviewed-by: Keith Busch 



Re: [PATCH] hw/nvme: fix endianness issue for shadow doorbells

2023-07-18 Thread Thomas Huth

On 18/07/2023 12.35, Klaus Jensen wrote:

From: Klaus Jensen 

In commit 2fda0726e514 ("hw/nvme: fix missing endian conversions for
doorbell buffers"), we fixed shadow doorbells for big-endian guests
running on little endian hosts. But I did not fix little-endian guests
on big-endian hosts. Fix this.

Solves issue #1765.

Fixes: 3f7fe8de3d49 ("hw/nvme: Implement shadow doorbell buffer support")
Cc: qemu-sta...@nongnu.org
Reported-by: Thomas Huth 
Signed-off-by: Klaus Jensen 
---
  hw/nvme/ctrl.c | 18 +-
  1 file changed, 13 insertions(+), 5 deletions(-)


Thank you very much, this fixes 
tests/avocado/boot_linux_console.py:BootLinuxConsole.test_ppc_powernv8 on my 
s390x host:


Tested-by: Thomas Huth 




Re: [PATCH v2 0/7] iotests/parallels: Add new tests and fix old

2023-07-18 Thread Alexander Ivanov
This patchset also could be applied upon [PATCH v8 00/10] parallels: Add 
duplication check, repair at open, fix bugs.


On 7/1/23 12:11, Alexander Ivanov wrote:

This patchset should be applied on top of [PATCH v7 0/8] parallels: Add
duplication check, repair at open, fix bugs

Add out-of-image, leak and BAT entries duplication checks tests.

Old parallels images check test (131):  Refactor, fix cluster size and fix
after repairing was added to parallels_open().

v2:
5: Fix a typo.
7: Add a test for data_off check.

Alexander Ivanov (7):
   iotests: Add out-of-image check test for parallels format
   iotests: Add leak check test for parallels format
   iotests: Add test for BAT entries duplication check
   iotests: Refactor tests of parallels images checks (131)
   iotests: Fix cluster size in parallels images tests (131)
   iotests: Fix test 131 after repair was added to parallels_open()
   iotests: Add test for data_off check

  tests/qemu-iotests/131|  36 +++--
  tests/qemu-iotests/131.out|  59 +++
  tests/qemu-iotests/tests/parallels-checks | 145 ++
  tests/qemu-iotests/tests/parallels-checks.out |  75 +
  4 files changed, 264 insertions(+), 51 deletions(-)
  create mode 100755 tests/qemu-iotests/tests/parallels-checks
  create mode 100644 tests/qemu-iotests/tests/parallels-checks.out



--
Best regards,
Alexander Ivanov




[PATCH v8 07/10] parallels: Image repairing in parallels_open()

2023-07-18 Thread Alexander Ivanov
Repair an image at opening if the image is unclean or out-of-image
corruption was detected.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 70 +--
 1 file changed, 38 insertions(+), 32 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index a78238eadd..5100c8f903 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -951,7 +951,7 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
 BDRVParallelsState *s = bs->opaque;
 ParallelsHeader ph;
 int ret, size, i;
-int64_t file_nb_sectors;
+int64_t file_nb_sectors, sector;
 QemuOpts *opts = NULL;
 Error *local_err = NULL;
 char *buf;
@@ -1024,11 +1024,6 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
  */
 s->header_size = size;
 }
-if (s->data_end > file_nb_sectors) {
-error_setg(errp, "Invalid image: incorrect data_off field");
-ret = -EINVAL;
-goto fail;
-}
 
 ret = bdrv_pread(bs->file, 0, s->header_size, s->header, 0);
 if (ret < 0) {
@@ -1036,33 +1031,8 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
 }
 s->bat_bitmap = (uint32_t *)(s->header + 1);
 
-for (i = 0; i < s->bat_size; i++) {
-int64_t off = bat2sect(s, i);
-if (off >= file_nb_sectors) {
-if (flags & BDRV_O_CHECK) {
-continue;
-}
-error_setg(errp, "parallels: Offset %" PRIi64 " in BAT[%d] entry "
-   "is larger than file size (%" PRIi64 ")",
-   off << BDRV_SECTOR_BITS, i,
-   file_nb_sectors << BDRV_SECTOR_BITS);
-ret = -EINVAL;
-goto fail;
-}
-if (off >= s->data_end) {
-s->data_end = off + s->tracks;
-}
-}
-
 if (le32_to_cpu(ph.inuse) == HEADER_INUSE_MAGIC) {
-/* Image was not closed correctly. The check is mandatory */
 s->header_unclean = true;
-if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
-error_setg(errp, "parallels: Image was not closed correctly; "
-   "cannot be opened read/write");
-ret = -EACCES;
-goto fail;
-}
 }
 
 opts = qemu_opts_create(¶llels_runtime_opts, NULL, 0, errp);
@@ -1123,10 +1093,40 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
bdrv_get_device_or_node_name(bs));
 ret = migrate_add_blocker(s->migration_blocker, errp);
 if (ret < 0) {
-error_free(s->migration_blocker);
+error_setg(errp, "Migration blocker error");
 goto fail;
 }
 qemu_co_mutex_init(&s->lock);
+
+for (i = 0; i < s->bat_size; i++) {
+sector = bat2sect(s, i);
+if (sector + s->tracks > s->data_end) {
+s->data_end = sector + s->tracks;
+}
+}
+
+/*
+ * We don't repair the image here if it's opened for checks. Also we don't
+ * want to change inactive images and can't change readonly images.
+ */
+if ((flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) || !(flags & BDRV_O_RDWR)) {
+return 0;
+}
+
+/*
+ * Repair the image if it's dirty or
+ * out-of-image corruption was detected.
+ */
+if (s->data_end > file_nb_sectors || s->header_unclean) {
+BdrvCheckResult res;
+ret = bdrv_check(bs, &res, BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
+if (ret < 0) {
+error_setg_errno(errp, -ret, "Could not repair corrupted image");
+migrate_del_blocker(s->migration_blocker);
+goto fail;
+}
+}
+
 return 0;
 
 fail_format:
@@ -1134,6 +1134,12 @@ fail_format:
 fail_options:
 ret = -EINVAL;
 fail:
+/*
+ * "s" object was allocated by g_malloc0 so we can safely
+ * try to free its fields even they were not allocated.
+ */
+error_free(s->migration_blocker);
+g_free(s->bat_dirty_bmap);
 qemu_vfree(s->header);
 return ret;
 }
-- 
2.34.1




[PATCH v8 09/10] parallels: Add data_off check

2023-07-18 Thread Alexander Ivanov
data_off field of the parallels image header can be corrupted. Check if
this field greater than the header + BAT size and less than file size.

Signed-off-by: Alexander Ivanov 
---
 block/parallels.c | 80 +++
 1 file changed, 80 insertions(+)

diff --git a/block/parallels.c b/block/parallels.c
index 3414807089..103acb7b40 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -450,6 +450,81 @@ static void parallels_check_unclean(BlockDriverState *bs,
 }
 }
 
+/*
+ * Returns true if data_off is correct, otherwise false. In both cases
+ * correct_offset is set to the proper value.
+ */
+static bool parallels_test_data_off(BDRVParallelsState *s,
+int64_t file_nb_sectors,
+uint32_t *correct_offset)
+{
+uint32_t data_off, min_off;
+bool old_magic;
+
+/*
+ * There are two slightly different image formats: with "WithoutFreeSpace"
+ * or "WithouFreSpacExt" magic words. Call the first one as "old magic".
+ * In such images data_off field can be zero. In this case the offset is
+ * calculated as the end of BAT table plus some padding to ensure sector
+ * size alignment.
+ */
+old_magic = !memcmp(s->header->magic, HEADER_MAGIC, 16);
+
+min_off = DIV_ROUND_UP(bat_entry_off(s->bat_size), BDRV_SECTOR_SIZE);
+if (!old_magic) {
+min_off = ROUND_UP(min_off, s->cluster_size / BDRV_SECTOR_SIZE);
+}
+
+if (correct_offset) {
+*correct_offset = min_off;
+}
+
+data_off = le32_to_cpu(s->header->data_off);
+if (data_off == 0 && old_magic) {
+return true;
+}
+
+if (data_off < min_off || data_off > file_nb_sectors) {
+return false;
+}
+
+if (correct_offset) {
+*correct_offset = data_off;
+}
+
+return true;
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+parallels_check_data_off(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix)
+{
+BDRVParallelsState *s = bs->opaque;
+int64_t file_size;
+uint32_t data_off;
+
+file_size = bdrv_co_nb_sectors(bs->file->bs);
+if (file_size < 0) {
+res->check_errors++;
+return file_size;
+}
+
+if (parallels_test_data_off(s, file_size, &data_off)) {
+return 0;
+}
+
+res->corruptions++;
+if (fix & BDRV_FIX_ERRORS) {
+s->header->data_off = cpu_to_le32(data_off);
+res->corruptions_fixed++;
+}
+
+fprintf(stderr, "%s data_off field has incorrect value\n",
+fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
+
+return 0;
+}
+
 static int coroutine_fn GRAPH_RDLOCK
 parallels_check_outside_image(BlockDriverState *bs, BdrvCheckResult *res,
   BdrvCheckMode fix)
@@ -713,6 +788,11 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult 
*res,
 WITH_QEMU_LOCK_GUARD(&s->lock) {
 parallels_check_unclean(bs, res, fix);
 
+ret = parallels_check_data_off(bs, res, fix);
+if (ret < 0) {
+return ret;
+}
+
 ret = parallels_check_outside_image(bs, res, fix);
 if (ret < 0) {
 return ret;
-- 
2.34.1




Re: [PATCH] hw/nvme: fix endianness issue for shadow doorbells

2023-07-18 Thread Cédric Le Goater

On 7/18/23 12:35, Klaus Jensen wrote:

From: Klaus Jensen 

In commit 2fda0726e514 ("hw/nvme: fix missing endian conversions for
doorbell buffers"), we fixed shadow doorbells for big-endian guests
running on little endian hosts. But I did not fix little-endian guests
on big-endian hosts. Fix this.

Solves issue #1765.


Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1765


Fixes: 3f7fe8de3d49 ("hw/nvme: Implement shadow doorbell buffer support")
Cc: qemu-sta...@nongnu.org
Reported-by: Thomas Huth 
Signed-off-by: Klaus Jensen 


Tested-by: Cédric Le Goater 

with a PowerNV QEMU machine running on a s390x LPAR.

Thanks,

C.



---
  hw/nvme/ctrl.c | 18 +-
  1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8e8e870b9a80..dadc2dc7da10 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -6801,6 +6801,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
  PCIDevice *pci = PCI_DEVICE(n);
  uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
  uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
+uint32_t v;
  int i;
  
  /* Address should be page aligned */

@@ -6818,6 +6819,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
  NvmeCQueue *cq = n->cq[i];
  
  if (sq) {

+v = cpu_to_le32(sq->tail);
+
  /*
   * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
   * nvme_process_db() uses this hard-coded way to calculate
@@ -6825,7 +6828,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
   */
  sq->db_addr = dbs_addr + (i << 3);
  sq->ei_addr = eis_addr + (i << 3);
-pci_dma_write(pci, sq->db_addr, &sq->tail, sizeof(sq->tail));
+pci_dma_write(pci, sq->db_addr, &v, sizeof(sq->tail));
  
  if (n->params.ioeventfd && sq->sqid != 0) {

  if (!nvme_init_sq_ioeventfd(sq)) {
@@ -6835,10 +6838,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
  }
  
  if (cq) {

+v = cpu_to_le32(cq->head);
+
  /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
  cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
  cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
-pci_dma_write(pci, cq->db_addr, &cq->head, sizeof(cq->head));
+pci_dma_write(pci, cq->db_addr, &v, sizeof(cq->head));
  
  if (n->params.ioeventfd && cq->cqid != 0) {

  if (!nvme_init_cq_ioeventfd(cq)) {
@@ -7587,7 +7592,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, 
unsigned size)
  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
  {
  PCIDevice *pci = PCI_DEVICE(n);
-uint32_t qid;
+uint32_t qid, v;
  
  if (unlikely(addr & ((1 << 2) - 1))) {

  NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
@@ -7654,7 +7659,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
  start_sqs = nvme_cq_full(cq) ? 1 : 0;
  cq->head = new_head;
  if (!qid && n->dbbuf_enabled) {
-pci_dma_write(pci, cq->db_addr, &cq->head, sizeof(cq->head));
+v = cpu_to_le32(cq->head);
+pci_dma_write(pci, cq->db_addr, &v, sizeof(cq->head));
  }
  if (start_sqs) {
  NvmeSQueue *sq;
@@ -7714,6 +7720,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
  
  sq->tail = new_tail;

  if (!qid && n->dbbuf_enabled) {
+v = cpu_to_le32(sq->tail);
+
  /*
   * The spec states "the host shall also update the controller's
   * corresponding doorbell property to match the value of that 
entry
@@ -7727,7 +7735,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
   * including ones that run on Linux, are not updating Admin 
Queues,
   * so we can't trust reading it for an appropriate sq tail.
   */
-pci_dma_write(pci, sq->db_addr, &sq->tail, sizeof(sq->tail));
+pci_dma_write(pci, sq->db_addr, &v, sizeof(sq->tail));
  }
  
  qemu_bh_schedule(sq->bh);





[PATCH v8 03/10] parallels: Check if data_end greater than the file size

2023-07-18 Thread Alexander Ivanov
Initially data_end is set to the data_off image header field and must not
be greater than the file size.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/block/parallels.c b/block/parallels.c
index 3c0dca3dbf..6a3d41373a 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -874,6 +874,11 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
  */
 s->header_size = size;
 }
+if (s->data_end > file_nb_sectors) {
+error_setg(errp, "Invalid image: incorrect data_off field");
+ret = -EINVAL;
+goto fail;
+}
 
 ret = bdrv_pread(bs->file, 0, s->header_size, s->header, 0);
 if (ret < 0) {
-- 
2.34.1




[PATCH v8 02/10] parallels: Incorrect data end calculation in parallels_open()

2023-07-18 Thread Alexander Ivanov
The BDRVParallelsState structure contains data_end field that is measured
in sectors. In parallels_open() initially this field is set by data_off
field from parallels image header.

According to the parallels format documentation, data_off field contains
an offset, in sectors, from the start of the file to the start of the
data area. For "WithoutFreeSpace" images: if data_off is zero, the offset
is calculated as the end of the BAT table plus some padding to ensure
sector size alignment.

The parallels_open() function has code for handling zero value in
data_off, but in the result data_end contains the offset in bytes.

Replace the alignment to sector size by division by sector size and fix
the comparision with s->header_size.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
Reviewed-by: Hanna Czenczek 
---
 block/parallels.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index c7b2ed5a54..3c0dca3dbf 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -865,9 +865,9 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
 }
 s->data_end = le32_to_cpu(ph.data_off);
 if (s->data_end == 0) {
-s->data_end = ROUND_UP(bat_entry_off(s->bat_size), BDRV_SECTOR_SIZE);
+s->data_end = DIV_ROUND_UP(size, BDRV_SECTOR_SIZE);
 }
-if (s->data_end < s->header_size) {
+if (s->data_end < (s->header_size >> BDRV_SECTOR_BITS)) {
 /*
  * There is not enough unused space to fit to block align between BAT
  * and actual data. We can't avoid read-modify-write...
-- 
2.34.1




[PATCH v8 08/10] parallels: Use bdrv_co_getlength() in parallels_check_outside_image()

2023-07-18 Thread Alexander Ivanov
bdrv_co_getlength() should be used in coroutine context. Replace
bdrv_getlength() by bdrv_co_getlength() in parallels_check_outside_image().

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/parallels.c b/block/parallels.c
index 5100c8f903..3414807089 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -500,7 +500,7 @@ parallels_check_leak(BlockDriverState *bs, BdrvCheckResult 
*res,
 int64_t size;
 int ret;
 
-size = bdrv_getlength(bs->file->bs);
+size = bdrv_co_getlength(bs->file->bs);
 if (size < 0) {
 res->check_errors++;
 return size;
-- 
2.34.1




[PATCH v8 04/10] parallels: Add "explicit" argument to parallels_check_leak()

2023-07-18 Thread Alexander Ivanov
In the on of the next patches we need to repair leaks without changing
leaks and leaks_fixed info in res. Also we don't want to print any warning
about leaks. Add "explicit" argument to skip info changing if the argument
is false.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 19 ---
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index 6a3d41373a..8bb5d115fc 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -488,7 +488,7 @@ parallels_check_outside_image(BlockDriverState *bs, 
BdrvCheckResult *res,
 
 static int coroutine_fn GRAPH_RDLOCK
 parallels_check_leak(BlockDriverState *bs, BdrvCheckResult *res,
- BdrvCheckMode fix)
+ BdrvCheckMode fix, bool explicit)
 {
 BDRVParallelsState *s = bs->opaque;
 int64_t size;
@@ -503,10 +503,13 @@ parallels_check_leak(BlockDriverState *bs, 
BdrvCheckResult *res,
 if (size > res->image_end_offset) {
 int64_t count;
 count = DIV_ROUND_UP(size - res->image_end_offset, s->cluster_size);
-fprintf(stderr, "%s space leaked at the end of the image %" PRId64 
"\n",
-fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR",
-size - res->image_end_offset);
-res->leaks += count;
+if (explicit) {
+fprintf(stderr,
+"%s space leaked at the end of the image %" PRId64 "\n",
+fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR",
+size - res->image_end_offset);
+res->leaks += count;
+}
 if (fix & BDRV_FIX_LEAKS) {
 Error *local_err = NULL;
 
@@ -521,7 +524,9 @@ parallels_check_leak(BlockDriverState *bs, BdrvCheckResult 
*res,
 res->check_errors++;
 return ret;
 }
-res->leaks_fixed += count;
+if (explicit) {
+res->leaks_fixed += count;
+}
 }
 }
 
@@ -574,7 +579,7 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult 
*res,
 return ret;
 }
 
-ret = parallels_check_leak(bs, res, fix);
+ret = parallels_check_leak(bs, res, fix, true);
 if (ret < 0) {
 return ret;
 }
-- 
2.34.1




[PATCH v8 06/10] parallels: Add checking and repairing duplicate offsets in BAT

2023-07-18 Thread Alexander Ivanov
Cluster offsets must be unique among all the BAT entries. Find duplicate
offsets in the BAT and fix it by copying the content of the relevant
cluster to a newly allocated cluster and set the new cluster offset to the
duplicated entry.

Add host_cluster_index() helper to deduplicate the code.

When new clusters are allocated, the file size increases by 128 Mb. Call
parallels_check_leak() to fix this leak.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 144 ++
 1 file changed, 144 insertions(+)

diff --git a/block/parallels.c b/block/parallels.c
index f7b44cb433..a78238eadd 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -136,6 +136,12 @@ static int cluster_remainder(BDRVParallelsState *s, 
int64_t sector_num,
 return MIN(nb_sectors, ret);
 }
 
+static uint32_t host_cluster_index(BDRVParallelsState *s, int64_t off)
+{
+off -= s->data_start << BDRV_SECTOR_BITS;
+return off / s->cluster_size;
+}
+
 static int64_t block_status(BDRVParallelsState *s, int64_t sector_num,
 int nb_sectors, int *pnum)
 {
@@ -533,6 +539,139 @@ parallels_check_leak(BlockDriverState *bs, 
BdrvCheckResult *res,
 return 0;
 }
 
+static int coroutine_fn GRAPH_RDLOCK
+parallels_check_duplicate(BlockDriverState *bs, BdrvCheckResult *res,
+  BdrvCheckMode fix)
+{
+BDRVParallelsState *s = bs->opaque;
+int64_t host_off, host_sector, guest_sector;
+unsigned long *bitmap;
+uint32_t i, bitmap_size, cluster_index, bat_entry;
+int n, ret = 0;
+uint64_t *buf = NULL;
+bool fixed = false;
+
+/*
+ * Create a bitmap of used clusters.
+ * If a bit is set, there is a BAT entry pointing to this cluster.
+ * Loop through the BAT entries, check bits relevant to an entry offset.
+ * If bit is set, this entry is duplicated. Otherwise set the bit.
+ *
+ * We shouldn't worry about newly allocated clusters outside the image
+ * because they are created higher then any existing cluster pointed by
+ * a BAT entry.
+ */
+bitmap_size = host_cluster_index(s, res->image_end_offset);
+if (bitmap_size == 0) {
+return 0;
+}
+if (res->image_end_offset % s->cluster_size) {
+/* A not aligned image end leads to a bitmap shorter by 1 */
+bitmap_size++;
+}
+
+bitmap = bitmap_new(bitmap_size);
+
+buf = qemu_blockalign(bs, s->cluster_size);
+
+for (i = 0; i < s->bat_size; i++) {
+host_off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+if (host_off == 0) {
+continue;
+}
+
+cluster_index = host_cluster_index(s, host_off);
+assert(cluster_index < bitmap_size);
+if (!test_bit(cluster_index, bitmap)) {
+bitmap_set(bitmap, cluster_index, 1);
+continue;
+}
+
+/* this cluster duplicates another one */
+fprintf(stderr, "%s duplicate offset in BAT entry %u\n",
+fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
+
+res->corruptions++;
+
+if (!(fix & BDRV_FIX_ERRORS)) {
+continue;
+}
+
+/*
+ * Reset the entry and allocate a new cluster
+ * for the relevant guest offset. In this way we let
+ * the lower layer to place the new cluster properly.
+ * Copy the original cluster to the allocated one.
+ * But before save the old offset value for repairing
+ * if we have an error.
+ */
+bat_entry = s->bat_bitmap[i];
+parallels_set_bat_entry(s, i, 0);
+
+ret = bdrv_co_pread(bs->file, host_off, s->cluster_size, buf, 0);
+if (ret < 0) {
+res->check_errors++;
+goto out_repair_bat;
+}
+
+guest_sector = (i * (int64_t)s->cluster_size) >> BDRV_SECTOR_BITS;
+host_sector = allocate_clusters(bs, guest_sector, s->tracks, &n);
+if (host_sector < 0) {
+res->check_errors++;
+goto out_repair_bat;
+}
+host_off = host_sector << BDRV_SECTOR_BITS;
+
+ret = bdrv_co_pwrite(bs->file, host_off, s->cluster_size, buf, 0);
+if (ret < 0) {
+res->check_errors++;
+goto out_repair_bat;
+}
+
+if (host_off + s->cluster_size > res->image_end_offset) {
+res->image_end_offset = host_off + s->cluster_size;
+}
+
+/*
+ * In the future allocate_cluster() will reuse holed offsets
+ * inside the image. Keep the used clusters bitmap content
+ * consistent for the new allocated clusters too.
+ *
+ * Note, clusters allocated outside the current image are not
+ * considered, and the bitmap size doesn't change.
+ */
+cluster_index = host_cluster_index(s, host_off);
+if (cluster_index < bitmap_size) {
+bitmap_set(bitmap, cluster_index, 1);
+}
+
+fix

[PATCH v8 05/10] parallels: Add data_start field to BDRVParallelsState

2023-07-18 Thread Alexander Ivanov
In the next patch we will need the offset of the data area for host cluster
index calculation. Add this field and setting up code.

Signed-off-by: Alexander Ivanov 
Reviewed-by: Denis V. Lunev 
---
 block/parallels.c | 7 ---
 block/parallels.h | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index 8bb5d115fc..f7b44cb433 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -868,10 +868,11 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
 ret = -ENOMEM;
 goto fail;
 }
-s->data_end = le32_to_cpu(ph.data_off);
-if (s->data_end == 0) {
-s->data_end = DIV_ROUND_UP(size, BDRV_SECTOR_SIZE);
+s->data_start = le32_to_cpu(ph.data_off);
+if (s->data_start == 0) {
+s->data_start = DIV_ROUND_UP(size, BDRV_SECTOR_SIZE);
 }
+s->data_end = s->data_start;
 if (s->data_end < (s->header_size >> BDRV_SECTOR_BITS)) {
 /*
  * There is not enough unused space to fit to block align between BAT
diff --git a/block/parallels.h b/block/parallels.h
index f22f43f988..4e53e9572d 100644
--- a/block/parallels.h
+++ b/block/parallels.h
@@ -75,6 +75,7 @@ typedef struct BDRVParallelsState {
 uint32_t *bat_bitmap;
 unsigned int bat_size;
 
+int64_t  data_start;
 int64_t  data_end;
 uint64_t prealloc_size;
 ParallelsPreallocMode prealloc_mode;
-- 
2.34.1




[PATCH v8 00/10] parallels: Add duplication check, repair at open, fix bugs

2023-07-18 Thread Alexander Ivanov
Fix incorrect data end calculation in parallels_open().

Check if data_end greater than the file size.

Add change_info argument to parallels_check_leak().

Add checking and repairing duplicate offsets in BAT

Image repairing in parallels_open().

v8:
1: New patch. Fixed comments formatting.
2: Fixed a comment line length.
6: Fixed a typo in a label.
9: Patch was splitted to two patches. This patch only adds data_off check.
   Function parallels_test_data_off() was changed to return the correct
   offset in any case.
10: Added data_off repairing to parallels_open(),


v7:
3: Renamed "change_info" argument to "explicit", move info printing under
   explicit condition.
4: Different patch. Add data_start field to BDRVParallelsState for future
   host_cluster_index() function.
5: Prevously was 4th. Used s->data_start instead of s->header->data_off.
   Add bitmap size increasing if file length is not cluster aligned. Revert
   a couple conditions for better readability. Changed sectors calculation
   for better code transparency. Replaced sector variable by host_sector and
   guest_sector. Renamed off to host_off. Moved out_repare_bat: below return
   to get jumps on error path only.
6: Prevously was 5th.
7: New patch. Replace bdrv_getlength() by bdrv_co_getlength() in
   parallels_check_outside_image() because it is used in coroutine context.
8: New patch. Add data_off check.

v6:
2: Different patch. Refused to split image leak handling. Instead there is a
   patch with a data_end check.
3: Different patch. There is a patch with change_info argument.
4: Removed changing fprintf by qemu_log from this patchset. Previously 3rd
   patch became 4th. Replaced qemu_memalign() by qemu_blockalign(). Got
   rid of iovecs, replaced bdrv_co_pwritev() by bdrv_co_pwrite(). Added
   assert(cluster_index < bitmap_size). Now BAT changes are reverted if
   there was an error in the cluster copying process. Simplified a sector
   calculation.
5: Moved header magic check to the appropriate place. Added a
   migrate_del_blocker() call and s->bat_dirty_bmap freeing on error.

v5:
3: Fixed a byteorder bug, fixed zero-length image handling and fixed uint32
   truncation.

v4:
2,5: Rebased.

v3:
2: Added (size >= res->image_end_offset) assert and changed the comment in
   parallels_get_leak_size(). Changed error printing and leaks fixing order.
3: Removed highest_offset() helper, instead image_end_offset field is used.
5: Moved highest_offset() code to parallels_open() - now it is used only in
   this function. Fixed data_end update condition. Fixed a leak of
   s->migration_blocker.

v2:
2: Moved outsude parallels_check_leak() 2 helpers:
   parallels_get_leak_size() and parallels_fix_leak().
   
3: Used highest_offset() helper in parallels_check_leak(). Fixed a typo.
   Added comments. Replaced g_malloc() call by qemu_memalign(). Replaced
   bdrv_pread() call by bdrv_co_pread(). Got rid of keeping bytes and
   sectors in the same variable. Added setting the bitmap of the used
   clusters for a new allocated cluster if it isn't out of the bitmap.
   Moved the leak fix to the end of all the checks. Removed a dependence
   on image format for the duplicate check.
   
4 (old): Merged this patch to the previous.
4 (former 5): Fixed formatting.
5 (former 6): Fixed comments. Added O_INACTIVE check in the condition.
  Replaced inuse detection by header_unclean checking.
  Replaced playing with corutines by bdrv_check() usage.

Alexander Ivanov (10):
  parallels: Fix comments formatting inside parallels driver
  parallels: Incorrect data end calculation in parallels_open()
  parallels: Check if data_end greater than the file size
  parallels: Add "explicit" argument to parallels_check_leak()
  parallels: Add data_start field to BDRVParallelsState
  parallels: Add checking and repairing duplicate offsets in BAT
  parallels: Image repairing in parallels_open()
  parallels: Use bdrv_co_getlength() in parallels_check_outside_image()
  parallels: Add data_off check
  parallels: Add data_off repairing to parallels_open()

 block/parallels.c | 346 +++---
 block/parallels.h |   1 +
 2 files changed, 299 insertions(+), 48 deletions(-)

-- 
2.34.1




[PATCH v8 10/10] parallels: Add data_off repairing to parallels_open()

2023-07-18 Thread Alexander Ivanov
Place data_start/data_end calculation after reading the image header
to s->header. Set s->data_start to the offset calculated in
parallels_test_data_off(). Call bdrv_check() if data_off is incorrect.

Signed-off-by: Alexander Ivanov 
---
 block/parallels.c | 31 +--
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index 103acb7b40..48c32d6821 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -1032,9 +1032,11 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
 ParallelsHeader ph;
 int ret, size, i;
 int64_t file_nb_sectors, sector;
+uint32_t data_start;
 QemuOpts *opts = NULL;
 Error *local_err = NULL;
 char *buf;
+bool data_off_is_correct;
 
 ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 if (ret < 0) {
@@ -1092,10 +1094,20 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
 ret = -ENOMEM;
 goto fail;
 }
-s->data_start = le32_to_cpu(ph.data_off);
-if (s->data_start == 0) {
-s->data_start = DIV_ROUND_UP(size, BDRV_SECTOR_SIZE);
+
+ret = bdrv_pread(bs->file, 0, s->header_size, s->header, 0);
+if (ret < 0) {
+goto fail;
 }
+s->bat_bitmap = (uint32_t *)(s->header + 1);
+
+if (le32_to_cpu(ph.inuse) == HEADER_INUSE_MAGIC) {
+s->header_unclean = true;
+}
+
+data_off_is_correct = parallels_test_data_off(s, file_nb_sectors,
+  &data_start);
+s->data_start = data_start;
 s->data_end = s->data_start;
 if (s->data_end < (s->header_size >> BDRV_SECTOR_BITS)) {
 /*
@@ -1105,16 +1117,6 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
 s->header_size = size;
 }
 
-ret = bdrv_pread(bs->file, 0, s->header_size, s->header, 0);
-if (ret < 0) {
-goto fail;
-}
-s->bat_bitmap = (uint32_t *)(s->header + 1);
-
-if (le32_to_cpu(ph.inuse) == HEADER_INUSE_MAGIC) {
-s->header_unclean = true;
-}
-
 opts = qemu_opts_create(¶llels_runtime_opts, NULL, 0, errp);
 if (!opts) {
 goto fail_options;
@@ -1197,7 +1199,8 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
  * Repair the image if it's dirty or
  * out-of-image corruption was detected.
  */
-if (s->data_end > file_nb_sectors || s->header_unclean) {
+if (s->data_end > file_nb_sectors || s->header_unclean
+|| !data_off_is_correct) {
 BdrvCheckResult res;
 ret = bdrv_check(bs, &res, BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
 if (ret < 0) {
-- 
2.34.1




[PATCH v8 01/10] parallels: Fix comments formatting inside parallels driver

2023-07-18 Thread Alexander Ivanov
This patch is technically necessary as git patch rendering could result
in moving some code from one place to the another and that hits
checkpatch.pl warning. This problem specifically happens within next
series.

Signed-off-by: Alexander Ivanov 
---
 block/parallels.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index 18e34aef28..c7b2ed5a54 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -188,7 +188,8 @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num,
 idx = sector_num / s->tracks;
 to_allocate = DIV_ROUND_UP(sector_num + *pnum, s->tracks) - idx;
 
-/* This function is called only by parallels_co_writev(), which will never
+/*
+ * This function is called only by parallels_co_writev(), which will never
  * pass a sector_num at or beyond the end of the image (because the block
  * layer never passes such a sector_num to that function). Therefore, idx
  * is always below s->bat_size.
@@ -196,7 +197,8 @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num,
  * exceed the image end. Therefore, idx + to_allocate cannot exceed
  * s->bat_size.
  * Note that s->bat_size is an unsigned int, therefore idx + to_allocate
- * will always fit into a uint32_t. */
+ * will always fit into a uint32_t.
+ */
 assert(idx < s->bat_size && idx + to_allocate <= s->bat_size);
 
 space = to_allocate * s->tracks;
@@ -230,13 +232,15 @@ allocate_clusters(BlockDriverState *bs, int64_t 
sector_num,
 }
 }
 
-/* Try to read from backing to fill empty clusters
+/*
+ * Try to read from backing to fill empty clusters
  * FIXME: 1. previous write_zeroes may be redundant
  *2. most of data we read from backing will be rewritten by
  *   parallels_co_writev. On aligned-to-cluster write we do not 
need
  *   this read at all.
  *3. it would be good to combine write of data from backing and new
- *   data into one write call */
+ *   data into one write call.
+ */
 if (bs->backing) {
 int64_t nb_cow_sectors = to_allocate * s->tracks;
 int64_t nb_cow_bytes = nb_cow_sectors << BDRV_SECTOR_BITS;
@@ -864,8 +868,10 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
 s->data_end = ROUND_UP(bat_entry_off(s->bat_size), BDRV_SECTOR_SIZE);
 }
 if (s->data_end < s->header_size) {
-/* there is not enough unused space to fit to block align between BAT
-   and actual data. We can't avoid read-modify-write... */
+/*
+ * There is not enough unused space to fit to block align between BAT
+ * and actual data. We can't avoid read-modify-write...
+ */
 s->header_size = size;
 }
 
-- 
2.34.1




[PATCH] hw/nvme: fix endianness issue for shadow doorbells

2023-07-18 Thread Klaus Jensen
From: Klaus Jensen 

In commit 2fda0726e514 ("hw/nvme: fix missing endian conversions for
doorbell buffers"), we fixed shadow doorbells for big-endian guests
running on little endian hosts. But I did not fix little-endian guests
on big-endian hosts. Fix this.

Solves issue #1765.

Fixes: 3f7fe8de3d49 ("hw/nvme: Implement shadow doorbell buffer support")
Cc: qemu-sta...@nongnu.org
Reported-by: Thomas Huth 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8e8e870b9a80..dadc2dc7da10 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -6801,6 +6801,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
 PCIDevice *pci = PCI_DEVICE(n);
 uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
 uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
+uint32_t v;
 int i;
 
 /* Address should be page aligned */
@@ -6818,6 +6819,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
 NvmeCQueue *cq = n->cq[i];
 
 if (sq) {
+v = cpu_to_le32(sq->tail);
+
 /*
  * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
  * nvme_process_db() uses this hard-coded way to calculate
@@ -6825,7 +6828,7 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
  */
 sq->db_addr = dbs_addr + (i << 3);
 sq->ei_addr = eis_addr + (i << 3);
-pci_dma_write(pci, sq->db_addr, &sq->tail, sizeof(sq->tail));
+pci_dma_write(pci, sq->db_addr, &v, sizeof(sq->tail));
 
 if (n->params.ioeventfd && sq->sqid != 0) {
 if (!nvme_init_sq_ioeventfd(sq)) {
@@ -6835,10 +6838,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
 }
 
 if (cq) {
+v = cpu_to_le32(cq->head);
+
 /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
 cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
 cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
-pci_dma_write(pci, cq->db_addr, &cq->head, sizeof(cq->head));
+pci_dma_write(pci, cq->db_addr, &v, sizeof(cq->head));
 
 if (n->params.ioeventfd && cq->cqid != 0) {
 if (!nvme_init_cq_ioeventfd(cq)) {
@@ -7587,7 +7592,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, 
unsigned size)
 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
 {
 PCIDevice *pci = PCI_DEVICE(n);
-uint32_t qid;
+uint32_t qid, v;
 
 if (unlikely(addr & ((1 << 2) - 1))) {
 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
@@ -7654,7 +7659,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
 start_sqs = nvme_cq_full(cq) ? 1 : 0;
 cq->head = new_head;
 if (!qid && n->dbbuf_enabled) {
-pci_dma_write(pci, cq->db_addr, &cq->head, sizeof(cq->head));
+v = cpu_to_le32(cq->head);
+pci_dma_write(pci, cq->db_addr, &v, sizeof(cq->head));
 }
 if (start_sqs) {
 NvmeSQueue *sq;
@@ -7714,6 +7720,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
 
 sq->tail = new_tail;
 if (!qid && n->dbbuf_enabled) {
+v = cpu_to_le32(sq->tail);
+
 /*
  * The spec states "the host shall also update the controller's
  * corresponding doorbell property to match the value of that entry
@@ -7727,7 +7735,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
  * including ones that run on Linux, are not updating Admin Queues,
  * so we can't trust reading it for an appropriate sq tail.
  */
-pci_dma_write(pci, sq->db_addr, &sq->tail, sizeof(sq->tail));
+pci_dma_write(pci, sq->db_addr, &v, sizeof(sq->tail));
 }
 
 qemu_bh_schedule(sq->bh);
-- 
2.41.0