date:20240328

[RFC PATCH v2 3/6] cxl/core: add report option for cxl_mem_get_poison()

2024-03-28 Thread Shiyang Ruan via

The GMER only has "Physical Address" field, no such one indicates length.
So, when a poison event is received, we could use GET_POISON_LIST command
to get the poison list.  Now driver has cxl_mem_get_poison(), so
reuse it and add a parameter 'bool report', report poison record to MCE
if set true.

Signed-off-by: Shiyang Ruan 
---
 drivers/cxl/core/mbox.c   | 8 ++--
 drivers/cxl/core/memdev.c | 4 ++--
 drivers/cxl/core/region.c | 8 
 drivers/cxl/cxlmem.h  | 2 +-
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 31b1b8711256..19b46fb06ed6 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -1309,7 +1309,7 @@ void cxl_mem_report_poison(struct cxl_memdev *cxlmd,
 EXPORT_SYMBOL_NS_GPL(cxl_mem_report_poison, CXL);
 
 int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
-  struct cxl_region *cxlr)
+  struct cxl_region *cxlr, bool report)
 {
struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
struct cxl_mbox_poison_out *po;
@@ -1340,10 +1340,14 @@ int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 
offset, u64 len,
if (rc)
break;
 
-   for (int i = 0; i < le16_to_cpu(po->count); i++)
+   for (int i = 0; i < le16_to_cpu(po->count); i++) {
trace_cxl_poison(cxlmd, cxlr, &po->record[i],
 po->flags, po->overflow_ts,
 CXL_POISON_TRACE_LIST);
+   if (report)
+   cxl_mem_report_poison(cxlmd, cxlr,
+ &po->record[i]);
+   }
 
/* Protect against an uncleared _FLAG_MORE */
nr_records = nr_records + le16_to_cpu(po->count);
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index d4e259f3a7e9..e976141ca4a9 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -200,14 +200,14 @@ static int cxl_get_poison_by_memdev(struct cxl_memdev 
*cxlmd)
if (resource_size(&cxlds->pmem_res)) {
offset = cxlds->pmem_res.start;
length = resource_size(&cxlds->pmem_res);
-   rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+   rc = cxl_mem_get_poison(cxlmd, offset, length, NULL, false);
if (rc)
return rc;
}
if (resource_size(&cxlds->ram_res)) {
offset = cxlds->ram_res.start;
length = resource_size(&cxlds->ram_res);
-   rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+   rc = cxl_mem_get_poison(cxlmd, offset, length, NULL, false);
/*
 * Invalid Physical Address is not an error for
 * volatile addresses. Device support is optional.
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 5c186e0a39b9..e83c46cb4dea 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -2585,7 +2585,7 @@ static int cxl_get_poison_unmapped(struct cxl_memdev 
*cxlmd,
if (ctx->mode == CXL_DECODER_RAM) {
offset = ctx->offset;
length = resource_size(&cxlds->ram_res) - offset;
-   rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+   rc = cxl_mem_get_poison(cxlmd, offset, length, NULL, false);
if (rc == -EFAULT)
rc = 0;
if (rc)
@@ -2603,7 +2603,7 @@ static int cxl_get_poison_unmapped(struct cxl_memdev 
*cxlmd,
return 0;
}
 
-   return cxl_mem_get_poison(cxlmd, offset, length, NULL);
+   return cxl_mem_get_poison(cxlmd, offset, length, NULL, false);
 }
 
 static int poison_by_decoder(struct device *dev, void *arg)
@@ -2637,7 +2637,7 @@ static int poison_by_decoder(struct device *dev, void 
*arg)
if (cxled->skip) {
offset = cxled->dpa_res->start - cxled->skip;
length = cxled->skip;
-   rc = cxl_mem_get_poison(cxlmd, offset, length, NULL);
+   rc = cxl_mem_get_poison(cxlmd, offset, length, NULL, false);
if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM)
rc = 0;
if (rc)
@@ -2646,7 +2646,7 @@ static int poison_by_decoder(struct device *dev, void 
*arg)
 
offset = cxled->dpa_res->start;
length = cxled->dpa_res->end - offset + 1;
-   rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region);
+   rc = cxl_mem_get_poison(cxlmd, offset, length, cxled->cxld.region, 
false);
if (rc == -EFAULT && cxled->mode == CXL_DECODER_RAM)
rc = 0;
if (rc)
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 82f80eb381fb..1f03130b9d6a 100644
--- a/drivers/c

[RFC PATCH v2 5/6] cxl: add definition for transaction types

2024-03-28 Thread Shiyang Ruan via

The transaction types are defined in General Media Event Record/DRAM Event
per CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43 and
Section 8.2.9.2.1.2; Table 8-44.  Add them for Event Record handler use.

Signed-off-by: Shiyang Ruan 
---
 include/linux/cxl-event.h | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 03fa6d50d46f..0a50754fc330 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -23,6 +23,19 @@ struct cxl_event_generic {
u8 data[CXL_EVENT_RECORD_DATA_LENGTH];
 } __packed;
 
+/*
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ */
+enum cxl_event_transaction_type {
+   CXL_EVENT_TRANSACTION_UNKNOWN = 0X00,
+   CXL_EVENT_TRANSACTION_READ,
+   CXL_EVENT_TRANSACTION_WRITE,
+   CXL_EVENT_TRANSACTION_SCAN_MEDIA,
+   CXL_EVENT_TRANSACTION_INJECT_POISON,
+   CXL_EVENT_TRANSACTION_MEDIA_SCRUB,
+   CXL_EVENT_TRANSACTION_MEDIA_MANAGEMENT,
+};
+
 /*
  * General Media Event Record
  * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
@@ -33,7 +46,7 @@ struct cxl_event_gen_media {
__le64 phys_addr;
u8 descriptor;
u8 type;
-   u8 transaction_type;
+   u8 transaction_type;/* enum cxl_event_transaction_type */
u8 validity_flags[2];
u8 channel;
u8 rank;
@@ -52,7 +65,7 @@ struct cxl_event_dram {
__le64 phys_addr;
u8 descriptor;
u8 type;
-   u8 transaction_type;
+   u8 transaction_type;/* enum cxl_event_transaction_type */
u8 validity_flags[2];
u8 channel;
u8 rank;
-- 
2.34.1

[RFC PATCH v2 4/6] cxl/core: report poison when injecting from debugfs

2024-03-28 Thread Shiyang Ruan via

Poison injection from debugfs is silent too.  Add calling
cxl_mem_report_poison() to make it able to do memory_failure().

Signed-off-by: Shiyang Ruan 
---
 drivers/cxl/core/memdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index e976141ca4a9..b0dcbe6f1004 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -366,6 +366,7 @@ int cxl_inject_poison(struct cxl_memdev *cxlmd, u64 dpa)
.length = cpu_to_le32(1),
};
trace_cxl_poison(cxlmd, cxlr, &record, 0, 0, CXL_POISON_TRACE_INJECT);
+   cxl_mem_report_poison(cxlmd, cxlr, &record);
 out:
up_read(&cxl_dpa_rwsem);
up_read(&cxl_region_rwsem);
-- 
2.34.1

[RFC PATCH v2 0/6] cxl: add poison event handler

2024-03-28 Thread Shiyang Ruan via

Changes:
RFCv1 -> RFCv2:
1. update commit message of PATCH 1
2. use memory_failure_queue() instead of MCE
3. also report poison in debugfs when injecting poison
4. correct DPA->HPA logic:
 find memdev's endpoint decoder to find the region it belongs to
5. distinguish transaction_type of GMER, only handle POISON related
 event for now


Currently driver only traces cxl events, poison injection (for both vmem
and pmem type) on cxl memdev is silent.  OS needs to be notified then it
could handle poison range in time.  Per CXL spec, the device error event
could be signaled through FW-First and OS-First methods.

So, add poison event handler in OS-First method:
  - qemu:
- CXL device report POISON event to OS by MSI by sending GMER after
  injecting a poison record
  - CXL driver  <-- this patchset
a. parse the POISON event from GMER;
b. retrieve POISON list from memdev;
c. translate poisoned DPA to HPA;
d. enqueue poisoned PFN to memory_failure's work queue;


Shiyang Ruan (6):
  cxl/core: correct length of DPA field masks
  cxl/core: introduce cxl_mem_report_poison()
  cxl/core: add report option for cxl_mem_get_poison()
  cxl/core: report poison when injecting from debugfs
  cxl: add definition for transaction_type
  cxl/core: add poison injection event handler

 drivers/cxl/core/mbox.c   | 126 +-
 drivers/cxl/core/memdev.c |   5 +-
 drivers/cxl/core/region.c |   8 +--
 drivers/cxl/core/trace.h  |   6 +-
 drivers/cxl/cxlmem.h  |  13 ++--
 include/linux/cxl-event.h |  17 -
 6 files changed, 144 insertions(+), 31 deletions(-)

-- 
2.34.1

[RFC PATCH v2 1/6] cxl/core: correct length of DPA field masks

2024-03-28 Thread Shiyang Ruan via

The length of Physical Address in General Media Event Record/DRAM Event
Record is 64-bit, so the field mask should be defined as such length.
Otherwise, this causes cxl_general_media and cxl_dram tracepoints to
mask off the upper-32-bits of DPA addresses. The cxl_poison event is
unaffected.

If userspace was doing its own DPA-to-HPA translation this could lead to
incorrect page retirement decisions, but there is no known consumer
(like rasdaemon) of this event today.

Fixes: d54a531a430b ("cxl/mem: Trace General Media Event Record")
Cc: 
Cc: Dan Williams 
Cc: Davidlohr Bueso 
Cc: Jonathan Cameron 
Cc: Ira Weiny 
Signed-off-by: Shiyang Ruan 
---
 drivers/cxl/core/trace.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index e5f13260fc52..e2d1f296df97 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -253,11 +253,11 @@ TRACE_EVENT(cxl_generic_event,
  * DRAM Event Record
  * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
  */
-#define CXL_DPA_FLAGS_MASK 0x3F
+#define CXL_DPA_FLAGS_MASK 0x3FULL
 #define CXL_DPA_MASK   (~CXL_DPA_FLAGS_MASK)
 
-#define CXL_DPA_VOLATILE   BIT(0)
-#define CXL_DPA_NOT_REPAIRABLE BIT(1)
+#define CXL_DPA_VOLATILE   BIT_ULL(0)
+#define CXL_DPA_NOT_REPAIRABLE BIT_ULL(1)
 #define show_dpa_flags(flags)  __print_flags(flags, "|",  \
{ CXL_DPA_VOLATILE, "VOLATILE"  }, \
{ CXL_DPA_NOT_REPAIRABLE,   "NOT_REPAIRABLE"}  \
-- 
2.34.1

[RFC PATCH v2 6/6] cxl/core: add poison injection event handler

2024-03-28 Thread Shiyang Ruan via

Currently driver only traces cxl events, poison injection (for both vmem
and pmem type) on cxl memdev is silent.  OS needs to be notified then it
could handle poison range in time.  Per CXL spec, the device error event
could be signaled through FW-First and OS-First methods.

So, add poison event handler in OS-First method:
  - qemu:
- CXL device report POISON event to OS by MSI by sending GMER after
  injecting a poison record
  - CXL driver
a. parse the POISON event from GMER;<-- this patch
b. retrieve POISON list from memdev;
c. translate poisoned DPA to HPA;
d. enqueue poisoned PFN to memory_failure's work queue;

Signed-off-by: Shiyang Ruan 
---

the reply to Jonathan's comment in last version:
> I'm not 100% convinced this is necessary poison causing.  Also
> the text tells us we should see 'an appropriate event'.
> DRAM one seems likely to be chosen by some vendors.
I think it's right to use DRAM Event Record for volatile-memdev, but 
should poison on a persistent-memdev also use DRAM Event Record too? 
Though its 'Physical Address' feild has the 'Volatile' bit too, which is 
same as General Media Event Record.  I am a bit confused about this.

---
 drivers/cxl/core/mbox.c | 100 ++--
 drivers/cxl/cxlmem.h|   8 ++--
 2 files changed, 91 insertions(+), 17 deletions(-)

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 19b46fb06ed6..97ef45d808b8 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -837,25 +837,99 @@ int cxl_enumerate_cmds(struct cxl_memdev_state *mds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_enumerate_cmds, CXL);
 
-void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
-   enum cxl_event_log_type type,
-   enum cxl_event_type event_type,
-   const uuid_t *uuid, union cxl_event *evt)
+struct cxl_event_poison_context {
+   u64 dpa;
+   u64 length;
+};
+
+static int __cxl_report_poison(struct device *dev, void *arg)
+{
+   struct cxl_event_poison_context *ctx = arg;
+   struct cxl_endpoint_decoder *cxled;
+   struct cxl_memdev *cxlmd;
+
+   cxled = to_cxl_endpoint_decoder(dev);
+   if (!cxled || !cxled->dpa_res || !resource_size(cxled->dpa_res))
+   return 0;
+
+   if (cxled->mode == CXL_DECODER_MIXED) {
+   dev_dbg(dev, "poison list read unsupported in mixed mode\n");
+   return 0;
+   }
+
+   if (ctx->dpa > cxled->dpa_res->end || ctx->dpa < cxled->dpa_res->start)
+   return 0;
+
+   cxlmd = cxled_to_memdev(cxled);
+   cxl_mem_get_poison(cxlmd, ctx->dpa, ctx->length, cxled->cxld.region,
+  true);
+
+   return 1;
+}
+
+static void cxl_event_handle_poison(struct cxl_memdev *cxlmd,
+   struct cxl_event_gen_media *rec)
+{
+   struct cxl_port *port = cxlmd->endpoint;
+   u64 phys_addr = le64_to_cpu(rec->phys_addr);
+   struct cxl_event_poison_context ctx = {
+   .dpa = phys_addr & CXL_DPA_MASK,
+   };
+
+   /* No regions mapped to this memdev, that is to say no HPA is mapped */
+   if (!port || !is_cxl_endpoint(port) ||
+   cxl_num_decoders_committed(port) == 0)
+   return;
+
+   /*
+* Host Inject Poison may have a range of DPA, but the GMER only has
+* "Physical Address" field, no such one indicates length.  So it's
+* better to call cxl_mem_get_poison() to find this poison record.
+*/
+   ctx.length = phys_addr & CXL_DPA_VOLATILE ?
+   resource_size(&cxlmd->cxlds->ram_res) :
+   resource_size(&cxlmd->cxlds->pmem_res) - ctx.dpa;
+
+   device_for_each_child(&port->dev, &ctx, __cxl_report_poison);
+}
+
+static void cxl_event_handle_general_media(struct cxl_memdev *cxlmd,
+  enum cxl_event_log_type type,
+  struct cxl_event_gen_media *rec)
+{
+   if (type == CXL_EVENT_TYPE_FAIL) {
+   switch (rec->transaction_type) {
+   case CXL_EVENT_TRANSACTION_READ:
+   case CXL_EVENT_TRANSACTION_WRITE:
+   case CXL_EVENT_TRANSACTION_INJECT_POISON:
+   cxl_event_handle_poison(cxlmd, rec);
+   break;
+   default:
+   break;
+   }
+   }
+}
+
+void cxl_event_handle_record(struct cxl_memdev *cxlmd,
+enum cxl_event_log_type type,
+enum cxl_event_type event_type,
+const uuid_t *uuid, union cxl_event *evt)
 {
-   if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
+   if (event_type == CXL_CPER_EVENT_GEN_MEDIA) {
trace_cxl_general_media(cxlmd, type, &evt->gen_media);
-   else if (event_type == CXL_CPER_EVENT_DRAM)
+   cxl_e

[RFC PATCH v2 2/6] cxl/core: introduce cxl_mem_report_poison()

2024-03-28 Thread Shiyang Ruan via

If poison is detected(reported from cxl memdev), OS should be notified to
handle it. So, introduce this helper function for later use:
  1. translate DPA to HPA;
  2. enqueue records into memory_failure's work queue;

Signed-off-by: Shiyang Ruan 
---

Currently poison injection from debugfs always create a 64-bytes-length
record, which is fine.  But the injection from qemu's QMP API:
qmp_cxl_inject_poison() could create a poison record contains big length,
which may cause many many times of calling memory_failure_queue().
Though the MEMORY_FAILURE_FIFO_SIZE is 1 << 4, it seems not enougth.

---
 drivers/cxl/core/mbox.c | 18 ++
 drivers/cxl/cxlmem.h|  3 +++
 2 files changed, 21 insertions(+)

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 9adda4795eb7..31b1b8711256 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -1290,6 +1290,24 @@ int cxl_set_timestamp(struct cxl_memdev_state *mds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_set_timestamp, CXL);
 
+void cxl_mem_report_poison(struct cxl_memdev *cxlmd,
+  struct cxl_region *cxlr,
+  struct cxl_poison_record *poison)
+{
+   u64 dpa = le64_to_cpu(poison->address) & CXL_POISON_START_MASK;
+   u64 len = PAGE_ALIGN(le32_to_cpu(poison->length) * CXL_POISON_LEN_MULT);
+   u64 hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
+   unsigned long pfn = PHYS_PFN(hpa);
+   unsigned long pfn_end = pfn + len / PAGE_SIZE - 1;
+
+   if (!IS_ENABLED(CONFIG_MEMORY_FAILURE))
+   return;
+
+   for (; pfn <= pfn_end; pfn++)
+   memory_failure_queue(pfn, MF_ACTION_REQUIRED);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_mem_report_poison, CXL);
+
 int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
   struct cxl_region *cxlr)
 {
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 20fb3b35e89e..82f80eb381fb 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -828,6 +828,9 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
const uuid_t *uuid, union cxl_event *evt);
 int cxl_set_timestamp(struct cxl_memdev_state *mds);
 int cxl_poison_state_init(struct cxl_memdev_state *mds);
+void cxl_mem_report_poison(struct cxl_memdev *cxlmd,
+  struct cxl_region *cxlr,
+  struct cxl_poison_record *poison);
 int cxl_mem_get_poison(struct cxl_memdev *cxlmd, u64 offset, u64 len,
   struct cxl_region *cxlr);
 int cxl_trigger_poison_list(struct cxl_memdev *cxlmd);
-- 
2.34.1

[PATCH] linux-user/syscall: xtensa: fix target_msqid_ds and ipc_perm conversion

2024-03-28 Thread Max Filippov

- target_ipc_perm::mode and target_ipc_perm::__seq fields are 32-bit wide
  on xtensa and thus need to use tswap32
- target_msqid_ds::msg_*time field pairs are reversed on big-endian
  xtensa
Both issues result in incorrect conversion results on big-endian xtensa
targets, spotted by the libc-test http://nsz.repo.hu/git/?p=libc-test

Cc: qemu-sta...@nongnu.org
Fixes: a3da8be5126b ("target/xtensa: linux-user: fix sysv IPC structures")
Signed-off-by: Max Filippov 
---
 linux-user/syscall.c | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index e384e1424890..cb334e90d6f0 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -3758,12 +3758,13 @@ static inline abi_long target_to_host_ipc_perm(struct 
ipc_perm *host_ip,
 host_ip->gid = tswap32(target_ip->gid);
 host_ip->cuid = tswap32(target_ip->cuid);
 host_ip->cgid = tswap32(target_ip->cgid);
-#if defined(TARGET_ALPHA) || defined(TARGET_MIPS) || defined(TARGET_PPC)
+#if defined(TARGET_ALPHA) || defined(TARGET_MIPS) || defined(TARGET_PPC) || \
+defined(TARGET_XTENSA)
 host_ip->mode = tswap32(target_ip->mode);
 #else
 host_ip->mode = tswap16(target_ip->mode);
 #endif
-#if defined(TARGET_PPC)
+#if defined(TARGET_PPC) || defined(TARGET_XTENSA)
 host_ip->__seq = tswap32(target_ip->__seq);
 #else
 host_ip->__seq = tswap16(target_ip->__seq);
@@ -3786,12 +3787,13 @@ static inline abi_long 
host_to_target_ipc_perm(abi_ulong target_addr,
 target_ip->gid = tswap32(host_ip->gid);
 target_ip->cuid = tswap32(host_ip->cuid);
 target_ip->cgid = tswap32(host_ip->cgid);
-#if defined(TARGET_ALPHA) || defined(TARGET_MIPS) || defined(TARGET_PPC)
+#if defined(TARGET_ALPHA) || defined(TARGET_MIPS) || defined(TARGET_PPC) || \
+defined(TARGET_XTENSA)
 target_ip->mode = tswap32(host_ip->mode);
 #else
 target_ip->mode = tswap16(host_ip->mode);
 #endif
-#if defined(TARGET_PPC)
+#if defined(TARGET_PPC) || defined(TARGET_XTENSA)
 target_ip->__seq = tswap32(host_ip->__seq);
 #else
 target_ip->__seq = tswap16(host_ip->__seq);
@@ -4111,6 +4113,14 @@ static inline abi_long do_semtimedop(int semid,
 struct target_msqid_ds
 {
 struct target_ipc_perm msg_perm;
+#if defined(TARGET_XTENSA) && TARGET_BIG_ENDIAN
+abi_ulong __unused1;
+abi_ulong msg_stime;
+abi_ulong __unused2;
+abi_ulong msg_rtime;
+abi_ulong __unused3;
+abi_ulong msg_ctime;
+#else
 abi_ulong msg_stime;
 #if TARGET_ABI_BITS == 32
 abi_ulong __unused1;
@@ -4122,6 +4132,7 @@ struct target_msqid_ds
 abi_ulong msg_ctime;
 #if TARGET_ABI_BITS == 32
 abi_ulong __unused3;
+#endif
 #endif
 abi_ulong __msg_cbytes;
 abi_ulong msg_qnum;
-- 
2.39.2

Re: [PATCH 3/3] ffvat: Fix reading files with non-continuous clusters

2024-03-28 Thread Amjad Alsharafi

I noticed the issue in the commit message 'ffvat' should be 'vvfat',
I'll fix it in the next version.

On Thu, Mar 28, 2024 at 04:11:27AM +0800, Amjad Alsharafi wrote:
> When reading with `read_cluster` we get the `mapping` with
> `find_mapping_for_cluster` and then we call `open_file` for this
> mapping.
> The issue appear when its the same file, but a second cluster that is
> not immediately after it, imagine clusters `500 -> 503`, this will give
> us 2 mappings one has the range `500..501` and another `503..504`, both
> point to the same file, but different offsets.
> 
> When we don't open the file since the path is the same, we won't assign
> `s->current_mapping` and thus accessing way out of bound of the file.
> 
> From our example above, after `open_file` (that didn't open anything) we
> will get the offset into the file with
> `s->cluster_size*(cluster_num-s->current_mapping->begin)`, which will
> give us `0x2000 * (504-500)`, which is out of bound for this mapping and
> will produce some issues.
> 
> Signed-off-by: Amjad Alsharafi 
> ---
>  block/vvfat.c | 21 ++---
>  1 file changed, 14 insertions(+), 7 deletions(-)
> 
> diff --git a/block/vvfat.c b/block/vvfat.c
> index cb3ab81e29..87165abc26 100644
> --- a/block/vvfat.c
> +++ b/block/vvfat.c
> @@ -1360,15 +1360,22 @@ static int open_file(BDRVVVFATState* s,mapping_t* 
> mapping)
>  {
>  if(!mapping)
>  return -1;
> +int new_path = 1;
>  if(!s->current_mapping ||
> -strcmp(s->current_mapping->path,mapping->path)) {
> -/* open file */
> -int fd = qemu_open_old(mapping->path,
> +
> s->current_mapping->first_mapping_index!=mapping->first_mapping_index ||
> +(new_path = strcmp(s->current_mapping->path,mapping->path))) {
> +
> +if (new_path) {
> +/* open file */
> +int fd = qemu_open_old(mapping->path,
> O_RDONLY | O_BINARY | O_LARGEFILE);
> -if(fd<0)
> -return -1;
> -vvfat_close_current_file(s);
> -s->current_fd = fd;
> +if(fd<0)
> +return -1;
> +vvfat_close_current_file(s);
> +
> +s->current_fd = fd;
> +}
> +assert(s->current_fd);
>  s->current_mapping = mapping;
>  }
>  return 0;
> -- 
> 2.44.0
>

[PATCH v9 1/2] memory tier: dax/kmem: introduce an abstract layer for finding, allocating, and putting memory types

2024-03-28 Thread Ho-Ren (Jack) Chuang

Since different memory devices require finding, allocating, and putting
memory types, these common steps are abstracted in this patch,
enhancing the scalability and conciseness of the code.

Signed-off-by: Ho-Ren (Jack) Chuang 
Reviewed-by: "Huang, Ying" 
---
 drivers/dax/kmem.c   | 20 ++--
 include/linux/memory-tiers.h | 13 +
 mm/memory-tiers.c| 32 
 3 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 42ee360cf4e3..01399e5b53b2 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -55,21 +55,10 @@ static LIST_HEAD(kmem_memory_types);
 
 static struct memory_dev_type *kmem_find_alloc_memory_type(int adist)
 {
-   bool found = false;
struct memory_dev_type *mtype;
 
mutex_lock(&kmem_memory_type_lock);
-   list_for_each_entry(mtype, &kmem_memory_types, list) {
-   if (mtype->adistance == adist) {
-   found = true;
-   break;
-   }
-   }
-   if (!found) {
-   mtype = alloc_memory_type(adist);
-   if (!IS_ERR(mtype))
-   list_add(&mtype->list, &kmem_memory_types);
-   }
+   mtype = mt_find_alloc_memory_type(adist, &kmem_memory_types);
mutex_unlock(&kmem_memory_type_lock);
 
return mtype;
@@ -77,13 +66,8 @@ static struct memory_dev_type 
*kmem_find_alloc_memory_type(int adist)
 
 static void kmem_put_memory_types(void)
 {
-   struct memory_dev_type *mtype, *mtn;
-
mutex_lock(&kmem_memory_type_lock);
-   list_for_each_entry_safe(mtype, mtn, &kmem_memory_types, list) {
-   list_del(&mtype->list);
-   put_memory_type(mtype);
-   }
+   mt_put_memory_types(&kmem_memory_types);
mutex_unlock(&kmem_memory_type_lock);
 }
 
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 69e781900082..a44c03c2ba3a 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -48,6 +48,9 @@ int mt_calc_adistance(int node, int *adist);
 int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
 const char *source);
 int mt_perf_to_adistance(struct access_coordinate *perf, int *adist);
+struct memory_dev_type *mt_find_alloc_memory_type(int adist,
+   struct list_head 
*memory_types);
+void mt_put_memory_types(struct list_head *memory_types);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -136,5 +139,15 @@ static inline int mt_perf_to_adistance(struct 
access_coordinate *perf, int *adis
 {
return -EIO;
 }
+
+struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head 
*memory_types)
+{
+   return NULL;
+}
+
+void mt_put_memory_types(struct list_head *memory_types)
+{
+
+}
 #endif /* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 0537664620e5..974af10cfdd8 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -623,6 +623,38 @@ void clear_node_memory_type(int node, struct 
memory_dev_type *memtype)
 }
 EXPORT_SYMBOL_GPL(clear_node_memory_type);
 
+struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head 
*memory_types)
+{
+   bool found = false;
+   struct memory_dev_type *mtype;
+
+   list_for_each_entry(mtype, memory_types, list) {
+   if (mtype->adistance == adist) {
+   found = true;
+   break;
+   }
+   }
+   if (!found) {
+   mtype = alloc_memory_type(adist);
+   if (!IS_ERR(mtype))
+   list_add(&mtype->list, memory_types);
+   }
+
+   return mtype;
+}
+EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type);
+
+void mt_put_memory_types(struct list_head *memory_types)
+{
+   struct memory_dev_type *mtype, *mtn;
+
+   list_for_each_entry_safe(mtype, mtn, memory_types, list) {
+   list_del(&mtype->list);
+   put_memory_type(mtype);
+   }
+}
+EXPORT_SYMBOL_GPL(mt_put_memory_types);
+
 static void dump_hmem_attrs(struct access_coordinate *coord, const char 
*prefix)
 {
pr_info(
-- 
Ho-Ren (Jack) Chuang

[PATCH v9 0/2] Improved Memory Tier Creation for CPUless NUMA Nodes

2024-03-28 Thread Ho-Ren (Jack) Chuang

When a memory device, such as CXL1.1 type3 memory, is emulated as
normal memory (E820_TYPE_RAM), the memory device is indistinguishable
from normal DRAM in terms of memory tiering with the current implementation.
The current memory tiering assigns all detected normal memory nodes
to the same DRAM tier. This results in normal memory devices with
different attributions being unable to be assigned to the correct memory tier,
leading to the inability to migrate pages between different types of memory.
https://lore.kernel.org/linux-mm/ph0pr08mb7955e9f08ccb64f23963b5c3a8...@ph0pr08mb7955.namprd08.prod.outlook.com/T/

This patchset automatically resolves the issues. It delays the initialization
of memory tiers for CPUless NUMA nodes until they obtain HMAT information
and after all devices are initialized at boot time, eliminating the need
for user intervention. If no HMAT is specified, it falls back to
using `default_dram_type`.

Example usecase:
We have CXL memory on the host, and we create VMs with a new system memory
device backed by host CXL memory. We inject CXL memory performance attributes
through QEMU, and the guest now sees memory nodes with performance attributes
in HMAT. With this change, we enable the guest kernel to construct
the correct memory tiering for the memory nodes.

-v9:
 * Address corner cases in `memory_tier_late_init`. Thank Ying's comments.
-v8:
 * Fix email format
 * 
https://lore.kernel.org/lkml/20240329004815.195476-1-horenchu...@bytedance.com/T/#u
-v7:
 * Add Reviewed-by: "Huang, Ying" 
-v6:
 Thanks to Ying's comments,
 * Move `default_dram_perf_lock` to the function's beginning for clarity
 * Fix double unlocking at v5
 * 
https://lore.kernel.org/lkml/20240327072729.3381685-1-horenchu...@bytedance.com/T/#u
-v5:
 Thanks to Ying's comments,
 * Add comments about what is protected by `default_dram_perf_lock`
 * Fix an uninitialized pointer mtype
 * Slightly shorten the time holding `default_dram_perf_lock`
 * Fix a deadlock bug in `mt_perf_to_adistance`
 * 
https://lore.kernel.org/lkml/20240327041646.3258110-1-horenchu...@bytedance.com/T/#u
-v4:
 Thanks to Ying's comments,
 * Remove redundant code
 * Reorganize patches accordingly
 * 
https://lore.kernel.org/lkml/20240322070356.315922-1-horenchu...@bytedance.com/T/#u
-v3:
 Thanks to Ying's comments,
 * Make the newly added code independent of HMAT
 * Upgrade set_node_memory_tier to support more cases
 * Put all non-driver-initialized memory types into default_memory_types
   instead of using hmat_memory_types
 * find_alloc_memory_type -> mt_find_alloc_memory_type
 * 
https://lore.kernel.org/lkml/20240320061041.3246828-1-horenchu...@bytedance.com/T/#u
-v2:
 Thanks to Ying's comments,
 * Rewrite cover letter & patch description
 * Rename functions, don't use _hmat
 * Abstract common functions into find_alloc_memory_type()
 * Use the expected way to use set_node_memory_tier instead of modifying it
 * 
https://lore.kernel.org/lkml/20240312061729.1997111-1-horenchu...@bytedance.com/T/#u
-v1:
 * 
https://lore.kernel.org/lkml/20240301082248.3456086-1-horenchu...@bytedance.com/T/#u

Ho-Ren (Jack) Chuang (2):
  memory tier: dax/kmem: introduce an abstract layer for finding,
allocating, and putting memory types
  memory tier: create CPUless memory tiers after obtaining HMAT info

 drivers/dax/kmem.c   |  20 +-
 include/linux/memory-tiers.h |  13 
 mm/memory-tiers.c| 125 ++-
 3 files changed, 124 insertions(+), 34 deletions(-)

-- 
Ho-Ren (Jack) Chuang

[PATCH v9 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-03-28 Thread Ho-Ren (Jack) Chuang

The current implementation treats emulated memory devices, such as
CXL1.1 type3 memory, as normal DRAM when they are emulated as normal memory
(E820_TYPE_RAM). However, these emulated devices have different
characteristics than traditional DRAM, making it important to
distinguish them. Thus, we modify the tiered memory initialization process
to introduce a delay specifically for CPUless NUMA nodes. This delay
ensures that the memory tier initialization for these nodes is deferred
until HMAT information is obtained during the boot process. Finally,
demotion tables are recalculated at the end.

* late_initcall(memory_tier_late_init);
Some device drivers may have initialized memory tiers between
`memory_tier_init()` and `memory_tier_late_init()`, potentially bringing
online memory nodes and configuring memory tiers. They should be excluded
in the late init.

* Handle cases where there is no HMAT when creating memory tiers
There is a scenario where a CPUless node does not provide HMAT information.
If no HMAT is specified, it falls back to using the default DRAM tier.

* Introduce another new lock `default_dram_perf_lock` for adist calculation
In the current implementation, iterating through CPUlist nodes requires
holding the `memory_tier_lock`. However, `mt_calc_adistance()` will end up
trying to acquire the same lock, leading to a potential deadlock.
Therefore, we propose introducing a standalone `default_dram_perf_lock` to
protect `default_dram_perf_*`. This approach not only avoids deadlock
but also prevents holding a large lock simultaneously.

* Upgrade `set_node_memory_tier` to support additional cases, including
  default DRAM, late CPUless, and hot-plugged initializations.
To cover hot-plugged memory nodes, `mt_calc_adistance()` and
`mt_find_alloc_memory_type()` are moved into `set_node_memory_tier()` to
handle cases where memtype is not initialized and where HMAT information is
available.

* Introduce `default_memory_types` for those memory types that are not
  initialized by device drivers.
Because late initialized memory and default DRAM memory need to be managed,
a default memory type is created for storing all memory types that are
not initialized by device drivers and as a fallback.

Signed-off-by: Ho-Ren (Jack) Chuang 
Signed-off-by: Hao Xiang 
Reviewed-by: "Huang, Ying" 
---
 mm/memory-tiers.c | 93 +++
 1 file changed, 77 insertions(+), 16 deletions(-)

diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 974af10cfdd8..9f8ae99e8e6e 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -36,6 +36,11 @@ struct node_memory_type_map {
 
 static DEFINE_MUTEX(memory_tier_lock);
 static LIST_HEAD(memory_tiers);
+/*
+ * The list is used to store all memory types that are not created
+ * by a device driver.
+ */
+static LIST_HEAD(default_memory_types);
 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
 struct memory_dev_type *default_dram_type;
 
@@ -108,6 +113,8 @@ static struct demotion_nodes *node_demotion __read_mostly;
 
 static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
 
+/* The lock is used to protect `default_dram_perf*` info and nid. */
+static DEFINE_MUTEX(default_dram_perf_lock);
 static bool default_dram_perf_error;
 static struct access_coordinate default_dram_perf;
 static int default_dram_perf_ref_nid = NUMA_NO_NODE;
@@ -505,7 +512,8 @@ static inline void __init_node_memory_type(int node, struct 
memory_dev_type *mem
 static struct memory_tier *set_node_memory_tier(int node)
 {
struct memory_tier *memtier;
-   struct memory_dev_type *memtype;
+   struct memory_dev_type *mtype = default_dram_type;
+   int adist = MEMTIER_ADISTANCE_DRAM;
pg_data_t *pgdat = NODE_DATA(node);
 
 
@@ -514,11 +522,20 @@ static struct memory_tier *set_node_memory_tier(int node)
if (!node_state(node, N_MEMORY))
return ERR_PTR(-EINVAL);
 
-   __init_node_memory_type(node, default_dram_type);
+   mt_calc_adistance(node, &adist);
+   if (node_memory_types[node].memtype == NULL) {
+   mtype = mt_find_alloc_memory_type(adist, &default_memory_types);
+   if (IS_ERR(mtype)) {
+   mtype = default_dram_type;
+   pr_info("Failed to allocate a memory type. Fall 
back.\n");
+   }
+   }
+
+   __init_node_memory_type(node, mtype);
 
-   memtype = node_memory_types[node].memtype;
-   node_set(node, memtype->nodes);
-   memtier = find_create_memory_tier(memtype);
+   mtype = node_memory_types[node].memtype;
+   node_set(node, mtype->nodes);
+   memtier = find_create_memory_tier(mtype);
if (!IS_ERR(memtier))
rcu_assign_pointer(pgdat->memtier, memtier);
return memtier;
@@ -655,6 +672,33 @@ void mt_put_memory_types(struct list_head *memory_types)
 }
 EXPORT_SYMBOL_GPL(mt_put_memory_types);
 
+/*
+ * This is invoked via `late_initcall()` to initialize memory

Re: Qemu Display Coacoa Patch Serie Qemu 9.0 RC1

2024-03-28 Thread Akihiko Odaki


On 2024/03/29 6:44, BALATON Zoltan wrote:

On Thu, 28 Mar 2024, Rene Engel wrote:
I wanted to discuss this topic with you again, there was already a 
patch series that worked well under Qemu with
Pegasos2/AmigaOneXe/Same460 and AmigaOs4.1. The option zoom-to-fit=on 
should be used to adjust all resolutions provided by the guest
system to the aspect ratio if there are no Virtio GPU drivers 
available that allow this.


In my opinion exactly this option zoom-to-fit=on makes this possible. 
If you don't want to use this option you still have the possibility
to deactivate it. In Qemu 9.0 RC1 not all resolutions are stretched 
like in previous patches e.g. 640x480/800x600/1024x720 etc. but this
is exactly what we need for the Pegasos2/AmigaOneXe/Same460 machine 
with AmigaOs4.1.


There seems to be a bit of confusion about how this zoom-to-fit option 
is implemented by different -display backends and I'm not sure what is 
the intended behaviour or how other -display backends handle it. Maybe a 
single option is not even enough to describe all possible preferences so 
another one i.e. keep-aspect=true|false may also be needed to cover all 
possible settings (don't zoom, zoom with aspect ratio kept, zoom to fit 
window even if that stretches the picture out of aspect ratio). For 9.0 
ptobsbly we should go for consistency with other backends now as adding 
new options is not possible during freeze and then resolve this afterwards.


It is indeed something that requires a new flag like zoom-interpolation.



There are also problems within the resolutions with the mouse pointer 
where the screen output flickers it currently affects all patch 
series. I would be happy if we could find a solution for all this.


The flicker may be due to the resize algorithm used by macOS not giving 
the same result always. To resolve it maybe yet another option may be 
needed to not zoom to full available window but try to keep the zoom 
factor some integer value to avoid fractional scaling but I'm not sure 
that's the best way to solve it.


Flickering sounds more like a bug. I appreciate if you can share some 
recording.


Regards,
Akihiko Odaki



Regards,
BALATON Zoltan

I'll leave you 2 videos so you can decide for yourself what would make 
the most sense. It shows once for me the working zoom behavior
which works very well and the behavior with Qemu 9.0Rc1 including new 
Cocoa patches.


Qemu zoom-to fit=on for all Screenmodes 
working: https://www.youtube.com/watch?v=dnJ3W8egAFY


Qemu 9.0. RC1 zoom-to fit=on not working for all 
Screenmodes: https://www.youtube.com/watch?v=Ddq68ViudrA

[PATCH] spapr: nested: use bitwise NOT operator for flags check

2024-03-28 Thread Harsh Prateek Bora

Check for flag bit in H_GUEST_GETSET_STATE_FLAG_GUEST_WIDE need to use
bitwise NOT operator to ensure no other flag bits are set.
Reported by Coverity as CID 1540008, 1540009.

Reported-by: Peter Maydell 
Signed-off by: Harsh Prateek Bora 
---
 hw/ppc/spapr_nested.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/ppc/spapr_nested.c b/hw/ppc/spapr_nested.c
index 936659b4c0..c02785756c 100644
--- a/hw/ppc/spapr_nested.c
+++ b/hw/ppc/spapr_nested.c
@@ -1511,7 +1511,7 @@ static target_ulong h_guest_getset_state(PowerPCCPU *cpu,
 if (flags & H_GUEST_GETSET_STATE_FLAG_GUEST_WIDE) {
 gsr.flags |= GUEST_STATE_REQUEST_GUEST_WIDE;
 }
-if (flags & !H_GUEST_GETSET_STATE_FLAG_GUEST_WIDE) {
+if (flags & ~H_GUEST_GETSET_STATE_FLAG_GUEST_WIDE) {
 return H_PARAMETER; /* flag not supported yet */
 }
 
-- 
2.39.3

Re: [PULL 35/38] spapr: nested: Introduce H_GUEST_[GET|SET]_STATE hcalls.

2024-03-28 Thread Harsh Prateek Bora





On 3/28/24 20:55, Peter Maydell wrote:

On Wed, 27 Mar 2024 at 05:41, Harsh Prateek Bora  wrote:




On 3/26/24 21:32, Peter Maydell wrote:

On Tue, 12 Mar 2024 at 17:11, Nicholas Piggin  wrote:


From: Harsh Prateek Bora 

Introduce the nested PAPR hcalls:
  - H_GUEST_GET_STATE which is used to get state of a nested guest or
a guest VCPU. The value field for each element in the request is
destination to be updated to reflect current state on success.
  - H_GUEST_SET_STATE which is used to modify the state of a guest or
a guest VCPU. On success, guest (or its VCPU) state shall be
updated as per the value field for the requested element(s).

Reviewed-by: Nicholas Piggin 
Signed-off-by: Michael Neuling 
Signed-off-by: Harsh Prateek Bora 
Signed-off-by: Nicholas Piggin 


Hi; Coverity points out a problem with this code (CID 1540008, 1540009):




+static target_ulong h_guest_getset_state(PowerPCCPU *cpu,
+ SpaprMachineState *spapr,
+ target_ulong *args,
+ bool set)
+{
+target_ulong flags = args[0];
+target_ulong lpid = args[1];
+target_ulong vcpuid = args[2];
+target_ulong buf = args[3];
+target_ulong buflen = args[4];
+struct guest_state_request gsr;
+SpaprMachineStateNestedGuest *guest;
+
+guest = spapr_get_nested_guest(spapr, lpid);
+if (!guest) {
+return H_P2;
+}
+gsr.buf = buf;
+assert(buflen <= GSB_MAX_BUF_SIZE);
+gsr.len = buflen;
+gsr.flags = 0;
+if (flags & H_GUEST_GETSET_STATE_FLAG_GUEST_WIDE) {


flags is a target_ulong, which means it might only be 32 bits.
But H_GUEST_GETSET_STATE_FLAG_GUEST_WIDE has a bit set in the
upper 32 bits only. So Coverity complains about this condition
being always-zero and the body of the if being dead code.

What was the intention here?


Hi Peter,
Ideally this is intended to be running on a ppc64 where target_ulong
should be uint64_t. I guess same holds true for existing nested-hv code
as well.


Sorry, I'm afraid I misread the Coverity report here;
sorry for the confusion. The 32-vs-64 bits question is a red
herring.

What Coverity is actually pointing out is in this next bit:


+gsr.flags |= GUEST_STATE_REQUEST_GUEST_WIDE;
+}
+if (flags & !H_GUEST_GETSET_STATE_FLAG_GUEST_WIDE) {


The C operator ! is the logical-NOT operator; since
H_GUEST_GETSET_STATE_FLAG_GUEST_WIDE is a non-zero value
that means that !H_GUEST_GETSET_STATE_FLAG_GUEST_WIDE is 0;
so we're testing (flags & 0), which is always false, and this
is the if() body which is dead-code as a result.

Should this be the bitwise-NOT ~  (ie "if any flag other
than this one is set"), or should this be an else clause
to the previous if() (ie "if this flag is not set") ?


Oh, this should have been bitwise-NOT, I shall send a follow-up patch 
for the fix.


regards,
Harsh



+return H_PARAMETER; /* flag not supported yet */
+}
+
+if (set) {
+gsr.flags |= GUEST_STATE_REQUEST_SET;
+}
+return map_and_getset_state(cpu, guest, vcpuid, &gsr);
+}




thanks
-- PMM

Re: [PATCH v8] arm/kvm: Enable support for KVM_ARM_VCPU_PMU_V3_FILTER

2024-03-28 Thread Shaoqin Huang


Hi Daniel,

On 3/25/24 16:55, Daniel P. Berrangé wrote:

On Mon, Mar 25, 2024 at 01:35:58PM +0800, Shaoqin Huang wrote:

Hi Daniel,

Thanks for your reviewing. I see your comments in the v7.

I have some doubts about what you said about the QAPI. Do you want me to
convert the current design into the QAPI parsing like the
IOThreadVirtQueueMapping? And we need to add new json definition in the
qapi/ directory?


I have defined the QAPI for kvm-pmu-filter like below:

+##
+# @FilterAction:
+#
+# The Filter Action
+#
+# @a: Allow
+#
+# @d: Disallow
+#
+# Since: 9.0
+##
+{ 'enum': 'FilterAction',
+  'data': [ 'a', 'd' ] }
+
+##
+# @SingleFilter:
+#
+# Lazy
+#
+# @action: the action
+#
+# @start: the start
+#
+# @end: the end
+#
+# Since: 9.0
+##
+
+{ 'struct': 'SingleFilter',
+ 'data': { 'action': 'FilterAction', 'start': 'int', 'end': 'int' } }
+
+##
+# @KVMPMUFilter:
+#
+# Lazy
+#
+# @filter: the filter
+#
+# Since: 9.0
+##
+
+{ 'struct': 'KVMPMUFilter',
+  'data': { 'filter': ['SingleFilter'] }}

And I guess I can use it by adding code like below:

--- a/hw/core/qdev-properties-system.c
+++ b/hw/core/qdev-properties-system.c
@@ -1206,3 +1206,35 @@ const PropertyInfo 
qdev_prop_iothread_vq_mapping_list = {

 .set = set_iothread_vq_mapping_list,
 .release = release_iothread_vq_mapping_list,
 };
+
+/* --- kvm-pmu-filter ---*/
+
+static void get_kvm_pmu_filter(Object *obj, Visitor *v,
+const char *name, void *opaque, Error **errp)
+{
+KVMPMUFilter **prop_ptr = object_field_prop_ptr(obj, opaque);
+
+visit_type_KVMPMUFilter(v, name, prop_ptr, errp);
+}
+
+static void set_kvm_pmu_filter(Object *obj, Visitor *v,
+const char *name, void *opaque, Error **errp)
+{
+KVMPMUFilter **prop_ptr = object_field_prop_ptr(obj, opaque);
+KVMPMUFilter *list;
+
+printf("running the %s\n", __func__);
+if (!visit_type_KVMPMUFilter(v, name, &list, errp)) {
+return;
+}
+
+printf("The name is %s\n", name);
+*prop_ptr = list;
+}
+
+const PropertyInfo qdev_prop_kvm_pmu_filter = {
+.name = "KVMPMUFilter",
+.description = "der der",
+.get = get_kvm_pmu_filter,
+.set = set_kvm_pmu_filter,
+};

+#define DEFINE_PROP_KVM_PMU_FILTER(_name, _state, _field) \
+DEFINE_PROP(_name, _state, _field, qdev_prop_kvm_pmu_filter, \
+KVMPMUFilter *)

--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -2439,6 +2441,7 @@ static Property arm_cpu_properties[] = {
 mp_affinity, ARM64_AFFINITY_INVALID),
 DEFINE_PROP_INT32("node-id", ARMCPU, node_id, CPU_UNSET_NUMA_NODE_ID),
 DEFINE_PROP_INT32("core-count", ARMCPU, core_count, -1),
+DEFINE_PROP_KVM_PMU_FILTER("kvm-pmu-filter", ARMCPU, kvm_pmu_filter),
 DEFINE_PROP_END_OF_LIST()
 };

And I guess I can use the new json format input like below:

qemu-system-aarch64 \
-cpu host, '{"filter": [{"action": "a", "start": 0x10, "end": "0x11"}]}'

But it doesn't work. It seems like because the -cpu option doesn't 
support json format parameter.


Maybe I'm wrong. So I want to double check with if the -cpu option 
support json format nowadays?


If the -cpu option doesn't support json format, how I can use the QAPI 
for kvm-pmu-filter property?


Thanks,
Shaoqin



Yes, you would define a type in the qapi dir similar to how is
done for IOThreadVirtQueueMapping, and then you can use that
in the property setter method.


With regards,
Daniel


--
Shaoqin

RE: [PATCH v5 0/7] Live Migration With IAA

2024-03-28 Thread Liu, Yuan1

> -Original Message-
> From: Peter Xu 
> Sent: Thursday, March 28, 2024 11:22 PM
> To: Liu, Yuan1 
> Cc: faro...@suse.de; qemu-devel@nongnu.org; hao.xi...@bytedance.com;
> bryan.zh...@bytedance.com; Zou, Nanhai 
> Subject: Re: [PATCH v5 0/7] Live Migration With IAA
> 
> On Thu, Mar 28, 2024 at 03:02:30AM +, Liu, Yuan1 wrote:
> > Yes, I will support software fallback to ensure CI testing and users can
> > still use qpl compression without IAA hardware.
> >
> > Although the qpl software solution will have better performance than
> zlib,
> > I still don't think it has a greater advantage than zstd. I don't think
> there
> > is a need to add a migration option to configure the qpl software or
> hardware path.
> > So I will still only use QPL as an independent compression in the next
> version, and
> > no other migration options are needed.
> 
> That should be fine.
> 
> >
> > I will also add a guide to qpl-compression.rst about IAA permission
> issues and how to
> > determine whether the hardware path is available.
> 
> OK.
> 
> [...]
> 
> > > > Yes, I use iperf3 to check the bandwidth for one core, the bandwith
> is
> > > 60Gbps.
> > > > [ ID] Interval   Transfer Bitrate Retr  Cwnd
> > > > [  5]   0.00-1.00   sec  7.00 GBytes  60.1 Gbits/sec0   2.87
> MBytes
> > > > [  5]   1.00-2.00   sec  7.05 GBytes  60.6 Gbits/sec0   2.87
> Mbytes
> > > >
> > > > And in the live migration test, a multifd thread's CPU utilization
> is
> > > almost 100%
> > >
> > > This 60Gpbs per-channel is definitely impressive..
> > >
> > > Have you tried migration without multifd on your system? Would that
> also
> > > perform similarly v.s. 2 channels multifd?
> >
> > Simple Test result below:
> > VM Type: 16vCPU, 64G memory
> > Workload in VM: fill 56G memory with Silesia data and vCPUs are idle
> > Migration Configurations:
> > 1. migrate_set_parameter max-bandwidth 100G
> > 2. migrate_set_parameter downtime-limit 300
> > 3. migrate_set_capability multifd on (multiFD test case)
> > 4. migrate_set_parameter multifd-channels 2 (multiFD test case)
> >
> >   Totaltime (ms) Downtime (ms) Throughput (mbps) Pages-
> per-second
> > without Multifd 23580   307  21221 689588
> > Multifd 2  7657 198  654102221176
> 
> Thanks for the test results.
> 
> So I am guessing the migration overheads besides pushing the socket is
> high
> enough to make it drop drastically, even if in this case zero detection
> shouldn't play a major role considering most of guest mem is pre-filled.

Yes, for no multifd migration, besides the network stack overhead, the zero
page detection overhead (both of source and destination) is indeed very high.
Placing the zero page detection in multi-threads can reduce the performance 
degradation caused by the overhead of zero page detection.

I also think migration doesn't need to detect zero page by memcmp in all cases.
The benefit of zero page detection may be that the VM's memory determines that
there are a large number of 0 pages. 

My experience in this area may be insufficient, I am trying with Hao and Bryan 
to
see if it is possible to use DSA hardware to accelerate this part (including 
page 0
detection and writing page 0). 

DSA is an accelerator for detecting memory, writing memory, and comparing memory
https://cdrdv2-public.intel.com/671116/341204-intel-data-streaming-accelerator-spec.pdf

[PATCH] migration: Yield coroutine when receiving MIG_CMD_POSTCOPY_LISTEN

2024-03-28 Thread Lei Wang

When using the post-copy preemption feature to perform post-copy live
migration, the below scenario could lead to a deadlock and the migration
will never finish:

 - Source connect() the preemption channel in postcopy_start().
 - Source and the destination side TCP stack finished the 3-way handshake
   thus the connection is successful.
 - The destination side main thread is handling the loading of the bulk RAM
   pages thus it doesn't start to handle the pending connection event in the
   event loop. and doesn't post the semaphore postcopy_qemufile_dst_done for
   the preemption thread.
 - The source side sends non-iterative device states, such as the virtio
   states.
 - The destination main thread starts to receive the virtio states, this
   process may lead to a page fault (e.g., virtio_load()->vring_avail_idx()
   may trigger a page fault since the avail ring page may not be received
   yet).
 - The page request is sent back to the source side. Source sends the page
   content to the destination side preemption thread.
 - Since the event is not arrived and the semaphore
   postcopy_qemufile_dst_done is not posted, the preemption thread in
   destination side is blocked, and cannot handle receiving the page.
 - The QEMU main load thread on the destination side is stuck at the page
   fault, and cannot yield and handle the connect() event for the
   preemption channel to unblock the preemption thread.
 - The postcopy will stuck there forever since this is a deadlock.

The key point to reproduce this bug is that the source side is sending pages
at a rate faster than the destination handling, otherwise,
the qemu_get_be64() in ram_load_precopy() will have a chance to yield since
at that time there are no pending data in the buffer to get. This will make
this bug harder to be reproduced.

Fix this by yielding the load coroutine when receiving
MIG_CMD_POSTCOPY_LISTEN so the main event loop can handle the connection
event before loading the non-iterative devices state to avoid the deadlock
condition.

Signed-off-by: Lei Wang 
---
 migration/savevm.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/migration/savevm.c b/migration/savevm.c
index e386c5267f..8fd4dc92f2 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2445,6 +2445,11 @@ static int loadvm_process_command(QEMUFile *f)
 return loadvm_postcopy_handle_advise(mis, len);
 
 case MIG_CMD_POSTCOPY_LISTEN:
+if (migrate_postcopy_preempt() && qemu_in_coroutine()) {
+aio_co_schedule(qemu_get_current_aio_context(),
+qemu_coroutine_self());
+qemu_coroutine_yield();
+}
 return loadvm_postcopy_handle_listen(mis);
 
 case MIG_CMD_POSTCOPY_RUN:
-- 
2.39.3

Re: [RFC 0/2] disable the configuration interrupt for the unsupported device

2024-03-28 Thread Jason Wang

On Fri, Mar 29, 2024 at 11:02 AM Cindy Lu  wrote:
>
> On Thu, Mar 28, 2024 at 12:12 PM Jason Wang  wrote:
> >
> > On Wed, Mar 27, 2024 at 5:33 PM Cindy Lu  wrote:
> > >
> > > On Wed, Mar 27, 2024 at 5:12 PM Jason Wang  wrote:
> > > >
> > > > On Wed, Mar 27, 2024 at 4:28 PM Cindy Lu  wrote:
> > > > >
> > > > > On Wed, Mar 27, 2024 at 3:54 PM Jason Wang  
> > > > > wrote:
> > > > > >
> > > > > > On Wed, Mar 27, 2024 at 2:03 PM Cindy Lu  wrote:
> > > > > > >
> > > > > > > On Wed, Mar 27, 2024 at 11:05 AM Jason Wang  
> > > > > > > wrote:
> > > > > > > >
> > > > > > > > Hi Cindy:
> > > > > > > >
> > > > > > > > On Wed, Mar 27, 2024 at 9:29 AM Cindy Lu  
> > > > > > > > wrote:
> > > > > > > > >
> > > > > > > > > we need a crash in Non-standard image, here is the jira for 
> > > > > > > > > this https://issues.redhat.com/browse/RHEL-28522
> > > > > > > > > The root cause of the issue is that an IRQFD was used without 
> > > > > > > > > initialization..
> > > > > > > > >
> > > > > > > > > During the booting process of the Vyatta image, the behavior 
> > > > > > > > > of the called function in qemu is as follows:
> > > > > > > > >
> > > > > > > > > 1. vhost_net_stop() was called, this will call the function
> > > > > > > > > virtio_pci_set_guest_notifiers() with assgin= false, and
> > > > > > > > > virtio_pci_set_guest_notifiers(） will release the irqfd for 
> > > > > > > > > vector 0
> > > > > > > >
> > > > > > > > Before vhost_net_stop(), do we know which vector is used by 
> > > > > > > > which queue?
> > > > > > > >
> > > > > > > before this stop, vdev->config_verctor is get from
> > > > > > > virtio_pci_common_read/virtio_pci_common_write
> > > > > > > it was set to vector 0
> > > > > >
> > > > > > I basically meant if vector 0 is shared with some virtqueues here.
> > > > > >
> > > > > Really sorry for this, vq's vector is 1,2, and will not share with the
> > > > > configure vector
> > > > > > > > >
> > > > > > > > > 2. virtio_reset() was called -->set configure vector to 
> > > > > > > > > VIRTIO_NO_VECTORt
> > > > > > > > >
> > > > > > > > > 3.vhost_net_start() was called (at this time the configure 
> > > > > > > > > vector is
> > > > > > > > > still VIRTIO_NO_VECTOR) and call 
> > > > > > > > > virtio_pci_set_guest_notifiers() with
> > > > > > > > > assgin= true, so the irqfd for vector 0 was not "init" during 
> > > > > > > > > this process
> > > > > > > >
> > > > > > > > How does the configure vector differ from the virtqueue vector 
> > > > > > > > here?
> > > > > > > >
> > > > > > > All the vectors are VIRTIO_NO_VECTOR (including vq). any
> > > > > > > msix_fire_vector_notifier()
> > > > > > > been called will cause the crash at this time.
> > > > > >
> > > > > > Won't virtio_pci_set_guest_notifiers() will try to allocate irqfd 
> > > > > > when
> > > > > > the assignment is true?
> > > > > >
> > > > > It will allocate, but  the vector is VIRTIO_NO_VECTOR (0x)
> > > > >
> > > > > then it will called kvm_virtio_pci_vector_use_one()
> > > > >
> > > > > in this function, there is a check for
> > > > >
> > > > > if (vector >= msix_nr_vectors_allocated(dev))
> > > > >
> > > > > { return 0; }
> > > > >
> > > > > So it will return.
> > > >
> > > > How about let's just fix this?
> > > >
> > > > Btw, it's better to explain in detail like the above in the next 
> > > > version.
> > > >
> > > > Thanks
> > > >
> > > The problem is I think the behavior here is correct, The vector here is
> > >  VIRTIO_NO_VECTOR and we should return,
> >
> > So if I understand correctly, the configure vector is configured after
> > DRIVER_OK?
> >
> sorry I didn't get your point, Do you mean set_guest_notifiers()?,
> this was called during the system boot
>  but for the value of vdev->config_vector/vq vector, this is changed
> by virtio_pci_common_read/virtio_pci_common_write
> and these function will not check the process  DRIVER_OK.

I basically mean Qemu behave based on the guest's behaviour.

So what you've described looks like a guest trying to configure the
config vector after it sets DRIVER_OK. So Qemu tries to use the irqfd
without initializaiton.

> > Spec doesn't forbid this, this is something we need to support.
> >
> > It looks to me the correct fix is to kvm_virtio_pci_vector_use_one()
> > when guest is writing to msix_vector after DRIVER_OK?
> >
> if I understand correctly. do you mean
> when  function   virtio_pci_common_read/virtio_pci_common_write was called
> we need to check the number of  vdev->config_vector/vq vector, if this
> was changed and also DRIVER_OK was set
> then we need to call virtio_pci_set_guest_notifiers() to re-init the irqfd?

It is not re-init, as it has been freed.

A quick fix would be, call kvm_virtio_pci_vector_use/unuse_one() when
a guest assign/deassign a vector after DRIVER_OK.

Thanks

> Thanks
> cindy
> > Thanks
> >
> > > the fix could work maybe is we try get to know if this was changed
> > > from another value
> > > and use that one? this seems strange.
> > > Than

RE: [PATCH v1 3/6] intel_iommu: Add a framework to check and sync host IOMMU cap/ecap

2024-03-28 Thread Duan, Zhenzhong

Hi Michael,

>-Original Message-
>From: Michael S. Tsirkin 
>Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check and
>sync host IOMMU cap/ecap
>
>On Mon, Mar 18, 2024 at 02:20:50PM +0100, Eric Auger wrote:
>> Hi Michael,
>>
>> On 3/13/24 12:17, Michael S. Tsirkin wrote:
>> > On Wed, Mar 13, 2024 at 07:54:11AM +, Duan, Zhenzhong wrote:
>> >>
>> >>> -Original Message-
>> >>> From: Michael S. Tsirkin 
>> >>> Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to check
>and
>> >>> sync host IOMMU cap/ecap
>> >>>
>> >>> On Wed, Mar 13, 2024 at 02:52:39AM +, Duan, Zhenzhong wrote:
>>  Hi Michael,
>> 
>> > -Original Message-
>> > From: Michael S. Tsirkin 
>> > Subject: Re: [PATCH v1 3/6] intel_iommu: Add a framework to
>check and
>> > sync host IOMMU cap/ecap
>> >
>> > On Wed, Feb 28, 2024 at 05:44:29PM +0800, Zhenzhong Duan
>wrote:
>> >> From: Yi Liu 
>> >>
>> >> Add a framework to check and synchronize host IOMMU cap/ecap
>with
>> >> vIOMMU cap/ecap.
>> >>
>> >> The sequence will be:
>> >>
>> >> vtd_cap_init() initializes iommu->cap/ecap.
>> >> vtd_check_hdev() update iommu->cap/ecap based on host
>cap/ecap.
>> >> iommu->cap_frozen set when machine create done, iommu-
>>cap/ecap
>> > become readonly.
>> >> Implementation details for different backends will be in following
>> >>> patches.
>> >> Signed-off-by: Yi Liu 
>> >> Signed-off-by: Yi Sun 
>> >> Signed-off-by: Zhenzhong Duan 
>> >> ---
>> >>  include/hw/i386/intel_iommu.h |  1 +
>> >>  hw/i386/intel_iommu.c | 50
>> > ++-
>> >>  2 files changed, 50 insertions(+), 1 deletion(-)
>> >>
>> >> diff --git a/include/hw/i386/intel_iommu.h
>> > b/include/hw/i386/intel_iommu.h
>> >> index bbc7b96add..c71a133820 100644
>> >> --- a/include/hw/i386/intel_iommu.h
>> >> +++ b/include/hw/i386/intel_iommu.h
>> >> @@ -283,6 +283,7 @@ struct IntelIOMMUState {
>> >>
>> >>  uint64_t cap;   /* The value of capability reg */
>> >>  uint64_t ecap;  /* The value of extended 
>> >> capability reg
>*/
>> >> +bool cap_frozen;/* cap/ecap become read-only 
>> >> after
>> >>> frozen */
>> >>  uint32_t context_cache_gen; /* Should be in [1,MAX] */
>> >>  GHashTable *iotlb;  /* IOTLB */
>> >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
>> >> index ffa1ad6429..a9f9dfd6a7 100644
>> >> --- a/hw/i386/intel_iommu.c
>> >> +++ b/hw/i386/intel_iommu.c
>> >> @@ -35,6 +35,8 @@
>> >>  #include "sysemu/kvm.h"
>> >>  #include "sysemu/dma.h"
>> >>  #include "sysemu/sysemu.h"
>> >> +#include "hw/vfio/vfio-common.h"
>> >> +#include "sysemu/iommufd.h"
>> >>  #include "hw/i386/apic_internal.h"
>> >>  #include "kvm/kvm_i386.h"
>> >>  #include "migration/vmstate.h"
>> >> @@ -3819,6 +3821,38 @@ VTDAddressSpace
>> > *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus,
>> >>  return vtd_dev_as;
>> >>  }
>> >>
>> >> +static int vtd_check_legacy_hdev(IntelIOMMUState *s,
>> >> + IOMMULegacyDevice *ldev,
>> >> + Error **errp)
>> >> +{
>> >> +return 0;
>> >> +}
>> >> +
>> >> +static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
>> >> +  IOMMUFDDevice *idev,
>> >> +  Error **errp)
>> >> +{
>> >> +return 0;
>> >> +}
>> >> +
>> >> +static int vtd_check_hdev(IntelIOMMUState *s,
>> >>> VTDHostIOMMUDevice
>> > *vtd_hdev,
>> >> +  Error **errp)
>> >> +{
>> >> +HostIOMMUDevice *base_dev = vtd_hdev->dev;
>> >> +IOMMUFDDevice *idev;
>> >> +
>> >> +if (base_dev->type == HID_LEGACY) {
>> >> +IOMMULegacyDevice *ldev = container_of(base_dev,
>> >> +   IOMMULegacyDevice, 
>> >> base);
>> >> +
>> >> +return vtd_check_legacy_hdev(s, ldev, errp);
>> >> +}
>> >> +
>> >> +idev = container_of(base_dev, IOMMUFDDevice, base);
>> >> +
>> >> +return vtd_check_iommufd_hdev(s, idev, errp);
>> >> +}
>> >> +
>> >>  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque,
>int
>> > devfn,
>> >>  HostIOMMUDevice *base_dev, Error 
>> >> **errp)
>> >>  {
>> >> @@ -3829,6 +3863,7 @@ static int
>> >>> vtd_dev_set_iommu_device(PCIBus
>> > *bus, void *opaque, int devfn,
>> >>  .devfn = devfn,
>> >>  };
>> >>  struct vtd_as_key *new_key;
>> >> +int ret;
>> >>
>> >>  assert(base_dev);
>> >>
>> >> @@ -3848,6 +3883,13 @@ static int
>

Re: [RFC 0/2] disable the configuration interrupt for the unsupported device

2024-03-28 Thread Cindy Lu

On Thu, Mar 28, 2024 at 12:12 PM Jason Wang  wrote:
>
> On Wed, Mar 27, 2024 at 5:33 PM Cindy Lu  wrote:
> >
> > On Wed, Mar 27, 2024 at 5:12 PM Jason Wang  wrote:
> > >
> > > On Wed, Mar 27, 2024 at 4:28 PM Cindy Lu  wrote:
> > > >
> > > > On Wed, Mar 27, 2024 at 3:54 PM Jason Wang  wrote:
> > > > >
> > > > > On Wed, Mar 27, 2024 at 2:03 PM Cindy Lu  wrote:
> > > > > >
> > > > > > On Wed, Mar 27, 2024 at 11:05 AM Jason Wang  
> > > > > > wrote:
> > > > > > >
> > > > > > > Hi Cindy:
> > > > > > >
> > > > > > > On Wed, Mar 27, 2024 at 9:29 AM Cindy Lu  wrote:
> > > > > > > >
> > > > > > > > we need a crash in Non-standard image, here is the jira for 
> > > > > > > > this https://issues.redhat.com/browse/RHEL-28522
> > > > > > > > The root cause of the issue is that an IRQFD was used without 
> > > > > > > > initialization..
> > > > > > > >
> > > > > > > > During the booting process of the Vyatta image, the behavior of 
> > > > > > > > the called function in qemu is as follows:
> > > > > > > >
> > > > > > > > 1. vhost_net_stop() was called, this will call the function
> > > > > > > > virtio_pci_set_guest_notifiers() with assgin= false, and
> > > > > > > > virtio_pci_set_guest_notifiers(） will release the irqfd for 
> > > > > > > > vector 0
> > > > > > >
> > > > > > > Before vhost_net_stop(), do we know which vector is used by which 
> > > > > > > queue?
> > > > > > >
> > > > > > before this stop, vdev->config_verctor is get from
> > > > > > virtio_pci_common_read/virtio_pci_common_write
> > > > > > it was set to vector 0
> > > > >
> > > > > I basically meant if vector 0 is shared with some virtqueues here.
> > > > >
> > > > Really sorry for this, vq's vector is 1,2, and will not share with the
> > > > configure vector
> > > > > > > >
> > > > > > > > 2. virtio_reset() was called -->set configure vector to 
> > > > > > > > VIRTIO_NO_VECTORt
> > > > > > > >
> > > > > > > > 3.vhost_net_start() was called (at this time the configure 
> > > > > > > > vector is
> > > > > > > > still VIRTIO_NO_VECTOR) and call 
> > > > > > > > virtio_pci_set_guest_notifiers() with
> > > > > > > > assgin= true, so the irqfd for vector 0 was not "init" during 
> > > > > > > > this process
> > > > > > >
> > > > > > > How does the configure vector differ from the virtqueue vector 
> > > > > > > here?
> > > > > > >
> > > > > > All the vectors are VIRTIO_NO_VECTOR (including vq). any
> > > > > > msix_fire_vector_notifier()
> > > > > > been called will cause the crash at this time.
> > > > >
> > > > > Won't virtio_pci_set_guest_notifiers() will try to allocate irqfd when
> > > > > the assignment is true?
> > > > >
> > > > It will allocate, but  the vector is VIRTIO_NO_VECTOR (0x)
> > > >
> > > > then it will called kvm_virtio_pci_vector_use_one()
> > > >
> > > > in this function, there is a check for
> > > >
> > > > if (vector >= msix_nr_vectors_allocated(dev))
> > > >
> > > > { return 0; }
> > > >
> > > > So it will return.
> > >
> > > How about let's just fix this?
> > >
> > > Btw, it's better to explain in detail like the above in the next version.
> > >
> > > Thanks
> > >
> > The problem is I think the behavior here is correct, The vector here is
> >  VIRTIO_NO_VECTOR and we should return,
>
> So if I understand correctly, the configure vector is configured after
> DRIVER_OK?
>
sorry I didn't get your point, Do you mean set_guest_notifiers()?,
this was called during the system boot
 but for the value of vdev->config_vector/vq vector, this is changed
by virtio_pci_common_read/virtio_pci_common_write
and these function will not check the process  DRIVER_OK.
> Spec doesn't forbid this, this is something we need to support.
>
> It looks to me the correct fix is to kvm_virtio_pci_vector_use_one()
> when guest is writing to msix_vector after DRIVER_OK?
>
if I understand correctly. do you mean
when  function   virtio_pci_common_read/virtio_pci_common_write was called
we need to check the number of  vdev->config_vector/vq vector, if this
was changed and also DRIVER_OK was set
then we need to call virtio_pci_set_guest_notifiers() to re-init the irqfd?
Thanks
cindy
> Thanks
>
> > the fix could work maybe is we try get to know if this was changed
> > from another value
> > and use that one? this seems strange.
> > Thanks
> > cindy
> > > >
> > > > > > So I think this should
> > > > > > be a bug in this guest image
> > > > >
> > > > > The point is Qemu should not crash even if the guest driver is buggy.
> > > > >
> > > > > It would be nice if we can have a qtest for this on top.
> > > > >
> > > > > Thanks
> > > > >
> > > > sure, got it, I have done the Qtest, and it passed
> > > > here is the result
> > > >
> > > > Ok: 794
> > > > Expected Fail:  0
> > > > Fail:   0
> > > > Unexpected Pass:0
> > > > Skipped:32
> > > > Timeout:0
> > > >
> > > > > > > >
> > > > > > > > 4. The system continues to boot and msix_fire_vector_notifier() 
> > > > > > > > was
> > > > > > > > cal

Re: [External] Re: [PATCH v8 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-03-28 Thread Ho-Ren (Jack) Chuang

On Thu, Mar 28, 2024 at 5:59 PM Huang, Ying  wrote:
>
> "Ho-Ren (Jack) Chuang"  writes:
>
> > The current implementation treats emulated memory devices, such as
> > CXL1.1 type3 memory, as normal DRAM when they are emulated as normal memory
> > (E820_TYPE_RAM). However, these emulated devices have different
> > characteristics than traditional DRAM, making it important to
> > distinguish them. Thus, we modify the tiered memory initialization process
> > to introduce a delay specifically for CPUless NUMA nodes. This delay
> > ensures that the memory tier initialization for these nodes is deferred
> > until HMAT information is obtained during the boot process. Finally,
> > demotion tables are recalculated at the end.
> >
> > * late_initcall(memory_tier_late_init);
> > Some device drivers may have initialized memory tiers between
> > `memory_tier_init()` and `memory_tier_late_init()`, potentially bringing
> > online memory nodes and configuring memory tiers. They should be excluded
> > in the late init.
> >
> > * Handle cases where there is no HMAT when creating memory tiers
> > There is a scenario where a CPUless node does not provide HMAT information.
> > If no HMAT is specified, it falls back to using the default DRAM tier.
> >
> > * Introduce another new lock `default_dram_perf_lock` for adist calculation
> > In the current implementation, iterating through CPUlist nodes requires
> > holding the `memory_tier_lock`. However, `mt_calc_adistance()` will end up
> > trying to acquire the same lock, leading to a potential deadlock.
> > Therefore, we propose introducing a standalone `default_dram_perf_lock` to
> > protect `default_dram_perf_*`. This approach not only avoids deadlock
> > but also prevents holding a large lock simultaneously.
> >
> > * Upgrade `set_node_memory_tier` to support additional cases, including
> >   default DRAM, late CPUless, and hot-plugged initializations.
> > To cover hot-plugged memory nodes, `mt_calc_adistance()` and
> > `mt_find_alloc_memory_type()` are moved into `set_node_memory_tier()` to
> > handle cases where memtype is not initialized and where HMAT information is
> > available.
> >
> > * Introduce `default_memory_types` for those memory types that are not
> >   initialized by device drivers.
> > Because late initialized memory and default DRAM memory need to be managed,
> > a default memory type is created for storing all memory types that are
> > not initialized by device drivers and as a fallback.
> >
> > Signed-off-by: Ho-Ren (Jack) Chuang 
> > Signed-off-by: Hao Xiang 
> > Reviewed-by: "Huang, Ying" 
> > ---
> >  mm/memory-tiers.c | 94 +++
> >  1 file changed, 78 insertions(+), 16 deletions(-)
> >
> > diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> > index 974af10cfdd8..e24fc3bebae4 100644
> > --- a/mm/memory-tiers.c
> > +++ b/mm/memory-tiers.c
> > @@ -36,6 +36,11 @@ struct node_memory_type_map {
> >
> >  static DEFINE_MUTEX(memory_tier_lock);
> >  static LIST_HEAD(memory_tiers);
> > +/*
> > + * The list is used to store all memory types that are not created
> > + * by a device driver.
> > + */
> > +static LIST_HEAD(default_memory_types);
> >  static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
> >  struct memory_dev_type *default_dram_type;
> >
> > @@ -108,6 +113,8 @@ static struct demotion_nodes *node_demotion 
> > __read_mostly;
> >
> >  static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
> >
> > +/* The lock is used to protect `default_dram_perf*` info and nid. */
> > +static DEFINE_MUTEX(default_dram_perf_lock);
> >  static bool default_dram_perf_error;
> >  static struct access_coordinate default_dram_perf;
> >  static int default_dram_perf_ref_nid = NUMA_NO_NODE;
> > @@ -505,7 +512,8 @@ static inline void __init_node_memory_type(int node, 
> > struct memory_dev_type *mem
> >  static struct memory_tier *set_node_memory_tier(int node)
> >  {
> >   struct memory_tier *memtier;
> > - struct memory_dev_type *memtype;
> > + struct memory_dev_type *mtype = default_dram_type;
> > + int adist = MEMTIER_ADISTANCE_DRAM;
> >   pg_data_t *pgdat = NODE_DATA(node);
> >
> >
> > @@ -514,11 +522,20 @@ static struct memory_tier *set_node_memory_tier(int 
> > node)
> >   if (!node_state(node, N_MEMORY))
> >   return ERR_PTR(-EINVAL);
> >
> > - __init_node_memory_type(node, default_dram_type);
> > + mt_calc_adistance(node, &adist);
> > + if (node_memory_types[node].memtype == NULL) {
> > + mtype = mt_find_alloc_memory_type(adist, 
> > &default_memory_types);
> > + if (IS_ERR(mtype)) {
> > + mtype = default_dram_type;
> > + pr_info("Failed to allocate a memory type. Fall 
> > back.\n");
> > + }
> > + }
> > +
> > + __init_node_memory_type(node, mtype);
> >
> > - memtype = node_memory_types[node].memtype;
> > - node_set(node, memtype->nodes);
> > - memtier = find_create_memory

RE: [PATCH v2 1/3] Hexagon (target/hexagon) Analyze reads before writes

2024-03-28 Thread Brian Cain



> -Original Message-
> From: Taylor Simpson 
> Sent: Thursday, February 1, 2024 4:34 AM
> To: qemu-devel@nongnu.org
> Cc: Brian Cain ; Matheus Bernardino (QUIC)
> ; Sid Manning ;
> Marco Liebel (QUIC) ;
> richard.hender...@linaro.org; phi...@linaro.org; a...@rev.ng; a...@rev.ng;
> ltaylorsimp...@gmail.com
> Subject: [PATCH v2 1/3] Hexagon (target/hexagon) Analyze reads before
> writes
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> We divide gen_analyze_funcs.py into 3 phases
> Declare the operands
> Analyze the register reads
> Analyze the register writes
> 
> We also create special versions of ctx_log_*_read for new operands
> Check that the operand is written before the read
> 
> This is a precursor to improving the analysis for short-circuiting
> the packet semantics in a subsequent commit
> 
> Signed-off-by: Taylor Simpson 
> ---
>  target/hexagon/translate.h  | 26 +++-
>  target/hexagon/README   |  9 +++--
>  target/hexagon/gen_analyze_funcs.py | 34 ++--
>  target/hexagon/hex_common.py| 63 +++--
>  4 files changed, 83 insertions(+), 49 deletions(-)
> 
> diff --git a/target/hexagon/translate.h b/target/hexagon/translate.h
> index 4dd59c6726..f06d71fc53 100644
> --- a/target/hexagon/translate.h
> +++ b/target/hexagon/translate.h
> @@ -1,5 +1,5 @@
>  /*
> - *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
> + *  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
>   *
>   *  This program is free software; you can redistribute it and/or modify
>   *  it under the terms of the GNU General Public License as published by
> @@ -75,6 +75,8 @@ typedef struct DisasContext {
>  TCGv dczero_addr;
>  } DisasContext;
> 
> +bool is_gather_store_insn(DisasContext *ctx);
> +
>  static inline void ctx_log_pred_write(DisasContext *ctx, int pnum)
>  {
>  if (!test_bit(pnum, ctx->pregs_written)) {
> @@ -89,6 +91,12 @@ static inline void ctx_log_pred_read(DisasContext *ctx,
> int pnum)
>  set_bit(pnum, ctx->pregs_read);
>  }
> 
> +static inline void ctx_log_pred_read_new(DisasContext *ctx, int pnum)
> +{
> +g_assert(test_bit(pnum, ctx->pregs_written));
> +set_bit(pnum, ctx->pregs_read);
> +}
> +
>  static inline void ctx_log_reg_write(DisasContext *ctx, int rnum,
>   bool is_predicated)
>  {
> @@ -120,6 +128,12 @@ static inline void ctx_log_reg_read(DisasContext
> *ctx, int rnum)
>  set_bit(rnum, ctx->regs_read);
>  }
> 
> +static inline void ctx_log_reg_read_new(DisasContext *ctx, int rnum)
> +{
> +g_assert(test_bit(rnum, ctx->regs_written));
> +set_bit(rnum, ctx->regs_read);
> +}
> +
>  static inline void ctx_log_reg_read_pair(DisasContext *ctx, int rnum)
>  {
>  ctx_log_reg_read(ctx, rnum);
> @@ -171,6 +185,15 @@ static inline void ctx_log_vreg_read(DisasContext
> *ctx, int rnum)
>  set_bit(rnum, ctx->vregs_read);
>  }
> 
> +static inline void ctx_log_vreg_read_new(DisasContext *ctx, int rnum)
> +{
> +g_assert(is_gather_store_insn(ctx) ||
> + test_bit(rnum, ctx->vregs_updated) ||
> + test_bit(rnum, ctx->vregs_select) ||
> + test_bit(rnum, ctx->vregs_updated_tmp));
> +set_bit(rnum, ctx->vregs_read);
> +}
> +
>  static inline void ctx_log_vreg_read_pair(DisasContext *ctx, int rnum)
>  {
>  ctx_log_vreg_read(ctx, rnum ^ 0);
> @@ -205,7 +228,6 @@ extern TCGv hex_vstore_addr[VSTORES_MAX];
>  extern TCGv hex_vstore_size[VSTORES_MAX];
>  extern TCGv hex_vstore_pending[VSTORES_MAX];
> 
> -bool is_gather_store_insn(DisasContext *ctx);
>  void process_store(DisasContext *ctx, int slot_num);
> 
>  FIELD(PROBE_PKT_SCALAR_STORE_S0, MMU_IDX,   0, 2)
> diff --git a/target/hexagon/README b/target/hexagon/README
> index 746ebec378..c1d8c8d0ab 100644
> --- a/target/hexagon/README
> +++ b/target/hexagon/README
> @@ -183,10 +183,11 @@ when the override is present.
>  }
> 
>  We also generate an analyze_ function for each instruction.  Currently,
> -these functions record the writes to registers by calling ctx_log_*.  During
> -gen_start_packet, we invoke the analyze_ function for each instruction
> in
> -the packet, and we mark the implicit writes.  After the analysis is 
> performed,
> -we initialize the result register for each of the predicated assignments.
> +these functions record the reads and writes to registers by calling 
> ctx_log_*.
> +During gen_start_packet, we invoke the analyze_ function for each
> instruction in
> +the packet, and we mark the implicit writes.  The analysis determines if the
> packet
> +semantics can be short-circuited.  If not, we initialize the result register 
> for
> each
> +of the predicated assignments.
> 
>  In addition to instruction semantics, we use a generator to create the decode
>  tree.  This generation is a four step p

RE: [PATCH v5 5/7] migration/multifd: implement initialization of qpl compression

2024-03-28 Thread Liu, Yuan1

> -Original Message-
> From: Peter Xu 
> Sent: Thursday, March 28, 2024 11:16 PM
> To: Liu, Yuan1 
> Cc: Daniel P. Berrangé ; faro...@suse.de; qemu-
> de...@nongnu.org; hao.xi...@bytedance.com; bryan.zh...@bytedance.com; Zou,
> Nanhai 
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Thu, Mar 28, 2024 at 02:32:37AM +, Liu, Yuan1 wrote:
> > > -Original Message-
> > > From: Peter Xu 
> > > Sent: Thursday, March 28, 2024 3:26 AM
> > > To: Liu, Yuan1 
> > > Cc: Daniel P. Berrangé ; faro...@suse.de; qemu-
> > > de...@nongnu.org; hao.xi...@bytedance.com; bryan.zh...@bytedance.com;
> Zou,
> > > Nanhai 
> > > Subject: Re: [PATCH v5 5/7] migration/multifd: implement
> initialization of
> > > qpl compression
> > >
> > > On Fri, Mar 22, 2024 at 12:40:32PM -0400, Peter Xu wrote:
> > > > > > void multifd_recv_zero_page_process(MultiFDRecvParams *p)
> > > > > > {
> > > > > > for (int i = 0; i < p->zero_num; i++) {
> > > > > > void *page = p->host + p->zero[i];
> > > > > > if (!buffer_is_zero(page, p->page_size)) {
> > > > > > memset(page, 0, p->page_size);
> > > > > > }
> > > > > > }
> > > > > > }
> > > >
> > > > It may not matter much (where I also see your below comments), but
> just
> > > to
> > > > mention another solution to avoid this read is that we can maintain
> > > > RAMBlock->receivedmap for precopy (especially, multifd, afaiu
> multifd
> > > > doesn't yet update this bitmap.. even if normal precopy does), then
> here
> > > > instead of scanning every time, maybe we can do:
> > > >
> > > >   /*
> > > >* If it's the 1st time receiving it, no need to clear it as it
> must
> > > be
> > > >* all zeros now.
> > > >*/
> > > >   if (bitmap_test(rb->receivedmap, page_offset)) {
> > > >   memset(page, 0, ...);
> > > >   } else {
> > > >   bitmap_set(rb->receivedmap, page_offset);
> > > >   }
> > > >
> > > > And we also always set the bit when !zero too.
> > > >
> > > > My rational is that it's unlikely a zero page if it's sent once or
> more,
> > > > while OTOH for the 1st time we receive it, it must be a zero page,
> so no
> > > > need to scan for the 1st round.
> > >
> > > Thinking about this, I'm wondering whether we should have this
> regardless.
> > > IIUC now multifd will always require two page faults on destination
> for
> > > anonymous guest memories (I suppose shmem/hugetlb is fine as no zero
> page
> > > in those worlds).  Even though it should be faster than DMA faults, it
> > > still is unwanted.
> > >
> > > I'll take a note myself as todo to do some measurements in the future
> > > first.  However if anyone thinks that makes sense and want to have a
> look,
> > > please say so.  It'll be more than welcomed.
> >
> > Yes, I think this is a better improvement to avoid two page faults. I
> can test
> > the performance impact of this change on SVM-capable devices and give
> some data
> > later. As we saw before, the IOTLB flush occurs via COW, with the
> change, the
> > impact of the COW should be gone.
> >
> > If you need more testing and analysis on this, please let me know
> 
> Nothing more than that.  Just a heads up that Xiang used to mention a test
> case where Richard used to suggest dropping the zero check:
> 
> https://lore.kernel.org/r/CAAYibXib+TWnJpV22E=adncdBmwXJRqgRjJXK7X71J=bDfa
> x...@mail.gmail.com
> 
> AFAIU this should be resolved if we have the bitmap maintained, but we can
> double check.  IIUC that's exactly the case for an idle guest, in that
> case
> it should be even faster to skip the memcmp when bit clear.
> 
> If you're going to post the patches, feel free to post that as a
> standalone
> small series first, then that can be considered merge even earlier.
> 
> Thanks a lot for doing this.

Sure, I will prepare a separate patch for this, and we can have a better 
discussion
on concrete implementation and test results.

RE: [PATCH v2 2/3] Hexagon (target/hexagon) Enable more short-circuit packets (scalar core)

2024-03-28 Thread Brian Cain



> -Original Message-
> From: Taylor Simpson 
> Sent: Thursday, February 1, 2024 4:34 AM
> To: qemu-devel@nongnu.org
> Cc: Brian Cain ; Matheus Bernardino (QUIC)
> ; Sid Manning ;
> Marco Liebel (QUIC) ;
> richard.hender...@linaro.org; phi...@linaro.org; a...@rev.ng; a...@rev.ng;
> ltaylorsimp...@gmail.com
> Subject: [PATCH v2 2/3] Hexagon (target/hexagon) Enable more short-circuit
> packets (scalar core)
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> Look for read-after-write instead of overlap of reads and writes
> 
> Here is an example with overalp but no read-after-write:
> 0x000200fc:  0x38103876 {   R0 = add(R0,R1); R6 = add(R6,R7) }
> 
> BEFORE:
>   000200fc
>  mov_i32 loc2,$0x0
>  mov_i32 loc2,r0
>  add_i32 loc3,loc2,r1
>  mov_i32 loc2,loc3
>  mov_i32 loc4,$0x0
>  mov_i32 loc4,r6
>  add_i32 loc5,loc4,r7
>  mov_i32 loc4,loc5
>  mov_i32 r0,loc2
>  mov_i32 r6,loc4
> 
> AFTER:
>   000200fc
>  add_i32 loc2,r0,r1
>  mov_i32 r0,loc2
>  add_i32 loc3,r6,r7
>  mov_i32 r6,loc3
> 
> We can also short-circuit packets with .new values by reading from the
> real destination instead of the temporary.
> 0x00020100:  0x78005ff3 {   R19 = #0xff
> 0x00020104:  0x2002e204 if (cmp.eq(N19.new,R2)) jump:t PC+8 }
> 
> BEFORE:
>   00020100
>  mov_i32 pc,$0x20108
>  mov_i32 loc8,$0x0
>  mov_i32 loc8,$0xff
>  setcond_i32 loc10,loc8,r2,eq
>  mov_i32 loc6,loc10
>  mov_i32 r19,loc8
>  add_i32 pkt_cnt,pkt_cnt,$0x2
>  add_i32 insn_cnt,insn_cnt,$0x4
>  brcond_i32 loc6,$0x0,eq,$L1
>  goto_tb $0x0
>  mov_i32 pc,$0x20108
>  exit_tb $0x7fbb5440
>  set_label $L1
>  goto_tb $0x1
>  exit_tb $0x7fbb5441
>  set_label $L0
>  exit_tb $0x7fbb5443
> 
> AFTER:
>   00020100
>  mov_i32 pc,$0x20108
>  mov_i32 r19,$0xff
>  setcond_i32 loc7,r19,r2,eq
>  mov_i32 loc4,loc7
>  add_i32 pkt_cnt,pkt_cnt,$0x2
>  add_i32 insn_cnt,insn_cnt,$0x4
>  brcond_i32 loc4,$0x0,eq,$L1
>  goto_tb $0x0
>  mov_i32 pc,$0x20108
>  exit_tb $0x7f976440
>  set_label $L1
>  goto_tb $0x1
>  exit_tb $0x7f976441
>  set_label $L0
>  exit_tb $0x7f976443
> 
> Signed-off-by: Taylor Simpson 
> ---
>  target/hexagon/translate.h | 13 +++--
>  target/hexagon/translate.c | 21 -
>  2 files changed, 11 insertions(+), 23 deletions(-)
> 
> diff --git a/target/hexagon/translate.h b/target/hexagon/translate.h
> index f06d71fc53..d5e7f49ad8 100644
> --- a/target/hexagon/translate.h
> +++ b/target/hexagon/translate.h
> @@ -38,12 +38,10 @@ typedef struct DisasContext {
>  int reg_log[REG_WRITES_MAX];
>  int reg_log_idx;
>  DECLARE_BITMAP(regs_written, TOTAL_PER_THREAD_REGS);
> -DECLARE_BITMAP(regs_read, TOTAL_PER_THREAD_REGS);
>  DECLARE_BITMAP(predicated_regs, TOTAL_PER_THREAD_REGS);
>  int preg_log[PRED_WRITES_MAX];
>  int preg_log_idx;
>  DECLARE_BITMAP(pregs_written, NUM_PREGS);
> -DECLARE_BITMAP(pregs_read, NUM_PREGS);
>  uint8_t store_width[STORES_MAX];
>  bool s1_store_processed;
>  int future_vregs_idx;
> @@ -68,6 +66,7 @@ typedef struct DisasContext {
>  bool is_tight_loop;
>  bool short_circuit;
>  bool has_hvx_helper;
> +bool read_after_write;
>  TCGv new_value[TOTAL_PER_THREAD_REGS];
>  TCGv new_pred_value[NUM_PREGS];
>  TCGv pred_written;
> @@ -88,13 +87,14 @@ static inline void ctx_log_pred_write(DisasContext
> *ctx, int pnum)
> 
>  static inline void ctx_log_pred_read(DisasContext *ctx, int pnum)
>  {
> -set_bit(pnum, ctx->pregs_read);
> +if (test_bit(pnum, ctx->pregs_written)) {
> +ctx->read_after_write = true;
> +}
>  }
> 
>  static inline void ctx_log_pred_read_new(DisasContext *ctx, int pnum)
>  {
>  g_assert(test_bit(pnum, ctx->pregs_written));
> -set_bit(pnum, ctx->pregs_read);
>  }
> 
>  static inline void ctx_log_reg_write(DisasContext *ctx, int rnum,
> @@ -125,13 +125,14 @@ static inline void
> ctx_log_reg_write_pair(DisasContext *ctx, int rnum,
> 
>  static inline void ctx_log_reg_read(DisasContext *ctx, int rnum)
>  {
> -set_bit(rnum, ctx->regs_read);
> +if (test_bit(rnum, ctx->regs_written)) {
> +ctx->read_after_write = true;
> +}
>  }
> 
>  static inline void ctx_log_reg_read_new(DisasContext *ctx, int rnum)
>  {
>  g_assert(test_bit(rnum, ctx->regs_written));
> -set_bit(rnum, ctx->regs_read);
>  }
> 
>  static inline void ctx_log_reg_read_pair(DisasContext *ctx, int rnum)
> diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
> index 95579ae243..751ca71790 100644
> --- a/target/hexagon/translate.c
> +++ b/target/hexagon/translate.c
> @@ -1,5 +1,5 @@
>  /*
> - *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
> + *  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
>   *
>   *  This program is free software; you can redistribute it and/or modify
>   *

RE: [PATCH v2 3/3] Hexagon (target/hexagon) Enable more short-circuit packets (HVX)

2024-03-28 Thread Brian Cain



> -Original Message-
> From: Taylor Simpson 
> Sent: Thursday, February 1, 2024 4:34 AM
> To: qemu-devel@nongnu.org
> Cc: Brian Cain ; Matheus Bernardino (QUIC)
> ; Sid Manning ;
> Marco Liebel (QUIC) ;
> richard.hender...@linaro.org; phi...@linaro.org; a...@rev.ng; a...@rev.ng;
> ltaylorsimp...@gmail.com
> Subject: [PATCH v2 3/3] Hexagon (target/hexagon) Enable more short-circuit
> packets (HVX)
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> Look for read-after-write instead of overlap of reads and writes
> 
> HVX instructions with helpers have pass-by-reference semantics, so
> we check for overlaps of reads and writes within the same instruction.
> 
> Signed-off-by: Taylor Simpson 
> ---
>  target/hexagon/translate.h  | 88 +++--
>  target/hexagon/translate.c  | 58 ++-
>  target/hexagon/gen_analyze_funcs.py | 19 ---
>  target/hexagon/hex_common.py| 45 ++-
>  4 files changed, 115 insertions(+), 95 deletions(-)
> 
> diff --git a/target/hexagon/translate.h b/target/hexagon/translate.h
> index d5e7f49ad8..00cc2bcd63 100644
> --- a/target/hexagon/translate.h
> +++ b/target/hexagon/translate.h
> @@ -50,23 +50,27 @@ typedef struct DisasContext {
>  int tmp_vregs_num[VECTOR_TEMPS_MAX];
>  int vreg_log[NUM_VREGS];
>  int vreg_log_idx;
> +DECLARE_BITMAP(vregs_written, NUM_VREGS);
> +DECLARE_BITMAP(insn_vregs_written, NUM_VREGS);
>  DECLARE_BITMAP(vregs_updated_tmp, NUM_VREGS);
>  DECLARE_BITMAP(vregs_updated, NUM_VREGS);
>  DECLARE_BITMAP(vregs_select, NUM_VREGS);
>  DECLARE_BITMAP(predicated_future_vregs, NUM_VREGS);
>  DECLARE_BITMAP(predicated_tmp_vregs, NUM_VREGS);
> -DECLARE_BITMAP(vregs_read, NUM_VREGS);
> +DECLARE_BITMAP(insn_vregs_read, NUM_VREGS);
>  int qreg_log[NUM_QREGS];
>  int qreg_log_idx;
> -DECLARE_BITMAP(qregs_read, NUM_QREGS);
> +DECLARE_BITMAP(qregs_written, NUM_QREGS);
> +DECLARE_BITMAP(insn_qregs_written, NUM_QREGS);
> +DECLARE_BITMAP(insn_qregs_read, NUM_QREGS);
>  bool pre_commit;
>  bool need_commit;
>  TCGCond branch_cond;
>  target_ulong branch_dest;
>  bool is_tight_loop;
>  bool short_circuit;
> -bool has_hvx_helper;
>  bool read_after_write;
> +bool has_hvx_overlap;
>  TCGv new_value[TOTAL_PER_THREAD_REGS];
>  TCGv new_pred_value[NUM_PREGS];
>  TCGv pred_written;
> @@ -146,10 +150,25 @@ intptr_t ctx_future_vreg_off(DisasContext *ctx, int
> regnum,
>  intptr_t ctx_tmp_vreg_off(DisasContext *ctx, int regnum,
>int num, bool alloc_ok);
> 
> +static inline void ctx_start_hvx_insn(DisasContext *ctx)
> +{
> +bitmap_zero(ctx->insn_vregs_written, NUM_VREGS);
> +bitmap_zero(ctx->insn_vregs_read, NUM_VREGS);
> +bitmap_zero(ctx->insn_qregs_written, NUM_QREGS);
> +bitmap_zero(ctx->insn_qregs_read, NUM_QREGS);
> +}
> +
>  static inline void ctx_log_vreg_write(DisasContext *ctx,
>int rnum, VRegWriteType type,
> -  bool is_predicated)
> +  bool is_predicated, bool has_helper)
>  {
> +if (has_helper) {
> +set_bit(rnum, ctx->insn_vregs_written);
> +if (test_bit(rnum, ctx->insn_vregs_read)) {
> +ctx->has_hvx_overlap = true;
> +}
> +}
> +set_bit(rnum, ctx->vregs_written);
>  if (type != EXT_TMP) {
>  if (!test_bit(rnum, ctx->vregs_updated)) {
>  ctx->vreg_log[ctx->vreg_log_idx] = rnum;
> @@ -175,42 +194,77 @@ static inline void ctx_log_vreg_write(DisasContext
> *ctx,
> 
>  static inline void ctx_log_vreg_write_pair(DisasContext *ctx,
> int rnum, VRegWriteType type,
> -   bool is_predicated)
> +   bool is_predicated, bool 
> has_helper)
>  {
> -ctx_log_vreg_write(ctx, rnum ^ 0, type, is_predicated);
> -ctx_log_vreg_write(ctx, rnum ^ 1, type, is_predicated);
> +ctx_log_vreg_write(ctx, rnum ^ 0, type, is_predicated, has_helper);
> +ctx_log_vreg_write(ctx, rnum ^ 1, type, is_predicated, has_helper);
>  }
> 
> -static inline void ctx_log_vreg_read(DisasContext *ctx, int rnum)
> +static inline void ctx_log_vreg_read(DisasContext *ctx, int rnum,
> + bool has_helper)
>  {
> -set_bit(rnum, ctx->vregs_read);
> +if (has_helper) {
> +set_bit(rnum, ctx->insn_vregs_read);
> +if (test_bit(rnum, ctx->insn_vregs_written)) {
> +ctx->has_hvx_overlap = true;
> +}
> +}
> +if (test_bit(rnum, ctx->vregs_written)) {
> +ctx->read_after_write = true;
> +}
>  }
> 
> -static inline void ctx_log_vreg_read_new(DisasContext *ctx, int rnum)
> +static inline void ctx_log_vreg_read_n

Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling

2024-03-28 Thread Zhijian Li (Fujitsu)



On 28/03/2024 23:01, Peter Xu wrote:
> On Thu, Mar 28, 2024 at 11:18:04AM -0300, Fabiano Rosas wrote:
>> Philippe Mathieu-Daudé  writes:
>>
>>> The whole RDMA subsystem was deprecated in commit e9a54265f5
>>> ("hw/rdma: Deprecate the pvrdma device and the rdma subsystem")
>>> released in v8.2.
>>>
>>> Remove:
>>>   - RDMA handling from migration
>>>   - dependencies on libibumad, libibverbs and librdmacm
>>>
>>> Keep the RAM_SAVE_FLAG_HOOK definition since it might appears
>>> in old migration streams.
>>>
>>> Cc: Peter Xu 
>>> Cc: Li Zhijian 
>>> Acked-by: Fabiano Rosas 
>>> Signed-off-by: Philippe Mathieu-Daudé 
>>
>> Just to be clear, because people raised the point in the last version,
>> the first link in the deprecation commit links to a thread comprising
>> entirely of rdma migration patches. I don't see any ambiguity on whether
>> the deprecation was intended to include migration. There's even an ack
>> from Juan.
> 
> Yes I remember that's the plan.
> 
>>
>> So on the basis of not reverting the previous maintainer's decision, my
>> Ack stands here.
>>
>> We also had pretty obvious bugs ([1], [2]) in the past that would have
>> been caught if we had any kind of testing for the feature, so I can't
>> even say this thing works currently.
>>
>> @Peter Xu, @Li Zhijian, what are your thoughts on this?
> 
> Generally I definitely agree with such a removal sooner or later, as that's
> how deprecation works, and even after Juan's left I'm not aware of any
> other new RDMA users.  Personally, I'd slightly prefer postponing it one
> more release which might help a bit of our downstream maintenance, however
> I assume that's not a blocker either, as I think we can also manage it.
> 
> IMHO it's more important to know whether there are still users and whether
> they would still like to see it around. That's also one thing I notice that
> e9a54265f533f didn't yet get acks from RDMA users that we are aware, even
> if they're rare. According to [2] it could be that such user may only rely
> on the release versions of QEMU when it broke things.
> 
> So I'm copying Yu too (while Zhijian is already in the loop), just in case
> someone would like to stand up and speak.


I admit RDMA migration was lack of testing(unit/CI test), which led to the a few
obvious bugs being noticed too late.
However I was a bit surprised when I saw the removal of the RDMA migration. I 
wasn't
aware that this feature has not been marked as deprecated(at least there is no
prompt to end-user).


> IMHO it's more important to know whether there are still users and whether
> they would still like to see it around.

Agree.
I didn't immediately express my opinion in V1 because I'm also consulting our
customers for this feature in the future.

Personally, I agree with Perter's idea that "I'd slightly prefer postponing it 
one
more release which might help a bit of our downstream maintenance"

Thanks
Zhijian

> 
> Thanks,
> 
>>
>> 1- https://lore.kernel.org/r/20230920090412.726725-1-lizhij...@fujitsu.com
>> 2- 
>> https://lore.kernel.org/r/cahecvy7hxswn4ow_kog+q+tn6f_kmeichevz1qgm-fbxbpp...@mail.gmail.com
>>
>

Re: [PATCH 12/19] migration: fix -Werror=maybe-uninitialized false-positives

2024-03-28 Thread Yong Huang

On Thu, Mar 28, 2024 at 6:23 PM  wrote:

> From: Marc-André Lureau 
>
> ../migration/dirtyrate.c:186:5: error: ‘records’ may be used uninitialized
> [-Werror=maybe-uninitialized]
> ../migration/dirtyrate.c:168:12: error: ‘gen_id’ may be used uninitialized
> [-Werror=maybe-uninitialized]
> ../migration/migration.c:2273:5: error: ‘file’ may be used uninitialized
> [-Werror=maybe-uninitialized]
>
> Signed-off-by: Marc-André Lureau 
> ---
>  migration/dirtyrate.c | 4 ++--
>  migration/migration.c | 2 +-
>  2 files changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
> index 1d2e85746f..22dd22922c 100644
> --- a/migration/dirtyrate.c
> +++ b/migration/dirtyrate.c
> @@ -144,12 +144,12 @@ int64_t vcpu_calculate_dirtyrate(int64_t
> calc_time_ms,
>   unsigned int flag,
>   bool one_shot)
>  {
> -DirtyPageRecord *records;
> +DirtyPageRecord *records = NULL;
>  int64_t init_time_ms;
>  int64_t duration;
>  int64_t dirtyrate;
>  int i = 0;
> -unsigned int gen_id;
> +unsigned int gen_id = 0;
>
>  retry:
>  init_time_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> diff --git a/migration/migration.c b/migration/migration.c
> index 9fe8fd2afd..412138ea94 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -2247,7 +2247,7 @@ static bool
> migrate_handle_rp_resume_ack(MigrationState *s,
>   */
>  static void migration_release_dst_files(MigrationState *ms)
>  {
> -QEMUFile *file;
> +QEMUFile *file = NULL;
>
>  WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
>  /*
> --
> 2.44.0
>
>
Reviewed-by: Hyman Huang 

Yong

-- 
Best regards

RE: [PATCH v2 1/9] Hexagon (target/hexagon) Add is_old/is_new to Register class

2024-03-28 Thread Brian Cain



> -Original Message-
> From: Taylor Simpson 
> Sent: Wednesday, March 6, 2024 9:23 PM
> To: qemu-devel@nongnu.org
> Cc: Brian Cain ; Matheus Bernardino (QUIC)
> ; Sid Manning ;
> Marco Liebel (QUIC) ;
> richard.hender...@linaro.org; phi...@linaro.org; a...@rev.ng; a...@rev.ng;
> ltaylorsimp...@gmail.com
> Subject: [PATCH v2 1/9] Hexagon (target/hexagon) Add is_old/is_new to
> Register class
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> Signed-off-by: Taylor Simpson 
> Reviewed-by: Philippe Mathieu-Daudé 
> ---

Reviewed-by: Brian Cain 

>  target/hexagon/hex_common.py | 14 +-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/target/hexagon/hex_common.py
> b/target/hexagon/hex_common.py
> index 195620c7ec..4bacef223f 100755
> --- a/target/hexagon/hex_common.py
> +++ b/target/hexagon/hex_common.py
> @@ -1,7 +1,7 @@
>  #!/usr/bin/env python3
> 
>  ##
> -##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
> +##  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
>  ##
>  ##  This program is free software; you can redistribute it and/or modify
>  ##  it under the terms of the GNU General Public License as published by
> @@ -397,10 +397,18 @@ def is_readwrite(self):
>  class OldSource(Source):
>  def reg_tcg(self):
>  return f"{self.regtype}{self.regid}V"
> +def is_old(self):
> +return True
> +def is_new(self):
> +return False
> 
>  class NewSource(Source):
>  def reg_tcg(self):
>  return f"{self.regtype}{self.regid}N"
> +def is_old(self):
> +return False
> +def is_new(self):
> +return True
> 
>  class ReadWrite:
>  def reg_tcg(self):
> @@ -413,6 +421,10 @@ def is_read(self):
>  return True
>  def is_readwrite(self):
>  return True
> +def is_old(self):
> +return True
> +def is_new(self):
> +return False
> 
>  class GprDest(Register, Single, Dest):
>  def decl_tcg(self, f, tag, regno):
> --
> 2.34.1

RE: [PATCH v2 2/9] Hexagon (target/hexagon) Mark new_read_idx in trans functions

2024-03-28 Thread Brian Cain



> -Original Message-
> From: Taylor Simpson 
> Sent: Wednesday, March 6, 2024 9:23 PM
> To: qemu-devel@nongnu.org
> Cc: Brian Cain ; Matheus Bernardino (QUIC)
> ; Sid Manning ;
> Marco Liebel (QUIC) ;
> richard.hender...@linaro.org; phi...@linaro.org; a...@rev.ng; a...@rev.ng;
> ltaylorsimp...@gmail.com
> Subject: [PATCH v2 2/9] Hexagon (target/hexagon) Mark new_read_idx in
> trans functions
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> Check that the value matches opcode_reginfo
> 
> Signed-off-by: Taylor Simpson 
> ---

Reviewed-by: Brian Cain 


>  target/hexagon/insn.h   |  3 ++-
>  target/hexagon/decode.c |  2 ++
>  target/hexagon/mmvec/decode_ext_mmvec.c |  2 ++
>  target/hexagon/gen_trans_funcs.py   | 15 ++-
>  4 files changed, 16 insertions(+), 6 deletions(-)
> 
> diff --git a/target/hexagon/insn.h b/target/hexagon/insn.h
> index 3e7a22c91e..36502bf056 100644
> --- a/target/hexagon/insn.h
> +++ b/target/hexagon/insn.h
> @@ -1,5 +1,5 @@
>  /*
> - *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
> + *  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
>   *
>   *  This program is free software; you can redistribute it and/or modify
>   *  it under the terms of the GNU General Public License as published by
> @@ -39,6 +39,7 @@ struct Instruction {
>  uint32_t slot:3;
>  uint32_t which_extended:1;/* If has an extender, which immediate */
>  uint32_t new_value_producer_slot:4;
> +int32_t new_read_idx;
> 
>  bool part1;  /*
>* cmp-jumps are split into two insns.
> diff --git a/target/hexagon/decode.c b/target/hexagon/decode.c
> index a40210ca1e..4595e30384 100644
> --- a/target/hexagon/decode.c
> +++ b/target/hexagon/decode.c
> @@ -131,6 +131,8 @@ decode_fill_newvalue_regno(Packet *packet)
>  use_regidx = strchr(opcode_reginfo[use_opcode], 's') -
>  opcode_reginfo[use_opcode];
>  }
> +g_assert(packet->insn[i].new_read_idx != -1 &&
> + packet->insn[i].new_read_idx == use_regidx);
> 
>  /*
>   * What's encoded at the N-field is the offset to who's producing
> diff --git a/target/hexagon/mmvec/decode_ext_mmvec.c
> b/target/hexagon/mmvec/decode_ext_mmvec.c
> index 202d84c7c0..e9007f5d71 100644
> --- a/target/hexagon/mmvec/decode_ext_mmvec.c
> +++ b/target/hexagon/mmvec/decode_ext_mmvec.c
> @@ -41,6 +41,8 @@ check_new_value(Packet *pkt)
>  GET_ATTRIB(use_opcode, A_STORE)) {
>  int use_regidx = strchr(opcode_reginfo[use_opcode], 's') -
>  opcode_reginfo[use_opcode];
> +g_assert(pkt->insn[i].new_read_idx != -1 &&
> + pkt->insn[i].new_read_idx == use_regidx);
>  /*
>   * What's encoded at the N-field is the offset to who's producing
>   * the value.
> diff --git a/target/hexagon/gen_trans_funcs.py
> b/target/hexagon/gen_trans_funcs.py
> index 53e844a44b..8acecdb993 100755
> --- a/target/hexagon/gen_trans_funcs.py
> +++ b/target/hexagon/gen_trans_funcs.py
> @@ -68,6 +68,7 @@ def mark_which_imm_extended(f, tag):
>  ## insn->regno[0] = args->Rd;
>  ## insn->regno[1] = args->Rs;
>  ## insn->regno[2] = args->Rt;
> +## insn->new_read_idx = -1;
>  ## return true;
>  ## }
>  ##
> @@ -84,14 +85,14 @@ def gen_trans_funcs(f):
>  insn->opcode = {tag};
>  """))
> 
> -regno = 0
> -for reg in regs:
> -reg_type = reg[0]
> -reg_id = reg[1]
> +new_read_idx = -1
> +for regno, (reg_type, reg_id, *_) in enumerate(regs):
> +reg = hex_common.get_register(tag, reg_type, reg_id)
>  f.write(code_fmt(f"""\
>  insn->regno[{regno}] = args->{reg_type}{reg_id};
>  """))
> -regno += 1
> +if reg.is_read() and reg.is_new():
> +new_read_idx = regno
> 
>  if len(imms) != 0:
>  mark_which_imm_extended(f, tag)
> @@ -112,6 +113,9 @@ def gen_trans_funcs(f):
>  insn->immed[{immno}] = args->{imm_type}{imm_letter};
>  """))
> 
> +f.write(code_fmt(f"""\
> +insn->new_read_idx = {new_read_idx};
> +"""))
>  f.write(textwrap.dedent(f"""\
>  return true;
>  {close_curly}
> @@ -120,5 +124,6 @@ def gen_trans_funcs(f):
> 
>  if __name__ == "__main__":
>  hex_common.read_semantics_file(sys.argv[1])
> +hex_common.init_registers()
>  with open(sys.argv[2], "w") as f:
>  gen_trans_funcs(f)
> --
> 2.34.1

RE: [PATCH v2 3/9] Hexagon (target/hexagon) Mark dest_idx in trans functions

2024-03-28 Thread Brian Cain



> -Original Message-
> From: Taylor Simpson 
> Sent: Wednesday, March 6, 2024 9:23 PM
> To: qemu-devel@nongnu.org
> Cc: Brian Cain ; Matheus Bernardino (QUIC)
> ; Sid Manning ;
> Marco Liebel (QUIC) ;
> richard.hender...@linaro.org; phi...@linaro.org; a...@rev.ng; a...@rev.ng;
> ltaylorsimp...@gmail.com
> Subject: [PATCH v2 3/9] Hexagon (target/hexagon) Mark dest_idx in trans
> functions
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> Check that the value matches opcode_reginfo/opcode_wregs
> 
> Signed-off-by: Taylor Simpson 
> ---

Reviewed-by: Brian Cain 

>  target/hexagon/insn.h   | 1 +
>  target/hexagon/decode.c | 2 ++
>  target/hexagon/mmvec/decode_ext_mmvec.c | 2 ++
>  target/hexagon/gen_trans_funcs.py   | 6 ++
>  4 files changed, 11 insertions(+)
> 
> diff --git a/target/hexagon/insn.h b/target/hexagon/insn.h
> index 36502bf056..a770379958 100644
> --- a/target/hexagon/insn.h
> +++ b/target/hexagon/insn.h
> @@ -40,6 +40,7 @@ struct Instruction {
>  uint32_t which_extended:1;/* If has an extender, which immediate */
>  uint32_t new_value_producer_slot:4;
>  int32_t new_read_idx;
> +int32_t dest_idx;
> 
>  bool part1;  /*
>* cmp-jumps are split into two insns.
> diff --git a/target/hexagon/decode.c b/target/hexagon/decode.c
> index 4595e30384..a4d8500fea 100644
> --- a/target/hexagon/decode.c
> +++ b/target/hexagon/decode.c
> @@ -184,6 +184,8 @@ decode_fill_newvalue_regno(Packet *packet)
> 
>  /* Now patch up the consumer with the register number */
>  dst_idx = dststr - opcode_reginfo[def_opcode];
> +g_assert(packet->insn[def_idx].dest_idx != -1 &&
> + packet->insn[def_idx].dest_idx == dst_idx);
>  packet->insn[i].regno[use_regidx] =
>  packet->insn[def_idx].regno[dst_idx];
>  /*
> diff --git a/target/hexagon/mmvec/decode_ext_mmvec.c
> b/target/hexagon/mmvec/decode_ext_mmvec.c
> index e9007f5d71..c1320406df 100644
> --- a/target/hexagon/mmvec/decode_ext_mmvec.c
> +++ b/target/hexagon/mmvec/decode_ext_mmvec.c
> @@ -86,6 +86,8 @@ check_new_value(Packet *pkt)
>  /* still not there, we have a bad packet */
>  g_assert_not_reached();
>  }
> +g_assert(pkt->insn[def_idx].dest_idx != -1 &&
> + pkt->insn[def_idx].dest_idx == dststr - reginfo);
>  int def_regnum = pkt->insn[def_idx].regno[dststr - reginfo];
>  /* Now patch up the consumer with the register number */
>  pkt->insn[i].regno[use_regidx] = def_regnum ^ def_oreg;
> diff --git a/target/hexagon/gen_trans_funcs.py
> b/target/hexagon/gen_trans_funcs.py
> index 8acecdb993..1201172dda 100755
> --- a/target/hexagon/gen_trans_funcs.py
> +++ b/target/hexagon/gen_trans_funcs.py
> @@ -69,6 +69,7 @@ def mark_which_imm_extended(f, tag):
>  ## insn->regno[1] = args->Rs;
>  ## insn->regno[2] = args->Rt;
>  ## insn->new_read_idx = -1;
> +## insn->dest_idx = 0;
>  ## return true;
>  ## }
>  ##
> @@ -86,6 +87,7 @@ def gen_trans_funcs(f):
>  """))
> 
>  new_read_idx = -1
> +dest_idx = -1
>  for regno, (reg_type, reg_id, *_) in enumerate(regs):
>  reg = hex_common.get_register(tag, reg_type, reg_id)
>  f.write(code_fmt(f"""\
> @@ -93,6 +95,9 @@ def gen_trans_funcs(f):
>  """))
>  if reg.is_read() and reg.is_new():
>  new_read_idx = regno
> +# dest_idx should be the first destination, so check for -1
> +if reg.is_written() and dest_idx == -1:
> +dest_idx = regno
> 
>  if len(imms) != 0:
>  mark_which_imm_extended(f, tag)
> @@ -115,6 +120,7 @@ def gen_trans_funcs(f):
> 
>  f.write(code_fmt(f"""\
>  insn->new_read_idx = {new_read_idx};
> +insn->dest_idx = {dest_idx};
>  """))
>  f.write(textwrap.dedent(f"""\
>  return true;
> --
> 2.34.1

RE: [PATCH v2 5/9] Hexagon (tests/tcg/hexagon) Test HVX .new read from high half of pair

2024-03-28 Thread Brian Cain



> -Original Message-
> From: Taylor Simpson 
> Sent: Wednesday, March 6, 2024 9:23 PM
> To: qemu-devel@nongnu.org
> Cc: Brian Cain ; Matheus Bernardino (QUIC)
> ; Sid Manning ;
> Marco Liebel (QUIC) ;
> richard.hender...@linaro.org; phi...@linaro.org; a...@rev.ng; a...@rev.ng;
> ltaylorsimp...@gmail.com
> Subject: [PATCH v2 5/9] Hexagon (tests/tcg/hexagon) Test HVX .new read
> from high half of pair
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> Make sure the decoding of HVX .new is correctly handling this case
> 
> Signed-off-by: Taylor Simpson 
> ---

Reviewed-by: Brian Cain 

>  tests/tcg/hexagon/hvx_misc.c | 16 +++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/tests/tcg/hexagon/hvx_misc.c b/tests/tcg/hexagon/hvx_misc.c
> index b45170acd1..1fe14b5158 100644
> --- a/tests/tcg/hexagon/hvx_misc.c
> +++ b/tests/tcg/hexagon/hvx_misc.c
> @@ -1,5 +1,5 @@
>  /*
> - *  Copyright(c) 2021-2023 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
> + *  Copyright(c) 2021-2024 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
>   *
>   *  This program is free software; you can redistribute it and/or modify
>   *  it under the terms of the GNU General Public License as published by
> @@ -231,6 +231,7 @@ static void test_masked_store(bool invert)
>  static void test_new_value_store(void)
>  {
>  void *p0 = buffer0;
> +void *p1 = buffer1;
>  void *pout = output;
> 
>  asm("{\n\t"
> @@ -242,6 +243,19 @@ static void test_new_value_store(void)
>  expect[0] = buffer0[0];
> 
>  check_output_w(__LINE__, 1);
> +
> +/* Test the .new read from the high half of a pair */
> +asm("v7 = vmem(%0 + #0)\n\t"
> +"v12 = vmem(%1 + #0)\n\t"
> +"{\n\t"
> +"v5:4 = vcombine(v12, v7)\n\t"
> +"vmem(%2 + #0) = v5.new\n\t"
> +"}\n\t"
> +: : "r"(p0), "r"(p1), "r"(pout) : "v4", "v5", "v7", "v12", "memory");
> +
> +expect[0] = buffer1[0];
> +
> +check_output_w(__LINE__, 1);
>  }
> 
>  static void test_max_temps()
> --
> 2.34.1

RE: [PATCH v2 4/9] Hexagon (target/hexagon) Mark has_pred_dest in trans functions

2024-03-28 Thread Brian Cain



> -Original Message-
> From: Taylor Simpson 
> Sent: Wednesday, March 6, 2024 9:23 PM
> To: qemu-devel@nongnu.org
> Cc: Brian Cain ; Matheus Bernardino (QUIC)
> ; Sid Manning ;
> Marco Liebel (QUIC) ;
> richard.hender...@linaro.org; phi...@linaro.org; a...@rev.ng; a...@rev.ng;
> ltaylorsimp...@gmail.com
> Subject: [PATCH v2 4/9] Hexagon (target/hexagon) Mark has_pred_dest in
> trans functions
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> Check that the value matches opcode_wregs
> 
> Signed-off-by: Taylor Simpson 
> ---

Reviewed-by: Brian Cain 

>  target/hexagon/insn.h | 1 +
>  target/hexagon/decode.c   | 3 +++
>  target/hexagon/gen_trans_funcs.py | 5 +
>  3 files changed, 9 insertions(+)
> 
> diff --git a/target/hexagon/insn.h b/target/hexagon/insn.h
> index a770379958..24dcf7fe9f 100644
> --- a/target/hexagon/insn.h
> +++ b/target/hexagon/insn.h
> @@ -41,6 +41,7 @@ struct Instruction {
>  uint32_t new_value_producer_slot:4;
>  int32_t new_read_idx;
>  int32_t dest_idx;
> +bool has_pred_dest;
> 
>  bool part1;  /*
>* cmp-jumps are split into two insns.
> diff --git a/target/hexagon/decode.c b/target/hexagon/decode.c
> index a4d8500fea..84a3899556 100644
> --- a/target/hexagon/decode.c
> +++ b/target/hexagon/decode.c
> @@ -366,6 +366,9 @@ static void decode_shuffle_for_execution(Packet
> *packet)
>  for (flag = false, i = 0; i < last_insn + 1; i++) {
>  int opcode = packet->insn[i].opcode;
> 
> +g_assert(packet->insn[i].has_pred_dest ==
> + (strstr(opcode_wregs[opcode], "Pd4") ||
> +  strstr(opcode_wregs[opcode], "Pe4")));
>  if ((strstr(opcode_wregs[opcode], "Pd4") ||
>   strstr(opcode_wregs[opcode], "Pe4")) &&
>  GET_ATTRIB(opcode, A_STORE) == 0) {
> diff --git a/target/hexagon/gen_trans_funcs.py
> b/target/hexagon/gen_trans_funcs.py
> index 1201172dda..9f86b4edbd 100755
> --- a/target/hexagon/gen_trans_funcs.py
> +++ b/target/hexagon/gen_trans_funcs.py
> @@ -70,6 +70,7 @@ def mark_which_imm_extended(f, tag):
>  ## insn->regno[2] = args->Rt;
>  ## insn->new_read_idx = -1;
>  ## insn->dest_idx = 0;
> +## insn->has_pred_dest = false;
>  ## return true;
>  ## }
>  ##
> @@ -88,6 +89,7 @@ def gen_trans_funcs(f):
> 
>  new_read_idx = -1
>  dest_idx = -1
> +has_pred_dest = "false"
>  for regno, (reg_type, reg_id, *_) in enumerate(regs):
>  reg = hex_common.get_register(tag, reg_type, reg_id)
>  f.write(code_fmt(f"""\
> @@ -98,6 +100,8 @@ def gen_trans_funcs(f):
>  # dest_idx should be the first destination, so check for -1
>  if reg.is_written() and dest_idx == -1:
>  dest_idx = regno
> +if reg_type == "P" and reg.is_written() and not reg.is_read():
> +has_pred_dest = "true"
> 
>  if len(imms) != 0:
>  mark_which_imm_extended(f, tag)
> @@ -121,6 +125,7 @@ def gen_trans_funcs(f):
>  f.write(code_fmt(f"""\
>  insn->new_read_idx = {new_read_idx};
>  insn->dest_idx = {dest_idx};
> +insn->has_pred_dest = {has_pred_dest};
>  """))
>  f.write(textwrap.dedent(f"""\
>  return true;
> --
> 2.34.1

RE: [PATCH v2 6/9] Hexagon (target/hexagon) Remove uses of op_regs_generated.h.inc

2024-03-28 Thread Brian Cain



> -Original Message-
> From: Taylor Simpson 
> Sent: Wednesday, March 6, 2024 9:23 PM
> To: qemu-devel@nongnu.org
> Cc: Brian Cain ; Matheus Bernardino (QUIC)
> ; Sid Manning ;
> Marco Liebel (QUIC) ;
> richard.hender...@linaro.org; phi...@linaro.org; a...@rev.ng; a...@rev.ng;
> ltaylorsimp...@gmail.com
> Subject: [PATCH v2 6/9] Hexagon (target/hexagon) Remove uses of
> op_regs_generated.h.inc
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> Signed-off-by: Taylor Simpson 
> ---

Reviewed-by: Brian Cain 

>  target/hexagon/opcodes.h|  4 --
>  target/hexagon/decode.c | 57 +++--
>  target/hexagon/mmvec/decode_ext_mmvec.c | 34 +++
>  target/hexagon/opcodes.c| 28 
>  4 files changed, 13 insertions(+), 110 deletions(-)
> 
> diff --git a/target/hexagon/opcodes.h b/target/hexagon/opcodes.h
> index fa7e321950..0ee11bd445 100644
> --- a/target/hexagon/opcodes.h
> +++ b/target/hexagon/opcodes.h
> @@ -40,10 +40,6 @@ typedef enum {
> 
>  extern const char * const opcode_names[];
> 
> -extern const char * const opcode_reginfo[];
> -extern const char * const opcode_rregs[];
> -extern const char * const opcode_wregs[];
> -
>  typedef struct {
>  const char * const encoding;
>  const EncClass enc_class;
> diff --git a/target/hexagon/decode.c b/target/hexagon/decode.c
> index 84a3899556..23deba2426 100644
> --- a/target/hexagon/decode.c
> +++ b/target/hexagon/decode.c
> @@ -115,24 +115,13 @@ static void
>  decode_fill_newvalue_regno(Packet *packet)
>  {
>  int i, use_regidx, offset, def_idx, dst_idx;
> -uint16_t def_opcode, use_opcode;
> -char *dststr;
> 
>  for (i = 1; i < packet->num_insns; i++) {
>  if (GET_ATTRIB(packet->insn[i].opcode, A_DOTNEWVALUE) &&
>  !GET_ATTRIB(packet->insn[i].opcode, A_EXTENSION)) {
> -use_opcode = packet->insn[i].opcode;
> -
> -/* It's a store, so we're adjusting the Nt field */
> -if (GET_ATTRIB(use_opcode, A_STORE)) {
> -use_regidx = strchr(opcode_reginfo[use_opcode], 't') -
> -opcode_reginfo[use_opcode];
> -} else {/* It's a Jump, so we're adjusting the Ns field */
> -use_regidx = strchr(opcode_reginfo[use_opcode], 's') -
> -opcode_reginfo[use_opcode];
> -}
> -g_assert(packet->insn[i].new_read_idx != -1 &&
> - packet->insn[i].new_read_idx == use_regidx);
> +
> +g_assert(packet->insn[i].new_read_idx != -1);
> +use_regidx = packet->insn[i].new_read_idx;
> 
>  /*
>   * What's encoded at the N-field is the offset to who's producing
> @@ -153,39 +142,9 @@ decode_fill_newvalue_regno(Packet *packet)
>   */
>  g_assert(!((def_idx < 0) || (def_idx > (packet->num_insns - 
> 1;
> 
> -/*
> - * packet->insn[def_idx] is the producer
> - * Figure out which type of destination it produces
> - * and the corresponding index in the reginfo
> - */
> -def_opcode = packet->insn[def_idx].opcode;
> -dststr = strstr(opcode_wregs[def_opcode], "Rd");
> -if (dststr) {
> -dststr = strchr(opcode_reginfo[def_opcode], 'd');
> -} else {
> -dststr = strstr(opcode_wregs[def_opcode], "Rx");
> -if (dststr) {
> -dststr = strchr(opcode_reginfo[def_opcode], 'x');
> -} else {
> -dststr = strstr(opcode_wregs[def_opcode], "Re");
> -if (dststr) {
> -dststr = strchr(opcode_reginfo[def_opcode], 'e');
> -} else {
> -dststr = strstr(opcode_wregs[def_opcode], "Ry");
> -if (dststr) {
> -dststr = strchr(opcode_reginfo[def_opcode], 'y');
> -} else {
> -g_assert_not_reached();
> -}
> -}
> -}
> -}
> -g_assert(dststr != NULL);
> -
>  /* Now patch up the consumer with the register number */
> -dst_idx = dststr - opcode_reginfo[def_opcode];
> -g_assert(packet->insn[def_idx].dest_idx != -1 &&
> - packet->insn[def_idx].dest_idx == dst_idx);
> +g_assert(packet->insn[def_idx].dest_idx != -1);
> +dst_idx = packet->insn[def_idx].dest_idx;
>  packet->insn[i].regno[use_regidx] =
>  packet->insn[def_idx].regno[dst_idx];
>  /*
> @@ -366,11 +325,7 @@ static void decode_shuffle_for_execution(Packet
> *packet)
>  for (flag = false, i = 0; i < last_insn + 1; i++) {
>  int o

RE: [PATCH v2 7/9] Hexagon (target/hexagon) Remove gen_op_regs.py

2024-03-28 Thread Brian Cain



> -Original Message-
> From: Taylor Simpson 
> Sent: Wednesday, March 6, 2024 9:23 PM
> To: qemu-devel@nongnu.org
> Cc: Brian Cain ; Matheus Bernardino (QUIC)
> ; Sid Manning ;
> Marco Liebel (QUIC) ;
> richard.hender...@linaro.org; phi...@linaro.org; a...@rev.ng; a...@rev.ng;
> ltaylorsimp...@gmail.com
> Subject: [PATCH v2 7/9] Hexagon (target/hexagon) Remove gen_op_regs.py
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> Signed-off-by: Taylor Simpson 
> ---

Reviewed-by: Brian Cain 

>  target/hexagon/README |   1 -
>  target/hexagon/gen_op_regs.py | 125 --
>  target/hexagon/meson.build|  14 +---
>  3 files changed, 2 insertions(+), 138 deletions(-)
>  delete mode 100755 target/hexagon/gen_op_regs.py
> 
> diff --git a/target/hexagon/README b/target/hexagon/README
> index 746ebec378..065c05154d 100644
> --- a/target/hexagon/README
> +++ b/target/hexagon/README
> @@ -43,7 +43,6 @@ target/hexagon/gen_semantics.c.  This step produces
>  That file is consumed by the following python scripts to produce the 
> indicated
>  header files in /target/hexagon
>  gen_opcodes_def.py  -> opcodes_def_generated.h.inc
> -gen_op_regs.py  -> op_regs_generated.h.inc
>  gen_printinsn.py-> printinsn_generated.h.inc
>  gen_op_attribs.py   -> op_attribs_generated.h.inc
>  gen_helper_protos.py-> helper_protos_generated.h.inc
> diff --git a/target/hexagon/gen_op_regs.py b/target/hexagon/gen_op_regs.py
> deleted file mode 100755
> index 7b7b33895a..00
> --- a/target/hexagon/gen_op_regs.py
> +++ /dev/null
> @@ -1,125 +0,0 @@
> -#!/usr/bin/env python3
> -
> -##
> -##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
> -##
> -##  This program is free software; you can redistribute it and/or modify
> -##  it under the terms of the GNU General Public License as published by
> -##  the Free Software Foundation; either version 2 of the License, or
> -##  (at your option) any later version.
> -##
> -##  This program is distributed in the hope that it will be useful,
> -##  but WITHOUT ANY WARRANTY; without even the implied warranty of
> -##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> -##  GNU General Public License for more details.
> -##
> -##  You should have received a copy of the GNU General Public License
> -##  along with this program; if not, see .
> -##
> -
> -import sys
> -import re
> -import string
> -import hex_common
> -
> -
> -##
> -## Generate the register and immediate operands for each instruction
> -##
> -def calculate_regid_reg(tag):
> -def letter_inc(x):
> -return chr(ord(x) + 1)
> -
> -ordered_implregs = ["SP", "FP", "LR"]
> -srcdst_lett = "X"
> -src_lett = "S"
> -dst_lett = "D"
> -retstr = ""
> -mapdict = {}
> -for reg in ordered_implregs:
> -reg_rd = 0
> -reg_wr = 0
> -if ("A_IMPLICIT_WRITES_" + reg) in hex_common.attribdict[tag]:
> -reg_wr = 1
> -if reg_rd and reg_wr:
> -retstr += srcdst_lett
> -mapdict[srcdst_lett] = reg
> -srcdst_lett = letter_inc(srcdst_lett)
> -elif reg_rd:
> -retstr += src_lett
> -mapdict[src_lett] = reg
> -src_lett = letter_inc(src_lett)
> -elif reg_wr:
> -retstr += dst_lett
> -mapdict[dst_lett] = reg
> -dst_lett = letter_inc(dst_lett)
> -return retstr, mapdict
> -
> -
> -def calculate_regid_letters(tag):
> -retstr, mapdict = calculate_regid_reg(tag)
> -return retstr
> -
> -
> -def strip_reg_prefix(x):
> -y = x.replace("UREG.", "")
> -y = y.replace("MREG.", "")
> -return y.replace("GREG.", "")
> -
> -
> -def main():
> -hex_common.read_semantics_file(sys.argv[1])
> -hex_common.read_attribs_file(sys.argv[2])
> -hex_common.init_registers()
> -tagregs = hex_common.get_tagregs(full=True)
> -tagimms = hex_common.get_tagimms()
> -
> -with open(sys.argv[3], "w") as f:
> -for tag in hex_common.tags:
> -regs = tagregs[tag]
> -rregs = []
> -wregs = []
> -regids = ""
> -for regtype, regid, _, numregs in regs:
> -reg = hex_common.get_register(tag, regtype, regid)
> -if reg.is_read():
> -if regid[0] not in regids:
> -regids += regid[0]
> -rregs.append(regtype + regid + numregs)
> -if reg.is_written():
> -wregs.append(regtype + regid + numregs)
> -if regid[0] not in regids:
> -regids += regid[0]
> -for attrib in hex_common.attribdict[tag]:
> -if hex_com

RE: [PATCH v2 9/9] Hexagon (target/hexagon) Remove hex_common.read_attribs_file

2024-03-28 Thread Brian Cain



> -Original Message-
> From: Taylor Simpson 
> Sent: Wednesday, March 6, 2024 9:23 PM
> To: qemu-devel@nongnu.org
> Cc: Brian Cain ; Matheus Bernardino (QUIC)
> ; Sid Manning ;
> Marco Liebel (QUIC) ;
> richard.hender...@linaro.org; phi...@linaro.org; a...@rev.ng; a...@rev.ng;
> ltaylorsimp...@gmail.com
> Subject: [PATCH v2 9/9] Hexagon (target/hexagon) Remove
> hex_common.read_attribs_file
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> The attribinfo data structure is not used
> Adjust the command-line arguments to the python scripts
> Add hex_common.read_common_files for TCG/helper generation scripts
> 
> Signed-off-by: Taylor Simpson 
> ---

Reviewed-by: Brian Cain 

>  target/hexagon/gen_analyze_funcs.py | 21 ++-
>  target/hexagon/gen_helper_funcs.py  | 21 ++-
>  target/hexagon/gen_helper_protos.py | 21 ++-
>  target/hexagon/gen_idef_parser_funcs.py |  5 ++--
>  target/hexagon/gen_op_attribs.py|  5 ++--
>  target/hexagon/gen_opcodes_def.py   |  4 +--
>  target/hexagon/gen_printinsn.py |  5 ++--
>  target/hexagon/gen_tcg_func_table.py|  5 ++--
>  target/hexagon/gen_tcg_funcs.py | 21 ++-
>  target/hexagon/hex_common.py| 35 +++--
>  target/hexagon/meson.build  | 31 +++---
>  11 files changed, 54 insertions(+), 120 deletions(-)
> 
> diff --git a/target/hexagon/gen_analyze_funcs.py
> b/target/hexagon/gen_analyze_funcs.py
> index a9af666cef..b73b4e2349 100755
> --- a/target/hexagon/gen_analyze_funcs.py
> +++ b/target/hexagon/gen_analyze_funcs.py
> @@ -1,7 +1,7 @@
>  #!/usr/bin/env python3
> 
>  ##
> -##  Copyright(c) 2022-2023 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
> +##  Copyright(c) 2022-2024 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
>  ##
>  ##  This program is free software; you can redistribute it and/or modify
>  ##  it under the terms of the GNU General Public License as published by
> @@ -67,24 +67,7 @@ def gen_analyze_func(f, tag, regs, imms):
> 
> 
>  def main():
> -hex_common.read_semantics_file(sys.argv[1])
> -hex_common.read_attribs_file(sys.argv[2])
> -hex_common.read_overrides_file(sys.argv[3])
> -hex_common.read_overrides_file(sys.argv[4])
> -## Whether or not idef-parser is enabled is
> -## determined by the number of arguments to
> -## this script:
> -##
> -##   5 args. -> not enabled,
> -##   6 args. -> idef-parser enabled.
> -##
> -## The 6:th arg. then holds a list of the successfully
> -## parsed instructions.
> -is_idef_parser_enabled = len(sys.argv) > 6
> -if is_idef_parser_enabled:
> -hex_common.read_idef_parser_enabled_file(sys.argv[5])
> -hex_common.calculate_attribs()
> -hex_common.init_registers()
> +hex_common.read_common_files()
>  tagregs = hex_common.get_tagregs()
>  tagimms = hex_common.get_tagimms()
> 
> diff --git a/target/hexagon/gen_helper_funcs.py
> b/target/hexagon/gen_helper_funcs.py
> index 9cc3d69c49..e9685bff2f 100755
> --- a/target/hexagon/gen_helper_funcs.py
> +++ b/target/hexagon/gen_helper_funcs.py
> @@ -1,7 +1,7 @@
>  #!/usr/bin/env python3
> 
>  ##
> -##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
> +##  Copyright(c) 2019-2024 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
>  ##
>  ##  This program is free software; you can redistribute it and/or modify
>  ##  it under the terms of the GNU General Public License as published by
> @@ -102,24 +102,7 @@ def gen_helper_function(f, tag, tagregs, tagimms):
> 
> 
>  def main():
> -hex_common.read_semantics_file(sys.argv[1])
> -hex_common.read_attribs_file(sys.argv[2])
> -hex_common.read_overrides_file(sys.argv[3])
> -hex_common.read_overrides_file(sys.argv[4])
> -## Whether or not idef-parser is enabled is
> -## determined by the number of arguments to
> -## this script:
> -##
> -##   5 args. -> not enabled,
> -##   6 args. -> idef-parser enabled.
> -##
> -## The 6:th arg. then holds a list of the successfully
> -## parsed instructions.
> -is_idef_parser_enabled = len(sys.argv) > 6
> -if is_idef_parser_enabled:
> -hex_common.read_idef_parser_enabled_file(sys.argv[5])
> -hex_common.calculate_attribs()
> -hex_common.init_registers()
> +hex_common.read_common_files()
>  tagregs = hex_common.get_tagregs()
>  tagimms = hex_common.get_tagimms()
> 
> diff --git a/target/hexagon/gen_helper_protos.py
> b/target/hexagon/gen_helper_protos.py
> index c82b0f54e4..4cc72a1581 100755
> --- a/target/hexagon/gen_helper_protos.py
> +++ b/target/hexagon/gen_helper_protos.py
> @@ -1,7 +1,7 @@
>  #!/usr/bin/env python3
> 
>  ##
> -##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
> +##  Copyright(c) 2019-2024

RE: [PATCH v2 8/9] Hexagon (target/hexagon) Remove gen_shortcode.py

2024-03-28 Thread Brian Cain



> -Original Message-
> From: Taylor Simpson 
> Sent: Wednesday, March 6, 2024 9:23 PM
> To: qemu-devel@nongnu.org
> Cc: Brian Cain ; Matheus Bernardino (QUIC)
> ; Sid Manning ;
> Marco Liebel (QUIC) ;
> richard.hender...@linaro.org; phi...@linaro.org; a...@rev.ng; a...@rev.ng;
> ltaylorsimp...@gmail.com
> Subject: [PATCH v2 8/9] Hexagon (target/hexagon) Remove gen_shortcode.py
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> This data structure is not used
> 
> Signed-off-by: Taylor Simpson 
> ---

Reviewed-by: Brian Cain 

>  target/hexagon/opcodes.c|  7 
>  target/hexagon/README   |  1 -
>  target/hexagon/gen_shortcode.py | 63 -
>  target/hexagon/meson.build  | 10 --
>  4 files changed, 81 deletions(-)
>  delete mode 100755 target/hexagon/gen_shortcode.py
> 
> diff --git a/target/hexagon/opcodes.c b/target/hexagon/opcodes.c
> index 02ae9cf787..c8bde2f9e9 100644
> --- a/target/hexagon/opcodes.c
> +++ b/target/hexagon/opcodes.c
> @@ -37,13 +37,6 @@ const char * const opcode_names[] = {
>  };
> 
> 
> -const char * const opcode_short_semantics[] = {
> -#define DEF_SHORTCODE(TAG, SHORTCODE)  [TAG] = #SHORTCODE,
> -#include "shortcode_generated.h.inc"
> -#undef DEF_SHORTCODE
> -NULL
> -};
> -
>  DECLARE_BITMAP(opcode_attribs[XX_LAST_OPCODE], A_ZZ_LASTATTRIB);
> 
>  static void init_attribs(int tag, ...)
> diff --git a/target/hexagon/README b/target/hexagon/README
> index 065c05154d..65b4fcc0fa 100644
> --- a/target/hexagon/README
> +++ b/target/hexagon/README
> @@ -46,7 +46,6 @@ header files in /target/hexagon
>  gen_printinsn.py-> printinsn_generated.h.inc
>  gen_op_attribs.py   -> op_attribs_generated.h.inc
>  gen_helper_protos.py-> helper_protos_generated.h.inc
> -gen_shortcode.py-> shortcode_generated.h.inc
>  gen_tcg_funcs.py-> tcg_funcs_generated.c.inc
>  gen_tcg_func_table.py   -> tcg_func_table_generated.c.inc
>  gen_helper_funcs.py -> helper_funcs_generated.c.inc
> diff --git a/target/hexagon/gen_shortcode.py
> b/target/hexagon/gen_shortcode.py
> deleted file mode 100755
> index deb94446c4..00
> --- a/target/hexagon/gen_shortcode.py
> +++ /dev/null
> @@ -1,63 +0,0 @@
> -#!/usr/bin/env python3
> -
> -##
> -##  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
> -##
> -##  This program is free software; you can redistribute it and/or modify
> -##  it under the terms of the GNU General Public License as published by
> -##  the Free Software Foundation; either version 2 of the License, or
> -##  (at your option) any later version.
> -##
> -##  This program is distributed in the hope that it will be useful,
> -##  but WITHOUT ANY WARRANTY; without even the implied warranty of
> -##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> -##  GNU General Public License for more details.
> -##
> -##  You should have received a copy of the GNU General Public License
> -##  along with this program; if not, see .
> -##
> -
> -import sys
> -import re
> -import string
> -import hex_common
> -
> -
> -def gen_shortcode(f, tag):
> -f.write(f"DEF_SHORTCODE({tag}, {hex_common.semdict[tag]})\n")
> -
> -
> -def main():
> -hex_common.read_semantics_file(sys.argv[1])
> -hex_common.read_attribs_file(sys.argv[2])
> -hex_common.calculate_attribs()
> -tagregs = hex_common.get_tagregs()
> -tagimms = hex_common.get_tagimms()
> -
> -with open(sys.argv[3], "w") as f:
> -f.write("#ifndef DEF_SHORTCODE\n")
> -f.write("#define DEF_SHORTCODE(TAG,SHORTCODE)/* Nothing */\n")
> -f.write("#endif\n")
> -
> -for tag in hex_common.tags:
> -## Skip the priv instructions
> -if "A_PRIV" in hex_common.attribdict[tag]:
> -continue
> -## Skip the guest instructions
> -if "A_GUEST" in hex_common.attribdict[tag]:
> -continue
> -## Skip the diag instructions
> -if tag == "Y6_diag":
> -continue
> -if tag == "Y6_diag0":
> -continue
> -if tag == "Y6_diag1":
> -continue
> -
> -gen_shortcode(f, tag)
> -
> -f.write("#undef DEF_SHORTCODE\n")
> -
> -
> -if __name__ == "__main__":
> -main()
> diff --git a/target/hexagon/meson.build b/target/hexagon/meson.build
> index b3a0944d3b..988e7489ba 100644
> --- a/target/hexagon/meson.build
> +++ b/target/hexagon/meson.build
> @@ -42,21 +42,11 @@ hexagon_ss.add(semantics_generated)
>  #
>  # Step 2
>  # We use Python scripts to generate the following files
> -# shortcode_generated.h.inc
>  # tcg_func_table_generated.c.inc
>  # printinsn_generated.h.inc
>

Re: [PATCH v8 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-03-28 Thread Huang, Ying

"Ho-Ren (Jack) Chuang"  writes:

> The current implementation treats emulated memory devices, such as
> CXL1.1 type3 memory, as normal DRAM when they are emulated as normal memory
> (E820_TYPE_RAM). However, these emulated devices have different
> characteristics than traditional DRAM, making it important to
> distinguish them. Thus, we modify the tiered memory initialization process
> to introduce a delay specifically for CPUless NUMA nodes. This delay
> ensures that the memory tier initialization for these nodes is deferred
> until HMAT information is obtained during the boot process. Finally,
> demotion tables are recalculated at the end.
>
> * late_initcall(memory_tier_late_init);
> Some device drivers may have initialized memory tiers between
> `memory_tier_init()` and `memory_tier_late_init()`, potentially bringing
> online memory nodes and configuring memory tiers. They should be excluded
> in the late init.
>
> * Handle cases where there is no HMAT when creating memory tiers
> There is a scenario where a CPUless node does not provide HMAT information.
> If no HMAT is specified, it falls back to using the default DRAM tier.
>
> * Introduce another new lock `default_dram_perf_lock` for adist calculation
> In the current implementation, iterating through CPUlist nodes requires
> holding the `memory_tier_lock`. However, `mt_calc_adistance()` will end up
> trying to acquire the same lock, leading to a potential deadlock.
> Therefore, we propose introducing a standalone `default_dram_perf_lock` to
> protect `default_dram_perf_*`. This approach not only avoids deadlock
> but also prevents holding a large lock simultaneously.
>
> * Upgrade `set_node_memory_tier` to support additional cases, including
>   default DRAM, late CPUless, and hot-plugged initializations.
> To cover hot-plugged memory nodes, `mt_calc_adistance()` and
> `mt_find_alloc_memory_type()` are moved into `set_node_memory_tier()` to
> handle cases where memtype is not initialized and where HMAT information is
> available.
>
> * Introduce `default_memory_types` for those memory types that are not
>   initialized by device drivers.
> Because late initialized memory and default DRAM memory need to be managed,
> a default memory type is created for storing all memory types that are
> not initialized by device drivers and as a fallback.
>
> Signed-off-by: Ho-Ren (Jack) Chuang 
> Signed-off-by: Hao Xiang 
> Reviewed-by: "Huang, Ying" 
> ---
>  mm/memory-tiers.c | 94 +++
>  1 file changed, 78 insertions(+), 16 deletions(-)
>
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 974af10cfdd8..e24fc3bebae4 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -36,6 +36,11 @@ struct node_memory_type_map {
>  
>  static DEFINE_MUTEX(memory_tier_lock);
>  static LIST_HEAD(memory_tiers);
> +/*
> + * The list is used to store all memory types that are not created
> + * by a device driver.
> + */
> +static LIST_HEAD(default_memory_types);
>  static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
>  struct memory_dev_type *default_dram_type;
>  
> @@ -108,6 +113,8 @@ static struct demotion_nodes *node_demotion __read_mostly;
>  
>  static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
>  
> +/* The lock is used to protect `default_dram_perf*` info and nid. */
> +static DEFINE_MUTEX(default_dram_perf_lock);
>  static bool default_dram_perf_error;
>  static struct access_coordinate default_dram_perf;
>  static int default_dram_perf_ref_nid = NUMA_NO_NODE;
> @@ -505,7 +512,8 @@ static inline void __init_node_memory_type(int node, 
> struct memory_dev_type *mem
>  static struct memory_tier *set_node_memory_tier(int node)
>  {
>   struct memory_tier *memtier;
> - struct memory_dev_type *memtype;
> + struct memory_dev_type *mtype = default_dram_type;
> + int adist = MEMTIER_ADISTANCE_DRAM;
>   pg_data_t *pgdat = NODE_DATA(node);
>  
>  
> @@ -514,11 +522,20 @@ static struct memory_tier *set_node_memory_tier(int 
> node)
>   if (!node_state(node, N_MEMORY))
>   return ERR_PTR(-EINVAL);
>  
> - __init_node_memory_type(node, default_dram_type);
> + mt_calc_adistance(node, &adist);
> + if (node_memory_types[node].memtype == NULL) {
> + mtype = mt_find_alloc_memory_type(adist, &default_memory_types);
> + if (IS_ERR(mtype)) {
> + mtype = default_dram_type;
> + pr_info("Failed to allocate a memory type. Fall 
> back.\n");
> + }
> + }
> +
> + __init_node_memory_type(node, mtype);
>  
> - memtype = node_memory_types[node].memtype;
> - node_set(node, memtype->nodes);
> - memtier = find_create_memory_tier(memtype);
> + mtype = node_memory_types[node].memtype;
> + node_set(node, mtype->nodes);
> + memtier = find_create_memory_tier(mtype);
>   if (!IS_ERR(memtier))
>   rcu_assign_pointer(pgdat->memtier, memtier);
>   return me

[PATCH v8 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-03-28 Thread Ho-Ren (Jack) Chuang

The current implementation treats emulated memory devices, such as
CXL1.1 type3 memory, as normal DRAM when they are emulated as normal memory
(E820_TYPE_RAM). However, these emulated devices have different
characteristics than traditional DRAM, making it important to
distinguish them. Thus, we modify the tiered memory initialization process
to introduce a delay specifically for CPUless NUMA nodes. This delay
ensures that the memory tier initialization for these nodes is deferred
until HMAT information is obtained during the boot process. Finally,
demotion tables are recalculated at the end.

* late_initcall(memory_tier_late_init);
Some device drivers may have initialized memory tiers between
`memory_tier_init()` and `memory_tier_late_init()`, potentially bringing
online memory nodes and configuring memory tiers. They should be excluded
in the late init.

* Handle cases where there is no HMAT when creating memory tiers
There is a scenario where a CPUless node does not provide HMAT information.
If no HMAT is specified, it falls back to using the default DRAM tier.

* Introduce another new lock `default_dram_perf_lock` for adist calculation
In the current implementation, iterating through CPUlist nodes requires
holding the `memory_tier_lock`. However, `mt_calc_adistance()` will end up
trying to acquire the same lock, leading to a potential deadlock.
Therefore, we propose introducing a standalone `default_dram_perf_lock` to
protect `default_dram_perf_*`. This approach not only avoids deadlock
but also prevents holding a large lock simultaneously.

* Upgrade `set_node_memory_tier` to support additional cases, including
  default DRAM, late CPUless, and hot-plugged initializations.
To cover hot-plugged memory nodes, `mt_calc_adistance()` and
`mt_find_alloc_memory_type()` are moved into `set_node_memory_tier()` to
handle cases where memtype is not initialized and where HMAT information is
available.

* Introduce `default_memory_types` for those memory types that are not
  initialized by device drivers.
Because late initialized memory and default DRAM memory need to be managed,
a default memory type is created for storing all memory types that are
not initialized by device drivers and as a fallback.

Signed-off-by: Ho-Ren (Jack) Chuang 
Signed-off-by: Hao Xiang 
Reviewed-by: "Huang, Ying" 
---
 mm/memory-tiers.c | 94 +++
 1 file changed, 78 insertions(+), 16 deletions(-)

diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 974af10cfdd8..e24fc3bebae4 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -36,6 +36,11 @@ struct node_memory_type_map {
 
 static DEFINE_MUTEX(memory_tier_lock);
 static LIST_HEAD(memory_tiers);
+/*
+ * The list is used to store all memory types that are not created
+ * by a device driver.
+ */
+static LIST_HEAD(default_memory_types);
 static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
 struct memory_dev_type *default_dram_type;
 
@@ -108,6 +113,8 @@ static struct demotion_nodes *node_demotion __read_mostly;
 
 static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
 
+/* The lock is used to protect `default_dram_perf*` info and nid. */
+static DEFINE_MUTEX(default_dram_perf_lock);
 static bool default_dram_perf_error;
 static struct access_coordinate default_dram_perf;
 static int default_dram_perf_ref_nid = NUMA_NO_NODE;
@@ -505,7 +512,8 @@ static inline void __init_node_memory_type(int node, struct 
memory_dev_type *mem
 static struct memory_tier *set_node_memory_tier(int node)
 {
struct memory_tier *memtier;
-   struct memory_dev_type *memtype;
+   struct memory_dev_type *mtype = default_dram_type;
+   int adist = MEMTIER_ADISTANCE_DRAM;
pg_data_t *pgdat = NODE_DATA(node);
 
 
@@ -514,11 +522,20 @@ static struct memory_tier *set_node_memory_tier(int node)
if (!node_state(node, N_MEMORY))
return ERR_PTR(-EINVAL);
 
-   __init_node_memory_type(node, default_dram_type);
+   mt_calc_adistance(node, &adist);
+   if (node_memory_types[node].memtype == NULL) {
+   mtype = mt_find_alloc_memory_type(adist, &default_memory_types);
+   if (IS_ERR(mtype)) {
+   mtype = default_dram_type;
+   pr_info("Failed to allocate a memory type. Fall 
back.\n");
+   }
+   }
+
+   __init_node_memory_type(node, mtype);
 
-   memtype = node_memory_types[node].memtype;
-   node_set(node, memtype->nodes);
-   memtier = find_create_memory_tier(memtype);
+   mtype = node_memory_types[node].memtype;
+   node_set(node, mtype->nodes);
+   memtier = find_create_memory_tier(mtype);
if (!IS_ERR(memtier))
rcu_assign_pointer(pgdat->memtier, memtier);
return memtier;
@@ -655,6 +672,34 @@ void mt_put_memory_types(struct list_head *memory_types)
 }
 EXPORT_SYMBOL_GPL(mt_put_memory_types);
 
+/*
+ * This is invoked via `late_initcall()` to initialize memory

[PATCH v8 1/2] memory tier: dax/kmem: introduce an abstract layer for finding, allocating, and putting memory types

2024-03-28 Thread Ho-Ren (Jack) Chuang

Since different memory devices require finding, allocating, and putting
memory types, these common steps are abstracted in this patch,
enhancing the scalability and conciseness of the code.

Signed-off-by: Ho-Ren (Jack) Chuang 
Reviewed-by: "Huang, Ying" 
---
 drivers/dax/kmem.c   | 20 ++--
 include/linux/memory-tiers.h | 13 +
 mm/memory-tiers.c| 32 
 3 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 42ee360cf4e3..01399e5b53b2 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -55,21 +55,10 @@ static LIST_HEAD(kmem_memory_types);
 
 static struct memory_dev_type *kmem_find_alloc_memory_type(int adist)
 {
-   bool found = false;
struct memory_dev_type *mtype;
 
mutex_lock(&kmem_memory_type_lock);
-   list_for_each_entry(mtype, &kmem_memory_types, list) {
-   if (mtype->adistance == adist) {
-   found = true;
-   break;
-   }
-   }
-   if (!found) {
-   mtype = alloc_memory_type(adist);
-   if (!IS_ERR(mtype))
-   list_add(&mtype->list, &kmem_memory_types);
-   }
+   mtype = mt_find_alloc_memory_type(adist, &kmem_memory_types);
mutex_unlock(&kmem_memory_type_lock);
 
return mtype;
@@ -77,13 +66,8 @@ static struct memory_dev_type 
*kmem_find_alloc_memory_type(int adist)
 
 static void kmem_put_memory_types(void)
 {
-   struct memory_dev_type *mtype, *mtn;
-
mutex_lock(&kmem_memory_type_lock);
-   list_for_each_entry_safe(mtype, mtn, &kmem_memory_types, list) {
-   list_del(&mtype->list);
-   put_memory_type(mtype);
-   }
+   mt_put_memory_types(&kmem_memory_types);
mutex_unlock(&kmem_memory_type_lock);
 }
 
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 69e781900082..a44c03c2ba3a 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -48,6 +48,9 @@ int mt_calc_adistance(int node, int *adist);
 int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
 const char *source);
 int mt_perf_to_adistance(struct access_coordinate *perf, int *adist);
+struct memory_dev_type *mt_find_alloc_memory_type(int adist,
+   struct list_head 
*memory_types);
+void mt_put_memory_types(struct list_head *memory_types);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -136,5 +139,15 @@ static inline int mt_perf_to_adistance(struct 
access_coordinate *perf, int *adis
 {
return -EIO;
 }
+
+struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head 
*memory_types)
+{
+   return NULL;
+}
+
+void mt_put_memory_types(struct list_head *memory_types)
+{
+
+}
 #endif /* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 0537664620e5..974af10cfdd8 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -623,6 +623,38 @@ void clear_node_memory_type(int node, struct 
memory_dev_type *memtype)
 }
 EXPORT_SYMBOL_GPL(clear_node_memory_type);
 
+struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head 
*memory_types)
+{
+   bool found = false;
+   struct memory_dev_type *mtype;
+
+   list_for_each_entry(mtype, memory_types, list) {
+   if (mtype->adistance == adist) {
+   found = true;
+   break;
+   }
+   }
+   if (!found) {
+   mtype = alloc_memory_type(adist);
+   if (!IS_ERR(mtype))
+   list_add(&mtype->list, memory_types);
+   }
+
+   return mtype;
+}
+EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type);
+
+void mt_put_memory_types(struct list_head *memory_types)
+{
+   struct memory_dev_type *mtype, *mtn;
+
+   list_for_each_entry_safe(mtype, mtn, memory_types, list) {
+   list_del(&mtype->list);
+   put_memory_type(mtype);
+   }
+}
+EXPORT_SYMBOL_GPL(mt_put_memory_types);
+
 static void dump_hmem_attrs(struct access_coordinate *coord, const char 
*prefix)
 {
pr_info(
-- 
Ho-Ren (Jack) Chuang

[PATCH v8 0/2] Improved Memory Tier Creation for CPUless NUMA Nodes

2024-03-28 Thread Ho-Ren (Jack) Chuang

When a memory device, such as CXL1.1 type3 memory, is emulated as
normal memory (E820_TYPE_RAM), the memory device is indistinguishable
from normal DRAM in terms of memory tiering with the current implementation.
The current memory tiering assigns all detected normal memory nodes
to the same DRAM tier. This results in normal memory devices with
different attributions being unable to be assigned to the correct memory tier,
leading to the inability to migrate pages between different types of memory.
https://lore.kernel.org/linux-mm/ph0pr08mb7955e9f08ccb64f23963b5c3a8...@ph0pr08mb7955.namprd08.prod.outlook.com/T/

This patchset automatically resolves the issues. It delays the initialization
of memory tiers for CPUless NUMA nodes until they obtain HMAT information
and after all devices are initialized at boot time, eliminating the need
for user intervention. If no HMAT is specified, it falls back to
using `default_dram_type`.

Example usecase:
We have CXL memory on the host, and we create VMs with a new system memory
device backed by host CXL memory. We inject CXL memory performance attributes
through QEMU, and the guest now sees memory nodes with performance attributes
in HMAT. With this change, we enable the guest kernel to construct
the correct memory tiering for the memory nodes.

-v8:
 * Fix email format
-v7:
 * Add Reviewed-by: Huang, Ying 
-v6:
 Thanks to Ying's comments,
 * Move `default_dram_perf_lock` to the function's beginning for clarity
 * Fix double unlocking at v5
 * 
https://lore.kernel.org/lkml/20240327072729.3381685-1-horenchu...@bytedance.com/T/#u
-v5:
 Thanks to Ying's comments,
 * Add comments about what is protected by `default_dram_perf_lock`
 * Fix an uninitialized pointer mtype
 * Slightly shorten the time holding `default_dram_perf_lock`
 * Fix a deadlock bug in `mt_perf_to_adistance`
 * 
https://lore.kernel.org/lkml/20240327041646.3258110-1-horenchu...@bytedance.com/T/#u
-v4:
 Thanks to Ying's comments,
 * Remove redundant code
 * Reorganize patches accordingly
 * 
https://lore.kernel.org/lkml/20240322070356.315922-1-horenchu...@bytedance.com/T/#u
-v3:
 Thanks to Ying's comments,
 * Make the newly added code independent of HMAT
 * Upgrade set_node_memory_tier to support more cases
 * Put all non-driver-initialized memory types into default_memory_types
   instead of using hmat_memory_types
 * find_alloc_memory_type -> mt_find_alloc_memory_type
 * 
https://lore.kernel.org/lkml/20240320061041.3246828-1-horenchu...@bytedance.com/T/#u
-v2:
 Thanks to Ying's comments,
 * Rewrite cover letter & patch description
 * Rename functions, don't use _hmat
 * Abstract common functions into find_alloc_memory_type()
 * Use the expected way to use set_node_memory_tier instead of modifying it
 * 
https://lore.kernel.org/lkml/20240312061729.1997111-1-horenchu...@bytedance.com/T/#u
-v1:
 * 
https://lore.kernel.org/lkml/20240301082248.3456086-1-horenchu...@bytedance.com/T/#u


Ho-Ren (Jack) Chuang (2):
  memory tier: dax/kmem: introduce an abstract layer for finding,
allocating, and putting memory types
  memory tier: create CPUless memory tiers after obtaining HMAT info

 drivers/dax/kmem.c   |  20 +-
 include/linux/memory-tiers.h |  13 
 mm/memory-tiers.c| 126 ++-
 3 files changed, 125 insertions(+), 34 deletions(-)

-- 
Ho-Ren (Jack) Chuang

[PATCH v7 0/2] Improved Memory Tier Creation for CPUless NUMA Nodes

2024-03-28 Thread Ho-Ren (Jack) Chuang

When a memory device, such as CXL1.1 type3 memory, is emulated as
normal memory (E820_TYPE_RAM), the memory device is indistinguishable
from normal DRAM in terms of memory tiering with the current implementation.
The current memory tiering assigns all detected normal memory nodes
to the same DRAM tier. This results in normal memory devices with
different attributions being unable to be assigned to the correct memory tier,
leading to the inability to migrate pages between different types of memory.
https://lore.kernel.org/linux-mm/ph0pr08mb7955e9f08ccb64f23963b5c3a8...@ph0pr08mb7955.namprd08.prod.outlook.com/T/

This patchset automatically resolves the issues. It delays the initialization
of memory tiers for CPUless NUMA nodes until they obtain HMAT information
and after all devices are initialized at boot time, eliminating the need
for user intervention. If no HMAT is specified, it falls back to
using `default_dram_type`.

Example usecase:
We have CXL memory on the host, and we create VMs with a new system memory
device backed by host CXL memory. We inject CXL memory performance attributes
through QEMU, and the guest now sees memory nodes with performance attributes
in HMAT. With this change, we enable the guest kernel to construct
the correct memory tiering for the memory nodes.

-v7:
 * Add Reviewed-by: Huang, Ying 
-v6:
 Thanks to Ying's comments,
 * Move `default_dram_perf_lock` to the function's beginning for clarity
 * Fix double unlocking at v5
 * 
https://lore.kernel.org/lkml/20240327072729.3381685-1-horenchu...@bytedance.com/T/#u
-v5:
 Thanks to Ying's comments,
 * Add comments about what is protected by `default_dram_perf_lock`
 * Fix an uninitialized pointer mtype
 * Slightly shorten the time holding `default_dram_perf_lock`
 * Fix a deadlock bug in `mt_perf_to_adistance`
 * 
https://lore.kernel.org/lkml/20240327041646.3258110-1-horenchu...@bytedance.com/T/#u
-v4:
 Thanks to Ying's comments,
 * Remove redundant code
 * Reorganize patches accordingly
 * 
https://lore.kernel.org/lkml/20240322070356.315922-1-horenchu...@bytedance.com/T/#u
-v3:
 Thanks to Ying's comments,
 * Make the newly added code independent of HMAT
 * Upgrade set_node_memory_tier to support more cases
 * Put all non-driver-initialized memory types into default_memory_types
   instead of using hmat_memory_types
 * find_alloc_memory_type -> mt_find_alloc_memory_type
 * 
https://lore.kernel.org/lkml/20240320061041.3246828-1-horenchu...@bytedance.com/T/#u
-v2:
 Thanks to Ying's comments,
 * Rewrite cover letter & patch description
 * Rename functions, don't use _hmat
 * Abstract common functions into find_alloc_memory_type()
 * Use the expected way to use set_node_memory_tier instead of modifying it
 * 
https://lore.kernel.org/lkml/20240312061729.1997111-1-horenchu...@bytedance.com/T/#u
-v1:
 * 
https://lore.kernel.org/lkml/20240301082248.3456086-1-horenchu...@bytedance.com/T/#u

Ho-Ren (Jack) Chuang (2):
  memory tier: dax/kmem: introduce an abstract layer for finding,
allocating, and putting memory types
  memory tier: create CPUless memory tiers after obtaining HMAT info

 drivers/dax/kmem.c   |  20 +-
 include/linux/memory-tiers.h |  13 
 mm/memory-tiers.c| 126 ++-
 3 files changed, 125 insertions(+), 34 deletions(-)

-- 
Ho-Ren (Jack) Chuang

Re: [PATCH-for-9.1 05/21] target/m68k: Replace qemu_printf() by monitor_printf() in monitor

2024-03-28 Thread BALATON Zoltan


On Thu, 28 Mar 2024, Dr. David Alan Gilbert wrote:

* BALATON Zoltan (bala...@eik.bme.hu) wrote:

On Sun, 24 Mar 2024, Dr. David Alan Gilbert wrote:

* Philippe Mathieu-Daudé (phi...@linaro.org) wrote:

Replace qemu_printf() by monitor_printf() / monitor_puts() in monitor.

Signed-off-by: Philippe Mathieu-Daudé 
---
 target/m68k/cpu.h |   2 +-
 target/m68k/helper.c  | 126 +-
 target/m68k/monitor.c |   4 +-
 3 files changed, 67 insertions(+), 65 deletions(-)

diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h
index 346427e144..4e4307956d 100644
--- a/target/m68k/cpu.h
+++ b/target/m68k/cpu.h
@@ -620,6 +620,6 @@ static inline void cpu_get_tb_cpu_state(CPUM68KState *env, 
vaddr *pc,
 }
 }

-void dump_mmu(CPUM68KState *env);
+void dump_mmu(Monitor *mon, CPUM68KState *env);

 #endif
diff --git a/target/m68k/helper.c b/target/m68k/helper.c
index 1a475f082a..310e26dfa1 100644
--- a/target/m68k/helper.c
+++ b/target/m68k/helper.c
@@ -25,7 +25,7 @@
 #include "exec/helper-proto.h"
 #include "gdbstub/helpers.h"
 #include "fpu/softfloat.h"
-#include "qemu/qemu-print.h"
+#include "monitor/monitor.h"

 #define SIGNBIT (1u << 31)

@@ -455,28 +455,30 @@ void m68k_switch_sp(CPUM68KState *env)
 #if !defined(CONFIG_USER_ONLY)
 /* MMU: 68040 only */

-static void print_address_zone(uint32_t logical, uint32_t physical,
+static void print_address_zone(Monitor *mon,
+   uint32_t logical, uint32_t physical,
uint32_t size, int attr)
 {
-qemu_printf("%08x - %08x -> %08x - %08x %c ",
-logical, logical + size - 1,
-physical, physical + size - 1,
-attr & 4 ? 'W' : '-');
+monitor_printf(mon, "%08x - %08x -> %08x - %08x %c ",
+   logical, logical + size - 1,
+   physical, physical + size - 1,
+   attr & 4 ? 'W' : '-');
 size >>= 10;
 if (size < 1024) {
-qemu_printf("(%d KiB)\n", size);
+monitor_printf(mon, "(%d KiB)\n", size);
 } else {
 size >>= 10;
 if (size < 1024) {
-qemu_printf("(%d MiB)\n", size);
+monitor_printf(mon, "(%d MiB)\n", size);
 } else {
 size >>= 10;
-qemu_printf("(%d GiB)\n", size);
+monitor_printf(mon, "(%d GiB)\n", size);
 }
 }
 }

-static void dump_address_map(CPUM68KState *env, uint32_t root_pointer)
+static void dump_address_map(Monitor *mon, CPUM68KState *env,
+ uint32_t root_pointer)
 {
 int i, j, k;
 int tic_size, tic_shift;
@@ -545,7 +547,7 @@ static void dump_address_map(CPUM68KState *env, uint32_t 
root_pointer)
 if (first_logical != 0x) {
 size = last_logical + (1 << tic_shift) -
first_logical;
-print_address_zone(first_logical,
+print_address_zone(mon, first_logical,
first_physical, size, last_attr);
 }
 first_logical = logical;
@@ -556,125 +558,125 @@ static void dump_address_map(CPUM68KState *env, 
uint32_t root_pointer)
 }
 if (first_logical != logical || (attr & 4) != (last_attr & 4)) {
 size = logical + (1 << tic_shift) - first_logical;
-print_address_zone(first_logical, first_physical, size, last_attr);
+print_address_zone(mon, first_logical, first_physical, size, 
last_attr);
 }
 }

 #define DUMP_CACHEFLAGS(a) \
 switch (a & M68K_DESC_CACHEMODE) { \
 case M68K_DESC_CM_WRTHRU: /* cacheable, write-through */ \
-qemu_printf("T"); \
+monitor_puts(mon, "T"); \
 break; \
 case M68K_DESC_CM_COPYBK: /* cacheable, copyback */ \
-qemu_printf("C"); \
+monitor_puts(mon, "C"); \
 break; \
 case M68K_DESC_CM_SERIAL: /* noncachable, serialized */ \
-qemu_printf("S"); \
+monitor_puts(mon, "S"); \
 break; \
 case M68K_DESC_CM_NCACHE: /* noncachable */ \
-qemu_printf("N"); \
+monitor_puts(mon, "N"); \
 break; \
 }

-static void dump_ttr(uint32_t ttr)
+static void dump_ttr(Monitor *mon, uint32_t ttr)
 {
 if ((ttr & M68K_TTR_ENABLED) == 0) {
-qemu_printf("disabled\n");
+monitor_puts(mon, "disabled\n");
 return;
 }
-qemu_printf("Base: 0x%08x Mask: 0x%08x Control: ",
-ttr & M68K_TTR_ADDR_BASE,
-(ttr & M68K_TTR_ADDR_MASK) << M68K_TTR_ADDR_MASK_SHIFT);
+monitor_printf(mon, "Base: 0x%08x Mask: 0x%08x Control: ",
+   ttr & M68K_TTR_ADDR_BASE,
+   (ttr & M68K_TTR_ADDR_MASK) << M68K_TTR_ADDR_MASK_SHIFT);
 switch (ttr & M68K_TTR_SFIELD) {
 case M68K_TTR_SFIELD_USER:
-qemu_printf("U");
+monitor_puts(mon, "U");
 break;
 case M68K_TTR_SFIELD_SUPER:
-qemu

Re: [PATCH-for-9.1 05/21] target/m68k: Replace qemu_printf() by monitor_printf() in monitor

2024-03-28 Thread Dr. David Alan Gilbert

* BALATON Zoltan (bala...@eik.bme.hu) wrote:
> On Sun, 24 Mar 2024, Dr. David Alan Gilbert wrote:
> > * Philippe Mathieu-Daudé (phi...@linaro.org) wrote:
> > > Replace qemu_printf() by monitor_printf() / monitor_puts() in monitor.
> > > 
> > > Signed-off-by: Philippe Mathieu-Daudé 
> > > ---
> > >  target/m68k/cpu.h |   2 +-
> > >  target/m68k/helper.c  | 126 +-
> > >  target/m68k/monitor.c |   4 +-
> > >  3 files changed, 67 insertions(+), 65 deletions(-)
> > > 
> > > diff --git a/target/m68k/cpu.h b/target/m68k/cpu.h
> > > index 346427e144..4e4307956d 100644
> > > --- a/target/m68k/cpu.h
> > > +++ b/target/m68k/cpu.h
> > > @@ -620,6 +620,6 @@ static inline void cpu_get_tb_cpu_state(CPUM68KState 
> > > *env, vaddr *pc,
> > >  }
> > >  }
> > > 
> > > -void dump_mmu(CPUM68KState *env);
> > > +void dump_mmu(Monitor *mon, CPUM68KState *env);
> > > 
> > >  #endif
> > > diff --git a/target/m68k/helper.c b/target/m68k/helper.c
> > > index 1a475f082a..310e26dfa1 100644
> > > --- a/target/m68k/helper.c
> > > +++ b/target/m68k/helper.c
> > > @@ -25,7 +25,7 @@
> > >  #include "exec/helper-proto.h"
> > >  #include "gdbstub/helpers.h"
> > >  #include "fpu/softfloat.h"
> > > -#include "qemu/qemu-print.h"
> > > +#include "monitor/monitor.h"
> > > 
> > >  #define SIGNBIT (1u << 31)
> > > 
> > > @@ -455,28 +455,30 @@ void m68k_switch_sp(CPUM68KState *env)
> > >  #if !defined(CONFIG_USER_ONLY)
> > >  /* MMU: 68040 only */
> > > 
> > > -static void print_address_zone(uint32_t logical, uint32_t physical,
> > > +static void print_address_zone(Monitor *mon,
> > > +   uint32_t logical, uint32_t physical,
> > > uint32_t size, int attr)
> > >  {
> > > -qemu_printf("%08x - %08x -> %08x - %08x %c ",
> > > -logical, logical + size - 1,
> > > -physical, physical + size - 1,
> > > -attr & 4 ? 'W' : '-');
> > > +monitor_printf(mon, "%08x - %08x -> %08x - %08x %c ",
> > > +   logical, logical + size - 1,
> > > +   physical, physical + size - 1,
> > > +   attr & 4 ? 'W' : '-');
> > >  size >>= 10;
> > >  if (size < 1024) {
> > > -qemu_printf("(%d KiB)\n", size);
> > > +monitor_printf(mon, "(%d KiB)\n", size);
> > >  } else {
> > >  size >>= 10;
> > >  if (size < 1024) {
> > > -qemu_printf("(%d MiB)\n", size);
> > > +monitor_printf(mon, "(%d MiB)\n", size);
> > >  } else {
> > >  size >>= 10;
> > > -qemu_printf("(%d GiB)\n", size);
> > > +monitor_printf(mon, "(%d GiB)\n", size);
> > >  }
> > >  }
> > >  }
> > > 
> > > -static void dump_address_map(CPUM68KState *env, uint32_t root_pointer)
> > > +static void dump_address_map(Monitor *mon, CPUM68KState *env,
> > > + uint32_t root_pointer)
> > >  {
> > >  int i, j, k;
> > >  int tic_size, tic_shift;
> > > @@ -545,7 +547,7 @@ static void dump_address_map(CPUM68KState *env, 
> > > uint32_t root_pointer)
> > >  if (first_logical != 0x) {
> > >  size = last_logical + (1 << tic_shift) -
> > > first_logical;
> > > -print_address_zone(first_logical,
> > > +print_address_zone(mon, first_logical,
> > > first_physical, size, 
> > > last_attr);
> > >  }
> > >  first_logical = logical;
> > > @@ -556,125 +558,125 @@ static void dump_address_map(CPUM68KState *env, 
> > > uint32_t root_pointer)
> > >  }
> > >  if (first_logical != logical || (attr & 4) != (last_attr & 4)) {
> > >  size = logical + (1 << tic_shift) - first_logical;
> > > -print_address_zone(first_logical, first_physical, size, 
> > > last_attr);
> > > +print_address_zone(mon, first_logical, first_physical, size, 
> > > last_attr);
> > >  }
> > >  }
> > > 
> > >  #define DUMP_CACHEFLAGS(a) \
> > >  switch (a & M68K_DESC_CACHEMODE) { \
> > >  case M68K_DESC_CM_WRTHRU: /* cacheable, write-through */ \
> > > -qemu_printf("T"); \
> > > +monitor_puts(mon, "T"); \
> > >  break; \
> > >  case M68K_DESC_CM_COPYBK: /* cacheable, copyback */ \
> > > -qemu_printf("C"); \
> > > +monitor_puts(mon, "C"); \
> > >  break; \
> > >  case M68K_DESC_CM_SERIAL: /* noncachable, serialized */ \
> > > -qemu_printf("S"); \
> > > +monitor_puts(mon, "S"); \
> > >  break; \
> > >  case M68K_DESC_CM_NCACHE: /* noncachable */ \
> > > -qemu_printf("N"); \
> > > +monitor_puts(mon, "N"); \
> > >  break; \
> > >  }
> > > 
> > > -static void dump_ttr(uint32_t ttr)
> > > +static void dump_ttr(Monitor *mon, uint32_t ttr)
> > >  {
> > >  if ((tt

Re: Qemu Display Coacoa Patch Serie Qemu 9.0 RC1

2024-03-28 Thread BALATON Zoltan


On Thu, 28 Mar 2024, Rene Engel wrote:

I wanted to discuss this topic with you again, there was already a patch series 
that worked well under Qemu with
Pegasos2/AmigaOneXe/Same460 and AmigaOs4.1. The option zoom-to-fit=on should be 
used to adjust all resolutions provided by the guest
system to the aspect ratio if there are no Virtio GPU drivers available that 
allow this. 

In my opinion exactly this option zoom-to-fit=on makes this possible. If you 
don't want to use this option you still have the possibility
to deactivate it. In Qemu 9.0 RC1 not all resolutions are stretched like in 
previous patches e.g. 640x480/800x600/1024x720 etc. but this
is exactly what we need for the Pegasos2/AmigaOneXe/Same460 machine with 
AmigaOs4.1. 


There seems to be a bit of confusion about how this zoom-to-fit option is 
implemented by different -display backends and I'm not sure what is the 
intended behaviour or how other -display backends handle it. Maybe a 
single option is not even enough to describe all possible preferences so 
another one i.e. keep-aspect=true|false may also be needed to cover all 
possible settings (don't zoom, zoom with aspect ratio kept, zoom to fit 
window even if that stretches the picture out of aspect ratio). For 9.0 
ptobsbly we should go for consistency with other backends now as adding 
new options is not possible during freeze and then resolve this 
afterwards.


There are also problems within the resolutions with the mouse pointer 
where the screen output flickers it currently affects all patch series. 
I would be happy if we could find a solution for all this. 


The flicker may be due to the resize algorithm used by macOS not giving 
the same result always. To resolve it maybe yet another option may be 
needed to not zoom to full available window but try to keep the zoom 
factor some integer value to avoid fractional scaling but I'm not sure 
that's the best way to solve it.


Regards,
BALATON Zoltan


I'll leave you 2 videos so you can decide for yourself what would make the most 
sense. It shows once for me the working zoom behavior
which works very well and the behavior with Qemu 9.0Rc1 including new Cocoa 
patches. 

Qemu zoom-to fit=on for all Screenmodes working: 
https://www.youtube.com/watch?v=dnJ3W8egAFY

Qemu 9.0. RC1 zoom-to fit=on not working for all Screenmodes: 
https://www.youtube.com/watch?v=Ddq68ViudrA

Re: [PATCH 2/3] target/hppa: mask offset bits in gva

2024-03-28 Thread Sven Schnelle

Richard Henderson  writes:

> On 3/23/24 22:09, Sven Schnelle wrote:
>> The CPU seems to mask a few bits in the offset when running
>> under HP-UX. ISR/IOR register contents for an address in
>> the processor HPA (0xfffa) on my C8000 and J6750:
>> running on Linux: 3fff c000fffa0500
>> running on HP-UX: 301f c000fffa0500
>> I haven't found how this is switched (guess some diag in the
>> firmware), but linux + seabios seems to handle that as well,
>> so lets mask out the additional bits.
>> Signed-off-by: Sven Schnelle 
>> [..]
> [..]
> Though my argument would suggest the mask should be 0xff for the
> 40-bit physical address, which is not what you see at all, so perhaps
> the thing is moot.  I am at a loss to explain why or how HP-UX gets a
> 7-bit hole in the ISR result.
>
> On the other hand, there are some not-well-documented shenanigans (aka
> implementation defined behaviour) between Figure H-8 and Figure H-11,
> where the 62-bit absolute address is expanded to a 64-bit logical
> physical address and then compacted to a 40-bit implementation
> physical address.
>
> We've already got hacks in place for this in hppa_abs_to_phys_pa2_w1,
> which just truncates everything down to 40 bits.  But that's probably
> not what the processor is really doing.

I looked into this again, and it's caused by Space-ID hashing. HP-UX asks
PDC/Firmware how many bits are used for the hashing. seabios returns
zero, in which case HP-UX uses a default mask of 0xf01f.
By modifying seabios, i can make HP-UX use the appropriate mask, but
switching of SpaceID hashing entirely is impossible. The reason why
the CPU doesn't strip the bits when running linux is that Linux switches
of Space-ID hashing early in the startup code (before mm gets
initialized).

My J6750 Firmware only returns two values: 0 when Space-ID hashing is
off, 0xfe0 when it is enabled. This is hardcoded in the firmware - the
only thing PDC checks is a bit in Debug Register 2, which enables
Space-ID hashing. 0xfe0 matches the 0xf01f... mask used by HP-UX
pretty well.

So if qemu wants to run 64 Bit HP-UX the proper way, i guess it needs
to implement Space-ID hashing.

[PATCH] gpio/pca955x: Update maintainer email address

2024-03-28 Thread Glenn Miles

It was noticed that my linux.vnet.ibm.com address does not
always work so dropping the vnet to see if that works better.

Signed-off-by: Glenn Miles 
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index a07af6b9d4..575ac2e05d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1545,7 +1545,7 @@ F: pc-bios/skiboot.lid
 F: tests/qtest/pnv*
 
 pca955x
-M: Glenn Miles 
+M: Glenn Miles 
 L: qemu-...@nongnu.org
 L: qemu-...@nongnu.org
 S: Odd Fixes
-- 
2.31.8

Re: [PATCH 15/19] migration: fix -Werror=maybe-uninitialized false-positive

2024-03-28 Thread Peter Xu

On Thu, Mar 28, 2024 at 02:20:48PM +0400, marcandre.lur...@redhat.com wrote:
> From: Marc-André Lureau 
> 
> ../migration/ram.c:1873:23: error: ‘dirty’ may be used uninitialized 
> [-Werror=maybe-uninitialized]
> 
> When 'block' != NULL, 'dirty' is initialized.
> 
> Signed-off-by: Marc-André Lureau 

Acked-by: Peter Xu 

-- 
Peter Xu

Re: [PATCH 12/19] migration: fix -Werror=maybe-uninitialized false-positives

2024-03-28 Thread Peter Xu

On Thu, Mar 28, 2024 at 02:20:45PM +0400, marcandre.lur...@redhat.com wrote:
> From: Marc-André Lureau 
> 
> ../migration/dirtyrate.c:186:5: error: ‘records’ may be used uninitialized 
> [-Werror=maybe-uninitialized]
> ../migration/dirtyrate.c:168:12: error: ‘gen_id’ may be used uninitialized 
> [-Werror=maybe-uninitialized]
> ../migration/migration.c:2273:5: error: ‘file’ may be used uninitialized 
> [-Werror=maybe-uninitialized]
> 
> Signed-off-by: Marc-André Lureau 

Acked-by: Peter Xu 

-- 
Peter Xu

Re: [PATCH 11/19] migration/block: fix -Werror=maybe-uninitialized false-positive

2024-03-28 Thread Peter Xu

On Thu, Mar 28, 2024 at 02:20:44PM +0400, marcandre.lur...@redhat.com wrote:
> From: Marc-André Lureau 
> 
> ../migration/block.c:966:16: error: ‘ret’ may be used uninitialized 
> [-Werror=maybe-uninitialized]
> 
> Given that "cluster_size" must be <= BLK_MIG_BLOCK_SIZE, the previous
> loop is entered at least once, so 'ret' is assigned a value in all conditions.
> 
> Signed-off-by: Marc-André Lureau 

Acked-by: Peter Xu 

-- 
Peter Xu

Re: [PATCH v10 18/23] hw/intc/arm_gicv3: Handle icv_nmiar1_read() for icc_nmiar1_read()

2024-03-28 Thread Peter Maydell

On Mon, 25 Mar 2024 at 08:52, Jinjie Ruan  wrote:
>
> Implement icv_nmiar1_read() for icc_nmiar1_read(), so add definition for
> ICH_LR_EL2.NMI and ICH_AP1R_EL2.NMI bit.
>
> If FEAT_GICv3_NMI is supported, ich_ap_write() should consider 
> ICV_AP1R_EL1.NMI
> bit. In icv_activate_irq() and icv_eoir_write(), the ICV_AP1R_EL1.NMI bit
> should be set or clear according to the Non-maskable property. And the RPR
> priority should also update the NMI bit according to the APR priority NMI bit.
>
> By the way, add gicv3_icv_nmiar1_read trace event.
>
> If the hpp irq is a NMI, the icv iar read should return 1022 and trap for
> NMI again
>
> Signed-off-by: Jinjie Ruan 
> Reviewed-by: Richard Henderson 
> ---
> v10:
> - Rename ICH_AP1R_EL2_NMI to ICV_AP1R_EL1_NMI.
> - Add ICV_RPR_EL1_NMI definition.
> - Set ICV_RPR_EL1.NMI according to the ICV_AP1R_EL1.NMI in
>   ich_highest_active_virt_prio().
> v9:
> - Correct the INTID_NMI logic.
> v8:
> - Fix an unexpected interrupt bug when sending VNMI by running qemu VM.
> v7:
> - Add Reviewed-by.
> v6:
> - Implement icv_nmiar1_read().
> ---
>  hw/intc/arm_gicv3_cpuif.c | 79 +--
>  hw/intc/gicv3_internal.h  |  4 ++
>  hw/intc/trace-events  |  1 +
>  3 files changed, 73 insertions(+), 11 deletions(-)

I haven't done a full review of this yet, but it looks like some of
the parts that applied to physical interrupts apply here too, eg
 * don't do the RPR NMI bit handling in ich_highest_active_virt_prio(),
   deal with NMI in the callers
 * in the AP registers, set either NMI or a group-priority bit, not both
 * AP NMI bits are only in the 0 reg, so checking doesn't need to be
   inside the for loop

You'll also need to update hppvi_index() so it accounts for NMIs
when it's finding the highest priority interrupt in the list registers:
compare the HighestPriorityVirtualInterrupt() pseudocode function.

thanks
-- PMM

[PATCH v4 4/4] target/ppc: Add migration support for BHRB

2024-03-28 Thread Glenn Miles

Adds migration support for Branch History Rolling
Buffer (BHRB) internal state.

Signed-off-by: Glenn Miles 
Reviewed-by: Nicholas Piggin 
---

Changes from v3:
  - Rebased onto latest master branch

 target/ppc/machine.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/target/ppc/machine.c b/target/ppc/machine.c
index 6b6c31d903..731dd8df35 100644
--- a/target/ppc/machine.c
+++ b/target/ppc/machine.c
@@ -711,6 +711,26 @@ static const VMStateDescription vmstate_reservation = {
 }
 };
 
+#ifdef TARGET_PPC64
+static bool bhrb_needed(void *opaque)
+{
+PowerPCCPU *cpu = opaque;
+return (cpu->env.flags & POWERPC_FLAG_BHRB) != 0;
+}
+
+static const VMStateDescription vmstate_bhrb = {
+.name = "cpu/bhrb",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = bhrb_needed,
+.fields = (VMStateField[]) {
+VMSTATE_UINTTL(env.bhrb_offset, PowerPCCPU),
+VMSTATE_UINT64_ARRAY(env.bhrb, PowerPCCPU, BHRB_MAX_NUM_ENTRIES),
+VMSTATE_END_OF_LIST()
+}
+};
+#endif
+
 const VMStateDescription vmstate_ppc_cpu = {
 .name = "cpu",
 .version_id = 5,
@@ -756,6 +776,7 @@ const VMStateDescription vmstate_ppc_cpu = {
 #ifdef TARGET_PPC64
 &vmstate_tm,
 &vmstate_slb,
+&vmstate_bhrb,
 #endif /* TARGET_PPC64 */
 &vmstate_tlb6xx,
 &vmstate_tlbemb,
-- 
2.31.8

[PATCH v4 2/4] target/ppc: Add recording of taken branches to BHRB

2024-03-28 Thread Glenn Miles

This commit continues adding support for the Branch History
Rolling Buffer (BHRB) as is provided starting with the P8
processor and continuing with its successors.  This commit
is limited to the recording and filtering of taken branches.

The following changes were made:

  - Enabled functionality on P10 processors only due to
performance impact seen with P8 and P9 where it is not
disabled for non problem state branches.
  - Added a BHRB buffer for storing branch instruction and
target addresses for taken branches
  - Renamed gen_update_cfar to gen_update_branch_history and
added a 'target' parameter to hold the branch target
address and 'inst_type' parameter to use for filtering
  - Added TCG code to gen_update_branch_history that stores
data to the BHRB and updates the BHRB offset.
  - Added BHRB resource initialization and reset functions

Signed-off-by: Glenn Miles 
Reviewed-by: Nicholas Piggin 
---

Changes from v3:
  - Rebased on latest master branch
  - Fixed compile errors for non ppc64-softmmu targets
  - Fixed compile errors from compiling on 32 bit hosts

 target/ppc/cpu.h   | 17 +
 target/ppc/cpu_init.c  | 37 +-
 target/ppc/power8-pmu.c| 33 +
 target/ppc/power8-pmu.h|  7 ++
 target/ppc/translate.c | 98 --
 target/ppc/translate/branch-impl.c.inc |  2 +-
 6 files changed, 186 insertions(+), 8 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 3f6b93ca8f..7a62a82d03 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -550,6 +550,8 @@ FIELD(MSR, LE, MSR_LE, 1)
  MMCR2_FC4P0 | MMCR2_FC5P0 | MMCR2_FC6P0)
 
 #define MMCRA_BHRBRDPPC_BIT(26) /* BHRB Recording Disable */
+#define MMCRA_IFM_MASK  PPC_BITMASK(32, 33) /* BHRB Instruction Filtering */
+#define MMCRA_IFM_SHIFT PPC_BIT_NR(33)
 
 #define MMCR1_EVT_SIZE 8
 /* extract64() does a right shift before extracting */
@@ -776,6 +778,8 @@ enum {
 POWERPC_FLAG_SMT  = 0x0040,
 /* Using "LPAR per core" mode  (as opposed to per-thread)*/
 POWERPC_FLAG_SMT_1LPAR = 0x0080,
+/* Has BHRB */
+POWERPC_FLAG_BHRB  = 0x0100,
 };
 
 /*
@@ -1217,6 +1221,9 @@ struct pnv_tod_tbst {
 #define PPC_CPU_OPCODES_LEN  0x40
 #define PPC_CPU_INDIRECT_OPCODES_LEN 0x20
 
+#define BHRB_MAX_NUM_ENTRIES_LOG2 (5)
+#define BHRB_MAX_NUM_ENTRIES  (1 << BHRB_MAX_NUM_ENTRIES_LOG2)
+
 struct CPUArchState {
 /* Most commonly used resources during translated code execution first */
 target_ulong gpr[32];  /* general purpose registers */
@@ -1313,6 +1320,16 @@ struct CPUArchState {
 int dcache_line_size;
 int icache_line_size;
 
+#ifdef TARGET_PPC64
+/* Branch History Rolling Buffer (BHRB) resources */
+target_ulong bhrb_num_entries;
+intptr_t bhrb_base;
+target_ulong bhrb_filter;
+target_ulong bhrb_offset;
+target_ulong bhrb_offset_mask;
+uint64_t bhrb[BHRB_MAX_NUM_ENTRIES];
+#endif
+
 /* These resources are used during exception processing */
 /* CPU model definition */
 target_ulong msr_mask;
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 4e65335669..907cdde5a8 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -6142,6 +6142,28 @@ POWERPC_FAMILY(POWER7)(ObjectClass *oc, void *data)
 pcc->l1_icache_size = 0x8000;
 }
 
+static void bhrb_init_state(CPUPPCState *env, target_long num_entries_log2)
+{
+if (env->flags & POWERPC_FLAG_BHRB) {
+if (num_entries_log2 > BHRB_MAX_NUM_ENTRIES_LOG2) {
+num_entries_log2 = BHRB_MAX_NUM_ENTRIES_LOG2;
+}
+env->bhrb_num_entries = 1 << num_entries_log2;
+env->bhrb_base = (intptr_t)&env->bhrb[0];
+env->bhrb_offset_mask = (env->bhrb_num_entries * sizeof(uint64_t)) - 1;
+}
+}
+
+static void bhrb_reset_state(CPUPPCState *env)
+{
+if (env->flags & POWERPC_FLAG_BHRB) {
+env->bhrb_offset = 0;
+env->bhrb_filter = 0;
+memset(env->bhrb, 0, sizeof(env->bhrb));
+}
+}
+
+#define POWER8_BHRB_ENTRIES_LOG2 5
 static void init_proc_POWER8(CPUPPCState *env)
 {
 /* Common Registers */
@@ -6183,6 +6205,8 @@ static void init_proc_POWER8(CPUPPCState *env)
 env->dcache_line_size = 128;
 env->icache_line_size = 128;
 
+bhrb_init_state(env, POWER8_BHRB_ENTRIES_LOG2);
+
 /* Allocate hardware IRQ controller */
 init_excp_POWER8(env);
 ppcPOWER7_irq_init(env_archcpu(env));
@@ -6307,6 +6331,7 @@ static struct ppc_radix_page_info POWER9_radix_page_info 
= {
 };
 #endif /* CONFIG_USER_ONLY */
 
+#define POWER9_BHRB_ENTRIES_LOG2 5
 static void init_proc_POWER9(CPUPPCState *env)
 {
 /* Common Registers */
@@ -6357,6 +6382,8 @@ static void init_proc_POWER9(CPUPPCState *env)
 env->dcache_line_size = 128;
 env->icache_line_size = 128;
 
+bhrb_init_state(env, POWER9_BHRB_ENTRIES_LOG2);
+
 /* Al

[PATCH v4 1/4] target/ppc: Add new hflags to support BHRB

2024-03-28 Thread Glenn Miles

This commit is preparatory to the addition of Branch History
Rolling Buffer (BHRB) functionality, which is being provided
today starting with the P8 processor.

BHRB uses several SPR register fields to control whether or not
a branch instruction's address (and sometimes target address)
should be recorded.  Checking each of these fields with each
branch instruction using jitted code would lead to a significant
decrease in performance.

Therefore, it was decided that BHRB configuration bits that are
not expected to change frequently should have their state summarized
in an hflag so that the amount of checking done by jitted code can
be reduced.

This commit contains the changes for summarizing the state of the
following register fields in the HFLAGS_BHRB_ENABLE hflag:

MMCR0[FCP] - Determines if BHRB recording is frozen in the
 problem state

MMCR0[FCPC] - A modifier for MMCR0[FCP]

MMCRA[BHRBRD] - Disables all BHRB recording for a thread

Signed-off-by: Glenn Miles 
Reviewed-by: Nicholas Piggin 
---

Changes from v3:
  - Rebased on latest master branch
  - Fixed compile errors from non ppc64-softmmu targets

 target/ppc/cpu.h |  5 +
 target/ppc/cpu_init.c|  4 ++--
 target/ppc/helper.h  |  1 +
 target/ppc/helper_regs.c | 37 
 target/ppc/machine.c |  2 +-
 target/ppc/power8-pmu-regs.c.inc |  5 +
 target/ppc/power8-pmu.c  | 15 +
 target/ppc/power8-pmu.h  |  4 ++--
 target/ppc/spr_common.h  |  1 +
 target/ppc/translate.c   |  2 ++
 10 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 67e6b2effd..3f6b93ca8f 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -535,6 +535,8 @@ FIELD(MSR, LE, MSR_LE, 1)
 #define MMCR0_FC56   PPC_BIT(59) /* PMC Freeze Counters 5-6 bit */
 #define MMCR0_PMC1CE PPC_BIT(48) /* MMCR0 PMC1 Condition Enabled */
 #define MMCR0_PMCjCE PPC_BIT(49) /* MMCR0 PMCj Condition Enabled */
+#define MMCR0_FCPPPC_BIT(34) /* Freeze Counters/BHRB if PR=1 */
+#define MMCR0_FCPC   PPC_BIT(51) /* Condition for FCP bit */
 /* MMCR0 userspace r/w mask */
 #define MMCR0_UREG_MASK (MMCR0_FC | MMCR0_PMAO | MMCR0_PMAE)
 /* MMCR2 userspace r/w mask */
@@ -547,6 +549,8 @@ FIELD(MSR, LE, MSR_LE, 1)
 #define MMCR2_UREG_MASK (MMCR2_FC1P0 | MMCR2_FC2P0 | MMCR2_FC3P0 | \
  MMCR2_FC4P0 | MMCR2_FC5P0 | MMCR2_FC6P0)
 
+#define MMCRA_BHRBRDPPC_BIT(26) /* BHRB Recording Disable */
+
 #define MMCR1_EVT_SIZE 8
 /* extract64() does a right shift before extracting */
 #define MMCR1_PMC1SEL_START 32
@@ -799,6 +803,7 @@ enum {
 HFLAGS_PMCJCE = 17, /* MMCR0 PMCjCE bit */
 HFLAGS_PMC_OTHER = 18, /* PMC other than PMC5-6 is enabled */
 HFLAGS_INSN_CNT = 19, /* PMU instruction count enabled */
+HFLAGS_BHRB_ENABLE = 20, /* Summary flag for enabling BHRB */
 HFLAGS_VSX = 23, /* MSR_VSX if cpu has VSX */
 HFLAGS_VR = 25,  /* MSR_VR if cpu has VRE */
 
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 7e65f08147..4e65335669 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5152,7 +5152,7 @@ static void register_book3s_pmu_sup_sprs(CPUPPCState *env)
  KVM_REG_PPC_MMCR1, 0x);
 spr_register_kvm(env, SPR_POWER_MMCRA, "MMCRA",
  SPR_NOACCESS, SPR_NOACCESS,
- &spr_read_generic, &spr_write_generic,
+ &spr_read_generic, &spr_write_MMCRA,
  KVM_REG_PPC_MMCRA, 0x);
 spr_register_kvm(env, SPR_POWER_PMC1, "PMC1",
  SPR_NOACCESS, SPR_NOACCESS,
@@ -7194,7 +7194,7 @@ static void ppc_cpu_reset_hold(Object *obj)
 if (env->mmu_model != POWERPC_MMU_REAL) {
 ppc_tlb_invalidate_all(env);
 }
-pmu_mmcr01_updated(env);
+pmu_mmcr01a_updated(env);
 }
 
 /* clean any pending stop state */
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 86f97ee1e7..3df360efe9 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -30,6 +30,7 @@ DEF_HELPER_2(store_dawr0, void, env, tl)
 DEF_HELPER_2(store_dawrx0, void, env, tl)
 DEF_HELPER_2(store_mmcr0, void, env, tl)
 DEF_HELPER_2(store_mmcr1, void, env, tl)
+DEF_HELPER_2(store_mmcrA, void, env, tl)
 DEF_HELPER_3(store_pmc, void, env, i32, i64)
 DEF_HELPER_2(read_pmc, tl, env, i32)
 DEF_HELPER_2(insns_inc, void, env, i32)
diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
index 25258986e3..07a07ae720 100644
--- a/target/ppc/helper_regs.c
+++ b/target/ppc/helper_regs.c
@@ -47,6 +47,39 @@ void hreg_swap_gpr_tgpr(CPUPPCState *env)
 env->tgpr[3] = tmp;
 }
 
+#if defined(TARGET_PPC64)
+static bool hreg_check_bhrb_enable(CPUPPCState *env)
+{
+bool pr = !!(env->msr & (1 << MSR_PR));
+target_long mmcr0;
+bool fcp;
+boo

[PATCH v4 3/4] target/ppc: Add clrbhrb and mfbhrbe instructions

2024-03-28 Thread Glenn Miles

Add support for the clrbhrb and mfbhrbe instructions.

Since neither instruction is believed to be critical to
performance, both instructions were implemented using helper
functions.

Access to both instructions is controlled by bits in the
HFSCR (for privileged state) and MMCR0 (for problem state).
A new function, helper_mmcr0_facility_check, was added for
checking MMCR0[BHRBA] and raising a facility_unavailable exception
if required.

NOTE: For P8 and P9, due to a performance issue, branch history will
not be kept, but the instructions will be allowed to execute
as normal with the exception that the mfbhrbe instruction will
always return a zero value.

Signed-off-by: Glenn Miles 
Reviewed-by: Nicholas Piggin 
---

Changes from v3:
  - Rebased on latest master branch
  - Fixed compile errors for non ppc64-softmmu targets

 target/ppc/cpu.h |  2 ++
 target/ppc/helper.h  |  7 
 target/ppc/insn32.decode |  8 +
 target/ppc/misc_helper.c | 50 
 target/ppc/translate.c   |  2 ++
 target/ppc/translate/bhrb-impl.c.inc | 43 
 6 files changed, 112 insertions(+)
 create mode 100644 target/ppc/translate/bhrb-impl.c.inc

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 7a62a82d03..76e896fdda 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -537,6 +537,7 @@ FIELD(MSR, LE, MSR_LE, 1)
 #define MMCR0_PMCjCE PPC_BIT(49) /* MMCR0 PMCj Condition Enabled */
 #define MMCR0_FCPPPC_BIT(34) /* Freeze Counters/BHRB if PR=1 */
 #define MMCR0_FCPC   PPC_BIT(51) /* Condition for FCP bit */
+#define MMCR0_BHRBA_NR PPC_BIT_NR(42)/* BHRB Available */
 /* MMCR0 userspace r/w mask */
 #define MMCR0_UREG_MASK (MMCR0_FC | MMCR0_PMAO | MMCR0_PMAE)
 /* MMCR2 userspace r/w mask */
@@ -636,6 +637,7 @@ FIELD(MSR, LE, MSR_LE, 1)
 
 /* HFSCR bits */
 #define HFSCR_MSGP PPC_BIT(53) /* Privileged Message Send Facilities */
+#define HFSCR_BHRB PPC_BIT(59) /* BHRB Instructions */
 #define HFSCR_IC_MSGP  0xA
 
 #define DBCR0_ICMP (1 << 27)
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 3df360efe9..8cdb322ed6 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -820,3 +820,10 @@ DEF_HELPER_4(DSCLIQ, void, env, fprp, fprp, i32)
 
 DEF_HELPER_1(tbegin, void, env)
 DEF_HELPER_FLAGS_1(fixup_thrm, TCG_CALL_NO_RWG, void, env)
+
+#if !defined(CONFIG_USER_ONLY)
+#if defined(TARGET_PPC64)
+DEF_HELPER_1(clrbhrb, void, env)
+DEF_HELPER_FLAGS_2(mfbhrbe, TCG_CALL_NO_WG, i64, env, i32)
+#endif
+#endif
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index eada59f59f..a343621cdd 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -998,3 +998,11 @@ MSGSND  01 - - . 0011001110 -   
@X_rb
 MSGCLRP 01 - - . 0010101110 -   @X_rb
 MSGSNDP 01 - - . 0010001110 -   @X_rb
 MSGSYNC 01 - - - 1101110110 -
+
+# Branch History Rolling Buffer (BHRB) Instructions
+
+&XFX_bhrbe  rt bhrbe
+@XFX_bhrbe  .. rt:5 bhrbe:10 .. -   &XFX_bhrbe
+
+MFBHRBE 01 . . . 0100101110 -   @XFX_bhrbe
+CLRBHRB 01 - - - 0110101110 -
diff --git a/target/ppc/misc_helper.c b/target/ppc/misc_helper.c
index 58e808dc96..6f419c9346 100644
--- a/target/ppc/misc_helper.c
+++ b/target/ppc/misc_helper.c
@@ -150,6 +150,17 @@ void helper_msr_facility_check(CPUPPCState *env, uint32_t 
bit,
 
 #if !defined(CONFIG_USER_ONLY)
 
+#ifdef TARGET_PPC64
+static void helper_mmcr0_facility_check(CPUPPCState *env, uint32_t bit,
+ uint32_t sprn, uint32_t cause)
+{
+if (FIELD_EX64(env->msr, MSR, PR) &&
+!(env->spr[SPR_POWER_MMCR0] & (1ULL << bit))) {
+raise_fu_exception(env, bit, sprn, cause, GETPC());
+}
+}
+#endif
+
 void helper_store_sdr1(CPUPPCState *env, target_ulong val)
 {
 if (env->spr[SPR_SDR1] != val) {
@@ -363,3 +374,42 @@ void helper_fixup_thrm(CPUPPCState *env)
 env->spr[i] = v;
 }
 }
+
+#if !defined(CONFIG_USER_ONLY)
+#if defined(TARGET_PPC64)
+void helper_clrbhrb(CPUPPCState *env)
+{
+helper_hfscr_facility_check(env, HFSCR_BHRB, "clrbhrb", FSCR_IC_BHRB);
+
+helper_mmcr0_facility_check(env, MMCR0_BHRBA_NR, 0, FSCR_IC_BHRB);
+
+if (env->flags & POWERPC_FLAG_BHRB) {
+memset(env->bhrb, 0, sizeof(env->bhrb));
+}
+}
+
+uint64_t helper_mfbhrbe(CPUPPCState *env, uint32_t bhrbe)
+{
+unsigned int index;
+
+helper_hfscr_facility_check(env, HFSCR_BHRB, "mfbhrbe", FSCR_IC_BHRB);
+
+helper_mmcr0_facility_check(env, MMCR0_BHRBA_NR, 0, FSCR_IC_BHRB);
+
+if (!(env->flags & POWERPC_FLAG_BHRB) ||
+ (bhrbe >= env->bhrb_num_entries) ||
+ (env->spr[SPR_POWER_MMCR0] & MMCR0_PMAE)) {
+return 0;
+}
+
+/*
+ * Note: bhrb_offset is the byte offset for writing the
+ * next entry (over the oldest

[PATCH v4 0/4] Add BHRB Facility Support

2024-03-28 Thread Glenn Miles

This is a series of patches for adding support for the Branch History
Rolling Buffer (BHRB) facility.  This was added to the Power ISA
starting with version 2.07.  Changes were subsequently made in version
3.1 to limit BHRB recording to instructions run in problem state only
and to add a control bit to disable recording (MMCRA[BHRBRD]).

Changes from previous version:
 - Rebased on latest master head (req'd changing cpu_env to tcg_env)
 - Fixed compiler errors for non ppc64-softmmu targets
 - Fixed compiler errors from compiling on 32-bit platforms

Glenn Miles (4):
  target/ppc: Add new hflags to support BHRB
  target/ppc: Add recording of taken branches to BHRB
  target/ppc: Add clrbhrb and mfbhrbe instructions
  target/ppc: Add migration support for BHRB

 target/ppc/cpu.h   |  24 ++
 target/ppc/cpu_init.c  |  41 +-
 target/ppc/helper.h|   8 ++
 target/ppc/helper_regs.c   |  37 +
 target/ppc/insn32.decode   |   8 ++
 target/ppc/machine.c   |  23 +-
 target/ppc/misc_helper.c   |  50 
 target/ppc/power8-pmu-regs.c.inc   |   5 ++
 target/ppc/power8-pmu.c|  48 +++-
 target/ppc/power8-pmu.h|  11 ++-
 target/ppc/spr_common.h|   1 +
 target/ppc/translate.c | 102 +++--
 target/ppc/translate/bhrb-impl.c.inc   |  43 +++
 target/ppc/translate/branch-impl.c.inc |   2 +-
 14 files changed, 386 insertions(+), 17 deletions(-)
 create mode 100644 target/ppc/translate/bhrb-impl.c.inc

-- 
2.31.8

Re: [PATCH-for-9.1 v2 2/3] migration: Remove RDMA protocol handling

2024-03-28 Thread Peter Xu

On Thu, Mar 28, 2024 at 04:22:27PM +0100, Thomas Huth wrote:
> Since e9a54265f5 was not very clear about rdma migration code, should we
> maybe rather add a separate deprecation note for the migration part, and add
> a proper warning message to the migration code in case someone tries to use
> it there, and then only remove the rdma migration code after two more
> releases?

Definitely a valid option to me.

So far RDMA isn't covered in tests (actually same to COLO, and I wonder our
position of COLO too in this case..), so unfortunately we don't even know
when it'll break just like before.

>From other activities that I can see when new code comes, maintaining RDMA
code should be fairly manageable so far (and whoever will write new rdma
codes in those two releases will also need to take the maintainer's
role). We did it for those years, and we can keep that for two more
releases. Hopefully that can ring a louder alarm to the current users with
such warnings, so that people can either stick with old binaries, or invest
developer/test resources to the community.

Thanks,

-- 
Peter Xu

Re: [RFC PATCH-for-9.1 14/29] hw/i386/pc: Move pc_system_flash_create() to pc_pci_machine_initfn()

2024-03-28 Thread BALATON Zoltan


On Thu, 28 Mar 2024, Philippe Mathieu-Daudé wrote:

pc_system_flash_create() is only useful for PCI-based machines.
Move the call to the PCI-based init() handler.

Signed-off-by: Philippe Mathieu-Daudé 
---
hw/i386/pc.c   |  2 +-
hw/i386/pc_sysfw.c | 10 --
2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 5b96daa414..33724791fd 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1679,7 +1679,6 @@ static void pc_machine_initfn(Object *obj)
pcms->fd_bootchk = true;
pcms->default_bus_bypass_iommu = false;

-pc_system_flash_create(pcms);
pcms->pcspk = isa_new(TYPE_PC_SPEAKER);
object_property_add_alias(OBJECT(pcms), "pcspk-audiodev",
  OBJECT(pcms->pcspk), "audiodev");
@@ -1694,6 +1693,7 @@ static void pc_pci_machine_initfn(Object *obj)

ppms->acpi_build_enabled = true;

+pc_system_flash_create(PC_MACHINE(obj));
cxl_machine_init(obj, &ppms->cxl_devices_state);

ppms->machine_done.notify = pc_pci_machine_done;
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 541dcaef71..167ff24fcb 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -93,12 +93,10 @@ static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms,

void pc_system_flash_create(PCMachineState *pcms)
{
-if (pc_machine_is_pci_enabled(pcms)) {
-pcms->flash[0] = pc_pflash_create(pcms, "system.flash0",
-  "pflash0");
-pcms->flash[1] = pc_pflash_create(pcms, "system.flash1",
-  "pflash1");
-}
+assert(pc_machine_is_pci_enabled(pcms));
+
+pcms->flash[0] = pc_pflash_create(pcms, "system.flash0", "pflash0");
+pcms->flash[1] = pc_pflash_create(pcms, "system.flash1", "pflash1");
}


This could just be inlined as it's called once, then no need for assert 
and a separate function.


Regards,
BALATON Zoltan



void pc_system_flash_cleanup_unused(PCMachineState *pcms)

Re: [RFC PATCH-for-9.1 13/29] hw/i386/pc: Remove non-PCI code from pc_system_firmware_init()

2024-03-28 Thread BALATON Zoltan


On Thu, 28 Mar 2024, Philippe Mathieu-Daudé wrote:

x86_bios_rom_init() is the single non-PCI-machine call
from pc_system_firmware_init(). Extract it to the caller.

Signed-off-by: Philippe Mathieu-Daudé 
---
hw/i386/pc.c   | 6 +-
hw/i386/pc_sysfw.c | 5 +
2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f184808e3e..5b96daa414 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -956,7 +956,11 @@ void pc_memory_init(PCMachineState *pcms,
}

/* Initialize PC system firmware */
-pc_system_firmware_init(pcms, rom_memory);
+if (pci_enabled) {
+pc_system_firmware_init(pcms, rom_memory);
+} else {
+x86_bios_rom_init(machine, "bios.bin", rom_memory, true);
+}

option_rom_mr = g_malloc(sizeof(*option_rom_mr));
memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE,
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 862a082b0a..541dcaef71 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -202,10 +202,7 @@ void pc_system_firmware_init(PCMachineState *pcms,


Maybe also rename to pc_pci_firmware_init() to make  it clear this is only 
for PCI PC machine now?


Regards,
BALATON Zoltan


int i;
BlockBackend *pflash_blk[ARRAY_SIZE(pcms->flash)];

-if (!pc_machine_is_pci_enabled(pcms)) {
-x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, true);
-return;
-}
+assert(pc_machine_is_pci_enabled(pcms));

/* Map legacy -drive if=pflash to machine properties */
for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {

Re: [RFC PATCH-for-9.1 09/29] hw/i386/pc: Pass PCMachineState argument to acpi_setup()

2024-03-28 Thread BALATON Zoltan


On Thu, 28 Mar 2024, Philippe Mathieu-Daudé wrote:

acpi_setup() caller knows about the machine state, so pass
it as argument to avoid a qdev_get_machine() call.

We already resolved X86_MACHINE(pcms) as 'x86ms' so use the
latter.

Signed-off-by: Philippe Mathieu-Daudé 
---
hw/i386/acpi-build.h | 3 ++-
hw/i386/acpi-build.c | 5 ++---
hw/i386/pc.c | 2 +-
3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/hw/i386/acpi-build.h b/hw/i386/acpi-build.h
index 0dce155c8c..31de5bddbd 100644
--- a/hw/i386/acpi-build.h
+++ b/hw/i386/acpi-build.h
@@ -2,6 +2,7 @@
#ifndef HW_I386_ACPI_BUILD_H
#define HW_I386_ACPI_BUILD_H
#include "hw/acpi/acpi-defs.h"
+#include "hw/i386/pc.h"

extern const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio;

@@ -9,7 +10,7 @@ extern const struct AcpiGenericAddress x86_nvdimm_acpi_dsmio;
#define ACPI_PCIHP_SEJ_BASE 0x8
#define ACPI_PCIHP_BNMR_BASE 0x10

-void acpi_setup(void);
+void acpi_setup(PCMachineState *pcms);


This is changed to PcPciMachineState * in a following patch so can't you 
already introduce it here to avoid some churn?


Regards,
BALATON Zoltan


Object *acpi_get_i386_pci_host(void);

#endif
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 6e8e32e5d2..e702d5e9d2 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2749,9 +2749,8 @@ static const VMStateDescription vmstate_acpi_build = {
},
};

-void acpi_setup(void)
+void acpi_setup(PCMachineState *pcms)
{
-PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
X86MachineState *x86ms = X86_MACHINE(pcms);
AcpiBuildTables tables;
AcpiBuildState *build_state;
@@ -2771,7 +2770,7 @@ void acpi_setup(void)
return;
}

-if (!x86_machine_is_acpi_enabled(X86_MACHINE(pcms))) {
+if (!x86_machine_is_acpi_enabled(x86ms)) {
ACPI_BUILD_DPRINTF("ACPI disabled. Bailing out.\n");
return;
}
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 6d87d1d4c2..dfc0247bb6 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -601,7 +601,7 @@ void pc_machine_done(Notifier *notifier, void *data)
/* set the number of CPUs */
x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);

-acpi_setup();
+acpi_setup(pcms);
if (x86ms->fw_cfg) {
fw_cfg_build_smbios(pcms, x86ms->fw_cfg, pcms->smbios_entry_point_type);
fw_cfg_build_feature_control(MACHINE(pcms), x86ms->fw_cfg);

Re: [PATCH] hw/intc/arm_gicv3: ICC_HPPIR* return SPURIOUS if int group is disabled

2024-03-28 Thread Richard Henderson


On 3/28/24 05:33, Peter Maydell wrote:

If the group of the highest priority pending interrupt is disabled
via ICC_IGRPEN*, the ICC_HPPIR* registers should return
INTID_SPURIOUS, not the interrupt ID.  (See the GIC architecture
specification pseudocode functions ICC_HPPIR1_EL1[] and
HighestPriorityPendingInterrupt().)

Make HPPIR reads honour the group disable, the way we already do
when determining whether to preempt in icc_hppi_can_preempt().

Cc:qemu-sta...@nongnu.org
Signed-off-by: Peter Maydell
---
Pre-existing bug which I happened to notice while working
on review of the FEAT_NMI patches. I don't suppose real world
code disables interrupt groups which it's actually using, which
is why nobody's noticed it. Still, it's a safe bugfix so might
as well go to stable too.
---
  hw/intc/arm_gicv3_cpuif.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)


Reviewed-by: Richard Henderson 

r~

[PATCH for-9.0] disas: Show opcodes for target_disas and monitor_disas

2024-03-28 Thread Richard Henderson

Fixes: 83b4613ba83 ("disas: introduce show_opcodes")
Signed-off-by: Richard Henderson 
---
 disas/disas-mon.c | 1 +
 disas/disas.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/disas/disas-mon.c b/disas/disas-mon.c
index 48ac492c6c..5d6d9aa02d 100644
--- a/disas/disas-mon.c
+++ b/disas/disas-mon.c
@@ -34,6 +34,7 @@ void monitor_disas(Monitor *mon, CPUState *cpu, uint64_t pc,
 disas_initialize_debug_target(&s, cpu);
 s.info.fprintf_func = disas_gstring_printf;
 s.info.stream = (FILE *)ds;  /* abuse this slot */
+s.info.show_opcodes = true;
 
 if (is_physical) {
 s.info.read_memory_func = physical_read_memory;
diff --git a/disas/disas.c b/disas/disas.c
index 17170d291e..7e3b0bb46c 100644
--- a/disas/disas.c
+++ b/disas/disas.c
@@ -211,6 +211,7 @@ void target_disas(FILE *out, CPUState *cpu, uint64_t code, 
size_t size)
 s.info.stream = out;
 s.info.buffer_vma = code;
 s.info.buffer_length = size;
+s.info.show_opcodes = true;
 
 if (s.info.cap_arch >= 0 && cap_disas_target(&s.info, code, size)) {
 return;
-- 
2.34.1

Re: [PATCH-for-9.1 v2 3/3] block/gluster: Remove RDMA protocol handling

2024-03-28 Thread Thomas Huth


On 28/03/2024 14.02, Philippe Mathieu-Daudé wrote:

GlusterFS+RDMA has been deprecated 8 years ago in commit
0552ff2465 ("block/gluster: deprecate rdma support"):

   gluster volfile server fetch happens through unix and/or tcp,
   it doesn't support volfile fetch over rdma. The rdma code may
   actually mislead, so to make sure things do not break, for now
   we fallback to tcp when requested for rdma, with a warning.

   If you are wondering how this worked all these days, its the
   gluster libgfapi code which handles anything other than unix
   transport as socket/tcp, sad but true.

Besides, the whole RDMA subsystem was deprecated in commit
e9a54265f5 ("hw/rdma: Deprecate the pvrdma device and the rdma
subsystem") released in v8.2.

Cc: Prasanna Kumar Kalever 
Signed-off-by: Philippe Mathieu-Daudé 
---
  docs/system/device-url-syntax.rst.inc  |  4 +--
  docs/system/qemu-block-drivers.rst.inc |  1 -
  block/gluster.c| 39 --
  3 files changed, 2 insertions(+), 42 deletions(-)


Reviewed-by: Thomas Huth

Re: [PATCH-for-9.1 v2 1/3] hw/rdma: Remove pvrdma device and rdmacm-mux helper

2024-03-28 Thread Thomas Huth


On 28/03/2024 14.02, Philippe Mathieu-Daudé wrote:

The whole RDMA subsystem was deprecated in commit e9a54265f5
("hw/rdma: Deprecate the pvrdma device and the rdma subsystem")
released in v8.2.

Remove:
  - PVRDMA device
  - generated vmw_pvrdma/ directory from linux-headers
  - rdmacm-mux tool from contrib/

Cc: Yuval Shaia 
Cc: Marcel Apfelbaum 
Signed-off-by: Philippe Mathieu-Daudé 
---


Reviewed-by: Thomas Huth

Re: [PATCH v5 1/2] Refactor common functions between POSIX and Windows implementation

2024-03-28 Thread Philippe Mathieu-Daudé


On 28/3/24 16:40, aidan_le...@selinc.com wrote:

From: aidaleuc 

Signed-off-by: aidaleuc 
---
  qga/commands-common-ssh.c | 50 ++
  qga/commands-common-ssh.h | 10 
  qga/commands-posix-ssh.c  | 51 +++
  qga/meson.build   |  1 +
  4 files changed, 64 insertions(+), 48 deletions(-)
  create mode 100644 qga/commands-common-ssh.c
  create mode 100644 qga/commands-common-ssh.h




diff --git a/qga/commands-posix-ssh.c b/qga/commands-posix-ssh.c
index 236f80de44..c82ccfe629 100644
--- a/qga/commands-posix-ssh.c
+++ b/qga/commands-posix-ssh.c
@@ -9,12 +9,13 @@
  #include 
  #include 
  
+#include "commands-common-ssh.h"

  #include "qapi/error.h"
  #include "qga-qapi-commands.h"
  
  #ifdef QGA_BUILD_UNIT_TEST

-static struct passwd *
-test_get_passwd_entry(const gchar *user_name, GError **error)
+ static struct passwd *
+ test_get_passwd_entry(const gchar *user_name, GError **error)


Modulo this spurious change,

Reviewed-by: Philippe Mathieu-Daudé 

Thank you for your various iterations,

Phil.

Re: [PATCH for-9.0 0/2] migration: Two migration bug fixes

2024-03-28 Thread Peter Xu

On Thu, Mar 28, 2024 at 04:02:50PM +0200, Avihai Horon wrote:
> Hello,
> 
> This small series fixes two migration bugs I stumbled upon recently.
> Comments are welcome, thanks for reviewing.
> 
> Avihai Horon (2):
>   migration: Set migration error in migration_completion()
>   migration/postcopy: Ensure postcopy_start() sets errp if it fails

queued for 9.0-rc2, thanks.

-- 
Peter Xu

Re: [PATCH v2 1/1] docs: sbsa: update specs, add dt note

2024-03-28 Thread Peter Maydell

On Thu, 28 Mar 2024 at 16:39, Marcin Juszkiewicz
 wrote:
>
> Hardware of sbsa-ref board is nowadays defined by both BSA and SBSA
> specifications. Then BBR defines firmware interface.
>
> Added note about DeviceTree data passed from QEMU to firmware. It is
> very minimal and provides only data we use in firmware.
>
> Added NUMA information to list of things reported by DeviceTree.
>
> Signed-off-by: Marcin Juszkiewicz 
> ---



Applied to target-arm.next, thanks.

-- PMM

Re: [PATCH v2 1/1] docs: sbsa: update specs, add dt note

2024-03-28 Thread Leif Lindholm

On Thu, Mar 28, 2024 at 17:38:51 +0100, Marcin Juszkiewicz wrote:
> Hardware of sbsa-ref board is nowadays defined by both BSA and SBSA
> specifications. Then BBR defines firmware interface.
> 
> Added note about DeviceTree data passed from QEMU to firmware. It is
> very minimal and provides only data we use in firmware.
> 
> Added NUMA information to list of things reported by DeviceTree.
> 
> Signed-off-by: Marcin Juszkiewicz 

Reviewed-by: Leif Lindholm 

Thanks!

/
 Leif

> ---
>  docs/system/arm/sbsa.rst | 35 ++-
>  1 file changed, 26 insertions(+), 9 deletions(-)
> 
> diff --git a/docs/system/arm/sbsa.rst b/docs/system/arm/sbsa.rst
> index bca61608ff..2bf22a1d0b 100644
> --- a/docs/system/arm/sbsa.rst
> +++ b/docs/system/arm/sbsa.rst
> @@ -1,12 +1,16 @@
>  Arm Server Base System Architecture Reference board (``sbsa-ref``)
>  ==
>  
> -While the ``virt`` board is a generic board platform that doesn't match
> -any real hardware the ``sbsa-ref`` board intends to look like real
> -hardware. The `Server Base System Architecture
> -`_ defines a
> -minimum base line of hardware support and importantly how the firmware
> -reports that to any operating system.
> +The ``sbsa-ref`` board intends to look like real hardware (while the ``virt``
> +board is a generic board platform that doesn't match any real hardware).
> +
> +The hardware part is defined by two specifications:
> +
> +  - `Base System Architecture 
> `__ (BSA)
> +  - `Server Base System Architecture 
> `__ (SBSA)
> +
> +The `Arm Base Boot Requirements 
> `__ (BBR)
> +specification defines how the firmware reports that to any operating system.
>  
>  It is intended to be a machine for developing firmware and testing
>  standards compliance with operating systems.
> @@ -35,16 +39,29 @@ includes both internal hardware and parts affected by the 
> qemu command line
>  (i.e. CPUs and memory). As a result it must have a firmware specifically 
> built
>  to expect a certain hardware layout (as you would in a real machine).
>  
> +Note
> +
> +
> +QEMU provides the guest EL3 firmware with minimal information about hardware
> +platform using minimalistic devicetree. This is not a Linux devicetree. It is
> +not even a firmware devicetree.
> +
> +It is information passed from QEMU to describe the information a hardware
> +platform would have other mechanisms to discover at runtime, that are 
> affected
> +by the QEMU command line.
> +
> +Ultimately this devicetree may be replaced by IPC calls to an emulated SCP.
> +
>  DeviceTree information
>  ''
>  
> -The devicetree provided by the board model to the firmware is not intended
> -to be a complete compliant DT. It currently reports:
> +The devicetree reports:
>  
> - CPUs
> - memory
> - platform version
> - GIC addresses
> +   - NUMA node id for CPUs and memory
>  
>  Platform version
>  
> @@ -70,4 +87,4 @@ Platform version changes:
>GIC ITS information is present in devicetree.
>  
>  0.3
> -  The USB controller is an XHCI device, not EHCI
> +  The USB controller is an XHCI device, not EHCI.
> -- 
> 2.44.0
>

Re: [PATCH 1/1] docs: sbsa: update specs, add dt note

2024-03-28 Thread Marcin Juszkiewicz


W dniu 28.03.2024 o 16:43, Peter Maydell pisze:

On Tue, 26 Mar 2024 at 09:58, Marcin Juszkiewicz
 wrote:


Hardware of sbsa-ref board is nowadays defined by both BSA and SBSA
specifications. Then BBR defines firmware interface.

Added note about DeviceTree data passed from QEMU to firmware. It is
very minimal and provides only data we use in firmware.

Added NUMA information to list of things reported by DeviceTree.

Signed-off-by: Marcin Juszkiewicz 
---
  docs/system/arm/sbsa.rst | 37 -
  1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/docs/system/arm/sbsa.rst b/docs/system/arm/sbsa.rst
index bca61608ff..d4d1f2efe3 100644
--- a/docs/system/arm/sbsa.rst
+++ b/docs/system/arm/sbsa.rst



+Note
+
+
+QEMU provides us with minimal information about hardware platform using


s/us/the guest EL3 firmware/  (or whatever other term you want to
use to describe the guest software that reads the dt).


Thanks, fixed.


+minimalistic devicetree. This is not a Linux devicetree. It is not even a
+firmware devicetree.
+
+It is information passed from QEMU to describe the information a hardware
+platform would have other mechanisms to discover at runtime, that are affected
+by the QEMU command line.



Might want to say also
  Guest EL3 firmware does not pass this devicetree on to later
  components of the software stack.
?


This is a matter of what firmware stack QEMU user will run. TF-A (our 
current "guest EL3 firmware") passed devicetree to later components of 
the software stack. We just stopped using it in EDK2. But if someone 
would like to run U-Boot or other firmware then both SMC and DT will 
wait for them.



+
+Ultimately this devicetree will be replaced by IPC calls to an emulated SCP.
+And when we do that, we won't then have to rewrite Normal world firmware to
+cope.


I would drop the last sentence here, and use "may" instead of "will".


Done.


+
  DeviceTree information
  ''

-The devicetree provided by the board model to the firmware is not intended
-to be a complete compliant DT. It currently reports:
+The devicetree reports:

 - CPUs
 - memory
 - platform version
 - GIC addresses
+   - NUMA node id for CPUs and memory


Otherwise looks good to me, and the updates to the spec URLs
are particularly helpful. As a docs change I'd be happy
to take it into 9.0 (at least before rc2) if some other
sbsa-ref-knowledgeable person wants to either review or ack it.
(But it's also OK if it misses 9.0 and goes into 9.1.)


OK.

Re: [PATCH for-9.1 6/9] block/nbd: Use URI parsing code from glib

2024-03-28 Thread Richard W.M. Jones

On Thu, Mar 28, 2024 at 10:06:01AM -0500, Eric Blake wrote:
> Adjusting cc list to add upstream NBD and drop developers unrelated to
> this part of the qemu series...
> 
> On Thu, Mar 28, 2024 at 02:13:42PM +, Richard W.M. Jones wrote:
> > On Thu, Mar 28, 2024 at 03:06:03PM +0100, Thomas Huth wrote:
> > > Since version 2.66, glib has useful URI parsing functions, too.
> > > Use those instead of the QEMU-internal ones to be finally able
> > > to get rid of the latter. The g_uri_get_host() also takes care
> > > of removing the square brackets from IPv6 addresses, so we can
> > > drop that part of the QEMU code now, too.
> > > 
> 
> > >  
> > >  if (is_unix) {
> > >  /* nbd+unix:///export?socket=path */
> > > -if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) 
> > > {
> > > +const char *uri_socket = g_hash_table_lookup(qp, "socket");
> > > +if (uri_server || uri_port != -1 || !uri_socket) {
> > >  ret = -EINVAL;
> > >  goto out;
> > >  }
> 
> The spec for NBD URIs is at:
> 
> https://github.com/NetworkBlockDevice/nbd/blob/master/doc/uri.md
> 
> Should any of this spec mention case-insensitive concerns, such as
> whether 'NBD://' may be equivalent to 'nbd://', and whether
> 'nbd+unix:///?socket=x' is equivalent to 'nbd+unix:///?Socket=x'?
> Right now, I think that most implementations of NBD servers and
> clients happen to use case-sensitive parsing; but glib provides the
> option to do case-insensitive query parsing.

I haven't thought about this before, but do note that the NBD URI spec
defers to "IETF standards describing URIs" for all unanswered
questions.  RFC3986 does talk about this incidentally.  About the
scheme field it says (section 3.1):

   Although schemes are case-
   insensitive, the canonical form is lowercase and documents that
   specify schemes must do so with lowercase letters.  An implementation
   should accept uppercase letters as equivalent to lowercase in scheme
   names (e.g., allow "HTTP" as well as "http") for the sake of
   robustness but should only produce lowercase scheme names for
   consistency.

Hostname is also (obviously) case insensitive.  There's also a section
(6.2.3) which talks about normalization of URIs.

Overall it seems the intention of the RFC writer is that parsers
should handle any case; but when generating URIs (and for examples,
documentation etc) we should only generate lowercase.

libnbd absolutely does *not* get this right, eg:

  $ nbdinfo NBD://localhost
  nbdinfo: nbd_connect_uri: unknown NBD URI scheme: NBD: Invalid argument

so that's a bug too.

> If I read https://docs.gtk.org/glib/type_func.Uri.parse_params.html
> correctly, passing G_URI_PARAMS_CASE_INSENSITIVE (which you did not
> do) would mean that 'nbd+unix:///?socket=ignore&Socket=/for/real'
> would result in this g_hash_table_lookup finding only "Socket", not
> "socket".  Maybe it is worth an explicit addition to the NBD URI spec
> to mention that we intend to be case-sensitive (in the parts where it
> can be; I'm not sure if the schema part must be handled
> case-insensitively without re-reading the RFCs), and therefore that
> 'Socket=' does NOT have the same meaning as 'socket='.

We could mention this in the spec for clarity, but the current meaning
(as above) would be that case-insensitive parsing is recommended.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
libguestfs lets you edit virtual machines.  Supports shell scripting,
bindings from many languages.  http://libguestfs.org

[PATCH v2 1/1] docs: sbsa: update specs, add dt note

2024-03-28 Thread Marcin Juszkiewicz

Hardware of sbsa-ref board is nowadays defined by both BSA and SBSA
specifications. Then BBR defines firmware interface.

Added note about DeviceTree data passed from QEMU to firmware. It is
very minimal and provides only data we use in firmware.

Added NUMA information to list of things reported by DeviceTree.

Signed-off-by: Marcin Juszkiewicz 
---
 docs/system/arm/sbsa.rst | 35 ++-
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/docs/system/arm/sbsa.rst b/docs/system/arm/sbsa.rst
index bca61608ff..2bf22a1d0b 100644
--- a/docs/system/arm/sbsa.rst
+++ b/docs/system/arm/sbsa.rst
@@ -1,12 +1,16 @@
 Arm Server Base System Architecture Reference board (``sbsa-ref``)
 ==
 
-While the ``virt`` board is a generic board platform that doesn't match
-any real hardware the ``sbsa-ref`` board intends to look like real
-hardware. The `Server Base System Architecture
-`_ defines a
-minimum base line of hardware support and importantly how the firmware
-reports that to any operating system.
+The ``sbsa-ref`` board intends to look like real hardware (while the ``virt``
+board is a generic board platform that doesn't match any real hardware).
+
+The hardware part is defined by two specifications:
+
+  - `Base System Architecture 
`__ (BSA)
+  - `Server Base System Architecture 
`__ (SBSA)
+
+The `Arm Base Boot Requirements 
`__ (BBR)
+specification defines how the firmware reports that to any operating system.
 
 It is intended to be a machine for developing firmware and testing
 standards compliance with operating systems.
@@ -35,16 +39,29 @@ includes both internal hardware and parts affected by the 
qemu command line
 (i.e. CPUs and memory). As a result it must have a firmware specifically built
 to expect a certain hardware layout (as you would in a real machine).
 
+Note
+
+
+QEMU provides the guest EL3 firmware with minimal information about hardware
+platform using minimalistic devicetree. This is not a Linux devicetree. It is
+not even a firmware devicetree.
+
+It is information passed from QEMU to describe the information a hardware
+platform would have other mechanisms to discover at runtime, that are affected
+by the QEMU command line.
+
+Ultimately this devicetree may be replaced by IPC calls to an emulated SCP.
+
 DeviceTree information
 ''
 
-The devicetree provided by the board model to the firmware is not intended
-to be a complete compliant DT. It currently reports:
+The devicetree reports:
 
- CPUs
- memory
- platform version
- GIC addresses
+   - NUMA node id for CPUs and memory
 
 Platform version
 
@@ -70,4 +87,4 @@ Platform version changes:
   GIC ITS information is present in devicetree.
 
 0.3
-  The USB controller is an XHCI device, not EHCI
+  The USB controller is an XHCI device, not EHCI.
-- 
2.44.0

Re: [PATCH for-9.0 1/2] migration: Set migration error in migration_completion()

2024-03-28 Thread Cédric Le Goater


On 3/28/24 16:50, Avihai Horon wrote:


On 28/03/2024 17:21, Cédric Le Goater wrote:

External email: Use caution opening links or attachments


Hello Avihai,

On 3/28/24 15:02, Avihai Horon wrote:

After commit 9425ef3f990a ("migration: Use migrate_has_error() in
close_return_path_on_source()"), close_return_path_on_source() assumes
that migration error is set if an error occurs during migration.

This may not be true if migration errors in migration_completion(). For
example, if qemu_savevm_state_complete_precopy() errors, migration error
will not be set


Out of curiosity, could you describe a bit more the context ? Did
vfio_save_complete_precopy() fail ? why ?


Yep, vfio_save_complete_precopy() failed (but it failed while I was 
experimenting with an unofficial debug FW).



We should propagate errors of .save_live_complete_precopy() handlers as
it was done .save_setup handlers(). For 9.1.


Agreed.




This in turn, will cause a migration hang bug, similar to the bug that
was fixed by commit 22b04245f0d5 ("migration: Join the return path
thread before releasing to_dst_file"), as shutdown() will not be issued
for the return-path channel.


yes, but this test :

    if (ret < 0) {
    goto fail;
    }

will skip the close_return_path_on_source() call. Won't it ? So I don't
understand how it can be an issue. Am I missing something ?


It will skip the close_return_path_on_source() call in migration_completion(), 
but there is another close_return_path_on_source() call in migrate_fd_cleanup().


OK. Found it. This is a code path I hadn't explored yet.

Acked-by: Cédric Le Goater 

Thanks,

C.







Fix it by ensuring migration error is set in case of error in
migration_completion().


Why didn't you add a reference to commit 9425ef3f990a ?


I thought this commit didn't introduce this bug, but looking again in the 
mailing list [1], it kinda did:
The hang bug was fully fixed by commit 22b04245f0d ("migration: Join the return path 
thread before releasing to_dst_file") and then 9425ef3f990a re-introduced the bug, 
but only for migration_completion() case.
So, you are right, a fixes line with 9425ef3f990a should be added.

Thanks.

[1] https://lore.kernel.org/all/20240226203122.22894-1-faro...@suse.de/





Signed-off-by: Avihai Horon 
---
  migration/migration.c | 10 ++
  1 file changed, 10 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index 9fe8fd2afd7..b73ae3a72c4 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2799,6 +2799,7 @@ static void migration_completion(MigrationState *s)
  {
  int ret = 0;
  int current_active_state = s->state;
+    Error *local_err = NULL;

  if (s->state == MIGRATION_STATUS_ACTIVE) {
  ret = migration_completion_precopy(s, ¤t_active_state);
@@ -2832,6 +2833,15 @@ static void migration_completion(MigrationState *s)
  return;

  fail:
+    if (qemu_file_get_error_obj(s->to_dst_file, &local_err)) {
+    migrate_set_error(s, local_err);
+    error_free(local_err);
+    } else if (ret) {
+    error_setg_errno(&local_err, -ret, "Error in migration completion");


The 'ret = -1' case could be improved with error_setg(). As a followup.

Thanks,

C.





+    migrate_set_error(s, local_err);
+    error_free(local_err);
+    }
+
  migration_completion_failed(s, current_active_state);
  }

[RFC v2 0/5] virtio,vhost: Add VIRTIO_F_IN_ORDER support

2024-03-28 Thread Jonah Palmer

The goal of these patches is to add support to a variety of virtio and
vhost devices for the VIRTIO_F_IN_ORDER transport feature. This feature
indicates that all buffers are used by the device in the same order in
which they were made available by the driver.

These patches attempt to implement a generalized, non-device-specific
solution to support this feature.

The core feature behind this solution is a buffer mechanism in the form
of a VirtQueue's used_elems VirtQueueElement array. This allows devices
who always use buffers in-order by default to have a minimal overhead
impact. Devices that may not always use buffers in-order likely will
experience a performance hit. How large that performance hit is will
depend on how frequent elements are completed out-of-order.

A VirtQueue whose device who uses this feature will use its used_elems
VirtQueueElement array to hold used VirtQueueElements. The index that
used elements are placed in used_elems is the same index on the
used/descriptor ring that would satisfy the in-order requirement. In
other words, used elements are placed in their in-order locations on
used_elems and are only written to the used/descriptor ring once the
elements on used_elems are able to continue their expected order.

To differentiate between a "used" and "unused" element on the used_elems
array (a "used" element being an element that was already written to the
used/descriptor ring and an "unused" element being an element that
wasn't), we use an element's in_num and out_num values. If the sum of
these two values is greater than 0, the element is considered unused. If
the sum is 0, then the element is considered used and invalid. When we
find an order and write the element to the used/descriptor ring, we set
these two values to 0 to indicate that it's been used.

---
v2: Use a VirtQueue's used_elems array as a buffer mechanism

v1: Implement custom GLib GHashTable as a buffer mechanism

Jonah Palmer (5):
  virtio: Initialize sequence variables
  virtio: In-order support for split VQs
  virtio: In-order support for packed VQs
  vhost,vhost-user: Add VIRTIO_F_IN_ORDER to vhost feature bits
  virtio: Add VIRTIO_F_IN_ORDER property definition

 hw/block/vhost-user-blk.c|   1 +
 hw/net/vhost_net.c   |   2 +
 hw/scsi/vhost-scsi.c |   1 +
 hw/scsi/vhost-user-scsi.c|   1 +
 hw/virtio/vhost-user-fs.c|   1 +
 hw/virtio/vhost-user-vsock.c |   1 +
 hw/virtio/virtio.c   | 118 +++
 include/hw/virtio/virtio.h   |   5 +-
 net/vhost-vdpa.c |   1 +
 9 files changed, 119 insertions(+), 12 deletions(-)

-- 
2.39.3

[RFC v2 2/5] virtio: In-order support for split VQs

2024-03-28 Thread Jonah Palmer

Implements VIRTIO_F_IN_ORDER feature support for virtio devices using
the split virtqueue layout.

For a virtio device that has negotiated the VIRTIO_F_IN_ORDER feature
whose virtqueues use a split virtqueue layout, it's essential that
used VirtQueueElements are written to the used ring in-order.

For devices that use this in-order feature, its VirtQueue's used_elems
array is used to hold processed VirtQueueElements until they can be
presented to the driver in-order.

In the split virtqueue case, we check to see if the element was the next
expected element to be written to the used ring. If it's not, nothing
get written to the used ring and we're done. If it is, the element is
written to the used ring and then we check to see if the next expected
element continues the order. This process is repeated until we're unable
to continue the order.

If no elements were written to the used ring, no update to the used
ring's index is needed.

Signed-off-by: Jonah Palmer 
---
 hw/virtio/virtio.c | 50 ++
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 069d96df99..19d3d43816 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -856,16 +856,38 @@ static void virtqueue_split_fill(VirtQueue *vq, const 
VirtQueueElement *elem,
 unsigned int len, unsigned int idx)
 {
 VRingUsedElem uelem;
+uint16_t uelem_idx;
 
 if (unlikely(!vq->vring.used)) {
 return;
 }
 
-idx = (idx + vq->used_idx) % vq->vring.num;
+if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) {
+/* Write element(s) to used ring if they're in-order */
+while (true) {
+uelem_idx = vq->used_seq_idx % vq->vring.num;
 
-uelem.id = elem->index;
-uelem.len = len;
-vring_used_write(vq, &uelem, idx);
+/* Stop if element has been used */
+if (vq->used_elems[uelem_idx].in_num +
+vq->used_elems[uelem_idx].out_num <= 0) {
+break;
+}
+uelem.id = vq->used_elems[uelem_idx].index;
+uelem.len = vq->used_elems[uelem_idx].len;
+vring_used_write(vq, &uelem, uelem_idx);
+
+/* Mark this element as used */
+vq->used_elems[uelem_idx].in_num = 0;
+vq->used_elems[uelem_idx].out_num = 0;
+vq->used_seq_idx++;
+}
+} else {
+idx = (idx + vq->used_idx) % vq->vring.num;
+
+uelem.id = elem->index;
+uelem.len = len;
+vring_used_write(vq, &uelem, idx);
+}
 }
 
 static void virtqueue_packed_fill(VirtQueue *vq, const VirtQueueElement *elem,
@@ -918,6 +940,8 @@ static void virtqueue_packed_fill_desc(VirtQueue *vq,
 void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
 unsigned int len, unsigned int idx)
 {
+uint16_t seq_idx;
+
 trace_virtqueue_fill(vq, elem, len, idx);
 
 virtqueue_unmap_sg(vq, elem, len);
@@ -926,6 +950,16 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement 
*elem,
 return;
 }
 
+if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) {
+seq_idx = elem->seq_idx % vq->vring.num;
+
+vq->used_elems[seq_idx].index = elem->index;
+vq->used_elems[seq_idx].len = elem->len;
+vq->used_elems[seq_idx].ndescs = elem->ndescs;
+vq->used_elems[seq_idx].in_num = elem->in_num;
+vq->used_elems[seq_idx].out_num = elem->out_num;
+}
+
 if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
 virtqueue_packed_fill(vq, elem, len, idx);
 } else {
@@ -944,6 +978,14 @@ static void virtqueue_split_flush(VirtQueue *vq, unsigned 
int count)
 
 /* Make sure buffer is written before we update index. */
 smp_wmb();
+if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) {
+count = (vq->used_seq_idx - vq->used_idx) % vq->vring.num;
+
+/* No in-order elements were written, nothing to update */
+if (!count) {
+return;
+}
+}
 trace_virtqueue_flush(vq, count);
 old = vq->used_idx;
 new = old + count;
-- 
2.39.3

[RFC v2 3/5] virtio: In-order support for packed VQs

2024-03-28 Thread Jonah Palmer

Implements VIRTIO_F_IN_ORDER feature support for virtio devices using
the packed virtqueue layout.

For a virtio device that has negotiated the VIRTIO_F_IN_ORDER feature
whose virtqueues use a packed virtqueue layout, it's essential that used
VirtQueueElements are written to the descriptor ring in-order.

In the packed virtqueue case, since we already write to the virtqueue's
used_elems array at the start of virtqueue_fill, we don't need to call
virtqueue_packed_fill. Furthermore, due to change in behavior of the
used_elems array and not knowing how many unused in-order elements
exist, separate logic is required for the flushing operation of packed
virtqueues.

Signed-off-by: Jonah Palmer 
---
 hw/virtio/virtio.c | 50 +++---
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 19d3d43816..dc2eabd18b 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -960,7 +960,8 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement 
*elem,
 vq->used_elems[seq_idx].out_num = elem->out_num;
 }
 
-if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED)) {
+if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_RING_PACKED) &&
+!virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) {
 virtqueue_packed_fill(vq, elem, len, idx);
 } else {
 virtqueue_split_fill(vq, elem, len, idx);
@@ -997,18 +998,53 @@ static void virtqueue_split_flush(VirtQueue *vq, unsigned 
int count)
 
 static void virtqueue_packed_flush(VirtQueue *vq, unsigned int count)
 {
-unsigned int i, ndescs = 0;
+unsigned int i, j, uelem_idx, ndescs = 0;
 
 if (unlikely(!vq->vring.desc)) {
 return;
 }
 
-for (i = 1; i < count; i++) {
-virtqueue_packed_fill_desc(vq, &vq->used_elems[i], i, false);
-ndescs += vq->used_elems[i].ndescs;
+if (virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER)) {
+/* First expected element is used, nothing to do */
+if (vq->used_elems[vq->used_idx].in_num +
+vq->used_elems[vq->used_idx].out_num <= 0) {
+return;
+}
+
+j = vq->used_idx;
+
+for (i = j + 1; ; i++) {
+uelem_idx = i % vq->vring.num;
+
+/* Stop if element has been used */
+if (vq->used_elems[uelem_idx].in_num +
+vq->used_elems[uelem_idx].out_num <= 0) {
+break;
+}
+
+virtqueue_packed_fill_desc(vq, &vq->used_elems[uelem_idx],
+   uelem_idx, false);
+ndescs += vq->used_elems[uelem_idx].ndescs;
+
+/* Mark this element as used */
+vq->used_elems[uelem_idx].in_num = 0;
+vq->used_elems[uelem_idx].out_num = 0;
+}
+
+/* Mark first expected element as used */
+vq->used_elems[vq->used_idx].in_num = 0;
+vq->used_elems[vq->used_idx].out_num = 0;
+} else {
+j = 0;
+
+for (i = 1; i < count; i++) {
+virtqueue_packed_fill_desc(vq, &vq->used_elems[i], i, false);
+ndescs += vq->used_elems[i].ndescs;
+}
 }
-virtqueue_packed_fill_desc(vq, &vq->used_elems[0], 0, true);
-ndescs += vq->used_elems[0].ndescs;
+
+virtqueue_packed_fill_desc(vq, &vq->used_elems[j], j, true);
+ndescs += vq->used_elems[j].ndescs;
 
 vq->inuse -= ndescs;
 vq->used_idx += ndescs;
-- 
2.39.3

[RFC v2 4/5] vhost, vhost-user: Add VIRTIO_F_IN_ORDER to vhost feature bits

2024-03-28 Thread Jonah Palmer

Add support for the VIRTIO_F_IN_ORDER feature across a variety of vhost
devices.

The inclusion of VIRTIO_F_IN_ORDER in the feature bits arrays for these
devices ensures that the backend is capable of offering and providing
support for this feature, and that it can be disabled if the backend
does not support it.

Acked-by: Eugenio Pérez 
Signed-off-by: Jonah Palmer 
---
 hw/block/vhost-user-blk.c| 1 +
 hw/net/vhost_net.c   | 2 ++
 hw/scsi/vhost-scsi.c | 1 +
 hw/scsi/vhost-user-scsi.c| 1 +
 hw/virtio/vhost-user-fs.c| 1 +
 hw/virtio/vhost-user-vsock.c | 1 +
 net/vhost-vdpa.c | 1 +
 7 files changed, 8 insertions(+)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 6a856ad51a..d176ed857e 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -51,6 +51,7 @@ static const int user_feature_bits[] = {
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_IOMMU_PLATFORM,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_IN_ORDER,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index fd1a93701a..eb0b1c06e5 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -48,6 +48,7 @@ static const int kernel_feature_bits[] = {
 VIRTIO_F_IOMMU_PLATFORM,
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_IN_ORDER,
 VIRTIO_NET_F_HASH_REPORT,
 VHOST_INVALID_FEATURE_BIT
 };
@@ -76,6 +77,7 @@ static const int user_feature_bits[] = {
 VIRTIO_F_IOMMU_PLATFORM,
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_IN_ORDER,
 VIRTIO_NET_F_RSS,
 VIRTIO_NET_F_HASH_REPORT,
 VIRTIO_NET_F_GUEST_USO4,
diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index ae26bc19a4..40e7630191 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -38,6 +38,7 @@ static const int kernel_feature_bits[] = {
 VIRTIO_RING_F_EVENT_IDX,
 VIRTIO_SCSI_F_HOTPLUG,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_IN_ORDER,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c
index a63b1f4948..1d59951ab7 100644
--- a/hw/scsi/vhost-user-scsi.c
+++ b/hw/scsi/vhost-user-scsi.c
@@ -36,6 +36,7 @@ static const int user_feature_bits[] = {
 VIRTIO_RING_F_EVENT_IDX,
 VIRTIO_SCSI_F_HOTPLUG,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_IN_ORDER,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index cca2cd41be..9243dbb128 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -33,6 +33,7 @@ static const int user_feature_bits[] = {
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_IOMMU_PLATFORM,
 VIRTIO_F_RING_RESET,
+VIRTIO_F_IN_ORDER,
 
 VHOST_INVALID_FEATURE_BIT
 };
diff --git a/hw/virtio/vhost-user-vsock.c b/hw/virtio/vhost-user-vsock.c
index 9431b9792c..cc7e4e47b4 100644
--- a/hw/virtio/vhost-user-vsock.c
+++ b/hw/virtio/vhost-user-vsock.c
@@ -21,6 +21,7 @@ static const int user_feature_bits[] = {
 VIRTIO_RING_F_INDIRECT_DESC,
 VIRTIO_RING_F_EVENT_IDX,
 VIRTIO_F_NOTIFY_ON_EMPTY,
+VIRTIO_F_IN_ORDER,
 VHOST_INVALID_FEATURE_BIT
 };
 
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 85e73dd6a7..ed3185acfa 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -62,6 +62,7 @@ const int vdpa_feature_bits[] = {
 VIRTIO_F_RING_PACKED,
 VIRTIO_F_RING_RESET,
 VIRTIO_F_VERSION_1,
+VIRTIO_F_IN_ORDER,
 VIRTIO_NET_F_CSUM,
 VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,
 VIRTIO_NET_F_CTRL_MAC_ADDR,
-- 
2.39.3

[RFC v2 1/5] virtio: Initialize sequence variables

2024-03-28 Thread Jonah Palmer

Initialize sequence variables for VirtQueue and VirtQueueElement
structures. A VirtQueue's sequence variables are initialized when a
VirtQueue is being created or reset. A VirtQueueElement's sequence
variable is initialized when a VirtQueueElement is being initialized.
These variables will be used to support the VIRTIO_F_IN_ORDER feature.

A VirtQueue's used_seq_idx represents the next expected index in a
sequence of VirtQueueElements to be processed (put on the used ring).
The next VirtQueueElement added to the used ring must match this
sequence number before additional elements can be safely added to the
used ring. It's also particularly useful for helping find the number of
new elements added to the used ring.

A VirtQueue's current_seq_idx represents the current sequence index.
This value is essentially a counter where the value is assigned to a new
VirtQueueElement and then incremented. Given its uint16_t type, this
sequence number can be between 0 and 65,535.

A VirtQueueElement's seq_idx represents the sequence number assigned to
the VirtQueueElement when it was created. This value must match with the
VirtQueue's used_seq_idx before the element can be put on the used ring
by the device.

Signed-off-by: Jonah Palmer 
---
 hw/virtio/virtio.c | 18 ++
 include/hw/virtio/virtio.h |  1 +
 2 files changed, 19 insertions(+)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index fb6b4ccd83..069d96df99 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -132,6 +132,10 @@ struct VirtQueue
 uint16_t used_idx;
 bool used_wrap_counter;
 
+/* In-Order sequence indices */
+uint16_t used_seq_idx;
+uint16_t current_seq_idx;
+
 /* Last used index value we have signalled on */
 uint16_t signalled_used;
 
@@ -1621,6 +1625,11 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t 
sz)
 elem->in_sg[i] = iov[out_num + i];
 }
 
+/* Assign sequence index for in-order processing */
+if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) {
+elem->seq_idx = vq->current_seq_idx++;
+}
+
 vq->inuse++;
 
 trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
@@ -1760,6 +1769,11 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t 
sz)
 vq->shadow_avail_idx = vq->last_avail_idx;
 vq->shadow_avail_wrap_counter = vq->last_avail_wrap_counter;
 
+/* Assign sequence index for in-order processing */
+if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) {
+elem->seq_idx = vq->current_seq_idx++;
+}
+
 trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
 done:
 address_space_cache_destroy(&indirect_desc_cache);
@@ -2087,6 +2101,8 @@ static void __virtio_queue_reset(VirtIODevice *vdev, 
uint32_t i)
 vdev->vq[i].notification = true;
 vdev->vq[i].vring.num = vdev->vq[i].vring.num_default;
 vdev->vq[i].inuse = 0;
+vdev->vq[i].used_seq_idx = 0;
+vdev->vq[i].current_seq_idx = 0;
 virtio_virtqueue_reset_region_cache(&vdev->vq[i]);
 }
 
@@ -2334,6 +2350,8 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int 
queue_size,
 vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN;
 vdev->vq[i].handle_output = handle_output;
 vdev->vq[i].used_elems = g_new0(VirtQueueElement, queue_size);
+vdev->vq[i].used_seq_idx = 0;
+vdev->vq[i].current_seq_idx = 0;
 
 return &vdev->vq[i];
 }
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index b3c74a1bca..910b2a3427 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -75,6 +75,7 @@ typedef struct VirtQueueElement
 hwaddr *out_addr;
 struct iovec *in_sg;
 struct iovec *out_sg;
+uint16_t seq_idx;
 } VirtQueueElement;
 
 #define VIRTIO_QUEUE_MAX 1024
-- 
2.39.3

[RFC v2 5/5] virtio: Add VIRTIO_F_IN_ORDER property definition

2024-03-28 Thread Jonah Palmer

Extend the virtio device property definitions to include the
VIRTIO_F_IN_ORDER feature.

The default state of this feature is disabled, allowing it to be
explicitly enabled where it's supported.

Acked-by: Eugenio Pérez 
Signed-off-by: Jonah Palmer 
---
 include/hw/virtio/virtio.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 910b2a3427..dd0ba6e57f 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -385,7 +385,9 @@ typedef struct VirtIORNGConf VirtIORNGConf;
 DEFINE_PROP_BIT64("packed", _state, _field, \
   VIRTIO_F_RING_PACKED, false), \
 DEFINE_PROP_BIT64("queue_reset", _state, _field, \
-  VIRTIO_F_RING_RESET, true)
+  VIRTIO_F_RING_RESET, true), \
+DEFINE_PROP_BIT64("in_order", _state, _field, \
+  VIRTIO_F_IN_ORDER, false)
 
 hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n);
 bool virtio_queue_enabled_legacy(VirtIODevice *vdev, int n);
-- 
2.39.3

Re: [PATCH v2 6/6] tests/qtest: Add tests for the STM32L4x5 USART

2024-03-28 Thread Peter Maydell

On Mon, 25 Mar 2024 at 06:19, Thomas Huth  wrote:
> We are now using timeouts from the meson test harneess in meson.build, too,
> see the slow_qtests[] at the beginning of that file.
> You seem to be using a 10 minutes timeout in your test below
> (usart_wait_for_flag() function), but you didn't adjust the meson timeout
> setting in meson.build, so this does not quite match...
> How long does your test take on a very loaded machine (with --enable-debug
> used)? If it could take more than 30 seconds, you need to adjust the timeout
> in meson.build, too. If it is running very fast, you should likely adjust
> the 10 minutes timeout in usart_wait_for_flag() to < 30 seconds instead to
> match the meson timeout setting.

I'd forgotten about the meson harness timeout.

tests/qtest/microbit-test.c also has a 10 minute timeout but
isn't listed as a "slow qtest" (that's the pattern I suggested
Arnaud follow for this test).

If the meson test harness now handles timeouts, should we write
this kind of test to not have a timeout at all, so it simply
waits indefinitely for the UART to become ready after writing
data to the socket connected to the chardev? Or are there
scenarios where the test gets run but not via the meson harness
and where we would want to still have a timeout?
(For running the test executable by hand for debugging I think
hanging indefinitely is fine and arguably more helpful than
timing out and stopping.)

thanks
-- PMM

Re: [PATCH v2 0/6] hw/char: Implement the STM32L4x5 USART, UART and LPUART

2024-03-28 Thread Peter Maydell

On Sun, 24 Mar 2024 at 16:56, Arnaud Minier
 wrote:
>
> This patch adds the STM32L4x5 USART
> (Universal Synchronous/Asynchronous Receiver/Transmitter)
> device and is part of a series implementing the
> STM32L4x5 with a few peripherals.
>
> It implements the necessary functionalities to receive/send
> characters over the serial port, which are useful to
> communicate with the program currently running.
>
> Many thanks Peter for your review, I think I addressed almost
> everything.
> I'm just unsure about how to handle the waiting time in the tests.
> I understand your concerns about the unreliability of using the wallclock
> time but I don't understand how using clock_step() would make it
> more reliable. We will always be waiting on something
> that is out of our control (i.e. the OS).
> I increased the delay from 5s to 10min to match the microbit test
> and added a comment (I paraphrased your comment, is that okay ?).

I think I was slightly confused between two things. For
a lot of qtests we do want to use clock_step() and not
have wallclock-based delays, but we can only do that where
the thing we're waiting for is purely simulation time
(i.e. where we triggered a change via a qtest write and
then want to look for the result via a qtest read).
Where we're triggering something via a different OS
pathway (e.g. here where we write to the socket that's
backing the chardev connected to the UART and then look
at the UART registers) we do need a wallclock delay.

I recommend you follow Thomas's suggestions about timeouts
in his comments on patch 6; I'd forgotten we have a
meson timeout now too.

thanks
-- PMM

Re: [PATCH v2 5/6] hw/arm: Add the USART to the stm32l4x5 SoC

2024-03-28 Thread Peter Maydell

On Sun, 24 Mar 2024 at 16:57, Arnaud Minier
 wrote:
>
> Add the USART to the SoC and connect it to the other implemented devices.
>
> Signed-off-by: Arnaud Minier 
> Signed-off-by: Inès Varhol 
> ---
>  docs/system/arm/b-l475e-iot01a.rst |  2 +-
>  hw/arm/Kconfig |  1 +
>  hw/arm/stm32l4x5_soc.c | 82 +++---
>  include/hw/arm/stm32l4x5_soc.h | 13 +
>  4 files changed, 91 insertions(+), 7 deletions(-)
>
> diff --git a/docs/system/arm/b-l475e-iot01a.rst 
> b/docs/system/arm/b-l475e-iot01a.rst
> index 0afef8e4f4..a76c9976c5 100644
> --- a/docs/system/arm/b-l475e-iot01a.rst
> +++ b/docs/system/arm/b-l475e-iot01a.rst
> @@ -19,13 +19,13 @@ Currently B-L475E-IOT01A machine's only supports the 
> following devices:
>  - STM32L4x5 SYSCFG (System configuration controller)
>  - STM32L4x5 RCC (Reset and clock control)
>  - STM32L4x5 GPIOs (General-purpose I/Os)
> +- STM32L4x5 USARTs, UARTs and LPUART (Serial ports)
>
>  Missing devices
>  """
>
>  The B-L475E-IOT01A does *not* support the following devices:
>
> -- Serial ports (UART)
>  - Analog to Digital Converter (ADC)
>  - SPI controller
>  - Timer controller (TIMER)
> diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
> index 893a7bff66..098d043375 100644
> --- a/hw/arm/Kconfig
> +++ b/hw/arm/Kconfig
> @@ -477,6 +477,7 @@ config STM32L4X5_SOC
>  select STM32L4X5_SYSCFG
>  select STM32L4X5_RCC
>  select STM32L4X5_GPIO
> +select STM32L4X5_USART
>
>  config XLNX_ZYNQMP_ARM
>  bool
> diff --git a/hw/arm/stm32l4x5_soc.c b/hw/arm/stm32l4x5_soc.c
> index 40e294f838..ae0868dcab 100644
> --- a/hw/arm/stm32l4x5_soc.c
> +++ b/hw/arm/stm32l4x5_soc.c
> @@ -28,6 +28,7 @@
>  #include "sysemu/sysemu.h"
>  #include "hw/or-irq.h"
>  #include "hw/arm/stm32l4x5_soc.h"
> +#include "hw/char/stm32l4x5_usart.h"
>  #include "hw/gpio/stm32l4x5_gpio.h"
>  #include "hw/qdev-clock.h"
>  #include "hw/misc/unimp.h"
> @@ -116,6 +117,22 @@ static const struct {
>  { 0x48001C00, 0x000F, 0x, 0x },
>  };
>
> +static const hwaddr usart_addr[] = {
> +0x40013800, /* "USART1", 0x400 */
> +0x40004400, /* "USART2", 0x400 */
> +0x40004800, /* "USART3", 0x400 */
> +};
> +static const hwaddr uart_addr[] = {
> +0x40004C00, /* "UART4" , 0x400 */
> +0x40005000  /* "UART5" , 0x400 */
> +};
> +
> +#define LPUART_BASE_ADDRESS 0x40008000
> +
> +static const int usart_irq[] = { 37, 38, 39 };
> +static const int uart_irq[] = { 52, 53 };
> +#define LPUART_IRQ 70
> +
>  static void stm32l4x5_soc_initfn(Object *obj)
>  {
>  Stm32l4x5SocState *s = STM32L4X5_SOC(obj);
> @@ -132,6 +149,18 @@ static void stm32l4x5_soc_initfn(Object *obj)
>  g_autofree char *name = g_strdup_printf("gpio%c", 'a' + i);
>  object_initialize_child(obj, name, &s->gpio[i], TYPE_STM32L4X5_GPIO);
>  }
> +
> +for (int i = 0; i < STM_NUM_USARTS; i++) {
> +object_initialize_child(obj, "usart[*]", &s->usart[i],
> +TYPE_STM32L4X5_USART);
> +}
> +
> +for (int i = 0; i < STM_NUM_UARTS; i++) {
> +object_initialize_child(obj, "uart[*]", &s->uart[i],
> +TYPE_STM32L4X5_UART);
> +}
> +object_initialize_child(obj, "lpuart1", &s->lpuart,
> +TYPE_STM32L4X5_LPUART);
>  }
>
>  static void stm32l4x5_soc_realize(DeviceState *dev_soc, Error **errp)
> @@ -279,6 +308,53 @@ static void stm32l4x5_soc_realize(DeviceState *dev_soc, 
> Error **errp)
>  sysbus_mmio_map(busdev, 0, RCC_BASE_ADDRESS);
>  sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, RCC_IRQ));
>
> +/* USART devices */
> +for (int i = 0; i < STM_NUM_USARTS; i++) {
> +dev = DEVICE(&(s->usart[i]));
> +qdev_prop_set_chr(dev, "chardev", serial_hd(i));
> +g_autofree char *name = g_strdup_printf("usart%d-out", i + 1);

Variable declarations should always be at the start of a code block.
Similarly below.

> +qdev_connect_clock_in(dev, "clk",
> +qdev_get_clock_out(DEVICE(&(s->rcc)), name));
> +busdev = SYS_BUS_DEVICE(dev);
> +if (!sysbus_realize(busdev, errp)) {
> +return;
> +}
> +sysbus_mmio_map(busdev, 0, usart_addr[i]);
> +sysbus_connect_irq(busdev, 0, qdev_get_gpio_in(armv7m, 
> usart_irq[i]));
> +}
> +
> +/*
> + * TODO: Connect the USARTs, UARTs and LPUART to the EXTI once the EXTI
> + * can handle other gpio-in than the gpios. (e.g. Direct Lines for the 
> usarts)
> + */
> +
> +/* UART devices */
> +for (int i = 0; i < STM_NUM_UARTS; i++) {
> +dev = DEVICE(&(s->uart[i]));
> +qdev_prop_set_chr(dev, "chardev", serial_hd(STM_NUM_USARTS + i));
> +g_autofree char *name = g_strdup_printf("uart%d-out", STM_NUM_USARTS 
> + i + 1);
> +qdev_connect_clock_in(dev, "clk",
> +qdev_get_clock_out(DEVICE(&(s->rcc)), name));
> +busdev = SYS_BUS_DEVICE(dev)

Re: [PATCH v2 4/6] hw/char/stm32l4x5_usart: Add options for serial parameters setting

2024-03-28 Thread Peter Maydell

On Sun, 24 Mar 2024 at 16:57, Arnaud Minier
 wrote:
>
> Add a function to change the settings of the
> serial connection.
>
> Signed-off-by: Arnaud Minier 
> Signed-off-by: Inès Varhol 
> ---
>  hw/char/stm32l4x5_usart.c | 98 +++
>  hw/char/trace-events  |  1 +
>  2 files changed, 99 insertions(+)
>
> diff --git a/hw/char/stm32l4x5_usart.c b/hw/char/stm32l4x5_usart.c
> index ec8c2f6e63..b4d11dd826 100644
> --- a/hw/char/stm32l4x5_usart.c
> +++ b/hw/char/stm32l4x5_usart.c
> @@ -267,6 +267,92 @@ static void 
> usart_cancel_transmit(Stm32l4x5UsartBaseState *s)
>  }
>  }
>
> +static void stm32l4x5_update_params(Stm32l4x5UsartBaseState *s)
> +{
> +int speed, parity, data_bits, stop_bits;
> +uint32_t value, usart_div;
> +QEMUSerialSetParams ssp;
> +
> +/* Select the parity type */
> +if (s->cr1 & R_CR1_PCE_MASK) {
> +if (s->cr1 & R_CR1_PS_MASK) {
> +parity = 'O';
> +} else {
> +parity = 'E';
> +}
> +} else {
> +parity = 'N';
> +}
> +
> +/* Select the number of stop bits */
> +switch (FIELD_EX32(s->cr2, CR2, STOP)) {
> +case 0:
> +stop_bits = 1;
> +break;
> +case 2:
> +stop_bits = 2;
> +break;
> +default:
> +qemu_log_mask(LOG_UNIMP,
> +"UNIMPLEMENTED: fractionnal stop bits; CR2[13:12] = %x",

%x without a leading 0x is a bit odd. In this case since
the possible values are 0-3 it doesn't make a difference,
but maybe better to use %u ?

> +FIELD_EX32(s->cr2, CR2, STOP));
> +return;
> +}
> +
> +/* Select the length of the word */
> +switch ((FIELD_EX32(s->cr1, CR1, M1) << 1) | FIELD_EX32(s->cr1, CR1, 
> M0)) {
> +case 0:
> +data_bits = 8;
> +break;
> +case 1:
> +data_bits = 9;
> +break;
> +case 2:
> +data_bits = 7;
> +break;
> +default:
> +qemu_log_mask(LOG_GUEST_ERROR,
> +"UNDEFINED: invalid word length, CR1.M = 0b11");
> +return;
> +}
> +
> +/* Select the baud rate */
> +value = FIELD_EX32(s->brr, BRR, BRR);
> +if (value < 16) {
> +qemu_log_mask(LOG_GUEST_ERROR,
> +"UNDEFINED: BRR lesser than 16: %u", value);

"less than"

> +return;
> +}
> +
> +if (FIELD_EX32(s->cr1, CR1, OVER8) == 0) {
> +/*
> + * Oversampling by 16
> + * BRR = USARTDIV
> + */
> +usart_div = value;
> +} else {
> +/*
> + * Oversampling by 8
> + * - BRR[2:0] = USARTDIV[3:0] shifted 1 bit to the right.
> + * - BRR[3] must be kept cleared.
> + * - BRR[15:4] = USARTDIV[15:4]
> + * - The frequency is multiplied by 2
> + */
> +usart_div = ((value & 0xFFF0) | ((value & 0x0007) << 1)) / 2;
> +}
> +
> +speed = clock_get_hz(s->clk) / usart_div;
> +
> +ssp.speed = speed;
> +ssp.parity= parity;
> +ssp.data_bits = data_bits;
> +ssp.stop_bits = stop_bits;
> +
> +qemu_chr_fe_ioctl(&s->chr, CHR_IOCTL_SERIAL_SET_PARAMS, &ssp);
> +
> +trace_stm32l4x5_usart_update_params(speed, parity, data_bits, stop_bits);
> +}

Otherwise
Reviewed-by: Peter Maydell 

thanks
-- PMM

[RFC PATCH-for-9.1 13/29] hw/i386/pc: Remove non-PCI code from pc_system_firmware_init()

2024-03-28 Thread Philippe Mathieu-Daudé

x86_bios_rom_init() is the single non-PCI-machine call
from pc_system_firmware_init(). Extract it to the caller.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/i386/pc.c   | 6 +-
 hw/i386/pc_sysfw.c | 5 +
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f184808e3e..5b96daa414 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -956,7 +956,11 @@ void pc_memory_init(PCMachineState *pcms,
 }
 
 /* Initialize PC system firmware */
-pc_system_firmware_init(pcms, rom_memory);
+if (pci_enabled) {
+pc_system_firmware_init(pcms, rom_memory);
+} else {
+x86_bios_rom_init(machine, "bios.bin", rom_memory, true);
+}
 
 option_rom_mr = g_malloc(sizeof(*option_rom_mr));
 memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE,
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 862a082b0a..541dcaef71 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -202,10 +202,7 @@ void pc_system_firmware_init(PCMachineState *pcms,
 int i;
 BlockBackend *pflash_blk[ARRAY_SIZE(pcms->flash)];
 
-if (!pc_machine_is_pci_enabled(pcms)) {
-x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, true);
-return;
-}
+assert(pc_machine_is_pci_enabled(pcms));
 
 /* Map legacy -drive if=pflash to machine properties */
 for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
-- 
2.41.0

Re: [PATCH v2 3/6] hw/char/stm32l4x5_usart: Enable serial read and write

2024-03-28 Thread Peter Maydell

On Sun, 24 Mar 2024 at 16:57, Arnaud Minier
 wrote:
>
> Implement the ability to read and write characters to the
> usart using the serial port.
>
> The character transmission is based on the
> cmsdk-apb-uart implementation.
>
> Signed-off-by: Arnaud Minier 
> Signed-off-by: Inès Varhol 


> +/* Try to send tx data, and arrange to be called back later if
> + * we can't (ie the char backend is busy/blocking).
> + */

Coding style wants the opening "/*" on a line of its own.

Otherwise
Reviewed-by: Peter Maydell 

thanks
-- PMM

[RFC PATCH-for-9.1 08/29] hw/i386/pc: Move CXLState to PcPciMachineState

2024-03-28 Thread Philippe Mathieu-Daudé

CXL depends on PCIe, which isn't available on non-PCI
machines such the ISA-only PC one.
Move CXLState to PcPciMachineState, and move the CXL
specific calls to pc_pci_machine_initfn() and
pc_pci_machine_done().

Signed-off-by: Philippe Mathieu-Daudé 
---
 include/hw/i386/pc.h |  3 ++-
 hw/i386/acpi-build.c | 14 +++---
 hw/i386/pc.c | 39 ---
 3 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 24c8e17e62..a97493d29d 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -54,13 +54,14 @@ typedef struct PCMachineState {
 hwaddr memhp_io_base;
 
 SGXEPCState sgx_epc;
-CXLState cxl_devices_state;
 } PCMachineState;
 
 typedef struct PcPciMachineState {
 PCMachineState parent_obj;
 
 Notifier machine_done;
+
+CXLState cxl_devices_state;
 } PcPciMachineState;
 
 #define PC_MACHINE_ACPI_DEVICE_PROP "acpi-device"
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index b9890886f6..6e8e32e5d2 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -1554,6 +1554,11 @@ build_dsdt(GArray *table_data, BIOSLinker *linker,
 crs_range_set_init(&crs_range_set);
 bus = PC_MACHINE(machine)->pcibus;
 if (bus) {
+PcPciMachineState *ppms;
+
+assert(pc_machine_is_pci_enabled(pcms));
+ppms = PC_PCI_MACHINE(machine);
+
 QLIST_FOREACH(bus, &bus->child, sibling) {
 uint8_t bus_num = pci_bus_num(bus);
 uint8_t numa_node = pci_bus_numa_node(bus);
@@ -1607,7 +1612,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker,
 
 /* Handle the ranges for the PXB expanders */
 if (pci_bus_is_cxl(bus)) {
-MemoryRegion *mr = &pcms->cxl_devices_state.host_mr;
+MemoryRegion *mr = &ppms->cxl_devices_state.host_mr;
 uint64_t base = mr->addr;
 
 cxl_present = true;
@@ -2513,6 +2518,8 @@ void acpi_build(AcpiBuildTables *tables, MachineState 
*machine)
 Object *vmgenid_dev;
 char *oem_id;
 char *oem_table_id;
+bool pci_enabled = pc_machine_is_pci_enabled(pcms);
+PcPciMachineState *ppms = pci_enabled ? PC_PCI_MACHINE(pcms) : NULL;
 
 acpi_get_pm_info(machine, &pm);
 acpi_get_misc_info(&misc);
@@ -2640,9 +2647,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState 
*machine)
   machine->nvdimms_state, machine->ram_slots,
   x86ms->oem_id, x86ms->oem_table_id);
 }
-if (pcms->cxl_devices_state.is_enabled) {
+if (ppms && ppms->cxl_devices_state.is_enabled) {
 cxl_build_cedt(table_offsets, tables_blob, tables->linker,
-   x86ms->oem_id, x86ms->oem_table_id, 
&pcms->cxl_devices_state);
+   x86ms->oem_id, x86ms->oem_table_id,
+   &ppms->cxl_devices_state);
 }
 
 acpi_add_table(table_offsets, tables_blob);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f9226f7115..6d87d1d4c2 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -598,13 +598,6 @@ void pc_machine_done(Notifier *notifier, void *data)
 PCMachineState, machine_done);
 X86MachineState *x86ms = X86_MACHINE(pcms);
 
-cxl_hook_up_pxb_registers(pcms->pcibus, &pcms->cxl_devices_state,
-  &error_fatal);
-
-if (pcms->cxl_devices_state.is_enabled) {
-cxl_fmws_link_targets(&pcms->cxl_devices_state, &error_fatal);
-}
-
 /* set the number of CPUs */
 x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
 
@@ -626,6 +619,13 @@ static void pc_pci_machine_done(Notifier *notifier, void 
*data)
 PCMachineState *pcms = PC_MACHINE(ppms);
 X86MachineState *x86ms = X86_MACHINE(pcms);
 
+cxl_hook_up_pxb_registers(pcms->pcibus, &ppms->cxl_devices_state,
+  &error_fatal);
+
+if (ppms->cxl_devices_state.is_enabled) {
+cxl_fmws_link_targets(&ppms->cxl_devices_state, &error_fatal);
+}
+
 fw_cfg_add_extra_pci_roots(pcms->pcibus, x86ms->fw_cfg);
 }
 
@@ -719,13 +719,14 @@ static uint64_t pc_get_cxl_range_start(PCMachineState 
*pcms)
 
 static uint64_t pc_get_cxl_range_end(PCMachineState *pcms)
 {
+PcPciMachineState *ppms = PC_PCI_MACHINE(pcms);
 uint64_t start = pc_get_cxl_range_start(pcms) + MiB;
 
-if (pcms->cxl_devices_state.fixed_windows) {
+if (ppms->cxl_devices_state.fixed_windows) {
 GList *it;
 
 start = ROUND_UP(start, 256 * MiB);
-for (it = pcms->cxl_devices_state.fixed_windows; it; it = it->next) {
+for (it = ppms->cxl_devices_state.fixed_windows; it; it = it->next) {
 CXLFixedWindow *fw = it->data;
 start += fw->size;
 }
@@ -823,6 +824,7 @@ void pc_memory_init(PCMachineState *pcms,
 hwaddr cxl_base, cxl_resv_end = 0;
 X86CPU *cpu = X86_CPU(first_cpu);
 bool pci_enabled = pc_machine_is_pci_enabled(pc

Re: [PATCH v2 2/6] hw/char: Implement STM32L4x5 USART skeleton

2024-03-28 Thread Peter Maydell

On Sun, 24 Mar 2024 at 16:56, Arnaud Minier
 wrote:
>
> Add the basic infrastructure (register read/write, type...)
> to implement the STM32L4x5 USART.
>
> Also create different types for the USART, UART and LPUART
> of the STM32L4x5 to deduplicate code and enable the
> implementation of different behaviors depending on the type.
>
> Signed-off-by: Arnaud Minier 
> Signed-off-by: Inès Varhol 
> ---
>  MAINTAINERS   |   1 +
>  hw/char/Kconfig   |   3 +
>  hw/char/meson.build   |   1 +
>  hw/char/stm32l4x5_usart.c | 395 ++
>  hw/char/trace-events  |   4 +
>  include/hw/char/stm32l4x5_usart.h |  66 +
>  6 files changed, 470 insertions(+)
>  create mode 100644 hw/char/stm32l4x5_usart.c
>  create mode 100644 include/hw/char/stm32l4x5_usart.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 409d7db4d4..deba4a54ce 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1128,6 +1128,7 @@ M: Inès Varhol 
>  L: qemu-...@nongnu.org
>  S: Maintained
>  F: hw/arm/stm32l4x5_soc.c
> +F: hw/char/stm32l4x5_usart.c
>  F: hw/misc/stm32l4x5_exti.c
>  F: hw/misc/stm32l4x5_syscfg.c
>  F: hw/misc/stm32l4x5_rcc.c
> diff --git a/hw/char/Kconfig b/hw/char/Kconfig
> index 6b6cf2fc1d..4fd74ea878 100644
> --- a/hw/char/Kconfig
> +++ b/hw/char/Kconfig
> @@ -41,6 +41,9 @@ config VIRTIO_SERIAL
>  config STM32F2XX_USART
>  bool
>
> +config STM32L4X5_USART
> +bool
> +
>  config CMSDK_APB_UART
>  bool
>
> diff --git a/hw/char/meson.build b/hw/char/meson.build
> index 006d20f1e2..e5b13b6958 100644
> --- a/hw/char/meson.build
> +++ b/hw/char/meson.build
> @@ -31,6 +31,7 @@ system_ss.add(when: 'CONFIG_RENESAS_SCI', if_true: 
> files('renesas_sci.c'))
>  system_ss.add(when: 'CONFIG_SIFIVE_UART', if_true: files('sifive_uart.c'))
>  system_ss.add(when: 'CONFIG_SH_SCI', if_true: files('sh_serial.c'))
>  system_ss.add(when: 'CONFIG_STM32F2XX_USART', if_true: 
> files('stm32f2xx_usart.c'))
> +system_ss.add(when: 'CONFIG_STM32L4X5_USART', if_true: 
> files('stm32l4x5_usart.c'))
>  system_ss.add(when: 'CONFIG_MCHP_PFSOC_MMUART', if_true: 
> files('mchp_pfsoc_mmuart.c'))
>  system_ss.add(when: 'CONFIG_HTIF', if_true: files('riscv_htif.c'))
>  system_ss.add(when: 'CONFIG_GOLDFISH_TTY', if_true: files('goldfish_tty.c'))
> diff --git a/hw/char/stm32l4x5_usart.c b/hw/char/stm32l4x5_usart.c
> new file mode 100644
> index 00..46e69bb096
> --- /dev/null
> +++ b/hw/char/stm32l4x5_usart.c
> @@ -0,0 +1,395 @@
> +/*
> + * STM32L4X5 USART (Universal Synchronous Asynchronous Receiver Transmitter)
> + *
> + * Copyright (c) 2023 Arnaud Minier 
> + * Copyright (c) 2023 Inès Varhol 
> + *
> + * SPDX-License-Identifier: GPL-2.0-or-later
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + * The STM32L4X5 USART is heavily inspired by the stm32f2xx_usart
> + * by Alistair Francis.
> + * The reference used is the STMicroElectronics RM0351 Reference manual
> + * for STM32L4x5 and STM32L4x6 advanced Arm ® -based 32-bit MCUs.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/log.h"
> +#include "qemu/module.h"
> +#include "qapi/error.h"
> +#include "chardev/char-fe.h"
> +#include "chardev/char-serial.h"
> +#include "migration/vmstate.h"
> +#include "hw/char/stm32l4x5_usart.h"
> +#include "hw/clock.h"
> +#include "hw/irq.h"
> +#include "hw/qdev-clock.h"
> +#include "hw/qdev-properties.h"
> +#include "hw/qdev-properties-system.h"
> +#include "hw/registerfields.h"
> +#include "trace.h"
> +
> +
> +REG32(CR1, 0x00)
> +FIELD(CR1, M1, 28, 1)/* Word length (part 2, see M0)*/

Missing space before "*/"

> +static const TypeInfo stm32l4x5_usart_types[] = {
> +{
> +.name   = TYPE_STM32L4X5_USART_BASE,
> +.parent = TYPE_SYS_BUS_DEVICE,
> +.instance_size  = sizeof(Stm32l4x5UsartBaseState),
> +.instance_init  = stm32l4x5_usart_base_init,
> +.class_init = stm32l4x5_usart_base_class_init,

This should also have
.abstract = true,

so you can't create an instance of this class, only of
the specific subclasses.

> +}, {
> +.name   = TYPE_STM32L4X5_USART,
> +.parent = TYPE_STM32L4X5_USART_BASE,
> +.class_init = stm32l4x5_usart_class_init,
> +}, {
> +.name   = TYPE_STM32L4X5_UART,
> +.parent = TYPE_STM32L4X5_USART_BASE,
> +.class_init = stm32l4x5_uart_class_init,
> +}, {
> +.name   = TYPE_STM32L4X5_LPUART,
> +.parent = TYPE_STM32L4X5_USART_BASE,
> +.class_init = stm32l4x5_lpuart_class_init,
> +}
> +};
> +

Otherwise
Reviewed-by: Peter Maydell 

thanks
-- PMM

[RFC PATCH-for-9.1 28/29] hw/i386/pc: Rename pc_init1() -> pc_piix_init()

2024-03-28 Thread Philippe Mathieu-Daudé

pc_init1() is specific to the isapc and i440fx/piix machines,
rename it as pc_piix_init(). Expose it in "hw/i386/pc.h" to
be able to call it externally (see next patch).

Signed-off-by: Philippe Mathieu-Daudé 
---
 include/hw/i386/pc.h | 1 +
 hw/i386/pc_piix.c| 8 
 hw/isa/piix.c| 2 +-
 hw/pci-host/i440fx.c | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 6510914803..9a11835b7e 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -163,6 +163,7 @@ void pc_basic_device_init(struct PCMachineState *pcms,
   bool create_fdctrl,
   uint32_t hpet_irqs);
 void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus);
+void pc_piix_init(MachineState *machine, const char *pci_type);
 
 void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs);
 
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 4f07476cfa..4a3ae72fe4 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -111,7 +111,7 @@ static bool gigabyte_align(PCMachineState *pcms)
 }
 
 /* PC hardware initialisation */
-static void pc_init1(MachineState *machine, const char *pci_type)
+void pc_piix_init(MachineState *machine, const char *pci_type)
 {
 PCMachineState *pcms = PC_MACHINE(machine);
 PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
@@ -437,7 +437,7 @@ static void pc_set_south_bridge(Object *obj, int value, 
Error **errp)
 #ifdef CONFIG_ISAPC
 static void pc_init_isa(MachineState *machine)
 {
-pc_init1(machine, NULL);
+pc_piix_init(machine, NULL);
 }
 #endif
 
@@ -447,7 +447,7 @@ static void pc_xen_hvm_init_pci(MachineState *machine)
 const char *pci_type = xen_igd_gfx_pt_enabled() ?
 TYPE_IGD_PASSTHROUGH_I440FX_PCI_DEVICE : 
TYPE_I440FX_PCI_DEVICE;
 
-pc_init1(machine, pci_type);
+pc_piix_init(machine, pci_type);
 }
 
 static void pc_xen_hvm_init(MachineState *machine)
@@ -472,7 +472,7 @@ static void pc_xen_hvm_init(MachineState *machine)
 if (compat) { \
 compat(machine); \
 } \
-pc_init1(machine, TYPE_I440FX_PCI_DEVICE); \
+pc_piix_init(machine, TYPE_I440FX_PCI_DEVICE); \
 } \
 DEFINE_PC_MACHINE(suffix, name, pc_init_##suffix, optionfn, \
   TYPE_PC_PCI_MACHINE)
diff --git a/hw/isa/piix.c b/hw/isa/piix.c
index 2d30711b17..14dc9e78be 100644
--- a/hw/isa/piix.c
+++ b/hw/isa/piix.c
@@ -432,7 +432,7 @@ static void pci_piix_class_init(ObjectClass *klass, void 
*data)
 k->class_id = PCI_CLASS_BRIDGE_ISA;
 /*
  * Reason: part of PIIX southbridge, needs to be wired up by e.g.
- * pc_piix.c's pc_init1()
+ * pc_piix.c's pc_piix_init()
  */
 dc->user_creatable = false;
 device_class_set_props(dc, pci_piix_props);
diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index add99e4f76..9f47d5507a 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -374,7 +374,7 @@ static void i440fx_pcihost_class_init(ObjectClass *klass, 
void *data)
 dc->realize = i440fx_pcihost_realize;
 dc->fw_name = "pci";
 device_class_set_props(dc, i440fx_props);
-/* Reason: needs to be wired up by pc_init1 */
+/* Reason: needs to be wired up by pc_piix_init */
 dc->user_creatable = false;
 
 object_class_property_add(klass, PCI_HOST_PROP_PCI_HOLE_START, "uint32",
-- 
2.41.0

[RFC PATCH-for-9.1 15/29] hw/i386/pc: Move FW/pflash related fields to PcPciMachineState

2024-03-28 Thread Philippe Mathieu-Daudé

Only PCI-based machines use the set of parallel flash devices.
Move the fields from PCMachineState to PcPciMachineState.
Directly pass a PcPciMachineState argument to the
pc_system_flash/fw methods.

Signed-off-by: Philippe Mathieu-Daudé 
---
 include/hw/i386/pc.h | 10 
 hw/i386/pc.c | 25 +-
 hw/i386/pc_piix.c|  3 ++-
 hw/i386/pc_sysfw.c   | 60 +++-
 4 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 67f8f4730b..668347c248 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -30,7 +30,6 @@ typedef struct PCMachineState {
 /* Pointers to devices and objects: */
 PCIBus *pcibus;
 I2CBus *smbus;
-PFlashCFI01 *flash[2];
 ISADevice *pcspk;
 DeviceState *iommu;
 BusState *idebus[MAX_IDE_BUS];
@@ -47,7 +46,6 @@ typedef struct PCMachineState {
 bool i8042_enabled;
 bool default_bus_bypass_iommu;
 bool fd_bootchk;
-uint64_t max_fw_size;
 
 /* ACPI Memory hotplug IO base address */
 hwaddr memhp_io_base;
@@ -61,7 +59,9 @@ typedef struct PcPciMachineState {
 Notifier machine_done;
 
 bool acpi_build_enabled;
+uint64_t max_fw_size;
 
+PFlashCFI01 *flash[2];
 CXLState cxl_devices_state;
 } PcPciMachineState;
 
@@ -184,9 +184,9 @@ void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs);
 #define TYPE_PORT92 "port92"
 
 /* pc_sysfw.c */
-void pc_system_flash_create(PCMachineState *pcms);
-void pc_system_flash_cleanup_unused(PCMachineState *pcms);
-void pc_system_firmware_init(PCMachineState *pcms, MemoryRegion *rom_memory);
+void pc_system_flash_create(PcPciMachineState *ppms);
+void pc_system_flash_cleanup_unused(PcPciMachineState *ppms);
+void pc_system_firmware_init(PcPciMachineState *ppms, MemoryRegion 
*rom_memory);
 bool pc_system_ovmf_table_find(const char *entry, uint8_t **data,
int *data_len);
 void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size);
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 33724791fd..5753a3ff0b 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -957,7 +957,7 @@ void pc_memory_init(PCMachineState *pcms,
 
 /* Initialize PC system firmware */
 if (pci_enabled) {
-pc_system_firmware_init(pcms, rom_memory);
+pc_system_firmware_init(ppms, rom_memory);
 } else {
 x86_bios_rom_init(machine, "bios.bin", rom_memory, true);
 }
@@ -1617,8 +1617,8 @@ static void pc_machine_get_max_fw_size(Object *obj, 
Visitor *v,
const char *name, void *opaque,
Error **errp)
 {
-PCMachineState *pcms = PC_MACHINE(obj);
-uint64_t value = pcms->max_fw_size;
+PcPciMachineState *ppms = PC_PCI_MACHINE(obj);
+uint64_t value = ppms->max_fw_size;
 
 visit_type_size(v, name, &value, errp);
 }
@@ -1627,7 +1627,7 @@ static void pc_machine_set_max_fw_size(Object *obj, 
Visitor *v,
const char *name, void *opaque,
Error **errp)
 {
-PCMachineState *pcms = PC_MACHINE(obj);
+PcPciMachineState *ppms = PC_PCI_MACHINE(obj);
 uint64_t value;
 
 if (!visit_type_size(v, name, &value, errp)) {
@@ -1651,7 +1651,7 @@ static void pc_machine_set_max_fw_size(Object *obj, 
Visitor *v,
 return;
 }
 
-pcms->max_fw_size = value;
+ppms->max_fw_size = value;
 }
 
 
@@ -1672,7 +1672,6 @@ static void pc_machine_initfn(Object *obj)
 pcms->smbus_enabled = true;
 pcms->sata_enabled = true;
 pcms->i8042_enabled = true;
-pcms->max_fw_size = 8 * MiB;
 #ifdef CONFIG_HPET
 pcms->hpet_enabled = true;
 #endif
@@ -1692,8 +1691,9 @@ static void pc_pci_machine_initfn(Object *obj)
 PcPciMachineState *ppms = PC_PCI_MACHINE(obj);
 
 ppms->acpi_build_enabled = true;
+ppms->max_fw_size = 8 * MiB;
 
-pc_system_flash_create(PC_MACHINE(obj));
+pc_system_flash_create(ppms);
 cxl_machine_init(obj, &ppms->cxl_devices_state);
 
 ppms->machine_done.notify = pc_pci_machine_done;
@@ -1815,12 +1815,6 @@ static void pc_machine_class_init(ObjectClass *oc, void 
*data)
 pc_machine_get_default_bus_bypass_iommu,
 pc_machine_set_default_bus_bypass_iommu);
 
-object_class_property_add(oc, PC_MACHINE_MAX_FW_SIZE, "size",
-pc_machine_get_max_fw_size, pc_machine_set_max_fw_size,
-NULL, NULL);
-object_class_property_set_description(oc, PC_MACHINE_MAX_FW_SIZE,
-"Maximum combined firmware size");
-
 object_class_property_add(oc, PC_MACHINE_SMBIOS_EP, "str",
 pc_machine_get_smbios_ep, pc_machine_set_smbios_ep,
 NULL, NULL);
@@ -1834,6 +1828,11 @@ static void pc_machine_class_init(ObjectClass *oc, void 
*data)
 
 static void pc_pci_machine_class_init(ObjectClass *oc, void *data)
 {
+object_class_property_add(oc, PC_MACHINE_MAX_FW_SIZE, "size",
+

[RFC PATCH-for-9.1 07/29] hw/i386/pc: Call fw_cfg_add_extra_pci_roots() in pc_pci_machine_done()

2024-03-28 Thread Philippe Mathieu-Daudé

fw_cfg_add_extra_pci_roots() expects a PCI bus, which only
PCI-based machines have. No need to call it on the ISA-only
machine. Move it to the PCI-specific machine_done handler.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/i386/pc.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index a16bb1554c..f9226f7115 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -608,8 +608,6 @@ void pc_machine_done(Notifier *notifier, void *data)
 /* set the number of CPUs */
 x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
 
-fw_cfg_add_extra_pci_roots(pcms->pcibus, x86ms->fw_cfg);
-
 acpi_setup();
 if (x86ms->fw_cfg) {
 fw_cfg_build_smbios(pcms, x86ms->fw_cfg, 
pcms->smbios_entry_point_type);
@@ -623,6 +621,12 @@ void pc_machine_done(Notifier *notifier, void *data)
 
 static void pc_pci_machine_done(Notifier *notifier, void *data)
 {
+PcPciMachineState *ppms = container_of(notifier,
+   PcPciMachineState, machine_done);
+PCMachineState *pcms = PC_MACHINE(ppms);
+X86MachineState *x86ms = X86_MACHINE(pcms);
+
+fw_cfg_add_extra_pci_roots(pcms->pcibus, x86ms->fw_cfg);
 }
 
 /* setup pci memory address space mapping into system address space */
-- 
2.41.0

[RFC PATCH-for-9.1 24/29] hw/i386/fw_cfg: Inline smbios_legacy_mode()

2024-03-28 Thread Philippe Mathieu-Daudé

All PCI-based machines have the smbios_legacy_mode
field set to %false. Simplify by using an inlined
helper checking whether the machine is PCI-based or
not.

Signed-off-by: Philippe Mathieu-Daudé 
---
 include/hw/i386/pc.h | 1 -
 hw/i386/fw_cfg.c | 8 ++--
 hw/i386/pc_piix.c| 2 --
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 6a6a8df005..6510914803 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -86,7 +86,6 @@ typedef struct PCMachineClass {
 int default_cpu_version;
 
 /* SMBIOS compat: */
-bool smbios_legacy_mode;
 SmbiosEntryPointType default_smbios_ep_type;
 
 /* RAM / address space compat: */
diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index f60390ed56..ffa60a4a33 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -54,6 +54,11 @@ static bool smbios_defaults(PCMachineState *pcms)
 return pc_machine_is_pci_enabled(pcms);
 }
 
+static bool smbios_legacy_mode(PCMachineState *pcms)
+{
+return !pc_machine_is_pci_enabled(pcms);
+}
+
 void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState *fw_cfg,
  SmbiosEntryPointType ep_type)
 {
@@ -62,7 +67,6 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
*fw_cfg,
 struct smbios_phys_mem_area *mem_array;
 unsigned i, array_count;
 MachineState *ms = MACHINE(pcms);
-PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
 MachineClass *mc = MACHINE_GET_CLASS(pcms);
 X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu);
 
@@ -74,7 +78,7 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
*fw_cfg,
 /* tell smbios about cpuid version and features */
 smbios_set_cpuid(cpu->env.cpuid_version, cpu->env.features[FEAT_1_EDX]);
 
-if (pcmc->smbios_legacy_mode) {
+if (smbios_legacy_mode(pcms)) {
 smbios_tables = smbios_get_table_legacy(&smbios_tables_len,
 &error_fatal);
 fw_cfg_add_bytes(fw_cfg, FW_CFG_SMBIOS_ENTRIES,
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 70dc8686f9..4f07476cfa 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -823,12 +823,10 @@ DEFINE_I440FX_MACHINE(v2_4, "pc-i440fx-2.4", NULL,
 #ifdef CONFIG_ISAPC
 static void isapc_machine_options(MachineClass *m)
 {
-PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
 m->desc = "ISA-only PC";
 m->max_cpus = 1;
 m->option_rom_has_mr = true;
 m->rom_file_has_mr = false;
-pcmc->smbios_legacy_mode = true;
 m->default_nic = "ne2k_isa";
 m->default_cpu_type = X86_CPU_TYPE_NAME("486");
 m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL);
-- 
2.41.0

[RFC PATCH-for-9.1 14/29] hw/i386/pc: Move pc_system_flash_create() to pc_pci_machine_initfn()

2024-03-28 Thread Philippe Mathieu-Daudé

pc_system_flash_create() is only useful for PCI-based machines.
Move the call to the PCI-based init() handler.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/i386/pc.c   |  2 +-
 hw/i386/pc_sysfw.c | 10 --
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 5b96daa414..33724791fd 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1679,7 +1679,6 @@ static void pc_machine_initfn(Object *obj)
 pcms->fd_bootchk = true;
 pcms->default_bus_bypass_iommu = false;
 
-pc_system_flash_create(pcms);
 pcms->pcspk = isa_new(TYPE_PC_SPEAKER);
 object_property_add_alias(OBJECT(pcms), "pcspk-audiodev",
   OBJECT(pcms->pcspk), "audiodev");
@@ -1694,6 +1693,7 @@ static void pc_pci_machine_initfn(Object *obj)
 
 ppms->acpi_build_enabled = true;
 
+pc_system_flash_create(PC_MACHINE(obj));
 cxl_machine_init(obj, &ppms->cxl_devices_state);
 
 ppms->machine_done.notify = pc_pci_machine_done;
diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 541dcaef71..167ff24fcb 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -93,12 +93,10 @@ static PFlashCFI01 *pc_pflash_create(PCMachineState *pcms,
 
 void pc_system_flash_create(PCMachineState *pcms)
 {
-if (pc_machine_is_pci_enabled(pcms)) {
-pcms->flash[0] = pc_pflash_create(pcms, "system.flash0",
-  "pflash0");
-pcms->flash[1] = pc_pflash_create(pcms, "system.flash1",
-  "pflash1");
-}
+assert(pc_machine_is_pci_enabled(pcms));
+
+pcms->flash[0] = pc_pflash_create(pcms, "system.flash0", "pflash0");
+pcms->flash[1] = pc_pflash_create(pcms, "system.flash1", "pflash1");
 }
 
 void pc_system_flash_cleanup_unused(PCMachineState *pcms)
-- 
2.41.0

[RFC PATCH-for-9.1 22/29] hw/i386/fw_cfg: Define fw_cfg_build_smbios() stub

2024-03-28 Thread Philippe Mathieu-Daudé

We are going to refactor fw_cfg_build_smbios() in the
next patches. In order to avoid too much #ifdef'ry in
it, define a stub.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/i386/fw_cfg-smbios-stub.c | 15 +++
 hw/i386/fw_cfg.c |  4 ++--
 hw/i386/meson.build  |  1 +
 3 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 hw/i386/fw_cfg-smbios-stub.c

diff --git a/hw/i386/fw_cfg-smbios-stub.c b/hw/i386/fw_cfg-smbios-stub.c
new file mode 100644
index 00..37dbfdee7c
--- /dev/null
+++ b/hw/i386/fw_cfg-smbios-stub.c
@@ -0,0 +1,15 @@
+/*
+ * QEMU fw_cfg/SMBIOS stubs
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ * SPDX-FileCopyrightText: 2024 Linaro Ltd.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/firmware/smbios.h"
+#include "fw_cfg.h"
+
+void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState *fw_cfg,
+ SmbiosEntryPointType ep_type)
+{
+}
diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index ecc4047a4b..14a7dfbdc9 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -48,10 +48,10 @@ const char *fw_cfg_arch_key_name(uint16_t key)
 return NULL;
 }
 
+#ifdef CONFIG_SMBIOS
 void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState *fw_cfg,
  SmbiosEntryPointType ep_type)
 {
-#ifdef CONFIG_SMBIOS
 uint8_t *smbios_tables, *smbios_anchor;
 size_t smbios_tables_len, smbios_anchor_len;
 struct smbios_phys_mem_area *mem_array;
@@ -100,8 +100,8 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
*fw_cfg,
 fw_cfg_add_file(fw_cfg, "etc/smbios/smbios-anchor",
 smbios_anchor, smbios_anchor_len);
 }
-#endif
 }
+#endif
 
 FWCfgState *fw_cfg_arch_create(MachineState *ms,
   uint16_t boot_cpus,
diff --git a/hw/i386/meson.build b/hw/i386/meson.build
index d8b70ef3e9..1a6e731196 100644
--- a/hw/i386/meson.build
+++ b/hw/i386/meson.build
@@ -6,6 +6,7 @@ i386_ss.add(files(
   'multiboot.c',
   'x86.c',
 ))
+i386_ss.add(when: 'CONFIG_SMBIOS', if_false: files('fw_cfg-smbios-stub.c'))
 
 i386_ss.add(when: 'CONFIG_X86_IOMMU', if_true: files('x86-iommu.c'),
   if_false: files('x86-iommu-stub.c'))
-- 
2.41.0

[RFC PATCH-for-9.1 19/29] hw/i386/pc: Pass PcPciMachineState argument to CXL helpers

2024-03-28 Thread Philippe Mathieu-Daudé

Since CXL helpers expect a PCI-based machine, we
can directly pass them a PcPciMachineState argument.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/i386/pc.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index e36d76656b..d8e91d18b8 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -705,14 +705,14 @@ static void pc_get_device_memory_range(PCMachineState 
*pcms,
 *device_mem_size = size;
 }
 
-static uint64_t pc_get_cxl_range_start(PCMachineState *pcms)
+static uint64_t pc_get_cxl_range_start(PcPciMachineState *ppms)
 {
+PCMachineState *pcms = PC_MACHINE(ppms);
 MachineState *ms = MACHINE(pcms);
 hwaddr cxl_base;
 ram_addr_t size;
 
-if (has_reserved_memory(pcms) &&
-(ms->ram_size < ms->maxram_size)) {
+if ((ms->ram_size < ms->maxram_size)) {
 pc_get_device_memory_range(pcms, &cxl_base, &size);
 cxl_base += size;
 } else {
@@ -722,10 +722,9 @@ static uint64_t pc_get_cxl_range_start(PCMachineState 
*pcms)
 return cxl_base;
 }
 
-static uint64_t pc_get_cxl_range_end(PCMachineState *pcms)
+static uint64_t pc_get_cxl_range_end(PcPciMachineState *ppms)
 {
-PcPciMachineState *ppms = PC_PCI_MACHINE(pcms);
-uint64_t start = pc_get_cxl_range_start(pcms) + MiB;
+uint64_t start = pc_get_cxl_range_start(ppms) + MiB;
 
 if (ppms->cxl_devices_state.fixed_windows) {
 GList *it;
@@ -937,7 +936,7 @@ void pc_memory_init(PCMachineState *pcms,
 MemoryRegion *mr = &ppms->cxl_devices_state.host_mr;
 hwaddr cxl_size = MiB;
 
-cxl_base = pc_get_cxl_range_start(pcms);
+cxl_base = pc_get_cxl_range_start(ppms);
 memory_region_init(mr, OBJECT(machine), "cxl_host_reg", cxl_size);
 memory_region_add_subregion(system_memory, cxl_base, mr);
 cxl_resv_end = cxl_base + cxl_size;
@@ -1027,7 +1026,7 @@ uint64_t pc_pci_hole64_start(void)
 ram_addr_t size = 0;
 
 if (ppms->cxl_devices_state.is_enabled) {
-hole64_start = pc_get_cxl_range_end(pcms);
+hole64_start = pc_get_cxl_range_end(ppms);
 } else if (has_reserved_memory(pcms) && (ms->ram_size < ms->maxram_size)) {
 pc_get_device_memory_range(pcms, &hole64_start, &size);
 if (!pcmc->broken_reserved_end) {
-- 
2.41.0

[RFC PATCH-for-9.1 29/29] hw/i386/pc: Move ISA-only PC machine to pc_isa.c

2024-03-28 Thread Philippe Mathieu-Daudé

Extract the ISA-only PC machine code from pc_piix.c
to a new file, pc_isa.c.

Signed-off-by: Philippe Mathieu-Daudé 
---
 MAINTAINERS |  1 +
 hw/i386/pc_isa.c| 33 +
 hw/i386/pc_piix.c   | 23 ---
 hw/i386/meson.build |  1 +
 4 files changed, 35 insertions(+), 23 deletions(-)
 create mode 100644 hw/i386/pc_isa.c

diff --git a/MAINTAINERS b/MAINTAINERS
index a07af6b9d4..a68fa813b8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1812,6 +1812,7 @@ M: Marcel Apfelbaum 
 S: Supported
 F: include/hw/i386/
 F: hw/i386/
+X: hw/i386/pc_isa.c
 F: hw/pci-host/i440fx.c
 F: hw/pci-host/q35.c
 F: hw/pci-host/pam.c
diff --git a/hw/i386/pc_isa.c b/hw/i386/pc_isa.c
new file mode 100644
index 00..a98c75f3ae
--- /dev/null
+++ b/hw/i386/pc_isa.c
@@ -0,0 +1,33 @@
+/*
+ * QEMU ISA PC System Emulator
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "qemu/osdep.h"
+#include "qom/object.h"
+#include "hw/boards.h"
+#include "hw/i386/pc.h"
+#include "hw/char/parallel-isa.h"
+#include "target/i386/cpu-qom.h"
+
+static void pc_init_isa(MachineState *machine)
+{
+pc_piix_init(machine, NULL);
+}
+
+static void isapc_machine_options(MachineClass *m)
+{
+m->desc = "ISA-only PC";
+m->max_cpus = 1;
+m->option_rom_has_mr = true;
+m->rom_file_has_mr = false;
+m->default_nic = "ne2k_isa";
+m->default_cpu_type = X86_CPU_TYPE_NAME("486");
+m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL);
+}
+
+DEFINE_PC_MACHINE(isapc, "isapc", pc_init_isa,
+  isapc_machine_options, TYPE_PC_MACHINE);
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 4a3ae72fe4..f94221ab92 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -434,13 +434,6 @@ static void pc_set_south_bridge(Object *obj, int value, 
Error **errp)
  * hw_compat_*, pc_compat_*, or * pc_*_machine_options().
  */
 
-#ifdef CONFIG_ISAPC
-static void pc_init_isa(MachineState *machine)
-{
-pc_piix_init(machine, NULL);
-}
-#endif
-
 #ifdef CONFIG_XEN
 static void pc_xen_hvm_init_pci(MachineState *machine)
 {
@@ -820,22 +813,6 @@ static void pc_i440fx_2_4_machine_options(MachineClass *m)
 DEFINE_I440FX_MACHINE(v2_4, "pc-i440fx-2.4", NULL,
   pc_i440fx_2_4_machine_options)
 
-#ifdef CONFIG_ISAPC
-static void isapc_machine_options(MachineClass *m)
-{
-m->desc = "ISA-only PC";
-m->max_cpus = 1;
-m->option_rom_has_mr = true;
-m->rom_file_has_mr = false;
-m->default_nic = "ne2k_isa";
-m->default_cpu_type = X86_CPU_TYPE_NAME("486");
-m->no_parallel = !module_object_class_by_name(TYPE_ISA_PARALLEL);
-}
-
-DEFINE_PC_MACHINE(isapc, "isapc", pc_init_isa,
-  isapc_machine_options, TYPE_PC_MACHINE);
-#endif
-
 #ifdef CONFIG_XEN
 static void xenfv_4_2_machine_options(MachineClass *m)
 {
diff --git a/hw/i386/meson.build b/hw/i386/meson.build
index 1a6e731196..0576fc6541 100644
--- a/hw/i386/meson.build
+++ b/hw/i386/meson.build
@@ -12,6 +12,7 @@ i386_ss.add(when: 'CONFIG_X86_IOMMU', if_true: 
files('x86-iommu.c'),
   if_false: files('x86-iommu-stub.c'))
 i386_ss.add(when: 'CONFIG_AMD_IOMMU', if_true: files('amd_iommu.c'),
   if_false: files('amd_iommu-stub.c'))
+i386_ss.add(when: 'CONFIG_ISAPC', if_true: files('pc_isa.c'))
 i386_ss.add(when: 'CONFIG_I440FX', if_true: files('pc_piix.c'))
 i386_ss.add(when: 'CONFIG_MICROVM', if_true: files('microvm.c', 
'acpi-microvm.c', 'microvm-dt.c'))
 i386_ss.add(when: 'CONFIG_Q35', if_true: files('pc_q35.c'))
-- 
2.41.0

[RFC PATCH-for-9.1 27/29] hw/i386/pc: Call fw_cfg_build_smbios_legacy() in pc_machine_done()

2024-03-28 Thread Philippe Mathieu-Daudé

Keep fw_cfg_build_smbios() for PCI-based machines, call
fw_cfg_build_smbios_legacy() directly from pc_machine_done().

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/i386/fw_cfg.c | 10 --
 hw/i386/pc.c | 12 +++-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index be37e28f46..92e058446f 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -49,11 +49,6 @@ const char *fw_cfg_arch_key_name(uint16_t key)
 }
 
 #ifdef CONFIG_SMBIOS
-static bool smbios_legacy_mode(PCMachineState *pcms)
-{
-return !pc_machine_is_pci_enabled(pcms);
-}
-
 void fw_cfg_build_smbios_legacy(PCMachineState *pcms, FWCfgState *fw_cfg)
 {
 uint8_t *smbios_tables;
@@ -81,11 +76,6 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
*fw_cfg,
 MachineClass *mc = MACHINE_GET_CLASS(pcms);
 X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu);
 
-if (smbios_legacy_mode(pcms)) {
-fw_cfg_build_smbios_legacy(pcms, fw_cfg);
-return;
-}
-
 /* These values are guest ABI, do not change */
 smbios_set_defaults("QEMU", mc->desc, mc->name);
 
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 7a758a2e84..7d06a088cf 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -591,6 +591,11 @@ void pc_acpi_smi_interrupt(void *opaque, int irq, int 
level)
 }
 }
 
+static bool smbios_legacy_mode(PCMachineState *pcms)
+{
+return !pc_machine_is_pci_enabled(pcms);
+}
+
 static
 void pc_machine_done(Notifier *notifier, void *data)
 {
@@ -602,7 +607,12 @@ void pc_machine_done(Notifier *notifier, void *data)
 x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
 
 if (x86ms->fw_cfg) {
-fw_cfg_build_smbios(pcms, x86ms->fw_cfg, 
pcms->smbios_entry_point_type);
+if (smbios_legacy_mode(pcms)) {
+fw_cfg_build_smbios_legacy(pcms, x86ms->fw_cfg);
+} else {
+fw_cfg_build_smbios(pcms, x86ms->fw_cfg,
+pcms->smbios_entry_point_type);
+}
 fw_cfg_build_feature_control(MACHINE(pcms), x86ms->fw_cfg);
 /* update FW_CFG_NB_CPUS to account for -device added CPUs */
 fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
-- 
2.41.0

[RFC PATCH-for-9.1 25/29] hw/i386/fw_cfg: Replace smbios_defaults() by !smbios_legacy_mode()

2024-03-28 Thread Philippe Mathieu-Daudé

smbios_defaults() and smbios_legacy_mode() are logical
opposite. Simplify using the latter.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/i386/fw_cfg.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index ffa60a4a33..df05fe060c 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -49,11 +49,6 @@ const char *fw_cfg_arch_key_name(uint16_t key)
 }
 
 #ifdef CONFIG_SMBIOS
-static bool smbios_defaults(PCMachineState *pcms)
-{
-return pc_machine_is_pci_enabled(pcms);
-}
-
 static bool smbios_legacy_mode(PCMachineState *pcms)
 {
 return !pc_machine_is_pci_enabled(pcms);
@@ -70,7 +65,7 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
*fw_cfg,
 MachineClass *mc = MACHINE_GET_CLASS(pcms);
 X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu);
 
-if (smbios_defaults(pcms)) {
+if (!smbios_legacy_mode(pcms)) {
 /* These values are guest ABI, do not change */
 smbios_set_defaults("QEMU", mc->desc, mc->name);
 }
-- 
2.41.0

[RFC PATCH-for-9.1 11/29] hw/i386/pc: Move acpi_setup() call to pc_pci_machine_done()

2024-03-28 Thread Philippe Mathieu-Daudé

acpi_setup() returns early if acpi_build_enabled is not set:

  2752 void acpi_setup(PCMachineState *pcms)
  2753 {
   ...
  2768 if (!pcms->acpi_build_enabled) {
  2769 ACPI_BUILD_DPRINTF("ACPI build disabled. Bailing out.\n");
  2770 return;
  2771 }

acpi_build_enabled is only set on PCI-based machines, so it
is pointless to call acpi_setup() from non-PCI like the ISA-only
machine, move the call to pc_pci_machine_done().

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/i386/pc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index f0dc04e2fc..47fe3a7c02 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -601,7 +601,6 @@ void pc_machine_done(Notifier *notifier, void *data)
 /* set the number of CPUs */
 x86_rtc_set_cpus_count(x86ms->rtc, x86ms->boot_cpus);
 
-acpi_setup(pcms);
 if (x86ms->fw_cfg) {
 fw_cfg_build_smbios(pcms, x86ms->fw_cfg, 
pcms->smbios_entry_point_type);
 fw_cfg_build_feature_control(MACHINE(pcms), x86ms->fw_cfg);
@@ -627,6 +626,8 @@ static void pc_pci_machine_done(Notifier *notifier, void 
*data)
 }
 
 fw_cfg_add_extra_pci_roots(pcms->pcibus, x86ms->fw_cfg);
+
+acpi_setup(pcms);
 }
 
 /* setup pci memory address space mapping into system address space */
-- 
2.41.0

[RFC PATCH-for-9.1 21/29] hw/i386/fw_cfg: Include missing 'qapi-types-machine.h' header

2024-03-28 Thread Philippe Mathieu-Daudé

"fw_cfg.h" declares fw_cfg_build_smbios() which use
SmbiosEntryPointType, itself declared in "qapi-types-machine.h".

  void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState *fw_cfg,
   SmbiosEntryPointType ep_type);
   

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/i386/fw_cfg.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/i386/fw_cfg.h b/hw/i386/fw_cfg.h
index 92e310f5fd..7a426119f8 100644
--- a/hw/i386/fw_cfg.h
+++ b/hw/i386/fw_cfg.h
@@ -12,6 +12,7 @@
 #include "hw/boards.h"
 #include "hw/i386/pc.h"
 #include "hw/nvram/fw_cfg.h"
+#include "qapi/qapi-types-machine.h"
 
 #define FW_CFG_IO_BASE 0x510
 
-- 
2.41.0

[RFC PATCH-for-9.1 26/29] hw/i386/fw_cfg: Factor fw_cfg_build_smbios_legacy() out

2024-03-28 Thread Philippe Mathieu-Daudé

Factor fw_cfg_build_smbios_legacy() out of
fw_cfg_build_smbios().

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/i386/fw_cfg.h |  1 +
 hw/i386/fw_cfg-smbios-stub.c |  4 
 hw/i386/fw_cfg.c | 33 ++---
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/hw/i386/fw_cfg.h b/hw/i386/fw_cfg.h
index 7a426119f8..25ce86ec1b 100644
--- a/hw/i386/fw_cfg.h
+++ b/hw/i386/fw_cfg.h
@@ -24,6 +24,7 @@
 FWCfgState *fw_cfg_arch_create(MachineState *ms,
uint16_t boot_cpus,
uint16_t apic_id_limit);
+void fw_cfg_build_smbios_legacy(PCMachineState *pcms, FWCfgState *fw_cfg);
 void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState *fw_cfg,
  SmbiosEntryPointType ep_type);
 void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg);
diff --git a/hw/i386/fw_cfg-smbios-stub.c b/hw/i386/fw_cfg-smbios-stub.c
index 37dbfdee7c..da00ffc9ae 100644
--- a/hw/i386/fw_cfg-smbios-stub.c
+++ b/hw/i386/fw_cfg-smbios-stub.c
@@ -13,3 +13,7 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
*fw_cfg,
  SmbiosEntryPointType ep_type)
 {
 }
+
+void fw_cfg_build_smbios_legacy(PCMachineState *pcms, FWCfgState *fw_cfg)
+{
+}
diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index df05fe060c..be37e28f46 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -54,6 +54,22 @@ static bool smbios_legacy_mode(PCMachineState *pcms)
 return !pc_machine_is_pci_enabled(pcms);
 }
 
+void fw_cfg_build_smbios_legacy(PCMachineState *pcms, FWCfgState *fw_cfg)
+{
+uint8_t *smbios_tables;
+size_t smbios_tables_len;
+MachineState *ms = MACHINE(pcms);
+X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu);
+
+/* tell smbios about cpuid version and features */
+smbios_set_cpuid(cpu->env.cpuid_version, cpu->env.features[FEAT_1_EDX]);
+
+smbios_tables = smbios_get_table_legacy(&smbios_tables_len,
+&error_fatal);
+fw_cfg_add_bytes(fw_cfg, FW_CFG_SMBIOS_ENTRIES,
+ smbios_tables, smbios_tables_len);
+}
+
 void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState *fw_cfg,
  SmbiosEntryPointType ep_type)
 {
@@ -65,22 +81,17 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
*fw_cfg,
 MachineClass *mc = MACHINE_GET_CLASS(pcms);
 X86CPU *cpu = X86_CPU(ms->possible_cpus->cpus[0].cpu);
 
-if (!smbios_legacy_mode(pcms)) {
-/* These values are guest ABI, do not change */
-smbios_set_defaults("QEMU", mc->desc, mc->name);
+if (smbios_legacy_mode(pcms)) {
+fw_cfg_build_smbios_legacy(pcms, fw_cfg);
+return;
 }
 
+/* These values are guest ABI, do not change */
+smbios_set_defaults("QEMU", mc->desc, mc->name);
+
 /* tell smbios about cpuid version and features */
 smbios_set_cpuid(cpu->env.cpuid_version, cpu->env.features[FEAT_1_EDX]);
 
-if (smbios_legacy_mode(pcms)) {
-smbios_tables = smbios_get_table_legacy(&smbios_tables_len,
-&error_fatal);
-fw_cfg_add_bytes(fw_cfg, FW_CFG_SMBIOS_ENTRIES,
- smbios_tables, smbios_tables_len);
-return;
-}
-
 /* build the array of physical mem area from e820 table */
 mem_array = g_malloc0(sizeof(*mem_array) * e820_get_num_entries());
 for (i = 0, array_count = 0; i < e820_get_num_entries(); i++) {
-- 
2.41.0

[RFC PATCH-for-9.1 17/29] hw/i386/pc: Inline gigabyte_align()

2024-03-28 Thread Philippe Mathieu-Daudé

All PCI-based machines have the gigabyte_align field
set to %true. Simplify by using an inlined helper
checking whether the machine is PCI-based or not.

Signed-off-by: Philippe Mathieu-Daudé 
---
 include/hw/i386/pc.h |  9 -
 hw/i386/pc.c |  1 -
 hw/i386/pc_piix.c| 16 +---
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 2db2aa03d3..758dd5f29b 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -76,14 +76,6 @@ typedef struct PcPciMachineState {
 
 /**
  * PCMachineClass:
- *
- * Compat fields:
- *
- * @gigabyte_align: Make sure that guest addresses aligned at
- *  1Gbyte boundaries get mapped to host
- *  addresses aligned at 1Gbyte boundaries. This
- *  way we can use 1GByte pages in the host.
- *
  */
 typedef struct PCMachineClass {
 X86MachineClass parent_class;
@@ -99,7 +91,6 @@ typedef struct PCMachineClass {
 SmbiosEntryPointType default_smbios_ep_type;
 
 /* RAM / address space compat: */
-bool gigabyte_align;
 bool has_reserved_memory;
 bool broken_reserved_end;
 bool enforce_amd_1tb_hole;
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index dd44df0470..093a7c35f7 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1752,7 +1752,6 @@ static void pc_machine_class_init(ObjectClass *oc, void 
*data)
 HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc);
 
 pcmc->smbios_defaults = true;
-pcmc->gigabyte_align = true;
 pcmc->has_reserved_memory = true;
 pcmc->enforce_amd_1tb_hole = true;
 pcmc->pvh_enabled = true;
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 2043a7022a..0bc14da768 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -99,6 +99,17 @@ static void piix_intx_routing_notifier_xen(PCIDevice *dev)
 }
 }
 
+/*
+ * gigabyte_align: Make sure that guest addresses aligned at
+ * 1Gbyte boundaries get mapped to host
+ * addresses aligned at 1Gbyte boundaries.
+ * This way we can use 1GByte pages in the host.
+ */
+static bool gigabyte_align(PCMachineState *pcms)
+{
+return pc_machine_is_pci_enabled(pcms);
+}
+
 /* PC hardware initialisation */
 static void pc_init1(MachineState *machine, const char *pci_type)
 {
@@ -130,7 +141,7 @@ static void pc_init1(MachineState *machine, const char 
*pci_type)
  *  - Then, to gigabyte align the memory, we move the split to 3G
  *(lowmem = 0xc000).  But only in case we have to split in
  *the first place, i.e. ram_size is larger than (traditional)
- *lowmem.  And for new machine types (gigabyte_align = true)
+ *lowmem.  And for new machine types (gigabyte_align() = true)
  *only, for live migration compatibility reasons.
  *
  *  - Next the max-ram-below-4g option was added, which allowed to
@@ -160,7 +171,7 @@ static void pc_init1(MachineState *machine, const char 
*pci_type)
 }
 lowmem = pcms->max_ram_below_4g;
 if (machine->ram_size >= pcms->max_ram_below_4g) {
-if (pcmc->gigabyte_align) {
+if (gigabyte_align(pcms)) {
 if (lowmem > 0xc000) {
 lowmem = 0xc000;
 }
@@ -818,7 +829,6 @@ static void isapc_machine_options(MachineClass *m)
 m->option_rom_has_mr = true;
 m->rom_file_has_mr = false;
 pcmc->smbios_defaults = false;
-pcmc->gigabyte_align = false;
 pcmc->smbios_legacy_mode = true;
 pcmc->has_reserved_memory = false;
 m->default_nic = "ne2k_isa";
-- 
2.41.0

1 2 3 >

1 - 100 of 241 matches

Mail list logo