date:20220609

[PATCH v6 8/8] xen/x86: use INFO level for node's without memory log message

2022-06-09 Thread Wei Chen

In previous code, Xen was using KERN_WARNING for log message
when Xen found a node without memory. Xen will print this
warning message, and said that this may be an BIOS Bug or
mis-configured hardware. But actually, this warning is bogus,
because in an NUMA setting, nodes can only have processors,
and with 0 bytes memory. So it is unreasonable to warn about
BIOS or hardware corruption based on the detection of node
with 0 bytes memory.

So in this patch, we remove the warning messages, but just
keep an info message to info users that there is one or more
nodes with 0 bytes memory in the system.

Signed-off-by: Wei Chen 
Reviewed-by: Jan Beulich 
---
v3 -> v4:
1. Remove full stop and use lower-case for node.
2. Add Rb.
v2 -> v3:
new commit.
---
 xen/arch/x86/srat.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
index 3d02520a5a..b62a152911 100644
--- a/xen/arch/x86/srat.c
+++ b/xen/arch/x86/srat.c
@@ -555,8 +555,7 @@ int __init acpi_scan_nodes(paddr_t start, paddr_t end)
uint64_t size = nodes[i].end - nodes[i].start;
 
if ( size == 0 )
-   printk(KERN_WARNING "SRAT: Node %u has no memory. "
-  "BIOS Bug or mis-configured hardware?\n", i);
+   printk(KERN_INFO "SRAT: node %u has no memory\n", i);
 
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
-- 
2.25.1

[PATCH v6 6/8] xen/x86: use paddr_t for addresses in NUMA node structure

2022-06-09 Thread Wei Chen

NUMA node structure "struct node" is using u64 as node memory
range. In order to make other architectures can reuse this
NUMA node relative code, we replace the u64 to paddr_t. And
use pfn_to_paddr and paddr_to_pfn to replace explicit shift
operations. The relate PRIx64 in print messages have been
replaced by PRIpaddr at the same time. And some being-phased-out
types like u64 in the lines we have touched also have been
converted to uint64_t or unsigned long.

Tested-by: Jiamei Xie 
Signed-off-by: Wei Chen 
Acked-by: Jan Beulich 
---
v3 -> v4:
1. Add Tb.
v2 -> v3:
1. Use uint64_t for size in acpi_scan_nodes, make it be
   consistent with numa_emulation.
2. Add Tb.
v1 -> v2:
1. Drop useless cast.
2. Use initializers of the variables.
3. Replace u64 by uint64_t.
4. Use unsigned long for start_pfn and end_pfn.
---
 xen/arch/x86/include/asm/numa.h |  8 
 xen/arch/x86/numa.c | 32 +++-
 xen/arch/x86/srat.c | 25 +
 3 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/xen/arch/x86/include/asm/numa.h b/xen/arch/x86/include/asm/numa.h
index 5d8385f2e1..c32ccffde3 100644
--- a/xen/arch/x86/include/asm/numa.h
+++ b/xen/arch/x86/include/asm/numa.h
@@ -18,7 +18,7 @@ extern cpumask_t node_to_cpumask[];
 #define node_to_cpumask(node)(node_to_cpumask[node])
 
 struct node { 
-   u64 start,end; 
+   paddr_t start, end;
 };
 
 extern int compute_hash_shift(struct node *nodes, int numnodes,
@@ -38,7 +38,7 @@ extern void numa_set_node(int cpu, nodeid_t node);
 extern nodeid_t setup_node(unsigned int pxm);
 extern void srat_detect_node(int cpu);
 
-extern void setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end);
+extern void setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end);
 extern nodeid_t apicid_to_node[];
 extern void init_cpu_to_node(void);
 
@@ -76,9 +76,9 @@ static inline __attribute__((pure)) nodeid_t 
phys_to_nid(paddr_t addr)
 NODE_DATA(nid)->node_spanned_pages)
 #define arch_want_default_dmazone() (num_online_nodes() > 1)
 
-extern int valid_numa_range(u64 start, u64 end, nodeid_t node);
+extern int valid_numa_range(paddr_t start, paddr_t end, nodeid_t node);
 
-void srat_parse_regions(u64 addr);
+void srat_parse_regions(paddr_t addr);
 extern u8 __node_distance(nodeid_t a, nodeid_t b);
 unsigned int arch_get_dma_bitsize(void);
 
diff --git a/xen/arch/x86/numa.c b/xen/arch/x86/numa.c
index 680b7d9002..627ae8aa95 100644
--- a/xen/arch/x86/numa.c
+++ b/xen/arch/x86/numa.c
@@ -162,12 +162,10 @@ int __init compute_hash_shift(struct node *nodes, int 
numnodes,
 return shift;
 }
 /* initialize NODE_DATA given nodeid and start/end */
-void __init setup_node_bootmem(nodeid_t nodeid, u64 start, u64 end)
-{ 
-unsigned long start_pfn, end_pfn;
-
-start_pfn = start >> PAGE_SHIFT;
-end_pfn = end >> PAGE_SHIFT;
+void __init setup_node_bootmem(nodeid_t nodeid, paddr_t start, paddr_t end)
+{
+unsigned long start_pfn = paddr_to_pfn(start);
+unsigned long end_pfn = paddr_to_pfn(end);
 
 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
@@ -198,11 +196,12 @@ void __init numa_init_array(void)
 static int numa_fake __initdata = 0;
 
 /* Numa emulation */
-static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
+static int __init numa_emulation(unsigned long start_pfn,
+ unsigned long end_pfn)
 {
 int i;
 struct node nodes[MAX_NUMNODES];
-u64 sz = ((end_pfn - start_pfn)< 1 )
@@ -218,9 +217,9 @@ static int __init numa_emulation(u64 start_pfn, u64 end_pfn)
 memset(&nodes,0,sizeof(nodes));
 for ( i = 0; i < numa_fake; i++ )
 {
-nodes[i].start = (start_pfnflags & ACPI_SRAT_MEM_HOT_PLUGGABLE) !=
!test_bit(i, memblk_hotplug);
 
-   printk("%sSRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with 
itself (%"PRIx64"-%"PRIx64")\n",
+   printk("%sSRAT: PXM %u (%"PRIpaddr"-%"PRIpaddr") overlaps with 
itself (%"PRIpaddr"-%"PRIpaddr")\n",
   mismatch ? KERN_ERR : KERN_WARNING, pxm, start, end,
   node_memblk_range[i].start, node_memblk_range[i].end);
if (mismatch) {
@@ -327,7 +327,7 @@ acpi_numa_memory_affinity_init(const struct 
acpi_srat_mem_affinity *ma)
}
} else {
printk(KERN_ERR
-  "SRAT: PXM %u (%"PRIx64"-%"PRIx64") overlaps with

[PATCH v6 5/8] xen/arm: use !CONFIG_NUMA to keep fake NUMA API

2022-06-09 Thread Wei Chen

We have introduced CONFIG_NUMA in a previous patch. And this
option is enabled only on x86 at the current stage. In a follow
up patch, we will enable this option for Arm. But we still
want users to be able to disable the CONFIG_NUMA via Kconfig. In
this case, keep the fake NUMA API, will make Arm code still
able to work with NUMA aware memory allocation and scheduler.

Signed-off-by: Wei Chen 
Tested-by: Jiamei Xie 
Reviewed-by: Stefano Stabellini 
---
v3 -> v4:
no change
v2 -> v3:
Add Tb.
v1 -> v2:
No change.
---
 xen/arch/arm/include/asm/numa.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/xen/arch/arm/include/asm/numa.h b/xen/arch/arm/include/asm/numa.h
index e4c4d89192..268a9db055 100644
--- a/xen/arch/arm/include/asm/numa.h
+++ b/xen/arch/arm/include/asm/numa.h
@@ -5,6 +5,8 @@
 
 typedef u8 nodeid_t;
 
+#ifndef CONFIG_NUMA
+
 /* Fake one node for now. See also node_online_map. */
 #define cpu_to_node(cpu) 0
 #define node_to_cpumask(node)   (cpu_online_map)
@@ -24,6 +26,9 @@ extern mfn_t first_valid_mfn;
 #define node_spanned_pages(nid) (max_page - mfn_x(first_valid_mfn))
 #define node_start_pfn(nid) (mfn_x(first_valid_mfn))
 #define __node_distance(a, b) (20)
+
+#endif
+
 #define arch_want_default_dmazone() (false)
 
 #endif /* __ARCH_ARM_NUMA_H */
-- 
2.25.1

[PATCH v6 7/8] xen/x86: add detection of memory interleaves for different nodes

2022-06-09 Thread Wei Chen

One NUMA node may contain several memory blocks. In current Xen
code, Xen will maintain a node memory range for each node to cover
all its memory blocks. But here comes the problem, in the gap of
one node's two memory blocks, if there are some memory blocks don't
belong to this node (remote memory blocks). This node's memory range
will be expanded to cover these remote memory blocks.

One node's memory range contains other nodes' memory, this is
obviously not very reasonable. This means current NUMA code only
can support node has no interleaved memory blocks. However, on a
physical machine, the addresses of multiple nodes can be interleaved.

So in this patch, we add code to detect memory interleaves of
different nodes. NUMA initialization will be failed and error
messages will be printed when Xen detect such hardware configuration.

As we have checked the node's range before, for a non-empty node,
the "nd->end == end && nd->start == start" check is unnecesary.
So we remove it from conflicting_memblks as well.

Signed-off-by: Wei Chen 
Tested-by: Jiamei Xie 
Reviewed-by: Jan Beulich 
---
v5 -> v6:
1. Use comma to replace dash for [start, end].
2. Add Rb.
v4 -> v5:
1. Remove "nd->end == end && nd->start == start" from
   conflicting_memblks.
2. Use case NO_CONFLICT instead of "default".
3. Correct wrong "node" to "pxm" in print message.
4. Remove unnecesary "else" to remove the indent depth.
5. Convert all ranges to proper mathematical interval
   representation.
6. Fix code-style comments.
v3 -> v4:
1. Drop "ERR" prefix for enumeration, and remove init value.
2. Use "switch case" for enumeration, and add "default:"
3. Use "PXM" in log messages.
4. Use unsigned int for node memory block id.
5. Fix some code-style comments.
6. Use "nd->end" in node range expansion check.
v2 -> v3:
1. Merge the check code from a separate function to
   conflicting_memblks. This will reduce the loop
   times of node memory blocks.
2. Use an enumeration to indicate conflict check status.
3. Use a pointer to get conflict memory block id.
v1 -> v2:
1. Update the description to say we're after is no memory
   interleaves of different nodes.
2. Only update node range when it passes the interleave check.
3. Don't use full upper-case for "node".
---
 xen/arch/x86/srat.c | 139 
 1 file changed, 101 insertions(+), 38 deletions(-)

diff --git a/xen/arch/x86/srat.c b/xen/arch/x86/srat.c
index 8ffe43bdfe..3d02520a5a 100644
--- a/xen/arch/x86/srat.c
+++ b/xen/arch/x86/srat.c
@@ -42,6 +42,12 @@ static struct node node_memblk_range[NR_NODE_MEMBLKS];
 static nodeid_t memblk_nodeid[NR_NODE_MEMBLKS];
 static __initdata DECLARE_BITMAP(memblk_hotplug, NR_NODE_MEMBLKS);
 
+enum conflicts {
+   NO_CONFLICT,
+   OVERLAP,
+   INTERLEAVE,
+};
+
 static inline bool node_found(unsigned idx, unsigned pxm)
 {
return ((pxm2node[idx].pxm == pxm) &&
@@ -119,20 +125,45 @@ int valid_numa_range(paddr_t start, paddr_t end, nodeid_t 
node)
return 0;
 }
 
-static __init int conflicting_memblks(paddr_t start, paddr_t end)
+static
+enum conflicts __init conflicting_memblks(nodeid_t nid, paddr_t start,
+ paddr_t end, paddr_t nd_start,
+ paddr_t nd_end, unsigned int *mblkid)
 {
-   int i;
+   unsigned int i;
 
+   /*
+* Scan all recorded nodes' memory blocks to check conflicts:
+* Overlap or interleave.
+*/
for (i = 0; i < num_node_memblks; i++) {
struct node *nd = &node_memblk_range[i];
+
+   *mblkid = i;
+
+   /* Skip 0 bytes node memory block. */
if (nd->start == nd->end)
continue;
+   /*
+* Use memblk range to check memblk overlaps, include the
+* self-overlap case. As nd's range is non-empty, the special
+* case "nd->end == end && nd->start == start" also can be 
covered.
+*/
if (nd->end > start && nd->start < end)
-   return i;
-   if (nd->end == end && nd->start == start)
-   return i;
+   return OVERLAP;
+
+   /*
+* Use node memory range to check whether new range contains
+* memory from other nodes - interleave check. We just need
+* to check full contains situation. Because overlaps have
+* been checked above.
+*/
+   if (nid != memblk_nodeid[i] &&
+   nd->start >= nd_start && nd->end <= nd_end)
+   return INTERLEAVE;
}
-   return -1;
+
+   return NO_CONFLICT;
 }
 
 static __init void cutoff_node(int i, paddr_t start, paddr_t end)
@@ -275,10 +306,12 @@ acpi_numa_processor_affinity_init(const struct 
acpi_srat_cpu_affinity *pa)
 void __init
 acpi_numa_memory_affinity_init(co

[PATCH v6 3/8] xen: introduce an arch helper for default dma zone status

2022-06-09 Thread Wei Chen

In current code, when Xen is running in a multiple nodes
NUMA system, it will set dma_bitsize in end_boot_allocator
to reserve some low address memory as DMA zone.

There are some x86 implications in the implementation.
Because on x86, memory starts from 0. On a multiple-nodes
NUMA system, if a single node contains the majority or all
of the DMA memory, x86 prefers to give out memory from
non-local allocations rather than exhausting the DMA memory
ranges. Hence x86 uses dma_bitsize to set aside some largely
arbitrary amount of memory for DMA zone. The allocations
from DMA zone would happen only after exhausting all other
nodes' memory.

But the implications are not shared across all architectures.
For example, Arm cannot guarantee the availability of memory
below a certain boundary for DMA limited-capability devices
either. But currently, Arm doesn't need a reserved DMA zone
in Xen. Because there is no DMA device in Xen. And for guests,
Xen Arm only allows Dom0 to have DMA operations without IOMMU.
Xen will try to allocate memory under 4GB or memory range that
is limited by dma_bitsize for Dom0 in boot time. For DomU, even
Xen can passthrough devices to DomU without IOMMU, but Xen Arm
doesn't guarantee their DMA operations. So, Xen Arm doesn't
need a reserved DMA zone to provide DMA memory for guests.

In this patch, we introduce an arch_want_default_dmazone helper
for different architectures to determine whether they need to
set dma_bitsize for DMA zone reservation or not.

At the same time, when x86 Xen is built with CONFIG_PV=n could
probably leverage this new helper to actually not trigger DMA
zone reservation.

Signed-off-by: Wei Chen 
Tested-by: Jiamei Xie 
Acked-by: Jan Beulich 
---
v3 -> v4:
1. Add Acked-by.
v2 -> v3:
1. Add Tb.
2. Rename arch_have_default_dmazone to arch_want_default_dmazone.
v1 -> v2:
1. Extend the description of Arm's workaround for reserve DMA
   allocations to avoid the same discussion every time.
2. Use a macro to define arch_have_default_dmazone, because
   it's little hard to make x86 version to static inline.
   Use a macro will also avoid add __init for this function.
3. Change arch_have_default_dmazone return value from
   unsigned int to bool.
4. Un-addressed comment: make arch_have_default_dmazone
   of x86 to be static inline. Because, if we move
   arch_have_default_dmazone to x86/asm/numa.h, it depends
   on nodemask.h to provide num_online_nodes. But nodemask.h
   needs numa.h to provide MAX_NUMANODES. This will cause a
   loop dependency. And this function can only be used in
   end_boot_allocator, in Xen initialization. So I think,
   compared to the changes introduced by inline, it doesn't
   mean much.
---
 xen/arch/arm/include/asm/numa.h | 1 +
 xen/arch/x86/include/asm/numa.h | 1 +
 xen/common/page_alloc.c | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/xen/arch/arm/include/asm/numa.h b/xen/arch/arm/include/asm/numa.h
index 31a6de4e23..e4c4d89192 100644
--- a/xen/arch/arm/include/asm/numa.h
+++ b/xen/arch/arm/include/asm/numa.h
@@ -24,6 +24,7 @@ extern mfn_t first_valid_mfn;
 #define node_spanned_pages(nid) (max_page - mfn_x(first_valid_mfn))
 #define node_start_pfn(nid) (mfn_x(first_valid_mfn))
 #define __node_distance(a, b) (20)
+#define arch_want_default_dmazone() (false)
 
 #endif /* __ARCH_ARM_NUMA_H */
 /*
diff --git a/xen/arch/x86/include/asm/numa.h b/xen/arch/x86/include/asm/numa.h
index bada2c0bb9..5d8385f2e1 100644
--- a/xen/arch/x86/include/asm/numa.h
+++ b/xen/arch/x86/include/asm/numa.h
@@ -74,6 +74,7 @@ static inline __attribute__((pure)) nodeid_t 
phys_to_nid(paddr_t addr)
 #define node_spanned_pages(nid)(NODE_DATA(nid)->node_spanned_pages)
 #define node_end_pfn(nid)   (NODE_DATA(nid)->node_start_pfn + \
 NODE_DATA(nid)->node_spanned_pages)
+#define arch_want_default_dmazone() (num_online_nodes() > 1)
 
 extern int valid_numa_range(u64 start, u64 end, nodeid_t node);
 
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index ea59cd1a4a..000ae6b972 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1889,7 +1889,7 @@ void __init end_boot_allocator(void)
 }
 nr_bootmem_regions = 0;
 
-if ( !dma_bitsize && (num_online_nodes() > 1) )
+if ( !dma_bitsize && arch_want_default_dmazone() )
 dma_bitsize = arch_get_dma_bitsize();
 
 printk("Domain heap initialised");
-- 
2.25.1

[PATCH v6 2/8] xen/arm: Keep memory nodes in device tree when Xen boots from EFI

2022-06-09 Thread Wei Chen

In current code, when Xen is booting from EFI, it will delete
all memory nodes in device tree. This would work well in current
stage, because Xen can get memory map from EFI system table.
However, EFI system table cannot completely replace memory nodes
of device tree. EFI system table doesn't contain memory NUMA
information. Xen depends on ACPI SRAT or device tree memory nodes
to parse memory blocks' NUMA mapping. So in EFI + DTB boot, Xen
doesn't have any method to get numa-node-id for memory blocks any
more. This makes device tree based NUMA support become impossible
for Xen in EFI + DTB boot.

So in this patch, we will keep memory nodes in device tree for
NUMA code to parse memory numa-node-id later.

As a side effect, if we still parse boot memory information in
early_scan_node, bootmem.info will calculate memory ranges in
memory nodes twice. So we have to prevent early_scan_node to
parse memory nodes in EFI boot.

Signed-off-by: Wei Chen 
Tested-by: Jiamei Xie 
Reviewed-by: Stefano Stabellini 
---
v3 -> v4:
1. No change.
v2 -> v3:
1. Add Rb.
v1 -> v2:
1. Move this patch from later to early of this series.
2. Refine commit message.
---
 xen/arch/arm/bootfdt.c  |  8 +++-
 xen/arch/arm/efi/efi-boot.h | 25 -
 2 files changed, 7 insertions(+), 26 deletions(-)

diff --git a/xen/arch/arm/bootfdt.c b/xen/arch/arm/bootfdt.c
index 29671c8df0..ec81a45de9 100644
--- a/xen/arch/arm/bootfdt.c
+++ b/xen/arch/arm/bootfdt.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -367,7 +368,12 @@ static int __init early_scan_node(const void *fdt,
 {
 int rc = 0;
 
-if ( device_tree_node_matches(fdt, node, "memory") )
+/*
+ * If Xen has been booted via UEFI, the memory banks are
+ * populated. So we should skip the parsing.
+ */
+if ( !efi_enabled(EFI_BOOT) &&
+ device_tree_node_matches(fdt, node, "memory") )
 rc = process_memory_node(fdt, node, name, depth,
  address_cells, size_cells, &bootinfo.mem);
 else if ( depth == 1 && !dt_node_cmp(name, "reserved-memory") )
diff --git a/xen/arch/arm/efi/efi-boot.h b/xen/arch/arm/efi/efi-boot.h
index e452b687d8..59d93c24a1 100644
--- a/xen/arch/arm/efi/efi-boot.h
+++ b/xen/arch/arm/efi/efi-boot.h
@@ -231,33 +231,8 @@ EFI_STATUS __init fdt_add_uefi_nodes(EFI_SYSTEM_TABLE 
*sys_table,
 int status;
 u32 fdt_val32;
 u64 fdt_val64;
-int prev;
 int num_rsv;
 
-/*
- * Delete any memory nodes present.  The EFI memory map is the only
- * memory description provided to Xen.
- */
-prev = 0;
-for (;;)
-{
-const char *type;
-int len;
-
-node = fdt_next_node(fdt, prev, NULL);
-if ( node < 0 )
-break;
-
-type = fdt_getprop(fdt, node, "device_type", &len);
-if ( type && strncmp(type, "memory", len) == 0 )
-{
-fdt_del_node(fdt, node);
-continue;
-}
-
-prev = node;
-}
-
/*
 * Delete all memory reserve map entries. When booting via UEFI,
 * kernel will use the UEFI memory map to find reserved regions.
-- 
2.25.1

[PATCH v6 4/8] xen: decouple NUMA from ACPI in Kconfig

2022-06-09 Thread Wei Chen

In current Xen code only implements x86 ACPI-based NUMA support.
So in Xen Kconfig system, NUMA equals to ACPI_NUMA. x86 selects
NUMA by default, and CONFIG_ACPI_NUMA is hardcode in config.h.

In a follow-up patch, we will introduce support for NUMA using
the device tree. That means we will have two NUMA implementations,
so in this patch we decouple NUMA from ACPI based NUMA in Kconfig.
Make NUMA as a common feature, that device tree based NUMA also
can select it.

Signed-off-by: Wei Chen 
Tested-by: Jiamei Xie 
Reviewed-by: Jan Beulich 
---
v3 -> v4:
no change.
v2 -> v3:
Add Tb.
v1 -> v2:
No change.
---
 xen/arch/x86/Kconfig  | 2 +-
 xen/arch/x86/include/asm/config.h | 1 -
 xen/common/Kconfig| 3 +++
 xen/drivers/acpi/Kconfig  | 3 ++-
 xen/drivers/acpi/Makefile | 2 +-
 5 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
index 06d6fbc864..1e31edc99f 100644
--- a/xen/arch/x86/Kconfig
+++ b/xen/arch/x86/Kconfig
@@ -6,6 +6,7 @@ config X86
def_bool y
select ACPI
select ACPI_LEGACY_TABLES_LOOKUP
+   select ACPI_NUMA
select ALTERNATIVE_CALL
select ARCH_SUPPORTS_INT128
select CORE_PARKING
@@ -26,7 +27,6 @@ config X86
select HAS_UBSAN
select HAS_VPCI if HVM
select NEEDS_LIBELF
-   select NUMA
 
 config ARCH_DEFCONFIG
string
diff --git a/xen/arch/x86/include/asm/config.h 
b/xen/arch/x86/include/asm/config.h
index de20642524..07bcd15831 100644
--- a/xen/arch/x86/include/asm/config.h
+++ b/xen/arch/x86/include/asm/config.h
@@ -31,7 +31,6 @@
 /* Intel P4 currently has largest cache line (L2 line size is 128 bytes). */
 #define CONFIG_X86_L1_CACHE_SHIFT 7
 
-#define CONFIG_ACPI_NUMA 1
 #define CONFIG_ACPI_SRAT 1
 #define CONFIG_ACPI_CSTATE 1
 
diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index d921c74d61..d65add3fc6 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -70,6 +70,9 @@ config MEM_ACCESS
 config NEEDS_LIBELF
bool
 
+config NUMA
+   bool
+
 config STATIC_MEMORY
bool "Static Allocation Support (UNSUPPORTED)" if UNSUPPORTED
depends on ARM
diff --git a/xen/drivers/acpi/Kconfig b/xen/drivers/acpi/Kconfig
index b64d3731fb..e3f3d8f4b1 100644
--- a/xen/drivers/acpi/Kconfig
+++ b/xen/drivers/acpi/Kconfig
@@ -5,5 +5,6 @@ config ACPI
 config ACPI_LEGACY_TABLES_LOOKUP
bool
 
-config NUMA
+config ACPI_NUMA
bool
+   select NUMA
diff --git a/xen/drivers/acpi/Makefile b/xen/drivers/acpi/Makefile
index 4f8e97228e..2fc5230253 100644
--- a/xen/drivers/acpi/Makefile
+++ b/xen/drivers/acpi/Makefile
@@ -3,7 +3,7 @@ obj-y += utilities/
 obj-$(CONFIG_X86) += apei/
 
 obj-bin-y += tables.init.o
-obj-$(CONFIG_NUMA) += numa.o
+obj-$(CONFIG_ACPI_NUMA) += numa.o
 obj-y += osl.o
 obj-$(CONFIG_HAS_CPUFREQ) += pmstat.o
 
-- 
2.25.1

[PATCH v6 0/8] Device tree based NUMA support for Arm - Part#1

2022-06-09 Thread Wei Chen

(The Arm device tree based NUMA support patch set contains 35
patches. In order to make stuff easier for reviewers, I split
them into 3 parts:
1. Preparation. I have re-sorted the patch series. And moved
   independent patches to the head of the series.
2. Move generically usable code from x86 to common.
3. Add new code to support Arm.

This series only contains the first part patches.)

Xen memory allocation and scheduler modules are NUMA aware.
But actually, on x86 has implemented the architecture APIs
to support NUMA. Arm was providing a set of fake architecture
APIs to make it compatible with NUMA awared memory allocation
and scheduler.

Arm system was working well as a single node NUMA system with
these fake APIs, because we didn't have multiple nodes NUMA
system on Arm. But in recent years, more and more Arm devices
support multiple nodes NUMA system.

So now we have a new problem. When Xen is running on these Arm
devices, Xen still treat them as single node SMP systems. The
NUMA affinity capability of Xen memory allocation and scheduler
becomes meaningless. Because they rely on input data that does
not reflect real NUMA layout.

Xen still think the access time for all of the memory is the
same for all CPUs. However, Xen may allocate memory to a VM
from different NUMA nodes with different access speeds. This
difference can be amplified in workloads inside VM, causing
performance instability and timeouts.

So in this patch series, we implement a set of NUMA API to use
device tree to describe the NUMA layout. We reuse most of the
code of x86 NUMA to create and maintain the mapping between
memory and CPU, create the matrix between any two NUMA nodes.
Except ACPI and some x86 specified code, we have moved other
code to common. In next stage, when we implement ACPI based
NUMA for Arm64, we may move the ACPI NUMA code to common too,
but in current stage, we keep it as x86 only.

This patch serires has been tested and booted well on one
Arm64 NUMA machine and one HPE x86 NUMA machine.

---
Part1 v5->v6:
1. Use comma to replace dash for "[start, end]".
Part1 v4->v5:
1. Remove "nd->end == end && nd->start == start" from
   conflicting_memblks.
2. Use case NO_CONFLICT instead of "default".
3. Correct wrong "node" to "pxm" in print message.
4. Remove unnecesary "else" to remove the indent depth.
5. Convert all ranges to proper mathematical interval
   representation.
6. Fix code-style comments.
Part1 v3->v4:
1. Add indent to make ln and test to be aligned in EFI
   common makefile.
2. Drop "ERR" prefix for node conflict check enumeration,
   and remove init value.
3. Use "switch case" for enumeration, and add "default:"
4. Use "PXM" in log messages.
5. Use unsigned int for node memory block id.
6. Fix some code-style comments.
7. Use "nd->end" in node range expansion check.
Part1 v2->v3:
1. Rework EFI stub patch:
   1.1. Add existed file check, if exists a regular stub files,
the common/stub files' links will be ignored.
   1.2. Keep stub.c in x86/efi to include common/efi/stub.c
   1.3. Restore efi_compat_xxx stub functions to x86/efi.c.
Other architectures will not use efi_compat_xxx.
   1.4. Remove ARM_EFI dependency from ARM_64.
   1.5. Add comment for adding stub.o to EFIOBJ-y.
   1.6. Merge patch#2 and patch#3 to one patch.
2. Rename arch_have_default_dmazone to arch_want_default_dmazone
3. Use uint64_t for size in acpi_scan_nodes, make it be
   consistent with numa_emulation.
4. Merge the interleaves checking code from a separate function
   to conflicting_memblks.
5. Use INFO level for node's without memory log message.
6. Move "xen/x86: Use ASSERT instead of VIRTUAL_BUG_ON for
   phys_to_nid" to part#2.
Part1 v1->v2:
1. Move independent patches from later to early of this series.
2. Drop the copy of EFI stub.c from Arm. Share common codes of
   x86 EFI stub for Arm.
3. Use CONFIG_ARM_EFI to replace CONFIG_EFI and remove help text
   and make CONFIG_ARM_EFI invisible.
4. Use ASSERT to replace VIRTUAL_BUG_ON in phys_to_nid.
5. Move MAX_NUMNODES from xen/numa.h to asm/numa.h for x86.
6. Extend the description of Arm's workaround for reserve DMA
   allocations to avoid the same discussion every time for
   arch_have_default_dmazone.
7. Update commit messages.

Wei Chen (8):
  xen: reuse x86 EFI stub functions for Arm
  xen/arm: Keep memory nodes in device tree when Xen boots from EFI
  xen: introduce an arch helper for default dma zone status
  xen: decouple NUMA from ACPI in Kconfig
  xen/arm: use !CONFIG_NUMA to keep fake NUMA API
  xen/x86: use paddr_t for addresses in NUMA node structure
  xen/x86: add detection of memory interleaves for different nodes
  xen/x86: use INFO level for node's without memory log message

 xen/arch/arm/Kconfig  |   4 +
 xen/arch/arm/Makefile |   2 +-
 xen/arch/arm/bootfdt.c|   8 +-
 xen/arch/arm/efi/Makefile |   8 ++
 xen/arch/arm/efi/efi-boot.h   |  25 -
 xen/arch/arm/include/asm/numa.h   |   6 ++
 xen/arc

[PATCH v6 1/8] xen: reuse x86 EFI stub functions for Arm

2022-06-09 Thread Wei Chen

x86 is using compiler feature testing to decide EFI build
enable or not. When EFI build is disabled, x86 will use an
efi/stub.c file to replace efi/runtime.c for build objects.
Following this idea, we introduce a stub file for Arm, but
use CONFIG_ARM_EFI to decide EFI build enable or not.

And the most functions in x86 EFI stub.c can be reused for
other architectures, like Arm. So we move them to common
and keep the x86 specific function in x86/efi/stub.c.

To avoid the symbol link conflict error when linking common
stub files to x86/efi. We add a regular file check in efi
stub files' link script. Depends on this check we can bypass
the link behaviors for existed stub files in x86/efi.

As there is no Arm specific EFI stub function for Arm in
current stage, Arm still can use the existed symbol link
method for EFI stub files.

Signed-off-by: Wei Chen 
Tested-by: Jiamei Xie 
Acked-by: Jan Beulich 
---
v4 -> v5:
1. Add acked-by.
v3 -> v4:
1. Add indent to make ln and test to be aligned.
v2 -> v3:
1. Add existed file check, if a regular stub files,
   the common/stub files' link will be ignored.
2. Keep stub.c in x86/efi to include common/efi/stub.c
3. Restore efi_compat_xxx stub functions to x86/efi.c.
   Other architectures will not use efi_compat_xxx.
4. Remove ARM_EFI dependency from ARM_64.
5. Add comment for adding stub.o to EFIOBJ-y.
6. Merge patch#2 and patch#3 to one patch.
v1 -> v2:
1. Drop the copy of stub.c from Arm EFI.
2. Share common codes of x86 EFI stub for other architectures.
3. Use CONFIG_ARM_EFI to replace CONFIG_EFI
4. Remove help text and make CONFIG_ARM_EFI invisible.
5. Merge one following patch:
   xen/arm: introduce a stub file for non-EFI architectures
6. Use the common stub.c instead of creating new one.
---
 xen/arch/arm/Kconfig |  4 
 xen/arch/arm/Makefile|  2 +-
 xen/arch/arm/efi/Makefile|  8 
 xen/arch/x86/efi/stub.c  | 32 +---
 xen/common/efi/efi-common.mk |  3 ++-
 xen/common/efi/stub.c| 32 
 6 files changed, 48 insertions(+), 33 deletions(-)
 create mode 100644 xen/common/efi/stub.c

diff --git a/xen/arch/arm/Kconfig b/xen/arch/arm/Kconfig
index ecfa6822e4..8a16d43bd5 100644
--- a/xen/arch/arm/Kconfig
+++ b/xen/arch/arm/Kconfig
@@ -6,6 +6,7 @@ config ARM_64
def_bool y
depends on !ARM_32
select 64BIT
+   select ARM_EFI
select HAS_FAST_MULTIPLY
 
 config ARM
@@ -33,6 +34,9 @@ config ACPI
  Advanced Configuration and Power Interface (ACPI) support for Xen is
  an alternative to device tree on ARM64.
 
+config ARM_EFI
+   bool
+
 config GICV3
bool "GICv3 driver"
depends on ARM_64 && !NEW_VGIC
diff --git a/xen/arch/arm/Makefile b/xen/arch/arm/Makefile
index 1d862351d1..bb7a6151c1 100644
--- a/xen/arch/arm/Makefile
+++ b/xen/arch/arm/Makefile
@@ -1,6 +1,5 @@
 obj-$(CONFIG_ARM_32) += arm32/
 obj-$(CONFIG_ARM_64) += arm64/
-obj-$(CONFIG_ARM_64) += efi/
 obj-$(CONFIG_ACPI) += acpi/
 obj-$(CONFIG_HAS_PCI) += pci/
 ifneq ($(CONFIG_NO_PLAT),y)
@@ -20,6 +19,7 @@ obj-y += domain.o
 obj-y += domain_build.init.o
 obj-y += domctl.o
 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+obj-y += efi/
 obj-y += gic.o
 obj-y += gic-v2.o
 obj-$(CONFIG_GICV3) += gic-v3.o
diff --git a/xen/arch/arm/efi/Makefile b/xen/arch/arm/efi/Makefile
index 4313c39066..dffe72e589 100644
--- a/xen/arch/arm/efi/Makefile
+++ b/xen/arch/arm/efi/Makefile
@@ -1,4 +1,12 @@
 include $(srctree)/common/efi/efi-common.mk
 
+ifeq ($(CONFIG_ARM_EFI),y)
 obj-y += $(EFIOBJ-y)
 obj-$(CONFIG_ACPI) +=  efi-dom0.init.o
+else
+# Add stub.o to EFIOBJ-y to re-use the clean-files in
+# efi-common.mk. Otherwise the link of stub.c in arm/efi
+# will not be cleaned in "make clean".
+EFIOBJ-y += stub.o
+obj-y += stub.o
+endif
diff --git a/xen/arch/x86/efi/stub.c b/xen/arch/x86/efi/stub.c
index 9984932626..f2365bc041 100644
--- a/xen/arch/x86/efi/stub.c
+++ b/xen/arch/x86/efi/stub.c
@@ -1,7 +1,5 @@
 #include 
-#include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -10,6 +8,7 @@
 #include 
 #include 
 #include 
+#include "../../../common/efi/stub.c"
 
 /*
  * Here we are in EFI stub. EFI calls are not supported due to lack
@@ -45,11 +44,6 @@ void __init noreturn efi_multiboot2(EFI_HANDLE ImageHandle,
 unreachable();
 }
 
-bool efi_enabled(unsigned int feature)
-{
-return false;
-}
-
 void __init efi_init_memory(void) { }
 
 bool efi_boot_mem_unused(unsigned long *start, unsigned long *end)
@@ -62,32 +56,8 @@ bool efi_boot_mem_unused(unsigned long *start, unsigned long 
*end)
 
 void efi_update_l4_pgtable(unsigned int l4idx, l4_pgentry_t l4e) { }
 
-bool efi_rs_using_pgtables(void)
-{
-return false;
-}
-
-unsigned long efi_get_time(void)
-{
-BUG();
-return 0;
-}
-
-void efi_halt_system(void) { }
-void efi_reset_system(bool warm) { }
-
-int efi_get_info(uint32_t idx, union xenpf_efi_info *info)
-{
-return -ENOSYS;
-}
-
 int efi_compat_get_i

[qemu-mainline test] 170902: FAIL

2022-06-09 Thread osstest service owner

flight 170902 qemu-mainline real [real]
http://logs.test-lab.xenproject.org/osstest/logs/170902/

Failures and problems with tests :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-amd64-xl-rtds broken  in 170891

Tests which are failing intermittently (not blocking):
 test-amd64-amd64-xl-rtds 5 host-install(5) broken in 170891 pass in 170902
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 16 guest-saverestore.2 fail 
in 170891 pass in 170902
 test-amd64-amd64-libvirt-vhd 19 guest-start/debian.repeat fail in 170891 pass 
in 170902
 test-amd64-amd64-xl-qcow221 guest-start/debian.repeat  fail pass in 170891

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 170884
 test-armhf-armhf-libvirt 16 saverestore-support-checkfail  like 170884
 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 170884
 test-amd64-i386-xl-qemuu-win7-amd64 19 guest-stop fail like 170884
 test-armhf-armhf-libvirt-raw 15 saverestore-support-checkfail  like 170884
 test-armhf-armhf-libvirt-qcow2 15 saverestore-support-check   fail like 170884
 test-amd64-i386-xl-qemuu-ws16-amd64 19 guest-stop fail like 170884
 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 170884
 test-arm64-arm64-xl-seattle  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  16 saverestore-support-checkfail   never pass
 test-amd64-i386-xl-pvshim14 guest-start  fail   never pass
 test-amd64-i386-libvirt-xsm  15 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  16 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  16 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt-raw  14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 15 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 15 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 16 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl-rtds 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  14 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 15 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 16 saverestore-support-checkfail never pass
 test-armhf-armhf-libvirt 15 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt 15 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-armhf-armhf-libvirt-raw 14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-support-check

[xen-4.16-testing test] 170901: regressions - FAIL

2022-06-09 Thread osstest service owner

flight 170901 xen-4.16-testing real [real]
http://logs.test-lab.xenproject.org/osstest/logs/170901/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-i386-xl-shadow 8 xen-boot fail REGR. vs. 170871
 test-amd64-coresched-i386-xl  8 xen-boot fail REGR. vs. 170871
 test-amd64-i386-xl-qemut-debianhvm-i386-xsm  8 xen-boot  fail REGR. vs. 170871
 test-amd64-i386-xl-xsm8 xen-boot fail REGR. vs. 170871
 test-amd64-i386-xl-qemut-stubdom-debianhvm-amd64-xsm 8 xen-boot fail REGR. vs. 
170871
 test-amd64-i386-pair 12 xen-boot/src_hostfail REGR. vs. 170871
 test-amd64-i386-pair 13 xen-boot/dst_hostfail REGR. vs. 170871
 test-amd64-i386-xl-qemuu-debianhvm-amd64  8 xen-boot fail REGR. vs. 170871
 test-amd64-i386-freebsd10-i386  8 xen-boot   fail REGR. vs. 170871
 test-amd64-i386-libvirt   8 xen-boot fail REGR. vs. 170871
 test-amd64-i386-livepatch 8 xen-boot fail REGR. vs. 170871
 test-amd64-i386-qemuu-rhel6hvm-intel  8 xen-boot fail REGR. vs. 170871
 test-amd64-i386-xl-qemuu-debianhvm-i386-xsm  8 xen-boot  fail REGR. vs. 170871
 test-amd64-i386-xl-qemut-win7-amd64  8 xen-boot  fail REGR. vs. 170871
 test-amd64-i386-qemut-rhel6hvm-intel  8 xen-boot fail REGR. vs. 170871
 test-amd64-i386-xl-qemuu-win7-amd64  8 xen-boot  fail REGR. vs. 170871
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 8 xen-boot fail REGR. vs. 
170871
 test-amd64-i386-libvirt-xsm   8 xen-boot fail REGR. vs. 170871
 test-amd64-i386-xl-qemut-debianhvm-amd64  8 xen-boot fail REGR. vs. 170871
 test-amd64-i386-migrupgrade  13 xen-boot/dst_hostfail REGR. vs. 170871
 test-amd64-i386-qemut-rhel6hvm-amd  8 xen-boot   fail REGR. vs. 170871
 test-amd64-i386-xl-qemuu-ovmf-amd64  8 xen-boot  fail REGR. vs. 170871
 test-amd64-i386-xl-pvshim 8 xen-boot fail REGR. vs. 170871
 test-amd64-i386-libvirt-pair 12 xen-boot/src_hostfail REGR. vs. 170871
 test-amd64-i386-libvirt-pair 13 xen-boot/dst_hostfail REGR. vs. 170871
 test-amd64-i386-xl-qemuu-debianhvm-amd64-shadow 8 xen-boot fail REGR. vs. 
170871
 test-amd64-i386-xl-qemut-ws16-amd64  8 xen-boot  fail REGR. vs. 170871
 test-amd64-i386-xl8 xen-boot fail REGR. vs. 170871
 test-amd64-i386-xl-qemuu-ws16-amd64  8 xen-boot  fail REGR. vs. 170871
 test-amd64-i386-freebsd10-amd64  8 xen-boot  fail REGR. vs. 170871
 test-amd64-i386-qemuu-rhel6hvm-amd  8 xen-boot   fail REGR. vs. 170871
 test-xtf-amd64-amd64-2   87 xtf/test-pv32pae-xsa-188 fail REGR. vs. 170871
 test-xtf-amd64-amd64-2   88 leak-check/check fail REGR. vs. 170871
 test-armhf-armhf-xl-credit1  10 host-ping-check-xen  fail REGR. vs. 170871
 test-amd64-i386-xl-qemuu-dmrestrict-amd64-dmrestrict 8 xen-boot fail REGR. vs. 
170871

Tests which did not succeed, but are not blocking:
 test-amd64-i386-xl-vhd8 xen-bootfail blocked in 170871
 test-amd64-i386-libvirt-raw   8 xen-bootfail blocked in 170871
 test-armhf-armhf-libvirt 16 saverestore-support-checkfail  like 170871
 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 170871
 test-amd64-amd64-xl-qemut-win7-amd64 19 guest-stopfail like 170871
 test-armhf-armhf-libvirt-raw 15 saverestore-support-checkfail  like 170871
 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 170871
 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 170871
 test-amd64-amd64-xl-qemut-ws16-amd64 19 guest-stopfail like 170871
 test-armhf-armhf-libvirt-qcow2 15 saverestore-support-check   fail like 170871
 test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-armhf-armhf-xl-rtds 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  15 migrate-support-checkfai

[GIT PULL] xen: branch for v5.19-rc2

2022-06-09 Thread Juergen Gross

Linus,

Please git pull the following tag:

 git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git 
for-linus-5.19a-rc2-tag

xen: branch for v5.19-rc2

It contains:
- a small cleanup removing "export" of an __init function
- a small series adding a new infrastructure for platform flags
- a series adding generic virtio support for Xen guests (frontend side)

Thanks.

Juergen

 .../devicetree/bindings/iommu/xen,grant-dma.yaml   |  39 +++
 MAINTAINERS|   8 +
 arch/arm/include/asm/xen/xen-ops.h |   2 +
 arch/arm/mm/dma-mapping.c  |   7 +-
 arch/arm/xen/enlighten.c   |   2 +
 arch/arm64/include/asm/xen/xen-ops.h   |   2 +
 arch/arm64/mm/dma-mapping.c|   7 +-
 arch/s390/Kconfig  |   1 -
 arch/s390/mm/init.c|  13 +-
 arch/x86/Kconfig   |   1 -
 arch/x86/mm/mem_encrypt.c  |   7 -
 arch/x86/mm/mem_encrypt_amd.c  |   4 +
 arch/x86/xen/enlighten_hvm.c   |   2 +
 arch/x86/xen/enlighten_pv.c|   2 +
 drivers/virtio/Kconfig |   6 -
 drivers/virtio/virtio.c|   5 +-
 drivers/xen/Kconfig|  20 ++
 drivers/xen/Makefile   |   2 +
 drivers/xen/grant-dma-iommu.c  |  78 +
 drivers/xen/grant-dma-ops.c| 346 +
 drivers/xen/grant-table.c  | 251 ---
 drivers/xen/xlate_mmu.c|   1 -
 include/asm-generic/Kbuild |   1 +
 include/asm-generic/platform-feature.h |   8 +
 include/linux/platform-feature.h   |  19 ++
 include/linux/virtio_config.h  |   9 -
 include/xen/arm/xen-ops.h  |  18 ++
 include/xen/grant_table.h  |   4 +
 include/xen/xen-ops.h  |  13 +
 include/xen/xen.h  |   8 +
 kernel/Makefile|   2 +-
 kernel/platform-feature.c  |  27 ++
 32 files changed, 830 insertions(+), 85 deletions(-)

Juergen Gross (5):
  kernel: add platform_has() infrastructure
  virtio: replace arch_has_restricted_virtio_memory_access()
  xen/grants: support allocating consecutive grants
  xen/grant-dma-ops: Add option to restrict memory access under Xen
  xen/virtio: Enable restricted memory access using Xen grant mappings

Masahiro Yamada (1):
  xen: unexport __init-annotated xen_xlate_map_ballooned_pages()

Oleksandr Tyshchenko (5):
  arm/xen: Introduce xen_setup_dma_ops()
  dt-bindings: Add xen,grant-dma IOMMU description for xen-grant DMA ops
  xen/grant-dma-iommu: Introduce stub IOMMU driver
  xen/grant-dma-ops: Retrieve the ID of backend's domain for DT devices
  arm/xen: Assign xen-grant DMA ops for xen-grant DMA devices

Re: [PATCH 04/36] cpuidle,intel_idle: Fix CPUIDLE_FLAG_IRQ_ENABLE

2022-06-09 Thread Jacob Pan

Hi Peter,

On Wed, 08 Jun 2022 16:27:27 +0200, Peter Zijlstra 
wrote:

> Commit c227233ad64c ("intel_idle: enable interrupts before C1 on
> Xeons") wrecked intel_idle in two ways:
> 
>  - must not have tracing in idle functions
>  - must return with IRQs disabled
> 
> Additionally, it added a branch for no good reason.
> 
> Fixes: c227233ad64c ("intel_idle: enable interrupts before C1 on Xeons")
> Signed-off-by: Peter Zijlstra (Intel) 
> ---
>  drivers/idle/intel_idle.c |   48
> +++--- 1 file changed, 37
> insertions(+), 11 deletions(-)
> 
> --- a/drivers/idle/intel_idle.c
> +++ b/drivers/idle/intel_idle.c
> @@ -129,21 +137,37 @@ static unsigned int mwait_substates __in
>   *
>   * Must be called under local_irq_disable().
>   */
nit: this comment is no long true, right?

> +
> -static __cpuidle int intel_idle(struct cpuidle_device *dev,
> - struct cpuidle_driver *drv, int index)
> +static __always_inline int __intel_idle(struct cpuidle_device *dev,
> + struct cpuidle_driver *drv, int
> index) {
>   struct cpuidle_state *state = &drv->states[index];
>   unsigned long eax = flg2MWAIT(state->flags);
>   unsigned long ecx = 1; /* break on interrupt flag */
>  
> - if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE)
> - local_irq_enable();
> -
>   mwait_idle_with_hints(eax, ecx);
>  
>   return index;
>  }
>  
> +static __cpuidle int intel_idle(struct cpuidle_device *dev,
> + struct cpuidle_driver *drv, int index)
> +{
> + return __intel_idle(dev, drv, index);
> +}
> +
> +static __cpuidle int intel_idle_irq(struct cpuidle_device *dev,
> + struct cpuidle_driver *drv, int
> index) +{
> + int ret;
> +
> + raw_local_irq_enable();
> + ret = __intel_idle(dev, drv, index);
> + raw_local_irq_disable();
> +
> + return ret;
> +}
> +
>  /**
>   * intel_idle_s2idle - Ask the processor to enter the given idle state.
>   * @dev: cpuidle device of the target CPU.
> @@ -1801,6 +1824,9 @@ static void __init intel_idle_init_cstat
>   /* Structure copy. */
>   drv->states[drv->state_count] =
> cpuidle_state_table[cstate]; 
> + if (cpuidle_state_table[cstate].flags &
> CPUIDLE_FLAG_IRQ_ENABLE)
> + drv->states[drv->state_count].enter =
> intel_idle_irq; +
>   if ((disabled_states_mask & BIT(drv->state_count)) ||
>   ((icpu->use_acpi || force_use_acpi) &&
>intel_idle_off_by_default(mwait_hint) &&
> 
> 


Thanks,

Jacob

[xen-unstable test] 170897: regressions - FAIL

2022-06-09 Thread osstest service owner

flight 170897 xen-unstable real [real]
flight 170907 xen-unstable real-retest [real]
http://logs.test-lab.xenproject.org/osstest/logs/170897/
http://logs.test-lab.xenproject.org/osstest/logs/170907/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-amd64-xl-qemut-debianhvm-i386-xsm  8 xen-boot fail REGR. vs. 170890
 test-amd64-amd64-xl-qemut-stubdom-debianhvm-amd64-xsm 8 xen-boot fail REGR. 
vs. 170890

Tests which are failing intermittently (not blocking):
 test-amd64-amd64-xl-qemut-debianhvm-amd64 20 guest-start/debianhvm.repeat fail 
pass in 170907-retest

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-qemut-win7-amd64 19 guest-stopfail like 170890
 test-armhf-armhf-libvirt 16 saverestore-support-checkfail  like 170890
 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 170890
 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 170890
 test-amd64-i386-xl-qemut-ws16-amd64 19 guest-stop fail like 170890
 test-amd64-i386-xl-qemut-win7-amd64 19 guest-stop fail like 170890
 test-armhf-armhf-libvirt-qcow2 15 saverestore-support-check   fail like 170890
 test-armhf-armhf-libvirt-raw 15 saverestore-support-checkfail  like 170890
 test-amd64-i386-xl-qemuu-win7-amd64 19 guest-stop fail like 170890
 test-amd64-amd64-xl-qemut-ws16-amd64 19 guest-stopfail like 170890
 test-amd64-i386-xl-qemuu-ws16-amd64 19 guest-stop fail like 170890
 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 170890
 test-amd64-amd64-libvirt 15 migrate-support-checkfail   never pass
 test-amd64-i386-xl-pvshim14 guest-start  fail   never pass
 test-amd64-i386-libvirt-xsm  15 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  15 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-arm64-arm64-xl  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  16 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt-raw  14 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 15 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  14 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-qcow2 14 migrate-support-checkfail never pass
 test-armhf-armhf-xl-vhd  14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-raw 14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 15 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 16 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-supp

[PATCH] add more MISRA C rules to docs/misra/rules.rst

2022-06-09 Thread Stefano Stabellini

Add the new MISRA C rules agreed by the MISRA C working group to
docs/misra/rules.rst.

Signed-off-by: Stefano Stabellini 

---

I added the rules that we agreed upon this morning together with all the
notes we discussed, in particular:

- macros as macro parameters at invocation time for Rule 5.3
- the clarification of Rule 9.1
- gnu_inline exception for Rule 8.10


diff --git a/docs/misra/rules.rst b/docs/misra/rules.rst
index 6ccff07765..5c28836bc8 100644
--- a/docs/misra/rules.rst
+++ b/docs/misra/rules.rst
@@ -89,6 +89,28 @@ existing codebase are work-in-progress.
(xen/include/public/) are allowed to retain longer identifiers
for backward compatibility.
 
+   * - `Rule 5.2 
`_
+ - Required
+ - Identifiers declared in the same scope and name space shall be
+   distinct
+ - The Xen characters limit for identifiers is 40. Public headers
+   (xen/include/public/) are allowed to retain longer identifiers
+   for backward compatibility.
+
+   * - `Rule 5.3 
`_
+ - Required
+ - An identifier declared in an inner scope shall not hide an
+   identifier declared in an outer scope
+ - Using macros as macro parameters at invocation time is allowed,
+   e.g. MAX(var0, MIN(var1, var2))
+
+   * - `Rule 5.4 
`_
+ - Required
+ - Macro identifiers shall be distinct
+ - The Xen characters limit for macro identifiers is 40. Public
+   headers (xen/include/public/) are allowed to retain longer
+   identifiers for backward compatibility.
+
* - `Rule 6.2 
`_
  - Required
  - Single-bit named bit fields shall not be of a signed type
@@ -123,8 +145,75 @@ existing codebase are work-in-progress.
declarations of objects and functions that have internal linkage
  -
 
+   * - `Rule 8.10 
`_
+ - Required
+ - An inline function shall be declared with the static storage class
+ - gnu_inline (without static) is allowed.
+
* - `Rule 8.12 
`_
  - Required
  - Within an enumerator list the value of an implicitly-specified
enumeration constant shall be unique
  -
+
+   * - `Rule 9.1 
`_
+ - Mandatory
+ - The value of an object with automatic storage duration shall not
+   be read before it has been set
+ - Rule clarification: do not use variables before they are
+   initialized. An explicit initializer is not necessarily required.
+   Try reducing the scope of the variable. If an explicit
+   initializer is added, consider initializing the variable to a
+   poison value.
+
+   * - `Rule 9.2 
`_
+ - Required
+ - The initializer for an aggregate or union shall be enclosed in
+   braces
+ -
+
+   * - `Rule 13.6 
`_
+ - Mandatory
+ - The operand of the sizeof operator shall not contain any
+   expression which has potential side effects
+ -
+
+   * - `Rule 14.1 
`_
+ - Required
+ - A loop counter shall not have essentially floating type
+ -
+
+   * - `Rule 16.7 
`_
+ - Required
+ - A switch-expression shall not have essentially Boolean type
+ -
+
+   * - `Rule 17.3 
`_
+ - Mandatory
+ - A function shall not be declared implicitly
+ -
+
+   * - `Rule 17.4 
`_
+ - Mandatory
+ - All exit paths from a function with non-void return type shall
+   have an explicit return statement with an expression
+ -
+
+   * - `Rule 20.7 
`_
+ - Required
+ - Expressions resulting from the expansion of macro parameters
+   shall be enclosed in parentheses
+ -
+
+   * - `Rule 20.13 
`_
+ - Required
+ - A line whose first token is # shall be a valid preprocessing
+   directive
+ -
+
+   * - `Rule 20.14

[linux-5.4 test] 170895: tolerable FAIL - PUSHED

2022-06-09 Thread osstest service owner

flight 170895 linux-5.4 real [real]
http://logs.test-lab.xenproject.org/osstest/logs/170895/

Failures :-/ but no regressions.

Tests which are failing intermittently (not blocking):
 test-armhf-armhf-xl-credit1 18 guest-start/debian.repeat fail in 170887 pass 
in 170895
 test-armhf-armhf-xl-credit2  14 guest-startfail pass in 170887

Tests which did not succeed, but are not blocking:
 test-armhf-armhf-xl-credit2 15 migrate-support-check fail in 170887 never pass
 test-armhf-armhf-xl-credit2 16 saverestore-support-check fail in 170887 never 
pass
 test-armhf-armhf-xl-multivcpu 18 guest-start/debian.repeatfail like 170724
 test-armhf-armhf-libvirt-raw 15 saverestore-support-checkfail  like 170724
 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 170736
 test-amd64-amd64-xl-qemut-win7-amd64 19 guest-stopfail like 170736
 test-amd64-i386-xl-qemut-win7-amd64 19 guest-stop fail like 170736
 test-amd64-i386-xl-qemuu-win7-amd64 19 guest-stop fail like 170736
 test-armhf-armhf-libvirt 16 saverestore-support-checkfail  like 170736
 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 170736
 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 170736
 test-amd64-i386-xl-qemut-ws16-amd64 19 guest-stop fail like 170736
 test-armhf-armhf-xl-rtds 18 guest-start/debian.repeatfail  like 170736
 test-armhf-armhf-libvirt-qcow2 15 saverestore-support-check   fail like 170736
 test-amd64-amd64-xl-qemut-ws16-amd64 19 guest-stopfail like 170736
 test-amd64-i386-xl-qemuu-ws16-amd64 19 guest-stop fail like 170736
 test-amd64-i386-xl-pvshim14 guest-start  fail   never pass
 test-amd64-amd64-libvirt 15 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt-xsm  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  16 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  16 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-armhf-armhf-xl-arndale  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 14 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-arm64-arm64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 16 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt-raw  14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 15 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 15 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 16 saverestore-support-checkfail never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  14 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 15 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 16 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl-vhd  14 migrate-support-checkfail   never pass
 te

[linux-linus test] 170894: regressions - FAIL

2022-06-09 Thread osstest service owner

flight 170894 linux-linus real [real]
http://logs.test-lab.xenproject.org/osstest/logs/170894/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-arm64-arm64-examine  8 reboot   fail REGR. vs. 170714
 test-amd64-amd64-libvirt  8 xen-boot fail REGR. vs. 170714
 test-amd64-amd64-xl-pvhv2-intel  8 xen-boot  fail REGR. vs. 170714
 test-amd64-amd64-libvirt-pair 12 xen-boot/src_host   fail REGR. vs. 170714
 test-amd64-amd64-libvirt-pair 13 xen-boot/dst_host   fail REGR. vs. 170714
 test-amd64-amd64-freebsd12-amd64  8 xen-boot fail REGR. vs. 170714
 test-amd64-amd64-qemuu-nested-intel  8 xen-boot  fail REGR. vs. 170714
 test-arm64-arm64-xl-seattle   8 xen-boot fail REGR. vs. 170714
 test-arm64-arm64-xl-credit2   8 xen-boot fail REGR. vs. 170714
 test-amd64-amd64-libvirt-qcow2  8 xen-boot   fail REGR. vs. 170714
 test-amd64-amd64-libvirt-raw  8 xen-boot fail REGR. vs. 170714
 test-arm64-arm64-xl   8 xen-boot fail REGR. vs. 170714
 test-arm64-arm64-xl-credit1   8 xen-boot fail REGR. vs. 170714
 test-arm64-arm64-xl-vhd   8 xen-boot fail REGR. vs. 170714
 test-arm64-arm64-libvirt-raw  8 xen-boot fail REGR. vs. 170714
 test-amd64-amd64-xl-pvshim8 xen-boot fail REGR. vs. 170714
 test-arm64-arm64-xl-xsm   8 xen-boot fail REGR. vs. 170714
 test-amd64-amd64-examine-bios  8 reboot  fail REGR. vs. 170714
 test-amd64-amd64-examine-uefi  8 reboot  fail REGR. vs. 170714
 test-amd64-amd64-examine  8 reboot   fail REGR. vs. 170714
 test-arm64-arm64-libvirt-xsm  8 xen-boot fail REGR. vs. 170714

Regressions which are regarded as allowable (not blocking):
 test-amd64-amd64-xl-rtds 20 guest-localmigrate/x10   fail REGR. vs. 170714

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-qemut-win7-amd64 19 guest-stopfail like 170714
 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 170714
 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 170714
 test-amd64-amd64-xl-qemut-ws16-amd64 19 guest-stopfail like 170714
 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 170714
 test-armhf-armhf-libvirt-qcow2 15 saverestore-support-check   fail like 170714
 test-armhf-armhf-libvirt 16 saverestore-support-checkfail  like 170714
 test-armhf-armhf-libvirt-raw 15 saverestore-support-checkfail  like 170714
 test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-arm64-arm64-xl-thunderx 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 15 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 16 saverestore-support-checkfail never pass
 test-armhf-armhf-xl-multivcpu 15 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 16 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl-vhd  14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-qcow2 14 migrate-support-checkfail never pass
 test-armhf-armhf-xl-credit1  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt 15 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt-raw 14 migrate-support-checkfail   never pass

version targeted for testing:
 linux6bfb56e93bcef41859c2d5ab234ffd80b691be35
baseline version:
 linuxd6ecaa0024485effd065124fe774de2e22095f2d

Last test of basis   170714  2022-05-24 03:27:44 Z   16 days
Failing since170716  2022-05-24 11:12:06 Z   16 days   43 attempts
Testing same since   170894  2022-06-09 05:18:39 Z0 days1 attempts

-

[PATCH v5 2/5] grub-mkconfig linux_xen: Fix quadratic algorithm for sorting menu items

2022-06-09 Thread Mathieu Desnoyers

The current implementation of the 20_linux_xen script implements its
menu items sorting in bash with a quadratic algorithm, calling "sed",
"sort", "head", and "grep" to compare versions between individual lines,
which is annoyingly slow for kernel developers who can easily end up
with 50-100 kernels in their boot partition.

This fix is ported from the 10_linux script, which has a similar
quadratic code pattern.

[ Note: this is untested. I would be grateful if anyone with a Xen
  environment could test it before it is merged. ]

Signed-off-by: Mathieu Desnoyers 
Cc: xen-devel@lists.xenproject.org
---
Changes since v4:
- Combine sed -e '...' -e '...' into sed -e '...; ...'
---
 util/grub.d/20_linux_xen.in | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/util/grub.d/20_linux_xen.in b/util/grub.d/20_linux_xen.in
index 51a983926..4382303c1 100644
--- a/util/grub.d/20_linux_xen.in
+++ b/util/grub.d/20_linux_xen.in
@@ -237,11 +237,17 @@ esac
 # yet, so it's empty. In a submenu it will be equal to '\t' (one tab).
 submenu_indentation=""
 
+# Perform a reverse version sort on the entire xen_list and linux_list.
+# Temporarily replace the '.old' suffix by ' 1' and append ' 2' for all
+# other files to order the '.old' files after their non-old counterpart
+# in reverse-sorted order.
+
+reverse_sorted_xen_list=$(echo ${xen_list} | tr ' ' '\n' | sed -e 's/\.old$/ 
1/; / 1$/! s/$/ 2/' | version_sort -r | sed -e 's/ 1$/.old/; s/ 2$//')
+reverse_sorted_linux_list=$(echo ${linux_list} | tr ' ' '\n' | sed -e 
's/\.old$/ 1/; / 1$/! s/$/ 2/' | version_sort -r | sed -e 's/ 1$/.old/; s/ 
2$//')
+
 is_top_level=true
 
-while [ "x${xen_list}" != "x" ] ; do
-list="${linux_list}"
-current_xen=`version_find_latest $xen_list`
+for current_xen in ${reverse_sorted_xen_list}; do
 xen_basename=`basename ${current_xen}`
 xen_dirname=`dirname ${current_xen}`
 rel_xen_dirname=`make_system_path_relative_to_its_root $xen_dirname`
@@ -273,8 +279,7 @@ while [ "x${xen_list}" != "x" ] ; do
fi
 done
 
-while [ "x$list" != "x" ] ; do
-   linux=`version_find_latest $list`
+for linux in ${reverse_sorted_linux_list}; do
gettext_printf "Found linux image: %s\n" "$linux" >&2
basename=`basename $linux`
dirname=`dirname $linux`
@@ -351,13 +356,10 @@ while [ "x${xen_list}" != "x" ] ; do
linux_entry "${OS}" "${version}" "${xen_version}" recovery \
"${GRUB_CMDLINE_LINUX_RECOVERY} ${GRUB_CMDLINE_LINUX}" 
"${GRUB_CMDLINE_XEN}"
fi
-
-   list=`echo $list | tr ' ' '\n' | fgrep -vx "$linux" | tr '\n' ' '`
 done
 if [ x"$is_top_level" != xtrue ]; then
echo '  }'
 fi
-xen_list=`echo $xen_list | tr ' ' '\n' | fgrep -vx "$current_xen" | tr 
'\n' ' '`
 done
 
 # If at least one kernel was found, then we need to
-- 
2.30.2

Re: [RFC PATCH 2/2] xen/grant-table: Use unpopulated DMAable pages instead of real RAM ones

2022-06-09 Thread Oleksandr




On 04.06.22 00:19, Stefano Stabellini wrote:


Hello Stefano

Thank you for having a look and sorry for the late response.


On Tue, 17 May 2022, Oleksandr Tyshchenko wrote:

From: Oleksandr Tyshchenko 

Depends on CONFIG_XEN_UNPOPULATED_ALLOC. If enabled then unpopulated
DMAable (contiguous) pages will be allocated for grant mapping into
instead of ballooning out real RAM pages.

TODO: Fallback to real RAM pages if xen_alloc_unpopulated_dma_pages()
fails.

Signed-off-by: Oleksandr Tyshchenko 
---
  drivers/xen/grant-table.c | 27 +++
  1 file changed, 27 insertions(+)

diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 8ac..2bb4392 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -864,6 +864,25 @@ EXPORT_SYMBOL_GPL(gnttab_free_pages);
   */
  int gnttab_dma_alloc_pages(struct gnttab_dma_alloc_args *args)
  {
+#ifdef CONFIG_XEN_UNPOPULATED_ALLOC
+   int ret;

This is an alternative implementation of the same function.


Currently, yes.



  If we are
going to use #ifdef, then I would #ifdef the entire function, rather
than just the body. Otherwise within the function body we can use
IS_ENABLED.



Good point. Note, there is one missing thing in current patch which is 
described in TODO.


"Fallback to real RAM pages if xen_alloc_unpopulated_dma_pages() 
fails."  So I will likely use IS_ENABLED within the function body.


If CONFIG_XEN_UNPOPULATED_ALLOC is enabled then gnttab_dma_alloc_pages() 
will try to call xen_alloc_unpopulated_dma_pages() the first and if 
fails then fallback to allocate RAM pages and balloon them out.


One moment is not entirely clear to me. If we use fallback in 
gnttab_dma_alloc_pages() then we must use fallback in 
gnttab_dma_free_pages() as well, we cannot use 
xen_free_unpopulated_dma_pages() for real RAM pages. The question is how 
to pass this information to the gnttab_dma_free_pages()? The first idea 
which comes to mind is to add a flag to struct gnttab_dma_alloc_args...






+   ret = xen_alloc_unpopulated_dma_pages(args->dev, args->nr_pages,
+   args->pages);
+   if (ret < 0)
+   return ret;
+
+   ret = gnttab_pages_set_private(args->nr_pages, args->pages);
+   if (ret < 0) {
+   gnttab_dma_free_pages(args);

it should xen_free_unpopulated_dma_pages ?


Besides calling the xen_free_unpopulated_dma_pages(), we also need to 
call gnttab_pages_clear_private() here, this is what 
gnttab_dma_free_pages() is doing.


I can change to call both function instead:

    gnttab_pages_clear_private(args->nr_pages, args->pages);
    xen_free_unpopulated_dma_pages(args->dev, args->nr_pages, args->pages);

Shall I?






+   return ret;
+   }
+
+   args->vaddr = page_to_virt(args->pages[0]);
+   args->dev_bus_addr = page_to_phys(args->pages[0]);

There are two things to note here.

The first thing to note is that normally we would call pfn_to_bfn to
retrieve the dev_bus_addr of a page because pfn_to_bfn takes into
account foreign mappings. However, these are freshly allocated pages
without foreign mappings, so page_to_phys/dma should be sufficient.


agree





The second has to do with physical addresses and DMA addresses. The
functions are called gnttab_dma_alloc_pages and
xen_alloc_unpopulated_dma_pages which make you think we are retrieving a
DMA address here. However, to get a DMA address we need to call
page_to_dma rather than page_to_phys.

page_to_dma takes into account special offsets that some devices have
when accessing memory. There are real cases on ARM where the physical
address != DMA address, e.g. RPi4.


I got it. Now I am in doubt whether it would be better to name the API:

xen_alloc_unpopulated_cma_pages()

or

xen_alloc_unpopulated_contiguous_pages()

What do you think?




However, to call page_to_dma you need to specify as first argument the
DMA-capable device that is expected to use those pages for DMA (e.g. an
ethernet device or a MMC controller.) While the args->dev we have in
gnttab_dma_alloc_pages is the gntdev_miscdev.


agree

As I understand, at this time it is unknown for what exactly device 
these pages are supposed to be used at the end.


For now, it is only known that these pages to be used by userspace PV 
backend for grant mappings.





So this interface cannot actually be used to allocate memory that is
supposed to be DMA-able by a DMA-capable device, such as an ethernet
device.


agree




But I think that should be fine because the memory is meant to be used
by a userspace PV backend for grant mappings. If any of those mappings
end up being used for actual DMA in the kernel they should go through the
drivers/xen/swiotlb-xen.c and xen_phys_to_dma should be called, which
ends up calling page_to_dma as appropriate.

It would be good to double-check that the above is correct and, if so,
maybe add a short in-code comment about it:

/*
  * These are not actually DMA addresses but regular p

[xen-unstable-smoke test] 170900: tolerable all pass - PUSHED

2022-06-09 Thread osstest service owner

flight 170900 xen-unstable-smoke real [real]
http://logs.test-lab.xenproject.org/osstest/logs/170900/

Failures :-/ but no regressions.

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-libvirt 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-support-checkfail   never pass

version targeted for testing:
 xen  c1c9cae3a9633054b177c5de21ad7268162b2f2c
baseline version:
 xen  59fbdf8a3667ce42c1cf70c94c3bcd0451afd4d8

Last test of basis   170899  2022-06-09 09:00:28 Z0 days
Testing same since   170900  2022-06-09 13:01:48 Z0 days1 attempts


People who touched revisions under test:
  Andrew Cooper 
  George Dunlap 

jobs:
 build-arm64-xsm  pass
 build-amd64  pass
 build-armhf  pass
 build-amd64-libvirt  pass
 test-armhf-armhf-xl  pass
 test-arm64-arm64-xl-xsm  pass
 test-amd64-amd64-xl-qemuu-debianhvm-amd64pass
 test-amd64-amd64-libvirt pass



sg-report-flight on osstest.test-lab.xenproject.org
logs: /home/logs/logs
images: /home/logs/images

Logs, config files, etc. are available at
http://logs.test-lab.xenproject.org/osstest/logs

Explanation of these reports, and of osstest in general, is at
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README.email;hb=master
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README;hb=master

Test harness code can be found at
http://xenbits.xen.org/gitweb?p=osstest.git;a=summary


Pushing revision :

To xenbits.xen.org:/home/xen/git/xen.git
   59fbdf8a36..c1c9cae3a9  c1c9cae3a9633054b177c5de21ad7268162b2f2c -> smoke

[libvirt test] 170892: regressions - FAIL

2022-06-09 Thread osstest service owner

flight 170892 libvirt real [real]
http://logs.test-lab.xenproject.org/osstest/logs/170892/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 build-armhf-libvirt   6 libvirt-buildfail REGR. vs. 151777
 build-amd64-libvirt   6 libvirt-buildfail REGR. vs. 151777
 build-i386-libvirt6 libvirt-buildfail REGR. vs. 151777
 build-arm64-libvirt   6 libvirt-buildfail REGR. vs. 151777

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-libvirt  1 build-check(1)   blocked  n/a
 test-amd64-amd64-libvirt-pair  1 build-check(1)   blocked  n/a
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 1 build-check(1) blocked n/a
 test-amd64-amd64-libvirt-vhd  1 build-check(1)   blocked  n/a
 test-amd64-amd64-libvirt-xsm  1 build-check(1)   blocked  n/a
 test-amd64-i386-libvirt   1 build-check(1)   blocked  n/a
 test-amd64-i386-libvirt-pair  1 build-check(1)   blocked  n/a
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 1 build-check(1) blocked n/a
 test-amd64-i386-libvirt-raw   1 build-check(1)   blocked  n/a
 test-amd64-i386-libvirt-xsm   1 build-check(1)   blocked  n/a
 test-arm64-arm64-libvirt  1 build-check(1)   blocked  n/a
 test-arm64-arm64-libvirt-qcow2  1 build-check(1)   blocked  n/a
 test-arm64-arm64-libvirt-raw  1 build-check(1)   blocked  n/a
 test-armhf-armhf-libvirt-raw  1 build-check(1)   blocked  n/a
 test-arm64-arm64-libvirt-xsm  1 build-check(1)   blocked  n/a
 test-armhf-armhf-libvirt  1 build-check(1)   blocked  n/a
 test-armhf-armhf-libvirt-qcow2  1 build-check(1)   blocked  n/a

version targeted for testing:
 libvirt  2177de7b6e35499584731e6f4869903aa553022b
baseline version:
 libvirt  2c846fa6bcc11929c9fb857a22430fb9945654ad

Last test of basis   151777  2020-07-10 04:19:19 Z  699 days
Failing since151818  2020-07-11 04:18:52 Z  698 days  680 attempts
Testing same since   170892  2022-06-09 04:20:18 Z0 days1 attempts


People who touched revisions under test:
Adolfo Jayme Barrientos 
  Aleksandr Alekseev 
  Aleksei Zakharov 
  Amneesh Singh 
  Andika Triwidada 
  Andrea Bolognani 
  Andrew Melnychenko 
  Ani Sinha 
  Balázs Meskó 
  Barrett Schonefeld 
  Bastian Germann 
  Bastien Orivel 
  BiaoXiang Ye 
  Bihong Yu 
  Binfeng Wu 
  Bjoern Walk 
  Boris Fiuczynski 
  Brad Laue 
  Brian Turek 
  Bruno Haible 
  Chris Mayo 
  Christian Borntraeger 
  Christian Ehrhardt 
  Christian Kirbach 
  Christian Schoenebeck 
  Christophe Fergeau 
  Claudio Fontana 
  Cole Robinson 
  Collin Walling 
  Cornelia Huck 
  Cédric Bosdonnat 
  Côme Borsoi 
  Daniel Henrique Barboza 
  Daniel Letai 
  Daniel P. Berrange 
  Daniel P. Berrangé 
  Didik Supriadi 
  dinglimin 
  Divya Garg 
  Dmitrii Shcherbakov 
  Dmytro Linkin 
  Eiichi Tsukata 
  Emilio Herrera 
  Eric Farman 
  Erik Skultety 
  Fabian Affolter 
  Fabian Freyer 
  Fabiano Fidêncio 
  Fangge Jin 
  Farhan Ali 
  Fedora Weblate Translation 
  Franck Ridel 
  Gavi Teitz 
  gongwei 
  Guoyi Tu
  Göran Uddeborg 
  Halil Pasic 
  Han Han 
  Hao Wang 
  Haonan Wang 
  Hela Basa 
  Helmut Grohne 
  Hiroki Narukawa 
  Hyman Huang(黄勇) 
  Ian Wienand 
  Ioanna Alifieraki 
  Ivan Teterevkov 
  Jakob Meng 
  Jamie Strandboge 
  Jamie Strandboge 
  Jan Kuparinen 
  jason lee 
  Jean-Baptiste Holcroft 
  Jia Zhou 
  Jianan Gao 
  Jim Fehlig 
  Jin Yan 
  Jing Qi 
  Jinsheng Zhang 
  Jiri Denemark 
  Joachim Falk 
  John Ferlan 
  John Levon 
  John Levon 
  Jonathan Watt 
  Jonathon Jongsma 
  Julio Faracco 
  Justin Gatzen 
  Ján Tomko 
  Kashyap Chamarthy 
  Kevin Locke 
  Kim InSoo 
  Koichi Murase 
  Kristina Hanicova 
  Laine Stump 
  Laszlo Ersek 
  Lee Yarwood 
  Lei Yang 
  Lena Voytek 
  Liang Yan 
  Liang Yan 
  Liao Pingfang 
  Lin Ma 
  Lin Ma 
  Lin Ma 
  Liu Yiding 
  Lubomir Rintel 
  Luke Yue 
  Luyao Zhong 
  luzhipeng 
  Marc Hartmayer 
  Marc-André Lureau 
  Marek Marczykowski-Górecki 
  Markus Schade 
  Martin Kletzander 
  Martin Pitt 
  Masayoshi Mizuma 
  Matej Cepl 
  Matt Coleman 
  Matt Coleman 
  Mauro Matteo Cascella 
  Max Goodhart 
  Maxim Nestratov 
  Meina Li 
  Michal Privoznik 
  Michał Smyk 
  Milo Casagrande 
  Moshe Levi 
  Moteen Shah 
  Moteen Shah 
  Muha Aliss 
  Nathan 
  Neal Gompa 
  Nick Chevsky 
  Nick Shyrokovskiy 
  Nickys Music Group 
  Nico Pache 
  Nicolas Lécureuil 
  Nicolas Lécureuil 
  Nikolay Shirokovskiy 
  Nikolay Shirokovskiy 
  Nikolay Shirokovskiy 
  Niteesh Dubey 
  Olaf Hering 
  Olesya Gerasimenko 
  Or Ozeri 
  Orion Poplawski 
  Pany 
  Paolo Bonzini 
  Patrick Magauran 
  Paulo de Rezende Pinatti 
  Pavel Hrdina 
  Peng Liang 
  Peng Liang 
  Peter Krempa

Re: [PATCH v2] x86emul/test: encourage compiler to use more embedded broadcast

2022-06-09 Thread Andrew Cooper

On 09/06/2022 16:35, Jan Beulich wrote:
> For one it was an oversight to leave dup_{hi,lo}() undefined for 512-bit
> vector size. And then in FMA testing we can also arrange for the
> compiler to (hopefully) recognize broadcasting potential. Plus we can
> replace the broadcast(1) use in the addsub() surrogate with inline
> assembly explicitly using embedded broadcast (even gcc12 still doesn't
> support broadcast for any of the addsub/subadd builtins).
>
> Signed-off-by: Jan Beulich 

Acked-by: Andrew Cooper

[PATCH] EFI: strip xen.efi when putting it on the EFI partition

2022-06-09 Thread Jan Beulich

With debug info retained, xen.efi can be quite large. Unlike for xen.gz
there's no intermediate step (mkelf32 there) involved which would strip
debug info kind of as a side effect. While the installing of xen.efi on
the EFI partition is an optional step (intended to be a courtesy to the
developer), adjust it also for the purpose of documenting what distros
would be expected to do during boot loader configuration (which is what
would normally put xen.efi into the EFI partition).

Model the control over stripping after Linux'es module installation,
except that the stripped executable is constructed in the build area
instead of in the destination location. This is to conserve on space
used there - EFI partitions tend to be only a few hundred Mb in size.

Signed-off-by: Jan Beulich 
---
GNU strip 2.38 appears to have issues when acting on a PE binary:
- file name symbols are also stripped; while there is a separate
  --keep-file-symbols option (which I would have thought to be on by
  default anyway), its use so far makes no difference,
- the string table grows in size, when one would expect it to retain its
  size (or shrink),
- linker version is changed in and timestamp zapped from the header.
Older GNU strip (observed with 2.35.1) doesn't work at all ("Data
Directory size (1c) exceeds space left in section (8)").

Future GNU strip is going to honor --keep-file-symbols (and will also
have the other issues fixed). Question is whether we should use that
option (for the symbol table as a whole to make sense), or whether
instead we should (by default) strip the symbol table as well.

--- a/xen/Makefile
+++ b/xen/Makefile
@@ -465,6 +465,22 @@ endif
 .PHONY: _build
 _build: $(TARGET)$(CONFIG_XEN_INSTALL_SUFFIX)
 
+# Strip
+#
+# INSTALL_EFI_STRIP, if defined, will cause xen.efi to be stripped before it
+# is installed. If INSTALL_EFI_STRIP is '1', then the default option
+# --strip-debug will be used. Otherwise, INSTALL_EFI_STRIP value will be used
+# as the option(s) to the strip command.
+ifdef INSTALL_EFI_STRIP
+
+ifeq ($(INSTALL_EFI_STRIP),1)
+efi-strip-opt := --strip-debug
+else
+efi-strip-opt := $(INSTALL_EFI_STRIP)
+endif
+
+endif
+
 .PHONY: _install
 _install: D=$(DESTDIR)
 _install: T=$(notdir $(TARGET))
@@ -489,6 +505,9 @@ _install: $(TARGET)$(CONFIG_XEN_INSTALL_
ln -sf $(T)-$(XEN_FULLVERSION).efi 
$(D)$(EFI_DIR)/$(T)-$(XEN_VERSION).efi; \
ln -sf $(T)-$(XEN_FULLVERSION).efi $(D)$(EFI_DIR)/$(T).efi; \
if [ -n '$(EFI_MOUNTPOINT)' -a -n '$(EFI_VENDOR)' ]; then \
+   $(if $(efi-strip-opt), \
+$(STRIP) $(efi-strip-opt) -p -o 
$(TARGET).efi.stripped $(TARGET).efi && \
+$(INSTALL_DATA) $(TARGET).efi.stripped 
$(D)$(EFI_MOUNTPOINT)/efi/$(EFI_VENDOR)/$(T)-$(XEN_FULLVERSION).efi ||) \
$(INSTALL_DATA) $(TARGET).efi 
$(D)$(EFI_MOUNTPOINT)/efi/$(EFI_VENDOR)/$(T)-$(XEN_FULLVERSION).efi; \
elif [ "$(D)" = "$(patsubst $(shell cd $(XEN_ROOT) && 
pwd)/%,%,$(D))" ]; then \
echo 'EFI installation only partially done (EFI_VENDOR 
not set)' >&2; \

Re: [PATCH] x86/mm: further simplify cleanup_page_mappings()

2022-06-09 Thread Andrew Cooper

On 09/06/2022 16:39, Jan Beulich wrote:
> With the removal of update_xen_mappings() there's no need anymore for a
> 2nd error code variable to transiently hold the IOMMU unmap return
> value.
>
> Signed-off-by: Jan Beulich 

Oh - I'd not even spotted that simplification.

Acked-by: Andrew Cooper 

> ---
> I have to admit that I was tempted to get rid of PAGE_ORDER_4K at this
> occasion, as it feels awkward to me to have such in clearly x86-only
> code.

Happy for that to go too.

~Andrew

[PATCH] x86/mm: further simplify cleanup_page_mappings()

2022-06-09 Thread Jan Beulich

With the removal of update_xen_mappings() there's no need anymore for a
2nd error code variable to transiently hold the IOMMU unmap return
value.

Signed-off-by: Jan Beulich 
---
I have to admit that I was tempted to get rid of PAGE_ORDER_4K at this
occasion, as it feels awkward to me to have such in clearly x86-only
code.

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2470,12 +2470,7 @@ static int cleanup_page_mappings(struct
 struct domain *d = page_get_owner(page);
 
 if ( d && unlikely(need_iommu_pt_sync(d)) && is_pv_domain(d) )
-{
-int rc2 = iommu_legacy_unmap(d, _dfn(mfn), 1u << PAGE_ORDER_4K);
-
-if ( !rc )
-rc = rc2;
-}
+rc = iommu_legacy_unmap(d, _dfn(mfn), 1u << PAGE_ORDER_4K);
 
 if ( likely(!is_special_page(page)) )
 {

[PATCH v2] x86emul/test: encourage compiler to use more embedded broadcast

2022-06-09 Thread Jan Beulich

For one it was an oversight to leave dup_{hi,lo}() undefined for 512-bit
vector size. And then in FMA testing we can also arrange for the
compiler to (hopefully) recognize broadcasting potential. Plus we can
replace the broadcast(1) use in the addsub() surrogate with inline
assembly explicitly using embedded broadcast (even gcc12 still doesn't
support broadcast for any of the addsub/subadd builtins).

Signed-off-by: Jan Beulich 
---
v2: Also alter addsub() surrogate.

--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -912,6 +912,13 @@ static inline vec_t movlhps(vec_t x, vec
 })
 #  endif
 # endif
+#elif VEC_SIZE == 64
+# if FLOAT_SIZE == 4
+#  define dup_hi(x) B(movshdup, _mask, x, undef(), ~0)
+#  define dup_lo(x) B(movsldup, _mask, x, undef(), ~0)
+# elif FLOAT_SIZE == 8
+#  define dup_lo(x) B(movddup, _mask, x, undef(), ~0)
+# endif
 #endif
 #if VEC_SIZE == 16 && defined(__SSSE3__) && !defined(__AVX512VL__)
 # if INT_SIZE == 1
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -49,8 +49,10 @@ float
 # define ELEM_SIZE FLOAT_SIZE
 # if FLOAT_SIZE == 4
 #  define MODE SF
+#  define ELEM_SFX "s"
 # elif FLOAT_SIZE == 8
 #  define MODE DF
+#  define ELEM_SFX "d"
 # endif
 #endif
 #ifndef VEC_SIZE
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -56,13 +56,27 @@ ENTRY(fma_test);
 #endif
 
 #if defined(fmaddsub) && !defined(addsub)
-# define addsub(x, y) fmaddsub(x, broadcast(1), y)
+# ifdef __AVX512F__
+#  define addsub(x, y) ({ \
+vec_t t_; \
+typeof(t_[0]) one_ = 1; \
+asm ( "vfmaddsub231p" ELEM_SFX " %2%{1to%c4%}, %1, %0" \
+  : "=v" (t_) \
+  : "v" (x), "m" (one_), "0" (y), "i" (ELEM_COUNT) ); \
+t_; \
+})
+# else
+#  define addsub(x, y) fmaddsub(x, broadcast(1), y)
+# endif
 #endif
 
 int fma_test(void)
 {
 unsigned int i;
 vec_t x, y, z, src, inv, one;
+#ifdef __AVX512F__
+typeof(one[0]) one_ = 1;
+#endif
 
 for ( i = 0; i < ELEM_COUNT; ++i )
 {
@@ -71,6 +85,10 @@ int fma_test(void)
 one[i] = 1;
 }
 
+#ifdef __AVX512F__
+# define one one_
+#endif
+
 x = (src + one) * inv;
 y = (src - one) * inv;
 touch(src);
@@ -93,22 +111,28 @@ int fma_test(void)
 x = src + inv;
 y = src - inv;
 touch(inv);
+touch(one);
 z = src * one + inv;
 if ( !eq(x, z) ) return __LINE__;
 
 touch(inv);
+touch(one);
 z = -src * one - inv;
 if ( !eq(-x, z) ) return __LINE__;
 
 touch(inv);
+touch(one);
 z = src * one - inv;
 if ( !eq(y, z) ) return __LINE__;
 
 touch(inv);
+touch(one);
 z = -src * one + inv;
 if ( !eq(-y, z) ) return __LINE__;
 touch(inv);
 
+#undef one
+
 #if defined(addsub) && defined(fmaddsub)
 x = addsub(src * inv, one);
 y = addsub(src * inv, -one);

[PATCH] MAINTAINERS: drop XSM maintainer

2022-06-09 Thread Jan Beulich

While mail hasn't been bouncing, Daniel has not been responding to patch
submissions or otherwise interacting with the community for several
years. Move maintainership to THE REST in kind of an unusual way, with
the goal to avoid
- orphaning the component,
- repeating all THE REST members here,
- removing the entry altogether.

Signed-off-by: Jan Beulich 
---
We hope this to be transient, with a new maintainer to be established
rather sooner than later.

I realize the way I'm expressing this may upset scripts/*_maintainer*.pl,
so I'd welcome any better alternative suggestion.

--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -648,7 +648,7 @@ F:  xen/common/trace.c
 F: xen/include/xen/trace.h
 
 XSM/FLASK
-M: Daniel De Graaf 
+M: THE REST (see below)
 R: Daniel P. Smith 
 S: Supported
 F: tools/flask/

Re: [XEN PATCH 1/4] build: xen/include: use if_changed

2022-06-09 Thread Bertrand Marquis

Hi Antony,

> On 9 Jun 2022, at 13:55, Anthony PERARD  wrote:
> 
> On Thu, Jun 09, 2022 at 11:51:20AM +, Bertrand Marquis wrote:
>> Hi,
>> 
>>> On 9 Jun 2022, at 11:26, Jan Beulich  wrote:
>>> 
>>> On 09.06.2022 12:16, Bertrand Marquis wrote:
 With this change, compiling for x86 is now ending up in:
 CHK include/headers99.chk
 make[9]: execvp: /bin/sh: Argument list too long
 make[9]: *** [include/Makefile:181: include/headers++.chk] Error 127
 
 Not quite sure yet why but I wanted to signal it early as other might be 
 impacted.
 
 Arm and arm64 builds are not impacted.
>>> 
>>> Hmm, that patch has passed the smoke push gate already, so there likely is
>>> more to it than there being an unconditional issue. I did build-test this
>>> before pushing, and I've just re-tested on a 2nd system without seeing an
>>> issue.
>> 
>> I have the problem only when building using Yocto, I did a normal build and 
>> the
>> issue is not coming.
>> 
> 
> Will the following patch help?

Yes it does, thanks a lot.

You can add my:
Reviewed-by: Bertrand Marquis 

Cheers
Bertrand

> 
> 
> From 0f32f749304b233c0d5574dc6b14f66e8709feba Mon Sep 17 00:00:00 2001
> From: Anthony PERARD 
> Date: Thu, 9 Jun 2022 13:42:52 +0100
> Subject: [XEN PATCH] build,include: rework shell script for headers++.chk
> 
> The command line generated for headers++.chk by make is quite long,
> and in some environment it is too long. This issue have been seen in
> Yocto build environment.
> 
> Error messages:
>make[9]: execvp: /bin/sh: Argument list too long
>make[9]: *** [include/Makefile:181: include/headers++.chk] Error 127
> 
> Rework to that we do the foreach loop in shell rather that make to
> reduce the command line size by a lot. We also need a way to get
> headers prerequisite for some public headers so we use a switch "case"
> in shell to be able to do some simple pattern matching. Variables
> alone in POSIX shell don't allow to work with associative array or
> variables with "/".
> 
> Reported-by: Bertrand Marquis 
> Fixes: 28e13c7f43 ("build: xen/include: use if_changed")
> Signed-off-by: Anthony PERARD 
> ---
> xen/include/Makefile | 17 +
> 1 file changed, 13 insertions(+), 4 deletions(-)
> 
> diff --git a/xen/include/Makefile b/xen/include/Makefile
> index 6d9bcc19b0..ca5e868f38 100644
> --- a/xen/include/Makefile
> +++ b/xen/include/Makefile
> @@ -158,13 +158,22 @@ define cmd_headerscxx_chk
>   touch $@.new; \
>   exit 0;   \
>   fi;   \
> - $(foreach i, $(filter %.h,$^),\
> - echo "#include "\"$(i)\"  \
> + get_prereq() {\
> + case $$1 in   \
> + $(foreach i, $(filter %.h,$^),\
> + $(if $($(patsubst $(srctree)/%,%,$i)-prereq), \
> + $(patsubst $(srctree)/%,%,$i)$(close) \
> + echo "$(foreach j, $($(patsubst $(srctree)/%,%,$i)-prereq),   \
> + -include c$(j))";;))  \
> + esac; \
> + };\
> + for i in $(filter %.h,$^); do \
> + echo "#include "\"$$i\"   \
>   | $(CXX) -x c++ -std=gnu++98 -Wall -Werror -D__XEN_TOOLS__\
> -include stdint.h -include $(srcdir)/public/xen.h   \
> -   $(foreach j, $($(patsubst $(srctree)/%,%,$i)-prereq), -include 
> c$(j)) \
> +   `get_prereq $$i`\
> -S -o /dev/null -   \
> - || exit $$?; echo $(i) >> $@.new;) \
> + || exit $$?; echo $$i >> $@.new; done;\
>   mv $@.new $@
> endef
> 
> 
> 
> 
> -- 
> Anthony PERARD

[PULL 3/3] include/hw/ide: Unexport pci_piix3_xen_ide_unplug()

2022-06-09 Thread Anthony PERARD

From: Bernhard Beschow 

This function was declared in a generic and public header, implemented
in a device-specific source file but only used in xen_platform. Given its
'aux' parameter, this function is more xen-specific than piix-specific.
Also, the hardcoded magic constants seem to be generic and related to
PCIIDEState and IDEBus rather than piix.

Therefore, move this function to xen_platform, unexport it, and drop the
"piix3" in the function name as well.

Signed-off-by: Bernhard Beschow 
Reviewed-by: Paul Durrant 
Acked-by: Anthony PERARD 
Reviewed-by: Philippe Mathieu-Daudé 
Message-Id: <20220513180957.90514-4-shen...@gmail.com>
Signed-off-by: Anthony PERARD 
---
 hw/i386/xen/xen_platform.c | 48 +-
 hw/ide/piix.c  | 46 
 include/hw/ide.h   |  3 ---
 3 files changed, 47 insertions(+), 50 deletions(-)

diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c
index 72028449ba..a64265cca0 100644
--- a/hw/i386/xen/xen_platform.c
+++ b/hw/i386/xen/xen_platform.c
@@ -26,6 +26,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "hw/ide.h"
+#include "hw/ide/pci.h"
 #include "hw/pci/pci.h"
 #include "hw/xen/xen_common.h"
 #include "migration/vmstate.h"
@@ -134,6 +135,51 @@ static void pci_unplug_nics(PCIBus *bus)
 pci_for_each_device(bus, 0, unplug_nic, NULL);
 }
 
+/*
+ * The Xen HVM unplug protocol [1] specifies a mechanism to allow guests to
+ * request unplug of 'aux' disks (which is stated to mean all IDE disks,
+ * except the primary master).
+ *
+ * NOTE: The semantics of what happens if unplug of all disks and 'aux' disks
+ *   is simultaneously requested is not clear. The implementation assumes
+ *   that an 'all' request overrides an 'aux' request.
+ *
+ * [1] 
https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=docs/misc/hvm-emulated-unplug.pandoc
+ */
+static void pci_xen_ide_unplug(DeviceState *dev, bool aux)
+{
+PCIIDEState *pci_ide;
+int i;
+IDEDevice *idedev;
+IDEBus *idebus;
+BlockBackend *blk;
+
+pci_ide = PCI_IDE(dev);
+
+for (i = aux ? 1 : 0; i < 4; i++) {
+idebus = &pci_ide->bus[i / 2];
+blk = idebus->ifs[i % 2].blk;
+
+if (blk && idebus->ifs[i % 2].drive_kind != IDE_CD) {
+if (!(i % 2)) {
+idedev = idebus->master;
+} else {
+idedev = idebus->slave;
+}
+
+blk_drain(blk);
+blk_flush(blk);
+
+blk_detach_dev(blk, DEVICE(idedev));
+idebus->ifs[i % 2].blk = NULL;
+idedev->conf.blk = NULL;
+monitor_remove_blk(blk);
+blk_unref(blk);
+}
+}
+qdev_reset_all(dev);
+}
+
 static void unplug_disks(PCIBus *b, PCIDevice *d, void *opaque)
 {
 uint32_t flags = *(uint32_t *)opaque;
@@ -147,7 +193,7 @@ static void unplug_disks(PCIBus *b, PCIDevice *d, void 
*opaque)
 
 switch (pci_get_word(d->config + PCI_CLASS_DEVICE)) {
 case PCI_CLASS_STORAGE_IDE:
-pci_piix3_xen_ide_unplug(DEVICE(d), aux);
+pci_xen_ide_unplug(DEVICE(d), aux);
 break;
 
 case PCI_CLASS_STORAGE_SCSI:
diff --git a/hw/ide/piix.c b/hw/ide/piix.c
index bc1b37512a..9a9b28078e 100644
--- a/hw/ide/piix.c
+++ b/hw/ide/piix.c
@@ -173,52 +173,6 @@ static void pci_piix_ide_realize(PCIDevice *dev, Error 
**errp)
 }
 }
 
-/*
- * The Xen HVM unplug protocol [1] specifies a mechanism to allow guests to
- * request unplug of 'aux' disks (which is stated to mean all IDE disks,
- * except the primary master).
- *
- * NOTE: The semantics of what happens if unplug of all disks and 'aux' disks
- *   is simultaneously requested is not clear. The implementation assumes
- *   that an 'all' request overrides an 'aux' request.
- *
- * [1] 
https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=docs/misc/hvm-emulated-unplug.pandoc
- */
-int pci_piix3_xen_ide_unplug(DeviceState *dev, bool aux)
-{
-PCIIDEState *pci_ide;
-int i;
-IDEDevice *idedev;
-IDEBus *idebus;
-BlockBackend *blk;
-
-pci_ide = PCI_IDE(dev);
-
-for (i = aux ? 1 : 0; i < 4; i++) {
-idebus = &pci_ide->bus[i / 2];
-blk = idebus->ifs[i % 2].blk;
-
-if (blk && idebus->ifs[i % 2].drive_kind != IDE_CD) {
-if (!(i % 2)) {
-idedev = idebus->master;
-} else {
-idedev = idebus->slave;
-}
-
-blk_drain(blk);
-blk_flush(blk);
-
-blk_detach_dev(blk, DEVICE(idedev));
-idebus->ifs[i % 2].blk = NULL;
-idedev->conf.blk = NULL;
-monitor_remove_blk(blk);
-blk_unref(blk);
-}
-}
-qdev_reset_all(dev);
-return 0;
-}
-
 static void pci_piix_ide_exitfn(PCIDevice *dev)
 {
 PCIIDEState *d = PCI_IDE(dev);
diff --git a/include/hw/ide.h b/include/hw/ide.h
index c5ce5da4f4..60f1f4f714 100644
--- a/include/hw/ide.h
+++ b/

[qemu-mainline test] 170891: regressions - trouble: broken/fail/pass

2022-06-09 Thread osstest service owner

flight 170891 qemu-mainline real [real]
http://logs.test-lab.xenproject.org/osstest/logs/170891/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-amd64-xl-rtds broken
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 16 guest-saverestore.2 fail 
REGR. vs. 170884
 test-amd64-amd64-libvirt-vhd 19 guest-start/debian.repeat fail REGR. vs. 170884

Regressions which are regarded as allowable (not blocking):
 test-amd64-amd64-xl-rtds  5 host-install(5)broken REGR. vs. 170884

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 170884
 test-armhf-armhf-libvirt 16 saverestore-support-checkfail  like 170884
 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 170884
 test-amd64-i386-xl-qemuu-win7-amd64 19 guest-stop fail like 170884
 test-armhf-armhf-libvirt-raw 15 saverestore-support-checkfail  like 170884
 test-armhf-armhf-libvirt-qcow2 15 saverestore-support-check   fail like 170884
 test-amd64-i386-xl-qemuu-ws16-amd64 19 guest-stop fail like 170884
 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 170884
 test-arm64-arm64-xl-seattle  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  16 saverestore-support-checkfail   never pass
 test-amd64-i386-xl-pvshim14 guest-start  fail   never pass
 test-amd64-i386-libvirt-xsm  15 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 16 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  16 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt-raw  14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 15 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 15 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 16 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl-rtds 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  14 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 15 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 16 saverestore-support-checkfail never pass
 test-armhf-armhf-libvirt 15 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt 15 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-armhf-armhf-libvirt-raw 14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-qcow2 14 migrate-support-checkfail never pass

version targeted for t

Re: [PATCH v3 2/3] ui: Deliver refresh rate via QemuUIInfo

2022-06-09 Thread Gerd Hoffmann

  Hi,

> > > > (1) add refresh_rate
> > > > (2) update users one by one
> > > > (3) finally drop update_interval when no user is left.
> > > > 
> > > > thanks,
> > > > Gerd
> > > > 
> > > 
> > > I think 1 and 3 should have to be done once since refresh_rate and
> > > update_interval would interfere with each other otherwise.
> > 
> > Well, between 1 and 3 both old and new API are active.  Shouldn't be
> > much of a problem because the GraphicHwOps implementations are using
> > only the one or the other.
> > 
> > take care,
> >Gerd
> > 
> 
> The only GraphicHwOps implementation updated with this change is xenfb.
> xenfb can be switched to use refresh_rate in step 1 or 3.
> 
> Switching to use refresh_rate in step 1 would break the refresh rate
> propagation until all host displays are updated to set refresh_rate instead
> of calling update_interval.

Well, host display update would need splitting into two pieces too,
first add refresh_rate, then later drop update_interval, to make the
update scheme work without temporary breakage.

That sounds increasingly like over engineering it though, I guess I just
queue up the patches as-is.

thanks,
  Gerd

Re: [PATCH 2/2] xen/heap: pass order to free_heap_pages() in heap init

2022-06-09 Thread Jan Beulich

On 09.06.2022 10:30, Julien Grall wrote:
> From: Hongyan Xia 
> 
> The idea is to split the range into multiple aligned power-of-2 regions
> which only needs to call free_heap_pages() once each. We check the least
> significant set bit of the start address and use its bit index as the
> order of this increment. This makes sure that each increment is both
> power-of-2 and properly aligned, which can be safely passed to
> free_heap_pages(). Of course, the order also needs to be sanity checked
> against the upper bound and MAX_ORDER.
> 
> Testing on a nested environment on c5.metal with various amount
> of RAM. Time for end_boot_allocator() to complete:
> Before After
> - 90GB: 1426 ms166 ms
> -  8GB:  124 ms 12 ms
> -  4GB:   60 ms  6 ms
> 
> Signed-off-by: Hongyan Xia 
> Signed-off-by: Julien Grall 
> ---
>  xen/common/page_alloc.c | 39 +--
>  1 file changed, 33 insertions(+), 6 deletions(-)
> 
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index a1938df1406c..bf852cfc11ea 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -1779,16 +1779,28 @@ int query_page_offline(mfn_t mfn, uint32_t *status)
>  
>  /*
>   * init_contig_heap_pages() is intended to only take pages from the same
> - * NUMA node.
> + * NUMA node and zone.
> + *
> + * For the latter, it is always true for !CONFIG_SEPARATE_XENHEAP since
> + * free_heap_pages() can only take power-of-two ranges which never cross
> + * zone boundaries. But for separate xenheap which is manually defined,
> + * it is possible for a power-of-two range to cross zones, so we need to
> + * check that as well.
>   */
> -static bool is_contig_page(struct page_info *pg, unsigned int nid)
> +static bool is_contig_page(struct page_info *pg, unsigned int nid,
> +   unsigned int zone)
>  {
> +#ifdef CONFIG_SEPARATE_XENHEAP
> +if ( zone != page_to_zone(pg) )
> +return false;
> +#endif
> +
>  return (nid == (phys_to_nid(page_to_maddr(pg;
>  }
>  
>  /*
>   * This function should only be called with valid pages from the same NUMA
> - * node.
> + * node and the same zone.
>   *
>   * Callers should use is_contig_page() first to check if all the pages
>   * in a range are contiguous.
> @@ -1817,8 +1829,22 @@ static void init_contig_heap_pages(struct page_info 
> *pg, unsigned long nr_pages,
>  
>  while ( s < e )
>  {
> -free_heap_pages(mfn_to_page(_mfn(s)), 0, need_scrub);
> -s += 1UL;
> +/*
> + * For s == 0, we simply use the largest increment by checking the
> + * index of the MSB set. For s != 0, we also need to ensure that the

"The MSB" reads as it it was not in line with the code; at least I would,
short of it being spelled out, assume it can only be the page's address
which is meant (as is the case for LSB below). But it's the MSB of the
range's size.

> + * chunk is properly sized to end at power-of-two alignment. We do 
> this
> + * by checking the LSB set and use its index as the increment. Both
> + * cases need to be guarded by MAX_ORDER.
> + *
> + * Note that the value of ffsl() and flsl() starts from 1 so we need
> + * to decrement it by 1.
> + */
> +int inc_order = min(MAX_ORDER, flsl(e - s) - 1);
> +
> +if ( s )
> +inc_order = min(inc_order, ffsl(s) - 1);
> +free_heap_pages(mfn_to_page(_mfn(s)), inc_order, need_scrub);
> +s += (1UL << inc_order);
>  }
>  }
>  
> @@ -1856,12 +1882,13 @@ static void init_heap_pages(
>  for ( i = 0; i < nr_pages; )
>  {
>  unsigned int nid = phys_to_nid(page_to_maddr(pg));
> +unsigned int zone = page_to_zone(pg);
>  unsigned long left = nr_pages - i;
>  unsigned long contig_pages;
>  
>  for ( contig_pages = 1; contig_pages < left; contig_pages++ )
>  {
> -if ( !is_contig_page(pg + contig_pages, nid) )
> +if ( !is_contig_page(pg + contig_pages, nid, zone) )
>  break;
>  }

Coming back to your reply to my comment on patch 1: I think this
addition of the node check is actually an argument for inlining the
function's body here (and then dropping the function). That way the
separate-Xen-heap aspect is visible at the place where it matters,
rather than requiring an indirection via looking at the helper
function (and leaving a dead parameter in the opposite case). But as
said - I'm not going to insist as long as the helper function has a
suitable name (representing what it does and not misguiding anyone
with the common "contig"-means-addresses implication).

Jan

Re: [PATCH 1/2] xen/heap: Split init_heap_pages() in two

2022-06-09 Thread Julien Grall


Hi,

On 09/06/2022 14:12, Jan Beulich wrote:

On 09.06.2022 14:33, Julien Grall wrote:

On 09/06/2022 13:09, Jan Beulich wrote:

On 09.06.2022 10:30, Julien Grall wrote:

From: Julien Grall 

At the moment, init_heap_pages() will call free_heap_pages() page
by page. To reduce the time to initialize the heap, we will want
to provide multiple pages at the same time.

init_heap_pages() is now split in two parts:
  - init_heap_pages(): will break down the range in multiple set
of contiguous pages. For now, the criteria is the pages should
belong to the same NUMA node.
  - init_contig_pages(): will initialize a set of contiguous pages.
For now the pages are still passed one by one to free_heap_pages().


Hmm, the common use of "contiguous" is to describe address correlation.
Therefore I'm afraid I'd like to see "contig" avoided when you mean
"same node". Perhaps init_node_pages()?


After the next patch, it will not only be the same node, It will also be
the same zone at least. Also, in the future, I would like to
re-submitting David Woodhouse patch to exclude broken pages (see [1]).

Therefore, I think the name init_node_pages() would not be suitable.
Please suggest a different name.


_init_heap_pages() then, as a helper of init_heap_pages()?


I am fine with your proposed named. That said...




--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1778,16 +1778,55 @@ int query_page_offline(mfn_t mfn, uint32_t *status)
   }
   
   /*

- * Hand the specified arbitrary page range to the specified heap zone
- * checking the node_id of the previous page.  If they differ and the
- * latter is not on a MAX_ORDER boundary, then we reserve the page by
- * not freeing it to the buddy allocator.
+ * init_contig_heap_pages() is intended to only take pages from the same
+ * NUMA node.
*/
+static bool is_contig_page(struct page_info *pg, unsigned int nid)
+{
+return (nid == (phys_to_nid(page_to_maddr(pg;
+}


If such a helper is indeed needed, then I think it absolutely wants
pg to be pointer-to-const. And imo it would also help readability if
the extra pair of parentheses around the nested function calls was
omitted. Given the naming concern, though, I wonder whether this
wouldn't better be open-coded in the one place it is used. Actually
naming is quite odd here beyond what I'd said further up: "Is this
page contiguous?" Such a question requires two pages, i.e. "Are these
two pages contiguous?" What you want to know is "Is this page on the
given node?"


There will be more check in the future (see next patch). I created an
helper because it reduces the size of the loop init_heap_pages(). I
would be OK to fold it if you strongly prefer that.


I don't "strongly" prefer that; I'd also be okay with a suitably named
helper. Just that I can't seem to be able to come up with any good name.


... I am not sure what could be a suitable name for this helper. I will 
have a look how bad the fold version look like.





+/*
+ * This function should only be called with valid pages from the same NUMA
+ * node.
+ *
+ * Callers should use is_contig_page() first to check if all the pages
+ * in a range are contiguous.
+ */
+static void init_contig_heap_pages(struct page_info *pg, unsigned long 
nr_pages,
+   bool need_scrub)
+{
+unsigned long s, e;
+unsigned int nid = phys_to_nid(page_to_maddr(pg));
+
+s = mfn_x(page_to_mfn(pg));
+e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 1));
+if ( unlikely(!avail[nid]) )
+{
+bool use_tail = !(s & ((1UL << MAX_ORDER) - 1)) &&


IS_ALIGNED(s, 1UL << MAX_ORDER) to "describe" what's meant?


This is existing code and it is quite complex. So I would prefer if we
avoid to simplify and move the code in the same patch. I would be happy
to write a follow-up patch to switch to IS_ALIGNED().


I do realize it's code you move, but I can accept your desire to merely
move the code without any cleanup. Personally I think that rather than a
follow-up patch (which doesn't help the reviewing of this one) such an
adjustment would better be a prereq one.


I will look for a prereq.




@@ -1812,35 +1851,24 @@ static void init_heap_pages(
   spin_unlock(&heap_lock);
   
   if ( system_state < SYS_STATE_active && opt_bootscrub == BOOTSCRUB_IDLE )

-idle_scrub = true;
+need_scrub = true;
   
-for ( i = 0; i < nr_pages; i++ )

+for ( i = 0; i < nr_pages; )
   {
-unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
+unsigned int nid = phys_to_nid(page_to_maddr(pg));
+unsigned long left = nr_pages - i;
+unsigned long contig_pages;
   
-if ( unlikely(!avail[nid]) )

+for ( contig_pages = 1; contig_pages < left; contig_pages++ )
   {
-unsigned long s = mfn_x(page_to_mfn(pg + i));
-unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 
1));
-bool use_tail = (nid == ph

Re: [PATCH 1/2] xen/heap: Split init_heap_pages() in two

2022-06-09 Thread Jan Beulich

On 09.06.2022 14:33, Julien Grall wrote:
> On 09/06/2022 13:09, Jan Beulich wrote:
>> On 09.06.2022 10:30, Julien Grall wrote:
>>> From: Julien Grall 
>>>
>>> At the moment, init_heap_pages() will call free_heap_pages() page
>>> by page. To reduce the time to initialize the heap, we will want
>>> to provide multiple pages at the same time.
>>>
>>> init_heap_pages() is now split in two parts:
>>>  - init_heap_pages(): will break down the range in multiple set
>>>of contiguous pages. For now, the criteria is the pages should
>>>belong to the same NUMA node.
>>>  - init_contig_pages(): will initialize a set of contiguous pages.
>>>For now the pages are still passed one by one to free_heap_pages().
>>
>> Hmm, the common use of "contiguous" is to describe address correlation.
>> Therefore I'm afraid I'd like to see "contig" avoided when you mean
>> "same node". Perhaps init_node_pages()?
> 
> After the next patch, it will not only be the same node, It will also be 
> the same zone at least. Also, in the future, I would like to 
> re-submitting David Woodhouse patch to exclude broken pages (see [1]).
> 
> Therefore, I think the name init_node_pages() would not be suitable. 
> Please suggest a different name.

_init_heap_pages() then, as a helper of init_heap_pages()?

>>> --- a/xen/common/page_alloc.c
>>> +++ b/xen/common/page_alloc.c
>>> @@ -1778,16 +1778,55 @@ int query_page_offline(mfn_t mfn, uint32_t *status)
>>>   }
>>>   
>>>   /*
>>> - * Hand the specified arbitrary page range to the specified heap zone
>>> - * checking the node_id of the previous page.  If they differ and the
>>> - * latter is not on a MAX_ORDER boundary, then we reserve the page by
>>> - * not freeing it to the buddy allocator.
>>> + * init_contig_heap_pages() is intended to only take pages from the same
>>> + * NUMA node.
>>>*/
>>> +static bool is_contig_page(struct page_info *pg, unsigned int nid)
>>> +{
>>> +return (nid == (phys_to_nid(page_to_maddr(pg;
>>> +}
>>
>> If such a helper is indeed needed, then I think it absolutely wants
>> pg to be pointer-to-const. And imo it would also help readability if
>> the extra pair of parentheses around the nested function calls was
>> omitted. Given the naming concern, though, I wonder whether this
>> wouldn't better be open-coded in the one place it is used. Actually
>> naming is quite odd here beyond what I'd said further up: "Is this
>> page contiguous?" Such a question requires two pages, i.e. "Are these
>> two pages contiguous?" What you want to know is "Is this page on the
>> given node?"
> 
> There will be more check in the future (see next patch). I created an 
> helper because it reduces the size of the loop init_heap_pages(). I 
> would be OK to fold it if you strongly prefer that.

I don't "strongly" prefer that; I'd also be okay with a suitably named
helper. Just that I can't seem to be able to come up with any good name.

>>> +/*
>>> + * This function should only be called with valid pages from the same NUMA
>>> + * node.
>>> + *
>>> + * Callers should use is_contig_page() first to check if all the pages
>>> + * in a range are contiguous.
>>> + */
>>> +static void init_contig_heap_pages(struct page_info *pg, unsigned long 
>>> nr_pages,
>>> +   bool need_scrub)
>>> +{
>>> +unsigned long s, e;
>>> +unsigned int nid = phys_to_nid(page_to_maddr(pg));
>>> +
>>> +s = mfn_x(page_to_mfn(pg));
>>> +e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 1));
>>> +if ( unlikely(!avail[nid]) )
>>> +{
>>> +bool use_tail = !(s & ((1UL << MAX_ORDER) - 1)) &&
>>
>> IS_ALIGNED(s, 1UL << MAX_ORDER) to "describe" what's meant?
> 
> This is existing code and it is quite complex. So I would prefer if we 
> avoid to simplify and move the code in the same patch. I would be happy 
> to write a follow-up patch to switch to IS_ALIGNED().

I do realize it's code you move, but I can accept your desire to merely
move the code without any cleanup. Personally I think that rather than a
follow-up patch (which doesn't help the reviewing of this one) such an
adjustment would better be a prereq one.

>>> @@ -1812,35 +1851,24 @@ static void init_heap_pages(
>>>   spin_unlock(&heap_lock);
>>>   
>>>   if ( system_state < SYS_STATE_active && opt_bootscrub == 
>>> BOOTSCRUB_IDLE )
>>> -idle_scrub = true;
>>> +need_scrub = true;
>>>   
>>> -for ( i = 0; i < nr_pages; i++ )
>>> +for ( i = 0; i < nr_pages; )
>>>   {
>>> -unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
>>> +unsigned int nid = phys_to_nid(page_to_maddr(pg));
>>> +unsigned long left = nr_pages - i;
>>> +unsigned long contig_pages;
>>>   
>>> -if ( unlikely(!avail[nid]) )
>>> +for ( contig_pages = 1; contig_pages < left; contig_pages++ )
>>>   {
>>> -unsigned long s = mfn_x(page_to_mfn(pg + i));
>>> -unsigned long e = mfn_x(mfn_add(page_t

Re: [PATCH 24/36] printk: Remove trace_.*_rcuidle() usage

2022-06-09 Thread Petr Mladek

On Thu 2022-06-09 12:02:04, Peter Zijlstra wrote:
> On Thu, Jun 09, 2022 at 11:16:46AM +0200, Petr Mladek wrote:
> > On Wed 2022-06-08 16:27:47, Peter Zijlstra wrote:
> > > The problem, per commit fc98c3c8c9dc ("printk: use rcuidle console
> > > tracepoint"), was printk usage from the cpuidle path where RCU was
> > > already disabled.
> > > 
> > Does this "prevent" calling printk() a safe way in code with
> > RCU disabled?
> 
> On x86_64, yes. Other architectures, less so.
> 
> Specifically, the objtool noinstr validation pass will warn at build
> time (DEBUG_ENTRY=y) if any noinstr/cpuidle code does a call to
> non-vetted code like printk().
> 
> At the same time; there's a few hacks that allow WARN to work, but
> mostly if you hit WARN in entry/noinstr you get to keep the pieces in
> any case.
> 
> On other architecture we'll need to rely on runtime coverage with
> PROVE_RCU. That is, if a splat like in the above mentioned commit
> happens again, we'll need to fix it by adjusting the callchain, not by
> mucking about with RCU state.

Makes sense. Feel free to use for this patch:

Acked-by: Petr Mladek 

> > Therefore if this patch allows to remove some tricky tracing
> > code then it might be worth it. But if trace_console_rcuidle()
> > variant is still going to be available then I would keep using it.
> 
> My ultimate goal is to delete trace_.*_rcuidle() and RCU_NONIDLE()
> entirely. We're close, but not quite there yet.

I keep my fingers crossed.

Best Regards,
Petr

Re: [PATCH 24/36] printk: Remove trace_.*_rcuidle() usage

2022-06-09 Thread Petr Mladek

On Thu 2022-06-09 20:30:58, Sergey Senozhatsky wrote:
> My emails are getting rejected... Let me try web-interface

Bad day for mail sending. I have problems as well ;-)

> Kudos to Petr for the questions and thanks to PeterZ for the answers.
> 
> On Thu, Jun 9, 2022 at 7:02 PM Peter Zijlstra  wrote:
> > This is the tracepoint used to spool all of printk into ftrace, I
> > suspect there's users, but I haven't used it myself.
> 
> I'm somewhat curious whether we can actually remove that trace event.

Good question.

Well, I think that it might be useful. It allows to see trace and
printk messages together.

It was ugly when it was in the console code. The new location
in vprintk_store() allows to have it even "correctly" sorted
(timestamp) against other tracing messages.

Best Regards,
Petr

Re: [XEN PATCH 1/4] build: xen/include: use if_changed

2022-06-09 Thread Anthony PERARD

On Thu, Jun 09, 2022 at 11:51:20AM +, Bertrand Marquis wrote:
> Hi,
> 
> > On 9 Jun 2022, at 11:26, Jan Beulich  wrote:
> > 
> > On 09.06.2022 12:16, Bertrand Marquis wrote:
> >> With this change, compiling for x86 is now ending up in:
> >> CHK include/headers99.chk
> >> make[9]: execvp: /bin/sh: Argument list too long
> >> make[9]: *** [include/Makefile:181: include/headers++.chk] Error 127
> >> 
> >> Not quite sure yet why but I wanted to signal it early as other might be 
> >> impacted.
> >> 
> >> Arm and arm64 builds are not impacted.
> > 
> > Hmm, that patch has passed the smoke push gate already, so there likely is
> > more to it than there being an unconditional issue. I did build-test this
> > before pushing, and I've just re-tested on a 2nd system without seeing an
> > issue.
> 
> I have the problem only when building using Yocto, I did a normal build and 
> the
> issue is not coming.
> 

Will the following patch help?


>From 0f32f749304b233c0d5574dc6b14f66e8709feba Mon Sep 17 00:00:00 2001
From: Anthony PERARD 
Date: Thu, 9 Jun 2022 13:42:52 +0100
Subject: [XEN PATCH] build,include: rework shell script for headers++.chk

The command line generated for headers++.chk by make is quite long,
and in some environment it is too long. This issue have been seen in
Yocto build environment.

Error messages:
make[9]: execvp: /bin/sh: Argument list too long
make[9]: *** [include/Makefile:181: include/headers++.chk] Error 127

Rework to that we do the foreach loop in shell rather that make to
reduce the command line size by a lot. We also need a way to get
headers prerequisite for some public headers so we use a switch "case"
in shell to be able to do some simple pattern matching. Variables
alone in POSIX shell don't allow to work with associative array or
variables with "/".

Reported-by: Bertrand Marquis 
Fixes: 28e13c7f43 ("build: xen/include: use if_changed")
Signed-off-by: Anthony PERARD 
---
 xen/include/Makefile | 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/xen/include/Makefile b/xen/include/Makefile
index 6d9bcc19b0..ca5e868f38 100644
--- a/xen/include/Makefile
+++ b/xen/include/Makefile
@@ -158,13 +158,22 @@ define cmd_headerscxx_chk
touch $@.new; \
exit 0;   \
fi;   \
-   $(foreach i, $(filter %.h,$^),\
-   echo "#include "\"$(i)\"  \
+   get_prereq() {\
+   case $$1 in   \
+   $(foreach i, $(filter %.h,$^),\
+   $(if $($(patsubst $(srctree)/%,%,$i)-prereq), \
+   $(patsubst $(srctree)/%,%,$i)$(close) \
+   echo "$(foreach j, $($(patsubst $(srctree)/%,%,$i)-prereq),   \
+   -include c$(j))";;))  \
+   esac; \
+   };\
+   for i in $(filter %.h,$^); do \
+   echo "#include "\"$$i\"   \
| $(CXX) -x c++ -std=gnu++98 -Wall -Werror -D__XEN_TOOLS__\
  -include stdint.h -include $(srcdir)/public/xen.h   \
- $(foreach j, $($(patsubst $(srctree)/%,%,$i)-prereq), -include 
c$(j)) \
+ `get_prereq $$i`\
  -S -o /dev/null -   \
-   || exit $$?; echo $(i) >> $@.new;) \
+   || exit $$?; echo $$i >> $@.new; done;\
mv $@.new $@
 endef
 



-- 
Anthony PERARD

Re: [XEN PATCH 1/4] build: xen/include: use if_changed

2022-06-09 Thread Bertrand Marquis

Hi,

> On 9 Jun 2022, at 13:16, Jan Beulich  wrote:
> 
> On 09.06.2022 13:51, Bertrand Marquis wrote:
>>> On 9 Jun 2022, at 11:26, Jan Beulich  wrote:
>>> On 09.06.2022 12:16, Bertrand Marquis wrote:
> On 1 Jun 2022, at 17:59, Anthony PERARD  wrote:
> 
> Use "define" for the headers*_chk commands as otherwise the "#"
> is interpreted as a comment and make can't find the end of
> $(foreach,).
> 
> Adding several .PRECIOUS as without them `make` deletes the
> intermediate targets. This is an issue because the macro $(if_changed,)
> check if the target exist in order to decide whether to recreate the
> target.
> 
> Removing the call to `mkdir` from the commands. Those aren't needed
> anymore because a rune in Rules.mk creates the directory for each
> $(targets).
> 
> Remove "export PYTHON" as it is already exported.
 
 With this change, compiling for x86 is now ending up in:
 CHK include/headers99.chk
 make[9]: execvp: /bin/sh: Argument list too long
 make[9]: *** [include/Makefile:181: include/headers++.chk] Error 127
 
 Not quite sure yet why but I wanted to signal it early as other might be 
 impacted.
 
 Arm and arm64 builds are not impacted.
>>> 
>>> Hmm, that patch has passed the smoke push gate already, so there likely is
>>> more to it than there being an unconditional issue. I did build-test this
>>> before pushing, and I've just re-tested on a 2nd system without seeing an
>>> issue.
>> 
>> I have the problem only when building using Yocto, I did a normal build and 
>> the
>> issue is not coming.
>> 
>> Doing a verbose compilation I have this (sorry for the long lines):
>> 
>> for i in include/public/vcpu.h include/public/errno.h include/public/kexec.h 
>> include/public/argo.h include/public/xen.h include/public/nmi.h 
>> include/public/xencomm.h include/public/xenoprof.h 
>> include/public/device_tree_defs.h include/public/version.h 
>> include/public/memory.h include/public/features.h include/public/sched.h 
>> include/public/xen-compat.h include/public/callback.h 
>> include/public/vm_event.h include/public/grant_table.h 
>> include/public/physdev.h include/public/tmem.h include/public/hypfs.h 
>> include/public/platform.h include/public/pmu.h include/public/elfnote.h 
>> include/public/trace.h include/public/event_channel.h 
>> include/public/io/vscsiif.h include/public/io/kbdif.h 
>> include/public/io/protocols.h include/public/io/ring.h 
>> include/public/io/displif.h include/public/io/fsif.h 
>> include/public/io/blkif.h include/public/io/console.h 
>> include/public/io/sndif.h include/public/io/fbif.h 
>> include/public/io/libxenvchan.h include/public/io/netif.h 
>> include/public/io/usbif.h include/public/io/pciif.h 
>> include/public/io/tpmif.h include/public/io/xs_wire.h 
>> include/public/io/xenbus.h include/public/io/cameraif.h 
>> include/public/hvm/pvdrivers.h include/public/hvm/e820.h 
>> include/public/hvm/hvm_xs_strings.h include/public/hvm/dm_op.h 
>> include/public/hvm/ioreq.h include/public/hvm/hvm_info_table.h 
>> include/public/hvm/hvm_vcpu.h include/public/hvm/hvm_op.h 
>> include/public/hvm/params.h; do x86_64-poky-linux-gcc  
>> --sysroot=/home/bermar01/Development/xen-dev/build/profile-qemu-x86_64.prj/tmp/work/core2-64-poky-linux/xen/4.17+git1-r0/recipe-sysroot
>>   -x c -ansi -Wall -Werror -include stdint.h -S -o /dev/null $i || exit 1; 
>> echo $i; done >include/headers.chk.new; mv include/headers.chk.new 
>> include/headers.chk
>> |   rm -f include/headers99.chk.new;  echo "#include 
>> "\"include/public/io/9pfs.h\" | x86_64-poky-linux-gcc  
>> --sysroot=/home/bermar01/Development/xen-dev/build/profile-qemu-x86_64.prj/tmp/work/core2-64-poky-linux/xen/4.17+git1-r0/recipe-sysroot
>>   -x c -std=c99 -Wall -Werror -include stdint.h  -include string.h -S -o 
>> /dev/null - || exit $?; echo include/public/io/9pfs.h >> 
>> include/headers99.chk.new;  echo "#include "\"include/public/io/pvcalls.h\" 
>> | x86_64-poky-linux-gcc  
>> --sysroot=/home/bermar01/Development/xen-dev/build/profile-qemu-x86_64.prj/tmp/work/core2-64-poky-linux/xen/4.17+git1-r0/recipe-sysroot
>>   -x c -std=c99 -Wall -Werror -include stdint.h  -include string.h -S -o 
>> /dev/null - || exit $?; echo include/public/io/pvcalls.h >> 
>> include/headers99.chk.new; mv include/headers99.chk.new include/headers99.chk
>> | make[9]: execvp: /bin/sh: Argument list too long
>> | make[9]: *** [include/Makefile:181: include/headers++.chk] Error 127
>> | make[9]: *** Waiting for unfinished jobs
>> 
>> So the command passed to the sub shell by make is quite long.
>> 
>> No idea why this comes out only when building in Yocto but I will dig a bit.
> 
> Maybe Yocto has an unusually low limit on command arguments' total size?
> The whole thing is just over 2500 chars, which doesn't look to be unusually
> long for Unix-like environments.
> 

Actually the command to generate headers++.chk is 15294 characters when

Re: [PATCH 1/2] xen/heap: Split init_heap_pages() in two

2022-06-09 Thread Julien Grall


Hi Jan,

On 09/06/2022 13:09, Jan Beulich wrote:

On 09.06.2022 10:30, Julien Grall wrote:

From: Julien Grall 

At the moment, init_heap_pages() will call free_heap_pages() page
by page. To reduce the time to initialize the heap, we will want
to provide multiple pages at the same time.

init_heap_pages() is now split in two parts:
 - init_heap_pages(): will break down the range in multiple set
   of contiguous pages. For now, the criteria is the pages should
   belong to the same NUMA node.
 - init_contig_pages(): will initialize a set of contiguous pages.
   For now the pages are still passed one by one to free_heap_pages().


Hmm, the common use of "contiguous" is to describe address correlation.
Therefore I'm afraid I'd like to see "contig" avoided when you mean
"same node". Perhaps init_node_pages()?


After the next patch, it will not only be the same node, It will also be 
the same zone at least. Also, in the future, I would like to 
re-submitting David Woodhouse patch to exclude broken pages (see [1]).


Therefore, I think the name init_node_pages() would not be suitable. 
Please suggest a different name.





--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1778,16 +1778,55 @@ int query_page_offline(mfn_t mfn, uint32_t *status)
  }
  
  /*

- * Hand the specified arbitrary page range to the specified heap zone
- * checking the node_id of the previous page.  If they differ and the
- * latter is not on a MAX_ORDER boundary, then we reserve the page by
- * not freeing it to the buddy allocator.
+ * init_contig_heap_pages() is intended to only take pages from the same
+ * NUMA node.
   */
+static bool is_contig_page(struct page_info *pg, unsigned int nid)
+{
+return (nid == (phys_to_nid(page_to_maddr(pg;
+}


If such a helper is indeed needed, then I think it absolutely wants
pg to be pointer-to-const. And imo it would also help readability if
the extra pair of parentheses around the nested function calls was
omitted. Given the naming concern, though, I wonder whether this
wouldn't better be open-coded in the one place it is used. Actually
naming is quite odd here beyond what I'd said further up: "Is this
page contiguous?" Such a question requires two pages, i.e. "Are these
two pages contiguous?" What you want to know is "Is this page on the
given node?"


There will be more check in the future (see next patch). I created an 
helper because it reduces the size of the loop init_heap_pages(). I 
would be OK to fold it if you strongly prefer that.





+/*
+ * This function should only be called with valid pages from the same NUMA
+ * node.
+ *
+ * Callers should use is_contig_page() first to check if all the pages
+ * in a range are contiguous.
+ */
+static void init_contig_heap_pages(struct page_info *pg, unsigned long 
nr_pages,


const again?


I will have a look.




+   bool need_scrub)
+{
+unsigned long s, e;
+unsigned int nid = phys_to_nid(page_to_maddr(pg));
+
+s = mfn_x(page_to_mfn(pg));
+e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 1));
+if ( unlikely(!avail[nid]) )
+{
+bool use_tail = !(s & ((1UL << MAX_ORDER) - 1)) &&


IS_ALIGNED(s, 1UL << MAX_ORDER) to "describe" what's meant?


This is existing code and it is quite complex. So I would prefer if we 
avoid to simplify and move the code in the same patch. I would be happy 
to write a follow-up patch to switch to IS_ALIGNED().





+(find_first_set_bit(e) <= find_first_set_bit(s));
+unsigned long n;
+
+n = init_node_heap(nid, s, nr_pages, &use_tail);
+BUG_ON(n > nr_pages);
+if ( use_tail )
+e -= n;
+else
+s += n;
+}
+
+while ( s < e )
+{
+free_heap_pages(mfn_to_page(_mfn(s)), 0, need_scrub);
+s += 1UL;


Nit (I realize the next patch will replace this anyway): Just ++s? Or
at least a plain 1 without UL suffix?


I will switch to s++.




@@ -1812,35 +1851,24 @@ static void init_heap_pages(
  spin_unlock(&heap_lock);
  
  if ( system_state < SYS_STATE_active && opt_bootscrub == BOOTSCRUB_IDLE )

-idle_scrub = true;
+need_scrub = true;
  
-for ( i = 0; i < nr_pages; i++ )

+for ( i = 0; i < nr_pages; )
  {
-unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
+unsigned int nid = phys_to_nid(page_to_maddr(pg));
+unsigned long left = nr_pages - i;
+unsigned long contig_pages;
  
-if ( unlikely(!avail[nid]) )

+for ( contig_pages = 1; contig_pages < left; contig_pages++ )
  {
-unsigned long s = mfn_x(page_to_mfn(pg + i));
-unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 
1));
-bool use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
-!(s & ((1UL << MAX_ORDER) - 1)) &&
-(find_first_set_bit(e) <= find_first_set_bit(s));
-

[xen-unstable-smoke test] 170899: tolerable all pass - PUSHED

2022-06-09 Thread osstest service owner

flight 170899 xen-unstable-smoke real [real]
http://logs.test-lab.xenproject.org/osstest/logs/170899/

Failures :-/ but no regressions.

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-libvirt 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-support-checkfail   never pass

version targeted for testing:
 xen  59fbdf8a3667ce42c1cf70c94c3bcd0451afd4d8
baseline version:
 xen  f3185c165d28901c3222becfc8be547263c53745

Last test of basis   170889  2022-06-08 19:01:56 Z0 days
Testing same since   170899  2022-06-09 09:00:28 Z0 days1 attempts


People who touched revisions under test:
  George Dunlap 
  Jan Beulich 
  Julien Grall 
  Julien Grall  # Arm

jobs:
 build-arm64-xsm  pass
 build-amd64  pass
 build-armhf  pass
 build-amd64-libvirt  pass
 test-armhf-armhf-xl  pass
 test-arm64-arm64-xl-xsm  pass
 test-amd64-amd64-xl-qemuu-debianhvm-amd64pass
 test-amd64-amd64-libvirt pass



sg-report-flight on osstest.test-lab.xenproject.org
logs: /home/logs/logs
images: /home/logs/images

Logs, config files, etc. are available at
http://logs.test-lab.xenproject.org/osstest/logs

Explanation of these reports, and of osstest in general, is at
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README.email;hb=master
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README;hb=master

Test harness code can be found at
http://xenbits.xen.org/gitweb?p=osstest.git;a=summary


Pushing revision :

To xenbits.xen.org:/home/xen/git/xen.git
   f3185c165d..59fbdf8a36  59fbdf8a3667ce42c1cf70c94c3bcd0451afd4d8 -> smoke

Re: MOVING COMMUNITY CALL Call for agenda items for 9 June Community Call @ 1500 UTC

2022-06-09 Thread George Dunlap



> On 9 Jun 2022, at 12:24, Jan Beulich  wrote:
> 
> On 09.06.2022 13:11, Roberto Bagnara wrote:
>> On 07/06/22 04:17, Stefano Stabellini wrote:
>>> # Rule 9.1 "The value of an object with automatic storage duration shall 
>>> not be read before it has been set"
>>> 
>>> The question is whether -Wuninitalised already covers this case or not.
>>> I think it does.
>>> 
>>> Eclair is reporting a few issues where variables are "possibly
>>> uninitialized". We should ask Roberto about them, I don't think they are
>>> actual errors? More like extra warnings?
>> 
>> No, -Wuninitialized is not reliable, as it has plenty of (well known)
>> false negatives. This is typical of compilers, for which the generation
>> of warnings is only a secondary objective. I wrote about that here:
>> 
>> https://www.bugseng.com/blog/compiler-warnings-use-them-dont-trust-them
>> 
>> On the specifics:
>> 
>> $ cat p.c
>> int foo (int b)
>> {
>> int a;
>> 
>> if (b)
>> {
>> a = 1;
>> }
>> 
>> return a;
>> }
>> 

> I understand what you're saying, yet I'd like to point out that adding
> initializers "blindly" may give a false sense of code correctness.
> Among other things it takes away the chance for tools to point out
> possible issues. Plus some tools warn about stray initializers ...

Right — if you always set “int a=0;”, then you’re getting a known value; but if 
your algorithm relies on it being something specific (and not zero), then it’s 
not clear the resulting software is actually more reliable.  If you don’t 
initialise it, there’s at least a chance the compiler will be able to tell you 
that you made a mistake; if you explicitly initialise it, then it’s all on you.

 -George


signature.asc
Description: Message signed with OpenPGP

Re: [XEN PATCH 1/4] build: xen/include: use if_changed

2022-06-09 Thread Jan Beulich

On 09.06.2022 13:51, Bertrand Marquis wrote:
>> On 9 Jun 2022, at 11:26, Jan Beulich  wrote:
>> On 09.06.2022 12:16, Bertrand Marquis wrote:
 On 1 Jun 2022, at 17:59, Anthony PERARD  wrote:

 Use "define" for the headers*_chk commands as otherwise the "#"
 is interpreted as a comment and make can't find the end of
 $(foreach,).

 Adding several .PRECIOUS as without them `make` deletes the
 intermediate targets. This is an issue because the macro $(if_changed,)
 check if the target exist in order to decide whether to recreate the
 target.

 Removing the call to `mkdir` from the commands. Those aren't needed
 anymore because a rune in Rules.mk creates the directory for each
 $(targets).

 Remove "export PYTHON" as it is already exported.
>>>
>>> With this change, compiling for x86 is now ending up in:
>>> CHK include/headers99.chk
>>> make[9]: execvp: /bin/sh: Argument list too long
>>> make[9]: *** [include/Makefile:181: include/headers++.chk] Error 127
>>>
>>> Not quite sure yet why but I wanted to signal it early as other might be 
>>> impacted.
>>>
>>> Arm and arm64 builds are not impacted.
>>
>> Hmm, that patch has passed the smoke push gate already, so there likely is
>> more to it than there being an unconditional issue. I did build-test this
>> before pushing, and I've just re-tested on a 2nd system without seeing an
>> issue.
> 
> I have the problem only when building using Yocto, I did a normal build and 
> the
> issue is not coming.
> 
> Doing a verbose compilation I have this (sorry for the long lines):
> 
>  for i in include/public/vcpu.h include/public/errno.h include/public/kexec.h 
> include/public/argo.h include/public/xen.h include/public/nmi.h 
> include/public/xencomm.h include/public/xenoprof.h 
> include/public/device_tree_defs.h include/public/version.h 
> include/public/memory.h include/public/features.h include/public/sched.h 
> include/public/xen-compat.h include/public/callback.h 
> include/public/vm_event.h include/public/grant_table.h 
> include/public/physdev.h include/public/tmem.h include/public/hypfs.h 
> include/public/platform.h include/public/pmu.h include/public/elfnote.h 
> include/public/trace.h include/public/event_channel.h 
> include/public/io/vscsiif.h include/public/io/kbdif.h 
> include/public/io/protocols.h include/public/io/ring.h 
> include/public/io/displif.h include/public/io/fsif.h 
> include/public/io/blkif.h include/public/io/console.h 
> include/public/io/sndif.h include/public/io/fbif.h 
> include/public/io/libxenvchan.h include/public/io/netif.h 
> include/public/io/usbif.h include/public/io/pciif.h include/public/io/tpmif.h 
> include/public/io/xs_wire.h include/public/io/xenbus.h 
> include/public/io/cameraif.h include/public/hvm/pvdrivers.h 
> include/public/hvm/e820.h include/public/hvm/hvm_xs_strings.h 
> include/public/hvm/dm_op.h include/public/hvm/ioreq.h 
> include/public/hvm/hvm_info_table.h include/public/hvm/hvm_vcpu.h 
> include/public/hvm/hvm_op.h include/public/hvm/params.h; do 
> x86_64-poky-linux-gcc  
> --sysroot=/home/bermar01/Development/xen-dev/build/profile-qemu-x86_64.prj/tmp/work/core2-64-poky-linux/xen/4.17+git1-r0/recipe-sysroot
>   -x c -ansi -Wall -Werror -include stdint.h -S -o /dev/null $i || exit 1; 
> echo $i; done >include/headers.chk.new; mv include/headers.chk.new 
> include/headers.chk
> |   rm -f include/headers99.chk.new;  echo "#include 
> "\"include/public/io/9pfs.h\" | x86_64-poky-linux-gcc  
> --sysroot=/home/bermar01/Development/xen-dev/build/profile-qemu-x86_64.prj/tmp/work/core2-64-poky-linux/xen/4.17+git1-r0/recipe-sysroot
>   -x c -std=c99 -Wall -Werror -include stdint.h  -include string.h -S -o 
> /dev/null - || exit $?; echo include/public/io/9pfs.h >> 
> include/headers99.chk.new;  echo "#include "\"include/public/io/pvcalls.h\" | 
> x86_64-poky-linux-gcc  
> --sysroot=/home/bermar01/Development/xen-dev/build/profile-qemu-x86_64.prj/tmp/work/core2-64-poky-linux/xen/4.17+git1-r0/recipe-sysroot
>   -x c -std=c99 -Wall -Werror -include stdint.h  -include string.h -S -o 
> /dev/null - || exit $?; echo include/public/io/pvcalls.h >> 
> include/headers99.chk.new; mv include/headers99.chk.new include/headers99.chk
> | make[9]: execvp: /bin/sh: Argument list too long
> | make[9]: *** [include/Makefile:181: include/headers++.chk] Error 127
> | make[9]: *** Waiting for unfinished jobs
> 
> So the command passed to the sub shell by make is quite long.
> 
> No idea why this comes out only when building in Yocto but I will dig a bit.

Maybe Yocto has an unusually low limit on command arguments' total size?
The whole thing is just over 2500 chars, which doesn't look to be unusually
long for Unix-like environments.

Jan

Re: [PATCH v3 2/3] ui: Deliver refresh rate via QemuUIInfo

2022-06-09 Thread Akihiko Odaki


On 2022/06/09 21:02, Gerd Hoffmann wrote:

On Thu, Jun 09, 2022 at 08:45:41PM +0900, Akihiko Odaki wrote:

On 2022/06/09 19:28, Gerd Hoffmann wrote:

--- a/include/ui/console.h
+++ b/include/ui/console.h
@@ -139,6 +139,7 @@ typedef struct QemuUIInfo {
   int   yoff;
   uint32_t  width;
   uint32_t  height;
+uint32_t  refresh_rate;
   } QemuUIInfo;
   /* cursor data format is 32bit RGBA */
@@ -426,7 +427,6 @@ typedef struct GraphicHwOps {
   void (*gfx_update)(void *opaque);
   bool gfx_update_async; /* if true, calls graphic_hw_update_done() */
   void (*text_update)(void *opaque, console_ch_t *text);
-void (*update_interval)(void *opaque, uint64_t interval);
   void (*ui_info)(void *opaque, uint32_t head, QemuUIInfo *info);
   void (*gl_block)(void *opaque, bool block);
   } GraphicHwOps;


So you are dropping update_interval, which isn't mentioned in the commit
message at all.  Also this patch is rather big.  I'd suggest:

(1) add refresh_rate
(2) update users one by one
(3) finally drop update_interval when no user is left.

thanks,
Gerd



I think 1 and 3 should have to be done once since refresh_rate and
update_interval would interfere with each other otherwise.


Well, between 1 and 3 both old and new API are active.  Shouldn't be
much of a problem because the GraphicHwOps implementations are using
only the one or the other.

take care,
   Gerd



The only GraphicHwOps implementation updated with this change is xenfb. 
xenfb can be switched to use refresh_rate in step 1 or 3.


Switching to use refresh_rate in step 1 would break the refresh rate 
propagation until all host displays are updated to set refresh_rate 
instead of calling update_interval.


Switching to use refresh_rate in step 3 would break the refresh rate 
propagation when a host display is updated to set refresh_rate instead 
of calling update_interval but xenfb does not use refresh_rate.


Regards,
Akihiko Odaki

Re: [PATCH 1/2] xen/heap: Split init_heap_pages() in two

2022-06-09 Thread Jan Beulich

On 09.06.2022 10:30, Julien Grall wrote:
> From: Julien Grall 
> 
> At the moment, init_heap_pages() will call free_heap_pages() page
> by page. To reduce the time to initialize the heap, we will want
> to provide multiple pages at the same time.
> 
> init_heap_pages() is now split in two parts:
> - init_heap_pages(): will break down the range in multiple set
>   of contiguous pages. For now, the criteria is the pages should
>   belong to the same NUMA node.
> - init_contig_pages(): will initialize a set of contiguous pages.
>   For now the pages are still passed one by one to free_heap_pages().

Hmm, the common use of "contiguous" is to describe address correlation.
Therefore I'm afraid I'd like to see "contig" avoided when you mean
"same node". Perhaps init_node_pages()?

> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -1778,16 +1778,55 @@ int query_page_offline(mfn_t mfn, uint32_t *status)
>  }
>  
>  /*
> - * Hand the specified arbitrary page range to the specified heap zone
> - * checking the node_id of the previous page.  If they differ and the
> - * latter is not on a MAX_ORDER boundary, then we reserve the page by
> - * not freeing it to the buddy allocator.
> + * init_contig_heap_pages() is intended to only take pages from the same
> + * NUMA node.
>   */
> +static bool is_contig_page(struct page_info *pg, unsigned int nid)
> +{
> +return (nid == (phys_to_nid(page_to_maddr(pg;
> +}

If such a helper is indeed needed, then I think it absolutely wants
pg to be pointer-to-const. And imo it would also help readability if
the extra pair of parentheses around the nested function calls was
omitted. Given the naming concern, though, I wonder whether this
wouldn't better be open-coded in the one place it is used. Actually
naming is quite odd here beyond what I'd said further up: "Is this
page contiguous?" Such a question requires two pages, i.e. "Are these
two pages contiguous?" What you want to know is "Is this page on the
given node?"

> +/*
> + * This function should only be called with valid pages from the same NUMA
> + * node.
> + *
> + * Callers should use is_contig_page() first to check if all the pages
> + * in a range are contiguous.
> + */
> +static void init_contig_heap_pages(struct page_info *pg, unsigned long 
> nr_pages,

const again?

> +   bool need_scrub)
> +{
> +unsigned long s, e;
> +unsigned int nid = phys_to_nid(page_to_maddr(pg));
> +
> +s = mfn_x(page_to_mfn(pg));
> +e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 1));
> +if ( unlikely(!avail[nid]) )
> +{
> +bool use_tail = !(s & ((1UL << MAX_ORDER) - 1)) &&

IS_ALIGNED(s, 1UL << MAX_ORDER) to "describe" what's meant?

> +(find_first_set_bit(e) <= find_first_set_bit(s));
> +unsigned long n;
> +
> +n = init_node_heap(nid, s, nr_pages, &use_tail);
> +BUG_ON(n > nr_pages);
> +if ( use_tail )
> +e -= n;
> +else
> +s += n;
> +}
> +
> +while ( s < e )
> +{
> +free_heap_pages(mfn_to_page(_mfn(s)), 0, need_scrub);
> +s += 1UL;

Nit (I realize the next patch will replace this anyway): Just ++s? Or
at least a plain 1 without UL suffix?

> @@ -1812,35 +1851,24 @@ static void init_heap_pages(
>  spin_unlock(&heap_lock);
>  
>  if ( system_state < SYS_STATE_active && opt_bootscrub == BOOTSCRUB_IDLE )
> -idle_scrub = true;
> +need_scrub = true;
>  
> -for ( i = 0; i < nr_pages; i++ )
> +for ( i = 0; i < nr_pages; )
>  {
> -unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
> +unsigned int nid = phys_to_nid(page_to_maddr(pg));
> +unsigned long left = nr_pages - i;
> +unsigned long contig_pages;
>  
> -if ( unlikely(!avail[nid]) )
> +for ( contig_pages = 1; contig_pages < left; contig_pages++ )
>  {
> -unsigned long s = mfn_x(page_to_mfn(pg + i));
> -unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 
> 1));
> -bool use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
> -!(s & ((1UL << MAX_ORDER) - 1)) &&
> -(find_first_set_bit(e) <= find_first_set_bit(s));
> -unsigned long n;
> -
> -n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), nr_pages - i,
> -   &use_tail);
> -BUG_ON(i + n > nr_pages);
> -if ( n && !use_tail )
> -{
> -i += n - 1;
> -continue;
> -}
> -if ( i + n == nr_pages )
> +if ( !is_contig_page(pg + contig_pages, nid) )
>  break;
> -nr_pages -= n;
>  }

Isn't doing this page by page in a loop quite inefficient? Can't you
simply obtain the end of the node's range covering the first page, and
then adjust "left" accordingly? I

Xen Security Advisory 401 v2 (CVE-2022-26362) - x86 pv: Race condition in typeref acquisition

2022-06-09 Thread Xen . org security team

-BEGIN PGP SIGNED MESSAGE-
Hash: SHA256

Xen Security Advisory CVE-2022-26362 / XSA-401
   version 2

 x86 pv: Race condition in typeref acquisition

UPDATES IN VERSION 2


Update 4.16 and 4.15 baselines.

Public release.

ISSUE DESCRIPTION
=

Xen maintains a type reference count for pages, in addition to a regular
reference count.  This scheme is used to maintain invariants required
for Xen's safety, e.g. PV guests may not have direct writeable access to
pagetables; updates need auditing by Xen.

Unfortunately, the logic for acquiring a type reference has a race
condition, whereby a safely TLB flush is issued too early and creates a
window where the guest can re-establish the read/write mapping before
writeability is prohibited.

IMPACT
==

Malicious x86 PV guest administrators may be able to escalate privilege
so as to control the whole system.

VULNERABLE SYSTEMS
==

All versions of Xen are vulnerable.

Only x86 PV guests can trigger this vulnerability.

To exploit the vulnerability, there needs to be an undue delay at just
the wrong moment in _get_page_type().  The degree to which an x86 PV
guest can practically control this race condition is unknown.

MITIGATION
==

Not running x86 PV guests will avoid the vulnerability.

CREDITS
===

This issue was discovered by Jann Horn of Google Project Zero.

RESOLUTION
==

Applying the appropriate attached patches resolves this issue.

Note that patches for released versions are generally prepared to
apply to the stable branches, and may not apply cleanly to the most
recent release tarball.  Downstreams are encouraged to update to the
tip of the stable branch before applying these patches.

xsa401/xsa401-?.patch   xen-unstable
xsa401/xsa401-4.16-?.patch  Xen 4.16.x - Xen 4.14.x
xsa401/xsa401-4.13-?.patch  Xen 4.13.x

$ sha256sum xsa401* xsa401*/*
d442bc0946eaa4c325226fd0805ab81eba6a68b68cffb9b03d9552edea86b118  xsa401.meta
074b57204f828cbd004c2d024b02a41af5d5bf3547d407af27249dca95eca13a  
xsa401/xsa401-1.patch
a095b39b203d501f9c9d4974638cd4d5e2d7a18daee7a7a61e2010dea477e212  
xsa401/xsa401-2.patch
99af3efc91d2dbf4fd54cc9f454b87bd76edbc85abd1a20bdad0bd22acabf466  
xsa401/xsa401-4.13-1.patch
bb997094052edbbbdd0dc9f3a0454508eb737556e2449ec6a0bc649deb921e4f  
xsa401/xsa401-4.13-2.patch
d336b31cb91466942e4fb8b44783bb2f0be4995076e70e0e78cdf992147cf72a  
xsa401/xsa401-4.16-1.patch
b380a76d67957b602ff3c9a3faaa4d9b422834d6ee3ab72432a6d07ddbc6  
xsa401/xsa401-4.16-2.patch
$

DEPLOYMENT DURING EMBARGO
=

Deployment of the patches and/or mitigations described above (or
others which are substantially similar) is permitted during the
embargo, even on public-facing systems with untrusted guest users and
administrators.

But: Distribution of updated software is prohibited (except to other
members of the predisclosure list).

Predisclosure list members who wish to deploy significantly different
patches and/or mitigations, please contact the Xen Project Security
Team.


(Note: this during-embargo deployment notice is retained in
post-embargo publicly released Xen Project advisories, even though it
is then no longer applicable.  This is to enable the community to have
oversight of the Xen Project Security Team's decisionmaking.)

For more information about permissible uses of embargoed information,
consult the Xen Project community's agreed Security Policy:
  http://www.xenproject.org/security-policy.html
-BEGIN PGP SIGNATURE-

iQFABAEBCAAqFiEEI+MiLBRfRHX6gGCng/4UyVfoK9kFAmKh4lsMHHBncEB4ZW4u
b3JnAAoJEIP+FMlX6CvZcoAH/ijbKKkQet6frag9HVfDHZtcb6N7yIxMUioVOu9t
tNhg4LdJJnnrCqXmJdXygZTYwIZufQGQOxMR3b66+6MJyz0JIL7XExqnLJs6mDsO
GFcvsxoGLYSdsBTVtGQgLpEPxwgkblKUQuwokz3K3kdxcHJmJceZitvaDdrycw8M
kRZ22qHUbFWTSOKZNe5t9t0x/4xwdyM4dYElAmuN4Ej1cQhhXG/Gbl+acZexS+cz
TFEbIS5G/j6EgaCpBSP5XCoUn2LlyswRxBllGh0kpaLrJRH4CX3E/KHBSdPMkWoP
3HyQF3o+WYvpWUGXVaAREaR+WxlsAwmQJUxpO64O4Y4IUEY=
=UGgq
-END PGP SIGNATURE-


xsa401.meta
Description: Binary data


xsa401/xsa401-1.patch
Description: Binary data


xsa401/xsa401-2.patch
Description: Binary data


xsa401/xsa401-4.13-1.patch
Description: Binary data


xsa401/xsa401-4.13-2.patch
Description: Binary data


xsa401/xsa401-4.16-1.patch
Description: Binary data


xsa401/xsa401-4.16-2.patch
Description: Binary data

Re: [PATCH v3 2/3] ui: Deliver refresh rate via QemuUIInfo

2022-06-09 Thread Gerd Hoffmann

On Thu, Jun 09, 2022 at 08:45:41PM +0900, Akihiko Odaki wrote:
> On 2022/06/09 19:28, Gerd Hoffmann wrote:
> > > --- a/include/ui/console.h
> > > +++ b/include/ui/console.h
> > > @@ -139,6 +139,7 @@ typedef struct QemuUIInfo {
> > >   int   yoff;
> > >   uint32_t  width;
> > >   uint32_t  height;
> > > +uint32_t  refresh_rate;
> > >   } QemuUIInfo;
> > >   /* cursor data format is 32bit RGBA */
> > > @@ -426,7 +427,6 @@ typedef struct GraphicHwOps {
> > >   void (*gfx_update)(void *opaque);
> > >   bool gfx_update_async; /* if true, calls graphic_hw_update_done() */
> > >   void (*text_update)(void *opaque, console_ch_t *text);
> > > -void (*update_interval)(void *opaque, uint64_t interval);
> > >   void (*ui_info)(void *opaque, uint32_t head, QemuUIInfo *info);
> > >   void (*gl_block)(void *opaque, bool block);
> > >   } GraphicHwOps;
> > 
> > So you are dropping update_interval, which isn't mentioned in the commit
> > message at all.  Also this patch is rather big.  I'd suggest:
> > 
> > (1) add refresh_rate
> > (2) update users one by one
> > (3) finally drop update_interval when no user is left.
> > 
> > thanks,
> >Gerd
> > 
> 
> I think 1 and 3 should have to be done once since refresh_rate and
> update_interval would interfere with each other otherwise.

Well, between 1 and 3 both old and new API are active.  Shouldn't be
much of a problem because the GraphicHwOps implementations are using
only the one or the other.

take care,
  Gerd

Re: [XEN PATCH 1/4] build: xen/include: use if_changed

2022-06-09 Thread Bertrand Marquis

Hi,

> On 9 Jun 2022, at 11:26, Jan Beulich  wrote:
> 
> On 09.06.2022 12:16, Bertrand Marquis wrote:
>>> On 1 Jun 2022, at 17:59, Anthony PERARD  wrote:
>>> 
>>> Use "define" for the headers*_chk commands as otherwise the "#"
>>> is interpreted as a comment and make can't find the end of
>>> $(foreach,).
>>> 
>>> Adding several .PRECIOUS as without them `make` deletes the
>>> intermediate targets. This is an issue because the macro $(if_changed,)
>>> check if the target exist in order to decide whether to recreate the
>>> target.
>>> 
>>> Removing the call to `mkdir` from the commands. Those aren't needed
>>> anymore because a rune in Rules.mk creates the directory for each
>>> $(targets).
>>> 
>>> Remove "export PYTHON" as it is already exported.
>> 
>> With this change, compiling for x86 is now ending up in:
>> CHK include/headers99.chk
>> make[9]: execvp: /bin/sh: Argument list too long
>> make[9]: *** [include/Makefile:181: include/headers++.chk] Error 127
>> 
>> Not quite sure yet why but I wanted to signal it early as other might be 
>> impacted.
>> 
>> Arm and arm64 builds are not impacted.
> 
> Hmm, that patch has passed the smoke push gate already, so there likely is
> more to it than there being an unconditional issue. I did build-test this
> before pushing, and I've just re-tested on a 2nd system without seeing an
> issue.

I have the problem only when building using Yocto, I did a normal build and the
issue is not coming.

Doing a verbose compilation I have this (sorry for the long lines):

 for i in include/public/vcpu.h include/public/errno.h include/public/kexec.h 
include/public/argo.h include/public/xen.h include/public/nmi.h 
include/public/xencomm.h include/public/xenoprof.h 
include/public/device_tree_defs.h include/public/version.h 
include/public/memory.h include/public/features.h include/public/sched.h 
include/public/xen-compat.h include/public/callback.h include/public/vm_event.h 
include/public/grant_table.h include/public/physdev.h include/public/tmem.h 
include/public/hypfs.h include/public/platform.h include/public/pmu.h 
include/public/elfnote.h include/public/trace.h include/public/event_channel.h 
include/public/io/vscsiif.h include/public/io/kbdif.h 
include/public/io/protocols.h include/public/io/ring.h 
include/public/io/displif.h include/public/io/fsif.h include/public/io/blkif.h 
include/public/io/console.h include/public/io/sndif.h include/public/io/fbif.h 
include/public/io/libxenvchan.h include/public/io/netif.h 
include/public/io/usbif.h include/public/io/pciif.h include/public/io/tpmif.h 
include/public/io/xs_wire.h include/public/io/xenbus.h 
include/public/io/cameraif.h include/public/hvm/pvdrivers.h 
include/public/hvm/e820.h include/public/hvm/hvm_xs_strings.h 
include/public/hvm/dm_op.h include/public/hvm/ioreq.h 
include/public/hvm/hvm_info_table.h include/public/hvm/hvm_vcpu.h 
include/public/hvm/hvm_op.h include/public/hvm/params.h; do 
x86_64-poky-linux-gcc  
--sysroot=/home/bermar01/Development/xen-dev/build/profile-qemu-x86_64.prj/tmp/work/core2-64-poky-linux/xen/4.17+git1-r0/recipe-sysroot
  -x c -ansi -Wall -Werror -include stdint.h -S -o /dev/null $i || exit 1; echo 
$i; done >include/headers.chk.new; mv include/headers.chk.new 
include/headers.chk
|   rm -f include/headers99.chk.new;  echo "#include 
"\"include/public/io/9pfs.h\" | x86_64-poky-linux-gcc  
--sysroot=/home/bermar01/Development/xen-dev/build/profile-qemu-x86_64.prj/tmp/work/core2-64-poky-linux/xen/4.17+git1-r0/recipe-sysroot
  -x c -std=c99 -Wall -Werror -include stdint.h  -include string.h -S -o 
/dev/null - || exit $?; echo include/public/io/9pfs.h >> 
include/headers99.chk.new;  echo "#include "\"include/public/io/pvcalls.h\" | 
x86_64-poky-linux-gcc  
--sysroot=/home/bermar01/Development/xen-dev/build/profile-qemu-x86_64.prj/tmp/work/core2-64-poky-linux/xen/4.17+git1-r0/recipe-sysroot
  -x c -std=c99 -Wall -Werror -include stdint.h  -include string.h -S -o 
/dev/null - || exit $?; echo include/public/io/pvcalls.h >> 
include/headers99.chk.new; mv include/headers99.chk.new include/headers99.chk
| make[9]: execvp: /bin/sh: Argument list too long
| make[9]: *** [include/Makefile:181: include/headers++.chk] Error 127
| make[9]: *** Waiting for unfinished jobs

So the command passed to the sub shell by make is quite long.

No idea why this comes out only when building in Yocto but I will dig a bit.

> 
> Also please remember to trim your replies.
> 

Will do.

Bertrand

Re: [PATCH v3 2/3] ui: Deliver refresh rate via QemuUIInfo

2022-06-09 Thread Akihiko Odaki


On 2022/06/09 19:28, Gerd Hoffmann wrote:

--- a/include/ui/console.h
+++ b/include/ui/console.h
@@ -139,6 +139,7 @@ typedef struct QemuUIInfo {
  int   yoff;
  uint32_t  width;
  uint32_t  height;
+uint32_t  refresh_rate;
  } QemuUIInfo;
  
  /* cursor data format is 32bit RGBA */

@@ -426,7 +427,6 @@ typedef struct GraphicHwOps {
  void (*gfx_update)(void *opaque);
  bool gfx_update_async; /* if true, calls graphic_hw_update_done() */
  void (*text_update)(void *opaque, console_ch_t *text);
-void (*update_interval)(void *opaque, uint64_t interval);
  void (*ui_info)(void *opaque, uint32_t head, QemuUIInfo *info);
  void (*gl_block)(void *opaque, bool block);
  } GraphicHwOps;


So you are dropping update_interval, which isn't mentioned in the commit
message at all.  Also this patch is rather big.  I'd suggest:

(1) add refresh_rate
(2) update users one by one
(3) finally drop update_interval when no user is left.

thanks,
   Gerd



I think 1 and 3 should have to be done once since refresh_rate and 
update_interval would interfere with each other otherwise. Does that 
make sense?


Regards,
Akihiko Odaki

Re: [PATCH 24/36] printk: Remove trace_.*_rcuidle() usage

2022-06-09 Thread Sergey Senozhatsky

My emails are getting rejected... Let me try web-interface

Kudos to Petr for the questions and thanks to PeterZ for the answers.

On Thu, Jun 9, 2022 at 7:02 PM Peter Zijlstra  wrote:
> This is the tracepoint used to spool all of printk into ftrace, I
> suspect there's users, but I haven't used it myself.

I'm somewhat curious whether we can actually remove that trace event.

Re: MISRA C meeting tomorrow, was: MOVING COMMUNITY CALL Call for agenda items for 9 June Community Call @ 1500 UTC

2022-06-09 Thread Jan Beulich

On 09.06.2022 13:17, Roberto Bagnara wrote:
> On 09/06/22 09:04, Jan Beulich wrote:
>> On 09.06.2022 03:20, Stefano Stabellini wrote:
>>> Finally, for Rule 13.2, I updated the link to ECLAIR's results. There
>>> are a lot more violations than just 4, but I don't know if they are
>>> valid or false positives.
>>
>> I've picked just the one case in xen/common/efi/ebmalloc.c to check,
>> and it says "possibly". That's because evaluation of function call
>> arguments involves the calling of (in this case two) further
>> functions. If those functions had side effects (which apparently the
>> tool can't figure), there would indeed be a problem.
>>
>> The (Arm based) count of almost 10k violations is clearly a concern.
>> I don't consider it even remotely reasonable to add 10k comments, no
>> matter how brief, to cover all the false positives.
> 
> Again, the MISRA approach is a preventive one.
> If you have reasons you want to write
> 
> f(g(), h());
> 
> then declare g() and h() as pure (or const, if they are const).
> E.g.:
> 
> #if COMPILER_SUPPORTS_PURE
> #define PURE __attribute__((pure))
> #else
> #define PURE
> #endif
> 
> int g(void) PURE;
> int h(void) PURE;
> 
> It's good documentation, it improves compiler diagnostics,
> and it satisfies Rule 13.2.

But such attributes first of all should be correct. They wouldn't be
in the case I've looked at (involving two __virt_to_maddr() invocations),
as the underlying va_to_par() isn't pure. Still in the normal case the
sequence of calls made is irrelevant to the overall result.

As to improving compiler diagnostics: It has been my experience that
pure and const are largely ignored when used on inline functions. The
compiler rather looks at the inline-expanded code to judge. (But it has
been a couple of years back that I last checked, so things may have
changed since then.)

Jan

Re: MOVING COMMUNITY CALL Call for agenda items for 9 June Community Call @ 1500 UTC

2022-06-09 Thread Jan Beulich

On 09.06.2022 13:11, Roberto Bagnara wrote:
> On 07/06/22 04:17, Stefano Stabellini wrote:
>  > # Rule 9.1 "The value of an object with automatic storage duration shall 
> not be read before it has been set"
>  >
>  > The question is whether -Wuninitalised already covers this case or not.
>  > I think it does.
>  >
>  > Eclair is reporting a few issues where variables are "possibly
>  > uninitialized". We should ask Roberto about them, I don't think they are
>  > actual errors? More like extra warnings?
> 
> No, -Wuninitialized is not reliable, as it has plenty of (well known)
> false negatives.  This is typical of compilers, for which the generation
> of warnings is only a secondary objective.  I wrote about that here:
> 
>https://www.bugseng.com/blog/compiler-warnings-use-them-dont-trust-them
> 
> On the specifics:
> 
> $ cat p.c
> int foo (int b)
> {
>  int a;
> 
>  if (b)
>  {
>  a = 1;
>  }
> 
>  return a;
> }
> 
> $ gcc -c -W -Wall -Wmaybe-uninitialized -O3 p.c
> $ gcc -c -W -Wall -Wuninitialized -O3 p.c
> $
> 
> Note that the example is less contrived than you might think.
> See, JF Bastien's talk at 2019 LLVM Developers' Meeting:
> 
>https://www.youtube.com/watch?v=I-XUHPimq3o
> 
> More generally, you can only embrace MISRA if you agree on
> its preventive nature, which is radically different from
> the "bug finding" approach.  The point is rather simple:
> 
> 1) static analysis alone cannot guarantee correctness;
> 2) peer review is unavoidable;
> 3) testing is unavoidable.
> 
> In order to effectively conduct a peer review, you cannot
> afford being distracted every minute by the thought
> "is this initialized?  where is it initialized?  with which
> value is it initialized?"
> In a MISRA setting, you want that the answer to such questions
> is immediately clear to anyone.
> In contrast, if you embrace bug finding (that is, checkers with
> false negatives like the ones implemented by compilers),
> you will miss instances that you may miss also with testing
> (testing a program with UB does not give reliable results);
> and you will likely miss them with peer review, unless you
> can spend a lot of time and resources in the activity.
> 
> The checker implemented by ECLAIR for Rule 9.1 embodies this
> principle: if it says "violation", then it is a definite
> violation;  if it says "caution", then maybe there is no
> UB, but a human will have to spend more than 30 seconds
> in order to convince herself that there is no UB.
> 
> I understand this may sound frustrating to virtuoso programmers,
> and there are many of them in the open source world.
> But the truth is that virtuosity in programming is not a good
> thing for safety-related development.   For safety you want
> code that is simple and straightforward to reason about.

I understand what you're saying, yet I'd like to point out that adding
initializers "blindly" may give a false sense of code correctness.
Among other things it takes away the chance for tools to point out
possible issues. Plus some tools warn about stray initializers ...

Jan

Re: [PATCH v5 7/8] xen/x86: add detection of memory interleaves for different nodes

2022-06-09 Thread Jan Beulich

On 06.06.2022 06:09, Wei Chen wrote:
> v4 -> v5:
> 1. Remove "nd->end == end && nd->start == start" from
>conflicting_memblks.
> 2. Use case NO_CONFLICT instead of "default".
> 3. Correct wrong "node" to "pxm" in print message.
> 4. Remove unnecesary "else" to remove the indent depth.
> 5. Convert all ranges to proper mathematical interval
>representation.

As to this:

> @@ -310,44 +343,74 @@ acpi_numa_memory_affinity_init(const struct 
> acpi_srat_mem_affinity *ma)
>   bad_srat();
>   return;
>   }
> +
> + /*
> +  * For the node that already has some memory blocks, we will
> +  * expand the node memory range temporarily to check memory
> +  * interleaves with other nodes. We will not use this node
> +  * temp memory range to check overlaps, because it will mask
> +  * the overlaps in same node.
> +  *
> +  * Node with 0 bytes memory doesn't need this expandsion.
> +  */
> + nd_start = start;
> + nd_end = end;
> + nd = &nodes[node];
> + if (nd->start != nd->end) {
> + if (nd_start > nd->start)
> + nd_start = nd->start;
> +
> + if (nd_end < nd->end)
> + nd_end = nd->end;
> + }
> +
>   /* It is fine to add this area to the nodes data it will be used later*/
> - i = conflicting_memblks(start, end);
> - if (i < 0)
> - /* everything fine */;
> - else if (memblk_nodeid[i] == node) {
> - bool mismatch = !(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) !=
> - !test_bit(i, memblk_hotplug);
> -
> - printk("%sSRAT: PXM %u (%"PRIpaddr"-%"PRIpaddr") overlaps with 
> itself (%"PRIpaddr"-%"PRIpaddr")\n",
> -mismatch ? KERN_ERR : KERN_WARNING, pxm, start, end,
> -node_memblk_range[i].start, node_memblk_range[i].end);
> - if (mismatch) {
> - bad_srat();
> - return;
> + switch (conflicting_memblks(node, start, end, nd_start, nd_end, &i)) {
> + case OVERLAP:
> + if (memblk_nodeid[i] == node) {
> + bool mismatch = !(ma->flags &
> +   ACPI_SRAT_MEM_HOT_PLUGGABLE) !=
> + !test_bit(i, memblk_hotplug);
> +
> + printk("%sSRAT: PXM %u [%"PRIpaddr"-%"PRIpaddr"] 
> overlaps with itself [%"PRIpaddr"-%"PRIpaddr"]\n",

As said when discussing v4, mathematical representation is [start,end].
Please properly use a comma instead of a dash here and below plus ...

> +mismatch ? KERN_ERR : KERN_WARNING, pxm, start,
> +end - 1, node_memblk_range[i].start,
> +node_memblk_range[i].end - 1);
> + if (mismatch) {
> + bad_srat();
> + return;
> + }
> + break;
>   }
> - } else {
> +
> + printk(KERN_ERR
> +"SRAT: PXM %u [%"PRIpaddr"-%"PRIpaddr"] overlaps with 
> PXM %u [%"PRIpaddr"-%"PRIpaddr"]\n",
> +pxm, start, end - 1, node_to_pxm(memblk_nodeid[i]),
> +node_memblk_range[i].start,
> +node_memblk_range[i].end - 1);
> + bad_srat();
> + return;
> +
> + case INTERLEAVE:
>   printk(KERN_ERR
> -"SRAT: PXM %u (%"PRIpaddr"-%"PRIpaddr") overlaps with 
> PXM %u (%"PRIpaddr"-%"PRIpaddr")\n",
> -pxm, start, end, node_to_pxm(memblk_nodeid[i]),
> -node_memblk_range[i].start, node_memblk_range[i].end);
> +"SRAT： PXM %u: [%"PRIpaddr"-%"PRIpaddr"] interleaves 
> with PXM %u memblk [%"PRIpaddr"-%"PRIpaddr"]\n",
> +pxm, nd_start, nd_end - 1, node_to_pxm(memblk_nodeid[i]),
> +node_memblk_range[i].start, node_memblk_range[i].end - 
> 1);
>   bad_srat();
>   return;
> +
> + case NO_CONFLICT:
> + break;
>   }
> +
>   if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)) {
> - struct node *nd = &nodes[node];
> -
> - if (!node_test_and_set(node, memory_nodes_parsed)) {
> - nd->start = start;
> - nd->end = end;
> - } else {
> - if (start < nd->start)
> - nd->start = start;
> - if (nd->end < end)
> - nd->end = end;
> - }
> + node_set(node, memory_nodes_parsed);
> + nd->start = nd_start;
> + nd->end = nd_end;
>   }
> - printk(KERN_INFO "SRAT: Node %u PXM %u %"PRIpaddr"-%"PRIpaddr"%s\n",
> -node, pxm, start, end,
> +
> + printk(KERN_INFO "SRAT: Node %u PXM %u [%"PRIpaddr"-%"PRIpaddr"]%s\n",
>

Re: MISRA C meeting tomorrow, was: MOVING COMMUNITY CALL Call for agenda items for 9 June Community Call @ 1500 UTC

2022-06-09 Thread Roberto Bagnara


On 09/06/22 09:04, Jan Beulich wrote:

On 09.06.2022 03:20, Stefano Stabellini wrote:

Finally, for Rule 13.2, I updated the link to ECLAIR's results. There
are a lot more violations than just 4, but I don't know if they are
valid or false positives.


I've picked just the one case in xen/common/efi/ebmalloc.c to check,
and it says "possibly". That's because evaluation of function call
arguments involves the calling of (in this case two) further
functions. If those functions had side effects (which apparently the
tool can't figure), there would indeed be a problem.

The (Arm based) count of almost 10k violations is clearly a concern.
I don't consider it even remotely reasonable to add 10k comments, no
matter how brief, to cover all the false positives.


Again, the MISRA approach is a preventive one.
If you have reasons you want to write

   f(g(), h());

then declare g() and h() as pure (or const, if they are const).
E.g.:

#if COMPILER_SUPPORTS_PURE
#define PURE __attribute__((pure))
#else
#define PURE
#endif

int g(void) PURE;
int h(void) PURE;

It's good documentation, it improves compiler diagnostics,
and it satisfies Rule 13.2.
Kind regards,

   Roberto

Re: MOVING COMMUNITY CALL Call for agenda items for 9 June Community Call @ 1500 UTC

2022-06-09 Thread Roberto Bagnara

On 07/06/22 04:17, Stefano Stabellini wrote:
> # Rule 9.1 "The value of an object with automatic storage duration shall not be 
read before it has been set"
>
> The question is whether -Wuninitalised already covers this case or not.
> I think it does.
>
> Eclair is reporting a few issues where variables are "possibly
> uninitialized". We should ask Roberto about them, I don't think they are
> actual errors? More like extra warnings?

No, -Wuninitialized is not reliable, as it has plenty of (well known)
false negatives.  This is typical of compilers, for which the generation
of warnings is only a secondary objective.  I wrote about that here:

  https://www.bugseng.com/blog/compiler-warnings-use-them-dont-trust-them

On the specifics:

$ cat p.c
int foo (int b)
{
int a;

if (b)
{
a = 1;
}

return a;
}

$ gcc -c -W -Wall -Wmaybe-uninitialized -O3 p.c
$ gcc -c -W -Wall -Wuninitialized -O3 p.c
$

Note that the example is less contrived than you might think.
See, JF Bastien's talk at 2019 LLVM Developers' Meeting:

  https://www.youtube.com/watch?v=I-XUHPimq3o

More generally, you can only embrace MISRA if you agree on
its preventive nature, which is radically different from
the "bug finding" approach.  The point is rather simple:

1) static analysis alone cannot guarantee correctness;
2) peer review is unavoidable;
3) testing is unavoidable.

In order to effectively conduct a peer review, you cannot
afford being distracted every minute by the thought
"is this initialized?  where is it initialized?  with which
value is it initialized?"
In a MISRA setting, you want that the answer to such questions
is immediately clear to anyone.
In contrast, if you embrace bug finding (that is, checkers with
false negatives like the ones implemented by compilers),
you will miss instances that you may miss also with testing
(testing a program with UB does not give reliable results);
and you will likely miss them with peer review, unless you
can spend a lot of time and resources in the activity.

The checker implemented by ECLAIR for Rule 9.1 embodies this
principle: if it says "violation", then it is a definite
violation;  if it says "caution", then maybe there is no
UB, but a human will have to spend more than 30 seconds
in order to convince herself that there is no UB.

I understand this may sound frustrating to virtuoso programmers,
and there are many of them in the open source world.
But the truth is that virtuosity in programming is not a good
thing for safety-related development.   For safety you want
code that is simple and straightforward to reason about.
Kind regards,

   Roberto

Re: [PATCH 33/36] cpuidle,omap3: Use WFI for omap3_pm_idle()

2022-06-09 Thread Peter Zijlstra

On Wed, Jun 08, 2022 at 06:28:33PM +0200, Arnd Bergmann wrote:
> On Wed, Jun 8, 2022 at 4:27 PM Peter Zijlstra  wrote:
> >
> > arch_cpu_idle() is a very simple idle interface and exposes only a
> > single idle state and is expected to not require RCU and not do any
> > tracing/instrumentation.
> >
> > As such, omap_sram_idle() is not a valid implementation. Replace it
> > with the simple (shallow) omap3_do_wfi() call. Leaving the more
> > complicated idle states for the cpuidle driver.
> >
> > Signed-off-by: Peter Zijlstra (Intel) 
> 
> I see similar code in omap2:
> 
> omap2_pm_idle()
>  -> omap2_enter_full_retention()
>  -> omap2_sram_suspend()
> 
> Is that code path safe to use without RCU or does it need a similar change?

It needs a similar change, clearly I was running on fumes to not have
found that when grepping around the omap code :/

Re: [PATCH 24/36] printk: Remove trace_.*_rcuidle() usage

2022-06-09 Thread Petr Mladek

Sending again. The previous attempt was rejected by several
recipients. It was caused by a mail server changes on my side.

I am sorry for spamming those who got the 1st mail already.

On Wed 2022-06-08 16:27:47, Peter Zijlstra wrote:
> The problem, per commit fc98c3c8c9dc ("printk: use rcuidle console
> tracepoint"), was printk usage from the cpuidle path where RCU was
> already disabled.
> 
> Per the patches earlier in this series, this is no longer the case.

My understanding is that this series reduces a lot the amount
of code called with RCU disabled. As a result the particular printk()
call mentioned by commit fc98c3c8c9dc ("printk: use rcuidle console
tracepoint") is called with RCU enabled now. Hence this particular
problem is fixed better way now.

But is this true in general?
Does this "prevent" calling printk() a safe way in code with
RCU disabled?

I am not sure if anyone cares. printk() is the best effort
functionality because of the consoles code anyway. Also I wonder
if anyone uses this trace_console().

Therefore if this patch allows to remove some tricky tracing
code then it might be worth it. But if trace_console_rcuidle()
variant is still going to be available then I would keep using it.

Best Regards,
Petr

> Signed-off-by: Peter Zijlstra (Intel) 
> ---
>  kernel/printk/printk.c |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -2238,7 +2238,7 @@ static u16 printk_sprint(char *text, u16
>   }
>   }
>  
> - trace_console_rcuidle(text, text_len);
> + trace_console(text, text_len);
>  
>   return text_len;
>  }
>

Re: [PATCH 33/36] cpuidle,omap3: Use WFI for omap3_pm_idle()

2022-06-09 Thread Peter Zijlstra

On Thu, Jun 09, 2022 at 10:39:22AM +0300, Tony Lindgren wrote:
> * Arnd Bergmann  [220608 18:18]:
> > On Wed, Jun 8, 2022 at 4:27 PM Peter Zijlstra  wrote:
> > >
> > > arch_cpu_idle() is a very simple idle interface and exposes only a
> > > single idle state and is expected to not require RCU and not do any
> > > tracing/instrumentation.
> > >
> > > As such, omap_sram_idle() is not a valid implementation. Replace it
> > > with the simple (shallow) omap3_do_wfi() call. Leaving the more
> > > complicated idle states for the cpuidle driver.
> 
> Agreed it makes sense to limit deeper idle states to cpuidle. Hopefully
> there is some informative splat for attempting to use arch_cpu_ide()
> for deeper idle states :)

The arch_cpu_idle() interface doesn't allow one to express a desire for
deeper states. I'm not sure how anyone could even attempt this.

But given what OMAP needs to go deeper, this would involve things that
require RCU, combine that with the follow up patches that rip out all
the trace_.*_rcuidle() hackery from the power and clock domain code,
PROVE_RCU should scream if anybody were to attempt it.

Re: [PATCH 24/36] printk: Remove trace_.*_rcuidle() usage

2022-06-09 Thread Petr Mladek

On Wed 2022-06-08 16:27:47, Peter Zijlstra wrote:
> The problem, per commit fc98c3c8c9dc ("printk: use rcuidle console
> tracepoint"), was printk usage from the cpuidle path where RCU was
> already disabled.
> 
> Per the patches earlier in this series, this is no longer the case.

My understanding is that this series reduces a lot the amount
of code called with RCU disabled. As a result the particular printk()
call mentioned by commit fc98c3c8c9dc ("printk: use rcuidle console
tracepoint") is called with RCU enabled now. Hence this particular
problem is fixed better way now.

But is this true in general?
Does this "prevent" calling printk() a safe way in code with
RCU disabled?

I am not sure if anyone cares. printk() is the best effort
functionality because of the consoles code anyway. Also I wonder
if anyone uses this trace_console().

Therefore if this patch allows to remove some tricky tracing
code then it might be worth it. But if trace_console_rcuidle()
variant is still going to be available then I would keep using it.

Best Regards,
Petr

> Signed-off-by: Peter Zijlstra (Intel) 
> ---
>  kernel/printk/printk.c |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -2238,7 +2238,7 @@ static u16 printk_sprint(char *text, u16
>   }
>   }
>  
> - trace_console_rcuidle(text, text_len);
> + trace_console(text, text_len);
>  
>   return text_len;
>  }
>

Re: [PATCH 24/36] printk: Remove trace_.*_rcuidle() usage

2022-06-09 Thread Peter Zijlstra

On Thu, Jun 09, 2022 at 11:16:46AM +0200, Petr Mladek wrote:
> On Wed 2022-06-08 16:27:47, Peter Zijlstra wrote:
> > The problem, per commit fc98c3c8c9dc ("printk: use rcuidle console
> > tracepoint"), was printk usage from the cpuidle path where RCU was
> > already disabled.
> > 
> > Per the patches earlier in this series, this is no longer the case.
> 
> My understanding is that this series reduces a lot the amount
> of code called with RCU disabled. As a result the particular printk()
> call mentioned by commit fc98c3c8c9dc ("printk: use rcuidle console
> tracepoint") is called with RCU enabled now. Hence this particular
> problem is fixed better way now.
> 
> But is this true in general?
> Does this "prevent" calling printk() a safe way in code with
> RCU disabled?

On x86_64, yes. Other architectures, less so.

Specifically, the objtool noinstr validation pass will warn at build
time (DEBUG_ENTRY=y) if any noinstr/cpuidle code does a call to
non-vetted code like printk().

At the same time; there's a few hacks that allow WARN to work, but
mostly if you hit WARN in entry/noinstr you get to keep the pieces in
any case.

On other architecture we'll need to rely on runtime coverage with
PROVE_RCU. That is, if a splat like in the above mentioned commit
happens again, we'll need to fix it by adjusting the callchain, not by
mucking about with RCU state.

> I am not sure if anyone cares. printk() is the best effort
> functionality because of the consoles code anyway. Also I wonder
> if anyone uses this trace_console().

This is the tracepoint used to spool all of printk into ftrace, I
suspect there's users, but I haven't used it myself.

> Therefore if this patch allows to remove some tricky tracing
> code then it might be worth it. But if trace_console_rcuidle()
> variant is still going to be available then I would keep using it.

My ultimate goal is to delete trace_.*_rcuidle() and RCU_NONIDLE()
entirely. We're close, but not quite there yet.

Re: [PATCH v3 2/3] ui: Deliver refresh rate via QemuUIInfo

2022-06-09 Thread Gerd Hoffmann

> --- a/include/ui/console.h
> +++ b/include/ui/console.h
> @@ -139,6 +139,7 @@ typedef struct QemuUIInfo {
>  int   yoff;
>  uint32_t  width;
>  uint32_t  height;
> +uint32_t  refresh_rate;
>  } QemuUIInfo;
>  
>  /* cursor data format is 32bit RGBA */
> @@ -426,7 +427,6 @@ typedef struct GraphicHwOps {
>  void (*gfx_update)(void *opaque);
>  bool gfx_update_async; /* if true, calls graphic_hw_update_done() */
>  void (*text_update)(void *opaque, console_ch_t *text);
> -void (*update_interval)(void *opaque, uint64_t interval);
>  void (*ui_info)(void *opaque, uint32_t head, QemuUIInfo *info);
>  void (*gl_block)(void *opaque, bool block);
>  } GraphicHwOps;

So you are dropping update_interval, which isn't mentioned in the commit
message at all.  Also this patch is rather big.  I'd suggest:

(1) add refresh_rate
(2) update users one by one
(3) finally drop update_interval when no user is left.

thanks,
  Gerd

Re: [XEN PATCH 1/4] build: xen/include: use if_changed

2022-06-09 Thread Jan Beulich

On 09.06.2022 12:16, Bertrand Marquis wrote:
>> On 1 Jun 2022, at 17:59, Anthony PERARD  wrote:
>>
>> Use "define" for the headers*_chk commands as otherwise the "#"
>> is interpreted as a comment and make can't find the end of
>> $(foreach,).
>>
>> Adding several .PRECIOUS as without them `make` deletes the
>> intermediate targets. This is an issue because the macro $(if_changed,)
>> check if the target exist in order to decide whether to recreate the
>> target.
>>
>> Removing the call to `mkdir` from the commands. Those aren't needed
>> anymore because a rune in Rules.mk creates the directory for each
>> $(targets).
>>
>> Remove "export PYTHON" as it is already exported.
> 
> With this change, compiling for x86 is now ending up in:
> CHK include/headers99.chk
> make[9]: execvp: /bin/sh: Argument list too long
> make[9]: *** [include/Makefile:181: include/headers++.chk] Error 127
> 
> Not quite sure yet why but I wanted to signal it early as other might be 
> impacted.
> 
> Arm and arm64 builds are not impacted.

Hmm, that patch has passed the smoke push gate already, so there likely is
more to it than there being an unconditional issue. I did build-test this
before pushing, and I've just re-tested on a 2nd system without seeing an
issue.

Also please remember to trim your replies.

Jan

[PATCH v6 06/12] IOMMU/x86: prefill newly allocate page tables

2022-06-09 Thread Jan Beulich

Page tables are used for two purposes after allocation: They either
start out all empty, or they are filled to replace a superpage.
Subsequently, to replace all empty or fully contiguous page tables,
contiguous sub-regions will be recorded within individual page tables.
Install the initial set of markers immediately after allocation. Make
sure to retain these markers when further populating a page table in
preparation for it to replace a superpage.

The markers are simply 4-bit fields holding the order value of
contiguous entries. To demonstrate this, if a page table had just 16
entries, this would be the initial (fully contiguous) set of markers:

index  0 1 2 3 4 5 6 7 8 9 A B C D E F
marker 4 0 1 0 2 0 1 0 3 0 1 0 2 0 1 0

"Contiguous" here means not only present entries with successively
increasing MFNs, each one suitably aligned for its slot, and identical
attributes, but also a respective number of all non-present (zero except
for the markers) entries.

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Roger Pau Monné 
---
An alternative to the ASSERT()s added to set_iommu_ptes_present() would
be to make the function less general-purpose; it's used in a single
place only after all (i.e. it might as well be folded into its only
caller).

While in VT-d's comment ahead of struct dma_pte I'm adjusting the
description of the high bits, I'd like to note that the description of
some of the lower bits isn't correct either. Yet I don't think adjusting
that belongs here.
---
v6: Use sizeof().
v5: Assert next_mfn is suitably aligned in set_iommu_ptes_present(). Use
CONTIG_LEVEL_SHIFT in favor of PAGE_SHIFT-3.
v4: Add another comment referring to pt-contig-markers.h. Re-base.
v3: Add comments. Re-base.
v2: New.

--- a/xen/arch/x86/include/asm/iommu.h
+++ b/xen/arch/x86/include/asm/iommu.h
@@ -146,7 +146,8 @@ void iommu_free_domid(domid_t domid, uns
 
 int __must_check iommu_free_pgtables(struct domain *d);
 struct domain_iommu;
-struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd);
+struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd,
+   uint64_t contig_mask);
 void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg);
 
 #endif /* !__ARCH_X86_IOMMU_H__ */
--- a/xen/drivers/passthrough/amd/iommu-defs.h
+++ b/xen/drivers/passthrough/amd/iommu-defs.h
@@ -446,11 +446,13 @@ union amd_iommu_x2apic_control {
 #define IOMMU_PAGE_TABLE_U32_PER_ENTRY (IOMMU_PAGE_TABLE_ENTRY_SIZE / 4)
 #define IOMMU_PAGE_TABLE_ALIGNMENT 4096
 
+#define IOMMU_PTE_CONTIG_MASK   0x1e /* The ign0 field below. */
+
 union amd_iommu_pte {
 uint64_t raw;
 struct {
 bool pr:1;
-unsigned int ign0:4;
+unsigned int ign0:4; /* Covered by IOMMU_PTE_CONTIG_MASK. */
 bool a:1;
 bool d:1;
 unsigned int ign1:2;
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -21,6 +21,8 @@
 
 #include "iommu.h"
 
+#include 
+
 /* Given pfn and page table level, return pde index */
 static unsigned int pfn_to_pde_idx(unsigned long pfn, unsigned int level)
 {
@@ -113,9 +115,23 @@ static void set_iommu_ptes_present(unsig
 return;
 }
 
+ASSERT(!(next_mfn & (page_sz - 1)));
+
 while ( nr_ptes-- )
 {
-set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
+ASSERT(!pde->next_level);
+ASSERT(!pde->u);
+
+if ( pde > table )
+ASSERT(pde->ign0 == find_first_set_bit(pde - table));
+else
+ASSERT(pde->ign0 == CONTIG_LEVEL_SHIFT);
+
+pde->iw = iw;
+pde->ir = ir;
+pde->fc = true; /* See set_iommu_pde_present(). */
+pde->mfn = next_mfn;
+pde->pr = true;
 
 ++pde;
 next_mfn += page_sz;
@@ -295,7 +311,7 @@ static int iommu_pde_from_dfn(struct dom
 mfn = next_table_mfn;
 
 /* allocate lower level page table */
-table = iommu_alloc_pgtable(hd);
+table = iommu_alloc_pgtable(hd, IOMMU_PTE_CONTIG_MASK);
 if ( table == NULL )
 {
 AMD_IOMMU_ERROR("cannot allocate I/O page table\n");
@@ -325,7 +341,7 @@ static int iommu_pde_from_dfn(struct dom
 
 if ( next_table_mfn == 0 )
 {
-table = iommu_alloc_pgtable(hd);
+table = iommu_alloc_pgtable(hd, IOMMU_PTE_CONTIG_MASK);
 if ( table == NULL )
 {
 AMD_IOMMU_ERROR("cannot allocate I/O page table\n");
@@ -726,7 +742,7 @@ static int fill_qpt(union amd_iommu_pte
  * page table pages, and the resulting allocations are always
  * zeroed.
  */
-pgs[level] = iommu_alloc_pgtable(hd);
+pgs[level] = iommu_alloc_pgtable(hd, 0);
 if ( !pgs[level] )
 {
 rc = -ENOMEM;

[PATCH v6 11/12] IOMMU/x86: add perf counters for page table splitting / coalescing

2022-06-09 Thread Jan Beulich

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin tian 
Reviewed-by: Roger Pau Monné 
---
v3: New.

--- a/xen/arch/x86/include/asm/perfc_defn.h
+++ b/xen/arch/x86/include/asm/perfc_defn.h
@@ -125,4 +125,7 @@ PERFCOUNTER(realmode_exits,  "vmexit
 
 PERFCOUNTER(pauseloop_exits, "vmexits from Pause-Loop Detection")
 
+PERFCOUNTER(iommu_pt_shatters,"IOMMU page table shatters")
+PERFCOUNTER(iommu_pt_coalesces,   "IOMMU page table coalesces")
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -345,6 +345,8 @@ static int iommu_pde_from_dfn(struct dom
  level, PTE_kind_table);
 
 *flush_flags |= IOMMU_FLUSHF_modified;
+
+perfc_incr(iommu_pt_shatters);
 }
 
 /* Install lower level page table for non-present entries */
@@ -477,6 +479,7 @@ int cf_check amd_iommu_map_page(
   flags & IOMMUF_readable, &contig);
 *flush_flags |= IOMMU_FLUSHF_modified | IOMMU_FLUSHF_all;
 iommu_queue_free_pgtable(hd, pg);
+perfc_incr(iommu_pt_coalesces);
 }
 
 spin_unlock(&hd->arch.mapping_lock);
@@ -543,6 +546,7 @@ int cf_check amd_iommu_unmap_page(
 clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level, &free);
 *flush_flags |= IOMMU_FLUSHF_all;
 iommu_queue_free_pgtable(hd, pg);
+perfc_incr(iommu_pt_coalesces);
 }
 }
 
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -404,6 +404,8 @@ static uint64_t addr_to_dma_page_maddr(s
 
 if ( flush_flags )
 *flush_flags |= IOMMU_FLUSHF_modified;
+
+perfc_incr(iommu_pt_shatters);
 }
 
 write_atomic(&pte->val, new_pte.val);
@@ -857,6 +859,7 @@ static int dma_pte_clear_one(struct doma
 
 *flush_flags |= IOMMU_FLUSHF_all;
 iommu_queue_free_pgtable(hd, pg);
+perfc_incr(iommu_pt_coalesces);
 }
 
 spin_unlock(&hd->arch.mapping_lock);
@@ -2239,6 +2242,7 @@ static int __must_check cf_check intel_i
 
 *flush_flags |= IOMMU_FLUSHF_modified | IOMMU_FLUSHF_all;
 iommu_queue_free_pgtable(hd, pg);
+perfc_incr(iommu_pt_coalesces);
 }
 
 spin_unlock(&hd->arch.mapping_lock);

[PATCH v6 12/12] VT-d: fold dma_pte_clear_one() into its only caller

2022-06-09 Thread Jan Beulich

This way intel_iommu_unmap_page() ends up quite a bit more similar to
intel_iommu_map_page().

No functional change intended.

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Roger Pau Monné 
---
v5: Re-base of changes earlier in the series.
v4: New.

--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -806,75 +806,6 @@ static void queue_free_pt(struct domain_
 iommu_queue_free_pgtable(hd, mfn_to_page(mfn));
 }
 
-/* clear one page's page table */
-static int dma_pte_clear_one(struct domain *domain, daddr_t addr,
- unsigned int order,
- unsigned int *flush_flags)
-{
-struct domain_iommu *hd = dom_iommu(domain);
-struct dma_pte *page = NULL, *pte = NULL, old;
-u64 pg_maddr;
-unsigned int level = (order / LEVEL_STRIDE) + 1;
-
-spin_lock(&hd->arch.mapping_lock);
-/* get target level pte */
-pg_maddr = addr_to_dma_page_maddr(domain, addr, level, flush_flags, false);
-if ( pg_maddr < PAGE_SIZE )
-{
-spin_unlock(&hd->arch.mapping_lock);
-return pg_maddr ? -ENOMEM : 0;
-}
-
-page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
-pte = &page[address_level_offset(addr, level)];
-
-if ( !dma_pte_present(*pte) )
-{
-spin_unlock(&hd->arch.mapping_lock);
-unmap_vtd_domain_page(page);
-return 0;
-}
-
-old = *pte;
-dma_clear_pte(*pte);
-iommu_sync_cache(pte, sizeof(*pte));
-
-while ( pt_update_contig_markers(&page->val,
- address_level_offset(addr, level),
- level, PTE_kind_null) &&
-++level < min_pt_levels )
-{
-struct page_info *pg = maddr_to_page(pg_maddr);
-
-unmap_vtd_domain_page(page);
-
-pg_maddr = addr_to_dma_page_maddr(domain, addr, level, flush_flags,
-  false);
-BUG_ON(pg_maddr < PAGE_SIZE);
-
-page = map_vtd_domain_page(pg_maddr);
-pte = &page[address_level_offset(addr, level)];
-dma_clear_pte(*pte);
-iommu_sync_cache(pte, sizeof(*pte));
-
-*flush_flags |= IOMMU_FLUSHF_all;
-iommu_queue_free_pgtable(hd, pg);
-perfc_incr(iommu_pt_coalesces);
-}
-
-spin_unlock(&hd->arch.mapping_lock);
-
-unmap_vtd_domain_page(page);
-
-*flush_flags |= IOMMU_FLUSHF_modified;
-
-if ( order && !dma_pte_superpage(old) )
-queue_free_pt(hd, maddr_to_mfn(dma_pte_addr(old)),
-  order / LEVEL_STRIDE);
-
-return 0;
-}
-
 static int iommu_set_root_entry(struct vtd_iommu *iommu)
 {
 u32 sts;
@@ -2264,11 +2195,17 @@ static int __must_check cf_check intel_i
 static int __must_check cf_check intel_iommu_unmap_page(
 struct domain *d, dfn_t dfn, unsigned int order, unsigned int *flush_flags)
 {
+struct domain_iommu *hd = dom_iommu(d);
+daddr_t addr = dfn_to_daddr(dfn);
+struct dma_pte *page = NULL, *pte = NULL, old;
+uint64_t pg_maddr;
+unsigned int level = (order / LEVEL_STRIDE) + 1;
+
 /*
  * While really we could unmap at any granularity, for now we assume unmaps
  * are issued by common code only at the same granularity as maps.
  */
-ASSERT((dom_iommu(d)->platform_ops->page_sizes >> order) & PAGE_SIZE_4K);
+ASSERT((hd->platform_ops->page_sizes >> order) & PAGE_SIZE_4K);
 
 /* Do nothing if VT-d shares EPT page table */
 if ( iommu_use_hap_pt(d) )
@@ -2278,7 +2215,62 @@ static int __must_check cf_check intel_i
 if ( iommu_hwdom_passthrough && is_hardware_domain(d) )
 return 0;
 
-return dma_pte_clear_one(d, dfn_to_daddr(dfn), order, flush_flags);
+spin_lock(&hd->arch.mapping_lock);
+/* get target level pte */
+pg_maddr = addr_to_dma_page_maddr(d, addr, level, flush_flags, false);
+if ( pg_maddr < PAGE_SIZE )
+{
+spin_unlock(&hd->arch.mapping_lock);
+return pg_maddr ? -ENOMEM : 0;
+}
+
+page = map_vtd_domain_page(pg_maddr);
+pte = &page[address_level_offset(addr, level)];
+
+if ( !dma_pte_present(*pte) )
+{
+spin_unlock(&hd->arch.mapping_lock);
+unmap_vtd_domain_page(page);
+return 0;
+}
+
+old = *pte;
+dma_clear_pte(*pte);
+iommu_sync_cache(pte, sizeof(*pte));
+
+while ( pt_update_contig_markers(&page->val,
+ address_level_offset(addr, level),
+ level, PTE_kind_null) &&
+++level < min_pt_levels )
+{
+struct page_info *pg = maddr_to_page(pg_maddr);
+
+unmap_vtd_domain_page(page);
+
+pg_maddr = addr_to_dma_page_maddr(d, addr, level, flush_flags, false);
+BUG_ON(pg_maddr < PAGE_SIZE);
+
+page = map_vtd_domain_page(pg_maddr);
+pte = &page[address_level_offset(addr, level)];
+dma_clear_pte(*pte);
+iommu_sync_cache(pte, sizeof(*pte));

[PATCH v6 02/12] IOMMU/x86: new command line option to suppress use of superpage mappings

2022-06-09 Thread Jan Beulich

Before actually enabling their use, provide a means to suppress it in
case of problems. Note that using the option can also affect the sharing
of page tables in the VT-d / EPT combination: If EPT would use large
page mappings but the option is in effect, page table sharing would be
suppressed (to properly fulfill the admin request).

Requested-by: Roger Pau Monné 
Signed-off-by: Jan Beulich 
---
v6: New.

--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -1405,7 +1405,7 @@ detection of systems known to misbehave
 
 ### iommu
 = List of [ , verbose, debug, force, required, 
quarantine[=scratch-page],
-sharept, intremap, intpost, crash-disable,
+sharept, superpages, intremap, intpost, crash-disable,
 snoop, qinval, igfx, amd-iommu-perdev-intremap,
 dom0-{passthrough,strict} ]
 
@@ -1481,6 +1481,12 @@ boolean (e.g. `iommu=no`) can override t
 
 This option is ignored on ARM, and the pagetables are always shared.
 
+*   The `superpages` boolean controls whether superpage mappings may be used
+in IOMMU page tables.  If using this option is necessary to fix an issue,
+please report a bug.
+
+This option is only valid on x86.
+
 *   The `intremap` boolean controls the Interrupt Remapping sub-feature, and
 is active by default on compatible hardware.  On x86 systems, the first
 generation of IOMMUs only supported DMA remapping, and Interrupt Remapping
--- a/xen/arch/x86/include/asm/iommu.h
+++ b/xen/arch/x86/include/asm/iommu.h
@@ -132,7 +132,7 @@ extern bool untrusted_msi;
 int pi_update_irte(const struct pi_desc *pi_desc, const struct pirq *pirq,
const uint8_t gvec);
 
-extern bool iommu_non_coherent;
+extern bool iommu_non_coherent, iommu_superpages;
 
 static inline void iommu_sync_cache(const void *addr, unsigned int size)
 {
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -88,6 +88,8 @@ static int __init cf_check parse_iommu_p
 iommu_igfx = val;
 else if ( (val = parse_boolean("qinval", s, ss)) >= 0 )
 iommu_qinval = val;
+else if ( (val = parse_boolean("superpages", s, ss)) >= 0 )
+iommu_superpages = val;
 #endif
 else if ( (val = parse_boolean("verbose", s, ss)) >= 0 )
 iommu_verbose = val;
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2213,7 +2213,8 @@ static bool __init vtd_ept_page_compatib
 if ( rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, ept_cap) != 0 ) 
 return false;
 
-return (ept_has_2mb(ept_cap) && opt_hap_2mb) <= cap_sps_2mb(vtd_cap) &&
+return iommu_superpages &&
+   (ept_has_2mb(ept_cap) && opt_hap_2mb) <= cap_sps_2mb(vtd_cap) &&
(ept_has_1gb(ept_cap) && opt_hap_1gb) <= cap_sps_1gb(vtd_cap);
 }
 
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -31,6 +31,7 @@
 const struct iommu_init_ops *__initdata iommu_init_ops;
 struct iommu_ops __ro_after_init iommu_ops;
 bool __read_mostly iommu_non_coherent;
+bool __initdata iommu_superpages = true;
 
 enum iommu_intremap __read_mostly iommu_intremap = iommu_intremap_full;
 
@@ -104,8 +105,13 @@ int __init iommu_hardware_setup(void)
 mask_IO_APIC_setup(ioapic_entries);
 }
 
+if ( !iommu_superpages )
+iommu_ops.page_sizes &= PAGE_SIZE_4K;
+
 rc = iommu_init_ops->setup();
 
+ASSERT(iommu_superpages || iommu_ops.page_sizes == PAGE_SIZE_4K);
+
 if ( ioapic_entries )
 {
 restore_IO_APIC_setup(ioapic_entries, rc);

Re: [PATCH v2] xen: Add MISRA support to cppcheck make rule

2022-06-09 Thread Bertrand Marquis

Hi Jan,

> On 9 Jun 2022, at 11:12, Jan Beulich  wrote:
> 
> On 09.06.2022 11:34, Bertrand Marquis wrote:
>> cppcheck MISRA addon can be used to check for non compliance to some of
>> the MISRA standard rules.
>> 
>> Add a CPPCHECK_MISRA variable that can be set to "y" using make command
>> line to generate a cppcheck report including cppcheck misra checks.
>> 
>> When MISRA checking is enabled, a file with a text description suitable
>> for cppcheck misra addon is generated out of Xen documentation file
>> which lists the rules followed by Xen (docs/misra/rules.rst).
>> 
>> By default MISRA checking is turned off.
>> 
>> While adding cppcheck-misra files to gitignore, also fix the missing /
>> for htmlreport gitignore
>> 
>> Signed-off-by: Bertrand Marquis 
>> ---
>> Changes in v2:
>> - fix missing / for htmlreport
>> - use wildcard for cppcheck-misra remove and gitignore
>> - fix comment in makefile
>> - fix dependencies for generation of json and txt file
>> ---
>> .gitignore | 3 +-
>> xen/Makefile | 29 ++-
>> xen/tools/convert_misra_doc.py | 139 +
>> 3 files changed, 168 insertions(+), 3 deletions(-)
>> create mode 100755 xen/tools/convert_misra_doc.py
>> 
>> diff --git a/.gitignore b/.gitignore
>> index 18ef56a780..b106caa7a9 100644
>> --- a/.gitignore
>> +++ b/.gitignore
>> @@ -297,6 +297,7 @@ xen/.banner
>> xen/.config
>> xen/.config.old
>> xen/.xen.elf32
>> +xen/cppcheck-misra.*
> 
> As said on v1, this wants to be added further down, while ...
> 
>> xen/xen-cppcheck.xml
> 
> ... this line wants moving down at this occasion or in a separate
> change.
> 
>> xen/System.map
>> xen/arch/x86/boot/mkelf32
>> @@ -318,7 +319,7 @@ xen/arch/*/efi/runtime.c
>> xen/arch/*/include/asm/asm-offsets.h
>> xen/common/config_data.S
>> xen/common/config.gz
>> -xen/cppcheck-htmlreport
>> +xen/cppcheck-htmlreport/
>> xen/include/headers*.chk
>> xen/include/compat/*
>> xen/include/config/
> 
> xen/cppcheck-misra.* wants to go alongside the line you adjust, while
> xen/xen-cppcheck.xml belongs yet further down.

Sorry I forgot that part in my v2.
I will do all fixes including xen-cppcheck.xml one in a v3 shortly.

Cheers
Bertrand

[PATCH v6 10/12] VT-d: replace all-contiguous page tables by superpage mappings

2022-06-09 Thread Jan Beulich

When a page table ends up with all contiguous entries (including all
identical attributes), it can be replaced by a superpage entry at the
next higher level. The page table itself can then be scheduled for
freeing.

The adjustment to LEVEL_MASK is merely to avoid leaving a latent trap
for whenever we (and obviously hardware) start supporting 512G mappings.

Note that cache sync-ing is likely more strict than necessary. This is
both to be on the safe side as well as to maintain the pattern of all
updates of (potentially) live tables being accompanied by a flush (if so
needed).

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Roger Pau Monné 
---
Unlike the freeing of all-empty page tables, this causes quite a bit of
back and forth for PV domains, due to their mapping/unmapping of pages
when they get converted to/from being page tables. It may therefore be
worth considering to delay re-coalescing a little, to avoid doing so
when the superpage would otherwise get split again pretty soon. But I
think this would better be the subject of a separate change anyway.

Of course this could also be helped by more "aware" kernel side
behavior: They could avoid immediately mapping freed page tables
writable again, in anticipation of re-using that same page for another
page table elsewhere.
---
v4: Re-base over changes earlier in the series.
v3: New.

--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -2211,14 +2211,35 @@ static int __must_check cf_check intel_i
  * While the (ab)use of PTE_kind_table here allows to save some work in
  * the function, the main motivation for it is that it avoids a so far
  * unexplained hang during boot (while preparing Dom0) on a Westmere
- * based laptop.
+ * based laptop.  This also has the intended effect of terminating the
+ * loop when super pages aren't supported anymore at the next level.
  */
-pt_update_contig_markers(&page->val,
- address_level_offset(dfn_to_daddr(dfn), level),
- level,
- (hd->platform_ops->page_sizes &
-  (1UL << level_to_offset_bits(level + 1))
-  ? PTE_kind_leaf : PTE_kind_table));
+while ( pt_update_contig_markers(&page->val,
+ address_level_offset(dfn_to_daddr(dfn), 
level),
+ level,
+ (hd->platform_ops->page_sizes &
+  (1UL << level_to_offset_bits(level + 1))
+   ? PTE_kind_leaf : PTE_kind_table)) )
+{
+struct page_info *pg = maddr_to_page(pg_maddr);
+
+unmap_vtd_domain_page(page);
+
+new.val &= ~(LEVEL_MASK << level_to_offset_bits(level));
+dma_set_pte_superpage(new);
+
+pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), ++level,
+  flush_flags, false);
+BUG_ON(pg_maddr < PAGE_SIZE);
+
+page = map_vtd_domain_page(pg_maddr);
+pte = &page[address_level_offset(dfn_to_daddr(dfn), level)];
+*pte = new;
+iommu_sync_cache(pte, sizeof(*pte));
+
+*flush_flags |= IOMMU_FLUSHF_modified | IOMMU_FLUSHF_all;
+iommu_queue_free_pgtable(hd, pg);
+}
 
 spin_unlock(&hd->arch.mapping_lock);
 unmap_vtd_domain_page(page);
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -232,7 +232,7 @@ struct context_entry {
 
 /* page table handling */
 #define LEVEL_STRIDE   (9)
-#define LEVEL_MASK ((1 << LEVEL_STRIDE) - 1)
+#define LEVEL_MASK (PTE_NUM - 1UL)
 #define PTE_NUM(1 << LEVEL_STRIDE)
 #define level_to_agaw(val) ((val) - 2)
 #define agaw_to_level(val) ((val) + 2)

[PATCH v6 09/12] AMD/IOMMU: replace all-contiguous page tables by superpage mappings

2022-06-09 Thread Jan Beulich

When a page table ends up with all contiguous entries (including all
identical attributes), it can be replaced by a superpage entry at the
next higher level. The page table itself can then be scheduled for
freeing.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 
---
Unlike the freeing of all-empty page tables, this causes quite a bit of
back and forth for PV domains, due to their mapping/unmapping of pages
when they get converted to/from being page tables. It may therefore be
worth considering to delay re-coalescing a little, to avoid doing so
when the superpage would otherwise get split again pretty soon. But I
think this would better be the subject of a separate change anyway.

Of course this could also be helped by more "aware" kernel side
behavior: They could avoid immediately mapping freed page tables
writable again, in anticipation of re-using that same page for another
page table elsewhere.
---
v4: Re-base over changes earlier in the series.
v3: New.

--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -81,7 +81,8 @@ static union amd_iommu_pte set_iommu_pte
  unsigned long dfn,
  unsigned long next_mfn,
  unsigned int level,
- bool iw, bool ir)
+ bool iw, bool ir,
+ bool *contig)
 {
 union amd_iommu_pte *table, *pde, old;
 
@@ -94,11 +95,15 @@ static union amd_iommu_pte set_iommu_pte
  old.iw != iw || old.ir != ir )
 {
 set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
-pt_update_contig_markers(&table->raw, pfn_to_pde_idx(dfn, level),
- level, PTE_kind_leaf);
+*contig = pt_update_contig_markers(&table->raw,
+   pfn_to_pde_idx(dfn, level),
+   level, PTE_kind_leaf);
 }
 else
+{
 old.pr = false; /* signal "no change" to the caller */
+*contig = false;
+}
 
 unmap_domain_page(table);
 
@@ -409,6 +414,7 @@ int cf_check amd_iommu_map_page(
 {
 struct domain_iommu *hd = dom_iommu(d);
 unsigned int level = (IOMMUF_order(flags) / PTE_PER_TABLE_SHIFT) + 1;
+bool contig;
 int rc;
 unsigned long pt_mfn = 0;
 union amd_iommu_pte old;
@@ -452,8 +458,26 @@ int cf_check amd_iommu_map_page(
 
 /* Install mapping */
 old = set_iommu_pte_present(pt_mfn, dfn_x(dfn), mfn_x(mfn), level,
-(flags & IOMMUF_writable),
-(flags & IOMMUF_readable));
+flags & IOMMUF_writable,
+flags & IOMMUF_readable, &contig);
+
+while ( unlikely(contig) && ++level < hd->arch.amd.paging_mode )
+{
+struct page_info *pg = mfn_to_page(_mfn(pt_mfn));
+unsigned long next_mfn;
+
+if ( iommu_pde_from_dfn(d, dfn_x(dfn), level, &pt_mfn, flush_flags,
+false) )
+BUG();
+BUG_ON(!pt_mfn);
+
+next_mfn = mfn_x(mfn) & (~0UL << (PTE_PER_TABLE_SHIFT * (level - 1)));
+set_iommu_pte_present(pt_mfn, dfn_x(dfn), next_mfn, level,
+  flags & IOMMUF_writable,
+  flags & IOMMUF_readable, &contig);
+*flush_flags |= IOMMU_FLUSHF_modified | IOMMU_FLUSHF_all;
+iommu_queue_free_pgtable(hd, pg);
+}
 
 spin_unlock(&hd->arch.mapping_lock);

[PATCH v6 08/12] VT-d: free all-empty page tables

2022-06-09 Thread Jan Beulich

When a page table ends up with no present entries left, it can be
replaced by a non-present entry at the next higher level. The page table
itself can then be scheduled for freeing.

Note that while its output isn't used there yet,
pt_update_contig_markers() right away needs to be called in all places
where entries get updated, not just the one where entries get cleared.

Note further that while pt_update_contig_markers() updates perhaps
several PTEs within the table, since these are changes to "avail" bits
only I do not think that cache flushing would be needed afterwards. Such
cache flushing (of entire pages, unless adding yet more logic to me more
selective) would be quite noticable performance-wise (very prominent
during Dom0 boot).

Also note that cache sync-ing is likely more strict than necessary. This
is both to be on the safe side as well as to maintain the pattern of all
updates of (potentially) live tables being accompanied by a flush (if so
needed).

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Roger Pau Monné 
---
v4: Re-base over changes earlier in the series.
v3: Properly bound loop. Re-base over changes earlier in the series.
v2: New.
---
The hang during boot on my Latitude E6410 (see the respective code
comment) was pretty close after iommu_enable_translation(). No errors,
no watchdog would kick in, just sometimes the first few pixel lines of
the next log message's (XEN) prefix would have made it out to the screen
(and there's no serial there). It's been a lot of experimenting until I
figured the workaround (which I consider ugly, but halfway acceptable).
I've been trying hard to make sure the workaround wouldn't be masking a
real issue, yet I'm still wary of it possibly doing so ... My best guess
at this point is that on these old IOMMUs the ignored bits 52...61
aren't really ignored for present entries, but also aren't "reserved"
enough to trigger faults. This guess is from having tried to set other
bits in this range (unconditionally, and with the workaround here in
place), which yielded the same behavior.

--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -43,6 +43,9 @@
 #include "vtd.h"
 #include "../ats.h"
 
+#define CONTIG_MASK DMA_PTE_CONTIG_MASK
+#include 
+
 /* dom_io is used as a sentinel for quarantined devices */
 #define QUARANTINE_SKIP(d, pgd_maddr) ((d) == dom_io && !(pgd_maddr))
 #define DEVICE_DOMID(d, pdev) ((d) != dom_io ? (d)->domain_id \
@@ -405,6 +408,9 @@ static uint64_t addr_to_dma_page_maddr(s
 
 write_atomic(&pte->val, new_pte.val);
 iommu_sync_cache(pte, sizeof(struct dma_pte));
+pt_update_contig_markers(&parent->val,
+ address_level_offset(addr, level),
+ level, PTE_kind_table);
 }
 
 if ( --level == target )
@@ -829,9 +835,31 @@ static int dma_pte_clear_one(struct doma
 
 old = *pte;
 dma_clear_pte(*pte);
+iommu_sync_cache(pte, sizeof(*pte));
+
+while ( pt_update_contig_markers(&page->val,
+ address_level_offset(addr, level),
+ level, PTE_kind_null) &&
+++level < min_pt_levels )
+{
+struct page_info *pg = maddr_to_page(pg_maddr);
+
+unmap_vtd_domain_page(page);
+
+pg_maddr = addr_to_dma_page_maddr(domain, addr, level, flush_flags,
+  false);
+BUG_ON(pg_maddr < PAGE_SIZE);
+
+page = map_vtd_domain_page(pg_maddr);
+pte = &page[address_level_offset(addr, level)];
+dma_clear_pte(*pte);
+iommu_sync_cache(pte, sizeof(*pte));
+
+*flush_flags |= IOMMU_FLUSHF_all;
+iommu_queue_free_pgtable(hd, pg);
+}
 
 spin_unlock(&hd->arch.mapping_lock);
-iommu_sync_cache(pte, sizeof(struct dma_pte));
 
 unmap_vtd_domain_page(page);
 
@@ -2177,8 +2205,21 @@ static int __must_check cf_check intel_i
 }
 
 *pte = new;
-
 iommu_sync_cache(pte, sizeof(struct dma_pte));
+
+/*
+ * While the (ab)use of PTE_kind_table here allows to save some work in
+ * the function, the main motivation for it is that it avoids a so far
+ * unexplained hang during boot (while preparing Dom0) on a Westmere
+ * based laptop.
+ */
+pt_update_contig_markers(&page->val,
+ address_level_offset(dfn_to_daddr(dfn), level),
+ level,
+ (hd->platform_ops->page_sizes &
+  (1UL << level_to_offset_bits(level + 1))
+  ? PTE_kind_leaf : PTE_kind_table));
+
 spin_unlock(&hd->arch.mapping_lock);
 unmap_vtd_domain_page(page);

[PATCH v6 07/12] AMD/IOMMU: free all-empty page tables

2022-06-09 Thread Jan Beulich

When a page table ends up with no present entries left, it can be
replaced by a non-present entry at the next higher level. The page table
itself can then be scheduled for freeing.

Note that while its output isn't used there yet,
pt_update_contig_markers() right away needs to be called in all places
where entries get updated, not just the one where entries get cleared.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 
---
v5: Re-base over changes earlier in the series.
v4: Re-base over changes earlier in the series.
v3: Re-base over changes earlier in the series.
v2: New.

--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -21,6 +21,7 @@
 
 #include "iommu.h"
 
+#define CONTIG_MASK IOMMU_PTE_CONTIG_MASK
 #include 
 
 /* Given pfn and page table level, return pde index */
@@ -35,16 +36,20 @@ static unsigned int pfn_to_pde_idx(unsig
 
 static union amd_iommu_pte clear_iommu_pte_present(unsigned long l1_mfn,
unsigned long dfn,
-   unsigned int level)
+   unsigned int level,
+   bool *free)
 {
 union amd_iommu_pte *table, *pte, old;
+unsigned int idx = pfn_to_pde_idx(dfn, level);
 
 table = map_domain_page(_mfn(l1_mfn));
-pte = &table[pfn_to_pde_idx(dfn, level)];
+pte = &table[idx];
 old = *pte;
 
 write_atomic(&pte->raw, 0);
 
+*free = pt_update_contig_markers(&table->raw, idx, level, PTE_kind_null);
+
 unmap_domain_page(table);
 
 return old;
@@ -87,7 +92,11 @@ static union amd_iommu_pte set_iommu_pte
 if ( !old.pr || old.next_level ||
  old.mfn != next_mfn ||
  old.iw != iw || old.ir != ir )
+{
 set_iommu_pde_present(pde, next_mfn, 0, iw, ir);
+pt_update_contig_markers(&table->raw, pfn_to_pde_idx(dfn, level),
+ level, PTE_kind_leaf);
+}
 else
 old.pr = false; /* signal "no change" to the caller */
 
@@ -326,6 +335,9 @@ static int iommu_pde_from_dfn(struct dom
 smp_wmb();
 set_iommu_pde_present(pde, next_table_mfn, next_level, true,
   true);
+pt_update_contig_markers(&next_table_vaddr->raw,
+ pfn_to_pde_idx(dfn, level),
+ level, PTE_kind_table);
 
 *flush_flags |= IOMMU_FLUSHF_modified;
 }
@@ -351,6 +363,9 @@ static int iommu_pde_from_dfn(struct dom
 next_table_mfn = mfn_x(page_to_mfn(table));
 set_iommu_pde_present(pde, next_table_mfn, next_level, true,
   true);
+pt_update_contig_markers(&next_table_vaddr->raw,
+ pfn_to_pde_idx(dfn, level),
+ level, PTE_kind_table);
 }
 else /* should never reach here */
 {
@@ -487,8 +502,24 @@ int cf_check amd_iommu_unmap_page(
 
 if ( pt_mfn )
 {
+bool free;
+
 /* Mark PTE as 'page not present'. */
-old = clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level);
+old = clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level, &free);
+
+while ( unlikely(free) && ++level < hd->arch.amd.paging_mode )
+{
+struct page_info *pg = mfn_to_page(_mfn(pt_mfn));
+
+if ( iommu_pde_from_dfn(d, dfn_x(dfn), level, &pt_mfn,
+flush_flags, false) )
+BUG();
+BUG_ON(!pt_mfn);
+
+clear_iommu_pte_present(pt_mfn, dfn_x(dfn), level, &free);
+*flush_flags |= IOMMU_FLUSHF_all;
+iommu_queue_free_pgtable(hd, pg);
+}
 }
 
 spin_unlock(&hd->arch.mapping_lock);

[PATCH v6 05/12] x86: introduce helper for recording degree of contiguity in page tables

2022-06-09 Thread Jan Beulich

This is a re-usable helper (kind of a template) which gets introduced
without users so that the individual subsequent patches introducing such
users can get committed independently of one another.

See the comment at the top of the new file. To demonstrate the effect,
if a page table had just 16 entries, this would be the set of markers
for a page table with fully contiguous mappings:

index  0 1 2 3 4 5 6 7 8 9 A B C D E F
marker 4 0 1 0 2 0 1 0 3 0 1 0 2 0 1 0

"Contiguous" here means not only present entries with successively
increasing MFNs, each one suitably aligned for its slot, but also a
respective number of all non-present entries.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 
---
v5: Bail early from step 1 if possible. Arrange for consumers who are
just after CONTIG_{LEVEL_SHIFT,NR}. Extend comment.
v3: Rename function and header. Introduce IS_CONTIG().
v2: New.

--- /dev/null
+++ b/xen/arch/x86/include/asm/pt-contig-markers.h
@@ -0,0 +1,110 @@
+#ifndef __ASM_X86_PT_CONTIG_MARKERS_H
+#define __ASM_X86_PT_CONTIG_MARKERS_H
+
+/*
+ * Short of having function templates in C, the function defined below is
+ * intended to be used by multiple parties interested in recording the
+ * degree of contiguity in mappings by a single page table.
+ *
+ * Scheme: Every entry records the order of contiguous successive entries,
+ * up to the maximum order covered by that entry (which is the number of
+ * clear low bits in its index, with entry 0 being the exception using
+ * the base-2 logarithm of the number of entries in a single page table).
+ * While a few entries need touching upon update, knowing whether the
+ * table is fully contiguous (and can hence be replaced by a higher level
+ * leaf entry) is then possible by simply looking at entry 0's marker.
+ *
+ * Prereqs:
+ * - CONTIG_MASK needs to be #define-d, to a value having at least 4
+ *   contiguous bits (ignored by hardware), before including this file (or
+ *   else only CONTIG_LEVEL_SHIFT and CONTIG_NR will become available),
+ * - page tables to be passed to the helper need to be initialized with
+ *   correct markers,
+ * - not-present entries need to be entirely clear except for the marker.
+ */
+
+/* This is the same for all anticipated users, so doesn't need passing in. */
+#define CONTIG_LEVEL_SHIFT 9
+#define CONTIG_NR  (1 << CONTIG_LEVEL_SHIFT)
+
+#ifdef CONTIG_MASK
+
+#include 
+#include 
+#include 
+
+#define GET_MARKER(e) MASK_EXTR(e, CONTIG_MASK)
+#define SET_MARKER(e, m) \
+((void)((e) = ((e) & ~CONTIG_MASK) | MASK_INSR(m, CONTIG_MASK)))
+
+#define IS_CONTIG(kind, pt, i, idx, shift, b) \
+((kind) == PTE_kind_leaf \
+ ? (((pt)[i] ^ (pt)[idx]) & ~CONTIG_MASK) == (1ULL << ((b) + (shift))) \
+ : !((pt)[i] & ~CONTIG_MASK))
+
+enum PTE_kind {
+PTE_kind_null,
+PTE_kind_leaf,
+PTE_kind_table,
+};
+
+static bool pt_update_contig_markers(uint64_t *pt, unsigned int idx,
+ unsigned int level, enum PTE_kind kind)
+{
+unsigned int b, i = idx;
+unsigned int shift = (level - 1) * CONTIG_LEVEL_SHIFT + PAGE_SHIFT;
+
+ASSERT(idx < CONTIG_NR);
+ASSERT(!(pt[idx] & CONTIG_MASK));
+
+/* Step 1: Reduce markers in lower numbered entries. */
+while ( i )
+{
+b = find_first_set_bit(i);
+i &= ~(1U << b);
+if ( GET_MARKER(pt[i]) <= b )
+break;
+SET_MARKER(pt[i], b);
+}
+
+/* An intermediate table is never contiguous with anything. */
+if ( kind == PTE_kind_table )
+return false;
+
+/*
+ * Present entries need in-sync index and address to be a candidate
+ * for being contiguous: What we're after is whether ultimately the
+ * intermediate table can be replaced by a superpage.
+ */
+if ( kind != PTE_kind_null &&
+ idx != ((pt[idx] >> shift) & (CONTIG_NR - 1)) )
+return false;
+
+/* Step 2: Check higher numbered entries for contiguity. */
+for ( b = 0; b < CONTIG_LEVEL_SHIFT && !(idx & (1U << b)); ++b )
+{
+i = idx | (1U << b);
+if ( !IS_CONTIG(kind, pt, i, idx, shift, b) || GET_MARKER(pt[i]) != b )
+break;
+}
+
+/* Step 3: Update markers in this and lower numbered entries. */
+for ( ; SET_MARKER(pt[idx], b), b < CONTIG_LEVEL_SHIFT; ++b )
+{
+i = idx ^ (1U << b);
+if ( !IS_CONTIG(kind, pt, i, idx, shift, b) || GET_MARKER(pt[i]) != b )
+break;
+idx &= ~(1U << b);
+}
+
+return b == CONTIG_LEVEL_SHIFT;
+}
+
+#undef IS_CONTIG
+#undef SET_MARKER
+#undef GET_MARKER
+#undef CONTIG_MASK
+
+#endif /* CONTIG_MASK */
+
+#endif /* __ASM_X86_PT_CONTIG_MARKERS_H */

[PATCH v6 04/12] VT-d: allow use of superpage mappings

2022-06-09 Thread Jan Beulich

... depending on feature availability (and absence of quirks).

Also make the page table dumping function aware of superpages.

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
Reviewed-by: Roger Pau Monné 
---
v6: Re-base over addition of "iommu=no-superpages" command line option.
v5: In intel_iommu_{,un}map_page() assert page order is supported.
v4: Change type of queue_free_pt()'s 1st parameter. Re-base.
v3: Rename queue_free_pt()'s last parameter. Replace "level > 1" checks
where possible. Tighten assertion.

--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -779,18 +779,37 @@ static int __must_check cf_check iommu_f
 return ret;
 }
 
+static void queue_free_pt(struct domain_iommu *hd, mfn_t mfn, unsigned int 
level)
+{
+if ( level > 1 )
+{
+struct dma_pte *pt = map_domain_page(mfn);
+unsigned int i;
+
+for ( i = 0; i < PTE_NUM; ++i )
+if ( dma_pte_present(pt[i]) && !dma_pte_superpage(pt[i]) )
+queue_free_pt(hd, maddr_to_mfn(dma_pte_addr(pt[i])),
+  level - 1);
+
+unmap_domain_page(pt);
+}
+
+iommu_queue_free_pgtable(hd, mfn_to_page(mfn));
+}
+
 /* clear one page's page table */
 static int dma_pte_clear_one(struct domain *domain, daddr_t addr,
  unsigned int order,
  unsigned int *flush_flags)
 {
 struct domain_iommu *hd = dom_iommu(domain);
-struct dma_pte *page = NULL, *pte = NULL;
+struct dma_pte *page = NULL, *pte = NULL, old;
 u64 pg_maddr;
+unsigned int level = (order / LEVEL_STRIDE) + 1;
 
 spin_lock(&hd->arch.mapping_lock);
-/* get last level pte */
-pg_maddr = addr_to_dma_page_maddr(domain, addr, 1, flush_flags, false);
+/* get target level pte */
+pg_maddr = addr_to_dma_page_maddr(domain, addr, level, flush_flags, false);
 if ( pg_maddr < PAGE_SIZE )
 {
 spin_unlock(&hd->arch.mapping_lock);
@@ -798,7 +817,7 @@ static int dma_pte_clear_one(struct doma
 }
 
 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
-pte = page + address_level_offset(addr, 1);
+pte = &page[address_level_offset(addr, level)];
 
 if ( !dma_pte_present(*pte) )
 {
@@ -807,14 +826,20 @@ static int dma_pte_clear_one(struct doma
 return 0;
 }
 
+old = *pte;
 dma_clear_pte(*pte);
-*flush_flags |= IOMMU_FLUSHF_modified;
 
 spin_unlock(&hd->arch.mapping_lock);
 iommu_sync_cache(pte, sizeof(struct dma_pte));
 
 unmap_vtd_domain_page(page);
 
+*flush_flags |= IOMMU_FLUSHF_modified;
+
+if ( order && !dma_pte_superpage(old) )
+queue_free_pt(hd, maddr_to_mfn(dma_pte_addr(old)),
+  order / LEVEL_STRIDE);
+
 return 0;
 }
 
@@ -2092,8 +2117,12 @@ static int __must_check cf_check intel_i
 struct domain_iommu *hd = dom_iommu(d);
 struct dma_pte *page, *pte, old, new = {};
 u64 pg_maddr;
+unsigned int level = (IOMMUF_order(flags) / LEVEL_STRIDE) + 1;
 int rc = 0;
 
+ASSERT((hd->platform_ops->page_sizes >> IOMMUF_order(flags)) &
+   PAGE_SIZE_4K);
+
 /* Do nothing if VT-d shares EPT page table */
 if ( iommu_use_hap_pt(d) )
 return 0;
@@ -2116,7 +2145,7 @@ static int __must_check cf_check intel_i
 return 0;
 }
 
-pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), 1, flush_flags,
+pg_maddr = addr_to_dma_page_maddr(d, dfn_to_daddr(dfn), level, flush_flags,
   true);
 if ( pg_maddr < PAGE_SIZE )
 {
@@ -2125,13 +2154,15 @@ static int __must_check cf_check intel_i
 }
 
 page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
-pte = &page[dfn_x(dfn) & LEVEL_MASK];
+pte = &page[address_level_offset(dfn_to_daddr(dfn), level)];
 old = *pte;
 
 dma_set_pte_addr(new, mfn_to_maddr(mfn));
 dma_set_pte_prot(new,
  ((flags & IOMMUF_readable) ? DMA_PTE_READ  : 0) |
  ((flags & IOMMUF_writable) ? DMA_PTE_WRITE : 0));
+if ( IOMMUF_order(flags) )
+dma_set_pte_superpage(new);
 
 /* Set the SNP on leaf page table if Snoop Control available */
 if ( iommu_snoop )
@@ -2152,14 +2183,26 @@ static int __must_check cf_check intel_i
 
 *flush_flags |= IOMMU_FLUSHF_added;
 if ( dma_pte_present(old) )
+{
 *flush_flags |= IOMMU_FLUSHF_modified;
 
+if ( IOMMUF_order(flags) && !dma_pte_superpage(old) )
+queue_free_pt(hd, maddr_to_mfn(dma_pte_addr(old)),
+  IOMMUF_order(flags) / LEVEL_STRIDE);
+}
+
 return rc;
 }
 
 static int __must_check cf_check intel_iommu_unmap_page(
 struct domain *d, dfn_t dfn, unsigned int order, unsigned int *flush_flags)
 {
+/*
+ * While really we could unmap at any granularity, for now we assume unmaps
+ * are issued by common code only at the same granularity as maps.
+ */
+ASSERT((dom_

[PATCH v6 03/12] AMD/IOMMU: allow use of superpage mappings

2022-06-09 Thread Jan Beulich

No separate feature flags exist which would control availability of
these; the only restriction is HATS (establishing the maximum number of
page table levels in general), and even that has a lower bound of 4.
Thus we can unconditionally announce 2M and 1G mappings. (Via non-
default page sizes the implementation in principle permits arbitrary
size mappings, but these require multiple identical leaf PTEs to be
written, which isn't all that different from having to write multiple
consecutive PTEs with increasing frame numbers. IMO that's therefore
beneficial only on hardware where suitable TLBs exist; I'm unaware of
such hardware.)

Note that in principle 512G and 256T mappings could also be supported
right away, but the freeing of page tables (to be introduced in
subsequent patches) when replacing a sufficiently populated tree with a
single huge page would need suitable preemption, which will require
extra work.

Signed-off-by: Jan Beulich 
Reviewed-by: Roger Pau Monné 
---
v5: Drop PAGE_SIZE_512G. In amd_iommu_{,un}map_page() assert page order
is supported.
v4: Change type of queue_free_pt()'s 1st parameter. Re-base.
v3: Rename queue_free_pt()'s last parameter. Replace "level > 1" checks
where possible.

--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -32,12 +32,13 @@ static unsigned int pfn_to_pde_idx(unsig
 }
 
 static union amd_iommu_pte clear_iommu_pte_present(unsigned long l1_mfn,
-   unsigned long dfn)
+   unsigned long dfn,
+   unsigned int level)
 {
 union amd_iommu_pte *table, *pte, old;
 
 table = map_domain_page(_mfn(l1_mfn));
-pte = &table[pfn_to_pde_idx(dfn, 1)];
+pte = &table[pfn_to_pde_idx(dfn, level)];
 old = *pte;
 
 write_atomic(&pte->raw, 0);
@@ -351,15 +352,39 @@ static int iommu_pde_from_dfn(struct dom
 return 0;
 }
 
+static void queue_free_pt(struct domain_iommu *hd, mfn_t mfn, unsigned int 
level)
+{
+if ( level > 1 )
+{
+union amd_iommu_pte *pt = map_domain_page(mfn);
+unsigned int i;
+
+for ( i = 0; i < PTE_PER_TABLE_SIZE; ++i )
+if ( pt[i].pr && pt[i].next_level )
+{
+ASSERT(pt[i].next_level < level);
+queue_free_pt(hd, _mfn(pt[i].mfn), pt[i].next_level);
+}
+
+unmap_domain_page(pt);
+}
+
+iommu_queue_free_pgtable(hd, mfn_to_page(mfn));
+}
+
 int cf_check amd_iommu_map_page(
 struct domain *d, dfn_t dfn, mfn_t mfn, unsigned int flags,
 unsigned int *flush_flags)
 {
 struct domain_iommu *hd = dom_iommu(d);
+unsigned int level = (IOMMUF_order(flags) / PTE_PER_TABLE_SHIFT) + 1;
 int rc;
 unsigned long pt_mfn = 0;
 union amd_iommu_pte old;
 
+ASSERT((hd->platform_ops->page_sizes >> IOMMUF_order(flags)) &
+   PAGE_SIZE_4K);
+
 spin_lock(&hd->arch.mapping_lock);
 
 /*
@@ -384,7 +409,7 @@ int cf_check amd_iommu_map_page(
 return rc;
 }
 
-if ( iommu_pde_from_dfn(d, dfn_x(dfn), 1, &pt_mfn, flush_flags, true) ||
+if ( iommu_pde_from_dfn(d, dfn_x(dfn), level, &pt_mfn, flush_flags, true) 
||
  !pt_mfn )
 {
 spin_unlock(&hd->arch.mapping_lock);
@@ -394,8 +419,8 @@ int cf_check amd_iommu_map_page(
 return -EFAULT;
 }
 
-/* Install 4k mapping */
-old = set_iommu_pte_present(pt_mfn, dfn_x(dfn), mfn_x(mfn), 1,
+/* Install mapping */
+old = set_iommu_pte_present(pt_mfn, dfn_x(dfn), mfn_x(mfn), level,
 (flags & IOMMUF_writable),
 (flags & IOMMUF_readable));
 
@@ -403,8 +428,13 @@ int cf_check amd_iommu_map_page(
 
 *flush_flags |= IOMMU_FLUSHF_added;
 if ( old.pr )
+{
 *flush_flags |= IOMMU_FLUSHF_modified;
 
+if ( IOMMUF_order(flags) && old.next_level )
+queue_free_pt(hd, _mfn(old.mfn), old.next_level);
+}
+
 return 0;
 }
 
@@ -413,8 +443,15 @@ int cf_check amd_iommu_unmap_page(
 {
 unsigned long pt_mfn = 0;
 struct domain_iommu *hd = dom_iommu(d);
+unsigned int level = (order / PTE_PER_TABLE_SHIFT) + 1;
 union amd_iommu_pte old = {};
 
+/*
+ * While really we could unmap at any granularity, for now we assume unmaps
+ * are issued by common code only at the same granularity as maps.
+ */
+ASSERT((hd->platform_ops->page_sizes >> order) & PAGE_SIZE_4K);
+
 spin_lock(&hd->arch.mapping_lock);
 
 if ( !hd->arch.amd.root_table )
@@ -423,7 +460,7 @@ int cf_check amd_iommu_unmap_page(
 return 0;
 }
 
-if ( iommu_pde_from_dfn(d, dfn_x(dfn), 1, &pt_mfn, flush_flags, false) )
+if ( iommu_pde_from_dfn(d, dfn_x(dfn), level, &pt_mfn, flush_flags, false) 
)
 {
 spin_unlock(&hd->arch.mapping_lock);
 AMD_IOMMU_ERROR("invalid IO pagetable entry dfn = %"PRI_dfn"\n",
@@ -43

Re: [XEN PATCH 1/4] build: xen/include: use if_changed

2022-06-09 Thread Bertrand Marquis

Hi Anthony,

> On 1 Jun 2022, at 17:59, Anthony PERARD  wrote:
> 
> Use "define" for the headers*_chk commands as otherwise the "#"
> is interpreted as a comment and make can't find the end of
> $(foreach,).
> 
> Adding several .PRECIOUS as without them `make` deletes the
> intermediate targets. This is an issue because the macro $(if_changed,)
> check if the target exist in order to decide whether to recreate the
> target.
> 
> Removing the call to `mkdir` from the commands. Those aren't needed
> anymore because a rune in Rules.mk creates the directory for each
> $(targets).
> 
> Remove "export PYTHON" as it is already exported.

With this change, compiling for x86 is now ending up in:
CHK include/headers99.chk
make[9]: execvp: /bin/sh: Argument list too long
make[9]: *** [include/Makefile:181: include/headers++.chk] Error 127

Not quite sure yet why but I wanted to signal it early as other might be 
impacted.

Arm and arm64 builds are not impacted.

Cheers
Bertrand

> 
> Signed-off-by: Anthony PERARD 
> ---
> xen/include/Makefile | 108 ++-
> 1 file changed, 76 insertions(+), 32 deletions(-)
> 
> diff --git a/xen/include/Makefile b/xen/include/Makefile
> index 03baf10efb..6d9bcc19b0 100644
> --- a/xen/include/Makefile
> +++ b/xen/include/Makefile
> @@ -45,38 +45,65 @@ public-$(CONFIG_ARM) := $(wildcard 
> $(srcdir)/public/arch-arm/*.h $(srcdir)/publi
> .PHONY: all
> all: $(addprefix $(obj)/,$(headers-y))
> 
> -$(obj)/compat/%.h: $(obj)/compat/%.i $(srcdir)/Makefile 
> $(srctree)/tools/compat-build-header.py
> - $(PYTHON) $(srctree)/tools/compat-build-header.py <$< $(patsubst 
> $(obj)/%,%,$@) >>$@.new; \
> - mv -f $@.new $@
> -
> -$(obj)/compat/%.i: $(obj)/compat/%.c $(srcdir)/Makefile
> - $(CPP) $(filter-out -Wa$(comma)% -include 
> %/include/xen/config.h,$(XEN_CFLAGS)) $(cppflags-y) -o $@ $<
> -
> -$(obj)/compat/%.c: $(src)/public/%.h $(srcdir)/xlat.lst $(srcdir)/Makefile 
> $(srctree)/tools/compat-build-source.py
> - mkdir -p $(@D)
> - $(PYTHON) $(srctree)/tools/compat-build-source.py $(srcdir)/xlat.lst 
> <$< >$@.new
> - mv -f $@.new $@
> -
> -$(obj)/compat/.xlat/%.h: $(obj)/compat/%.h $(obj)/compat/.xlat/%.lst 
> $(srctree)/tools/get-fields.sh $(srcdir)/Makefile
> - export PYTHON=$(PYTHON); \
> - while read what name; do \
> - $(SHELL) $(srctree)/tools/get-fields.sh "$$what" compat_$$name 
> $< || exit $$?; \
> - done <$(patsubst $(obj)/compat/%,$(obj)/compat/.xlat/%,$(basename 
> $<)).lst >$@.new
> - mv -f $@.new $@
> +quiet_cmd_compat_h = GEN $@
> +cmd_compat_h = \
> +$(PYTHON) $(srctree)/tools/compat-build-header.py <$< $(patsubst 
> $(obj)/%,%,$@) >>$@.new; \
> +mv -f $@.new $@
> +
> +quiet_cmd_compat_i = CPP $@
> +cmd_compat_i = $(CPP) $(filter-out -Wa$(comma)% -include 
> %/include/xen/config.h,$(XEN_CFLAGS)) $(cppflags-y) -o $@ $<
> +
> +quiet_cmd_compat_c = GEN $@
> +cmd_compat_c = \
> +   $(PYTHON) $(srctree)/tools/compat-build-source.py $(srcdir)/xlat.lst <$< 
> >$@.new; \
> +   mv -f $@.new $@
> +
> +quiet_cmd_xlat_headers = GEN $@
> +cmd_xlat_headers = \
> +while read what name; do \
> +$(SHELL) $(srctree)/tools/get-fields.sh "$$what" compat_$$name $< || 
> exit $$?; \
> +done <$(patsubst $(obj)/compat/%,$(obj)/compat/.xlat/%,$(basename 
> $<)).lst >$@.new; \
> +mv -f $@.new $@
> +
> +targets += $(headers-y)
> +$(obj)/compat/%.h: $(obj)/compat/%.i $(srctree)/tools/compat-build-header.py 
> FORCE
> + $(call if_changed,compat_h)
> +
> +.PRECIOUS: $(obj)/compat/%.i
> +targets += $(patsubst %.h, %.i, $(headers-y))
> +$(obj)/compat/%.i: $(obj)/compat/%.c FORCE
> + $(call if_changed,compat_i)
> +
> +.PRECIOUS: $(obj)/compat/%.c
> +targets += $(patsubst %.h, %.c, $(headers-y))
> +$(obj)/compat/%.c: $(src)/public/%.h $(srcdir)/xlat.lst 
> $(srctree)/tools/compat-build-source.py FORCE
> + $(call if_changed,compat_c)
> +
> +targets += $(patsubst compat/%, compat/.xlat/%, $(headers-y))
> +$(obj)/compat/.xlat/%.h: $(obj)/compat/%.h $(obj)/compat/.xlat/%.lst 
> $(srctree)/tools/get-fields.sh FORCE
> + $(call if_changed,xlat_headers)
> +
> +quiet_cmd_xlat_lst = GEN $@
> +cmd_xlat_lst = \
> + grep -v '^[[:blank:]]*$(pound)' $< | sed -ne 
> 's,@arch@,$(compat-arch-y),g' -re 's,[[:blank:]]+$*\.h[[:blank:]]*$$,,p' 
> >$@.new; \
> + $(call move-if-changed,$@.new,$@)
> 
> .PRECIOUS: $(obj)/compat/.xlat/%.lst
> -$(obj)/compat/.xlat/%.lst: $(srcdir)/xlat.lst $(srcdir)/Makefile
> - mkdir -p $(@D)
> - grep -v '^[[:blank:]]*#' $< | sed -ne 's,@arch@,$(compat-arch-y),g' -re 
> 's,[[:blank:]]+$*\.h[[:blank:]]*$$,,p' >$@.new
> - $(call move-if-changed,$@.new,$@)
> +targets += $(patsubst compat/%.h, compat/.xlat/%.lst, $(headers-y))
> +$(obj)/compat/.xlat/%.lst: $(srcdir)/xlat.lst FORCE
> + $(call if_changed,xlat_lst)
> 
> xlat-y := $(shell sed -ne 's,@arch@,$(compat-arch-y),g' -re 
> 's,^[?!][[:blank:]]+[^[:blank:]]+[[:blank:]]+,,p' $(

[PATCH v6 01/12] IOMMU/x86: support freeing of pagetables

2022-06-09 Thread Jan Beulich

For vendor specific code to support superpages we need to be able to
deal with a superpage mapping replacing an intermediate page table (or
hierarchy thereof). Consequently an iommu_alloc_pgtable() counterpart is
needed to free individual page tables while a domain is still alive.
Since the freeing needs to be deferred until after a suitable IOTLB
flush was performed, released page tables get queued for processing by a
tasklet.

Signed-off-by: Jan Beulich 
---
I was considering whether to use a softirq-tasklet instead. This would
have the benefit of avoiding extra scheduling operations, but come with
the risk of the freeing happening prematurely because of a
process_pending_softirqs() somewhere.
---
v6: Extend comment on the use of process_pending_softirqs().
v5: Fix CPU_UP_PREPARE for BIGMEM. Schedule tasklet in CPU_DOWN_FAILED
when list is not empty. Skip all processing in CPU_DEAD when list is
empty.
v4: Change type of iommu_queue_free_pgtable()'s 1st parameter. Re-base.
v3: Call process_pending_softirqs() from free_queued_pgtables().

--- a/xen/arch/x86/include/asm/iommu.h
+++ b/xen/arch/x86/include/asm/iommu.h
@@ -147,6 +147,7 @@ void iommu_free_domid(domid_t domid, uns
 int __must_check iommu_free_pgtables(struct domain *d);
 struct domain_iommu;
 struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd);
+void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg);
 
 #endif /* !__ARCH_X86_IOMMU_H__ */
 /*
--- a/xen/drivers/passthrough/x86/iommu.c
+++ b/xen/drivers/passthrough/x86/iommu.c
@@ -12,6 +12,7 @@
  * this program; If not, see .
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -551,6 +552,103 @@ struct page_info *iommu_alloc_pgtable(st
 return pg;
 }
 
+/*
+ * Intermediate page tables which get replaced by large pages may only be
+ * freed after a suitable IOTLB flush. Hence such pages get queued on a
+ * per-CPU list, with a per-CPU tasklet processing the list on the assumption
+ * that the necessary IOTLB flush will have occurred by the time tasklets get
+ * to run. (List and tasklet being per-CPU has the benefit of accesses not
+ * requiring any locking.)
+ */
+static DEFINE_PER_CPU(struct page_list_head, free_pgt_list);
+static DEFINE_PER_CPU(struct tasklet, free_pgt_tasklet);
+
+static void free_queued_pgtables(void *arg)
+{
+struct page_list_head *list = arg;
+struct page_info *pg;
+unsigned int done = 0;
+
+while ( (pg = page_list_remove_head(list)) )
+{
+free_domheap_page(pg);
+
+/*
+ * Just to be on the safe side, check for processing softirqs every
+ * once in a while.  Generally it is expected that parties queuing
+ * pages for freeing will find a need for preemption before too many
+ * pages can be queued.  Granularity of checking is somewhat arbitrary.
+ */
+if ( !(++done & 0x1ff) )
+ process_pending_softirqs();
+}
+}
+
+void iommu_queue_free_pgtable(struct domain_iommu *hd, struct page_info *pg)
+{
+unsigned int cpu = smp_processor_id();
+
+spin_lock(&hd->arch.pgtables.lock);
+page_list_del(pg, &hd->arch.pgtables.list);
+spin_unlock(&hd->arch.pgtables.lock);
+
+page_list_add_tail(pg, &per_cpu(free_pgt_list, cpu));
+
+tasklet_schedule(&per_cpu(free_pgt_tasklet, cpu));
+}
+
+static int cf_check cpu_callback(
+struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+unsigned int cpu = (unsigned long)hcpu;
+struct page_list_head *list = &per_cpu(free_pgt_list, cpu);
+struct tasklet *tasklet = &per_cpu(free_pgt_tasklet, cpu);
+
+switch ( action )
+{
+case CPU_DOWN_PREPARE:
+tasklet_kill(tasklet);
+break;
+
+case CPU_DEAD:
+if ( !page_list_empty(list) )
+{
+page_list_splice(list, &this_cpu(free_pgt_list));
+INIT_PAGE_LIST_HEAD(list);
+tasklet_schedule(&this_cpu(free_pgt_tasklet));
+}
+break;
+
+case CPU_UP_PREPARE:
+INIT_PAGE_LIST_HEAD(list);
+fallthrough;
+case CPU_DOWN_FAILED:
+tasklet_init(tasklet, free_queued_pgtables, list);
+if ( !page_list_empty(list) )
+tasklet_schedule(tasklet);
+break;
+}
+
+return NOTIFY_DONE;
+}
+
+static struct notifier_block cpu_nfb = {
+.notifier_call = cpu_callback,
+};
+
+static int __init cf_check bsp_init(void)
+{
+if ( iommu_enabled )
+{
+cpu_callback(&cpu_nfb, CPU_UP_PREPARE,
+ (void *)(unsigned long)smp_processor_id());
+register_cpu_notifier(&cpu_nfb);
+}
+
+return 0;
+}
+presmp_initcall(bsp_init);
+
 bool arch_iommu_use_permitted(const struct domain *d)
 {
 /*

[PATCH v6 00/12] IOMMU: superpage support when not sharing pagetables

2022-06-09 Thread Jan Beulich

For a long time we've been rather inefficient with IOMMU page table
management when not sharing page tables, i.e. in particular for PV (and
further specifically also for PV Dom0) and AMD (where nowadays we never
share page tables). While up to about 3.5 years ago AMD code had logic
to un-shatter page mappings, that logic was ripped out for being buggy
(XSA-275 plus follow-on).

This series enables use of large pages in AMD and Intel (VT-d) code;
Arm is presently not in need of any enabling as pagetables are always
shared there. It also augments PV Dom0 creation with suitable explicit
IOMMU mapping calls to facilitate use of large pages there. Depending
on the amount of memory handed to Dom0 this improves booting time
(latency until Dom0 actually starts) quite a bit; subsequent shattering
of some of the large pages may of course consume some of the saved time.

Known fallout has been spelled out here:
https://lists.xen.org/archives/html/xen-devel/2021-08/msg00781.html

I'm inclined to say "of course" there are also a few seemingly unrelated
changes included here, which I just came to consider necessary or at
least desirable (in part for having been in need of adjustment for a
long time) along the way.

See individual patches for details on the v6 changes.

01: IOMMU/x86: support freeing of pagetables
02: IOMMU/x86: new command line option to suppress use of superpage mappings
03: AMD/IOMMU: allow use of superpage mappings
04: VT-d: allow use of superpage mappings
05: x86: introduce helper for recording degree of contiguity in page tables
06: IOMMU/x86: prefill newly allocate page tables
07: AMD/IOMMU: free all-empty page tables
08: VT-d: free all-empty page tables
09: AMD/IOMMU: replace all-contiguous page tables by superpage mappings
10: VT-d: replace all-contiguous page tables by superpage mappings
11: IOMMU/x86: add perf counters for page table splitting / coalescing
12: VT-d: fold dma_pte_clear_one() into its only caller

While not directly related (except that making this mode work properly
here was a fair part of the overall work), at this occasion I'd also
like to renew my proposal to make "iommu=dom0-strict" the default going
forward. It already is not only the default, but the only possible mode
for PVH Dom0.

Jan

Re: [PATCH v2] xen: Add MISRA support to cppcheck make rule

2022-06-09 Thread Jan Beulich

On 09.06.2022 11:34, Bertrand Marquis wrote:
> cppcheck MISRA addon can be used to check for non compliance to some of
> the MISRA standard rules.
> 
> Add a CPPCHECK_MISRA variable that can be set to "y" using make command
> line to generate a cppcheck report including cppcheck misra checks.
> 
> When MISRA checking is enabled, a file with a text description suitable
> for cppcheck misra addon is generated out of Xen documentation file
> which lists the rules followed by Xen (docs/misra/rules.rst).
> 
> By default MISRA checking is turned off.
> 
> While adding cppcheck-misra files to gitignore, also fix the missing /
> for htmlreport gitignore
> 
> Signed-off-by: Bertrand Marquis 
> ---
> Changes in v2:
> - fix missing / for htmlreport
> - use wildcard for cppcheck-misra remove and gitignore
> - fix comment in makefile
> - fix dependencies for generation of json and txt file
> ---
>  .gitignore |   3 +-
>  xen/Makefile   |  29 ++-
>  xen/tools/convert_misra_doc.py | 139 +
>  3 files changed, 168 insertions(+), 3 deletions(-)
>  create mode 100755 xen/tools/convert_misra_doc.py
> 
> diff --git a/.gitignore b/.gitignore
> index 18ef56a780..b106caa7a9 100644
> --- a/.gitignore
> +++ b/.gitignore
> @@ -297,6 +297,7 @@ xen/.banner
>  xen/.config
>  xen/.config.old
>  xen/.xen.elf32
> +xen/cppcheck-misra.*

As said on v1, this wants to be added further down, while ...

>  xen/xen-cppcheck.xml

... this line wants moving down at this occasion or in a separate
change.

>  xen/System.map
>  xen/arch/x86/boot/mkelf32
> @@ -318,7 +319,7 @@ xen/arch/*/efi/runtime.c
>  xen/arch/*/include/asm/asm-offsets.h
>  xen/common/config_data.S
>  xen/common/config.gz
> -xen/cppcheck-htmlreport
> +xen/cppcheck-htmlreport/
>  xen/include/headers*.chk
>  xen/include/compat/*
>  xen/include/config/

xen/cppcheck-misra.* wants to go alongside the line you adjust, while
xen/xen-cppcheck.xml belongs yet further down.

Jan

Re: [PATCH v2 3/3] x86/vmx: implement Notify VM Exit

2022-06-09 Thread Roger Pau Monné

On Thu, Jun 09, 2022 at 03:39:33PM +0800, Xiaoyao Li wrote:
> On 6/9/2022 3:04 PM, Tian, Kevin wrote:
> > +Chenyi/Xiaoyao who worked on the KVM support. Presumably
> > similar opens have been discussed in KVM hence they have the
> > right background to comment here.
> > 
> > > From: Roger Pau Monne 
> > > Sent: Thursday, May 26, 2022 7:12 PM
> > > 
> > > Under certain conditions guests can get the CPU stuck in an unbounded
> > > loop without the possibility of an interrupt window to occur on
> > > instruction boundary.  This was the case with the scenarios described
> > > in XSA-156.
> > > 
> > > Make use of the Notify VM Exit mechanism, that will trigger a VM Exit
> > > if no interrupt window occurs for a specified amount of time.  Note
> > > that using the Notify VM Exit avoids having to trap #AC and #DB
> > > exceptions, as Xen is guaranteed to get a VM Exit even if the guest
> > > puts the CPU in a loop without an interrupt window, as such disable
> > > the intercepts if the feature is available and enabled.
> > > 
> > > Setting the notify VM exit window to 0 is safe because there's a
> > > threshold added by the hardware in order to have a sane window value.
> > > 
> > > Suggested-by: Andrew Cooper 
> > > Signed-off-by: Roger Pau Monné 
> > > ---
> > > Changes since v1:
> > >   - Properly update debug state when using notify VM exit.
> > >   - Reword commit message.
> > > ---
> > > This change enables the notify VM exit by default, KVM however doesn't
> > > seem to enable it by default, and there's the following note in the
> > > commit message:
> > > 
> > > "- There's a possibility, however small, that a notify VM exit happens
> > > with VM_CONTEXT_INVALID set in exit qualification. In this case, the
> > > vcpu can no longer run. To avoid killing a well-behaved guest, set
> > > notify window as -1 to disable this feature by default."
> > > 
> > > It's not obviously clear to me whether the comment was meant to be:
> > > "There's a possibility, however small, that a notify VM exit _wrongly_
> > > happens with VM_CONTEXT_INVALID".
> > > 
> > > It's also not clear whether such wrong hardware behavior only affects
> > > a specific set of hardware,
> 
> I'm not sure what you mean for a specific set of hardware.
> 
> We make it default off in KVM just in case that future silicon wrongly sets
> VM_CONTEXT_INVALID bit. Becuase we make the policy that VM cannot continue
> running in that case.
> 
> For the worst case, if some future silicon happens to have this kind silly
> bug, then the existing product kernel all suffer the possibility that their
> VM being killed due to the feature is default on.

That's IMO a weird policy.  If there's such behavior in any hardware
platform I would assume Intel would issue an errata, and then we would
just avoid using the feature on affected hardware (like we do with
other hardware features when they have erratas).

If we applied the same logic to all new Intel features we won't use
any of them.  At least in Xen there are already combinations of vmexit
conditions that will lead to the guest being killed.

> > > in a way that we could avoid enabling
> > > notify VM exit there.
> > > 
> > > There's a discussion in one of the Linux patches that 128K might be
> > > the safer value in order to prevent false positives, but I have no
> > > formal confirmation about this.  Maybe our Intel maintainers can
> > > provide some more feedback on a suitable notify VM exit window
> > > value.
> 
> The 128k is the internal threshold for SPR silicon. The internal threshold
> is tuned by Intel for each silicon, to make sure it's big enough to avoid
> false positive even when user set vmcs.notify_window to 0.
> 
> However, it varies for different processor generations.
> 
> What is the suitable value is hard to say, it depends on how soon does VMM
> want to intercept the VM. Anyway, Intel ensures that even value 0 is safe.

Ideally we need a fixed default value that's guaranteed to work on all
possible hardware that supports the feature, or alternatively a way to
calculate a sane default window based on the hardware platform.

Could we get some wording added to the ISE regarding 0 being a
suitable default value to use because hardware will add a threshold
internally to make the value safe?

Thanks, Roger.

[PATCH v2] xen: Add MISRA support to cppcheck make rule

2022-06-09 Thread Bertrand Marquis

cppcheck MISRA addon can be used to check for non compliance to some of
the MISRA standard rules.

Add a CPPCHECK_MISRA variable that can be set to "y" using make command
line to generate a cppcheck report including cppcheck misra checks.

When MISRA checking is enabled, a file with a text description suitable
for cppcheck misra addon is generated out of Xen documentation file
which lists the rules followed by Xen (docs/misra/rules.rst).

By default MISRA checking is turned off.

While adding cppcheck-misra files to gitignore, also fix the missing /
for htmlreport gitignore

Signed-off-by: Bertrand Marquis 
---
Changes in v2:
- fix missing / for htmlreport
- use wildcard for cppcheck-misra remove and gitignore
- fix comment in makefile
- fix dependencies for generation of json and txt file
---
 .gitignore |   3 +-
 xen/Makefile   |  29 ++-
 xen/tools/convert_misra_doc.py | 139 +
 3 files changed, 168 insertions(+), 3 deletions(-)
 create mode 100755 xen/tools/convert_misra_doc.py

diff --git a/.gitignore b/.gitignore
index 18ef56a780..b106caa7a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -297,6 +297,7 @@ xen/.banner
 xen/.config
 xen/.config.old
 xen/.xen.elf32
+xen/cppcheck-misra.*
 xen/xen-cppcheck.xml
 xen/System.map
 xen/arch/x86/boot/mkelf32
@@ -318,7 +319,7 @@ xen/arch/*/efi/runtime.c
 xen/arch/*/include/asm/asm-offsets.h
 xen/common/config_data.S
 xen/common/config.gz
-xen/cppcheck-htmlreport
+xen/cppcheck-htmlreport/
 xen/include/headers*.chk
 xen/include/compat/*
 xen/include/config/
diff --git a/xen/Makefile b/xen/Makefile
index 82f5310b12..a4dce29efd 100644
--- a/xen/Makefile
+++ b/xen/Makefile
@@ -548,7 +548,7 @@ _clean:
rm -f include/asm $(TARGET) $(TARGET).gz $(TARGET).efi 
$(TARGET).efi.map $(TARGET)-syms $(TARGET)-syms.map
rm -f asm-offsets.s arch/*/include/asm/asm-offsets.h
rm -f .banner .allconfig.tmp include/xen/compile.h
-   rm -f xen-cppcheck.xml
+   rm -f cppcheck-misra.* xen-cppcheck.xml
 
 .PHONY: _distclean
 _distclean: clean
@@ -642,6 +642,10 @@ CPPCHECK_HTMLREPORT ?= cppcheck-htmlreport
 # build directory. This can be changed by giving a directory in this variable.
 CPPCHECK_HTMLREPORT_OUTDIR ?= cppcheck-htmlreport
 
+# By default we do not check misra rules, to enable pass "CPPCHECK_MISRA=y" to
+# make command line.
+CPPCHECK_MISRA ?= n
+
 # Compile flags to pass to cppcheck:
 # - include directories and defines Xen Makefile is passing (from CFLAGS)
 # - include config.h as this is passed directly to the compiler.
@@ -666,6 +670,15 @@ CPPCHECKFILES := $(wildcard $(patsubst 
$(objtree)/%.o,$(srctree)/%.c, \
  $(filter-out $(objtree)/tools/%, \
  $(shell find $(objtree) -name "*.o"
 
+# Headers and files required to run cppcheck on a file
+CPPCHECKDEPS := $(objtree)/include/generated/autoconf.h \
+$(objtree)/include/generated/compiler-def.h
+
+ifeq ($(CPPCHECK_MISRA),y)
+CPPCHECKFLAGS += --addon=cppcheck-misra.json
+CPPCHECKDEPS += cppcheck-misra.json
+endif
+
 quiet_cmd_cppcheck_xml = CPPCHECK $(patsubst $(srctree)/%,%,$<)
 cmd_cppcheck_xml = $(CPPCHECK) -v -q --xml $(CPPCHECKFLAGS) \
--output-file=$@ $<
@@ -690,7 +703,7 @@ ifeq ($(CPPCHECKFILES),)
 endif
$(call if_changed,merge_cppcheck_reports)
 
-$(objtree)/%.c.cppcheck: $(srctree)/%.c 
$(objtree)/include/generated/autoconf.h 
$(objtree)/include/generated/compiler-def.h | cppcheck-version
+$(objtree)/%.c.cppcheck: $(srctree)/%.c $(CPPCHECKDEPS) | cppcheck-version
$(call if_changed,cppcheck_xml)
 
 cppcheck-version:
@@ -703,6 +716,18 @@ cppcheck-version:
exit 1; \
fi
 
+# List of Misra rules to respect is written inside a doc.
+# In order to have some helpful text in the cppcheck output, generate a text
+# file containing the rules identifier, classification and text from the Xen
+# documentation file. Also generate a json file with the right arguments for
+# cppcheck in json format including the list of rules to ignore.
+#
+cppcheck-misra.txt: $(XEN_ROOT)/docs/misra/rules.rst 
$(srctree)/tools/convert_misra_doc.py
+   $(Q)$(srctree)/tools/convert_misra_doc.py -i $< -o $@ -j $(@:.txt=.json)
+
+# convert_misra_doc is generating both files.
+cppcheck-misra.json: cppcheck-misra.txt
+
 # Put this in generated headers this way it is cleaned by include/Makefile
 $(objtree)/include/generated/compiler-def.h:
$(Q)$(CC) -dM -E -o $@ - < /dev/null
diff --git a/xen/tools/convert_misra_doc.py b/xen/tools/convert_misra_doc.py
new file mode 100755
index 00..47133a33a6
--- /dev/null
+++ b/xen/tools/convert_misra_doc.py
@@ -0,0 +1,139 @@
+#!/usr/bin/env python
+
+"""
+This script is converting the misra documentation RST file into a text file
+that can be used as text-rules for cppcheck.
+Usage:
+convert_misr_doc.py -i INPUT [-o OUTPUT] [-j JSON]
+
+INPUT  - RST file containing the list of misra

Re: [PATCH v6 2/9] xen: do not free reserved memory into heap

2022-06-09 Thread Julien Grall


Hi,

On 09/06/2022 06:54, Penny Zheng wrote:




-Original Message-
From: Julien Grall 
Sent: Tuesday, June 7, 2022 5:13 PM
To: Penny Zheng ; xen-devel@lists.xenproject.org
Cc: Wei Chen ; Stefano Stabellini
; Bertrand Marquis ;
Volodymyr Babchuk ; Andrew Cooper
; George Dunlap ;
Jan Beulich ; Wei Liu 
Subject: Re: [PATCH v6 2/9] xen: do not free reserved memory into heap

Hi Penny,



Hi Julien


On 07/06/2022 08:30, Penny Zheng wrote:

Pages used as guest RAM for static domain, shall be reserved to this
domain only.
So in case reserved pages being used for other purpose, users shall
not free them back to heap, even when last ref gets dropped.

free_staticmem_pages will be called by free_heap_pages in runtime for
static domain freeing memory resource, so let's drop the __init flag.

Signed-off-by: Penny Zheng 
---
v6 changes:
- adapt to PGC_static
- remove #ifdef aroud function declaration
---
v5 changes:
- In order to avoid stub functions, we #define PGC_staticmem to
non-zero only when CONFIG_STATIC_MEMORY
- use "unlikely()" around pg->count_info & PGC_staticmem
- remove pointless "if", since mark_page_free() is going to set
count_info to PGC_state_free and by consequence clear PGC_staticmem
- move #define PGC_staticmem 0 to mm.h
---
v4 changes:
- no changes
---
v3 changes:
- fix possible racy issue in free_staticmem_pages()
- introduce a stub free_staticmem_pages() for the
!CONFIG_STATIC_MEMORY case
- move the change to free_heap_pages() to cover other potential call
sites
- fix the indentation
---
v2 changes:
- new commit
---
   xen/arch/arm/include/asm/mm.h |  4 +++-
   xen/common/page_alloc.c   | 12 +---
   xen/include/xen/mm.h  |  2 --
   3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/xen/arch/arm/include/asm/mm.h
b/xen/arch/arm/include/asm/mm.h index fbff11c468..7442893e77 100644
--- a/xen/arch/arm/include/asm/mm.h
+++ b/xen/arch/arm/include/asm/mm.h
@@ -108,9 +108,11 @@ struct page_info
 /* Page is Xen heap? */
   #define _PGC_xen_heap PG_shift(2)
   #define PGC_xen_heap  PG_mask(1, 2)
-  /* Page is static memory */


NITpicking: You added this comment in patch #1 and now removing the space.
Any reason to drop the space?


+#ifdef CONFIG_STATIC_MEMORY


I think this change ought to be explained in the commit message. AFAIU, this is
necessary to allow the compiler to remove code and avoid linking issues. Is
that correct?


+/* Page is static memory */
   #define _PGC_staticPG_shift(3)
   #define PGC_static PG_mask(1, 3)
+#endif
   /* ... */
   /* Page is broken? */
   #define _PGC_broken   PG_shift(7)
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index
9e5c757847..6876869fa6 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1443,6 +1443,13 @@ static void free_heap_pages(

   ASSERT(order <= MAX_ORDER);

+if ( unlikely(pg->count_info & PGC_static) )
+{
+/* Pages of static memory shall not go back to the heap. */
+free_staticmem_pages(pg, 1UL << order, need_scrub);

I can't remember whether I asked this before (I couldn't find a thread).

free_staticmem_pages() doesn't seem to be protected by any lock. So how do
you prevent the concurrent access to the page info with the acquire part?


True, last time you suggested that rsv_page_list needs to be protected with a
spinlock (mostly like d->page_alloc_lock). I haven't thought it thoroughly, 
sorry
about that.
So for freeing part, I shall get the lock at arch_free_heap_page(), where we 
insert
the page to the rsv_page_list, and release the lock at the end of the 
free_staticmem_page.


In general, a lock is better to be lock/unlock in the same function 
because it is easier to verify. However, I am not sure that extending 
the locking from d->page_alloc_lock up after free_heap_pages() is right.


The first reason being that they are other callers of free_heap_pages(). 
So now all the callers of the helpers would need to know whether they 
need to help d->page_alloc_lock.


Secondly, free_staticmem_pages() is meant to be the reverse of 
prepare_staticmem_pages(). We should prevent both of them to be called 
concurrently. It sounds strange to use the d->page_alloc_lock to protect 
it (a page is technically not belonging to a domain at this point).


To me it looks like we want to add the pages on the rsv_page_list 
*after* the pages have been freed. So we know that all the pages on that 
list have been marked as freed (i.e. free_staticmem_pages() completed).


In addition to that, we would need the code in free_staticmem_pages() to 
be protected by the heap_lock (at least so it is match 
prepare_staticmem_pages()).


Any thoughts?

Cheers,

--
Julien Grall

[PATCH 0/2] xen/mm: Optimize init_heap_pages()

2022-06-09 Thread Julien Grall

From: Julien Grall 

Hi all,

As part of the Live-Update work, we noticed that a big part Xen boot
is spent to add pages in the heap. For instance, on when running Xen
in a nested envionment on a c5.metal, it takes ~1.5s.

This small series is reworking init_heap_pages() to give the pages
to free_heap_pages() by chunk rather than one by one.

With this approach, the time spent to init the heap is down
to 166 ms in the setup mention above.

There is potentially one more optimization possible that would
allow to further reduce the time spent. The new approach is accessing
the page information multiple time in separate loop that can potentially
be large.

Cheers,

Hongyan Xia (1):
  xen/heap: pass order to free_heap_pages() in heap init

Julien Grall (1):
  xen/heap: Split init_heap_pages() in two

 xen/common/page_alloc.c | 109 ++--
 1 file changed, 82 insertions(+), 27 deletions(-)

-- 
2.32.0

[PATCH 1/2] xen/heap: Split init_heap_pages() in two

2022-06-09 Thread Julien Grall

From: Julien Grall 

At the moment, init_heap_pages() will call free_heap_pages() page
by page. To reduce the time to initialize the heap, we will want
to provide multiple pages at the same time.

init_heap_pages() is now split in two parts:
- init_heap_pages(): will break down the range in multiple set
  of contiguous pages. For now, the criteria is the pages should
  belong to the same NUMA node.
- init_contig_pages(): will initialize a set of contiguous pages.
  For now the pages are still passed one by one to free_heap_pages().

Note that the comment before init_heap_pages() is heavily outdated and
does not reflect the current code. So update it.

This patch is a merge/rework of patches from David Woodhouse and
Hongyan Xia.

Signed-off-by: Julien Grall 



Interestingly, I was expecting this patch to perform worse. However,
from testing there is a small increase in perf.

That said, I split the patch because it keeps refactoring and
optimization separated.
---
 xen/common/page_alloc.c | 82 +++--
 1 file changed, 55 insertions(+), 27 deletions(-)

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 3e6504283f1e..a1938df1406c 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1778,16 +1778,55 @@ int query_page_offline(mfn_t mfn, uint32_t *status)
 }
 
 /*
- * Hand the specified arbitrary page range to the specified heap zone
- * checking the node_id of the previous page.  If they differ and the
- * latter is not on a MAX_ORDER boundary, then we reserve the page by
- * not freeing it to the buddy allocator.
+ * init_contig_heap_pages() is intended to only take pages from the same
+ * NUMA node.
  */
+static bool is_contig_page(struct page_info *pg, unsigned int nid)
+{
+return (nid == (phys_to_nid(page_to_maddr(pg;
+}
+
+/*
+ * This function should only be called with valid pages from the same NUMA
+ * node.
+ *
+ * Callers should use is_contig_page() first to check if all the pages
+ * in a range are contiguous.
+ */
+static void init_contig_heap_pages(struct page_info *pg, unsigned long 
nr_pages,
+   bool need_scrub)
+{
+unsigned long s, e;
+unsigned int nid = phys_to_nid(page_to_maddr(pg));
+
+s = mfn_x(page_to_mfn(pg));
+e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 1));
+if ( unlikely(!avail[nid]) )
+{
+bool use_tail = !(s & ((1UL << MAX_ORDER) - 1)) &&
+(find_first_set_bit(e) <= find_first_set_bit(s));
+unsigned long n;
+
+n = init_node_heap(nid, s, nr_pages, &use_tail);
+BUG_ON(n > nr_pages);
+if ( use_tail )
+e -= n;
+else
+s += n;
+}
+
+while ( s < e )
+{
+free_heap_pages(mfn_to_page(_mfn(s)), 0, need_scrub);
+s += 1UL;
+}
+}
+
 static void init_heap_pages(
 struct page_info *pg, unsigned long nr_pages)
 {
 unsigned long i;
-bool idle_scrub = false;
+bool need_scrub = scrub_debug;
 
 /*
  * Keep MFN 0 away from the buddy allocator to avoid crossing zone
@@ -1812,35 +1851,24 @@ static void init_heap_pages(
 spin_unlock(&heap_lock);
 
 if ( system_state < SYS_STATE_active && opt_bootscrub == BOOTSCRUB_IDLE )
-idle_scrub = true;
+need_scrub = true;
 
-for ( i = 0; i < nr_pages; i++ )
+for ( i = 0; i < nr_pages; )
 {
-unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
+unsigned int nid = phys_to_nid(page_to_maddr(pg));
+unsigned long left = nr_pages - i;
+unsigned long contig_pages;
 
-if ( unlikely(!avail[nid]) )
+for ( contig_pages = 1; contig_pages < left; contig_pages++ )
 {
-unsigned long s = mfn_x(page_to_mfn(pg + i));
-unsigned long e = mfn_x(mfn_add(page_to_mfn(pg + nr_pages - 1), 
1));
-bool use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
-!(s & ((1UL << MAX_ORDER) - 1)) &&
-(find_first_set_bit(e) <= find_first_set_bit(s));
-unsigned long n;
-
-n = init_node_heap(nid, mfn_x(page_to_mfn(pg + i)), nr_pages - i,
-   &use_tail);
-BUG_ON(i + n > nr_pages);
-if ( n && !use_tail )
-{
-i += n - 1;
-continue;
-}
-if ( i + n == nr_pages )
+if ( !is_contig_page(pg + contig_pages, nid) )
 break;
-nr_pages -= n;
 }
 
-free_heap_pages(pg + i, 0, scrub_debug || idle_scrub);
+init_contig_heap_pages(pg, contig_pages, need_scrub);
+
+pg += contig_pages;
+i += contig_pages;
 }
 }
 
-- 
2.32.0

[PATCH 2/2] xen/heap: pass order to free_heap_pages() in heap init

2022-06-09 Thread Julien Grall

From: Hongyan Xia 

The idea is to split the range into multiple aligned power-of-2 regions
which only needs to call free_heap_pages() once each. We check the least
significant set bit of the start address and use its bit index as the
order of this increment. This makes sure that each increment is both
power-of-2 and properly aligned, which can be safely passed to
free_heap_pages(). Of course, the order also needs to be sanity checked
against the upper bound and MAX_ORDER.

Testing on a nested environment on c5.metal with various amount
of RAM. Time for end_boot_allocator() to complete:
Before After
- 90GB: 1426 ms166 ms
-  8GB:  124 ms 12 ms
-  4GB:   60 ms  6 ms

Signed-off-by: Hongyan Xia 
Signed-off-by: Julien Grall 
---
 xen/common/page_alloc.c | 39 +--
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index a1938df1406c..bf852cfc11ea 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -1779,16 +1779,28 @@ int query_page_offline(mfn_t mfn, uint32_t *status)
 
 /*
  * init_contig_heap_pages() is intended to only take pages from the same
- * NUMA node.
+ * NUMA node and zone.
+ *
+ * For the latter, it is always true for !CONFIG_SEPARATE_XENHEAP since
+ * free_heap_pages() can only take power-of-two ranges which never cross
+ * zone boundaries. But for separate xenheap which is manually defined,
+ * it is possible for a power-of-two range to cross zones, so we need to
+ * check that as well.
  */
-static bool is_contig_page(struct page_info *pg, unsigned int nid)
+static bool is_contig_page(struct page_info *pg, unsigned int nid,
+   unsigned int zone)
 {
+#ifdef CONFIG_SEPARATE_XENHEAP
+if ( zone != page_to_zone(pg) )
+return false;
+#endif
+
 return (nid == (phys_to_nid(page_to_maddr(pg;
 }
 
 /*
  * This function should only be called with valid pages from the same NUMA
- * node.
+ * node and the same zone.
  *
  * Callers should use is_contig_page() first to check if all the pages
  * in a range are contiguous.
@@ -1817,8 +1829,22 @@ static void init_contig_heap_pages(struct page_info *pg, 
unsigned long nr_pages,
 
 while ( s < e )
 {
-free_heap_pages(mfn_to_page(_mfn(s)), 0, need_scrub);
-s += 1UL;
+/*
+ * For s == 0, we simply use the largest increment by checking the
+ * index of the MSB set. For s != 0, we also need to ensure that the
+ * chunk is properly sized to end at power-of-two alignment. We do this
+ * by checking the LSB set and use its index as the increment. Both
+ * cases need to be guarded by MAX_ORDER.
+ *
+ * Note that the value of ffsl() and flsl() starts from 1 so we need
+ * to decrement it by 1.
+ */
+int inc_order = min(MAX_ORDER, flsl(e - s) - 1);
+
+if ( s )
+inc_order = min(inc_order, ffsl(s) - 1);
+free_heap_pages(mfn_to_page(_mfn(s)), inc_order, need_scrub);
+s += (1UL << inc_order);
 }
 }
 
@@ -1856,12 +1882,13 @@ static void init_heap_pages(
 for ( i = 0; i < nr_pages; )
 {
 unsigned int nid = phys_to_nid(page_to_maddr(pg));
+unsigned int zone = page_to_zone(pg);
 unsigned long left = nr_pages - i;
 unsigned long contig_pages;
 
 for ( contig_pages = 1; contig_pages < left; contig_pages++ )
 {
-if ( !is_contig_page(pg + contig_pages, nid) )
+if ( !is_contig_page(pg + contig_pages, nid, zone) )
 break;
 }
 
-- 
2.32.0

[xen-unstable test] 170890: tolerable FAIL - PUSHED

2022-06-09 Thread osstest service owner

flight 170890 xen-unstable real [real]
http://logs.test-lab.xenproject.org/osstest/logs/170890/

Failures :-/ but no regressions.

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-rtds 20 guest-localmigrate/x10   fail  like 170877
 test-amd64-amd64-xl-qemut-win7-amd64 19 guest-stopfail like 170877
 test-armhf-armhf-libvirt 16 saverestore-support-checkfail  like 170877
 test-amd64-amd64-qemuu-nested-amd 20 debian-hvm-install/l1/l2 fail like 170877
 test-amd64-amd64-xl-qemuu-ws16-amd64 19 guest-stopfail like 170877
 test-amd64-i386-xl-qemut-ws16-amd64 19 guest-stop fail like 170877
 test-amd64-i386-xl-qemut-win7-amd64 19 guest-stop fail like 170877
 test-armhf-armhf-libvirt-qcow2 15 saverestore-support-check   fail like 170877
 test-armhf-armhf-libvirt-raw 15 saverestore-support-checkfail  like 170877
 test-amd64-i386-xl-qemuu-win7-amd64 19 guest-stop fail like 170877
 test-amd64-amd64-xl-qemut-ws16-amd64 19 guest-stopfail like 170877
 test-amd64-i386-xl-qemuu-ws16-amd64 19 guest-stop fail like 170877
 test-amd64-amd64-xl-qemuu-win7-amd64 19 guest-stopfail like 170877
 test-amd64-amd64-libvirt 15 migrate-support-checkfail   never pass
 test-amd64-i386-xl-pvshim14 guest-start  fail   never pass
 test-amd64-i386-libvirt-xsm  15 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  15 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 13 migrate-support-check 
fail never pass
 test-arm64-arm64-xl  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  16 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 14 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt-raw  14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 14 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-raw 15 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  14 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  16 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-xsm 15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  15 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-qcow2 14 migrate-support-checkfail never pass
 test-armhf-armhf-xl-vhd  14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  15 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-raw 14 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 15 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 16 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  15 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  16 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 15 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 16 saverestore-support-checkfail never pass

version targeted for testing:
 xen  7ac12e3634cc3ed92

Re: [PATCH 33/36] cpuidle,omap3: Use WFI for omap3_pm_idle()

2022-06-09 Thread Tony Lindgren

* Arnd Bergmann  [220608 18:18]:
> On Wed, Jun 8, 2022 at 4:27 PM Peter Zijlstra  wrote:
> >
> > arch_cpu_idle() is a very simple idle interface and exposes only a
> > single idle state and is expected to not require RCU and not do any
> > tracing/instrumentation.
> >
> > As such, omap_sram_idle() is not a valid implementation. Replace it
> > with the simple (shallow) omap3_do_wfi() call. Leaving the more
> > complicated idle states for the cpuidle driver.

Agreed it makes sense to limit deeper idle states to cpuidle. Hopefully
there is some informative splat for attempting to use arch_cpu_ide()
for deeper idle states :)

> I see similar code in omap2:
> 
> omap2_pm_idle()
>  -> omap2_enter_full_retention()
>  -> omap2_sram_suspend()
> 
> Is that code path safe to use without RCU or does it need a similar change?

Seems like a similar change should be done for omap2. Then anybody who
cares to implement a minimal cpuidle support can do so.

Regards,

Tony

Re: [PATCH v2 3/3] x86/vmx: implement Notify VM Exit

2022-06-09 Thread Xiaoyao Li


On 6/9/2022 3:04 PM, Tian, Kevin wrote:

+Chenyi/Xiaoyao who worked on the KVM support. Presumably
similar opens have been discussed in KVM hence they have the
right background to comment here.


From: Roger Pau Monne 
Sent: Thursday, May 26, 2022 7:12 PM

Under certain conditions guests can get the CPU stuck in an unbounded
loop without the possibility of an interrupt window to occur on
instruction boundary.  This was the case with the scenarios described
in XSA-156.

Make use of the Notify VM Exit mechanism, that will trigger a VM Exit
if no interrupt window occurs for a specified amount of time.  Note
that using the Notify VM Exit avoids having to trap #AC and #DB
exceptions, as Xen is guaranteed to get a VM Exit even if the guest
puts the CPU in a loop without an interrupt window, as such disable
the intercepts if the feature is available and enabled.

Setting the notify VM exit window to 0 is safe because there's a
threshold added by the hardware in order to have a sane window value.

Suggested-by: Andrew Cooper 
Signed-off-by: Roger Pau Monné 
---
Changes since v1:
  - Properly update debug state when using notify VM exit.
  - Reword commit message.
---
This change enables the notify VM exit by default, KVM however doesn't
seem to enable it by default, and there's the following note in the
commit message:

"- There's a possibility, however small, that a notify VM exit happens
with VM_CONTEXT_INVALID set in exit qualification. In this case, the
vcpu can no longer run. To avoid killing a well-behaved guest, set
notify window as -1 to disable this feature by default."

It's not obviously clear to me whether the comment was meant to be:
"There's a possibility, however small, that a notify VM exit _wrongly_
happens with VM_CONTEXT_INVALID".

It's also not clear whether such wrong hardware behavior only affects
a specific set of hardware, 


I'm not sure what you mean for a specific set of hardware.

We make it default off in KVM just in case that future silicon wrongly 
sets VM_CONTEXT_INVALID bit. Becuase we make the policy that VM cannot 
continue running in that case.


For the worst case, if some future silicon happens to have this kind 
silly bug, then the existing product kernel all suffer the possibility 
that their VM being killed due to the feature is default on.



in a way that we could avoid enabling
notify VM exit there.

There's a discussion in one of the Linux patches that 128K might be
the safer value in order to prevent false positives, but I have no
formal confirmation about this.  Maybe our Intel maintainers can
provide some more feedback on a suitable notify VM exit window
value.


The 128k is the internal threshold for SPR silicon. The internal 
threshold is tuned by Intel for each silicon, to make sure it's big 
enough to avoid false positive even when user set vmcs.notify_window to 0.


However, it varies for different processor generations.

What is the suitable value is hard to say, it depends on how soon does 
VMM want to intercept the VM. Anyway, Intel ensures that even value 0 is 
safe.




I've tested with 0 (the proposed default in the patch) and I don't
seem to be able to trigger notify VM exits under normal guest
operation.  Note that even in that case the guest won't be destroyed
unless the context is corrupt.
---

Re: MISRA C meeting tomorrow, was: MOVING COMMUNITY CALL Call for agenda items for 9 June Community Call @ 1500 UTC

2022-06-09 Thread Jan Beulich

On 09.06.2022 03:20, Stefano Stabellini wrote:
> Finally, for Rule 13.2, I updated the link to ECLAIR's results. There
> are a lot more violations than just 4, but I don't know if they are
> valid or false positives.

I've picked just the one case in xen/common/efi/ebmalloc.c to check,
and it says "possibly". That's because evaluation of function call
arguments involves the calling of (in this case two) further
functions. If those functions had side effects (which apparently the
tool can't figure), there would indeed be a problem.

The (Arm based) count of almost 10k violations is clearly a concern.
I don't consider it even remotely reasonable to add 10k comments, no
matter how brief, to cover all the false positives.

Jan

RE: [PATCH v2 3/3] x86/vmx: implement Notify VM Exit

2022-06-09 Thread Tian, Kevin

+Chenyi/Xiaoyao who worked on the KVM support. Presumably
similar opens have been discussed in KVM hence they have the
right background to comment here.

> From: Roger Pau Monne 
> Sent: Thursday, May 26, 2022 7:12 PM
> 
> Under certain conditions guests can get the CPU stuck in an unbounded
> loop without the possibility of an interrupt window to occur on
> instruction boundary.  This was the case with the scenarios described
> in XSA-156.
> 
> Make use of the Notify VM Exit mechanism, that will trigger a VM Exit
> if no interrupt window occurs for a specified amount of time.  Note
> that using the Notify VM Exit avoids having to trap #AC and #DB
> exceptions, as Xen is guaranteed to get a VM Exit even if the guest
> puts the CPU in a loop without an interrupt window, as such disable
> the intercepts if the feature is available and enabled.
> 
> Setting the notify VM exit window to 0 is safe because there's a
> threshold added by the hardware in order to have a sane window value.
> 
> Suggested-by: Andrew Cooper 
> Signed-off-by: Roger Pau Monné 
> ---
> Changes since v1:
>  - Properly update debug state when using notify VM exit.
>  - Reword commit message.
> ---
> This change enables the notify VM exit by default, KVM however doesn't
> seem to enable it by default, and there's the following note in the
> commit message:
> 
> "- There's a possibility, however small, that a notify VM exit happens
>with VM_CONTEXT_INVALID set in exit qualification. In this case, the
>vcpu can no longer run. To avoid killing a well-behaved guest, set
>notify window as -1 to disable this feature by default."
> 
> It's not obviously clear to me whether the comment was meant to be:
> "There's a possibility, however small, that a notify VM exit _wrongly_
> happens with VM_CONTEXT_INVALID".
> 
> It's also not clear whether such wrong hardware behavior only affects
> a specific set of hardware, in a way that we could avoid enabling
> notify VM exit there.
> 
> There's a discussion in one of the Linux patches that 128K might be
> the safer value in order to prevent false positives, but I have no
> formal confirmation about this.  Maybe our Intel maintainers can
> provide some more feedback on a suitable notify VM exit window
> value.
> 
> I've tested with 0 (the proposed default in the patch) and I don't
> seem to be able to trigger notify VM exits under normal guest
> operation.  Note that even in that case the guest won't be destroyed
> unless the context is corrupt.
> ---
>  docs/misc/xen-command-line.pandoc   | 11 +
>  xen/arch/x86/hvm/vmx/vmcs.c | 19 +++
>  xen/arch/x86/hvm/vmx/vmx.c  | 32 +++--
>  xen/arch/x86/include/asm/hvm/vmx/vmcs.h |  4 
>  xen/arch/x86/include/asm/hvm/vmx/vmx.h  |  6 +
>  xen/arch/x86/include/asm/perfc_defn.h   |  3 ++-
>  6 files changed, 72 insertions(+), 3 deletions(-)
> 
> diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-
> command-line.pandoc
> index 1dc7e1ca07..ccf8bf5806 100644
> --- a/docs/misc/xen-command-line.pandoc
> +++ b/docs/misc/xen-command-line.pandoc
> @@ -2544,6 +2544,17 @@ guest will notify Xen that it has failed to acquire a
> spinlock.
>  ,  and  must be integers. The values will be
>  encoded in guest CPUID 0x4002 if viridian enlightenments are enabled.
> 
> +### vm-notify-window (Intel)
> +> `= `
> +
> +> Default: `0`
> +
> +Specify the value of the VM Notify window used to detect locked VMs. Set
> to -1
> +to disable the feature.  Value is in units of crystal clock cycles.
> +
> +Note the hardware might add a threshold to the provided value in order to
> make
> +it safe, and hence using 0 is fine.
> +
>  ### vpid (Intel)
>  > `= `
> 
> diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
> index d388e6729c..6cb2c6c6b7 100644
> --- a/xen/arch/x86/hvm/vmx/vmcs.c
> +++ b/xen/arch/x86/hvm/vmx/vmcs.c
> @@ -67,6 +67,9 @@ integer_param("ple_gap", ple_gap);
>  static unsigned int __read_mostly ple_window = 4096;
>  integer_param("ple_window", ple_window);
> 
> +static unsigned int __ro_after_init vm_notify_window;
> +integer_param("vm-notify-window", vm_notify_window);
> +
>  static bool __read_mostly opt_ept_pml = true;
>  static s8 __read_mostly opt_ept_ad = -1;
>  int8_t __read_mostly opt_ept_exec_sp = -1;
> @@ -210,6 +213,7 @@ static void __init vmx_display_features(void)
>  P(cpu_has_vmx_pml, "Page Modification Logging");
>  P(cpu_has_vmx_tsc_scaling, "TSC Scaling");
>  P(cpu_has_vmx_bus_lock_detection, "Bus Lock Detection");
> +P(cpu_has_vmx_notify_vm_exiting, "Notify VM Exit");
>  #undef P
> 
>  if ( !printed )
> @@ -329,6 +333,8 @@ static int vmx_init_vmcs_config(bool bsp)
>  opt |= SECONDARY_EXEC_UNRESTRICTED_GUEST;
>  if ( opt_ept_pml )
>  opt |= SECONDARY_EXEC_ENABLE_PML;
> +if ( vm_notify_window != ~0u )
> +opt |= SECONDARY_EXEC_NOTIFY_VM_EXITING;
> 
>  /*
>   * "API

88 matches

Mail list logo